Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7dcc95d
Preliminary creation of oneMKL RNG object in kernel4 (not using it yet).
Oct 12, 2022
2d2e0de
Declaring two RNG distribution objects: discrete and continuous.
L30nardoSV Oct 14, 2022
a42f765
Redefining rng-engine for 64 elements + passing rng engine and distrs…
L30nardoSV Oct 14, 2022
157c04a
Enabling and formatting printfs.
L30nardoSV Oct 14, 2022
f32f4bb
Enabling and formatting printfs.
L30nardoSV Oct 14, 2022
437079b
Enabling and formatting printfs.
L30nardoSV Oct 14, 2022
eae836d
Enabling and formatting printfs.
L30nardoSV Oct 14, 2022
67b0996
Enabling and formatting printfs.
L30nardoSV Oct 14, 2022
a9bed89
Adding local wi id.
L30nardoSV Oct 14, 2022
0b5d91f
Removing temp. private array + writing generated rngs directly into s…
L30nardoSV Oct 14, 2022
3286dde
Removing unused CUDA printfs + commenting out equivalent SYCL printfs.
L30nardoSV Oct 14, 2022
5f12191
Replacing call to gpu_randf() with call to oneapi generate_single().
L30nardoSV Oct 14, 2022
e46d720
Replacing gpu_randf() calls with oneapi generate_single() ones each.
L30nardoSV Oct 14, 2022
71f5bf2
Fixing seed and offset of RNG engine. Otherwise, some resulting energ…
L30nardoSV Oct 14, 2022
fc3ad66
Removing unused discrete RNG distribution object.
L30nardoSV Oct 14, 2022
197aeb7
Adding RNG engine and distribution to LS kernel (not yet calling RNG …
L30nardoSV Oct 14, 2022
0c39ba1
In LS: replacing gpu_randf() calls with oneapi generate_single() ones…
L30nardoSV Oct 14, 2022
277a17f
Commenting out again DOCK_TRACE (as it was initially).
L30nardoSV Oct 14, 2022
cb9df0c
In AD: replacing gpu_randf() call with oneapi generate_single() call.
L30nardoSV Oct 14, 2022
18ca574
In ADAM: replacing gpu_randf() call with oneapi generate_single() call.
L30nardoSV Oct 14, 2022
f283a72
Correcting RNG VecSize for philox from 64 (incorrect) to 16 (maximum …
L30nardoSV Oct 14, 2022
580561f
In Kernel4: adding preprocessor directives to select RNG engine.
L30nardoSV Oct 19, 2022
cb3a582
Defining <RNG_ONEMKL_TYPE> for different engine types.
L30nardoSV Oct 19, 2022
259dc3d
Adding macro <RNG_ORIGINAL> to enable original adhoc/manual RNG code.
L30nardoSV Oct 19, 2022
5152291
In SW: adding macro <RNG_ORIGINAL> to enable original adhoc/manual RN…
L30nardoSV Oct 19, 2022
471701c
In AD and ADAM: adding macro <RNG_ORIGINAL>.
L30nardoSV Oct 19, 2022
44eb582
Redefining macro name for oneMKL engine type.
L30nardoSV Oct 19, 2022
6de0661
Using <RNG_ONEMKL_DISTRIBUTION_TYPE> macro.
L30nardoSV Oct 19, 2022
2d4ba66
Minor comment reordering.
L30nardoSV Oct 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 41 additions & 9 deletions dpcpp/kernel3.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,13 @@ gpu_perform_LS_kernel(
int *evaluation_cnt,
float *offspring_energy,
float *sFloatAccumulator,
int *entity_id)
int *entity_id
#if !defined (RNG_ORIGINAL)
,
RNG_ONEMKL_ENGINE_TYPE* rng_engine,
RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
#endif
)
// The GPU global function performs local search on the pre-defined entities of conformations_next.
// The number of blocks which should be started equals to num_of_lsentities*num_of_runs.
// This way the first num_of_lsentities entity of each population will be subjected to local search
Expand Down Expand Up @@ -85,8 +91,11 @@ gpu_perform_LS_kernel(
// If entity 0 is not selected according to LS-rate,
// choosing an other entity
if (100.0f *
gpu_randf(cData.pMem_prng_states, item_ct1) >
cData.dockpars.lsearch_rate) {
#if defined (RNG_ORIGINAL)
gpu_randf(cData.pMem_prng_states, item_ct1) > cData.dockpars.lsearch_rate) {
#else
oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) > cData.dockpars.lsearch_rate) {
#endif
*entity_id = cData.dockpars.num_of_lsentities;
}
}
Expand Down Expand Up @@ -138,11 +147,13 @@ gpu_perform_LS_kernel(
#ifdef SWAT3
genotype_deviate[gene_counter] =
*rho *
(2.0f *
gpu_randf(cData.pMem_prng_states, item_ct1) -
1.0f) *
(gpu_randf(cData.pMem_prng_states, item_ct1) <
gene_scale);
#if defined (RNG_ORIGINAL)
(2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f) *
(gpu_randf(cData.pMem_prng_states, item_ct1) < gene_scale);
#else
(2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f) *
(oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) < gene_scale);
#endif

// Translation genes
if (gene_counter < 3) {
Expand All @@ -157,7 +168,11 @@ gpu_perform_LS_kernel(
}
}
#else
#if defined (RNG_ORIGINAL)
genotype_deviate[gene_counter] = rho*(2.0f*gpu_randf(cData.pMem_prng_states)-1.0f)*(gpu_randf(cData.pMem_prng_states)<0.3f);
#else
genotype_deviate[gene_counter] = rho*(2.0f*oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine)-1.0f)*(oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine)<0.3f);
#endif

// Translation genes
if (gene_counter < 3) {
Expand Down Expand Up @@ -419,6 +434,17 @@ void gpu_perform_LS(
sycl::range<3>(1, 1, threads),
sycl::range<3>(1, 1, threads)),
[=](sycl::nd_item<3> item_ct1) {

#if !defined (RNG_ORIGINAL)
// Creating an RNG engine object
uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
uint64_t rng_offset = item_ct1.get_local_id(2) * threads;
RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);

// Creating a continuous RNG distribution object
RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
#endif

gpu_perform_LS_kernel(
pMem_conformations_next, pMem_energies_next,
item_ct1, dpct_local_acc_ct1.get_pointer(),
Expand All @@ -429,7 +455,13 @@ void gpu_perform_LS(
evaluation_cnt_acc_ct1.get_pointer(),
offspring_energy_acc_ct1.get_pointer(),
sFloatAccumulator_acc_ct1.get_pointer(),
entity_id_acc_ct1.get_pointer());
entity_id_acc_ct1.get_pointer()
#if !defined (RNG_ORIGINAL)
,
&rng_engine,
&rng_continuous_distr
#endif
);
});
});
/*
Expand Down
81 changes: 58 additions & 23 deletions dpcpp/kernel4.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,15 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

#ifdef DOCK_TRACE
#ifdef __SYCL_DEVICE_ONLY__
#define CONSTANT __attribute__((opencl_constant))
#define CL_CONSTANT __attribute__((opencl_constant))
#else
#define CONSTANT
#define CL_CONSTANT
#endif
static const CONSTANT char FMT1[] = "DOCK_TRACE: %s globalID: %6d %20s %10.6f %20s %10.6f";
#define PRINTF(format, ...) { \
static const CL_CONSTANT char _format[] = format; \
sycl::ext::oneapi::experimental::printf(_format, ## __VA_ARGS__); }
#endif


void

gpu_gen_and_eval_newpops_kernel(
Expand All @@ -60,7 +61,13 @@ gpu_gen_and_eval_newpops_kernel(
float *sBestEnergy,
int *sBestID,
sycl::float3 *calc_coords,
float *sFloatAccumulator)
float *sFloatAccumulator
#if !defined (RNG_ORIGINAL)
,
RNG_ONEMKL_ENGINE_TYPE* rng_engine,
RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
#endif
)
// The GPU global function
{

Expand Down Expand Up @@ -168,17 +175,24 @@ gpu_gen_and_eval_newpops_kernel(
gene_counter < 10;
gene_counter += item_ct1.get_local_range().get(2))
{
#if defined (RNG_ORIGINAL)
randnums[gene_counter] = gpu_randf(cData.pMem_prng_states, item_ct1);
#else
randnums[gene_counter] = oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine);
#endif
}

#if 0
if ((threadIdx.x == 0) && (blockIdx.x == 1))
{
printf("%06d ", blockIdx.x);
for (int i = 0; i < 10; i++)
printf("%12.6f ", randnums[i]);
printf("\n");
item_ct1.barrier(SYCL_MEMORY_SPACE);
if ( (item_ct1.get_group(2) == 1) && (item_ct1.get_local_id(2) == 0) ) {
PRINTF("\nLocal work-item id: %d\n", item_ct1.get_local_id(2));
for (uint32_t j = 0; j < 10; j++) {
PRINTF("randnums[%d]=%2.4f\n", j, randnums[j]);
}
}
item_ct1.barrier(SYCL_MEMORY_SPACE);
#endif

// Determining run ID
run_id = item_ct1.get_group(2) / cData.dockpars.pop_size;
/*
Expand Down Expand Up @@ -337,27 +351,31 @@ gpu_gen_and_eval_newpops_kernel(
{
// Notice: dockpars_mutation_rate was scaled down to [0,1] in host
// to reduce number of operations in device
if (/*100.0f**/ gpu_randf(cData.pMem_prng_states,
item_ct1) <
cData.dockpars.mutation_rate)
#if defined (RNG_ORIGINAL)
if (/*100.0f**/ gpu_randf(cData.pMem_prng_states, item_ct1) < cData.dockpars.mutation_rate)
#else
if (oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) < cData.dockpars.mutation_rate)
#endif
{
// Translation genes
if (gene_counter < 3) {
offspring_genotype[gene_counter] +=
cData.dockpars.abs_max_dmov *
(2.0f * gpu_randf(
cData.pMem_prng_states,
item_ct1) -
1.0f);
#if defined (RNG_ORIGINAL)
(2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f);
#else
(2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f);
#endif
}
// Orientation and torsion genes
else {
offspring_genotype[gene_counter] +=
cData.dockpars.abs_max_dang *
(2.0f * gpu_randf(
cData.pMem_prng_states,
item_ct1) -
1.0f);
#if defined (RNG_ORIGINAL)
(2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f);
#else
(2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f);
#endif
map_angle(offspring_genotype[gene_counter]);
}

Expand Down Expand Up @@ -467,6 +485,17 @@ void gpu_gen_and_eval_newpops(
sycl::range<3>(1, 1, threadsPerBlock)),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(32)]] {

#if !defined (RNG_ORIGINAL)
// Creating an RNG engine object
uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
uint64_t rng_offset = item_ct1.get_local_id(2) * threadsPerBlock;
RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);

// Creating a continuous RNG distribution object
RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
#endif

gpu_gen_and_eval_newpops_kernel(
pMem_conformations_current,
pMem_energies_current, pMem_conformations_next,
Expand All @@ -480,7 +509,13 @@ void gpu_gen_and_eval_newpops(
sBestEnergy_acc_ct1.get_pointer(),
sBestID_acc_ct1.get_pointer(),
calc_coords_acc_ct1.get_pointer(),
sFloatAccumulator_acc_ct1.get_pointer());
sFloatAccumulator_acc_ct1.get_pointer()
#if !defined (RNG_ORIGINAL)
,
&rng_engine,
&rng_continuous_distr
#endif
);
});
});
/*
Expand Down
31 changes: 29 additions & 2 deletions dpcpp/kernel_ad.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,13 @@ gpu_gradient_minAD_kernel(
float *sFloatAccumulator,
float *rho,
int *cons_succ,
int *cons_fail)
int *cons_fail
#if !defined (RNG_ORIGINAL)
,
RNG_ONEMKL_ENGINE_TYPE* rng_engine,
RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
#endif
)
// The GPU global function performs gradient-based minimization on (some) entities of conformations_next.
// The number of OpenCL compute units (CU) which should be started equals to num_of_minEntities*num_of_runs.
// This way the first num_of_lsentities entity of each population will be subjected to local search
Expand Down Expand Up @@ -133,7 +139,11 @@ gpu_gradient_minAD_kernel(
// If entity 0 is not selected according to LS-rate,
// choosing another entity
if (100.0f *
#if defined (RNG_ORIGINAL)
gpu_randf(cData.pMem_prng_states, item_ct1) >
#else
oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) >
#endif
cData.dockpars.lsearch_rate) {
*entity_id =
cData.dockpars
Expand Down Expand Up @@ -493,6 +503,17 @@ void gpu_gradient_minAD(
sycl::range<3>(1, 1, threads),
sycl::range<3>(1, 1, threads)),
[=](sycl::nd_item<3> item_ct1) {

#if !defined (RNG_ORIGINAL)
// Creating an RNG engine object
uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
uint64_t rng_offset = item_ct1.get_local_id(2) * threads;
RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);

// Creating a continuous RNG distribution object
RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
#endif

gpu_gradient_minAD_kernel(
pMem_conformations_next, pMem_energies_next,
item_ct1, dpct_local_acc_ct1.get_pointer(),
Expand All @@ -501,7 +522,13 @@ void gpu_gradient_minAD(
sFloatAccumulator_acc_ct1.get_pointer(),
rho_acc_ct1.get_pointer(),
cons_succ_acc_ct1.get_pointer(),
cons_fail_acc_ct1.get_pointer());
cons_fail_acc_ct1.get_pointer()
#if !defined (RNG_ORIGINAL)
,
&rng_engine,
&rng_continuous_distr
#endif
);
});
});
/*
Expand Down
31 changes: 29 additions & 2 deletions dpcpp/kernel_adam.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,13 @@ gpu_gradient_minAdam_kernel(
GpuData cData,
int *entity_id,
float *best_energy,
float *sFloatAccumulator)
float *sFloatAccumulator
#if !defined (RNG_ORIGINAL)
,
RNG_ONEMKL_ENGINE_TYPE* rng_engine,
RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
#endif
)
// The GPU global function performs gradient-based minimization on (some) entities of conformations_next.
// The number of OpenCL compute units (CU) which should be started equals to num_of_minEntities*num_of_runs.
// This way the first num_of_lsentities entity of each population will be subjected to local search
Expand Down Expand Up @@ -124,7 +130,11 @@ gpu_gradient_minAdam_kernel(
// If entity 0 is not selected according to LS-rate,
// choosing another entity
if (100.0f *
#if defined (RNG_ORIGINAL)
gpu_randf(cData.pMem_prng_states, item_ct1) >
#else
oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) >
#endif
cData.dockpars.lsearch_rate) {
*entity_id =
cData.dockpars
Expand Down Expand Up @@ -487,12 +497,29 @@ void gpu_gradient_minAdam(
sycl::range<3>(1, 1, threads),
sycl::range<3>(1, 1, threads)),
[=](sycl::nd_item<3> item_ct1) {

#if !defined (RNG_ORIGINAL)
// Creating an RNG engine object
uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
uint64_t rng_offset = item_ct1.get_local_id(2) * threads;
RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);

// Creating a continuous RNG distribution object
RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
#endif

gpu_gradient_minAdam_kernel(
pMem_conformations_next, pMem_energies_next,
item_ct1, dpct_local_acc_ct1.get_pointer(),
*cData_ptr_ct1, entity_id_acc_ct1.get_pointer(),
best_energy_acc_ct1.get_pointer(),
sFloatAccumulator_acc_ct1.get_pointer());
sFloatAccumulator_acc_ct1.get_pointer()
#if !defined (RNG_ORIGINAL)
,
&rng_engine,
&rng_continuous_distr
#endif
);
});
});
/*
Expand Down
37 changes: 37 additions & 0 deletions dpcpp/kernels.dp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,43 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "calcenergy.h"
#include "GpuData.h"
#include "dpcpp_migration.h"
#include "oneapi/mkl/rng/device.hpp"

// Throwing error if two oneMKL RNG engine types is selected
// If no oneMKL RNG is selected, then using original adhoc/manual RNG

//#define RNG_ONEMKL_MRG32K3A
//#define RNG_ONEMKL_PHILOX4X32X10
//#define RNG_ONEMKL_MCG31M1
//#define RNG_ONEMKL_MCG59

#if defined (RNG_ONEMKL_MRG32K3A) && (defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG31M1) || defined (RNG_ONEMKL_MCG59))
#error "RNG_ONEMKL_MRG32K3A is defined. Do not define additional ONEMKL RNG engines!"
#elif defined (RNG_ONEMKL_PHILOX4X32X10) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_MCG31M1) || defined (RNG_ONEMKL_MCG59))
#error "RNG_ONEMKL_PHILOX4X32X10 is defined. Do not define additional ONEMKL RNG engines!"
#elif defined (RNG_ONEMKL_MCG31M1) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG59))
#error "RNG_ONEMKL_MCG31M1 is defined. Do not define additional ONEMKL RNG engines!"
#elif defined (RNG_ONEMKL_MCG59) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG31M1))
#error "RNG_ONEMKL_MCG59 is defined. Do not define additional ONEMKL RNG engines!"
#elif !defined (RNG_ONEMKL_MRG32K3A) && !defined (RNG_ONEMKL_PHILOX4X32X10) && !defined (RNG_ONEMKL_MCG31M1) && !defined (RNG_ONEMKL_MCG59)
#define RNG_ORIGINAL
#endif

// Defining data type for selected oneMKL RNG engine type
#if defined (RNG_ONEMKL_MRG32K3A)
#define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mrg32k3a<16>
#elif defined (RNG_ONEMKL_PHILOX4X32X10)
#define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::philox4x32x10<16>
#elif defined (RNG_ONEMKL_MCG31M1)
#define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mcg31m1<16>
#elif defined (RNG_ONEMKL_MCG59)
#define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mcg59<16>
#endif

// Defining data type for oneMKL RNG distribution type
#if !defined (RNG_ORIGINAL)
#define RNG_ONEMKL_DISTRIBUTION_TYPE oneapi::mkl::rng::device::uniform<float>
#endif

inline uint64_t llitoulli(int64_t l)
{
Expand Down