diff --git a/dpcpp/kernel3.dp.cpp b/dpcpp/kernel3.dp.cpp index 25b0f51f..0c3f27b4 100644 --- a/dpcpp/kernel3.dp.cpp +++ b/dpcpp/kernel3.dp.cpp @@ -51,7 +51,13 @@ gpu_perform_LS_kernel( int *evaluation_cnt, float *offspring_energy, float *sFloatAccumulator, - int *entity_id) + int *entity_id + #if !defined (RNG_ORIGINAL) + , + RNG_ONEMKL_ENGINE_TYPE* rng_engine, + RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr + #endif +) // The GPU global function performs local search on the pre-defined entities of conformations_next. // The number of blocks which should be started equals to num_of_lsentities*num_of_runs. // This way the first num_of_lsentities entity of each population will be subjected to local search @@ -85,8 +91,11 @@ gpu_perform_LS_kernel( // If entity 0 is not selected according to LS-rate, // choosing an other entity if (100.0f * - gpu_randf(cData.pMem_prng_states, item_ct1) > - cData.dockpars.lsearch_rate) { + #if defined (RNG_ORIGINAL) + gpu_randf(cData.pMem_prng_states, item_ct1) > cData.dockpars.lsearch_rate) { + #else + oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) > cData.dockpars.lsearch_rate) { + #endif *entity_id = cData.dockpars.num_of_lsentities; } } @@ -138,11 +147,13 @@ gpu_perform_LS_kernel( #ifdef SWAT3 genotype_deviate[gene_counter] = *rho * - (2.0f * - gpu_randf(cData.pMem_prng_states, item_ct1) - - 1.0f) * - (gpu_randf(cData.pMem_prng_states, item_ct1) < - gene_scale); + #if defined (RNG_ORIGINAL) + (2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f) * + (gpu_randf(cData.pMem_prng_states, item_ct1) < gene_scale); + #else + (2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f) * + (oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) < gene_scale); + #endif // Translation genes if (gene_counter < 3) { @@ -157,7 +168,11 @@ gpu_perform_LS_kernel( } } #else + #if defined (RNG_ORIGINAL) genotype_deviate[gene_counter] = rho*(2.0f*gpu_randf(cData.pMem_prng_states)-1.0f)*(gpu_randf(cData.pMem_prng_states)<0.3f); + #else + genotype_deviate[gene_counter] = rho*(2.0f*oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine)-1.0f)*(oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine)<0.3f); + #endif // Translation genes if (gene_counter < 3) { @@ -419,6 +434,17 @@ void gpu_perform_LS( sycl::range<3>(1, 1, threads), sycl::range<3>(1, 1, threads)), [=](sycl::nd_item<3> item_ct1) { + + #if !defined (RNG_ORIGINAL) + // Creating an RNG engine object + uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)]; + uint64_t rng_offset = item_ct1.get_local_id(2) * threads; + RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset); + + // Creating a continuous RNG distribution object + RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr; + #endif + gpu_perform_LS_kernel( pMem_conformations_next, pMem_energies_next, item_ct1, dpct_local_acc_ct1.get_pointer(), @@ -429,7 +455,13 @@ void gpu_perform_LS( evaluation_cnt_acc_ct1.get_pointer(), offspring_energy_acc_ct1.get_pointer(), sFloatAccumulator_acc_ct1.get_pointer(), - entity_id_acc_ct1.get_pointer()); + entity_id_acc_ct1.get_pointer() + #if !defined (RNG_ORIGINAL) + , + &rng_engine, + &rng_continuous_distr + #endif + ); }); }); /* diff --git a/dpcpp/kernel4.dp.cpp b/dpcpp/kernel4.dp.cpp index 7547e103..bac08202 100644 --- a/dpcpp/kernel4.dp.cpp +++ b/dpcpp/kernel4.dp.cpp @@ -33,14 +33,15 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #ifdef DOCK_TRACE #ifdef __SYCL_DEVICE_ONLY__ - #define CONSTANT __attribute__((opencl_constant)) + #define CL_CONSTANT __attribute__((opencl_constant)) #else - #define CONSTANT + #define CL_CONSTANT #endif -static const CONSTANT char FMT1[] = "DOCK_TRACE: %s globalID: %6d %20s %10.6f %20s %10.6f"; +#define PRINTF(format, ...) { \ + static const CL_CONSTANT char _format[] = format; \ + sycl::ext::oneapi::experimental::printf(_format, ## __VA_ARGS__); } #endif - void gpu_gen_and_eval_newpops_kernel( @@ -60,7 +61,13 @@ gpu_gen_and_eval_newpops_kernel( float *sBestEnergy, int *sBestID, sycl::float3 *calc_coords, - float *sFloatAccumulator) + float *sFloatAccumulator + #if !defined (RNG_ORIGINAL) + , + RNG_ONEMKL_ENGINE_TYPE* rng_engine, + RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr + #endif + ) // The GPU global function { @@ -168,17 +175,24 @@ gpu_gen_and_eval_newpops_kernel( gene_counter < 10; gene_counter += item_ct1.get_local_range().get(2)) { + #if defined (RNG_ORIGINAL) randnums[gene_counter] = gpu_randf(cData.pMem_prng_states, item_ct1); + #else + randnums[gene_counter] = oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine); + #endif } + #if 0 - if ((threadIdx.x == 0) && (blockIdx.x == 1)) - { - printf("%06d ", blockIdx.x); - for (int i = 0; i < 10; i++) - printf("%12.6f ", randnums[i]); - printf("\n"); + item_ct1.barrier(SYCL_MEMORY_SPACE); + if ( (item_ct1.get_group(2) == 1) && (item_ct1.get_local_id(2) == 0) ) { + PRINTF("\nLocal work-item id: %d\n", item_ct1.get_local_id(2)); + for (uint32_t j = 0; j < 10; j++) { + PRINTF("randnums[%d]=%2.4f\n", j, randnums[j]); + } } + item_ct1.barrier(SYCL_MEMORY_SPACE); #endif + // Determining run ID run_id = item_ct1.get_group(2) / cData.dockpars.pop_size; /* @@ -337,27 +351,31 @@ gpu_gen_and_eval_newpops_kernel( { // Notice: dockpars_mutation_rate was scaled down to [0,1] in host // to reduce number of operations in device - if (/*100.0f**/ gpu_randf(cData.pMem_prng_states, - item_ct1) < - cData.dockpars.mutation_rate) + #if defined (RNG_ORIGINAL) + if (/*100.0f**/ gpu_randf(cData.pMem_prng_states, item_ct1) < cData.dockpars.mutation_rate) + #else + if (oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) < cData.dockpars.mutation_rate) + #endif { // Translation genes if (gene_counter < 3) { offspring_genotype[gene_counter] += cData.dockpars.abs_max_dmov * - (2.0f * gpu_randf( - cData.pMem_prng_states, - item_ct1) - - 1.0f); + #if defined (RNG_ORIGINAL) + (2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f); + #else + (2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f); + #endif } // Orientation and torsion genes else { offspring_genotype[gene_counter] += cData.dockpars.abs_max_dang * - (2.0f * gpu_randf( - cData.pMem_prng_states, - item_ct1) - - 1.0f); + #if defined (RNG_ORIGINAL) + (2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f); + #else + (2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f); + #endif map_angle(offspring_genotype[gene_counter]); } @@ -467,6 +485,17 @@ void gpu_gen_and_eval_newpops( sycl::range<3>(1, 1, threadsPerBlock)), [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { + + #if !defined (RNG_ORIGINAL) + // Creating an RNG engine object + uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)]; + uint64_t rng_offset = item_ct1.get_local_id(2) * threadsPerBlock; + RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset); + + // Creating a continuous RNG distribution object + RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr; + #endif + gpu_gen_and_eval_newpops_kernel( pMem_conformations_current, pMem_energies_current, pMem_conformations_next, @@ -480,7 +509,13 @@ void gpu_gen_and_eval_newpops( sBestEnergy_acc_ct1.get_pointer(), sBestID_acc_ct1.get_pointer(), calc_coords_acc_ct1.get_pointer(), - sFloatAccumulator_acc_ct1.get_pointer()); + sFloatAccumulator_acc_ct1.get_pointer() + #if !defined (RNG_ORIGINAL) + , + &rng_engine, + &rng_continuous_distr + #endif + ); }); }); /* diff --git a/dpcpp/kernel_ad.dp.cpp b/dpcpp/kernel_ad.dp.cpp index 7c7ee6a5..9cb0b3a6 100644 --- a/dpcpp/kernel_ad.dp.cpp +++ b/dpcpp/kernel_ad.dp.cpp @@ -75,7 +75,13 @@ gpu_gradient_minAD_kernel( float *sFloatAccumulator, float *rho, int *cons_succ, - int *cons_fail) + int *cons_fail + #if !defined (RNG_ORIGINAL) + , + RNG_ONEMKL_ENGINE_TYPE* rng_engine, + RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr + #endif +) // The GPU global function performs gradient-based minimization on (some) entities of conformations_next. // The number of OpenCL compute units (CU) which should be started equals to num_of_minEntities*num_of_runs. // This way the first num_of_lsentities entity of each population will be subjected to local search @@ -133,7 +139,11 @@ gpu_gradient_minAD_kernel( // If entity 0 is not selected according to LS-rate, // choosing another entity if (100.0f * + #if defined (RNG_ORIGINAL) gpu_randf(cData.pMem_prng_states, item_ct1) > + #else + oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) > + #endif cData.dockpars.lsearch_rate) { *entity_id = cData.dockpars @@ -493,6 +503,17 @@ void gpu_gradient_minAD( sycl::range<3>(1, 1, threads), sycl::range<3>(1, 1, threads)), [=](sycl::nd_item<3> item_ct1) { + + #if !defined (RNG_ORIGINAL) + // Creating an RNG engine object + uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)]; + uint64_t rng_offset = item_ct1.get_local_id(2) * threads; + RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset); + + // Creating a continuous RNG distribution object + RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr; + #endif + gpu_gradient_minAD_kernel( pMem_conformations_next, pMem_energies_next, item_ct1, dpct_local_acc_ct1.get_pointer(), @@ -501,7 +522,13 @@ void gpu_gradient_minAD( sFloatAccumulator_acc_ct1.get_pointer(), rho_acc_ct1.get_pointer(), cons_succ_acc_ct1.get_pointer(), - cons_fail_acc_ct1.get_pointer()); + cons_fail_acc_ct1.get_pointer() + #if !defined (RNG_ORIGINAL) + , + &rng_engine, + &rng_continuous_distr + #endif + ); }); }); /* diff --git a/dpcpp/kernel_adam.dp.cpp b/dpcpp/kernel_adam.dp.cpp index fe9c7b6a..674294f0 100644 --- a/dpcpp/kernel_adam.dp.cpp +++ b/dpcpp/kernel_adam.dp.cpp @@ -65,7 +65,13 @@ gpu_gradient_minAdam_kernel( GpuData cData, int *entity_id, float *best_energy, - float *sFloatAccumulator) + float *sFloatAccumulator + #if !defined (RNG_ORIGINAL) + , + RNG_ONEMKL_ENGINE_TYPE* rng_engine, + RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr + #endif +) // The GPU global function performs gradient-based minimization on (some) entities of conformations_next. // The number of OpenCL compute units (CU) which should be started equals to num_of_minEntities*num_of_runs. // This way the first num_of_lsentities entity of each population will be subjected to local search @@ -124,7 +130,11 @@ gpu_gradient_minAdam_kernel( // If entity 0 is not selected according to LS-rate, // choosing another entity if (100.0f * + #if defined (RNG_ORIGINAL) gpu_randf(cData.pMem_prng_states, item_ct1) > + #else + oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) > + #endif cData.dockpars.lsearch_rate) { *entity_id = cData.dockpars @@ -487,12 +497,29 @@ void gpu_gradient_minAdam( sycl::range<3>(1, 1, threads), sycl::range<3>(1, 1, threads)), [=](sycl::nd_item<3> item_ct1) { + + #if !defined (RNG_ORIGINAL) + // Creating an RNG engine object + uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)]; + uint64_t rng_offset = item_ct1.get_local_id(2) * threads; + RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset); + + // Creating a continuous RNG distribution object + RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr; + #endif + gpu_gradient_minAdam_kernel( pMem_conformations_next, pMem_energies_next, item_ct1, dpct_local_acc_ct1.get_pointer(), *cData_ptr_ct1, entity_id_acc_ct1.get_pointer(), best_energy_acc_ct1.get_pointer(), - sFloatAccumulator_acc_ct1.get_pointer()); + sFloatAccumulator_acc_ct1.get_pointer() + #if !defined (RNG_ORIGINAL) + , + &rng_engine, + &rng_continuous_distr + #endif + ); }); }); /* diff --git a/dpcpp/kernels.dp.cpp b/dpcpp/kernels.dp.cpp index 85d96758..22f4d181 100644 --- a/dpcpp/kernels.dp.cpp +++ b/dpcpp/kernels.dp.cpp @@ -31,6 +31,43 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "calcenergy.h" #include "GpuData.h" #include "dpcpp_migration.h" +#include "oneapi/mkl/rng/device.hpp" + +// Throwing error if two oneMKL RNG engine types is selected +// If no oneMKL RNG is selected, then using original adhoc/manual RNG + +//#define RNG_ONEMKL_MRG32K3A +//#define RNG_ONEMKL_PHILOX4X32X10 +//#define RNG_ONEMKL_MCG31M1 +//#define RNG_ONEMKL_MCG59 + +#if defined (RNG_ONEMKL_MRG32K3A) && (defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG31M1) || defined (RNG_ONEMKL_MCG59)) + #error "RNG_ONEMKL_MRG32K3A is defined. Do not define additional ONEMKL RNG engines!" +#elif defined (RNG_ONEMKL_PHILOX4X32X10) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_MCG31M1) || defined (RNG_ONEMKL_MCG59)) + #error "RNG_ONEMKL_PHILOX4X32X10 is defined. Do not define additional ONEMKL RNG engines!" +#elif defined (RNG_ONEMKL_MCG31M1) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG59)) + #error "RNG_ONEMKL_MCG31M1 is defined. Do not define additional ONEMKL RNG engines!" +#elif defined (RNG_ONEMKL_MCG59) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG31M1)) + #error "RNG_ONEMKL_MCG59 is defined. Do not define additional ONEMKL RNG engines!" +#elif !defined (RNG_ONEMKL_MRG32K3A) && !defined (RNG_ONEMKL_PHILOX4X32X10) && !defined (RNG_ONEMKL_MCG31M1) && !defined (RNG_ONEMKL_MCG59) + #define RNG_ORIGINAL +#endif + +// Defining data type for selected oneMKL RNG engine type +#if defined (RNG_ONEMKL_MRG32K3A) + #define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mrg32k3a<16> +#elif defined (RNG_ONEMKL_PHILOX4X32X10) + #define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::philox4x32x10<16> +#elif defined (RNG_ONEMKL_MCG31M1) + #define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mcg31m1<16> +#elif defined (RNG_ONEMKL_MCG59) + #define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mcg59<16> +#endif + +// Defining data type for oneMKL RNG distribution type +#if !defined (RNG_ORIGINAL) + #define RNG_ONEMKL_DISTRIBUTION_TYPE oneapi::mkl::rng::device::uniform +#endif inline uint64_t llitoulli(int64_t l) {