emascarenhas · L30nardoSV · Oct 12, 2022 · Oct 14, 2022 · Oct 14, 2022 · Oct 14, 2022
diff --git a/dpcpp/kernel3.dp.cpp b/dpcpp/kernel3.dp.cpp
@@ -51,7 +51,13 @@ gpu_perform_LS_kernel(
                       int *evaluation_cnt,
                       float *offspring_energy,
                       float *sFloatAccumulator,
-                      int *entity_id)
+                      int *entity_id
+						#if !defined (RNG_ORIGINAL)
+						,
+						RNG_ONEMKL_ENGINE_TYPE* rng_engine,
+						RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
+						#endif
+)
 // The GPU global function performs local search on the pre-defined entities of conformations_next.
 // The number of blocks which should be started equals to num_of_lsentities*num_of_runs.
 // This way the first num_of_lsentities entity of each population will be subjected to local search
@@ -85,8 +91,11 @@ gpu_perform_LS_kernel(
                         // If entity 0 is not selected according to LS-rate,
 			// choosing an other entity
                         if (100.0f *
-                                gpu_randf(cData.pMem_prng_states, item_ct1) >
-                            cData.dockpars.lsearch_rate) {
+								#if defined (RNG_ORIGINAL)
+                                gpu_randf(cData.pMem_prng_states, item_ct1) > cData.dockpars.lsearch_rate) {
+								#else
+								oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) > cData.dockpars.lsearch_rate) {
+								#endif
                                 *entity_id = cData.dockpars.num_of_lsentities;
                         }
 		}
@@ -138,11 +147,13 @@ gpu_perform_LS_kernel(
 #ifdef SWAT3
                         genotype_deviate[gene_counter] =
                             *rho *
-                            (2.0f *
-                                 gpu_randf(cData.pMem_prng_states, item_ct1) -
-                             1.0f) *
-                            (gpu_randf(cData.pMem_prng_states, item_ct1) <
-                             gene_scale);
+							#if defined (RNG_ORIGINAL)
+                            (2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f) *
+                            (gpu_randf(cData.pMem_prng_states, item_ct1) < gene_scale);
+							#else
+                            (2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f) *
+                            (oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) < gene_scale);
+							#endif
 
                         // Translation genes
 			if (gene_counter < 3) {
@@ -157,7 +168,11 @@ gpu_perform_LS_kernel(
 				}
 			}
 #else
+			#if defined (RNG_ORIGINAL)
 			genotype_deviate[gene_counter] = rho*(2.0f*gpu_randf(cData.pMem_prng_states)-1.0f)*(gpu_randf(cData.pMem_prng_states)<0.3f);
+			#else
+			genotype_deviate[gene_counter] = rho*(2.0f*oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine)-1.0f)*(oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine)<0.3f);
+			#endif
 
 			// Translation genes
 			if (gene_counter < 3) {
@@ -419,6 +434,17 @@ void gpu_perform_LS(
                                           sycl::range<3>(1, 1, threads),
                                       sycl::range<3>(1, 1, threads)),
                     [=](sycl::nd_item<3> item_ct1) {
+
+							#if !defined (RNG_ORIGINAL)
+							// Creating an RNG engine object
+							uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
+							uint64_t rng_offset = item_ct1.get_local_id(2) * threads;
+							RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);
+
+							// Creating a continuous RNG distribution object
+							RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
+							#endif
+
                             gpu_perform_LS_kernel(
                                 pMem_conformations_next, pMem_energies_next,
                                 item_ct1, dpct_local_acc_ct1.get_pointer(),
@@ -429,7 +455,13 @@ void gpu_perform_LS(
                                 evaluation_cnt_acc_ct1.get_pointer(),
                                 offspring_energy_acc_ct1.get_pointer(),
                                 sFloatAccumulator_acc_ct1.get_pointer(),
-                                entity_id_acc_ct1.get_pointer());
+                                entity_id_acc_ct1.get_pointer()
+								#if !defined (RNG_ORIGINAL)
+								,
+								&rng_engine,
+								&rng_continuous_distr
+								#endif
+								);
                     });
         });
         /*

diff --git a/dpcpp/kernel4.dp.cpp b/dpcpp/kernel4.dp.cpp
@@ -33,14 +33,15 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 
 #ifdef DOCK_TRACE
 #ifdef __SYCL_DEVICE_ONLY__
-          #define CONSTANT __attribute__((opencl_constant))
+          #define CL_CONSTANT __attribute__((opencl_constant))
 #else
-          #define CONSTANT
+          #define CL_CONSTANT
 #endif
-static const CONSTANT char FMT1[] = "DOCK_TRACE: %s globalID: %6d %20s %10.6f %20s %10.6f";
+#define PRINTF(format, ...) { \
+            static const CL_CONSTANT char _format[] = format; \
+            sycl::ext::oneapi::experimental::printf(_format, ## __VA_ARGS__); }
 #endif
 
-
 void
 
 gpu_gen_and_eval_newpops_kernel(
@@ -60,7 +61,13 @@ gpu_gen_and_eval_newpops_kernel(
                                 float *sBestEnergy,
                                 int *sBestID,
                                 sycl::float3 *calc_coords,
-                                float *sFloatAccumulator)
+                                float *sFloatAccumulator
+								#if !defined (RNG_ORIGINAL)
+								,
+								RNG_ONEMKL_ENGINE_TYPE* rng_engine,
+								RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
+								#endif
+								)
 // The GPU global function
 {
 
@@ -168,17 +175,24 @@ gpu_gen_and_eval_newpops_kernel(
                      gene_counter < 10;
                      gene_counter += item_ct1.get_local_range().get(2))
                 {
+						#if defined (RNG_ORIGINAL)
                         randnums[gene_counter] = gpu_randf(cData.pMem_prng_states, item_ct1);
+						#else
+						randnums[gene_counter] = oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine);
+						#endif
                 }
+
 #if 0
-		if ((threadIdx.x == 0) && (blockIdx.x == 1))
-		{
-			printf("%06d ", blockIdx.x);
-			for (int i = 0; i < 10; i++)
-				printf("%12.6f ", randnums[i]);
-			printf("\n");
+		item_ct1.barrier(SYCL_MEMORY_SPACE);
+		if ( (item_ct1.get_group(2) == 1) && (item_ct1.get_local_id(2) == 0) ) {
+			PRINTF("\nLocal work-item id: %d\n", item_ct1.get_local_id(2));
+			for (uint32_t j = 0; j < 10; j++) {
+				PRINTF("randnums[%d]=%2.4f\n", j, randnums[j]);
+			}
 		}
+		item_ct1.barrier(SYCL_MEMORY_SPACE);
 #endif
+
 		// Determining run ID
                 run_id = item_ct1.get_group(2) / cData.dockpars.pop_size;
                 /*
@@ -337,27 +351,31 @@ gpu_gen_and_eval_newpops_kernel(
                 {
 			// Notice: dockpars_mutation_rate was scaled down to [0,1] in host
 			// to reduce number of operations in device
-                        if (/*100.0f**/ gpu_randf(cData.pMem_prng_states,
-                                                  item_ct1) <
-                            cData.dockpars.mutation_rate)
+						#if defined (RNG_ORIGINAL)
+                        if (/*100.0f**/ gpu_randf(cData.pMem_prng_states, item_ct1) < cData.dockpars.mutation_rate)
+						#else
+						if (oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) < cData.dockpars.mutation_rate)
+						#endif
                         {
 				// Translation genes
 				if (gene_counter < 3) {
                                         offspring_genotype[gene_counter] +=
                                             cData.dockpars.abs_max_dmov *
-                                            (2.0f * gpu_randf(
-                                                        cData.pMem_prng_states,
-                                                        item_ct1) -
-                                             1.0f);
+											#if defined (RNG_ORIGINAL)
+                                            (2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f);
+											#else
+											(2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f);
+											#endif
                                 }
 				// Orientation and torsion genes
 				else {
                                         offspring_genotype[gene_counter] +=
                                             cData.dockpars.abs_max_dang *
-                                            (2.0f * gpu_randf(
-                                                        cData.pMem_prng_states,
-                                                        item_ct1) -
-                                             1.0f);
+											#if defined (RNG_ORIGINAL)
+                                            (2.0f * gpu_randf(cData.pMem_prng_states, item_ct1) - 1.0f);
+											#else
+											(2.0f * oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) - 1.0f);
+											#endif
                                         map_angle(offspring_genotype[gene_counter]);
 				}
 
@@ -467,6 +485,17 @@ void gpu_gen_and_eval_newpops(
                                       sycl::range<3>(1, 1, threadsPerBlock)),
                     [=](sycl::nd_item<3> item_ct1) 
                     [[intel::reqd_sub_group_size(32)]] {
+
+							#if !defined (RNG_ORIGINAL)
+							// Creating an RNG engine object
+							uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
+							uint64_t rng_offset = item_ct1.get_local_id(2) * threadsPerBlock;
+							RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);
+
+							// Creating a continuous RNG distribution object
+							RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
+							#endif
+
                             gpu_gen_and_eval_newpops_kernel(
                                 pMem_conformations_current,
                                 pMem_energies_current, pMem_conformations_next,
@@ -480,7 +509,13 @@ void gpu_gen_and_eval_newpops(
                                 sBestEnergy_acc_ct1.get_pointer(),
                                 sBestID_acc_ct1.get_pointer(),
                                 calc_coords_acc_ct1.get_pointer(),
-                                sFloatAccumulator_acc_ct1.get_pointer());
+                                sFloatAccumulator_acc_ct1.get_pointer()
+								#if !defined (RNG_ORIGINAL)
+								,
+								&rng_engine,
+								&rng_continuous_distr
+								#endif
+								);
                     });
         });
         /*

diff --git a/dpcpp/kernel_ad.dp.cpp b/dpcpp/kernel_ad.dp.cpp
@@ -75,7 +75,13 @@ gpu_gradient_minAD_kernel(
                           float *sFloatAccumulator,
                           float *rho,
                           int *cons_succ,
-                          int *cons_fail)
+                          int *cons_fail
+						#if !defined (RNG_ORIGINAL)
+						,
+						RNG_ONEMKL_ENGINE_TYPE* rng_engine,
+						RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
+						#endif
+)
 // The GPU global function performs gradient-based minimization on (some) entities of conformations_next.
 // The number of OpenCL compute units (CU) which should be started equals to num_of_minEntities*num_of_runs.
 // This way the first num_of_lsentities entity of each population will be subjected to local search
@@ -133,7 +139,11 @@ gpu_gradient_minAD_kernel(
                         // If entity 0 is not selected according to LS-rate,
 			// choosing another entity
                         if (100.0f *
+								#if defined (RNG_ORIGINAL)
                                 gpu_randf(cData.pMem_prng_states, item_ct1) >
+								#else
+								oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) >
+								#endif
                             cData.dockpars.lsearch_rate) {
                                 *entity_id =
                                     cData.dockpars
@@ -493,6 +503,17 @@ void gpu_gradient_minAD(
                                           sycl::range<3>(1, 1, threads),
                                       sycl::range<3>(1, 1, threads)),
                     [=](sycl::nd_item<3> item_ct1) {
+
+							#if !defined (RNG_ORIGINAL)
+							// Creating an RNG engine object
+							uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
+							uint64_t rng_offset = item_ct1.get_local_id(2) * threads;
+							RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);
+
+							// Creating a continuous RNG distribution object
+							RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
+							#endif
+
                             gpu_gradient_minAD_kernel(
                                 pMem_conformations_next, pMem_energies_next,
                                 item_ct1, dpct_local_acc_ct1.get_pointer(),
@@ -501,7 +522,13 @@ void gpu_gradient_minAD(
                                 sFloatAccumulator_acc_ct1.get_pointer(),
                                 rho_acc_ct1.get_pointer(),
                                 cons_succ_acc_ct1.get_pointer(),
-                                cons_fail_acc_ct1.get_pointer());
+                                cons_fail_acc_ct1.get_pointer()
+								#if !defined (RNG_ORIGINAL)
+								,
+								&rng_engine,
+								&rng_continuous_distr
+								#endif
+								);
                     });
         });
         /*

diff --git a/dpcpp/kernel_adam.dp.cpp b/dpcpp/kernel_adam.dp.cpp
@@ -65,7 +65,13 @@ gpu_gradient_minAdam_kernel(
                             GpuData cData,
                             int *entity_id,
                             float *best_energy,
-                            float *sFloatAccumulator)
+                            float *sFloatAccumulator
+						#if !defined (RNG_ORIGINAL)
+						,
+						RNG_ONEMKL_ENGINE_TYPE* rng_engine,
+						RNG_ONEMKL_DISTRIBUTION_TYPE* rng_continuous_distr
+						#endif
+)
 // The GPU global function performs gradient-based minimization on (some) entities of conformations_next.
 // The number of OpenCL compute units (CU) which should be started equals to num_of_minEntities*num_of_runs.
 // This way the first num_of_lsentities entity of each population will be subjected to local search
@@ -124,7 +130,11 @@ gpu_gradient_minAdam_kernel(
                         // If entity 0 is not selected according to LS-rate,
 			// choosing another entity
                         if (100.0f *
+								#if defined (RNG_ORIGINAL)
                                 gpu_randf(cData.pMem_prng_states, item_ct1) >
+								#else
+								oneapi::mkl::rng::device::generate_single(*rng_continuous_distr, *rng_engine) >
+								#endif
                             cData.dockpars.lsearch_rate) {
                                 *entity_id =
                                     cData.dockpars
@@ -487,12 +497,29 @@ void gpu_gradient_minAdam(
                                           sycl::range<3>(1, 1, threads),
                                       sycl::range<3>(1, 1, threads)),
                     [=](sycl::nd_item<3> item_ct1) {
+
+							#if !defined (RNG_ORIGINAL)
+							// Creating an RNG engine object
+							uint64_t rng_seed = cData_ptr_ct1->pMem_prng_states[item_ct1.get_global_id(2)];
+							uint64_t rng_offset = item_ct1.get_local_id(2) * threads;
+							RNG_ONEMKL_ENGINE_TYPE rng_engine(rng_seed, rng_offset);
+
+							// Creating a continuous RNG distribution object
+							RNG_ONEMKL_DISTRIBUTION_TYPE rng_continuous_distr;
+							#endif
+
                             gpu_gradient_minAdam_kernel(
                                 pMem_conformations_next, pMem_energies_next,
                                 item_ct1, dpct_local_acc_ct1.get_pointer(),
                                 *cData_ptr_ct1, entity_id_acc_ct1.get_pointer(),
                                 best_energy_acc_ct1.get_pointer(),
-                                sFloatAccumulator_acc_ct1.get_pointer());
+                                sFloatAccumulator_acc_ct1.get_pointer()
+								#if !defined (RNG_ORIGINAL)
+								,
+								&rng_engine,
+								&rng_continuous_distr
+								#endif
+								);
                     });
         });
         /*

diff --git a/dpcpp/kernels.dp.cpp b/dpcpp/kernels.dp.cpp
@@ -31,6 +31,43 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "calcenergy.h"
 #include "GpuData.h"
 #include "dpcpp_migration.h"
+#include "oneapi/mkl/rng/device.hpp"
+
+// Throwing error if two oneMKL RNG engine types is selected
+// If no oneMKL RNG is selected, then using original adhoc/manual RNG
+
+//#define RNG_ONEMKL_MRG32K3A
+//#define RNG_ONEMKL_PHILOX4X32X10
+//#define RNG_ONEMKL_MCG31M1
+//#define RNG_ONEMKL_MCG59
+
+#if defined (RNG_ONEMKL_MRG32K3A) && (defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG31M1) || defined (RNG_ONEMKL_MCG59))
+	#error "RNG_ONEMKL_MRG32K3A is defined. Do not define additional ONEMKL RNG engines!"
+#elif defined (RNG_ONEMKL_PHILOX4X32X10) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_MCG31M1) || defined (RNG_ONEMKL_MCG59))
+	#error "RNG_ONEMKL_PHILOX4X32X10 is defined. Do not define additional ONEMKL RNG engines!"
+#elif defined (RNG_ONEMKL_MCG31M1) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG59))
+	#error "RNG_ONEMKL_MCG31M1 is defined. Do not define additional ONEMKL RNG engines!"
+#elif defined (RNG_ONEMKL_MCG59) && (defined (RNG_ONEMKL_MRG32K3A) || defined (RNG_ONEMKL_PHILOX4X32X10) || defined (RNG_ONEMKL_MCG31M1))
+	#error "RNG_ONEMKL_MCG59 is defined. Do not define additional ONEMKL RNG engines!"
+#elif !defined (RNG_ONEMKL_MRG32K3A) && !defined (RNG_ONEMKL_PHILOX4X32X10) && !defined (RNG_ONEMKL_MCG31M1) && !defined (RNG_ONEMKL_MCG59)
+	#define RNG_ORIGINAL
+#endif
+
+// Defining data type for selected oneMKL RNG engine type
+#if defined (RNG_ONEMKL_MRG32K3A)
+	#define RNG_ONEMKL_ENGINE_TYPE	oneapi::mkl::rng::device::mrg32k3a<16>
+#elif defined (RNG_ONEMKL_PHILOX4X32X10)
+	#define RNG_ONEMKL_ENGINE_TYPE	oneapi::mkl::rng::device::philox4x32x10<16>
+#elif defined (RNG_ONEMKL_MCG31M1)
+	#define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mcg31m1<16>
+#elif defined (RNG_ONEMKL_MCG59)
+	#define RNG_ONEMKL_ENGINE_TYPE oneapi::mkl::rng::device::mcg59<16>
+#endif
+
+// Defining data type for oneMKL RNG distribution type
+#if !defined (RNG_ORIGINAL)
+	#define RNG_ONEMKL_DISTRIBUTION_TYPE oneapi::mkl::rng::device::uniform<float>
+#endif
 
 inline uint64_t llitoulli(int64_t l)
 {