@@ -35,8 +35,10 @@ struct cpuGpu_offloadThreshold {
3535template <typename T>
3636class doGemm {
3737 public:
38- doGemm (const int iters, const int upperLimit)
38+ doGemm (const int iters, const int startDim, const int upperLimit, const
39+ bool cpuEnabled = true , const bool gpuEnabled = true )
3940 : iterations_(iters),
41+ startDimention_(startDim),
4042 upperLimit_(upperLimit),
4143 doCPU_(cpuEnabled),
4244 doGPU_(gpuEnabled)
@@ -68,12 +70,9 @@ class doGemm {
6870 " _square_square_M=N=K.csv" );
6971 for (int dim = startDimention_; dim <= upperLimit_; dim++) {
7072 // M = dim, N = dim, K = dim;
71- callKernels (csvFile, dim, dim, dim);
72- std::ofstream csvFile = initCSVFile (std::string (CSV_DIR) + " /" +
73- getKernelName () + " _square.csv" );
74- for (int dim = 1 ; dim <= upperLimit_; dim++) {
75- const int M = dim, N = dim, K = dim;
76- callDenseKernels (csvFile, M, N, K);
73+ callDenseKernels (csvFile, dim, dim, dim);
74+ std::ofstream csvFile = initCSVFile (std::string (CSV_DIR) + " /" +
75+ getKernelName () + " _square.csv" );
7776 }
7877 // Close file
7978 csvFile.close ();
@@ -92,15 +91,11 @@ class doGemm {
9291 cpuGpu_unified_ = cpuGpu_offloadThreshold ();
9392 csvFile = initCSVFile (std::string (CSV_DIR) + " /" + getKernelName () +
9493 " _rectangular_16MxK.csv" );
95- for (int dim = 16 ; dim <= upperLimit_; dim += 16 ) {
96- const int M = dim, N = dim, K = (dim / 16 );
97- callDenseKernels (csvFile, M, N, K);
98- " _tall-thin_short-wide_M=N_M=16K.csv" );
9994 int K = startDimention_;
10095 int M = 16 * K;
10196 int N = 16 * K;
10297 while (M <= upperLimit_) {
103- callKernels (csvFile, M, N, K);
98+ callDenseKernels (csvFile, M, N, K);
10499 M += 16 ;
105100 N += 16 ;
106101 K++;
@@ -123,11 +118,7 @@ class doGemm {
123118 " _rectangular_Mx32.csv" );
124119 if (upperLimit_ >= 32 ) {
125120 for (int dim = 1 ; dim <= upperLimit_; dim++) {
126- const int M = dim, N = dim, K = 32 ;
127- callDenseKernels (csvFile, M, N, K);
128- for (int dim = startDimention_; dim <= upperLimit_; dim++) {
129- // M = dim, N = dim, K = 32;
130- callKernels (csvFile, dim, dim, 32 );
121+ callDenseKernels (csvFile, dim, dim, 32 );
131122 }
132123 }
133124 // Close file
@@ -150,7 +141,7 @@ class doGemm {
150141 N = startDimention_;
151142 K = 16 * M;
152143 while (K <= upperLimit_) {
153- callKernels (csvFile, M, N, K);
144+ callDenseKernels (csvFile, M, N, K);
154145 M++;
155146 N++;
156147 K += 16 ;
@@ -174,7 +165,7 @@ class doGemm {
174165 if (upperLimit_ >= 32 ) {
175166 for (int dim = startDimention_; dim <= upperLimit_; dim++) {
176167 // M = 32, N = 32, K = dim;
177- callKernels (csvFile, 32 , 32 , dim);
168+ callDenseKernels (csvFile, 32 , 32 , dim);
178169 }
179170 }
180171 // Close file
@@ -193,15 +184,8 @@ class doGemm {
193184 cpuGpu_unified_ = cpuGpu_offloadThreshold ();
194185 csvFile = initCSVFile (std::string (CSV_DIR) + " /" + getKernelName () +
195186 " _rectangular_Mx16K.csv" );
196- for (int dim = 16 ; dim <= upperLimit_; dim += 16 ) {
197- const int M = (dim / 16 ), N = (dim / 16 ), K = dim;
198- callDenseKernels (csvFile, M, N, K);
199- " _tall-thin_square_K=N_M=16K.csv" );
200- K = startDimention_;
201- N = startDimention_;
202- M = 16 * K;
203187 while (M <= upperLimit_) {
204- callKernels (csvFile, M, N, K);
188+ callDenseKernels (csvFile, M, N, K);
205189 M += 16 ;
206190 N++;
207191 K++;
@@ -225,7 +209,7 @@ class doGemm {
225209 if (upperLimit_ >= 32 ) {
226210 for (int dim = startDimention_; dim <= upperLimit_; dim++) {
227211 // M = dim, N = 32, K = 32;
228- callKernels (csvFile, dim, 32 , 32 );
212+ callDenseKernels (csvFile, dim, 32 , 32 );
229213 }
230214 }
231215 // Close file
@@ -248,19 +232,19 @@ class doGemm {
248232 K = startDimention_;
249233 N = 16 * K;
250234 while (N <= upperLimit_) {
251- callKernels (csvFile, M, N, K);
235+ callDenseKernels (csvFile, M, N, K);
252236 M++;
253237 N += 16 ;
254238 K++;
255- for (int dim = 1 ; dim <= upperLimit_; dim++) {
256- const int M = 32 , N = 32 , K = dim;
257- callDenseKernels (csvFile, M, N, K);
258- }
259239 }
260240 // Close file
261241 csvFile.close ();
262- // Print offload results to stdout
263- printOffloadThreshold (" Short and Wide (32 x K)" );
242+ #if CPU_ENABLED && GPU_ENABLED
243+ if (doCPU_ && doGPU_) {
244+ // Print offload results to stdout
245+ printOffloadThreshold (" Square x Short-and-Wide (M=K, N=16K)" );
246+ }
247+ #endif
264248
265249 // Square sparse matrix - sparse matrix multiplication
266250 cpuGpu_always_ = cpuGpu_offloadThreshold ();
@@ -270,19 +254,15 @@ class doGemm {
270254 " _sparse_square.csv" );
271255 if (upperLimit_ >= 32 ) {
272256 for (int dim = 1 ; dim <= upperLimit_; dim++) {
273- const int N = dim;
274- callSparseKernels (csvFile, N, 0.99 );
257+ callSparseKernels (csvFile, dim, 0.99 );
275258 }
276259 }
277260 // Close file
278261 csvFile.close ();
279- // Print offload results to stdout
280- printOffloadThreshold (" Sparse Square" );
281-
282262#if CPU_ENABLED && GPU_ENABLED
283- if (doCPU_ && doGPU_ ) {
263+ if (doCPU_ && dpGPU_ ) {
284264 // Print offload results to stdout
285- printOffloadThreshold (" Square x Short-and-Wide (M=K, N=16K) " );
265+ printOffloadThreshold (" Sparse Square " );
286266 }
287267#endif
288268
@@ -296,7 +276,7 @@ class doGemm {
296276 if (upperLimit_ >= 32 ) {
297277 for (int dim = startDimention_; dim <= upperLimit_; dim++) {
298278 // M = 32, N = dim, K = 32;
299- callKernels (csvFile, 32 , dim, 32 );
279+ callDenseKernels (csvFile, 32 , dim, 32 );
300280 }
301281 }
302282 // Close file
@@ -501,14 +481,20 @@ class doGemm {
501481 const uint64_t flops = calcFlops (N, N, N);
502482 std::string kernelName = getKernelName ();
503483
504- spGemmCpu_.initialise (N, sparsity);
505- time_checksum_gflop cpuResult = spGemmCpu_.compute ();
506- cpuResult.gflops = calcGflops (flops, iterations_, cpuResult.runtime );
507-
508- // Perform the GPU kernels
509-
484+ #if CPU_ENABLED
485+ if (doCPU_) {
486+ spGemmCpu_.initialise (N, sparsity);
487+ time_checksum_gflop cpuResult = spGemmCpu_.compute ();
488+ cpuResult.gflops = calcGflops (flops, iterations_, cpuResult.runtime );
489+ writeLineToCsv (csvFile, " cpu" , kernelName, N, N, N, probSize, iterations_,
490+ cpuResult.runtime , cpuResult.gflops );
491+ }
492+ #endif
493+ #if GPU_ENABLED
494+ // Perform the GPU kernels
510495 // - UNIFIED : data passed from host to device (and device to host) as
511496 // needed
497+ if (doGPU_) {
512498 spGemmGpu_.initialise (gpuOffloadType::unified, N, sparsity);
513499 time_checksum_gflop gpuResult_unified = spGemmGpu_.compute ();
514500 gpuResult_unified.gflops =
@@ -525,13 +511,9 @@ class doGemm {
525511 time_checksum_gflop gpuResult_once = spGemmGpu_.compute ();
526512 gpuResult_once.gflops =
527513 calcGflops (flops, iterations_, gpuResult_once.runtime );
528-
529-
530514 // ToDo -- non-default GPU operations
531515
532516 // Write lines to CSV file
533- writeLineToCsv (csvFile, " cpu" , kernelName, N, N, N, probSize, iterations_,
534- cpuResult.runtime , cpuResult.gflops );
535517 writeLineToCsv (csvFile, " gpu_offloadOnce" , kernelName, N, N, N, probSize,
536518 iterations_, gpuResult_once.runtime , gpuResult_once.gflops );
537519 writeLineToCsv (csvFile, " gpu_offloadAlways" , kernelName, N, N, N, probSize,
@@ -540,6 +522,10 @@ class doGemm {
540522 writeLineToCsv (csvFile, " gpu_unified" , kernelName, N, N, N, probSize,
541523 iterations_, gpuResult_unified.runtime ,
542524 gpuResult_unified.gflops );
525+
526+ }
527+ #endif
528+
543529 }
544530
545531 /* * A function for calculating FLOPs performed by a GEMM.
@@ -569,7 +555,7 @@ class doGemm {
569555 }
570556
571557 /* * Print to stdout the offload thresholds. */
572- void printOffloadThreshold (std::string problemName) const {
558+ void printOffloadThreshold (const std::string& problemName) const {
573559 std::vector<std::string> header = {
574560 " Device" , " M" , " N" , " K" , " Total Prob. Size (KiB)" ,
575561 " GFLOP/s" , " CPU GFLOP/s" };
@@ -663,16 +649,14 @@ class doGemm {
663649#if CPU_ENABLED
664650 /* * The GEMM CPU kernel. */
665651 cpu::gemm_cpu<T> gemmCpu_;
652+ cpu::sp_gemm_cpu<T> spGemmCpu_;
666653#endif
667654
668- cpu::sp_gemm_cpu<T> spGemmCpu_;
669-
670655#if GPU_ENABLED
671656 /* * The GEMM GPU kernel. */
672657 gpu::gemm_gpu<T> gemmGpu_;
673- #endif
674-
675658 gpu::sp_gemm_gpu<T> spGemmGpu_;
659+ #endif
676660
677661 /* * The point at which offloading to GPU (offload once) becomes worthwhile. */
678662 cpuGpu_offloadThreshold cpuGpu_once_;
0 commit comments