llnl · rhornung67 · Dec 19, 2025 · Dec 17, 2025 · Dec 17, 2025 · Dec 18, 2025
diff --git a/docs/sphinx/user_guide/run.rst b/docs/sphinx/user_guide/run.rst
@@ -51,29 +51,12 @@ kernel.
 Running the Suite
 ==================
 
-After compilation, the main executable will reside in the ``bin`` subdirectory 
-of the build space. The executable will be able to run all kernels and 
-variants that have been built depending on which CMake options were specified
-to configure the build.
+After compilation, the RAJA Performance Suite executable will reside in the
+``bin`` subdirectory of the build space. The executable will be able to run
+all kernels and variants that have been built depending on which CMake options
+were specified to configure the build.
 
-To run the Suite in its default mode, type the executable name with no 
-command-line arguments::
-
-  $ ./bin/raja-perf.exe
-
-This will run all kernels and variants that have been built in their default
-configurations. Information describing how the Suite will run along with
-some information about each kernel will appear on the screen. More information
-about kernel and execution details will also appear in a run report files 
-generated in the run directory after Suite execution completes. 
-
-.. note:: You can pass the ``--dryrun`` command-line option to the executable
-          to see a summary of how the Suite will execute, by showing default
-          run parameters, without actually running it. You can also pass 
-          other command-line options when doing a "dry run" and you will see
-          that the given options are represented in the screen output.
-
-The Suite can be run in a variety of ways determined by the command-line 
+The Suite can be run in many different ways chosen by the command-line 
 options passed to the executable. For example, you can run or exclude subsets 
 of kernels, variants, or groups. You can also pass options to set problem 
 sizes, number of times each kernel is run (sampled), and many other run 
@@ -95,6 +78,27 @@ or::
 .. note:: To see all available Suite execution options, pass the `--help` or 
           `-h` option to the executable.
 
+.. important:: We do not describe most of the Suite execution options in this
+               guide since the runtime help output is the main reference for
+               available options, defaults, and arguments they accept.
+
+To run the Suite in its default mode, type the executable name with no 
+command-line arguments::
+
+  $ ./bin/raja-perf.exe
+
+This will run all kernels and variants that have been built in their default
+configurations. Information describing how the Suite will run along with
+some information about each kernel will appear on the screen. More information
+about kernel and execution details will also appear in a run report files 
+generated in the run directory after Suite execution completes. 
+
+.. note:: You can pass the ``--dryrun`` command-line option to the executable
+          to see a summary of how the Suite will execute by showing run
+          parameters without actually running it. You can pass any other
+          command-line options when doing a "dry run" and you will see
+          that the given options are represented in the screen output.
+
 Lastly, the program will report specific errors if given incorrect input, such
 as an option that requires a value and no value is provided. It will also emit 
 a summary of command-line arguments it was given if the input contains 
@@ -116,7 +120,8 @@ will report the following in the screen output::
     See run parameters or option messages above.
 
 The output indicates that the kernel input is invalid because the string Foo
-is not the name of a kernel in the Suite, while DAXPY is the name of a kernel. 
+is not the name of a kernel in the Suite, while DAXPY is the name of a kernel
+in the Suite.
 
 .. note:: The Suite executable will attempt to provide helpful information
           if it is given incorrect input, such as command-line arguments that 
@@ -130,8 +135,18 @@ is not the name of a kernel in the Suite, while DAXPY is the name of a kernel.
 Running with MPI
 ==================
 
-Running the Suite with MPI is just like running any other MPI application.
-For example, issuing the following command on a machine with slurm scheduling::
+The Suite can be configured and compiled to run in a distributed memory
+parallel mode using MPI. Running the Suite on multiple MPI ranks will execute
+the same code for each kernel on each rank with minimal synchronization points
+to gather execution timing data from all ranks. This capability is provided so
+that individual kernel performance more closely aligns with how such kernels 
+would perform in a real application. For example, compute node memory bandwidth
+may be different when running on a many core system using OpenMP multithreading
+to exercise all cores than when each core is mapped to an MPI rank.
+
+Running the Suite on multiple MPI ranks is just like running any other MPI
+application. For example, issuing the following command on a machine with
+slurm scheduling::
 
   $ srun -n 2 ./bin/raja-perf.exe
 

diff --git a/src/common/Executor.cpp b/src/common/Executor.cpp
@@ -19,7 +19,7 @@
 #include "CudaDataUtils.hpp"
 #include "HipDataUtils.hpp"
 
-// Warmup kernels to run first to help reduce startup overheads in timings
+// Warmup kernels for default warmup mode
 #include "basic/DAXPY.hpp"
 #include "basic/REDUCE3_INT.hpp"
 #include "basic/INDEXLIST_3LOOP.hpp"
@@ -754,7 +754,9 @@ void Executor::runKernel(KernelBase* kernel, bool print_kernel_name)
 
 void Executor::runWarmupKernels()
 {
-  if ( run_params.getDisableWarmup() ) {
+  RunParams::WarmupMode warmup_mode = run_params.getWarmupMode();
+
+  if ( warmup_mode == RunParams::WarmupMode::Disable ) {
     return;
   } 
 
@@ -763,16 +765,28 @@ void Executor::runWarmupKernels()
   //
   // Get warmup kernels to run from input
   //
-  std::set<KernelID> kernel_ids = run_params.getWarmupKernelIDsToRun();
+  std::set<KernelID> warmup_kernel_ids;
+
+  if ( warmup_mode == RunParams::WarmupMode::Explicit ) {
 
-  if ( kernel_ids.empty() ) {
+    warmup_kernel_ids = run_params.getSpecifiedWarmupKernelIDs();
+
+  } else if ( warmup_mode == RunParams::WarmupMode::PerfRunSame ) {
 
     //
-    // If no warmup kernels were given, choose a warmup kernel for each feature
+    // Warmup kernels will be same as kernels specified to run in the suite
     //
+    for (size_t ik = 0; ik < kernels.size(); ++ik) {
+      KernelBase* kernel = kernels[ik];
+      warmup_kernel_ids.insert( kernel->getKernelID() );
+    } // iterate over kernels to run
+
+  } else if ( warmup_mode == RunParams::WarmupMode::Default ) {
 
     //
-    // For kernels to be run, assemble a set of feature IDs
+    // No warmup kernel input given, choose a warmup kernel for each feature
+    //
+    // First, assemble a set of feature IDs
     //
     std::set<FeatureID> feature_ids;
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
@@ -788,7 +802,7 @@ void Executor::runWarmupKernels()
     } // iterate over kernels
 
     //
-    // Map feature IDs to set of warmup kernel IDs
+    // Map feature IDs to rudimentary set of warmup kernel IDs
     //
     for ( auto fid = feature_ids.begin(); fid != feature_ids.end(); ++ fid ) {
 
@@ -797,29 +811,29 @@ void Executor::runWarmupKernels()
         case Forall:
         case Kernel:
         case Launch:
-          kernel_ids.insert(Basic_DAXPY); break;
+          warmup_kernel_ids.insert(Basic_DAXPY); break;
 
         case Sort:
-          kernel_ids.insert(Algorithm_SORT); break;
+          warmup_kernel_ids.insert(Algorithm_SORT); break;
 
         case Scan:
-          kernel_ids.insert(Basic_INDEXLIST_3LOOP); break;
+          warmup_kernel_ids.insert(Basic_INDEXLIST_3LOOP); break;
 
         case Workgroup:
-          kernel_ids.insert(Comm_HALO_PACKING_FUSED); break;
+          warmup_kernel_ids.insert(Comm_HALO_PACKING_FUSED); break;
 
         case Reduction:
-          kernel_ids.insert(Basic_REDUCE3_INT); break;
+          warmup_kernel_ids.insert(Basic_REDUCE3_INT); break;
 
         case Atomic:
-          kernel_ids.insert(Basic_PI_ATOMIC); break;
+          warmup_kernel_ids.insert(Basic_PI_ATOMIC); break;
 
         case View:
           break;
 
   #ifdef RAJA_PERFSUITE_ENABLE_MPI
         case MPI:
-          kernel_ids.insert(Comm_HALO_EXCHANGE_FUSED); break;
+          warmup_kernel_ids.insert(Comm_HALO_EXCHANGE_FUSED); break;
   #endif
 
         default:
@@ -835,7 +849,15 @@ void Executor::runWarmupKernels()
   //
   // Run warmup kernels
   //
-  for ( auto kid = kernel_ids.begin(); kid != kernel_ids.end(); ++ kid ) {
+  bool prev_state = KernelBase::setWarmupRun(true);
+
+  for ( auto kid = warmup_kernel_ids.begin();
+             kid != warmup_kernel_ids.end(); ++ kid ) {
+    //  
+    // Note that we create a new kernel object for each kernel to run
+    // in warmup so we don't pollute timing data, checksum data, etc.
+    // for kernels that will run for real later...
+    //
     KernelBase* kernel = getKernelObject(*kid, run_params);
 #if defined(RAJA_PERFSUITE_USE_CALIPER)
     kernel->caliperOff();
@@ -847,6 +869,8 @@ void Executor::runWarmupKernels()
     delete kernel;
   }
 
+  KernelBase::setWarmupRun(prev_state);
+
 }
 
 void Executor::outputRunData()
@@ -933,10 +957,12 @@ void Executor::writeCSVReport(ostream& file, CSVRepMode mode,
     //
     // Set basic table formatting parameters.
     //
-    const string kernel_col_name("Kernel  ");
+    const string kernel_name_col_header_variant("Variant  ");
+    const string kernel_name_col_header_tuning("Tuning  ");
     const string sepchr(" , ");
 
-    size_t kercol_width = kernel_col_name.size();
+    size_t kercol_width = max(kernel_name_col_header_variant.size(),
+                              kernel_name_col_header_tuning.size());
     for (size_t ik = 0; ik < kernels.size(); ++ik) {
       kercol_width = max(kercol_width, kernels[ik]->getName().size());
     }
@@ -969,7 +995,7 @@ void Executor::writeCSVReport(ostream& file, CSVRepMode mode,
     //
     // Print column variant name line.
     //
-    file <<left<< setw(kercol_width) << kernel_col_name;
+    file <<left<< setw(kercol_width) << kernel_name_col_header_variant;
     for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
       for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) {
         file << sepchr <<left<< setw(vartuncol_width[iv][it])
@@ -981,7 +1007,7 @@ void Executor::writeCSVReport(ostream& file, CSVRepMode mode,
     //
     // Print column tuning name line.
     //
-    file <<left<< setw(kercol_width) << kernel_col_name;
+    file <<left<< setw(kercol_width) << kernel_name_col_header_tuning;
     for (size_t iv = 0; iv < variant_ids.size(); ++iv) {
       for (size_t it = 0; it < tuning_names[variant_ids[iv]].size(); ++it) {
         file << sepchr <<left<< setw(vartuncol_width[iv][it])

diff --git a/src/common/KernelBase.cpp b/src/common/KernelBase.cpp
@@ -19,6 +19,16 @@
 
 namespace rajaperf {
 
+//
+// Static method to set whether kernels are used for warmup purposes or not
+//
+bool KernelBase::setWarmupRun(bool warmup_run)
+{
+  bool previous_state = s_warmup_run;
+  s_warmup_run = warmup_run;
+  return previous_state;
+}
+
 KernelBase::KernelBase(KernelID kid, const RunParams& params)
   : run_params(params)
 #if defined(RAJA_ENABLE_TARGET_OPENMP)
@@ -133,7 +143,9 @@ Index_type KernelBase::getTargetProblemSize() const
 Index_type KernelBase::getRunReps() const
 {
   Index_type run_reps = static_cast<Index_type>(0);
-  if (run_params.getInputState() == RunParams::CheckRun) {
+  if (s_warmup_run) {
+    run_reps = static_cast<Index_type>(1);
+  } else if (run_params.getInputState() == RunParams::CheckRun) {
     run_reps = static_cast<Index_type>(run_params.getCheckRunReps());
   } else {
     run_reps = static_cast<Index_type>(default_reps*run_params.getRepFactor());

diff --git a/src/common/KernelBase.hpp b/src/common/KernelBase.hpp
@@ -90,6 +90,15 @@ class KernelBase
     { return std::numeric_limits<size_t>::max(); }
   static std::string getDefaultTuningName() { return "default"; }
 
+  //
+  // Method to set state of all Kernel objects to indicate kernel runs 
+  // are for warmup purposes if true is passed, else false.
+  //
+  // The warmup state before the method call is returned to facilitate 
+  // reset mechanics. 
+  //
+  static bool setWarmupRun(bool warmup_run);
+
   KernelBase(KernelID kid, const RunParams& params);
 
   virtual ~KernelBase();
@@ -631,7 +640,13 @@ class KernelBase
                         variant_tuning_method_pointer method);
 
   //
-  // Static properties of kernel, independent of run
+  // Boolean member shared by all kernel objects indicating whether they
+  // will be run for warmup purposes (true) or not (false).
+  //
+  static inline bool s_warmup_run = false;
+
+  //
+  // Persistent properties of kernel, independent of run
   //
   KernelID    kernel_id;
   std::string name;