From c261baa0a857d554cb10f54277f30decdd8b2e6a Mon Sep 17 00:00:00 2001
From: Dingqing Yang <dingqingy@nvidia.com>
Date: Tue, 24 Mar 2026 16:36:28 -0700
Subject: [PATCH 1/4] provide perf script util for lock gpu frequency. Helpful
 for correlation

Signed-off-by: Dingqing Yang <dingqingy@nvidia.com>
---
 scripts/performance/argument_parser.py  | 10 ++++++
 scripts/performance/perf_plugins.py     | 42 +++++++++++++++++++++++++
 scripts/performance/setup_experiment.py |  3 ++
 3 files changed, 55 insertions(+)
diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py
index 06193ff840..85281312d4 100644
--- a/scripts/performance/argument_parser.py
+++ b/scripts/performance/argument_parser.py
@@ -562,6 +562,16 @@ def parse_cli_args():
         type=bool_arg,
         required=False,
     )
+    performance_args.add_argument(
+        "-lgc",
+        "--lock_gpu_freq",
+        help="Lock GPU graphics clock to the specified frequency in MHz via "
+        "`sudo nvidia-smi -lgc <freq>`. Runs once per node before training. "
+        "Use `nvidia-smi -rgc` to reset after the job.",
+        type=int,
+        required=False,
+        default=None,
+    )
     performance_args.add_argument(
         "-en",
         "--enable_nsys",
diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index e420d06bb6..d8694ec8d7 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -188,6 +188,8 @@ class PerfEnvPlugin(Plugin):
             in order to not block DP level communication overlap.
         enable_vboost (bool): Whether to steer more power towards tensor cores via
             `sudo nvidia-smi boost-slider --vboost 1`. May not work on all systems.
+        lock_gpu_freq (int | None): Lock GPU graphics clock to the specified frequency in MHz via
+            `sudo nvidia-smi -lgc <freq>`. Runs once per node before training. None to disable.
         enable_manual_gc (bool): Enable manual garbage collection for better performance.
         manual_gc_interval (int): Interval for manual garbage collection. Default is 100.
         tp_size (int): Tensor parallelism size. Default is 1.
@@ -200,6 +202,7 @@ class PerfEnvPlugin(Plugin):
 
     enable_layernorm_sm_margin: bool = True
     enable_vboost: bool = False
+    lock_gpu_freq: int | None = None
     enable_manual_gc: bool = True
     manual_gc_interval: int = 100
     tp_size: int | None = None
@@ -416,6 +419,42 @@ def get_vboost_srun_cmd(nodes, job_dir):
                 else vboost_cmd
             )
 
+    def _set_lock_gpu_freq(
+        self, task: Union["run.Partial", "run.Script"], executor: "run.Executor", lock_gpu_freq: int | None
+    ):
+        def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz):
+            import shlex
+
+            lock_freq_cmd = "\n".join(
+                [
+                    "",
+                    "# Command 0: lock GPU graphics clock",
+                    " ".join(
+                        [
+                            "srun",
+                            "--ntasks-per-node=1",
+                            "--output",
+                            os.path.join(job_dir, "lock_gpu_freq.out"),
+                            "--error",
+                            os.path.join(job_dir, "lock_gpu_freq.err"),
+                            "bash -c",
+                            shlex.quote(f"sudo nvidia-smi -lgc {freq_mhz}"),
+                        ]
+                    ),
+                    "",
+                ]
+            )
+
+            return lock_freq_cmd
+
+        if lock_gpu_freq is not None and isinstance(executor, SlurmExecutor):
+            lock_freq_cmd = get_lock_gpu_freq_srun_cmd(executor.tunnel.job_dir, lock_gpu_freq)
+            executor.setup_lines = (
+                executor.setup_lines + lock_freq_cmd
+                if (executor.setup_lines and len(executor.setup_lines) > 0)
+                else lock_freq_cmd
+            )
+
     def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executor"):
         """Enable the performance environment settings"""
         workload_base_config = get_workload_base_config(
@@ -480,6 +519,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
         # Improve perf by steering power to tensor cores, may not work on all systems
         self._set_vboost(task, executor, self.enable_vboost)
 
+        # Lock GPU graphics clock frequency for stable performance measurements
+        self._set_lock_gpu_freq(task, executor, self.lock_gpu_freq)
+
         # Set model-specific environment variables
         self._set_model_specific_environment_variables(
             task,
diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py
index 2598c58b39..e7ff4e25cb 100755
--- a/scripts/performance/setup_experiment.py
+++ b/scripts/performance/setup_experiment.py
@@ -189,6 +189,7 @@ def main(
     detach: bool,
     dryrun: bool,
     enable_vboost: bool,
+    lock_gpu_freq: Optional[int],
     enable_nsys: bool,
     pytorch_profiler: bool,
     moe_a2a_overlap: bool,
@@ -354,6 +355,7 @@ def main(
         plugins.append(
             PerfEnvPlugin(
                 enable_vboost=enable_vboost,
+                lock_gpu_freq=lock_gpu_freq,
                 moe_a2a_overlap=moe_a2a_overlap,
                 tp_size=tp_size,
                 pp_size=pp_size,
@@ -599,6 +601,7 @@ def main(
         detach=args.detach,
         dryrun=args.dryrun,
         enable_vboost=args.enable_vboost,
+        lock_gpu_freq=args.lock_gpu_freq,
         enable_nsys=args.enable_nsys,
         pytorch_profiler=args.pytorch_profiler,
         moe_a2a_overlap=args.moe_a2a_overlap,

From 3b0d3ed41253acd28a00d5f951513bea397abe83 Mon Sep 17 00:00:00 2001
From: Dingqing Yang <dingqingy@nvidia.com>
Date: Mon, 6 Apr 2026 21:39:56 -0700
Subject: [PATCH 2/4] [perf] docs: add docstring and README entry for
 --lock_gpu_freq

Address review feedback: document when/why to use GPU clock locking
and add the new arg to the performance README.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Dingqing Yang <dingqingy@nvidia.com>
---
 scripts/performance/README.md       | 1 +
 scripts/performance/perf_plugins.py | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/scripts/performance/README.md b/scripts/performance/README.md
index d936a19d84..7bfda8776d 100644
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@@ -175,6 +175,7 @@ python scripts/performance/setup_experiment.py
 - `-g/--gpu`: Target GPU type (`h100`, `b200`, `gb200`, `gb300`, `b300`).
 - `-c/--compute_dtype`: Compute precision (`bf16`, `fp8_cs`, `fp8_mx`, `fp8_sc`, `nvfp4`). Default `bf16`.
 - `-vb/--enable_vboost`: Enable VBoost (tensor core power steering). Pass `true` or `false`. Disabled by default.
+- `-lgc/--lock_gpu_freq`: Lock GPU graphics clock to a fixed frequency in MHz (e.g. `2100`). Useful for reducing run-to-run variance in benchmarks. Runs `nvidia-smi -lgc <freq>` once per node before training. Use `nvidia-smi -rgc` to reset after the job. Disabled by default.
 - `-en/--enable_nsys`: Enable Nsight Systems profiling. Disabled by default.
 - `-pyp/--pytorch_profiler`: Enable PyTorch profiler. Pass `true` or `false`. Disabled by default.
 - `--profiling_start_step`: Defines start step for profiling. Default `10`.
diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index d8694ec8d7..250a263012 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -422,6 +422,12 @@ def get_vboost_srun_cmd(nodes, job_dir):
     def _set_lock_gpu_freq(
         self, task: Union["run.Partial", "run.Script"], executor: "run.Executor", lock_gpu_freq: int | None
     ):
+        """Lock GPU graphics clocks to a fixed frequency before training.
+
+        This is useful for performance benchmarking where consistent GPU clock speeds
+        are needed to reduce run-to-run variance. Runs ``nvidia-smi -lgc <freq>`` once
+        per node via srun before the main training command.
+        """
         def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz):
             import shlex
 

From bf842321d528bbdf9a0164628460aac75ebae8cf Mon Sep 17 00:00:00 2001
From: Dingqing Yang <dingqingy@nvidia.com>
Date: Mon, 6 Apr 2026 21:44:32 -0700
Subject: [PATCH 3/4] [perf] docs: add docstring and README entry for
 --lock_gpu_freq

Address review feedback: document the correlation study use case
for GPU clock locking and add the new arg to the performance README.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Dingqing Yang <dingqingy@nvidia.com>
---
 scripts/performance/README.md       | 2 +-
 scripts/performance/perf_plugins.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/scripts/performance/README.md b/scripts/performance/README.md
index 7bfda8776d..937f0ed3f2 100644
--- a/scripts/performance/README.md
+++ b/scripts/performance/README.md
@@ -175,7 +175,7 @@ python scripts/performance/setup_experiment.py
 - `-g/--gpu`: Target GPU type (`h100`, `b200`, `gb200`, `gb300`, `b300`).
 - `-c/--compute_dtype`: Compute precision (`bf16`, `fp8_cs`, `fp8_mx`, `fp8_sc`, `nvfp4`). Default `bf16`.
 - `-vb/--enable_vboost`: Enable VBoost (tensor core power steering). Pass `true` or `false`. Disabled by default.
-- `-lgc/--lock_gpu_freq`: Lock GPU graphics clock to a fixed frequency in MHz (e.g. `2100`). Useful for reducing run-to-run variance in benchmarks. Runs `nvidia-smi -lgc <freq>` once per node before training. Use `nvidia-smi -rgc` to reset after the job. Disabled by default.
+- `-lgc/--lock_gpu_freq`: Lock GPU graphics clock to a fixed frequency in MHz (e.g. `1200`). Used for silicon simulation correlation studies. Disabled by default.
 - `-en/--enable_nsys`: Enable Nsight Systems profiling. Disabled by default.
 - `-pyp/--pytorch_profiler`: Enable PyTorch profiler. Pass `true` or `false`. Disabled by default.
 - `--profiling_start_step`: Defines start step for profiling. Default `10`.
diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index 250a263012..f503cffdcb 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -424,9 +424,8 @@ def _set_lock_gpu_freq(
     ):
         """Lock GPU graphics clocks to a fixed frequency before training.
 
-        This is useful for performance benchmarking where consistent GPU clock speeds
-        are needed to reduce run-to-run variance. Runs ``nvidia-smi -lgc <freq>`` once
-        per node via srun before the main training command.
+        Used for silicon simulation correlation studies where a fixed GPU
+        clock frequency is required to match simulation assumptions.
         """
         def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz):
             import shlex

From 6882c4a06617e2253d4ef9f8ca8032af70360c30 Mon Sep 17 00:00:00 2001
From: Dingqing Yang <dingqingy@nvidia.com>
Date: Mon, 6 Apr 2026 21:47:51 -0700
Subject: [PATCH 4/4] [perf] fix: lint blank line after docstring

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Signed-off-by: Dingqing Yang <dingqingy@nvidia.com>
---
 scripts/performance/perf_plugins.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py
index f503cffdcb..487b8594c6 100644
--- a/scripts/performance/perf_plugins.py
+++ b/scripts/performance/perf_plugins.py
@@ -427,6 +427,7 @@ def _set_lock_gpu_freq(
         Used for silicon simulation correlation studies where a fixed GPU
         clock frequency is required to match simulation assumptions.
         """
+
         def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz):
             import shlex