From c261baa0a857d554cb10f54277f30decdd8b2e6a Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Tue, 24 Mar 2026 16:36:28 -0700 Subject: [PATCH 1/4] provide perf script util for lock gpu frequency. Helpful for correlation Signed-off-by: Dingqing Yang --- scripts/performance/argument_parser.py | 10 ++++++ scripts/performance/perf_plugins.py | 42 +++++++++++++++++++++++++ scripts/performance/setup_experiment.py | 3 ++ 3 files changed, 55 insertions(+) diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index 06193ff840..85281312d4 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -562,6 +562,16 @@ def parse_cli_args(): type=bool_arg, required=False, ) + performance_args.add_argument( + "-lgc", + "--lock_gpu_freq", + help="Lock GPU graphics clock to the specified frequency in MHz via " + "`sudo nvidia-smi -lgc `. Runs once per node before training. " + "Use `nvidia-smi -rgc` to reset after the job.", + type=int, + required=False, + default=None, + ) performance_args.add_argument( "-en", "--enable_nsys", diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index e420d06bb6..d8694ec8d7 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -188,6 +188,8 @@ class PerfEnvPlugin(Plugin): in order to not block DP level communication overlap. enable_vboost (bool): Whether to steer more power towards tensor cores via `sudo nvidia-smi boost-slider --vboost 1`. May not work on all systems. + lock_gpu_freq (int | None): Lock GPU graphics clock to the specified frequency in MHz via + `sudo nvidia-smi -lgc `. Runs once per node before training. None to disable. enable_manual_gc (bool): Enable manual garbage collection for better performance. manual_gc_interval (int): Interval for manual garbage collection. Default is 100. tp_size (int): Tensor parallelism size. Default is 1. @@ -200,6 +202,7 @@ class PerfEnvPlugin(Plugin): enable_layernorm_sm_margin: bool = True enable_vboost: bool = False + lock_gpu_freq: int | None = None enable_manual_gc: bool = True manual_gc_interval: int = 100 tp_size: int | None = None @@ -416,6 +419,42 @@ def get_vboost_srun_cmd(nodes, job_dir): else vboost_cmd ) + def _set_lock_gpu_freq( + self, task: Union["run.Partial", "run.Script"], executor: "run.Executor", lock_gpu_freq: int | None + ): + def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz): + import shlex + + lock_freq_cmd = "\n".join( + [ + "", + "# Command 0: lock GPU graphics clock", + " ".join( + [ + "srun", + "--ntasks-per-node=1", + "--output", + os.path.join(job_dir, "lock_gpu_freq.out"), + "--error", + os.path.join(job_dir, "lock_gpu_freq.err"), + "bash -c", + shlex.quote(f"sudo nvidia-smi -lgc {freq_mhz}"), + ] + ), + "", + ] + ) + + return lock_freq_cmd + + if lock_gpu_freq is not None and isinstance(executor, SlurmExecutor): + lock_freq_cmd = get_lock_gpu_freq_srun_cmd(executor.tunnel.job_dir, lock_gpu_freq) + executor.setup_lines = ( + executor.setup_lines + lock_freq_cmd + if (executor.setup_lines and len(executor.setup_lines) > 0) + else lock_freq_cmd + ) + def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executor"): """Enable the performance environment settings""" workload_base_config = get_workload_base_config( @@ -480,6 +519,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo # Improve perf by steering power to tensor cores, may not work on all systems self._set_vboost(task, executor, self.enable_vboost) + # Lock GPU graphics clock frequency for stable performance measurements + self._set_lock_gpu_freq(task, executor, self.lock_gpu_freq) + # Set model-specific environment variables self._set_model_specific_environment_variables( task, diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 2598c58b39..e7ff4e25cb 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -189,6 +189,7 @@ def main( detach: bool, dryrun: bool, enable_vboost: bool, + lock_gpu_freq: Optional[int], enable_nsys: bool, pytorch_profiler: bool, moe_a2a_overlap: bool, @@ -354,6 +355,7 @@ def main( plugins.append( PerfEnvPlugin( enable_vboost=enable_vboost, + lock_gpu_freq=lock_gpu_freq, moe_a2a_overlap=moe_a2a_overlap, tp_size=tp_size, pp_size=pp_size, @@ -599,6 +601,7 @@ def main( detach=args.detach, dryrun=args.dryrun, enable_vboost=args.enable_vboost, + lock_gpu_freq=args.lock_gpu_freq, enable_nsys=args.enable_nsys, pytorch_profiler=args.pytorch_profiler, moe_a2a_overlap=args.moe_a2a_overlap, From 3b0d3ed41253acd28a00d5f951513bea397abe83 Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Mon, 6 Apr 2026 21:39:56 -0700 Subject: [PATCH 2/4] [perf] docs: add docstring and README entry for --lock_gpu_freq Address review feedback: document when/why to use GPU clock locking and add the new arg to the performance README. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Dingqing Yang --- scripts/performance/README.md | 1 + scripts/performance/perf_plugins.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/scripts/performance/README.md b/scripts/performance/README.md index d936a19d84..7bfda8776d 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -175,6 +175,7 @@ python scripts/performance/setup_experiment.py - `-g/--gpu`: Target GPU type (`h100`, `b200`, `gb200`, `gb300`, `b300`). - `-c/--compute_dtype`: Compute precision (`bf16`, `fp8_cs`, `fp8_mx`, `fp8_sc`, `nvfp4`). Default `bf16`. - `-vb/--enable_vboost`: Enable VBoost (tensor core power steering). Pass `true` or `false`. Disabled by default. +- `-lgc/--lock_gpu_freq`: Lock GPU graphics clock to a fixed frequency in MHz (e.g. `2100`). Useful for reducing run-to-run variance in benchmarks. Runs `nvidia-smi -lgc ` once per node before training. Use `nvidia-smi -rgc` to reset after the job. Disabled by default. - `-en/--enable_nsys`: Enable Nsight Systems profiling. Disabled by default. - `-pyp/--pytorch_profiler`: Enable PyTorch profiler. Pass `true` or `false`. Disabled by default. - `--profiling_start_step`: Defines start step for profiling. Default `10`. diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index d8694ec8d7..250a263012 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -422,6 +422,12 @@ def get_vboost_srun_cmd(nodes, job_dir): def _set_lock_gpu_freq( self, task: Union["run.Partial", "run.Script"], executor: "run.Executor", lock_gpu_freq: int | None ): + """Lock GPU graphics clocks to a fixed frequency before training. + + This is useful for performance benchmarking where consistent GPU clock speeds + are needed to reduce run-to-run variance. Runs ``nvidia-smi -lgc `` once + per node via srun before the main training command. + """ def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz): import shlex From bf842321d528bbdf9a0164628460aac75ebae8cf Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Mon, 6 Apr 2026 21:44:32 -0700 Subject: [PATCH 3/4] [perf] docs: add docstring and README entry for --lock_gpu_freq Address review feedback: document the correlation study use case for GPU clock locking and add the new arg to the performance README. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Dingqing Yang --- scripts/performance/README.md | 2 +- scripts/performance/perf_plugins.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/performance/README.md b/scripts/performance/README.md index 7bfda8776d..937f0ed3f2 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -175,7 +175,7 @@ python scripts/performance/setup_experiment.py - `-g/--gpu`: Target GPU type (`h100`, `b200`, `gb200`, `gb300`, `b300`). - `-c/--compute_dtype`: Compute precision (`bf16`, `fp8_cs`, `fp8_mx`, `fp8_sc`, `nvfp4`). Default `bf16`. - `-vb/--enable_vboost`: Enable VBoost (tensor core power steering). Pass `true` or `false`. Disabled by default. -- `-lgc/--lock_gpu_freq`: Lock GPU graphics clock to a fixed frequency in MHz (e.g. `2100`). Useful for reducing run-to-run variance in benchmarks. Runs `nvidia-smi -lgc ` once per node before training. Use `nvidia-smi -rgc` to reset after the job. Disabled by default. +- `-lgc/--lock_gpu_freq`: Lock GPU graphics clock to a fixed frequency in MHz (e.g. `1200`). Used for silicon simulation correlation studies. Disabled by default. - `-en/--enable_nsys`: Enable Nsight Systems profiling. Disabled by default. - `-pyp/--pytorch_profiler`: Enable PyTorch profiler. Pass `true` or `false`. Disabled by default. - `--profiling_start_step`: Defines start step for profiling. Default `10`. diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 250a263012..f503cffdcb 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -424,9 +424,8 @@ def _set_lock_gpu_freq( ): """Lock GPU graphics clocks to a fixed frequency before training. - This is useful for performance benchmarking where consistent GPU clock speeds - are needed to reduce run-to-run variance. Runs ``nvidia-smi -lgc `` once - per node via srun before the main training command. + Used for silicon simulation correlation studies where a fixed GPU + clock frequency is required to match simulation assumptions. """ def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz): import shlex From 6882c4a06617e2253d4ef9f8ca8032af70360c30 Mon Sep 17 00:00:00 2001 From: Dingqing Yang Date: Mon, 6 Apr 2026 21:47:51 -0700 Subject: [PATCH 4/4] [perf] fix: lint blank line after docstring Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: Dingqing Yang --- scripts/performance/perf_plugins.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index f503cffdcb..487b8594c6 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -427,6 +427,7 @@ def _set_lock_gpu_freq( Used for silicon simulation correlation studies where a fixed GPU clock frequency is required to match simulation assumptions. """ + def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz): import shlex