diff --git a/scripts/performance/README.md b/scripts/performance/README.md index 30513cdac9..fc193a21ac 100644 --- a/scripts/performance/README.md +++ b/scripts/performance/README.md @@ -216,6 +216,7 @@ Mounting cached files is not enough by itself. If `HF_HUB_OFFLINE` remains `0`, - `-g/--gpu`: Target GPU type (`h100`, `b200`, `gb200`, `gb300`, `b300`). - `-c/--compute_dtype`: Compute precision (`bf16`, `fp8_cs`, `fp8_mx`, `fp8_sc`, `nvfp4`). Default `bf16`. - `-vb/--enable_vboost`: Enable VBoost (tensor core power steering). Pass `true` or `false`. Disabled by default. +- `-lgc/--lock_gpu_freq`: Lock GPU graphics clock to a fixed frequency in MHz (e.g. `1200`). Used for silicon simulation correlation studies. Disabled by default. - `-en/--enable_nsys`: Enable Nsight Systems profiling. Disabled by default. - `-pyp/--pytorch_profiler`: Enable PyTorch profiler. Pass `true` or `false`. Disabled by default. - `--profiling_start_step`: Defines start step for profiling. Default `10`. diff --git a/scripts/performance/argument_parser.py b/scripts/performance/argument_parser.py index 79eb80f45a..d8a95d6e3b 100644 --- a/scripts/performance/argument_parser.py +++ b/scripts/performance/argument_parser.py @@ -608,6 +608,16 @@ def parse_cli_args(): type=bool_arg, required=False, ) + performance_args.add_argument( + "-lgc", + "--lock_gpu_freq", + help="Lock GPU graphics clock to the specified frequency in MHz via " + "`sudo nvidia-smi -lgc `. Runs once per node before training. " + "Use `nvidia-smi -rgc` to reset after the job.", + type=int, + required=False, + default=None, + ) performance_args.add_argument( "-en", "--enable_nsys", diff --git a/scripts/performance/perf_plugins.py b/scripts/performance/perf_plugins.py index 4a0b28dbe6..2909aace6a 100644 --- a/scripts/performance/perf_plugins.py +++ b/scripts/performance/perf_plugins.py @@ -188,6 +188,8 @@ class PerfEnvPlugin(Plugin): in order to not block DP level communication overlap. enable_vboost (bool): Whether to steer more power towards tensor cores via `sudo nvidia-smi boost-slider --vboost 1`. May not work on all systems. + lock_gpu_freq (int | None): Lock GPU graphics clock to the specified frequency in MHz via + `sudo nvidia-smi -lgc `. Runs once per node before training. None to disable. enable_manual_gc (bool): Enable manual garbage collection for better performance. manual_gc_interval (int): Interval for manual garbage collection. Default is 100. tp_size (int): Tensor parallelism size. Default is 1. @@ -200,6 +202,7 @@ class PerfEnvPlugin(Plugin): enable_layernorm_sm_margin: bool = True enable_vboost: bool = False + lock_gpu_freq: int | None = None enable_manual_gc: bool = True manual_gc_interval: int = 100 tp_size: int | None = None @@ -433,6 +436,48 @@ def get_vboost_srun_cmd(nodes, job_dir): else vboost_cmd ) + def _set_lock_gpu_freq( + self, task: Union["run.Partial", "run.Script"], executor: "run.Executor", lock_gpu_freq: int | None + ): + """Lock GPU graphics clocks to a fixed frequency before training. + + Used for silicon simulation correlation studies where a fixed GPU + clock frequency is required to match simulation assumptions. + """ + + def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz): + import shlex + + lock_freq_cmd = "\n".join( + [ + "", + "# Command 0: lock GPU graphics clock", + " ".join( + [ + "srun", + "--ntasks-per-node=1", + "--output", + os.path.join(job_dir, "lock_gpu_freq.out"), + "--error", + os.path.join(job_dir, "lock_gpu_freq.err"), + "bash -c", + shlex.quote(f"sudo nvidia-smi -lgc {freq_mhz}"), + ] + ), + "", + ] + ) + + return lock_freq_cmd + + if lock_gpu_freq is not None and isinstance(executor, SlurmExecutor): + lock_freq_cmd = get_lock_gpu_freq_srun_cmd(executor.tunnel.job_dir, lock_gpu_freq) + executor.setup_lines = ( + executor.setup_lines + lock_freq_cmd + if (executor.setup_lines and len(executor.setup_lines) > 0) + else lock_freq_cmd + ) + def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executor"): """Enable the performance environment settings""" workload_base_config = get_workload_base_config( @@ -495,6 +540,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo # Improve perf by steering power to tensor cores, may not work on all systems self._set_vboost(task, executor, self.enable_vboost) + # Lock GPU graphics clock frequency for stable performance measurements + self._set_lock_gpu_freq(task, executor, self.lock_gpu_freq) + # Set model-specific environment variables self._set_model_specific_environment_variables( task, diff --git a/scripts/performance/setup_experiment.py b/scripts/performance/setup_experiment.py index 58ec76e886..c6b1a1cf99 100755 --- a/scripts/performance/setup_experiment.py +++ b/scripts/performance/setup_experiment.py @@ -209,6 +209,7 @@ def main( detach: bool, dryrun: bool, enable_vboost: bool, + lock_gpu_freq: Optional[int], enable_nsys: bool, pytorch_profiler: bool, moe_a2a_overlap: bool, @@ -398,6 +399,7 @@ def main( plugins.append( PerfEnvPlugin( enable_vboost=enable_vboost, + lock_gpu_freq=lock_gpu_freq, moe_a2a_overlap=moe_a2a_overlap, tp_size=tp_size, pp_size=pp_size, @@ -652,6 +654,7 @@ def main( detach=args.detach, dryrun=args.dryrun, enable_vboost=args.enable_vboost, + lock_gpu_freq=args.lock_gpu_freq, enable_nsys=args.enable_nsys, pytorch_profiler=args.pytorch_profiler, moe_a2a_overlap=args.moe_a2a_overlap,