Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions scripts/performance/argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -562,6 +562,16 @@ def parse_cli_args():
type=bool_arg,
required=False,
)
performance_args.add_argument(
"-lgc",
"--lock_gpu_freq",
help="Lock GPU graphics clock to the specified frequency in MHz via "
"`sudo nvidia-smi -lgc <freq>`. Runs once per node before training. "
"Use `nvidia-smi -rgc` to reset after the job.",
type=int,
required=False,
default=None,
)
performance_args.add_argument(
"-en",
"--enable_nsys",
Expand Down
42 changes: 42 additions & 0 deletions scripts/performance/perf_plugins.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,8 @@ class PerfEnvPlugin(Plugin):
in order to not block DP level communication overlap.
enable_vboost (bool): Whether to steer more power towards tensor cores via
`sudo nvidia-smi boost-slider --vboost 1`. May not work on all systems.
lock_gpu_freq (int | None): Lock GPU graphics clock to the specified frequency in MHz via
`sudo nvidia-smi -lgc <freq>`. Runs once per node before training. None to disable.
enable_manual_gc (bool): Enable manual garbage collection for better performance.
manual_gc_interval (int): Interval for manual garbage collection. Default is 100.
tp_size (int): Tensor parallelism size. Default is 1.
Expand All @@ -200,6 +202,7 @@ class PerfEnvPlugin(Plugin):

enable_layernorm_sm_margin: bool = True
enable_vboost: bool = False
lock_gpu_freq: int | None = None
enable_manual_gc: bool = True
manual_gc_interval: int = 100
tp_size: int | None = None
Expand Down Expand Up @@ -416,6 +419,42 @@ def get_vboost_srun_cmd(nodes, job_dir):
else vboost_cmd
)

def _set_lock_gpu_freq(
self, task: Union["run.Partial", "run.Script"], executor: "run.Executor", lock_gpu_freq: int | None
):
def get_lock_gpu_freq_srun_cmd(job_dir, freq_mhz):
import shlex

lock_freq_cmd = "\n".join(
[
"",
"# Command 0: lock GPU graphics clock",
" ".join(
[
"srun",
"--ntasks-per-node=1",
"--output",
os.path.join(job_dir, "lock_gpu_freq.out"),
"--error",
os.path.join(job_dir, "lock_gpu_freq.err"),
"bash -c",
shlex.quote(f"sudo nvidia-smi -lgc {freq_mhz}"),
]
),
"",
]
)

return lock_freq_cmd

if lock_gpu_freq is not None and isinstance(executor, SlurmExecutor):
lock_freq_cmd = get_lock_gpu_freq_srun_cmd(executor.tunnel.job_dir, lock_gpu_freq)
executor.setup_lines = (
executor.setup_lines + lock_freq_cmd
if (executor.setup_lines and len(executor.setup_lines) > 0)
else lock_freq_cmd
)

def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executor"):
"""Enable the performance environment settings"""
workload_base_config = get_workload_base_config(
Expand Down Expand Up @@ -480,6 +519,9 @@ def setup(self, task: Union["run.Partial", "run.Script"], executor: "run.Executo
# Improve perf by steering power to tensor cores, may not work on all systems
self._set_vboost(task, executor, self.enable_vboost)

# Lock GPU graphics clock frequency for stable performance measurements
self._set_lock_gpu_freq(task, executor, self.lock_gpu_freq)

# Set model-specific environment variables
self._set_model_specific_environment_variables(
task,
Expand Down
3 changes: 3 additions & 0 deletions scripts/performance/setup_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ def main(
detach: bool,
dryrun: bool,
enable_vboost: bool,
lock_gpu_freq: Optional[int],
enable_nsys: bool,
pytorch_profiler: bool,
moe_a2a_overlap: bool,
Expand Down Expand Up @@ -354,6 +355,7 @@ def main(
plugins.append(
PerfEnvPlugin(
enable_vboost=enable_vboost,
lock_gpu_freq=lock_gpu_freq,
moe_a2a_overlap=moe_a2a_overlap,
tp_size=tp_size,
pp_size=pp_size,
Expand Down Expand Up @@ -599,6 +601,7 @@ def main(
detach=args.detach,
dryrun=args.dryrun,
enable_vboost=args.enable_vboost,
lock_gpu_freq=args.lock_gpu_freq,
enable_nsys=args.enable_nsys,
pytorch_profiler=args.pytorch_profiler,
moe_a2a_overlap=args.moe_a2a_overlap,
Expand Down
Loading