Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions configs/gpu_perf_hpo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
# HPO configuration for GPU performance benchmarking with the CLAIMED
# gpu_performance_test component on Vela (OpenShift / MLBatch).
#
# Hyperparameters:
# batch_size – DataLoader / training batch size
# num_workers – DataLoader worker count
# hidden_dim – MLP hidden layer width (training/inference cost)
# depth – MLP depth (training/inference cost)
# matrix_size – Matrix multiplication size (raw GPU compute)
# gpu_num – Number of GPUs to request per pod (launcher-level)
#
# Static args:
# All remaining fixed CLI flags for gpu_performance_test.py
#
# Metrics:
# The gpu_performance_test script prints "Samples/sec:" three times (DataLoader,
# Training, Inference) and "GFLOPS:" once. iterate2 uses the name#N syntax
# to select the Nth occurrence (0-based) of a repeated metric name.

metrics:
- "Samples/sec#0" # DataLoader throughput (1st occurrence)
- "Samples/sec#1" # Training throughput (2nd occurrence)
- "Samples/sec#2" # Inference throughput (3rd occurrence)
- GFLOPS

hpo:
gpu_num:
type: categorical
choices: [1, 2]

batch_size:
type: categorical
choices: [32, 64, 128]

num_workers:
type: categorical
choices: [8, 16, 32]

hidden_dim:
type: categorical
choices: [500, 1000, 2000]

depth:
type: categorical
choices: [500, 1000, 2000]

matrix_size:
type: categorical
choices: [5000, 10000, 20000]

static:
mode: single_gpu
dataset_size: 100000
steps: 1
input_dim: 1000000
num_classes: 100
materialize_dir: "."
cleanup: true # flag – generates --cleanup (store_true)
iterations: 100
51 changes: 51 additions & 0 deletions configs/gridfm_graphkit_hpo.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# HPO configuration for gridfm-graphkit HGNS PF case118
#
# Hyperparameters:
# gpu_num – number of GPUs to request from the WLM (launcher-level)
# bfloat16 – boolean flag (presence = --bfloat16, absence = flag omitted)
# tf32 – boolean flag (presence = --tf32, absence = flag omitted)
# compile – torch.compile mode; null disables the flag entirely
# dataset – group: selects config + data_path + exp_name together
#
# Static args:
# all other fixed CLI args
#
# Metrics:
# extracted from [performance] lines in trial output

metrics:
- case118_ieee/layer_0_residual
- last epoch time
- last epoch it/s

hpo:
gpu_num:
type: categorical
choices: [1, 2, 4]

bfloat16:
type: flag # store_true: true → --bfloat16, false → flag omitted

tf32:
type: flag # store_true: true → --tf32, false → flag omitted

compile:
type: categorical
choices: ["max-autotune", "default", "reduce-overhead", null]
# null → --compile flag is omitted entirely

num_workers:
type: categorical
choices: [8, 16, 24, 32]

dataset:
type: group # one choice selects all bundled args together
choices:
case118:
config: ./examples/config/HGNS_PF_datakit_case118.yaml
data_path: /u/rkie/

static:
run_name: run1
log_dir: logs
report-performance: true
Loading
Loading