terrastackai · romeokienzler · Apr 20, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/configs/gpu_perf_hpo.yaml b/configs/gpu_perf_hpo.yaml
@@ -0,0 +1,59 @@
+# HPO configuration for GPU performance benchmarking with the CLAIMED
+# gpu_performance_test component on Vela (OpenShift / MLBatch).
+#
+# Hyperparameters:
+#   batch_size    – DataLoader / training batch size
+#   num_workers   – DataLoader worker count
+#   hidden_dim    – MLP hidden layer width (training/inference cost)
+#   depth         – MLP depth (training/inference cost)
+#   matrix_size   – Matrix multiplication size (raw GPU compute)
+#   gpu_num       – Number of GPUs to request per pod (launcher-level)
+#
+# Static args:
+#   All remaining fixed CLI flags for gpu_performance_test.py
+#
+# Metrics:
+#   The gpu_performance_test script prints "Samples/sec:" three times (DataLoader,
+#   Training, Inference) and "GFLOPS:" once. iterate2 uses the  name#N  syntax
+#   to select the Nth occurrence (0-based) of a repeated metric name.
+
+metrics:
+  - "Samples/sec#0"   # DataLoader throughput (1st occurrence)
+  - "Samples/sec#1"   # Training throughput   (2nd occurrence)
+  - "Samples/sec#2"   # Inference throughput  (3rd occurrence)
+  - GFLOPS
+
+hpo:
+  gpu_num:
+    type: categorical
+    choices: [1, 2]
+
+  batch_size:
+    type: categorical
+    choices: [32, 64, 128]
+
+  num_workers:
+    type: categorical
+    choices: [8, 16, 32]
+
+  hidden_dim:
+    type: categorical
+    choices: [500, 1000, 2000]
+
+  depth:
+    type: categorical
+    choices: [500, 1000, 2000]
+
+  matrix_size:
+    type: categorical
+    choices: [5000, 10000, 20000]
+
+static:
+  mode: single_gpu
+  dataset_size: 100000
+  steps: 1
+  input_dim: 1000000
+  num_classes: 100
+  materialize_dir: "."
+  cleanup: true      # flag – generates --cleanup (store_true)
+  iterations: 100
diff --git a/configs/gridfm_graphkit_hpo.yaml b/configs/gridfm_graphkit_hpo.yaml
@@ -0,0 +1,51 @@
+# HPO configuration for gridfm-graphkit HGNS PF case118
+#
+# Hyperparameters:
+#   gpu_num   – number of GPUs to request from the WLM (launcher-level)
+#   bfloat16  – boolean flag (presence = --bfloat16, absence = flag omitted)
+#   tf32      – boolean flag (presence = --tf32, absence = flag omitted)
+#   compile   – torch.compile mode; null disables the flag entirely
+#   dataset   – group: selects config + data_path + exp_name together
+#
+# Static args:
+#   all other fixed CLI args
+#
+# Metrics:
+#   extracted from [performance] lines in trial output
+
+metrics:
+  - case118_ieee/layer_0_residual
+  - last epoch time
+  - last epoch it/s
+
+hpo:
+  gpu_num:
+    type: categorical
+    choices: [1, 2, 4]
+
+  bfloat16:
+    type: flag               # store_true: true → --bfloat16, false → flag omitted
+
+  tf32:
+    type: flag               # store_true: true → --tf32, false → flag omitted
+
+  compile:
+    type: categorical
+    choices: ["max-autotune", "default", "reduce-overhead", null]
+    # null → --compile flag is omitted entirely
+
+  num_workers:
+    type: categorical
+    choices: [8, 16, 24, 32]
+
+  dataset:
+    type: group             # one choice selects all bundled args together
+    choices:
+      case118:
+        config: ./examples/config/HGNS_PF_datakit_case118.yaml
+        data_path: /u/rkie/
+
+static:
+  run_name: run1
+  log_dir: logs
+  report-performance: true