diff --git a/README.md b/README.md index 58bf59d..3b2ddc0 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,49 @@ We recommend using python 3.10, 3.11 or 3.12 and also using a virtual environmen pip install terratorch-iterate ``` +### New instructions for iterate v0.3 +Iterate v0.3 can optimize over arbitrary code running on arbitrary workload managers. +Slurm and LSF are supported, Kubernetes/OpenShift and PBS coming soon. + +From version 0.3 on the current iterate can be used using `iterate-classig`. Here are some usage examples + +#### Prerequisites +mkdir deleteme.iterate +cd deleteme.iterate +python -m venv .venv +source ./venv/bin/activate +wget https://raw.githubusercontent.com/terrastackai/iterate/refs/heads/main/examples/bumpy_function.py +wget https://raw.githubusercontent.com/terrastackai/iterate/refs/heads/main/examples/bumpy_hpo.yaml +pip install terratorch-iterate==0.3 + +#### Run locally +``` +iterate \ + --script bumpy_function.py \ + --root-dir . \ + --optuna-study-name terratorch_hpo_nas_2 \ + --optuna-db-path "sqlite:///iterate_study.db" \ + --hpo-yaml bumpy_hpo.yaml \ + --wlm none \ + --metric yval +``` +#### Run on LSF +``` +iterate \ + --script bumpy_function.py \ + --root-dir . \ + --optuna-study-name terratorch_hpo_nas_2 \ + --optuna-db-path "sqlite:///iterate_study.db" \ + --hpo-yaml bumpy_hpo.yaml \ + --wlm lsf \ + --metric yval \ + --gpu-count 0 +``` +#### Useful commands +``` +pip install optuna-dashboard +optuna-dashboard --host 0.0.0.0 sqlite:///iterate_study.db +``` ### Suggested setup for development ```sh diff --git a/examples/bumpy_function.py b/examples/bumpy_function.py new file mode 100644 index 0000000..e098d76 --- /dev/null +++ b/examples/bumpy_function.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 +import argparse +import math + + +def bumpy_function_3d( + x, y, z, + global_mu, global_sigma, + mu_rest, sigma_rest, amps_rest, +): + """ + 3D smooth multimodal function with: + - one global optimum = 1 at global_mu = (mx,my,mz) + - multiple local optima < 1 + + f(p) = 1 - Π_k (1 - a_k * exp(-||p - mu_k||^2 / (2 sigma_k^2))) + """ + + def sqdist(p, q): + return (p[0] - q[0])**2 + (p[1] - q[1])**2 + (p[2] - q[2])**2 + + p = (x, y, z) + + # Global peak (amplitude = 1) + val = 1.0 - math.exp( + -sqdist(p, global_mu) / (2.0 * global_sigma**2) + ) + + # Local peaks + for mu_k, sig_k, a_k in zip(mu_rest, sigma_rest, amps_rest): + term = 1.0 - a_k * math.exp( + -sqdist(p, mu_k) / (2.0 * sig_k**2) + ) + val *= term + + return 1.0 - val + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Evaluate the 3D bumpy multimodal function.") + + parser.add_argument("--x", type=float, required=True) + parser.add_argument("--y", type=float, required=True) + parser.add_argument("--z", type=float, required=True) + parser.add_argument("--trial-number", type=int, default=0) + + parser.add_argument( + "--global-mu", + type=float, + nargs=3, + default=[0.0, 0.0, 0.0], + metavar=("MX", "MY", "MZ"), + ) + parser.add_argument("--global-sigma", type=float, default=0.7) + + parser.add_argument( + "--mu-rest", + type=float, + nargs="*", + default=[-2.0, 0.0, 0.0, 2.0, 0.0, 0.0], + help="Flat list of (x y z) triplets", + ) + parser.add_argument( + "--sigma-rest", + type=float, + nargs="*", + default=[0.6, 0.6], + ) + parser.add_argument( + "--amps-rest", + type=float, + nargs="*", + default=[0.5, 0.8], + ) + + args = parser.parse_args() + + mu_rest = [ + tuple(args.mu_rest[i:i+3]) + for i in range(0, len(args.mu_rest), 3) + ] + + yval = bumpy_function_3d( + x=args.x, + y=args.y, + z=args.z, + global_mu=tuple(args.global_mu), + global_sigma=args.global_sigma, + mu_rest=mu_rest, + sigma_rest=args.sigma_rest, + amps_rest=args.amps_rest, + ) + + print(f'yval: {yval}, trial_number: {args.trial_number}') diff --git a/examples/bumpy_hpo.yaml b/examples/bumpy_hpo.yaml new file mode 100644 index 0000000..70a16bf --- /dev/null +++ b/examples/bumpy_hpo.yaml @@ -0,0 +1,30 @@ +# ======================= +# Static parameters - passed to the underlying training script as is +# ======================= + +static: + global-mu: 23 42 66 + +# ======================== +# Training hyperparameters - evaluated by optuna and passed to the underlying training script +# ======================== + +hpo: + x: + type: float + low: 1 + high: 100 + log: true + + y: + type: float + low: 1 + high: 100 + log: true + + z: + type: float + low: 1 + high: 100 + log: true + diff --git a/pyproject.toml b/pyproject.toml index 5c4fd83..019a146 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ include = ["terratorch_iterate*"] [project] name = "terratorch-iterate" -version = "0.2.3" +version = "0.3" requires-python = ">= 3.11" description = "A terratorch's plugin for benchmarking and hyperparameter optimization" authors = [ @@ -77,8 +77,8 @@ dependencies = [ ] [project.urls] -Homepage = "https://github.com/IBM/terratorch-iterate" -Issues = "https://github.com/IBM/terratorch-iterate/issues" +Homepage = "https://github.com/terrastackai/iterate" +Issues = "https://github.com/terrastackai/iterate/issues" [project.optional-dependencies] dev = [ @@ -113,7 +113,8 @@ line-length = 88 skip-string-normalization = true [project.scripts] -iterate = "terratorch_iterate.main:main" +iterate-classic = "terratorch_iterate.main:main" +iterate = "terratorch_iterate.iterate2:main" [tool.isort] multi_line_output = 3 diff --git a/terratorch_iterate/iterate2.py b/terratorch_iterate/iterate2.py new file mode 100644 index 0000000..b9741bf --- /dev/null +++ b/terratorch_iterate/iterate2.py @@ -0,0 +1,387 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import subprocess +import re +from pathlib import Path +from typing import Dict, Any, Optional, Literal + +import optuna +import yaml + +# ============================================================ +# CLI +# ============================================================ + +def parse_args(): + parser = argparse.ArgumentParser( + description="Generic Optuna HPO launcher with pluggable execution backend" + ) + + # ------------------------ + # Execution config + # ------------------------ + parser.add_argument("--script", required=True, help="Training script to execute") + parser.add_argument("--root-dir", default=None, help="Root dir (derived if omitted)") + parser.add_argument("--venv", default=".venv", help="Virtualenv dir, default: .venv (set empty to disable)") + parser.add_argument( + "--wlm", + choices=["lsf", "slurm", "openshift", "none"], + default="none", + help="Workload manager", + ) + parser.add_argument("--gpu-count", type=int, default=1, help="GPUs per trial") + parser.add_argument("--cpu-count", type=int, default=4, help="CPUs per trial") + parser.add_argument("--mem-gb", type=int, default=128, help="Memory (GB) per trial") + + # ------------------------ + # Optuna config + # ------------------------ + parser.add_argument("--optuna-study-name", required=True) + parser.add_argument("--optuna-db-path", required=True) + parser.add_argument("--optuna-n-trials", type=int, default=100) + + # ------------------------ + # HPO space + # ------------------------ + parser.add_argument( + "--hpo-json", + type=str, + default=None, + help="HPO search space as JSON string", + ) + parser.add_argument( + "--hpo-yaml", + type=str, + default=None, + help="HPO search space YAML file", + ) + + # ------------------------ + # Static arguments (passed to every trial) + # ------------------------ + parser.add_argument( + "--static-args-json", + type=str, + default=None, + help="Static arguments as JSON string (key-value pairs)", + ) + parser.add_argument( + "--static-args-yaml", + type=str, + default=None, + help="Static arguments YAML file (key-value pairs)", + ) + + # ------------------------ + # Metric extraction + # ------------------------ + parser.add_argument( + "--metric", + default="val/F1_Score", + help="Metric name to extract from logs", + ) + + return parser.parse_args() + + +# ============================================================ +# PATH RESOLUTION +# ============================================================ + +def resolve_paths(script: str, root_dir: Optional[str]): + script_path = Path(script).resolve() + + if root_dir is None: + root_dir = script_path.parent.parent + + return script_path, Path(root_dir).resolve() + + +# ============================================================ +# WORKLOAD MANAGER ABSTRACTION +# ============================================================ + +def build_launcher_command( + *, + wlm: Literal["lsf", "slurm", "openshift", "none"], + cmd: str, + trial_id: int, + out_file: str, + err_file: str, + gpu_count: int = 1, + cpu_count: int = 4, + mem_gb: int = 128, +): + if wlm == "lsf": + if gpu_count > 1: + return ( + f"bsub -gpu num={gpu_count} -K " + f"-o {out_file} -e {err_file} " + f"-R \"rusage[ngpus={gpu_count}, cpu={cpu_count}, mem={mem_gb}GB]\" " + f"-J hpo_trial_{trial_id} " + f"\"{cmd}\"" + ) + else: + return ( + f"bsub -K " + f"-o {out_file} -e {err_file} " + f"-R \"rusage[cpu={cpu_count}, mem={mem_gb}GB]\" " + f"-J hpo_trial_{trial_id} " + f"\"{cmd}\"" + ) + + if wlm == "slurm": + return ( + f"srun --gres=gpu:{gpu_count} --cpus-per-task={cpu_count} --mem={mem_gb}G " + f"--job-name=hpo_trial_{trial_id} " + f"--output={out_file} --error={err_file} " + f"bash -c \"{cmd}\"" + ) + + if wlm == "openshift": + raise NotImplementedError("OpenShift launcher not implemented yet") + + if wlm == "none": + return f'bash -c "{cmd} > {out_file} 2> {err_file}"' + + raise ValueError(f"Unknown workload manager: {wlm}") + + +# ============================================================ +# SHELL COMMAND BUILDER +# ============================================================ + +def build_shell_command( + *, + root_dir: Path, + script_path: Path, + venv: Optional[str], + script_args: Dict[str, Any], +): + """ + Build shell command for argparse-based scripts. + + Args: + root_dir: Root directory to cd into + script_path: Path to the Python script + venv: Virtual environment directory (optional) + script_args: Dictionary of argument name -> value for the script + """ + parts = [f"cd {root_dir}"] + + if venv: + parts.append(f"source {venv}/bin/activate") + + # Build the python command with arguments + arg_list = [f"python {script_path}"] + + for key, value in script_args.items(): + # Convert parameter names to lowercase CLI argument names + # e.g., "BATCH_SIZE" -> "--batch-size", "batch_size" -> "--batch-size" + #arg_name = key.lower().replace("_", "-") + arg_name = key.replace("_", "-") + + # Handle boolean flags + if isinstance(value, bool): + if value: + arg_list.append(f"--{arg_name}") + # Handle string "False"/"True" from YAML/JSON + elif isinstance(value, str) and value.lower() in ("false", "true"): + if value.lower() == "true": + arg_list.append(f"--{arg_name}") + else: + arg_list.append(f"--{arg_name} {value}") + + parts.append(" ".join(arg_list)) + + return " && ".join(parts) + + +# ============================================================ +# HPO SPACE LOADING +# ============================================================ + +def load_hpo_space(args) -> Dict[str, Any]: + """Load HPO search space configuration.""" + data = {} + if args.hpo_json: + data = json.loads(args.hpo_json) + elif args.hpo_yaml: + with open(args.hpo_yaml, "r") as f: + data = yaml.safe_load(f) + + # If the YAML has an 'hpo' section, use only that. + # Otherwise, assume the whole file is the HPO space. + if isinstance(data, dict) and "hpo" in data: + return data["hpo"] + return data + +def load_static_args(args) -> Dict[str, Any]: + """Load static arguments.""" + data = {} + if args.static_args_json: + data = json.loads(args.static_args_json) + elif args.static_args_yaml: + with open(args.static_args_yaml, "r") as f: + data = yaml.safe_load(f) + elif args.hpo_yaml: # Fallback: check the HPO file for a 'static' section + with open(args.hpo_yaml, "r") as f: + data = yaml.safe_load(f) + + if isinstance(data, dict) and "static" in data: + return data["static"] + return data if data else {} + + +# ============================================================ +# OPTUNA PARAM INSTANTIATION +# ============================================================ + +def _num(x): + """Convert string to numeric if needed.""" + if isinstance(x, str): + return float(x) + return x + + +def suggest_from_spec(trial, name: str, spec: Dict[str, Any]): + """ + Suggest a hyperparameter value based on specification. + + Args: + trial: Optuna trial object + name: Parameter name + spec: Parameter specification dict with 'type' and other fields + + Returns: + Suggested parameter value + """ + t = spec["type"] + + if t == "float": + return trial.suggest_float( + name, + _num(spec["low"]), + _num(spec["high"]), + log=spec.get("log", False), + ) + + if t == "int": + return trial.suggest_int( + name, + int(_num(spec["low"])), + int(_num(spec["high"])), + log=spec.get("log", False), + ) + + if t == "categorical": + return trial.suggest_categorical(name, spec["choices"]) + + raise ValueError(f"Unknown param type: {t}") + + +# ============================================================ +# METRIC EXTRACTION +# ============================================================ + + +def extract_metric_from_log(path, metric: str): + pattern = re.compile( + rf"{re.escape(metric)}\s*[: ]\s*([-+]?\d*\.?\d+(?:[eE][-+]?\d+)?)" + ) + + with open(path, "r", encoding="utf-8", errors="ignore") as f: + text = f.read() + + matches = pattern.findall(text) + + if not matches: + raise RuntimeError(f"Metric '{metric}' not found in {path}") + + return float(matches[0]) + + +# ============================================================ +# MAIN +# ============================================================ + +def main(): + print("IMPORTANT: For the old iterate HPO Launcher v0.2.3 use iterate-classic") + args = parse_args() + hpo_space = load_hpo_space(args) + static_args = load_static_args(args) + + script_path, root_dir = resolve_paths(args.script, args.root_dir) + + def objective(trial): + # Combine static arguments with HPO parameters + script_args = static_args.copy() + + # Add HPO parameters + for name, spec in hpo_space.items(): + script_args[name] = suggest_from_spec(trial, name, spec) + + # Add trial number + script_args["trial_number"] = trial.number + + # Log file paths + out_file = f"trial_{trial.number}.out" + err_file = f"trial_{trial.number}.err" + + # Build command + shell_cmd = build_shell_command( + root_dir=root_dir, + script_path=script_path, + venv=args.venv if args.venv else None, + script_args=script_args, + ) + + launcher_cmd = build_launcher_command( + wlm=args.wlm, + cmd=shell_cmd, + trial_id=trial.number, + out_file=out_file, + err_file=err_file, + gpu_count=args.gpu_count, + cpu_count=args.cpu_count, + mem_gb=args.mem_gb, + ) + + # Execute trial + print(f"Trial {trial.number}: Executing command") + print(f" HPO params: {hpo_space.keys()}") + subprocess.run(launcher_cmd, shell=True, check=True) + + # Extract and return metric + metric_value = extract_metric_from_log(out_file, args.metric) + print(f"Trial {trial.number}: {args.metric} = {metric_value}") + + return metric_value + + # Create or load Optuna study + study = optuna.create_study( + study_name=args.optuna_study_name, + storage=args.optuna_db_path, + direction="maximize", + load_if_exists=True, + ) + + # Run optimization + study.optimize(objective, n_trials=args.optuna_n_trials) + + # Print best results + print("\n" + "="*60) + print("OPTIMIZATION COMPLETE") + print("="*60) + print(f"Best trial: {study.best_trial.number}") + print(f"Best value: {study.best_value}") + print(f"Best params:") + for key, value in study.best_params.items(): + print(f" {key}: {value}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/terratorch_iterate/main.py b/terratorch_iterate/main.py index 8f8c105..9dabddb 100644 --- a/terratorch_iterate/main.py +++ b/terratorch_iterate/main.py @@ -171,6 +171,7 @@ def _convert_config(args: Namespace): def main(): + print("DEPRECATED: iterate-classic is deprecated. Please use iterate instead.") parser = ArgumentParser() parser.add_argument("--defaults", type=Defaults) # to ignore model diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py index acc342d..dffbbe6 100644 --- a/tests/integration/test_main.py +++ b/tests/integration/test_main.py @@ -78,7 +78,7 @@ def test_main( abs_path = repo_home_dir / storage_uri storage_uri = str(abs_path.resolve()) experiment_name = config_data["experiment_name"] - arguments = ["terratorch", "--config", str(config_file.resolve())] + arguments = ["iterate-classic", "--config", str(config_file.resolve())] if hpo: arguments.insert(1, "--hpo") sys.argv = arguments diff --git a/tests/unit/test_iterate2.py b/tests/unit/test_iterate2.py new file mode 100644 index 0000000..cb0fe22 --- /dev/null +++ b/tests/unit/test_iterate2.py @@ -0,0 +1,19 @@ +import os + +def test_iterate2( ): + script = """ +iterate \ + --script ./examples/bumpy_function.py \ + --root-dir . \ + --optuna-study-name hpo \ + --optuna-db-path "sqlite:///iterate_study.db" \ + --hpo-yaml examples/bumpy_hpo.yaml \ + --optuna-n-trials 10 \ + --wlm none \ + --metric yval \ + --gpu-count 0 \ + --cpu-count 1 \ + --mem-gb 1 + """ + ret = os.system(script) + assert ret == 0 \ No newline at end of file