diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 00000000..cae17f4b --- /dev/null +++ b/tests/README.md @@ -0,0 +1,75 @@ +# YALIS Tests + +## Performance Regression Tests + +The performance tests live in `tests/performance/` and measure TTFT, TBT, +end-to-end latency, and throughput across a matrix of batch sizes, prompt +lengths, and decode lengths. + +### Quick start + +Performance testing is a two-step workflow: first record baselines on a +known-good branch, then compare against those baselines on a feature branch. + +#### 1. Generate baselines (on the develop / baseline branch) + +```bash +PERF_PYTEST_ARGS="--perf-update-baselines" ./tests/scripts/run_perf_regression_tests.sh +``` + +This runs every benchmark combination and writes the results to +`tests/performance/baselines/perf_baselines.json`. + +#### 2. Check for regressions (on your feature branch) + +```bash +./tests/scripts/run_perf_regression_tests.sh +``` + +Any metric that regresses beyond the tolerance (default 10 %) will cause the +test to fail with a detailed report. + +### Configuration + +The script uses `srun` to launch on a Slurm cluster. You can control the +number of GPUs with the `GPUS` environment variable: + +```bash +GPUS=4 ./tests/scripts/run_perf_regression_tests.sh +``` + +Additional pytest options can be passed through the `PERF_PYTEST_ARGS` +environment variable: + +| Option | Default | Description | +| ----------------------------- | ----------------------------------------------- | ------------------------------------------------- | +| `--perf-update-baselines` | off | Record new baselines instead of comparing. | +| `--perf-tolerance FLOAT` | `0.10` | Max allowed regression fraction (10 %). | +| `--perf-warmup-iters INT` | `3` | Warmup iterations before measurement. | +| `--perf-measure-iters INT` | `5` | Measurement iterations for averaging. | +| `--perf-baseline-path PATH` | `tests/performance/baselines/perf_baselines.json` | Path to the baselines JSON file. | + +Example — tighter tolerance with more measurement iterations: + +```bash +PERF_PYTEST_ARGS="--perf-tolerance 0.05 --perf-measure-iters 10" \ + ./tests/scripts/run_perf_regression_tests.sh +``` + +The model, precision, and attention backend are configured via pytest ini +settings. The defaults (set in `tests/performance/conftest.py`) are: + +| Setting | Default | +| ------------------------ | ------------------------------------ | +| `model` | `meta-llama/Llama-3.1-8B-Instruct` | +| `dtype` | `bf16` | +| `attn_backend` | `sdpa` | +| `use_paged_kv_caching` | `False` | + +To override these, create a `pytest.ini` (or add an `[pytest]` section to +`pyproject.toml`) with the desired values, or pass a custom `-c ` +through `PERF_PYTEST_ARGS`. + +## Correctness Tests + + diff --git a/tests/performance/__init__.py b/tests/performance/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/performance/baselines/.gitkeep b/tests/performance/baselines/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py new file mode 100644 index 00000000..f3cca2d3 --- /dev/null +++ b/tests/performance/conftest.py @@ -0,0 +1,344 @@ +import json +import os +import subprocess +from datetime import datetime, timezone + +import pytest +import torch.distributed as dist +from transformers import AutoTokenizer + +from yalis import ModelConfig, InferenceConfig, LLMEngine +from tests.sample_dataset import AlpacaDataset + +BASELINE_DIR = os.path.join(os.path.dirname(__file__), "baselines") +DEFAULT_BASELINE_PATH = os.path.join(BASELINE_DIR, "perf_baselines.json") + +_PERF_RESULTS_KEY = pytest.StashKey[list]() + + +# ------------------------------------------------------------------ # +# Hooks # +# ------------------------------------------------------------------ # + + +def pytest_configure(config): + """Initialise a session-wide list to collect perf comparison results.""" + config.stash[_PERF_RESULTS_KEY] = [] + + +def pytest_sessionstart(session): + """Validate CLI options that must be positive.""" + val = session.config.getoption("--perf-measure-iters", default=5) + if val < 1: + raise pytest.UsageError("--perf-measure-iters must be at least 1") + + +def pytest_terminal_summary(terminalreporter, config): + """Print a performance comparison table after the test run.""" + results = config.stash.get(_PERF_RESULTS_KEY, []) + if not results: + return + + write = terminalreporter.write_line + update_mode = config.getoption("--perf-update-baselines", default=False) + + if update_mode: + write("") + write("=== Performance baselines saved ===", bold=True) + for entry in results: + write(f" [{entry['key']}]") + for label, value in entry["metrics"]: + write(f" {label:<14} {value:>12.4f}") + write("") + else: + tolerance = config.getoption("--perf-tolerance", default=0.10) + write("") + write("=== Performance comparison ===", bold=True) + write( + f" {'Benchmark':<40} {'Metric':<14}" + f" {'Baseline':>12} {'Current':>12} {'Change':>10}" + ) + write(f" {'-' * 92}") + for entry in results: + first = True + for label, base_val, curr_val, pct in entry["comparisons"]: + tag = entry["key"] if first else "" + marker = " !!" if abs(pct) > tolerance else "" + write( + f" {tag:<40} {label:<14}" + f" {base_val:>12.4f} {curr_val:>12.4f}" + f" {pct:>+9.1%}{marker}" + ) + first = False + write("") + write(f" Tolerance: {tolerance:.0%}") + write("") + + +# ------------------------------------------------------------------ # +# CLI options # +# ------------------------------------------------------------------ # + + +def pytest_addoption(parser): + parser.addini( + "model", + "Model to use for the test", + type="string", + default="meta-llama/Llama-3.1-8B-Instruct", + ) + parser.addini( + "dtype", + "Data type to use for the test", + type="string", + default="bf16", + ) + parser.addini( + "attn_backend", + "Attention backend to use for the test", + type="string", + default="sdpa", + ) + parser.addini( + "use_paged_kv_caching", + "Enable paged KV caching (requires flash backend)", + type="bool", + default=False, + ) + parser.addoption( + "--perf-update-baselines", + action="store_true", + default=False, + help="Update performance baselines instead of comparing.", + ) + parser.addoption( + "--perf-tolerance", + type=float, + default=0.10, + help="Max allowed regression fraction (default: 0.10 = 10%%).", + ) + parser.addoption( + "--perf-warmup-iters", + type=int, + default=3, + help="Warmup iterations before measurement (default: 3).", + ) + parser.addoption( + "--perf-measure-iters", + type=int, + default=5, + help="Measurement iterations for averaging (default: 5, min: 1).", + ) + parser.addoption( + "--perf-baseline-path", + type=str, + default=DEFAULT_BASELINE_PATH, + help="Path to the baseline JSON file.", + ) + + +# ------------------------------------------------------------------ # +# Baseline store # +# ------------------------------------------------------------------ # + + +class BaselineStore: + """Thin wrapper around a JSON file that holds perf baselines.""" + + def __init__(self, path): + self.path = path + self._data = self._load() + + # -- persistence ------------------------------------------------ # + + def _load(self): + if os.path.exists(self.path): + with open(self.path) as f: + return json.load(f) + return {"metadata": {}, "benchmarks": {}} + + def flush(self): + os.makedirs(os.path.dirname(self.path), exist_ok=True) + with open(self.path, "w") as f: + json.dump(self._data, f, indent=2) + + # -- read / write ----------------------------------------------- # + + def get(self, key): + return self._data["benchmarks"].get(key) + + def put(self, key, entry): + self._data["benchmarks"][key] = entry + + def set_metadata(self, **kwargs): + self._data["metadata"].update(kwargs) + + +# ------------------------------------------------------------------ # +# Fixtures # +# ------------------------------------------------------------------ # + + +@pytest.fixture(scope="module", autouse=True) +def cleanup_dist(): + yield + if dist.is_initialized(): + dist.barrier() + dist.destroy_process_group() + + +@pytest.fixture(scope="session") +def model_id(request): + return request.config.getini("model") + + +@pytest.fixture(scope="session") +def dtype(request): + return request.config.getini("dtype").lower() + + +@pytest.fixture(scope="session") +def attn_backend(request): + return request.config.getini("attn_backend").lower() + + +@pytest.fixture(scope="session") +def use_paged_kv_caching(request): + return request.config.getini("use_paged_kv_caching") + + +@pytest.fixture(scope="module") +def perf_engine(model_id, dtype, attn_backend, use_paged_kv_caching): + """LLMEngine configured for performance measurement.""" + model_config = ModelConfig(model_name=model_id, precision=dtype) + inference_config = InferenceConfig( + max_batch_size=8, + max_length_of_generated_sequences=2048, + top_p=0.0, + temperature=0.0, + tp_dims=None, + attention_backend=attn_backend, + use_paged_kv_caching=use_paged_kv_caching, + prestore_kv_cache=True, + ) + return LLMEngine( + model_config=model_config, + inference_config=inference_config, + ) + + +@pytest.fixture(scope="session") +def tokenizer(model_id): + tok = AutoTokenizer.from_pretrained(model_id) + tok.pad_token = tok.eos_token + tok.padding_side = "left" + return tok + + +@pytest.fixture(scope="session") +def alpaca_dataset(): + return AlpacaDataset(random_seed=42) + + +@pytest.fixture(scope="session") +def perf_results(request): + """Session-wide list for collecting perf comparison data.""" + return request.config.stash[_PERF_RESULTS_KEY] + + +@pytest.fixture(scope="session") +def baseline_store( + request, model_id, dtype, attn_backend, use_paged_kv_caching +): + """Load (or create) the baseline store and flush on teardown.""" + path = request.config.getoption("--perf-baseline-path") + store = BaselineStore(path) + + update = request.config.getoption("--perf-update-baselines") + + # Determine rank from env vars set by the launcher (torchrun / SLURM). + # dist may not yet be initialised (session fixture created early) or + # already torn down (cleanup_dist is module-scoped), so we must not + # rely on dist.is_initialized() / dist.get_rank(). + _rank = os.environ.get("RANK", os.environ.get("SLURM_PROCID", "0")) + is_rank_zero = int(_rank) == 0 + + if update: + git_sha = _git_sha() + store.set_metadata( + model=model_id, + attention_backend=attn_backend, + precision=dtype, + use_paged_kv_caching=use_paged_kv_caching, + updated_at=datetime.now(timezone.utc).isoformat(), + git_commit=git_sha, + ) + else: + _validate_baseline_config( + store, model_id, dtype, attn_backend, use_paged_kv_caching + ) + + yield store + + if update and is_rank_zero: + store.flush() + + +# ------------------------------------------------------------------ # +# Helpers # +# ------------------------------------------------------------------ # + + +def _validate_baseline_config( + store, model_id, dtype, attn_backend, use_paged_kv_caching +): + """Verify the current test config matches the baseline metadata. + + Raises ``pytest.UsageError`` on mismatch so the session fails + immediately rather than producing misleading comparisons. + """ + meta = store._data.get("metadata", {}) + if not meta: + return # no baselines yet — nothing to validate + + checks = { + "model": (meta.get("model"), model_id), + "precision": (meta.get("precision"), dtype), + "attention_backend": (meta.get("attention_backend"), attn_backend), + "use_paged_kv_caching": ( + meta.get("use_paged_kv_caching"), + use_paged_kv_caching, + ), + } + + mismatches = [] + for field, (stored, current) in checks.items(): + if stored is None: + # Baseline was created before this field was tracked — skip. + continue + if stored != current: + mismatches.append( + f" {field}: baseline={stored!r}, current={current!r}" + ) + + if mismatches: + detail = "\n".join(mismatches) + raise pytest.UsageError( + f"Baseline config mismatch — the stored baselines were " + f"recorded with a different configuration:\n{detail}\n" + f"Re-run with --perf-update-baselines to regenerate." + ) + + +def _git_sha(): + try: + return ( + subprocess.check_output( + ["git", "rev-parse", "--short", "HEAD"], + stderr=subprocess.DEVNULL, + ) + .decode() + .strip() + ) + except Exception: + return "unknown" diff --git a/tests/performance/test_perf_regression.py b/tests/performance/test_perf_regression.py new file mode 100644 index 00000000..75d2b7e2 --- /dev/null +++ b/tests/performance/test_perf_regression.py @@ -0,0 +1,222 @@ +""" +Performance regression tests for YALIS. + +Workflow +-------- +1. On the develop (baseline) branch, generate baselines:: + + pytest tests/performance/ --perf-update-baselines + +2. On your feature branch, run the tests to check for regressions:: + + pytest tests/performance/ + + Any metric that regresses beyond the tolerance (default 10 %) + will cause the test to fail with a detailed report. + +Options +------- +--perf-tolerance FLOAT Allowed regression fraction (default 0.10). +--perf-warmup-iters INT Warmup iterations (default 3). +--perf-measure-iters INT Measurement iterations (default 5). +--perf-baseline-path PATH Path to the baselines JSON file. +""" + +import pytest +import torch.distributed as dist + +from tests.basic_correctness.utils import alpaca_prompt + +BATCH_SIZES = [1, 8] +PROMPT_LENGTHS = [128, 512] +DECODE_LENGTHS = [32, 128] + +# Metrics where *lower* is better (latencies). +_LOWER_IS_BETTER = {"ttft_ms", "tbt_ms", "e2e_ms"} +# Metrics where *higher* is better (throughput). +_HIGHER_IS_BETTER = {"throughput_tps"} + +_ALL_METRICS = [ + ("ttft_ms", "TTFT"), + ("tbt_ms", "TBT"), + ("throughput_tps", "Throughput"), + ("e2e_ms", "E2E"), +] + + +def _bench_key(batch_size, prompt_length, decode_length): + return ( + f"batch_{batch_size}" + f"_prompt_{prompt_length}" + f"_decode_{decode_length}" + ) + + +def _run_iterations(engine, prompts, decode_length, n_iters): + """Run *n_iters* generate calls and return the list of metric dicts.""" + collected = [] + for _ in range(n_iters): + _, metrics = engine.generate( + prompts, + report_throughput=False, + tokens_to_generate=decode_length, + ignore_eos=True, + ) + collected.append(metrics) + return collected + + +def _average_metrics(metrics_list): + n = len(metrics_list) + return { + "ttft_ms": sum(m["TTFT"] for m in metrics_list) / n, + "tbt_ms": sum(m["TBT"] for m in metrics_list) / n, + "throughput_tps": sum(m["Throughput"] for m in metrics_list) / n, + "e2e_ms": sum(m["E2E"] for m in metrics_list) / n, + } + + +def _check_regressions(baseline, current, tolerance): + """Return a list of (metric, baseline_val, current_val, pct) tuples + for every metric that regressed beyond *tolerance*.""" + regressions = [] + for key, label in _ALL_METRICS: + base_val = baseline[key] + curr_val = current[key] + + if base_val == 0: + continue + + if key in _LOWER_IS_BETTER: + pct = (curr_val - base_val) / base_val + regressed = pct > tolerance + else: + pct = (base_val - curr_val) / base_val + regressed = pct > tolerance + pct = -pct # show as negative when throughput drops + + if regressed: + regressions.append((label, base_val, curr_val, pct)) + + return regressions + + +def _format_report(key, current, baseline, regressions, tolerance): + """Build a human-readable report string.""" + lines = [f"Performance regression detected for [{key}]:"] + lines.append("") + lines.append( + f" {'Metric':<14} {'Baseline':>12} {'Current':>12} {'Change':>10}" + ) + lines.append(f" {'-'*50}") + + for mkey, label in _ALL_METRICS: + base_val = baseline[mkey] + curr_val = current[mkey] + if base_val != 0: + pct = (curr_val - base_val) / base_val + marker = ( + " << REGRESSION" + if any(r[0] == label for r in regressions) + else "" + ) + lines.append( + f" {label:<14} {base_val:>12.4f} {curr_val:>12.4f}" + f" {pct:>+9.1%}{marker}" + ) + else: + lines.append( + f" {label:<14} {base_val:>12.4f} {curr_val:>12.4f}" + f" N/A" + ) + + lines.append("") + lines.append(f" Tolerance: {tolerance:.0%}") + return "\n".join(lines) + + +# ------------------------------------------------------------------ # +# Tests # +# ------------------------------------------------------------------ # + + +@pytest.mark.parametrize("batch_size", BATCH_SIZES) +@pytest.mark.parametrize("prompt_length", PROMPT_LENGTHS) +@pytest.mark.parametrize("decode_length", DECODE_LENGTHS) +def test_perf_regression( + perf_engine, + tokenizer, + alpaca_dataset, + baseline_store, + perf_results, + batch_size, + prompt_length, + decode_length, + request, +): + config = request.config + update_mode = config.getoption("--perf-update-baselines") + tolerance = config.getoption("--perf-tolerance") + warmup_iters = config.getoption("--perf-warmup-iters") + measure_iters = config.getoption("--perf-measure-iters") + + key = _bench_key(batch_size, prompt_length, decode_length) + + # --- prepare prompts ------------------------------------------ # + prompts = alpaca_prompt( + alpaca_dataset, tokenizer, prompt_length, batch_size + ) + + # --- warmup --------------------------------------------------- # + _run_iterations(perf_engine, prompts, decode_length, warmup_iters) + + # --- measure -------------------------------------------------- # + raw = _run_iterations(perf_engine, prompts, decode_length, measure_iters) + current = _average_metrics(raw) + + # Only rank 0 performs the baseline comparison / update. + if dist.is_initialized() and dist.get_rank() != 0: + return + + # --- update mode: store and return ---------------------------- # + if update_mode: + baseline_store.put( + key, + { + "batch_size": batch_size, + "prompt_length": prompt_length, + "decode_length": decode_length, + **current, + }, + ) + perf_results.append( + { + "key": key, + "metrics": [ + (label, current[mkey]) for mkey, label in _ALL_METRICS + ], + } + ) + return + + # --- compare mode --------------------------------------------- # + baseline = baseline_store.get(key) + if baseline is None: + pytest.skip( + f"No baseline for {key}. " + "Run with --perf-update-baselines first." + ) + + regressions = _check_regressions(baseline, current, tolerance) + + comparisons = [] + for mkey, label in _ALL_METRICS: + base_val = baseline[mkey] + curr_val = current[mkey] + pct = (curr_val - base_val) / base_val if base_val != 0 else 0 + comparisons.append((label, base_val, curr_val, pct)) + perf_results.append({"key": key, "comparisons": comparisons}) + + if regressions: + report = _format_report(key, current, baseline, regressions, tolerance) + pytest.fail(report) diff --git a/tests/scripts/run_perf_regression_tests.sh b/tests/scripts/run_perf_regression_tests.sh new file mode 100755 index 00000000..6fb69ad4 --- /dev/null +++ b/tests/scripts/run_perf_regression_tests.sh @@ -0,0 +1,38 @@ +#!/bin/bash +NNODES=1 +GPUS_DEFAULT=1 +GPUS=${GPUS:-$GPUS_DEFAULT} + + +export MASTER_ADDR=$(hostname) +export MASTER_PORT=29500 +export WORLD_SIZE=${GPUS} + +## nccl env vars to speedup stuff +export CUDA_DEVICE_MAX_CONNECTIONS=1 +export NCCL_NET_GDR_LEVEL=PHB +export NCCL_CROSS_NIC=1 +export NCCL_SOCKET_IFNAME=hsi +export MPICH_GPU_SUPPORT_ENABLED=0 +export CUDA_VISIBLE_DEVICES=0 + +export HF_HOME="$SCRATCH/hf_cache" +export TRANSFORMERS_HOME="$SCRATCH/hf_cache" +export HF_DATASETS_CACHE="$SCRATCH/hf_cache" +export YALIS_CACHE="/pscratch/sd/p/prajwal/SpecDec/yalis/yalis/external" +export TORCHINDUCTOR_CACHE_DIR="${SCRATCH}/.cache/torch_inductor" + +SCRIPT="tests/performance/test_perf_regression.py" + +export PYTHONPATH="$PYTHONPATH:." + +chmod +x tests/get_rank_tests.sh + +# Pass extra pytest flags via PERF_PYTEST_ARGS, e.g.: +# PERF_PYTEST_ARGS="--perf-update-baselines" ./tests/scripts/run_perf_regression_tests.sh +# PERF_PYTEST_ARGS="--perf-tolerance 0.15" ./tests/scripts/run_perf_regression_tests.sh +EXTRA_ARGS=${PERF_PYTEST_ARGS:-} + +perf_cmd="NCCL_CUMEM_ENABLE=0 srun -N $NNODES -n $GPUS -G $GPUS -c 16 --cpu-bind=cores ./tests/get_rank_tests.sh pytest $SCRIPT $EXTRA_ARGS" +echo $perf_cmd +eval $perf_cmd