diff --git a/tests/README.md b/tests/README.md
new file mode 100644
index 00000000..cae17f4b
--- /dev/null
+++ b/tests/README.md
@@ -0,0 +1,75 @@
+# YALIS Tests
+
+## Performance Regression Tests
+
+The performance tests live in `tests/performance/` and measure TTFT, TBT,
+end-to-end latency, and throughput across a matrix of batch sizes, prompt
+lengths, and decode lengths.
+
+### Quick start
+
+Performance testing is a two-step workflow: first record baselines on a
+known-good branch, then compare against those baselines on a feature branch.
+
+#### 1. Generate baselines (on the develop / baseline branch)
+
+```bash
+PERF_PYTEST_ARGS="--perf-update-baselines" ./tests/scripts/run_perf_regression_tests.sh
+```
+
+This runs every benchmark combination and writes the results to
+`tests/performance/baselines/perf_baselines.json`.
+
+#### 2. Check for regressions (on your feature branch)
+
+```bash
+./tests/scripts/run_perf_regression_tests.sh
+```
+
+Any metric that regresses beyond the tolerance (default 10 %) will cause the
+test to fail with a detailed report.
+
+### Configuration
+
+The script uses `srun` to launch on a Slurm cluster. You can control the
+number of GPUs with the `GPUS` environment variable:
+
+```bash
+GPUS=4 ./tests/scripts/run_perf_regression_tests.sh
+```
+
+Additional pytest options can be passed through the `PERF_PYTEST_ARGS`
+environment variable:
+
+| Option                        | Default                                         | Description                                       |
+| ----------------------------- | ----------------------------------------------- | ------------------------------------------------- |
+| `--perf-update-baselines`     | off                                             | Record new baselines instead of comparing.        |
+| `--perf-tolerance FLOAT`      | `0.10`                                          | Max allowed regression fraction (10 %).           |
+| `--perf-warmup-iters INT`     | `3`                                             | Warmup iterations before measurement.             |
+| `--perf-measure-iters INT`    | `5`                                             | Measurement iterations for averaging.             |
+| `--perf-baseline-path PATH`   | `tests/performance/baselines/perf_baselines.json` | Path to the baselines JSON file.                |
+
+Example — tighter tolerance with more measurement iterations:
+
+```bash
+PERF_PYTEST_ARGS="--perf-tolerance 0.05 --perf-measure-iters 10" \
+  ./tests/scripts/run_perf_regression_tests.sh
+```
+
+The model, precision, and attention backend are configured via pytest ini
+settings. The defaults (set in `tests/performance/conftest.py`) are:
+
+| Setting                  | Default                              |
+| ------------------------ | ------------------------------------ |
+| `model`                  | `meta-llama/Llama-3.1-8B-Instruct`  |
+| `dtype`                  | `bf16`                               |
+| `attn_backend`           | `sdpa`                               |
+| `use_paged_kv_caching`   | `False`                              |
+
+To override these, create a `pytest.ini` (or add an `[pytest]` section to
+`pyproject.toml`) with the desired values, or pass a custom `-c <ini-file>`
+through `PERF_PYTEST_ARGS`.
+
+## Correctness Tests
+
+<!-- TODO: Add correctness test documentation -->
diff --git a/tests/performance/__init__.py b/tests/performance/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/performance/baselines/.gitkeep b/tests/performance/baselines/.gitkeep
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/performance/conftest.py b/tests/performance/conftest.py
new file mode 100644
index 00000000..f3cca2d3
--- /dev/null
+++ b/tests/performance/conftest.py
@@ -0,0 +1,344 @@
+import json
+import os
+import subprocess
+from datetime import datetime, timezone
+
+import pytest
+import torch.distributed as dist
+from transformers import AutoTokenizer
+
+from yalis import ModelConfig, InferenceConfig, LLMEngine
+from tests.sample_dataset import AlpacaDataset
+
+BASELINE_DIR = os.path.join(os.path.dirname(__file__), "baselines")
+DEFAULT_BASELINE_PATH = os.path.join(BASELINE_DIR, "perf_baselines.json")
+
+_PERF_RESULTS_KEY = pytest.StashKey[list]()
+
+
+# ------------------------------------------------------------------ #
+#  Hooks                                                              #
+# ------------------------------------------------------------------ #
+
+
+def pytest_configure(config):
+    """Initialise a session-wide list to collect perf comparison results."""
+    config.stash[_PERF_RESULTS_KEY] = []
+
+
+def pytest_sessionstart(session):
+    """Validate CLI options that must be positive."""
+    val = session.config.getoption("--perf-measure-iters", default=5)
+    if val < 1:
+        raise pytest.UsageError("--perf-measure-iters must be at least 1")
+
+
+def pytest_terminal_summary(terminalreporter, config):
+    """Print a performance comparison table after the test run."""
+    results = config.stash.get(_PERF_RESULTS_KEY, [])
+    if not results:
+        return
+
+    write = terminalreporter.write_line
+    update_mode = config.getoption("--perf-update-baselines", default=False)
+
+    if update_mode:
+        write("")
+        write("=== Performance baselines saved ===", bold=True)
+        for entry in results:
+            write(f"  [{entry['key']}]")
+            for label, value in entry["metrics"]:
+                write(f"    {label:<14} {value:>12.4f}")
+        write("")
+    else:
+        tolerance = config.getoption("--perf-tolerance", default=0.10)
+        write("")
+        write("=== Performance comparison ===", bold=True)
+        write(
+            f"  {'Benchmark':<40} {'Metric':<14}"
+            f" {'Baseline':>12} {'Current':>12} {'Change':>10}"
+        )
+        write(f"  {'-' * 92}")
+        for entry in results:
+            first = True
+            for label, base_val, curr_val, pct in entry["comparisons"]:
+                tag = entry["key"] if first else ""
+                marker = " !!" if abs(pct) > tolerance else ""
+                write(
+                    f"  {tag:<40} {label:<14}"
+                    f" {base_val:>12.4f} {curr_val:>12.4f}"
+                    f" {pct:>+9.1%}{marker}"
+                )
+                first = False
+        write("")
+        write(f"  Tolerance: {tolerance:.0%}")
+        write("")
+
+
+# ------------------------------------------------------------------ #
+#  CLI options                                                        #
+# ------------------------------------------------------------------ #
+
+
+def pytest_addoption(parser):
+    parser.addini(
+        "model",
+        "Model to use for the test",
+        type="string",
+        default="meta-llama/Llama-3.1-8B-Instruct",
+    )
+    parser.addini(
+        "dtype",
+        "Data type to use for the test",
+        type="string",
+        default="bf16",
+    )
+    parser.addini(
+        "attn_backend",
+        "Attention backend to use for the test",
+        type="string",
+        default="sdpa",
+    )
+    parser.addini(
+        "use_paged_kv_caching",
+        "Enable paged KV caching (requires flash backend)",
+        type="bool",
+        default=False,
+    )
+    parser.addoption(
+        "--perf-update-baselines",
+        action="store_true",
+        default=False,
+        help="Update performance baselines instead of comparing.",
+    )
+    parser.addoption(
+        "--perf-tolerance",
+        type=float,
+        default=0.10,
+        help="Max allowed regression fraction (default: 0.10 = 10%%).",
+    )
+    parser.addoption(
+        "--perf-warmup-iters",
+        type=int,
+        default=3,
+        help="Warmup iterations before measurement (default: 3).",
+    )
+    parser.addoption(
+        "--perf-measure-iters",
+        type=int,
+        default=5,
+        help="Measurement iterations for averaging (default: 5, min: 1).",
+    )
+    parser.addoption(
+        "--perf-baseline-path",
+        type=str,
+        default=DEFAULT_BASELINE_PATH,
+        help="Path to the baseline JSON file.",
+    )
+
+
+# ------------------------------------------------------------------ #
+#  Baseline store                                                     #
+# ------------------------------------------------------------------ #
+
+
+class BaselineStore:
+    """Thin wrapper around a JSON file that holds perf baselines."""
+
+    def __init__(self, path):
+        self.path = path
+        self._data = self._load()
+
+    # -- persistence ------------------------------------------------ #
+
+    def _load(self):
+        if os.path.exists(self.path):
+            with open(self.path) as f:
+                return json.load(f)
+        return {"metadata": {}, "benchmarks": {}}
+
+    def flush(self):
+        os.makedirs(os.path.dirname(self.path), exist_ok=True)
+        with open(self.path, "w") as f:
+            json.dump(self._data, f, indent=2)
+
+    # -- read / write ----------------------------------------------- #
+
+    def get(self, key):
+        return self._data["benchmarks"].get(key)
+
+    def put(self, key, entry):
+        self._data["benchmarks"][key] = entry
+
+    def set_metadata(self, **kwargs):
+        self._data["metadata"].update(kwargs)
+
+
+# ------------------------------------------------------------------ #
+#  Fixtures                                                           #
+# ------------------------------------------------------------------ #
+
+
+@pytest.fixture(scope="module", autouse=True)
+def cleanup_dist():
+    yield
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
+
+@pytest.fixture(scope="session")
+def model_id(request):
+    return request.config.getini("model")
+
+
+@pytest.fixture(scope="session")
+def dtype(request):
+    return request.config.getini("dtype").lower()
+
+
+@pytest.fixture(scope="session")
+def attn_backend(request):
+    return request.config.getini("attn_backend").lower()
+
+
+@pytest.fixture(scope="session")
+def use_paged_kv_caching(request):
+    return request.config.getini("use_paged_kv_caching")
+
+
+@pytest.fixture(scope="module")
+def perf_engine(model_id, dtype, attn_backend, use_paged_kv_caching):
+    """LLMEngine configured for performance measurement."""
+    model_config = ModelConfig(model_name=model_id, precision=dtype)
+    inference_config = InferenceConfig(
+        max_batch_size=8,
+        max_length_of_generated_sequences=2048,
+        top_p=0.0,
+        temperature=0.0,
+        tp_dims=None,
+        attention_backend=attn_backend,
+        use_paged_kv_caching=use_paged_kv_caching,
+        prestore_kv_cache=True,
+    )
+    return LLMEngine(
+        model_config=model_config,
+        inference_config=inference_config,
+    )
+
+
+@pytest.fixture(scope="session")
+def tokenizer(model_id):
+    tok = AutoTokenizer.from_pretrained(model_id)
+    tok.pad_token = tok.eos_token
+    tok.padding_side = "left"
+    return tok
+
+
+@pytest.fixture(scope="session")
+def alpaca_dataset():
+    return AlpacaDataset(random_seed=42)
+
+
+@pytest.fixture(scope="session")
+def perf_results(request):
+    """Session-wide list for collecting perf comparison data."""
+    return request.config.stash[_PERF_RESULTS_KEY]
+
+
+@pytest.fixture(scope="session")
+def baseline_store(
+    request, model_id, dtype, attn_backend, use_paged_kv_caching
+):
+    """Load (or create) the baseline store and flush on teardown."""
+    path = request.config.getoption("--perf-baseline-path")
+    store = BaselineStore(path)
+
+    update = request.config.getoption("--perf-update-baselines")
+
+    # Determine rank from env vars set by the launcher (torchrun / SLURM).
+    # dist may not yet be initialised (session fixture created early) or
+    # already torn down (cleanup_dist is module-scoped), so we must not
+    # rely on dist.is_initialized() / dist.get_rank().
+    _rank = os.environ.get("RANK", os.environ.get("SLURM_PROCID", "0"))
+    is_rank_zero = int(_rank) == 0
+
+    if update:
+        git_sha = _git_sha()
+        store.set_metadata(
+            model=model_id,
+            attention_backend=attn_backend,
+            precision=dtype,
+            use_paged_kv_caching=use_paged_kv_caching,
+            updated_at=datetime.now(timezone.utc).isoformat(),
+            git_commit=git_sha,
+        )
+    else:
+        _validate_baseline_config(
+            store, model_id, dtype, attn_backend, use_paged_kv_caching
+        )
+
+    yield store
+
+    if update and is_rank_zero:
+        store.flush()
+
+
+# ------------------------------------------------------------------ #
+#  Helpers                                                            #
+# ------------------------------------------------------------------ #
+
+
+def _validate_baseline_config(
+    store, model_id, dtype, attn_backend, use_paged_kv_caching
+):
+    """Verify the current test config matches the baseline metadata.
+
+    Raises ``pytest.UsageError`` on mismatch so the session fails
+    immediately rather than producing misleading comparisons.
+    """
+    meta = store._data.get("metadata", {})
+    if not meta:
+        return  # no baselines yet — nothing to validate
+
+    checks = {
+        "model": (meta.get("model"), model_id),
+        "precision": (meta.get("precision"), dtype),
+        "attention_backend": (meta.get("attention_backend"), attn_backend),
+        "use_paged_kv_caching": (
+            meta.get("use_paged_kv_caching"),
+            use_paged_kv_caching,
+        ),
+    }
+
+    mismatches = []
+    for field, (stored, current) in checks.items():
+        if stored is None:
+            # Baseline was created before this field was tracked — skip.
+            continue
+        if stored != current:
+            mismatches.append(
+                f"  {field}: baseline={stored!r}, current={current!r}"
+            )
+
+    if mismatches:
+        detail = "\n".join(mismatches)
+        raise pytest.UsageError(
+            f"Baseline config mismatch — the stored baselines were "
+            f"recorded with a different configuration:\n{detail}\n"
+            f"Re-run with --perf-update-baselines to regenerate."
+        )
+
+
+def _git_sha():
+    try:
+        return (
+            subprocess.check_output(
+                ["git", "rev-parse", "--short", "HEAD"],
+                stderr=subprocess.DEVNULL,
+            )
+            .decode()
+            .strip()
+        )
+    except Exception:
+        return "unknown"
diff --git a/tests/performance/test_perf_regression.py b/tests/performance/test_perf_regression.py
new file mode 100644
index 00000000..75d2b7e2
--- /dev/null
+++ b/tests/performance/test_perf_regression.py
@@ -0,0 +1,222 @@
+"""
+Performance regression tests for YALIS.
+
+Workflow
+--------
+1. On the develop (baseline) branch, generate baselines::
+
+       pytest tests/performance/ --perf-update-baselines
+
+2. On your feature branch, run the tests to check for regressions::
+
+       pytest tests/performance/
+
+   Any metric that regresses beyond the tolerance (default 10 %)
+   will cause the test to fail with a detailed report.
+
+Options
+-------
+--perf-tolerance FLOAT     Allowed regression fraction (default 0.10).
+--perf-warmup-iters INT    Warmup iterations (default 3).
+--perf-measure-iters INT   Measurement iterations (default 5).
+--perf-baseline-path PATH  Path to the baselines JSON file.
+"""
+
+import pytest
+import torch.distributed as dist
+
+from tests.basic_correctness.utils import alpaca_prompt
+
+BATCH_SIZES = [1, 8]
+PROMPT_LENGTHS = [128, 512]
+DECODE_LENGTHS = [32, 128]
+
+# Metrics where *lower* is better (latencies).
+_LOWER_IS_BETTER = {"ttft_ms", "tbt_ms", "e2e_ms"}
+# Metrics where *higher* is better (throughput).
+_HIGHER_IS_BETTER = {"throughput_tps"}
+
+_ALL_METRICS = [
+    ("ttft_ms", "TTFT"),
+    ("tbt_ms", "TBT"),
+    ("throughput_tps", "Throughput"),
+    ("e2e_ms", "E2E"),
+]
+
+
+def _bench_key(batch_size, prompt_length, decode_length):
+    return (
+        f"batch_{batch_size}"
+        f"_prompt_{prompt_length}"
+        f"_decode_{decode_length}"
+    )
+
+
+def _run_iterations(engine, prompts, decode_length, n_iters):
+    """Run *n_iters* generate calls and return the list of metric dicts."""
+    collected = []
+    for _ in range(n_iters):
+        _, metrics = engine.generate(
+            prompts,
+            report_throughput=False,
+            tokens_to_generate=decode_length,
+            ignore_eos=True,
+        )
+        collected.append(metrics)
+    return collected
+
+
+def _average_metrics(metrics_list):
+    n = len(metrics_list)
+    return {
+        "ttft_ms": sum(m["TTFT"] for m in metrics_list) / n,
+        "tbt_ms": sum(m["TBT"] for m in metrics_list) / n,
+        "throughput_tps": sum(m["Throughput"] for m in metrics_list) / n,
+        "e2e_ms": sum(m["E2E"] for m in metrics_list) / n,
+    }
+
+
+def _check_regressions(baseline, current, tolerance):
+    """Return a list of (metric, baseline_val, current_val, pct) tuples
+    for every metric that regressed beyond *tolerance*."""
+    regressions = []
+    for key, label in _ALL_METRICS:
+        base_val = baseline[key]
+        curr_val = current[key]
+
+        if base_val == 0:
+            continue
+
+        if key in _LOWER_IS_BETTER:
+            pct = (curr_val - base_val) / base_val
+            regressed = pct > tolerance
+        else:
+            pct = (base_val - curr_val) / base_val
+            regressed = pct > tolerance
+            pct = -pct  # show as negative when throughput drops
+
+        if regressed:
+            regressions.append((label, base_val, curr_val, pct))
+
+    return regressions
+
+
+def _format_report(key, current, baseline, regressions, tolerance):
+    """Build a human-readable report string."""
+    lines = [f"Performance regression detected for [{key}]:"]
+    lines.append("")
+    lines.append(
+        f"  {'Metric':<14} {'Baseline':>12} {'Current':>12} {'Change':>10}"
+    )
+    lines.append(f"  {'-'*50}")
+
+    for mkey, label in _ALL_METRICS:
+        base_val = baseline[mkey]
+        curr_val = current[mkey]
+        if base_val != 0:
+            pct = (curr_val - base_val) / base_val
+            marker = (
+                " << REGRESSION"
+                if any(r[0] == label for r in regressions)
+                else ""
+            )
+            lines.append(
+                f"  {label:<14} {base_val:>12.4f} {curr_val:>12.4f}"
+                f" {pct:>+9.1%}{marker}"
+            )
+        else:
+            lines.append(
+                f"  {label:<14} {base_val:>12.4f} {curr_val:>12.4f}"
+                f"       N/A"
+            )
+
+    lines.append("")
+    lines.append(f"  Tolerance: {tolerance:.0%}")
+    return "\n".join(lines)
+
+
+# ------------------------------------------------------------------ #
+#  Tests                                                              #
+# ------------------------------------------------------------------ #
+
+
+@pytest.mark.parametrize("batch_size", BATCH_SIZES)
+@pytest.mark.parametrize("prompt_length", PROMPT_LENGTHS)
+@pytest.mark.parametrize("decode_length", DECODE_LENGTHS)
+def test_perf_regression(
+    perf_engine,
+    tokenizer,
+    alpaca_dataset,
+    baseline_store,
+    perf_results,
+    batch_size,
+    prompt_length,
+    decode_length,
+    request,
+):
+    config = request.config
+    update_mode = config.getoption("--perf-update-baselines")
+    tolerance = config.getoption("--perf-tolerance")
+    warmup_iters = config.getoption("--perf-warmup-iters")
+    measure_iters = config.getoption("--perf-measure-iters")
+
+    key = _bench_key(batch_size, prompt_length, decode_length)
+
+    # --- prepare prompts ------------------------------------------ #
+    prompts = alpaca_prompt(
+        alpaca_dataset, tokenizer, prompt_length, batch_size
+    )
+
+    # --- warmup --------------------------------------------------- #
+    _run_iterations(perf_engine, prompts, decode_length, warmup_iters)
+
+    # --- measure -------------------------------------------------- #
+    raw = _run_iterations(perf_engine, prompts, decode_length, measure_iters)
+    current = _average_metrics(raw)
+
+    # Only rank 0 performs the baseline comparison / update.
+    if dist.is_initialized() and dist.get_rank() != 0:
+        return
+
+    # --- update mode: store and return ---------------------------- #
+    if update_mode:
+        baseline_store.put(
+            key,
+            {
+                "batch_size": batch_size,
+                "prompt_length": prompt_length,
+                "decode_length": decode_length,
+                **current,
+            },
+        )
+        perf_results.append(
+            {
+                "key": key,
+                "metrics": [
+                    (label, current[mkey]) for mkey, label in _ALL_METRICS
+                ],
+            }
+        )
+        return
+
+    # --- compare mode --------------------------------------------- #
+    baseline = baseline_store.get(key)
+    if baseline is None:
+        pytest.skip(
+            f"No baseline for {key}. "
+            "Run with --perf-update-baselines first."
+        )
+
+    regressions = _check_regressions(baseline, current, tolerance)
+
+    comparisons = []
+    for mkey, label in _ALL_METRICS:
+        base_val = baseline[mkey]
+        curr_val = current[mkey]
+        pct = (curr_val - base_val) / base_val if base_val != 0 else 0
+        comparisons.append((label, base_val, curr_val, pct))
+    perf_results.append({"key": key, "comparisons": comparisons})
+
+    if regressions:
+        report = _format_report(key, current, baseline, regressions, tolerance)
+        pytest.fail(report)
diff --git a/tests/scripts/run_perf_regression_tests.sh b/tests/scripts/run_perf_regression_tests.sh
new file mode 100755
index 00000000..6fb69ad4
--- /dev/null
+++ b/tests/scripts/run_perf_regression_tests.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+NNODES=1
+GPUS_DEFAULT=1
+GPUS=${GPUS:-$GPUS_DEFAULT}
+
+
+export MASTER_ADDR=$(hostname)
+export MASTER_PORT=29500
+export WORLD_SIZE=${GPUS}
+
+## nccl env vars to speedup stuff
+export CUDA_DEVICE_MAX_CONNECTIONS=1
+export NCCL_NET_GDR_LEVEL=PHB
+export NCCL_CROSS_NIC=1
+export NCCL_SOCKET_IFNAME=hsi
+export MPICH_GPU_SUPPORT_ENABLED=0
+export CUDA_VISIBLE_DEVICES=0
+
+export HF_HOME="$SCRATCH/hf_cache"
+export TRANSFORMERS_HOME="$SCRATCH/hf_cache"
+export HF_DATASETS_CACHE="$SCRATCH/hf_cache"
+export YALIS_CACHE="/pscratch/sd/p/prajwal/SpecDec/yalis/yalis/external"
+export TORCHINDUCTOR_CACHE_DIR="${SCRATCH}/.cache/torch_inductor"
+
+SCRIPT="tests/performance/test_perf_regression.py"
+
+export PYTHONPATH="$PYTHONPATH:."
+
+chmod +x tests/get_rank_tests.sh
+
+# Pass extra pytest flags via PERF_PYTEST_ARGS, e.g.:
+#   PERF_PYTEST_ARGS="--perf-update-baselines" ./tests/scripts/run_perf_regression_tests.sh
+#   PERF_PYTEST_ARGS="--perf-tolerance 0.15" ./tests/scripts/run_perf_regression_tests.sh
+EXTRA_ARGS=${PERF_PYTEST_ARGS:-}
+
+perf_cmd="NCCL_CUMEM_ENABLE=0 srun -N $NNODES -n $GPUS -G $GPUS -c 16 --cpu-bind=cores ./tests/get_rank_tests.sh pytest $SCRIPT $EXTRA_ARGS"
+echo $perf_cmd
+eval $perf_cmd