RouteWorks · RixinLiu · Dec 8, 2025 · Dec 3, 2025 · Dec 3, 2025 · Dec 3, 2025
diff --git a/.github/workflows/pr-evaluation.yml b/.github/workflows/pr-evaluation.yml
@@ -54,18 +54,49 @@ jobs:
 
           # Compare against base to show only changes in this PR
           # Use three-dot diff to show changes from merge-base to HEAD (only PR changes)
-          CHANGED_FILES=$(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>&1 | awk '$1 == "A" || $1 == "M" {print $2}')
-          if [[ -z "$CHANGED_FILES" ]]; then
+          mapfile -t CHANGED_FILES < <(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>/dev/null | awk '$1 == "A" || $1 == "M" {print $2}')
+
+          if [[ ${#CHANGED_FILES[@]} -eq 0 ]]; then
             echo "No changed prediction file detected; skipping evaluation."
             echo "router=" >> "$GITHUB_OUTPUT"
             exit 0
           fi
-          if [[ $(echo "$CHANGED_FILES" | wc -l) -ne 1 ]]; then
-            echo "Expected exactly one changed prediction file, found:" >&2
-            echo "$CHANGED_FILES" >&2
+
+          router_name=""
+          has_base=0
+          has_robustness=0
+
+          for file in "${CHANGED_FILES[@]}"; do
+            filename=$(basename "$file")
+            name="${filename%.json}"
+            if [[ "$name" == *-robustness ]]; then
+              has_robustness=1
+              name="${name%-robustness}"
+            else
+              has_base=1
+            fi
+
+            if [[ -z "$name" ]]; then
+              echo "Unable to determine router name from $file" >&2
+              exit 1
+            fi
+
+            if [[ -z "$router_name" ]]; then
+              router_name="$name"
+            elif [[ "$router_name" != "$name" ]]; then
+              echo "Prediction files belong to different routers:" >&2
+              printf '  %s\n' "${CHANGED_FILES[@]}" >&2
+              exit 1
+            fi
+          done
+
+          if [[ ${#CHANGED_FILES[@]} -ne 2 || $has_base -ne 1 || $has_robustness -ne 1 ]]; then
+            echo "Expected exactly two prediction files (router and router-robustness), found:" >&2
+            printf '  %s\n' "${CHANGED_FILES[@]}" >&2
             exit 1
           fi
-          ROUTER_NAME=$(basename "$CHANGED_FILES" .json)
+
+          ROUTER_NAME="$router_name"
           echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT"
 
           # Detect split based on prediction file size (from PR branch)
@@ -114,6 +145,8 @@ jobs:
           mkdir -p base/router_inference/predictions
           cp "pr/router_inference/predictions/${ROUTER_NAME}.json" \
              "base/router_inference/predictions/${ROUTER_NAME}.json"
+          cp "pr/router_inference/predictions/${ROUTER_NAME}-robustness.json" \
+             "base/router_inference/predictions/${ROUTER_NAME}-robustness.json"
           echo "Copied prediction file from PR to base workspace"
 
       - name: Evaluate submission
@@ -123,7 +156,7 @@ jobs:
         env:
           ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset
         run: |
-          set -euo pipefail
+          set -euo pipefail; trap 'cat evaluation_output.txt' EXIT
           # Uses base repo's evaluation script (safe - not from PR)
           BASE_SHA="${{ github.event.pull_request.base.sha }}"
           uv run python automation/process_pr_submission.py \
@@ -161,6 +194,9 @@ jobs:
             comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
             comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
             comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
+            const robustnessScore = metrics.robustness_score;
+            const robustnessCell = robustnessScore !== undefined ? robustnessScore.toFixed(4) : 'N/A';
+            comment += `| **Robustness Score** | ${robustnessCell} |\n`;
 
             // Add optimality scores if available
             if (metrics.optimality) {

diff --git a/README.md b/README.md
@@ -90,7 +90,9 @@ See the [`ModelInference`](./llm_inference/model_inference.py) class for the com
 
 ## 2. Get Routing Decisions
 
-Follow the steps below to obtain your router's model choices for each query. Start with the `sub_10` split (a 10% subset) for local testing. Once your setup works, you can evaluate on the `full` dataset for full local evaluation and official leaderboard submission.
+Follow the steps below to obtain your router's model choices for each query. Start with the `sub_10` split (a 10% subset) for local testing. Once your setup works, you can evaluate:
+- on the `full` dataset for full local evaluation and official leaderboard submission.
+- on the `robustness` dataset for robustness evaluation.
 
 ### Step 2.1: Prepare Config File
 
@@ -138,7 +140,7 @@ router = MyRouter(args.router_name)
 Finally, generate the prediction file:
 
 ```bash
-uv run python ./router_inference/generate_prediction_file.py your-router [sub_10|full]
+uv run python ./router_inference/generate_prediction_file.py your-router [sub_10|full|robustness]
 ```
 
 > [!NOTE]
@@ -148,10 +150,10 @@ uv run python ./router_inference/generate_prediction_file.py your-router [sub_10
 ### Step 2.3: Validate Config and Prediction Files
 
 ```bash
-uv run python ./router_inference/check_config_prediction_files.py your-router [sub_10|full]
+uv run python ./router_inference/check_config_prediction_files.py your-router [sub_10|full|robustness]
 ```
 
-This script checks: (1) all model names are valid, (2) prediction file has correct size (809 for `sub_10`, 8400 for `full`), and (3) all entries have valid `global_index`, `prompt`, and `prediction` fields.
+This script checks: (1) all model names are valid, (2) prediction file has correct size (809 for `sub_10`, 8400 for `full`, 420 for `robustness`), and (3) all entries have valid `global_index`, `prompt`, and `prediction` fields.
 
 ## 3. Run LLM Inference
 
@@ -162,22 +164,29 @@ uv run python ./llm_inference/run.py your-router
 ```
 
 The script loads your prediction file, makes API calls using the models specified in the `prediction` field, and saves results incrementally. It uses cached results when available and saves progress after each query, so you can safely interrupt and resume. Results are saved to `./cached_results/` for reuse across routers.
+> [!NOTE]
+> - For robustness evaluation, we only measure the model-selection flip ratio after adding noise to the original prompt, so no additional LLM inference is required for this stage.
 
 ## 4. Run Router Evaluation
 
 As the last step, run the evaluation script:
 
 ```bash
-uv run python ./llm_evaluation/run.py your-router [sub_10|full]
+uv run python ./llm_evaluation/run.py your-router [sub_10|full|robustness]
 ```
 
+> [!TIP]
+> - Use `sub_10` or `full` to evaluate on those datasets.
+> - Use `robustness` to run robustness-only evaluation (expects `<router_name>-robustness.json`).
+
 # Submitting to the leaderboard
 
 To get your router on the leaderboard, you can open a Pull Request with your router's prediction file to trigger our automated evaluation workflow. Details are as follows:
 
 1. **Add your files**:
    - `router_inference/config/<router_name>.json` - Your router configuration
    - `router_inference/predictions/<router_name>.json` - Your prediction file with `generated_result` fields populated
+   - `router_inference/predictions/<router_name>-robustness.json` - Your prediction file for robustness evaluation, no `generated_result` fields needed
 2. **Open a Pull Request to `main` branch** - The automated workflow will:
    - Validate your submission
    - Run evaluation on the full dataset
@@ -213,6 +222,7 @@ Feel free to contact us for contributions and collaborations.
 
 ```
 Yifan Lu (yifan.lu@rice.edu)
+Rixin Liu (rixin.liu@rice.edu)
 Jiarong Xing (jxing@rice.edu)
 ```
 

diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py
@@ -45,6 +45,8 @@
 from pathlib import Path
 from typing import Iterable, Optional
 
+from global_utils.robustness import compute_robustness_score
+
 
 REPO_ROOT = Path(__file__).resolve().parents[1]
 WORKTREES_DIR = REPO_ROOT / ".pr_worktrees"
@@ -149,11 +151,14 @@ def cleanup_worktree(worktree_path: Path, branch_name: str, *, keep: bool) -> No
 
 
 def ensure_prediction_file_added(
-    worktree_path: Path, base_ref: str, router_name: str
+    worktree_path: Path, base_ref: str, router_name: str, *, robustness: bool = False
 ) -> None:
     """Verify the PR adds or modifies a prediction file for the specified router."""
 
-    target_path = Path("router_inference") / "predictions" / f"{router_name}.json"
+    suffix = "-robustness" if robustness else ""
+    target_path = (
+        Path("router_inference") / "predictions" / f"{router_name}{suffix}.json"
+    )
 
     diff_cmd = [
         "git",
@@ -274,6 +279,59 @@ def compute_scores(prediction_file: Path) -> dict[str, float]:
     }
 
 
+def compute_robustness_score_from_predictions(
+    full_prediction_file: Path, robustness_prediction_file: Path
+) -> Optional[float]:
+    """Compute robustness flip ratio between full/sub_10 and robustness splits."""
+
+    with full_prediction_file.open("r", encoding="utf-8") as full_handle:
+        full_predictions = json.load(full_handle)
+    with robustness_prediction_file.open("r", encoding="utf-8") as robustness_handle:
+        robustness_predictions = json.load(robustness_handle)
+
+    if not isinstance(full_predictions, list) or not isinstance(
+        robustness_predictions, list
+    ):
+        raise ValueError("Prediction payload must be a list of entries.")
+
+    return compute_robustness_score(full_predictions, robustness_predictions)
+
+
+def append_robustness_score_to_metrics(
+    metrics: dict[str, object],
+    prediction_file: Path,
+    robustness_prediction_file: Path,
+    metrics_path: Path,
+) -> dict[str, object]:
+    """
+    Ensure robustness_score is present in metrics, computing it if necessary.
+    """
+
+    if "robustness_score" in metrics:
+        return metrics
+
+    if not robustness_prediction_file.exists():
+        print(
+            "⚠ Robustness prediction file not found; skipping robustness score computation."
+        )
+        return metrics
+
+    score = compute_robustness_score_from_predictions(
+        prediction_file, robustness_prediction_file
+    )
+    if score is None:
+        print(
+            "⚠ Could not compute robustness score because no overlapping entries were found."
+        )
+        return metrics
+
+    metrics["robustness_score"] = score
+    with metrics_path.open("w", encoding="utf-8") as handle:
+        json.dump(metrics, handle, indent=2)
+    print(f"✔ Appended robustness_score={score:.4f} to metrics.json")
+    return metrics
+
+
 def compute_arena_score(
     cost: float,
     accuracy: float,
@@ -387,6 +445,9 @@ def main(argv: Optional[list[str]] = None) -> int:
 
         if not args.allow_existing_prediction:
             ensure_prediction_file_added(worktree_path, base_ref, args.router)
+            ensure_prediction_file_added(
+                worktree_path, base_ref, args.router, robustness=True
+            )
 
         if not args.skip_sync:
             run_command(["uv", "sync", "--locked"], cwd=worktree_path, capture=True)
@@ -437,6 +498,19 @@ def main(argv: Optional[list[str]] = None) -> int:
                 ).strip()
             )
 
+        robustness_prediction_file = prediction_file.with_name(
+            f"{args.router}-robustness.json"
+        )
+        if not robustness_prediction_file.exists():
+            raise FileNotFoundError(
+                textwrap.dedent(
+                    f"""
+                    Robustness prediction file not found: {robustness_prediction_file}
+                    Ensure the pull request includes router_inference/predictions/{args.router}-robustness.json
+                    """
+                ).strip()
+            )
+
         # Read metrics from metrics.json (required - no fallback)
         # llm_evaluation/run.py writes metrics.json to the current working directory (worktree_path)
         metrics_path = worktree_path / "metrics.json"
@@ -455,6 +529,10 @@ def main(argv: Optional[list[str]] = None) -> int:
         with open(metrics_path, "r") as f:
             metrics = json.load(f)
 
+        metrics = append_robustness_score_to_metrics(
+            metrics, prediction_file, robustness_prediction_file, metrics_path
+        )
+
         # Copy metrics.json to base directory (REPO_ROOT) for workflow to read
         base_metrics_path = REPO_ROOT / "metrics.json"
         shutil.copy2(metrics_path, base_metrics_path)
@@ -477,6 +555,8 @@ def main(argv: Optional[list[str]] = None) -> int:
 
         archived_prediction = run_dir / f"{args.router}.json"
         shutil.copy2(prediction_file, archived_prediction)
+        archived_robust_prediction = run_dir / f"{args.router}-robustness.json"
+        shutil.copy2(robustness_prediction_file, archived_robust_prediction)
 
         summary_payload: dict[str, object] = {
             "pr": args.pr,

diff --git a/global_utils/__init__.py b/global_utils/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright contributors to the RouterArena project
+# SPDX-License-Identifier: Apache-2.0
+
+"""Shared utilities for RouterArena scripts."""
+
+from .robustness import compute_robustness_score  # noqa: F401
+
+__all__ = ["compute_robustness_score"]
diff --git a/global_utils/robustness.py b/global_utils/robustness.py
@@ -0,0 +1,90 @@
+# SPDX-FileCopyrightText: Copyright contributors to the RouterArena project
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utilities for computing robustness metrics across scripts."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+from universal_model_names import ModelNameManager
+
+__all__ = ["compute_robustness_score"]
+
+
+def _normalize_model_name(
+    model_name: Optional[str], name_manager: ModelNameManager
+) -> Optional[str]:
+    """Convert a model name to its universal form, falling back gracefully."""
+    if model_name is None:
+        return None
+    try:
+        return name_manager.get_universal_name(model_name)
+    except ValueError:
+        return model_name
+
+
+def compute_robustness_score(
+    full_predictions: list[dict[str, Any]],
+    robustness_predictions: list[dict[str, Any]],
+    *,
+    name_manager: ModelNameManager | None = None,
+) -> Optional[float]:
+    """
+    Compute the robustness flip ratio between full and robustness prediction sets.
+
+    Args:
+        full_predictions: Router predictions for the full/sub_10 split.
+        robustness_predictions: Predictions collected from the robustness split.
+        name_manager: Optional shared instance to reuse universal name cache.
+
+    Returns:
+        A float in [0, 1] representing stability (1 - flip ratio),
+        or ``None`` if no overlapping entries were found.
+    """
+
+    manager = name_manager or ModelNameManager()
+
+    def get_index(entry: dict[str, Any]) -> Optional[str]:
+        """Extract a normalized global index from an entry."""
+        value = entry.get("global index") or entry.get("global_index")
+        return str(value) if value is not None else None
+
+    def normalize(name: object) -> Optional[str]:
+        """Normalize model names through the shared name manager."""
+        if not name:
+            return None
+        return _normalize_model_name(str(name), manager)
+
+    # Build a lookup of router selections from the full split.
+    full_map = {
+        key: entry
+        for entry in full_predictions
+        if isinstance(entry, dict)
+        and not entry.get("for_optimality", False)
+        and (key := get_index(entry)) is not None
+    }
+
+    if not full_map:
+        return None
+
+    matches = [
+        (full_map[key].get("prediction"), entry.get("prediction"))
+        for entry in robustness_predictions
+        if isinstance(entry, dict)
+        and (key := get_index(entry)) is not None
+        and key in full_map
+        and full_map[key].get("prediction")
+        and entry.get("prediction")
+    ]
+
+    if not matches:
+        return None
+
+    flips = sum(
+        1
+        for full_model, robust_model in matches
+        if normalize(full_model) != normalize(robust_model)
+    )
+
+    return 1.0 - flips / len(matches)