Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
393e9d4
prep_datasets is ready
RixinLiu Dec 3, 2025
76e0d9c
generate_prediction_file.py is ready
RixinLiu Dec 3, 2025
a589e93
check_config_prediction_files.py is ready
RixinLiu Dec 3, 2025
0d783fa
llm_evaluation/run.py is ready
RixinLiu Dec 3, 2025
a77f91e
pr-automation is ready to test
RixinLiu Dec 4, 2025
2285c91
[Debug]
RixinLiu Dec 4, 2025
865f500
[Feat.] PR evaluation workflow with automatic robustness evaluation
RixinLiu Dec 4, 2025
9712ce2
[GEMINI SUGGESTION] Update try-catch in automation/process_pr_submiss…
RixinLiu Dec 4, 2025
56e09c3
[GEMINI SUGGESTION] Update try-catch in llm_evaluation/run.py
RixinLiu Dec 4, 2025
ee47815
[GEMINI SUGGESTION] Fix typo in scripts/process_datasets/prep_dataset…
RixinLiu Dec 4, 2025
83e741f
Refactor robustness score compute logic
RixinLiu Dec 4, 2025
dbcdcf8
Refine compute_robustness_score implementation
RixinLiu Dec 4, 2025
d783823
Handle robustness CLI errors by raising exceptions
RixinLiu Dec 4, 2025
ef5f776
Remove type ignore
RixinLiu Dec 4, 2025
6366111
Solve conflict between local utils and global utils
RixinLiu Dec 4, 2025
4120894
Replace arg --calculate-robustness-score with robustness
RixinLiu Dec 4, 2025
407aa2e
Should pass pre-commit
RixinLiu Dec 4, 2025
fd8e765
Refine code
RixinLiu Dec 7, 2025
2c24823
Update scripts/process_datasets/prep_datasets.py
RixinLiu Dec 8, 2025
94e22f3
Update router_inference/check_config_prediction_files.py
RixinLiu Dec 8, 2025
6883d6e
Ready to merge
RixinLiu Dec 8, 2025
1bc8176
Ready to merge
RixinLiu Dec 8, 2025
9e60ff7
Remove incorrect files
RixinLiu Dec 8, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 43 additions & 7 deletions .github/workflows/pr-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -54,18 +54,49 @@ jobs:

# Compare against base to show only changes in this PR
# Use three-dot diff to show changes from merge-base to HEAD (only PR changes)
CHANGED_FILES=$(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>&1 | awk '$1 == "A" || $1 == "M" {print $2}')
if [[ -z "$CHANGED_FILES" ]]; then
mapfile -t CHANGED_FILES < <(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>/dev/null | awk '$1 == "A" || $1 == "M" {print $2}')

if [[ ${#CHANGED_FILES[@]} -eq 0 ]]; then
Comment thread
jiarong0907 marked this conversation as resolved.
echo "No changed prediction file detected; skipping evaluation."
echo "router=" >> "$GITHUB_OUTPUT"
exit 0
fi
if [[ $(echo "$CHANGED_FILES" | wc -l) -ne 1 ]]; then
echo "Expected exactly one changed prediction file, found:" >&2
echo "$CHANGED_FILES" >&2

router_name=""
has_base=0
has_robustness=0

for file in "${CHANGED_FILES[@]}"; do
filename=$(basename "$file")
name="${filename%.json}"
if [[ "$name" == *-robustness ]]; then
has_robustness=1
name="${name%-robustness}"
else
has_base=1
fi

if [[ -z "$name" ]]; then
echo "Unable to determine router name from $file" >&2
exit 1
fi

if [[ -z "$router_name" ]]; then
router_name="$name"
elif [[ "$router_name" != "$name" ]]; then
echo "Prediction files belong to different routers:" >&2
printf ' %s\n' "${CHANGED_FILES[@]}" >&2
exit 1
fi
done

if [[ ${#CHANGED_FILES[@]} -ne 2 || $has_base -ne 1 || $has_robustness -ne 1 ]]; then
echo "Expected exactly two prediction files (router and router-robustness), found:" >&2
printf ' %s\n' "${CHANGED_FILES[@]}" >&2
exit 1
fi
ROUTER_NAME=$(basename "$CHANGED_FILES" .json)

ROUTER_NAME="$router_name"
echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT"

# Detect split based on prediction file size (from PR branch)
Expand Down Expand Up @@ -114,6 +145,8 @@ jobs:
mkdir -p base/router_inference/predictions
cp "pr/router_inference/predictions/${ROUTER_NAME}.json" \
"base/router_inference/predictions/${ROUTER_NAME}.json"
cp "pr/router_inference/predictions/${ROUTER_NAME}-robustness.json" \
"base/router_inference/predictions/${ROUTER_NAME}-robustness.json"
echo "Copied prediction file from PR to base workspace"

- name: Evaluate submission
Expand All @@ -123,7 +156,7 @@ jobs:
env:
ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset
run: |
set -euo pipefail
set -euo pipefail; trap 'cat evaluation_output.txt' EXIT
# Uses base repo's evaluation script (safe - not from PR)
BASE_SHA="${{ github.event.pull_request.base.sha }}"
uv run python automation/process_pr_submission.py \
Expand Down Expand Up @@ -161,6 +194,9 @@ jobs:
comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
const robustnessScore = metrics.robustness_score;
const robustnessCell = robustnessScore !== undefined ? robustnessScore.toFixed(4) : 'N/A';
comment += `| **Robustness Score** | ${robustnessCell} |\n`;

// Add optimality scores if available
if (metrics.optimality) {
Expand Down
20 changes: 15 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,9 @@ See the [`ModelInference`](./llm_inference/model_inference.py) class for the com

## 2. Get Routing Decisions

Follow the steps below to obtain your router's model choices for each query. Start with the `sub_10` split (a 10% subset) for local testing. Once your setup works, you can evaluate on the `full` dataset for full local evaluation and official leaderboard submission.
Follow the steps below to obtain your router's model choices for each query. Start with the `sub_10` split (a 10% subset) for local testing. Once your setup works, you can evaluate:
- on the `full` dataset for full local evaluation and official leaderboard submission.
- on the `robustness` dataset for robustness evaluation.

### Step 2.1: Prepare Config File

Expand Down Expand Up @@ -138,7 +140,7 @@ router = MyRouter(args.router_name)
Finally, generate the prediction file:

```bash
uv run python ./router_inference/generate_prediction_file.py your-router [sub_10|full]
uv run python ./router_inference/generate_prediction_file.py your-router [sub_10|full|robustness]
```

> [!NOTE]
Expand All @@ -148,10 +150,10 @@ uv run python ./router_inference/generate_prediction_file.py your-router [sub_10
### Step 2.3: Validate Config and Prediction Files

```bash
uv run python ./router_inference/check_config_prediction_files.py your-router [sub_10|full]
uv run python ./router_inference/check_config_prediction_files.py your-router [sub_10|full|robustness]
```

This script checks: (1) all model names are valid, (2) prediction file has correct size (809 for `sub_10`, 8400 for `full`), and (3) all entries have valid `global_index`, `prompt`, and `prediction` fields.
This script checks: (1) all model names are valid, (2) prediction file has correct size (809 for `sub_10`, 8400 for `full`, 420 for `robustness`), and (3) all entries have valid `global_index`, `prompt`, and `prediction` fields.

## 3. Run LLM Inference

Expand All @@ -162,22 +164,29 @@ uv run python ./llm_inference/run.py your-router
```

The script loads your prediction file, makes API calls using the models specified in the `prediction` field, and saves results incrementally. It uses cached results when available and saves progress after each query, so you can safely interrupt and resume. Results are saved to `./cached_results/` for reuse across routers.
> [!NOTE]
> - For robustness evaluation, we only measure the model-selection flip ratio after adding noise to the original prompt, so no additional LLM inference is required for this stage.

## 4. Run Router Evaluation

As the last step, run the evaluation script:

```bash
uv run python ./llm_evaluation/run.py your-router [sub_10|full]
uv run python ./llm_evaluation/run.py your-router [sub_10|full|--calculate-robustness-score]
Comment thread
jiarong0907 marked this conversation as resolved.
Outdated
```

> [!TIP]
> - sub_10|full for evaluation on those datasets
> - --calculate-robustness-score for robustness evaluation only

Comment thread
jiarong0907 marked this conversation as resolved.
# Submitting to the leaderboard

To get your router on the leaderboard, you can open a Pull Request with your router's prediction file to trigger our automated evaluation workflow. Details are as follows:

1. **Add your files**:
- `router_inference/config/<router_name>.json` - Your router configuration
- `router_inference/predictions/<router_name>.json` - Your prediction file with `generated_result` fields populated
- `router_inference/predictions/<router_name>-robustness.json` - Your prediction file for robustness evaluation, no `generated_result` fields needed
2. **Open a Pull Request to `main` branch** - The automated workflow will:
- Validate your submission
- Run evaluation on the full dataset
Expand Down Expand Up @@ -213,6 +222,7 @@ Feel free to contact us for contributions and collaborations.

```
Yifan Lu (yifan.lu@rice.edu)
Rixin Liu (rixin.liu@rice.edu)
Jiarong Xing (jxing@rice.edu)
```

Expand Down
136 changes: 134 additions & 2 deletions automation/process_pr_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@
from pathlib import Path
from typing import Iterable, Optional

from universal_model_names import ModelNameManager


REPO_ROOT = Path(__file__).resolve().parents[1]
WORKTREES_DIR = REPO_ROOT / ".pr_worktrees"
Expand Down Expand Up @@ -149,11 +151,14 @@ def cleanup_worktree(worktree_path: Path, branch_name: str, *, keep: bool) -> No


def ensure_prediction_file_added(
worktree_path: Path, base_ref: str, router_name: str
worktree_path: Path, base_ref: str, router_name: str, *, robustness: bool = False
) -> None:
"""Verify the PR adds or modifies a prediction file for the specified router."""

target_path = Path("router_inference") / "predictions" / f"{router_name}.json"
suffix = "-robustness" if robustness else ""
target_path = (
Path("router_inference") / "predictions" / f"{router_name}{suffix}.json"
)

diff_cmd = [
"git",
Expand Down Expand Up @@ -274,6 +279,110 @@ def compute_scores(prediction_file: Path) -> dict[str, float]:
}


def compute_robustness_score_from_predictions(
full_prediction_file: Path, robustness_prediction_file: Path
) -> Optional[float]:
"""Compute robustness flip ratio between full/sub_10 and robustness splits."""

manager = ModelNameManager()

with full_prediction_file.open("r", encoding="utf-8") as full_handle:
full_predictions = json.load(full_handle)
with robustness_prediction_file.open("r", encoding="utf-8") as robustness_handle:
robustness_predictions = json.load(robustness_handle)

if not isinstance(full_predictions, list) or not isinstance(
robustness_predictions, list
):
raise ValueError("Prediction payload must be a list of entries.")

def normalize(name: Optional[str]) -> Optional[str]:
if name is None:
return None
try:
return manager.get_universal_name(name)
except Exception:
Comment thread
RixinLiu marked this conversation as resolved.
Outdated
return name

full_map: dict[str, dict[str, object]] = {}
for entry in full_predictions:
if not isinstance(entry, dict):
continue
if entry.get("for_optimality", False):
continue
global_index = entry.get("global index") or entry.get("global_index")
if global_index is None:
continue
key = str(global_index)
if key not in full_map:
full_map[key] = entry

if not full_map:
return None

matched = 0
flips = 0
for entry in robustness_predictions:
if not isinstance(entry, dict):
continue
global_index = entry.get("global index") or entry.get("global_index")
if global_index is None:
continue
key = str(global_index)
full_entry = full_map.get(key)
if not full_entry:
continue

full_model = full_entry.get("prediction")
robust_model = entry.get("prediction")
if not full_model or not robust_model:
continue

matched += 1
if normalize(str(full_model)) != normalize(str(robust_model)):
flips += 1

if matched == 0:
return None

return 1.0 - flips / matched

Comment thread
jiarong0907 marked this conversation as resolved.

def append_robustness_score_to_metrics(
metrics: dict[str, object],
prediction_file: Path,
robustness_prediction_file: Path,
metrics_path: Path,
) -> dict[str, object]:
"""
Ensure robustness_score is present in metrics, computing it if necessary.
"""

if "robustness_score" in metrics:
return metrics

if not robustness_prediction_file.exists():
print(
"⚠ Robustness prediction file not found; skipping robustness score computation."
)
return metrics

score = compute_robustness_score_from_predictions(
prediction_file, robustness_prediction_file
)
if score is None:
print(
"⚠ Could not compute robustness score because no overlapping entries were found."
)
return metrics

metrics["robustness_score"] = score
with metrics_path.open("w", encoding="utf-8") as handle:
json.dump(metrics, handle, indent=2)
print(f"✔ Appended robustness_score={score:.4f} to metrics.json")
return metrics


def compute_arena_score(
cost: float,
accuracy: float,
Expand Down Expand Up @@ -387,6 +496,9 @@ def main(argv: Optional[list[str]] = None) -> int:

if not args.allow_existing_prediction:
ensure_prediction_file_added(worktree_path, base_ref, args.router)
ensure_prediction_file_added(
worktree_path, base_ref, args.router, robustness=True
)

if not args.skip_sync:
run_command(["uv", "sync", "--locked"], cwd=worktree_path, capture=True)
Expand All @@ -410,6 +522,7 @@ def main(argv: Optional[list[str]] = None) -> int:
args.router,
args.split,
"--force",
"--calculate-robustness-score",
]

evaluation_logs = ""
Expand Down Expand Up @@ -437,6 +550,19 @@ def main(argv: Optional[list[str]] = None) -> int:
).strip()
)

robustness_prediction_file = prediction_file.with_name(
f"{args.router}-robustness.json"
)
if not robustness_prediction_file.exists():
raise FileNotFoundError(
textwrap.dedent(
f"""
Robustness prediction file not found: {robustness_prediction_file}
Ensure the pull request includes router_inference/predictions/{args.router}-robustness.json
"""
).strip()
)

# Read metrics from metrics.json (required - no fallback)
# llm_evaluation/run.py writes metrics.json to the current working directory (worktree_path)
metrics_path = worktree_path / "metrics.json"
Expand All @@ -455,6 +581,10 @@ def main(argv: Optional[list[str]] = None) -> int:
with open(metrics_path, "r") as f:
metrics = json.load(f)

metrics = append_robustness_score_to_metrics(
metrics, prediction_file, robustness_prediction_file, metrics_path
)

# Copy metrics.json to base directory (REPO_ROOT) for workflow to read
base_metrics_path = REPO_ROOT / "metrics.json"
shutil.copy2(metrics_path, base_metrics_path)
Expand All @@ -477,6 +607,8 @@ def main(argv: Optional[list[str]] = None) -> int:

archived_prediction = run_dir / f"{args.router}.json"
shutil.copy2(prediction_file, archived_prediction)
archived_robust_prediction = run_dir / f"{args.router}-robustness.json"
shutil.copy2(robustness_prediction_file, archived_robust_prediction)

summary_payload: dict[str, object] = {
"pr": args.pr,
Expand Down
Loading