RouteWorks · jiarong0907 · Nov 10, 2025 · Nov 7, 2025 · Nov 7, 2025 · Nov 7, 2025
diff --git a/.github/workflows/pr-evaluation.yml b/.github/workflows/pr-evaluation.yml
@@ -0,0 +1,156 @@
+name: Router Submission Evaluation
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - "router_inference/predictions/**"
+
+jobs:
+  evaluate-router:
+    runs-on: self-hosted
+    permissions:
+      contents: read
+      pull-requests: write
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Detect new prediction file
+        id: detect
+        shell: bash
+        run: |
+          set -euo pipefail
+          git fetch origin main
+          NEW_FILES=$(git diff --name-status origin/main...HEAD -- router_inference/predictions/*.json | awk '$1 == "A" {print $2}')
+          if [[ -z "$NEW_FILES" ]]; then
+            echo "No newly added prediction file detected; skipping evaluation."
+            echo "router=" >> "$GITHUB_OUTPUT"
+            exit 0
+          fi
+          if [[ $(echo "$NEW_FILES" | wc -l) -ne 1 ]]; then
+            echo "Expected exactly one new prediction file, found:" >&2
+            echo "$NEW_FILES" >&2
+            exit 1
+          fi
+          ROUTER_NAME=$(basename "$NEW_FILES" .json)
+          echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT"
+
+          # Detect split based on prediction file size
+          PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json"
+          if [[ ! -f "$PREDICTION_FILE" ]]; then
+            echo "Error: Prediction file not found at $PREDICTION_FILE" >&2
+            exit 1
+          fi
+          ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))")
+          echo "Prediction file contains $ENTRY_COUNT entries"
+
+          if [[ "$ENTRY_COUNT" -eq 8400 ]]; then
+            SPLIT="full"
+          elif [[ "$ENTRY_COUNT" -eq 809 ]]; then
+            SPLIT="sub_10"
+          else
+            echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to sub_10." >&2
+            SPLIT="sub_10"
+          fi
+          echo "split=$SPLIT" >> "$GITHUB_OUTPUT"
+
+      - name: Show detected router
+        if: ${{ steps.detect.outputs.router != '' }}
+        run: |
+          set -euo pipefail
+          echo "Detected router submission: ${{ steps.detect.outputs.router }}"
+          echo "Detected split: ${{ steps.detect.outputs.split }}"
+
+      - name: Prepare dataset
+        if: ${{ steps.detect.outputs.router != '' }}
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          set -euo pipefail
+          # Verify HF_TOKEN is set if needed for private datasets
+          if [[ -z "${HF_TOKEN:-}" ]]; then
+            echo "⚠ Warning: HF_TOKEN not set. This may fail if accessing private datasets."
+          fi
+          # Always prepare dataset to ensure it's available
+          echo "Preparing dataset..."
+          mkdir -p "${{ github.workspace }}/dataset"
+          uv run python scripts/process_datasets/prep_datasets.py
+
+      - name: Evaluate submission
+        if: ${{ steps.detect.outputs.router != '' }}
+        id: evaluate
+        env:
+          ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: |
+          set -euo pipefail
+          uv run python automation/process_pr_submission.py \
+            --pr "${{ github.event.pull_request.number }}" \
+            --router "${{ steps.detect.outputs.router }}" \
+            --split "${{ steps.detect.outputs.split }}" > evaluation_output.txt 2>&1
+          # Extract metrics from output
+          if grep -q "Metrics:" evaluation_output.txt; then
+            python3 automation/extract_metrics.py evaluation_output.txt
+          fi
+          cat evaluation_output.txt
+
+      - name: Post evaluation results as PR comment
+        if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }}
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+
+            let comment = '## 📊 Router Evaluation Results\n\n';
+            comment += `**Router:** \`${{ steps.detect.outputs.router }}\`\n`;
+            comment += `**Dataset Split:** \`${{ steps.detect.outputs.split }}\`\n\n`;
+
+            // Try to read metrics from file
+            try {
+              if (fs.existsSync('metrics.json')) {
+                const metrics = JSON.parse(fs.readFileSync('metrics.json', 'utf8'));
+                comment += '### Metrics\n\n';
+                comment += '| Metric | Value |\n';
+                comment += '|--------|-------|\n';
+                comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`;
+                comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`;
+                comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`;
+                comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
+                comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
+                comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
+              } else {
+                // Fallback: try to parse from evaluation output
+                const output = fs.readFileSync('evaluation_output.txt', 'utf8');
+                const metricsMatch = output.match(/Metrics:\s*(\{[\s\S]*?\})/);
+                if (metricsMatch) {
+                  const metrics = JSON.parse(metricsMatch[1]);
+                  comment += '### Metrics\n\n';
+                  comment += '| Metric | Value |\n';
+                  comment += '|--------|-------|\n';
+                  comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`;
+                  comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`;
+                  comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`;
+                  comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
+                  comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
+                  comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
+                } else {
+                  comment += '⚠️ Could not parse evaluation metrics from output.\n';
+                }
+              }
+            } catch (error) {
+              comment += `⚠️ Error reading metrics: ${error.message}\n`;
+            }
+
+            comment += '\n---\n';
+            comment += '*Evaluation completed by RouterArena automated workflow*';
+
+            github.rest.issues.createComment({
+              issue_number: context.payload.pull_request.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: comment
+            });
diff --git a/.gitignore b/.gitignore
@@ -14,6 +14,8 @@ dataset/
 *.log
 logs/
 /cached_results2/
+.pr_worktrees/
+pr_evaluations/
 
 # Environment files
 .venv

diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ The current leaderboard is computed considering the accuracy and overall cost fo
 
 <h2 align="left">Have your router on there!</h3>
 
-If you want your router on the leaderboard, please contact us via email at yifan.lu@rice.edu or jxing@rice.edu, or submit a GitHub issue. For fairness, we have withheld the ground truth answers for the full dataset. However, you can still test your router using the sub-sampled 10% dataset by following the steps below.
+If you want your router on the leaderboard, submit a Pull Request with your router's prediction file. For questions or issues, please open a GitHub issue. For fairness, we have withheld the ground truth answers for the full dataset. However, you can still test your router using the sub-sampled 10% dataset by following the steps below.
 
 ## Setup
 
@@ -116,7 +116,7 @@ Create a config file in `./router_inference/config/<router_name>.json`. We have
 
 *Note: The model name must be the same as the one used in `./universal_model_names.py` (see next step for details)*
 
-**Important**: For each model in your config, add an entry with the pricing per million tokens in this format:
+**Important**: For each model in your config, add an entry with the pricing per million tokens in this format at `model_cost/cost.json`:
 
 ```json
 {
@@ -165,7 +165,27 @@ The script loads your prediction file, makes API calls using the models specifie
 
 ## LLM Evaluation and Compute RouterArena Score
 
-**Important**: For the `sub_10` split (testing), you can run evaluation locally and get RouterArena scores. For the `full` dataset (official leaderboard), ground truth answers are not available locally. After running LLM inference on the `full` dataset, submit your prediction file via GitHub issue or contact us at yifan.lu@rice.edu or jxing@rice.edu for official evaluation.
+**Important**: For the `sub_10` split (testing), you can run evaluation locally and get RouterArena scores. For the `full` dataset (official leaderboard), ground truth answers are not available locally, but you can submit your router for automatic evaluation via Pull Request.
+
+### Submitting Your Router for Official Evaluation
+
+After running LLM inference on the `full` dataset and ensuring your prediction file has `generated_result` fields populated, submit a Pull Request:
+
+1. **Fork the repository** and create a new branch
+2. **Add your files**:
+   - `router_inference/config/<router_name>.json` - Your router configuration
+   - `router_inference/predictions/<router_name>.json` - Your prediction file with `generated_result` fields populated
+3. **Open a Pull Request** - The automated workflow will:
+   - Validate your submission
+   - Run evaluation on the full dataset
+   - Post results as a comment on your PR
+   - Update the leaderboard upon approval
+
+**Note**: Make sure to run `llm_inference/run.py` first to populate the `generated_result` fields in your prediction file before submitting.
+
+For questions or issues, please open a GitHub issue.
+
+### Local Evaluation (sub_10 split)
 
 For local evaluation on the `sub_10` split, run the evaluation script:
 

diff --git a/automation/extract_metrics.py b/automation/extract_metrics.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright contributors to the RouterArena project
+# SPDX-License-Identifier: Apache-2.0
+
+"""Extract metrics from evaluation output and save to JSON file."""
+
+import json
+import re
+import sys
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: extract_metrics.py <output_file>")
+        sys.exit(1)
+
+    output_file = sys.argv[1]
+
+    try:
+        with open(output_file, "r") as f:
+            content = f.read()
+
+        # Find the Metrics JSON block
+        match = re.search(r"Metrics:\s*(\{.*?\})", content, re.DOTALL)
+        if match:
+            metrics_json = match.group(1)
+            metrics = json.loads(metrics_json)
+
+            # Write metrics to file
+            with open("metrics.json", "w") as mf:
+                json.dump(metrics, mf)
+
+            # Output key metrics as step outputs
+            print(f"accuracy={metrics['accuracy']}")
+            print(f"arena_score={metrics['arena_score']}")
+            print(f"total_cost={metrics['total_cost']}")
+            print(f"num_queries={metrics['num_queries']}")
+        else:
+            print("No metrics found in output", file=sys.stderr)
+            sys.exit(1)
+    except Exception as e:
+        print(f"Error extracting metrics: {e}", file=sys.stderr)
+        sys.exit(1)