[Feat] Support evaluation on modified prediction files #21
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Router Submission Evaluation | |
| on: | |
| pull_request: | |
| types: [opened, synchronize, reopened] | |
| paths: | |
| - "router_inference/predictions/**" | |
| jobs: | |
| evaluate-router: | |
| runs-on: self-hosted | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| steps: | |
| - name: Checkout PR branch for file detection | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha }} | |
| fetch-depth: 0 | |
| - name: Detect changed prediction file | |
| id: detect | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| # Compare against the fork's base branch (the branch the PR was created from) | |
| # This ensures each router submission is evaluated independently | |
| BASE_REF="${{ github.event.pull_request.base.ref }}" | |
| BASE_SHA="${{ github.event.pull_request.base.sha }}" | |
| if [[ -z "$BASE_SHA" ]]; then | |
| echo "Error: Could not determine PR base SHA" >&2 | |
| exit 1 | |
| fi | |
| # Fetch the base branch to ensure it's available for comparison | |
| git fetch origin "$BASE_REF" || true | |
| # Try to fetch the specific base SHA if it's not already available | |
| if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then | |
| echo "Base SHA $BASE_SHA not found locally, attempting to fetch..." | |
| git fetch origin "$BASE_SHA" || git fetch origin "$BASE_REF" || true | |
| fi | |
| # For PRs from forks, we want to compare against the fork's base branch state | |
| # Use three-dot diff to show changes from merge-base to HEAD (only PR changes) | |
| # This isolates the evaluation to changes in this specific fork submission | |
| CHANGED_FILES=$(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>&1 | awk '$1 == "A" || $1 == "M" {print $2}') | |
| if [[ -z "$CHANGED_FILES" ]]; then | |
| echo "No changed prediction file detected; skipping evaluation." | |
| echo "router=" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| if [[ $(echo "$CHANGED_FILES" | wc -l) -ne 1 ]]; then | |
| echo "Expected exactly one changed prediction file, found:" >&2 | |
| echo "$CHANGED_FILES" >&2 | |
| exit 1 | |
| fi | |
| ROUTER_NAME=$(basename "$CHANGED_FILES" .json) | |
| echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT" | |
| # Detect split based on prediction file size (from PR branch) | |
| PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json" | |
| if [[ ! -f "$PREDICTION_FILE" ]]; then | |
| echo "Error: Prediction file not found at $PREDICTION_FILE" >&2 | |
| exit 1 | |
| fi | |
| ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))") | |
| echo "Prediction file contains $ENTRY_COUNT entries" | |
| if [[ "$ENTRY_COUNT" -eq 8400 ]]; then | |
| SPLIT="full" | |
| elif [[ "$ENTRY_COUNT" -eq 809 ]]; then | |
| SPLIT="sub_10" | |
| else | |
| echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to sub_10." >&2 | |
| SPLIT="sub_10" | |
| fi | |
| echo "split=$SPLIT" >> "$GITHUB_OUTPUT" | |
| - name: Continue using PR branch for evaluation | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| run: | | |
| set -euo pipefail | |
| # We stay on the PR branch to use the code from the PR | |
| # This allows the PR to include both router submissions AND code improvements | |
| # The prediction file is already available from the detection step | |
| ROUTER_NAME="${{ steps.detect.outputs.router }}" | |
| PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json" | |
| # Verify the file exists and has content | |
| if [[ ! -f "$PREDICTION_FILE" ]]; then | |
| echo "Error: Prediction file not found at $PREDICTION_FILE" >&2 | |
| exit 1 | |
| fi | |
| echo "Using PR branch for evaluation (includes both router submission and code changes)" | |
| echo "Prediction file ready: $PREDICTION_FILE" | |
| - name: Show detected router | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| run: | | |
| set -euo pipefail | |
| echo "Detected router submission: ${{ steps.detect.outputs.router }}" | |
| echo "Detected split: ${{ steps.detect.outputs.split }}" | |
| - name: Prepare dataset | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| run: | | |
| set -euo pipefail | |
| # Prepare dataset from public repository | |
| echo "Preparing dataset..." | |
| mkdir -p "${{ github.workspace }}/dataset" | |
| uv run python scripts/process_datasets/prep_datasets.py | |
| - name: Evaluate submission | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| id: evaluate | |
| env: | |
| ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset | |
| run: | | |
| set -euo pipefail | |
| # Use the PR's base branch SHA for comparison (fork's base, not upstream main) | |
| BASE_SHA="${{ github.event.pull_request.base.sha }}" | |
| uv run python automation/process_pr_submission.py \ | |
| --pr "${{ github.event.pull_request.number }}" \ | |
| --router "${{ steps.detect.outputs.router }}" \ | |
| --split "${{ steps.detect.outputs.split }}" \ | |
| --base-ref "$BASE_SHA" > evaluation_output.txt 2>&1 | |
| # Extract metrics from output | |
| if grep -q "Metrics:" evaluation_output.txt; then | |
| python3 automation/extract_metrics.py evaluation_output.txt | |
| fi | |
| cat evaluation_output.txt | |
| - name: Post evaluation results as PR comment | |
| if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }} | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| let comment = '## 📊 Router Evaluation Results\n\n'; | |
| comment += `**Router:** \`${{ steps.detect.outputs.router }}\`\n`; | |
| comment += `**Dataset Split:** \`${{ steps.detect.outputs.split }}\`\n\n`; | |
| // Try to read metrics from file | |
| try { | |
| if (fs.existsSync('metrics.json')) { | |
| const metrics = JSON.parse(fs.readFileSync('metrics.json', 'utf8')); | |
| comment += '### Metrics\n\n'; | |
| comment += '| Metric | Value |\n'; | |
| comment += '|--------|-------|\n'; | |
| comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; | |
| comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; | |
| comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; | |
| comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; | |
| } else { | |
| // Fallback: try to parse from evaluation output | |
| const output = fs.readFileSync('evaluation_output.txt', 'utf8'); | |
| const metricsMatch = output.match(/Metrics:\s*(\{[\s\S]*?\})/); | |
| if (metricsMatch) { | |
| const metrics = JSON.parse(metricsMatch[1]); | |
| comment += '### Metrics\n\n'; | |
| comment += '| Metric | Value |\n'; | |
| comment += '|--------|-------|\n'; | |
| comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; | |
| comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; | |
| comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; | |
| comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; | |
| } else { | |
| comment += '⚠️ Could not parse evaluation metrics from output.\n'; | |
| } | |
| } | |
| } catch (error) { | |
| comment += `⚠️ Error reading metrics: ${error.message}\n`; | |
| } | |
| comment += '\n---\n'; | |
| comment += '*Evaluation completed by RouterArena automated workflow*'; | |
| github.rest.issues.createComment({ | |
| issue_number: context.payload.pull_request.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); |