[DON'T MERGE] Testing router submission via file modification #18
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Router Submission Evaluation | |
| on: | |
| pull_request: | |
| types: [opened, synchronize, reopened] | |
| paths: | |
| - "router_inference/predictions/**" | |
| jobs: | |
| evaluate-router: | |
| runs-on: self-hosted | |
| permissions: | |
| contents: read | |
| pull-requests: write | |
| steps: | |
| - name: Checkout PR branch for file detection | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: ${{ github.event.pull_request.head.sha }} | |
| fetch-depth: 0 | |
| - name: Detect changed prediction file | |
| id: detect | |
| shell: bash | |
| run: | | |
| set -euo pipefail | |
| # Compare against the fork's base branch (the branch the PR was created from) | |
| # This ensures each router submission is evaluated independently | |
| BASE_REF="${{ github.event.pull_request.base.ref }}" | |
| BASE_SHA="${{ github.event.pull_request.base.sha }}" | |
| if [[ -z "$BASE_SHA" ]]; then | |
| echo "Error: Could not determine PR base SHA" >&2 | |
| exit 1 | |
| fi | |
| # Fetch the base branch to ensure it's available for comparison | |
| git fetch origin "$BASE_REF" || true | |
| # Try to fetch the specific base SHA if it's not already available | |
| if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then | |
| echo "Base SHA $BASE_SHA not found locally, attempting to fetch..." | |
| git fetch origin "$BASE_SHA" || git fetch origin "$BASE_REF" || true | |
| fi | |
| # For PRs from forks, we want to compare against the fork's base branch state | |
| # Use three-dot diff to show changes from merge-base to HEAD (only PR changes) | |
| # This isolates the evaluation to changes in this specific fork submission | |
| CHANGED_FILES=$(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>&1 | awk '$1 == "A" || $1 == "M" {print $2}') | |
| if [[ -z "$CHANGED_FILES" ]]; then | |
| echo "No changed prediction file detected; skipping evaluation." | |
| echo "router=" >> "$GITHUB_OUTPUT" | |
| exit 0 | |
| fi | |
| if [[ $(echo "$CHANGED_FILES" | wc -l) -ne 1 ]]; then | |
| echo "Expected exactly one changed prediction file, found:" >&2 | |
| echo "$CHANGED_FILES" >&2 | |
| exit 1 | |
| fi | |
| ROUTER_NAME=$(basename "$CHANGED_FILES" .json) | |
| echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT" | |
| # Detect split based on prediction file size (from PR branch) | |
| PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json" | |
| if [[ ! -f "$PREDICTION_FILE" ]]; then | |
| echo "Error: Prediction file not found at $PREDICTION_FILE" >&2 | |
| exit 1 | |
| fi | |
| ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))") | |
| echo "Prediction file contains $ENTRY_COUNT entries" | |
| if [[ "$ENTRY_COUNT" -eq 8400 ]]; then | |
| SPLIT="full" | |
| elif [[ "$ENTRY_COUNT" -eq 809 ]]; then | |
| SPLIT="sub_10" | |
| else | |
| echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to sub_10." >&2 | |
| SPLIT="sub_10" | |
| fi | |
| echo "split=$SPLIT" >> "$GITHUB_OUTPUT" | |
| # Save the prediction file to temporary location for copying to main branch | |
| mkdir -p /tmp/pr_predictions | |
| cp "$PREDICTION_FILE" "/tmp/pr_predictions/${ROUTER_NAME}.json" | |
| echo "Saved prediction file to /tmp/pr_predictions/${ROUTER_NAME}.json" | |
| - name: Checkout main branch for evaluation | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| uses: actions/checkout@v4 | |
| with: | |
| ref: main | |
| fetch-depth: 0 | |
| - name: Copy prediction file from PR branch | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| run: | | |
| set -euo pipefail | |
| ROUTER_NAME="${{ steps.detect.outputs.router }}" | |
| mkdir -p "router_inference/predictions" | |
| # Copy the prediction file that was saved in the previous step | |
| # Always use the PR's version, overwriting any existing file in main branch | |
| if [[ -f "/tmp/pr_predictions/${ROUTER_NAME}.json" ]]; then | |
| cp -f "/tmp/pr_predictions/${ROUTER_NAME}.json" "router_inference/predictions/${ROUTER_NAME}.json" | |
| echo "Successfully copied prediction file from PR branch (overwrote any existing file)" | |
| else | |
| # Fallback: try to fetch from PR branch directly | |
| echo "Prediction file not in /tmp, attempting to fetch from PR branch..." | |
| git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-branch || true | |
| PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json" | |
| # Use -f flag to force overwrite if file exists | |
| if git show pr-branch:"$PREDICTION_FILE" > "router_inference/predictions/${ROUTER_NAME}.json" 2>/dev/null; then | |
| echo "Successfully retrieved prediction file from PR branch (overwrote any existing file)" | |
| else | |
| echo "Error: Could not retrieve prediction file from PR branch" >&2 | |
| exit 1 | |
| fi | |
| fi | |
| # Verify the file exists and has content | |
| if [[ ! -f "router_inference/predictions/${ROUTER_NAME}.json" ]]; then | |
| echo "Error: Prediction file was not copied successfully" >&2 | |
| exit 1 | |
| fi | |
| echo "Prediction file ready: router_inference/predictions/${ROUTER_NAME}.json" | |
| - name: Show detected router | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| run: | | |
| set -euo pipefail | |
| echo "Detected router submission: ${{ steps.detect.outputs.router }}" | |
| echo "Detected split: ${{ steps.detect.outputs.split }}" | |
| - name: Prepare dataset | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| run: | | |
| set -euo pipefail | |
| # Prepare dataset from public repository | |
| echo "Preparing dataset..." | |
| mkdir -p "${{ github.workspace }}/dataset" | |
| uv run python scripts/process_datasets/prep_datasets.py | |
| - name: Evaluate submission | |
| if: ${{ steps.detect.outputs.router != '' }} | |
| id: evaluate | |
| env: | |
| ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset | |
| run: | | |
| set -euo pipefail | |
| # Use the PR's base branch for comparison (fork's base, not upstream main) | |
| BASE_REF="${{ github.event.pull_request.base.ref }}" | |
| BASE_SHA="${{ github.event.pull_request.base.sha }}" | |
| uv run python automation/process_pr_submission.py \ | |
| --pr "${{ github.event.pull_request.number }}" \ | |
| --router "${{ steps.detect.outputs.router }}" \ | |
| --split "${{ steps.detect.outputs.split }}" \ | |
| --base-ref "$BASE_SHA" > evaluation_output.txt 2>&1 | |
| # Extract metrics from output | |
| if grep -q "Metrics:" evaluation_output.txt; then | |
| python3 automation/extract_metrics.py evaluation_output.txt | |
| fi | |
| cat evaluation_output.txt | |
| - name: Post evaluation results as PR comment | |
| if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }} | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| const path = require('path'); | |
| let comment = '## 📊 Router Evaluation Results\n\n'; | |
| comment += `**Router:** \`${{ steps.detect.outputs.router }}\`\n`; | |
| comment += `**Dataset Split:** \`${{ steps.detect.outputs.split }}\`\n\n`; | |
| // Try to read metrics from file | |
| try { | |
| if (fs.existsSync('metrics.json')) { | |
| const metrics = JSON.parse(fs.readFileSync('metrics.json', 'utf8')); | |
| comment += '### Metrics\n\n'; | |
| comment += '| Metric | Value |\n'; | |
| comment += '|--------|-------|\n'; | |
| comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; | |
| comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; | |
| comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; | |
| comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; | |
| } else { | |
| // Fallback: try to parse from evaluation output | |
| const output = fs.readFileSync('evaluation_output.txt', 'utf8'); | |
| const metricsMatch = output.match(/Metrics:\s*(\{[\s\S]*?\})/); | |
| if (metricsMatch) { | |
| const metrics = JSON.parse(metricsMatch[1]); | |
| comment += '### Metrics\n\n'; | |
| comment += '| Metric | Value |\n'; | |
| comment += '|--------|-------|\n'; | |
| comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; | |
| comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; | |
| comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; | |
| comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; | |
| comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; | |
| } else { | |
| comment += '⚠️ Could not parse evaluation metrics from output.\n'; | |
| } | |
| } | |
| } catch (error) { | |
| comment += `⚠️ Error reading metrics: ${error.message}\n`; | |
| } | |
| comment += '\n---\n'; | |
| comment += '*Evaluation completed by RouterArena automated workflow*'; | |
| github.rest.issues.createComment({ | |
| issue_number: context.payload.pull_request.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); |