Skip to content

[DON'T MERGE] Testing router submission via file modification #17

[DON'T MERGE] Testing router submission via file modification

[DON'T MERGE] Testing router submission via file modification #17

Workflow file for this run

name: Router Submission Evaluation
on:
pull_request:
types: [opened, synchronize, reopened]
paths:
- "router_inference/predictions/**"
jobs:
evaluate-router:
runs-on: self-hosted
permissions:
contents: read
pull-requests: write
steps:
- name: Checkout PR branch for file detection
uses: actions/checkout@v4
with:
ref: ${{ github.event.pull_request.head.sha }}
fetch-depth: 0
- name: Detect changed prediction file
id: detect
shell: bash
run: |
set -euo pipefail
# Compare against the fork's base branch (the branch the PR was created from)
# This ensures each router submission is evaluated independently
BASE_REF="${{ github.event.pull_request.base.ref }}"
BASE_SHA="${{ github.event.pull_request.base.sha }}"
if [[ -z "$BASE_SHA" ]]; then
echo "Error: Could not determine PR base SHA" >&2
exit 1
fi
# Fetch the base branch to ensure it's available for comparison
git fetch origin "$BASE_REF" || true
# Try to fetch the specific base SHA if it's not already available
if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then
echo "Base SHA $BASE_SHA not found locally, attempting to fetch..."
git fetch origin "$BASE_SHA" || git fetch origin "$BASE_REF" || true
fi
# For PRs from forks, we want to compare against the fork's base branch state
# Use three-dot diff to show changes from merge-base to HEAD (only PR changes)
# This isolates the evaluation to changes in this specific fork submission
CHANGED_FILES=$(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>&1 | awk '$1 == "A" || $1 == "M" {print $2}')
if [[ -z "$CHANGED_FILES" ]]; then
echo "No changed prediction file detected; skipping evaluation."
echo "router=" >> "$GITHUB_OUTPUT"
exit 0
fi
if [[ $(echo "$CHANGED_FILES" | wc -l) -ne 1 ]]; then
echo "Expected exactly one changed prediction file, found:" >&2
echo "$CHANGED_FILES" >&2
exit 1
fi
ROUTER_NAME=$(basename "$CHANGED_FILES" .json)
echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT"
# Detect split based on prediction file size (from PR branch)
PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json"
if [[ ! -f "$PREDICTION_FILE" ]]; then
echo "Error: Prediction file not found at $PREDICTION_FILE" >&2
exit 1
fi
ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))")
echo "Prediction file contains $ENTRY_COUNT entries"
if [[ "$ENTRY_COUNT" -eq 8400 ]]; then
SPLIT="full"
elif [[ "$ENTRY_COUNT" -eq 809 ]]; then
SPLIT="sub_10"
else
echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to sub_10." >&2
SPLIT="sub_10"
fi
echo "split=$SPLIT" >> "$GITHUB_OUTPUT"
# Save the prediction file to temporary location for copying to main branch
mkdir -p /tmp/pr_predictions
cp "$PREDICTION_FILE" "/tmp/pr_predictions/${ROUTER_NAME}.json"
echo "Saved prediction file to /tmp/pr_predictions/${ROUTER_NAME}.json"
- name: Checkout main branch for evaluation
if: ${{ steps.detect.outputs.router != '' }}
uses: actions/checkout@v4
with:
ref: main
fetch-depth: 0
- name: Copy prediction file from PR branch
if: ${{ steps.detect.outputs.router != '' }}
run: |
set -euo pipefail
ROUTER_NAME="${{ steps.detect.outputs.router }}"
mkdir -p "router_inference/predictions"
# Copy the prediction file that was saved in the previous step
# Always use the PR's version, overwriting any existing file in main branch
if [[ -f "/tmp/pr_predictions/${ROUTER_NAME}.json" ]]; then
cp -f "/tmp/pr_predictions/${ROUTER_NAME}.json" "router_inference/predictions/${ROUTER_NAME}.json"
echo "Successfully copied prediction file from PR branch (overwrote any existing file)"
else
# Fallback: try to fetch from PR branch directly
echo "Prediction file not in /tmp, attempting to fetch from PR branch..."
git fetch origin pull/${{ github.event.pull_request.number }}/head:pr-branch || true
PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json"
# Use -f flag to force overwrite if file exists
if git show pr-branch:"$PREDICTION_FILE" > "router_inference/predictions/${ROUTER_NAME}.json" 2>/dev/null; then
echo "Successfully retrieved prediction file from PR branch (overwrote any existing file)"
else
echo "Error: Could not retrieve prediction file from PR branch" >&2
exit 1
fi
fi
# Verify the file exists and has content
if [[ ! -f "router_inference/predictions/${ROUTER_NAME}.json" ]]; then
echo "Error: Prediction file was not copied successfully" >&2
exit 1
fi
echo "Prediction file ready: router_inference/predictions/${ROUTER_NAME}.json"
- name: Show detected router
if: ${{ steps.detect.outputs.router != '' }}
run: |
set -euo pipefail
echo "Detected router submission: ${{ steps.detect.outputs.router }}"
echo "Detected split: ${{ steps.detect.outputs.split }}"
- name: Prepare dataset
if: ${{ steps.detect.outputs.router != '' }}
run: |
set -euo pipefail
# Prepare dataset from public repository
echo "Preparing dataset..."
mkdir -p "${{ github.workspace }}/dataset"
uv run python scripts/process_datasets/prep_datasets.py
- name: Evaluate submission
if: ${{ steps.detect.outputs.router != '' }}
id: evaluate
env:
ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset
run: |
set -euo pipefail
uv run python automation/process_pr_submission.py \
--pr "${{ github.event.pull_request.number }}" \
--router "${{ steps.detect.outputs.router }}" \
--split "${{ steps.detect.outputs.split }}" > evaluation_output.txt 2>&1
# Extract metrics from output
if grep -q "Metrics:" evaluation_output.txt; then
python3 automation/extract_metrics.py evaluation_output.txt
fi
cat evaluation_output.txt
- name: Post evaluation results as PR comment
if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }}
uses: actions/github-script@v7
with:
script: |
const fs = require('fs');
const path = require('path');
let comment = '## 📊 Router Evaluation Results\n\n';
comment += `**Router:** \`${{ steps.detect.outputs.router }}\`\n`;
comment += `**Dataset Split:** \`${{ steps.detect.outputs.split }}\`\n\n`;
// Try to read metrics from file
try {
if (fs.existsSync('metrics.json')) {
const metrics = JSON.parse(fs.readFileSync('metrics.json', 'utf8'));
comment += '### Metrics\n\n';
comment += '| Metric | Value |\n';
comment += '|--------|-------|\n';
comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`;
comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`;
comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`;
comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
} else {
// Fallback: try to parse from evaluation output
const output = fs.readFileSync('evaluation_output.txt', 'utf8');
const metricsMatch = output.match(/Metrics:\s*(\{[\s\S]*?\})/);
if (metricsMatch) {
const metrics = JSON.parse(metricsMatch[1]);
comment += '### Metrics\n\n';
comment += '| Metric | Value |\n';
comment += '|--------|-------|\n';
comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`;
comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`;
comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`;
comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
} else {
comment += '⚠️ Could not parse evaluation metrics from output.\n';
}
}
} catch (error) {
comment += `⚠️ Error reading metrics: ${error.message}\n`;
}
comment += '\n---\n';
comment += '*Evaluation completed by RouterArena automated workflow*';
github.rest.issues.createComment({
issue_number: context.payload.pull_request.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: comment
});