[Feat] Support evaluation on modified prediction files #21

Workflow file for this run

.github/workflows/pr-evaluation.yml at 34a10da

	name: Router Submission Evaluation

	on:
	pull_request:
	types: [opened, synchronize, reopened]
	paths:
	- "router_inference/predictions/**"

	jobs:
	evaluate-router:
	runs-on: self-hosted
	permissions:
	contents: read
	pull-requests: write
	steps:
	- name: Checkout PR branch for file detection
	uses: actions/checkout@v4
	with:
	ref: ${{ github.event.pull_request.head.sha }}
	fetch-depth: 0

	- name: Detect changed prediction file
	id: detect
	shell: bash
	run: \|
	set -euo pipefail
	# Compare against the fork's base branch (the branch the PR was created from)
	# This ensures each router submission is evaluated independently
	BASE_REF="${{ github.event.pull_request.base.ref }}"
	BASE_SHA="${{ github.event.pull_request.base.sha }}"

	if [[ -z "$BASE_SHA" ]]; then
	echo "Error: Could not determine PR base SHA" >&2
	exit 1
	fi

	# Fetch the base branch to ensure it's available for comparison
	git fetch origin "$BASE_REF" \|\| true

	# Try to fetch the specific base SHA if it's not already available
	if ! git cat-file -e "$BASE_SHA" 2>/dev/null; then
	echo "Base SHA $BASE_SHA not found locally, attempting to fetch..."
	git fetch origin "$BASE_SHA" \|\| git fetch origin "$BASE_REF" \|\| true
	fi

	# For PRs from forks, we want to compare against the fork's base branch state
	# Use three-dot diff to show changes from merge-base to HEAD (only PR changes)
	# This isolates the evaluation to changes in this specific fork submission
	CHANGED_FILES=$(git diff --name-status "$BASE_SHA"...HEAD -- router_inference/predictions/*.json 2>&1 \| awk '$1 == "A" \|\| $1 == "M" {print $2}')
	if [[ -z "$CHANGED_FILES" ]]; then
	echo "No changed prediction file detected; skipping evaluation."
	echo "router=" >> "$GITHUB_OUTPUT"
	exit 0
	fi
	if [[ $(echo "$CHANGED_FILES" \| wc -l) -ne 1 ]]; then
	echo "Expected exactly one changed prediction file, found:" >&2
	echo "$CHANGED_FILES" >&2
	exit 1
	fi
	ROUTER_NAME=$(basename "$CHANGED_FILES" .json)
	echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT"

	# Detect split based on prediction file size (from PR branch)
	PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json"
	if [[ ! -f "$PREDICTION_FILE" ]]; then
	echo "Error: Prediction file not found at $PREDICTION_FILE" >&2
	exit 1
	fi
	ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))")
	echo "Prediction file contains $ENTRY_COUNT entries"

	if [[ "$ENTRY_COUNT" -eq 8400 ]]; then
	SPLIT="full"
	elif [[ "$ENTRY_COUNT" -eq 809 ]]; then
	SPLIT="sub_10"
	else
	echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to sub_10." >&2
	SPLIT="sub_10"
	fi
	echo "split=$SPLIT" >> "$GITHUB_OUTPUT"

	- name: Continue using PR branch for evaluation
	if: ${{ steps.detect.outputs.router != '' }}
	run: \|
	set -euo pipefail
	# We stay on the PR branch to use the code from the PR
	# This allows the PR to include both router submissions AND code improvements
	# The prediction file is already available from the detection step
	ROUTER_NAME="${{ steps.detect.outputs.router }}"
	PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json"

	# Verify the file exists and has content
	if [[ ! -f "$PREDICTION_FILE" ]]; then
	echo "Error: Prediction file not found at $PREDICTION_FILE" >&2
	exit 1
	fi
	echo "Using PR branch for evaluation (includes both router submission and code changes)"
	echo "Prediction file ready: $PREDICTION_FILE"

	- name: Show detected router
	if: ${{ steps.detect.outputs.router != '' }}
	run: \|
	set -euo pipefail
	echo "Detected router submission: ${{ steps.detect.outputs.router }}"
	echo "Detected split: ${{ steps.detect.outputs.split }}"

	- name: Prepare dataset
	if: ${{ steps.detect.outputs.router != '' }}
	run: \|
	set -euo pipefail
	# Prepare dataset from public repository
	echo "Preparing dataset..."
	mkdir -p "${{ github.workspace }}/dataset"
	uv run python scripts/process_datasets/prep_datasets.py

	- name: Evaluate submission
	if: ${{ steps.detect.outputs.router != '' }}
	id: evaluate
	env:
	ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset
	run: \|
	set -euo pipefail
	# Use the PR's base branch SHA for comparison (fork's base, not upstream main)
	BASE_SHA="${{ github.event.pull_request.base.sha }}"
	uv run python automation/process_pr_submission.py \
	--pr "${{ github.event.pull_request.number }}" \
	--router "${{ steps.detect.outputs.router }}" \
	--split "${{ steps.detect.outputs.split }}" \
	--base-ref "$BASE_SHA" > evaluation_output.txt 2>&1
	# Extract metrics from output
	if grep -q "Metrics:" evaluation_output.txt; then
	python3 automation/extract_metrics.py evaluation_output.txt
	fi
	cat evaluation_output.txt

	- name: Post evaluation results as PR comment
	if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }}
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');

	let comment = '## 📊 Router Evaluation Results\n\n';
	comment += `Router: \`${{ steps.detect.outputs.router }}\`\n`;
	comment += `Dataset Split: \`${{ steps.detect.outputs.split }}\`\n\n`;

	// Try to read metrics from file
	try {
	if (fs.existsSync('metrics.json')) {
	const metrics = JSON.parse(fs.readFileSync('metrics.json', 'utf8'));
	comment += '### Metrics\n\n';
	comment += '\| Metric \| Value \|\n';
	comment += '\|--------\|-------\|\n';
	comment += `\| RouterArena Score \| ${metrics.arena_score.toFixed(4)} \|\n`;
	comment += `\| Accuracy \| ${(metrics.accuracy * 100).toFixed(2)}% \|\n`;
	comment += `\| Total Cost \| $${metrics.total_cost.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per Query \| $${metrics.avg_cost_per_query.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per 1K Queries \| $${metrics.avg_cost_per_1000.toFixed(4)} \|\n`;
	comment += `\| Number of Queries \| ${metrics.num_queries} \|\n`;
	} else {
	// Fallback: try to parse from evaluation output
	const output = fs.readFileSync('evaluation_output.txt', 'utf8');
	const metricsMatch = output.match(/Metrics:\s(\{[\s\S]?\})/);
	if (metricsMatch) {
	const metrics = JSON.parse(metricsMatch[1]);
	comment += '### Metrics\n\n';
	comment += '\| Metric \| Value \|\n';
	comment += '\|--------\|-------\|\n';
	comment += `\| RouterArena Score \| ${metrics.arena_score.toFixed(4)} \|\n`;
	comment += `\| Accuracy \| ${(metrics.accuracy * 100).toFixed(2)}% \|\n`;
	comment += `\| Total Cost \| $${metrics.total_cost.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per Query \| $${metrics.avg_cost_per_query.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per 1K Queries \| $${metrics.avg_cost_per_1000.toFixed(4)} \|\n`;
	comment += `\| Number of Queries \| ${metrics.num_queries} \|\n`;
	} else {
	comment += '⚠️ Could not parse evaluation metrics from output.\n';
	}
	}
	} catch (error) {
	comment += `⚠️ Error reading metrics: ${error.message}\n`;
	}

	comment += '\n---\n';
	comment += 'Evaluation completed by RouterArena automated workflow';

	github.rest.issues.createComment({
	issue_number: context.payload.pull_request.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: comment
	});

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Feat] Support evaluation on modified prediction files #21

Workflow file

[Feat] Support evaluation on modified prediction files #21

Uh oh!

Workflow file for this run