[DON'T MERGE] Update the eval procedure & Test the Evaluation Workflow #13

Workflow file for this run

.github/workflows/pr-evaluation.yml at 3ec07c3

	name: Router Submission Evaluation

	on:
	pull_request:
	types: [opened, synchronize, reopened]
	paths:
	- "router_inference/predictions/**"

	jobs:
	evaluate-router:
	runs-on: self-hosted
	permissions:
	contents: read
	pull-requests: write
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	fetch-depth: 0

	- name: Detect new prediction file
	id: detect
	shell: bash
	run: \|
	set -euo pipefail
	git fetch origin main
	NEW_FILES=$(git diff --name-status origin/main...HEAD -- router_inference/predictions/*.json \| awk '$1 == "A" {print $2}')
	if [[ -z "$NEW_FILES" ]]; then
	echo "No newly added prediction file detected; skipping evaluation."
	echo "router=" >> "$GITHUB_OUTPUT"
	exit 0
	fi
	if [[ $(echo "$NEW_FILES" \| wc -l) -ne 1 ]]; then
	echo "Expected exactly one new prediction file, found:" >&2
	echo "$NEW_FILES" >&2
	exit 1
	fi
	ROUTER_NAME=$(basename "$NEW_FILES" .json)
	echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT"

	# Detect split based on prediction file size
	PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json"
	if [[ ! -f "$PREDICTION_FILE" ]]; then
	echo "Error: Prediction file not found at $PREDICTION_FILE" >&2
	exit 1
	fi
	ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))")
	echo "Prediction file contains $ENTRY_COUNT entries"

	if [[ "$ENTRY_COUNT" -eq 8400 ]]; then
	SPLIT="full"
	elif [[ "$ENTRY_COUNT" -eq 809 ]]; then
	SPLIT="sub_10"
	else
	echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to sub_10." >&2
	SPLIT="sub_10"
	fi
	echo "split=$SPLIT" >> "$GITHUB_OUTPUT"

	- name: Show detected router
	if: ${{ steps.detect.outputs.router != '' }}
	run: \|
	set -euo pipefail
	echo "Detected router submission: ${{ steps.detect.outputs.router }}"
	echo "Detected split: ${{ steps.detect.outputs.split }}"

	- name: Prepare dataset
	if: ${{ steps.detect.outputs.router != '' }}
	run: \|
	set -euo pipefail
	# Prepare dataset from public repository
	echo "Preparing dataset..."
	mkdir -p "${{ github.workspace }}/dataset"
	uv run python scripts/process_datasets/prep_datasets.py

	- name: Evaluate submission
	if: ${{ steps.detect.outputs.router != '' }}
	id: evaluate
	env:
	ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset
	run: \|
	set -euo pipefail
	uv run python automation/process_pr_submission.py \
	--pr "${{ github.event.pull_request.number }}" \
	--router "${{ steps.detect.outputs.router }}" \
	--split "${{ steps.detect.outputs.split }}" > evaluation_output.txt 2>&1
	# Extract metrics from output
	if grep -q "Metrics:" evaluation_output.txt; then
	python3 automation/extract_metrics.py evaluation_output.txt
	fi
	cat evaluation_output.txt

	- name: Post evaluation results as PR comment
	if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }}
	uses: actions/github-script@v7
	with:
	script: \|
	const fs = require('fs');
	const path = require('path');

	let comment = '## 📊 Router Evaluation Results\n\n';
	comment += `Router: \`${{ steps.detect.outputs.router }}\`\n`;
	comment += `Dataset Split: \`${{ steps.detect.outputs.split }}\`\n\n`;

	// Try to read metrics from file
	try {
	if (fs.existsSync('metrics.json')) {
	const metrics = JSON.parse(fs.readFileSync('metrics.json', 'utf8'));
	comment += '### Metrics\n\n';
	comment += '\| Metric \| Value \|\n';
	comment += '\|--------\|-------\|\n';
	comment += `\| RouterArena Score \| ${metrics.arena_score.toFixed(4)} \|\n`;
	comment += `\| Accuracy \| ${(metrics.accuracy * 100).toFixed(2)}% \|\n`;
	comment += `\| Total Cost \| $${metrics.total_cost.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per Query \| $${metrics.avg_cost_per_query.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per 1K Queries \| $${metrics.avg_cost_per_1000.toFixed(4)} \|\n`;
	comment += `\| Number of Queries \| ${metrics.num_queries} \|\n`;
	} else {
	// Fallback: try to parse from evaluation output
	const output = fs.readFileSync('evaluation_output.txt', 'utf8');
	const metricsMatch = output.match(/Metrics:\s(\{[\s\S]?\})/);
	if (metricsMatch) {
	const metrics = JSON.parse(metricsMatch[1]);
	comment += '### Metrics\n\n';
	comment += '\| Metric \| Value \|\n';
	comment += '\|--------\|-------\|\n';
	comment += `\| RouterArena Score \| ${metrics.arena_score.toFixed(4)} \|\n`;
	comment += `\| Accuracy \| ${(metrics.accuracy * 100).toFixed(2)}% \|\n`;
	comment += `\| Total Cost \| $${metrics.total_cost.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per Query \| $${metrics.avg_cost_per_query.toFixed(6)} \|\n`;
	comment += `\| Avg Cost per 1K Queries \| $${metrics.avg_cost_per_1000.toFixed(4)} \|\n`;
	comment += `\| Number of Queries \| ${metrics.num_queries} \|\n`;
	} else {
	comment += '⚠️ Could not parse evaluation metrics from output.\n';
	}
	}
	} catch (error) {
	comment += `⚠️ Error reading metrics: ${error.message}\n`;
	}

	comment += '\n---\n';
	comment += 'Evaluation completed by RouterArena automated workflow';

	github.rest.issues.createComment({
	issue_number: context.payload.pull_request.number,
	owner: context.repo.owner,
	repo: context.repo.repo,
	body: comment
	});

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[DON'T MERGE] Update the eval procedure & Test the Evaluation Workflow #13

Workflow file

[DON'T MERGE] Update the eval procedure & Test the Evaluation Workflow #13

Uh oh!

Workflow file for this run