-
Notifications
You must be signed in to change notification settings - Fork 27
Automated PR-Based Router Submission and Evaluation #13
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
053de02
Add router submission evaluation workflow
yl231 4b04bc5
[Testing Automatic Eval] upload predictions and config files
yl231 ecb5ae8
Sync dataset into worktree for evaluations
yl231 5482a16
Fix dataset preparation and environment variable handling in workflow
yl231 9834e43
Add PR comment posting and model cost validation
yl231 aba4b3a
Add glm-4-air-router submission for full dataset evaluation
yl231 a0cbfac
Fix evaluation workflow and add validation for generated_result fields
yl231 1620547
Fix generated_result format and update README for PR submission
yl231 69db384
Fix validation to accept generated_result as dictionary
yl231 f6ec20b
Restore config and predictions to match main branch
yl231 353fa74
Fixed PR comments.
yl231 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,156 @@ | ||
| name: Router Submission Evaluation | ||
|
|
||
| on: | ||
| pull_request: | ||
| types: [opened, synchronize, reopened] | ||
| paths: | ||
| - "router_inference/predictions/**" | ||
|
|
||
| jobs: | ||
| evaluate-router: | ||
| runs-on: self-hosted | ||
| permissions: | ||
| contents: read | ||
| pull-requests: write | ||
| steps: | ||
| - name: Checkout repository | ||
| uses: actions/checkout@v4 | ||
| with: | ||
| fetch-depth: 0 | ||
|
|
||
| - name: Detect new prediction file | ||
| id: detect | ||
| shell: bash | ||
| run: | | ||
| set -euo pipefail | ||
| git fetch origin main | ||
| NEW_FILES=$(git diff --name-status origin/main...HEAD -- router_inference/predictions/*.json | awk '$1 == "A" {print $2}') | ||
| if [[ -z "$NEW_FILES" ]]; then | ||
| echo "No newly added prediction file detected; skipping evaluation." | ||
| echo "router=" >> "$GITHUB_OUTPUT" | ||
| exit 0 | ||
| fi | ||
| if [[ $(echo "$NEW_FILES" | wc -l) -ne 1 ]]; then | ||
| echo "Expected exactly one new prediction file, found:" >&2 | ||
| echo "$NEW_FILES" >&2 | ||
| exit 1 | ||
| fi | ||
| ROUTER_NAME=$(basename "$NEW_FILES" .json) | ||
| echo "router=$ROUTER_NAME" >> "$GITHUB_OUTPUT" | ||
|
|
||
| # Detect split based on prediction file size | ||
| PREDICTION_FILE="router_inference/predictions/${ROUTER_NAME}.json" | ||
| if [[ ! -f "$PREDICTION_FILE" ]]; then | ||
| echo "Error: Prediction file not found at $PREDICTION_FILE" >&2 | ||
| exit 1 | ||
| fi | ||
| ENTRY_COUNT=$(python3 -c "import json; print(len(json.load(open('$PREDICTION_FILE'))))") | ||
| echo "Prediction file contains $ENTRY_COUNT entries" | ||
|
|
||
| if [[ "$ENTRY_COUNT" -eq 8400 ]]; then | ||
| SPLIT="full" | ||
| elif [[ "$ENTRY_COUNT" -eq 809 ]]; then | ||
| SPLIT="sub_10" | ||
| else | ||
| echo "Warning: Unexpected prediction file size ($ENTRY_COUNT entries). Defaulting to sub_10." >&2 | ||
| SPLIT="sub_10" | ||
| fi | ||
| echo "split=$SPLIT" >> "$GITHUB_OUTPUT" | ||
|
|
||
| - name: Show detected router | ||
| if: ${{ steps.detect.outputs.router != '' }} | ||
| run: | | ||
| set -euo pipefail | ||
| echo "Detected router submission: ${{ steps.detect.outputs.router }}" | ||
| echo "Detected split: ${{ steps.detect.outputs.split }}" | ||
|
|
||
| - name: Prepare dataset | ||
| if: ${{ steps.detect.outputs.router != '' }} | ||
| env: | ||
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| run: | | ||
| set -euo pipefail | ||
| # Verify HF_TOKEN is set if needed for private datasets | ||
| if [[ -z "${HF_TOKEN:-}" ]]; then | ||
| echo "⚠ Warning: HF_TOKEN not set. This may fail if accessing private datasets." | ||
| fi | ||
| # Always prepare dataset to ensure it's available | ||
| echo "Preparing dataset..." | ||
| mkdir -p "${{ github.workspace }}/dataset" | ||
| uv run python scripts/process_datasets/prep_datasets.py | ||
|
|
||
| - name: Evaluate submission | ||
| if: ${{ steps.detect.outputs.router != '' }} | ||
| id: evaluate | ||
| env: | ||
| ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset | ||
| HF_TOKEN: ${{ secrets.HF_TOKEN }} | ||
| run: | | ||
| set -euo pipefail | ||
| uv run python automation/process_pr_submission.py \ | ||
| --pr "${{ github.event.pull_request.number }}" \ | ||
| --router "${{ steps.detect.outputs.router }}" \ | ||
| --split "${{ steps.detect.outputs.split }}" > evaluation_output.txt 2>&1 | ||
| # Extract metrics from output | ||
| if grep -q "Metrics:" evaluation_output.txt; then | ||
| python3 automation/extract_metrics.py evaluation_output.txt | ||
| fi | ||
| cat evaluation_output.txt | ||
|
|
||
| - name: Post evaluation results as PR comment | ||
| if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }} | ||
| uses: actions/github-script@v7 | ||
| with: | ||
| script: | | ||
| const fs = require('fs'); | ||
| const path = require('path'); | ||
|
|
||
| let comment = '## 📊 Router Evaluation Results\n\n'; | ||
| comment += `**Router:** \`${{ steps.detect.outputs.router }}\`\n`; | ||
| comment += `**Dataset Split:** \`${{ steps.detect.outputs.split }}\`\n\n`; | ||
|
|
||
| // Try to read metrics from file | ||
| try { | ||
| if (fs.existsSync('metrics.json')) { | ||
| const metrics = JSON.parse(fs.readFileSync('metrics.json', 'utf8')); | ||
| comment += '### Metrics\n\n'; | ||
| comment += '| Metric | Value |\n'; | ||
| comment += '|--------|-------|\n'; | ||
| comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; | ||
| comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; | ||
| comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; | ||
| comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; | ||
| comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; | ||
| comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; | ||
| } else { | ||
| // Fallback: try to parse from evaluation output | ||
| const output = fs.readFileSync('evaluation_output.txt', 'utf8'); | ||
| const metricsMatch = output.match(/Metrics:\s*(\{[\s\S]*?\})/); | ||
| if (metricsMatch) { | ||
| const metrics = JSON.parse(metricsMatch[1]); | ||
| comment += '### Metrics\n\n'; | ||
| comment += '| Metric | Value |\n'; | ||
| comment += '|--------|-------|\n'; | ||
| comment += `| **RouterArena Score** | ${metrics.arena_score.toFixed(4)} |\n`; | ||
| comment += `| **Accuracy** | ${(metrics.accuracy * 100).toFixed(2)}% |\n`; | ||
| comment += `| **Total Cost** | $${metrics.total_cost.toFixed(6)} |\n`; | ||
| comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`; | ||
| comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`; | ||
| comment += `| **Number of Queries** | ${metrics.num_queries} |\n`; | ||
| } else { | ||
| comment += '⚠️ Could not parse evaluation metrics from output.\n'; | ||
| } | ||
| } | ||
| } catch (error) { | ||
| comment += `⚠️ Error reading metrics: ${error.message}\n`; | ||
| } | ||
|
|
||
| comment += '\n---\n'; | ||
| comment += '*Evaluation completed by RouterArena automated workflow*'; | ||
|
|
||
| github.rest.issues.createComment({ | ||
| issue_number: context.payload.pull_request.number, | ||
| owner: context.repo.owner, | ||
| repo: context.repo.repo, | ||
| body: comment | ||
| }); | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,8 @@ dataset/ | |
| *.log | ||
| logs/ | ||
| /cached_results2/ | ||
| .pr_worktrees/ | ||
| pr_evaluations/ | ||
|
|
||
| # Environment files | ||
| .venv | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,42 @@ | ||
| #!/usr/bin/env python3 | ||
| # SPDX-FileCopyrightText: Copyright contributors to the RouterArena project | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| """Extract metrics from evaluation output and save to JSON file.""" | ||
|
|
||
| import json | ||
| import re | ||
| import sys | ||
|
|
||
| if __name__ == "__main__": | ||
| if len(sys.argv) < 2: | ||
| print("Usage: extract_metrics.py <output_file>") | ||
| sys.exit(1) | ||
|
|
||
| output_file = sys.argv[1] | ||
|
|
||
| try: | ||
| with open(output_file, "r") as f: | ||
| content = f.read() | ||
|
|
||
| # Find the Metrics JSON block | ||
| match = re.search(r"Metrics:\s*(\{.*?\})", content, re.DOTALL) | ||
|
yl231 marked this conversation as resolved.
|
||
| if match: | ||
| metrics_json = match.group(1) | ||
| metrics = json.loads(metrics_json) | ||
|
|
||
| # Write metrics to file | ||
| with open("metrics.json", "w") as mf: | ||
| json.dump(metrics, mf) | ||
|
|
||
| # Output key metrics as step outputs | ||
| print(f"accuracy={metrics['accuracy']}") | ||
| print(f"arena_score={metrics['arena_score']}") | ||
| print(f"total_cost={metrics['total_cost']}") | ||
| print(f"num_queries={metrics['num_queries']}") | ||
| else: | ||
| print("No metrics found in output", file=sys.stderr) | ||
| sys.exit(1) | ||
| except Exception as e: | ||
| print(f"Error extracting metrics: {e}", file=sys.stderr) | ||
| sys.exit(1) | ||
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.