Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/pr-evaluation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,23 @@ jobs:
comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
const abnormalCount = metrics.abnormal_count;
if (abnormalCount !== undefined) {
comment += `| **Abnormal Entries** | ${abnormalCount} |\n`;
}
const robustnessScore = metrics.robustness_score;
const robustnessCell = robustnessScore !== undefined ? robustnessScore.toFixed(4) : 'N/A';
comment += `| **Robustness Score** | ${robustnessCell} |\n`;

// Warn the author if some queries had no valid generation (scored as 0)
if (abnormalCount && abnormalCount > 0) {
const pct = ((abnormalCount / metrics.num_queries) * 100).toFixed(1);
comment += `\n> ⚠️ **${abnormalCount} of ${metrics.num_queries} queries (${pct}%) had no valid generation** `;
comment += `(inference failed / empty answer) and were scored as **incorrect (0)**. `;
comment += `These queries still count toward the denominator, so accuracy and cost reflect the full query set. `;
comment += `Please regenerate predictions for these queries and resubmit for a complete evaluation.\n`;
}

// Add optimality scores if available
if (metrics.optimality) {
comment += '\n### Optimality Metrics\n\n';
Expand Down
22 changes: 17 additions & 5 deletions automation/process_pr_submission.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,22 @@ def compute_scores(prediction_file: Path) -> dict[str, float]:
# Filter to regular predictions only (exclude optimality entries)
regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]

accuracies = [
entry["accuracy"]
for entry in regular_predictions
if entry.get("accuracy") is not None
]
# Every regular query contributes to the accuracy denominator. Entries with no
# valid generation (success=False / missing / empty) are scored as 0, not dropped,
# so accuracy cannot be inflated over a self-selected answered subset.
accuracies = []
abnormal_count = 0
for entry in regular_predictions:
generated_result = entry.get("generated_result")
has_valid_generation = isinstance(
generated_result, dict
) and generated_result.get("success", False)
accuracy = entry.get("accuracy")
if not has_valid_generation or accuracy is None:
accuracy = 0.0
abnormal_count += 1
accuracies.append(accuracy)
Comment thread
yl231 marked this conversation as resolved.
Outdated

costs = [
entry["cost"]
for entry in regular_predictions
Expand All @@ -280,6 +291,7 @@ def compute_scores(prediction_file: Path) -> dict[str, float]:
"avg_cost_per_query": avg_cost_per_query,
"avg_cost_per_1000": avg_cost_per_1000,
"arena_score": arena_score,
"abnormal_count": abnormal_count,
}


Expand Down
38 changes: 27 additions & 11 deletions llm_evaluation/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -932,39 +932,51 @@ def compute_router_metrics(predictions: List[Dict[str, Any]], router_name: str)
regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]
optimality_predictions = [p for p in predictions if p.get("for_optimality", False)]

# Extract accuracy and cost ONLY from regular predictions for RouterArena score
# Extract accuracy and cost ONLY from regular predictions for RouterArena score.
#
# IMPORTANT: every regular query must contribute to the accuracy denominator.
# Entries with no valid generation (inference failed / success=False / empty
# generated_answer) are NOT dropped — they are scored as 0 (wrong) and counted
# via ``abnormal_count``. Silently dropping them would let a submission inflate
# accuracy (averaged over a self-selected subset) and dilute cost (summed over
# the answered subset but divided by the full query count).
accuracies = []
costs = []
valid_cost_count = 0
abnormal_count = 0

for prediction in regular_predictions:
generated_result = prediction.get("generated_result")
has_valid_generation = isinstance(generated_result, dict) and generated_result.get(
"success", False
)
accuracy = prediction.get("accuracy")
if accuracy is not None:
accuracies.append(accuracy)

if not has_valid_generation or accuracy is None:
# No successful generation (or somehow unscored): count as wrong.
accuracy = 0.0
abnormal_count += 1

accuracies.append(accuracy)
Comment thread
yl231 marked this conversation as resolved.
Outdated

cost = prediction.get("cost")
if cost is not None and cost > 0:
costs.append(cost)
valid_cost_count += 1

# Check if any entries were evaluated
if not accuracies and not costs:
if not regular_predictions:
raise ValueError(
"No entries were evaluated. All prediction entries are missing 'generated_result' fields. "
"No regular (non-optimality) entries found. "
"Please run llm_inference/run.py first to generate model outputs before evaluation."
)

if not accuracies:
raise ValueError(
"No entries have accuracy values. Cannot compute RouterArena score without accuracy data."
)

if not costs:
raise ValueError(
"No entries have valid cost values. Cannot compute RouterArena score without cost data."
)

# Compute average accuracy
# Compute average accuracy over ALL regular queries (abnormal entries count as 0)
avg_accuracy = sum(accuracies) / len(accuracies) if accuracies else 0.0

# Compute total cost (sum of all costs)
Expand All @@ -989,6 +1001,9 @@ def compute_router_metrics(predictions: List[Dict[str, Any]], router_name: str)
)
logger.info(f"Queries with Accuracy: {len(accuracies)}")
logger.info(f"Queries with Valid Cost: {valid_cost_count}")
logger.info(
f"Abnormal Entries (no valid generation, scored as 0): {abnormal_count}"
)
logger.info(f"Average Accuracy: {avg_accuracy:.4f}")
logger.info(f"Total Cost: ${total_cost:.6f}")
if num_queries > 0:
Expand Down Expand Up @@ -1050,6 +1065,7 @@ def compute_router_metrics(predictions: List[Dict[str, Any]], router_name: str)
"avg_cost_per_query": total_cost / num_queries if num_queries > 0 else 0.0,
"avg_cost_per_1000": avg_cost_per_1000,
"num_queries": num_queries,
"abnormal_count": abnormal_count,
}

# Add optimality scores if available (reuse previously computed result)
Expand Down
Loading