RouteWorks · yl231 · May 29, 2026 · May 29, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.github/workflows/pr-evaluation.yml b/.github/workflows/pr-evaluation.yml
@@ -293,10 +293,23 @@ jobs:
             comment += `| **Avg Cost per Query** | $${metrics.avg_cost_per_query.toFixed(6)} |\n`;
             comment += `| **Avg Cost per 1K Queries** | $${metrics.avg_cost_per_1000.toFixed(4)} |\n`;
             comment += `| **Number of Queries** | ${metrics.num_queries} |\n`;
+            const abnormalCount = metrics.abnormal_count;
+            if (abnormalCount !== undefined) {
+              comment += `| **Abnormal Entries** | ${abnormalCount} |\n`;
+            }
             const robustnessScore = metrics.robustness_score;
             const robustnessCell = robustnessScore !== undefined ? robustnessScore.toFixed(4) : 'N/A';
             comment += `| **Robustness Score** | ${robustnessCell} |\n`;
 
+            // Warn the author if some queries had no valid generation (scored as 0)
+            if (abnormalCount && abnormalCount > 0) {
+              const pct = ((abnormalCount / metrics.num_queries) * 100).toFixed(1);
+              comment += `\n> ⚠️ **${abnormalCount} of ${metrics.num_queries} queries (${pct}%) had no valid generation** `;
+              comment += `(inference failed / empty answer) and were scored as **incorrect (0)**. `;
+              comment += `These queries still count toward the denominator, so accuracy and cost reflect the full query set. `;
+              comment += `Please regenerate predictions for these queries and resubmit for a complete evaluation.\n`;
+            }
+
             // Add optimality scores if available
             if (metrics.optimality) {
               comment += '\n### Optimality Metrics\n\n';

diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py
@@ -254,11 +254,22 @@ def compute_scores(prediction_file: Path) -> dict[str, float]:
     # Filter to regular predictions only (exclude optimality entries)
     regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]
 
-    accuracies = [
-        entry["accuracy"]
-        for entry in regular_predictions
-        if entry.get("accuracy") is not None
-    ]
+    # Every regular query contributes to the accuracy denominator. Entries with no
+    # valid generation (success=False / missing / empty) are scored as 0, not dropped,
+    # so accuracy cannot be inflated over a self-selected answered subset.
+    accuracies = []
+    abnormal_count = 0
+    for entry in regular_predictions:
+        generated_result = entry.get("generated_result")
+        has_valid_generation = isinstance(
+            generated_result, dict
+        ) and generated_result.get("success", False)
+        accuracy = entry.get("accuracy")
+        if not has_valid_generation or accuracy is None:
+            accuracy = 0.0
+            abnormal_count += 1
+        accuracies.append(accuracy)
+
     costs = [
         entry["cost"]
         for entry in regular_predictions
@@ -280,6 +291,7 @@ def compute_scores(prediction_file: Path) -> dict[str, float]:
         "avg_cost_per_query": avg_cost_per_query,
         "avg_cost_per_1000": avg_cost_per_1000,
         "arena_score": arena_score,
+        "abnormal_count": abnormal_count,
     }
 
 

diff --git a/llm_evaluation/run.py b/llm_evaluation/run.py
@@ -932,39 +932,51 @@ def compute_router_metrics(predictions: List[Dict[str, Any]], router_name: str)
     regular_predictions = [p for p in predictions if not p.get("for_optimality", False)]
     optimality_predictions = [p for p in predictions if p.get("for_optimality", False)]
 
-    # Extract accuracy and cost ONLY from regular predictions for RouterArena score
+    # Extract accuracy and cost ONLY from regular predictions for RouterArena score.
+    #
+    # IMPORTANT: every regular query must contribute to the accuracy denominator.
+    # Entries with no valid generation (inference failed / success=False / empty
+    # generated_answer) are NOT dropped — they are scored as 0 (wrong) and counted
+    # via ``abnormal_count``. Silently dropping them would let a submission inflate
+    # accuracy (averaged over a self-selected subset) and dilute cost (summed over
+    # the answered subset but divided by the full query count).
     accuracies = []
     costs = []
     valid_cost_count = 0
+    abnormal_count = 0
 
     for prediction in regular_predictions:
+        generated_result = prediction.get("generated_result")
+        has_valid_generation = isinstance(generated_result, dict) and generated_result.get(
+            "success", False
+        )
         accuracy = prediction.get("accuracy")
-        if accuracy is not None:
-            accuracies.append(accuracy)
+
+        if not has_valid_generation or accuracy is None:
+            # No successful generation (or somehow unscored): count as wrong.
+            accuracy = 0.0
+            abnormal_count += 1
+
+        accuracies.append(accuracy)
 
         cost = prediction.get("cost")
         if cost is not None and cost > 0:
             costs.append(cost)
             valid_cost_count += 1
 
     # Check if any entries were evaluated
-    if not accuracies and not costs:
+    if not regular_predictions:
         raise ValueError(
-            "No entries were evaluated. All prediction entries are missing 'generated_result' fields. "
+            "No regular (non-optimality) entries found. "
             "Please run llm_inference/run.py first to generate model outputs before evaluation."
         )
 
-    if not accuracies:
-        raise ValueError(
-            "No entries have accuracy values. Cannot compute RouterArena score without accuracy data."
-        )
-
     if not costs:
         raise ValueError(
             "No entries have valid cost values. Cannot compute RouterArena score without cost data."
         )
 
-    # Compute average accuracy
+    # Compute average accuracy over ALL regular queries (abnormal entries count as 0)
     avg_accuracy = sum(accuracies) / len(accuracies) if accuracies else 0.0
 
     # Compute total cost (sum of all costs)
@@ -989,6 +1001,9 @@ def compute_router_metrics(predictions: List[Dict[str, Any]], router_name: str)
         )
     logger.info(f"Queries with Accuracy: {len(accuracies)}")
     logger.info(f"Queries with Valid Cost: {valid_cost_count}")
+    logger.info(
+        f"Abnormal Entries (no valid generation, scored as 0): {abnormal_count}"
+    )
     logger.info(f"Average Accuracy: {avg_accuracy:.4f}")
     logger.info(f"Total Cost: ${total_cost:.6f}")
     if num_queries > 0:
@@ -1050,6 +1065,7 @@ def compute_router_metrics(predictions: List[Dict[str, Any]], router_name: str)
         "avg_cost_per_query": total_cost / num_queries if num_queries > 0 else 0.0,
         "avg_cost_per_1000": avg_cost_per_1000,
         "num_queries": num_queries,
+        "abnormal_count": abnormal_count,
     }
 
     # Add optimality scores if available (reuse previously computed result)