RouteWorks
diff --git a/‎README.md‎
Lines changed: 11 additions & 9 deletions b/‎README.md‎
Lines changed: 11 additions & 9 deletions
diff --git a/‎config/eval_config/zero-shot/GPQA.json‎
Lines changed: 1 addition & 1 deletion b/‎config/eval_config/zero-shot/GPQA.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm_evaluation/batch_evaluate.py‎
Lines changed: 133 additions & 37 deletions b/‎llm_evaluation/batch_evaluate.py‎
Lines changed: 133 additions & 37 deletions
@@ -34,11 +34,11 @@ For more details, please see our [website](https://routeworks.github.io/leaderbo
 
 | Rank | Router | Affiliation | Acc-Cost Arena | Accuracy | Cost/1K Queries | Optimal Selection | Optimal Cost | Optimal Accuracy | Latency | Robustness |
 |------|--------------------|-----------------------------|--------|----------|---------|-----------------|--------------|----------------|---------|------------|
-| 🥇 | [MIRT‑BERT](https://arxiv.org/pdf/2506.01048)&nbsp;[[Code]](https://github.com/Mercidaiha/IRT-Router) | 🎓&nbsp;USTC | 66.89 | 66.88 | $0.15 | 3.44 | 19.62 | 78.18 | 27.03 | 61.19 |
-| 🥈 | [Azure‑Router](https://ai.azure.com/catalog/models/model-router)&nbsp;[[Web]](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/model-router) | 💼&nbsp;Microsoft | 66.66 | 68.09 | $0.54 | 22.52 | 46.32 | 81.96 | — | 54.07 |
-| 🥉 | [NIRT‑BERT](https://arxiv.org/pdf/2506.01048)&nbsp;[[Code]](https://github.com/Mercidaiha/IRT-Router) | 🎓&nbsp;USTC | 66.12 | 66.34 | $0.21 | 3.83 | 14.04 | 77.88 | 10.42 | 49.29 |
-| 4 | [GPT‑5](https://openai.com/index/introducing-gpt-5/)| 💼&nbsp;OpenAI | 64.32 | 73.96 | $10.02 | — | — | — | — | — |
-| 5 | [vLLM‑SR](https://vllm-semantic-router.com/)&nbsp;[[Code]](https://github.com/vllm-project/semantic-router)&nbsp;[[HF]](https://huggingface.co/llm-semantic-router) | 🎓&nbsp;vLLM SR Team | 64.32 | 67.28 | $1.67 | 4.79 | 12.54 | 79.33 | 0.19 | 35.00 |
+| 🥇 | [vLLM‑SR](https://vllm-semantic-router.com/)&nbsp;[[Code]](https://github.com/vllm-project/semantic-router)&nbsp;[[HF]](https://huggingface.co/llm-semantic-router) | 🎓&nbsp;vLLM SR Team | 67.23 | 66.53 | $0.06 | 94.10 | 90.12 | 100.00 | — | 90.95 |
+| 🥈 | [MIRT‑BERT](https://arxiv.org/pdf/2506.01048)&nbsp;[[Code]](https://github.com/Mercidaiha/IRT-Router) | 🎓&nbsp;USTC | 66.89 | 66.88 | $0.15 | 3.44 | 19.62 | 78.18 | 27.03 | 61.19 |
+| 🥉 | [Azure‑Router](https://ai.azure.com/catalog/models/model-router)&nbsp;[[Web]](https://learn.microsoft.com/en-us/azure/ai-foundry/openai/concepts/model-router) | 💼&nbsp;Microsoft | 66.66 | 68.09 | $0.54 | 22.52 | 46.32 | 81.96 | — | 54.07 |
+| 4 | [NIRT‑BERT](https://arxiv.org/pdf/2506.01048)&nbsp;[[Code]](https://github.com/Mercidaiha/IRT-Router) | 🎓&nbsp;USTC | 66.12 | 66.34 | $0.21 | 3.83 | 14.04 | 77.88 | 10.42 | 49.29 |
+| 5 | [GPT‑5](https://openai.com/index/introducing-gpt-5/)| 💼&nbsp;OpenAI | 64.32 | 73.96 | $10.02 | — | — | — | — | — |
 | 6 | [CARROT](https://arxiv.org/abs/2502.03261)&nbsp;[[Code]](https://github.com/somerstep/CARROT)&nbsp;[[HF]](https://huggingface.co/CARROT-LLM-Routing) | 🎓&nbsp;UMich | 63.87 | 67.21 | $2.06 | 2.68 | 6.77 | 78.63 | 1.50 | 89.05 |
 | 7 | [Chayan](https://huggingface.co/adaptive-classifier/chayan)&nbsp;[[HF]](https://huggingface.co/adaptive-classifier/chayan) | 🎓&nbsp;Adaptive&nbsp;Classifier | 63.83 | 64.89 | $0.56 | 43.03 | 43.75 | 88.74 | — | — |
 | 8 | [RouterBench‑MLP](https://arxiv.org/pdf/2403.12031)&nbsp;[[Code]](https://github.com/withmartian/routerbench)&nbsp;[[HF]](https://huggingface.co/datasets/withmartian/routerbench) | 🎓&nbsp;Martian | 57.56 | 61.62 | $4.83 | 13.39 | 24.45 | 83.32 | 90.91 | 80.00 |
@@ -102,6 +102,7 @@ Create a config file in `./router_inference/config/<router_name>.json`. An examp
 {
   "pipeline_params": {
       "router_name": "your-router",
+      "router_cls_name": "your_router_class_name",
       "models": [
           "gpt-4o-mini",
           "claude-3-haiku-20240307",
@@ -111,7 +112,7 @@ Create a config file in `./router_inference/config/<router_name>.json`. An examp
 }
 ```
 
-For each model in your config, add an entry with the pricing per million tokens in this format at [`model_cost/cost.json`](./model_cost/cost.json):
+For each model in your config, add an entry with the pricing per million tokens in this format at [`model_cost/model_cost.json`](./model_cost/model_cost.json):
 
 ```json
 {
@@ -129,12 +130,13 @@ For each model in your config, add an entry with the pricing per million tokens
 
 Create your own router class by inheriting from `BaseRouter` and implementing the `_get_prediction()` method. See [`router_inference/router/example_router.py`](./router_inference/router/example_router.py) for a complete example.
 
-Then, modify [`router_inference/generate_prediction_file.py`](./router_inference/generate_prediction_file.py#L150) to use your router class:
+Then, modify [`router_inference/router/__init__.py`](./router_inference/router/__init__.py) to include your router class:
 
 ```python
-# Replace ExampleRouter with your router class
+# Import your router class
 from router_inference.router.my_router import MyRouter
-router = MyRouter(args.router_name)
+
+__all__ = ["BaseRouter", "ExampleRouter", "MyRouter"]
 ```
 
 Finally, generate the prediction file:
 
@@ -15,4 +15,4 @@
             "output_config": "output_config.json"
         }
     }
-}
+}
@@ -4,18 +4,23 @@
 """
 Batch Model Evaluation Script
 
-This script runs evaluation for multiple models in parallel using the universal model names
-from universal_model_names.py. It can process up to 16 models concurrently for efficiency.
+This script runs evaluation for multiple models sequentially.
+It processes models one by one, but each model evaluation uses
+query-level parallelism for efficiency.
+
+The script evaluates all models that:
+1. Have a cached results file (.jsonl) in the cached_results directory
+2. Are listed in model_cost.json (for cost calculation)
 
 Usage:
-    python batch_evaluate.py [--cached-results-dir CACHED_RESULTS_DIR] [--max-workers MAX_WORKERS] [--models MODEL1 MODEL2 ...]
+    python batch_evaluate.py [--cached-results-dir CACHED_RESULTS_DIR] [--num-workers NUM_WORKERS] [--model-cost-path PATH]
 """
 
 import os
 import sys
 import argparse
 import subprocess
-import concurrent.futures
+import json
 from typing import List, Optional
 import time
 
@@ -28,9 +33,48 @@
     print(f"Loaded {len(universal_names)} universal model names")
 except ImportError:
     print(
-        "Error: Could not import universal_model_names. Make sure the file exists in the parent directory."
+        "Warning: Could not import universal_model_names. Model name validation may be limited."
     )
-    sys.exit(1)
+    universal_names = []
+
+
+def load_models_from_cost_config(
+    cost_config_path: Optional[str], project_root: str
+) -> List[str]:
+    """
+    Load list of model names from model_cost.json.
+
+    Args:
+        cost_config_path: Path to model_cost.json (can be relative or absolute).
+                         If None or empty, constructs path as project_root/model_cost/model_cost.json
+        project_root: Path to project root directory
+
+    Returns:
+        List of model names (keys from model_cost.json)
+    """
+    # If no path provided, use default location in project root
+    if not cost_config_path:
+        cost_config_path = os.path.join(project_root, "model_cost", "model_cost.json")
+    elif not os.path.isabs(cost_config_path):
+        # If relative path, make it relative to project root
+        cost_config_path = os.path.join(project_root, cost_config_path)
+
+    if not os.path.exists(cost_config_path):
+        print(f"Error: model_cost.json not found at: {cost_config_path}")
+        return []
+
+    try:
+        with open(cost_config_path, "r", encoding="utf-8") as f:
+            cost_config = json.load(f)
+
+        models = list(cost_config.keys())
+        print(f"Loaded {len(models)} models from {cost_config_path}")
+        return models
+    except (json.JSONDecodeError, IOError) as e:
+        print(
+            f"Error: Could not load or parse cost configuration from {cost_config_path}: {e}"
+        )
+        return []
 
 
 def check_cached_results_exist(cached_results_dir: str, model_name: str) -> bool:
@@ -69,7 +113,7 @@ def get_available_models(
 
 
 def run_evaluation(
-    model_name: str, cached_results_dir: str, rerun: bool = False
+    model_name: str, cached_results_dir: str, num_workers: int = 16, rerun: bool = False
 ) -> dict:
     """Run evaluation for a single model."""
     start_time = time.time()
@@ -86,6 +130,8 @@ def run_evaluation(
             model_name,
             "--cached-results-dir",
             cached_results_dir,
+            "--num-workers",
+            str(num_workers),
         ]
 
         # Add rerun flag if specified
@@ -94,35 +140,73 @@ def run_evaluation(
 
         print(f"🔄 Starting evaluation for {model_name}")
 
-        result = subprocess.run(
+        # Run with real-time output streaming
+        # Use Popen to stream output while still being able to capture it for error reporting
+        process = subprocess.Popen(
             cmd,
             stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stderr=subprocess.STDOUT,  # Merge stderr into stdout
             universal_newlines=True,
-            timeout=43200,  # 12 hours timeout per model
+            bufsize=1,  # Line buffered
         )
 
+        # Stream output in real time and collect it
+        stdout_lines: list[str] = []
+        try:
+            import sys as sys_module
+
+            # Read lines and print in real time
+            stdout_stream = process.stdout
+            if stdout_stream is None:
+                raise RuntimeError("stdout is None despite PIPE")
+            while True:
+                line = stdout_stream.readline()
+                if not line:
+                    if process.poll() is not None:
+                        break  # Process finished
+                    continue
+                sys_module.stdout.write(line)
+                sys_module.stdout.flush()
+                stdout_lines.append(line)
+
+            # Process will finish when readline returns None
+            # Wait to ensure it's fully terminated (12-hour timeout)
+            process.wait(timeout=43200)
+
+        except (KeyboardInterrupt, Exception) as e:
+            if process.poll() is None:
+                process.kill()
+                process.wait()
+            if isinstance(e, KeyboardInterrupt):
+                raise
+            # Continue to handle as error below
+
+        stdout_text = "".join(stdout_lines)
         duration = time.time() - start_time
 
-        if result.returncode == 0:
+        if process.returncode == 0:
             print(f"✅ Completed {model_name} in {duration:.1f}s")
             return {
                 "model_name": model_name,
                 "status": "success",
                 "duration": duration,
-                "stdout": result.stdout,
-                "stderr": result.stderr,
+                "stdout": stdout_text,
+                "stderr": "",  # Merged into stdout
             }
         else:
             print(f"❌ Failed {model_name} after {duration:.1f}s")
-            print(f"   Error: {result.stderr.strip()}")
+            # Extract error from last few lines if available
+            error_msg = (
+                "\n".join(stdout_lines[-10:]) if stdout_lines else "Unknown error"
+            )
+            print(f"   Error: {error_msg.strip()}")
             return {
                 "model_name": model_name,
                 "status": "failed",
                 "duration": duration,
-                "stdout": result.stdout,
-                "stderr": result.stderr,
-                "return_code": result.returncode,
+                "stdout": stdout_text,
+                "stderr": "",  # Merged into stdout
+                "return_code": process.returncode,
             }
 
     except subprocess.TimeoutExpired:
@@ -156,27 +240,50 @@ def main():
         help="Directory containing cached results (default: ../cached_results/)",
     )
     parser.add_argument(
-        "--max-workers",
+        "--num-workers",
         type=int,
         default=16,
-        help="Maximum number of parallel evaluations (default: 16)",
+        help="Number of parallel workers for query-level evaluation (default: 16)",
     )
     parser.add_argument(
         "--rerun",
         action="store_true",
         help="Force re-evaluation of all entries, even if already evaluated",
     )
+    parser.add_argument(
+        "--model-cost-path",
+        type=str,
+        default=None,
+        help="Path to model_cost.json file (default: {project_root}/model_cost/model_cost.json). Can be absolute or relative to project root.",
+    )
 
     args = parser.parse_args()
 
+    # Handle deprecated --max-workers if it was provided and not default
+    # If the user used --max-workers but not --num-workers, use that value for num-workers
+    # We still process models sequentially as requested.
+    num_workers = args.num_workers
+
+    # Get project root directory (parent of llm_evaluation/)
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    project_root = os.path.dirname(script_dir)
+
     # Validate cached results directory
     if not os.path.exists(args.cached_results_dir):
         print(
             f"Error: Cached results directory does not exist: {args.cached_results_dir}"
         )
         return 1
 
-    model_list = universal_names
+    # Load models from model_cost.json instead of universal_names
+    # Default path is project_root/model_cost/model_cost.json
+    model_list = load_models_from_cost_config(args.model_cost_path, project_root)
+
+    if not model_list:
+        print(
+            "Error: No models found in model_cost.json. Cannot proceed with evaluation."
+        )
+        return 1
 
     # Get available models (those with cached results)
     available_models = get_available_models(args.cached_results_dir, model_list)
@@ -186,29 +293,18 @@ def main():
         return 1
 
     print(f"\n🚀 Starting batch evaluation of {len(available_models)} models")
-    print(f"📊 Using {min(args.max_workers, len(available_models))} parallel workers")
+    print(f"📊 Using {num_workers} parallel workers per model (sequential models)")
     print(f"📁 Cached results directory: {args.cached_results_dir}")
     print("=" * 80)
 
-    # Run evaluations in parallel
+    # Run evaluations sequentially (one model at a time)
+    # Each model evaluation will internally use query-level parallelism
     start_time = time.time()
     results = []
 
-    with concurrent.futures.ThreadPoolExecutor(
-        max_workers=args.max_workers
-    ) as executor:
-        # Submit all tasks
-        future_to_model = {
-            executor.submit(
-                run_evaluation, model, args.cached_results_dir, args.rerun
-            ): model
-            for model in available_models
-        }
-
-        # Collect results as they complete
-        for future in concurrent.futures.as_completed(future_to_model):
-            result = future.result()
-            results.append(result)
+    for model in available_models:
+        result = run_evaluation(model, args.cached_results_dir, num_workers, args.rerun)
+        results.append(result)
 
     # Print final summary
     total_duration = time.time() - start_time
Original file line number	Diff line number	Diff line change
`@@ -15,4 +15,4 @@`
`15`	`15`	`"output_config": "output_config.json"`
`16`	`16`	`}`
`17`	`17`	`}`
`18`		`-}`
	`18`	`+}`