Merge origin/main into research-pipeline - resolved conflicts and aligned with README.md (using sub_10)

yl231 · yl231 · commit 8a5870ece415 · 2025-11-03T20:31:34.000-06:00
diff --git a/llm_evaluation/eval_reasoning.py b/llm_evaluation/eval_reasoning.py
@@ -3,6 +3,7 @@
 
 import os
 import json
+from typing import Any, Dict
 from metrics import (
     mcq_exact_match,
     mcq_accuracy,
@@ -201,8 +202,8 @@ def eval(pred_dir, eval_params, pipeline_config, all_data):
         scores: Dictionary of evaluation scores
         raw_results: Detailed results for each prediction
     """
-    scores = dict()
-    all_raw_results = dict()
+    scores: Dict[str, float] = {}
+    all_raw_results: Dict[str, Dict[str, Any]] = {}
 
     # Get the appropriate scorers for this dataset and metrics
     dataset_name = eval_params["dataset"]
diff --git a/llm_evaluation/evaluate_models.py b/llm_evaluation/evaluate_models.py
@@ -98,8 +98,8 @@ def load_all_data(self):
         print("Loading ground truth data...")
         try:
             # Load data directly without LiveCodeBench dependency
-            from datasets import load_from_disk  # type: ignore[import-untyped]
-            import pandas as pd  # type: ignore[import-untyped]
+            from datasets import load_from_disk
+            import pandas as pd
 
             # Load the router eval benchmark dataset
             router_eval_bench = load_from_disk("./dataset/routerarena")
@@ -368,14 +368,21 @@ def evaluate_model(self, model_name: str, rerun=False) -> Dict[str, Any]:
 
             # Evaluate each entry in this dataset
             for entry in dataset_entries:
-                global_index = entry.get("global_index")
+                global_index_val = entry.get("global_index")
                 generated_answer = entry.get("generated_answer", "")
 
                 try:
                     # Get ground truth for this entry
-                    ground_truth = self._get_ground_truth(global_index, dataset_name)
+                    if not isinstance(global_index_val, str):
+                        print(
+                            f"Warning: Invalid global_index {global_index_val} for dataset {dataset_name}"
+                        )
+                        continue
+                    ground_truth = self._get_ground_truth(
+                        global_index_val, dataset_name
+                    )
                     if ground_truth is None:
-                        print(f"Warning: No ground truth found for {global_index}")
+                        print(f"Warning: No ground truth found for {global_index_val}")
                         continue
 
                     # Evaluate using the appropriate scorer
@@ -415,7 +422,7 @@ def evaluate_model(self, model_name: str, rerun=False) -> Dict[str, Any]:
                         "metric": "error",
                         "inference_cost": 0.0,
                     }
-                    print(f"Error evaluating {global_index}: {e}")
+                    print(f"Error evaluating {global_index_val}: {e}")
                     continue
 
             dataset_scores[dataset_name] = len(dataset_entries)
@@ -432,7 +439,7 @@ def evaluate_model(self, model_name: str, rerun=False) -> Dict[str, Any]:
         print(f"Evaluation completed. Evaluated {evaluated_count} new entries.")
         return self._compile_final_results(universal_model_name, cached_results)
 
-    def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[str]:
+    def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[Any]:
         """Get ground truth for a specific global_index from the dataset."""
         # Load dataset if not already loaded
         if self.all_data is None:
@@ -475,7 +482,7 @@ def _get_ground_truth(self, global_index: str, dataset_name: str) -> Optional[st
         return None
 
     def _evaluate_single_entry(
-        self, generated_answer: str, ground_truth: str, scorer, dataset_name: str
+        self, generated_answer: str, ground_truth: Any, scorer, dataset_name: str
     ) -> tuple:
         """Evaluate a single entry using the appropriate scorer."""
         try:
@@ -593,7 +600,10 @@ def main():
 
     args = parser.parse_args()
 
-    universal_name = model_name_manager.get_universal_name(args.model_name)
+    if model_name_manager is not None:
+        universal_name = model_name_manager.get_universal_name(args.model_name)
+    else:
+        universal_name = args.model_name
     print(f"Input model name: {args.model_name}")
     print(f"Universal model name: {universal_name}")
 
diff --git a/llm_evaluation/livecodebench_util.py b/llm_evaluation/livecodebench_util.py
@@ -20,7 +20,10 @@
 import time
 import zlib
 from io import StringIO
-from typing import Optional
+from typing import Optional, Any, Dict
+
+# Global storage for original references used by reliability_guard
+originals: Dict[str, Any] = {}
 
 
 def has_code(response):
@@ -126,8 +129,7 @@ def post_process_tests_inputs(raw_text, is_stdin):
 
             # If no matches are found, fall back to line-by-line parsing
             cleaned_lines = cleaned_string.split("\n")
-            if test_cases is None:
-                test_cases = []
+            test_cases = []
             for line in cleaned_lines:
                 try:
                     test_case = json.loads(line)
@@ -229,10 +231,9 @@ def prepare_test_input_output_std(test_case):
 
 def run_test_func(completion, is_extracted, test_input, test_output):
     # print(f"inside: {completion}")
+    # Define the namespace in which to execute the completion code
+    namespace: Dict[str, Any] = {}
     if not is_extracted:
-        # Define the namespace in which to execute the completion code
-        namespace = {}
-
         # Execute the generated code in the namespace
 
         exec(completion, namespace)
@@ -273,8 +274,6 @@ def run_test_func(completion, is_extracted, test_input, test_output):
 
         return True, result_output
     else:
-        namespace = {}
-
         # Execute the generated code in the namespace
 
         exec(completion, namespace)
@@ -313,7 +312,7 @@ def run_test_std(completion, test_input, test_output):
         # Simulate that the code is being run as the main script
         completion = '__name__ = "__main__"\n' + completion
 
-    namespace = {}
+    namespace: Dict[str, Any] = {}
     exec(completion, namespace)
 
     output_value = output.getvalue().strip()
@@ -409,7 +408,7 @@ def swallow_io(redirect_input=True):
 
     with contextlib.redirect_stdout(stream), contextlib.redirect_stderr(stream):
         if redirect_input:
-            with contextlib.redirect_stdin(StringIO()):  # Redirect stdin if enabled
+            with redirect_stdin(StringIO()):  # Redirect stdin if enabled
                 yield stream
         else:
             yield stream  # Do not redirect stdin
@@ -443,8 +442,18 @@ def readable(self, *args, **kwargs):
         return False
 
 
-class redirect_stdin(contextlib._RedirectStream):  # type: ignore
-    _stream = "stdin"
+class redirect_stdin:
+    def __init__(self, new_target: Any):
+        self._new_target = new_target
+        self._old_target: Any = None
+
+    def __enter__(self) -> Any:
+        self._old_target = sys.stdin
+        sys.stdin = self._new_target
+        return self._new_target
+
+    def __exit__(self, exc_type, exc, tb) -> None:
+        sys.stdin = self._old_target
 
 
 @contextlib.contextmanager
@@ -484,7 +493,9 @@ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     builtins.exit = cast(Any, None)  # type: ignore[assignment]
     builtins.quit = cast(Any, None)  # type: ignore[assignment]
 
-    import os
+    # Prepare Any-typed aliases to avoid mypy assignment errors
+    os_mod: Any = os
+    subprocess_mod: Any = subprocess
 
     os.environ["OMP_NUM_THREADS"] = "1"
 
@@ -516,20 +527,51 @@ def reliability_guard(maximum_memory_bytes: Optional[int] = None):
     os.getcwd = cast(Any, None)  # type: ignore[assignment]
     os.chdir = cast(Any, None)  # type: ignore[assignment]
 
-    import shutil
+    # Disable destructive os functions (guard where platform-specific)
+    for name in [
+        "kill",
+        "system",
+        "putenv",
+        "remove",
+        "removedirs",
+        "rmdir",
+        "fchdir",
+        "setuid",
+        "fork",
+        "forkpty",
+        "killpg",
+        "rename",
+        "renames",
+        "truncate",
+        "replace",
+        "unlink",
+        "fchmod",
+        "fchown",
+        "chmod",
+        "chown",
+        "chroot",
+        "getcwd",
+        "chdir",
+        "lchflags",
+        "lchmod",
+        "lchown",
+    ]:
+        try:
+            setattr(os_mod, name, None)
+        except Exception:
+            pass
 
     shutil.rmtree = cast(Any, None)  # type: ignore[assignment]
     shutil.move = cast(Any, None)  # type: ignore[assignment]
     shutil.chown = cast(Any, None)  # type: ignore[assignment]
 
-    import subprocess
+    # Disable subprocess.Popen
+    setattr(subprocess_mod, "Popen", None)
 
     setattr(subprocess, "Popen", cast(Any, None))  # type: ignore[misc]
 
     # __builtins__["help"] = None   # this line is commented out as it results into error
 
-    import sys
-
     sys.modules["ipdb"] = None  # type: ignore[assignment]
     sys.modules["joblib"] = None  # type: ignore[assignment]
     sys.modules["resource"] = None  # type: ignore[assignment]
@@ -600,7 +642,7 @@ def restore_original_references():
         setattr(shutil, func_name, original_func)
 
     # Restore 'subprocess' functions
-    subprocess.Popen = originals["subprocess"]["Popen"]
+    setattr(subprocess, "Popen", originals["subprocess"]["Popen"])
 
     # Restore sys modules
     for module_name, original_module in originals["sys_modules"].items():
diff --git a/llm_evaluation/metrics.py b/llm_evaluation/metrics.py
@@ -130,7 +130,7 @@ def classification_score(prediction, ground_truth, **kwargs):
             score = 0.0
     else:
         best_match = None
-        highest_similarity = 0
+        highest_similarity = 0.0
         for string in all_classes:
             similarity = difflib.SequenceMatcher(None, string, prediction).ratio()
             if similarity > highest_similarity:
@@ -528,24 +528,25 @@ def math_equal(
 
     try:  # 1. numerical equal
         if is_digit(prediction) and is_digit(reference):
-            prediction = parse_digits(prediction)
-            reference = parse_digits(reference)
-            # number questions
-            if include_percentage:
-                gt_result = [reference / 100, reference, reference * 100]
-            else:
-                gt_result = [reference]
-            for item in gt_result:
-                try:
-                    if is_close:
-                        if numeric_equal(prediction, item):
-                            return True
-                    else:
-                        if item == prediction:
-                            return True
-                except Exception:
-                    continue
-            return False
+            pred_val = parse_digits(prediction)
+            ref_val = parse_digits(reference)
+            if pred_val is not None and ref_val is not None:
+                # number questions
+                if include_percentage:
+                    gt_result: list[float] = [ref_val / 100, ref_val, ref_val * 100]
+                else:
+                    gt_result = [ref_val]
+                for item in gt_result:
+                    try:
+                        if is_close:
+                            if numeric_equal(pred_val, item):
+                                return True
+                        else:
+                            if item == pred_val:
+                                return True
+                    except Exception:
+                        continue
+                return False
     except Exception:
         pass
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -100,3 +100,9 @@ packages = ["."]
 
 [tool.uv]
 dev-dependencies = []
+
+[tool.mypy]
+plugins = ['pydantic.mypy']
+ignore_missing_imports = true
+check_untyped_defs = true
+follow_imports = "silent"
diff --git a/router_inference/check_config_prediction_files.py b/router_inference/check_config_prediction_files.py
@@ -15,7 +15,7 @@
 Usage:
     python router_inference/check_config_prediction_files.py <router_name> <split>
 
-    split: either "10" for 10% split or "full" for full dataset
+    split: either "sub_10" for 10% split or "full" for full dataset
 """
 
 import argparse
@@ -89,15 +89,15 @@ def load_dataset(split: str) -> List[Dict[str, Any]]:
     Load dataset file.
 
     Args:
-        split: Either "10" or "full"
+        split: Either "sub_10" or "full"
 
     Returns:
         List of dataset entries
     """
     dataset_path = DATASET_PATHS.get(split)
 
     if not dataset_path:
-        raise ValueError(f"Invalid split: {split}. Must be '10' or 'full'")
+        raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'")
 
     if not os.path.exists(dataset_path):
         raise FileNotFoundError(f"Dataset file not found: {dataset_path}")
@@ -143,15 +143,15 @@ def check_prediction_size(
 
     Args:
         predictions: List of prediction dictionaries
-        split: Either "10" or "full"
+        split: Either "sub_10" or "full"
 
     Returns:
         Tuple of (is_valid, error_message)
     """
     expected_size = EXPECTED_SIZES.get(split)
 
     if expected_size is None:
-        return False, f"Invalid split: {split}. Must be '10' or 'full'"
+        return False, f"Invalid split: {split}. Must be 'sub_10' or 'full'"
 
     actual_size = len(predictions)
 
diff --git a/router_inference/generate_prediction_file.py b/router_inference/generate_prediction_file.py
@@ -11,7 +11,7 @@
 Usage:
     python router_inference/generate_prediction_file.py <router_name> <split>
 
-    split: either "10" for 10% split (809 entries) or "full" (8400 entries)
+    split: either "sub_10" for 10% split (809 entries) or "full" (8400 entries)
 """
 
 import argparse
@@ -56,15 +56,15 @@ def load_dataset(split: str) -> List[Dict[str, Any]]:
     Load dataset file.
 
     Args:
-        split: Either "10" or "full"
+        split: Either "sub_10" or "full"
 
     Returns:
         List of dataset entries
     """
     dataset_path = DATASET_PATHS.get(split)
 
     if not dataset_path:
-        raise ValueError(f"Invalid split: {split}. Must be '10' or 'full'")
+        raise ValueError(f"Invalid split: {split}. Must be 'sub_10' or 'full'")
 
     if not os.path.exists(dataset_path):
         raise FileNotFoundError(f"Dataset file not found: {dataset_path}")