Fix GitHub hosted CI runner bugs for router evaluation (#84)

jiarong0907 · web-flow · commit d0e776ad4e09 · 2026-02-18T15:36:25.000-06:00
* try a different way to install uv

* fix active approach.

* Add more log to track the progress

* enableing log in the middle

* enable subprocess log

* fix pre-commit ci
diff --git a/.github/workflows/pr-evaluation.yml b/.github/workflows/pr-evaluation.yml
@@ -6,6 +6,10 @@ on:
 
 jobs:
   evaluate-router:
+    concurrency:
+      group: evaluate-router-pr-${{ github.event.issue.number }}
+      cancel-in-progress: false
+    timeout-minutes: 120
     if: >-
       github.event.issue.pull_request &&
       startsWith(github.event.comment.body, '/evaluate') &&
@@ -183,6 +187,20 @@ jobs:
           echo "Detected router submission: ${{ steps.detect.outputs.router }}"
           echo "Detected split: ${{ steps.detect.outputs.split }}"
 
+      - name: Set up Python
+        if: ${{ steps.detect.outputs.router != '' }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install uv
+        if: ${{ steps.detect.outputs.router != '' }}
+        run: |
+          set -euo pipefail
+          python -m pip install --upgrade pip
+          python -m pip install uv
+          uv --version
+
       - name: Prepare dataset
         if: ${{ steps.detect.outputs.router != '' }}
         working-directory: base
@@ -191,7 +209,7 @@ jobs:
           # Prepare dataset from public repository
           # Uses base repo's script (safe - not from PR)
           echo "Preparing dataset..."
-          mkdir -p "${{ github.workspace }}/dataset"
+          mkdir -p dataset
           uv run python scripts/process_datasets/prep_datasets.py
 
       - name: Copy PR prediction file to base workspace
@@ -221,17 +239,16 @@ jobs:
         id: evaluate
         working-directory: base
         env:
-          ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/dataset
+          ROUTERARENA_DATASET_DIR: ${{ github.workspace }}/base/dataset
         run: |
-          set -euo pipefail; trap 'cat evaluation_output.txt' EXIT
+          set -euo pipefail
           # Uses base repo's evaluation script (safe - not from PR)
           BASE_SHA="${{ steps.pr.outputs.base_sha }}"
           uv run python automation/process_pr_submission.py \
             --pr "${{ steps.pr.outputs.number }}" \
             --router "${{ steps.detect.outputs.router }}" \
             --split "${{ steps.detect.outputs.split }}" \
-            --base-ref "$BASE_SHA" > evaluation_output.txt 2>&1
-          cat evaluation_output.txt
+            --base-ref "$BASE_SHA" 2>&1 | tee evaluation_output.txt
 
       - name: Post evaluation results as PR comment
         if: ${{ steps.detect.outputs.router != '' && steps.evaluate.outcome == 'success' }}
diff --git a/automation/process_pr_submission.py b/automation/process_pr_submission.py
@@ -454,23 +454,29 @@ def main(argv: Optional[list[str]] = None) -> int:
             )
 
         if not args.skip_sync:
-            run_command(["uv", "sync", "--locked"], cwd=worktree_path, capture=True)
+            print("▶ Syncing dependencies with uv...", flush=True)
+            run_command(["uv", "sync", "--locked"], cwd=worktree_path, capture=False)
+            print("✔ Synced dependencies")
 
         validation_cmd = [
             "uv",
             "run",
-            "--activepython",
+            "--active",
             "router_inference/check_config_prediction_files.py",
             args.router,
             args.split,
             "--check-generated-result",
         ]
-        validation_result = run_command(validation_cmd, cwd=worktree_path, capture=True)
+        print("▶ Validating prediction/config files...", flush=True)
+        validation_result = run_command(
+            validation_cmd, cwd=worktree_path, capture=False
+        )
+        print("✔ Validated prediction files")
 
         evaluation_cmd = [
             "uv",
             "run",
-            "--activepython",
+            "--active",
             "llm_evaluation/run.py",
             args.router,
             args.split,
@@ -479,9 +485,11 @@ def main(argv: Optional[list[str]] = None) -> int:
 
         evaluation_logs = ""
         try:
+            print("▶ Running evaluation...", flush=True)
             evaluation_result = run_command(
-                evaluation_cmd, cwd=worktree_path, capture=True
+                evaluation_cmd, cwd=worktree_path, capture=False
             )
+            print("✔ Evaluated predictions")
             evaluation_logs = (evaluation_result.stdout or "") + (
                 evaluation_result.stderr or ""
             )