microsoft
diff --git a/‎experiments/eval/run.py‎
Lines changed: 13 additions & 15 deletions b/‎experiments/eval/run.py‎
Lines changed: 13 additions & 15 deletions
diff --git a/‎src/magentic_ui/eval/benchmarks/sentinelbench/README.md‎
Lines changed: 35 additions & 20 deletions b/‎src/magentic_ui/eval/benchmarks/sentinelbench/README.md‎
Lines changed: 35 additions & 20 deletions
diff --git a/‎src/magentic_ui/eval/benchmarks/sentinelbench/sentinelbench.py‎
Lines changed: 6 additions & 9 deletions b/‎src/magentic_ui/eval/benchmarks/sentinelbench/sentinelbench.py‎
Lines changed: 6 additions & 9 deletions
@@ -139,13 +139,11 @@ def create_sentinelbench_benchmark(data_dir="SentinelBench", name="SentinelBench
             elif args.get('use_full_variants'):
                 task_variants = SENTINELBENCH_TASK_VARIANTS
 
-            # Handle custom base URL if provided
-            base_website_path = "http://10.255.255.254:5173/"  # Default
-            if args.get('sentinelbench_url'):
-                base_website_path = args['sentinelbench_url']
-                # Ensure URL ends with slash
-                if not base_website_path.endswith('/'):
-                    base_website_path += '/'
+            # Use provided SentinelBench URL (has default value)
+            base_website_path = args['sentinelbench_url']
+            # Ensure URL ends with slash
+            if not base_website_path.endswith('/'):
+                base_website_path += '/'
 
             benchmark = SentinelBenchBenchmark(
                 data_dir=data_dir,
@@ -277,7 +275,7 @@ def main(
     difficulty: Annotated[Optional[str], typer.Option(help="⚡ Filter tasks by difficulty level or multiple levels separated by commas (e.g., 'easy,medium')", rich_help_panel="🛡️ SentinelBench Options")] = None,
     use_test_variants: Annotated[bool, typer.Option(help="🧪 Use test variants for SentinelBench (smaller set)", rich_help_panel="🛡️ SentinelBench Options")] = False,
     use_full_variants: Annotated[bool, typer.Option(help="🎛️ Use full variants for SentinelBench (all combinations)", rich_help_panel="🛡️ SentinelBench Options")] = False,
-    sentinelbench_url: Annotated[Optional[str], typer.Option(help="🌐 Override SentinelBench base URL (default: http://10.255.255.254:5173/)", rich_help_panel="🛡️ SentinelBench Options")] = None,
+    sentinelbench_url: Annotated[str, typer.Option(help="🌐 SentinelBench base URL", rich_help_panel="🛡️ SentinelBench Options")] = "https://sentinel-bench.vercel.app/",
 
     # Evaluation Options
     redo_eval: Annotated[bool, typer.Option(help="🔄 Redo evaluation even if results exist", rich_help_panel="📊 Evaluation Options")] = False,
@@ -375,13 +373,13 @@ def main(
             for info in filter_info:
                 typer.echo(f"   • {info}", color=True)
 
-        # Display custom URL info if provided
-        if args.get("sentinelbench_url"):
-            custom_url = args["sentinelbench_url"]
-            if not custom_url.endswith('/'):
-                custom_url += '/'
-            typer.echo("🌐 Custom SentinelBench URL:", color=True)
-            typer.echo(f"   • URL: [cyan]{custom_url}[/cyan]", color=True)
+        # Display SentinelBench URL info
+        if dataset == "SentinelBench":
+            sentinelbench_url = str(args["sentinelbench_url"])
+            if not sentinelbench_url.endswith('/'):
+                sentinelbench_url += '/'
+            typer.echo("🌐 SentinelBench URL:", color=True)
+            typer.echo(f"   • URL: [cyan]{sentinelbench_url}[/cyan]", color=True)
 
     # Save experiment args
     save_experiment_args(args, system_name)
 
@@ -1,20 +1,5 @@
-SentinelBench: A benchmark for evaluating AI agents on monitoring and long-term observation tasks.
-
 This benchmark focuses on testing AI agents' capabilities in persistent monitoring, state change detection, and task completion under varying complexity and noise levels.
 
-The benchmark includes 18 interactive web-based tasks designed around monitoring scenarios, from simple button pressing to complex social media monitoring.
-
-## Task Characterization
-
-Each task includes several dimensions for analysis:
-- **difficulty**: easy, medium, hard
-- **base_task**: underlying task type (e.g., reactor, animal-mover, button-presser)
-- **duration**: Short, Medium, Long
-- **criteria**: Objective, Subjective, Mixed
-- **activity**: Active (requires user interaction), Passive (monitoring/waiting)
-- **noise**: Low, Medium, High
-- **realism**: Playful, Realistic
-
 ## Usage
 
 To run SentinelBench evaluations:
@@ -23,6 +8,12 @@ To run SentinelBench evaluations:
 python experiments/eval/run.py --current-dir . --dataset SentinelBench --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
 ```
 
+**Note**: The above command uses the default SentinelBench URL (`https://sentinel-bench.vercel.app/`). If you're hosting SentinelBench locally or at a different URL, specify it with `--sentinelbench-url`:
+
+```bash
+python experiments/eval/run.py --current-dir . --dataset SentinelBench --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run --sentinelbench-url http://YOUR_HOST_IP:5173/
+```
+
 ### Task Filtering
 
 SentinelBench supports filtering tasks to run specific subsets:
@@ -48,11 +39,35 @@ python experiments/eval/run.py --current-dir . --dataset SentinelBench --split t
 python experiments/eval/run.py --current-dir . --dataset SentinelBench --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run --base-task animal-mover --difficulty medium
 ```
 
-## Local Hosting
+*For all examples above, add `--sentinelbench-url http://YOUR_HOST_IP:5173/` if using a custom URL instead of the default.*
+
+## URL Configuration
 
-SentinelBench is designed to be hosted locally during development and testing. The default configuration expects the benchmark website to be running at `http://172.25.159.193:5173/`.
+SentinelBench evaluations use `https://sentinel-bench.vercel.app/` by default. For local development or custom deployments, you can override this URL.
 
+### Using Default (Production) URL
+No additional configuration needed - just run the commands as shown above.
+
+### Using Custom/Local URL
+Add `--sentinelbench-url` parameter to specify your custom URL:
+
+```bash
+python experiments/eval/run.py --dataset SentinelBench --sentinelbench-url http://localhost:5173/ [other options]
+```
+
+### Local Hosting Setup
 To host SentinelBench locally:
-1. Clone the SentinelBench repository
-2. Install dependencies and start the development server with: `npm run dev -- --host 0.0.0.0`
-3. Ensure it's accessible at the expected URL (http://172.25.159.193:5173/)
+1. Clone the MagenticUI repository  
+2. Navigate to the SentinelBench/ directory
+3. Install dependencies and start the development server with: `npm run dev -- --host 0.0.0.0`
+4. Note the IP address and port where the server is running (typically shown in the terminal output)
+5. Use this URL with the `--sentinelbench-url` parameter
+
+**Common local URLs:**
+- Local development: `http://localhost:5173/` or `http://127.0.0.1:5173/`
+- Network accessible: `http://YOUR_MACHINE_IP:5173/` (replace YOUR_MACHINE_IP with your actual IP)
+- Docker/VM: Check your container/VM's IP address
+
+## Running Analysis
+
+We provide all scripts to run analysis within the tools/ subdirectory. This subdirectory also contains a README.md file with explanations of the order the tools should be ran and how to better utilize them.
@@ -26,7 +26,7 @@ def __init__(
         self,
         name: str = "SentinelBench",
         data_dir: Union[str, None] = None,
-        base_website_path: str = "http://10.255.255.254:5173/",
+        base_website_path: str = "https://sentinel-bench.vercel.app/",
         task_variants: Optional[Dict[str, List[Union[int, float]]]] = None,
     ):
         """
@@ -36,7 +36,7 @@ def __init__(
             name: Name of the benchmark
             data_dir: Directory containing the benchmark data
             base_website_path: The base path of the website to use for the SentinelBench.
-                              Make sure it ends with a slash. Default is http://10.255.255.254:5173/ for local testing.
+                              Make sure it ends with a slash. Defaults to https://sentinel-bench.vercel.app/
             task_variants: Dict of task_id -> list of parameter values.
                           E.g., {"reactor-easy": [60, 120, 3600]} for different duration values
         """
@@ -49,15 +49,12 @@ def __init__(
         self.task_variants = task_variants or {}
         self.default_params = SENTINELBENCH_DEFAULT_PARAMS
 
-        logging_msg = (
+        logging.info(
             f"[SentinelBench] Using base website path: {self.base_website_path}"
         )
-        if self.base_website_path == "http://10.255.255.254:5173/":
-            logging_msg += """
-            SentinelBench is currently configured for local testing at 10.255.255.254:5173.
-            Make sure you have the SentinelBench website running locally with 'npm run dev -- --host 0.0.0.0' before executing evaluations.
-            """
-        logging.info(logging_msg)
+        logging.info(
+            "[SentinelBench] Make sure the SentinelBench website is running and accessible at the provided URL before executing evaluations."
+        )
 
     def download_dataset(self) -> None:
         """