Skip to content

Commit 05da06a

Browse files
Fixes absolute paths and hardcoded values
1 parent ed7b34d commit 05da06a

File tree

6 files changed

+348
-125
lines changed

6 files changed

+348
-125
lines changed

experiments/eval/run.py

Lines changed: 13 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,11 @@ def create_sentinelbench_benchmark(data_dir="SentinelBench", name="SentinelBench
139139
elif args.get('use_full_variants'):
140140
task_variants = SENTINELBENCH_TASK_VARIANTS
141141

142-
# Handle custom base URL if provided
143-
base_website_path = "http://10.255.255.254:5173/" # Default
144-
if args.get('sentinelbench_url'):
145-
base_website_path = args['sentinelbench_url']
146-
# Ensure URL ends with slash
147-
if not base_website_path.endswith('/'):
148-
base_website_path += '/'
142+
# Use provided SentinelBench URL (has default value)
143+
base_website_path = args['sentinelbench_url']
144+
# Ensure URL ends with slash
145+
if not base_website_path.endswith('/'):
146+
base_website_path += '/'
149147

150148
benchmark = SentinelBenchBenchmark(
151149
data_dir=data_dir,
@@ -277,7 +275,7 @@ def main(
277275
difficulty: Annotated[Optional[str], typer.Option(help="⚡ Filter tasks by difficulty level or multiple levels separated by commas (e.g., 'easy,medium')", rich_help_panel="🛡️ SentinelBench Options")] = None,
278276
use_test_variants: Annotated[bool, typer.Option(help="🧪 Use test variants for SentinelBench (smaller set)", rich_help_panel="🛡️ SentinelBench Options")] = False,
279277
use_full_variants: Annotated[bool, typer.Option(help="🎛️ Use full variants for SentinelBench (all combinations)", rich_help_panel="🛡️ SentinelBench Options")] = False,
280-
sentinelbench_url: Annotated[Optional[str], typer.Option(help="🌐 Override SentinelBench base URL (default: http://10.255.255.254:5173/)", rich_help_panel="🛡️ SentinelBench Options")] = None,
278+
sentinelbench_url: Annotated[str, typer.Option(help="🌐 SentinelBench base URL", rich_help_panel="🛡️ SentinelBench Options")] = "https://sentinel-bench.vercel.app/",
281279

282280
# Evaluation Options
283281
redo_eval: Annotated[bool, typer.Option(help="🔄 Redo evaluation even if results exist", rich_help_panel="📊 Evaluation Options")] = False,
@@ -375,13 +373,13 @@ def main(
375373
for info in filter_info:
376374
typer.echo(f" • {info}", color=True)
377375

378-
# Display custom URL info if provided
379-
if args.get("sentinelbench_url"):
380-
custom_url = args["sentinelbench_url"]
381-
if not custom_url.endswith('/'):
382-
custom_url += '/'
383-
typer.echo("🌐 Custom SentinelBench URL:", color=True)
384-
typer.echo(f" • URL: [cyan]{custom_url}[/cyan]", color=True)
376+
# Display SentinelBench URL info
377+
if dataset == "SentinelBench":
378+
sentinelbench_url = str(args["sentinelbench_url"])
379+
if not sentinelbench_url.endswith('/'):
380+
sentinelbench_url += '/'
381+
typer.echo("🌐 SentinelBench URL:", color=True)
382+
typer.echo(f" • URL: [cyan]{sentinelbench_url}[/cyan]", color=True)
385383

386384
# Save experiment args
387385
save_experiment_args(args, system_name)
Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,5 @@
1-
SentinelBench: A benchmark for evaluating AI agents on monitoring and long-term observation tasks.
2-
31
This benchmark focuses on testing AI agents' capabilities in persistent monitoring, state change detection, and task completion under varying complexity and noise levels.
42

5-
The benchmark includes 18 interactive web-based tasks designed around monitoring scenarios, from simple button pressing to complex social media monitoring.
6-
7-
## Task Characterization
8-
9-
Each task includes several dimensions for analysis:
10-
- **difficulty**: easy, medium, hard
11-
- **base_task**: underlying task type (e.g., reactor, animal-mover, button-presser)
12-
- **duration**: Short, Medium, Long
13-
- **criteria**: Objective, Subjective, Mixed
14-
- **activity**: Active (requires user interaction), Passive (monitoring/waiting)
15-
- **noise**: Low, Medium, High
16-
- **realism**: Playful, Realistic
17-
183
## Usage
194

205
To run SentinelBench evaluations:
@@ -23,6 +8,12 @@ To run SentinelBench evaluations:
238
python experiments/eval/run.py --current-dir . --dataset SentinelBench --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run
249
```
2510

11+
**Note**: The above command uses the default SentinelBench URL (`https://sentinel-bench.vercel.app/`). If you're hosting SentinelBench locally or at a different URL, specify it with `--sentinelbench-url`:
12+
13+
```bash
14+
python experiments/eval/run.py --current-dir . --dataset SentinelBench --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run --sentinelbench-url http://YOUR_HOST_IP:5173/
15+
```
16+
2617
### Task Filtering
2718

2819
SentinelBench supports filtering tasks to run specific subsets:
@@ -48,11 +39,35 @@ python experiments/eval/run.py --current-dir . --dataset SentinelBench --split t
4839
python experiments/eval/run.py --current-dir . --dataset SentinelBench --split test --run-id 1 --simulated-user-type none --parallel 1 --config experiments/endpoint_configs/config.yaml --mode run --base-task animal-mover --difficulty medium
4940
```
5041

51-
## Local Hosting
42+
*For all examples above, add `--sentinelbench-url http://YOUR_HOST_IP:5173/` if using a custom URL instead of the default.*
43+
44+
## URL Configuration
5245

53-
SentinelBench is designed to be hosted locally during development and testing. The default configuration expects the benchmark website to be running at `http://172.25.159.193:5173/`.
46+
SentinelBench evaluations use `https://sentinel-bench.vercel.app/` by default. For local development or custom deployments, you can override this URL.
5447

48+
### Using Default (Production) URL
49+
No additional configuration needed - just run the commands as shown above.
50+
51+
### Using Custom/Local URL
52+
Add `--sentinelbench-url` parameter to specify your custom URL:
53+
54+
```bash
55+
python experiments/eval/run.py --dataset SentinelBench --sentinelbench-url http://localhost:5173/ [other options]
56+
```
57+
58+
### Local Hosting Setup
5559
To host SentinelBench locally:
56-
1. Clone the SentinelBench repository
57-
2. Install dependencies and start the development server with: `npm run dev -- --host 0.0.0.0`
58-
3. Ensure it's accessible at the expected URL (http://172.25.159.193:5173/)
60+
1. Clone the MagenticUI repository
61+
2. Navigate to the SentinelBench/ directory
62+
3. Install dependencies and start the development server with: `npm run dev -- --host 0.0.0.0`
63+
4. Note the IP address and port where the server is running (typically shown in the terminal output)
64+
5. Use this URL with the `--sentinelbench-url` parameter
65+
66+
**Common local URLs:**
67+
- Local development: `http://localhost:5173/` or `http://127.0.0.1:5173/`
68+
- Network accessible: `http://YOUR_MACHINE_IP:5173/` (replace YOUR_MACHINE_IP with your actual IP)
69+
- Docker/VM: Check your container/VM's IP address
70+
71+
## Running Analysis
72+
73+
We provide all scripts to run analysis within the tools/ subdirectory. This subdirectory also contains a README.md file with explanations of the order the tools should be ran and how to better utilize them.

src/magentic_ui/eval/benchmarks/sentinelbench/sentinelbench.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def __init__(
2626
self,
2727
name: str = "SentinelBench",
2828
data_dir: Union[str, None] = None,
29-
base_website_path: str = "http://10.255.255.254:5173/",
29+
base_website_path: str = "https://sentinel-bench.vercel.app/",
3030
task_variants: Optional[Dict[str, List[Union[int, float]]]] = None,
3131
):
3232
"""
@@ -36,7 +36,7 @@ def __init__(
3636
name: Name of the benchmark
3737
data_dir: Directory containing the benchmark data
3838
base_website_path: The base path of the website to use for the SentinelBench.
39-
Make sure it ends with a slash. Default is http://10.255.255.254:5173/ for local testing.
39+
Make sure it ends with a slash. Defaults to https://sentinel-bench.vercel.app/
4040
task_variants: Dict of task_id -> list of parameter values.
4141
E.g., {"reactor-easy": [60, 120, 3600]} for different duration values
4242
"""
@@ -49,15 +49,12 @@ def __init__(
4949
self.task_variants = task_variants or {}
5050
self.default_params = SENTINELBENCH_DEFAULT_PARAMS
5151

52-
logging_msg = (
52+
logging.info(
5353
f"[SentinelBench] Using base website path: {self.base_website_path}"
5454
)
55-
if self.base_website_path == "http://10.255.255.254:5173/":
56-
logging_msg += """
57-
SentinelBench is currently configured for local testing at 10.255.255.254:5173.
58-
Make sure you have the SentinelBench website running locally with 'npm run dev -- --host 0.0.0.0' before executing evaluations.
59-
"""
60-
logging.info(logging_msg)
55+
logging.info(
56+
"[SentinelBench] Make sure the SentinelBench website is running and accessible at the provided URL before executing evaluations."
57+
)
6158

6259
def download_dataset(self) -> None:
6360
"""

0 commit comments

Comments
 (0)