Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
47f0bfd
Added browser_local flag to --magentic-cli to enable seeing browser w…
matheusmaldaner Aug 3, 2025
0e573e1
Minor change from using argparse to Typer
matheusmaldaner Aug 4, 2025
53157ee
Added citation to README
matheusmaldaner Aug 4, 2025
19bf0d6
rough UI integration of sentinel steps
matheusmaldaner Aug 5, 2025
21ae496
Updates prompt to ensure URL is passed in the details field
matheusmaldaner Aug 7, 2025
0d65087
Dynamic sleep_duration
matheusmaldaner Aug 12, 2025
f4ae04d
Lock file
matheusmaldaner Aug 13, 2025
2dba654
Poe check
matheusmaldaner Aug 13, 2025
bf85f38
Fixed dataset download
matheusmaldaner Aug 14, 2025
133f93c
Added sentinel-tasks and use-local-browser flags
matheusmaldaner Aug 14, 2025
8fec221
Revert the changes made locally to run.py
matheusmaldaner Aug 14, 2025
b814b9b
Remove current session tab (#318)
cheng-tan Aug 5, 2025
18aa63f
Mcp server list (#319)
cheng-tan Aug 5, 2025
f8285ff
Addresses Issue #137 (#339)
matheusmaldaner Aug 13, 2025
9ecf7f2
Add GitHub Actions workflow for automatic GitHub Pages deployment (#317)
mmurad2 Aug 13, 2025
2f881c1
Revert "Add GitHub Actions workflow for automatic GitHub Pages deploy…
husseinmozannar Aug 14, 2025
f9512c3
Add github pages (#342)
husseinmozannar Aug 14, 2025
d7ffa49
Rough testing changes
matheusmaldaner Aug 15, 2025
f02ff73
Last pushes
matheusmaldaner Aug 15, 2025
a300e45
SentinelBench fixes
matheusmaldaner Aug 17, 2025
2428f8b
Enables file upload tool and print plan to console to validate sentin…
matheusmaldaner Aug 17, 2025
a87680a
Enables task or difficulty specific runs for SentinelBench
matheusmaldaner Aug 18, 2025
7c2d738
Create function to compare results with and without sentinel tasks
matheusmaldaner Aug 18, 2025
36a7de8
Code to save partial results during eval when a task is canceled
matheusmaldaner Aug 18, 2025
0dc805d
Updated pricing
matheusmaldaner Aug 20, 2025
cf10c5f
Data exploration for WebGames
matheusmaldaner Aug 20, 2025
290c52a
Instructions to run
matheusmaldaner Aug 20, 2025
79d05ec
Introduced time limit flag and variants file for sentinel bench eval
matheusmaldaner Aug 21, 2025
ea8b68d
Fixed pathing
matheusmaldaner Aug 21, 2025
72d61a0
Fixed critical error in sleep_duration validation
matheusmaldaner Aug 21, 2025
30ba25b
gpt-5-mini price support
matheusmaldaner Aug 21, 2025
7d5e99b
Logging
matheusmaldaner Aug 22, 2025
bb62333
More changes to sentinelbench
matheusmaldaner Aug 22, 2025
6ef1e0e
syncing changes across computers
matheusmaldaner Aug 27, 2025
9163a76
Adds support for comparing two runs
matheusmaldaner Aug 28, 2025
35337a2
Updated task naming
matheusmaldaner Aug 28, 2025
47fa512
Removed duplicate test.jsonl
matheusmaldaner Aug 28, 2025
bd87dc6
Added --pretty-output flag
matheusmaldaner Aug 28, 2025
3315dfa
Support to run multiple tasks at once from command line
matheusmaldaner Aug 29, 2025
e54defc
Improve --help flag display
matheusmaldaner Aug 29, 2025
23f73ef
Hardcoded way of forcing timeout values for specific tasks, will get …
matheusmaldaner Aug 29, 2025
a9822f7
Fix sentinel task prompt
matheusmaldaner Aug 29, 2025
d1178eb
Small prompt fix
matheusmaldaner Aug 29, 2025
83de1d1
Fix final answer extraction when running parallel runs
matheusmaldaner Aug 29, 2025
a858173
Add `--sentinelbench-url` flag to change the base url
matheusmaldaner Aug 29, 2025
aacfcb3
Delete unused files
matheusmaldaner Sep 7, 2025
fd7425e
Poe check
matheusmaldaner Sep 23, 2025
15aef43
Init
matheusmaldaner Sep 23, 2025
a7cc3f7
Remove plots
matheusmaldaner Sep 23, 2025
56eb345
File to test task variants
matheusmaldaner Sep 23, 2025
0685420
Poe check
matheusmaldaner Sep 24, 2025
7834fc9
SentinelBench utility tools
matheusmaldaner Sep 28, 2025
1e48e53
Task variants
matheusmaldaner Sep 28, 2025
2420120
Poe check
matheusmaldaner Sep 28, 2025
3ec1441
Small changes on README and comments
matheusmaldaner Sep 28, 2025
ed7b34d
Small fixes and poe checks
matheusmaldaner Sep 28, 2025
05da06a
Fixes absolute paths and hardcoded values
matheusmaldaner Sep 28, 2025
042d8af
uv.lock
matheusmaldaner Sep 28, 2025
e073ff3
Merge branch 'main' into sentinel/evals
matheusmaldaner Sep 28, 2025
69ea146
All SentinelBench tasks and router configs
matheusmaldaner Sep 28, 2025
40d41f0
SentinelBench dump
matheusmaldaner Sep 28, 2025
bb67372
Ignores SentinelBench env variables and typo fix
matheusmaldaner Sep 30, 2025
1836bc0
Better exception handling + uv.lock file
matheusmaldaner Oct 1, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ debug

data
runs
plots

node_modules

Expand Down Expand Up @@ -211,3 +212,13 @@ venv.bak/
# Task centric memory related db and logs
**/memory_bank/
**/pagelogs/

# SentinelBench
SentinelBench/sentinelbench/node_modules/
SentinelBench/sentinelbench/dist/
SentinelBench/sentinelbench/.env
SentinelBench/sentinelbench/.env.local
SentinelBench/sentinelbench/.env.production
SentinelBench/sentinelbench/.env.development
SentinelBench/sentinelbench/.wrangler/
SentinelBench/sentinelbench/coverage/
1 change: 0 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,6 @@ To reproduce these experimental results, please see the following [instructions]

If you're interested in reading more checkout our [technical report](https://www.microsoft.com/en-us/research/wp-content/uploads/2025/07/magentic-ui-report.pdf) and [blog post](https://www.microsoft.com/en-us/research/blog/magentic-ui-an-experimental-human-centered-web-agent/).


## 🛠️ Installation
### Pre-Requisites

Expand Down
5 changes: 5 additions & 0 deletions SentinelBench/sentinelbench/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# 1. make a copy of this file and rename it to .env
# 2. populate the variables below with your credentials

# Cloudflare D1 Database
CLOUDFLARE_D1_DATABASE_ID=your-database-id-here
13 changes: 13 additions & 0 deletions SentinelBench/sentinelbench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# SentinelBench

This is a collection of challenges for testing AI agents on long-running, persistent monitoring and conditional tasks.

They're designed to be:

- focused on sustained engagement and monitoring capabilities
- testing patience, persistence, and continuous observation
- requiring agents to maintain state across sessions
- challenging for AI agents due to time and attention requirements
- easy to evaluate
- each task provides a unique password on successful completion as well as the time it took the agent to complete.
- passwords follow a ANSWER_TIMEXXX format which allow them to be easily evaluated for both accuracy and latency
26 changes: 26 additions & 0 deletions SentinelBench/sentinelbench/db/schema.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
CREATE TABLE IF NOT EXISTS completions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
task_id TEXT NOT NULL,
start_time DATETIME NOT NULL,
completion_time DATETIME NOT NULL,
user_agent TEXT,
ip_address TEXT,
user_id TEXT,
host TEXT,
url TEXT
);

CREATE INDEX IF NOT EXISTS idx_task_id ON completions(task_id);

CREATE TABLE IF NOT EXISTS views (
id INTEGER PRIMARY KEY AUTOINCREMENT,
task_id TEXT NOT NULL,
view_time DATETIME NOT NULL,
user_agent TEXT,
ip_address TEXT,
user_id TEXT,
host TEXT,
url TEXT
);

CREATE INDEX IF NOT EXISTS idx_views_task_id ON views(task_id);
28 changes: 28 additions & 0 deletions SentinelBench/sentinelbench/eslint.config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import js from '@eslint/js'
import globals from 'globals'
import reactHooks from 'eslint-plugin-react-hooks'
import reactRefresh from 'eslint-plugin-react-refresh'
import tseslint from 'typescript-eslint'

export default tseslint.config(
{ ignores: ['dist'] },
{
extends: [js.configs.recommended, ...tseslint.configs.recommended],
files: ['**/*.{ts,tsx}'],
languageOptions: {
ecmaVersion: 2020,
globals: globals.browser,
},
plugins: {
'react-hooks': reactHooks,
'react-refresh': reactRefresh,
},
rules: {
...reactHooks.configs.recommended.rules,
'react-refresh/only-export-components': [
'warn',
{ allowConstantExport: true },
],
},
},
)
56 changes: 56 additions & 0 deletions SentinelBench/sentinelbench/functions/api/record-completion.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import { D1Database } from "@cloudflare/workers-types";

interface Env {
DB: D1Database;
}

export interface TaskCompletion {
taskId: string;
completionTime: string;
userId: string;
startTime: string;
host: string;
url: string;
}

export const onRequestPost = async (context: {
request: Request;
env: Env;
}) => {
try {
const data: TaskCompletion = await context.request.json();

await context.env.DB.prepare(
`INSERT INTO completions (task_id, start_time, completion_time, user_agent, ip_address, user_id, host, url)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)`
)
.bind(
data.taskId,
data.startTime,
data.completionTime,
context.request.headers.get("User-Agent") || "",
context.request.headers.get("CF-Connecting-IP") || "",
data.userId || "",
data.host,
data.url
)
.run();

return new Response(JSON.stringify({ success: true }), {
headers: { "Content-Type": "application/json" },
status: 200,
});
} catch (error) {
console.error("Error saving task completion:", error);
return new Response(
JSON.stringify({
success: false,
error: "Failed to save task completion",
}),
{
headers: { "Content-Type": "application/json" },
status: 500,
}
);
}
};
54 changes: 54 additions & 0 deletions SentinelBench/sentinelbench/functions/api/record-view.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import { D1Database } from "@cloudflare/workers-types";

interface Env {
DB: D1Database;
}

export interface TaskView {
taskId: string;
userId: string;
viewTime: string; // ISO string
host: string;
url: string;
}

export const onRequestPost = async (context: {
request: Request;
env: Env;
}) => {
try {
const data: TaskView = await context.request.json();

await context.env.DB.prepare(
`INSERT INTO views (task_id, view_time, user_agent, ip_address, user_id, host, url)
VALUES (?, ?, ?, ?, ?, ?, ?)`
)
.bind(
data.taskId,
data.viewTime,
context.request.headers.get("User-Agent") || "",
context.request.headers.get("CF-Connecting-IP") || "",
data.userId || "",
data.host,
data.url
)
.run();

return new Response(JSON.stringify({ success: true }), {
headers: { "Content-Type": "application/json" },
status: 200,
});
} catch (error) {
console.error("Error saving task view:", error);
return new Response(
JSON.stringify({
success: false,
error: "Failed to save task view",
}),
{
headers: { "Content-Type": "application/json" },
status: 500,
}
);
}
};
Loading