microsoft · matheusmaldaner · Aug 3, 2025 · Aug 4, 2025 · Aug 4, 2025 · Aug 5, 2025
diff --git a/.gitignore b/.gitignore
@@ -170,6 +170,7 @@ debug
 
 data
 runs
+plots
 
 node_modules
 
@@ -211,3 +212,13 @@ venv.bak/
 # Task centric memory related db and logs
 **/memory_bank/
 **/pagelogs/
+
+# SentinelBench
+SentinelBench/sentinelbench/node_modules/
+SentinelBench/sentinelbench/dist/
+SentinelBench/sentinelbench/.env
+SentinelBench/sentinelbench/.env.local
+SentinelBench/sentinelbench/.env.production
+SentinelBench/sentinelbench/.env.development
+SentinelBench/sentinelbench/.wrangler/
+SentinelBench/sentinelbench/coverage/
diff --git a/README.md b/README.md
@@ -128,7 +128,6 @@ To reproduce these experimental results, please see the following [instructions]
 
 If you're interested in reading more checkout our [technical report](https://www.microsoft.com/en-us/research/wp-content/uploads/2025/07/magentic-ui-report.pdf) and [blog post](https://www.microsoft.com/en-us/research/blog/magentic-ui-an-experimental-human-centered-web-agent/).
 
-
 ## 🛠️ Installation
 ### Pre-Requisites
 

diff --git a/SentinelBench/sentinelbench/.env.example b/SentinelBench/sentinelbench/.env.example
@@ -0,0 +1,5 @@
+# 1. make a copy of this file and rename it to .env
+# 2. populate the variables below with your credentials
+
+# Cloudflare D1 Database
+CLOUDFLARE_D1_DATABASE_ID=your-database-id-here
diff --git a/SentinelBench/sentinelbench/README.md b/SentinelBench/sentinelbench/README.md
@@ -0,0 +1,13 @@
+# SentinelBench
+
+This is a collection of challenges for testing AI agents on long-running, persistent monitoring and conditional tasks.
+
+They're designed to be:
+
+- focused on sustained engagement and monitoring capabilities
+- testing patience, persistence, and continuous observation  
+- requiring agents to maintain state across sessions
+- challenging for AI agents due to time and attention requirements
+- easy to evaluate
+  - each task provides a unique password on successful completion as well as the time it took the agent to complete.
+  - passwords follow a ANSWER_TIMEXXX format which allow them to be easily evaluated for both accuracy and latency
diff --git a/SentinelBench/sentinelbench/db/schema.sql b/SentinelBench/sentinelbench/db/schema.sql
@@ -0,0 +1,26 @@
+CREATE TABLE IF NOT EXISTS completions (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    task_id TEXT NOT NULL,
+    start_time DATETIME NOT NULL,
+    completion_time DATETIME NOT NULL,
+    user_agent TEXT,
+    ip_address TEXT,
+    user_id TEXT,
+    host TEXT,
+    url TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_task_id ON completions(task_id);
+
+CREATE TABLE IF NOT EXISTS views (
+    id INTEGER PRIMARY KEY AUTOINCREMENT,
+    task_id TEXT NOT NULL,
+    view_time DATETIME NOT NULL,
+    user_agent TEXT,
+    ip_address TEXT,
+    user_id TEXT,
+    host TEXT,
+    url TEXT
+);
+
+CREATE INDEX IF NOT EXISTS idx_views_task_id ON views(task_id);
diff --git a/SentinelBench/sentinelbench/eslint.config.js b/SentinelBench/sentinelbench/eslint.config.js
@@ -0,0 +1,28 @@
+import js from '@eslint/js'
+import globals from 'globals'
+import reactHooks from 'eslint-plugin-react-hooks'
+import reactRefresh from 'eslint-plugin-react-refresh'
+import tseslint from 'typescript-eslint'
+
+export default tseslint.config(
+  { ignores: ['dist'] },
+  {
+    extends: [js.configs.recommended, ...tseslint.configs.recommended],
+    files: ['**/*.{ts,tsx}'],
+    languageOptions: {
+      ecmaVersion: 2020,
+      globals: globals.browser,
+    },
+    plugins: {
+      'react-hooks': reactHooks,
+      'react-refresh': reactRefresh,
+    },
+    rules: {
+      ...reactHooks.configs.recommended.rules,
+      'react-refresh/only-export-components': [
+        'warn',
+        { allowConstantExport: true },
+      ],
+    },
+  },
+)
diff --git a/SentinelBench/sentinelbench/functions/api/record-completion.ts b/SentinelBench/sentinelbench/functions/api/record-completion.ts
@@ -0,0 +1,56 @@
+import { D1Database } from "@cloudflare/workers-types";
+
+interface Env {
+  DB: D1Database;
+}
+
+export interface TaskCompletion {
+  taskId: string;
+  completionTime: string;
+  userId: string;
+  startTime: string;
+  host: string;
+  url: string;
+}
+
+export const onRequestPost = async (context: {
+  request: Request;
+  env: Env;
+}) => {
+  try {
+    const data: TaskCompletion = await context.request.json();
+
+    await context.env.DB.prepare(
+      `INSERT INTO completions (task_id, start_time, completion_time, user_agent, ip_address, user_id, host, url)
+       VALUES (?, ?, ?, ?, ?, ?, ?, ?)`
+    )
+      .bind(
+        data.taskId,
+        data.startTime,
+        data.completionTime,
+        context.request.headers.get("User-Agent") || "",
+        context.request.headers.get("CF-Connecting-IP") || "",
+        data.userId || "",
+        data.host,
+        data.url
+      )
+      .run();
+
+    return new Response(JSON.stringify({ success: true }), {
+      headers: { "Content-Type": "application/json" },
+      status: 200,
+    });
+  } catch (error) {
+    console.error("Error saving task completion:", error);
+    return new Response(
+      JSON.stringify({
+        success: false,
+        error: "Failed to save task completion",
+      }),
+      {
+        headers: { "Content-Type": "application/json" },
+        status: 500,
+      }
+    );
+  }
+};
diff --git a/SentinelBench/sentinelbench/functions/api/record-view.ts b/SentinelBench/sentinelbench/functions/api/record-view.ts
@@ -0,0 +1,54 @@
+import { D1Database } from "@cloudflare/workers-types";
+
+interface Env {
+  DB: D1Database;
+}
+
+export interface TaskView {
+  taskId: string;
+  userId: string;
+  viewTime: string; // ISO string
+  host: string;
+  url: string;
+}
+
+export const onRequestPost = async (context: {
+  request: Request;
+  env: Env;
+}) => {
+  try {
+    const data: TaskView = await context.request.json();
+
+    await context.env.DB.prepare(
+      `INSERT INTO views (task_id, view_time, user_agent, ip_address, user_id, host, url)
+       VALUES (?, ?, ?, ?, ?, ?, ?)`
+    )
+      .bind(
+        data.taskId,
+        data.viewTime,
+        context.request.headers.get("User-Agent") || "",
+        context.request.headers.get("CF-Connecting-IP") || "",
+        data.userId || "",
+        data.host,
+        data.url
+      )
+      .run();
+
+    return new Response(JSON.stringify({ success: true }), {
+      headers: { "Content-Type": "application/json" },
+      status: 200,
+    });
+  } catch (error) {
+    console.error("Error saving task view:", error);
+    return new Response(
+      JSON.stringify({
+        success: false,
+        error: "Failed to save task view",
+      }),
+      {
+        headers: { "Content-Type": "application/json" },
+        status: 500,
+      }
+    );
+  }
+};
Original file line number	Diff line number	Diff line change
Expand Up		@@ -128,7 +128,6 @@ To reproduce these experimental results, please see the following [instructions]

		If you're interested in reading more checkout our [technical report](https://www.microsoft.com/en-us/research/wp-content/uploads/2025/07/magentic-ui-report.pdf) and [blog post](https://www.microsoft.com/en-us/research/blog/magentic-ui-an-experimental-human-centered-web-agent/).


		## 🛠️ Installation
		### Pre-Requisites

Expand Down