browserbase
diff --git a/‎evals/args.ts‎
Lines changed: 46 additions & 152 deletions b/‎evals/args.ts‎
Lines changed: 46 additions & 152 deletions
diff --git a/‎evals/cli.ts‎
Lines changed: 1 addition & 0 deletions b/‎evals/cli.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎evals/deterministic/tests/Errors/apiKeyError.test.ts‎
Lines changed: 18 additions & 6 deletions b/‎evals/deterministic/tests/Errors/apiKeyError.test.ts‎
Lines changed: 18 additions & 6 deletions
@@ -103,195 +103,89 @@ function buildUsage(detailed = false): string {
     `pnpm run evals [key=value]… [category <name>] | name=<evalName>`,
   );
 
-  const examplesSection = `
-      ${chalk.magenta.underline("Examples")}
-
-      ${chalk.dim("# Run every evaluation locally with default settings")}
-      ${chalk.green("pnpm run evals")}
-
-      ${chalk.dim("# Same as above but in Browserbase with three trials")}  
-      ${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("browserbase")} ${chalk.cyan("trials=")}${chalk.yellow("3")}
-
-      ${chalk.dim("# Run evals using the Stagehand API")}
-      ${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("browserbase")} ${chalk.cyan("api=")}${chalk.yellow("true")}
-
-      ${chalk.dim("# Run evals from only the 'act' category with a max of 4 running at any given time")}
-      ${chalk.green("pnpm run evals")} ${chalk.cyan("category")} ${chalk.yellow("act")} ${chalk.cyan("concurrency=")}${chalk.yellow("4")}
-
-      ${chalk.dim("# Execute a specific eval by filename")}
-      ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("my_eval_name")}
-  `;
-
   const body = dedent`
     ${chalk.magenta.underline("Keys\n")}
   ${chalk.cyan("env".padEnd(12))} ${"target environment".padEnd(24)}
-    (default ${chalk.dim("LOCAL")})                [${chalk.yellow("browserbase")}, ${chalk.yellow("local")}]
+    (default ${chalk.dim("LOCAL")})                [${chalk.yellow("BROWSERBASE")}, ${chalk.yellow("LOCAL")}] ${chalk.gray("← LOCAL sets api=false")}
 
   ${chalk.cyan("api".padEnd(12))} ${"use the Stagehand API".padEnd(24)}
-    (default ${chalk.dim("false")})                [${chalk.yellow("true")}, ${chalk.yellow("false")}]
+    (default ${chalk.dim("false")})                [${chalk.yellow("true")},  ${chalk.yellow("false")}]
 
-  ${chalk.cyan("trials".padEnd(12))} ${"number of trials per task".padEnd(24)}
-    (default ${chalk.dim("3")})
+  ${chalk.cyan("trials".padEnd(12))} ${"number of trials".padEnd(24)}
+    (default ${chalk.dim("10")})
 
   ${chalk.cyan("concurrency".padEnd(12))} ${"max parallel sessions".padEnd(24)}
-    (default ${chalk.dim("3")})
+    (default ${chalk.dim("10")})
 
   ${chalk.cyan("provider".padEnd(12))} ${"override LLM provider".padEnd(24)}
-    (default ${chalk.dim(providerDefault || "varies by model")})        [${chalk.yellow("openai")}, ${chalk.yellow("anthropic")}, ${chalk.yellow("google")}, ${chalk.yellow("together")}, ${chalk.yellow("groq")}, ${chalk.yellow("cerebras")}]
+    (default ${chalk.dim(providerDefault)})        [${chalk.yellow("OPENAI")}, ${chalk.yellow("ANTHROPIC")}, ${chalk.yellow("GOOGLE")}, ${chalk.yellow("TOGETHER")}, ${chalk.yellow("GROQ")}, ${chalk.yellow("CEREBRAS")}]
 
   ${chalk.cyan("max_k".padEnd(12))} ${"max test cases per dataset".padEnd(24)}
     (default ${chalk.dim("25")})
 
-  ${chalk.cyan("--dataset".padEnd(12))} ${"filter to specific benchmark".padEnd(24)}
-    (optional)              [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}, ${chalk.yellow("webbench")}, ${chalk.yellow("osworld")}, ${chalk.yellow("onlineMind2Web")}]
+  ${chalk.cyan("--dataset".padEnd(12))} ${"filter dataset for benchmarks".padEnd(24)}
+    (optional)              [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}]
 
 
     ${chalk.magenta.underline("Positional filters\n")}
-      
-      category <category_name>   
-      
-        ${chalk.gray("Available categories:")}
-        ${DEFAULT_EVAL_CATEGORIES.slice(0, 5)
-          .map((c) => chalk.yellow(c))
-          .join(", ")},
-        ${DEFAULT_EVAL_CATEGORIES.slice(5, 10)
-          .map((c) => chalk.yellow(c))
-          .join(", ")}${DEFAULT_EVAL_CATEGORIES.slice(10).length > 0 ? "," : ""}
-        ${DEFAULT_EVAL_CATEGORIES.slice(10)
-          .map((c) => chalk.yellow(c))
-          .join(", ")}
-  `;
+      category <category_name>   one of: ${DEFAULT_EVAL_CATEGORIES.map((c) =>
+        chalk.yellow(c),
+      ).join(", ")}
 
-  if (!detailed)
-    return `${header}\n\n${synopsis}\n\nFor more details: ${chalk.bold(
-      "pnpm run evals -man\n",
-    )}`;
-
-  const externalBenchmarksSection = dedent`
-    ${chalk.magenta.underline("\nExternal Benchmarks\n")}
-    
-    ${chalk.cyan.bold("WebBench")} - 5,607 real-world web automation tasks across 452 live websites
-    
-      ${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/webbench")}
-      
-      ${chalk.dim("Or:")}  ${chalk.green("EVAL_DATASET=webbench pnpm run evals")}
-      
-      ${chalk.gray("Environment Variables:")}
-      
-      EVAL_WEBBENCH_LIMIT       max tasks to run (default: 25)
-      EVAL_WEBBENCH_SAMPLE      random sample count before limit
-      EVAL_WEBBENCH_DIFFICULTY  filter: [${chalk.yellow("easy")}, ${chalk.yellow("hard")}] (254 easy, 61 hard tasks)
-      EVAL_WEBBENCH_CATEGORY    filter: [${chalk.yellow("READ")}, ${chalk.yellow("CREATE")}, ${chalk.yellow("UPDATE")}, ${chalk.yellow("DELETE")}, ${chalk.yellow("FILE_MANIPULATION")}]
-      EVAL_WEBBENCH_USE_HITL    use only HITL dataset with difficulty ratings (true/false)
-      
-      ${chalk.dim("Examples:")}
-      
-      ${chalk.green("EVAL_WEBBENCH_DIFFICULTY=easy EVAL_WEBBENCH_LIMIT=10 pnpm run evals name=agent/webbench")}
-      
-      ${chalk.green("EVAL_DATASET=webbench EVAL_WEBBENCH_CATEGORY=READ pnpm run evals")}
-    
-    
-    ${chalk.cyan.bold("GAIA")} - General AI Assistant benchmark for complex reasoning
-    
-      ${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/gaia")}
-      
-      ${chalk.dim("Or:")}  ${chalk.green("EVAL_DATASET=gaia pnpm run evals")}
-      
-      ${chalk.gray("Environment Variables:")}
-      
-      EVAL_GAIA_LIMIT           max tasks to run (default: 25)
-      EVAL_GAIA_SAMPLE          random sample count before limit
-      EVAL_GAIA_LEVEL           filter by difficulty level [${chalk.yellow("1")}, ${chalk.yellow("2")}, ${chalk.yellow("3")}]
+      ${chalk.magenta.underline("\nExamples")}
       
-      ${chalk.dim("Example:")}
-      
-      ${chalk.green("EVAL_GAIA_LEVEL=1 EVAL_GAIA_LIMIT=10 pnpm run evals name=agent/gaia")}
-    
-    
-    ${chalk.cyan.bold("WebVoyager")} - Web navigation and task completion benchmark
-    
-      ${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/webvoyager")}
-      
-      ${chalk.dim("Or:")}  ${chalk.green("EVAL_DATASET=webvoyager pnpm run evals")}
-      
-      ${chalk.gray("Environment Variables:")}
-      
-      EVAL_WEBVOYAGER_LIMIT     max tasks to run (default: 25)
-      EVAL_WEBVOYAGER_SAMPLE    random sample count before limit
-      
-      ${chalk.gray("Ground Truth Evaluation:")}
-      
-      WebVoyager uses ground truth answers for improved accuracy:
-      • Checks agent's "Final Answer:" against reference answers
-      • Supports golden (ideal) and possible (acceptable) answers
-      • Falls back to screenshot evaluation when uncertain
-      • Reference data: evals/datasets/webvoyager/reference-answers.json
-      
-      ${chalk.dim("Example:")}
-      
-      ${chalk.green("EVAL_WEBVOYAGER_SAMPLE=50 EVAL_WEBVOYAGER_LIMIT=10 pnpm run evals name=agent/webvoyager")}
-    
-    
-    ${chalk.cyan.bold("OSWorld")} - Chrome browser automation tasks from the OSWorld benchmark
-    
-      ${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/osworld")}
-      
-      ${chalk.dim("Or:")}  ${chalk.green("EVAL_DATASET=osworld pnpm run evals")}
-      
-      ${chalk.gray("Environment Variables:")}
+      ${chalk.dim("# Run every evaluation locally with default settings")}
       
-      EVAL_OSWORLD_LIMIT           max tasks to run (default: 25)
-      EVAL_OSWORLD_SAMPLE          random sample count before limit
-      EVAL_OSWORLD_SOURCE          filter by source: [${chalk.yellow("Mind2Web")}, ${chalk.yellow("test_task_1")}, ...]
-      EVAL_OSWORLD_EVALUATION_TYPE filter by eval type: [${chalk.yellow("url_match")}, ${chalk.yellow("string_match")}, ${chalk.yellow("dom_state")}, ${chalk.yellow("custom")}]
-      EVAL_OSWORLD_TIMEOUT         timeout per task in milliseconds (default: 60000)
+      ${chalk.green("pnpm run evals")}
       
-      ${chalk.dim("Examples:")}
       
-      ${chalk.green("EVAL_OSWORLD_SOURCE=Mind2Web EVAL_OSWORLD_LIMIT=10 pnpm run evals name=agent/osworld")}
+      ${chalk.dim("# Same as above but in Browserbase with three trials")}
       
-      ${chalk.green("EVAL_DATASET=osworld EVAL_OSWORLD_EVALUATION_TYPE=url_match pnpm run evals")}
-    
-    
-    ${chalk.cyan.bold("Mind2Web")} - Real-world web interaction tasks for evaluating web agents
-    
-      ${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/onlineMind2Web")}
+      ${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("BROWSERBASE")} ${chalk.cyan(
+        "trials=",
+      )}${chalk.yellow("3")}
       
-      ${chalk.dim("Or:")}  ${chalk.green("EVAL_DATASET=onlineMind2Web pnpm run evals")}
       
-      ${chalk.gray("Environment Variables:")}
+      ${chalk.dim("# Run evals using the Stagehand API")}
       
-      EVAL_ONLINEMIND2WEB_LIMIT     max tasks to run (default: 25)
-      EVAL_ONLINEMIND2WEB_SAMPLE    random sample count before limit
+      ${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("BROWSERBASE")} ${chalk.cyan(
+        "api=",
+      )}${chalk.yellow("true")}
+
+
+      ${chalk.dim(
+        "# Run evals from only the 'act' category with a max of 4 running at any given time",
+      )}
       
-      ${chalk.dim("Example:")}
+      ${chalk.green("pnpm run evals")} ${chalk.cyan("category")} ${chalk.yellow("act")} ${chalk.cyan(
+        "concurrency=",
+      )}${chalk.yellow("4")}
+
+
+      ${chalk.dim("# Execute a specific eval by filename")}
       
-      ${chalk.green("EVAL_ONLINEMIND2WEB_SAMPLE=50 EVAL_ONLINEMIND2WEB_LIMIT=10 pnpm run evals name=agent/onlineMind2Web")}
+      ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("my_eval_name")}
   `;
 
+  if (!detailed)
+    return `${header}\n\n${synopsis}\n\nFor more details: ${chalk.bold(
+      "pnpm run evals -man\n",
+    )}`;
+
   const envSection = dedent`
-    ${chalk.magenta.underline("\nGlobal Environment Variables\n")}
-      
-      EVAL_ENV              target environment, overridable via ${chalk.cyan("env=")}
-      
-      EVAL_TRIAL_COUNT      number of trials, overridable via ${chalk.cyan("trials=")}
-      
-      EVAL_MAX_CONCURRENCY  parallel sessions, overridable via ${chalk.cyan("concurrency=")}
-      
-      EVAL_PROVIDER         LLM provider, overridable via ${chalk.cyan("provider=")}
-      
-      EVAL_MAX_K            global limit for all benchmarks (overrides individual limits)
+    ${chalk.magenta.underline("\nEnvironment variables\n")}
+      EVAL_ENV              overridable via ${chalk.cyan("env=")}
       
-      EVAL_DATASET          filter to specific benchmark, overridable via ${chalk.cyan("--dataset=")}
+      EVAL_TRIAL_COUNT      overridable via ${chalk.cyan("trials=")}
       
-      USE_API               use Stagehand API, overridable via ${chalk.cyan("api=")}
+      EVAL_MAX_CONCURRENCY  overridable via ${chalk.cyan("concurrency=")}
       
-      EVAL_MODELS           comma-separated list of models to use
+      EVAL_PROVIDER         overridable via ${chalk.cyan("provider=")}
       
-      AGENT_EVAL_MAX_STEPS  max steps for agent tasks (default: 50)
+      USE_API               overridable via ${chalk.cyan("api=true")}
   `;
 
-  return `${header}\n\n${synopsis}\n\n${body}\n${examplesSection}\n${externalBenchmarksSection}\n${envSection}\n`;
+  return `${header}\n\n${synopsis}\n\n${body}\n${envSection}\n`;
 }
 
 const wantsHelp = rawArgs.some((a) => HELP_REGEX.test(a));
 
@@ -381,6 +381,7 @@ function handleRun(args: string[]): void {
         webbench: "agent/webbench",
         gaia: "agent/gaia",
         webvoyager: "agent/webvoyager",
+        osworld: "agent/osworld",
         onlineMind2Web: "agent/onlineMind2Web",
       };
 
 
@@ -25,8 +25,12 @@ test.describe("API key/LLMClient error", () => {
 
     expect(errorThrown).toBeInstanceOf(Error);
     expect(
-      errorThrown?.message?.includes("No LLM API key or LLM Client configured") ||
-      errorThrown?.message?.includes("API key is missing. Pass it using the 'apiKey' parameter")
+      errorThrown?.message?.includes(
+        "No LLM API key or LLM Client configured",
+      ) ||
+        errorThrown?.message?.includes(
+          "API key is missing. Pass it using the 'apiKey' parameter",
+        ),
     ).toBe(true);
 
     await stagehand.close();
@@ -49,8 +53,12 @@ test.describe("API key/LLMClient error", () => {
 
     expect(errorThrown).toBeInstanceOf(Error);
     expect(
-      errorThrown?.message?.includes("No LLM API key or LLM Client configured") ||
-      errorThrown?.message?.includes("API key is missing. Pass it using the 'apiKey' parameter")
+      errorThrown?.message?.includes(
+        "No LLM API key or LLM Client configured",
+      ) ||
+        errorThrown?.message?.includes(
+          "API key is missing. Pass it using the 'apiKey' parameter",
+        ),
     ).toBe(true);
 
     await stagehand.close();
@@ -71,8 +79,12 @@ test.describe("API key/LLMClient error", () => {
 
     expect(errorThrown).toBeInstanceOf(Error);
     expect(
-      errorThrown?.message?.includes("No LLM API key or LLM Client configured") ||
-      errorThrown?.message?.includes("API key is missing. Pass it using the 'apiKey' parameter")
+      errorThrown?.message?.includes(
+        "No LLM API key or LLM Client configured",
+      ) ||
+        errorThrown?.message?.includes(
+          "API key is missing. Pass it using the 'apiKey' parameter",
+        ),
     ).toBe(true);
 
     await stagehand.close();