Skip to content

Commit 9afc0a8

Browse files
update evals (#1139)
# why Make it easier to parse/filter/group evals # what changed Evals tagged with more granular metadata and error parsing # test plan --------- Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
1 parent f426ba5 commit 9afc0a8

31 files changed

+497
-951
lines changed

evals/args.ts

Lines changed: 46 additions & 152 deletions
Original file line numberDiff line numberDiff line change
@@ -103,195 +103,89 @@ function buildUsage(detailed = false): string {
103103
`pnpm run evals [key=value]… [category <name>] | name=<evalName>`,
104104
);
105105

106-
const examplesSection = `
107-
${chalk.magenta.underline("Examples")}
108-
109-
${chalk.dim("# Run every evaluation locally with default settings")}
110-
${chalk.green("pnpm run evals")}
111-
112-
${chalk.dim("# Same as above but in Browserbase with three trials")}
113-
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("browserbase")} ${chalk.cyan("trials=")}${chalk.yellow("3")}
114-
115-
${chalk.dim("# Run evals using the Stagehand API")}
116-
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("browserbase")} ${chalk.cyan("api=")}${chalk.yellow("true")}
117-
118-
${chalk.dim("# Run evals from only the 'act' category with a max of 4 running at any given time")}
119-
${chalk.green("pnpm run evals")} ${chalk.cyan("category")} ${chalk.yellow("act")} ${chalk.cyan("concurrency=")}${chalk.yellow("4")}
120-
121-
${chalk.dim("# Execute a specific eval by filename")}
122-
${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("my_eval_name")}
123-
`;
124-
125106
const body = dedent`
126107
${chalk.magenta.underline("Keys\n")}
127108
${chalk.cyan("env".padEnd(12))} ${"target environment".padEnd(24)}
128-
(default ${chalk.dim("LOCAL")}) [${chalk.yellow("browserbase")}, ${chalk.yellow("local")}]
109+
(default ${chalk.dim("LOCAL")}) [${chalk.yellow("BROWSERBASE")}, ${chalk.yellow("LOCAL")}] ${chalk.gray("← LOCAL sets api=false")}
129110
130111
${chalk.cyan("api".padEnd(12))} ${"use the Stagehand API".padEnd(24)}
131-
(default ${chalk.dim("false")}) [${chalk.yellow("true")}, ${chalk.yellow("false")}]
112+
(default ${chalk.dim("false")}) [${chalk.yellow("true")}, ${chalk.yellow("false")}]
132113
133-
${chalk.cyan("trials".padEnd(12))} ${"number of trials per task".padEnd(24)}
134-
(default ${chalk.dim("3")})
114+
${chalk.cyan("trials".padEnd(12))} ${"number of trials".padEnd(24)}
115+
(default ${chalk.dim("10")})
135116
136117
${chalk.cyan("concurrency".padEnd(12))} ${"max parallel sessions".padEnd(24)}
137-
(default ${chalk.dim("3")})
118+
(default ${chalk.dim("10")})
138119
139120
${chalk.cyan("provider".padEnd(12))} ${"override LLM provider".padEnd(24)}
140-
(default ${chalk.dim(providerDefault || "varies by model")}) [${chalk.yellow("openai")}, ${chalk.yellow("anthropic")}, ${chalk.yellow("google")}, ${chalk.yellow("together")}, ${chalk.yellow("groq")}, ${chalk.yellow("cerebras")}]
121+
(default ${chalk.dim(providerDefault)}) [${chalk.yellow("OPENAI")}, ${chalk.yellow("ANTHROPIC")}, ${chalk.yellow("GOOGLE")}, ${chalk.yellow("TOGETHER")}, ${chalk.yellow("GROQ")}, ${chalk.yellow("CEREBRAS")}]
141122
142123
${chalk.cyan("max_k".padEnd(12))} ${"max test cases per dataset".padEnd(24)}
143124
(default ${chalk.dim("25")})
144125
145-
${chalk.cyan("--dataset".padEnd(12))} ${"filter to specific benchmark".padEnd(24)}
146-
(optional) [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}, ${chalk.yellow("webbench")}, ${chalk.yellow("osworld")}, ${chalk.yellow("onlineMind2Web")}]
126+
${chalk.cyan("--dataset".padEnd(12))} ${"filter dataset for benchmarks".padEnd(24)}
127+
(optional) [${chalk.yellow("gaia")}, ${chalk.yellow("webvoyager")}]
147128
148129
149130
${chalk.magenta.underline("Positional filters\n")}
150-
151-
category <category_name>
152-
153-
${chalk.gray("Available categories:")}
154-
${DEFAULT_EVAL_CATEGORIES.slice(0, 5)
155-
.map((c) => chalk.yellow(c))
156-
.join(", ")},
157-
${DEFAULT_EVAL_CATEGORIES.slice(5, 10)
158-
.map((c) => chalk.yellow(c))
159-
.join(", ")}${DEFAULT_EVAL_CATEGORIES.slice(10).length > 0 ? "," : ""}
160-
${DEFAULT_EVAL_CATEGORIES.slice(10)
161-
.map((c) => chalk.yellow(c))
162-
.join(", ")}
163-
`;
131+
category <category_name> one of: ${DEFAULT_EVAL_CATEGORIES.map((c) =>
132+
chalk.yellow(c),
133+
).join(", ")}
164134
165-
if (!detailed)
166-
return `${header}\n\n${synopsis}\n\nFor more details: ${chalk.bold(
167-
"pnpm run evals -man\n",
168-
)}`;
169-
170-
const externalBenchmarksSection = dedent`
171-
${chalk.magenta.underline("\nExternal Benchmarks\n")}
172-
173-
${chalk.cyan.bold("WebBench")} - 5,607 real-world web automation tasks across 452 live websites
174-
175-
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/webbench")}
176-
177-
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=webbench pnpm run evals")}
178-
179-
${chalk.gray("Environment Variables:")}
180-
181-
EVAL_WEBBENCH_LIMIT max tasks to run (default: 25)
182-
EVAL_WEBBENCH_SAMPLE random sample count before limit
183-
EVAL_WEBBENCH_DIFFICULTY filter: [${chalk.yellow("easy")}, ${chalk.yellow("hard")}] (254 easy, 61 hard tasks)
184-
EVAL_WEBBENCH_CATEGORY filter: [${chalk.yellow("READ")}, ${chalk.yellow("CREATE")}, ${chalk.yellow("UPDATE")}, ${chalk.yellow("DELETE")}, ${chalk.yellow("FILE_MANIPULATION")}]
185-
EVAL_WEBBENCH_USE_HITL use only HITL dataset with difficulty ratings (true/false)
186-
187-
${chalk.dim("Examples:")}
188-
189-
${chalk.green("EVAL_WEBBENCH_DIFFICULTY=easy EVAL_WEBBENCH_LIMIT=10 pnpm run evals name=agent/webbench")}
190-
191-
${chalk.green("EVAL_DATASET=webbench EVAL_WEBBENCH_CATEGORY=READ pnpm run evals")}
192-
193-
194-
${chalk.cyan.bold("GAIA")} - General AI Assistant benchmark for complex reasoning
195-
196-
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/gaia")}
197-
198-
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=gaia pnpm run evals")}
199-
200-
${chalk.gray("Environment Variables:")}
201-
202-
EVAL_GAIA_LIMIT max tasks to run (default: 25)
203-
EVAL_GAIA_SAMPLE random sample count before limit
204-
EVAL_GAIA_LEVEL filter by difficulty level [${chalk.yellow("1")}, ${chalk.yellow("2")}, ${chalk.yellow("3")}]
135+
${chalk.magenta.underline("\nExamples")}
205136
206-
${chalk.dim("Example:")}
207-
208-
${chalk.green("EVAL_GAIA_LEVEL=1 EVAL_GAIA_LIMIT=10 pnpm run evals name=agent/gaia")}
209-
210-
211-
${chalk.cyan.bold("WebVoyager")} - Web navigation and task completion benchmark
212-
213-
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/webvoyager")}
214-
215-
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=webvoyager pnpm run evals")}
216-
217-
${chalk.gray("Environment Variables:")}
218-
219-
EVAL_WEBVOYAGER_LIMIT max tasks to run (default: 25)
220-
EVAL_WEBVOYAGER_SAMPLE random sample count before limit
221-
222-
${chalk.gray("Ground Truth Evaluation:")}
223-
224-
WebVoyager uses ground truth answers for improved accuracy:
225-
• Checks agent's "Final Answer:" against reference answers
226-
• Supports golden (ideal) and possible (acceptable) answers
227-
• Falls back to screenshot evaluation when uncertain
228-
• Reference data: evals/datasets/webvoyager/reference-answers.json
229-
230-
${chalk.dim("Example:")}
231-
232-
${chalk.green("EVAL_WEBVOYAGER_SAMPLE=50 EVAL_WEBVOYAGER_LIMIT=10 pnpm run evals name=agent/webvoyager")}
233-
234-
235-
${chalk.cyan.bold("OSWorld")} - Chrome browser automation tasks from the OSWorld benchmark
236-
237-
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/osworld")}
238-
239-
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=osworld pnpm run evals")}
240-
241-
${chalk.gray("Environment Variables:")}
137+
${chalk.dim("# Run every evaluation locally with default settings")}
242138
243-
EVAL_OSWORLD_LIMIT max tasks to run (default: 25)
244-
EVAL_OSWORLD_SAMPLE random sample count before limit
245-
EVAL_OSWORLD_SOURCE filter by source: [${chalk.yellow("Mind2Web")}, ${chalk.yellow("test_task_1")}, ...]
246-
EVAL_OSWORLD_EVALUATION_TYPE filter by eval type: [${chalk.yellow("url_match")}, ${chalk.yellow("string_match")}, ${chalk.yellow("dom_state")}, ${chalk.yellow("custom")}]
247-
EVAL_OSWORLD_TIMEOUT timeout per task in milliseconds (default: 60000)
139+
${chalk.green("pnpm run evals")}
248140
249-
${chalk.dim("Examples:")}
250141
251-
${chalk.green("EVAL_OSWORLD_SOURCE=Mind2Web EVAL_OSWORLD_LIMIT=10 pnpm run evals name=agent/osworld")}
142+
${chalk.dim("# Same as above but in Browserbase with three trials")}
252143
253-
${chalk.green("EVAL_DATASET=osworld EVAL_OSWORLD_EVALUATION_TYPE=url_match pnpm run evals")}
254-
255-
256-
${chalk.cyan.bold("Mind2Web")} - Real-world web interaction tasks for evaluating web agents
257-
258-
${chalk.dim("Run:")} ${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("agent/onlineMind2Web")}
144+
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("BROWSERBASE")} ${chalk.cyan(
145+
"trials=",
146+
)}${chalk.yellow("3")}
259147
260-
${chalk.dim("Or:")} ${chalk.green("EVAL_DATASET=onlineMind2Web pnpm run evals")}
261148
262-
${chalk.gray("Environment Variables:")}
149+
${chalk.dim("# Run evals using the Stagehand API")}
263150
264-
EVAL_ONLINEMIND2WEB_LIMIT max tasks to run (default: 25)
265-
EVAL_ONLINEMIND2WEB_SAMPLE random sample count before limit
151+
${chalk.green("pnpm run evals")} ${chalk.cyan("env=")}${chalk.yellow("BROWSERBASE")} ${chalk.cyan(
152+
"api=",
153+
)}${chalk.yellow("true")}
154+
155+
156+
${chalk.dim(
157+
"# Run evals from only the 'act' category with a max of 4 running at any given time",
158+
)}
266159
267-
${chalk.dim("Example:")}
160+
${chalk.green("pnpm run evals")} ${chalk.cyan("category")} ${chalk.yellow("act")} ${chalk.cyan(
161+
"concurrency=",
162+
)}${chalk.yellow("4")}
163+
164+
165+
${chalk.dim("# Execute a specific eval by filename")}
268166
269-
${chalk.green("EVAL_ONLINEMIND2WEB_SAMPLE=50 EVAL_ONLINEMIND2WEB_LIMIT=10 pnpm run evals name=agent/onlineMind2Web")}
167+
${chalk.green("pnpm run evals")} ${chalk.cyan("name=")}${chalk.yellow("my_eval_name")}
270168
`;
271169

170+
if (!detailed)
171+
return `${header}\n\n${synopsis}\n\nFor more details: ${chalk.bold(
172+
"pnpm run evals -man\n",
173+
)}`;
174+
272175
const envSection = dedent`
273-
${chalk.magenta.underline("\nGlobal Environment Variables\n")}
274-
275-
EVAL_ENV target environment, overridable via ${chalk.cyan("env=")}
276-
277-
EVAL_TRIAL_COUNT number of trials, overridable via ${chalk.cyan("trials=")}
278-
279-
EVAL_MAX_CONCURRENCY parallel sessions, overridable via ${chalk.cyan("concurrency=")}
280-
281-
EVAL_PROVIDER LLM provider, overridable via ${chalk.cyan("provider=")}
282-
283-
EVAL_MAX_K global limit for all benchmarks (overrides individual limits)
176+
${chalk.magenta.underline("\nEnvironment variables\n")}
177+
EVAL_ENV overridable via ${chalk.cyan("env=")}
284178
285-
EVAL_DATASET filter to specific benchmark, overridable via ${chalk.cyan("--dataset=")}
179+
EVAL_TRIAL_COUNT overridable via ${chalk.cyan("trials=")}
286180
287-
USE_API use Stagehand API, overridable via ${chalk.cyan("api=")}
181+
EVAL_MAX_CONCURRENCY overridable via ${chalk.cyan("concurrency=")}
288182
289-
EVAL_MODELS comma-separated list of models to use
183+
EVAL_PROVIDER overridable via ${chalk.cyan("provider=")}
290184
291-
AGENT_EVAL_MAX_STEPS max steps for agent tasks (default: 50)
185+
USE_API overridable via ${chalk.cyan("api=true")}
292186
`;
293187

294-
return `${header}\n\n${synopsis}\n\n${body}\n${examplesSection}\n${externalBenchmarksSection}\n${envSection}\n`;
188+
return `${header}\n\n${synopsis}\n\n${body}\n${envSection}\n`;
295189
}
296190

297191
const wantsHelp = rawArgs.some((a) => HELP_REGEX.test(a));

evals/cli.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,7 @@ function handleRun(args: string[]): void {
381381
webbench: "agent/webbench",
382382
gaia: "agent/gaia",
383383
webvoyager: "agent/webvoyager",
384+
osworld: "agent/osworld",
384385
onlineMind2Web: "agent/onlineMind2Web",
385386
};
386387

evals/deterministic/tests/Errors/apiKeyError.test.ts

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,12 @@ test.describe("API key/LLMClient error", () => {
2525

2626
expect(errorThrown).toBeInstanceOf(Error);
2727
expect(
28-
errorThrown?.message?.includes("No LLM API key or LLM Client configured") ||
29-
errorThrown?.message?.includes("API key is missing. Pass it using the 'apiKey' parameter")
28+
errorThrown?.message?.includes(
29+
"No LLM API key or LLM Client configured",
30+
) ||
31+
errorThrown?.message?.includes(
32+
"API key is missing. Pass it using the 'apiKey' parameter",
33+
),
3034
).toBe(true);
3135

3236
await stagehand.close();
@@ -49,8 +53,12 @@ test.describe("API key/LLMClient error", () => {
4953

5054
expect(errorThrown).toBeInstanceOf(Error);
5155
expect(
52-
errorThrown?.message?.includes("No LLM API key or LLM Client configured") ||
53-
errorThrown?.message?.includes("API key is missing. Pass it using the 'apiKey' parameter")
56+
errorThrown?.message?.includes(
57+
"No LLM API key or LLM Client configured",
58+
) ||
59+
errorThrown?.message?.includes(
60+
"API key is missing. Pass it using the 'apiKey' parameter",
61+
),
5462
).toBe(true);
5563

5664
await stagehand.close();
@@ -71,8 +79,12 @@ test.describe("API key/LLMClient error", () => {
7179

7280
expect(errorThrown).toBeInstanceOf(Error);
7381
expect(
74-
errorThrown?.message?.includes("No LLM API key or LLM Client configured") ||
75-
errorThrown?.message?.includes("API key is missing. Pass it using the 'apiKey' parameter")
82+
errorThrown?.message?.includes(
83+
"No LLM API key or LLM Client configured",
84+
) ||
85+
errorThrown?.message?.includes(
86+
"API key is missing. Pass it using the 'apiKey' parameter",
87+
),
7688
).toBe(true);
7789

7890
await stagehand.close();

0 commit comments

Comments
 (0)