@@ -103,195 +103,89 @@ function buildUsage(detailed = false): string {
103103 `pnpm run evals [key=value]… [category <name>] | name=<evalName>` ,
104104 ) ;
105105
106- const examplesSection = `
107- ${ chalk . magenta . underline ( "Examples" ) }
108-
109- ${ chalk . dim ( "# Run every evaluation locally with default settings" ) }
110- ${ chalk . green ( "pnpm run evals" ) }
111-
112- ${ chalk . dim ( "# Same as above but in Browserbase with three trials" ) }
113- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "browserbase" ) } ${ chalk . cyan ( "trials=" ) } ${ chalk . yellow ( "3" ) }
114-
115- ${ chalk . dim ( "# Run evals using the Stagehand API" ) }
116- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "browserbase" ) } ${ chalk . cyan ( "api=" ) } ${ chalk . yellow ( "true" ) }
117-
118- ${ chalk . dim ( "# Run evals from only the 'act' category with a max of 4 running at any given time" ) }
119- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "category" ) } ${ chalk . yellow ( "act" ) } ${ chalk . cyan ( "concurrency=" ) } ${ chalk . yellow ( "4" ) }
120-
121- ${ chalk . dim ( "# Execute a specific eval by filename" ) }
122- ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "my_eval_name" ) }
123- ` ;
124-
125106 const body = dedent `
126107 ${ chalk . magenta . underline ( "Keys\n" ) }
127108 ${ chalk . cyan ( "env" . padEnd ( 12 ) ) } ${ "target environment" . padEnd ( 24 ) }
128- (default ${ chalk . dim ( "LOCAL" ) } ) [${ chalk . yellow ( "browserbase " ) } , ${ chalk . yellow ( "local " ) } ]
109+ (default ${ chalk . dim ( "LOCAL" ) } ) [${ chalk . yellow ( "BROWSERBASE " ) } , ${ chalk . yellow ( "LOCAL " ) } ] ${ chalk . gray ( "← LOCAL sets api=false" ) }
129110
130111 ${ chalk . cyan ( "api" . padEnd ( 12 ) ) } ${ "use the Stagehand API" . padEnd ( 24 ) }
131- (default ${ chalk . dim ( "false" ) } ) [${ chalk . yellow ( "true" ) } , ${ chalk . yellow ( "false" ) } ]
112+ (default ${ chalk . dim ( "false" ) } ) [${ chalk . yellow ( "true" ) } , ${ chalk . yellow ( "false" ) } ]
132113
133- ${ chalk . cyan ( "trials" . padEnd ( 12 ) ) } ${ "number of trials per task " . padEnd ( 24 ) }
134- (default ${ chalk . dim ( "3 " ) } )
114+ ${ chalk . cyan ( "trials" . padEnd ( 12 ) ) } ${ "number of trials" . padEnd ( 24 ) }
115+ (default ${ chalk . dim ( "10 " ) } )
135116
136117 ${ chalk . cyan ( "concurrency" . padEnd ( 12 ) ) } ${ "max parallel sessions" . padEnd ( 24 ) }
137- (default ${ chalk . dim ( "3 " ) } )
118+ (default ${ chalk . dim ( "10 " ) } )
138119
139120 ${ chalk . cyan ( "provider" . padEnd ( 12 ) ) } ${ "override LLM provider" . padEnd ( 24 ) }
140- (default ${ chalk . dim ( providerDefault || "varies by model" ) } ) [${ chalk . yellow ( "openai " ) } , ${ chalk . yellow ( "anthropic " ) } , ${ chalk . yellow ( "google " ) } , ${ chalk . yellow ( "together " ) } , ${ chalk . yellow ( "groq " ) } , ${ chalk . yellow ( "cerebras " ) } ]
121+ (default ${ chalk . dim ( providerDefault ) } ) [${ chalk . yellow ( "OPENAI " ) } , ${ chalk . yellow ( "ANTHROPIC " ) } , ${ chalk . yellow ( "GOOGLE " ) } , ${ chalk . yellow ( "TOGETHER " ) } , ${ chalk . yellow ( "GROQ " ) } , ${ chalk . yellow ( "CEREBRAS " ) } ]
141122
142123 ${ chalk . cyan ( "max_k" . padEnd ( 12 ) ) } ${ "max test cases per dataset" . padEnd ( 24 ) }
143124 (default ${ chalk . dim ( "25" ) } )
144125
145- ${ chalk . cyan ( "--dataset" . padEnd ( 12 ) ) } ${ "filter to specific benchmark " . padEnd ( 24 ) }
146- (optional) [${ chalk . yellow ( "gaia" ) } , ${ chalk . yellow ( "webvoyager" ) } , ${ chalk . yellow ( "webbench" ) } , ${ chalk . yellow ( "osworld" ) } , ${ chalk . yellow ( "onlineMind2Web" ) } ]
126+ ${ chalk . cyan ( "--dataset" . padEnd ( 12 ) ) } ${ "filter dataset for benchmarks " . padEnd ( 24 ) }
127+ (optional) [${ chalk . yellow ( "gaia" ) } , ${ chalk . yellow ( "webvoyager" ) } ]
147128
148129
149130 ${ chalk . magenta . underline ( "Positional filters\n" ) }
150-
151- category <category_name>
152-
153- ${ chalk . gray ( "Available categories:" ) }
154- ${ DEFAULT_EVAL_CATEGORIES . slice ( 0 , 5 )
155- . map ( ( c ) => chalk . yellow ( c ) )
156- . join ( ", " ) } ,
157- ${ DEFAULT_EVAL_CATEGORIES . slice ( 5 , 10 )
158- . map ( ( c ) => chalk . yellow ( c ) )
159- . join ( ", " ) } ${ DEFAULT_EVAL_CATEGORIES . slice ( 10 ) . length > 0 ? "," : "" }
160- ${ DEFAULT_EVAL_CATEGORIES . slice ( 10 )
161- . map ( ( c ) => chalk . yellow ( c ) )
162- . join ( ", " ) }
163- ` ;
131+ category <category_name> one of: ${ DEFAULT_EVAL_CATEGORIES . map ( ( c ) =>
132+ chalk . yellow ( c ) ,
133+ ) . join ( ", " ) }
164134
165- if ( ! detailed )
166- return `${ header } \n\n${ synopsis } \n\nFor more details: ${ chalk . bold (
167- "pnpm run evals -man\n" ,
168- ) } `;
169-
170- const externalBenchmarksSection = dedent `
171- ${ chalk . magenta . underline ( "\nExternal Benchmarks\n" ) }
172-
173- ${ chalk . cyan . bold ( "WebBench" ) } - 5,607 real-world web automation tasks across 452 live websites
174-
175- ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/webbench" ) }
176-
177- ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=webbench pnpm run evals" ) }
178-
179- ${ chalk . gray ( "Environment Variables:" ) }
180-
181- EVAL_WEBBENCH_LIMIT max tasks to run (default: 25)
182- EVAL_WEBBENCH_SAMPLE random sample count before limit
183- EVAL_WEBBENCH_DIFFICULTY filter: [${ chalk . yellow ( "easy" ) } , ${ chalk . yellow ( "hard" ) } ] (254 easy, 61 hard tasks)
184- EVAL_WEBBENCH_CATEGORY filter: [${ chalk . yellow ( "READ" ) } , ${ chalk . yellow ( "CREATE" ) } , ${ chalk . yellow ( "UPDATE" ) } , ${ chalk . yellow ( "DELETE" ) } , ${ chalk . yellow ( "FILE_MANIPULATION" ) } ]
185- EVAL_WEBBENCH_USE_HITL use only HITL dataset with difficulty ratings (true/false)
186-
187- ${ chalk . dim ( "Examples:" ) }
188-
189- ${ chalk . green ( "EVAL_WEBBENCH_DIFFICULTY=easy EVAL_WEBBENCH_LIMIT=10 pnpm run evals name=agent/webbench" ) }
190-
191- ${ chalk . green ( "EVAL_DATASET=webbench EVAL_WEBBENCH_CATEGORY=READ pnpm run evals" ) }
192-
193-
194- ${ chalk . cyan . bold ( "GAIA" ) } - General AI Assistant benchmark for complex reasoning
195-
196- ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/gaia" ) }
197-
198- ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=gaia pnpm run evals" ) }
199-
200- ${ chalk . gray ( "Environment Variables:" ) }
201-
202- EVAL_GAIA_LIMIT max tasks to run (default: 25)
203- EVAL_GAIA_SAMPLE random sample count before limit
204- EVAL_GAIA_LEVEL filter by difficulty level [${ chalk . yellow ( "1" ) } , ${ chalk . yellow ( "2" ) } , ${ chalk . yellow ( "3" ) } ]
135+ ${ chalk . magenta . underline ( "\nExamples" ) }
205136
206- ${ chalk . dim ( "Example:" ) }
207-
208- ${ chalk . green ( "EVAL_GAIA_LEVEL=1 EVAL_GAIA_LIMIT=10 pnpm run evals name=agent/gaia" ) }
209-
210-
211- ${ chalk . cyan . bold ( "WebVoyager" ) } - Web navigation and task completion benchmark
212-
213- ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/webvoyager" ) }
214-
215- ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=webvoyager pnpm run evals" ) }
216-
217- ${ chalk . gray ( "Environment Variables:" ) }
218-
219- EVAL_WEBVOYAGER_LIMIT max tasks to run (default: 25)
220- EVAL_WEBVOYAGER_SAMPLE random sample count before limit
221-
222- ${ chalk . gray ( "Ground Truth Evaluation:" ) }
223-
224- WebVoyager uses ground truth answers for improved accuracy:
225- • Checks agent's "Final Answer:" against reference answers
226- • Supports golden (ideal) and possible (acceptable) answers
227- • Falls back to screenshot evaluation when uncertain
228- • Reference data: evals/datasets/webvoyager/reference-answers.json
229-
230- ${ chalk . dim ( "Example:" ) }
231-
232- ${ chalk . green ( "EVAL_WEBVOYAGER_SAMPLE=50 EVAL_WEBVOYAGER_LIMIT=10 pnpm run evals name=agent/webvoyager" ) }
233-
234-
235- ${ chalk . cyan . bold ( "OSWorld" ) } - Chrome browser automation tasks from the OSWorld benchmark
236-
237- ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/osworld" ) }
238-
239- ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=osworld pnpm run evals" ) }
240-
241- ${ chalk . gray ( "Environment Variables:" ) }
137+ ${ chalk . dim ( "# Run every evaluation locally with default settings" ) }
242138
243- EVAL_OSWORLD_LIMIT max tasks to run (default: 25)
244- EVAL_OSWORLD_SAMPLE random sample count before limit
245- EVAL_OSWORLD_SOURCE filter by source: [${ chalk . yellow ( "Mind2Web" ) } , ${ chalk . yellow ( "test_task_1" ) } , ...]
246- EVAL_OSWORLD_EVALUATION_TYPE filter by eval type: [${ chalk . yellow ( "url_match" ) } , ${ chalk . yellow ( "string_match" ) } , ${ chalk . yellow ( "dom_state" ) } , ${ chalk . yellow ( "custom" ) } ]
247- EVAL_OSWORLD_TIMEOUT timeout per task in milliseconds (default: 60000)
139+ ${ chalk . green ( "pnpm run evals" ) }
248140
249- ${ chalk . dim ( "Examples:" ) }
250141
251- ${ chalk . green ( "EVAL_OSWORLD_SOURCE=Mind2Web EVAL_OSWORLD_LIMIT=10 pnpm run evals name=agent/osworld ") }
142+ ${ chalk . dim ( "# Same as above but in Browserbase with three trials ") }
252143
253- ${ chalk . green ( "EVAL_DATASET=osworld EVAL_OSWORLD_EVALUATION_TYPE=url_match pnpm run evals" ) }
254-
255-
256- ${ chalk . cyan . bold ( "Mind2Web" ) } - Real-world web interaction tasks for evaluating web agents
257-
258- ${ chalk . dim ( "Run:" ) } ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "name=" ) } ${ chalk . yellow ( "agent/onlineMind2Web" ) }
144+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "BROWSERBASE" ) } ${ chalk . cyan (
145+ "trials=" ,
146+ ) } ${ chalk . yellow ( "3" ) }
259147
260- ${ chalk . dim ( "Or:" ) } ${ chalk . green ( "EVAL_DATASET=onlineMind2Web pnpm run evals" ) }
261148
262- ${ chalk . gray ( "Environment Variables: ") }
149+ ${ chalk . dim ( "# Run evals using the Stagehand API ") }
263150
264- EVAL_ONLINEMIND2WEB_LIMIT max tasks to run (default: 25)
265- EVAL_ONLINEMIND2WEB_SAMPLE random sample count before limit
151+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "env=" ) } ${ chalk . yellow ( "BROWSERBASE" ) } ${ chalk . cyan (
152+ "api=" ,
153+ ) } ${ chalk . yellow ( "true" ) }
154+
155+
156+ ${ chalk . dim (
157+ "# Run evals from only the 'act' category with a max of 4 running at any given time" ,
158+ ) }
266159
267- ${ chalk . dim ( "Example:" ) }
160+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( "category" ) } ${ chalk . yellow ( "act" ) } ${ chalk . cyan (
161+ "concurrency=" ,
162+ ) } ${ chalk . yellow ( "4" ) }
163+
164+
165+ ${ chalk . dim ( "# Execute a specific eval by filename" ) }
268166
269- ${ chalk . green ( "EVAL_ONLINEMIND2WEB_SAMPLE=50 EVAL_ONLINEMIND2WEB_LIMIT=10 pnpm run evals name=agent/onlineMind2Web " ) }
167+ ${ chalk . green ( "pnpm run evals" ) } ${ chalk . cyan ( " name=" ) } ${ chalk . yellow ( "my_eval_name ") }
270168 ` ;
271169
170+ if ( ! detailed )
171+ return `${ header } \n\n${ synopsis } \n\nFor more details: ${ chalk . bold (
172+ "pnpm run evals -man\n" ,
173+ ) } `;
174+
272175 const envSection = dedent `
273- ${ chalk . magenta . underline ( "\nGlobal Environment Variables\n" ) }
274-
275- EVAL_ENV target environment, overridable via ${ chalk . cyan ( "env=" ) }
276-
277- EVAL_TRIAL_COUNT number of trials, overridable via ${ chalk . cyan ( "trials=" ) }
278-
279- EVAL_MAX_CONCURRENCY parallel sessions, overridable via ${ chalk . cyan ( "concurrency=" ) }
280-
281- EVAL_PROVIDER LLM provider, overridable via ${ chalk . cyan ( "provider=" ) }
282-
283- EVAL_MAX_K global limit for all benchmarks (overrides individual limits)
176+ ${ chalk . magenta . underline ( "\nEnvironment variables\n" ) }
177+ EVAL_ENV overridable via ${ chalk . cyan ( "env=" ) }
284178
285- EVAL_DATASET filter to specific benchmark, overridable via ${ chalk . cyan ( "--dataset =" ) }
179+ EVAL_TRIAL_COUNT overridable via ${ chalk . cyan ( "trials =" ) }
286180
287- USE_API use Stagehand API, overridable via ${ chalk . cyan ( "api =" ) }
181+ EVAL_MAX_CONCURRENCY overridable via ${ chalk . cyan ( "concurrency =" ) }
288182
289- EVAL_MODELS comma-separated list of models to use
183+ EVAL_PROVIDER overridable via ${ chalk . cyan ( "provider=" ) }
290184
291- AGENT_EVAL_MAX_STEPS max steps for agent tasks (default: 50)
185+ USE_API overridable via ${ chalk . cyan ( "api=true" ) }
292186 ` ;
293187
294- return `${ header } \n\n${ synopsis } \n\n${ body } \n${ examplesSection } \n ${ externalBenchmarksSection } \n ${ envSection } \n` ;
188+ return `${ header } \n\n${ synopsis } \n\n${ body } \n${ envSection } \n` ;
295189}
296190
297191const wantsHelp = rawArgs . some ( ( a ) => HELP_REGEX . test ( a ) ) ;
0 commit comments