Skip to content

Commit 8e76ed7

Browse files
committed
fix: 5 critical logic bugs + security hardening + unit tests
1 parent 350a3b9 commit 8e76ed7

File tree

6 files changed

+517
-58
lines changed

6 files changed

+517
-58
lines changed

src/assertions.ts

Lines changed: 82 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ export function evaluate(trace: AgentTrace, expect: Expectations): AssertionResu
8080

8181
// output_matches
8282
if (expect.output_matches) {
83-
let re: RegExp;
83+
let re: RegExp | null = null;
8484
try {
8585
re = new RegExp(expect.output_matches);
8686
} catch (err: any) {
@@ -90,14 +90,15 @@ export function evaluate(trace: AgentTrace, expect: Expectations): AssertionResu
9090
expected: expect.output_matches,
9191
message: `Invalid regex: /${expect.output_matches}/ — ${err.message}`,
9292
});
93-
return results;
9493
}
95-
results.push({
96-
name: `output_matches: /${expect.output_matches}/`,
97-
passed: re.test(outputs),
98-
expected: expect.output_matches,
99-
actual: outputs.slice(0, 200),
100-
});
94+
if (re) {
95+
results.push({
96+
name: `output_matches: /${expect.output_matches}/`,
97+
passed: re.test(outputs),
98+
expected: expect.output_matches,
99+
actual: outputs.slice(0, 200),
100+
});
101+
}
101102
}
102103

103104
// max_steps
@@ -278,17 +279,10 @@ export function evaluate(trace: AgentTrace, expect: Expectations): AssertionResu
278279
}
279280
}
280281

281-
// custom
282+
// custom — safe property-access evaluator (no arbitrary code execution)
282283
if (expect.custom) {
283284
try {
284-
const fn = new Function(
285-
'trace',
286-
'steps',
287-
'toolCalls',
288-
'outputs',
289-
`return (${expect.custom})`,
290-
);
291-
const result = fn(trace, trace.steps, toolCalls, outputs);
285+
const result = evaluateSafeExpression(expect.custom, { trace, steps: trace.steps, toolCalls, outputs });
292286
results.push({
293287
name: `custom: ${expect.custom.slice(0, 60)}`,
294288
passed: !!result,
@@ -459,6 +453,77 @@ function deepPartialMatch(actual: Record<string, any>, expected: Record<string,
459453
return true;
460454
}
461455

456+
/**
457+
* Safe expression evaluator for custom assertions.
458+
* Only allows property access (dot notation, bracket notation) and comparison operators.
459+
* NO function calls, NO assignment, NO constructors, NO template literals.
460+
*
461+
* Supported: trace.steps.length > 0, toolCalls.length === 3, outputs.includes("hello")
462+
* Blocked: process.exit(), require('fs'), new Function(), import(), eval(), etc.
463+
*/
464+
export function evaluateSafeExpression(
465+
expr: string,
466+
context: Record<string, any>,
467+
): any {
468+
// Block dangerous patterns
469+
const BLOCKED_PATTERNS = [
470+
/\bnew\s+/, // new Function(), new (anything)
471+
/\bimport\s*\(/, // dynamic import()
472+
/\beval\s*\(/, // eval()
473+
/\brequire\s*\(/, // require()
474+
/\bFunction\s*\(/, // Function()
475+
/\b__proto__\b/, // prototype pollution
476+
/\bconstructor\b/, // constructor access
477+
/\bprototype\b/, // prototype access
478+
/\bprocess\b/, // process.exit, process.env
479+
/\bglobal(This)?\b/, // globalThis
480+
/\bwindow\b/, // browser global
481+
/\bself\b/, // worker global
482+
/`/, // template literals (can execute code)
483+
/\$\{/, // template literal interpolation
484+
/;\s*\w/, // multiple statements
485+
/\bwhile\b/, // loops
486+
/\bfor\b/, // loops
487+
/\bdelete\b/, // delete operator
488+
/\bvoid\b/, // void operator
489+
/\btypeof\b/, // typeof (unnecessary for assertions)
490+
/(?<![=!<>])=(?!=)/, // assignment (but not ==, ===, !=, !==, <=, >=)
491+
/\bthis\b/, // this reference
492+
];
493+
494+
for (const pattern of BLOCKED_PATTERNS) {
495+
if (pattern.test(expr)) {
496+
throw new Error(
497+
`Unsafe expression blocked: pattern "${pattern.source}" matched. ` +
498+
`Custom assertions only support property access and comparisons.`,
499+
);
500+
}
501+
}
502+
503+
// Only allow: identifiers, dots, brackets, numbers, strings, comparisons, boolean operators, parens
504+
// This is a whitelist approach on top of the blocklist
505+
const ALLOWED = /^[\w\s.[\]()'"<>=!&|+\-*/%,?:]+$/;
506+
if (!ALLOWED.test(expr)) {
507+
throw new Error(
508+
`Unsafe expression blocked: contains disallowed characters. ` +
509+
`Custom assertions only support property access and comparisons.`,
510+
);
511+
}
512+
513+
// Use Function with frozen context objects to prevent mutation
514+
const frozenContext: Record<string, any> = {};
515+
for (const [key, value] of Object.entries(context)) {
516+
frozenContext[key] = typeof value === 'object' && value !== null ? Object.freeze(value) : value;
517+
}
518+
519+
const keys = Object.keys(frozenContext);
520+
const values = Object.values(frozenContext);
521+
522+
// Build a function with limited scope
523+
const fn = new Function(...keys, `"use strict"; return (${expr})`);
524+
return fn(...values);
525+
}
526+
462527
/**
463528
* Standalone assertion: compare a trace against a named golden snapshot.
464529
* Returns an AssertionResult compatible with AgentProbe's result format.

src/codegen.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ const NL_PATTERNS: NLPattern[] = [
218218
name: `Costs under $${m[1]}`,
219219
input: 'Perform the task',
220220
trace: '',
221-
expect: { max_cost: parseFloat(m[1]) },
221+
expect: { max_cost_usd: parseFloat(m[1]) },
222222
}),
223223
},
224224
{

src/cost.ts

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -97,15 +97,20 @@ export function calculateCost(trace: AgentTrace): CostReport {
9797
}
9898

9999
/**
100-
* Find pricing for a model, fuzzy-matching known models.
100+
* Find pricing for a model, using exact match first, then longest prefix match.
101+
* Sorting by key length descending ensures 'o1-mini' matches before 'o1'.
101102
*/
102103
export function findPricing(model: string): { input: number; output: number } {
104+
// Exact match first
103105
if (PRICING[model]) return PRICING[model];
104-
// Fuzzy match: try prefix
106+
107+
// Longest prefix match: sort keys by length descending so more specific models match first
105108
const normalized = model.toLowerCase();
106-
for (const [key, val] of Object.entries(PRICING)) {
107-
if (normalized.includes(key) || key.includes(normalized)) return val;
109+
const sortedKeys = Object.keys(PRICING).sort((a, b) => b.length - a.length);
110+
for (const key of sortedKeys) {
111+
if (normalized.startsWith(key) || normalized.includes(key)) return PRICING[key];
108112
}
113+
109114
// Default: gpt-4o-mini pricing as fallback
110115
return { input: 0.15, output: 0.6 };
111116
}

src/runner.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,16 @@ async function runSingleTestInner(
210210
}
211211
}
212212

213+
// Zero-assertion warning: tests with no assertions should not silently pass
214+
if (assertions.length === 0) {
215+
console.warn(chalk.yellow(`⚠️ Test "${test.name}" has no assertions defined`));
216+
assertions.push({
217+
name: 'no_assertions',
218+
passed: false,
219+
message: 'No assertions defined — test cannot pass without assertions',
220+
});
221+
}
222+
213223
const passed = assertions.every((a) => a.passed);
214224

215225
// Restore env

0 commit comments

Comments
 (0)