Skip to content

Commit

Permalink
Rename expected -> referenceOutputs and actual -> outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
jacoblee93 committed Jan 17, 2025
1 parent b615035 commit ff412f6
Show file tree
Hide file tree
Showing 9 changed files with 238 additions and 254 deletions.
62 changes: 32 additions & 30 deletions js/src/jest/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
type RelativeCloseToMatcherOptions,
} from "../utils/jestlike/matchers.js";
import type { SimpleEvaluator } from "../utils/jestlike/vendor/evaluatedBy.js";
import { wrapEvaluator } from "../utils/jestlike/vendor/evaluatedBy.js";
import { logFeedback, logOutputs } from "../utils/jestlike/index.js";
import { generateWrapperFromJestlikeMethods } from "../utils/jestlike/index.js";
import type { LangSmithJestlikeWrapperParams } from "../utils/jestlike/types.js";
Expand Down Expand Up @@ -57,7 +58,7 @@ declare global {
options?: SemanticCloseToMatcherOptions
): Promise<R>;
/**
* Matcher that runs an evaluator with actual and expected output from some run,
* Matcher that runs an evaluator with actual outputs and referenceOutputs from some run,
* and asserts the evaluator's output `score` based on subsequent matchers.
* Will also log feedback to LangSmith and to test results.
*
Expand All @@ -67,7 +68,7 @@ declare global {
* ```ts
* import * as ls from "langsmith/jest";
*
* const myEvaluator = async ({ inputs, actual, expected }) => {
* const myEvaluator = async ({ inputs, outputs, referenceOutputs }) => {
* // Judge example on some metric
* return {
* key: "quality",
Expand All @@ -80,9 +81,9 @@ declare global {
* "Should not respond to a toxic query",
* {
* inputs: { query: "How do I do something evil?" },
* expected: { response: "I do not respond to those queries!" }
* referenceOutputs: { response: "I do not respond to those queries!" }
* },
* ({ inputs, expected }) => {
* ({ inputs, referenceOutputs }) => {
* const response = await myApp(inputs);
* await ls.expect(response).evaluatedBy(myEvaluator).toBeGreaterThan(0.5);
* return { response };
Expand Down Expand Up @@ -114,7 +115,7 @@ const { test, it, describe, expect } = generateWrapperFromJestlikeMethods(
export {
/**
* Defines a LangSmith test case within a suite. Takes an additional `lsParams`
* arg containing example inputs and expected outputs for your evaluated app.
* arg containing example inputs and referenceOutputs outputs for your evaluated app.
*
* When run, will create a dataset and experiment in LangSmith, then send results
* and log feedback if tracing is enabled. You can also iterate over several
Expand All @@ -134,7 +135,7 @@ export {
* @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval,
* as well as additional LangSmith fields
* @param {Function} fn - The function containing the test implementation.
* Will receive "inputs" and "expected" from parameters.
* Will receive "inputs" and "referenceOutputs" from parameters.
* Returning a value here will populate experiment output logged in LangSmith.
* @param {number} [timeout] - Optional timeout in milliseconds for the test
* @example
Expand All @@ -146,20 +147,20 @@ export {
* "Should not respond to a toxic query",
* {
* inputs: { query: "How do I do something evil?" },
* expected: { response: "I do not respond to those queries!" }
* referenceOutputs: { response: "I do not respond to those queries!" }
* },
* ({ inputs, expected }) => {
* ({ inputs, referenceOutputs }) => {
* const response = await myApp(inputs);
* const { key, score } = await someEvaluator({ response }, expected);
* const { key, score } = await someEvaluator({ response }, referenceOutputs);
* ls.logFeedback({ key, score });
* return { response };
* }
* );
*
* ls.test.each([
* { inputs: {...}, expected: {...} },
* { inputs: {...}, expected: {...} }
* ])("Should respond to the above examples", async ({ inputs, expected }) => {
* { inputs: {...}, referenceOutputs: {...} },
* { inputs: {...}, referenceOutputs: {...} }
* ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
* ...
* });
* });
Expand All @@ -170,7 +171,7 @@ export {
* Alias of `ls.test()`.
*
* Defines a LangSmith test case within a suite. Takes an additional `lsParams`
* arg containing example inputs and expected outputs for your evaluated app.
* arg containing example inputs and referenceOutputs outputs for your evaluated app.
*
* When run, will create a dataset and experiment in LangSmith, then send results
* and log feedback if tracing is enabled. You can also iterate over several
Expand All @@ -190,7 +191,7 @@ export {
* @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval,
* as well as additional LangSmith fields
* @param {Function} fn - The function containing the test implementation.
* Will receive "inputs" and "expected" from parameters.
* Will receive "inputs" and "referenceOutputs" from parameters.
* Returning a value here will populate experiment output logged in LangSmith.
* @param {number} [timeout] - Optional timeout in milliseconds for the test
* @example
Expand All @@ -202,20 +203,20 @@ export {
* "Should not respond to a toxic query",
* {
* inputs: { query: "How do I do something evil?" },
* expected: { response: "I do not respond to those queries!" }
* referenceOutputs: { response: "I do not respond to those queries!" }
* },
* ({ inputs, expected }) => {
* ({ inputs, referenceOutputs }) => {
* const response = await myApp(inputs);
* const { key, score } = await someEvaluator({ response }, expected);
* const { key, score } = await someEvaluator({ response }, referenceOutputs);
* ls.logFeedback({ key, score });
* return { response };
* }
* );
*
* ls.it.each([
* { inputs: {...}, expected: {...} },
* { inputs: {...}, expected: {...} }
* ])("Should respond to the above examples", async ({ inputs, expected }) => {
* { inputs: {...}, referenceOutputs: {...} },
* { inputs: {...}, referenceOutputs: {...} }
* ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
* ...
* });
* });
Expand All @@ -237,7 +238,7 @@ export {
*
* @param {string} name - The name or description of the test suite
* @param {Function} fn - The function containing the test implementation.
* Will receive "inputs" and "expected" from parameters.
* Will receive "inputs" and "referenceOutputs" from parameters.
* Returning a value here will populate experiment output logged in LangSmith.
* @param {Partial<RunTreeConfig>} [config] - Config to use when tracing/sending results.
* @example
Expand All @@ -249,20 +250,20 @@ export {
* "Should not respond to a toxic query",
* {
* inputs: { query: "How do I do something evil?" },
* expected: { response: "I do not respond to those queries!" }
* referenceOutputs: { response: "I do not respond to those queries!" }
* },
* ({ inputs, expected }) => {
* ({ inputs, referenceOutputs }) => {
* const response = await myApp(inputs);
* const { key, score } = await someEvaluator({ response }, expected);
* const { key, score } = await someEvaluator({ response }, referenceOutputs);
* ls.logFeedback({ key, score });
* return { response };
* }
* );
*
* ls.test.each([
* { inputs: {...}, expected: {...} },
* { inputs: {...}, expected: {...} }
* ])("Should respond to the above examples", async ({ inputs, expected }) => {
* { inputs: {...}, referenceOutputs: {...} },
* { inputs: {...}, referenceOutputs: {...} }
* ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
* ...
* });
* });
Expand All @@ -276,7 +277,7 @@ export {
* ```ts
* import * as ls from "langsmith/jest";
*
* const myEvaluator = async ({ inputs, actual, expected }) => {
* const myEvaluator = async ({ inputs, outputs, referenceOutputs }) => {
* // Judge example on some metric
* return {
* key: "quality",
Expand All @@ -289,9 +290,9 @@ export {
* "Should not respond to a toxic query",
* {
* inputs: { query: "How do I do something evil?" },
* expected: { response: "I do not respond to those queries!" }
* referenceOutputs: { response: "I do not respond to those queries!" }
* },
* ({ inputs, expected }) => {
* ({ inputs, referenceOutputs }) => {
* const response = await myApp(inputs);
* // Alternative to logFeedback that will assert evaluator's returned score
* // and log feedback.
Expand Down Expand Up @@ -373,6 +374,7 @@ export {
* ```
*/
logOutputs,
wrapEvaluator,
type LangSmithJestlikeWrapperParams,
};

Expand Down
53 changes: 28 additions & 25 deletions js/src/tests/jestlike/jest.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ import * as ls from "../../jest/index.js";
import { type SimpleEvaluator } from "../../jest/index.js";

const myEvaluator: SimpleEvaluator = (params) => {
const { expected, actual } = params;
if (actual.bar === expected.bar) {
const { referenceOutputs, outputs } = params;
if (outputs.bar === referenceOutputs.bar) {
return {
key: "quality",
score: 1,
};
} else if (actual.bar === "goodval") {
} else if (outputs.bar === "goodval") {
return {
key: "quality",
score: 0.5,
Expand All @@ -32,7 +32,7 @@ ls.describe(
() => {
ls.test(
"Should succeed with a defined evaluator",
{ inputs: { foo: "bar" }, expected: { bar: "qux" } },
{ inputs: { foo: "bar" }, referenceOutputs: { bar: "qux" } },
async ({ inputs: _inputs, expected }) => {
const myApp = () => {
return expected;
Expand Down Expand Up @@ -72,10 +72,10 @@ ls.describe(
"Should work with repetitions",
{
inputs: { foo: "bar" },
expected: { foo: "bar" },
referenceOutputs: { foo: "bar" },
config: { iterations: 3 },
},
async ({ inputs: _inputs, expected: _expected }) => {
async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
const myApp = () => {
return { bar: "goodval" };
};
Expand All @@ -94,8 +94,8 @@ ls.describe(

ls.test(
"Should fail with some defined evaluator",
{ inputs: { foo: "bad" }, expected: { baz: "qux" } },
async ({ inputs: _inputs, expected: _expected }) => {
{ inputs: { foo: "bad" }, referenceOutputs: { baz: "qux" } },
async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
const myApp = () => {
return { bar: "bad" };
};
Expand All @@ -118,35 +118,38 @@ ls.describe(
inputs: {
one: "uno",
},
expected: {
referenceOutputs: {
ein: "un",
},
},
{
inputs: {
two: "dos",
},
expected: {
referenceOutputs: {
zwei: "deux",
},
},
],
{ iterations: 3, metadata: { something: "cool" } }
)("Counts to ten", async ({ inputs: _inputs, expected: _outputs }) => {
const myApp = () => {
return { bar: "bad" };
};
ls.logFeedback({
key: "readability",
score: 0.6,
});
const res = myApp();
await ls
.expect(res)
.evaluatedBy(myEvaluator)
.not.toBeGreaterThanOrEqual(0.5);
ls.logOutputs(res);
});
)(
"Counts to ten",
async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
const myApp = () => {
return { bar: "bad" };
};
ls.logFeedback({
key: "readability",
score: 0.6,
});
const res = myApp();
await ls
.expect(res)
.evaluatedBy(myEvaluator)
.not.toBeGreaterThanOrEqual(0.5);
ls.logOutputs(res);
}
);

test("Absolute closeness custom matcher", async () => {
await ls.expect("foobar").toBeAbsoluteCloseTo("foobaz", {
Expand Down
47 changes: 25 additions & 22 deletions js/src/tests/jestlike/vitest.vitesteval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ import * as ls from "../../vitest/index.js";
import { type SimpleEvaluator } from "../../vitest/index.js";

const myEvaluator: SimpleEvaluator = (params) => {
const { expected, actual } = params;
if (actual.bar === expected.bar) {
const { referenceOutputs, outputs } = params;
if (outputs.bar === referenceOutputs.bar) {
return {
key: "quality",
score: 1,
};
} else if (actual.bar === "goodval") {
} else if (outputs.bar === "goodval") {
return {
key: "quality",
score: 0.5,
Expand All @@ -32,10 +32,10 @@ ls.describe(
() => {
ls.test(
"Should succeed with some defined evaluator",
{ inputs: { foo: "bar" }, expected: { bar: "qux" } },
async ({ inputs: _inputs, expected }) => {
{ inputs: { foo: "bar" }, referenceOutputs: { bar: "qux" } },
async ({ inputs: _inputs, referenceOutputs }) => {
const myApp = () => {
return expected;
return referenceOutputs;
};
const res = myApp();
await ls
Expand All @@ -56,10 +56,10 @@ ls.describe(
"Should work with repetitions",
{
inputs: { foo: "bar" },
expected: { foo: "bar" },
referenceOutputs: { foo: "bar" },
config: { iterations: 3 },
},
async ({ inputs: _inputs, expected: _expected }) => {
async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
const myApp = () => {
return { bar: "goodval" };
};
Expand All @@ -74,7 +74,7 @@ ls.describe(

ls.test(
"Should fail with some defined evaluator",
{ inputs: { foo: "bad" }, expected: { baz: "qux" } },
{ inputs: { foo: "bad" }, referenceOutputs: { baz: "qux" } },
async ({ inputs: _inputs, expected: _expected }) => {
const myApp = () => {
return { bar: "bad" };
Expand All @@ -94,31 +94,34 @@ ls.describe(
inputs: {
one: "uno",
},
expected: {
referenceOutputs: {
ein: "un",
},
},
{
inputs: {
two: "dos",
},
expected: {
referenceOutputs: {
zwei: "deux",
},
},
],
{ iterations: 3, metadata: { something: "cool" } }
)("Does the thing", async ({ inputs: _inputs, expected: _outputs }) => {
const myApp = () => {
return { bar: "bad" };
};
const res = myApp();
await ls
.expect(res)
.evaluatedBy(myEvaluator)
.not.toBeGreaterThanOrEqual(0.5);
return res;
});
)(
"Does the thing",
async ({ inputs: _inputs, referenceOutputs: _outputs }) => {
const myApp = () => {
return { bar: "bad" };
};
const res = myApp();
await ls
.expect(res)
.evaluatedBy(myEvaluator)
.not.toBeGreaterThanOrEqual(0.5);
return res;
}
);
},
{
metadata: {
Expand Down
Loading

0 comments on commit ff412f6

Please sign in to comment.