Rename expected -> referenceOutputs and actual -> outputs

langchain-ai · Jan 17, 2025 · ff412f6 · ff412f6
1 parent b615035
commit ff412f6
Show file tree

Hide file tree

Showing 9 changed files with 238 additions and 254 deletions.
diff --git a/js/src/jest/index.ts b/js/src/jest/index.ts
@@ -17,6 +17,7 @@ import {
   type RelativeCloseToMatcherOptions,
 } from "../utils/jestlike/matchers.js";
 import type { SimpleEvaluator } from "../utils/jestlike/vendor/evaluatedBy.js";
+import { wrapEvaluator } from "../utils/jestlike/vendor/evaluatedBy.js";
 import { logFeedback, logOutputs } from "../utils/jestlike/index.js";
 import { generateWrapperFromJestlikeMethods } from "../utils/jestlike/index.js";
 import type { LangSmithJestlikeWrapperParams } from "../utils/jestlike/types.js";
@@ -57,7 +58,7 @@ declare global {
         options?: SemanticCloseToMatcherOptions
       ): Promise<R>;
       /**
-       * Matcher that runs an evaluator with actual and expected output from some run,
+       * Matcher that runs an evaluator with actual outputs and referenceOutputs from some run,
        * and asserts the evaluator's output `score` based on subsequent matchers.
        * Will also log feedback to LangSmith and to test results.
        *
@@ -67,7 +68,7 @@ declare global {
        * ```ts
        * import * as ls from "langsmith/jest";
        *
-       * const myEvaluator = async ({ inputs, actual, expected }) => {
+       * const myEvaluator = async ({ inputs, outputs, referenceOutputs }) => {
        *   // Judge example on some metric
        *   return {
        *     key: "quality",
@@ -80,9 +81,9 @@ declare global {
        *     "Should not respond to a toxic query",
        *     {
        *       inputs: { query: "How do I do something evil?" },
-       *       expected: { response: "I do not respond to those queries!" }
+       *       referenceOutputs: { response: "I do not respond to those queries!" }
        *     },
-       *     ({ inputs, expected }) => {
+       *     ({ inputs, referenceOutputs }) => {
        *       const response = await myApp(inputs);
        *       await ls.expect(response).evaluatedBy(myEvaluator).toBeGreaterThan(0.5);
        *       return { response };
@@ -114,7 +115,7 @@ const { test, it, describe, expect } = generateWrapperFromJestlikeMethods(
 export {
   /**
    * Defines a LangSmith test case within a suite. Takes an additional `lsParams`
-   * arg containing example inputs and expected outputs for your evaluated app.
+   * arg containing example inputs and referenceOutputs outputs for your evaluated app.
    *
    * When run, will create a dataset and experiment in LangSmith, then send results
    * and log feedback if tracing is enabled. You can also iterate over several
@@ -134,7 +135,7 @@ export {
    * @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval,
    *   as well as additional LangSmith fields
    * @param {Function} fn - The function containing the test implementation.
-   *   Will receive "inputs" and "expected" from parameters.
+   *   Will receive "inputs" and "referenceOutputs" from parameters.
    *   Returning a value here will populate experiment output logged in LangSmith.
    * @param {number} [timeout] - Optional timeout in milliseconds for the test
    * @example
@@ -146,20 +147,20 @@ export {
    *     "Should not respond to a toxic query",
    *     {
    *       inputs: { query: "How do I do something evil?" },
-   *       expected: { response: "I do not respond to those queries!" }
+   *       referenceOutputs: { response: "I do not respond to those queries!" }
    *     },
-   *     ({ inputs, expected }) => {
+   *     ({ inputs, referenceOutputs }) => {
    *       const response = await myApp(inputs);
-   *       const { key, score } = await someEvaluator({ response }, expected);
+   *       const { key, score } = await someEvaluator({ response }, referenceOutputs);
    *       ls.logFeedback({ key, score });
    *       return { response };
    *     }
    *   );
    *
    *   ls.test.each([
-   *     { inputs: {...}, expected: {...} },
-   *     { inputs: {...}, expected: {...} }
-   *   ])("Should respond to the above examples", async ({ inputs, expected }) => {
+   *     { inputs: {...}, referenceOutputs: {...} },
+   *     { inputs: {...}, referenceOutputs: {...} }
+   *   ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
    *     ...
    *   });
    * });
@@ -170,7 +171,7 @@ export {
    * Alias of `ls.test()`.
    *
    * Defines a LangSmith test case within a suite. Takes an additional `lsParams`
-   * arg containing example inputs and expected outputs for your evaluated app.
+   * arg containing example inputs and referenceOutputs outputs for your evaluated app.
    *
    * When run, will create a dataset and experiment in LangSmith, then send results
    * and log feedback if tracing is enabled. You can also iterate over several
@@ -190,7 +191,7 @@ export {
    * @param {LangSmithJestlikeWrapperParams<I, O>} lsParams Input and output for the eval,
    *   as well as additional LangSmith fields
    * @param {Function} fn - The function containing the test implementation.
-   *   Will receive "inputs" and "expected" from parameters.
+   *   Will receive "inputs" and "referenceOutputs" from parameters.
    *   Returning a value here will populate experiment output logged in LangSmith.
    * @param {number} [timeout] - Optional timeout in milliseconds for the test
    * @example
@@ -202,20 +203,20 @@ export {
    *     "Should not respond to a toxic query",
    *     {
    *       inputs: { query: "How do I do something evil?" },
-   *       expected: { response: "I do not respond to those queries!" }
+   *       referenceOutputs: { response: "I do not respond to those queries!" }
    *     },
-   *     ({ inputs, expected }) => {
+   *     ({ inputs, referenceOutputs }) => {
    *       const response = await myApp(inputs);
-   *       const { key, score } = await someEvaluator({ response }, expected);
+   *       const { key, score } = await someEvaluator({ response }, referenceOutputs);
    *       ls.logFeedback({ key, score });
    *       return { response };
    *     }
    *   );
    *
    *   ls.it.each([
-   *     { inputs: {...}, expected: {...} },
-   *     { inputs: {...}, expected: {...} }
-   *   ])("Should respond to the above examples", async ({ inputs, expected }) => {
+   *     { inputs: {...}, referenceOutputs: {...} },
+   *     { inputs: {...}, referenceOutputs: {...} }
+   *   ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
    *     ...
    *   });
    * });
@@ -237,7 +238,7 @@ export {
    *
    * @param {string} name - The name or description of the test suite
    * @param {Function} fn - The function containing the test implementation.
-   *   Will receive "inputs" and "expected" from parameters.
+   *   Will receive "inputs" and "referenceOutputs" from parameters.
    *   Returning a value here will populate experiment output logged in LangSmith.
    * @param {Partial<RunTreeConfig>} [config] - Config to use when tracing/sending results.
    * @example
@@ -249,20 +250,20 @@ export {
    *     "Should not respond to a toxic query",
    *     {
    *       inputs: { query: "How do I do something evil?" },
-   *       expected: { response: "I do not respond to those queries!" }
+   *       referenceOutputs: { response: "I do not respond to those queries!" }
    *     },
-   *     ({ inputs, expected }) => {
+   *     ({ inputs, referenceOutputs }) => {
    *       const response = await myApp(inputs);
-   *       const { key, score } = await someEvaluator({ response }, expected);
+   *       const { key, score } = await someEvaluator({ response }, referenceOutputs);
    *       ls.logFeedback({ key, score });
    *       return { response };
    *     }
    *   );
    *
    *   ls.test.each([
-   *     { inputs: {...}, expected: {...} },
-   *     { inputs: {...}, expected: {...} }
-   *   ])("Should respond to the above examples", async ({ inputs, expected }) => {
+   *     { inputs: {...}, referenceOutputs: {...} },
+   *     { inputs: {...}, referenceOutputs: {...} }
+   *   ])("Should respond to the above examples", async ({ inputs, referenceOutputs }) => {
    *     ...
    *   });
    * });
@@ -276,7 +277,7 @@ export {
    * ```ts
    * import * as ls from "langsmith/jest";
    *
-   * const myEvaluator = async ({ inputs, actual, expected }) => {
+   * const myEvaluator = async ({ inputs, outputs, referenceOutputs }) => {
    *   // Judge example on some metric
    *   return {
    *     key: "quality",
@@ -289,9 +290,9 @@ export {
    *     "Should not respond to a toxic query",
    *     {
    *       inputs: { query: "How do I do something evil?" },
-   *       expected: { response: "I do not respond to those queries!" }
+   *       referenceOutputs: { response: "I do not respond to those queries!" }
    *     },
-   *     ({ inputs, expected }) => {
+   *     ({ inputs, referenceOutputs }) => {
    *       const response = await myApp(inputs);
    *       // Alternative to logFeedback that will assert evaluator's returned score
    *       // and log feedback.
@@ -373,6 +374,7 @@ export {
    * ```
    */
   logOutputs,
+  wrapEvaluator,
   type LangSmithJestlikeWrapperParams,
 };
 

diff --git a/js/src/tests/jestlike/jest.test.ts b/js/src/tests/jestlike/jest.test.ts
@@ -5,13 +5,13 @@ import * as ls from "../../jest/index.js";
 import { type SimpleEvaluator } from "../../jest/index.js";
 
 const myEvaluator: SimpleEvaluator = (params) => {
-  const { expected, actual } = params;
-  if (actual.bar === expected.bar) {
+  const { referenceOutputs, outputs } = params;
+  if (outputs.bar === referenceOutputs.bar) {
     return {
       key: "quality",
       score: 1,
     };
-  } else if (actual.bar === "goodval") {
+  } else if (outputs.bar === "goodval") {
     return {
       key: "quality",
       score: 0.5,
@@ -32,7 +32,7 @@ ls.describe(
   () => {
     ls.test(
       "Should succeed with a defined evaluator",
-      { inputs: { foo: "bar" }, expected: { bar: "qux" } },
+      { inputs: { foo: "bar" }, referenceOutputs: { bar: "qux" } },
       async ({ inputs: _inputs, expected }) => {
         const myApp = () => {
           return expected;
@@ -72,10 +72,10 @@ ls.describe(
       "Should work with repetitions",
       {
         inputs: { foo: "bar" },
-        expected: { foo: "bar" },
+        referenceOutputs: { foo: "bar" },
         config: { iterations: 3 },
       },
-      async ({ inputs: _inputs, expected: _expected }) => {
+      async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
         const myApp = () => {
           return { bar: "goodval" };
         };
@@ -94,8 +94,8 @@ ls.describe(
 
     ls.test(
       "Should fail with some defined evaluator",
-      { inputs: { foo: "bad" }, expected: { baz: "qux" } },
-      async ({ inputs: _inputs, expected: _expected }) => {
+      { inputs: { foo: "bad" }, referenceOutputs: { baz: "qux" } },
+      async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
         const myApp = () => {
           return { bar: "bad" };
         };
@@ -118,35 +118,38 @@ ls.describe(
           inputs: {
             one: "uno",
           },
-          expected: {
+          referenceOutputs: {
             ein: "un",
           },
         },
         {
           inputs: {
             two: "dos",
           },
-          expected: {
+          referenceOutputs: {
             zwei: "deux",
           },
         },
       ],
       { iterations: 3, metadata: { something: "cool" } }
-    )("Counts to ten", async ({ inputs: _inputs, expected: _outputs }) => {
-      const myApp = () => {
-        return { bar: "bad" };
-      };
-      ls.logFeedback({
-        key: "readability",
-        score: 0.6,
-      });
-      const res = myApp();
-      await ls
-        .expect(res)
-        .evaluatedBy(myEvaluator)
-        .not.toBeGreaterThanOrEqual(0.5);
-      ls.logOutputs(res);
-    });
+    )(
+      "Counts to ten",
+      async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
+        const myApp = () => {
+          return { bar: "bad" };
+        };
+        ls.logFeedback({
+          key: "readability",
+          score: 0.6,
+        });
+        const res = myApp();
+        await ls
+          .expect(res)
+          .evaluatedBy(myEvaluator)
+          .not.toBeGreaterThanOrEqual(0.5);
+        ls.logOutputs(res);
+      }
+    );
 
     test("Absolute closeness custom matcher", async () => {
       await ls.expect("foobar").toBeAbsoluteCloseTo("foobaz", {

diff --git a/js/src/tests/jestlike/vitest.vitesteval.ts b/js/src/tests/jestlike/vitest.vitesteval.ts
@@ -5,13 +5,13 @@ import * as ls from "../../vitest/index.js";
 import { type SimpleEvaluator } from "../../vitest/index.js";
 
 const myEvaluator: SimpleEvaluator = (params) => {
-  const { expected, actual } = params;
-  if (actual.bar === expected.bar) {
+  const { referenceOutputs, outputs } = params;
+  if (outputs.bar === referenceOutputs.bar) {
     return {
       key: "quality",
       score: 1,
     };
-  } else if (actual.bar === "goodval") {
+  } else if (outputs.bar === "goodval") {
     return {
       key: "quality",
       score: 0.5,
@@ -32,10 +32,10 @@ ls.describe(
   () => {
     ls.test(
       "Should succeed with some defined evaluator",
-      { inputs: { foo: "bar" }, expected: { bar: "qux" } },
-      async ({ inputs: _inputs, expected }) => {
+      { inputs: { foo: "bar" }, referenceOutputs: { bar: "qux" } },
+      async ({ inputs: _inputs, referenceOutputs }) => {
         const myApp = () => {
-          return expected;
+          return referenceOutputs;
         };
         const res = myApp();
         await ls
@@ -56,10 +56,10 @@ ls.describe(
       "Should work with repetitions",
       {
         inputs: { foo: "bar" },
-        expected: { foo: "bar" },
+        referenceOutputs: { foo: "bar" },
         config: { iterations: 3 },
       },
-      async ({ inputs: _inputs, expected: _expected }) => {
+      async ({ inputs: _inputs, referenceOutputs: _referenceOutputs }) => {
         const myApp = () => {
           return { bar: "goodval" };
         };
@@ -74,7 +74,7 @@ ls.describe(
 
     ls.test(
       "Should fail with some defined evaluator",
-      { inputs: { foo: "bad" }, expected: { baz: "qux" } },
+      { inputs: { foo: "bad" }, referenceOutputs: { baz: "qux" } },
       async ({ inputs: _inputs, expected: _expected }) => {
         const myApp = () => {
           return { bar: "bad" };
@@ -94,31 +94,34 @@ ls.describe(
           inputs: {
             one: "uno",
           },
-          expected: {
+          referenceOutputs: {
             ein: "un",
           },
         },
         {
           inputs: {
             two: "dos",
           },
-          expected: {
+          referenceOutputs: {
             zwei: "deux",
           },
         },
       ],
       { iterations: 3, metadata: { something: "cool" } }
-    )("Does the thing", async ({ inputs: _inputs, expected: _outputs }) => {
-      const myApp = () => {
-        return { bar: "bad" };
-      };
-      const res = myApp();
-      await ls
-        .expect(res)
-        .evaluatedBy(myEvaluator)
-        .not.toBeGreaterThanOrEqual(0.5);
-      return res;
-    });
+    )(
+      "Does the thing",
+      async ({ inputs: _inputs, referenceOutputs: _outputs }) => {
+        const myApp = () => {
+          return { bar: "bad" };
+        };
+        const res = myApp();
+        await ls
+          .expect(res)
+          .evaluatedBy(myEvaluator)
+          .not.toBeGreaterThanOrEqual(0.5);
+        return res;
+      }
+    );
   },
   {
     metadata: {