Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add inline evaluation results for summary evaluators #1348

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions js/src/client.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
RawExample,
AttachmentInfo,
AttachmentData,
EvaluationResult,
EvaluationResults,
} from "./schemas.js";
import {
convertLangChainMessageToExample,
Expand All @@ -54,11 +56,7 @@
getRuntimeEnvironment,
} from "./utils/env.js";

import {
EvaluationResult,
EvaluationResults,
RunEvaluator,
} from "./evaluation/evaluator.js";
import { RunEvaluator } from "./evaluation/evaluator.js";
import { __version__ } from "./index.js";
import { assertUuid } from "./utils/_uuid.js";
import { warnOnce } from "./utils/warn.js";
Expand Down Expand Up @@ -424,7 +422,7 @@
// If there is an item on the queue we were unable to pop,
// just return it as a single batch.
if (popped.length === 0 && this.items.length > 0) {
const item = this.items.shift()!;

Check warning on line 425 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Forbidden non-null assertion
popped.push(item);
poppedSizeBytes += item.size;
this.sizeBytes -= item.size;
Expand Down Expand Up @@ -847,7 +845,7 @@
if (this._serverInfo === undefined) {
try {
this._serverInfo = await this._getServerInfo();
} catch (e) {

Check warning on line 848 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

'e' is defined but never used. Allowed unused args must match /^_/u
console.warn(
`[WARNING]: LangSmith failed to fetch info on supported operations. Falling back to batch operations and default limits.`
);
Expand Down Expand Up @@ -1573,7 +1571,7 @@
treeFilter?: string;
isRoot?: boolean;
dataSourceType?: string;
}): Promise<any> {

Check warning on line 1574 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Unexpected any. Specify a different type
let projectIds_ = projectIds || [];
if (projectNames) {
projectIds_ = [
Expand Down Expand Up @@ -1861,7 +1859,7 @@
`Failed to list shared examples: ${response.status} ${response.statusText}`
);
}
return result.map((example: any) => ({

Check warning on line 1862 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Unexpected any. Specify a different type
...example,
_hostUrl: this.getHostUrl(),
}));
Expand Down Expand Up @@ -1998,7 +1996,7 @@
}
// projectId querying
return true;
} catch (e) {

Check warning on line 1999 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

'e' is defined but never used. Allowed unused args must match /^_/u
return false;
}
}
Expand Down Expand Up @@ -3373,7 +3371,7 @@
async _logEvaluationFeedback(
evaluatorResponse: EvaluationResult | EvaluationResults,
run?: Run,
sourceInfo?: { [key: string]: any }

Check warning on line 3374 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Unexpected any. Specify a different type
): Promise<[results: EvaluationResult[], feedbacks: Feedback[]]> {
const evalResults: Array<EvaluationResult> =
this._selectEvalResults(evaluatorResponse);
Expand Down Expand Up @@ -3412,7 +3410,7 @@
public async logEvaluationFeedback(
evaluatorResponse: EvaluationResult | EvaluationResults,
run?: Run,
sourceInfo?: { [key: string]: any }

Check warning on line 3413 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Unexpected any. Specify a different type
): Promise<EvaluationResult[]> {
const [results] = await this._logEvaluationFeedback(
evaluatorResponse,
Expand Down Expand Up @@ -3862,7 +3860,7 @@

public async createCommit(
promptIdentifier: string,
object: any,

Check warning on line 3863 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Unexpected any. Specify a different type
options?: {
parentCommitHash?: string;
}
Expand Down Expand Up @@ -4094,7 +4092,7 @@
isPublic?: boolean;
isArchived?: boolean;
}
): Promise<Record<string, any>> {

Check warning on line 4095 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Unexpected any. Specify a different type
if (!(await this.promptExists(promptIdentifier))) {
throw new Error("Prompt does not exist, you must create it first.");
}
Expand All @@ -4105,7 +4103,7 @@
throw await this._ownerConflictError("update a prompt", owner);
}

const payload: Record<string, any> = {};

Check warning on line 4106 in js/src/client.ts

View workflow job for this annotation

GitHub Actions / Check linting

Unexpected any. Specify a different type

if (options?.description !== undefined)
payload.description = options.description;
Expand Down
92 changes: 65 additions & 27 deletions js/src/evaluation/_runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ import { Client, RunTree, RunTreeConfig } from "../index.js";
import {
AttachmentInfo,
BaseRun,
EvaluationResult,
EvaluationResults,
Example,
ExperimentResultRow,
KVMap,
Run,
TracerSession,
Expand All @@ -15,12 +18,7 @@ import { atee } from "../utils/atee.js";
import { getLangChainEnvVarsMetadata } from "../utils/env.js";
import { printErrorStackTrace } from "../utils/error.js";
import { randomName } from "./_random_name.js";
import {
EvaluationResult,
EvaluationResults,
RunEvaluator,
runEvaluator,
} from "./evaluator.js";
import { RunEvaluator, runEvaluator } from "./evaluator.js";
import { LangSmithConflictError } from "../utils/error.js";
import { v4 as uuidv4 } from "uuid";
import {
Expand Down Expand Up @@ -64,6 +62,18 @@ type DeprecatedAsyncSummaryEvaluator = (
examples: Array<Example>
) => Promise<EvaluationResult | EvaluationResults>;

type SyncSummaryEvaluator = (
runs: Array<Run>,
examples: Array<Example>,
evaluationResults: Array<ExperimentResultRow>
) => EvaluationResult | EvaluationResults;

type AsyncSummaryEvaluator = (
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this different from SummaryEvaluatorT?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is what we wrap the evaluator to internally, I do not believe it is user facing.

runs: Array<Run>,
examples: Array<Example>,
evaluationResults: Array<ExperimentResultRow>
) => Promise<EvaluationResult | EvaluationResults>;

// Summary evaluator runs over the whole dataset
export type SummaryEvaluatorT =
| DeprecatedSyncSummaryEvaluator
Expand All @@ -74,13 +84,15 @@ export type SummaryEvaluatorT =
inputs: Array<Record<string, any>>;
outputs: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
evaluationResults?: Array<ExperimentResultRow>;
}) => EvaluationResult | EvaluationResults)
| ((args: {
runs: Array<Run>;
examples: Array<Example>;
inputs: Array<Record<string, any>>;
outputs: Array<Record<string, any>>;
referenceOutputs?: Array<Record<string, any>>;
evaluationResults?: Array<ExperimentResultRow>;
}) => Promise<EvaluationResult | EvaluationResults>);

/** @deprecated Use object parameter version instead: (args: { run, example, inputs, outputs, referenceOutputs }) => ... */
Expand Down Expand Up @@ -133,13 +145,17 @@ interface _ExperimentManagerArgs {
runs?: AsyncGenerator<Run>;
evaluationResults?: AsyncGenerator<EvaluationResults>;
summaryResults?: AsyncGenerator<
(runsArray: Run[]) => AsyncGenerator<EvaluationResults, any, unknown>,
(
Copy link
Collaborator

@jacoblee93 jacoblee93 Dec 21, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a weird typing - isn't it breaking since you now have to specify evaluationResults?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah yeah - can I just make evaluationResults an optional param? Or is there a better solution to make this non breaking?

runsArray: Run[],
evaluationResults: ExperimentResultRow[]
) => AsyncGenerator<EvaluationResults, any, unknown>,
any,
unknown
>;
examples?: Example[];
numRepetitions?: number;
_runsArray?: Run[];
_evaluationResultsArray?: ExperimentResultRow[];
includeAttachments?: boolean;
}

Expand Down Expand Up @@ -235,12 +251,6 @@ export function evaluate(
return _evaluate(target, options);
}

export interface ExperimentResultRow {
run: Run;
example: Example;
evaluationResults: EvaluationResults;
}

/**
* Manage the execution of experiments.
*
Expand All @@ -255,7 +265,10 @@ export class _ExperimentManager {
_evaluationResults?: AsyncGenerator<EvaluationResults>;

_summaryResults?: AsyncGenerator<
(runsArray: Run[]) => AsyncGenerator<EvaluationResults, any, unknown>,
(
runsArray: Run[],
evaluationResults: ExperimentResultRow[]
) => AsyncGenerator<EvaluationResults, any, unknown>,
any,
unknown
>;
Expand All @@ -266,6 +279,8 @@ export class _ExperimentManager {

_runsArray?: Run[];

_evaluationResultsArray?: ExperimentResultRow[];

client: Client;

_experiment?: TracerSession;
Expand Down Expand Up @@ -558,6 +573,7 @@ export class _ExperimentManager {
client: this.client,
runs: this.runs,
_runsArray: this._runsArray,
_evaluationResultsArray: this._evaluationResultsArray,
evaluationResults: this._evaluationResults,
summaryResults: aggregateFeedbackGen,
includeAttachments: this._includeAttachments,
Expand All @@ -578,7 +594,15 @@ export class _ExperimentManager {
for await (const evaluationResult of this.evaluationResults) {
evaluationResults.push(evaluationResult);
}
if (!this._evaluationResultsArray) {
this._evaluationResultsArray = [];
}
for (let i = 0; i < this._runsArray.length; i++) {
this._evaluationResultsArray.push({
run: this._runsArray[i],
example: examples[i],
evaluationResults: evaluationResults[i],
});
yield {
run: this._runsArray[i],
example: examples[i],
Expand All @@ -598,7 +622,8 @@ export class _ExperimentManager {
// This is because runs array is not available until after this generator
// is set, so we need to pass it like so.
for await (const evaluationResults of evaluationResultsGenerator(
this._runsArray ?? []
this._runsArray ?? [],
this._evaluationResultsArray ?? []
)) {
results.push(...evaluationResults.results);
}
Expand Down Expand Up @@ -752,7 +777,12 @@ export class _ExperimentManager {

async *_applySummaryEvaluators(
summaryEvaluators: Array<SummaryEvaluatorT>
): AsyncGenerator<(runsArray: Run[]) => AsyncGenerator<EvaluationResults>> {
): AsyncGenerator<
(
runsArray: Run[],
evaluationResults: ExperimentResultRow[]
) => AsyncGenerator<EvaluationResults>
> {
const projectId = this._getExperiment().id;
const examples = await this.getExamples();

Expand All @@ -770,13 +800,18 @@ export class _ExperimentManager {

yield async function* (
this: _ExperimentManager,
runsArray: Run[]
runsArray: Run[],
evaluationResults: ExperimentResultRow[]
): AsyncGenerator<EvaluationResults> {
const aggregateFeedback = [];

for (const evaluator of wrappedEvaluators) {
try {
const summaryEvalResult = await evaluator(runsArray, examples);
const summaryEvalResult = await evaluator(
runsArray,
examples,
evaluationResults
);

const flattenedResults =
this.client._selectEvalResults(summaryEvalResult);
Expand Down Expand Up @@ -1114,17 +1149,16 @@ function _resolveData(
async function wrapSummaryEvaluators(
evaluators: SummaryEvaluatorT[],
optionsArray?: Partial<RunTreeConfig>[]
): Promise<
Array<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator>
> {
): Promise<Array<AsyncSummaryEvaluator | SyncSummaryEvaluator>> {
async function _wrap(
evaluator: SummaryEvaluatorT
): Promise<DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator> {
): Promise<AsyncSummaryEvaluator | SyncSummaryEvaluator> {
const evalName = evaluator.name || "BatchEvaluator";

const wrapperInner = (
runs: Run[],
examples: Example[]
examples: Example[],
evaluationResults: ExperimentResultRow[]
): Promise<EvaluationResult | EvaluationResults> => {
const wrapperSuperInner = traceable(
(
Expand All @@ -1145,19 +1179,25 @@ async function wrapSummaryEvaluators(
inputs: Record<string, any>[];
outputs: Record<string, any>[];
referenceOutputs?: Record<string, any>[];
evaluationResults?: ExperimentResultRow[];
}) => EvaluationResult | EvaluationResults
)({
runs,
examples,
inputs,
outputs,
referenceOutputs,
evaluationResults,
})
);
}
// Otherwise use the traditional (runs, examples) signature
return Promise.resolve(
(evaluator as DeprecatedSyncSummaryEvaluator)(runs, examples)
(evaluator as SyncSummaryEvaluator)(
runs,
examples,
evaluationResults
)
);
},
{ ...optionsArray, name: evalName }
Expand All @@ -1174,9 +1214,7 @@ async function wrapSummaryEvaluators(
return wrapperInner;
}

const results: Array<
DeprecatedAsyncSummaryEvaluator | DeprecatedSyncSummaryEvaluator
> = [];
const results: Array<AsyncSummaryEvaluator | SyncSummaryEvaluator> = [];
for (let i = 0; i < evaluators.length; i++) {
results.push(await _wrap(evaluators[i]));
}
Expand Down
78 changes: 2 additions & 76 deletions js/src/evaluation/evaluator.ts
Original file line number Diff line number Diff line change
@@ -1,87 +1,13 @@
import {
EvaluationResult,
EvaluationResults,
Example,
FeedbackConfig,
Run,
ScoreType,
ValueType,
} from "../schemas.js";
import { v4 as uuidv4 } from "uuid";
import { TraceableFunction, traceable } from "../traceable.js";
import { RunTreeConfig } from "../run_trees.js";

/**
* Represents a categorical class.
*/
export type Category = {
/**
* The value of the category.
*/
value?: number;
/**
* The label of the category.
*/
label: string;
};

/**
* Represents the result of an evaluation.
*/
export type EvaluationResult = {
/**
* The key associated with the evaluation result.
*/
key: string;
/**
* The score of the evaluation result.
*/
score?: ScoreType;
/**
* The value of the evaluation result.
*/
value?: ValueType;
/**
* A comment associated with the evaluation result.
*/
comment?: string;
/**
* A correction record associated with the evaluation result.
*/
correction?: Record<string, unknown>;
/**
* Information about the evaluator.
*/
evaluatorInfo?: Record<string, unknown>;
/**
* The source run ID of the evaluation result.
* If set, a link to the source run will be available in the UI.
*/
sourceRunId?: string;
/**
* The target run ID of the evaluation result.
* If this is not set, the target run ID is assumed to be
* the root of the trace.
*/
targetRunId?: string;

/**
* The feedback config associated with the evaluation result.
* If set, this will be used to define how a feedback key
* should be interpreted.
*/
feedbackConfig?: FeedbackConfig;
};

/**
* Batch evaluation results, if your evaluator wishes
* to return multiple scores.
*/
export type EvaluationResults = {
/**
* The evaluation results.
*/
results: Array<EvaluationResult>;
};

export interface RunEvaluator {
evaluateRun(
run: Run,
Expand Down
4 changes: 2 additions & 2 deletions js/src/evaluation/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Evaluation methods
export { RunEvaluator, EvaluationResult } from "./evaluator.js";
export { RunEvaluator } from "./evaluator.js";
export {
StringEvaluator,
GradingFunctionParams,
Expand All @@ -12,6 +12,6 @@ export {
type DataT,
type SummaryEvaluatorT,
type EvaluatorT,
type ExperimentResultRow,
} from "./_runner.js";
export { EvaluationResult, type EvaluationResults } from "../schemas.js";
export { evaluateComparative } from "./evaluate_comparative.js";
3 changes: 2 additions & 1 deletion js/src/evaluation/string_evaluator.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import { Example, Run, ScoreType, ValueType } from "../schemas.js";
import { EvaluationResult, RunEvaluator } from "./evaluator.js";
import { RunEvaluator } from "./evaluator.js";
import { EvaluationResult } from "../schemas.js";

export interface GradingFunctionResult {
key?: string;
Expand Down
Loading
Loading