Skip to content

Commit 1612f5a

Browse files
authored
feat(evals): add status enum for evaluation scores (#2169)
1 parent 8e13b78 commit 1612f5a

File tree

20 files changed

+257
-59
lines changed

20 files changed

+257
-59
lines changed

genkit-tools/common/src/eval/parser.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ export function enrichResultsWithScoring(
4545
evaluator,
4646
score: s.score,
4747
scoreId: s.id,
48+
status: s.status,
4849
rationale: s.details?.reasoning,
4950
error: s.error,
5051
traceId: scoredSample.traceId,

genkit-tools/common/src/types/eval.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,10 +130,13 @@ export type EvalInput = z.infer<typeof EvalInputSchema>;
130130
export const EvalInputDatasetSchema = z.array(EvalInputSchema);
131131
export type EvalInputDataset = z.infer<typeof EvalInputDatasetSchema>;
132132

133+
const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);
134+
133135
export const EvalMetricSchema = z.object({
134136
evaluator: z.string(),
135137
scoreId: z.string().optional(),
136138
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
139+
status: EvalStatusEnumSchema.optional(),
137140
rationale: z.string().optional(),
138141
error: z.string().optional(),
139142
traceId: z.string().optional(),

genkit-tools/common/src/types/evaluator.ts

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ export const BaseEvalDataPointSchema = BaseDataPointSchema.extend({
4646
testCaseId: z.string(),
4747
});
4848
export type BaseEvalDataPoint = z.infer<typeof BaseEvalDataPointSchema>;
49-
49+
// Enum for Score Status
50+
export const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);
5051
/**
5152
* Zod schema for evaluation score
5253
*/
@@ -56,7 +57,7 @@ export const ScoreSchema = z.object({
5657
.describe('Optional ID to differentiate different scores')
5758
.optional(),
5859
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
59-
// TODO: use StatusSchema
60+
status: EvalStatusEnumSchema.optional(),
6061
error: z.string().optional(),
6162
details: z
6263
.object({

genkit-tools/genkit-schema.json

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,14 @@
413413
"$ref": "#/$defs/EvalFnResponse"
414414
}
415415
},
416+
"EvalStatusEnum": {
417+
"type": "string",
418+
"enum": [
419+
"UNKNOWN",
420+
"PASS",
421+
"FAIL"
422+
]
423+
},
416424
"Score": {
417425
"type": "object",
418426
"properties": {
@@ -427,6 +435,9 @@
427435
"boolean"
428436
]
429437
},
438+
"status": {
439+
"$ref": "#/$defs/EvalStatusEnum"
440+
},
430441
"error": {
431442
"type": "string"
432443
},

go/ai/gen.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,14 @@ type EvalRequest struct {
9292

9393
type EvalResponse []any
9494

95+
type EvalStatusEnum string
96+
97+
const (
98+
EvalStatusEnumUNKNOWN EvalStatusEnum = "UNKNOWN"
99+
EvalStatusEnumPASS EvalStatusEnum = "PASS"
100+
EvalStatusEnumFAIL EvalStatusEnum = "FAIL"
101+
)
102+
95103
type FinishReason string
96104

97105
const (

js/ai/src/evaluator.ts

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ export const BaseEvalDataPointSchema = BaseDataPointSchema.extend({
3939
});
4040
export type BaseEvalDataPoint = z.infer<typeof BaseEvalDataPointSchema>;
4141

42+
const EvalStatusEnumSchema = z.enum(['UNKNOWN', 'PASS', 'FAIL']);
43+
44+
/** Enum that indicates if an evaluation has passed or failed */
45+
export enum EvalStatusEnum {
46+
UNKNOWN = 'UNKNOWN',
47+
PASS = 'PASS',
48+
FAIL = 'FAIL',
49+
}
50+
4251
export const ScoreSchema = z.object({
4352
id: z
4453
.string()
@@ -47,7 +56,7 @@ export const ScoreSchema = z.object({
4756
)
4857
.optional(),
4958
score: z.union([z.number(), z.string(), z.boolean()]).optional(),
50-
// TODO: use StatusSchema
59+
status: EvalStatusEnumSchema.optional(),
5160
error: z.string().optional(),
5261
details: z
5362
.object({
@@ -218,8 +227,10 @@ export function defineEvaluator<
218227
testCaseId: datapoint.testCaseId,
219228
evaluation: {
220229
error: `Evaluation of test case ${datapoint.testCaseId} failed: \n${(e as Error).stack}`,
230+
status: EvalStatusEnum.FAIL,
221231
},
222232
});
233+
// Throw to mark the span as failed.
223234
throw e;
224235
}
225236
}

js/ai/src/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ export {
3333
} from './embedder.js';
3434
export {
3535
BaseDataPointSchema,
36+
EvalStatusEnum,
3637
evaluate,
3738
evaluatorRef,
3839
type EvalResponses,

js/genkit/src/evaluator.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ export {
1919
BaseEvalDataPointSchema,
2020
EvalResponseSchema,
2121
EvalResponsesSchema,
22+
EvalStatusEnum,
2223
EvaluatorInfoSchema,
2324
ScoreSchema,
2425
evaluatorRef,

js/plugins/evaluators/src/index.ts

Lines changed: 82 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@
1414
* limitations under the License.
1515
*/
1616

17-
import { EmbedderReference, Genkit, ModelReference, z } from 'genkit';
17+
import { Genkit, z } from 'genkit';
1818
import {
1919
BaseEvalDataPoint,
2020
EvalResponse,
21+
EvalStatusEnum,
2122
Score,
2223
evaluatorRef,
2324
} from 'genkit/evaluator';
@@ -30,22 +31,18 @@ import {
3031
maliciousnessScore,
3132
regexp,
3233
} from './metrics/index.js';
33-
import { GenkitMetric } from './types.js';
34-
export { GenkitMetric };
34+
import {
35+
AnswerRelevancyGenkitMetricConfig,
36+
GenkitMetric,
37+
ResolvedConfig,
38+
isGenkitMetricConfig,
39+
type GenkitMetricConfig,
40+
type PluginOptions,
41+
} from './types.js';
42+
export { GenkitMetric, type GenkitMetricConfig, type PluginOptions };
3543

3644
const PLUGIN_NAME = 'genkitEval';
3745

38-
export interface PluginOptions<
39-
ModelCustomOptions extends z.ZodTypeAny,
40-
EmbedderCustomOptions extends z.ZodTypeAny,
41-
> {
42-
metrics?: Array<GenkitMetric>;
43-
judge?: ModelReference<ModelCustomOptions>;
44-
judgeConfig?: z.infer<ModelCustomOptions>;
45-
embedder?: EmbedderReference<EmbedderCustomOptions>;
46-
embedderOptions?: z.infer<EmbedderCustomOptions>;
47-
}
48-
4946
/**
5047
* Reference to the Genkit evaluator for a specified metric
5148
*/
@@ -75,15 +72,16 @@ export function genkitEval<
7572

7673
export default genkitEval;
7774

78-
function hasMetric(arr: GenkitMetric[] | undefined, metric: GenkitMetric) {
79-
return arr?.some((m) => m === metric);
80-
}
81-
82-
function fillScores(dataPoint: BaseEvalDataPoint, score: Score): EvalResponse {
83-
return {
84-
testCaseId: dataPoint.testCaseId,
85-
evaluation: score,
86-
};
75+
function fillScores(
76+
dataPoint: BaseEvalDataPoint,
77+
score: Score,
78+
statusOverrideFn?: (args: { score: Score }) => EvalStatusEnum
79+
): EvalResponse {
80+
let status = score.status;
81+
if (statusOverrideFn) {
82+
status = statusOverrideFn({ score });
83+
}
84+
return { testCaseId: dataPoint.testCaseId, evaluation: { ...score, status } };
8785
}
8886

8987
/**
@@ -96,23 +94,35 @@ export function genkitEvaluators<
9694
ai: Genkit,
9795
params: PluginOptions<ModelCustomOptions, EmbedderCustomOptions>
9896
) {
99-
let { metrics, judge, judgeConfig, embedder, embedderOptions } = params;
100-
if (!metrics) {
101-
metrics = [GenkitMetric.MALICIOUSNESS, GenkitMetric.FAITHFULNESS];
102-
} else if (!embedder && hasMetric(metrics, GenkitMetric.ANSWER_RELEVANCY)) {
103-
throw new Error('Embedder must be specified if computing answer relvancy');
97+
let { metrics } = params;
98+
if (metrics.length === 0) {
99+
throw new Error('No metrics configured in genkitEval plugin');
104100
}
105101
return metrics.map((metric) => {
106-
switch (metric) {
102+
const {
103+
type,
104+
judge,
105+
judgeConfig,
106+
embedder,
107+
embedderOptions,
108+
statusOverrideFn,
109+
} = resolveConfig(metric, params);
110+
const evaluator = `${PLUGIN_NAME}/${type.toLocaleLowerCase()}`;
111+
switch (type) {
107112
case GenkitMetric.ANSWER_RELEVANCY: {
108113
if (!judge) {
109114
throw new Error(
110115
'Judge llms must be specified if computing answer relvancy'
111116
);
112117
}
118+
if (!embedder) {
119+
throw new Error(
120+
'Embedder must be specified if computing answer relvancy'
121+
);
122+
}
113123
return ai.defineEvaluator(
114124
{
115-
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
125+
name: evaluator,
116126
displayName: 'Answer Relevancy',
117127
definition:
118128
'Assesses how pertinent the generated answer is to the given prompt',
@@ -126,7 +136,7 @@ export function genkitEvaluators<
126136
judgeConfig,
127137
embedderOptions
128138
);
129-
return fillScores(datapoint, answerRelevancy);
139+
return fillScores(datapoint, answerRelevancy, statusOverrideFn);
130140
}
131141
);
132142
}
@@ -138,7 +148,7 @@ export function genkitEvaluators<
138148
}
139149
return ai.defineEvaluator(
140150
{
141-
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
151+
name: evaluator,
142152
displayName: 'Faithfulness',
143153
definition:
144154
'Measures the factual consistency of the generated answer against the given context',
@@ -150,7 +160,7 @@ export function genkitEvaluators<
150160
datapoint,
151161
judgeConfig
152162
);
153-
return fillScores(datapoint, faithfulness);
163+
return fillScores(datapoint, faithfulness, statusOverrideFn);
154164
}
155165
);
156166
}
@@ -162,7 +172,7 @@ export function genkitEvaluators<
162172
}
163173
return ai.defineEvaluator(
164174
{
165-
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
175+
name: evaluator,
166176
displayName: 'Maliciousness',
167177
definition:
168178
'Measures whether the generated output intends to deceive, harm, or exploit',
@@ -174,14 +184,14 @@ export function genkitEvaluators<
174184
datapoint,
175185
judgeConfig
176186
);
177-
return fillScores(datapoint, maliciousness);
187+
return fillScores(datapoint, maliciousness, statusOverrideFn);
178188
}
179189
);
180190
}
181191
case GenkitMetric.REGEX: {
182192
return ai.defineEvaluator(
183193
{
184-
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
194+
name: evaluator,
185195
displayName: 'RegExp',
186196
definition: 'Tests output against the regexp provided as reference',
187197
},
@@ -193,29 +203,60 @@ export function genkitEvaluators<
193203
case GenkitMetric.DEEP_EQUAL: {
194204
return ai.defineEvaluator(
195205
{
196-
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
197-
displayName: 'Deep Equal',
206+
name: evaluator,
207+
displayName: 'Deep Equals',
198208
definition:
199209
'Tests equality of output against the provided reference',
200210
},
201211
async (datapoint: BaseEvalDataPoint) => {
202-
return fillScores(datapoint, await deepEqual(datapoint));
212+
return fillScores(
213+
datapoint,
214+
await deepEqual(datapoint),
215+
statusOverrideFn
216+
);
203217
}
204218
);
205219
}
206220
case GenkitMetric.JSONATA: {
207221
return ai.defineEvaluator(
208222
{
209-
name: `${PLUGIN_NAME}/${metric.toLocaleLowerCase()}`,
223+
name: evaluator,
210224
displayName: 'JSONata',
211225
definition:
212226
'Tests JSONata expression (provided in reference) against output',
213227
},
214228
async (datapoint: BaseEvalDataPoint) => {
215-
return fillScores(datapoint, await jsonata(datapoint));
229+
return fillScores(
230+
datapoint,
231+
await jsonata(datapoint),
232+
statusOverrideFn
233+
);
216234
}
217235
);
218236
}
219237
}
220238
});
221239
}
240+
241+
function resolveConfig<M extends z.ZodTypeAny, E extends z.ZodTypeAny>(
242+
metric: GenkitMetricConfig<M, E>,
243+
params: PluginOptions<M, E>
244+
): ResolvedConfig<M, E> {
245+
if (isGenkitMetricConfig(metric)) {
246+
return {
247+
type: metric.type,
248+
statusOverrideFn: metric.statusOverrideFn,
249+
judge: metric.judge ?? params.judge,
250+
judgeConfig: metric.judgeConfig ?? params.judgeConfig,
251+
embedder:
252+
metric.type === GenkitMetric.ANSWER_RELEVANCY
253+
? (metric as AnswerRelevancyGenkitMetricConfig<M, E>).embedder
254+
: undefined,
255+
embedderOptions:
256+
metric.type === GenkitMetric.ANSWER_RELEVANCY
257+
? (metric as AnswerRelevancyGenkitMetricConfig<M, E>).embedderOptions
258+
: undefined,
259+
} as ResolvedConfig<M, E>;
260+
}
261+
return { type: metric, ...params };
262+
}

js/plugins/evaluators/src/metrics/answer_relevancy.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import similarity from 'compute-cosine-similarity';
1818
import { Genkit, ModelArgument, z } from 'genkit';
1919
import { EmbedderArgument } from 'genkit/embedder';
20-
import { BaseEvalDataPoint, Score } from 'genkit/evaluator';
20+
import { BaseEvalDataPoint, EvalStatusEnum, Score } from 'genkit/evaluator';
2121
import path from 'path';
2222
import { getDirName, loadPromptFile, renderText } from './helper.js';
2323

@@ -103,11 +103,13 @@ export async function answerRelevancyScore<
103103
: answered
104104
? 'Cosine similarity'
105105
: 'Cosine similarity with penalty for insufficient answer';
106+
const finalScore = adjustedScore * (isNonCommittal ? 0 : 1);
106107
return {
107-
score: adjustedScore * (isNonCommittal ? 0 : 1),
108+
score: finalScore,
108109
details: {
109110
reasoning,
110111
},
112+
status: finalScore > 0.5 ? EvalStatusEnum.PASS : EvalStatusEnum.FAIL,
111113
};
112114
} catch (err) {
113115
console.debug(

0 commit comments

Comments
 (0)