Skip to content

Commit 85ba576

Browse files
andrei-rusuradu-mocanu
authored andcommitted
add initial version of revamped coded evaluators
1 parent be12b16 commit 85ba576

40 files changed

+7002
-20
lines changed

src/uipath/_cli/_evals/_models/_output.py

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
1-
from typing import List, Optional
1+
from collections import defaultdict
2+
from typing import Any, Dict, List, Optional
23

34
from opentelemetry.sdk.trace import ReadableSpan
45
from pydantic import BaseModel, ConfigDict, model_serializer
56
from pydantic.alias_generators import to_camel
7+
from pydantic_core import core_schema
68

79
from uipath._cli._runtime._contracts import UiPathRuntimeResult
810
from uipath.eval.models.models import EvaluationResult, ScoreType
@@ -22,11 +24,15 @@ class EvaluationResultDto(BaseModel):
2224
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
2325

2426
score: float
25-
details: Optional[str] = None
27+
details: Optional[str | BaseModel] = None
2628
evaluation_time: Optional[float] = None
2729

2830
@model_serializer(mode="wrap")
29-
def serialize_model(self, serializer, info):
31+
def serialize_model(
32+
self,
33+
serializer: core_schema.SerializerFunctionWrapHandler,
34+
info: core_schema.SerializationInfo,
35+
) -> Any:
3036
data = serializer(self)
3137
if self.details is None and isinstance(data, dict):
3238
data.pop("details", None)
@@ -96,3 +102,82 @@ def compute_average_score(self) -> None:
96102
eval_result.score for eval_result in self.evaluation_set_results
97103
]
98104
self.score = sum(eval_item_scores) / len(eval_item_scores)
105+
106+
def calculate_final_score(
107+
self,
108+
evaluator_weights: Dict[str, float] | None = None,
109+
default_weight: float = 1.0,
110+
) -> tuple[float, Dict[str, float]]:
111+
"""Aggregate evaluation results with deduplication and weighted scoring.
112+
113+
This function performs the following steps:
114+
1. Flattens the nested evaluation_set_results structure
115+
2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
116+
3. Calculates average score per evaluator across all datapoints
117+
4. Computes final weighted score across evaluators
118+
119+
Args:
120+
evaluator_weights: Optional dict mapping evaluator names to weights
121+
default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)
122+
123+
Returns:
124+
Tuple of (final_score, agg_metrics_per_evaluator)
125+
- final_score: Weighted average across evaluators
126+
- agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
127+
"""
128+
if not self.evaluation_set_results:
129+
return 0.0, {}
130+
131+
if evaluator_weights is None:
132+
evaluator_weights = {}
133+
134+
# Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
135+
# datapoint_id = evaluation_name, evaluator_name from EvaluationRunResultDto
136+
grouped_by_datapoint_evaluator: defaultdict[
137+
str, defaultdict[str, list[float]]
138+
] = defaultdict(lambda: defaultdict(list))
139+
140+
for eval_run_result in self.evaluation_set_results:
141+
datapoint_id = eval_run_result.evaluation_name
142+
for eval_run_result_dto in eval_run_result.evaluation_run_results:
143+
evaluator_name = eval_run_result_dto.evaluator_name
144+
score = eval_run_result_dto.result.score
145+
grouped_by_datapoint_evaluator[datapoint_id][evaluator_name].append(
146+
score
147+
)
148+
149+
# Step 2: Deduplicate by averaging same evaluator results for same datapoint
150+
dedup_scores: list[tuple[str, str, float]] = []
151+
for datapoint_id, evaluators_dict in grouped_by_datapoint_evaluator.items():
152+
for evaluator_name, scores_list in evaluators_dict.items():
153+
if scores_list:
154+
# Average the scores for this evaluator on this datapoint
155+
avg_score = sum(scores_list) / len(scores_list)
156+
dedup_scores.append((datapoint_id, evaluator_name, avg_score))
157+
158+
# Step 3: Group by evaluator and calculate average score per evaluator
159+
grouped_by_evaluator: defaultdict[str, list[float]] = defaultdict(list)
160+
for _datapoint_id, evaluator_name, score in dedup_scores:
161+
grouped_by_evaluator[evaluator_name].append(score)
162+
163+
agg_metrics_per_evaluator = {}
164+
for evaluator_name, scores_list in grouped_by_evaluator.items():
165+
avg_score = sum(scores_list) / len(scores_list)
166+
agg_metrics_per_evaluator[evaluator_name] = avg_score
167+
168+
# Step 4: Calculate final weighted score
169+
if not agg_metrics_per_evaluator:
170+
return 0.0, {}
171+
172+
total_weighted_score = 0.0
173+
total_weight = 0.0
174+
175+
for evaluator_name, avg_score in agg_metrics_per_evaluator.items():
176+
weight = evaluator_weights.get(evaluator_name, default_weight)
177+
total_weighted_score += avg_score * weight
178+
total_weight += weight
179+
180+
final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
181+
182+
self.score = final_score
183+
return final_score, agg_metrics_per_evaluator

0 commit comments

Comments
 (0)