1- from typing import List , Optional
1+ from collections import defaultdict
2+ from typing import Any , Dict , List , Optional
23
34from opentelemetry .sdk .trace import ReadableSpan
45from pydantic import BaseModel , ConfigDict , model_serializer
56from pydantic .alias_generators import to_camel
7+ from pydantic_core import core_schema
68
79from uipath ._cli ._runtime ._contracts import UiPathRuntimeResult
810from uipath .eval .models .models import EvaluationResult , ScoreType
@@ -22,11 +24,15 @@ class EvaluationResultDto(BaseModel):
2224 model_config = ConfigDict (alias_generator = to_camel , populate_by_name = True )
2325
2426 score : float
25- details : Optional [str ] = None
27+ details : Optional [str | BaseModel ] = None
2628 evaluation_time : Optional [float ] = None
2729
2830 @model_serializer (mode = "wrap" )
29- def serialize_model (self , serializer , info ):
31+ def serialize_model (
32+ self ,
33+ serializer : core_schema .SerializerFunctionWrapHandler ,
34+ info : core_schema .SerializationInfo ,
35+ ) -> Any :
3036 data = serializer (self )
3137 if self .details is None and isinstance (data , dict ):
3238 data .pop ("details" , None )
@@ -96,3 +102,82 @@ def compute_average_score(self) -> None:
96102 eval_result .score for eval_result in self .evaluation_set_results
97103 ]
98104 self .score = sum (eval_item_scores ) / len (eval_item_scores )
105+
106+ def calculate_final_score (
107+ self ,
108+ evaluator_weights : Dict [str , float ] | None = None ,
109+ default_weight : float = 1.0 ,
110+ ) -> tuple [float , Dict [str , float ]]:
111+ """Aggregate evaluation results with deduplication and weighted scoring.
112+
113+ This function performs the following steps:
114+ 1. Flattens the nested evaluation_set_results structure
115+ 2. Deduplicates results by datapoint_id (evaluation_name) and evaluator_name (averages duplicates)
116+ 3. Calculates average score per evaluator across all datapoints
117+ 4. Computes final weighted score across evaluators
118+
119+ Args:
120+ evaluator_weights: Optional dict mapping evaluator names to weights
121+ default_weight: Default weight for evaluators not in evaluator_weights (default: 1.0)
122+
123+ Returns:
124+ Tuple of (final_score, agg_metrics_per_evaluator)
125+ - final_score: Weighted average across evaluators
126+ - agg_metrics_per_evaluator: Dict mapping evaluator names to their average scores
127+ """
128+ if not self .evaluation_set_results :
129+ return 0.0 , {}
130+
131+ if evaluator_weights is None :
132+ evaluator_weights = {}
133+
134+ # Step 1: Flatten the nested structure and group by datapoint_id and evaluator_name for deduplication
135+ # datapoint_id = evaluation_name, evaluator_name from EvaluationRunResultDto
136+ grouped_by_datapoint_evaluator : defaultdict [
137+ str , defaultdict [str , list [float ]]
138+ ] = defaultdict (lambda : defaultdict (list ))
139+
140+ for eval_run_result in self .evaluation_set_results :
141+ datapoint_id = eval_run_result .evaluation_name
142+ for eval_run_result_dto in eval_run_result .evaluation_run_results :
143+ evaluator_name = eval_run_result_dto .evaluator_name
144+ score = eval_run_result_dto .result .score
145+ grouped_by_datapoint_evaluator [datapoint_id ][evaluator_name ].append (
146+ score
147+ )
148+
149+ # Step 2: Deduplicate by averaging same evaluator results for same datapoint
150+ dedup_scores : list [tuple [str , str , float ]] = []
151+ for datapoint_id , evaluators_dict in grouped_by_datapoint_evaluator .items ():
152+ for evaluator_name , scores_list in evaluators_dict .items ():
153+ if scores_list :
154+ # Average the scores for this evaluator on this datapoint
155+ avg_score = sum (scores_list ) / len (scores_list )
156+ dedup_scores .append ((datapoint_id , evaluator_name , avg_score ))
157+
158+ # Step 3: Group by evaluator and calculate average score per evaluator
159+ grouped_by_evaluator : defaultdict [str , list [float ]] = defaultdict (list )
160+ for _datapoint_id , evaluator_name , score in dedup_scores :
161+ grouped_by_evaluator [evaluator_name ].append (score )
162+
163+ agg_metrics_per_evaluator = {}
164+ for evaluator_name , scores_list in grouped_by_evaluator .items ():
165+ avg_score = sum (scores_list ) / len (scores_list )
166+ agg_metrics_per_evaluator [evaluator_name ] = avg_score
167+
168+ # Step 4: Calculate final weighted score
169+ if not agg_metrics_per_evaluator :
170+ return 0.0 , {}
171+
172+ total_weighted_score = 0.0
173+ total_weight = 0.0
174+
175+ for evaluator_name , avg_score in agg_metrics_per_evaluator .items ():
176+ weight = evaluator_weights .get (evaluator_name , default_weight )
177+ total_weighted_score += avg_score * weight
178+ total_weight += weight
179+
180+ final_score = total_weighted_score / total_weight if total_weight > 0 else 0.0
181+
182+ self .score = final_score
183+ return final_score , agg_metrics_per_evaluator
0 commit comments