Skip to content

Commit 58f7bfc

Browse files
committed
feat: wire new evaluators
1 parent c5c541b commit 58f7bfc

File tree

17 files changed

+376
-62
lines changed

17 files changed

+376
-62
lines changed

src/uipath/_cli/_evals/_models/_evaluation_set.py

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,29 @@
11
from enum import IntEnum
2-
from typing import Any, Dict, List
2+
from typing import Annotated, Any, Dict, List, Literal, Union
33

4-
from pydantic import BaseModel, ConfigDict, Field
4+
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
55
from pydantic.alias_generators import to_camel
66

7+
from uipath.eval.coded_evaluators import BaseEvaluator
8+
from uipath.eval.evaluators import LegacyBaseEvaluator
9+
710

811
class EvaluationItem(BaseModel):
912
"""Individual evaluation item within an evaluation set."""
1013

14+
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
15+
id: str
16+
name: str
17+
inputs: Dict[str, Any]
18+
evaluation_criterias: dict[str, dict[str, Any] | None] = Field(
19+
..., alias="evaluationCriterias"
20+
)
21+
expected_agent_behavior: str = Field(default="", alias="expectedAgentBehavior")
22+
23+
24+
class LegacyEvaluationItem(BaseModel):
25+
"""Individual evaluation item within an evaluation set."""
26+
1127
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
1228

1329
id: str
@@ -28,12 +44,36 @@ class EvaluationItem(BaseModel):
2844
class EvaluationSet(BaseModel):
2945
"""Complete evaluation set model."""
3046

47+
model_config = ConfigDict(
48+
alias_generator=to_camel, populate_by_name=True, extra="allow"
49+
)
50+
51+
id: str
52+
name: str
53+
version: Literal["1.0"] = "1.0"
54+
evaluator_refs: List[str] = Field(default_factory=list)
55+
evaluations: List[EvaluationItem] = Field(default_factory=list)
56+
57+
def extract_selected_evals(self, eval_ids) -> None:
58+
selected_evals: list[EvaluationItem] = []
59+
for evaluation in self.evaluations:
60+
if evaluation.id in eval_ids:
61+
selected_evals.append(evaluation)
62+
eval_ids.remove(evaluation.id)
63+
if len(eval_ids) > 0:
64+
raise ValueError("Unknown evaluation ids: {}".format(eval_ids))
65+
self.evaluations = selected_evals
66+
67+
68+
class LegacyEvaluationSet(BaseModel):
69+
"""Complete evaluation set model."""
70+
3171
model_config = ConfigDict(alias_generator=to_camel, populate_by_name=True)
3272

3373
id: str
3474
file_name: str
3575
evaluator_refs: List[str] = Field(default_factory=list)
36-
evaluations: List[EvaluationItem] = Field(default_factory=list)
76+
evaluations: List[LegacyEvaluationItem] = Field(default_factory=list)
3777
name: str
3878
batch_size: int = 10
3979
timeout_minutes: int = 20
@@ -42,7 +82,7 @@ class EvaluationSet(BaseModel):
4282
updated_at: str
4383

4484
def extract_selected_evals(self, eval_ids) -> None:
45-
selected_evals: list[EvaluationItem] = []
85+
selected_evals: list[LegacyEvaluationItem] = []
4686
for evaluation in self.evaluations:
4787
if evaluation.id in eval_ids:
4888
selected_evals.append(evaluation)
@@ -56,3 +96,29 @@ class EvaluationStatus(IntEnum):
5696
PENDING = 0
5797
IN_PROGRESS = 1
5898
COMPLETED = 2
99+
100+
101+
def _discriminate_eval_set(
102+
v: Any,
103+
) -> Literal["evaluation_set", "legacy_evaluation_set"]:
104+
"""Discriminator function that returns a tag based on version field."""
105+
if isinstance(v, dict):
106+
version = v.get("version")
107+
if version == "1.0":
108+
return "evaluation_set"
109+
return "legacy_evaluation_set"
110+
111+
112+
AnyEvaluationSet = Annotated[
113+
Union[
114+
Annotated[EvaluationSet, Tag("evaluation_set")],
115+
Annotated[LegacyEvaluationSet, Tag("legacy_evaluation_set")],
116+
],
117+
Discriminator(_discriminate_eval_set),
118+
]
119+
120+
AnyEvaluationItem = Union[EvaluationItem, LegacyEvaluationItem]
121+
122+
AnyEvaluator = Annotated[
123+
Union[LegacyBaseEvaluator[Any], BaseEvaluator[Any, Any, Any]], "List of evaluators"
124+
]
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
from typing import Annotated, Any, Literal, Union
2+
3+
from pydantic import BaseModel, ConfigDict, Discriminator, Field, Tag
4+
5+
from uipath.eval.coded_evaluators.base_evaluator import BaseEvaluatorConfig
6+
from uipath.eval.coded_evaluators.contains_evaluator import ContainsEvaluatorConfig
7+
from uipath.eval.models.models import (
8+
EvaluatorType,
9+
LegacyEvaluatorCategory,
10+
LegacyEvaluatorType,
11+
)
12+
13+
14+
class EvaluatorBaseParams(BaseModel):
15+
"""Parameters for initializing the base evaluator."""
16+
17+
id: str
18+
name: str
19+
description: str
20+
evaluator_type: LegacyEvaluatorType = Field(..., alias="type")
21+
created_at: str = Field(..., alias="createdAt")
22+
updated_at: str = Field(..., alias="updatedAt")
23+
target_output_key: str = Field(..., alias="targetOutputKey")
24+
file_name: str = Field(..., alias="fileName")
25+
26+
27+
class LLMEvaluatorParams(EvaluatorBaseParams):
28+
category: Literal[LegacyEvaluatorCategory.LlmAsAJudge] = Field(
29+
..., alias="category"
30+
)
31+
prompt: str = Field(..., alias="prompt")
32+
model: str = Field(..., alias="model")
33+
34+
model_config = ConfigDict(
35+
validate_by_name=True, validate_by_alias=True, extra="allow"
36+
)
37+
38+
39+
class TrajectoryEvaluatorParams(EvaluatorBaseParams):
40+
category: Literal[LegacyEvaluatorCategory.Trajectory] = Field(..., alias="category")
41+
prompt: str = Field(..., alias="prompt")
42+
model: str = Field(..., alias="model")
43+
44+
model_config = ConfigDict(
45+
validate_by_name=True, validate_by_alias=True, extra="allow"
46+
)
47+
48+
49+
class EqualsEvaluatorParams(EvaluatorBaseParams):
50+
model_config = ConfigDict(
51+
validate_by_name=True, validate_by_alias=True, extra="allow"
52+
)
53+
54+
55+
class JsonSimilarityEvaluatorParams(EvaluatorBaseParams):
56+
model_config = ConfigDict(
57+
validate_by_name=True, validate_by_alias=True, extra="allow"
58+
)
59+
60+
61+
class UnknownEvaluatorParams(EvaluatorBaseParams):
62+
model_config = ConfigDict(
63+
validate_by_name=True, validate_by_alias=True, extra="allow"
64+
)
65+
66+
67+
class UnknownEvaluatorConfig(BaseEvaluatorConfig):
68+
model_config = ConfigDict(
69+
validate_by_name=True, validate_by_alias=True, extra="allow"
70+
)
71+
72+
73+
def legacy_evaluator_discriminator(data: Any) -> str:
74+
if isinstance(data, dict):
75+
category = data.get("category")
76+
evaluator_type = data.get("type")
77+
match category:
78+
case LegacyEvaluatorCategory.LlmAsAJudge:
79+
return "LLMEvaluatorParams"
80+
case LegacyEvaluatorCategory.Trajectory:
81+
return "TrajectoryEvaluatorParams"
82+
case LegacyEvaluatorCategory.Deterministic:
83+
match evaluator_type:
84+
case LegacyEvaluatorType.Equals:
85+
return "EqualsEvaluatorParams"
86+
case LegacyEvaluatorType.JsonSimilarity:
87+
return "JsonSimilarityEvaluatorParams"
88+
case _:
89+
return "UnknownEvaluatorParams"
90+
case _:
91+
return "UnknownEvaluatorParams"
92+
else:
93+
return "UnknownEvaluatorParams"
94+
95+
96+
def evaluator_config_discriminator(data: Any) -> str:
97+
if isinstance(data, dict):
98+
evaluator_type_id = data.get("evaluatorTypeId")
99+
match evaluator_type_id:
100+
case EvaluatorType.CONTAINS:
101+
return "ContainsEvaluatorConfig"
102+
case _:
103+
return "UnknownEvaluatorConfig"
104+
else:
105+
return "UnknownEvaluatorConfig"
106+
107+
108+
EvaluatorLegacy = Annotated[
109+
Union[
110+
Annotated[
111+
LLMEvaluatorParams,
112+
Tag("LLMEvaluatorParams"),
113+
],
114+
Annotated[
115+
TrajectoryEvaluatorParams,
116+
Tag("TrajectoryEvaluatorParams"),
117+
],
118+
Annotated[
119+
EqualsEvaluatorParams,
120+
Tag("EqualsEvaluatorParams"),
121+
],
122+
Annotated[
123+
JsonSimilarityEvaluatorParams,
124+
Tag("JsonSimilarityEvaluatorParams"),
125+
],
126+
Annotated[
127+
UnknownEvaluatorParams,
128+
Tag("UnknownEvaluatorParams"),
129+
],
130+
],
131+
Field(discriminator=Discriminator(legacy_evaluator_discriminator)),
132+
]
133+
134+
EvaluatorConfig = Annotated[
135+
Union[
136+
Annotated[
137+
ContainsEvaluatorConfig,
138+
Tag("ContainsEvaluatorConfig"),
139+
],
140+
Annotated[
141+
UnknownEvaluatorConfig,
142+
Tag("UnknownEvaluatorConfig"),
143+
],
144+
],
145+
Field(discriminator=Discriminator(evaluator_config_discriminator)),
146+
]

src/uipath/_cli/_evals/_models/_evaluator_base_params.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from pydantic import BaseModel
22

3-
from uipath.eval.models.models import EvaluatorCategory, EvaluatorType
3+
from uipath.eval.models.models import LegacyEvaluatorCategory, LegacyEvaluatorType
44

55

66
class EvaluatorBaseParams(BaseModel):
77
"""Parameters for initializing the base evaluator."""
88

99
id: str
10-
category: EvaluatorCategory
11-
evaluator_type: EvaluatorType
10+
category: LegacyEvaluatorCategory
11+
evaluator_type: LegacyEvaluatorType
1212
name: str
1313
description: str
1414
created_at: str

src/uipath/_cli/_evals/_progress_reporter.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,10 @@
99
from opentelemetry import trace
1010

1111
from uipath import UiPath
12-
from uipath._cli._evals._models._evaluation_set import EvaluationItem, EvaluationStatus
12+
from uipath._cli._evals._models._evaluation_set import (
13+
EvaluationStatus,
14+
LegacyEvaluationItem,
15+
)
1316
from uipath._cli._evals._models._sw_reporting import (
1417
StudioWebAgentSnapshot,
1518
StudioWebProgressItem,
@@ -28,7 +31,7 @@
2831
)
2932
from uipath._utils import Endpoint, RequestSpec
3033
from uipath._utils.constants import ENV_TENANT_ID, HEADER_INTERNAL_TENANT_ID
31-
from uipath.eval.evaluators import BaseEvaluator
34+
from uipath.eval.evaluators import LegacyBaseEvaluator
3235
from uipath.eval.models import EvalItemResult, ScoreType
3336
from uipath.tracing import LlmOpsHttpExporter
3437

@@ -85,7 +88,7 @@ async def create_eval_set_run(
8588
eval_set_id: str,
8689
agent_snapshot: StudioWebAgentSnapshot,
8790
no_of_evals: int,
88-
evaluators: List[BaseEvaluator[Any]],
91+
evaluators: List[LegacyBaseEvaluator[Any]],
8992
) -> str:
9093
"""Create a new evaluation set run in StudioWeb."""
9194
spec = self._create_eval_set_run_spec(eval_set_id, agent_snapshot, no_of_evals)
@@ -101,7 +104,7 @@ async def create_eval_set_run(
101104

102105
@gracefully_handle_errors
103106
async def create_eval_run(
104-
self, eval_item: EvaluationItem, eval_set_run_id: str
107+
self, eval_item: LegacyEvaluationItem, eval_set_run_id: str
105108
) -> str:
106109
"""Create a new evaluation run in StudioWeb.
107110
@@ -126,7 +129,7 @@ async def create_eval_run(
126129
async def update_eval_run(
127130
self,
128131
sw_progress_item: StudioWebProgressItem,
129-
evaluators: dict[str, BaseEvaluator[Any]],
132+
evaluators: dict[str, LegacyBaseEvaluator[Any]],
130133
):
131134
"""Update an evaluation run with results."""
132135
assertion_runs, evaluator_scores = self._collect_results(
@@ -300,7 +303,7 @@ def _extract_agent_snapshot(self, entrypoint: str) -> StudioWebAgentSnapshot:
300303
def _collect_results(
301304
self,
302305
eval_results: list[EvalItemResult],
303-
evaluators: dict[str, BaseEvaluator[Any]],
306+
evaluators: dict[str, LegacyBaseEvaluator[Any]],
304307
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
305308
assertion_runs: list[dict[str, Any]] = []
306309
evaluator_scores_list: list[dict[str, Any]] = []
@@ -365,7 +368,7 @@ def _update_eval_run_spec(
365368
)
366369

367370
def _create_eval_run_spec(
368-
self, eval_item: EvaluationItem, eval_set_run_id: str
371+
self, eval_item: LegacyEvaluationItem, eval_set_run_id: str
369372
) -> RequestSpec:
370373
return RequestSpec(
371374
method="POST",

0 commit comments

Comments
 (0)