Skip to content

Commit 4531d08

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: GenAI Evaluation: Release GenAI Evaluation SDK Autorater Config/Tuning/Evaluation to vertexai.preview module.
PiperOrigin-RevId: 720725088
1 parent 50a66b7 commit 4531d08

12 files changed

+918
-8
lines changed

Diff for: setup.py

+2
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,8 @@
151151
evaluation_extra_require = [
152152
"pandas >= 1.0.0",
153153
"tqdm>=4.23.0",
154+
"scikit-learn<1.6.0; python_version<='3.10'",
155+
"scikit-learn; python_version>'3.10'",
154156
]
155157

156158
langchain_extra_require = [

Diff for: tests/unit/vertexai/test_autorater_utils.py

+349
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,349 @@
1+
# -*- coding: utf-8 -*-
2+
3+
# Copyright 2024 Google LLC
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
"""Unit tests for autorater utils."""
18+
19+
import copy
20+
import datetime
21+
from typing import Any, Dict, List
22+
from unittest import mock
23+
import uuid
24+
25+
from google import auth
26+
from google.auth import credentials as auth_credentials
27+
import vertexai
28+
from google.cloud.aiplatform import compat
29+
from google.cloud.aiplatform import initializer
30+
from google.cloud.aiplatform import utils as aiplatform_utils
31+
from google.cloud.aiplatform_v1beta1.services import gen_ai_tuning_service
32+
from google.cloud.aiplatform_v1beta1.types import job_state
33+
from google.cloud.aiplatform_v1beta1.types import (
34+
tuning_job as gca_tuning_job,
35+
)
36+
from vertexai.preview import tuning
37+
from vertexai.preview.evaluation import autorater_utils
38+
from vertexai.preview.evaluation.metrics import pairwise_metric
39+
from vertexai.preview.evaluation.metrics import pointwise_metric
40+
import numpy as np
41+
import pandas as pd
42+
import pytest
43+
44+
45+
AutoraterConfig = autorater_utils.AutoraterConfig
46+
PointwiseMetric = pointwise_metric.PointwiseMetric
47+
PairwiseMetric = pairwise_metric.PairwiseMetric
48+
49+
_TEST_PROJECT = "test-project"
50+
_TEST_LOCATION = "us-central1"
51+
52+
53+
_global_tuning_jobs: Dict[str, gca_tuning_job.TuningJob] = {}
54+
_SCORE = "score"
55+
_METRIC = "metric"
56+
_PAIRWISE_CHOICE = "pairwise_choice"
57+
_HUMAN_RATING = "human_rating"
58+
_HUMAN_PAIRWISE_CHOICE = "human_pairwise_choice"
59+
_ACCURACY_BALANCED = "accuracy_balanced"
60+
_F1_SCORE_BALANCED = "f1_score_balanced"
61+
_CONFUSION_MATRIX = "confusion_matrix"
62+
_CONFUSION_MATRIX_LABELS = "confusion_matrix_labels"
63+
64+
65+
@pytest.fixture
66+
def google_auth_mock():
67+
with mock.patch.object(auth, "default") as google_auth_default_mock:
68+
google_auth_default_mock.return_value = (
69+
auth_credentials.AnonymousCredentials(),
70+
_TEST_PROJECT,
71+
)
72+
yield google_auth_default_mock
73+
74+
75+
class MockGenAiTuningServiceClient(gen_ai_tuning_service.GenAiTuningServiceClient):
76+
"""Mock GenAiTuningServiceClient."""
77+
78+
@property
79+
def _tuning_jobs(self) -> Dict[str, gca_tuning_job.TuningJob]:
80+
return _global_tuning_jobs
81+
82+
def create_tuning_job(
83+
self,
84+
*,
85+
parent: str,
86+
tuning_job: gca_tuning_job.TuningJob,
87+
**_,
88+
) -> gca_tuning_job.TuningJob:
89+
tuning_job = copy.deepcopy(tuning_job)
90+
resource_id = uuid.uuid4().hex
91+
resource_name = f"{parent}/tuningJobs/{resource_id}"
92+
tuning_job.name = resource_name
93+
current_time = datetime.datetime.now(datetime.timezone.utc)
94+
tuning_job.tuned_model = gca_tuning_job.TunedModel(
95+
model=f"{parent}/models/123",
96+
endpoint=f"{parent}/endpoints/456",
97+
)
98+
tuning_job.state = job_state.JobState.JOB_STATE_SUCCEEDED
99+
tuning_job.create_time = current_time
100+
tuning_job.update_time = current_time
101+
self._tuning_jobs[resource_name] = tuning_job
102+
return tuning_job
103+
104+
def get_tuning_job(self, *, name: str, **_) -> gca_tuning_job.TuningJob:
105+
tuning_job = self._tuning_jobs[name]
106+
tuning_job = copy.deepcopy(tuning_job)
107+
return tuning_job
108+
109+
110+
class MockTuningJobClientWithOverride(aiplatform_utils.ClientWithOverride):
111+
_is_temporary = False
112+
_default_version = compat.V1
113+
_version_map = ((compat.V1, MockGenAiTuningServiceClient),)
114+
115+
116+
@pytest.mark.usefixtures("google_auth_mock")
117+
class TestAutoraterUtils:
118+
"""Unit tests for generative model tuning."""
119+
120+
def setup_method(self):
121+
vertexai.init(
122+
project=_TEST_PROJECT,
123+
location=_TEST_LOCATION,
124+
)
125+
126+
def teardown_method(self):
127+
initializer.global_pool.shutdown(wait=True)
128+
129+
@mock.patch.object(
130+
target=tuning.TuningJob,
131+
attribute="client_class",
132+
new=MockTuningJobClientWithOverride,
133+
)
134+
def test_tune_autorater(self):
135+
"""Test tune_autorater."""
136+
autorater_config = autorater_utils.tune_autorater(
137+
base_model="gemini-1.0-pro-001",
138+
train_dataset="gs://test-bucket/train_dataset.jsonl",
139+
validation_dataset="gs://test-bucket/validation_dataset.jsonl",
140+
epochs=300,
141+
learning_rate_multiplier=1.0,
142+
time_out_hours=0,
143+
)
144+
assert autorater_config.autorater_model == (
145+
"projects/test-project/locations/us-central1/endpoints/456"
146+
)
147+
148+
def test_evaluate_autorater(self):
149+
"""Test evaluate_autorater."""
150+
autorater_config = autorater_utils.AutoraterConfig(
151+
autorater_model="projects/test-project/locations/us-central1/endpoints/456"
152+
)
153+
y_true_2_class = [1, 0, 1, 0, 1, 0]
154+
y_pred_2_class = [1, 0, 0, 1, 1, 0]
155+
y_true_multi_class = ["1", "2", "1", "1", "2", "3"]
156+
y_pred_multi_class = [
157+
"2",
158+
"2",
159+
"1",
160+
"1",
161+
"2",
162+
"1",
163+
]
164+
metrics = [
165+
PairwiseMetric(
166+
metric="test_pairwise_2_class",
167+
metric_prompt_template="test prompt1",
168+
),
169+
PointwiseMetric(
170+
metric="test_pointwise_multi_class",
171+
metric_prompt_template="test prompt2",
172+
),
173+
]
174+
autorater_eval_result = autorater_utils.evaluate_autorater(
175+
evaluate_autorater_input=pd.DataFrame(
176+
{
177+
f"test_pairwise_2_class/{_PAIRWISE_CHOICE}": y_pred_2_class,
178+
f"test_pairwise_2_class/{_HUMAN_PAIRWISE_CHOICE}": y_true_2_class,
179+
f"test_pointwise_multi_class/{_SCORE}": y_pred_multi_class,
180+
f"test_pointwise_multi_class/{_HUMAN_RATING}": y_true_multi_class,
181+
}
182+
),
183+
eval_metrics=metrics,
184+
autorater_config=autorater_config,
185+
eval_dataset_metadata={
186+
"eval_dataset_path": "gs://test-bucket/eval_dataset.jsonl",
187+
"eval_dataset_size": 6,
188+
},
189+
unused_params=10,
190+
)
191+
expected_eval_results = [
192+
{
193+
_METRIC: metrics[0].metric_name,
194+
_ACCURACY_BALANCED: 2 / 3,
195+
_F1_SCORE_BALANCED: 2 / 3,
196+
_CONFUSION_MATRIX: np.array([[2, 1], [1, 2]]),
197+
_CONFUSION_MATRIX_LABELS: ["0", "1"],
198+
},
199+
{
200+
_METRIC: metrics[1].metric_name,
201+
_ACCURACY_BALANCED: 5 / 9,
202+
_F1_SCORE_BALANCED: 3 / 5,
203+
_CONFUSION_MATRIX: np.array([[2, 1, 0], [0, 2, 0], [1, 0, 0]]),
204+
_CONFUSION_MATRIX_LABELS: ["1.0", "2.0", "3.0"],
205+
},
206+
]
207+
208+
assert _compare_autorater_eval_result(
209+
autorater_eval_result.eval_result, expected_eval_results
210+
)
211+
assert autorater_eval_result.eval_dataset_metadata == {
212+
"eval_dataset_path": "gs://test-bucket/eval_dataset.jsonl",
213+
"eval_dataset_size": 6,
214+
}
215+
assert autorater_eval_result.autorater_config == autorater_config
216+
assert autorater_eval_result.unused_params == 10
217+
218+
def test_evaluate_autorater_exceed_pointwise_limit(self):
219+
"""Test evaluate_autorater."""
220+
autorater_config = autorater_utils.AutoraterConfig(
221+
autorater_model="projects/test-project/locations/us-central1/endpoints/456"
222+
)
223+
y_true_multi_class = [_ for _ in range(12)]
224+
y_pred_multi_class = [_ for _ in range(12)]
225+
metrics = [
226+
PointwiseMetric(
227+
metric="test_pointwise_multi_class",
228+
metric_prompt_template="test prompt2",
229+
),
230+
]
231+
autorater_eval_result = autorater_utils.evaluate_autorater(
232+
evaluate_autorater_input=pd.DataFrame(
233+
{
234+
f"test_pointwise_multi_class/{_SCORE}": y_pred_multi_class,
235+
f"test_pointwise_multi_class/{_HUMAN_RATING}": y_true_multi_class,
236+
}
237+
),
238+
eval_metrics=metrics,
239+
autorater_config=autorater_config,
240+
eval_dataset_metadata={
241+
"eval_dataset_path": "gs://test-bucket/eval_dataset.jsonl",
242+
"eval_dataset_size": 6,
243+
},
244+
unused_params=10,
245+
)
246+
assert autorater_eval_result.eval_result == [
247+
{
248+
_METRIC: metrics[0].metric_name,
249+
_ACCURACY_BALANCED: 1.0,
250+
_F1_SCORE_BALANCED: 1.0,
251+
},
252+
]
253+
assert autorater_eval_result.eval_dataset_metadata == {
254+
"eval_dataset_path": "gs://test-bucket/eval_dataset.jsonl",
255+
"eval_dataset_size": 6,
256+
}
257+
assert autorater_eval_result.autorater_config == autorater_config
258+
assert autorater_eval_result.unused_params == 10
259+
260+
@mock.patch.object(
261+
target=tuning.TuningJob,
262+
attribute="client_class",
263+
new=MockTuningJobClientWithOverride,
264+
)
265+
def test_evaluate_autorater_with_skipped_results(self):
266+
"""Test evaluate_autorater."""
267+
autorater_config = autorater_utils.AutoraterConfig(
268+
autorater_model="projects/test-project/locations/us-central1/endpoints/456"
269+
)
270+
y_true_2_class = ["1", "0", "1", "0", "1", "0", "Error", "1"]
271+
y_pred_2_class = ["1", "0", "0", "1", "1", "0", "0", "ERROR"]
272+
y_true_multi_class = ["1", "2", "1", 1, "2", "3", "1", "NaN"]
273+
y_pred_multi_class = ["2", "2.0", "1", 1.0, "2", "1", "NaN", "1"]
274+
metrics = [
275+
PairwiseMetric(
276+
metric="test_pairwise_2_class",
277+
metric_prompt_template="test prompt1",
278+
),
279+
PointwiseMetric(
280+
metric="test_pointwise_multi_class",
281+
metric_prompt_template="test prompt2",
282+
),
283+
]
284+
autorater_eval_result = autorater_utils.evaluate_autorater(
285+
evaluate_autorater_input=pd.DataFrame(
286+
{
287+
f"test_pairwise_2_class/{_PAIRWISE_CHOICE}": y_pred_2_class,
288+
f"test_pairwise_2_class/{_HUMAN_PAIRWISE_CHOICE}": y_true_2_class,
289+
f"test_pointwise_multi_class/{_SCORE}": y_pred_multi_class,
290+
f"test_pointwise_multi_class/{_HUMAN_RATING}": y_true_multi_class,
291+
}
292+
),
293+
eval_metrics=metrics,
294+
autorater_config=autorater_config,
295+
eval_dataset_metadata={
296+
"eval_dataset_path": "gs://test-bucket/eval_dataset.jsonl",
297+
"eval_dataset_size": 6,
298+
},
299+
unused_params=10,
300+
)
301+
expected_eval_results = [
302+
{
303+
_METRIC: metrics[0].metric_name,
304+
_ACCURACY_BALANCED: 2 / 3,
305+
_F1_SCORE_BALANCED: 2 / 3,
306+
_CONFUSION_MATRIX: np.array([[2, 1], [1, 2]]),
307+
_CONFUSION_MATRIX_LABELS: ["0", "1"],
308+
},
309+
{
310+
_METRIC: metrics[1].metric_name,
311+
_ACCURACY_BALANCED: 5 / 9,
312+
_F1_SCORE_BALANCED: 3 / 5,
313+
_CONFUSION_MATRIX: np.array([[2, 1, 0], [0, 2, 0], [1, 0, 0]]),
314+
_CONFUSION_MATRIX_LABELS: ["1.0", "2.0", "3.0"],
315+
},
316+
]
317+
assert _compare_autorater_eval_result(
318+
autorater_eval_result.eval_result, expected_eval_results
319+
)
320+
assert autorater_eval_result.eval_dataset_metadata == {
321+
"eval_dataset_path": "gs://test-bucket/eval_dataset.jsonl",
322+
"eval_dataset_size": 6,
323+
}
324+
assert autorater_eval_result.autorater_config == autorater_config
325+
assert autorater_eval_result.unused_params == 10
326+
327+
328+
def _compare_autorater_eval_result(
329+
actual_eval_results: List[Dict[str, Any]],
330+
expected_eval_results: List[Dict[str, Any]],
331+
) -> bool:
332+
"""Compare autorater eval result."""
333+
for actual, expected in zip(actual_eval_results, expected_eval_results):
334+
if actual[_METRIC] != expected[_METRIC]:
335+
return False
336+
if not _almost_equal(actual[_ACCURACY_BALANCED], expected[_ACCURACY_BALANCED]):
337+
return False
338+
if not _almost_equal(actual[_F1_SCORE_BALANCED], expected[_F1_SCORE_BALANCED]):
339+
return False
340+
if not (actual[_CONFUSION_MATRIX] == expected[_CONFUSION_MATRIX]).all():
341+
return False
342+
if actual[_CONFUSION_MATRIX_LABELS] != expected[_CONFUSION_MATRIX_LABELS]:
343+
return False
344+
return True
345+
346+
347+
def _almost_equal(a: Any, b: Any) -> bool:
348+
"""Compare two numbers with a small tolerance."""
349+
return abs(a - b) <= 1e-6

0 commit comments

Comments
 (0)