Skip to content

Commit 5f89836

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Upload aggregate metrics and generative models used in an evaluation to GCS.
PiperOrigin-RevId: 713454093
1 parent 30a9403 commit 5f89836

File tree

3 files changed

+109
-27
lines changed

3 files changed

+109
-27
lines changed

tests/unit/vertexai/test_evaluation.py

+28-11
Original file line numberDiff line numberDiff line change
@@ -558,13 +558,9 @@ def mock_experiment_tracker():
558558

559559

560560
@pytest.fixture
561-
def mock_storage_blob_upload_from_filename():
562-
with mock.patch(
563-
"google.cloud.storage.Blob.upload_from_filename"
564-
) as mock_blob_upload_from_filename, mock.patch(
565-
"google.cloud.storage.Bucket.exists", return_value=True
566-
):
567-
yield mock_blob_upload_from_filename
561+
def mock_storage_blob_from_string():
562+
with mock.patch("google.cloud.storage.Blob.from_string") as mock_blob_from_string:
563+
yield mock_blob_from_string
568564

569565

570566
@pytest.mark.usefixtures("google_auth_mock")
@@ -1948,8 +1944,29 @@ def test_pairtwise_metric_prompt_template_with_default_values(self):
19481944
== _EXPECTED_PAIRWISE_PROMPT_TEMPLATE_WITH_DEFAULT_VALUES.strip()
19491945
)
19501946

1951-
def test_upload_results(self, mock_storage_blob_upload_from_filename):
1952-
evaluation.utils.upload_evaluation_results(
1953-
_TEST_CSV, _TEST_BUCKET, _TEST_FILE_NAME
1947+
def test_upload_results(self, mock_storage_blob_from_string):
1948+
with mock.patch("json.dump") as mock_json_dump:
1949+
evaluation.utils.upload_evaluation_results(
1950+
MOCK_EVAL_RESULT,
1951+
_TEST_BUCKET,
1952+
_TEST_FILE_NAME,
1953+
"candidate_model",
1954+
"baseline_model",
1955+
)
1956+
1957+
mock_storage_blob_from_string.assert_any_call(
1958+
uri="gs://test-bucket/test-file-name/test-file-name.csv",
1959+
client=mock.ANY,
1960+
)
1961+
mock_storage_blob_from_string.assert_any_call(
1962+
uri="gs://test-bucket/test-file-name/summary_metrics.json",
1963+
client=mock.ANY,
1964+
)
1965+
mock_json_dump.assert_called_once_with(
1966+
{
1967+
"summary_metrics": MOCK_EVAL_RESULT.summary_metrics,
1968+
"candidate_model_name": "candidate_model",
1969+
"baseline_model_name": "baseline_model",
1970+
},
1971+
mock.ANY,
19541972
)
1955-
assert mock_storage_blob_upload_from_filename.called_once_with(_TEST_CSV)

vertexai/evaluation/eval_task.py

+22-1
Original file line numberDiff line numberDiff line change
@@ -464,8 +464,29 @@ def evaluate(
464464
evaluation_service_qps=evaluation_service_qps,
465465
retry_timeout=retry_timeout,
466466
)
467+
468+
candidate_model_name = None
469+
if isinstance(model, generative_models.GenerativeModel):
470+
candidate_model_name = model._model_name
471+
472+
baseline_model_name = None
473+
pairwise_metrics = [
474+
metric
475+
for metric in self.metrics
476+
if isinstance(metric, pairwise_metric.PairwiseMetric)
477+
]
478+
if pairwise_metrics:
479+
# All pairwise metrics should have the same baseline model.
480+
baseline_model = pairwise_metrics[0].baseline_model
481+
if isinstance(baseline_model, generative_models.GenerativeModel):
482+
baseline_model_name = baseline_model._model_name
483+
467484
utils.upload_evaluation_results(
468-
eval_result.metrics_table, self.output_uri_prefix, output_file_name
485+
eval_result,
486+
self.output_uri_prefix,
487+
output_file_name,
488+
candidate_model_name,
489+
baseline_model_name,
469490
)
470491
return eval_result
471492

vertexai/evaluation/utils.py

+59-15
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,12 @@
1717

1818
import functools
1919
import io
20+
import json
2021
import os
2122
import tempfile
2223
import threading
2324
import time
24-
from typing import Any, Dict, Optional, TYPE_CHECKING, Union, Callable, Literal
25+
from typing import Any, Callable, Dict, Literal, Optional, TYPE_CHECKING, Union
2526

2627
from google.cloud import bigquery
2728
from google.cloud import storage
@@ -33,6 +34,7 @@
3334
from google.cloud.aiplatform_v1.services import (
3435
evaluation_service as gapic_evaluation_services,
3536
)
37+
from vertexai.evaluation import _base as eval_base
3638

3739

3840
if TYPE_CHECKING:
@@ -276,35 +278,77 @@ def _upload_pandas_df_to_gcs(
276278
" Please provide a valid GCS path with `jsonl` or `csv` suffix."
277279
)
278280

279-
storage_client = storage.Client(
280-
project=initializer.global_config.project,
281-
credentials=initializer.global_config.credentials,
282-
)
283-
storage.Blob.from_string(
284-
uri=upload_gcs_path, client=storage_client
285-
).upload_from_filename(filename=local_dataset_path)
281+
_upload_file_to_gcs(upload_gcs_path, local_dataset_path)
282+
283+
284+
def _upload_evaluation_summary_to_gcs(
285+
summary_metrics: Dict[str, float],
286+
upload_gcs_path: str,
287+
candidate_model_name: Optional[str] = None,
288+
baseline_model_name: Optional[str] = None,
289+
):
290+
"""Uploads the evaluation summary to a GCS bucket."""
291+
summary = {
292+
"summary_metrics": summary_metrics,
293+
}
294+
if candidate_model_name:
295+
summary["candidate_model_name"] = candidate_model_name
296+
if baseline_model_name:
297+
summary["baseline_model_name"] = baseline_model_name
298+
299+
with tempfile.TemporaryDirectory() as temp_dir:
300+
local_summary_path = os.path.join(temp_dir, "summary_metrics.json")
301+
json.dump(summary, open(local_summary_path, "w"))
302+
_upload_file_to_gcs(upload_gcs_path, local_summary_path)
303+
304+
305+
def _upload_file_to_gcs(upload_gcs_path: str, filename: str):
306+
storage_client = storage.Client(
307+
project=initializer.global_config.project,
308+
credentials=initializer.global_config.credentials,
309+
)
310+
storage.Blob.from_string(
311+
uri=upload_gcs_path, client=storage_client
312+
).upload_from_filename(filename)
286313

287314

288315
def upload_evaluation_results(
289-
dataset: "pd.DataFrame", destination_uri_prefix: str, file_name: str
316+
eval_result: eval_base.EvalResult,
317+
destination_uri_prefix: str,
318+
file_name: str,
319+
candidate_model_name: Optional[str] = None,
320+
baseline_model_name: Optional[str] = None,
290321
) -> None:
291322
"""Uploads eval results to GCS destination.
292323
293324
Args:
294-
dataset: Pandas dataframe to upload.
325+
eval_result: Eval results to upload.
295326
destination_uri_prefix: GCS folder to store the data.
296-
file_name: File name to store the data.
327+
file_name: File name to store the metrics table.
328+
candidate_model_name: Optional. Candidate model name.
329+
baseline_model_name: Optional. Baseline model name.
297330
"""
298331
if not destination_uri_prefix:
299332
_ipython_utils.display_gen_ai_evaluation_results_button()
300333
return
334+
if eval_result.metrics_table is None:
335+
return
301336
if destination_uri_prefix.startswith(_GCS_PREFIX):
302-
_, extension = os.path.splitext(file_name)
337+
base_name, extension = os.path.splitext(file_name)
303338
file_type = extension.lower()[1:]
304-
output_path = destination_uri_prefix + "/" + file_name
305-
_upload_pandas_df_to_gcs(dataset, output_path, file_type)
339+
output_folder = destination_uri_prefix + "/" + base_name
340+
metrics_table_path = output_folder + "/" + file_name
341+
_upload_pandas_df_to_gcs(
342+
eval_result.metrics_table, metrics_table_path, file_type
343+
)
344+
_upload_evaluation_summary_to_gcs(
345+
eval_result.summary_metrics,
346+
output_folder + "/summary_metrics.json",
347+
candidate_model_name,
348+
baseline_model_name,
349+
)
306350
_ipython_utils.display_gen_ai_evaluation_results_button(
307-
output_path.split(_GCS_PREFIX)[1]
351+
metrics_table_path.split(_GCS_PREFIX)[1]
308352
)
309353
else:
310354
raise ValueError(

0 commit comments

Comments
 (0)