Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Upload aggregate metrics and generative models used in an evaluation to GCS. #4837

Merged
merged 1 commit into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 28 additions & 11 deletions tests/unit/vertexai/test_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -558,13 +558,9 @@ def mock_experiment_tracker():


@pytest.fixture
def mock_storage_blob_upload_from_filename():
with mock.patch(
"google.cloud.storage.Blob.upload_from_filename"
) as mock_blob_upload_from_filename, mock.patch(
"google.cloud.storage.Bucket.exists", return_value=True
):
yield mock_blob_upload_from_filename
def mock_storage_blob_from_string():
with mock.patch("google.cloud.storage.Blob.from_string") as mock_blob_from_string:
yield mock_blob_from_string


@pytest.mark.usefixtures("google_auth_mock")
Expand Down Expand Up @@ -1948,8 +1944,29 @@ def test_pairtwise_metric_prompt_template_with_default_values(self):
== _EXPECTED_PAIRWISE_PROMPT_TEMPLATE_WITH_DEFAULT_VALUES.strip()
)

def test_upload_results(self, mock_storage_blob_upload_from_filename):
evaluation.utils.upload_evaluation_results(
_TEST_CSV, _TEST_BUCKET, _TEST_FILE_NAME
def test_upload_results(self, mock_storage_blob_from_string):
with mock.patch("json.dump") as mock_json_dump:
evaluation.utils.upload_evaluation_results(
MOCK_EVAL_RESULT,
_TEST_BUCKET,
_TEST_FILE_NAME,
"candidate_model",
"baseline_model",
)

mock_storage_blob_from_string.assert_any_call(
uri="gs://test-bucket/test-file-name/test-file-name.csv",
client=mock.ANY,
)
mock_storage_blob_from_string.assert_any_call(
uri="gs://test-bucket/test-file-name/summary_metrics.json",
client=mock.ANY,
)
mock_json_dump.assert_called_once_with(
{
"summary_metrics": MOCK_EVAL_RESULT.summary_metrics,
"candidate_model_name": "candidate_model",
"baseline_model_name": "baseline_model",
},
mock.ANY,
)
assert mock_storage_blob_upload_from_filename.called_once_with(_TEST_CSV)
23 changes: 22 additions & 1 deletion vertexai/evaluation/eval_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,8 +464,29 @@ def evaluate(
evaluation_service_qps=evaluation_service_qps,
retry_timeout=retry_timeout,
)

candidate_model_name = None
if isinstance(model, generative_models.GenerativeModel):
candidate_model_name = model._model_name

baseline_model_name = None
pairwise_metrics = [
metric
for metric in self.metrics
if isinstance(metric, pairwise_metric.PairwiseMetric)
]
if pairwise_metrics:
# All pairwise metrics should have the same baseline model.
baseline_model = pairwise_metrics[0].baseline_model
if isinstance(baseline_model, generative_models.GenerativeModel):
baseline_model_name = baseline_model._model_name

utils.upload_evaluation_results(
eval_result.metrics_table, self.output_uri_prefix, output_file_name
eval_result,
self.output_uri_prefix,
output_file_name,
candidate_model_name,
baseline_model_name,
)
return eval_result

Expand Down
74 changes: 59 additions & 15 deletions vertexai/evaluation/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@

import functools
import io
import json
import os
import tempfile
import threading
import time
from typing import Any, Dict, Optional, TYPE_CHECKING, Union, Callable, Literal
from typing import Any, Callable, Dict, Literal, Optional, TYPE_CHECKING, Union

from google.cloud import bigquery
from google.cloud import storage
Expand All @@ -33,6 +34,7 @@
from google.cloud.aiplatform_v1.services import (
evaluation_service as gapic_evaluation_services,
)
from vertexai.evaluation import _base as eval_base


if TYPE_CHECKING:
Expand Down Expand Up @@ -276,35 +278,77 @@ def _upload_pandas_df_to_gcs(
" Please provide a valid GCS path with `jsonl` or `csv` suffix."
)

storage_client = storage.Client(
project=initializer.global_config.project,
credentials=initializer.global_config.credentials,
)
storage.Blob.from_string(
uri=upload_gcs_path, client=storage_client
).upload_from_filename(filename=local_dataset_path)
_upload_file_to_gcs(upload_gcs_path, local_dataset_path)


def _upload_evaluation_summary_to_gcs(
summary_metrics: Dict[str, float],
upload_gcs_path: str,
candidate_model_name: Optional[str] = None,
baseline_model_name: Optional[str] = None,
) -> None:
"""Uploads the evaluation summary to a GCS bucket."""
summary = {
"summary_metrics": summary_metrics,
}
if candidate_model_name:
summary["candidate_model_name"] = candidate_model_name
if baseline_model_name:
summary["baseline_model_name"] = baseline_model_name

with tempfile.TemporaryDirectory() as temp_dir:
local_summary_path = os.path.join(temp_dir, "summary_metrics.json")
json.dump(summary, open(local_summary_path, "w"))
_upload_file_to_gcs(upload_gcs_path, local_summary_path)


def _upload_file_to_gcs(upload_gcs_path: str, filename: str) -> None:
storage_client = storage.Client(
project=initializer.global_config.project,
credentials=initializer.global_config.credentials,
)
storage.Blob.from_string(
uri=upload_gcs_path, client=storage_client
).upload_from_filename(filename)


def upload_evaluation_results(
dataset: "pd.DataFrame", destination_uri_prefix: str, file_name: str
eval_result: eval_base.EvalResult,
destination_uri_prefix: str,
file_name: str,
candidate_model_name: Optional[str] = None,
baseline_model_name: Optional[str] = None,
) -> None:
"""Uploads eval results to GCS destination.

Args:
dataset: Pandas dataframe to upload.
eval_result: Eval results to upload.
destination_uri_prefix: GCS folder to store the data.
file_name: File name to store the data.
file_name: File name to store the metrics table.
candidate_model_name: Optional. Candidate model name.
baseline_model_name: Optional. Baseline model name.
"""
if not destination_uri_prefix:
_ipython_utils.display_gen_ai_evaluation_results_button()
return
if eval_result.metrics_table is None:
return
if destination_uri_prefix.startswith(_GCS_PREFIX):
_, extension = os.path.splitext(file_name)
base_name, extension = os.path.splitext(file_name)
file_type = extension.lower()[1:]
output_path = destination_uri_prefix + "/" + file_name
_upload_pandas_df_to_gcs(dataset, output_path, file_type)
output_folder = destination_uri_prefix + "/" + base_name
metrics_table_path = output_folder + "/" + file_name
_upload_pandas_df_to_gcs(
eval_result.metrics_table, metrics_table_path, file_type
)
_upload_evaluation_summary_to_gcs(
eval_result.summary_metrics,
output_folder + "/summary_metrics.json",
candidate_model_name,
baseline_model_name,
)
_ipython_utils.display_gen_ai_evaluation_results_button(
output_path.split(_GCS_PREFIX)[1]
metrics_table_path.split(_GCS_PREFIX)[1]
)
else:
raise ValueError(
Expand Down
Loading