Skip to content

Commit aabb579

Browse files
vertex-sdk-botcopybara-github
authored andcommitted
feat: Upload dataset URI, metric criteria, and rating rubrics used in an evaluation to GCS.
PiperOrigin-RevId: 723940715
1 parent d4cae46 commit aabb579

File tree

4 files changed

+49
-1
lines changed

4 files changed

+49
-1
lines changed

tests/unit/vertexai/test_evaluation.py

+13
Original file line numberDiff line numberDiff line change
@@ -2145,6 +2145,8 @@ def test_upload_results(self, mock_storage_blob_from_string):
21452145
_TEST_FILE_NAME,
21462146
"candidate_model",
21472147
"baseline_model",
2148+
"gs://test-bucket/test-dataset.csv",
2149+
[_TEST_POINTWISE_METRIC, _TEST_PAIRWISE_METRIC],
21482150
)
21492151

21502152
mock_storage_blob_from_string.assert_any_call(
@@ -2160,6 +2162,17 @@ def test_upload_results(self, mock_storage_blob_from_string):
21602162
"summary_metrics": MOCK_EVAL_RESULT.summary_metrics,
21612163
"candidate_model_name": "candidate_model",
21622164
"baseline_model_name": "baseline_model",
2165+
"dataset_uri": "gs://test-bucket/test-dataset.csv",
2166+
"metric_descriptions": {
2167+
"test_pointwise_metric": {
2168+
"criteria": _CRITERIA,
2169+
"rating_rubric": _POINTWISE_RATING_RUBRIC,
2170+
},
2171+
"test_pairwise_metric": {
2172+
"criteria": _CRITERIA,
2173+
"rating_rubric": _PAIRWISE_RATING_RUBRIC,
2174+
},
2175+
},
21632176
},
21642177
mock.ANY,
21652178
)

vertexai/evaluation/eval_task.py

+7
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ def __init__(
284284
output_uri_prefix: GCS location to store the metrics_table from
285285
evaluation results.
286286
"""
287+
self._raw_dataset = dataset
287288
self._dataset = utils.load_dataset(dataset)
288289
self._metrics = metrics
289290
self._experiment = experiment
@@ -481,12 +482,18 @@ def evaluate(
481482
if isinstance(baseline_model, generative_models.GenerativeModel):
482483
baseline_model_name = baseline_model._model_name
483484

485+
dataset_uri = None
486+
if isinstance(self._raw_dataset, str):
487+
dataset_uri = self._raw_dataset
488+
484489
utils.upload_evaluation_results(
485490
eval_result,
486491
self.output_uri_prefix,
487492
output_file_name,
488493
candidate_model_name,
489494
baseline_model_name,
495+
dataset_uri,
496+
self.metrics,
490497
)
491498
return eval_result
492499

vertexai/evaluation/metrics/_base.py

+1
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ def __init__(
6868
the model-based evaluation. A freeform string is also accepted.
6969
"""
7070
super().__init__(metric=metric)
71+
self._raw_metric_prompt_template = metric_prompt_template
7172
self.metric_prompt_template = str(metric_prompt_template)
7273

7374

vertexai/evaluation/utils.py

+28-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import tempfile
2323
import threading
2424
import time
25-
from typing import Any, Callable, Dict, Literal, Optional, TYPE_CHECKING, Union
25+
from typing import Any, Callable, Dict, List, Literal, Optional, TYPE_CHECKING, Union
2626

2727
from google.cloud import bigquery
2828
from google.cloud import storage
@@ -35,6 +35,10 @@
3535
evaluation_service as gapic_evaluation_services,
3636
)
3737
from vertexai.evaluation import _base as eval_base
38+
from vertexai.evaluation.metrics import (
39+
_base as metrics_base,
40+
metric_prompt_template as metric_prompt_template_base,
41+
)
3842

3943

4044
if TYPE_CHECKING:
@@ -286,6 +290,8 @@ def _upload_evaluation_summary_to_gcs(
286290
upload_gcs_path: str,
287291
candidate_model_name: Optional[str] = None,
288292
baseline_model_name: Optional[str] = None,
293+
dataset_uri: Optional[str] = None,
294+
metrics: Optional[List[Union[str, metrics_base._Metric]]] = None,
289295
) -> None:
290296
"""Uploads the evaluation summary to a GCS bucket."""
291297
summary = {
@@ -295,6 +301,21 @@ def _upload_evaluation_summary_to_gcs(
295301
summary["candidate_model_name"] = candidate_model_name
296302
if baseline_model_name:
297303
summary["baseline_model_name"] = baseline_model_name
304+
if dataset_uri:
305+
summary["dataset_uri"] = dataset_uri
306+
307+
if metrics:
308+
metric_descriptions = {}
309+
for metric in metrics:
310+
if isinstance(metric, metrics_base._ModelBasedMetric) and isinstance(
311+
metric._raw_metric_prompt_template,
312+
metric_prompt_template_base._MetricPromptTemplate,
313+
):
314+
metric_descriptions[metric.metric_name] = {
315+
"criteria": metric._raw_metric_prompt_template._criteria,
316+
"rating_rubric": metric._raw_metric_prompt_template._rating_rubric,
317+
}
318+
summary["metric_descriptions"] = metric_descriptions
298319

299320
with tempfile.TemporaryDirectory() as temp_dir:
300321
local_summary_path = os.path.join(temp_dir, "summary_metrics.json")
@@ -318,6 +339,8 @@ def upload_evaluation_results(
318339
file_name: str,
319340
candidate_model_name: Optional[str] = None,
320341
baseline_model_name: Optional[str] = None,
342+
dataset_uri: Optional[str] = None,
343+
metrics: Optional[List[Union[str, metrics_base._Metric]]] = None,
321344
) -> None:
322345
"""Uploads eval results to GCS destination.
323346
@@ -327,6 +350,8 @@ def upload_evaluation_results(
327350
file_name: File name to store the metrics table.
328351
candidate_model_name: Optional. Candidate model name.
329352
baseline_model_name: Optional. Baseline model name.
353+
dataset_uri: Optional. URI pointing to the dataset.
354+
metrics: Optional. List of metrics used for evaluation.
330355
"""
331356
if not destination_uri_prefix:
332357
_ipython_utils.display_gen_ai_evaluation_results_button()
@@ -346,6 +371,8 @@ def upload_evaluation_results(
346371
output_folder + "/summary_metrics.json",
347372
candidate_model_name,
348373
baseline_model_name,
374+
dataset_uri,
375+
metrics,
349376
)
350377
_ipython_utils.display_gen_ai_evaluation_results_button(
351378
metrics_table_path.split(_GCS_PREFIX)[1]

0 commit comments

Comments
 (0)