vertexai/evaluation/utils.py

# -*- coding: utf-8 -*-

# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import functools
import io
import json
import os
import tempfile
import threading
import time
from typing import Any, Callable, Dict, Literal, Optional, TYPE_CHECKING, Union

from google.cloud import bigquery
from google.cloud import storage
from google.cloud.aiplatform import base
from google.cloud.aiplatform import compat
from google.cloud.aiplatform import initializer
from google.cloud.aiplatform import utils
from google.cloud.aiplatform.utils import _ipython_utils
from google.cloud.aiplatform_v1.services import (
    evaluation_service as gapic_evaluation_services,
)
from vertexai.evaluation import _base as eval_base


if TYPE_CHECKING:
    import pandas as pd

_BQ_PREFIX = "bq://"
_GCS_PREFIX = "gs://"
_LOGGER = base.Logger(__name__)


class _EvaluationServiceClientWithOverride(utils.ClientWithOverride):
    _is_temporary = False
    _default_version = compat.V1
    _version_map = (
        (
            compat.V1,
            gapic_evaluation_services.EvaluationServiceClient,
        ),
    )


class RateLimiter:
    """Helper class for rate-limiting requests to Vertex AI to improve QoS.

    Attributes:
        seconds_per_event: The time interval (in seconds) between events to
            maintain the desired rate.
        last: The timestamp of the last event.
        _lock: A lock to ensure thread safety.
    """

    def __init__(self, rate: Optional[float] = None):
        """Initializes the rate limiter.

        A simple rate limiter for controlling the frequency of API calls. This class
        implements a token bucket algorithm to limit the rate at which events
        can occur. It's designed for cases where the batch size (number of events
        per call) is always 1 for traffic shaping and rate limiting.

        Args:
            rate: The number of queries allowed per second.
        Raises:
            ValueError: If the rate is not positive.
        """
        if not rate or rate <= 0:
            raise ValueError("Rate must be a positive number")
        self.seconds_per_event = 1.0 / rate
        self.last = time.time() - self.seconds_per_event
        self._lock = threading.Lock()

    def _admit(self) -> float:
        """Checks if an event can be admitted or calculates the remaining delay."""
        now = time.time()
        time_since_last = now - self.last
        if time_since_last >= self.seconds_per_event:
            self.last = now
            return 0
        else:
            return self.seconds_per_event - time_since_last

    def sleep_and_advance(self):
        """Blocks the current thread until the next event can be admitted."""
        with self._lock:
            delay = self._admit()
            if delay > 0:
                time.sleep(delay)
                self.last = time.time()


def rate_limit(rate: Optional[float] = None) -> Callable[[Any], Any]:
    """Decorator version of rate limiter."""

    def _rate_limit(method):
        limiter = RateLimiter(rate)

        @functools.wraps(method)
        def wrapper(*args, **kwargs):
            limiter.sleep_and_advance()
            return method(*args, **kwargs)

        return wrapper

    return _rate_limit


def create_evaluation_service_client(
    api_base_path_override: Optional[str] = None,
) -> _EvaluationServiceClientWithOverride:
    """Creates a client for the evaluation service.

    Args:
      api_base_path_override: Optional. Override default api base path.

    Returns:
      Instantiated Vertex AI EvaluationServiceClient with optional
      overrides.
    """
    return initializer.global_config.create_client(
        client_class=_EvaluationServiceClientWithOverride,
        location_override=initializer.global_config.location,
        api_base_path_override=api_base_path_override,
    )


def load_dataset(
    source: Union[str, "pd.DataFrame", Dict[str, Any]],
) -> "pd.DataFrame":
    """Loads dataset from various sources into a DataFrame.

    Args:
        source: The dataset source. Supports the following dataset formats:
        * pandas.DataFrame: Used directly for evaluation.
        * Dict: Converted to a pandas DataFrame before evaluation.
        * str: Interpreted as a file path or URI. Supported formats include:
            * Local JSONL or CSV files:  Loaded from the local filesystem.
            * GCS JSONL or CSV files: Loaded from Google Cloud Storage
                (e.g., 'gs://bucket/data.csv').
            * BigQuery table URI: Loaded from Google Cloud BigQuery
                (e.g., 'bq://project-id.dataset.table_name').

    Returns:
        The dataset in pandas DataFrame format.
    """
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            'Pandas is not installed. Please install the SDK using "pip install'
            ' google-cloud-aiplatform[evaluation]"'
        )
    if isinstance(source, pd.DataFrame):
        return source.copy()
    elif isinstance(source, dict):
        return pd.DataFrame(source)
    elif isinstance(source, str):
        if source.startswith(_BQ_PREFIX):
            return _load_bigquery(source[len(_BQ_PREFIX) :])

        _, extension = os.path.splitext(source)
        file_type = extension.lower()[1:]

        if file_type == "jsonl":
            return _load_jsonl(source)
        elif file_type == "csv":
            return _load_csv(source)
        else:
            raise ValueError(
                f"Unsupported file type: {file_type} from {source}. Please"
                " provide a valid GCS path with `jsonl` or `csv` suffix or a valid"
                " BigQuery table URI."
            )
    else:
        raise TypeError(
            "Unsupported dataset type. Must be a `pd.DataFrame`, Python dictionary,"
            " valid GCS path with  `jsonl` or `csv` suffix or a valid BigQuery table URI."
        )


def _load_jsonl(filepath: str) -> "pd.DataFrame":
    """Loads data from a JSONL file into a DataFrame."""
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            'Pandas is not installed. Please install the SDK using "pip install'
            ' google-cloud-aiplatform[evaluation]"'
        )
    if filepath.startswith(_GCS_PREFIX):
        file_contents = _read_gcs_file_contents(filepath)
        return pd.read_json(file_contents, lines=True)
    else:
        with open(filepath, "r") as f:
            return pd.read_json(f, lines=True)


def _load_csv(filepath: str) -> "pd.DataFrame":
    """Loads data from a CSV file into a DataFrame."""
    try:
        import pandas as pd
    except ImportError:
        raise ImportError(
            'Pandas is not installed. Please install the SDK using "pip install'
            ' google-cloud-aiplatform[evaluation]"'
        )
    if filepath.startswith(_GCS_PREFIX):
        file_contents = _read_gcs_file_contents(filepath)
        return pd.read_csv(io.StringIO(file_contents), encoding="utf-8")
    else:
        return pd.read_csv(filepath, encoding="utf-8")


def _load_bigquery(table_id: str) -> "pd.DataFrame":
    """Loads data from a BigQuery table into a DataFrame."""

    bigquery_client = bigquery.Client(project=initializer.global_config.project)
    table = bigquery_client.get_table(table_id)
    return bigquery_client.list_rows(table).to_dataframe()


def _read_gcs_file_contents(filepath: str) -> str:
    """Reads the contents of a file from Google Cloud Storage.

    Args:
        filepath: The GCS file path (e.g., 'gs://bucket_name/file.csv')

    Returns:
        str: The contents of the file.
    """

    storage_client = storage.Client(
        project=initializer.global_config.project,
        credentials=initializer.global_config.credentials,
    )
    bucket_name, blob_path = filepath[len(_GCS_PREFIX) :].split("/", 1)
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(blob_path)
    return blob.download_as_string().decode("utf-8")


def _upload_pandas_df_to_gcs(
    df: "pd.DataFrame", upload_gcs_path: str, file_type: Literal["csv", "jsonl"]
) -> None:
    """Uploads the provided Pandas DataFrame to a GCS bucket.

    Args:
        df: The Pandas DataFrame to upload.
        upload_gcs_path: The GCS path to upload the data file.
        file_type: The file type of the data file.
    """

    with tempfile.TemporaryDirectory() as temp_dir:
        if file_type == "csv":
            local_dataset_path = os.path.join(temp_dir, "metrics_table.csv")
            df.to_csv(path_or_buf=local_dataset_path)
        elif file_type == "jsonl":
            local_dataset_path = os.path.join(temp_dir, "metrics_table.jsonl")
            df.to_json(path_or_buf=local_dataset_path, orient="records", lines=True)
        else:
            raise ValueError(
                f"Unsupported file type: {file_type} from {upload_gcs_path}."
                " Please provide a valid GCS path with `jsonl` or `csv` suffix."
            )

        _upload_file_to_gcs(upload_gcs_path, local_dataset_path)


def _upload_evaluation_summary_to_gcs(
    summary_metrics: Dict[str, float],
    upload_gcs_path: str,
    candidate_model_name: Optional[str] = None,
    baseline_model_name: Optional[str] = None,
) -> None:
    """Uploads the evaluation summary to a GCS bucket."""
    summary = {
        "summary_metrics": summary_metrics,
    }
    if candidate_model_name:
        summary["candidate_model_name"] = candidate_model_name
    if baseline_model_name:
        summary["baseline_model_name"] = baseline_model_name

    with tempfile.TemporaryDirectory() as temp_dir:
        local_summary_path = os.path.join(temp_dir, "summary_metrics.json")
        json.dump(summary, open(local_summary_path, "w"))
        _upload_file_to_gcs(upload_gcs_path, local_summary_path)


def _upload_file_to_gcs(upload_gcs_path: str, filename: str) -> None:
    storage_client = storage.Client(
        project=initializer.global_config.project,
        credentials=initializer.global_config.credentials,
    )
    storage.Blob.from_string(
        uri=upload_gcs_path, client=storage_client
    ).upload_from_filename(filename)


def upload_evaluation_results(
    eval_result: eval_base.EvalResult,
    destination_uri_prefix: str,
    file_name: str,
    candidate_model_name: Optional[str] = None,
    baseline_model_name: Optional[str] = None,
) -> None:
    """Uploads eval results to GCS destination.

    Args:
        eval_result: Eval results to upload.
        destination_uri_prefix: GCS folder to store the data.
        file_name: File name to store the metrics table.
        candidate_model_name: Optional. Candidate model name.
        baseline_model_name: Optional. Baseline model name.
    """
    if not destination_uri_prefix:
        _ipython_utils.display_gen_ai_evaluation_results_button()
        return
    if eval_result.metrics_table is None:
        return
    if destination_uri_prefix.startswith(_GCS_PREFIX):
        base_name, extension = os.path.splitext(file_name)
        file_type = extension.lower()[1:]
        output_folder = destination_uri_prefix + "/" + base_name
        metrics_table_path = output_folder + "/" + file_name
        _upload_pandas_df_to_gcs(
            eval_result.metrics_table, metrics_table_path, file_type
        )
        _upload_evaluation_summary_to_gcs(
            eval_result.summary_metrics,
            output_folder + "/summary_metrics.json",
            candidate_model_name,
            baseline_model_name,
        )
        _ipython_utils.display_gen_ai_evaluation_results_button(
            metrics_table_path.split(_GCS_PREFIX)[1]
        )
    else:
        raise ValueError(
            f"Unsupported destination URI: {destination_uri_prefix}."
            f" Please provide a valid GCS bucket URI prefix starting with"
            f" {_GCS_PREFIX}."
        )


def initialize_metric_column_mapping(
    metric_column_mapping: Optional[Dict[str, str]], dataset: "pd.DataFrame"
):
    """Initializes metric column mapping with dataset columns."""
    initialized_metric_column_mapping = {}
    for column in dataset.columns:
        initialized_metric_column_mapping[column] = column
    if metric_column_mapping:
        for key, value in metric_column_mapping.items():
            if key in initialized_metric_column_mapping:
                _LOGGER.warning(
                    f"Cannot override `{key}` column with `{key}:{value}` mapping"
                    f" because `{key}` column is present in the evaluation"
                    " dataset. `metric_column_mapping` cannot override keys"
                    " that are already in evaluation dataset columns."
                )
            else:
                initialized_metric_column_mapping[key] = value
    return initialized_metric_column_mapping