googleapis · rey-esp · Jan 27, 2025 · Jan 27, 2025 · Jan 27, 2025 · Jan 28, 2025
@@ -19,6 +19,7 @@
 
 from typing import List, Literal, Optional, Union
 
+import bigframes_vendored.sklearn.decomposition._ml
 import bigframes_vendored.sklearn.decomposition._pca
 from google.cloud import bigquery
 
@@ -197,3 +198,177 @@ def score(
 
         # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
         return self._bqml_model.evaluate()
+
+
+@log_adapter.class_logger
+class MF(
+    base.UnsupervisedTrainablePredictor,
+    bigframes_vendored.sklearn.decomposition._mf.MF,
+):
+    __doc__ = bigframes_vendored.sklearn.decomposition._mf.MF.__doc__
+
+    def __init__(
+        self,
+        n_components: Optional[Union[int, float]] = None,
+        *,
+        num_factors: int,
+        user_col: str,
+        item_col: str,
+        l2_reg: float,
+    ):
+        self.n_components = n_components
+        self.num_factors = num_factors
+        self.user_col = user_col
+        self.item_col = item_col
+        self.l2_reg = l2_reg
+        self._bqml_model: Optional[core.BqmlModel] = None
+        self._bqml_model_factory = globals.bqml_model_factory()
+
+    @classmethod
+    def _from_bq(
+        cls, session: bigframes.session.Session, bq_model: bigquery.Model
+    ) -> MF:
+        assert bq_model.model_type == "MF"
+
+        kwargs = utils.retrieve_params_from_bq_model(
+            cls, bq_model, _BQML_PARAMS_MAPPING
+        )
+
+        last_fitting = bq_model.training_runs[-1]["trainingOptions"]
+        if "numPrincipalComponents" in last_fitting:
+            kwargs["n_components"] = int(last_fitting["numPrincipalComponents"])
+        # elif "pcaExplainedVarianceRatio" in last_fitting:
+        #     kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"])
+
+        model = cls(**kwargs)
+        model._bqml_model = core.BqmlModel(session, bq_model)
+        return model
+
+    @property
+    def _bqml_options(self) -> dict:
+        """The model options as they will be set for BQML"""
+        options: dict = {
+            "model_type": "ML",
+        }
+
+        assert self.n_components is not None
+        if 0 < self.n_components < 1:
+            options["pca_explained_variance_ratio"] = float(self.n_components)
+        elif self.n_components >= 1:
+            options["num_principal_components"] = int(self.n_components)
+
+        return options
+
+    def _fit(
+        self,
+        X: utils.ArrayType,
+        y=None,
+        transforms: Optional[List[str]] = None,
+    ) -> PCA:
+        (X,) = utils.batch_convert_to_dataframe(X)
+
+        # To mimic sklearn's behavior
+        if self.n_components is None:
+            self.n_components = min(X.shape)
+        self._bqml_model = self._bqml_model_factory.create_model(
+            X_train=X,
+            transforms=transforms,
+            options=self._bqml_options,
+        )
+        return self
+
+    @property
+    def components_(self) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before calling components_.")
+
+        return self._bqml_model.principal_components()
+
+    @property
+    def explained_variance_(self) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError(
+                "A model must be fitted before calling explained_variance_."
+            )
+
+        return self._bqml_model.principal_component_info()[
+            ["principal_component_id", "eigenvalue"]
+        ].rename(columns={"eigenvalue": "explained_variance"})
+
+    @property
+    def explained_variance_ratio_(self) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError(
+                "A model must be fitted before calling explained_variance_ratio_."
+            )
+
+        return self._bqml_model.principal_component_info()[
+            ["principal_component_id", "explained_variance_ratio"]
+        ]
+
+    def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before predict")
+
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
+
+        return self._bqml_model.predict(X)
+
+    def detect_anomalies(
+        self,
+        X: utils.ArrayType,
+        *,
+        contamination: float = 0.1,
+    ) -> bpd.DataFrame:
+        """Detect the anomaly data points of the input.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series):
+                Series or a DataFrame to detect anomalies.
+            contamination (float, default 0.1):
+                Identifies the proportion of anomalies in the training dataset that are used to create the model.
+                The value must be in the range [0, 0.5].
+
+        Returns:
+            bigframes.dataframe.DataFrame: detected DataFrame."""
+        if contamination < 0.0 or contamination > 0.5:
+            raise ValueError(
+                f"contamination must be [0.0, 0.5], but is {contamination}."
+            )
+
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before detect_anomalies")
+
+        (X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)
+
+        return self._bqml_model.detect_anomalies(
+            X, options={"contamination": contamination}
+        )
+
+    def to_gbq(self, model_name: str, replace: bool = False) -> PCA:
+        """Save the model to BigQuery.
+
+        Args:
+            model_name (str):
+                The name of the model.
+            replace (bool, default False):
+                Determine whether to replace if the model already exists. Default to False.
+
+        Returns:
+            PCA: Saved model."""
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before it can be saved")
+
+        new_model = self._bqml_model.copy(model_name, replace)
+        return new_model.session.read_gbq_model(model_name)
+
+    def score(
+        self,
+        X=None,
+        y=None,
+    ) -> bpd.DataFrame:
+        if not self._bqml_model:
+            raise RuntimeError("A model must be fitted before score")
+
+        # TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
+        return self._bqml_model.evaluate()
@@ -42,6 +42,7 @@
         "LINEAR_REGRESSION": linear_model.LinearRegression,
         "LOGISTIC_REGRESSION": linear_model.LogisticRegression,
         "KMEANS": cluster.KMeans,
+        "MF": decomposition.MF,
         "PCA": decomposition.PCA,
         "BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor,
         "BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier,
@@ -82,6 +83,7 @@
 def from_bq(
     session: bigframes.session.Session, bq_model: bigquery.Model
 ) -> Union[
+    decomposition.MF,
     decomposition.PCA,
     cluster.KMeans,
     linear_model.LinearRegression,

@@ -0,0 +1,97 @@
+""" Matrix Factorization.
+"""
+
+# Author: Alexandre Gramfort <[email protected]>
+#         Olivier Grisel <[email protected]>
+#         Mathieu Blondel <[email protected]>
+#         Denis A. Engemann <[email protected]>
+#         Michael Eickenberg <[email protected]>
+#         Giorgio Patrini <[email protected]>
+#
+# License: BSD 3 clause
+
+from abc import ABCMeta
+
+from bigframes_vendored.sklearn.base import BaseEstimator
+
+from bigframes import constants
+
+
+class MF(BaseEstimator, metaclass=ABCMeta):
+    """Matrix Factorization (MF).
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> from bigframes.ml.decomposition import MF
+        >>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
+        >>> model = MF(n_components=2, init='random', random_state=0)
+        >>> W = model.fit_transform(X)
+        >>> H = model.components_
+
+    Args:
+        n_components (int, float or None, default None):
+            Number of components to keep. If n_components is not set, all
+            components are kept, n_components = min(n_samples, n_features).
+            If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.
+        num_factors (int or auto, default auto):
+            Specifies the number of latent factors to use.
+            If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples.
+        user_col (str):
+            The user column name.
+        item_col (str):
+            The item column name.
+        l2_reg (float, default 1.0):
+            If you aren't running hyperparameter tuning, then you can specify a FLOAT64 value. The default value is 1.0.
+            If you are running hyperparameter tuning, then you can use one of the following options:
+                The HPARAM_RANGE keyword and two FLOAT64 values that define the range to use for the hyperparameter. For example, L2_REG = HPARAM_RANGE(1.5, 5.0).
+                The HPARAM_CANDIDATES keyword and an array of FLOAT64 values that provide discrete values to use for the hyperparameter. For example, L2_REG = HPARAM_CANDIDATES([0, 1.0, 3.0, 5.0]).
+    """
+
+    def fit(self, X, y=None):
+        """Fit the model according to the given training data.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
+                Series or DataFrame of shape (n_samples, n_features). Training vector,
+                where `n_samples` is the number of samples and `n_features` is
+                the number of features.
+
+            y (default None):
+                Ignored.
+
+        Returns:
+            PCA: Fitted estimator.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def score(self, X=None, y=None):
+        """Calculate evaluation metrics of the model.
+
+        .. note::
+
+            Output matches that of the BigQuery ML.EVALUATE function.
+            See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#matrix_factorization_models
+            for the outputs relevant to this model type.
+
+        Args:
+            X (default None):
+                Ignored.
+
+            y (default None):
+                Ignored.
+        Returns:
+            bigframes.dataframe.DataFrame: DataFrame that represents model metrics.
+        """
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
+
+    def predict(self, X):
+        """Predict the closest cluster for each sample in X.
+
+        Args:
+            X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
+                Series or a DataFrame to predict.
+
+        Returns:
+            bigframes.dataframe.DataFrame: Predicted DataFrames."""
+        raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)