Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add support for creating a Matrix Factorization model #1330

Open
wants to merge 39 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
1d39560
feat: add support for creating a Matrix Factorization model
rey-esp Jan 27, 2025
e19c262
feat: add support for creating a Matrix Factorization model
rey-esp Jan 27, 2025
1bef4a2
feat: add support for creating a Matrix Factorization model
rey-esp Jan 27, 2025
d157cd7
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 28, 2025
e336bde
Update bigframes/ml/decomposition.py
rey-esp Jan 28, 2025
d5f713a
Update bigframes/ml/decomposition.py
rey-esp Jan 28, 2025
5e3e443
Update bigframes/ml/decomposition.py
rey-esp Jan 28, 2025
34a60bc
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 28, 2025
c116e8a
rating_col
rey-esp Jan 28, 2025
dedef39
(nearly) complete class
rey-esp Jan 28, 2025
e5165a9
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 28, 2025
05eb854
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 28, 2025
2787178
removem print()
rey-esp Jan 28, 2025
8c66e07
removem print()
rey-esp Jan 28, 2025
086b4dd
adding recommend
rey-esp Jan 29, 2025
8ed3ccd
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 29, 2025
1b4eef9
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 29, 2025
7c371ac
remove hyper parameter runing references
rey-esp Jan 30, 2025
7498c8c
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 30, 2025
55ef06a
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Jan 30, 2025
29805b5
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 4, 2025
8de384a
swap predict in _mf for recommend
rey-esp Feb 4, 2025
647532b
recommend -> predict
rey-esp Feb 4, 2025
b340c4f
update predict doc string
rey-esp Feb 4, 2025
580de41
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 4, 2025
29ee357
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 5, 2025
bac2ece
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 6, 2025
3f22c23
Merge branch 'b338873783-matrix-factorization' of github.com:googleap…
rey-esp Feb 6, 2025
213f11d
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 6, 2025
aaf0d1f
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 10, 2025
4c90c1d
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 10, 2025
792bd64
Merge branch 'b338873783-matrix-factorization' of github.com:googleap…
rey-esp Feb 10, 2025
ed279be
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 11, 2025
ba5beb3
preparing test files
rey-esp Feb 12, 2025
86fb956
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 12, 2025
a29bbcf
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 13, 2025
8577833
add test data
rey-esp Feb 13, 2025
a92007c
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 19, 2025
a808429
Merge branch 'main' into b338873783-matrix-factorization
rey-esp Feb 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
175 changes: 175 additions & 0 deletions bigframes/ml/decomposition.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from typing import List, Literal, Optional, Union

import bigframes_vendored.sklearn.decomposition._ml
import bigframes_vendored.sklearn.decomposition._pca
from google.cloud import bigquery

Expand Down Expand Up @@ -197,3 +198,177 @@ def score(

# TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
return self._bqml_model.evaluate()


@log_adapter.class_logger
class MF(
base.UnsupervisedTrainablePredictor,
bigframes_vendored.sklearn.decomposition._mf.MF,
):
__doc__ = bigframes_vendored.sklearn.decomposition._mf.MF.__doc__

def __init__(
self,
n_components: Optional[Union[int, float]] = None,
*,
num_factors: int,
user_col: str,
item_col: str,
l2_reg: float,
):
self.n_components = n_components
self.num_factors = num_factors
self.user_col = user_col
self.item_col = item_col
self.l2_reg = l2_reg
self._bqml_model: Optional[core.BqmlModel] = None
self._bqml_model_factory = globals.bqml_model_factory()

@classmethod
def _from_bq(
cls, session: bigframes.session.Session, bq_model: bigquery.Model
) -> MF:
assert bq_model.model_type == "MF"

kwargs = utils.retrieve_params_from_bq_model(
cls, bq_model, _BQML_PARAMS_MAPPING
)

last_fitting = bq_model.training_runs[-1]["trainingOptions"]
if "numPrincipalComponents" in last_fitting:
kwargs["n_components"] = int(last_fitting["numPrincipalComponents"])
# elif "pcaExplainedVarianceRatio" in last_fitting:
# kwargs["n_components"] = float(last_fitting["pcaExplainedVarianceRatio"])

model = cls(**kwargs)
model._bqml_model = core.BqmlModel(session, bq_model)
return model

@property
def _bqml_options(self) -> dict:
"""The model options as they will be set for BQML"""
options: dict = {
"model_type": "ML",
}

assert self.n_components is not None
if 0 < self.n_components < 1:
options["pca_explained_variance_ratio"] = float(self.n_components)
elif self.n_components >= 1:
options["num_principal_components"] = int(self.n_components)

return options

def _fit(
self,
X: utils.ArrayType,
y=None,
transforms: Optional[List[str]] = None,
) -> PCA:
(X,) = utils.batch_convert_to_dataframe(X)

# To mimic sklearn's behavior
if self.n_components is None:
self.n_components = min(X.shape)
self._bqml_model = self._bqml_model_factory.create_model(
X_train=X,
transforms=transforms,
options=self._bqml_options,
)
return self

@property
def components_(self) -> bpd.DataFrame:
if not self._bqml_model:
raise RuntimeError("A model must be fitted before calling components_.")

return self._bqml_model.principal_components()

@property
def explained_variance_(self) -> bpd.DataFrame:
if not self._bqml_model:
raise RuntimeError(
"A model must be fitted before calling explained_variance_."
)

return self._bqml_model.principal_component_info()[
["principal_component_id", "eigenvalue"]
].rename(columns={"eigenvalue": "explained_variance"})

@property
def explained_variance_ratio_(self) -> bpd.DataFrame:
if not self._bqml_model:
raise RuntimeError(
"A model must be fitted before calling explained_variance_ratio_."
)

return self._bqml_model.principal_component_info()[
["principal_component_id", "explained_variance_ratio"]
]

def predict(self, X: utils.ArrayType) -> bpd.DataFrame:
if not self._bqml_model:
raise RuntimeError("A model must be fitted before predict")

(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)

return self._bqml_model.predict(X)

def detect_anomalies(
self,
X: utils.ArrayType,
*,
contamination: float = 0.1,
) -> bpd.DataFrame:
"""Detect the anomaly data points of the input.

Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series):
Series or a DataFrame to detect anomalies.
contamination (float, default 0.1):
Identifies the proportion of anomalies in the training dataset that are used to create the model.
The value must be in the range [0, 0.5].

Returns:
bigframes.dataframe.DataFrame: detected DataFrame."""
if contamination < 0.0 or contamination > 0.5:
raise ValueError(
f"contamination must be [0.0, 0.5], but is {contamination}."
)

if not self._bqml_model:
raise RuntimeError("A model must be fitted before detect_anomalies")

(X,) = utils.batch_convert_to_dataframe(X, session=self._bqml_model.session)

return self._bqml_model.detect_anomalies(
X, options={"contamination": contamination}
)

def to_gbq(self, model_name: str, replace: bool = False) -> PCA:
"""Save the model to BigQuery.

Args:
model_name (str):
The name of the model.
replace (bool, default False):
Determine whether to replace if the model already exists. Default to False.

Returns:
PCA: Saved model."""
if not self._bqml_model:
raise RuntimeError("A model must be fitted before it can be saved")

new_model = self._bqml_model.copy(model_name, replace)
return new_model.session.read_gbq_model(model_name)

def score(
self,
X=None,
y=None,
) -> bpd.DataFrame:
if not self._bqml_model:
raise RuntimeError("A model must be fitted before score")

# TODO(b/291973741): X param is ignored. Update BQML supports input in ML.EVALUATE.
return self._bqml_model.evaluate()
2 changes: 2 additions & 0 deletions bigframes/ml/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
"LINEAR_REGRESSION": linear_model.LinearRegression,
"LOGISTIC_REGRESSION": linear_model.LogisticRegression,
"KMEANS": cluster.KMeans,
"MF": decomposition.MF,
"PCA": decomposition.PCA,
"BOOSTED_TREE_REGRESSOR": ensemble.XGBRegressor,
"BOOSTED_TREE_CLASSIFIER": ensemble.XGBClassifier,
Expand Down Expand Up @@ -82,6 +83,7 @@
def from_bq(
session: bigframes.session.Session, bq_model: bigquery.Model
) -> Union[
decomposition.MF,
decomposition.PCA,
cluster.KMeans,
linear_model.LinearRegression,
Expand Down
97 changes: 97 additions & 0 deletions third_party/bigframes_vendored/sklearn/decomposition/_mf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
""" Matrix Factorization.
"""

# Author: Alexandre Gramfort <[email protected]>
# Olivier Grisel <[email protected]>
# Mathieu Blondel <[email protected]>
# Denis A. Engemann <[email protected]>
# Michael Eickenberg <[email protected]>
# Giorgio Patrini <[email protected]>
#
# License: BSD 3 clause

from abc import ABCMeta

from bigframes_vendored.sklearn.base import BaseEstimator

from bigframes import constants


class MF(BaseEstimator, metaclass=ABCMeta):
"""Matrix Factorization (MF).

**Examples:**

>>> import bigframes.pandas as bpd
>>> from bigframes.ml.decomposition import MF
>>> X = bpd.DataFrame([[1, 1], [2, 1], [3, 1.2], [4, 1], [5, 0.8], [6, 1]])
>>> model = MF(n_components=2, init='random', random_state=0)
>>> W = model.fit_transform(X)
>>> H = model.components_

Args:
n_components (int, float or None, default None):
Number of components to keep. If n_components is not set, all
components are kept, n_components = min(n_samples, n_features).
If 0 < n_components < 1, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components.
num_factors (int or auto, default auto):
Specifies the number of latent factors to use.
If you aren't running hyperparameter tuning, then you can specify an INT64 value between 2 and 200. The default value is log2(n), where n is the number of training examples.
user_col (str):
The user column name.
item_col (str):
The item column name.
l2_reg (float, default 1.0):
If you aren't running hyperparameter tuning, then you can specify a FLOAT64 value. The default value is 1.0.
If you are running hyperparameter tuning, then you can use one of the following options:
The HPARAM_RANGE keyword and two FLOAT64 values that define the range to use for the hyperparameter. For example, L2_REG = HPARAM_RANGE(1.5, 5.0).
The HPARAM_CANDIDATES keyword and an array of FLOAT64 values that provide discrete values to use for the hyperparameter. For example, L2_REG = HPARAM_CANDIDATES([0, 1.0, 3.0, 5.0]).
"""

def fit(self, X, y=None):
"""Fit the model according to the given training data.

Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
Series or DataFrame of shape (n_samples, n_features). Training vector,
where `n_samples` is the number of samples and `n_features` is
the number of features.

y (default None):
Ignored.

Returns:
PCA: Fitted estimator.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def score(self, X=None, y=None):
"""Calculate evaluation metrics of the model.

.. note::

Output matches that of the BigQuery ML.EVALUATE function.
See: https://cloud.google.com/bigquery/docs/reference/standard-sql/bigqueryml-syntax-evaluate#matrix_factorization_models
for the outputs relevant to this model type.

Args:
X (default None):
Ignored.

y (default None):
Ignored.
Returns:
bigframes.dataframe.DataFrame: DataFrame that represents model metrics.
"""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)

def predict(self, X):
"""Predict the closest cluster for each sample in X.

Args:
X (bigframes.dataframe.DataFrame or bigframes.series.Series or pandas.core.frame.DataFrame or pandas.core.series.Series):
Series or a DataFrame to predict.

Returns:
bigframes.dataframe.DataFrame: Predicted DataFrames."""
raise NotImplementedError(constants.ABSTRACT_METHOD_ERROR_MESSAGE)
Loading