online-ml · MaxHalford · Nov 12, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/.github/actions/install-env/action.yml b/.github/actions/install-env/action.yml
@@ -26,7 +26,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.local # the path depends on the OS
-        key: poetry-2 # increment to reset cache
+        key: poetry-4 # increment to reset cache
 
     - name: Install poetry
       uses: snok/install-poetry@v1

diff --git a/build.py b/build.py
@@ -1,36 +1,23 @@
 import platform
-from distutils.command.build_ext import build_ext
-from distutils.errors import CCompilerError, DistutilsExecError, DistutilsPlatformError
+
+import numpy
 import setuptools
+from Cython.Build import cythonize
+from setuptools.command.build_ext import build_ext
+from setuptools.errors import CCompilerError
 from setuptools_rust import Binding, RustExtension
 
-try:
-    from numpy import __version__ as numpy_version
-    from numpy import get_include
-except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy"])
-    from numpy import __version__ as numpy_version
-    from numpy import get_include
-
-try:
-    from Cython.Build import cythonize
-except ImportError:
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "Cython"])
-    from Cython.Build import cythonize  # type: ignore
-
-
 ext_modules = cythonize(
     module_list=[
         setuptools.Extension(
             "*",
-            sources=["**/*.pyx"],
-            include_dirs=[get_include()],
+            sources=["river/**/*.pyx"],
+            include_dirs=[numpy.get_include()],
             libraries=[] if platform.system() == "Windows" else ["m"],
             define_macros=[("NPY_NO_DEPRECATED_API", "NPY_1_7_API_VERSION")],
         )
     ],
     compiler_directives={
-        "language_level": 3,
         "binding": True,
         "embedsignature": True,
     },
@@ -47,13 +34,13 @@ class ExtBuilder(build_ext):
     def run(self):
         try:
             build_ext.run(self)
-        except (DistutilsPlatformError, FileNotFoundError):
+        except (FileNotFoundError):
             raise BuildFailed("File not found. Could not compile C extension.")
 
     def build_extension(self, ext):
         try:
             build_ext.build_extension(self, ext)
-        except (CCompilerError, DistutilsExecError, DistutilsPlatformError, ValueError):
+        except (CCompilerError, ValueError):
             raise BuildFailed("Could not compile C extension.")
 
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,11 @@
 [build-system]
-requires = ["poetry-core>=1.0.0", "cython", "numpy", "setuptools", "wheel", "setuptools-rust"]
+requires = [
+    "poetry-core>=1.0.0",
+    "cython>3",
+    "numpy>=2.0.0",
+    "setuptools>70.1.0",
+    "setuptools-rust",
+]
 build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
@@ -20,7 +26,7 @@ include = [
     "river/datasets/*.zip",
     "river/stream/*.zip",
     "Cargo.toml",
-    "rust_src/**/*"
+    "rust_src/**/*",
 ]
 
 [tool.poetry.build]
@@ -29,34 +35,34 @@ script = "build.py"
 
 [tool.poetry.dependencies]
 python = "^3.9"
-numpy = "^1.23.0"
-scipy = "^1.12.1"
-pandas = "^2.1"
+numpy = ">=1.23.0"
+scipy = "^1.13.1"
+pandas = "^2.2.3"
 
 [tool.poetry.group.dev.dependencies]
 graphviz = "^0.20.1"
 gymnasium = "^0.29.0"
-matplotlib = "^3.0.2"
+matplotlib = "^3.8.4"
 mypy = "^1.11.1"
 pre-commit = "^3.5.0"
 pytest = "^7.4.2"
 ruff = "^0.4.10"
-scikit-learn = "^1.3.1"
+scikit-learn = "^1.5.1"
 sqlalchemy = "^2.0.22"
-sympy = "^1.10.1"
-pytest-xdist = {extras = ["psutil"], version = "^3.3.1"}
+sympy = "^1.12.1"
+pytest-xdist = { extras = ["psutil"], version = "^3.3.1" }
 ipykernel = "^6.26.0"
 ipython = "^8.17.2"
 rich = "^13.6.0"
 jupyter = "^1.0.0"
 mike = "^2.0.0"
-polars = "^0.20.8"
+polars = "^1.1.0"
 
 [tool.poetry.group.compat]
 optional = true
 
 [tool.poetry.group.compat.dependencies]
-scikit-learn = "^1.0.1"
+scikit-learn = "^1.5.1"
 sqlalchemy = "^2.0.0"
 
 [tool.poetry.group.docs]
@@ -84,7 +90,7 @@ optional = true
 
 [tool.poetry.group.benchmark.dependencies]
 "dominate" = "2.8.0"
-"scikit-learn" = "1.3.1"
+"scikit-learn" = "1.5.1"
 "tabulate" = "0.9.0"
 "vowpalwabbit" = "9.9.0"
 "watermark" = "2.4.3"
@@ -161,7 +167,7 @@ module = [
     "requests.*",
     "gymnasium.*",
     "sympy.*",
-    "polars.*"
+    "polars.*",
 ]
 ignore_missing_imports = true
 

diff --git a/river/compose/test_product.py b/river/compose/test_product.py
@@ -83,10 +83,10 @@ def test_issue_1253():
     >>> model = group1 + group1 * group2
     >>> XT = model.transform_many(X)
 
-    >>> XT.memory_usage().sum() // 1000
+    >>> XT.memory_usage().sum().item() // 1000
     85
 
-    >>> XT.sparse.to_dense().memory_usage().sum() // 1000
+    >>> XT.sparse.to_dense().memory_usage().sum().item() // 1000
     4455
 
     >>> X, y = datasets.make_regression(n_samples=6, n_features=2)

diff --git a/river/datasets/synth/anomaly_sine.py b/river/datasets/synth/anomaly_sine.py
@@ -139,4 +139,4 @@ def __iter__(self):
         self._generate_data()
 
         for xi, yi in itertools.zip_longest(self.X, self.y if hasattr(self.y, "__iter__") else []):
-            yield dict(zip(["sine", "cosine"], xi)), bool(yi)
+            yield dict(zip(["sine", "cosine"], xi.tolist())), bool(yi)
diff --git a/river/datasets/synth/logical.py b/river/datasets/synth/logical.py
@@ -67,7 +67,10 @@ def __iter__(self):
         X, Y = self._make_logical(n_tiles=self.n_tiles, shuffle=self.shuffle)
 
         for xi, yi in itertools.zip_longest(X, Y if hasattr(Y, "__iter__") else []):
-            yield dict(zip(self.feature_names, xi)), dict(zip(self.target_names, yi))
+            yield (
+                dict(zip(self.feature_names, xi.tolist())),
+                dict(zip(self.target_names, yi.tolist())),
+            )
 
     def _make_logical(self, n_tiles: int = 1, shuffle: bool = True):
         """Make toy dataset"""

diff --git a/river/ensemble/streaming_random_patches.py b/river/ensemble/streaming_random_patches.py
@@ -407,7 +407,7 @@ class SRPClassifier(BaseSRPEnsemble, base.Classifier):
     >>> metric = metrics.Accuracy()
 
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    Accuracy: 72.17%
+    Accuracy: 72.77%
 
     Notes
     -----

diff --git a/river/facto/ffm.py b/river/facto/ffm.py
@@ -255,7 +255,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class FFMClassifier(FFM, base.Classifier):

diff --git a/river/facto/fm.py b/river/facto/fm.py
@@ -238,7 +238,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class FMClassifier(FM, base.Classifier):

diff --git a/river/facto/fwfm.py b/river/facto/fwfm.py
@@ -275,7 +275,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class FwFMClassifier(FwFM, base.Classifier):

diff --git a/river/facto/hofm.py b/river/facto/hofm.py
@@ -267,7 +267,7 @@ def __init__(
 
     def predict_one(self, x):
         x = self._ohe_cat_features(x)
-        return self._raw_dot(x)
+        return self._raw_dot(x).item()
 
 
 class HOFMClassifier(HOFM, base.Classifier):

diff --git a/river/forest/adaptive_random_forest.py b/river/forest/adaptive_random_forest.py
@@ -565,7 +565,7 @@ class ARFClassifier(BaseForest, base.Classifier):
     >>> metric = metrics.Accuracy()
 
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    Accuracy: 67.97%
+    Accuracy: 67.57%
 
     The total number of warnings and drifts detected, respectively
     >>> model.n_warnings_detected(), model.n_drifts_detected()
@@ -849,7 +849,7 @@ class ARFRegressor(BaseForest, base.Regressor):
     >>> metric = metrics.MAE()
 
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    MAE: 0.772113
+    MAE: 0.793949
 
     """
 

diff --git a/river/forest/online_extra_trees.py b/river/forest/online_extra_trees.py
@@ -614,7 +614,7 @@ class OXTRegressor(ExtraTrees, base.Regressor):
     >>> metric = metrics.RMSE()
 
     >>> evaluate.progressive_val_score(dataset, model, metric)
-    RMSE: 3.16212
+    RMSE: 2.849735
 
     References
     ----------

diff --git a/river/imblearn/chebyshev.py b/river/imblearn/chebyshev.py
@@ -162,10 +162,10 @@ class ChebyshevOverSampler(base.Wrapper, base.Regressor):
     ...     metrics.MAE(),
     ...     print_every=500
     ... )
-    [500] MAE: 1.629786
-    [1,000] MAE: 1.663799
-    [1,001] MAE: 1.66253
-    MAE: 1.66253
+    [500] MAE: 1.64417
+    [1,000] MAE: 1.676185
+    [1,001] MAE: 1.674668
+    MAE: 1.674668
 
     References
     ----------

diff --git a/river/linear_model/bayesian_lin_reg.py b/river/linear_model/bayesian_lin_reg.py
@@ -211,7 +211,7 @@ def predict_one(self, x, with_dist=False):
         """
 
         # Bishop equation 3.58
-        y_pred_mean = utils.math.dot(self._m, x)
+        y_pred_mean = 0.0 if not len(self._m) else utils.math.dot(self._m, x).item()
         if not with_dist:
             return y_pred_mean
 

diff --git a/river/naive_bayes/base.py b/river/naive_bayes/base.py
@@ -83,7 +83,7 @@ def one_hot_encode(y: pd.Series) -> pd.DataFrame:
     """
     classes = np.unique(y)
     indices = np.searchsorted(classes, y)
-    indptr = np.hstack((0, np.cumsum(np.in1d(y, classes))))
+    indptr = np.hstack((0, np.cumsum(np.isin(y, classes))))
     data = np.empty_like(indices)
     data.fill(1)
     return pd.DataFrame.sparse.from_spmatrix(

diff --git a/river/optim/initializers.py b/river/optim/initializers.py
@@ -80,7 +80,7 @@ class Normal(Initializer):
     >>> init = optim.initializers.Normal(mu=0, sigma=1, seed=42)
 
     >>> init(shape=1)
-    0.496714
+    np.float64(0.4967141...)
 
     >>> init(shape=2)
     array([-0.1382643 ,  0.64768854])

diff --git a/river/preprocessing/lda.py b/river/preprocessing/lda.py
@@ -209,7 +209,7 @@ def transform_one(self, x):
         # Sample empirical topic assignment:
         _, components = self._compute_statistics_components(words_indexes_list)
 
-        return dict(enumerate(components))
+        return dict(enumerate(components.tolist()))
 
     def _update_indexes(self, word_list: typing.Iterable[str]):
         """

diff --git a/river/preprocessing/scale.py b/river/preprocessing/scale.py
@@ -212,10 +212,12 @@ def learn_many(self, X: pd.DataFrame):
             a = old_count / (old_count + new_count)
             b = new_count / (old_count + new_count)
 
-            self.means[col] = a * old_mean + b * new_mean
+            self.means[col] = (a * old_mean + b * new_mean).item()
             if self.with_std:
-                self.vars[col] = a * old_var + b * new_var + a * b * (old_mean - new_mean) ** 2
-            self.counts[col] += new_count
+                self.vars[col] = (
+                    a * old_var + b * new_var + a * b * (old_mean - new_mean) ** 2
+                ).item()
+            self.counts[col] += new_count.item()
 
     def transform_many(self, X: pd.DataFrame):
         """Scale a mini-batch of features.

diff --git a/river/proba/beta.py b/river/proba/beta.py
@@ -92,20 +92,20 @@ def revert(self, x):
         else:
             self.beta -= 1
 
-    def __call__(self, p: float):
+    def __call__(self, p: float) -> float:
         return (
             p ** (self.alpha - 1) * (1 - p) ** (self.beta - 1) / _beta_func(self.alpha, self.beta)
         )
 
-    def sample(self):
+    def sample(self) -> float:
         return self._rng.betavariate(self.alpha, self.beta)
 
     @property
-    def mode(self):
+    def mode(self) -> float:
         try:
             return (self.alpha - 1) / (self.alpha + self.beta - 2)
         except ZeroDivisionError:
             return 0.5
 
-    def cdf(self, x):
-        return scipy.special.betainc(self.alpha, self.beta, x)
+    def cdf(self, x) -> float:
+        return scipy.special.betainc(self.alpha, self.beta, x).item()
diff --git a/river/proba/gaussian.py b/river/proba/gaussian.py
@@ -72,7 +72,7 @@ def update(self, x, w=1.0):
     def revert(self, x, w=1.0):
         self._var.revert(x, w)
 
-    def __call__(self, x):
+    def __call__(self, x) -> float:
         var = self._var.get()
         if var:
             try:
@@ -83,17 +83,17 @@ def __call__(self, x):
                 return 0.0
         return 0.0
 
-    def cdf(self, x):
+    def cdf(self, x) -> float:
         try:
             return 0.5 * (1.0 + math.erf((x - self.mu) / (self.sigma * math.sqrt(2.0))))
         except ZeroDivisionError:
             return 0.0
 
-    def sample(self):
+    def sample(self) -> float:
         return self._rng.gauss(self.mu, self.sigma)
 
     @property
-    def mode(self):
+    def mode(self) -> float:
         return self.mu
 
 
@@ -207,7 +207,7 @@ class MultivariateGaussian(base.MultivariateContinuousDistribution):
     >>> multi.mu['blue'] == single.mu
     True
     >>> multi.sigma['blue']['blue'] == single.sigma
-    True
+    np.True_
 
     """
 

diff --git a/river/reco/biased_mf.py b/river/reco/biased_mf.py
@@ -199,7 +199,7 @@ def predict_one(self, user, item, x=None):
         # Add the dot product of the user and the item latent vectors
         y_pred += np.dot(self.u_latents[user], self.i_latents[item])
 
-        return y_pred
+        return y_pred.item()
 
     def learn_one(self, user, item, y, x=None):
         # Update the global mean