[python-package] remove uses of deprecated NumPy random number generation APIs, require 'numpy>=1.17.0' (#6468)

jameslamb · web-flow · commit e0cda880fc74 · 2024-06-03T20:17:40.000-05:00
diff --git a/.gitignore b/.gitignore
@@ -405,7 +405,7 @@ python-package/lightgbm/VERSION.txt
 
 # R build artefacts
 **/autom4te.cache/
-conftest*
+R-package/conftest*
 R-package/config.status
 !R-package/data/agaricus.test.rda
 !R-package/data/agaricus.train.rda
diff --git a/docs/Python-Intro.rst b/docs/Python-Intro.rst
@@ -59,8 +59,9 @@ Many of the examples in this page use functionality from ``numpy``. To run the e
 
 .. code:: python
 
-    data = np.random.rand(500, 10)  # 500 entities, each contains 10 features
-    label = np.random.randint(2, size=500)  # binary target
+    rng = np.random.default_rng()
+    data = rng.uniform(size=(500, 10))  # 500 entities, each contains 10 features
+    label = rng.integers(low=0, high=2, size=(500, ))  # binary target
     train_data = lgb.Dataset(data, label=label)
 
 **To load a scipy.sparse.csr\_matrix array into Dataset:**
@@ -139,15 +140,17 @@ It doesn't need to convert to one-hot encoding, and is much faster than one-hot
 
 .. code:: python
 
-    w = np.random.rand(500, )
+    rng = np.random.default_rng()
+    w = rng.uniform(size=(500, ))
     train_data = lgb.Dataset(data, label=label, weight=w)
 
 or
 
 .. code:: python
 
     train_data = lgb.Dataset(data, label=label)
-    w = np.random.rand(500, )
+    rng = np.random.default_rng()
+    w = rng.uniform(size=(500, ))
     train_data.set_weight(w)
 
 And you can use ``Dataset.set_init_score()`` to set initial score, and ``Dataset.set_group()`` to set group/query data for ranking tasks.
@@ -249,7 +252,8 @@ A model that has been trained or loaded can perform predictions on datasets:
 .. code:: python
 
     # 7 entities, each contains 10 features
-    data = np.random.rand(7, 10)
+    rng = np.random.default_rng()
+    data = rng.uniform(size=(7, 10))
     ypred = bst.predict(data)
 
 If early stopping is enabled during training, you can get predictions from the best iteration with ``bst.best_iteration``:
diff --git a/examples/python-guide/logistic_regression.py b/examples/python-guide/logistic_regression.py
@@ -22,15 +22,15 @@
 #################
 # Simulate some binary data with a single categorical and
 #   single continuous predictor
-np.random.seed(0)
+rng = np.random.default_rng(seed=0)
 N = 1000
 X = pd.DataFrame({"continuous": range(N), "categorical": np.repeat([0, 1, 2, 3, 4], N / 5)})
 CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2]
 LINEAR_TERM = np.array(
     [-0.5 + 0.01 * X["continuous"][k] + CATEGORICAL_EFFECTS[X["categorical"][k]] for k in range(X.shape[0])]
-) + np.random.normal(0, 1, X.shape[0])
+) + rng.normal(loc=0, scale=1, size=X.shape[0])
 TRUE_PROB = expit(LINEAR_TERM)
-Y = np.random.binomial(1, TRUE_PROB, size=N)
+Y = rng.binomial(n=1, p=TRUE_PROB, size=N)
 DATA = {
     "X": X,
     "probability_labels": TRUE_PROB,
@@ -65,10 +65,9 @@ def experiment(objective, label_type, data):
     result : dict
         Experiment summary stats.
     """
-    np.random.seed(0)
     nrounds = 5
     lgb_data = data[f"lgb_with_{label_type}_labels"]
-    params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1}
+    params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1, "seed": 123}
     time_zero = time.time()
     gbm = lgb.train(params, lgb_data, num_boost_round=nrounds)
     y_fitted = gbm.predict(data["X"])
diff --git a/python-package/lightgbm/compat.py b/python-package/lightgbm/compat.py
@@ -37,18 +37,6 @@ def __init__(self, *args: Any, **kwargs: Any):
 
     concat = None
 
-"""numpy"""
-try:
-    from numpy.random import Generator as np_random_Generator
-except ImportError:
-
-    class np_random_Generator:  # type: ignore
-        """Dummy class for np.random.Generator."""
-
-        def __init__(self, *args: Any, **kwargs: Any):
-            pass
-
-
 """matplotlib"""
 try:
     import matplotlib  # noqa: F401
diff --git a/python-package/lightgbm/sklearn.py b/python-package/lightgbm/sklearn.py
@@ -41,7 +41,6 @@
     _LGBMModelBase,
     _LGBMRegressorBase,
     dt_DataTable,
-    np_random_Generator,
     pd_DataFrame,
 )
 from .engine import train
@@ -476,7 +475,7 @@ def __init__(
         colsample_bytree: float = 1.0,
         reg_alpha: float = 0.0,
         reg_lambda: float = 0.0,
-        random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None,
+        random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
         n_jobs: Optional[int] = None,
         importance_type: str = "split",
         **kwargs: Any,
@@ -739,7 +738,7 @@ def _process_params(self, stage: str) -> Dict[str, Any]:
 
         if isinstance(params["random_state"], np.random.RandomState):
             params["random_state"] = params["random_state"].randint(np.iinfo(np.int32).max)
-        elif isinstance(params["random_state"], np_random_Generator):
+        elif isinstance(params["random_state"], np.random.Generator):
             params["random_state"] = int(params["random_state"].integers(np.iinfo(np.int32).max))
         if self._n_classes > 2:
             for alias in _ConfigAliases.get("num_class"):
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -19,7 +19,7 @@ classifiers = [
     "Topic :: Scientific/Engineering :: Artificial Intelligence"
 ]
 dependencies = [
-    "numpy",
+    "numpy>=1.17.0",
     "scipy"
 ]
 description = "LightGBM Python Package"
@@ -156,6 +156,8 @@ select = [
     "E",
     # pyflakes
     "F",
+    # NumPy-specific rules
+    "NPY",
     # pylint
     "PL",
     # flake8-return: unnecessary assignment before return
diff --git a/tests/python_package_test/conftest.py b/tests/python_package_test/conftest.py
@@ -0,0 +1,12 @@
+import numpy as np
+import pytest
+
+
+@pytest.fixture(scope="function")
+def rng():
+    return np.random.default_rng()
+
+
+@pytest.fixture(scope="function")
+def rng_fixed_seed():
+    return np.random.default_rng(seed=42)
diff --git a/tests/python_package_test/test_basic.py b/tests/python_package_test/test_basic.py
diff --git a/tests/python_package_test/test_engine.py b/tests/python_package_test/test_engine.py
diff --git a/tests/python_package_test/test_sklearn.py b/tests/python_package_test/test_sklearn.py