[backport] Compatibility fixes for scikit-learn 1.6 (dmlc#11021, dmlc#11162) (dmlc#11205)

hcho3 · jameslamb · trivialfis · web-flow · commit b8cfb5691a31 · 2025-02-05T15:37:48.000-08:00
* Adapt to scikit-learn 1.6 estimator tag changes (dmlc#11021) * More sklearn tag support. (dmlc#11162) * [CI] Unpin scikit-learn * Remove test_doc_link() test --------- Co-authored-by: James Lamb <jaylamb20@gmail.com> Co-authored-by: Jiaming Yuan <jm.yuan@outlook.com>
diff --git a/.gitignore b/.gitignore
@@ -139,11 +139,13 @@ credentials.csv
 .bloop
 
 # python tests
+*.bin
 demo/**/*.txt
 *.dmatrix
 .hypothesis
 __MACOSX/
 model*.json
+/tests/python/models/models/
 
 # R tests
 *.htm
diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml
@@ -62,6 +62,8 @@ extension-pkg-whitelist = ["numpy"]
 disable = [
     "attribute-defined-outside-init",
     "import-outside-toplevel",
+    "too-few-public-methods",
+    "too-many-ancestors",
     "too-many-nested-blocks",
     "unexpected-special-method-signature",
     "unsubscriptable-object",
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
@@ -45,32 +45,43 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
 
 # sklearn
 try:
+    from sklearn import __version__ as _sklearn_version
     from sklearn.base import BaseEstimator as XGBModelBase
     from sklearn.base import ClassifierMixin as XGBClassifierBase
     from sklearn.base import RegressorMixin as XGBRegressorBase
-    from sklearn.preprocessing import LabelEncoder
 
     try:
-        from sklearn.model_selection import KFold as XGBKFold
         from sklearn.model_selection import StratifiedKFold as XGBStratifiedKFold
     except ImportError:
-        from sklearn.cross_validation import KFold as XGBKFold
         from sklearn.cross_validation import StratifiedKFold as XGBStratifiedKFold
 
+    # sklearn.utils Tags types can be imported unconditionally once
+    # xgboost's minimum scikit-learn version is 1.6 or higher
+    try:
+        from sklearn.utils import Tags as _sklearn_Tags
+    except ImportError:
+        _sklearn_Tags = object
+
     SKLEARN_INSTALLED = True
 
 except ImportError:
     SKLEARN_INSTALLED = False
 
     # used for compatibility without sklearn
-    XGBModelBase = object
-    XGBClassifierBase = object
-    XGBRegressorBase = object
-    LabelEncoder = object
+    class XGBModelBase:  # type: ignore[no-redef]
+        """Dummy class for sklearn.base.BaseEstimator."""
+
+    class XGBClassifierBase:  # type: ignore[no-redef]
+        """Dummy class for sklearn.base.ClassifierMixin."""
+
+    class XGBRegressorBase:  # type: ignore[no-redef]
+        """Dummy class for sklearn.base.RegressorMixin."""
 
-    XGBKFold = None
     XGBStratifiedKFold = None
 
+    _sklearn_Tags = object
+    _sklearn_version = object
+
 
 _logger = logging.getLogger(__name__)
 
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -410,7 +410,7 @@ def c_array(
 def from_array_interface(interface: dict) -> NumpyOrCupy:
     """Convert array interface to numpy or cupy array"""
 
-    class Array:  # pylint: disable=too-few-public-methods
+    class Array:
         """Wrapper type for communicating with numpy and cupy."""
 
         _interface: Optional[dict] = None
diff --git a/python-package/xgboost/dask/__init__.py b/python-package/xgboost/dask/__init__.py
@@ -1,8 +1,6 @@
 # pylint: disable=too-many-arguments, too-many-locals
 # pylint: disable=missing-class-docstring, invalid-name
 # pylint: disable=too-many-lines
-# pylint: disable=too-few-public-methods
-# pylint: disable=import-error
 """
 Dask extensions for distributed training
 ----------------------------------------
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -29,7 +29,14 @@
 
 # Do not use class names on scikit-learn directly.  Re-define the classes on
 # .compat to guarantee the behavior without scikit-learn
-from .compat import SKLEARN_INSTALLED, XGBClassifierBase, XGBModelBase, XGBRegressorBase
+from .compat import (
+    SKLEARN_INSTALLED,
+    XGBClassifierBase,
+    XGBModelBase,
+    XGBRegressorBase,
+    _sklearn_Tags,
+    _sklearn_version,
+)
 from .config import config_context
 from .core import (
     Booster,
@@ -45,7 +52,7 @@
 from .training import train
 
 
-class XGBRankerMixIn:  # pylint: disable=too-few-public-methods
+class XGBRankerMixIn:
     """MixIn for ranking, defines the _estimator_type usually defined in scikit-learn
     base classes.
 
@@ -69,7 +76,7 @@ def _can_use_qdm(tree_method: Optional[str]) -> bool:
     return tree_method in ("hist", "gpu_hist", None, "auto")
 
 
-class _SklObjWProto(Protocol):  # pylint: disable=too-few-public-methods
+class _SklObjWProto(Protocol):
     def __call__(
         self,
         y_true: ArrayLike,
@@ -782,11 +789,52 @@ def __init__(
 
     def _more_tags(self) -> Dict[str, bool]:
         """Tags used for scikit-learn data validation."""
-        tags = {"allow_nan": True, "no_validation": True}
+        tags = {"allow_nan": True, "no_validation": True, "sparse": True}
         if hasattr(self, "kwargs") and self.kwargs.get("updater") == "shotgun":
             tags["non_deterministic"] = True
+
+        tags["categorical"] = self.enable_categorical
+        return tags
+
+    @staticmethod
+    def _update_sklearn_tags_from_dict(
+        *,
+        tags: _sklearn_Tags,
+        tags_dict: Dict[str, bool],
+    ) -> _sklearn_Tags:
+        """Update ``sklearn.utils.Tags`` inherited from ``scikit-learn`` base classes.
+
+        ``scikit-learn`` 1.6 introduced a dataclass-based interface for estimator tags.
+        ref: https://github.com/scikit-learn/scikit-learn/pull/29677
+
+        This method handles updating that instance based on the values in
+        ``self._more_tags()``.
+
+        """
+        tags.non_deterministic = tags_dict.get("non_deterministic", False)
+        tags.no_validation = tags_dict["no_validation"]
+        tags.input_tags.allow_nan = tags_dict["allow_nan"]
+        tags.input_tags.sparse = tags_dict["sparse"]
+        tags.input_tags.categorical = tags_dict["categorical"]
         return tags
 
+    def __sklearn_tags__(self) -> _sklearn_Tags:
+        # XGBModelBase.__sklearn_tags__() cannot be called unconditionally,
+        # because that method isn't defined for scikit-learn<1.6
+        if not hasattr(XGBModelBase, "__sklearn_tags__"):
+            err_msg = (
+                "__sklearn_tags__() should not be called when using scikit-learn<1.6. "
+                f"Detected version: {_sklearn_version}"
+            )
+            raise AttributeError(err_msg)
+
+        # take whatever tags are provided by BaseEstimator, then modify
+        # them with XGBoost-specific values
+        return self._update_sklearn_tags_from_dict(
+            tags=super().__sklearn_tags__(),  # pylint: disable=no-member
+            tags_dict=self._more_tags(),
+        )
+
     def __sklearn_is_fitted__(self) -> bool:
         return hasattr(self, "_Booster")
 
@@ -841,13 +889,27 @@ def get_params(self, deep: bool = True) -> Dict[str, Any]:
         """Get parameters."""
         # Based on: https://stackoverflow.com/questions/59248211
         # The basic flow in `get_params` is:
-        # 0. Return parameters in subclass first, by using inspect.
-        # 1. Return parameters in `XGBModel` (the base class).
+        # 0. Return parameters in subclass (self.__class__) first, by using inspect.
+        # 1. Return parameters in all parent classes (especially `XGBModel`).
         # 2. Return whatever in `**kwargs`.
         # 3. Merge them.
+        #
+        # This needs to accommodate being called recursively in the following
+        # inheritance graphs (and similar for classification and ranking):
+        #
+        #   XGBRFRegressor -> XGBRegressor -> XGBModel -> BaseEstimator
+        #                     XGBRegressor -> XGBModel -> BaseEstimator
+        #                                     XGBModel -> BaseEstimator
+        #
         params = super().get_params(deep)
         cp = copy.copy(self)
-        cp.__class__ = cp.__class__.__bases__[0]
+        # If the immediate parent defines get_params(), use that.
+        if callable(getattr(cp.__class__.__bases__[0], "get_params", None)):
+            cp.__class__ = cp.__class__.__bases__[0]
+        # Otherwise, skip it and assume the next class will have it.
+        # This is here primarily for cases where the first class in MRO is a scikit-learn mixin.
+        else:
+            cp.__class__ = cp.__class__.__bases__[1]
         params.update(cp.__class__.get_params(cp, deep))
         # if kwargs is a dict, update params accordingly
         if hasattr(self, "kwargs") and isinstance(self.kwargs, dict):
@@ -1431,7 +1493,7 @@ def _cls_predict_proba(n_classes: int, prediction: PredtT, vstack: Callable) ->
         Number of boosting rounds.
 """,
 )
-class XGBClassifier(XGBModel, XGBClassifierBase):
+class XGBClassifier(XGBClassifierBase, XGBModel):
     # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
     @_deprecate_positional_args
     def __init__(
@@ -1447,6 +1509,12 @@ def _more_tags(self) -> Dict[str, bool]:
         tags["multilabel"] = True
         return tags
 
+    def __sklearn_tags__(self) -> _sklearn_Tags:
+        tags = super().__sklearn_tags__()
+        tags_dict = self._more_tags()
+        tags.classifier_tags.multi_label = tags_dict["multilabel"]
+        return tags
+
     @_deprecate_positional_args
     def fit(
         self,
@@ -1717,7 +1785,7 @@ def fit(
     "Implementation of the scikit-learn API for XGBoost regression.",
     ["estimators", "model", "objective"],
 )
-class XGBRegressor(XGBModel, XGBRegressorBase):
+class XGBRegressor(XGBRegressorBase, XGBModel):
     # pylint: disable=missing-docstring
     @_deprecate_positional_args
     def __init__(
@@ -1731,6 +1799,13 @@ def _more_tags(self) -> Dict[str, bool]:
         tags["multioutput_only"] = False
         return tags
 
+    def __sklearn_tags__(self) -> _sklearn_Tags:
+        tags = super().__sklearn_tags__()
+        tags_dict = self._more_tags()
+        tags.target_tags.multi_output = tags_dict["multioutput"]
+        tags.target_tags.single_output = not tags_dict["multioutput_only"]
+        return tags
+
 
 @xgboost_model_doc(
     "scikit-learn API for XGBoost random forest regression.",
@@ -1858,7 +1933,7 @@ def _get_qid(
         `qid` can be a special column of input `X` instead of a separated parameter, see
         :py:meth:`fit` for more info.""",
 )
-class XGBRanker(XGBModel, XGBRankerMixIn):
+class XGBRanker(XGBRankerMixIn, XGBModel):
     # pylint: disable=missing-docstring,too-many-arguments,invalid-name
     @_deprecate_positional_args
     def __init__(self, *, objective: str = "rank:ndcg", **kwargs: Any):
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
@@ -2,8 +2,8 @@
 
 import base64
 
-# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
-# pylint: disable=too-few-public-methods, too-many-lines, too-many-branches
+# pylint: disable=fixme, protected-access, no-member, invalid-name
+# pylint: disable=too-many-lines, too-many-branches
 import json
 import logging
 import os
diff --git a/python-package/xgboost/spark/estimator.py b/python-package/xgboost/spark/estimator.py
@@ -1,7 +1,6 @@
 """Xgboost pyspark integration submodule for estimator API."""
 
-# pylint: disable=too-many-ancestors
-# pylint: disable=fixme, too-many-ancestors, protected-access, no-member, invalid-name
+# pylint: disable=fixme, protected-access, no-member, invalid-name
 # pylint: disable=unused-argument, too-many-locals
 
 import warnings
diff --git a/python-package/xgboost/spark/params.py b/python-package/xgboost/spark/params.py
@@ -2,7 +2,6 @@
 
 from typing import Dict
 
-# pylint: disable=too-few-public-methods
 from pyspark.ml.param import TypeConverters
 from pyspark.ml.param.shared import Param, Params
 
diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py
@@ -43,7 +43,7 @@ def _get_default_params_from_func(
     return filtered_params_dict
 
 
-class CommunicatorContext(CCtx):  # pylint: disable=too-few-public-methods
+class CommunicatorContext(CCtx):
     """Context with PySpark specific task ID."""
 
     def __init__(self, context: BarrierTaskContext, **args: Any) -> None:
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
@@ -564,7 +564,7 @@ def is_binary(self) -> bool:
         return self.max_rel == 1
 
 
-class PBM:  # pylint: disable=too-few-public-methods
+class PBM:
     """Simulate click data with position bias model. There are other models available in
     `ULTRA <https://github.com/ULTR-Community/ULTRA.git>`_ like the cascading model.
 
diff --git a/tests/ci_build/Dockerfile.gpu b/tests/ci_build/Dockerfile.gpu
@@ -27,8 +27,7 @@ RUN \
         "nccl>=${NCCL_SHORT_VER}" \
         dask \
         dask-cuda=$RAPIDS_VERSION_ARG* dask-cudf=$RAPIDS_VERSION_ARG* cupy \
-        numpy pytest pytest-timeout scipy \
-        "scikit-learn<=1.5.2" \
+        numpy pytest pytest-timeout scipy scikit-learn \
         pandas matplotlib wheel python-kubernetes urllib3 graphviz "hypothesis<=6.112" \
         "pyspark>=3.4.0" cloudpickle cuda-python && \
     mamba clean --all && \
diff --git a/tests/ci_build/conda_env/win64_test.yml b/tests/ci_build/conda_env/win64_test.yml
@@ -6,7 +6,7 @@ dependencies:
 - numpy
 - scipy
 - matplotlib
-- scikit-learn<=1.5.2
+- scikit-learn
 - pandas
 - pytest
 - boto3
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py