Implement Shapley values (#111)

RAMitchell · web-flow · commit 2293a7c18138 · 2024-07-18T15:09:02.000+02:00
diff --git a/examples/batch_training/batch_training.png b/examples/batch_training/batch_training.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a831958b4bab9fafa697c1030fae7bd227e11d0865717c92ae4dead506adb32c
-size 42617
+oid sha256:3054e5187332f6626fb26cd3d4a3aa19ceb7990217bc99dd51cee082e7820a54
+size 44220
diff --git a/examples/kernel_ridge_regression/kernel_ridge_regression.png b/examples/kernel_ridge_regression/kernel_ridge_regression.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be8e9ffe773df62be535efbac096f6a46f751247f3b9c31b2e4e469fcb0554ca
-size 67018
+oid sha256:4954553fbd28f83edd272f43d5edf83401b148fa023c39681b07b55013888b86
+size 154299
diff --git a/examples/linear_model/linear_model.png b/examples/linear_model/linear_model.png
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c78bb070c7c49818d01c2fca0e6b7520124ecdc230e938ccf581e84a7804086a
-size 96563
+oid sha256:128ac98ddbc21b3542ec861b2ef4efda9ffcaf742d8a6e8538c73720d875bb15
+size 94096
diff --git a/examples/notebook/interpretability.ipynb b/examples/notebook/interpretability.ipynb
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:74db56d0d128cada8bf0edebd8cb5e519f6603abe2b2be7564696a8d18d8c2c6
+size 180122
diff --git a/examples/probabalistic_regression/probabilistic_regression.gif b/examples/probabalistic_regression/probabilistic_regression.gif
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03eb594a5ea974f86f938c08a78bc9fe0e07e3d70e7bfde3eaa51076a0e26730
-size 848444
+oid sha256:eceba902d5a7ae3c95c8806c763ba49d0aa34437a90166508ad201c1dcb7c60f
+size 61641
diff --git a/legateboost/legateboost.py b/legateboost/legateboost.py
@@ -16,6 +16,7 @@
 from .metrics import BaseMetric, metrics
 from .models import BaseModel, Tree
 from .objectives import BaseObjective, objectives
+from .shapley import global_shapley_attributions, local_shapley_attributions
 from .utils import PickleCunumericMixin, preround
 
 if TYPE_CHECKING:
@@ -48,6 +49,7 @@ def __init__(
         self.random_state = random_state
         self.model_init_: cn.ndarray
         self.callbacks = callbacks
+        self.metrics_: list[BaseMetric]
         if not isinstance(base_models, tuple):
             raise ValueError("base_models must be a tuple")
         self.base_models = base_models
@@ -444,6 +446,113 @@ def dump_models(self) -> str:
             text += str(m)
         return text
 
+    def global_attributions(
+        self,
+        X: cn.array,
+        y: cn.array,
+        metric: Optional[BaseMetric] = None,
+        random_state: Optional[np.random.RandomState] = None,
+        n_samples: int = 5,
+        check_efficiency: bool = False,
+    ) -> Tuple[cn.array, cn.array]:
+        r"""Compute global feature attributions for the model. Global
+        attributions show the effect of a feature on a model's loss function.
+
+        We use a Shapley value approach to compute the attributions:
+        :math:`Sh_i(v)=\frac{1}{|N|!} \sum_{\sigma \in \mathfrak{S}_d} \big[ v([\sigma]_{i-1} \cup\{i\}) - v([\sigma]_{i-1}) \big],`
+        where :math:`v` is the model's loss function, :math:`N` is the set of features, and :math:`\mathfrak{S}_d` is the set of all permutations of the features.
+        :math:`[\sigma]_{i-1}` represents the set of players ranked lower than :math:`i` in the ordering :math:`\sigma`.
+
+        In effect the shapley value shows the effect of adding a feature to the model, averaged over all possible orderings of the features. In our case the above function is approximated using an antithetic-sampling method [#]_, where `n_samples` corresponds to pairs of permutation samples. This method also returns the standard error, which decreases according to :math:`1/\sqrt{n\_samples}`.
+
+        This definition of attributions requires removing a feature from the active set. We use a random sample of values from X to fill in the missing feature values. This choice of background distribution corresponds to an 'interventional' Shapley value approach discussed in [#]_.
+
+
+        .. [#] Mitchell, Rory, et al. "Sampling permutations for shapley value estimation." Journal of Machine Learning Research 23.43 (2022): 1-46.
+        .. [#] Covert, Ian, Scott M. Lundberg, and Su-In Lee. "Understanding global feature contributions with additive importance measures." Advances in Neural Information Processing Systems 33 (2020): 17212-17223.
+
+        The method uses memory (and time) proportional to :math:`n\_samples \times n\_features \times n\_background\_samples`. Reduce the number of background samples or the size of X to speed up computation and reduce memory usage. X does not need to be the entire training set to get useful estimates.
+
+        See the method :func:`~legateboost.BaseModel.local_attributions` for the effect of features on individual prediction outputs.
+
+        Parameters
+        ----------
+        X : cn.array
+            The input data.
+        y : cn.array
+            The target values.
+        metric : BaseMetric, optional
+            The metric to evaluate the model. If None, the model default metric is used.
+        random_state : int, optional
+            The random state for reproducibility.
+        n_samples : int, optional
+            The number of sample pairs to use in the antithetic sampling method.
+        check_efficiency : bool, optional
+            If True, check that shapley values + null coalition add up to the final loss for X, y (the so called efficiency property of Shapley values)'.
+
+        Returns
+        -------
+        cn.array
+            The Shapley value estimates for each feature. The last value is the null coalition loss. The sum of this array results in the loss for X, y.
+        cn.array
+            The standard error of the Shapley value esimates, with respect to `n_samples`. The standard error decreases according to :math:`1/\sqrt{n\_samples}`.
+        """  # noqa: E501
+        check_is_fitted(self, "is_fitted_")
+        return global_shapley_attributions(
+            self,
+            X,
+            y,
+            metric,
+            random_state,
+            n_samples,
+            check_efficiency,
+        )
+
+    def local_attributions(
+        self,
+        X: cn.array,
+        X_background: cn.array,
+        random_state: Optional[np.random.RandomState] = None,
+        n_samples: int = 5,
+        check_efficiency: bool = False,
+    ) -> Tuple[cn.array, cn.array]:
+        r"""Local feature attributions for model predictions. Shows the effect
+        of a feature on each output prediction. See the definition of Shapley
+        values in :func:`~legateboost.BaseModel.global_attributions`, where the
+        :math:`v` function is here the model prediction instead of the loss
+        function.
+
+        Parameters
+        ----------
+        X : cn.array
+            The input data.
+        X_background : cn.array
+            The background data to use for missing feature values. This could be a random sample of training data (e.g. between 10-100 instances).
+        random_state : int, optional
+            The random state for reproducibility.
+        n_samples : int
+            The number of sample pairs to use in the antithetic sampling method.
+        check_efficiency : bool
+            If True, check that shapley values + null prediction add up to the final predictions for X (the so called efficiency property of Shapley values).
+
+
+        Returns
+        -------
+        cn.array
+            The Shapley value estimates for each feature. The final value is the 'null prediction', where all features are turned off. The sum of this array results in the model prediction.
+        cn.array
+            The standard error of the Shapley value esimates, with respect to `n_samples`. The standard error decreases according to :math:`1/\sqrt{n\_samples}`.
+        """  # noqa: E501
+        check_is_fitted(self, "is_fitted_")
+        return local_shapley_attributions(
+            self,
+            X,
+            X_background,
+            random_state,
+            n_samples,
+            check_efficiency,
+        )
+
 
 class LBRegressor(LBBase, RegressorMixin):
     """Implementation of a gradient boosting algorithm for regression problems.
@@ -856,7 +965,7 @@ def predict_proba(self, X: cn.ndarray) -> cn.ndarray:
         check_is_fitted(self, "is_fitted_")
         pred = self._objective_instance.transform(super()._predict(X))
         if pred.shape[1] == 1:
-            pred = pred.squeeze()
+            pred = pred.reshape(-1)
             pred = cn.stack([1.0 - pred, pred], axis=1)
         return pred
 
diff --git a/legateboost/shapley.py b/legateboost/shapley.py
@@ -0,0 +1,134 @@
+from typing import TYPE_CHECKING, Optional, Tuple
+
+import numpy as np
+from sklearn.base import is_regressor
+from sklearn.utils.validation import check_random_state
+
+import cunumeric as cn
+
+from .metrics import BaseMetric
+
+# provide definitions for mypy without circular import at runtime
+if TYPE_CHECKING:
+    from .legateboost import LBBase
+
+
+def global_shapley_attributions(
+    model: "LBBase",
+    X: cn.array,
+    y: cn.array,
+    metric_in: Optional[BaseMetric] = None,
+    random_state: Optional[np.random.RandomState] = None,
+    n_samples: int = 5,
+    check_efficiency: bool = False,
+) -> Tuple[cn.array, cn.array]:
+    def predict_fn(X: cn.array) -> cn.array:
+        fn = model.predict if is_regressor(model) else model.predict_proba
+        return fn(X)
+
+    metric = metric_in if metric_in is not None else model._metrics[0]
+
+    random_state_ = check_random_state(random_state)
+    w = cn.ones(y.shape[0])
+    gen = cn.random.default_rng(seed=random_state_.randint(2**32))
+
+    # antithetic sampling
+    v_a = cn.empty((X.shape[1] + 1, n_samples))
+    v_b = cn.empty((X.shape[1] + 1, n_samples))
+
+    def eval_sample(p: cn.array, v: cn.array) -> None:
+        # cunumeric has no shuffle as of writing
+        # with replacement should be fine
+        X_temp = X[gen.integers(0, X.shape[0], X.shape[0])]
+        null_loss = metric.metric(y, predict_fn(X_temp), w)
+        v[-1] = null_loss
+        previous_loss = null_loss
+        for feature in p:
+            X_temp[:, feature] = X[:, feature]
+            loss = metric.metric(y, predict_fn(X_temp), w)
+            v[feature] = loss - previous_loss
+            previous_loss = loss
+
+    for i in range(n_samples):
+        p_a = random_state_.permutation(X.shape[1])
+        p_b = cn.flip(p_a)
+        eval_sample(p_a, v_a[:, i])
+        eval_sample(p_b, v_b[:, i])
+
+    v = (v_a + v_b) / 2
+    shapley_values = cn.mean(v, axis=1)
+    se = cn.std(v, axis=1, ddof=1) / cn.sqrt(n_samples)
+
+    if check_efficiency:
+        full_coalition_loss = metric.metric(y, predict_fn(X), cn.ones(y.shape[0]))
+        if not cn.isclose(cn.sum(shapley_values), full_coalition_loss):
+            raise ValueError("Shapley values do not sum up to the full coalition loss")
+    return shapley_values, se
+
+
+def local_shapley_attributions(
+    model: "LBBase",
+    X: cn.array,
+    X_background: cn.array,
+    random_state: Optional[np.random.RandomState] = None,
+    n_samples: int = 5,
+    check_efficiency: bool = False,
+) -> Tuple[cn.array, cn.array]:
+    def predict_fn(X: cn.array) -> cn.array:
+        fn = model.predict if is_regressor(model) else model.predict_proba
+        p = fn(X)
+        if p.ndim == 1:
+            return p.reshape(-1, 1)
+        return p
+
+    random_state_ = check_random_state(random_state)
+    n_background_samples = 5
+    gen = cn.random.default_rng(seed=random_state_.randint(2**32))
+
+    n_outputs = predict_fn(X[:1, ...]).shape[1]
+
+    # antithetic sampling
+    # perhaps we can do a running mean/se to avoid the last dimension
+    v_a = cn.zeros((X.shape[0], X.shape[1] + 1, n_outputs, n_samples))
+    v_b = cn.zeros((X.shape[0], X.shape[1] + 1, n_outputs, n_samples))
+
+    def eval_sample(p: cn.array, v: cn.array) -> None:
+        X_temp = X_background[
+            gen.integers(
+                0, X_background.shape[0], size=(X.shape[0], n_background_samples)
+            )
+        ]
+        null_pred = predict_fn(X_temp.reshape(X.shape[0] * n_background_samples, -1))
+        v[:, -1, :, i] = null_pred.reshape(
+            X.shape[0], n_background_samples, n_outputs
+        ).mean(axis=1)
+        previous_pred = null_pred
+        for feature in p:
+            X_temp[:, :, feature] = X[:, cn.newaxis, feature]
+            pred = predict_fn(X_temp.reshape(X.shape[0] * n_background_samples, -1))
+            v[:, feature, :, i] = (
+                (pred - previous_pred)
+                .reshape(X.shape[0], n_background_samples, n_outputs)
+                .mean(axis=1)
+            )
+            previous_pred = pred
+
+    for i in range(n_samples):
+        p_a = random_state_.permutation(X.shape[1])
+        p_b = cn.flip(p_a)
+        eval_sample(p_a, v_a)
+        eval_sample(p_b, v_b)
+
+    v = (v_a + v_b) / 2
+    shapley_values = cn.mean(v, axis=-1)
+    se = cn.std(v, axis=-1, ddof=1) / cn.sqrt(n_samples)
+
+    if check_efficiency:
+        pred = predict_fn(X)
+        if not cn.allclose(cn.sum(shapley_values, axis=1), pred):
+            raise ValueError("Shapley values do not sum up to the predictions")
+
+    if n_outputs == 1:
+        shapley_values = shapley_values[:, :, 0]
+        se = se[:, :, 0]
+    return shapley_values, se
diff --git a/legateboost/test/test_shapley.py b/legateboost/test/test_shapley.py
@@ -0,0 +1,92 @@
+import pytest
+from sklearn.datasets import make_classification, make_regression
+
+import cunumeric as cn
+import legateboost as lb
+
+
+@pytest.mark.parametrize("random_state", range(2))
+@pytest.mark.parametrize("metric", [None, lb.metrics.MSEMetric()])
+@pytest.mark.parametrize("num_outputs", [1, 2])
+def test_regressor_global_shapley_attributions(random_state, metric, num_outputs):
+    X, y = make_regression(random_state=10, n_features=10, n_targets=num_outputs)
+    model = lb.LBRegressor(n_estimators=5).fit(X, y)
+    shapley, se = model.global_attributions(
+        X,
+        y,
+        metric,
+        n_samples=20,
+        random_state=random_state,
+        check_efficiency=True,
+    )
+    assert cn.isfinite(shapley).all()
+    assert cn.isfinite(se).all()
+    assert (se >= 0).all()
+
+
+@pytest.mark.parametrize("metric", [None, lb.metrics.ExponentialMetric()])
+@pytest.mark.parametrize("num_classes", [2, 3])
+def test_classifier_global_shapley_attributions(metric, num_classes):
+    X, y = make_classification(
+        random_state=10, n_features=10, n_classes=num_classes, n_clusters_per_class=1
+    )
+    model = lb.LBClassifier(n_estimators=5, random_state=9).fit(X, y)
+    shapley, se = model.global_attributions(
+        X,
+        y,
+        metric,
+        random_state=9,
+        check_efficiency=True,
+    )
+    assert cn.isfinite(shapley).all()
+    assert cn.isfinite(se).all()
+    assert (se >= 0).all()
+
+
+@pytest.mark.parametrize("random_state", range(2))
+@pytest.mark.parametrize("num_outputs", [1, 2])
+def test_regressor_local_shapley_attributions(random_state, num_outputs):
+    X, y = make_regression(random_state=10, n_features=10, n_targets=num_outputs)
+    model = lb.LBRegressor(n_estimators=5, random_state=random_state).fit(X, y)
+    X_background = X[:10]
+    shapley, se = model.local_attributions(
+        X,
+        X_background,
+        random_state=random_state,
+        check_efficiency=True,
+    )
+    if num_outputs > 1:
+        assert shapley.shape == (X.shape[0], X.shape[1] + 1, num_outputs)
+    else:
+        assert shapley.shape == (X.shape[0], X.shape[1] + 1)
+    assert cn.isfinite(shapley).all()
+    assert cn.isfinite(se).all()
+    assert (se >= 0).all()
+
+
+@pytest.mark.parametrize("random_state", range(2))
+@pytest.mark.parametrize("num_classes", [2, 3])
+def test_classifier_local_shapley_attributions(random_state, num_classes):
+    X, y = make_classification(
+        random_state=10, n_features=10, n_classes=num_classes, n_clusters_per_class=1
+    )
+    model = lb.LBClassifier(n_estimators=5, random_state=random_state).fit(X, y)
+    X_background = X[:10]
+    shapley, se = model.local_attributions(
+        X,
+        X_background,
+        random_state=random_state,
+        check_efficiency=True,
+    )
+    assert shapley.shape == (X.shape[0], X.shape[1] + 1, num_classes)
+    assert cn.isfinite(shapley).all()
+    assert cn.isfinite(se).all()
+    assert (se >= 0).all()
+
+    # Do a single row
+    shapley, se = model.local_attributions(
+        X[:1],
+        X_background,
+        random_state=random_state,
+        check_efficiency=True,
+    )

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:74db56d0d128cada8bf0edebd8cb5e519f6603abe2b2be7564696a8d18d8c2c6`
	`3`	`+size 180122`