Skip to content

Commit 2293a7c

Browse files
authored
Implement Shapley values (#111)
1 parent 1bd3dda commit 2293a7c

File tree

8 files changed

+347
-9
lines changed

8 files changed

+347
-9
lines changed
+2-2
Loading
Loading
+2-2
Loading
+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
version https://git-lfs.github.com/spec/v1
2+
oid sha256:74db56d0d128cada8bf0edebd8cb5e519f6603abe2b2be7564696a8d18d8c2c6
3+
size 180122
Loading

legateboost/legateboost.py

+110-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from .metrics import BaseMetric, metrics
1717
from .models import BaseModel, Tree
1818
from .objectives import BaseObjective, objectives
19+
from .shapley import global_shapley_attributions, local_shapley_attributions
1920
from .utils import PickleCunumericMixin, preround
2021

2122
if TYPE_CHECKING:
@@ -48,6 +49,7 @@ def __init__(
4849
self.random_state = random_state
4950
self.model_init_: cn.ndarray
5051
self.callbacks = callbacks
52+
self.metrics_: list[BaseMetric]
5153
if not isinstance(base_models, tuple):
5254
raise ValueError("base_models must be a tuple")
5355
self.base_models = base_models
@@ -444,6 +446,113 @@ def dump_models(self) -> str:
444446
text += str(m)
445447
return text
446448

449+
def global_attributions(
450+
self,
451+
X: cn.array,
452+
y: cn.array,
453+
metric: Optional[BaseMetric] = None,
454+
random_state: Optional[np.random.RandomState] = None,
455+
n_samples: int = 5,
456+
check_efficiency: bool = False,
457+
) -> Tuple[cn.array, cn.array]:
458+
r"""Compute global feature attributions for the model. Global
459+
attributions show the effect of a feature on a model's loss function.
460+
461+
We use a Shapley value approach to compute the attributions:
462+
:math:`Sh_i(v)=\frac{1}{|N|!} \sum_{\sigma \in \mathfrak{S}_d} \big[ v([\sigma]_{i-1} \cup\{i\}) - v([\sigma]_{i-1}) \big],`
463+
where :math:`v` is the model's loss function, :math:`N` is the set of features, and :math:`\mathfrak{S}_d` is the set of all permutations of the features.
464+
:math:`[\sigma]_{i-1}` represents the set of players ranked lower than :math:`i` in the ordering :math:`\sigma`.
465+
466+
In effect the shapley value shows the effect of adding a feature to the model, averaged over all possible orderings of the features. In our case the above function is approximated using an antithetic-sampling method [#]_, where `n_samples` corresponds to pairs of permutation samples. This method also returns the standard error, which decreases according to :math:`1/\sqrt{n\_samples}`.
467+
468+
This definition of attributions requires removing a feature from the active set. We use a random sample of values from X to fill in the missing feature values. This choice of background distribution corresponds to an 'interventional' Shapley value approach discussed in [#]_.
469+
470+
471+
.. [#] Mitchell, Rory, et al. "Sampling permutations for shapley value estimation." Journal of Machine Learning Research 23.43 (2022): 1-46.
472+
.. [#] Covert, Ian, Scott M. Lundberg, and Su-In Lee. "Understanding global feature contributions with additive importance measures." Advances in Neural Information Processing Systems 33 (2020): 17212-17223.
473+
474+
The method uses memory (and time) proportional to :math:`n\_samples \times n\_features \times n\_background\_samples`. Reduce the number of background samples or the size of X to speed up computation and reduce memory usage. X does not need to be the entire training set to get useful estimates.
475+
476+
See the method :func:`~legateboost.BaseModel.local_attributions` for the effect of features on individual prediction outputs.
477+
478+
Parameters
479+
----------
480+
X : cn.array
481+
The input data.
482+
y : cn.array
483+
The target values.
484+
metric : BaseMetric, optional
485+
The metric to evaluate the model. If None, the model default metric is used.
486+
random_state : int, optional
487+
The random state for reproducibility.
488+
n_samples : int, optional
489+
The number of sample pairs to use in the antithetic sampling method.
490+
check_efficiency : bool, optional
491+
If True, check that shapley values + null coalition add up to the final loss for X, y (the so called efficiency property of Shapley values)'.
492+
493+
Returns
494+
-------
495+
cn.array
496+
The Shapley value estimates for each feature. The last value is the null coalition loss. The sum of this array results in the loss for X, y.
497+
cn.array
498+
The standard error of the Shapley value esimates, with respect to `n_samples`. The standard error decreases according to :math:`1/\sqrt{n\_samples}`.
499+
""" # noqa: E501
500+
check_is_fitted(self, "is_fitted_")
501+
return global_shapley_attributions(
502+
self,
503+
X,
504+
y,
505+
metric,
506+
random_state,
507+
n_samples,
508+
check_efficiency,
509+
)
510+
511+
def local_attributions(
512+
self,
513+
X: cn.array,
514+
X_background: cn.array,
515+
random_state: Optional[np.random.RandomState] = None,
516+
n_samples: int = 5,
517+
check_efficiency: bool = False,
518+
) -> Tuple[cn.array, cn.array]:
519+
r"""Local feature attributions for model predictions. Shows the effect
520+
of a feature on each output prediction. See the definition of Shapley
521+
values in :func:`~legateboost.BaseModel.global_attributions`, where the
522+
:math:`v` function is here the model prediction instead of the loss
523+
function.
524+
525+
Parameters
526+
----------
527+
X : cn.array
528+
The input data.
529+
X_background : cn.array
530+
The background data to use for missing feature values. This could be a random sample of training data (e.g. between 10-100 instances).
531+
random_state : int, optional
532+
The random state for reproducibility.
533+
n_samples : int
534+
The number of sample pairs to use in the antithetic sampling method.
535+
check_efficiency : bool
536+
If True, check that shapley values + null prediction add up to the final predictions for X (the so called efficiency property of Shapley values).
537+
538+
539+
Returns
540+
-------
541+
cn.array
542+
The Shapley value estimates for each feature. The final value is the 'null prediction', where all features are turned off. The sum of this array results in the model prediction.
543+
cn.array
544+
The standard error of the Shapley value esimates, with respect to `n_samples`. The standard error decreases according to :math:`1/\sqrt{n\_samples}`.
545+
""" # noqa: E501
546+
check_is_fitted(self, "is_fitted_")
547+
return local_shapley_attributions(
548+
self,
549+
X,
550+
X_background,
551+
random_state,
552+
n_samples,
553+
check_efficiency,
554+
)
555+
447556

448557
class LBRegressor(LBBase, RegressorMixin):
449558
"""Implementation of a gradient boosting algorithm for regression problems.
@@ -856,7 +965,7 @@ def predict_proba(self, X: cn.ndarray) -> cn.ndarray:
856965
check_is_fitted(self, "is_fitted_")
857966
pred = self._objective_instance.transform(super()._predict(X))
858967
if pred.shape[1] == 1:
859-
pred = pred.squeeze()
968+
pred = pred.reshape(-1)
860969
pred = cn.stack([1.0 - pred, pred], axis=1)
861970
return pred
862971

legateboost/shapley.py

+134
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
from typing import TYPE_CHECKING, Optional, Tuple
2+
3+
import numpy as np
4+
from sklearn.base import is_regressor
5+
from sklearn.utils.validation import check_random_state
6+
7+
import cunumeric as cn
8+
9+
from .metrics import BaseMetric
10+
11+
# provide definitions for mypy without circular import at runtime
12+
if TYPE_CHECKING:
13+
from .legateboost import LBBase
14+
15+
16+
def global_shapley_attributions(
17+
model: "LBBase",
18+
X: cn.array,
19+
y: cn.array,
20+
metric_in: Optional[BaseMetric] = None,
21+
random_state: Optional[np.random.RandomState] = None,
22+
n_samples: int = 5,
23+
check_efficiency: bool = False,
24+
) -> Tuple[cn.array, cn.array]:
25+
def predict_fn(X: cn.array) -> cn.array:
26+
fn = model.predict if is_regressor(model) else model.predict_proba
27+
return fn(X)
28+
29+
metric = metric_in if metric_in is not None else model._metrics[0]
30+
31+
random_state_ = check_random_state(random_state)
32+
w = cn.ones(y.shape[0])
33+
gen = cn.random.default_rng(seed=random_state_.randint(2**32))
34+
35+
# antithetic sampling
36+
v_a = cn.empty((X.shape[1] + 1, n_samples))
37+
v_b = cn.empty((X.shape[1] + 1, n_samples))
38+
39+
def eval_sample(p: cn.array, v: cn.array) -> None:
40+
# cunumeric has no shuffle as of writing
41+
# with replacement should be fine
42+
X_temp = X[gen.integers(0, X.shape[0], X.shape[0])]
43+
null_loss = metric.metric(y, predict_fn(X_temp), w)
44+
v[-1] = null_loss
45+
previous_loss = null_loss
46+
for feature in p:
47+
X_temp[:, feature] = X[:, feature]
48+
loss = metric.metric(y, predict_fn(X_temp), w)
49+
v[feature] = loss - previous_loss
50+
previous_loss = loss
51+
52+
for i in range(n_samples):
53+
p_a = random_state_.permutation(X.shape[1])
54+
p_b = cn.flip(p_a)
55+
eval_sample(p_a, v_a[:, i])
56+
eval_sample(p_b, v_b[:, i])
57+
58+
v = (v_a + v_b) / 2
59+
shapley_values = cn.mean(v, axis=1)
60+
se = cn.std(v, axis=1, ddof=1) / cn.sqrt(n_samples)
61+
62+
if check_efficiency:
63+
full_coalition_loss = metric.metric(y, predict_fn(X), cn.ones(y.shape[0]))
64+
if not cn.isclose(cn.sum(shapley_values), full_coalition_loss):
65+
raise ValueError("Shapley values do not sum up to the full coalition loss")
66+
return shapley_values, se
67+
68+
69+
def local_shapley_attributions(
70+
model: "LBBase",
71+
X: cn.array,
72+
X_background: cn.array,
73+
random_state: Optional[np.random.RandomState] = None,
74+
n_samples: int = 5,
75+
check_efficiency: bool = False,
76+
) -> Tuple[cn.array, cn.array]:
77+
def predict_fn(X: cn.array) -> cn.array:
78+
fn = model.predict if is_regressor(model) else model.predict_proba
79+
p = fn(X)
80+
if p.ndim == 1:
81+
return p.reshape(-1, 1)
82+
return p
83+
84+
random_state_ = check_random_state(random_state)
85+
n_background_samples = 5
86+
gen = cn.random.default_rng(seed=random_state_.randint(2**32))
87+
88+
n_outputs = predict_fn(X[:1, ...]).shape[1]
89+
90+
# antithetic sampling
91+
# perhaps we can do a running mean/se to avoid the last dimension
92+
v_a = cn.zeros((X.shape[0], X.shape[1] + 1, n_outputs, n_samples))
93+
v_b = cn.zeros((X.shape[0], X.shape[1] + 1, n_outputs, n_samples))
94+
95+
def eval_sample(p: cn.array, v: cn.array) -> None:
96+
X_temp = X_background[
97+
gen.integers(
98+
0, X_background.shape[0], size=(X.shape[0], n_background_samples)
99+
)
100+
]
101+
null_pred = predict_fn(X_temp.reshape(X.shape[0] * n_background_samples, -1))
102+
v[:, -1, :, i] = null_pred.reshape(
103+
X.shape[0], n_background_samples, n_outputs
104+
).mean(axis=1)
105+
previous_pred = null_pred
106+
for feature in p:
107+
X_temp[:, :, feature] = X[:, cn.newaxis, feature]
108+
pred = predict_fn(X_temp.reshape(X.shape[0] * n_background_samples, -1))
109+
v[:, feature, :, i] = (
110+
(pred - previous_pred)
111+
.reshape(X.shape[0], n_background_samples, n_outputs)
112+
.mean(axis=1)
113+
)
114+
previous_pred = pred
115+
116+
for i in range(n_samples):
117+
p_a = random_state_.permutation(X.shape[1])
118+
p_b = cn.flip(p_a)
119+
eval_sample(p_a, v_a)
120+
eval_sample(p_b, v_b)
121+
122+
v = (v_a + v_b) / 2
123+
shapley_values = cn.mean(v, axis=-1)
124+
se = cn.std(v, axis=-1, ddof=1) / cn.sqrt(n_samples)
125+
126+
if check_efficiency:
127+
pred = predict_fn(X)
128+
if not cn.allclose(cn.sum(shapley_values, axis=1), pred):
129+
raise ValueError("Shapley values do not sum up to the predictions")
130+
131+
if n_outputs == 1:
132+
shapley_values = shapley_values[:, :, 0]
133+
se = se[:, :, 0]
134+
return shapley_values, se

legateboost/test/test_shapley.py

+92
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
import pytest
2+
from sklearn.datasets import make_classification, make_regression
3+
4+
import cunumeric as cn
5+
import legateboost as lb
6+
7+
8+
@pytest.mark.parametrize("random_state", range(2))
9+
@pytest.mark.parametrize("metric", [None, lb.metrics.MSEMetric()])
10+
@pytest.mark.parametrize("num_outputs", [1, 2])
11+
def test_regressor_global_shapley_attributions(random_state, metric, num_outputs):
12+
X, y = make_regression(random_state=10, n_features=10, n_targets=num_outputs)
13+
model = lb.LBRegressor(n_estimators=5).fit(X, y)
14+
shapley, se = model.global_attributions(
15+
X,
16+
y,
17+
metric,
18+
n_samples=20,
19+
random_state=random_state,
20+
check_efficiency=True,
21+
)
22+
assert cn.isfinite(shapley).all()
23+
assert cn.isfinite(se).all()
24+
assert (se >= 0).all()
25+
26+
27+
@pytest.mark.parametrize("metric", [None, lb.metrics.ExponentialMetric()])
28+
@pytest.mark.parametrize("num_classes", [2, 3])
29+
def test_classifier_global_shapley_attributions(metric, num_classes):
30+
X, y = make_classification(
31+
random_state=10, n_features=10, n_classes=num_classes, n_clusters_per_class=1
32+
)
33+
model = lb.LBClassifier(n_estimators=5, random_state=9).fit(X, y)
34+
shapley, se = model.global_attributions(
35+
X,
36+
y,
37+
metric,
38+
random_state=9,
39+
check_efficiency=True,
40+
)
41+
assert cn.isfinite(shapley).all()
42+
assert cn.isfinite(se).all()
43+
assert (se >= 0).all()
44+
45+
46+
@pytest.mark.parametrize("random_state", range(2))
47+
@pytest.mark.parametrize("num_outputs", [1, 2])
48+
def test_regressor_local_shapley_attributions(random_state, num_outputs):
49+
X, y = make_regression(random_state=10, n_features=10, n_targets=num_outputs)
50+
model = lb.LBRegressor(n_estimators=5, random_state=random_state).fit(X, y)
51+
X_background = X[:10]
52+
shapley, se = model.local_attributions(
53+
X,
54+
X_background,
55+
random_state=random_state,
56+
check_efficiency=True,
57+
)
58+
if num_outputs > 1:
59+
assert shapley.shape == (X.shape[0], X.shape[1] + 1, num_outputs)
60+
else:
61+
assert shapley.shape == (X.shape[0], X.shape[1] + 1)
62+
assert cn.isfinite(shapley).all()
63+
assert cn.isfinite(se).all()
64+
assert (se >= 0).all()
65+
66+
67+
@pytest.mark.parametrize("random_state", range(2))
68+
@pytest.mark.parametrize("num_classes", [2, 3])
69+
def test_classifier_local_shapley_attributions(random_state, num_classes):
70+
X, y = make_classification(
71+
random_state=10, n_features=10, n_classes=num_classes, n_clusters_per_class=1
72+
)
73+
model = lb.LBClassifier(n_estimators=5, random_state=random_state).fit(X, y)
74+
X_background = X[:10]
75+
shapley, se = model.local_attributions(
76+
X,
77+
X_background,
78+
random_state=random_state,
79+
check_efficiency=True,
80+
)
81+
assert shapley.shape == (X.shape[0], X.shape[1] + 1, num_classes)
82+
assert cn.isfinite(shapley).all()
83+
assert cn.isfinite(se).all()
84+
assert (se >= 0).all()
85+
86+
# Do a single row
87+
shapley, se = model.local_attributions(
88+
X[:1],
89+
X_background,
90+
random_state=random_state,
91+
check_efficiency=True,
92+
)

0 commit comments

Comments
 (0)