|
16 | 16 | from .metrics import BaseMetric, metrics
|
17 | 17 | from .models import BaseModel, Tree
|
18 | 18 | from .objectives import BaseObjective, objectives
|
| 19 | +from .shapley import global_shapley_attributions, local_shapley_attributions |
19 | 20 | from .utils import PickleCunumericMixin, preround
|
20 | 21 |
|
21 | 22 | if TYPE_CHECKING:
|
@@ -48,6 +49,7 @@ def __init__(
|
48 | 49 | self.random_state = random_state
|
49 | 50 | self.model_init_: cn.ndarray
|
50 | 51 | self.callbacks = callbacks
|
| 52 | + self.metrics_: list[BaseMetric] |
51 | 53 | if not isinstance(base_models, tuple):
|
52 | 54 | raise ValueError("base_models must be a tuple")
|
53 | 55 | self.base_models = base_models
|
@@ -444,6 +446,113 @@ def dump_models(self) -> str:
|
444 | 446 | text += str(m)
|
445 | 447 | return text
|
446 | 448 |
|
| 449 | + def global_attributions( |
| 450 | + self, |
| 451 | + X: cn.array, |
| 452 | + y: cn.array, |
| 453 | + metric: Optional[BaseMetric] = None, |
| 454 | + random_state: Optional[np.random.RandomState] = None, |
| 455 | + n_samples: int = 5, |
| 456 | + check_efficiency: bool = False, |
| 457 | + ) -> Tuple[cn.array, cn.array]: |
| 458 | + r"""Compute global feature attributions for the model. Global |
| 459 | + attributions show the effect of a feature on a model's loss function. |
| 460 | +
|
| 461 | + We use a Shapley value approach to compute the attributions: |
| 462 | + :math:`Sh_i(v)=\frac{1}{|N|!} \sum_{\sigma \in \mathfrak{S}_d} \big[ v([\sigma]_{i-1} \cup\{i\}) - v([\sigma]_{i-1}) \big],` |
| 463 | + where :math:`v` is the model's loss function, :math:`N` is the set of features, and :math:`\mathfrak{S}_d` is the set of all permutations of the features. |
| 464 | + :math:`[\sigma]_{i-1}` represents the set of players ranked lower than :math:`i` in the ordering :math:`\sigma`. |
| 465 | +
|
| 466 | + In effect the shapley value shows the effect of adding a feature to the model, averaged over all possible orderings of the features. In our case the above function is approximated using an antithetic-sampling method [#]_, where `n_samples` corresponds to pairs of permutation samples. This method also returns the standard error, which decreases according to :math:`1/\sqrt{n\_samples}`. |
| 467 | +
|
| 468 | + This definition of attributions requires removing a feature from the active set. We use a random sample of values from X to fill in the missing feature values. This choice of background distribution corresponds to an 'interventional' Shapley value approach discussed in [#]_. |
| 469 | +
|
| 470 | +
|
| 471 | + .. [#] Mitchell, Rory, et al. "Sampling permutations for shapley value estimation." Journal of Machine Learning Research 23.43 (2022): 1-46. |
| 472 | + .. [#] Covert, Ian, Scott M. Lundberg, and Su-In Lee. "Understanding global feature contributions with additive importance measures." Advances in Neural Information Processing Systems 33 (2020): 17212-17223. |
| 473 | +
|
| 474 | + The method uses memory (and time) proportional to :math:`n\_samples \times n\_features \times n\_background\_samples`. Reduce the number of background samples or the size of X to speed up computation and reduce memory usage. X does not need to be the entire training set to get useful estimates. |
| 475 | +
|
| 476 | + See the method :func:`~legateboost.BaseModel.local_attributions` for the effect of features on individual prediction outputs. |
| 477 | +
|
| 478 | + Parameters |
| 479 | + ---------- |
| 480 | + X : cn.array |
| 481 | + The input data. |
| 482 | + y : cn.array |
| 483 | + The target values. |
| 484 | + metric : BaseMetric, optional |
| 485 | + The metric to evaluate the model. If None, the model default metric is used. |
| 486 | + random_state : int, optional |
| 487 | + The random state for reproducibility. |
| 488 | + n_samples : int, optional |
| 489 | + The number of sample pairs to use in the antithetic sampling method. |
| 490 | + check_efficiency : bool, optional |
| 491 | + If True, check that shapley values + null coalition add up to the final loss for X, y (the so called efficiency property of Shapley values)'. |
| 492 | +
|
| 493 | + Returns |
| 494 | + ------- |
| 495 | + cn.array |
| 496 | + The Shapley value estimates for each feature. The last value is the null coalition loss. The sum of this array results in the loss for X, y. |
| 497 | + cn.array |
| 498 | + The standard error of the Shapley value esimates, with respect to `n_samples`. The standard error decreases according to :math:`1/\sqrt{n\_samples}`. |
| 499 | + """ # noqa: E501 |
| 500 | + check_is_fitted(self, "is_fitted_") |
| 501 | + return global_shapley_attributions( |
| 502 | + self, |
| 503 | + X, |
| 504 | + y, |
| 505 | + metric, |
| 506 | + random_state, |
| 507 | + n_samples, |
| 508 | + check_efficiency, |
| 509 | + ) |
| 510 | + |
| 511 | + def local_attributions( |
| 512 | + self, |
| 513 | + X: cn.array, |
| 514 | + X_background: cn.array, |
| 515 | + random_state: Optional[np.random.RandomState] = None, |
| 516 | + n_samples: int = 5, |
| 517 | + check_efficiency: bool = False, |
| 518 | + ) -> Tuple[cn.array, cn.array]: |
| 519 | + r"""Local feature attributions for model predictions. Shows the effect |
| 520 | + of a feature on each output prediction. See the definition of Shapley |
| 521 | + values in :func:`~legateboost.BaseModel.global_attributions`, where the |
| 522 | + :math:`v` function is here the model prediction instead of the loss |
| 523 | + function. |
| 524 | +
|
| 525 | + Parameters |
| 526 | + ---------- |
| 527 | + X : cn.array |
| 528 | + The input data. |
| 529 | + X_background : cn.array |
| 530 | + The background data to use for missing feature values. This could be a random sample of training data (e.g. between 10-100 instances). |
| 531 | + random_state : int, optional |
| 532 | + The random state for reproducibility. |
| 533 | + n_samples : int |
| 534 | + The number of sample pairs to use in the antithetic sampling method. |
| 535 | + check_efficiency : bool |
| 536 | + If True, check that shapley values + null prediction add up to the final predictions for X (the so called efficiency property of Shapley values). |
| 537 | +
|
| 538 | +
|
| 539 | + Returns |
| 540 | + ------- |
| 541 | + cn.array |
| 542 | + The Shapley value estimates for each feature. The final value is the 'null prediction', where all features are turned off. The sum of this array results in the model prediction. |
| 543 | + cn.array |
| 544 | + The standard error of the Shapley value esimates, with respect to `n_samples`. The standard error decreases according to :math:`1/\sqrt{n\_samples}`. |
| 545 | + """ # noqa: E501 |
| 546 | + check_is_fitted(self, "is_fitted_") |
| 547 | + return local_shapley_attributions( |
| 548 | + self, |
| 549 | + X, |
| 550 | + X_background, |
| 551 | + random_state, |
| 552 | + n_samples, |
| 553 | + check_efficiency, |
| 554 | + ) |
| 555 | + |
447 | 556 |
|
448 | 557 | class LBRegressor(LBBase, RegressorMixin):
|
449 | 558 | """Implementation of a gradient boosting algorithm for regression problems.
|
@@ -856,7 +965,7 @@ def predict_proba(self, X: cn.ndarray) -> cn.ndarray:
|
856 | 965 | check_is_fitted(self, "is_fitted_")
|
857 | 966 | pred = self._objective_instance.transform(super()._predict(X))
|
858 | 967 | if pred.shape[1] == 1:
|
859 |
| - pred = pred.squeeze() |
| 968 | + pred = pred.reshape(-1) |
860 | 969 | pred = cn.stack([1.0 - pred, pred], axis=1)
|
861 | 970 | return pred
|
862 | 971 |
|
|
0 commit comments