Skip to content

Commit

Permalink
feat: Add return_all argument to Boot.predict
Browse files Browse the repository at this point in the history
  • Loading branch information
saattrupdan committed Apr 23, 2023
1 parent 1bf7d18 commit 54e8a06
Show file tree
Hide file tree
Showing 2 changed files with 71 additions and 20 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,14 @@ and this project adheres to
[Semantic Versioning](http://semver.org/spec/v2.0.0.html).


## [Unreleased]
### Added
- Added a `return_all` argument to the `Boot.predict` method, which will override the
`uncertainty` and `quantiles` arguments and return the raw bootstrap distribution
over which the quantiles would normally be calculated. This allows other uses of the
bootstrap distribution than for computing prediction intervals.


## [v4.3.1] - 2023-03-20
### Fixed
- Previously, all the trees in `QuantileRegressionForest` were the same. This has now
Expand Down
83 changes: 63 additions & 20 deletions src/doubt/models/boot/boot.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
import copy
import multiprocessing as mp
from types import MethodType
from typing import Callable, Optional, Tuple, Union
from typing import Callable, List, Optional, Tuple, Union

import numpy as np
from joblib import Parallel, delayed
from numpy.typing import NDArray


class Boot:
Expand Down Expand Up @@ -48,9 +49,7 @@ class Boot:
Alternatively, we can output the whole bootstrap distribution::
>>> preds, intervals = boot.compute_statistic(
... np.mean,
... n_boots=3,
... return_all=True
... np.mean, n_boots=3, return_all=True
... )
>>> round(preds, 2)
4.06
Expand All @@ -69,6 +68,24 @@ class Boot:
>>> np.around(intervals, 2)
array([473.5 , 490.14])
Instead of specifying the uncertainty, we can also specify the precise
quantiles that we are interested in::
>>> preds, intervals = linreg.predict(
... [10, 30, 1000, 50], quantiles=[0.1, 0.5]
... )
>>> np.around(intervals, 2)
array([476.45, 481.82])
Lastly, as with the `compute_statistic` method, we can also output the whole
bootstrap distribution with the `return_all` argument::
>>> preds, bootstrap_samples = linreg.predict(
... [10, 30, 1000, 50], return_all=True
... )
>>> np.around(bootstrap_samples, 2)
array([-43.7 , -10.02, -8.63, ..., 8.28, 9.06, 10.19])
Sources:
[1]: Friedman, J., Hastie, T., & Tibshirani, R. (2001). The elements of
statistical learning (Vol. 1, No. 10). New York: Springer series in
Expand Down Expand Up @@ -144,7 +161,7 @@ def compute_statistic(
statistic: Callable[[np.ndarray], float],
n_boots: Optional[int] = None,
uncertainty: float = 0.05,
quantiles: Optional[np.ndarray] = None,
quantiles: Optional[Union[np.ndarray, List[float]]] = None,
return_all: bool = False,
) -> Union[float, Tuple[float, np.ndarray]]:
"""Compute bootstrapped statistic.
Expand Down Expand Up @@ -210,8 +227,9 @@ def predict(
X: np.ndarray,
n_boots: Optional[int] = None,
uncertainty: Optional[float] = None,
quantiles: Optional[np.ndarray] = None,
) -> Tuple[Union[float, np.ndarray], np.ndarray]:
quantiles: Optional[Union[np.ndarray, List[float]]] = None,
return_all: bool = False,
) -> Union[Union[float, NDArray], Tuple[Union[float, NDArray], NDArray]]:
"""Compute bootstrapped predictions.
Args:
Expand All @@ -223,17 +241,21 @@ def predict(
root of the data set. Defaults to None
uncertainty (float or None, optional):
The uncertainty used to compute the prediction interval of the bootstrapped
prediction. If None then no prediction intervals are returned. Defaults to
None.
prediction. For instance, an uncertainty of 0.05 will yield a 95%
prediction interval. Will not be used if `quantiles` or `return_all` is
used. Defaults to None.
quantiles (sequence of floats or None, optional):
List of quantiles to output, as an alternative to the `uncertainty`
argument, and will not be used if that argument is set. If None then
`uncertainty` is used. Defaults to None.
List of quantiles to output. Will override the value of `uncertainty` if
specified. Will not be used if `return_all` is True. Defaults to None.
return_all (bool, optional):
Whether the raw bootstrapped predictions should be returned. Will override
the values of both `quantiles` and `uncertainty`. Defaults to False.
Returns:
float array or pair of float arrays:
The bootstrapped predictions, and the confidence intervals if `uncertainty`
is not None, or the specified quantiles if `quantiles` is not None.
is not None, the specified quantiles if `quantiles` is not None, or the raw
bootstrapped values if `return_all` is True.
"""
# Initialise random number generator
rng = np.random.default_rng(self.random_seed)
Expand All @@ -252,15 +274,15 @@ def predict(

# If no quantiles should be outputted then simply return the predictions of the
# underlying model
if uncertainty is None and quantiles is None:
if uncertainty is None and quantiles is None and not return_all:
return preds

# Ensure that the underlying model has been fitted before predicting. This is only
# a requirement if `uncertainty` is set, as we need access to `self.X_train`
if not hasattr(self, "X_train") or self.X_train is None:
raise RuntimeError(
"This model has not been fitted yet! Call fit() "
"before predicting new samples."
"This model has not been fitted yet! Call fit() before predicting new "
"samples."
)

# Store the number of data points in the training and test datasets
Expand Down Expand Up @@ -295,16 +317,37 @@ def predict(
# Add up the bootstrap predictions and the hybrid train/val residuals
C = np.array([m + o for m in bootstrap_preds for o in self.residuals])

# Calculate the desired quantiles
# If we are dealing with a single sample then we replace the predictions with the
# first sample
if onedim:
preds = preds[0]

# Next, we compute the associated uncertainty data, if requested. Firstly, if
# `return_all` is set then we return the raw bootstrapped predictions, being the
# `C` array
if return_all:
if onedim:
return preds, C[:, 0]
else:
return preds, C

# If the uncertainty has been specified and not the quantiles, then we compute
# the quantiles from the uncertainty
if quantiles is None and uncertainty is not None:
quantiles = [uncertainty / 2, 1 - uncertainty / 2]

# Compute the quantiles of the `C` array
quantile_vals = np.transpose(np.quantile(C, q=quantiles or [], axis=0))

# Return the predictions and the desired quantiles
# Add the quantiles to the predictions to get the final prediction intervals
# as the uncertainty data
if onedim:
return preds[0], (preds + quantile_vals)[0]
intervals = preds + quantile_vals[0]
else:
return preds, np.expand_dims(preds, axis=1) + quantile_vals
intervals = np.expand_dims(preds, axis=1) + quantile_vals

# Return the predictions and the prediction intervals
return preds, intervals


def fit(self, X: np.ndarray, y: np.ndarray, n_boots: Optional[int] = None):
Expand Down

0 comments on commit 54e8a06

Please sign in to comment.