From 460bcc776df5bf38ce44a2c7664d6b005848c7d3 Mon Sep 17 00:00:00 2001 From: Max Kuhn Date: Tue, 16 May 2023 13:36:58 -0400 Subject: [PATCH 1/6] initial files --- .../survival-metrics/figs/RKM-1.svg | 825 ++++++++++++++++++ .../survival-metrics/figs/brier-scores-1.svg | 153 ++++ .../survival-metrics/figs/roc-5-1.svg | 88 ++ .../survival-metrics/figs/roc-scores-1.svg | 146 ++++ .../survival-metrics/figs/surv-hist-05-1.svg | 190 ++++ .../survival-metrics/figs/surv-plot-1.svg | 84 ++ .../survival-metrics/figs/usable-data-1.svg | 79 ++ .../survival-metrics/index.Rmarkdown.Rmd | 335 +++++++ .../survival-metrics/index.Rmarkdown.html | 322 +++++++ 9 files changed, 2222 insertions(+) create mode 100644 content/learn/statistics/survival-metrics/figs/RKM-1.svg create mode 100644 content/learn/statistics/survival-metrics/figs/brier-scores-1.svg create mode 100644 content/learn/statistics/survival-metrics/figs/roc-5-1.svg create mode 100644 content/learn/statistics/survival-metrics/figs/roc-scores-1.svg create mode 100644 content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg create mode 100644 content/learn/statistics/survival-metrics/figs/surv-plot-1.svg create mode 100644 content/learn/statistics/survival-metrics/figs/usable-data-1.svg create mode 100644 content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd create mode 100644 content/learn/statistics/survival-metrics/index.Rmarkdown.html diff --git a/content/learn/statistics/survival-metrics/figs/RKM-1.svg b/content/learn/statistics/survival-metrics/figs/RKM-1.svg new file mode 100644 index 00000000..3f4b484b --- /dev/null +++ b/content/learn/statistics/survival-metrics/figs/RKM-1.svg @@ -0,0 +1,825 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0.00 +0.25 +0.50 +0.75 +1.00 + + + + + + + + + +0 +5 +10 +15 +time +probability of being censored + + diff --git a/content/learn/statistics/survival-metrics/figs/brier-scores-1.svg b/content/learn/statistics/survival-metrics/figs/brier-scores-1.svg new file mode 100644 index 00000000..aa7e2698 --- /dev/null +++ b/content/learn/statistics/survival-metrics/figs/brier-scores-1.svg @@ -0,0 +1,153 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0.00 +0.05 +0.10 +0.15 +0.20 +0.25 + + + + + + + + + + +0 +5 +10 +15 +time +Brier score + + diff --git a/content/learn/statistics/survival-metrics/figs/roc-5-1.svg b/content/learn/statistics/survival-metrics/figs/roc-5-1.svg new file mode 100644 index 00000000..d8ac4fc8 --- /dev/null +++ b/content/learn/statistics/survival-metrics/figs/roc-5-1.svg @@ -0,0 +1,88 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0.00 +0.25 +0.50 +0.75 +1.00 + + + + + + + + + + +0.00 +0.25 +0.50 +0.75 +1.00 +1 - specificity +sensitivity + + diff --git a/content/learn/statistics/survival-metrics/figs/roc-scores-1.svg b/content/learn/statistics/survival-metrics/figs/roc-scores-1.svg new file mode 100644 index 00000000..7f2614a3 --- /dev/null +++ b/content/learn/statistics/survival-metrics/figs/roc-scores-1.svg @@ -0,0 +1,146 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0.25 +0.50 +0.75 +1.00 + + + + + + + + +0 +5 +10 +15 +time +ROC AUC + + diff --git a/content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg b/content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg new file mode 100644 index 00000000..8454ca8b --- /dev/null +++ b/content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg @@ -0,0 +1,190 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +non-event + + + + + + + + + + +event + + + + + + + +0.00 +0.25 +0.50 +0.75 +1.00 +0 +10 +20 +30 + + + + +0 +10 +20 +30 + + + + +Probability of Survival +Sum of Weights + + diff --git a/content/learn/statistics/survival-metrics/figs/surv-plot-1.svg b/content/learn/statistics/survival-metrics/figs/surv-plot-1.svg new file mode 100644 index 00000000..a083b00b --- /dev/null +++ b/content/learn/statistics/survival-metrics/figs/surv-plot-1.svg @@ -0,0 +1,84 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0.00 +0.25 +0.50 +0.75 +1.00 + + + + + + + + + +0 +5 +10 +15 +time +survival probability + + diff --git a/content/learn/statistics/survival-metrics/figs/usable-data-1.svg b/content/learn/statistics/survival-metrics/figs/usable-data-1.svg new file mode 100644 index 00000000..32570a00 --- /dev/null +++ b/content/learn/statistics/survival-metrics/figs/usable-data-1.svg @@ -0,0 +1,79 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +0 +50 +100 +150 +200 +250 + + + + + + + + + + +0 +5 +10 +15 +time +Number of Usable Samples + + diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd b/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd new file mode 100644 index 00000000..8bc8ff79 --- /dev/null +++ b/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd @@ -0,0 +1,335 @@ +--- +title: "Dynamic Performance Metrics for Event Time Data" +tags: [survival,censored,parsnip,yardstick] +categories: [statistical analysis] +type: learn-subsection +weight: 9 +description: | + Let's discuss how to compute modern performance metrics for time-to-event models. +--- + +```{r setup, include = FALSE, message = FALSE, warning = FALSE} +source(here::here("content/learn/common.R")) +``` + +```{r load, include = FALSE} +library(tidymodels) +pkgs <- c("tidymodels", "censored", "prodlim") +theme_set(theme_bw() + theme(legend.position = "top")) +tidymodels_prefer() +options(pillar.advice = FALSE, pillar.min_title_chars = Inf) +``` + +`r req_pkgs(pkgs)` You'll need the development versions of censored and parsnip. To install these, use + +```r +library(pak) + +pak::pak(c("tidymodels/censored", "tidymodels/parsnip")) +``` + +## Introduction + + +One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call the dynamic performance metrics. + +Before we delve into the details, it is important to understand the inverse probability of censoring weights (IPCW) scheme and computations. This is described in [another article](https://www.tidymodels.org/learn/statistics/survival-ipcw/). + +The section below discusses the practical aspects of moving to a binary outcome and the statistical implications. Before we start, though, let's define the various types of times that will be mentioned: + +- Observed time: time recorded in the data +- Event time: observed times for actual events +- Evaluation time: the time, specified by the analyst, that the model is evaluated. + +For example, we'll simulate some data using the methods of [Bender _et al_ (2005)](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Generating+survival+times+to+simulate+Cox+proportional+hazards+models.%22&btnG=) using the prodlim package. Training and validation sets are simulated. We'll also load the censored package so that we can fit a model to these time-to-event data: + +```{r} +#| label: data + +library(tidymodels) +library(censored) +library(prodlim) + +set.seed(5882) +sim_dat <- SimSurv(1000) %>% + mutate(event_time = Surv(time, event)) %>% + select(event_time, X1, X2) + +set.seed(2) +split <- initial_split(sim_dat) +sim_tr <- training(split) +sim_val <- testing(split) + +## Resampling object +sim_rs <- vfold_cv(sim_tr) +``` + +We'll need a model to illustrate the code and concepts. Let's fit a bagged survival tree model to the training set: + +```{r} +#| label: bag-tree-fit +bag_tree_spec <- + bag_tree() %>% + set_mode("censored regression") %>% + set_engine("rpart") + +set.seed(17) +bag_tree_fit <- + bag_tree_spec %>% + fit(event_time ~ ., data = sim_tr) + +bag_tree_fit +``` + +Using this model, we'll make predictions of different types. + +## Survival Probability Prediction + +This censored regression model can make static predictions via the predicted event time using `predict(object, type = "time")`. It can also create dynamic predictions regarding the probability of survival for each data point at specific times. The syntax for this is + +```r +predict(object, new_data, type = "survival", eval_time = numeric()) +``` + +where `eval_time` is a vector of time points at which we want the corresponding survivor function estimates. Alternatively, we can use the `augment()` function to get both types of prediction and automatically attach them to the data. + +The largest event time in the training set is `r round(max(sim_tr$event_time[,1]), 3)` so we will use a set of evaluation times between zero and 18. From there, we use `augment()`: + +```{r} +#| label: val-pred + +time_points <- seq(0, 18, by = 0.25) + +val_pred <- augment(bag_tree_fit, sim_val, eval_time = time_points) +val_pred +``` + +The `.pred` column contains the dynamic predictions in a list column. Since `length(time_points)` is `r length(time_points)`, each data frame in the list column has that many rows: + +```{r} +#| label: val-pred-dynamic + +val_pred$.pred[[1]] +``` + +The yardstick package currently has two dynamic metrics. Each is described below. + +## Brier Score + +The Brier score is a metric that can be used with both classification and event-time models. For classification models, it computes the squared error between the observed outcome (encoded as 0/1) and the corresponding predicted probability for the class. + +A little math: suppose that the value $y_{ik}$ is a 0/1 indicator for whether the observed outcome $i$ corresponds to class $k$, and $\hat{p}_{ik}$ is the estimated class probability. The score is then: + +$$ +Brier_{class} = \frac{1}{N}\sum_{i=1}^N\sum_{k=1}^C (y_{ik} - \hat{p}_{ik})^2 +$$ +There are only two classes for survival models: events and non-events^[Again, see the previous article for more detail on this.]. The survival function estimate is the probability that corresponds to non-events. For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate. + +As [previously described](https://www.tidymodels.org/learn/statistics/survival-ipcw/), we weight each observation using a technique similar to propensity weights in causal inference. We'll denote these weights for evaluation time `t` as $w_{it}$. Denoting the survival probability estimate as $\hat{p}_{it}$, the [time-dependent Brier score](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Assessment+and+Comparison+of+Prognostic+Classification+Schemes+for+Survival+Data.%22&btnG=) is: + +$$ +Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{i} = 0)(y_{i} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{i} = 1)(y_{i} - (1 - \hat{p}_{it}))^2}_\text{events}\right] +$$ + +For this score, a perfect model has a score of zero, while an uninformative model would have a score of around 1/4. + +How do we compute this using the yardstick package? The function [`brier_survival()`](https://yardstick.tidymodels.org/reference/brier_survival.html) follows the same convention as the other metric functions. The main arguments are: + +- `data`: the data frame with the predictions (structured as the output produced by `augment()`, as shown above). +- `truth`: the name of the column with the `Surv` object. +- `...`: the name of the column with the dynamic predictions. Within tidymodels, this column is always called `.pred`. In other words, `.pred` should be passed without an argument name. + +Since the evaluation times and the case weights are within the `.pred` column, there is no need to specify these. Here are the results of our validation set: + +```{r} +#| label: val-pred-brier + +brier_scores <- + val_pred %>% + brier_survival(truth = event_time, .pred) +brier_scores +``` + +Over time: + +```{r} +#| label: brier-scores +brier_scores %>% + ggplot(aes(.eval_time, .estimate)) + + geom_hline(yintercept = 1 / 4, col = "red", lty = 3) + + geom_line() + + geom_point() + + labs(x = "time", y = "Brier score") +``` + +There is also an _integrated_ Brier score. This required evaluation times as inputs but instead of returning each result, it takes the area under the plot shown above. The syntax is the same but the result has a single row: + +```{r} +#| label: val-pred-brier-int + +val_pred %>% brier_survival_integrated(truth = event_time, .pred) +``` + +Again, smaller values are better. + +## Receiver Operating Characteristic (ROC) Curves + +Given that we can convert our event time data into a binary representation at time `t,` it is possible to compute any classification statistics of interest. Sensitivity and specificity are two interesting quantities: + +- sensitivity: How well do we predict the events? This is analogous to the true positive rate. +- specificity: How well do we predict the non-events? One minus specificity is the false positive rate. + +As with classification, we don't know what threshold to use for the survival estimate to convert the data to a binary format. The default of 50% may not be appropriate. + +To look a little closer, we'll collect the dynamic predictions and convert them to binary factors. + +```{r} +time_as_binary_event <- function(surv, eval_time) { + event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these + status <- parsnip:::.extract_surv_status(surv) + is_event_before_t <- event_time <= eval_time & status == 1 + + # Three possible contributions to the statistic from Graf 1999 + # Censoring time before eval_time, no contribution (Graf category 3) + binary_res <- rep(NA_character_, length(event_time)) + + # A real event prior to eval_time (Graf category 1) + binary_res <- if_else(is_event_before_t, "event", binary_res) + + # Observed time greater than eval_time (Graf category 2) + binary_res <- if_else(event_time > eval_time, "non-event", binary_res) + factor(binary_res, levels = c("event", "non-event")) +} + + +# Unnest the list columns and convert the data to binary format +binary_encoding <- + val_pred %>% + select(.pred, event_time) %>% + add_rowindex() %>% + unnest(.pred) %>% + mutate( + obs_class = time_as_binary_event(event_time, .eval_time), + pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"), + pred_class = factor(pred_class, levels = c("event", "non-event")), + ) +``` + +Let's take an evaluation time of 5.00 as an example. + +```{r} +#| label: data-at-5 +data_at_5 <- + binary_encoding %>% + filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% + select(.eval_time, .pred_survival, .weight_censored, obs_class, pred_class, event_time) +data_at_5 +``` + +What does the distribution of the survival probabilities for each of the actual classes look like? + + +```{r} +#| label: surv-hist-05 +#| warning: false +#| out-width: 70% +#| fig-width: 7 +#| fig-height: 7 + +data_at_5 %>% + ggplot(aes(x = .pred_survival, weight = .weight_censored)) + + geom_vline(xintercept = 1 / 2, col = "blue", lty = 2) + + geom_histogram(col = "white", bins = 30) + + facet_wrap(~obs_class, ncol = 1) + + lims(x = 0:1) + + labs(x = "Probability of Survival", y = "Sum of Weights") +``` + + +```{r} +#| label: conf-mat-05-hide +#| include: false +cls_set <- metric_set(accuracy, sens, spec) +stats_05 <- + data_at_5 %>% + cls_set(truth = obs_class, + estimate = pred_class, + case_weights = .weight_censored) + +pred_05 <- augment(bag_tree_fit, sim_val, eval_time = 5) + +curve_05 <-pred_05 %>% roc_curve_survival(truth = event_time, .pred) +auc_05 <-pred_05 %>% roc_auc_survival(truth = event_time, .pred) +``` + +More probability values are to the right of the 50% cutoff for the true non-events. Conversely, true events tend to have smaller probabilities. Using this cutoff, the sensitivity would be `r round(stats_05$.estimate[2] * 100, 1)`% and the specificity would be `r round(stats_05$.estimate[3] * 100, 1)`%. There are other possible cutoffs for the survival probabilities. Maybe one of these would have better statistics. + +ROC curves were designed as a general method that, given a collection of continuous predictions, determines an effective threshold such that values above the threshold indicate a specific event. For our purposes, the ROC curve will compute the sensitivity and specificity for _all possible_ probability thresholds. It then plots the true positive rate versus the false positive rate. Generally, we use the area under the ROC curve to quantify it with a single number. Values near one indicate a perfect model, while values near 1/2 occur with non-informative models. + +[Blanche _et al_ (2013)](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Review+and+comparison+of+ROC+curve+estimators+for+a+time-dependent+outcome+with+marker-dependent+censoring%22&btnG=) gives a good overview of ROC curves for survival analysis and their Section 4.3 is most relevant here. + +For our example at `t = 5.00`, the ROC curve is: + +```{r} +#| label: roc-5 +#| echo: false +curve_05 %>% + ggplot(aes(1 - specificity, sensitivity)) + + geom_abline(col = "red", lty = 3) + + geom_step(direction = "vh") + + coord_equal() +``` + +The area under this curve is `r round(auc_05$.estimate[1], 3)`. + +Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is the same as the Brier code shown above: + + +```{r} +#| label: val-pred-roc + +roc_scores <- + val_pred %>% + roc_auc_survival(truth = event_time, .pred) +roc_scores +``` + +Over time: + +```{r} +#| label: roc-scores +roc_scores %>% + ggplot(aes(.eval_time, .estimate)) + + geom_hline(yintercept = 1 / 2, col = "red", lty = 3) + + geom_line() + + geom_point() + + labs(x = "time", y = "ROC AUC") +``` + +The initial variation is due to so few events at the early stages of analysis. + +We should maximize the AUC. This metric measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric's results over time differ. + +## Tuning these metrics + +Many of the event time models available in tidymodels have tuning parameters. The `tune_*()` functions and `fit_resamples()` have an `eval_time` argument used to pass the evaluation times. The statistics are computed for these time points using out-of-sample data. + +In some cases, such as [iterative search](https://www.tmwr.org/iterative-search.html) or [racing methods](https://www.tmwr.org/grid-search.html#racing), the functions need a single value to optimize. If a dynamic metric is used to guide the optimization, _the first evaluation time given by the user_ will be used. + +For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would wish to use that value as the first element of `time_points`. + + +## Summary + +tidymodels has two time-dependent metrics for characterizing the performance of event-time models: + +* The Brier score measures the distance between the observed class result and the predicted probabilities. +* ROC curves try to measure the separation between the two classes based on the survival probabilities. + + +## Session information + +```{r si, echo = FALSE} +small_session(pkgs) +``` + diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown.html b/content/learn/statistics/survival-metrics/index.Rmarkdown.html new file mode 100644 index 00000000..506fc2a9 --- /dev/null +++ b/content/learn/statistics/survival-metrics/index.Rmarkdown.html @@ -0,0 +1,322 @@ +--- +title: "Dynamic Performance Metrics for Event Time Data" +tags: [survival,censored,parsnip,yardstick] +categories: [statistical analysis] +type: learn-subsection +weight: 9 +description: | + Let's discuss how to compute modern performance metrics for time-to-event models. +--- + + + +

To use the code in this article, you will need to install the following packages: censored, prodlim, and tidymodels. You’ll need the development versions of censored and parsnip. To install these, use

+
library(pak)
+
+pak::pak(c("tidymodels/censored", "tidymodels/parsnip"))
+
+

Introduction

+

One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call the dynamic performance metrics.

+

Before we delve into the details, it is important to understand the inverse probability of censoring weights (IPCW) scheme and computations. This is described in another article.

+

The section below discusses the practical aspects of moving to a binary outcome and the statistical implications. Before we start, though, let’s define the various types of times that will be mentioned:

+ +

For example, we’ll simulate some data using the methods of Bender et al (2005) using the prodlim package. Training and validation sets are simulated. We’ll also load the censored package so that we can fit a model to these time-to-event data:

+
library(tidymodels)
+library(censored)
+#> Loading required package: survival
+library(prodlim)
+
+set.seed(5882)
+sim_dat <- SimSurv(1000) %>%
+  mutate(event_time = Surv(time, event)) %>%
+  select(event_time, X1, X2)
+
+set.seed(2)
+split   <- initial_split(sim_dat)
+sim_tr  <- training(split)
+sim_val <- testing(split)
+
+## Resampling object
+sim_rs <- vfold_cv(sim_tr)
+

We’ll need a model to illustrate the code and concepts. Let’s fit a bagged survival tree model to the training set:

+
bag_tree_spec <- 
+  bag_tree() %>% 
+  set_mode("censored regression") %>% 
+  set_engine("rpart")
+
+set.seed(17)
+bag_tree_fit <- 
+  bag_tree_spec %>% 
+  fit(event_time ~ ., data = sim_tr)
+
+bag_tree_fit
+#> parsnip model object
+#> 
+#> 
+#> Bagging survival trees with 25 bootstrap replications 
+#> 
+#> Call: bagging.data.frame(formula = event_time ~ ., data = data)
+

Using this model, we’ll make predictions of different types.

+
+
+

Survival Probability Prediction

+

This censored regression model can make static predictions via the predicted event time using predict(object, type = "time"). It can also create dynamic predictions regarding the probability of survival for each data point at specific times. The syntax for this is

+
predict(object, new_data, type = "survival", eval_time = numeric())
+

where eval_time is a vector of time points at which we want the corresponding survivor function estimates. Alternatively, we can use the augment() function to get both types of prediction and automatically attach them to the data.

+

The largest event time in the training set is 18.1 so we will use a set of evaluation times between zero and 18. From there, we use augment():

+
time_points <- seq(0, 18, by = 0.25)
+
+val_pred <- augment(bag_tree_fit, sim_val, eval_time = time_points)
+val_pred
+#> # A tibble: 250 × 5
+#>    .pred             .pred_time event_time    X1      X2
+#>    <list>                 <dbl>     <Surv> <dbl>   <dbl>
+#>  1 <tibble [73 × 5]>       9.48      5.78      1 -0.630 
+#>  2 <tibble [73 × 5]>       6.67      3.26+     0  0.792 
+#>  3 <tibble [73 × 5]>       3.82      2.34      1  0.811 
+#>  4 <tibble [73 × 5]>       8.53      7.45+     1 -0.271 
+#>  5 <tibble [73 × 5]>       7.07      8.05      0  0.315 
+#>  6 <tibble [73 × 5]>       7.24     14.09      0  0.264 
+#>  7 <tibble [73 × 5]>      10.6       5.23+     0 -0.532 
+#>  8 <tibble [73 × 5]>      12.3       3.18+     0 -1.41  
+#>  9 <tibble [73 × 5]>      11.0       1.86      1 -0.851 
+#> 10 <tibble [73 × 5]>       6.03      7.20      1 -0.0937
+#> # ℹ 240 more rows
+

The .pred column contains the dynamic predictions in a list column. Since length(time_points) is 73, each data frame in the list column has that many rows:

+
val_pred$.pred[[1]]
+#> # A tibble: 73 × 5
+#>    .eval_time .pred_survival .weight_time .pred_censored .weight_censored
+#>         <dbl>          <dbl>        <dbl>          <dbl>            <dbl>
+#>  1       0             1            0              1                 1   
+#>  2       0.25          0.996        0.250          1                 1   
+#>  3       0.5           0.993        0.500          0.997             1.00
+#>  4       0.75          0.993        0.750          0.995             1.01
+#>  5       1             0.989        1.00           0.992             1.01
+#>  6       1.25          0.984        1.25           0.985             1.02
+#>  7       1.5           0.975        1.50           0.978             1.02
+#>  8       1.75          0.974        1.75           0.969             1.03
+#>  9       2             0.964        2.00           0.956             1.05
+#> 10       2.25          0.949        2.25           0.944             1.06
+#> # ℹ 63 more rows
+

The yardstick package currently has two dynamic metrics. Each is described below.

+
+
+

Brier Score

+

The Brier score is a metric that can be used with both classification and event-time models. For classification models, it computes the squared error between the observed outcome (encoded as 0/1) and the corresponding predicted probability for the class.

+

A little math: suppose that the value \(y_{ik}\) is a 0/1 indicator for whether the observed outcome \(i\) corresponds to class \(k\), and \(\hat{p}_{ik}\) is the estimated class probability. The score is then:

+

\[ +Brier_{class} = \frac{1}{N}\sum_{i=1}^N\sum_{k=1}^C (y_{ik} - \hat{p}_{ik})^2 +\] +There are only two classes for survival models: events and non-events1. The survival function estimate is the probability that corresponds to non-events. For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate.

+

As previously described, we weight each observation using a technique similar to propensity weights in causal inference. We’ll denote these weights for evaluation time t as \(w_{it}\). Denoting the survival probability estimate as \(\hat{p}_{it}\), the time-dependent Brier score is:

+

\[ +Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{i} = 0)(y_{i} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{i} = 1)(y_{i} - (1 - \hat{p}_{it}))^2}_\text{events}\right] +\]

+

For this score, a perfect model has a score of zero, while an uninformative model would have a score of around 1/4.

+

How do we compute this using the yardstick package? The function brier_survival() follows the same convention as the other metric functions. The main arguments are:

+ +

Since the evaluation times and the case weights are within the .pred column, there is no need to specify these. Here are the results of our validation set:

+
brier_scores <-
+  val_pred %>% 
+  brier_survival(truth = event_time, .pred)
+brier_scores
+#> # A tibble: 73 × 4
+#>    .metric        .estimator .eval_time .estimate
+#>    <chr>          <chr>           <dbl>     <dbl>
+#>  1 brier_survival standard         0    0        
+#>  2 brier_survival standard         0.25 0.0000270
+#>  3 brier_survival standard         0.5  0.00418  
+#>  4 brier_survival standard         0.75 0.0121   
+#>  5 brier_survival standard         1    0.0243   
+#>  6 brier_survival standard         1.25 0.0431   
+#>  7 brier_survival standard         1.5  0.0521   
+#>  8 brier_survival standard         1.75 0.0677   
+#>  9 brier_survival standard         2    0.0857   
+#> 10 brier_survival standard         2.25 0.100    
+#> # ℹ 63 more rows
+

Over time:

+
brier_scores %>% 
+  ggplot(aes(.eval_time, .estimate)) + 
+  geom_hline(yintercept = 1 / 4, col = "red", lty = 3) +
+  geom_line() +
+  geom_point() + 
+  labs(x = "time", y = "Brier score")
+

+

There is also an integrated Brier score. This required evaluation times as inputs but instead of returning each result, it takes the area under the plot shown above. The syntax is the same but the result has a single row:

+
val_pred %>% brier_survival_integrated(truth = event_time, .pred)
+#> # A tibble: 1 × 3
+#>   .metric                   .estimator .estimate
+#>   <chr>                     <chr>          <dbl>
+#> 1 brier_survival_integrated standard       0.121
+

Again, smaller values are better.

+
+
+

Receiver Operating Characteristic (ROC) Curves

+

Given that we can convert our event time data into a binary representation at time t, it is possible to compute any classification statistics of interest. Sensitivity and specificity are two interesting quantities:

+ +

As with classification, we don’t know what threshold to use for the survival estimate to convert the data to a binary format. The default of 50% may not be appropriate.

+

To look a little closer, we’ll collect the dynamic predictions and convert them to binary factors.

+
time_as_binary_event <- function(surv, eval_time) {
+  event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these
+  status <- parsnip:::.extract_surv_status(surv)
+  is_event_before_t <- event_time <= eval_time & status == 1
+
+  # Three possible contributions to the statistic from Graf 1999
+  # Censoring time before eval_time, no contribution (Graf category 3)
+  binary_res <- rep(NA_character_, length(event_time))
+
+  # A real event prior to eval_time (Graf category 1)
+  binary_res <- if_else(is_event_before_t, "event", binary_res)
+
+  # Observed time greater than eval_time (Graf category 2)
+  binary_res <- if_else(event_time > eval_time, "non-event", binary_res)
+  factor(binary_res, levels = c("event", "non-event"))
+}
+
+
+# Unnest the list columns and convert the data to binary format 
+binary_encoding <- 
+  val_pred %>% 
+  select(.pred, event_time) %>% 
+  add_rowindex() %>% 
+  unnest(.pred) %>% 
+  mutate(
+    obs_class = time_as_binary_event(event_time, .eval_time),
+    pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"),
+    pred_class = factor(pred_class, levels = c("event", "non-event")),
+  )
+

Let’s take an evaluation time of 5.00 as an example.

+
data_at_5 <- 
+  binary_encoding %>% 
+  filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% 
+  select(.eval_time, .pred_survival, .weight_censored, obs_class, pred_class, event_time)
+data_at_5
+#> # A tibble: 205 × 6
+#>    .eval_time .pred_survival .weight_censored obs_class pred_class event_time
+#>         <dbl>          <dbl>            <dbl> <fct>     <fct>          <Surv>
+#>  1          5         0.835              1.28 non-event non-event       5.78 
+#>  2          5         0.327              1.06 event     event           2.34 
+#>  3          5         0.812              1.28 non-event non-event       7.45+
+#>  4          5         0.751              1.28 non-event non-event       8.05 
+#>  5          5         0.755              1.28 non-event non-event      14.09 
+#>  6          5         0.854              1.28 non-event non-event       5.23+
+#>  7          5         0.868              1.04 event     non-event       1.86 
+#>  8          5         0.664              1.28 non-event non-event       7.20 
+#>  9          5         0.0436             1.19 event     event           3.95 
+#> 10          5         0.375              1.28 non-event event           5.18+
+#> # ℹ 195 more rows
+

What does the distribution of the survival probabilities for each of the actual classes look like?

+
data_at_5 %>% 
+  ggplot(aes(x = .pred_survival, weight = .weight_censored)) + 
+  geom_vline(xintercept = 1 / 2, col = "blue", lty = 2) +
+  geom_histogram(col = "white", bins = 30) + 
+  facet_wrap(~obs_class, ncol = 1) +
+  lims(x = 0:1) +
+  labs(x = "Probability of Survival", y = "Sum of Weights")
+

+

More probability values are to the right of the 50% cutoff for the true non-events. Conversely, true events tend to have smaller probabilities. Using this cutoff, the sensitivity would be 70.3% and the specificity would be 83%. There are other possible cutoffs for the survival probabilities. Maybe one of these would have better statistics.

+

ROC curves were designed as a general method that, given a collection of continuous predictions, determines an effective threshold such that values above the threshold indicate a specific event. For our purposes, the ROC curve will compute the sensitivity and specificity for all possible probability thresholds. It then plots the true positive rate versus the false positive rate. Generally, we use the area under the ROC curve to quantify it with a single number. Values near one indicate a perfect model, while values near 1/2 occur with non-informative models.

+

Blanche et al (2013) gives a good overview of ROC curves for survival analysis and their Section 4.3 is most relevant here.

+

For our example at t = 5.00, the ROC curve is:

+

+

The area under this curve is 0.838.

+

Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is the same as the Brier code shown above:

+
roc_scores <-
+  val_pred %>% 
+  roc_auc_survival(truth = event_time, .pred)
+roc_scores
+#> # A tibble: 73 × 4
+#>    .metric          .estimator .eval_time .estimate
+#>    <chr>            <chr>           <dbl>     <dbl>
+#>  1 roc_auc_survival standard         0       0.5   
+#>  2 roc_auc_survival standard         0.25    0.5   
+#>  3 roc_auc_survival standard         0.5     0.0948
+#>  4 roc_auc_survival standard         0.75    0.851 
+#>  5 roc_auc_survival standard         1       0.697 
+#>  6 roc_auc_survival standard         1.25    0.697 
+#>  7 roc_auc_survival standard         1.5     0.719 
+#>  8 roc_auc_survival standard         1.75    0.757 
+#>  9 roc_auc_survival standard         2       0.745 
+#> 10 roc_auc_survival standard         2.25    0.765 
+#> # ℹ 63 more rows
+

Over time:

+
roc_scores %>% 
+  ggplot(aes(.eval_time, .estimate)) + 
+  geom_hline(yintercept = 1 / 2, col = "red", lty = 3) +
+  geom_line() +
+  geom_point() + 
+  labs(x = "time", y = "ROC AUC")
+

+

The initial variation is due to so few events at the early stages of analysis.

+

We should maximize the AUC. This metric measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric’s results over time differ.

+
+
+

Tuning these metrics

+

Many of the event time models available in tidymodels have tuning parameters. The tune_*() functions and fit_resamples() have an eval_time argument used to pass the evaluation times. The statistics are computed for these time points using out-of-sample data.

+

In some cases, such as iterative search or racing methods, the functions need a single value to optimize. If a dynamic metric is used to guide the optimization, the first evaluation time given by the user will be used.

+

For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would wish to use that value as the first element of time_points.

+
+
+

Summary

+

tidymodels has two time-dependent metrics for characterizing the performance of event-time models:

+ +
+
+

Session information

+
#> ─ Session info ─────────────────────────────────────────────────────
+#>  setting  value
+#>  version  R version 4.2.0 (2022-04-22)
+#>  os       macOS Big Sur/Monterey 10.16
+#>  system   x86_64, darwin17.0
+#>  ui       X11
+#>  language (EN)
+#>  collate  en_US.UTF-8
+#>  ctype    en_US.UTF-8
+#>  tz       America/New_York
+#>  date     2023-05-16
+#>  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
+#> 
+#> ─ Packages ─────────────────────────────────────────────────────────
+#>  package    * version    date (UTC) lib source
+#>  broom      * 1.0.4      2023-03-11 [1] CRAN (R 4.2.0)
+#>  censored   * 0.2.0.9000 2023-05-16 [1] Github (tidymodels/censored@f9eccb6)
+#>  dials      * 1.2.0      2023-04-03 [1] CRAN (R 4.2.0)
+#>  dplyr      * 1.1.2      2023-04-20 [1] CRAN (R 4.2.0)
+#>  ggplot2    * 3.4.2      2023-04-03 [1] CRAN (R 4.2.0)
+#>  infer      * 1.0.4      2022-12-02 [1] CRAN (R 4.2.0)
+#>  parsnip    * 1.1.0.9001 2023-05-16 [1] Github (tidymodels/parsnip@ab42409)
+#>  prodlim    * 2023.03.31 2023-04-02 [1] CRAN (R 4.2.0)
+#>  purrr      * 1.0.1      2023-01-10 [1] CRAN (R 4.2.0)
+#>  recipes    * 1.0.6      2023-04-25 [1] CRAN (R 4.2.0)
+#>  rlang        1.1.1      2023-04-28 [1] CRAN (R 4.2.0)
+#>  rsample    * 1.1.1      2022-12-07 [1] CRAN (R 4.2.0)
+#>  tibble     * 3.2.1      2023-03-20 [1] CRAN (R 4.2.0)
+#>  tidymodels * 1.1.0      2023-05-01 [1] CRAN (R 4.2.0)
+#>  tune       * 1.1.1.9001 2023-05-16 [1] Github (tidymodels/tune@fdc0016)
+#>  workflows  * 1.1.3      2023-02-22 [1] CRAN (R 4.2.0)
+#>  yardstick  * 1.2.0      2023-04-21 [1] CRAN (R 4.2.0)
+#> 
+#>  [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
+#> 
+#> ────────────────────────────────────────────────────────────────────
+
+
+
+
    +
  1. Again, see the previous article for more detail on this.↩︎

  2. +
+
From 52312ec3062e2b37e2b58bad970d24744885ec19 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Mon, 22 May 2023 10:29:02 +0100 Subject: [PATCH 2/6] typos --- content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd b/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd index 8bc8ff79..b2faddbf 100644 --- a/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd +++ b/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd @@ -31,11 +31,11 @@ pak::pak(c("tidymodels/censored", "tidymodels/parsnip")) ## Introduction -One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call the dynamic performance metrics. +One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call them dynamic performance metrics. Before we delve into the details, it is important to understand the inverse probability of censoring weights (IPCW) scheme and computations. This is described in [another article](https://www.tidymodels.org/learn/statistics/survival-ipcw/). -The section below discusses the practical aspects of moving to a binary outcome and the statistical implications. Before we start, though, let's define the various types of times that will be mentioned: +The sections below discusses the practical aspects of moving to a binary outcome and the statistical implications. Before we start, though, let's define the various types of times that will be mentioned: - Observed time: time recorded in the data - Event time: observed times for actual events From 04e0376e447bc05609d092c3f13d6cc120f905f7 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Wed, 7 Jun 2023 12:27:10 +0100 Subject: [PATCH 3/6] edits --- .../survival-metrics/figs/brier-scores-1.svg | 187 +++++++------ .../survival-metrics/figs/roc-5-1.svg | 2 +- .../survival-metrics/figs/roc-scores-1.svg | 245 ++++++++++-------- .../survival-metrics/figs/surv-hist-05-1.svg | 152 +++++------ .../survival-metrics/index.Rmarkdown.Rmd | 82 +++--- .../survival-metrics/index.Rmarkdown.html | 228 ++++++++-------- 6 files changed, 461 insertions(+), 435 deletions(-) diff --git a/content/learn/statistics/survival-metrics/figs/brier-scores-1.svg b/content/learn/statistics/survival-metrics/figs/brier-scores-1.svg index aa7e2698..1693b1e4 100644 --- a/content/learn/statistics/survival-metrics/figs/brier-scores-1.svg +++ b/content/learn/statistics/survival-metrics/figs/brier-scores-1.svg @@ -35,10 +35,10 @@ - - - - + + + + @@ -46,84 +46,97 @@ - - - + + + + - + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -140,13 +153,15 @@ - - - + + + + 0 -5 -10 -15 +5 +10 +15 +20 time Brier score diff --git a/content/learn/statistics/survival-metrics/figs/roc-5-1.svg b/content/learn/statistics/survival-metrics/figs/roc-5-1.svg index d8ac4fc8..c85ee4e0 100644 --- a/content/learn/statistics/survival-metrics/figs/roc-5-1.svg +++ b/content/learn/statistics/survival-metrics/figs/roc-5-1.svg @@ -58,7 +58,7 @@ - + diff --git a/content/learn/statistics/survival-metrics/figs/roc-scores-1.svg b/content/learn/statistics/survival-metrics/figs/roc-scores-1.svg index 7f2614a3..06015ad7 100644 --- a/content/learn/statistics/survival-metrics/figs/roc-scores-1.svg +++ b/content/learn/statistics/survival-metrics/figs/roc-scores-1.svg @@ -24,123 +24,142 @@ - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + -0.25 -0.50 -0.75 -1.00 - - - - - - - - -0 -5 -10 -15 -time +0.5 +0.6 +0.7 +0.8 +0.9 + + + + + + + + + + +0 +5 +10 +15 +20 +time ROC AUC diff --git a/content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg b/content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg index 8454ca8b..5aa4e8ab 100644 --- a/content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg +++ b/content/learn/statistics/survival-metrics/figs/surv-hist-05-1.svg @@ -30,50 +30,50 @@ - - - - + + + + - - - + + + - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + - - - - - - - - - + + + + + + + + + @@ -87,51 +87,51 @@ - - - - + + + + - - - + + + - - + + - - - - + + + + - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + @@ -169,22 +169,22 @@ 0.75 1.00 0 -10 -20 -30 +25 +50 +75 - - - + + + 0 -10 -20 -30 +25 +50 +75 - - - -Probability of Survival -Sum of Weights + + + +probability of survival +sum of weights diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd b/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd index b2faddbf..f41e3581 100644 --- a/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd +++ b/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd @@ -23,25 +23,27 @@ options(pillar.advice = FALSE, pillar.min_title_chars = Inf) `r req_pkgs(pkgs)` You'll need the development versions of censored and parsnip. To install these, use ```r -library(pak) +#install.packages("pak") pak::pak(c("tidymodels/censored", "tidymodels/parsnip")) ``` - ## Introduction - One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call them dynamic performance metrics. -Before we delve into the details, it is important to understand the inverse probability of censoring weights (IPCW) scheme and computations. This is described in [another article](https://www.tidymodels.org/learn/statistics/survival-ipcw/). +Many dynamic metrics are similar to those used for binary classification models. The basic idea is that, for a given time $t$ for model evaluation, we try to encode the observed event time data into a binary "has there been an event at time $t$?" version. We can also convert the predicted survival probabilities into predicted events/non-events based on a threshold (default is 0.50). The survival versions of these metrics need those binary versions of observed truth and predictions as well as a way to account for censoring. + +Censoring plays into the details of the conversion and is additionally captured in the form of weights. For details on both these aspects, see the [Accounting for Censoring in Performance Metrics for Event Time Data](FIXME) article. -The sections below discusses the practical aspects of moving to a binary outcome and the statistical implications. Before we start, though, let's define the various types of times that will be mentioned: +To start, let's define the various types of times that will be mentioned: - Observed time: time recorded in the data - Event time: observed times for actual events - Evaluation time: the time, specified by the analyst, that the model is evaluated. -For example, we'll simulate some data using the methods of [Bender _et al_ (2005)](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Generating+survival+times+to+simulate+Cox+proportional+hazards+models.%22&btnG=) using the prodlim package. Training and validation sets are simulated. We'll also load the censored package so that we can fit a model to these time-to-event data: +## Example data + +As an example, we'll simulate some data with the prodlim package, using the methods of [Bender _et al_ (2005)](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Generating+survival+times+to+simulate+Cox+proportional+hazards+models.%22&btnG=). A training and a validation set are simulated. We'll also load the censored package so that we can fit a model to these time-to-event data: ```{r} #| label: data @@ -51,7 +53,7 @@ library(censored) library(prodlim) set.seed(5882) -sim_dat <- SimSurv(1000) %>% +sim_dat <- SimSurv(2000) %>% mutate(event_time = Surv(time, event)) %>% select(event_time, X1, X2) @@ -68,16 +70,13 @@ We'll need a model to illustrate the code and concepts. Let's fit a bagged survi ```{r} #| label: bag-tree-fit -bag_tree_spec <- - bag_tree() %>% - set_mode("censored regression") %>% - set_engine("rpart") set.seed(17) bag_tree_fit <- - bag_tree_spec %>% + bag_tree() %>% + set_mode("censored regression") %>% + set_engine("rpart") %>% fit(event_time ~ ., data = sim_tr) - bag_tree_fit ``` @@ -93,18 +92,18 @@ predict(object, new_data, type = "survival", eval_time = numeric()) where `eval_time` is a vector of time points at which we want the corresponding survivor function estimates. Alternatively, we can use the `augment()` function to get both types of prediction and automatically attach them to the data. -The largest event time in the training set is `r round(max(sim_tr$event_time[,1]), 3)` so we will use a set of evaluation times between zero and 18. From there, we use `augment()`: +The largest event time in the training set is `r round(max(sim_tr$event_time[,1]), 3)` so we will use a set of evaluation times between zero and 21. ```{r} #| label: val-pred -time_points <- seq(0, 18, by = 0.25) +time_points <- seq(0, 21, by = 0.25) val_pred <- augment(bag_tree_fit, sim_val, eval_time = time_points) val_pred ``` -The `.pred` column contains the dynamic predictions in a list column. Since `length(time_points)` is `r length(time_points)`, each data frame in the list column has that many rows: +The observed data are in the `event_time` column. The predicted survival probabilities are in the `.pred` column. This is a list column with a data frame for each observation, containing the predictions at the `r length(time_points)` evaluation time points in the (nested) column `.pred_survival`. ```{r} #| label: val-pred-dynamic @@ -118,17 +117,16 @@ The yardstick package currently has two dynamic metrics. Each is described below The Brier score is a metric that can be used with both classification and event-time models. For classification models, it computes the squared error between the observed outcome (encoded as 0/1) and the corresponding predicted probability for the class. -A little math: suppose that the value $y_{ik}$ is a 0/1 indicator for whether the observed outcome $i$ corresponds to class $k$, and $\hat{p}_{ik}$ is the estimated class probability. The score is then: +A little math: suppose that the value $y_{ik}$ is a 0/1 indicator for whether the observed outcome $i$ corresponds to class $k$, and $\hat{p}_{ik}$ is the estimated class probability. The classification score is then: $$ Brier_{class} = \frac{1}{N}\sum_{i=1}^N\sum_{k=1}^C (y_{ik} - \hat{p}_{ik})^2 $$ -There are only two classes for survival models: events and non-events^[Again, see the previous article for more detail on this.]. The survival function estimate is the probability that corresponds to non-events. For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate. -As [previously described](https://www.tidymodels.org/learn/statistics/survival-ipcw/), we weight each observation using a technique similar to propensity weights in causal inference. We'll denote these weights for evaluation time `t` as $w_{it}$. Denoting the survival probability estimate as $\hat{p}_{it}$, the [time-dependent Brier score](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Assessment+and+Comparison+of+Prognostic+Classification+Schemes+for+Survival+Data.%22&btnG=) is: +For survival models, we transform the event time data into a binary version $y_{it}$: is there an event at evaluation time $t$^[Again, see the [Accounting for Censoring in Performance Metrics for Event Time Data](FIXME) article for more on this.]. The survival function estimate $\hat{p}_{it}$ is the probability that corresponds to non-events at time $t$. For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate. To account for censoring, we also weight each observation with $w_{it}$. The [time-dependent Brier score](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Assessment+and+Comparison+of+Prognostic+Classification+Schemes+for+Survival+Data.%22&btnG=) is: $$ -Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{i} = 0)(y_{i} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{i} = 1)(y_{i} - (1 - \hat{p}_{it}))^2}_\text{events}\right] +Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{it} = 0)(y_{it} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{it} = 1)(y_{it} - (1 - \hat{p}_{it}))^2}_\text{events}\right] $$ For this score, a perfect model has a score of zero, while an uninformative model would have a score of around 1/4. @@ -174,16 +172,16 @@ Again, smaller values are better. ## Receiver Operating Characteristic (ROC) Curves -Given that we can convert our event time data into a binary representation at time `t,` it is possible to compute any classification statistics of interest. Sensitivity and specificity are two interesting quantities: +When we not only turn the event time data into a binary representation but also the predicted probabilities, we are in well-chartered classification metrics territory. Sensitivity and specificity are common quantities to compute, we do so here in their weighted version to account for censoring: -- sensitivity: How well do we predict the events? This is analogous to the true positive rate. -- specificity: How well do we predict the non-events? One minus specificity is the false positive rate. +- Sensitivity: How well do we predict the events? This is analogous to the true positive rate. +- Specificity: How well do we predict the non-events? One minus specificity is the false positive rate. -As with classification, we don't know what threshold to use for the survival estimate to convert the data to a binary format. The default of 50% may not be appropriate. - -To look a little closer, we'll collect the dynamic predictions and convert them to binary factors. +Let's take a look at our example data and use an evaluation time of 5.00! ```{r} +#| label: data-at-5 + time_as_binary_event <- function(surv, eval_time) { event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these status <- parsnip:::.extract_surv_status(surv) @@ -201,8 +199,7 @@ time_as_binary_event <- function(surv, eval_time) { factor(binary_res, levels = c("event", "non-event")) } - -# Unnest the list columns and convert the data to binary format +# Unnest the list columns and convert the event time data to binary format binary_encoding <- val_pred %>% select(.pred, event_time) %>% @@ -211,14 +208,9 @@ binary_encoding <- mutate( obs_class = time_as_binary_event(event_time, .eval_time), pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"), - pred_class = factor(pred_class, levels = c("event", "non-event")), + pred_class = factor(pred_class, levels = c("event", "non-event")) ) -``` -Let's take an evaluation time of 5.00 as an example. - -```{r} -#| label: data-at-5 data_at_5 <- binary_encoding %>% filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% @@ -226,8 +218,7 @@ data_at_5 <- data_at_5 ``` -What does the distribution of the survival probabilities for each of the actual classes look like? - +What does the distribution of the survival probabilities for each of the actual classes look like? ```{r} #| label: surv-hist-05 @@ -242,7 +233,7 @@ data_at_5 %>% geom_histogram(col = "white", bins = 30) + facet_wrap(~obs_class, ncol = 1) + lims(x = 0:1) + - labs(x = "Probability of Survival", y = "Sum of Weights") + labs(x = "probability of survival", y = "sum of weights") ``` @@ -252,14 +243,18 @@ data_at_5 %>% cls_set <- metric_set(accuracy, sens, spec) stats_05 <- data_at_5 %>% + mutate( + pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"), + pred_class = factor(pred_class, levels = c("event", "non-event")) + ) %>% cls_set(truth = obs_class, estimate = pred_class, case_weights = .weight_censored) pred_05 <- augment(bag_tree_fit, sim_val, eval_time = 5) -curve_05 <-pred_05 %>% roc_curve_survival(truth = event_time, .pred) -auc_05 <-pred_05 %>% roc_auc_survival(truth = event_time, .pred) +curve_05 <- pred_05 %>% roc_curve_survival(truth = event_time, .pred) +auc_05 <- pred_05 %>% roc_auc_survival(truth = event_time, .pred) ``` More probability values are to the right of the 50% cutoff for the true non-events. Conversely, true events tend to have smaller probabilities. Using this cutoff, the sensitivity would be `r round(stats_05$.estimate[2] * 100, 1)`% and the specificity would be `r round(stats_05$.estimate[3] * 100, 1)`%. There are other possible cutoffs for the survival probabilities. Maybe one of these would have better statistics. @@ -268,7 +263,7 @@ ROC curves were designed as a general method that, given a collection of continu [Blanche _et al_ (2013)](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Review+and+comparison+of+ROC+curve+estimators+for+a+time-dependent+outcome+with+marker-dependent+censoring%22&btnG=) gives a good overview of ROC curves for survival analysis and their Section 4.3 is most relevant here. -For our example at `t = 5.00`, the ROC curve is: +For our example at evaluation time $t = 5.00$, the ROC curve is: ```{r} #| label: roc-5 @@ -282,8 +277,7 @@ curve_05 %>% The area under this curve is `r round(auc_05$.estimate[1], 3)`. -Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is the same as the Brier code shown above: - +Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is very similar to the Brier code shown above: ```{r} #| label: val-pred-roc @@ -308,7 +302,7 @@ roc_scores %>% The initial variation is due to so few events at the early stages of analysis. -We should maximize the AUC. This metric measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric's results over time differ. +The ROC measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric's results over time differ. ## Tuning these metrics @@ -316,7 +310,7 @@ Many of the event time models available in tidymodels have tuning parameters. Th In some cases, such as [iterative search](https://www.tmwr.org/iterative-search.html) or [racing methods](https://www.tmwr.org/grid-search.html#racing), the functions need a single value to optimize. If a dynamic metric is used to guide the optimization, _the first evaluation time given by the user_ will be used. -For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would wish to use that value as the first element of `time_points`. +For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would need to use that value of 5.00 as the first element `time_points`, the vector given to the `eval_time` argument in our example above. ## Summary diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown.html b/content/learn/statistics/survival-metrics/index.Rmarkdown.html index 506fc2a9..6d4b8692 100644 --- a/content/learn/statistics/survival-metrics/index.Rmarkdown.html +++ b/content/learn/statistics/survival-metrics/index.Rmarkdown.html @@ -11,27 +11,31 @@

To use the code in this article, you will need to install the following packages: censored, prodlim, and tidymodels. You’ll need the development versions of censored and parsnip. To install these, use

-
library(pak)
+
#install.packages("pak")
 
 pak::pak(c("tidymodels/censored", "tidymodels/parsnip"))

Introduction

-

One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call the dynamic performance metrics.

-

Before we delve into the details, it is important to understand the inverse probability of censoring weights (IPCW) scheme and computations. This is described in another article.

-

The section below discusses the practical aspects of moving to a binary outcome and the statistical implications. Before we start, though, let’s define the various types of times that will be mentioned:

+

One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call them dynamic performance metrics.

+

Many dynamic metrics are similar to those used for binary classification models. The basic idea is that, for a given time \(t\) for model evaluation, we try to encode the observed event time data into a binary “has there been an event at time \(t\)?” version. We can also convert the predicted survival probabilities into predicted events/non-events based on a threshold (default is 0.50). The survival versions of these metrics need those binary versions of observed truth and predictions as well as a way to account for censoring.

+

Censoring plays into the details of the conversion and is additionally captured in the form of weights. For details on both these aspects, see the Accounting for Censoring in Performance Metrics for Event Time Data article.

+

To start, let’s define the various types of times that will be mentioned:

  • Observed time: time recorded in the data
  • Event time: observed times for actual events
  • Evaluation time: the time, specified by the analyst, that the model is evaluated.
-

For example, we’ll simulate some data using the methods of Bender et al (2005) using the prodlim package. Training and validation sets are simulated. We’ll also load the censored package so that we can fit a model to these time-to-event data:

+
+
+

Example data

+

As an example, we’ll simulate some data with the prodlim package, using the methods of Bender et al (2005). A training and a validation set are simulated. We’ll also load the censored package so that we can fit a model to these time-to-event data:

library(tidymodels)
 library(censored)
 #> Loading required package: survival
 library(prodlim)
 
 set.seed(5882)
-sim_dat <- SimSurv(1000) %>%
+sim_dat <- SimSurv(2000) %>%
   mutate(event_time = Surv(time, event)) %>%
   select(event_time, X1, X2)
 
@@ -43,16 +47,12 @@ 

Introduction

## Resampling object sim_rs <- vfold_cv(sim_tr)

We’ll need a model to illustrate the code and concepts. Let’s fit a bagged survival tree model to the training set:

-
bag_tree_spec <- 
+
set.seed(17)
+bag_tree_fit <- 
   bag_tree() %>% 
   set_mode("censored regression") %>% 
-  set_engine("rpart")
-
-set.seed(17)
-bag_tree_fit <- 
-  bag_tree_spec %>% 
+  set_engine("rpart") %>% 
   fit(event_time ~ ., data = sim_tr)
-
 bag_tree_fit
 #> parsnip model object
 #> 
@@ -67,54 +67,53 @@ 

Survival Probability Prediction

This censored regression model can make static predictions via the predicted event time using predict(object, type = "time"). It can also create dynamic predictions regarding the probability of survival for each data point at specific times. The syntax for this is

predict(object, new_data, type = "survival", eval_time = numeric())

where eval_time is a vector of time points at which we want the corresponding survivor function estimates. Alternatively, we can use the augment() function to get both types of prediction and automatically attach them to the data.

-

The largest event time in the training set is 18.1 so we will use a set of evaluation times between zero and 18. From there, we use augment():

-
time_points <- seq(0, 18, by = 0.25)
+

The largest event time in the training set is 21.083 so we will use a set of evaluation times between zero and 21.

+
time_points <- seq(0, 21, by = 0.25)
 
 val_pred <- augment(bag_tree_fit, sim_val, eval_time = time_points)
 val_pred
-#> # A tibble: 250 × 5
+#> # A tibble: 500 × 5
 #>    .pred             .pred_time event_time    X1      X2
 #>    <list>                 <dbl>     <Surv> <dbl>   <dbl>
-#>  1 <tibble [73 × 5]>       9.48      5.78      1 -0.630 
-#>  2 <tibble [73 × 5]>       6.67      3.26+     0  0.792 
-#>  3 <tibble [73 × 5]>       3.82      2.34      1  0.811 
-#>  4 <tibble [73 × 5]>       8.53      7.45+     1 -0.271 
-#>  5 <tibble [73 × 5]>       7.07      8.05      0  0.315 
-#>  6 <tibble [73 × 5]>       7.24     14.09      0  0.264 
-#>  7 <tibble [73 × 5]>      10.6       5.23+     0 -0.532 
-#>  8 <tibble [73 × 5]>      12.3       3.18+     0 -1.41  
-#>  9 <tibble [73 × 5]>      11.0       1.86      1 -0.851 
-#> 10 <tibble [73 × 5]>       6.03      7.20      1 -0.0937
-#> # ℹ 240 more rows
-

The .pred column contains the dynamic predictions in a list column. Since length(time_points) is 73, each data frame in the list column has that many rows:

+#> 1 <tibble [85 × 5]> 6.66 4.83 1 -0.630 +#> 2 <tibble [85 × 5]> 6.66 6.11 1 -0.606 +#> 3 <tibble [85 × 5]> 7.47 6.60+ 1 -1.03 +#> 4 <tibble [85 × 5]> 3.29 2.72 1 0.811 +#> 5 <tibble [85 × 5]> 5.10 4.73+ 1 -0.376 +#> 6 <tibble [85 × 5]> 4.99 8.70 0 1.18 +#> 7 <tibble [85 × 5]> 7.23 10.82 1 -0.851 +#> 8 <tibble [85 × 5]> 6.46 6.89 0 0.493 +#> 9 <tibble [85 × 5]> 4.75 2.45+ 1 0.0207 +#> 10 <tibble [85 × 5]> 13.4 8.23+ 0 -1.52 +#> # ℹ 490 more rows
+

The observed data are in the event_time column. The predicted survival probabilities are in the .pred column. This is a list column with a data frame for each observation, containing the predictions at the 85 evaluation time points in the (nested) column .pred_survival.

val_pred$.pred[[1]]
-#> # A tibble: 73 × 5
+#> # A tibble: 85 × 5
 #>    .eval_time .pred_survival .weight_time .pred_censored .weight_censored
 #>         <dbl>          <dbl>        <dbl>          <dbl>            <dbl>
 #>  1       0             1            0              1                 1   
-#>  2       0.25          0.996        0.250          1                 1   
-#>  3       0.5           0.993        0.500          0.997             1.00
-#>  4       0.75          0.993        0.750          0.995             1.01
-#>  5       1             0.989        1.00           0.992             1.01
-#>  6       1.25          0.984        1.25           0.985             1.02
-#>  7       1.5           0.975        1.50           0.978             1.02
-#>  8       1.75          0.974        1.75           0.969             1.03
-#>  9       2             0.964        2.00           0.956             1.05
-#> 10       2.25          0.949        2.25           0.944             1.06
-#> # ℹ 63 more rows
+#> 2 0.25 1 0.250 0.999 1.00 +#> 3 0.5 0.999 0.500 0.996 1.00 +#> 4 0.75 0.992 0.750 0.993 1.01 +#> 5 1 0.988 1.00 0.991 1.01 +#> 6 1.25 0.980 1.25 0.987 1.01 +#> 7 1.5 0.972 1.50 0.981 1.02 +#> 8 1.75 0.959 1.75 0.971 1.03 +#> 9 2 0.938 2.00 0.966 1.04 +#> 10 2.25 0.925 2.25 0.959 1.04 +#> # ℹ 75 more rows

The yardstick package currently has two dynamic metrics. Each is described below.

Brier Score

The Brier score is a metric that can be used with both classification and event-time models. For classification models, it computes the squared error between the observed outcome (encoded as 0/1) and the corresponding predicted probability for the class.

-

A little math: suppose that the value \(y_{ik}\) is a 0/1 indicator for whether the observed outcome \(i\) corresponds to class \(k\), and \(\hat{p}_{ik}\) is the estimated class probability. The score is then:

+

A little math: suppose that the value \(y_{ik}\) is a 0/1 indicator for whether the observed outcome \(i\) corresponds to class \(k\), and \(\hat{p}_{ik}\) is the estimated class probability. The classification score is then:

\[ Brier_{class} = \frac{1}{N}\sum_{i=1}^N\sum_{k=1}^C (y_{ik} - \hat{p}_{ik})^2 -\] -There are only two classes for survival models: events and non-events1. The survival function estimate is the probability that corresponds to non-events. For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate.

-

As previously described, we weight each observation using a technique similar to propensity weights in causal inference. We’ll denote these weights for evaluation time t as \(w_{it}\). Denoting the survival probability estimate as \(\hat{p}_{it}\), the time-dependent Brier score is:

+\]

+

For survival models, we transform the event time data into a binary version \(y_{it}\): is there an event at evaluation time \(t\)1. The survival function estimate \(\hat{p}_{it}\) is the probability that corresponds to non-events at time \(t\). For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate. To account for censoring, we also weight each observation with \(w_{it}\). The time-dependent Brier score is:

\[ -Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{i} = 0)(y_{i} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{i} = 1)(y_{i} - (1 - \hat{p}_{it}))^2}_\text{events}\right] +Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{it} = 0)(y_{it} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{it} = 1)(y_{it} - (1 - \hat{p}_{it}))^2}_\text{events}\right] \]

For this score, a perfect model has a score of zero, while an uninformative model would have a score of around 1/4.

How do we compute this using the yardstick package? The function brier_survival() follows the same convention as the other metric functions. The main arguments are:

@@ -128,20 +127,20 @@

Brier Score

val_pred %>% brier_survival(truth = event_time, .pred) brier_scores -#> # A tibble: 73 × 4 +#> # A tibble: 85 × 4 #> .metric .estimator .eval_time .estimate #> <chr> <chr> <dbl> <dbl> -#> 1 brier_survival standard 0 0 -#> 2 brier_survival standard 0.25 0.0000270 -#> 3 brier_survival standard 0.5 0.00418 -#> 4 brier_survival standard 0.75 0.0121 -#> 5 brier_survival standard 1 0.0243 -#> 6 brier_survival standard 1.25 0.0431 -#> 7 brier_survival standard 1.5 0.0521 -#> 8 brier_survival standard 1.75 0.0677 -#> 9 brier_survival standard 2 0.0857 -#> 10 brier_survival standard 2.25 0.100 -#> # ℹ 63 more rows
+#> 1 brier_survival standard 0 0 +#> 2 brier_survival standard 0.25 0 +#> 3 brier_survival standard 0.5 0.00202 +#> 4 brier_survival standard 0.75 0.00796 +#> 5 brier_survival standard 1 0.0266 +#> 6 brier_survival standard 1.25 0.0402 +#> 7 brier_survival standard 1.5 0.0563 +#> 8 brier_survival standard 1.75 0.0785 +#> 9 brier_survival standard 2 0.0895 +#> 10 brier_survival standard 2.25 0.0951 +#> # ℹ 75 more rows

Over time:

brier_scores %>% 
   ggplot(aes(.eval_time, .estimate)) + 
@@ -155,18 +154,17 @@ 

Brier Score

#> # A tibble: 1 × 3 #> .metric .estimator .estimate #> <chr> <chr> <dbl> -#> 1 brier_survival_integrated standard 0.121
+#> 1 brier_survival_integrated standard 0.113

Again, smaller values are better.

Receiver Operating Characteristic (ROC) Curves

-

Given that we can convert our event time data into a binary representation at time t, it is possible to compute any classification statistics of interest. Sensitivity and specificity are two interesting quantities:

+

When we not only turn the event time data into a binary representation but also the predicted probabilities, we are in well-chartered classification metrics territory. Sensitivity and specificity are common quantities to compute, we do so here in their weighted version to account for censoring:

    -
  • sensitivity: How well do we predict the events? This is analogous to the true positive rate.
  • -
  • specificity: How well do we predict the non-events? One minus specificity is the false positive rate.
  • +
  • Sensitivity: How well do we predict the events? This is analogous to the true positive rate.
  • +
  • Specificity: How well do we predict the non-events? One minus specificity is the false positive rate.
-

As with classification, we don’t know what threshold to use for the survival estimate to convert the data to a binary format. The default of 50% may not be appropriate.

-

To look a little closer, we’ll collect the dynamic predictions and convert them to binary factors.

+

Let’s take a look at our example data and use an evaluation time of 5.00!

time_as_binary_event <- function(surv, eval_time) {
   event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these
   status <- parsnip:::.extract_surv_status(surv)
@@ -184,8 +182,7 @@ 

Receiver Operating Characteristic (ROC) Curves

factor(binary_res, levels = c("event", "non-event")) } - -# Unnest the list columns and convert the data to binary format +# Unnest the list columns and convert the event time data to binary format binary_encoding <- val_pred %>% select(.pred, event_time) %>% @@ -194,28 +191,28 @@

Receiver Operating Characteristic (ROC) Curves

mutate( obs_class = time_as_binary_event(event_time, .eval_time), pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"), - pred_class = factor(pred_class, levels = c("event", "non-event")), - )
-

Let’s take an evaluation time of 5.00 as an example.

-
data_at_5 <- 
+    pred_class = factor(pred_class, levels = c("event", "non-event"))
+  )
+
+data_at_5 <- 
   binary_encoding %>% 
   filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% 
-  select(.eval_time, .pred_survival, .weight_censored, obs_class, pred_class, event_time)
+  select(.eval_time, .pred_survival, .weight_censored, obs_class, event_time)
 data_at_5
-#> # A tibble: 205 × 6
-#>    .eval_time .pred_survival .weight_censored obs_class pred_class event_time
-#>         <dbl>          <dbl>            <dbl> <fct>     <fct>          <Surv>
-#>  1          5         0.835              1.28 non-event non-event       5.78 
-#>  2          5         0.327              1.06 event     event           2.34 
-#>  3          5         0.812              1.28 non-event non-event       7.45+
-#>  4          5         0.751              1.28 non-event non-event       8.05 
-#>  5          5         0.755              1.28 non-event non-event      14.09 
-#>  6          5         0.854              1.28 non-event non-event       5.23+
-#>  7          5         0.868              1.04 event     non-event       1.86 
-#>  8          5         0.664              1.28 non-event non-event       7.20 
-#>  9          5         0.0436             1.19 event     event           3.95 
-#> 10          5         0.375              1.28 non-event event           5.18+
-#> # ℹ 195 more rows
+#> # A tibble: 391 × 5 +#> .eval_time .pred_survival .weight_censored obs_class event_time +#> <dbl> <dbl> <dbl> <fct> <Surv> +#> 1 5 0.662 1.27 event 4.83 +#> 2 5 0.662 1.29 non-event 6.11 +#> 3 5 0.731 1.29 non-event 6.60+ +#> 4 5 0.213 1.07 event 2.72 +#> 5 5 0.499 1.29 non-event 8.70 +#> 6 5 0.711 1.29 non-event 10.82 +#> 7 5 0.668 1.29 non-event 6.89 +#> 8 5 0.901 1.29 non-event 8.23+ +#> 9 5 0.469 1.20 event 4.20 +#> 10 5 0.886 1.29 non-event 7.27+ +#> # ℹ 381 more rows

What does the distribution of the survival probabilities for each of the actual classes look like?

data_at_5 %>% 
   ggplot(aes(x = .pred_survival, weight = .weight_censored)) + 
@@ -223,33 +220,33 @@ 

Receiver Operating Characteristic (ROC) Curves

geom_histogram(col = "white", bins = 30) + facet_wrap(~obs_class, ncol = 1) + lims(x = 0:1) + - labs(x = "Probability of Survival", y = "Sum of Weights")
+ labs(x = "probability of survival", y = "sum of weights")

-

More probability values are to the right of the 50% cutoff for the true non-events. Conversely, true events tend to have smaller probabilities. Using this cutoff, the sensitivity would be 70.3% and the specificity would be 83%. There are other possible cutoffs for the survival probabilities. Maybe one of these would have better statistics.

+

More probability values are to the right of the 50% cutoff for the true non-events. Conversely, true events tend to have smaller probabilities. Using this cutoff, the sensitivity would be 66.8% and the specificity would be 82.3%. There are other possible cutoffs for the survival probabilities. Maybe one of these would have better statistics.

ROC curves were designed as a general method that, given a collection of continuous predictions, determines an effective threshold such that values above the threshold indicate a specific event. For our purposes, the ROC curve will compute the sensitivity and specificity for all possible probability thresholds. It then plots the true positive rate versus the false positive rate. Generally, we use the area under the ROC curve to quantify it with a single number. Values near one indicate a perfect model, while values near 1/2 occur with non-informative models.

Blanche et al (2013) gives a good overview of ROC curves for survival analysis and their Section 4.3 is most relevant here.

-

For our example at t = 5.00, the ROC curve is:

+

For our example at evaluation time \(t = 5.00\), the ROC curve is:

-

The area under this curve is 0.838.

-

Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is the same as the Brier code shown above:

+

The area under this curve is 0.807.

+

Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is very similar to the Brier code shown above:

roc_scores <-
   val_pred %>% 
   roc_auc_survival(truth = event_time, .pred)
 roc_scores
-#> # A tibble: 73 × 4
+#> # A tibble: 85 × 4
 #>    .metric          .estimator .eval_time .estimate
 #>    <chr>            <chr>           <dbl>     <dbl>
-#>  1 roc_auc_survival standard         0       0.5   
-#>  2 roc_auc_survival standard         0.25    0.5   
-#>  3 roc_auc_survival standard         0.5     0.0948
-#>  4 roc_auc_survival standard         0.75    0.851 
-#>  5 roc_auc_survival standard         1       0.697 
-#>  6 roc_auc_survival standard         1.25    0.697 
-#>  7 roc_auc_survival standard         1.5     0.719 
-#>  8 roc_auc_survival standard         1.75    0.757 
-#>  9 roc_auc_survival standard         2       0.745 
-#> 10 roc_auc_survival standard         2.25    0.765 
-#> # ℹ 63 more rows
+#> 1 roc_auc_survival standard 0 0.5 +#> 2 roc_auc_survival standard 0.25 0.5 +#> 3 roc_auc_survival standard 0.5 0.869 +#> 4 roc_auc_survival standard 0.75 0.852 +#> 5 roc_auc_survival standard 1 0.734 +#> 6 roc_auc_survival standard 1.25 0.768 +#> 7 roc_auc_survival standard 1.5 0.792 +#> 8 roc_auc_survival standard 1.75 0.777 +#> 9 roc_auc_survival standard 2 0.771 +#> 10 roc_auc_survival standard 2.25 0.778 +#> # ℹ 75 more rows

Over time:

roc_scores %>% 
   ggplot(aes(.eval_time, .estimate)) + 
@@ -259,13 +256,13 @@ 

Receiver Operating Characteristic (ROC) Curves

labs(x = "time", y = "ROC AUC")

The initial variation is due to so few events at the early stages of analysis.

-

We should maximize the AUC. This metric measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric’s results over time differ.

+

The ROC measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric’s results over time differ.

Tuning these metrics

Many of the event time models available in tidymodels have tuning parameters. The tune_*() functions and fit_resamples() have an eval_time argument used to pass the evaluation times. The statistics are computed for these time points using out-of-sample data.

In some cases, such as iterative search or racing methods, the functions need a single value to optimize. If a dynamic metric is used to guide the optimization, the first evaluation time given by the user will be used.

-

For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would wish to use that value as the first element of time_points.

+

For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would need to use that value of 5.00 as the first element time_points, the vector given to the eval_time argument in our example above.

Summary

@@ -280,43 +277,44 @@

Session information

#> ─ Session info ─────────────────────────────────────────────────────
 #>  setting  value
 #>  version  R version 4.2.0 (2022-04-22)
-#>  os       macOS Big Sur/Monterey 10.16
-#>  system   x86_64, darwin17.0
+#>  os       macOS 13.4
+#>  system   aarch64, darwin20
 #>  ui       X11
 #>  language (EN)
 #>  collate  en_US.UTF-8
 #>  ctype    en_US.UTF-8
-#>  tz       America/New_York
-#>  date     2023-05-16
-#>  pandoc   3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
+#>  tz       Europe/London
+#>  date     2023-06-07
+#>  pandoc   2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
 #> 
 #> ─ Packages ─────────────────────────────────────────────────────────
 #>  package    * version    date (UTC) lib source
 #>  broom      * 1.0.4      2023-03-11 [1] CRAN (R 4.2.0)
-#>  censored   * 0.2.0.9000 2023-05-16 [1] Github (tidymodels/censored@f9eccb6)
+#>  censored   * 0.2.0.9000 2023-05-22 [1] Github (tidymodels/censored@f9eccb6)
 #>  dials      * 1.2.0      2023-04-03 [1] CRAN (R 4.2.0)
 #>  dplyr      * 1.1.2      2023-04-20 [1] CRAN (R 4.2.0)
 #>  ggplot2    * 3.4.2      2023-04-03 [1] CRAN (R 4.2.0)
 #>  infer      * 1.0.4      2022-12-02 [1] CRAN (R 4.2.0)
-#>  parsnip    * 1.1.0.9001 2023-05-16 [1] Github (tidymodels/parsnip@ab42409)
+#>  parsnip    * 1.1.0.9002 2023-06-05 [1] local
 #>  prodlim    * 2023.03.31 2023-04-02 [1] CRAN (R 4.2.0)
 #>  purrr      * 1.0.1      2023-01-10 [1] CRAN (R 4.2.0)
-#>  recipes    * 1.0.6      2023-04-25 [1] CRAN (R 4.2.0)
+#>  recipes    * 1.0.6.9000 2023-05-17 [1] local
 #>  rlang        1.1.1      2023-04-28 [1] CRAN (R 4.2.0)
-#>  rsample    * 1.1.1      2022-12-07 [1] CRAN (R 4.2.0)
+#>  rsample    * 1.1.1.9000 2023-04-21 [1] local
 #>  tibble     * 3.2.1      2023-03-20 [1] CRAN (R 4.2.0)
 #>  tidymodels * 1.1.0      2023-05-01 [1] CRAN (R 4.2.0)
-#>  tune       * 1.1.1.9001 2023-05-16 [1] Github (tidymodels/tune@fdc0016)
-#>  workflows  * 1.1.3      2023-02-22 [1] CRAN (R 4.2.0)
+#>  tune       * 1.1.1.9000 2023-04-20 [1] local
+#>  workflows  * 1.1.3.9000 2023-04-03 [1] local
 #>  yardstick  * 1.2.0      2023-04-21 [1] CRAN (R 4.2.0)
 #> 
-#>  [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
+#>  [1] /Users/hannah/Library/R/arm64/4.2/library
+#>  [2] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
 #> 
 #> ────────────────────────────────────────────────────────────────────

    -
  1. Again, see the previous article for more detail on this.↩︎

  2. +
  3. Again, see the Accounting for Censoring in Performance Metrics for Event Time Data article for more on this.↩︎

From b0e1816dd91834b05fde26e8d904b085e2bdb2a1 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Wed, 7 Jun 2023 12:32:08 +0100 Subject: [PATCH 4/6] fix file extension --- .../{index.Rmarkdown.Rmd => index.Rmarkdown} | 1 + .../survival-metrics/index.Rmarkdown.html | 320 -------------- .../survival-metrics/index.markdown | 407 ++++++++++++++++++ 3 files changed, 408 insertions(+), 320 deletions(-) rename content/learn/statistics/survival-metrics/{index.Rmarkdown.Rmd => index.Rmarkdown} (99%) delete mode 100644 content/learn/statistics/survival-metrics/index.Rmarkdown.html create mode 100644 content/learn/statistics/survival-metrics/index.markdown diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd b/content/learn/statistics/survival-metrics/index.Rmarkdown similarity index 99% rename from content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd rename to content/learn/statistics/survival-metrics/index.Rmarkdown index f41e3581..bd7cff65 100644 --- a/content/learn/statistics/survival-metrics/index.Rmarkdown.Rmd +++ b/content/learn/statistics/survival-metrics/index.Rmarkdown @@ -27,6 +27,7 @@ options(pillar.advice = FALSE, pillar.min_title_chars = Inf) pak::pak(c("tidymodels/censored", "tidymodels/parsnip")) ``` + ## Introduction One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call them dynamic performance metrics. diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown.html b/content/learn/statistics/survival-metrics/index.Rmarkdown.html deleted file mode 100644 index 6d4b8692..00000000 --- a/content/learn/statistics/survival-metrics/index.Rmarkdown.html +++ /dev/null @@ -1,320 +0,0 @@ ---- -title: "Dynamic Performance Metrics for Event Time Data" -tags: [survival,censored,parsnip,yardstick] -categories: [statistical analysis] -type: learn-subsection -weight: 9 -description: | - Let's discuss how to compute modern performance metrics for time-to-event models. ---- - - - -

To use the code in this article, you will need to install the following packages: censored, prodlim, and tidymodels. You’ll need the development versions of censored and parsnip. To install these, use

-
#install.packages("pak")
-
-pak::pak(c("tidymodels/censored", "tidymodels/parsnip"))
-
-

Introduction

-

One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call them dynamic performance metrics.

-

Many dynamic metrics are similar to those used for binary classification models. The basic idea is that, for a given time \(t\) for model evaluation, we try to encode the observed event time data into a binary “has there been an event at time \(t\)?” version. We can also convert the predicted survival probabilities into predicted events/non-events based on a threshold (default is 0.50). The survival versions of these metrics need those binary versions of observed truth and predictions as well as a way to account for censoring.

-

Censoring plays into the details of the conversion and is additionally captured in the form of weights. For details on both these aspects, see the Accounting for Censoring in Performance Metrics for Event Time Data article.

-

To start, let’s define the various types of times that will be mentioned:

-
    -
  • Observed time: time recorded in the data
  • -
  • Event time: observed times for actual events
  • -
  • Evaluation time: the time, specified by the analyst, that the model is evaluated.
  • -
-
-
-

Example data

-

As an example, we’ll simulate some data with the prodlim package, using the methods of Bender et al (2005). A training and a validation set are simulated. We’ll also load the censored package so that we can fit a model to these time-to-event data:

-
library(tidymodels)
-library(censored)
-#> Loading required package: survival
-library(prodlim)
-
-set.seed(5882)
-sim_dat <- SimSurv(2000) %>%
-  mutate(event_time = Surv(time, event)) %>%
-  select(event_time, X1, X2)
-
-set.seed(2)
-split   <- initial_split(sim_dat)
-sim_tr  <- training(split)
-sim_val <- testing(split)
-
-## Resampling object
-sim_rs <- vfold_cv(sim_tr)
-

We’ll need a model to illustrate the code and concepts. Let’s fit a bagged survival tree model to the training set:

-
set.seed(17)
-bag_tree_fit <- 
-  bag_tree() %>% 
-  set_mode("censored regression") %>% 
-  set_engine("rpart") %>% 
-  fit(event_time ~ ., data = sim_tr)
-bag_tree_fit
-#> parsnip model object
-#> 
-#> 
-#> Bagging survival trees with 25 bootstrap replications 
-#> 
-#> Call: bagging.data.frame(formula = event_time ~ ., data = data)
-

Using this model, we’ll make predictions of different types.

-
-
-

Survival Probability Prediction

-

This censored regression model can make static predictions via the predicted event time using predict(object, type = "time"). It can also create dynamic predictions regarding the probability of survival for each data point at specific times. The syntax for this is

-
predict(object, new_data, type = "survival", eval_time = numeric())
-

where eval_time is a vector of time points at which we want the corresponding survivor function estimates. Alternatively, we can use the augment() function to get both types of prediction and automatically attach them to the data.

-

The largest event time in the training set is 21.083 so we will use a set of evaluation times between zero and 21.

-
time_points <- seq(0, 21, by = 0.25)
-
-val_pred <- augment(bag_tree_fit, sim_val, eval_time = time_points)
-val_pred
-#> # A tibble: 500 × 5
-#>    .pred             .pred_time event_time    X1      X2
-#>    <list>                 <dbl>     <Surv> <dbl>   <dbl>
-#>  1 <tibble [85 × 5]>       6.66      4.83      1 -0.630 
-#>  2 <tibble [85 × 5]>       6.66      6.11      1 -0.606 
-#>  3 <tibble [85 × 5]>       7.47      6.60+     1 -1.03  
-#>  4 <tibble [85 × 5]>       3.29      2.72      1  0.811 
-#>  5 <tibble [85 × 5]>       5.10      4.73+     1 -0.376 
-#>  6 <tibble [85 × 5]>       4.99      8.70      0  1.18  
-#>  7 <tibble [85 × 5]>       7.23     10.82      1 -0.851 
-#>  8 <tibble [85 × 5]>       6.46      6.89      0  0.493 
-#>  9 <tibble [85 × 5]>       4.75      2.45+     1  0.0207
-#> 10 <tibble [85 × 5]>      13.4       8.23+     0 -1.52  
-#> # ℹ 490 more rows
-

The observed data are in the event_time column. The predicted survival probabilities are in the .pred column. This is a list column with a data frame for each observation, containing the predictions at the 85 evaluation time points in the (nested) column .pred_survival.

-
val_pred$.pred[[1]]
-#> # A tibble: 85 × 5
-#>    .eval_time .pred_survival .weight_time .pred_censored .weight_censored
-#>         <dbl>          <dbl>        <dbl>          <dbl>            <dbl>
-#>  1       0             1            0              1                 1   
-#>  2       0.25          1            0.250          0.999             1.00
-#>  3       0.5           0.999        0.500          0.996             1.00
-#>  4       0.75          0.992        0.750          0.993             1.01
-#>  5       1             0.988        1.00           0.991             1.01
-#>  6       1.25          0.980        1.25           0.987             1.01
-#>  7       1.5           0.972        1.50           0.981             1.02
-#>  8       1.75          0.959        1.75           0.971             1.03
-#>  9       2             0.938        2.00           0.966             1.04
-#> 10       2.25          0.925        2.25           0.959             1.04
-#> # ℹ 75 more rows
-

The yardstick package currently has two dynamic metrics. Each is described below.

-
-
-

Brier Score

-

The Brier score is a metric that can be used with both classification and event-time models. For classification models, it computes the squared error between the observed outcome (encoded as 0/1) and the corresponding predicted probability for the class.

-

A little math: suppose that the value \(y_{ik}\) is a 0/1 indicator for whether the observed outcome \(i\) corresponds to class \(k\), and \(\hat{p}_{ik}\) is the estimated class probability. The classification score is then:

-

\[ -Brier_{class} = \frac{1}{N}\sum_{i=1}^N\sum_{k=1}^C (y_{ik} - \hat{p}_{ik})^2 -\]

-

For survival models, we transform the event time data into a binary version \(y_{it}\): is there an event at evaluation time \(t\)1. The survival function estimate \(\hat{p}_{it}\) is the probability that corresponds to non-events at time \(t\). For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate. To account for censoring, we also weight each observation with \(w_{it}\). The time-dependent Brier score is:

-

\[ -Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{it} = 0)(y_{it} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{it} = 1)(y_{it} - (1 - \hat{p}_{it}))^2}_\text{events}\right] -\]

-

For this score, a perfect model has a score of zero, while an uninformative model would have a score of around 1/4.

-

How do we compute this using the yardstick package? The function brier_survival() follows the same convention as the other metric functions. The main arguments are:

-
    -
  • data: the data frame with the predictions (structured as the output produced by augment(), as shown above).
  • -
  • truth: the name of the column with the Surv object.
  • -
  • ...: the name of the column with the dynamic predictions. Within tidymodels, this column is always called .pred. In other words, .pred should be passed without an argument name.
  • -
-

Since the evaluation times and the case weights are within the .pred column, there is no need to specify these. Here are the results of our validation set:

-
brier_scores <-
-  val_pred %>% 
-  brier_survival(truth = event_time, .pred)
-brier_scores
-#> # A tibble: 85 × 4
-#>    .metric        .estimator .eval_time .estimate
-#>    <chr>          <chr>           <dbl>     <dbl>
-#>  1 brier_survival standard         0      0      
-#>  2 brier_survival standard         0.25   0      
-#>  3 brier_survival standard         0.5    0.00202
-#>  4 brier_survival standard         0.75   0.00796
-#>  5 brier_survival standard         1      0.0266 
-#>  6 brier_survival standard         1.25   0.0402 
-#>  7 brier_survival standard         1.5    0.0563 
-#>  8 brier_survival standard         1.75   0.0785 
-#>  9 brier_survival standard         2      0.0895 
-#> 10 brier_survival standard         2.25   0.0951 
-#> # ℹ 75 more rows
-

Over time:

-
brier_scores %>% 
-  ggplot(aes(.eval_time, .estimate)) + 
-  geom_hline(yintercept = 1 / 4, col = "red", lty = 3) +
-  geom_line() +
-  geom_point() + 
-  labs(x = "time", y = "Brier score")
-

-

There is also an integrated Brier score. This required evaluation times as inputs but instead of returning each result, it takes the area under the plot shown above. The syntax is the same but the result has a single row:

-
val_pred %>% brier_survival_integrated(truth = event_time, .pred)
-#> # A tibble: 1 × 3
-#>   .metric                   .estimator .estimate
-#>   <chr>                     <chr>          <dbl>
-#> 1 brier_survival_integrated standard       0.113
-

Again, smaller values are better.

-
-
-

Receiver Operating Characteristic (ROC) Curves

-

When we not only turn the event time data into a binary representation but also the predicted probabilities, we are in well-chartered classification metrics territory. Sensitivity and specificity are common quantities to compute, we do so here in their weighted version to account for censoring:

-
    -
  • Sensitivity: How well do we predict the events? This is analogous to the true positive rate.
  • -
  • Specificity: How well do we predict the non-events? One minus specificity is the false positive rate.
  • -
-

Let’s take a look at our example data and use an evaluation time of 5.00!

-
time_as_binary_event <- function(surv, eval_time) {
-  event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these
-  status <- parsnip:::.extract_surv_status(surv)
-  is_event_before_t <- event_time <= eval_time & status == 1
-
-  # Three possible contributions to the statistic from Graf 1999
-  # Censoring time before eval_time, no contribution (Graf category 3)
-  binary_res <- rep(NA_character_, length(event_time))
-
-  # A real event prior to eval_time (Graf category 1)
-  binary_res <- if_else(is_event_before_t, "event", binary_res)
-
-  # Observed time greater than eval_time (Graf category 2)
-  binary_res <- if_else(event_time > eval_time, "non-event", binary_res)
-  factor(binary_res, levels = c("event", "non-event"))
-}
-
-# Unnest the list columns and convert the event time data to binary format 
-binary_encoding <- 
-  val_pred %>% 
-  select(.pred, event_time) %>% 
-  add_rowindex() %>% 
-  unnest(.pred) %>% 
-  mutate(
-    obs_class = time_as_binary_event(event_time, .eval_time),
-    pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"),
-    pred_class = factor(pred_class, levels = c("event", "non-event"))
-  )
-
-data_at_5 <- 
-  binary_encoding %>% 
-  filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% 
-  select(.eval_time, .pred_survival, .weight_censored, obs_class, event_time)
-data_at_5
-#> # A tibble: 391 × 5
-#>    .eval_time .pred_survival .weight_censored obs_class event_time
-#>         <dbl>          <dbl>            <dbl> <fct>         <Surv>
-#>  1          5          0.662             1.27 event          4.83 
-#>  2          5          0.662             1.29 non-event      6.11 
-#>  3          5          0.731             1.29 non-event      6.60+
-#>  4          5          0.213             1.07 event          2.72 
-#>  5          5          0.499             1.29 non-event      8.70 
-#>  6          5          0.711             1.29 non-event     10.82 
-#>  7          5          0.668             1.29 non-event      6.89 
-#>  8          5          0.901             1.29 non-event      8.23+
-#>  9          5          0.469             1.20 event          4.20 
-#> 10          5          0.886             1.29 non-event      7.27+
-#> # ℹ 381 more rows
-

What does the distribution of the survival probabilities for each of the actual classes look like?

-
data_at_5 %>% 
-  ggplot(aes(x = .pred_survival, weight = .weight_censored)) + 
-  geom_vline(xintercept = 1 / 2, col = "blue", lty = 2) +
-  geom_histogram(col = "white", bins = 30) + 
-  facet_wrap(~obs_class, ncol = 1) +
-  lims(x = 0:1) +
-  labs(x = "probability of survival", y = "sum of weights")
-

-

More probability values are to the right of the 50% cutoff for the true non-events. Conversely, true events tend to have smaller probabilities. Using this cutoff, the sensitivity would be 66.8% and the specificity would be 82.3%. There are other possible cutoffs for the survival probabilities. Maybe one of these would have better statistics.

-

ROC curves were designed as a general method that, given a collection of continuous predictions, determines an effective threshold such that values above the threshold indicate a specific event. For our purposes, the ROC curve will compute the sensitivity and specificity for all possible probability thresholds. It then plots the true positive rate versus the false positive rate. Generally, we use the area under the ROC curve to quantify it with a single number. Values near one indicate a perfect model, while values near 1/2 occur with non-informative models.

-

Blanche et al (2013) gives a good overview of ROC curves for survival analysis and their Section 4.3 is most relevant here.

-

For our example at evaluation time \(t = 5.00\), the ROC curve is:

-

-

The area under this curve is 0.807.

-

Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is very similar to the Brier code shown above:

-
roc_scores <-
-  val_pred %>% 
-  roc_auc_survival(truth = event_time, .pred)
-roc_scores
-#> # A tibble: 85 × 4
-#>    .metric          .estimator .eval_time .estimate
-#>    <chr>            <chr>           <dbl>     <dbl>
-#>  1 roc_auc_survival standard         0        0.5  
-#>  2 roc_auc_survival standard         0.25     0.5  
-#>  3 roc_auc_survival standard         0.5      0.869
-#>  4 roc_auc_survival standard         0.75     0.852
-#>  5 roc_auc_survival standard         1        0.734
-#>  6 roc_auc_survival standard         1.25     0.768
-#>  7 roc_auc_survival standard         1.5      0.792
-#>  8 roc_auc_survival standard         1.75     0.777
-#>  9 roc_auc_survival standard         2        0.771
-#> 10 roc_auc_survival standard         2.25     0.778
-#> # ℹ 75 more rows
-

Over time:

-
roc_scores %>% 
-  ggplot(aes(.eval_time, .estimate)) + 
-  geom_hline(yintercept = 1 / 2, col = "red", lty = 3) +
-  geom_line() +
-  geom_point() + 
-  labs(x = "time", y = "ROC AUC")
-

-

The initial variation is due to so few events at the early stages of analysis.

-

The ROC measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric’s results over time differ.

-
-
-

Tuning these metrics

-

Many of the event time models available in tidymodels have tuning parameters. The tune_*() functions and fit_resamples() have an eval_time argument used to pass the evaluation times. The statistics are computed for these time points using out-of-sample data.

-

In some cases, such as iterative search or racing methods, the functions need a single value to optimize. If a dynamic metric is used to guide the optimization, the first evaluation time given by the user will be used.

-

For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would need to use that value of 5.00 as the first element time_points, the vector given to the eval_time argument in our example above.

-
-
-

Summary

-

tidymodels has two time-dependent metrics for characterizing the performance of event-time models:

-
    -
  • The Brier score measures the distance between the observed class result and the predicted probabilities.
  • -
  • ROC curves try to measure the separation between the two classes based on the survival probabilities.
  • -
-
-
-

Session information

-
#> ─ Session info ─────────────────────────────────────────────────────
-#>  setting  value
-#>  version  R version 4.2.0 (2022-04-22)
-#>  os       macOS 13.4
-#>  system   aarch64, darwin20
-#>  ui       X11
-#>  language (EN)
-#>  collate  en_US.UTF-8
-#>  ctype    en_US.UTF-8
-#>  tz       Europe/London
-#>  date     2023-06-07
-#>  pandoc   2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
-#> 
-#> ─ Packages ─────────────────────────────────────────────────────────
-#>  package    * version    date (UTC) lib source
-#>  broom      * 1.0.4      2023-03-11 [1] CRAN (R 4.2.0)
-#>  censored   * 0.2.0.9000 2023-05-22 [1] Github (tidymodels/censored@f9eccb6)
-#>  dials      * 1.2.0      2023-04-03 [1] CRAN (R 4.2.0)
-#>  dplyr      * 1.1.2      2023-04-20 [1] CRAN (R 4.2.0)
-#>  ggplot2    * 3.4.2      2023-04-03 [1] CRAN (R 4.2.0)
-#>  infer      * 1.0.4      2022-12-02 [1] CRAN (R 4.2.0)
-#>  parsnip    * 1.1.0.9002 2023-06-05 [1] local
-#>  prodlim    * 2023.03.31 2023-04-02 [1] CRAN (R 4.2.0)
-#>  purrr      * 1.0.1      2023-01-10 [1] CRAN (R 4.2.0)
-#>  recipes    * 1.0.6.9000 2023-05-17 [1] local
-#>  rlang        1.1.1      2023-04-28 [1] CRAN (R 4.2.0)
-#>  rsample    * 1.1.1.9000 2023-04-21 [1] local
-#>  tibble     * 3.2.1      2023-03-20 [1] CRAN (R 4.2.0)
-#>  tidymodels * 1.1.0      2023-05-01 [1] CRAN (R 4.2.0)
-#>  tune       * 1.1.1.9000 2023-04-20 [1] local
-#>  workflows  * 1.1.3.9000 2023-04-03 [1] local
-#>  yardstick  * 1.2.0      2023-04-21 [1] CRAN (R 4.2.0)
-#> 
-#>  [1] /Users/hannah/Library/R/arm64/4.2/library
-#>  [2] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
-#> 
-#> ────────────────────────────────────────────────────────────────────
-
-
-
-
    -
  1. Again, see the Accounting for Censoring in Performance Metrics for Event Time Data article for more on this.↩︎

  2. -
-
diff --git a/content/learn/statistics/survival-metrics/index.markdown b/content/learn/statistics/survival-metrics/index.markdown new file mode 100644 index 00000000..c4cfecea --- /dev/null +++ b/content/learn/statistics/survival-metrics/index.markdown @@ -0,0 +1,407 @@ +--- +title: "Dynamic Performance Metrics for Event Time Data" +tags: [survival,censored,parsnip,yardstick] +categories: [statistical analysis] +type: learn-subsection +weight: 9 +description: | + Let's discuss how to compute modern performance metrics for time-to-event models. +--- + + + + + +To use the code in this article, you will need to install the following packages: censored, prodlim, and tidymodels. You'll need the development versions of censored and parsnip. To install these, use + +```r +#install.packages("pak") + +pak::pak(c("tidymodels/censored", "tidymodels/parsnip")) +``` + +## Introduction + +One trend in modern survival analysis is to compute time-dependent measures of performance. These are primarily driven by an increased focus on predictions for the probability of survival at a given time (as opposed to the predictions of event times or linear predictors). Since these are conditional on the time of evaluation, we call them dynamic performance metrics. + +Many dynamic metrics are similar to those used for binary classification models. The basic idea is that, for a given time `\(t\)` for model evaluation, we try to encode the observed event time data into a binary "has there been an event at time `\(t\)`?" version. We can also convert the predicted survival probabilities into predicted events/non-events based on a threshold (default is 0.50). The survival versions of these metrics need those binary versions of observed truth and predictions as well as a way to account for censoring. + +Censoring plays into the details of the conversion and is additionally captured in the form of weights. For details on both these aspects, see the [Accounting for Censoring in Performance Metrics for Event Time Data](FIXME) article. + +To start, let's define the various types of times that will be mentioned: + +- Observed time: time recorded in the data +- Event time: observed times for actual events +- Evaluation time: the time, specified by the analyst, that the model is evaluated. + +## Example data + +As an example, we'll simulate some data with the prodlim package, using the methods of [Bender _et al_ (2005)](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Generating+survival+times+to+simulate+Cox+proportional+hazards+models.%22&btnG=). A training and a validation set are simulated. We'll also load the censored package so that we can fit a model to these time-to-event data: + + +```r +library(tidymodels) +library(censored) +#> Loading required package: survival +library(prodlim) + +set.seed(5882) +sim_dat <- SimSurv(2000) %>% + mutate(event_time = Surv(time, event)) %>% + select(event_time, X1, X2) + +set.seed(2) +split <- initial_split(sim_dat) +sim_tr <- training(split) +sim_val <- testing(split) + +## Resampling object +sim_rs <- vfold_cv(sim_tr) +``` + +We'll need a model to illustrate the code and concepts. Let's fit a bagged survival tree model to the training set: + + +```r +set.seed(17) +bag_tree_fit <- + bag_tree() %>% + set_mode("censored regression") %>% + set_engine("rpart") %>% + fit(event_time ~ ., data = sim_tr) +bag_tree_fit +#> parsnip model object +#> +#> +#> Bagging survival trees with 25 bootstrap replications +#> +#> Call: bagging.data.frame(formula = event_time ~ ., data = data) +``` + +Using this model, we'll make predictions of different types. + +## Survival Probability Prediction + +This censored regression model can make static predictions via the predicted event time using `predict(object, type = "time")`. It can also create dynamic predictions regarding the probability of survival for each data point at specific times. The syntax for this is + +```r +predict(object, new_data, type = "survival", eval_time = numeric()) +``` + +where `eval_time` is a vector of time points at which we want the corresponding survivor function estimates. Alternatively, we can use the `augment()` function to get both types of prediction and automatically attach them to the data. + +The largest event time in the training set is 21.083 so we will use a set of evaluation times between zero and 21. + + +```r +time_points <- seq(0, 21, by = 0.25) + +val_pred <- augment(bag_tree_fit, sim_val, eval_time = time_points) +val_pred +#> # A tibble: 500 × 5 +#> .pred .pred_time event_time X1 X2 +#> +#> 1 6.66 4.83 1 -0.630 +#> 2 6.66 6.11 1 -0.606 +#> 3 7.47 6.60+ 1 -1.03 +#> 4 3.29 2.72 1 0.811 +#> 5 5.10 4.73+ 1 -0.376 +#> 6 4.99 8.70 0 1.18 +#> 7 7.23 10.82 1 -0.851 +#> 8 6.46 6.89 0 0.493 +#> 9 4.75 2.45+ 1 0.0207 +#> 10 13.4 8.23+ 0 -1.52 +#> # ℹ 490 more rows +``` + +The observed data are in the `event_time` column. The predicted survival probabilities are in the `.pred` column. This is a list column with a data frame for each observation, containing the predictions at the 85 evaluation time points in the (nested) column `.pred_survival`. + + +```r +val_pred$.pred[[1]] +#> # A tibble: 85 × 5 +#> .eval_time .pred_survival .weight_time .pred_censored .weight_censored +#> +#> 1 0 1 0 1 1 +#> 2 0.25 1 0.250 0.999 1.00 +#> 3 0.5 0.999 0.500 0.996 1.00 +#> 4 0.75 0.992 0.750 0.993 1.01 +#> 5 1 0.988 1.00 0.991 1.01 +#> 6 1.25 0.980 1.25 0.987 1.01 +#> 7 1.5 0.972 1.50 0.981 1.02 +#> 8 1.75 0.959 1.75 0.971 1.03 +#> 9 2 0.938 2.00 0.966 1.04 +#> 10 2.25 0.925 2.25 0.959 1.04 +#> # ℹ 75 more rows +``` + +The yardstick package currently has two dynamic metrics. Each is described below. + +## Brier Score + +The Brier score is a metric that can be used with both classification and event-time models. For classification models, it computes the squared error between the observed outcome (encoded as 0/1) and the corresponding predicted probability for the class. + +A little math: suppose that the value `\(y_{ik}\)` is a 0/1 indicator for whether the observed outcome `\(i\)` corresponds to class `\(k\)`, and `\(\hat{p}_{ik}\)` is the estimated class probability. The classification score is then: + +$$ +Brier_{class} = \frac{1}{N}\sum_{i=1}^N\sum_{k=1}^C (y_{ik} - \hat{p}_{ik})^2 +$$ + +For survival models, we transform the event time data into a binary version `\(y_{it}\)`: is there an event at evaluation time `\(t\)`^[Again, see the [Accounting for Censoring in Performance Metrics for Event Time Data](FIXME) article for more on this.]. The survival function estimate `\(\hat{p}_{it}\)` is the probability that corresponds to non-events at time `\(t\)`. For example, if were has not been an event at the current evaluation time, our best model should estimate the survival probability to be near one. For observations that are events, the probability estimate is just one minus the survivor estimate. To account for censoring, we also weight each observation with `\(w_{it}\)`. The [time-dependent Brier score](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Assessment+and+Comparison+of+Prognostic+Classification+Schemes+for+Survival+Data.%22&btnG=) is: + +$$ +Brier_{surv}(t) = \frac{1}{N}\sum_{i=1}^N w_{it}\left[\underbrace{I(y_{it} = 0)(y_{it} - \hat{p}_{it})^2}_\text{non-events} + \underbrace{I(y_{it} = 1)(y_{it} - (1 - \hat{p}_{it}))^2}_\text{events}\right] +$$ + +For this score, a perfect model has a score of zero, while an uninformative model would have a score of around 1/4. + +How do we compute this using the yardstick package? The function [`brier_survival()`](https://yardstick.tidymodels.org/reference/brier_survival.html) follows the same convention as the other metric functions. The main arguments are: + +- `data`: the data frame with the predictions (structured as the output produced by `augment()`, as shown above). +- `truth`: the name of the column with the `Surv` object. +- `...`: the name of the column with the dynamic predictions. Within tidymodels, this column is always called `.pred`. In other words, `.pred` should be passed without an argument name. + +Since the evaluation times and the case weights are within the `.pred` column, there is no need to specify these. Here are the results of our validation set: + + +```r +brier_scores <- + val_pred %>% + brier_survival(truth = event_time, .pred) +brier_scores +#> # A tibble: 85 × 4 +#> .metric .estimator .eval_time .estimate +#> +#> 1 brier_survival standard 0 0 +#> 2 brier_survival standard 0.25 0 +#> 3 brier_survival standard 0.5 0.00202 +#> 4 brier_survival standard 0.75 0.00796 +#> 5 brier_survival standard 1 0.0266 +#> 6 brier_survival standard 1.25 0.0402 +#> 7 brier_survival standard 1.5 0.0563 +#> 8 brier_survival standard 1.75 0.0785 +#> 9 brier_survival standard 2 0.0895 +#> 10 brier_survival standard 2.25 0.0951 +#> # ℹ 75 more rows +``` + +Over time: + + +```r +brier_scores %>% + ggplot(aes(.eval_time, .estimate)) + + geom_hline(yintercept = 1 / 4, col = "red", lty = 3) + + geom_line() + + geom_point() + + labs(x = "time", y = "Brier score") +``` + + + +There is also an _integrated_ Brier score. This required evaluation times as inputs but instead of returning each result, it takes the area under the plot shown above. The syntax is the same but the result has a single row: + + +```r +val_pred %>% brier_survival_integrated(truth = event_time, .pred) +#> # A tibble: 1 × 3 +#> .metric .estimator .estimate +#> +#> 1 brier_survival_integrated standard 0.113 +``` + +Again, smaller values are better. + +## Receiver Operating Characteristic (ROC) Curves + +When we not only turn the event time data into a binary representation but also the predicted probabilities, we are in well-chartered classification metrics territory. Sensitivity and specificity are common quantities to compute, we do so here in their weighted version to account for censoring: + +- Sensitivity: How well do we predict the events? This is analogous to the true positive rate. +- Specificity: How well do we predict the non-events? One minus specificity is the false positive rate. + +Let's take a look at our example data and use an evaluation time of 5.00! + + +```r +time_as_binary_event <- function(surv, eval_time) { + event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these + status <- parsnip:::.extract_surv_status(surv) + is_event_before_t <- event_time <= eval_time & status == 1 + + # Three possible contributions to the statistic from Graf 1999 + # Censoring time before eval_time, no contribution (Graf category 3) + binary_res <- rep(NA_character_, length(event_time)) + + # A real event prior to eval_time (Graf category 1) + binary_res <- if_else(is_event_before_t, "event", binary_res) + + # Observed time greater than eval_time (Graf category 2) + binary_res <- if_else(event_time > eval_time, "non-event", binary_res) + factor(binary_res, levels = c("event", "non-event")) +} + +# Unnest the list columns and convert the event time data to binary format +binary_encoding <- + val_pred %>% + select(.pred, event_time) %>% + add_rowindex() %>% + unnest(.pred) %>% + mutate( + obs_class = time_as_binary_event(event_time, .eval_time), + pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"), + pred_class = factor(pred_class, levels = c("event", "non-event")) + ) + +data_at_5 <- + binary_encoding %>% + filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% + select(.eval_time, .pred_survival, .weight_censored, obs_class, pred_class, event_time) +data_at_5 +#> # A tibble: 391 × 6 +#> .eval_time .pred_survival .weight_censored obs_class pred_class event_time +#> +#> 1 5 0.662 1.27 event non-event 4.83 +#> 2 5 0.662 1.29 non-event non-event 6.11 +#> 3 5 0.731 1.29 non-event non-event 6.60+ +#> 4 5 0.213 1.07 event event 2.72 +#> 5 5 0.499 1.29 non-event event 8.70 +#> 6 5 0.711 1.29 non-event non-event 10.82 +#> 7 5 0.668 1.29 non-event non-event 6.89 +#> 8 5 0.901 1.29 non-event non-event 8.23+ +#> 9 5 0.469 1.20 event event 4.20 +#> 10 5 0.886 1.29 non-event non-event 7.27+ +#> # ℹ 381 more rows +``` + +What does the distribution of the survival probabilities for each of the actual classes look like? + + +```r +data_at_5 %>% + ggplot(aes(x = .pred_survival, weight = .weight_censored)) + + geom_vline(xintercept = 1 / 2, col = "blue", lty = 2) + + geom_histogram(col = "white", bins = 30) + + facet_wrap(~obs_class, ncol = 1) + + lims(x = 0:1) + + labs(x = "probability of survival", y = "sum of weights") +``` + + + + + + +More probability values are to the right of the 50% cutoff for the true non-events. Conversely, true events tend to have smaller probabilities. Using this cutoff, the sensitivity would be 66.8% and the specificity would be 82.3%. There are other possible cutoffs for the survival probabilities. Maybe one of these would have better statistics. + +ROC curves were designed as a general method that, given a collection of continuous predictions, determines an effective threshold such that values above the threshold indicate a specific event. For our purposes, the ROC curve will compute the sensitivity and specificity for _all possible_ probability thresholds. It then plots the true positive rate versus the false positive rate. Generally, we use the area under the ROC curve to quantify it with a single number. Values near one indicate a perfect model, while values near 1/2 occur with non-informative models. + +[Blanche _et al_ (2013)](https://scholar.google.com/scholar?hl=en&as_sdt=0%2C7&q=%22Review+and+comparison+of+ROC+curve+estimators+for+a+time-dependent+outcome+with+marker-dependent+censoring%22&btnG=) gives a good overview of ROC curves for survival analysis and their Section 4.3 is most relevant here. + +For our example at evaluation time `\(t = 5.00\)`, the ROC curve is: + + + +The area under this curve is 0.807. + +Since this is a dynamic metric, we compute the AUC for each evaluation time. The syntax is very similar to the Brier code shown above: + + +```r +roc_scores <- + val_pred %>% + roc_auc_survival(truth = event_time, .pred) +roc_scores +#> # A tibble: 85 × 4 +#> .metric .estimator .eval_time .estimate +#> +#> 1 roc_auc_survival standard 0 0.5 +#> 2 roc_auc_survival standard 0.25 0.5 +#> 3 roc_auc_survival standard 0.5 0.869 +#> 4 roc_auc_survival standard 0.75 0.852 +#> 5 roc_auc_survival standard 1 0.734 +#> 6 roc_auc_survival standard 1.25 0.768 +#> 7 roc_auc_survival standard 1.5 0.792 +#> 8 roc_auc_survival standard 1.75 0.777 +#> 9 roc_auc_survival standard 2 0.771 +#> 10 roc_auc_survival standard 2.25 0.778 +#> # ℹ 75 more rows +``` + +Over time: + + +```r +roc_scores %>% + ggplot(aes(.eval_time, .estimate)) + + geom_hline(yintercept = 1 / 2, col = "red", lty = 3) + + geom_line() + + geom_point() + + labs(x = "time", y = "ROC AUC") +``` + + + +The initial variation is due to so few events at the early stages of analysis. + +The ROC measures the separation between classes and the Brier score focuses more on accurate and well-calibrated predictions. It should not be surprising that each metric's results over time differ. + +## Tuning these metrics + +Many of the event time models available in tidymodels have tuning parameters. The `tune_*()` functions and `fit_resamples()` have an `eval_time` argument used to pass the evaluation times. The statistics are computed for these time points using out-of-sample data. + +In some cases, such as [iterative search](https://www.tmwr.org/iterative-search.html) or [racing methods](https://www.tmwr.org/grid-search.html#racing), the functions need a single value to optimize. If a dynamic metric is used to guide the optimization, _the first evaluation time given by the user_ will be used. + +For example, if a model for these data was being optimized, and we wanted a time of 5.00 to guide the process, we would need to use that value of 5.00 as the first element `time_points`, the vector given to the `eval_time` argument in our example above. + + +## Summary + +tidymodels has two time-dependent metrics for characterizing the performance of event-time models: + +* The Brier score measures the distance between the observed class result and the predicted probabilities. +* ROC curves try to measure the separation between the two classes based on the survival probabilities. + + +## Session information + + +``` +#> ─ Session info ───────────────────────────────────────────────────── +#> setting value +#> version R version 4.2.0 (2022-04-22) +#> os macOS 13.4 +#> system aarch64, darwin20 +#> ui X11 +#> language (EN) +#> collate en_US.UTF-8 +#> ctype en_US.UTF-8 +#> tz Europe/London +#> date 2023-06-07 +#> pandoc 2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown) +#> +#> ─ Packages ───────────────────────────────────────────────────────── +#> package * version date (UTC) lib source +#> broom * 1.0.4 2023-03-11 [1] CRAN (R 4.2.0) +#> censored * 0.2.0.9000 2023-05-22 [1] Github (tidymodels/censored@f9eccb6) +#> dials * 1.2.0 2023-04-03 [1] CRAN (R 4.2.0) +#> dplyr * 1.1.2 2023-04-20 [1] CRAN (R 4.2.0) +#> ggplot2 * 3.4.2 2023-04-03 [1] CRAN (R 4.2.0) +#> infer * 1.0.4 2022-12-02 [1] CRAN (R 4.2.0) +#> parsnip * 1.1.0.9002 2023-06-05 [1] local +#> prodlim * 2023.03.31 2023-04-02 [1] CRAN (R 4.2.0) +#> purrr * 1.0.1 2023-01-10 [1] CRAN (R 4.2.0) +#> recipes * 1.0.6.9000 2023-05-17 [1] local +#> rlang 1.1.1 2023-04-28 [1] CRAN (R 4.2.0) +#> rsample * 1.1.1.9000 2023-04-21 [1] local +#> tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.2.0) +#> tidymodels * 1.1.0 2023-05-01 [1] CRAN (R 4.2.0) +#> tune * 1.1.1.9000 2023-04-20 [1] local +#> workflows * 1.1.3.9000 2023-04-03 [1] local +#> yardstick * 1.2.0 2023-04-21 [1] CRAN (R 4.2.0) +#> +#> [1] /Users/hannah/Library/R/arm64/4.2/library +#> [2] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library +#> +#> ──────────────────────────────────────────────────────────────────── +``` + From d876a01220a7a122c552841447b41f4027623c46 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Wed, 7 Jun 2023 15:02:09 +0100 Subject: [PATCH 5/6] focus code shown on the metric rather than the immediate steps - those are laid out in the details article --- .../survival-metrics/index.Rmarkdown | 8 +-- .../survival-metrics/index.markdown | 65 +------------------ 2 files changed, 5 insertions(+), 68 deletions(-) diff --git a/content/learn/statistics/survival-metrics/index.Rmarkdown b/content/learn/statistics/survival-metrics/index.Rmarkdown index bd7cff65..49ec86c5 100644 --- a/content/learn/statistics/survival-metrics/index.Rmarkdown +++ b/content/learn/statistics/survival-metrics/index.Rmarkdown @@ -178,10 +178,11 @@ When we not only turn the event time data into a binary representation but also - Sensitivity: How well do we predict the events? This is analogous to the true positive rate. - Specificity: How well do we predict the non-events? One minus specificity is the false positive rate. -Let's take a look at our example data and use an evaluation time of 5.00! +These depend on the threshold used to turn predicted probabilities into predicted events/non-events. Let's take a look at the distribution of the survival probabilities for our example data at an evaluation time of 5.00. The distributions are separated by the observed class and weighted by the censoring weights. Details of both aspects are the same as for the Brier score and can be found in the [Accounting for Censoring in Performance Metrics for Event Time Data](FIXME) article. ```{r} #| label: data-at-5 +#| include: false time_as_binary_event <- function(surv, eval_time) { event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these @@ -216,13 +217,12 @@ data_at_5 <- binary_encoding %>% filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% select(.eval_time, .pred_survival, .weight_censored, obs_class, pred_class, event_time) -data_at_5 -``` -What does the distribution of the survival probabilities for each of the actual classes look like? +``` ```{r} #| label: surv-hist-05 +#| echo: false #| warning: false #| out-width: 70% #| fig-width: 7 diff --git a/content/learn/statistics/survival-metrics/index.markdown b/content/learn/statistics/survival-metrics/index.markdown index c4cfecea..7a6be48a 100644 --- a/content/learn/statistics/survival-metrics/index.markdown +++ b/content/learn/statistics/survival-metrics/index.markdown @@ -219,72 +219,9 @@ When we not only turn the event time data into a binary representation but also - Sensitivity: How well do we predict the events? This is analogous to the true positive rate. - Specificity: How well do we predict the non-events? One minus specificity is the false positive rate. -Let's take a look at our example data and use an evaluation time of 5.00! +These depend on the threshold used to turn predicted probabilities into predicted events/non-events. Let's take a look at the distribution of the survival probabilities for our example data at an evaluation time of 5.00. The distributions are separated by the observed class and weighted by the censoring weights. Details of both aspects are the same as for the Brier score and can be found in the [Accounting for Censoring in Performance Metrics for Event Time Data](FIXME) article. -```r -time_as_binary_event <- function(surv, eval_time) { - event_time <- parsnip:::.extract_surv_time(surv) # TODO we should export these - status <- parsnip:::.extract_surv_status(surv) - is_event_before_t <- event_time <= eval_time & status == 1 - - # Three possible contributions to the statistic from Graf 1999 - # Censoring time before eval_time, no contribution (Graf category 3) - binary_res <- rep(NA_character_, length(event_time)) - - # A real event prior to eval_time (Graf category 1) - binary_res <- if_else(is_event_before_t, "event", binary_res) - - # Observed time greater than eval_time (Graf category 2) - binary_res <- if_else(event_time > eval_time, "non-event", binary_res) - factor(binary_res, levels = c("event", "non-event")) -} - -# Unnest the list columns and convert the event time data to binary format -binary_encoding <- - val_pred %>% - select(.pred, event_time) %>% - add_rowindex() %>% - unnest(.pred) %>% - mutate( - obs_class = time_as_binary_event(event_time, .eval_time), - pred_class = if_else(.pred_survival >= 1 / 2, "non-event", "event"), - pred_class = factor(pred_class, levels = c("event", "non-event")) - ) - -data_at_5 <- - binary_encoding %>% - filter(.eval_time == 5.00 & !is.na(.weight_censored)) %>% - select(.eval_time, .pred_survival, .weight_censored, obs_class, pred_class, event_time) -data_at_5 -#> # A tibble: 391 × 6 -#> .eval_time .pred_survival .weight_censored obs_class pred_class event_time -#> -#> 1 5 0.662 1.27 event non-event 4.83 -#> 2 5 0.662 1.29 non-event non-event 6.11 -#> 3 5 0.731 1.29 non-event non-event 6.60+ -#> 4 5 0.213 1.07 event event 2.72 -#> 5 5 0.499 1.29 non-event event 8.70 -#> 6 5 0.711 1.29 non-event non-event 10.82 -#> 7 5 0.668 1.29 non-event non-event 6.89 -#> 8 5 0.901 1.29 non-event non-event 8.23+ -#> 9 5 0.469 1.20 event event 4.20 -#> 10 5 0.886 1.29 non-event non-event 7.27+ -#> # ℹ 381 more rows -``` - -What does the distribution of the survival probabilities for each of the actual classes look like? - - -```r -data_at_5 %>% - ggplot(aes(x = .pred_survival, weight = .weight_censored)) + - geom_vline(xintercept = 1 / 2, col = "blue", lty = 2) + - geom_histogram(col = "white", bins = 30) + - facet_wrap(~obs_class, ncol = 1) + - lims(x = 0:1) + - labs(x = "probability of survival", y = "sum of weights") -``` From dfa85b1ecf137dd449b664c5cdc8ccce7405b297 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Wed, 7 Jun 2023 16:23:42 +0100 Subject: [PATCH 6/6] remove unused plots --- .../survival-metrics/figs/RKM-1.svg | 825 ------------------ .../survival-metrics/figs/surv-plot-1.svg | 84 -- .../survival-metrics/figs/usable-data-1.svg | 79 -- 3 files changed, 988 deletions(-) delete mode 100644 content/learn/statistics/survival-metrics/figs/RKM-1.svg delete mode 100644 content/learn/statistics/survival-metrics/figs/surv-plot-1.svg delete mode 100644 content/learn/statistics/survival-metrics/figs/usable-data-1.svg diff --git a/content/learn/statistics/survival-metrics/figs/RKM-1.svg b/content/learn/statistics/survival-metrics/figs/RKM-1.svg deleted file mode 100644 index 3f4b484b..00000000 --- a/content/learn/statistics/survival-metrics/figs/RKM-1.svg +++ /dev/null @@ -1,825 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0.00 -0.25 -0.50 -0.75 -1.00 - - - - - - - - - -0 -5 -10 -15 -time -probability of being censored - - diff --git a/content/learn/statistics/survival-metrics/figs/surv-plot-1.svg b/content/learn/statistics/survival-metrics/figs/surv-plot-1.svg deleted file mode 100644 index a083b00b..00000000 --- a/content/learn/statistics/survival-metrics/figs/surv-plot-1.svg +++ /dev/null @@ -1,84 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0.00 -0.25 -0.50 -0.75 -1.00 - - - - - - - - - -0 -5 -10 -15 -time -survival probability - - diff --git a/content/learn/statistics/survival-metrics/figs/usable-data-1.svg b/content/learn/statistics/survival-metrics/figs/usable-data-1.svg deleted file mode 100644 index 32570a00..00000000 --- a/content/learn/statistics/survival-metrics/figs/usable-data-1.svg +++ /dev/null @@ -1,79 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -0 -50 -100 -150 -200 -250 - - - - - - - - - - -0 -5 -10 -15 -time -Number of Usable Samples - -