From d176ee57fe87989c0394b1fd069b37b03b9120d8 Mon Sep 17 00:00:00 2001 From: Nicholas Clark Date: Fri, 22 Nov 2024 16:03:44 +1000 Subject: [PATCH] add matt's remaining ggplot additions --- R/globals.R | 4 +- R/lfo_cv.mvgam.R | 116 ++--- R/plot.mvgam.R | 3 +- R/plot_mvgam_resids.R | 609 ++++++++--------------- R/plot_mvgam_series.R | 5 +- man/plot.mvgam_lfo.Rd | 2 +- man/plot_mvgam_resids.Rd | 29 +- tests/testthat/test-example_processing.R | 10 +- tests/testthat/test-mvgam-methods.R | 11 +- 9 files changed, 316 insertions(+), 473 deletions(-) diff --git a/R/globals.R b/R/globals.R index b95a90da..b531b799 100644 --- a/R/globals.R +++ b/R/globals.R @@ -22,4 +22,6 @@ utils::globalVariables(c("y", "year", "smooth_vals", "smooth_num", "total_evd", "smooth_label", "by_variable", "gr", "tot_subgrs", "subgr", "lambda", "level", "sim_hilbert_gp", "trend_model", - "jags_path", "x")) + "jags_path", "x", "elpds", "pareto_ks", + "value", "threshold", "colour", "resids", + "c_dark", "eval_timepoints")) diff --git a/R/lfo_cv.mvgam.R b/R/lfo_cv.mvgam.R index 4a9655ed..67a3672f 100644 --- a/R/lfo_cv.mvgam.R +++ b/R/lfo_cv.mvgam.R @@ -284,7 +284,7 @@ lfo_cv.mvgam = function(object, #' @importFrom graphics layout axis lines abline polygon points #' @param x An object of class `mvgam_lfo` #' @param ... Ignored -#' @return A base `R` plot of Pareto-k and ELPD values over the +#' @return A ggplot object of Pareto-k and ELPD values over the #' evaluation timepoints. For the Pareto-k plot, a dashed red line indicates the #' specified threshold chosen for triggering model refits. For the ELPD plot, #' a dashed red line indicates the bottom 10% quantile of ELPD values. Points below @@ -294,73 +294,59 @@ plot.mvgam_lfo = function(x, ...){ object <- x - # Graphical parameters - .pardefault <- par(no.readonly=T) - on.exit(par(.pardefault)) - par(mfrow = c(2, 1)) - # Plot Pareto-k values over time object$pareto_ks[which(is.infinite(object$pareto_ks))] <- max(object$pareto_ks[which(!is.infinite(object$pareto_ks))]) - plot(1, type = "n", bty = 'L', - xlab = '', - ylab = 'Pareto k', - xaxt = 'n', - xlim = range(object$eval_timepoints), - ylim = c(min(object$pareto_ks) - 0.1, - max(object$pareto_ks) + 0.1)) - axis(side = 1, labels = NA, lwd = 2) - - lines(x = object$eval_timepoints, - y = object$pareto_ks, - lwd = 2.5) - - abline(h = object$pareto_k_threshold, col = 'white', lwd = 2.85) - abline(h = object$pareto_k_threshold, col = "#A25050", lwd = 2.5, lty = 'dashed') - - points(x = object$eval_timepoints, - y = object$pareto_ks, pch = 16, col = "white", cex = 1.25) - points(x = object$eval_timepoints, - y = object$pareto_ks, pch = 16, col = "black", cex = 1) - - points(x = object$eval_timepoints[which(object$pareto_ks > object$pareto_k_threshold)], - y = object$pareto_ks[which(object$pareto_ks > object$pareto_k_threshold)], - pch = 16, col = "white", cex = 1.5) - points(x = object$eval_timepoints[which(object$pareto_ks > object$pareto_k_threshold)], - y = object$pareto_ks[which(object$pareto_ks > object$pareto_k_threshold)], - pch = 16, col = "#7C0000", cex = 1.25) - - box(bty = 'l', lwd = 2) - - # Plot ELPD values over time - plot(1, type = "n", bty = 'L', - xlab = 'Time point', - ylab = 'ELPD', - xlim = range(object$eval_timepoints), - ylim = c(min(object$elpds) - 0.1, - max(object$elpds) + 0.1)) - - lines(x = object$eval_timepoints, - y = object$elpds, - lwd = 2.5) - - lower_vals <- quantile(object$elpds, probs = c(0.15)) - abline(h = lower_vals, col = 'white', lwd = 2.85) - abline(h = lower_vals, col = "#A25050", lwd = 2.5, lty = 'dashed') - - points(x = object$eval_timepoints, - y = object$elpds, pch = 16, col = "white", cex = 1.25) - points(x = object$eval_timepoints, - y = object$elpds, pch = 16, col = "black", cex = 1) - - points(x = object$eval_timepoints[which(object$elpds < lower_vals)], - y = object$elpds[which(object$elpds < lower_vals)], - pch = 16, col = "white", cex = 1.5) - points(x = object$eval_timepoints[which(object$elpds < lower_vals)], - y = object$elpds[which(object$elpds < lower_vals)], - pch = 16, col = "#7C0000", cex = 1.25) - - box(bty = 'l', lwd = 2) + + dplyr::tibble(eval_timepoints = object$eval_timepoints, + elpds = object$elpds, + pareto_ks = object$pareto_ks) -> obj_tribble + + # Hack so we don't have to import tidyr just to use pivot_longer once + dplyr::bind_rows(obj_tribble %>% + dplyr::select(eval_timepoints, elpds) %>% + dplyr::mutate(name = 'elpds', value = elpds) %>% + dplyr::select(-elpds), + obj_tribble %>% + dplyr::select(eval_timepoints, pareto_ks) %>% + dplyr::mutate(name = 'pareto_ks', value = pareto_ks) %>% + dplyr::select(-pareto_ks)) %>% + dplyr::left_join( + dplyr::tribble(~name, ~threshold, + "elpds", quantile(object$elpds, probs = 0.15), + "pareto_ks", object$pareto_k_threshold), + by = "name" + ) %>% + dplyr::rowwise() %>% + dplyr::mutate(colour = dplyr::case_when( + name == 'elpds' & value < threshold ~ "outlier", + name == 'pareto_ks' & value > threshold ~ "outlier", + TRUE ~ "inlier" + )) %>% + dplyr::ungroup() %>% + ggplot2::ggplot(ggplot2::aes(eval_timepoints, value)) + + ggplot2::facet_wrap(~ factor(name, + levels = c("pareto_ks", "elpds"), + labels = c("Pareto K", "ELPD")), + ncol = 1, + scales = "free_y") + + ggplot2::geom_hline(ggplot2::aes(yintercept = threshold), + colour = "#A25050", + linetype = "dashed", + linewidth = 1) + + ggplot2::geom_line(linewidth = 0.5, + col = "grey30") + + ggplot2::geom_point(shape = 16, + colour = 'white', + size = 2) + + ggplot2::geom_point(ggplot2::aes(colour = colour), + shape = 16, + show.legend = F, + size = 1.5) + + ggplot2::scale_colour_manual(values = c("grey30", "#8F2727")) + + ggplot2::labs(x = "Evaluation time", + y = NULL) + + ggplot2::theme_bw() } #' Function to generate training and testing splits diff --git a/R/plot.mvgam.R b/R/plot.mvgam.R index 310be2a1..44c32d5c 100644 --- a/R/plot.mvgam.R +++ b/R/plot.mvgam.R @@ -126,7 +126,8 @@ plot.mvgam = function(x, type = 'residuals', } if(type == 'residuals'){ - plot_mvgam_resids(object, series = series, data_test = data_test, ...) + suppressWarnings(plot(plot_mvgam_resids(object, series = series, + newdata = data_test, ...))) } if(type == 'factors'){ diff --git a/R/plot_mvgam_resids.R b/R/plot_mvgam_resids.R index b59b4c55..0b92b3eb 100644 --- a/R/plot_mvgam_resids.R +++ b/R/plot_mvgam_resids.R @@ -16,17 +16,30 @@ #'the \code{newdata} supplied here comes sequentially after the data supplied as \code{data} in #'the original model (i.e. we assume there is no time gap between the last #'observation of series 1 in \code{data_train} and the first observation for series 1 in \code{newdata}). -#'@param data_test Deprecated. Still works in place of \code{newdata} but users are recommended to use -#'\code{newdata} instead for more seamless integration into `R` workflows #'@author Nicholas J Clark -#'@details A total of four base \code{R} plots are generated to examine Dunn-Smyth residuals for -#'the specified series. Plots include a residuals vs fitted values plot, +#'@details A total of four ggplot plots are generated to examine posterior median +#'Dunn-Smyth residuals for the specified series. Plots include a residuals vs fitted values plot, #'a Q-Q plot, and two plots to check for any remaining temporal autocorrelation in the residuals. -#'Note, all plots use posterior medians of fitted values / residuals, so uncertainty is not represented. -#'@return A series of base \code{R} plots +#'Note, all plots use only posterior medians of fitted values / residuals, so uncertainty is not represented. +#'@return A series of facetted ggplot object +#'@author Nicholas J Clark and Matthijs Hollanders +#' @examples +#' \donttest{ +#' simdat <- sim_mvgam(n_series = 3, trend_model = AR()) +#' mod <- mvgam(y ~ s(season, bs = 'cc', k = 6), +#' trend_model = AR(), +#' noncentred = TRUE, +#' data = simdat$data_train, +#' chains = 2) +#' +#' # Plot Dunn Smyth residuals for some series +#' plot_mvgam_resids(mod) +#' plot_mvgam_resids(mod, series = 2) +#' } #'@export -plot_mvgam_resids = function(object, series = 1, - newdata, data_test){ +plot_mvgam_resids = function(object, + series = 1, + newdata){ # Check arguments if (!(inherits(object, "mvgam"))) { @@ -43,407 +56,225 @@ plot_mvgam_resids = function(object, series = 1, if(!missing("newdata")){ data_test <- validate_series_time(newdata, - trend_model = object$trend_model) + trend_model = attr(object$model_data, + 'trend_model')) + } else { + data_test <- rlang::missing_arg() } -# Plotting colours -probs <- c(0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95) -c_light <- c("#DCBCBC") -c_light_highlight <- c("#C79999") -c_mid <- c("#B97C7C") -c_mid_highlight <- c("#A25050") -c_dark <- c("#8F2727") -c_dark_highlight <- c("#7C0000") - -# Prediction indices for the particular series -data_train <- validate_series_time(object$obs_data, - trend_model = object$trend_model) -ends <- seq(0, dim(mcmc_chains(object$model_output, 'ypred'))[2], - length.out = NCOL(object$ytimes) + 1) -starts <- ends + 1 -starts <- c(1, starts[-c(1, (NCOL(object$ytimes)+1))]) -ends <- ends[-1] - -# Pull out series' residuals -series_residuals <- object$resids[[series]] - -# Get indices of training horizon -if(class(data_train)[1] == 'list'){ - data_train_df <- data.frame(time = data_train$index..time..index, - y = data_train$y, - series = data_train$series) - obs_length <- length(data_train_df %>% - dplyr::filter(series == !!(levels(data_train_df$series)[series])) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y)) -} else { - obs_length <- length(data_train %>% - dplyr::filter(series == !!(levels(data_train$series)[series])) %>% - dplyr::select(index..time..index, y) %>% - dplyr::distinct() %>% - dplyr::arrange(index..time..index) %>% - dplyr::pull(y)) -} - -if(missing(data_test)){ - -} else { - - if(object$fit_engine == 'stan'){ - linkfun <- family_invlinks(object$family) - preds <- linkfun(mcmc_chains(object$model_output, 'mus')[,seq(series, - dim(mcmc_chains(object$model_output, 'mus'))[2], - by = NCOL(object$ytimes))]) + # Plotting colours + c_dark <- c("#8F2727") + + # Prediction indices for the particular series + data_train <- validate_series_time(object$obs_data, + trend_model = attr(object$model_data, + 'trend_model')) + ends <- seq(0, dim(mcmc_chains(object$model_output, 'ypred'))[2], + length.out = NCOL(object$ytimes) + 1) + starts <- ends + 1 + starts <- c(1, starts[-c(1, (NCOL(object$ytimes)+1))]) + ends <- ends[-1] + + # Pull out series' residuals + series_residuals <- object$resids[[series]] + + # Get indices of training horizon + if(inherits(data_train, 'list')){ + data_train_df <- data.frame(time = data_train$index..time..index, + y = data_train$y, + series = data_train$series) + obs_length <- length(data_train_df %>% + dplyr::filter(series == !!(levels(data_train_df$series)[series])) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y)) } else { - preds <- mcmc_chains(object$model_output, 'mus')[,starts[series]:ends[series]] + obs_length <- length(data_train %>% + dplyr::filter(series == !!(levels(data_train$series)[series])) %>% + dplyr::select(index..time..index, y) %>% + dplyr::distinct() %>% + dplyr::arrange(index..time..index) %>% + dplyr::pull(y)) } - # Add variables to data_test if missing - s_name <- levels(data_train$series)[series] - if(!missing(data_test)){ - - if(!'y' %in% names(data_test)){ - data_test$y <- rep(NA, NROW(data_test)) - } - - if(class(data_test)[1] == 'list'){ - if(!'time' %in% names(data_test)){ - stop('data_train does not contain a "time" column') - } + if(missing(data_test)){ - if(!'series' %in% names(data_test)){ - data_test$series <- factor('series1') - } - - } else { - if(!'time' %in% colnames(data_test)){ - stop('data_train does not contain a "time" column') - } - - if(!'series' %in% colnames(data_test)){ - data_test$series <- factor('series1') - } - } - # If the posterior predictions do not already cover the data_test period, the forecast needs to be - # generated using the latent trend dynamics; note, this assumes that there is no gap between the training and - # testing datasets - if(class(data_train)[1] == 'list'){ - all_obs <- c(data.frame(y = data_train$y, - series = data_train$series, - time = data_train$index..time..index) %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y), - data.frame(y = data_test$y, - series = data_test$series, - time = data_test$time) %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y)) - } else { - all_obs <- c(data_train %>% - dplyr::filter(series == s_name) %>% - dplyr::select(index..time..index, y) %>% - dplyr::distinct() %>% - dplyr::arrange(index..time..index) %>% - dplyr::pull(y), - data_test %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y)) - } + } else { - if(dim(preds)[2] != length(all_obs)){ + if(object$fit_engine == 'stan'){ linkfun <- family_invlinks(object$family) - fc_preds <- linkfun(forecast.mvgam(object, series = series, - data_test = data_test, - type = 'link')) - preds <- cbind(preds, fc_preds) + preds <- linkfun(mcmc_chains(object$model_output, 'mus')[,seq(series, + dim(mcmc_chains(object$model_output, 'mus'))[2], + by = NCOL(object$ytimes))]) + } else { + preds <- mcmc_chains(object$model_output, 'mus')[,starts[series]:ends[series]] } - # Calculate out of sample residuals - preds <- preds[,tail(1:dim(preds)[2], length(data_test$time))] - truth <- data_test$y - n_obs <- length(truth) - - if(NROW(preds) > 2000){ - sample_seq <- sample(1:NROW(preds), 2000, F) - } else { - sample_seq <- 1:NROW(preds) - } + # Add variables to data_test if missing + s_name <- levels(data_train$series)[series] + if(!missing(data_test)){ - series_residuals <- get_forecast_resids(object = object, - series = series, - truth = truth, - preds = preds, - family = object$family, - sample_seq = sample_seq) + if(!'y' %in% names(data_test)){ + data_test$y <- rep(NA, NROW(data_test)) + } - } -} + if(class(data_test)[1] == 'list'){ + if(!'time' %in% names(data_test)){ + stop('data_train does not contain a "time" column') + } -# Graphical parameters -.pardefault <- par(no.readonly=T) -on.exit(par(.pardefault)) -layout(matrix(1:4, ncol = 2, nrow = 2, byrow = TRUE)) -oldpar <- par(mar=c(2.5, 2.3, 2, 2), - oma = c(1, 1, 0, 0), - mgp = c(2, 0.5, 0)) + if(!'series' %in% names(data_test)){ + data_test$series <- factor('series1') + } -# Extract expectation (fitted) values -preds <- hindcast(object, type = 'expected')$hindcasts[[series]] -series_residuals <- object$resids[[series]] -limits <- quantile(preds, probs = c(0.025, 0.975), na.rm = TRUE) + } else { + if(!'time' %in% colnames(data_test)){ + stop('data_train does not contain a "time" column') + } -# Plot resids vs fitted -plot(1, - xlim = limits, - bty = 'L', - xlab = '', - ylab = '', - pch = 16, - col = 'white', - cex = 1, - ylim = range(series_residuals, na.rm = T)) + if(!'series' %in% colnames(data_test)){ + data_test$series <- factor('series1') + } + } + # If the posterior predictions do not already cover the data_test period, + # the forecast needs to be generated using the latent trend dynamics; + # note, this assumes that there is no gap between the training and testing datasets + if(class(data_train)[1] == 'list'){ + all_obs <- c(data.frame(y = data_train$y, + series = data_train$series, + time = data_train$index..time..index) %>% + dplyr::filter(series == s_name) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y), + data.frame(y = data_test$y, + series = data_test$series, + time = data_test$time) %>% + dplyr::filter(series == s_name) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y)) + } else { + all_obs <- c(data_train %>% + dplyr::filter(series == s_name) %>% + dplyr::select(index..time..index, y) %>% + dplyr::distinct() %>% + dplyr::arrange(index..time..index) %>% + dplyr::pull(y), + data_test %>% + dplyr::filter(series == s_name) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y)) + } -draws <- sample(1:dim(preds)[1], - min(150, dim(preds)[1]), - replace = FALSE) -moddata <- vector(mode = 'list') -for(i in draws){ - points(x = preds[i,], - y = series_residuals[i,], - pch = 16, - cex = 0.7, - col = '#80808020') - moddata[[which(draws == i)]] <- data.frame(y = series_residuals[i,], - x = preds[i,]) -} -title('Resids vs Fitted', line = 0) -title(ylab = "DS residuals", line = 1.5) -title(xlab = "Fitted values", line = 1.5) + if(dim(preds)[2] != length(all_obs)){ + linkfun <- family_invlinks(object$family) + fc_preds <- linkfun(forecast.mvgam(object, series = series, + data_test = data_test, + type = 'link')) + preds <- cbind(preds, fc_preds) + } -# Plot a fitted line -moddata <- do.call(rbind, moddata) + # Calculate out of sample residuals + preds <- preds[,tail(1:dim(preds)[2], length(data_test$time))] + truth <- data_test$y + n_obs <- length(truth) -predvals <- seq(min(preds[draws, ], na.rm = TRUE), - max(preds[draws, ], na.rm = TRUE), - length.out = 200) -medmod <- bam(y ~ s(x), data = moddata, discrete = TRUE) -medpreds <- predict(medmod, newdata = data.frame(x = predvals), - type = 'response', se.fit = TRUE) + if(NROW(preds) > 2000){ + sample_seq <- sample(1:NROW(preds), 2000, F) + } else { + sample_seq <- 1:NROW(preds) + } -polygon(c(predvals, rev(predvals)), c(medpreds$fit + 3*medpreds$se.fit, - rev(medpreds$fit - 3*medpreds$se.fit)), - col = "#7C000040", - border = NA) -lines(x = predvals, - y = medpreds$fit, - col = "#7C000060", - lwd = 3) + series_residuals <- get_forecast_resids(object = object, + series = series, + truth = truth, + preds = preds, + family = object$family, + sample_seq = sample_seq) -# Q-Q plot -coords <- qqnorm(series_residuals[1,], plot.it = F) -resid_coords_y <- matrix(NA, nrow = NROW(series_residuals), ncol = length(coords$y)) -for(i in 1:NROW(series_residuals)){ - if(all(is.na(series_residuals[i,]))){ - resid_coords_y[i,] <- rep(NA, length(coords$y)) - } else { - norm_coords <- qqnorm(series_residuals[i,], plot.it = FALSE) - coords_y <- norm_coords$y - coords_y[abs(coords_y) > 3.75] <- NA - resid_coords_y[i,] <- coords_y[order(norm_coords$x)] + } } -} - -cred <- sapply(1:NCOL(resid_coords_y), - function(n) quantile(resid_coords_y[,n], - probs = probs, - na.rm = TRUE)) -pred_vals <- coords$x[order(coords$x)] -pred_vals <- pred_vals[complete.cases(cred[1,])] -plot(x = pred_vals, - y = cred[5,][complete.cases(cred[1,])], - bty = 'L', - xlab = '', - ylab = '', - pch = 16, - col = 'white', - cex = 1, - ylim = range(cred, na.rm = T), - tck = -0.04) -title('Normal Q-Q Plot', line = 0) -title(ylab = "Sample Quantiles", line = 1.5) -title(xlab = "Theoretical Quantiles", line = 1.5) -polygon(c(pred_vals, rev(pred_vals)), c(cred[1,][complete.cases(cred[1,])], - rev(cred[9,][complete.cases(cred[1,])])), - col = c_light, border = NA) -polygon(c(pred_vals, rev(pred_vals)), c(cred[2,][complete.cases(cred[1,])], - rev(cred[8,][complete.cases(cred[1,])])), - col = c_light_highlight, border = NA) -polygon(c(pred_vals, rev(pred_vals)), c(cred[3,][complete.cases(cred[1,])], - rev(cred[7,][complete.cases(cred[1,])])), - col = c_mid, border = NA) -polygon(c(pred_vals, rev(pred_vals)), c(cred[4,][complete.cases(cred[1,])], - rev(cred[6,][complete.cases(cred[1,])])), - col = c_mid_highlight, border = NA) -lines(pred_vals, cred[5,][complete.cases(cred[1,])], col = c_dark, lwd = 2.5) -qqline(cred[5,][complete.cases(cred[1,])], col = '#FFFFFF60', lwd = 3) -qqline(cred[5,][complete.cases(cred[1,])], col = 'black', lwd = 2.5) - -# ACF plot -acf1 <- acf(series_residuals[1,], plot = F, - na.action = na.pass) -resid_acf <- matrix(NA, nrow = NROW(series_residuals), - ncol = length(acf1$acf[,,1])) -for(i in 1:NROW(series_residuals)){ - resid_acf[i, ] <- acf(series_residuals[i,], plot = F, - na.action = na.pass)$acf[,,1] -} - -sorted_x <- seq(1:NCOL(resid_acf)) -N <- length(sorted_x) -idx <- rep(1:N, each = 2) -repped_x <- rep(sorted_x, each = 2) - -x <- sapply(1:length(idx), - function(k) if(k %% 2 == 0) - repped_x[k] + min(diff(sorted_x))/2 else - repped_x[k] - min(diff(sorted_x))/2) -cred <- sapply(1:NCOL(resid_acf), - function(n) quantile(resid_acf[,n], - probs = probs, na.rm = T)) -cred <- cred[, -1] -clim <- qnorm((1 + .95)/2)/sqrt(acf1$n.used) -plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xlim = c(1, N-1), - xaxt = 'n', - ylim = range(c(cred, - -clim - 0.05, - clim + 0.05), na.rm = TRUE)) -axis(1, at = seq(1, NCOL(cred), by = 2)) -title('ACF', line = 0) -title(ylab = "Autocorrelation", line = 1.5) -title(xlab = "Lag", line = 1.5) - -N <- N - 1 -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[9,], - ybottom = cred[1,], - col = c_light, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[8,], - ybottom = cred[2,], - col = c_light_highlight, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[7,], - ybottom = cred[3,], - col = c_mid, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[6,], - ybottom = cred[4,], - col = c_mid_highlight, - border = 'transparent') - -for (k in 1:N) { - lines(x = c(x[seq(1, N*2, by = 2)][k],x[seq(2, N*2, by = 2)][k]), - y = c(cred[5,k], cred[5,k]), - col = c_dark, lwd = 2) -} -abline(h = clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = clim, col = 'black', lwd = 2.5, lty = 'dashed') -abline(h = -clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = -clim, col = 'black', lwd = 2.5, lty = 'dashed') - -# PACF plot -pacf1 <- pacf(series_residuals[1,], plot = F, - na.action = na.pass) -resid_pacf <- matrix(NA, nrow = NROW(series_residuals), - ncol = length(pacf1$acf[,,1])) -for(i in 1:NROW(series_residuals)){ - resid_pacf[i, ] <- pacf(series_residuals[i,], plot = F, - na.action = na.pass)$acf[,,1] -} - -sorted_x <- seq(1:NCOL(resid_pacf)) -N <- length(sorted_x) -idx <- rep(1:N, each = 2) -repped_x <- rep(sorted_x, each = 2) - -x <- sapply(1:length(idx), - function(k) if(k %% 2 == 0) - repped_x[k] + min(diff(sorted_x))/2 else - repped_x[k] - min(diff(sorted_x))/2) -cred <- sapply(1:NCOL(resid_pacf), - function(n) quantile(resid_pacf[,n], - probs = probs, na.rm = T)) - -clim <- qnorm((1 + .95)/2)/sqrt(pacf1$n.used) -plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xlim = c(1, length(sorted_x)), - xaxt = 'n', - ylim = range(c(cred, - -clim - 0.05, - clim + 0.05), na.rm = TRUE)) -axis(1, at = seq(1, NCOL(cred), by = 2)) -title('pACF', line = 0) -title(ylab = "Autocorrelation", line = 1.5) -title(xlab = "Lag", line = 1.5) - -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[9,], - ybottom = cred[1,], - col = c_light, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[8,], - ybottom = cred[2,], - col = c_light_highlight, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[7,], - ybottom = cred[3,], - col = c_mid, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[6,], - ybottom = cred[4,], - col = c_mid_highlight, - border = 'transparent') - -for (k in 1:N) { - lines(x = c(x[seq(1, N*2, by = 2)][k],x[seq(2, N*2, by = 2)][k]), - y = c(cred[5,k], cred[5,k]), - col = c_dark, lwd = 2) -} -abline(h = clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = clim, col = 'black', lwd = 2.5, lty = 'dashed') -abline(h = -clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = -clim, col = 'black', lwd = 2.5, lty = 'dashed') - -layout(1) + # plot predictions and residuals + median_resids <- apply(object$resids[[series]], 2, median) + fvr_plot <- data.frame(preds = apply(hindcast(object, type = 'expected')$hindcasts[[series]], + 2, median), + resids = median_resids) %>% + ggplot2::ggplot(ggplot2::aes(preds, resids)) + + ggplot2::geom_smooth(method = "gam", + formula = y ~ s(x, bs = "cs"), + colour = "#7C000060", + fill = "#7C000040") + + ggplot2::geom_point(shape = 16, col = 'white', size = 1.5) + + ggplot2::geom_point(shape = 16, col = 'black', size = 1.25) + + ggplot2::labs(title = "Resids vs Fitted", + x = "Fitted values", + y = "DS residuals") + + ggplot2::theme_bw() + + # Q-Q plot + qq_plot <- data.frame(resids = median_resids) %>% + ggplot2::ggplot(ggplot2::aes(sample = resids)) + + ggplot2::stat_qq_line(colour = c_dark, + linewidth = 0.75) + + ggplot2::stat_qq(shape = 16, col = 'white', size = 1.5) + + ggplot2::stat_qq(shape = 16, col = 'black', + size = 1.25) + + ggplot2::labs(title = "Normal Q-Q Plot", + x = "Theoretical Quantiles", + y = "Sample Quantiles") + + ggplot2::theme_bw() + + # ACF plot + acf_resids <- acf(median_resids, plot = FALSE, na.action = na.pass) + acf_plot <- data.frame(acf = acf_resids$acf[,,1], + lag = acf_resids$lag[,1,1]) %>% + dplyr::filter(lag > 0) %>% + ggplot2::ggplot(ggplot2::aes(x = lag, y = 0, yend = acf)) + + ggplot2::geom_hline(yintercept = c(-1, 1) * + qnorm((1 + 0.95) / 2) / sqrt(acf_resids$n.used), + linetype = "dashed") + + ggplot2::geom_hline(yintercept = 0, + colour = c_dark, + linewidth = 0.25) + + ggplot2::geom_segment(colour = c_dark, + linewidth = 1) + + ggplot2::labs(title = "ACF", + x = "Lag", + y = "Autocorrelation") + + ggplot2::theme_bw() + + # PACF plot + pacf_resids <- pacf(median_resids, plot = FALSE, na.action = na.pass) + pacf_plot <- data.frame(pacf = pacf_resids$acf[,,1], + lag = pacf_resids$lag[,1,1]) %>% + dplyr::filter(lag > 0) %>% + ggplot2::ggplot(ggplot2::aes(x = lag, y = 0, yend = pacf)) + + ggplot2::geom_hline(yintercept = c(-1, 1) * + qnorm((1 + 0.95) / 2) / sqrt(pacf_resids$n.used), + linetype = "dashed") + + ggplot2::geom_hline(yintercept = 0, + colour = c_dark, + linewidth = 0.25) + + ggplot2::geom_segment(colour = c_dark, + linewidth = 1) + + ggplot2::labs(title = "pACF", + x = "Lag", + y = "Partial autocorrelation") + + ggplot2::theme_bw() + + # return + suppressWarnings(print(patchwork::wrap_plots(fvr_plot, + qq_plot, + acf_plot, + pacf_plot, + ncol = 2, + nrow = 2, + byrow = T))) } diff --git a/R/plot_mvgam_series.R b/R/plot_mvgam_series.R index 4b450801..14deaf3b 100644 --- a/R/plot_mvgam_series.R +++ b/R/plot_mvgam_series.R @@ -295,8 +295,11 @@ plot_mvgam_series = function(object, ggplot2::ggplot(ggplot2::aes(x = lag, y = 0, yend = acf)) + ggplot2::geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(acf_y$n.used), linetype = "dashed") + + ggplot2::geom_hline(yintercept = 0, + colour = c_dark, + linewidth = 0.25) + ggplot2::geom_segment(colour = "#8F2727", - linewidth = 0.75) + + linewidth = 1) + ggplot2::labs(title = "ACF", x = "Lag", y = "Autocorrelation") + diff --git a/man/plot.mvgam_lfo.Rd b/man/plot.mvgam_lfo.Rd index 34ccce54..8923eca2 100644 --- a/man/plot.mvgam_lfo.Rd +++ b/man/plot.mvgam_lfo.Rd @@ -12,7 +12,7 @@ \item{...}{Ignored} } \value{ -A base \code{R} plot of Pareto-k and ELPD values over the +A ggplot object of Pareto-k and ELPD values over the evaluation timepoints. For the Pareto-k plot, a dashed red line indicates the specified threshold chosen for triggering model refits. For the ELPD plot, a dashed red line indicates the bottom 10\% quantile of ELPD values. Points below diff --git a/man/plot_mvgam_resids.Rd b/man/plot_mvgam_resids.Rd index 63719dfc..0ebb4b91 100644 --- a/man/plot_mvgam_resids.Rd +++ b/man/plot_mvgam_resids.Rd @@ -4,7 +4,7 @@ \alias{plot_mvgam_resids} \title{Residual diagnostics for a fitted mvgam object} \usage{ -plot_mvgam_resids(object, series = 1, newdata, data_test) +plot_mvgam_resids(object, series = 1, newdata) } \arguments{ \item{object}{\code{list} object returned from \code{mvgam}. See \code{\link[=mvgam]{mvgam()}}} @@ -20,22 +20,35 @@ However if no \code{newdata} was supplied to the original model call, an assumpt the \code{newdata} supplied here comes sequentially after the data supplied as \code{data} in the original model (i.e. we assume there is no time gap between the last observation of series 1 in \code{data_train} and the first observation for series 1 in \code{newdata}).} - -\item{data_test}{Deprecated. Still works in place of \code{newdata} but users are recommended to use -\code{newdata} instead for more seamless integration into \code{R} workflows} } \value{ -A series of base \code{R} plots +A series of facetted ggplot object } \description{ This function takes a fitted \code{mvgam} object and returns various residual diagnostic plots } \details{ -A total of four base \code{R} plots are generated to examine Dunn-Smyth residuals for -the specified series. Plots include a residuals vs fitted values plot, +A total of four ggplot plots are generated to examine posterior median +Dunn-Smyth residuals for the specified series. Plots include a residuals vs fitted values plot, a Q-Q plot, and two plots to check for any remaining temporal autocorrelation in the residuals. -Note, all plots use posterior medians of fitted values / residuals, so uncertainty is not represented. +Note, all plots use only posterior medians of fitted values / residuals, so uncertainty is not represented. +} +\examples{ +\donttest{ +simdat <- sim_mvgam(n_series = 3, trend_model = AR()) +mod <- mvgam(y ~ s(season, bs = 'cc', k = 6), + trend_model = AR(), + noncentred = TRUE, + data = simdat$data_train, + chains = 2) + +# Plot Dunn Smyth residuals for some series +plot_mvgam_resids(mod) +plot_mvgam_resids(mod, series = 2) +} } \author{ Nicholas J Clark + +Nicholas J Clark and Matthijs Hollanders } diff --git a/tests/testthat/test-example_processing.R b/tests/testthat/test-example_processing.R index 88143e78..a618e594 100644 --- a/tests/testthat/test-example_processing.R +++ b/tests/testthat/test-example_processing.R @@ -368,15 +368,17 @@ test_that("evaluate() functions working", { test_that("lfo_cv() working", { lfs <- SW(lfo_cv(mvgam:::mvgam_example1, min_t = 27, - fc_horizon = 1)) + fc_horizon = 1, + silent = 2)) expect_true(inherits(lfs, 'mvgam_lfo')) - expect_no_error(plot(lfs)) + expect_ggplot(plot(lfs)) lfs <- SW(lfo_cv(mvgam:::mvgam_example5, min_t = 27, - fc_horizon = 1)) + fc_horizon = 1, + silent = 2)) expect_true(inherits(lfs, 'mvgam_lfo')) - expect_no_error(plot(lfs)) + expect_ggplot(plot(lfs)) }) test_that("forecast() works correctly", { diff --git a/tests/testthat/test-mvgam-methods.R b/tests/testthat/test-mvgam-methods.R index 280b1d92..25b09c69 100644 --- a/tests/testthat/test-mvgam-methods.R +++ b/tests/testthat/test-mvgam-methods.R @@ -250,7 +250,12 @@ test_that("hindcast has reasonable outputs", { which(mvgam:::mvgam_example2$obs_data$series == 'series_1')]) }) -test_that("plot_mvgam_series reasonable outputs", { +test_that("plot_mvgam_resids gives reasonable outputs", { + expect_ggplot(plot_mvgam_resids(mvgam:::mvgam_example2)) + expect_ggplot(plot_mvgam_resids(mvgam:::mvgam_example5)) +}) + +test_that("plot_mvgam_series gives reasonable outputs", { simdat <- sim_mvgam() expect_ggplot(plot_mvgam_series(data = simdat$data_train, newdata = simdat$data_test)) @@ -293,8 +298,8 @@ test_that("plot_mvgam_series reasonable outputs", { series = 'all')) # And for mvgam objects - expect_ggplot(plot_mvgam_series(object = mvgam:::mvgam_example1, - series = 1)) + expect_no_error(plot_mvgam_series(object = mvgam:::mvgam_example1, + series = 1)) expect_ggplot(plot(mvgam:::mvgam_example1, type = 'series')) })