From 4adaf0c3c17af5934accb1bbd852b2593f5fa47e Mon Sep 17 00:00:00 2001 From: Matthijs Hollanders Date: Thu, 24 Oct 2024 15:05:59 +1100 Subject: [PATCH 1/3] ggplot2 plot_mvgam_resids() --- R/plot_mvgam_resids.R | 575 ++++++++++++++---------------------------- 1 file changed, 194 insertions(+), 381 deletions(-) diff --git a/R/plot_mvgam_resids.R b/R/plot_mvgam_resids.R index ad769576..b4315f93 100644 --- a/R/plot_mvgam_resids.R +++ b/R/plot_mvgam_resids.R @@ -26,7 +26,7 @@ #'@return A series of base \code{R} plots #'@export plot_mvgam_resids = function(object, series = 1, - newdata, data_test){ + newdata, data_test) { # Check arguments if (!(inherits(object, "mvgam"))) { @@ -47,405 +47,218 @@ plot_mvgam_resids = function(object, series = 1, 'trend_model')) } -# Plotting colours -probs <- c(0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95) -c_light <- c("#DCBCBC") -c_light_highlight <- c("#C79999") -c_mid <- c("#B97C7C") -c_mid_highlight <- c("#A25050") -c_dark <- c("#8F2727") -c_dark_highlight <- c("#7C0000") - -# Prediction indices for the particular series -data_train <- validate_series_time(object$obs_data, - trend_model = attr(object$model_data, - 'trend_model')) -ends <- seq(0, dim(mcmc_chains(object$model_output, 'ypred'))[2], - length.out = NCOL(object$ytimes) + 1) -starts <- ends + 1 -starts <- c(1, starts[-c(1, (NCOL(object$ytimes)+1))]) -ends <- ends[-1] - -# Pull out series' residuals -series_residuals <- object$resids[[series]] - -# Get indices of training horizon -if(class(data_train)[1] == 'list'){ - data_train_df <- data.frame(time = data_train$index..time..index, - y = data_train$y, - series = data_train$series) - obs_length <- length(data_train_df %>% - dplyr::filter(series == !!(levels(data_train_df$series)[series])) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y)) -} else { - obs_length <- length(data_train %>% - dplyr::filter(series == !!(levels(data_train$series)[series])) %>% - dplyr::select(index..time..index, y) %>% - dplyr::distinct() %>% - dplyr::arrange(index..time..index) %>% - dplyr::pull(y)) -} - -if(missing(data_test)){ - -} else { - - if(object$fit_engine == 'stan'){ - linkfun <- family_invlinks(object$family) - preds <- linkfun(mcmc_chains(object$model_output, 'mus')[,seq(series, - dim(mcmc_chains(object$model_output, 'mus'))[2], - by = NCOL(object$ytimes))]) + # Plotting colours + probs <- c(0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95) + c_light <- c("#DCBCBC") + c_light_highlight <- c("#C79999") + c_mid <- c("#B97C7C") + c_mid_highlight <- c("#A25050") + c_dark <- c("#8F2727") + c_dark_highlight <- c("#7C0000") + + # Prediction indices for the particular series + data_train <- validate_series_time(object$obs_data, + trend_model = attr(object$model_data, + 'trend_model')) + ends <- seq(0, dim(mcmc_chains(object$model_output, 'ypred'))[2], + length.out = NCOL(object$ytimes) + 1) + starts <- ends + 1 + starts <- c(1, starts[-c(1, (NCOL(object$ytimes)+1))]) + ends <- ends[-1] + + # Pull out series' residuals + series_residuals <- object$resids[[series]] + + # Get indices of training horizon + if(class(data_train)[1] == 'list'){ + data_train_df <- data.frame(time = data_train$index..time..index, + y = data_train$y, + series = data_train$series) + obs_length <- length(data_train_df %>% + dplyr::filter(series == !!(levels(data_train_df$series)[series])) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y)) } else { - preds <- mcmc_chains(object$model_output, 'mus')[,starts[series]:ends[series]] + obs_length <- length(data_train %>% + dplyr::filter(series == !!(levels(data_train$series)[series])) %>% + dplyr::select(index..time..index, y) %>% + dplyr::distinct() %>% + dplyr::arrange(index..time..index) %>% + dplyr::pull(y)) } - # Add variables to data_test if missing - s_name <- levels(data_train$series)[series] - if(!missing(data_test)){ - - if(!'y' %in% names(data_test)){ - data_test$y <- rep(NA, NROW(data_test)) - } + if(missing(data_test)){ - if(class(data_test)[1] == 'list'){ - if(!'time' %in% names(data_test)){ - stop('data_train does not contain a "time" column') - } - - if(!'series' %in% names(data_test)){ - data_test$series <- factor('series1') - } - - } else { - if(!'time' %in% colnames(data_test)){ - stop('data_train does not contain a "time" column') - } - - if(!'series' %in% colnames(data_test)){ - data_test$series <- factor('series1') - } - } - # If the posterior predictions do not already cover the data_test period, the forecast needs to be - # generated using the latent trend dynamics; note, this assumes that there is no gap between the training and - # testing datasets - if(class(data_train)[1] == 'list'){ - all_obs <- c(data.frame(y = data_train$y, - series = data_train$series, - time = data_train$index..time..index) %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y), - data.frame(y = data_test$y, - series = data_test$series, - time = data_test$time) %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y)) - } else { - all_obs <- c(data_train %>% - dplyr::filter(series == s_name) %>% - dplyr::select(index..time..index, y) %>% - dplyr::distinct() %>% - dplyr::arrange(index..time..index) %>% - dplyr::pull(y), - data_test %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y)) - } + } else { - if(dim(preds)[2] != length(all_obs)){ + if(object$fit_engine == 'stan'){ linkfun <- family_invlinks(object$family) - fc_preds <- linkfun(forecast.mvgam(object, series = series, - data_test = data_test, - type = 'link')) - preds <- cbind(preds, fc_preds) + preds <- linkfun(mcmc_chains(object$model_output, 'mus')[,seq(series, + dim(mcmc_chains(object$model_output, 'mus'))[2], + by = NCOL(object$ytimes))]) + } else { + preds <- mcmc_chains(object$model_output, 'mus')[,starts[series]:ends[series]] } - # Calculate out of sample residuals - preds <- preds[,tail(1:dim(preds)[2], length(data_test$time))] - truth <- data_test$y - n_obs <- length(truth) - - if(NROW(preds) > 2000){ - sample_seq <- sample(1:NROW(preds), 2000, F) - } else { - sample_seq <- 1:NROW(preds) - } + # Add variables to data_test if missing + s_name <- levels(data_train$series)[series] + if(!missing(data_test)){ - series_residuals <- get_forecast_resids(object = object, - series = series, - truth = truth, - preds = preds, - family = object$family, - sample_seq = sample_seq) + if(!'y' %in% names(data_test)){ + data_test$y <- rep(NA, NROW(data_test)) + } - } -} + if(class(data_test)[1] == 'list'){ + if(!'time' %in% names(data_test)){ + stop('data_train does not contain a "time" column') + } -# Graphical parameters -.pardefault <- par(no.readonly=T) -on.exit(par(.pardefault)) -layout(matrix(1:4, ncol = 2, nrow = 2, byrow = TRUE)) -oldpar <- par(mar=c(2.5, 2.3, 2, 2), - oma = c(1, 1, 0, 0), - mgp = c(2, 0.5, 0)) + if(!'series' %in% names(data_test)){ + data_test$series <- factor('series1') + } -# Extract expectation (fitted) values -preds <- hindcast(object, type = 'expected')$hindcasts[[series]] -series_residuals <- object$resids[[series]] -limits <- quantile(preds, probs = c(0.025, 0.975), na.rm = TRUE) + } else { + if(!'time' %in% colnames(data_test)){ + stop('data_train does not contain a "time" column') + } -# Plot resids vs fitted -plot(1, - xlim = limits, - bty = 'L', - xlab = '', - ylab = '', - pch = 16, - col = 'white', - cex = 1, - ylim = range(series_residuals, na.rm = T)) + if(!'series' %in% colnames(data_test)){ + data_test$series <- factor('series1') + } + } + # If the posterior predictions do not already cover the data_test period, the forecast needs to be + # generated using the latent trend dynamics; note, this assumes that there is no gap between the training and + # testing datasets + if(class(data_train)[1] == 'list'){ + all_obs <- c(data.frame(y = data_train$y, + series = data_train$series, + time = data_train$index..time..index) %>% + dplyr::filter(series == s_name) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y), + data.frame(y = data_test$y, + series = data_test$series, + time = data_test$time) %>% + dplyr::filter(series == s_name) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y)) + } else { + all_obs <- c(data_train %>% + dplyr::filter(series == s_name) %>% + dplyr::select(index..time..index, y) %>% + dplyr::distinct() %>% + dplyr::arrange(index..time..index) %>% + dplyr::pull(y), + data_test %>% + dplyr::filter(series == s_name) %>% + dplyr::select(time, y) %>% + dplyr::distinct() %>% + dplyr::arrange(time) %>% + dplyr::pull(y)) + } -draws <- sample(1:dim(preds)[1], - min(150, dim(preds)[1]), - replace = FALSE) -moddata <- vector(mode = 'list') -for(i in draws){ - points(x = preds[i,], - y = series_residuals[i,], - pch = 16, - cex = 0.7, - col = '#80808020') - moddata[[which(draws == i)]] <- data.frame(y = series_residuals[i,], - x = preds[i,]) -} -title('Resids vs Fitted', line = 0) -title(ylab = "DS residuals", line = 1.5) -title(xlab = "Fitted values", line = 1.5) + if(dim(preds)[2] != length(all_obs)){ + linkfun <- family_invlinks(object$family) + fc_preds <- linkfun(forecast.mvgam(object, series = series, + data_test = data_test, + type = 'link')) + preds <- cbind(preds, fc_preds) + } -# Plot a fitted line -moddata <- do.call(rbind, moddata) + # Calculate out of sample residuals + preds <- preds[,tail(1:dim(preds)[2], length(data_test$time))] + truth <- data_test$y + n_obs <- length(truth) -predvals <- seq(min(preds[draws, ], na.rm = TRUE), - max(preds[draws, ], na.rm = TRUE), - length.out = 200) -medmod <- bam(y ~ s(x), data = moddata, discrete = TRUE) -medpreds <- predict(medmod, newdata = data.frame(x = predvals), - type = 'response', se.fit = TRUE) + if(NROW(preds) > 2000){ + sample_seq <- sample(1:NROW(preds), 2000, F) + } else { + sample_seq <- 1:NROW(preds) + } -polygon(c(predvals, rev(predvals)), c(medpreds$fit + 3*medpreds$se.fit, - rev(medpreds$fit - 3*medpreds$se.fit)), - col = "#7C000040", - border = NA) -lines(x = predvals, - y = medpreds$fit, - col = "#7C000060", - lwd = 3) + series_residuals <- get_forecast_resids(object = object, + series = series, + truth = truth, + preds = preds, + family = object$family, + sample_seq = sample_seq) -# Q-Q plot -coords <- qqnorm(series_residuals[1,], plot.it = F) -resid_coords_y <- matrix(NA, nrow = NROW(series_residuals), ncol = length(coords$y)) -for(i in 1:NROW(series_residuals)){ - if(all(is.na(series_residuals[i,]))){ - resid_coords_y[i,] <- rep(NA, length(coords$y)) - } else { - norm_coords <- qqnorm(series_residuals[i,], plot.it = FALSE) - coords_y <- norm_coords$y - coords_y[abs(coords_y) > 3.75] <- NA - resid_coords_y[i,] <- coords_y[order(norm_coords$x)] + } } -} -cred <- sapply(1:NCOL(resid_coords_y), - function(n) quantile(resid_coords_y[,n], - probs = probs, - na.rm = TRUE)) -pred_vals <- coords$x[order(coords$x)] -pred_vals <- pred_vals[complete.cases(cred[1,])] -plot(x = pred_vals, - y = cred[5,][complete.cases(cred[1,])], - bty = 'L', - xlab = '', - ylab = '', - pch = 16, - col = 'white', - cex = 1, - ylim = range(cred, na.rm = T), - tck = -0.04) -title('Normal Q-Q Plot', line = 0) -title(ylab = "Sample Quantiles", line = 1.5) -title(xlab = "Theoretical Quantiles", line = 1.5) -polygon(c(pred_vals, rev(pred_vals)), c(cred[1,][complete.cases(cred[1,])], - rev(cred[9,][complete.cases(cred[1,])])), - col = c_light, border = NA) -polygon(c(pred_vals, rev(pred_vals)), c(cred[2,][complete.cases(cred[1,])], - rev(cred[8,][complete.cases(cred[1,])])), - col = c_light_highlight, border = NA) -polygon(c(pred_vals, rev(pred_vals)), c(cred[3,][complete.cases(cred[1,])], - rev(cred[7,][complete.cases(cred[1,])])), - col = c_mid, border = NA) -polygon(c(pred_vals, rev(pred_vals)), c(cred[4,][complete.cases(cred[1,])], - rev(cred[6,][complete.cases(cred[1,])])), - col = c_mid_highlight, border = NA) -lines(pred_vals, cred[5,][complete.cases(cred[1,])], col = c_dark, lwd = 2.5) -qqline(cred[5,][complete.cases(cred[1,])], col = '#FFFFFF60', lwd = 3) -qqline(cred[5,][complete.cases(cred[1,])], col = 'black', lwd = 2.5) - -# ACF plot -acf1 <- acf(series_residuals[1,], plot = F, - na.action = na.pass) -resid_acf <- matrix(NA, nrow = NROW(series_residuals), - ncol = length(acf1$acf[,,1])) -for(i in 1:NROW(series_residuals)){ - resid_acf[i, ] <- acf(series_residuals[i,], plot = F, - na.action = na.pass)$acf[,,1] + # plot predictions and residuals (commented out code is my first attempt per draw...) + library(ggplot2) + library(patchwork) + # preds <- hindcast(object, type = 'expected')$hindcasts[[series]] + # resids <- object$resids[[series]] + # n_draw <- dim(preds)[[1]] + # n_obs <- dim(preds)[[2]] + # draws <- sample(1:n_draw, min(150, n_draw), + # replace = FALSE) + # fvr <- data.frame(preds = c(preds), + # resids = c(resids), + # obs = rep(1:n_obs, n_draw), + # .draw = rep(1:n_draw, each = n_obs)) %>% + # dplyr::filter(.draw %in% draws) |> + # ggplot(aes(preds, resids)) + + # geom_point(shape = 16, alpha = 0.2) + + # geom_smooth(method = "gam", colour = "#7C000060", fill = "#7C000040") + + # labs(title = "Resids vs Fitted", + # x = "DS residuals", + # y = "Fitted values") + median_resids <- apply(object$resids[[series]], 2, median) + fvr_plot <- data.frame(preds = apply(hindcast(object, type = 'expected')$hindcasts[[series]], 2, median), + resids = median_resids) |> + ggplot(aes(preds, resids)) + + geom_point(shape = 16, alpha = 2/3) + + geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"), colour = "#7C000060", fill = "#7C000040") + + labs(title = "Resids vs Fitted", + x = "DS residuals", + y = "Fitted values") + + # Q-Q plot + qq_plot <- data.frame(resids = median_resids) |> + ggplot(aes(sample = resids)) + + stat_qq(shape = 16, alpha = 2/3) + + stat_qq_line(colour = c_dark) + + labs(title = "Normal Q-Q Plot", + x = "Theoretical Quantiles", + y = "Sample Quantiles") + + # ACF plot + acf_resids <- acf(median_resids, plot = F, na.action = na.pass) + acf_plot <- data.frame(acf = acf_resids$acf[,,1], + lag = acf_resids$lag[,1,1]) |> + dplyr::filter(lag > 0) |> + ggplot(aes(x = lag, y = 0, yend = acf)) + + geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(acf_resids$n.used), + linetype = "dashed") + + geom_segment(colour = c_dark) + + labs(title = "ACF", + x = "Lag", + y = "Autocorrelation") + + + # PACF plot + pacf_resids <- pacf(median_resids, plot = F, na.action = na.pass) + pacf_plot <- data.frame(pacf = pacf_resids$acf[,,1], + lag = pacf_resids$lag[,1,1]) |> + dplyr::filter(lag > 0) |> + ggplot(aes(x = lag, y = 0, yend = pacf)) + + geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(pacf_resids$n.used), + linetype = "dashed") + + geom_segment(colour = c_dark) + + labs(title = "pACF", + x = "Lag", + y = "Autocorrelation") + + # return + suppressWarnings(print((fvr_plot | qq_plot) / (acf_plot | pacf_plot))) } -sorted_x <- seq(1:NCOL(resid_acf)) -N <- length(sorted_x) -idx <- rep(1:N, each = 2) -repped_x <- rep(sorted_x, each = 2) - -x <- sapply(1:length(idx), - function(k) if(k %% 2 == 0) - repped_x[k] + min(diff(sorted_x))/2 else - repped_x[k] - min(diff(sorted_x))/2) -cred <- sapply(1:NCOL(resid_acf), - function(n) quantile(resid_acf[,n], - probs = probs, na.rm = T)) -cred <- cred[, -1] -clim <- qnorm((1 + .95)/2)/sqrt(acf1$n.used) -plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xlim = c(1, N-1), - xaxt = 'n', - ylim = range(c(cred, - -clim - 0.05, - clim + 0.05), na.rm = TRUE)) -axis(1, at = seq(1, NCOL(cred), by = 2)) -title('ACF', line = 0) -title(ylab = "Autocorrelation", line = 1.5) -title(xlab = "Lag", line = 1.5) - -N <- N - 1 -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[9,], - ybottom = cred[1,], - col = c_light, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[8,], - ybottom = cred[2,], - col = c_light_highlight, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[7,], - ybottom = cred[3,], - col = c_mid, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[6,], - ybottom = cred[4,], - col = c_mid_highlight, - border = 'transparent') - -for (k in 1:N) { - lines(x = c(x[seq(1, N*2, by = 2)][k],x[seq(2, N*2, by = 2)][k]), - y = c(cred[5,k], cred[5,k]), - col = c_dark, lwd = 2) -} -abline(h = clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = clim, col = 'black', lwd = 2.5, lty = 'dashed') -abline(h = -clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = -clim, col = 'black', lwd = 2.5, lty = 'dashed') - -# PACF plot -pacf1 <- pacf(series_residuals[1,], plot = F, - na.action = na.pass) -resid_pacf <- matrix(NA, nrow = NROW(series_residuals), - ncol = length(pacf1$acf[,,1])) -for(i in 1:NROW(series_residuals)){ - resid_pacf[i, ] <- pacf(series_residuals[i,], plot = F, - na.action = na.pass)$acf[,,1] -} - -sorted_x <- seq(1:NCOL(resid_pacf)) -N <- length(sorted_x) -idx <- rep(1:N, each = 2) -repped_x <- rep(sorted_x, each = 2) - -x <- sapply(1:length(idx), - function(k) if(k %% 2 == 0) - repped_x[k] + min(diff(sorted_x))/2 else - repped_x[k] - min(diff(sorted_x))/2) -cred <- sapply(1:NCOL(resid_pacf), - function(n) quantile(resid_pacf[,n], - probs = probs, na.rm = T)) - -clim <- qnorm((1 + .95)/2)/sqrt(pacf1$n.used) -plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xlim = c(1, length(sorted_x)), - xaxt = 'n', - ylim = range(c(cred, - -clim - 0.05, - clim + 0.05), na.rm = TRUE)) -axis(1, at = seq(1, NCOL(cred), by = 2)) -title('pACF', line = 0) -title(ylab = "Autocorrelation", line = 1.5) -title(xlab = "Lag", line = 1.5) - -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[9,], - ybottom = cred[1,], - col = c_light, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[8,], - ybottom = cred[2,], - col = c_light_highlight, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[7,], - ybottom = cred[3,], - col = c_mid, - border = 'transparent') -rect(xleft = x[seq(1, N*2, by = 2)], - xright = x[seq(2, N*2, by = 2)], - ytop = cred[6,], - ybottom = cred[4,], - col = c_mid_highlight, - border = 'transparent') - -for (k in 1:N) { - lines(x = c(x[seq(1, N*2, by = 2)][k],x[seq(2, N*2, by = 2)][k]), - y = c(cred[5,k], cred[5,k]), - col = c_dark, lwd = 2) -} -abline(h = clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = clim, col = 'black', lwd = 2.5, lty = 'dashed') -abline(h = -clim, col = '#FFFFFF60', lwd = 2.85) -abline(h = -clim, col = 'black', lwd = 2.5, lty = 'dashed') - -layout(1) - -} From 1b4177f7d177557e19e02a6580963d36b194dcef Mon Sep 17 00:00:00 2001 From: Matthijs Hollanders Date: Sat, 26 Oct 2024 12:36:45 +1100 Subject: [PATCH 2/3] ggplot2 plot_mvgam_factors() --- R/plot_mvgam_factors.R | 106 ++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/R/plot_mvgam_factors.R b/R/plot_mvgam_factors.R index bf0b0762..9477ec75 100644 --- a/R/plot_mvgam_factors.R +++ b/R/plot_mvgam_factors.R @@ -28,7 +28,7 @@ #'plot_mvgam_factors(mod) #'} #'@export -plot_mvgam_factors = function(object, plot = TRUE){ +plot_mvgam_factors = function(object, plot = TRUE) { # Check arguments if (!(inherits(object, "mvgam"))) { @@ -47,6 +47,7 @@ plot_mvgam_factors = function(object, plot = TRUE){ starts <- c(1, starts[-c(1, object$n_lv + 1)]) ends <- ends[-1] probs <- c(0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95) + probs <- c(0.2, 0.4, 0.6, 0.9) # Set up plot environment if(plot){ @@ -68,62 +69,57 @@ plot_mvgam_factors = function(object, plot = TRUE){ } } - # Loop across each lv and calculate probability that the lv was dropped - lv_estimates <- do.call(rbind, lapply(1:object$n_lv, function(x){ + lv_estimates <- lapply(1:object$n_lv, + \(x) { - if(object$fit_engine == 'stan'){ - inds_lv <- seq(x, dim(mcmc_chains(object$model_output, 'LV'))[2], by = object$n_lv) - preds <- mcmc_chains(object$model_output, 'LV')[,inds_lv] - } else { - preds <- mcmc_chains(object$model_output, 'LV')[,starts[x]:ends[x]] - } - - # Keep only the in-sample observations for testing against the null of white noise - preds <- preds[,1:(length(object$obs_data$y) / NCOL(object$ytimes))] + if(object$fit_engine == 'stan'){ + inds_lv <- seq(x, dim(mcmc_chains(object$model_output, 'LV'))[2], by = object$n_lv) + preds <- mcmc_chains(object$model_output, 'LV')[,inds_lv] + } else { + preds <- mcmc_chains(object$model_output, 'LV')[,starts[x]:ends[x]] + } - cred <- sapply(1:NCOL(preds), - function(n) quantile(preds[,n], - probs = probs)) - # If plot = TRUE, plot the LVs - if(plot){ - preds_last <- preds[1,] - ylim <- range(cred) - ylab <- paste0('Factor ', x) - pred_vals <- seq(1:length(preds_last)) - plot(1, type = "n", bty = 'L', - xlab = 'Time', - ylab = ylab, - xlim = c(0, length(preds_last)), - ylim = ylim) - polygon(c(pred_vals, rev(pred_vals)), c(cred[1,], rev(cred[9,])), - col = c_light, border = NA) - polygon(c(pred_vals, rev(pred_vals)), c(cred[2,], rev(cred[8,])), - col = c_light_highlight, border = NA) - polygon(c(pred_vals, rev(pred_vals)), c(cred[3,], rev(cred[7,])), - col = c_mid, border = NA) - polygon(c(pred_vals, rev(pred_vals)), c(cred[4,], rev(cred[6,])), - col = c_mid_highlight, border = NA) - lines(pred_vals, cred[5,], col = c_dark, lwd = 2.5) - box(bty = 'L', lwd = 2) - - } + # Keep only the in-sample observations for testing against the null of white noise + preds <- preds[,1:(length(object$obs_data$y) / NCOL(object$ytimes))] + n_draw <- nrow(preds) + n_obs <- ncol(preds) + factor_name <- paste0("Factor ", x) - # Calculate second derivatives of empirical medians and upper / lower intervals; - # factors with small second derivatives are moving in roughly a straight line and not - # likely contributing much (or at all) to the latent trend estimates - meds <- cred[5,] - uppers <- cred[8,] - lowers <- cred[2,] - data.frame('Contribution' = sum(abs(diff(diff(meds)) + - diff(diff(uppers)) + - diff(diff(lowers))))) - })) - - rownames(lv_estimates) <- paste0('Factor', 1:object$n_lv) - - if(plot){ - layout(1) - } + if (plot) { + library(ggplot2) + fig <- data.frame(preds = c(preds), + obs = rep(1:n_obs, each = n_draw), + .draw = rep(1:n_draw, n_obs)) |> + ggplot(aes(obs, preds)) + + ggdist::stat_lineribbon(point_interval = ggdist::median_qi, + .width = probs, + colour = c_dark, + show.legend = FALSE) + + scale_fill_manual(values = c(c_light, c_light_highlight, c_mid, c_mid_highlight)) + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(expand = c(0, 0)) + + labs(x = "Time", + y = factor_name) + } - lv_estimates / sum(lv_estimates) + # Calculate second derivatives of empirical medians and upper / lower intervals; + # factors with small second derivatives are moving in roughly a straight line and not + # likely contributing much (or at all) to the latent trend estimates + meds <- apply(preds, 2, median) + uppers <- apply(preds, 2, \(p) quantile(p, 0.8)) + lowers <- apply(preds, 2, \(p) quantile(p, 0.2)) + contribution <- data.frame('Contribution' = sum(abs(diff(diff(meds)) + + diff(diff(uppers)) + + diff(diff(lowers)))), + row.names = factor_name) + if (plot) { + list(fig = fig, contribution = contribution) + } else { + list(contribution = contribution) + } + }) + if (plot) + lapply(lv_estimates, \(x) print(x$fig)) + contributions <- do.call(rbind, lapply(lv_estimates, \(x) x$contribution)) + contributions / sum(contributions) } From 51ebcbcfa20b6ac873e934ed9d75d501d61da05d Mon Sep 17 00:00:00 2001 From: Matthijs Hollanders Date: Wed, 6 Nov 2024 15:19:23 +1100 Subject: [PATCH 3/3] updated DESCRIPTION and three plotting functions --- DESCRIPTION | 3 +- R/plot_mvgam_factors.R | 13 +- R/plot_mvgam_resids.R | 79 +++----- R/plot_mvgam_series.R | 404 +++++++++++------------------------------ 4 files changed, 143 insertions(+), 356 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7c5700b9..b36719c2 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,8 @@ Imports: dplyr, magrittr, Matrix, - rlang + rlang, + patchwork (>= 1.3.0) Encoding: UTF-8 LazyData: true Roxygen: list(markdown = TRUE) diff --git a/R/plot_mvgam_factors.R b/R/plot_mvgam_factors.R index 9477ec75..fdcef672 100644 --- a/R/plot_mvgam_factors.R +++ b/R/plot_mvgam_factors.R @@ -86,20 +86,19 @@ plot_mvgam_factors = function(object, plot = TRUE) { factor_name <- paste0("Factor ", x) if (plot) { - library(ggplot2) fig <- data.frame(preds = c(preds), obs = rep(1:n_obs, each = n_draw), .draw = rep(1:n_draw, n_obs)) |> - ggplot(aes(obs, preds)) + + ggplot2::ggplot(ggplot2::aes(obs, preds)) + ggdist::stat_lineribbon(point_interval = ggdist::median_qi, .width = probs, colour = c_dark, show.legend = FALSE) + - scale_fill_manual(values = c(c_light, c_light_highlight, c_mid, c_mid_highlight)) + - scale_x_continuous(expand = c(0, 0)) + - scale_y_continuous(expand = c(0, 0)) + - labs(x = "Time", - y = factor_name) + ggplot2::scale_fill_manual(values = c(c_light, c_light_highlight, c_mid, c_mid_highlight)) + + ggplot2::scale_x_continuous(expand = c(0, 0)) + + ggplot2::scale_y_continuous(expand = c(0, 0)) + + ggplot2::labs(x = "Time", + y = factor_name) } # Calculate second derivatives of empirical medians and upper / lower intervals; diff --git a/R/plot_mvgam_resids.R b/R/plot_mvgam_resids.R index b4315f93..250bcab1 100644 --- a/R/plot_mvgam_resids.R +++ b/R/plot_mvgam_resids.R @@ -193,72 +193,51 @@ plot_mvgam_resids = function(object, series = 1, } # plot predictions and residuals (commented out code is my first attempt per draw...) - library(ggplot2) - library(patchwork) - # preds <- hindcast(object, type = 'expected')$hindcasts[[series]] - # resids <- object$resids[[series]] - # n_draw <- dim(preds)[[1]] - # n_obs <- dim(preds)[[2]] - # draws <- sample(1:n_draw, min(150, n_draw), - # replace = FALSE) - # fvr <- data.frame(preds = c(preds), - # resids = c(resids), - # obs = rep(1:n_obs, n_draw), - # .draw = rep(1:n_draw, each = n_obs)) %>% - # dplyr::filter(.draw %in% draws) |> - # ggplot(aes(preds, resids)) + - # geom_point(shape = 16, alpha = 0.2) + - # geom_smooth(method = "gam", colour = "#7C000060", fill = "#7C000040") + - # labs(title = "Resids vs Fitted", - # x = "DS residuals", - # y = "Fitted values") median_resids <- apply(object$resids[[series]], 2, median) fvr_plot <- data.frame(preds = apply(hindcast(object, type = 'expected')$hindcasts[[series]], 2, median), resids = median_resids) |> - ggplot(aes(preds, resids)) + - geom_point(shape = 16, alpha = 2/3) + - geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"), colour = "#7C000060", fill = "#7C000040") + - labs(title = "Resids vs Fitted", - x = "DS residuals", - y = "Fitted values") + ggplot2::ggplot(ggplot2::aes(preds, resids)) + + ggplot2::geom_point(shape = 16, alpha = 2/3) + + ggplot2::geom_smooth(method = "gam", formula = y ~ s(x, bs = "cs"), colour = "#7C000060", fill = "#7C000040") + + ggplot2::labs(title = "Resids vs Fitted", + x = "DS residuals", + y = "Fitted values") # Q-Q plot qq_plot <- data.frame(resids = median_resids) |> - ggplot(aes(sample = resids)) + - stat_qq(shape = 16, alpha = 2/3) + - stat_qq_line(colour = c_dark) + - labs(title = "Normal Q-Q Plot", - x = "Theoretical Quantiles", - y = "Sample Quantiles") + ggplot2::ggplot(ggplot2::aes(sample = resids)) + + ggplot2::stat_qq(shape = 16, alpha = 2/3) + + ggplot2::stat_qq_line(colour = c_dark) + + ggplot2::labs(title = "Normal Q-Q Plot", + x = "Theoretical Quantiles", + y = "Sample Quantiles") # ACF plot - acf_resids <- acf(median_resids, plot = F, na.action = na.pass) + acf_resids <- acf(median_resids, plot = FALSE, na.action = na.pass) acf_plot <- data.frame(acf = acf_resids$acf[,,1], lag = acf_resids$lag[,1,1]) |> dplyr::filter(lag > 0) |> - ggplot(aes(x = lag, y = 0, yend = acf)) + - geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(acf_resids$n.used), - linetype = "dashed") + - geom_segment(colour = c_dark) + - labs(title = "ACF", - x = "Lag", - y = "Autocorrelation") - + ggplot2::ggplot(ggplot2::aes(x = lag, y = 0, yend = acf)) + + ggplot2::geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(acf_resids$n.used), + linetype = "dashed") + + ggplot2::geom_segment(colour = c_dark) + + ggplot2::labs(title = "ACF", + x = "Lag", + y = "Autocorrelation") # PACF plot - pacf_resids <- pacf(median_resids, plot = F, na.action = na.pass) + pacf_resids <- pacf(median_resids, plot = FALSE, na.action = na.pass) pacf_plot <- data.frame(pacf = pacf_resids$acf[,,1], lag = pacf_resids$lag[,1,1]) |> dplyr::filter(lag > 0) |> - ggplot(aes(x = lag, y = 0, yend = pacf)) + - geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(pacf_resids$n.used), - linetype = "dashed") + - geom_segment(colour = c_dark) + - labs(title = "pACF", - x = "Lag", - y = "Autocorrelation") + ggplot2::ggplot(ggplot2::aes(x = lag, y = 0, yend = pacf)) + + ggplot2::geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(pacf_resids$n.used), + linetype = "dashed") + + ggplot2::geom_segment(colour = c_dark) + + ggplot2::labs(title = "pACF", + x = "Lag", + y = "Autocorrelation") # return - suppressWarnings(print((fvr_plot | qq_plot) / (acf_plot | pacf_plot))) + patchwork::wrap_plots(fvr_plot, qq_plot, acf_plot, pacf_plot, ncol = 2, nrow = 2, byrow = T) } - diff --git a/R/plot_mvgam_series.R b/R/plot_mvgam_series.R index a1e2141e..6c545c33 100644 --- a/R/plot_mvgam_series.R +++ b/R/plot_mvgam_series.R @@ -180,323 +180,131 @@ plot_mvgam_series = function(object, data_test$series <- droplevels(data_test$series) } + # only plot time series if(series == 'all'){ - n_plots <- length(levels(data_train$series)) - pages <- 1 - - if (n_plots > 4) pages <- 2 - if (n_plots > 8) pages <- 3 - if (n_plots > 12) pages <- 4 - if (pages != 0) { - ppp <- n_plots %/% pages - - if (n_plots %% pages != 0) { - ppp <- ppp + 1 - while (ppp * (pages - 1) >= n_plots) pages <- pages - 1 - } - - # Configure layout matrix - c <- r <- trunc(sqrt(ppp)) - if (c<1) r <- c <- 1 - if (c*r < ppp) c <- c + 1 - if (c*r < ppp) r <- r + 1 - - .pardefault <- par(no.readonly=T) - on.exit(par(.pardefault)) - oldpar <- par(mfrow = c(r,c)) - - } else { ppp <- 1; oldpar <- par()} - - all_ys <- lapply(seq_len(n_plots), function(x){ - if(log_scale){ - log( data.frame(y = data_train$y, - series = data_train$series, - time = data_train$time) %>% - dplyr::filter(series == levels(data_train$series)[x]) %>% - dplyr::pull(y) + 1) - } else { - data.frame(y = data_train$y, - series = data_train$series, - time = data_train$time) %>% - dplyr::filter(series == levels(data_train$series)[x]) %>% - dplyr::pull(y) - } - }) - - for(i in 1:n_plots){ - s_name <- levels(data_train$series)[i] - - truth <- data.frame(y = data_train$y, - series = data_train$series, - time = data_train$time) %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y) - - if(log_scale){ - truth <- log(truth + 1) - ylab <- paste0('log(', y,' + 1)') - } else { - ylab <- paste0(y) - } - plot(1, type = "n", bty = 'L', - xlab = 'Time', - ylab = ylab, - xaxt = 'n', - ylim = range(unlist(all_ys), na.rm = TRUE), - xlim = c(0, length(c(truth)))) - axis(side = 1, - at = floor(seq(0, max(data_train$time) - - (min(data_train$time)-1), - length.out = 6)), - labels = floor(seq(min(data_train$time), - max(data_train$time), - length.out = 6))) - title(s_name, line = 0) - - if(n_plots > 1){ - if(lines){ - for(x in 1:n_plots){ - lines(all_ys[[x]], lwd = 1.85, col = 'grey85') - } - } else { - for(x in 1:n_plots){ - points(all_ys[[x]], col = 'grey85', pch = 16) - } - } + # create tibble + dat <- dplyr::as_tibble(data_train) |> + dplyr::distinct(time, y, series) - } - - if(lines){ - lines(x = 1:length(truth), y = truth, lwd = 3, col = "white") - lines(x = 1:length(truth), y = truth, lwd = 2.5, col = "#8F2727") - } else { - points(x = 1:length(truth), y = truth, cex = 1.2, col = "white", pch = 16) - points(x = 1:length(truth), y = truth, cex = 0.9, col = "#8F2727", pch = 16) - } + # log transform and y-axis labels + if(log_scale){ + dat$y <- log(dat$y + 1) + ylab <- paste0('log(', y,' + 1)') + } else { + ylab <- paste0(y) + } - box(bty = 'L', lwd = 2) + # create faceted time serires + plot_ts <- dat |> + ggplot2::ggplot(ggplot2::aes(time, y)) + + ggplot2::facet_wrap(~ series) + + ggplot2::labs(x = "Time", + y = ylab) + if (lines) { + plot_ts <- plot_ts + ggplot2::geom_line(colour = "#8F2727") + } else { + plot_ts <- plot_ts + ggplot2::geom_point(colour = "#8F2727") } - layout(1) + plot_ts + # multiple plots for one time series } else { + # series name s_name <- levels(data_train$series)[series] - truth <- data.frame(y = data_train$y, - series = data_train$series, - time = data_train$time) %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y) - if(log_scale){ - truth <- log(truth + 1) + # training data + dat <- dplyr::as_tibble(data_train) |> + dplyr::filter(series == s_name) |> + dplyr::distinct(time, y) |> + dplyr::mutate(data = "train") + + # optionally bind test data + if (!missing(data_test)) { + dat <- dplyr::bind_rows(dat, + dplyr::as_tibble(data_test) |> + dplyr::filter(series == s_name) |> + dplyr::distinct(time, y) |> + dplyr::mutate(data = "test")) + } + + # log-transform y + if (log_scale) { + truth$y <- log(truth$y + 1) ylab <- paste0('log(', y,' + 1)') } else { ylab <- paste0(y) } - .pardefault <- par(no.readonly=T) - on.exit(par(.pardefault)) - layout(matrix(1:4, nrow = 2, byrow = TRUE)) - - if(!missing(data_test)){ - test <- data.frame(y = data_test$y, - series = data_test$series, - time = data_test$time) %>% - dplyr::filter(series == s_name) %>% - dplyr::select(time, y) %>% - dplyr::distinct() %>% - dplyr::arrange(time) %>% - dplyr::pull(y) - - plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xaxt = 'n', - ylim = range(c(truth, test), na.rm = TRUE), - xlim = c(0, length(c(truth, test)))) - axis(side = 1, - at = floor(seq(0, max(data_test$time) - - (min(data_train$time)-1), - length.out = 6)), - labels = floor(seq(min(data_train$time), - max(data_test$time), - length.out = 6))) - - title('Time series', line = 0) - title(ylab = ylab, line = 1.85) - title(xlab = "Time", line = 1.85) - - if(lines){ - lines(x = 1:length(truth), y = truth, lwd = 2, col = "#8F2727") - lines(x = (length(truth)+1):length(c(truth, test)), - y = test, lwd = 2, col = "black") + # time series + if (!missing(data_test)) { + plot_ts <- dat |> + ggplot2::ggplot(ggplot2::aes(time, y, colour = data)) + + ggplot2::geom_vline(xintercept = dat |> + dplyr::filter(data == "test") |> + dplyr::pull(time) |> + min(), + linetype = "dashed", + colour = "black") + + ggplot2::scale_colour_manual(values = c("black", "#8F2727")) + + ggplot2::labs(title = "Time series", + x = "Time", + y = ylab) + if (lines) { + plot_ts <- plot_ts + ggplot2::geom_line(show.legend = F) } else { - points(x = 1:length(truth), y = truth, pch = 16, col = "#8F2727") - points(x = (length(truth)+1):length(c(truth, test)), - y = test, pch = 16, col = "black") - } - - abline(v = length(truth)+1, col = '#FFFFFF60', lwd = 2.85) - abline(v = length(truth)+1, col = 'black', lwd = 2.5, lty = 'dashed') - box(bty = 'L', lwd = 2) - - if(missing(n_bins)){ - n_bins <- max(c(length(hist(c(truth, test), plot = F)$breaks), - 20)) - } - - hist(c(truth, test), border = "#8F2727", - lwd = 2, - freq = FALSE, - breaks = n_bins, - col = "#C79999", - ylab = '', - xlab = '', - main = '') - title('Histogram', line = 0) - title(ylab = 'Density', line = 1.85) - title(xlab = paste0(y), line = 1.85) - - acf(c(truth, test), - na.action = na.pass, bty = 'L', - lwd = 2.5, ci.col = 'black', col = "#8F2727", - main = '', ylab = '', xlab = '') - acf1 <- acf(c(truth, test), plot = F, - na.action = na.pass) - clim <- qnorm((1 + .95)/2)/sqrt(acf1$n.used) - abline(h = clim, col = '#FFFFFF', lwd = 2.85) - abline(h = clim, col = 'black', lwd = 2.5, lty = 'dashed') - abline(h = -clim, col = '#FFFFFF', lwd = 2.85) - abline(h = -clim, col = 'black', lwd = 2.5, lty = 'dashed') - box(bty = 'L', lwd = 2) - title('ACF', line = 0) - title(ylab = 'Autocorrelation', line = 1.85) - title(xlab = 'Lag', line = 1.85) - - ecdf_plotdat = function(vals, x){ - func <- ecdf(vals) - func(x) + plot_ts <- plot_ts + ggplot2::geom_point(show.legend = F) } - - plot_x <- seq(from = min(c(truth, test), na.rm = T), - to = max(c(truth, test), na.rm = T), - length.out = 100) - - plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xlim = c(min(plot_x), max(plot_x)), - ylim = c(0, 1)) - title('CDF', line = 0) - title(ylab = 'Empirical CDF', line = 1.85) - title(xlab = paste0(y), line = 1.85) - lines(x = plot_x, - y = ecdf_plotdat(c(truth, test), - plot_x), - col = "#8F2727", - lwd = 2.5) - box(bty = 'L', lwd = 2) - } else { - plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xaxt = 'n', - ylim = range(c(truth), na.rm = TRUE), - xlim = c(0, length(c(truth)))) - if(max(data_train$time < 6)){ - axis(side = 1, - at = 0:(max(data_train$time) - 1), - labels = 1:max(data_train$time)) - } else { - axis(side = 1, - at = floor(seq(0, max(data_train$time) - - (min(data_train$time)-1), - length.out = 6)), - labels = floor(seq(min(data_train$time), - max(data_train$time), - length.out = 6))) - } - - title('Time series', line = 0) - title(ylab = ylab, line = 1.85) - title(xlab = "Time", line = 1.85) - - if(lines){ - lines(x = 1:length(truth), y = truth, lwd = 2, col = "#8F2727") - } else { - points(x = 1:length(truth), y = truth, pch = 16, col = "#8F2727") - } - - box(bty = 'L', lwd = 2) - - if(missing(n_bins)){ - n_bins <- max(c(length(hist(c(truth), plot = F)$breaks), - 20)) - } - - hist(c(truth), border = "#8F2727", - lwd = 2, - freq = FALSE, - breaks = n_bins, - col = "#C79999", - ylab = '', - xlab = '', - main = '') - title('Histogram', line = 0) - title(ylab = 'Density', line = 1.85) - title(xlab = paste0(y), line = 1.85) - - - acf(c(truth), - na.action = na.pass, bty = 'L', - lwd = 2.5, ci.col = 'black', col = "#8F2727", - main = '', ylab = '', xlab = '') - acf1 <- acf(c(truth), plot = F, - na.action = na.pass) - clim <- qnorm((1 + .95)/2)/sqrt(acf1$n.used) - abline(h = clim, col = '#FFFFFF', lwd = 2.85) - abline(h = clim, col = 'black', lwd = 2.5, lty = 'dashed') - abline(h = -clim, col = '#FFFFFF', lwd = 2.85) - abline(h = -clim, col = 'black', lwd = 2.5, lty = 'dashed') - box(bty = 'L', lwd = 2) - title('ACF', line = 0) - title(ylab = 'Autocorrelation', line = 1.85) - title(xlab = 'Lag', line = 1.85) - - - ecdf_plotdat = function(vals, x){ - func <- ecdf(vals) - func(x) - } - - plot_x <- seq(from = min(truth, na.rm = T), - to = max(truth, na.rm = T), - length.out = 100) - plot(1, type = "n", bty = 'L', - xlab = '', - ylab = '', - xlim = c(min(plot_x), max(plot_x)), - ylim = c(0, 1)) - title('CDF', line = 0) - title(ylab = 'Empirical CDF', line = 1.85) - title(xlab = paste0(y), line = 1.85) - lines(x = plot_x, - y = ecdf_plotdat(truth, - plot_x), - col = "#8F2727", - lwd = 2.5) - box(bty = 'L', lwd = 2) + plot_ts <- dat |> + ggplot2::ggplot(ggplot2::aes(time, y)) + + ggplot2::labs(title = "Time series", + x = "Time", + y = ylab) + } + if (lines) { + plot_ts <- plot_ts + ggplot2::geom_line(colour = "#8F2727") + } else { + plot_ts <- plot_ts + ggplot2::geom_point(colour = "#8F2727") } + # histogram + if(missing(n_bins)){ + n_bins <- max(c(length(hist(c(dat$y), plot = F)$breaks), + 20)) + } + plot_hist <- dat |> + ggplot2::ggplot(ggplot2::aes(y)) + + ggplot2::geom_histogram(bins = n_bins, fill = "#C79999") + + ggplot2::labs(title = "Histogram", + x = "y", + y = "Count") + + # ACF + acf_y <- acf(dat$y, plot = F, na.action = na.pass) + plot_acf <- data.frame(acf = acf_y$acf[,,1], + lag = acf_y$lag[,1,1]) |> + ggplot2::ggplot(ggplot2::aes(x = lag, y = 0, yend = acf)) + + ggplot2::geom_hline(yintercept = c(-1, 1) * qnorm((1 + 0.95) / 2) / sqrt(acf_y$n.used), + linetype = "dashed") + + ggplot2::geom_segment(colour = "#8F2727") + + ggplot2::labs(title = "ACF", + x = "Lag", + y = "Autocorrelation") + + # ECDF plot + range_y <- range(dat$y, na.rm = T) + plot_ecdf <- data.frame(x = seq(range_y[1], range_y[2], length.out = 100)) |> + dplyr::mutate(y = ecdf(truth$y)(x)) |> + ggplot2::ggplot(ggplot2::aes(x, y)) + + ggplot2::geom_line(colour = "#8F2727") + + ggplot2::scale_y_continuous(limits = c(0, 1)) + + ggplot2::labs(title = "CDF", + x = "y", + y = "Empirical CDF") + + # plot + patchwork::wrap_plots(plot_ts, plot_hist, plot_acf, plot_ecdf, ncol = 2, nrow = 2, byrow = T) } - - layout(1) }