incorporate uncertainty in residual plots; add contributor ORCIDs

nicholasjclark · Nov 24, 2024 · 50351a3 · 50351a3
1 parent 072e49a
commit 50351a3
Show file tree

Hide file tree

Showing 36 changed files with 149 additions and 66 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,13 +1,13 @@
 Package: mvgam
 Title: Multivariate (Dynamic) Generalized Additive Models
 Version: 1.1.4
-Date: 2024-09-05
+Date: 2024-11-25
 Authors@R: c(person("Nicholas J", "Clark", , "[email protected]", 
                     role = c("aut", "cre"), comment = c(ORCID = "0000-0001-7131-3301")),
              person("Scott", "Pease", role = c("ctb"),
-                    comment = c("broom enhancements")),
+                    comment = c("broom enhancements", ORCID = "0009-0006-8977-9285")),
              person("Matthijs", "Hollanders", role = c("ctb"),
-                    comment = c("ggplot visualizations")))
+                    comment = c("ggplot visualizations", ORCID = "0000-0003-0796-1018")))
 Description: Fit Bayesian Dynamic Generalized Additive Models to multivariate observations. Users can build nonlinear State-Space models that can incorporate semiparametric effects in observation and process components, using a wide range of observation families. Estimation is performed using Markov Chain Monte Carlo with Hamiltonian Monte Carlo in the software 'Stan'. References: Clark & Wells (2022) <doi:10.1111/2041-210X.13974>.
 URL: https://github.com/nicholasjclark/mvgam, https://nicholasjclark.github.io/mvgam/
 BugReports: https://github.com/nicholasjclark/mvgam/issues

diff --git a/R/plot_mvgam_resids.R b/R/plot_mvgam_resids.R
@@ -7,11 +7,15 @@
 #'@importFrom mgcv bam
 #'@param object \code{list} object returned from \code{mvgam}. See [mvgam()]
 #'@param series \code{integer} specifying which series in the set is to be plotted
+#'@param n_draws \code{integer} specifying the number of posterior residual draws
+#'to use for calculating uncertainty in the "ACF" and "pACF" frames. Default is `100`
+#'@param n_points \code{integer} specifying the maximum number of points to show in the
+#'"Resids vs Fitted" and "Normal Q-Q Plot" frames. Default is `1000`
 #'@author Nicholas J Clark
 #'@details A total of four ggplot plots are generated to examine posterior
 #'Dunn-Smyth residuals for the specified series. Plots include a residuals vs fitted values plot,
 #'a Q-Q plot, and two plots to check for any remaining temporal autocorrelation in the residuals.
-#'Note, all plots use only report statistics from a sample of up to 20 posterior
+#'Note, all plots only report statistics from a sample of up to 100 posterior
 #'draws (to save computational time), so uncertainty in these relationships may not be adequately represented.
 #'@return A series of facetted ggplot object
 #'@author Nicholas J Clark and Matthijs Hollanders
@@ -31,45 +35,60 @@
 #' }
 #'@export
 plot_mvgam_resids = function(object,
-                             series = 1){
+                             series = 1,
+                             n_draws = 100L,
+                             n_points = 1000L){
 
   # Check arguments
   if (!(inherits(object, "mvgam"))) {
     stop('argument "object" must be of class "mvgam"')
   }
 
   validate_pos_integer(series)
+  validate_pos_integer(n_draws)
+  validate_pos_integer(n_points)
 
   if(series > NCOL(object$ytimes)){
     stop(paste0('object only contains data / predictions for ',
                 NCOL(object$ytimes), ' series'),
          call. = FALSE)
   }
 
-  # Plotting colours
-  c_dark <- c("#8F2727")
-
   # Take a sample of posterior draws to compute autocorrelation statistics
   # This is because acf(posterior_median_residual) can induce spurious patterns
   # due to the randomness of DS residuals;
   # rather, we want median(acf(residual_i)), where i indexes all possible draws
   # But this is computationally expensive for some models so we compromise
   # by only taking a few draws
-  n_draws <- NROW(object$resids[[series]])
-  n_samps <- min(20, n_draws)
+  n_total_draws <- NROW(object$resids[[series]])
+  n_samps <- min(n_draws, n_total_draws)
   hcs <- hindcast(object, type = 'expected')$hindcasts[[series]]
   resids <- object$resids[[series]]
 
   resid_df <- do.call(rbind, lapply(seq_len(n_samps), function(x){
     data.frame(preds = hcs[x, ],
-               resids = resids[x, ]) %>%
-                 dplyr::filter(!is.na(resids))
+               resids = resids[x, ],
+               .draw = x) %>%
+      dplyr::filter(!is.na(resids))
   }))
 
-  # Plot predictions and residuals
+  # Plot predictions and residuals (but limit number of points to n_points to
+  # speed up plotting)
+  if(NROW(resid_df) > n_points){
+    resid_df <- resid_df[sample(1:NROW(resid_df),
+                                n_points,
+                                replace = FALSE), ]
+  }
+
   fvr_plot <- ggplot2::ggplot(resid_df, ggplot2::aes(preds, resids)) +
-    ggplot2::geom_point(shape = 16, col = 'white', size = 1.25) +
-    ggplot2::geom_point(shape = 16, col = 'black', size = 1) +
+    ggplot2::geom_point(shape = 16,
+                        col = 'white',
+                        size = 1.25,
+                        alpha = 0.4) +
+    ggplot2::geom_point(shape = 16,
+                        col = 'black',
+                        size = 1,
+                        alpha = 0.4) +
     ggplot2::geom_smooth(method = "gam",
                          formula = y ~ s(x, bs = "cs"),
                          colour = "#7C000060",
@@ -81,11 +100,16 @@ plot_mvgam_resids = function(object,
 
   # Q-Q plot
   qq_plot <- ggplot2::ggplot(resid_df, ggplot2::aes(sample = resids)) +
-    ggplot2::stat_qq_line(colour = c_dark,
+    ggplot2::stat_qq_line(colour = "#8F2727",
                           linewidth = 1) +
-    ggplot2::stat_qq(shape = 16, col = 'white', size = 1.25) +
-    ggplot2::stat_qq(shape = 16, col = 'black',
-                     size = 1) +
+    ggplot2::stat_qq(shape = 16,
+                     col = 'white',
+                     size = 1.25,
+                     alpha = 0.4) +
+    ggplot2::stat_qq(shape = 16,
+                     col = 'black',
+                     size = 1,
+                     alpha = 0.4) +
     ggplot2::labs(title = "Normal Q-Q Plot",
                   x = "Theoretical Quantiles",
                   y = "Sample Quantiles") +
@@ -100,16 +124,34 @@ plot_mvgam_resids = function(object,
       dplyr::filter(lag > 0)
   })) %>%
     dplyr::group_by(lag) %>%
-    dplyr::summarise_all(median)
-  acf_plot <- ggplot2::ggplot(acf_stats, ggplot2::aes(x = lag, y = 0, yend = acf)) +
+    dplyr::mutate(ylow = quantile(acf, probs = 0.05),
+                  yqlow = quantile(acf, probs = 0.2),
+                  ymidlow = quantile(acf, probs = 0.25),
+                  ymidhigh = quantile(acf, probs = 0.75),
+                  yqhigh = quantile(acf, probs = 0.8),
+                  yhigh = quantile(acf, probs = 0.95)) %>%
+    dplyr::distinct()
+  acf_plot <- ggplot2::ggplot(acf_stats,
+                              ggplot2::aes(x = lag)) +
     ggplot2::geom_hline(yintercept = c(-1, 1) *
-                          qnorm((1 + 0.95) / 2) / acf_stats$denom[1],
+                          qnorm((1 + 0.95) / 2) /
+                          acf_stats$denom[1],
                         linetype = "dashed") +
     ggplot2::geom_hline(yintercept = 0,
-                        colour = c_dark,
+                        colour = "#7C0000",
                         linewidth = 0.25) +
-    ggplot2::geom_segment(colour = c_dark,
-                          linewidth = 1) +
+    ggplot2::geom_segment(colour = "#DCBCBC",
+                          linewidth = 1.5,
+                          ggplot2::aes(y = ylow,
+                                       yend = yhigh)) +
+    ggplot2::geom_segment(colour = "#B97C7C",
+                          linewidth = 1.5,
+                          ggplot2::aes(y = yqlow,
+                                       yend = yqhigh)) +
+    ggplot2::geom_segment(colour = "#7C0000",
+                          linewidth = 1.5,
+                          ggplot2::aes(y = ymidlow,
+                                       yend = ymidhigh)) +
     ggplot2::labs(title = "ACF",
                   x = "Lag",
                   y = "Autocorrelation") +
@@ -124,17 +166,35 @@ plot_mvgam_resids = function(object,
       dplyr::filter(lag > 0)
   })) %>%
     dplyr::group_by(lag) %>%
-    dplyr::summarise_all(median)
+    dplyr::mutate(ylow = quantile(pacf, probs = 0.05),
+                  yqlow = quantile(pacf, probs = 0.2),
+                  ymidlow = quantile(pacf, probs = 0.25),
+                  ymidhigh = quantile(pacf, probs = 0.75),
+                  yqhigh = quantile(pacf, probs = 0.8),
+                  yhigh = quantile(pacf, probs = 0.95)) %>%
+    dplyr::distinct()
 
-  pacf_plot <- ggplot2::ggplot(pacf_stats, ggplot2::aes(x = lag, y = 0, yend = pacf)) +
+  pacf_plot <- ggplot2::ggplot(pacf_stats,
+                               ggplot2::aes(x = lag)) +
     ggplot2::geom_hline(yintercept = c(-1, 1) *
-                          qnorm((1 + 0.95) / 2) / pacf_stats$denom[1],
+                          qnorm((1 + 0.95) / 2) /
+                          pacf_stats$denom[1],
                         linetype = "dashed") +
     ggplot2::geom_hline(yintercept = 0,
-                        colour = c_dark,
+                        colour = "#7C0000",
                         linewidth = 0.25) +
-    ggplot2::geom_segment(colour = c_dark,
-                          linewidth = 1) +
+    ggplot2::geom_segment(colour = "#DCBCBC",
+                          linewidth = 1.5,
+                          ggplot2::aes(y = ylow,
+                                       yend = yhigh)) +
+    ggplot2::geom_segment(colour = "#B97C7C",
+                          linewidth = 1.5,
+                          ggplot2::aes(y = yqlow,
+                                       yend = yqhigh)) +
+    ggplot2::geom_segment(colour = "#7C0000",
+                          linewidth = 1.5,
+                          ggplot2::aes(y = ymidlow,
+                                       yend = ymidhigh)) +
     ggplot2::labs(title = "pACF",
                   x = "Lag",
                   y = "Partial autocorrelation") +

diff --git a/README.Rmd b/README.Rmd
@@ -44,7 +44,7 @@ A series of [vignettes cover data formatting, forecasting and several extended c
 * [Phylogenetic smoothing using `mgcv`](https://ecogambler.netlify.app/blog/phylogenetic-smooths-mgcv/){target="_blank"}
 * [Incorporating time-varying seasonality in forecast models](https://ecogambler.netlify.app/blog/time-varying-seasonality/){target="_blank"}
 
-Please also feel free to use the [`mvgam` Discussion Board](https://github.com/nicholasjclark/mvgam/discussions) to hunt for or post other discussion topics related to the package.
+Please also feel free to use the [`mvgam` Discussion Board](https://github.com/nicholasjclark/mvgam/discussions) to hunt for or post other discussion topics related to the package, and do check out the [`mvgam` changelog](https://nicholasjclark.github.io/mvgam/news/index.html) for any updates about recent upgrades that the package has incorporated.
 
 ## Installation
 Install the stable version from `CRAN` using: `install.packages('mvgam')`, or install the development version from `GitHub` using:

diff --git a/README.md b/README.md
@@ -69,7 +69,10 @@ been compiled:
 
 Please also feel free to use the [`mvgam` Discussion
 Board](https://github.com/nicholasjclark/mvgam/discussions) to hunt for
-or post other discussion topics related to the package.
+or post other discussion topics related to the package, and do check out
+the [`mvgam`
+changelog](https://nicholasjclark.github.io/mvgam/news/index.html) for
+any updates about recent upgrades that the package has incorporated.
 
 ## Installation
 
@@ -242,29 +245,29 @@ summary(lynx_mvgam)
 #> 
 #> 
 #> GAM coefficient (beta) estimates:
-#>                 2.5%   50%  97.5% Rhat n_eff
-#> (Intercept)   6.4000  6.60  6.900 1.01   722
-#> s(season).1  -0.6500 -0.14  0.360 1.00  1143
-#> s(season).2   0.6800  1.30  1.900 1.01   693
-#> s(season).3   1.3000  1.90  2.600 1.00   725
-#> s(season).4  -0.0730  0.54  1.100 1.00   978
-#> s(season).5  -1.3000 -0.70 -0.140 1.00   997
-#> s(season).6  -1.2000 -0.57  0.097 1.00   894
-#> s(season).7   0.0071  0.72  1.300 1.00   974
-#> s(season).8   0.6200  1.40  2.100 1.00   903
-#> s(season).9  -0.3500  0.22  0.850 1.00   758
-#> s(season).10 -1.4000 -0.85 -0.330 1.00   911
+#>                2.5%   50%  97.5% Rhat n_eff
+#> (Intercept)   6.400  6.60  6.900    1   830
+#> s(season).1  -0.620 -0.13  0.390    1  1000
+#> s(season).2   0.750  1.30  1.900    1  1053
+#> s(season).3   1.300  1.90  2.500    1   826
+#> s(season).4  -0.110  0.54  1.100    1   898
+#> s(season).5  -1.300 -0.68 -0.077    1   878
+#> s(season).6  -1.200 -0.54  0.120    1   988
+#> s(season).7  -0.022  0.72  1.400    1  1057
+#> s(season).8   0.640  1.40  2.100    1   795
+#> s(season).9  -0.360  0.23  0.820    1   808
+#> s(season).10 -1.400 -0.88 -0.350    1  1073
 #> 
 #> Approximate significance of GAM smooths:
 #>            edf Ref.df Chi.sq p-value    
-#> s(season) 9.96     10   51.8  <2e-16 ***
+#> s(season) 9.97     10   48.9  <2e-16 ***
 #> ---
 #> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 #> 
 #> Latent trend parameter AR estimates:
 #>          2.5%  50% 97.5% Rhat n_eff
-#> ar1[1]   0.59 0.82  0.98    1   705
-#> sigma[1] 0.38 0.48  0.60    1   576
+#> ar1[1]   0.60 0.82  0.97    1   592
+#> sigma[1] 0.38 0.48  0.60    1   596
 #> 
 #> Stan MCMC diagnostics:
 #> n_eff / iter looks reasonable for all parameters
@@ -273,7 +276,7 @@ summary(lynx_mvgam)
 #> 0 of 2000 iterations saturated the maximum tree depth of 10 (0%)
 #> E-FMI indicated no pathological behavior
 #> 
-#> Samples were drawn using NUTS(diag_e) at Fri Nov 22 6:47:36 PM 2024.
+#> Samples were drawn using NUTS(diag_e) at Mon Nov 25 9:36:12 AM 2024.
 #> For each parameter, n_eff is a crude measure of effective sample size,
 #> and Rhat is the potential scale reduction factor on split MCMC chains
 #> (at convergence, Rhat = 1)
@@ -414,8 +417,8 @@ plot(lynx_mvgam, type = 'forecast', newdata = lynx_test)
 
 <img src="man/figures/README-unnamed-chunk-21-1.png" alt="Plotting forecast distributions using mvgam in R" width="60%" style="display: block; margin: auto;" />
 
-    #> Out of sample CRPS:
-    #> 2351.96370025
+    #> Out of sample DRPS:
+    #> 2395.89857175
 
 And the estimated latent trend component, again using the more flexible
 `plot_mvgam_...()` option to show first derivatives of the estimated
@@ -574,7 +577,7 @@ summary(mod, include_betas = FALSE)
 #> 0 of 2000 iterations saturated the maximum tree depth of 10 (0%)
 #> E-FMI indicated no pathological behavior
 #> 
-#> Samples were drawn using NUTS(diag_e) at Fri Nov 22 6:48:56 PM 2024.
+#> Samples were drawn using NUTS(diag_e) at Mon Nov 25 9:37:32 AM 2024.
 #> For each parameter, n_eff is a crude measure of effective sample size,
 #> and Rhat is the potential scale reduction factor on split MCMC chains
 #> (at convergence, Rhat = 1)

diff --git a/docs/authors.html b/docs/authors.html
diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/reference/figures/README-unnamed-chunk-12-1.png b/docs/reference/figures/README-unnamed-chunk-12-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-13-1.png b/docs/reference/figures/README-unnamed-chunk-13-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-14-1.png b/docs/reference/figures/README-unnamed-chunk-14-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-15-1.png b/docs/reference/figures/README-unnamed-chunk-15-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-16-1.png b/docs/reference/figures/README-unnamed-chunk-16-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-17-1.png b/docs/reference/figures/README-unnamed-chunk-17-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-18-1.png b/docs/reference/figures/README-unnamed-chunk-18-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-19-1.png b/docs/reference/figures/README-unnamed-chunk-19-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-20-1.png b/docs/reference/figures/README-unnamed-chunk-20-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-21-1.png b/docs/reference/figures/README-unnamed-chunk-21-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-22-1.png b/docs/reference/figures/README-unnamed-chunk-22-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-23-1.png b/docs/reference/figures/README-unnamed-chunk-23-1.png
diff --git a/docs/reference/figures/README-unnamed-chunk-24-1.png b/docs/reference/figures/README-unnamed-chunk-24-1.png