fix error plotting factors with 2 levels

nicholasjclark · Jun 15, 2023 · dd94a5a · dd94a5a
1 parent 0562150
commit dd94a5a
Show file tree

Hide file tree

Showing 8 changed files with 238 additions and 80 deletions.
diff --git a/R/dynamic.R b/R/dynamic.R
@@ -55,9 +55,26 @@ dynamic = function(variable, rho = 5, stationary = TRUE){
 #' @noRd
 interpret_mvgam = function(formula, N){
 
+  facs <- colnames(attr(terms.formula(formula), 'factors'))
+
+  # Re-arrange so that random effects always come last
+  if(any(grepl('bs = \"re\"', facs, fixed = TRUE))){
+    newfacs <- facs[!grepl('bs = \"re\"', facs, fixed = TRUE)]
+    refacs <- facs[grepl('bs = \"re\"', facs, fixed = TRUE)]
+    int <- attr(terms.formula(formula), 'intercept')
+    newformula <- as.formula(paste(terms.formula(formula)[[2]], '~',paste(paste(newfacs, collapse = '+'),
+                                                                 '+',
+                                                                 paste(refacs, collapse = '+'),
+                                                                 collapse = '+'),
+                                   ifelse(int == 0, ' - 1', '')))
+  } else {
+    newformula <- formula
+  }
+  attr(newformula, '.Environment') <- attr(formula, '.Environment')
+
   # Check if any terms use the dynamic wrapper
-  response <- terms.formula(formula)[[2]]
-  tf <- terms.formula(formula, specials = c("dynamic"))
+  response <- terms.formula(newformula)[[2]]
+  tf <- terms.formula(newformula, specials = c("dynamic"))
   which_dynamics <- attr(tf,"specials")$dynamic
 
   # Update the formula to the correct Gaussian Process spline
@@ -96,7 +113,7 @@ interpret_mvgam = function(formula, N){
 
     # Replace dynamic terms with the correct specification
     update_terms <- vector(length = length(which_dynamics))
-    old_rhs <- as.character(formula)[3]
+    old_rhs <- as.character(newformula)[3]
     old_rhs <- gsub(" ", "", (old_rhs), fixed = TRUE)
     old_rhs <- gsub(',stationary=TRUE|,stationary=FALSE', '',
          old_rhs)
@@ -114,10 +131,10 @@ interpret_mvgam = function(formula, N){
 
     # Return the updated formula for passing to mgcv
     updated_formula <- as.formula(paste(response, '~', old_rhs))
-    attr(updated_formula, '.Environment') <- attr(formula, '.Environment')
+    attr(updated_formula, '.Environment') <- attr(newformula, '.Environment')
 
   } else {
-    updated_formula <- formula
+    updated_formula <- newformula
   }
 
   return(updated_formula)

diff --git a/R/evaluate_mvgams.R b/R/evaluate_mvgams.R
@@ -920,7 +920,7 @@ variogram_score = function(truth, fc, log = FALSE, weights){
       if(i == j){
         out[i,j] <- 0
       } else {
-        v_fc <- quantile(abs(fc[i,] - fc[j,]) ^ 0.5, 0.5)
+        v_fc <- quantile(abs(fc[i,] - fc[j,]) ^ 0.5, 0.5, na.rm = TRUE)
         v_dat <- abs(truth[i] - truth[j]) ^ 0.5
         out[i,j] <- 2 * weights[i, j] * ((v_dat - v_fc) ^ 2)
       }

diff --git a/R/get_mvgam_priors.R b/R/get_mvgam_priors.R
@@ -15,21 +15,41 @@
 #'Any other variables to be included in the linear predictor of \code{formula} must also be present
 #'@param data_train Deprecated. Still works in place of \code{data} but users are recommended to use
 #'\code{data} instead for more seamless integration into `R` workflows
-#'@param family \code{character}. Must be either 'nb' (for Negative Binomial), 'tw' (for Tweedie) or 'poisson'
+#'@param family \code{family} specifying the exponential observation family for the series. Currently supported
+#'families are:
+#'\itemize{
+#'   \item`nb()` for count data
+#'   \item`poisson()` for count data
+#'   \item`tweedie()` for count data (power parameter `p` fixed at `1.5`)
+#'   \item`gaussian()` for real-valued data
+#'   \item`betar()` for proportional data on `(0,1)`
+#'   \item`lognormal()` for non-negative real-valued data
+#'   \item`student_t()` for real-valued data
+#'   \item`Gamma()` for non-negative real-valued data}
+#'See \code{\link{mvgam_families}} for more details
 #'@param use_lv \code{logical}. If \code{TRUE}, use dynamic factors to estimate series'
 #'latent trends in a reduced dimension format. If \code{FALSE}, estimate independent latent trends for each series
 #'@param n_lv \code{integer} the number of latent dynamic factors to use if \code{use_lv == TRUE}.
 #'Cannot be \code{>n_series}. Defaults arbitrarily to \code{min(2, floor(n_series / 2))}
 #'@param trend_model \code{character} specifying the time series dynamics for the latent trend. Options are:
-#''None' (no latent trend component; i.e. the GAM component is all that contributes to the linear predictor,
-#'and the observation process is the only source of error; similarly to what is estimated by \code{\link[mgcv]{gam}}),
-#''RW' (random walk with possible drift),
-#''AR1' (AR1 model with intercept),
-#''AR2' (AR2 model with intercept) or
-#''AR3' (AR3 model with intercept) or
-#''VAR1' (with possible drift; only available in \code{Stan}) or
-#''GP' (Gaussian Process with squared exponential kernel;
-#'only available for estimation in \code{Stan})
+#'\itemize{
+#'   \item `None` (no latent trend component; i.e. the GAM component is all that contributes to the linear predictor,
+#'and the observation process is the only source of error; similarly to what is estimated by \code{\link[mgcv]{gam}})
+#'   \item `RW` (random walk with possible drift)
+#'   \item `AR1` (with possible drift)
+#'   \item `AR2` (with possible drift)
+#'   \item `AR3` (with possible drift)
+#'   \item `VAR1` (with possible drift; only available in \code{Stan})
+#'   \item `GP` (Gaussian Process with squared exponential kernel;
+#'only available in \code{Stan})}
+#'@param trend_map Optional `data.frame` specifying which series should depend on which latent
+#'trends. Useful for allowing multiple series to depend on the same latent trend process, but with
+#'different observation processes. If supplied, a latent factor model is set up by setting
+#'`use_lv = TRUE` and using the mapping to set up the shared trends. Needs to have column names
+#'`series` and `trend`, with integer values in the `trend` column to state which trend each series
+#'should depend on. The `series` column should have a single unique entry for each series in the
+#'data (names should perfectly match factor levels of the `series` variable in `data`). See examples
+#'in \code{\link{mvgam}} for details
 #'@param drift \code{logical} estimate a drift parameter in the latent trend components. Useful if the latent
 #'trend is expected to broadly follow a non-zero slope. Note that if the latent trend is more or less stationary,
 #'the drift parameter can become unidentifiable, especially if an intercept term is included in the GAM linear
@@ -125,6 +145,7 @@ get_mvgam_priors = function(formula,
                             n_lv,
                             use_stan = TRUE,
                             trend_model = 'None',
+                            trend_map,
                             drift = FALSE){
 
   # Validate the family argument
@@ -139,9 +160,6 @@ get_mvgam_priors = function(formula,
                                        "student",
                                        "Gamma"))
 
-  # Validate the trend argument
-  trend_model <- evaluate_trend_model(trend_model)
-
   if(missing("data") & missing("data_train")){
     stop('Argument "data" is missing with no default')
   }
@@ -150,6 +168,48 @@ get_mvgam_priors = function(formula,
     data_train <- data
   }
 
+  # Validate the trend argument
+  trend_model <- evaluate_trend_model(trend_model)
+
+  # Check trend_map is correctly specified
+  if(!missing(trend_map)){
+
+    # No point in trend mapping if trend model is 'None'
+    if(trend_model == 'None'){
+      stop('cannot set up latent trends when "trend_model = None"',
+           call. = FALSE)
+    }
+
+    # Trend mapping not supported by JAGS
+    if(!use_stan){
+      stop('trend mapping not available for JAGS',
+           call. = FALSE)
+    }
+
+    # trend_map must have an entry for each unique time series
+    if(!all(sort(trend_map$series) == sort(unique(data_train$series)))){
+      stop('argument "trend_map" must have an entry for every unique time series in "data"',
+           call. = FALSE)
+    }
+
+    # trend_map must not specify a greater number of trends than there are series
+    if(max(trend_map$trend) > length(unique(data_train$series))){
+      stop('argument "trend_map" specifies more latent trends than there are series in "data"',
+           call. = FALSE)
+    }
+
+    # trend_map must not skip any trends
+    if(!all(sort(unique(trend_map$trend)) == seq(1:max(trend_map$trend)))){
+      stop('argument "trend_map" must link at least one series to each latent trend')
+    }
+
+    # If trend_map correctly specified, set use_lv to TRUE
+    use_lv <- TRUE
+
+    # Model should be set up using dynamic factors of the correct length
+    n_lv <- max(trend_map$trend)
+  }
+
   # JAGS cannot support latent GP or VAR trends
   if(!use_stan & trend_model %in%c ('GP', 'VAR1')){
     warning('gaussian process and VAR trends not yet supported for JAGS; reverting to Stan')
@@ -442,27 +502,19 @@ get_mvgam_priors = function(formula,
 
   if(trend_model == 'RW'){
     if(use_stan){
-      if(use_lv){
-        trend_df <- data.frame(param_name = c('vector<lower=0>[n_lv] sigma;'),
-                               param_length = n_lv,
-                               param_info = c('trend sd'),
-                               prior = c('sigma ~ exponential(1);'),
-                               example_change = c(
-                                 paste0('sigma ~ exponential(',
-                                        round(runif(min = 0.01, max = 1, n = 1), 2),
-                                        ');'
-                                 )))
-      } else {
-        trend_df <- data.frame(param_name = c('vector<lower=0>[n_series] sigma;'),
-                               param_length = length(unique(data_train$series)),
-                               param_info = c('trend sd'),
-                               prior = c('sigma ~ exponential(1);'),
-                               example_change = c(
-                                 paste0('sigma ~ exponential(',
-                                        round(runif(min = 0.01, max = 1, n = 1), 2),
-                                        ');'
-                                 )))
-      }
+      trend_df <- data.frame(param_name = c(paste0('vector<lower=0>[',
+                                                   ifelse(use_lv, 'n_lv', 'n_series'),
+                                                   '] sigma;')),
+                             param_length = ifelse(use_lv,
+                                                   n_lv,
+                                                   length(unique(data_train$series))),
+                             param_info = c('trend sd'),
+                             prior = c('sigma ~ exponential(2);'),
+                             example_change = c(
+                             paste0('sigma ~ exponential(',
+                                    round(runif(min = 0.01, max = 1, n = 1), 2),
+                                    ');'
+                             )))
 
     } else {
       trend_df <- data.frame(param_name = c('vector<lower=0>[n_series] sigma'),
@@ -736,18 +788,23 @@ get_mvgam_priors = function(formula,
 
   # Remove options for trend variance priors if using a dynamic factor model
   if(use_lv){
-    trend_df %>%
-      dplyr::filter(!grepl(paste0('vector<lower=0>[',
-                                  ifelse(use_lv, 'n_lv', 'n_series'),
-                                  '] sigma;'), param_name)) -> trend_df
+    if(missing(trend_map)){
+      trend_df %>%
+        dplyr::filter(!grepl(paste0('vector<lower=0>[',
+                                    ifelse(use_lv, 'n_lv', 'n_series'),
+                                    '] sigma;'), param_name,
+                             fixed = TRUE)) -> trend_df
+    }
 
     if(use_stan){
-      trend_df <- rbind(trend_df,
-                        data.frame(param_name = c('vector[M] L;'),
-                                   param_length = n_lv * length(unique(data_train$series)),
-                                   param_info = c('factor loadings'),
-                                   prior = c('L ~ student_t(5, 0, 1);'),
-                                   example_change = 'L ~ std_normal();'))
+      if(missing(trend_map)){
+        trend_df <- rbind(trend_df,
+                          data.frame(param_name = c('vector[M] L;'),
+                                     param_length = n_lv * length(unique(data_train$series)),
+                                     param_info = c('factor loadings'),
+                                     prior = c('L ~ student_t(5, 0, 1);'),
+                                     example_change = 'L ~ std_normal();'))
+      }
     }
   }
 

diff --git a/R/lfo_cv.mvgam.R b/R/lfo_cv.mvgam.R
@@ -112,7 +112,8 @@ lfo_cv.mvgam = function(object,
 
   # Initialize the process for i = min_t, generating a
   # conditional forecast for all of the future data
-  data_splits <- cv_split(all_data, last_train = min_t)
+  data_splits <- cv_split(all_data, last_train = min_t,
+                          fc_horizon = fc_horizon)
 
   # Fit model to training and forecast the testing
   fit_past <- update(object,
@@ -161,7 +162,8 @@ lfo_cv.mvgam = function(object,
       refits <- c(refits, i)
 
       # Subset the data to now include the last set of training observations
-      data_splits <- cv_split(all_data, last_train = i)
+      data_splits <- cv_split(all_data, last_train = i,
+                              fc_horizon = fc_horizon)
 
       # Re-fit the model
       fit_past <- update(fit_past,
@@ -279,18 +281,24 @@ plot.mvgam_lfo = function(x, ...){
 
 #' Function to generate training and testing splits
 #' @noRd
-cv_split = function(data, last_train){
-  if(class(data)[1] == 'list'){
+cv_split = function(data, last_train, fc_horizon = 1){
+  if(inherits(data, 'list')){
 
     # Find indices of training and testing splits
     temp_dat = data.frame(time = data$time,
                           series = data$series) %>%
       dplyr::mutate(index = dplyr::row_number()) %>%
       dplyr::arrange(time, series)
+
     indices_train <- temp_dat %>%
       dplyr::filter(time <= last_train) %>%
       dplyr::pull(index)
 
+    indices_test <- temp_dat %>%
+      dplyr::filter(time > last_train &
+                      time <= (last_train + fc_horizon)) %>%
+      dplyr::pull(index)
+
     # Split
     data_train <- lapply(data, function(x){
       if(is.matrix(x)){
@@ -302,18 +310,20 @@ cv_split = function(data, last_train){
 
     data_test <- lapply(data, function(x){
       if(is.matrix(x)){
-        matrix(x[-indices_train,], ncol = NCOL(x))
+        matrix(x[indices_test,], ncol = NCOL(x))
       } else {
-        x[-indices_train]
+        x[indices_test]
       }
     })
 
   } else {
     data_train <- data %>%
       dplyr::filter(time <= last_train) %>%
       dplyr::arrange(time, series)
+
     data_test <- data %>%
-      dplyr::filter(time > last_train) %>%
+      dplyr::filter(time > last_train &
+                      time <= (last_train + fc_horizon)) %>%
       dplyr::arrange(time, series)
   }
 

diff --git a/R/mvgam.R b/R/mvgam.R
@@ -1347,7 +1347,8 @@ mvgam = function(formula,
                                         trend_map = trend_map,
                                         data_train = data_train,
                                         ytimes = ytimes,
-                                        n_lv = n_lv)
+                                        n_lv = n_lv,
+                                        trend_model = trend_model)
       vectorised$model_file <- trend_map_setup$model_file
       vectorised$model_data <- trend_map_setup$model_data
     }
@@ -1385,6 +1386,7 @@ mvgam = function(formula,
   #### Return only the model file and all data / inits needed to run the model
   # outside of mvgam ####
   if(!run_model){
+    unlink('base_gam.txt')
       output <- structure(list(call = orig_formula,
                                family = family_char,
                                trend_model = trend_model,