add variance as a prediction type

swpease · Sep 20, 2023 · e6748e4 · e6748e4
1 parent 32a6d5e
commit e6748e4
Show file tree

Hide file tree

Showing 89 changed files with 1,748 additions and 451 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -92,12 +92,17 @@ importFrom(bayesplot,color_scheme_set)
 importFrom(bayesplot,log_posterior)
 importFrom(bayesplot,neff_ratio)
 importFrom(bayesplot,nuts_params)
+importFrom(brms,dstudent_t)
 importFrom(brms,get_prior)
 importFrom(brms,logm1)
 importFrom(brms,lognormal)
 importFrom(brms,mcmc_plot)
 importFrom(brms,ndraws)
 importFrom(brms,prior_string)
+importFrom(brms,pstudent_t)
+importFrom(brms,qstudent_t)
+importFrom(brms,rstudent_t)
+importFrom(brms,student)
 importFrom(grDevices,devAskNewPage)
 importFrom(grDevices,hcl.colors)
 importFrom(grDevices,rgb)
@@ -164,7 +169,6 @@ importFrom(stats,dlnorm)
 importFrom(stats,dnbinom)
 importFrom(stats,dnorm)
 importFrom(stats,dpois)
-importFrom(stats,dt)
 importFrom(stats,ecdf)
 importFrom(stats,formula)
 importFrom(stats,frequency)
@@ -187,12 +191,10 @@ importFrom(stats,poisson)
 importFrom(stats,ppois)
 importFrom(stats,predict)
 importFrom(stats,printCoefmat)
-importFrom(stats,pt)
 importFrom(stats,qcauchy)
 importFrom(stats,qnorm)
 importFrom(stats,qqline)
 importFrom(stats,qqnorm)
-importFrom(stats,qt)
 importFrom(stats,quantile)
 importFrom(stats,rbeta)
 importFrom(stats,reformulate)
@@ -201,7 +203,6 @@ importFrom(stats,rlnorm)
 importFrom(stats,rnbinom)
 importFrom(stats,rnorm)
 importFrom(stats,rpois)
-importFrom(stats,rt)
 importFrom(stats,runif)
 importFrom(stats,sd)
 importFrom(stats,setNames)

diff --git a/R/compute_edf.R b/R/compute_edf.R
@@ -71,19 +71,16 @@ compute_edf = function(mgcv_model, object, rho_names, sigma_raw_names){
       mu <- fitted(object, summary = FALSE)[best_draw, 1:length(eta)]
     }
 
-    # Because some mvgam families do not (yet) have simple functions to compute
-    # prediction variance, we will resort to brute force sampling
-    preds <- do.call(rbind, lapply(1:75, function(x){
-      # luckily the predict function is vectorized!!
-      predict(object, process_error = FALSE, type = 'response')[best_draw,]
-    }))
-    pred_variance <- apply(preds, 2, var)
-    if(any(pred_variance == 0)){
-      pred_variance[which(pred_variance == 0)] <-
-        mu[which(pred_variance == 0)]
+    # Calculate variance using family's mean-variance relationship
+    mu_variance <- predict(object,
+                           process_error = FALSE,
+                           type = 'variance')[best_draw, 1:length(eta)]
+    if(any(mu_variance == 0)){
+      mu_variance[which(mu_variance == 0)] <-
+        mu[which(mu_variance == 0)]
     }
 
-    w <- as.numeric(mgcv_model$family$mu.eta(eta) / pred_variance)
+    w <- as.numeric(mgcv_model$family$mu.eta(eta) / mu_variance)
     XWX <- t(X) %*% (w * X)
     rho <- mgcv_model$sp
     lambda <- exp(rho)

diff --git a/R/evaluate_mvgams.R b/R/evaluate_mvgams.R
@@ -667,7 +667,7 @@ drps_score <- function(truth, fc, interval_width = 0.9,
   }
 
   #nsum <- 1000
-  nsum <- max(c(truth, fc)) + 1000
+  nsum <- max(c(truth, fc), na.rm = TRUE) + 1000
   Fy = ecdf(fc)
   ysum <- 0:nsum
   indicator <- ifelse(ysum - truth >= 0, 1, 0)
@@ -714,6 +714,9 @@ sis_score <- function(truth, fc, interval_width = 0.9,
 #' @importFrom scoringRules es_sample
 #' @noRd
 energy_score <- function(truth, fc, log = FALSE) {
+  # es_sample can't handle any NAs
+  has_nas <- apply(fc, 2, function(x) any(is.na(x)))
+  fc <- fc[,!has_nas]
   if(log){
     truth <- log(truth + 0.001)
     fc <- log(fc + 0.001)

diff --git a/R/families.R b/R/families.R
@@ -1,6 +1,6 @@
 #' Supported mvgam families
 #' @importFrom stats make.link dgamma pgamma rgamma qnorm plnorm runif pbeta dlnorm dpois pnorm ppois plogis gaussian poisson Gamma dnbinom rnbinom dnorm dbeta
-#' @importFrom brms lognormal
+#' @importFrom brms lognormal student rstudent_t qstudent_t dstudent_t pstudent_t
 #' @param link a specification for the family link function. At present these cannot
 #' be changed
 #' @details \code{mvgam} currently supports the following standard observation families:
@@ -20,12 +20,14 @@
 #'\itemize{
 #'   \item \code{\link[brms]{lognormal}} for non-negative real-valued data
 #'   \item \code{tweedie} for count data (power parameter `p` fixed at `1.5`)
-#'   \item \code{student-t} for real-valued data
+#'   \item `student_t()` (or \code{\link[brms]{student}}) for real-valued data
 #'   }
 #'Note that only `poisson()`, `nb()`, and `tweedie()` are available if
 #'using `JAGS`. All families, apart from `tweedie()`, are supported if
 #'using `Stan`.
 #' @name mvgam_families
+#' @details Note that currently it is not possible to change the default link
+#' functions in `mvgam`, so any call to change these will be silently ignored
 #' @author Nicholas J Clark
 NULL
 
@@ -36,7 +38,7 @@ tweedie = function(link = 'log'){
   structure(list(family = "tweedie", link = 'log', linkfun = linktemp$linkfun,
                  linkinv = linktemp$linkinv, mu.eta = linktemp$mu.eta,
                  valideta = linktemp$valideta),
-            class = c("extended.family","family"))
+            class = c("extended.family", "family"))
 }
 
 #' @rdname mvgam_families
@@ -46,7 +48,7 @@ student_t = function(link = 'identity'){
   structure(list(family = "student", link = 'identity', linkfun = linktemp$linkfun,
                  linkinv = linktemp$linkinv, mu.eta = linktemp$mu.eta,
                  valideta = linktemp$valideta),
-            class = c("extended.family","family"))
+            class = c("extended.family", "family"))
 }
 
 #### Non-exported functions for performing family-specific tasks ####
@@ -71,57 +73,20 @@ beta_shapes = function(mu, phi) {
               shape2 = (1 - mu) * phi))
 }
 
-# Scaled Student's t distribution
-# Original author: mjskay (https://github.com/mjskay/ggdist/blob/master/R/student_t.R)
-#'
-#' Density, distribution function, quantile function and random generation for the
-#' scaled Student's t distribution, parameterized by degrees of freedom (`df`),
-#' location (`mu`), and scale (`sigma`).
-#'
-#' @inheritParams stats::dt
-#' @param mu Location parameter (median)
-#' @param sigma Scale parameter
-#'
-#' @name student_t
-#' @importFrom stats dt pt qt rt
-#' @noRd
-dstudent_t = function(x, df, mu = 0, sigma = 1, log = FALSE) {
-  if (log) {
-    dt((x - mu)/sigma, df = df, log = TRUE) - log(sigma)
-  }
-  else {
-    dt((x - mu)/sigma, df = df) / sigma
-  }
-}
-
-#' @noRd
-pstudent_t = function(q, df, mu = 0, sigma = 1, lower.tail = TRUE, log.p = FALSE) {
-  pt((q - mu)/sigma, df = df, lower.tail = lower.tail, log.p = log.p)
-}
-
-#' @noRd
-qstudent_t = function(p, df, mu = 0, sigma = 1, lower.tail = TRUE, log.p = FALSE) {
-  qt(p, df = df, lower.tail = lower.tail, log.p = log.p)*sigma + mu
-}
-
-#' @noRd
-rstudent_t = function(n, df, mu = 0, sigma = 1) {
-  as.vector(rt(n, df = df)*sigma + mu)
-}
-
 #' Generic prediction function
 #' @importFrom stats predict
 #' @param Xp A `mgcv` linear predictor matrix
 #' @param family \code{character}. The `family` slot of the model's family argument
 #' @param betas Vector of regression coefficients of length `NCOL(Xp)`
-#' @param type Either `link`, `expected` or `response`
+#' @param type Either `link`, `expected`, `response` or `variance`
 #' @param family_pars Additional arguments for each specific observation process (i.e.
 #' overdispersion parameter if `family == "nb"`)
 #' @param density logical. Rather than calculating a prediction, evaluate the log-likelihood.
 #' Use this option when particle filtering
 #' @param truth Observation to use for evaluating the likelihood (if `density == TRUE`)
 #' @details A generic prediction function that will make it easier to add new
-#' response distributions in future
+#' response distributions in future. Use `type = variance` for computing family-level
+#' variance as a function of the mean
 #' @noRd
 mvgam_predict = function(Xp, family, betas,
                          type = 'link',
@@ -140,6 +105,8 @@ mvgam_predict = function(Xp, family, betas,
                      log = TRUE)
       }
 
+    } else if(type == 'variance') {
+      out <- rep.int(1, NROW(Xp))
     } else {
       out <- rnorm(n = NROW(Xp),
                    mean = ((matrix(Xp, ncol = NCOL(Xp)) %*%
@@ -166,6 +133,12 @@ mvgam_predict = function(Xp, family, betas,
                                       betas)) +
                                     attr(Xp, 'model.offset'),
                     sdlog = as.vector(family_pars$sigma_obs))
+    } else if(type == 'variance'){
+      mu <- ((matrix(Xp, ncol = NCOL(Xp)) %*%
+                betas)) + attr(Xp, 'model.offset')
+      sd <- as.vector(family_pars$sigma_obs)
+      out <- (exp((sd) ^ 2) - 1) * exp((2 * mu + sd ^ 2))
+
     } else {
       mu <- as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
                               betas) + attr(Xp, 'model.offset'))
@@ -180,12 +153,16 @@ mvgam_predict = function(Xp, family, betas,
                           betas) + attr(Xp, 'model.offset'))
       if(density){
         out <- dstudent_t(truth,
-                          df = family_pars$nu,
+                          df = as.vector(family_pars$nu),
                           mu = out,
                           sigma = as.vector(family_pars$sigma_obs),
                           log = TRUE)
       }
 
+    } else if(type == 'variance') {
+      out <- as.vector(family_pars$nu) /
+        (pmax(2.01, as.vector(family_pars$nu)) - 2)
+
     } else {
       out <- rstudent_t(n = NROW(Xp),
                         df = family_pars$nu,
@@ -211,7 +188,10 @@ mvgam_predict = function(Xp, family, betas,
                    lambda = exp(((matrix(Xp, ncol = NCOL(Xp)) %*%
                                     betas)) +
                                   attr(Xp, 'model.offset')))
-    } else {
+    } else if(type == 'variance'){
+      out <- exp(as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
+                          betas) + attr(Xp, 'model.offset')))
+      } else {
       out <- exp(((matrix(Xp, ncol = NCOL(Xp)) %*%
                      betas)) +
                    attr(Xp, 'model.offset'))
@@ -236,6 +216,10 @@ mvgam_predict = function(Xp, family, betas,
                                   betas)) +
                                 attr(Xp, 'model.offset')),
                      size = as.vector(family_pars$phi))
+    } else if(type == 'variance'){
+      mu <- exp(as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
+                             betas) + attr(Xp, 'model.offset')))
+      out <- mu + mu^2 / as.vector(family_pars$phi)
     } else {
       out <- exp(((matrix(Xp, ncol = NCOL(Xp)) %*%
                      betas)) +
@@ -263,7 +247,11 @@ mvgam_predict = function(Xp, family, betas,
       out <- rbeta(n = NROW(Xp),
                    shape1 = shape_pars$shape1,
                    shape2 = shape_pars$shape2)
-    } else {
+    } else if(type == 'variance'){
+      mu <- plogis(as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
+                               betas) + attr(Xp, 'model.offset')))
+      out <- mu * (1 - mu) / (1 + exp(as.vector(family_pars$phi)))
+      } else {
      out <- plogis(as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
                           betas) + attr(Xp, 'model.offset')))
     }
@@ -288,7 +276,10 @@ mvgam_predict = function(Xp, family, betas,
                       exp(as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
                                        betas) + attr(Xp, 'model.offset'))),
                     shape = as.vector(family_pars$shape))
-    } else {
+    } else if(type == 'variance'){
+      out <- as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
+                          betas) + attr(Xp, 'model.offset')) ^ 2
+      } else {
       out <- as.vector(family_pars$shape) /
         (as.vector(family_pars$shape) / exp(as.vector((matrix(Xp, ncol = NCOL(Xp)) %*%
                                               betas) + attr(Xp, 'model.offset'))))
@@ -319,7 +310,12 @@ mvgam_predict = function(Xp, family, betas,
                      # Power parameter is fixed
                      p = 1.5,
                      phi = as.vector(family_pars$phi)))
-    } else {
+    } else if(type == 'variance'){
+      out <- (exp(((matrix(Xp, ncol = NCOL(Xp)) %*%
+                     betas)) +
+                   attr(Xp, 'model.offset')) ^ 1.5) *
+        as.vector(family_pars$phi)
+      } else {
       out <- exp(((matrix(Xp, ncol = NCOL(Xp)) %*%
                      betas)) +
                    attr(Xp, 'model.offset'))
@@ -353,9 +349,9 @@ family_to_mgcvfam = function(family){
   } else if(family$family == 'student'){
     gaussian()
   } else if(family$family == 'lognormal'){
-    mgcv::tw()
+    mgcv::Tweedie(p = 1.5, link = 'identity')
   } else if(family$family == 'tweedie'){
-    mgcv::Tweedie(p=1.5)
+    mgcv::Tweedie(p = 1.5, link = 'log')
   } else {
     family
   }

diff --git a/R/predict.mvgam.R b/R/predict.mvgam.R
@@ -10,7 +10,8 @@
 #'If \code{expected} is used, predictions reflect the expectation of the response (the mean)
 #'but ignore uncertainty in the observation process. When \code{response} is used,
 #'the predictions take uncertainty in the observation process into account to return
-#'predictions on the outcome scale
+#'predictions on the outcome scale. When \code{variance} is used, the variance of the response
+#'with respect to the mean (mean-variance relationship) is returned
 #'@param process_error Logical. If \code{TRUE} and a dynamic trend model was fit,
 #'expected uncertainty in the process model is accounted for by using draws
 #'from the latent trend SD parameters. If \code{FALSE}, uncertainty in the latent trend
@@ -54,7 +55,8 @@ predict.mvgam = function(object, newdata,
 
   type <- match.arg(arg = type, choices = c("link",
                                             "expected",
-                                            "response"))
+                                            "response",
+                                            "variance"))
 
   # If a linear predictor was supplied for the latent process models, calculate
   # predictions by assuming the trend is stationary (this is basically what brms)

diff --git a/R/sim_mvgam.R b/R/sim_mvgam.R
@@ -29,7 +29,7 @@
 #'@param trend_rel Depracated. Use `prop_trend` instead
 #'@param freq \code{integer}. The seasonal frequency of the series
 #'@param family \code{family} specifying the exponential observation family for the series. Currently supported
-#'families are: `nb()`, `poisson()`, `tweedie()`, `gaussian()`, `betar()`, `lognormal()`, `student_t()` and `Gamma()`
+#'families are: `nb()`, `poisson()`, `tweedie()`, `gaussian()`, `betar()`, `lognormal()`, `student()` and `Gamma()`
 #'@param phi \code{vector} of dispersion parameters for the series (i.e. `size` for Negative Binomial or
 #'`phi` for Tweedie or Beta). If \code{length(phi) < n_series}, the first element of `phi` will
 #'be replicated `n_series` times. Defaults to \code{5} for Negative Binomial and Tweedie; \code{10} for