Adding distributions #106

venpopov · 2024-04-12T09:35:38Z

Currently we cannot add two distributions together:

library(distributional)
d <- dist_gamma(1,1) + dist_gamma(1,1)
density(d, 2)
#> Error in dist$inverse(at): Inverting transformations for distributions is not yet supported.

^{Created on 2024-04-12 with reprex v2.1.0}

But this would be nice to have. The density for the sum of two iid random variables is the convolution of their densities. One way to implement this, is to calculate the convolution on a grid of values, and then create an approximation function that interpolates between values on the grid.

Here's an example function (not to be implemented as is in the package, but an example of how this works):

# creates a function that gives the approximate density of the sum of two random variables
# via convolution of their densities
# N_exponent is the exponent of the number of points used in the convolution (2^N_exponent)
conv_dens <- function(dist1, dist2, N_exponent = 14, trunc = 1e-6) {
  dist <- c(dist1,dist2)
  qlower <- quantile(dist, 0)
  qlower <- ifelse(qlower == -Inf, quantile(dist, trunc), qlower)
  qlower <- min(qlower)
  qupper <- quantile(dist, 1)
  qupper <- ifelse(qupper == Inf, quantile(dist, 1-trunc), qupper)
  qupper <- max(qupper)
  
  N <- 2^N_exponent
  x <- seq(qlower, qupper, length=N)
  dx <- (qupper - qlower) / N
  
  dens <- density(dist, x)
  dens_z <- convolve(dens[[1]], rev(dens[[2]]), type = "open") / N * (max(x)-min(x))
  
  xn <- seq(2*qlower, 2*qupper, length=N*2-1)
  approxfun(xn, dens_z, rule = 2)
}

This function takes two distributional distributions and outputs a function that approximates the density of their convolution.

Here are a couple of examples, plotting the results of adding random generates from each distribution, and plotting the approximated density, the analytical density (if known):

plot_conv_dens <- function(dist1, dist2, analytical_dist = NULL, N_exponent = 14, xlim = c(-Inf, Inf)) {
  x <- generate(c(dist1,dist2), 100000)
  z <- x[[1]] + x[[2]]
  z <- z[z > xlim[1] & z < xlim[2]]
  hist(z, freq=FALSE, breaks=60, col = "grey90", border = "grey")
  fz <- conv_dens(dist1,dist2, N_exponent = N_exponent)
  
  zn <- seq(min(z),max(z),0.01)
  lines(zn, fz(zn), col="red")
  if (!is.null(analytical_dist)) {
    lines(zn, density(analytical_dist, zn)[[1]], col="blue", lty = 2)
    legend("topright", legend=c("empirical", "convolution", "analytical"), 
           col=c("black", "red", "blue"), lty=c(1,1,2))
  } else {
    legend("topright", legend=c("empirical", "convolution"), 
           col=c("black", "red"), lty=1)
  }
}

# visualization of adding two normals
dist1 <- dist_normal()
dist2 <- dist_normal()
plot_conv_dens(dist1, dist2, analytical_dist = dist_normal(0, sqrt(2)), N_exponent = 14)

# visualization of adding two gammas
dist1 <- dist_gamma(2, 2)
dist2 <- dist_gamma(2, 2)
plot_conv_dens(dist1, dist2, analytical_dist = dist_gamma(4, 2), N_exponent = 14)

# adding normal and lognormal
dist1 <- dist_normal()
dist2 <- dist_lognormal()
plot_conv_dens(dist1, dist2, N_exponent = 14, xlim = c(-5,20))

^{Created on 2024-04-12 with reprex v2.1.0}

The speed-vs-approximation error can be controlled via the N_exponent and grid limits:

approx_error <- function(dist1, dist2, analytical_dist, N_exponent = 14, trunc = 1e-6, xlim = c(-10,10)) {
  fz <- conv_dens(dist1, dist2, N_exponent = N_exponent, trunc = trunc)
  x <- seq(xlim[1],xlim[2],0.01)
  out <- data.frame(x = x, approx = fz(x), analytical =  density(analytical_dist, x)[[1]])
  out$error <- out$approx - out$analytical
  out
}


# error in adding gammas
dist1 <- dist_gamma(2, 2)
dist2 <- dist_gamma(2, 2)
res <- approx_error(dist1, dist2, dist_gamma(4, 2), N_exponent = 14, xlim = c(0,10))
par(mfrow = c(1,2))
plot(res$x, res$error, type = "l", col = "red", xlab = "x", ylab = "absolute error")
plot(res$x, res$error / res$analytical, type = "l", col = "red", xlab = "x", ylab = "relative error")

# increase quantile truncation
res <- approx_error(dist1, dist2, dist_gamma(4, 2), N_exponent = 14, xlim = c(0,10), trunc = 1e-8)
plot(res$x, res$error, type = "l", col = "red", xlab = "x", ylab = "absolute error")
plot(res$x, res$error / res$analytical, type = "l", col = "red", xlab = "x", ylab = "relative error")

# decrease N_exponent
res <- approx_error(dist1, dist2, dist_gamma(4, 2), N_exponent = 5, xlim = c(0,10))
plot(res$x, res$error, type = "l", col = "red", xlab = "x", ylab = "absolute error")
plot(res$x, res$error / res$analytical, type = "l", col = "red", xlab = "x", ylab = "relative error")

^{Created on 2024-04-12 with reprex v2.1.0}

The speed is naturally much slower than the builtin distributions, but it does not depend on the number of evaluation points, it is just about constructing the initial density function:

# benchmark
library(microbenchmark)

dist1 <- dist_gamma(2, 2)
dist2 <- dist_gamma(2, 2)

microbenchmark(
  "single gamma" = density(dist1, 1),
  "N = 2^10" = conv_dens(dist1, dist2, N_exponent = 10)(1),
  "N = 2^14" = conv_dens(dist1, dist2, N_exponent = 14)(1)
)
#> Warning in microbenchmark(`single gamma` = density(dist1, 1), `N = 2^10` =
#> conv_dens(dist1, : less accurate nanosecond times to avoid potential integer
#> overflows
#> Unit: microseconds
#>          expr      min        lq       mean    median        uq       max neval
#>  single gamma   24.190   26.0145   30.27112   27.7980   30.9345    76.752   100
#>      N = 2^10  535.214  564.4265  701.69409  584.7215  607.3945 11424.568   100
#>      N = 2^14 7360.361 7544.4510 7984.87792 7669.7675 7983.4175 11559.417   100

at <- seq(0, 10, 0.01)
microbenchmark(
  "single gamma" = density(dist1, at),
  "N = 2^10" = conv_dens(dist1, dist2, N_exponent = 10)(at),
  "N = 2^14" = conv_dens(dist1, dist2, N_exponent = 14)(at)
)
#> Unit: microseconds
#>          expr      min        lq       mean    median       uq       max neval
#>  single gamma   66.789   68.5725   74.29528   70.8685   76.383   171.052   100
#>      N = 2^10  547.760  570.5150  677.67055  583.1020  610.367  2978.855   100
#>      N = 2^14 7302.018 7536.2510 7820.35312 7615.7910 7736.557 10931.666   100

^{Created on 2024-04-12 with reprex v2.1.0}

Other operations

The above should work for + and -. For multiplication and division, it can be made to work, by first taking the logarithm of each distribution, computing the convolution dendsity, and then exponentiating again

What's needed to implement

This will need a new 'dist_*' class, since the logic is different from dist_transformed. Something like 'dist_convolved' would do the trick, where the density function uses the convolution logic above. For efficiency, the density approximation function would have to be stored within the dist object, so that it is not recomputed every time density is used. Will have to think about how to implement the quantile, cdf and generate functions

Thoughts?

The text was updated successfully, but these errors were encountered:

mitchelloharawild · 2024-04-18T10:45:22Z

Great idea, I had planned to create dist_convolved() when designing the package (as part of the mixture/transformed family) but haven't gotten around to it yet. I also wasn't sure if it fit the package, it starts to blur the idea of 'representing distributions' with modelling/methods (but I think implementing convolutions with the circular convolution theorem is ok).

A few suggestions:

Rather than N_exponent, simply n = 512 as the default should be fine for speed and consistency with stats::density().

A question:

What do you think of using other approximations of the density, rather than linear approximation over the grid. Something like a gaussian mixture model? It's another model-like step and will add some time to the setup, but then we inherit all the methods from dist_mixture() and dist_normal() and each distribution stores less parameters. @robjhyndman for his opinions too.

Would you like to start with creating dist_convolution(x, y) with a method for using the density?

I was also thinking of using a table of common convolutions, to produce something similar to how exp(dist_normal()) gives dist_lognormal() for convolutions. I've already done this for dist_normal() + dist_normal() in Ops.dist_normal for instance, but I think this should be moved into a lookup table in Ops.dist_default for when the operation involves two distributions.

robjhyndman · 2024-04-19T01:46:11Z

While Gaussian mixtures can, in theory, be used to approximate any smooth distribution, finding the mixture components can be computationally challenging. So I think using a numerical linear approximation of the cdf over a grid will be faster and more reliable.

venpopov · 2024-04-19T07:08:57Z

Great idea, I had planned to create dist_convolved() when designing the package (as part of the mixture/transformed family) but haven't gotten around to it yet. I also wasn't sure if it fit the package, it starts to blur the idea of 'representing distributions' with modelling/methods (but I think implementing convolutions with the circular convolution theorem is ok).

A few suggestions:

Rather than N_exponent, simply n = 512 as the default should be fine for speed and consistency with stats::density().

A question:

What do you think of using other approximations of the density, rather than linear approximation over the grid. Something like a gaussian mixture model? It's another model-like step and will add some time to the setup, but then we inherit all the methods from dist_mixture() and dist_normal() and each distribution stores less parameters. @robjhyndman for his opinions too.

Would you like to start with creating dist_convolution(x, y) with a method for using the density?

I was also thinking of using a table of common convolutions, to produce something similar to how exp(dist_normal()) gives dist_lognormal() for convolutions. I've already done this for dist_normal() + dist_normal() in Ops.dist_normal for instance, but I think this should be moved into a lookup table in Ops.dist_default for when the operation involves two distributions.

Glad you like it. I won't be able to start on this for a few weeks - lot's going on at the moment. If you are not in a rush I can probably get to it in about a month. If you want you can also start in the meantime.

I think a table of common convolutions makes a lot of sense.

mitchelloharawild · 2024-04-19T07:11:31Z

No worries, I'll take this one on then. I'm hoping to submit a CRAN release in ~3 weeks (not necessarily v1.0.0), but having symbolic derivatives and some convolutions would be useful for some upstream packages.

If you could review the changes before I submit to CRAN that would be appreciated!

venpopov · 2024-04-19T07:24:08Z

Happy to review

mitchelloharawild added this to the v1.0.0 milestone Apr 19, 2024

mitchelloharawild mentioned this issue May 25, 2024

Updating the Formatting of dist_mixture #112

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Adding distributions #106

Adding distributions #106

venpopov commented Apr 12, 2024

mitchelloharawild commented Apr 18, 2024 •

edited

Loading

robjhyndman commented Apr 19, 2024

venpopov commented Apr 19, 2024

mitchelloharawild commented Apr 19, 2024 •

edited

Loading

venpopov commented Apr 19, 2024

Adding distributions #106

Adding distributions #106

Comments

venpopov commented Apr 12, 2024

Other operations

What's needed to implement

mitchelloharawild commented Apr 18, 2024 • edited Loading

robjhyndman commented Apr 19, 2024

venpopov commented Apr 19, 2024

mitchelloharawild commented Apr 19, 2024 • edited Loading

venpopov commented Apr 19, 2024

mitchelloharawild commented Apr 18, 2024 •

edited

Loading

mitchelloharawild commented Apr 19, 2024 •

edited

Loading