diff --git a/NAMESPACE b/NAMESPACE index e7c766b2..6ecff560 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -22,6 +22,7 @@ export(tidy_cauchy) export(tidy_chisquare) export(tidy_combine_distributions) export(tidy_combined_autoplot) +export(tidy_distribution_comparison) export(tidy_distribution_summary_tbl) export(tidy_empirical) export(tidy_exponential) diff --git a/NEWS.md b/NEWS.md index ccb19659..fdb8a5fb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,7 +9,7 @@ None 2. Fix #187 - Add functions `ci_lo()` and `ci_hi()` 3. Fix #189 - Add function `tidy_bootstrap()` 4. Fix #190 - Add function `bootstrap_unnest_tbl()` -5. Fix +5. Fix #202 - Add function `tidy_distribution_comparison()` ## Minor Fixes and Improvements 1. Fix #176 - Update `_autoplot` functions to include cumulative mean MCMC chart diff --git a/R/util-distribution-comparison.R b/R/util-distribution-comparison.R new file mode 100644 index 00000000..6c1c02ea --- /dev/null +++ b/R/util-distribution-comparison.R @@ -0,0 +1,212 @@ +#' Compare Empirical Data to Distributions +#' +#' @family Empirical +#' +#' @author Steven P. Sanderson II, MPH +#' +#' @details The purpose of this function is to take some data set provided and +#' to try to find a distribution that may fit the best. A parameter of +#' `.distribution_type` must be set to either `continuous` or `discrete` in order +#' for this the function to try the appropriate types of distributions. +#' +#' The following distributions are used: +#' +#' Continuous: +#' - tidy_beta +#' - tidy_cauchy +#' - tidy_exponential +#' - tidy_gamma +#' - tidy_logistic +#' - tidy_lognormal +#' - tidy_pareto +#' - tidy_uniform +#' - tidy_weibull +#' +#' Discrete: +#' - tidy_binomial +#' - tidy_geometric +#' - tidy_hypergeometric +#' - tidy_poisson +#' +#' +#' @description Compare some empirical data set against different distributions +#' to help find the distribution that could be the best fit. +#' +#' @param .x The data set being passed to the function +#' @param .distribution_type What kind of data is it, can be one of `continuous` +#' or `discrete` +#' +#' @examples +#' xc <- mtcars$mpg +#' tidy_distribution_comparison(xc, "continuous") +#' +#' xd <- trunc(xc) +#' tidy_distribution_comparison(xd, "discrete") +#' +#' @return +#' An invisible list object. A tibble is printed. +#' +#' @export +#' + +tidy_distribution_comparison <- function(.x, .distribution_type = "continuous"){ + + # Tidyeval ---- + x_term <- as.numeric(.x) + n <- length(x_term) + dist_type <- tolower(as.character(.distribution_type)) + + if (!dist_type %in% c("continuous","discrete")){ + rlang::abort( + message = "The '.distribution_type' parameter must be either 'continuous' + or 'discrete'.", + use_cli_format = TRUE + ) + } + + # Get parameter estimates for distributions + if (dist_type == "continuous"){ + b <- util_beta_param_estimate(x_term)$parameter_tbl %>% + dplyr::filter(method == "NIST_MME") %>% + dplyr::select(dist_type, shape1, shape2) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + c <- util_cauchy_param_estimate(x_term)$parameter_tbl %>% + dplyr::select(dist_type, location, scale) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + e <- util_exponential_param_estimate(x_term)$parameter_tbl %>% + dplyr::select(dist_type, rate) %>% + dplyr::mutate(param_2 = NA) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + g <- util_gamma_param_estimate(x_term)$parameter_tbl %>% + dplyr::filter(method == "NIST_MME") %>% + dplyr::select(dist_type, shape, scale) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + l <- util_logistic_param_estimate(x_term)$parameter_tbl %>% + dplyr::filter(method == "EnvStats_MME") %>% + dplyr::select(dist_type, location, scale) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + ln <- util_lognormal_param_estimate(x_term)$parameter_tbl %>% + dplyr::filter(method == "EnvStats_MME") %>% + dplyr::select(dist_type, mean_log, sd_log) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + p <- util_pareto_param_estimate(x_term)$parameter_tbl %>% + dplyr::filter(method == "MLE") %>% + dplyr::select(dist_type, shape, scale) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + u <- util_uniform_param_estimate(x_term)$parameter_tbl %>% + dplyr::filter(method == "NIST_MME") %>% + dplyr::select(dist_type, min_est, max_est) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + w <- util_weibull_param_estimate(x_term)$parameter_tbl %>% + dplyr::select(dist_type, shape, scale) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + comp_tbl <- tidy_combine_distributions( + tidy_empirical(x_term, .distribution_type = dist_type), + tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2)), + tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2)), + tidy_exponential(.n = n, .rate = round(e[[2]], 2)), + tidy_gamma(.n = n, .shape = round(g[[2]], 2), .scale = round(g[[3]], 2)), + tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2)), + tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2)), + tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2)), + tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2)), + tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2)) + ) + + } else { + bn <- util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl %>% + dplyr::select(dist_type, size, prob) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + ge <- util_geometric_param_estimate(x_term)$parameter_tbl %>% + dplyr::filter(method == "EnvStats_MME") %>% + dplyr::select(dist_type, shape) %>% + dplyr::mutate(param_2 = NA) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + h <- util_hypergeometric_param_estimate(.x = x_term, .total = n, .k = n)$parameter_tbl %>% + tidyr::drop_na() %>% + dplyr::slice(1) %>% + dplyr::select(dist_type, m, total) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + po <- util_poisson_param_estimate(x_term)$parameter_tbl %>% + dplyr::select(dist_type, lambda) %>% + dplyr::mutate(param_2 = NA) %>% + purrr::set_names("dist_type", "param_1", "param_2") + + comp_tbl <- tidy_combine_distributions( + tidy_empirical(.x = x_term, .distribution_type = dist_type), + tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2)), + tidy_geometric(.n = n, .prob = round(ge[[2]], 2)), + tidy_hypergeometric( + .n = n, + .m = trunc(h[[2]]), + .nn = n - trunc(h[[2]]), + .k = trunc(h[[2]]) + ), + tidy_poisson(.n = n, .lambda = round(po[[2]], 2)) + ) + + } + + # Deviance calculations ---- + deviance_tbl <- comp_tbl %>% + dplyr::select(dist_type, x, y) %>% + dplyr::group_by(dist_type, x) %>% + dplyr::mutate(y = stats::median(y)) %>% + dplyr::ungroup() %>% + dplyr::group_by(dist_type) %>% + dplyr::mutate(y = tidy_scale_zero_one_vec(y)) %>% + dplyr::ungroup() %>% + tidyr::pivot_wider( + id_cols = x, + names_from = dist_type, + values_from = y + ) %>% + dplyr::select(x, Empirical, dplyr::everything()) %>% + dplyr::mutate( + dplyr::across( + .cols = -c(x, Empirical), + .fns = ~ Empirical - . + ) + ) %>% + tidyr::drop_na() %>% + tidyr::pivot_longer( + cols = -x + ) + + total_deviance_tbl <- deviance_tbl %>% + dplyr::filter(!name == "Empirical") %>% + dplyr::group_by(name) %>% + dplyr::summarise(total_deviance = sum(value)) %>% + dplyr::ungroup() %>% + dplyr::mutate(total_deviance = abs(total_deviance)) %>% + dplyr::arrange(abs(total_deviance)) %>% + dplyr::rename(dist_with_params = name, + abs_tot_deviance = total_deviance) + + # Return ---- + attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl" + attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl" + + output <- list( + comparison_tbl = comp_tbl, + deviance_tbl = deviance_tbl, + total_deviance_tbl = total_deviance_tbl + ) + + return(invisible(output)) + + print(total_deviance_tbl) + +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 95c46c65..df7cdc00 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -29,6 +29,7 @@ reference: - title: Empirical Data Functions contents: - contains("_empirical") + - has_concept("Empirical") - title: Multiple Distribution Tibble desc: Generate multiple simulations of the same distribution with different parameters contents: diff --git a/man/tidy_distribution_comparison.Rd b/man/tidy_distribution_comparison.Rd new file mode 100644 index 00000000..89381ddd --- /dev/null +++ b/man/tidy_distribution_comparison.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/util-distribution-comparison.R +\name{tidy_distribution_comparison} +\alias{tidy_distribution_comparison} +\title{Compare Empirical Data to Distributions} +\usage{ +tidy_distribution_comparison(.x, .distribution_type = "continuous") +} +\arguments{ +\item{.x}{The data set being passed to the function} + +\item{.distribution_type}{What kind of data is it, can be one of \code{continuous} +or \code{discrete}} +} +\value{ +An invisible list object. A tibble is printed. +} +\description{ +Compare some empirical data set against different distributions +to help find the distribution that could be the best fit. +} +\details{ +The purpose of this function is to take some data set provided and +to try to find a distribution that may fit the best. A parameter of +\code{.distribution_type} must be set to either \code{continuous} or \code{discrete} in order +for this the function to try the appropriate types of distributions. + +The following distributions are used: + +Continuous: +\itemize{ +\item tidy_beta +\item tidy_cauchy +\item tidy_exponential +\item tidy_gamma +\item tidy_logistic +\item tidy_lognormal +\item tidy_pareto +\item tidy_uniform +\item tidy_weibull +} + +Discrete: +\itemize{ +\item tidy_binomial +\item tidy_geometric +\item tidy_hypergeometric +\item tidy_poisson +} +} +\examples{ +xc <- mtcars$mpg +tidy_distribution_comparison(xc, "continuous") + +xd <- trunc(xc) +tidy_distribution_comparison(xd, "discrete") + +} +\author{ +Steven P. Sanderson II, MPH +} +\concept{Empirical}