Skip to content

Commit

Permalink
Fixes #202
Browse files Browse the repository at this point in the history
  • Loading branch information
spsanderson committed Jun 7, 2022
1 parent 8def42e commit 3cd8dd6
Show file tree
Hide file tree
Showing 5 changed files with 277 additions and 1 deletion.
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export(tidy_cauchy)
export(tidy_chisquare)
export(tidy_combine_distributions)
export(tidy_combined_autoplot)
export(tidy_distribution_comparison)
export(tidy_distribution_summary_tbl)
export(tidy_empirical)
export(tidy_exponential)
Expand Down
2 changes: 1 addition & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ None
2. Fix #187 - Add functions `ci_lo()` and `ci_hi()`
3. Fix #189 - Add function `tidy_bootstrap()`
4. Fix #190 - Add function `bootstrap_unnest_tbl()`
5. Fix
5. Fix #202 - Add function `tidy_distribution_comparison()`

## Minor Fixes and Improvements
1. Fix #176 - Update `_autoplot` functions to include cumulative mean MCMC chart
Expand Down
212 changes: 212 additions & 0 deletions R/util-distribution-comparison.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
#' Compare Empirical Data to Distributions
#'
#' @family Empirical
#'
#' @author Steven P. Sanderson II, MPH
#'
#' @details The purpose of this function is to take some data set provided and
#' to try to find a distribution that may fit the best. A parameter of
#' `.distribution_type` must be set to either `continuous` or `discrete` in order
#' for this the function to try the appropriate types of distributions.
#'
#' The following distributions are used:
#'
#' Continuous:
#' - tidy_beta
#' - tidy_cauchy
#' - tidy_exponential
#' - tidy_gamma
#' - tidy_logistic
#' - tidy_lognormal
#' - tidy_pareto
#' - tidy_uniform
#' - tidy_weibull
#'
#' Discrete:
#' - tidy_binomial
#' - tidy_geometric
#' - tidy_hypergeometric
#' - tidy_poisson
#'
#'
#' @description Compare some empirical data set against different distributions
#' to help find the distribution that could be the best fit.
#'
#' @param .x The data set being passed to the function
#' @param .distribution_type What kind of data is it, can be one of `continuous`
#' or `discrete`
#'
#' @examples
#' xc <- mtcars$mpg
#' tidy_distribution_comparison(xc, "continuous")
#'
#' xd <- trunc(xc)
#' tidy_distribution_comparison(xd, "discrete")
#'
#' @return
#' An invisible list object. A tibble is printed.
#'
#' @export
#'

tidy_distribution_comparison <- function(.x, .distribution_type = "continuous"){

# Tidyeval ----
x_term <- as.numeric(.x)
n <- length(x_term)
dist_type <- tolower(as.character(.distribution_type))

if (!dist_type %in% c("continuous","discrete")){
rlang::abort(
message = "The '.distribution_type' parameter must be either 'continuous'
or 'discrete'.",
use_cli_format = TRUE
)
}

# Get parameter estimates for distributions
if (dist_type == "continuous"){
b <- util_beta_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape1, shape2) %>%
purrr::set_names("dist_type", "param_1", "param_2")

c <- util_cauchy_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2")

e <- util_exponential_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, rate) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2")

g <- util_gamma_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2")

l <- util_logistic_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, location, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2")

ln <- util_lognormal_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, mean_log, sd_log) %>%
purrr::set_names("dist_type", "param_1", "param_2")

p <- util_pareto_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "MLE") %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2")

u <- util_uniform_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "NIST_MME") %>%
dplyr::select(dist_type, min_est, max_est) %>%
purrr::set_names("dist_type", "param_1", "param_2")

w <- util_weibull_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, shape, scale) %>%
purrr::set_names("dist_type", "param_1", "param_2")

comp_tbl <- tidy_combine_distributions(
tidy_empirical(x_term, .distribution_type = dist_type),
tidy_beta(.n = n, .shape1 = round(b[[2]], 2), .shape2 = round(b[[3]], 2)),
tidy_cauchy(.n = n, .location = round(c[[2]], 2), .scale = round(c[[3]], 2)),
tidy_exponential(.n = n, .rate = round(e[[2]], 2)),
tidy_gamma(.n = n, .shape = round(g[[2]], 2), .scale = round(g[[3]], 2)),
tidy_logistic(.n = n, .location = round(l[[2]], 2), .scale = round(l[[3]], 2)),
tidy_lognormal(.n = n, .meanlog = round(ln[[2]], 2), .sdlog = round(ln[[3]], 2)),
tidy_pareto(.n = n, .shape = round(p[[2]], 2), .scale = round(p[[3]], 2)),
tidy_uniform(.n = n, .min = round(u[[2]], 2), .max = round(u[[3]], 2)),
tidy_weibull(.n = n, .shape = round(w[[2]], 2), .scale = round(w[[3]], 2))
)

} else {
bn <- util_binomial_param_estimate(trunc(tidy_scale_zero_one_vec(x_term)))$parameter_tbl %>%
dplyr::select(dist_type, size, prob) %>%
purrr::set_names("dist_type", "param_1", "param_2")

ge <- util_geometric_param_estimate(x_term)$parameter_tbl %>%
dplyr::filter(method == "EnvStats_MME") %>%
dplyr::select(dist_type, shape) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2")

h <- util_hypergeometric_param_estimate(.x = x_term, .total = n, .k = n)$parameter_tbl %>%
tidyr::drop_na() %>%
dplyr::slice(1) %>%
dplyr::select(dist_type, m, total) %>%
purrr::set_names("dist_type", "param_1", "param_2")

po <- util_poisson_param_estimate(x_term)$parameter_tbl %>%
dplyr::select(dist_type, lambda) %>%
dplyr::mutate(param_2 = NA) %>%
purrr::set_names("dist_type", "param_1", "param_2")

comp_tbl <- tidy_combine_distributions(
tidy_empirical(.x = x_term, .distribution_type = dist_type),
tidy_binomial(.n = n, .size = round(bn[[2]], 2), .prob = round(bn[[3]], 2)),
tidy_geometric(.n = n, .prob = round(ge[[2]], 2)),
tidy_hypergeometric(
.n = n,
.m = trunc(h[[2]]),
.nn = n - trunc(h[[2]]),
.k = trunc(h[[2]])
),
tidy_poisson(.n = n, .lambda = round(po[[2]], 2))
)

}

# Deviance calculations ----
deviance_tbl <- comp_tbl %>%
dplyr::select(dist_type, x, y) %>%
dplyr::group_by(dist_type, x) %>%
dplyr::mutate(y = stats::median(y)) %>%
dplyr::ungroup() %>%
dplyr::group_by(dist_type) %>%
dplyr::mutate(y = tidy_scale_zero_one_vec(y)) %>%
dplyr::ungroup() %>%
tidyr::pivot_wider(
id_cols = x,
names_from = dist_type,
values_from = y
) %>%
dplyr::select(x, Empirical, dplyr::everything()) %>%
dplyr::mutate(
dplyr::across(
.cols = -c(x, Empirical),
.fns = ~ Empirical - .
)
) %>%
tidyr::drop_na() %>%
tidyr::pivot_longer(
cols = -x
)

total_deviance_tbl <- deviance_tbl %>%
dplyr::filter(!name == "Empirical") %>%
dplyr::group_by(name) %>%
dplyr::summarise(total_deviance = sum(value)) %>%
dplyr::ungroup() %>%
dplyr::mutate(total_deviance = abs(total_deviance)) %>%
dplyr::arrange(abs(total_deviance)) %>%
dplyr::rename(dist_with_params = name,
abs_tot_deviance = total_deviance)

# Return ----
attr(deviance_tbl, ".tibble_type") <- "deviance_comparison_tbl"
attr(total_deviance_tbl, ".tibble_type") <- "deviance_results_tbl"

output <- list(
comparison_tbl = comp_tbl,
deviance_tbl = deviance_tbl,
total_deviance_tbl = total_deviance_tbl
)

return(invisible(output))

print(total_deviance_tbl)

}
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ reference:
- title: Empirical Data Functions
contents:
- contains("_empirical")
- has_concept("Empirical")
- title: Multiple Distribution Tibble
desc: Generate multiple simulations of the same distribution with different parameters
contents:
Expand Down
62 changes: 62 additions & 0 deletions man/tidy_distribution_comparison.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 3cd8dd6

Please sign in to comment.