From dba16de0f8657b33f4b531725be5da57a29e14d7 Mon Sep 17 00:00:00 2001 From: abichat Date: Tue, 5 Mar 2024 14:55:28 +0100 Subject: [PATCH 1/3] fix tidy for aggregate_list when not trained --- R/aggregate_list.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/R/aggregate_list.R b/R/aggregate_list.R index f65602e..6a847af 100644 --- a/R/aggregate_list.R +++ b/R/aggregate_list.R @@ -168,8 +168,7 @@ tidy.step_aggregate_list <- function(x, ...) { res <- tibble( terms = term_names, - pv = rlang::na_dbl, - kept = rlang::na_lgl + aggregate = rlang::na_chr ) } # Always return the step id: From f29df5c2191679751ae1afb9f482501f4fd2f1d6 Mon Sep 17 00:00:00 2001 From: abichat Date: Tue, 5 Mar 2024 15:38:13 +0100 Subject: [PATCH 2/3] readme seed --- README.Rmd | 4 ++++ README.md | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/README.Rmd b/README.Rmd index f6b2961..49749f5 100644 --- a/README.Rmd +++ b/README.Rmd @@ -56,6 +56,10 @@ list_family <- split(cheese_taxonomy$asv, cheese_taxonomy$family) head(list_family, 2) ``` +```{r seed, echo=FALSE} +set.seed(42) +``` + The following recipe will 1. aggregate the ASV variables at the family level, as defined by `list_family`; diff --git a/README.md b/README.md index cedebf0..c123cbc 100644 --- a/README.md +++ b/README.md @@ -137,19 +137,19 @@ tidy(rec, 3) #> # A tibble: 13 × 4 #> terms pv kept id #> -#> 1 Aspergillaceae 0.0608 FALSE select_kruskal_rtUAJ -#> 2 Debaryomycetaceae 0.0273 TRUE select_kruskal_rtUAJ -#> 3 Dipodascaceae 0.0273 TRUE select_kruskal_rtUAJ -#> 4 Dothioraceae 0.101 FALSE select_kruskal_rtUAJ -#> 5 Lichtheimiaceae 0.276 FALSE select_kruskal_rtUAJ -#> 6 Metschnikowiaceae 0.0509 FALSE select_kruskal_rtUAJ -#> 7 Mucoraceae 0.0608 FALSE select_kruskal_rtUAJ -#> 8 Phaffomycetaceae 0.0794 FALSE select_kruskal_rtUAJ -#> 9 Saccharomycetaceae 0.0273 TRUE select_kruskal_rtUAJ -#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE select_kruskal_rtUAJ -#> 11 Trichomonascaceae 0.0625 FALSE select_kruskal_rtUAJ -#> 12 Trichosporonaceae 0.0273 TRUE select_kruskal_rtUAJ -#> 13 Wickerhamomyceteae 0.177 FALSE select_kruskal_rtUAJ +#> 1 Aspergillaceae 0.0608 FALSE select_kruskal_WKayj +#> 2 Debaryomycetaceae 0.0273 TRUE select_kruskal_WKayj +#> 3 Dipodascaceae 0.0273 TRUE select_kruskal_WKayj +#> 4 Dothioraceae 0.101 FALSE select_kruskal_WKayj +#> 5 Lichtheimiaceae 0.276 FALSE select_kruskal_WKayj +#> 6 Metschnikowiaceae 0.0509 FALSE select_kruskal_WKayj +#> 7 Mucoraceae 0.0608 FALSE select_kruskal_WKayj +#> 8 Phaffomycetaceae 0.0794 FALSE select_kruskal_WKayj +#> 9 Saccharomycetaceae 0.0273 TRUE select_kruskal_WKayj +#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE select_kruskal_WKayj +#> 11 Trichomonascaceae 0.0625 FALSE select_kruskal_WKayj +#> 12 Trichosporonaceae 0.0273 TRUE select_kruskal_WKayj +#> 13 Wickerhamomyceteae 0.177 FALSE select_kruskal_WKayj ``` ## Notes From 5f711eeaf97e20936ee4c62aad130776e8113c9d Mon Sep 17 00:00:00 2001 From: abichat Date: Tue, 5 Mar 2024 17:52:04 +0100 Subject: [PATCH 3/3] step_aggregate_hclust --- NAMESPACE | 8 + R/aggregate_hclust.R | 196 +++++++++++++++++++++++++ R/aggregate_list.R | 5 +- _pkgdown.yml | 1 + dev/dev_history.R | 3 + man/step_aggregate_hclust.Rd | 89 +++++++++++ man/step_aggregate_list.Rd | 2 +- tests/testthat/test-aggregate_hclust.R | 93 ++++++++++++ 8 files changed, 394 insertions(+), 3 deletions(-) create mode 100644 R/aggregate_hclust.R create mode 100644 man/step_aggregate_hclust.Rd create mode 100644 tests/testthat/test-aggregate_hclust.R diff --git a/NAMESPACE b/NAMESPACE index cb873a3..971df69 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,23 +1,27 @@ # Generated by roxygen2: do not edit by hand +S3method(bake,step_aggregate_hclust) S3method(bake,step_aggregate_list) S3method(bake,step_rownormalize_tss) S3method(bake,step_select_background) S3method(bake,step_select_cv) S3method(bake,step_select_kruskal) S3method(bake,step_select_wilcoxon) +S3method(prep,step_aggregate_hclust) S3method(prep,step_aggregate_list) S3method(prep,step_rownormalize_tss) S3method(prep,step_select_background) S3method(prep,step_select_cv) S3method(prep,step_select_kruskal) S3method(prep,step_select_wilcoxon) +S3method(print,step_aggregate_hclust) S3method(print,step_aggregate_list) S3method(print,step_rownormalize_tss) S3method(print,step_select_background) S3method(print,step_select_cv) S3method(print,step_select_kruskal) S3method(print,step_select_wilcoxon) +S3method(tidy,step_aggregate_hclust) S3method(tidy,step_aggregate_list) S3method(tidy,step_rownormalize_tss) S3method(tidy,step_select_background) @@ -26,6 +30,7 @@ S3method(tidy,step_select_kruskal) S3method(tidy,step_select_wilcoxon) export("%>%") export(cv) +export(step_aggregate_hclust) export(step_aggregate_list) export(step_rownormalize_tss) export(step_select_background) @@ -54,6 +59,9 @@ importFrom(rlang,.data) importFrom(rlang,abort) importFrom(rlang,enquos) importFrom(stats,as.formula) +importFrom(stats,cutree) +importFrom(stats,dist) +importFrom(stats,hclust) importFrom(stats,kruskal.test) importFrom(stats,p.adjust) importFrom(stats,sd) diff --git a/R/aggregate_hclust.R b/R/aggregate_hclust.R new file mode 100644 index 0000000..3b8cf5f --- /dev/null +++ b/R/aggregate_hclust.R @@ -0,0 +1,196 @@ +#' Feature aggregation step based on a hierarchical clustering. +#' +#' Aggregate variables according to hierarchical clustering. +#' +#' @param recipe A recipe object. The step will be added to the sequence of +#' operations for this recipe. +#' @param ... One or more selector functions to choose variables +#' for this step. See [selections()] for more details. +#' @param role For model terms created by this step, what analysis role should +#' they be assigned? By default, the new columns created by this step from +#' the original variables will be used as `predictors` in a model. +#' @param trained A logical to indicate if the quantities for preprocessing +#' have been estimated. +#' @param n_clusters Number of cluster to create. +#' @param fun_agg Aggregation function like `sum` or `mean`. +#' @param dist_metric Default to `euclidean`. See [stats::dist()] for more +#' details. +#' @param linkage_method Deault to `complete`. See [stats::hclust()] for more +#' details. +#' @param res This parameter is only produced after the recipe has been trained. +#' @param prefix A character string for the prefix of the resulting new +#' variables. +#' @param keep_original_cols A logical to keep the original variables in +#' the output. Defaults to `FALSE`. +#' @param skip A logical. Should the step be skipped when the +#' recipe is baked by [bake()]? While all operations are baked +#' when [prep()] is run, some operations may not be able to be +#' conducted on new data (e.g. processing the outcome variable(s)). +#' Care should be taken when using `skip = TRUE` as it may affect +#' the computations for subsequent operations. +#' @param id A character string that is unique to this step to identify it. +#' +#' @return An updated version of recipe with the new step added to the +#' sequence of any existing operations. +#' +#' @export +#' +#' @importFrom recipes add_step rand_id +#' @importFrom rlang enquos +#' +#' @author Antoine Bichat +#' +#' @examples +#' rec <- +#' iris %>% +#' recipe(formula = Species ~ .) %>% +#' step_aggregate_hclust(all_numeric_predictors(), +#' n_clusters = 2, fun_agg = sum) %>% +#' prep() +#' rec +#' tidy(rec, 1) +#' juice(rec) +step_aggregate_hclust <- function(recipe, ..., role = "predictor", + trained = FALSE, + n_clusters, + fun_agg, + dist_metric = "euclidean", + linkage_method = "complete", + res = NULL, + prefix = "cl_", + keep_original_cols = FALSE, + skip = FALSE, + id = rand_id("aggregate_hclust")) { + + add_step( + recipe, + step_aggregate_hclust_new( + terms = enquos(...), + role = role, + trained = trained, + n_clusters = n_clusters, + fun_agg = fun_agg, + dist_metric = dist_metric, + linkage_method = linkage_method, + res = res, + prefix = prefix, + keep_original_cols = keep_original_cols, + skip = skip, + id = id + ) + ) +} + +#' @importFrom recipes step +step_aggregate_hclust_new <- function(terms, role, trained, + n_clusters, fun_agg, + dist_metric, linkage_method, + res, prefix, keep_original_cols, + skip, id) { + + step(subclass = "aggregate_hclust", + terms = terms, + role = role, + trained = trained, + n_clusters = n_clusters, + fun_agg = fun_agg, + dist_metric = dist_metric, + linkage_method = linkage_method, + res = res, + prefix = prefix, + keep_original_cols = keep_original_cols, + skip = skip, + id = id) +} + +#' @export +#' @importFrom dplyr mutate +#' @importFrom recipes recipes_eval_select +#' @importFrom rlang .data +#' @importFrom stats cutree dist hclust +#' @importFrom tibble enframe +prep.step_aggregate_hclust <- function(x, training, info = NULL, ...) { + col_names <- recipes_eval_select(x$terms, training, info) + check_type(training[, col_names], quant = TRUE) + + ct <- + training[, col_names] %>% + as.matrix() %>% + t() %>% + dist(method = x$dist_metric) %>% + hclust(method = x$linkage_method) %>% + cutree(k = x$n_clusters) + + res_agg_hclust <- + ct %>% + enframe(name = "terms", value = "aggregate") %>% + mutate(aggregate = paste0(x$prefix, .data$aggregate)) + + + step_aggregate_hclust_new( + terms = x$terms, + role = x$role, + trained = TRUE, + n_clusters = x$n_clusters, + fun_agg = x$fun_agg, + dist_metric = x$dist_metric, + linkage_method = x$linkage_method, + res = res_agg_hclust, + prefix = x$prefix, + keep_original_cols = x$keep_original_cols, + skip = x$skip, + id = x$id + ) +} + + +#' @export +#' @importFrom recipes check_new_data +bake.step_aggregate_hclust <- function(object, new_data, ...) { + col_names <- object$res$terms + check_new_data(col_names, object, new_data) + + list_agg_hc <- split(object$res$terms, object$res$aggregate) + + aggregate_var(new_data, list_agg = list_agg_hc, fun_agg = object$fun_agg, + prefix = object$prefix, + keep_original_cols = object$keep_original_cols) +} + +#' @export +#' @importFrom recipes print_step +print.step_aggregate_hclust <- + function(x, width = max(20, options()$width - 35), ...) { + title <- paste("`hclust` aggregation of ") + + print_step( + tr_obj = x$res$terms, + untr_obj = x$terms, + trained = x$trained, + title = title, + width = width + ) + invisible(x) + } + + +#' @rdname step_aggregate_hclust +#' @param x A `step_aggregate_hclust` object. +#' @export +#' @importFrom recipes is_trained sel2char +#' @importFrom tibble tibble +tidy.step_aggregate_hclust <- function(x, ...) { + if (is_trained(x)) { + res <- x$res + } else { + term_names <- sel2char(x$terms) + res <- + tibble( + terms = term_names, + aggregate = rlang::na_chr + ) + } + # Always return the step id: + res$id <- x$id + res +} diff --git a/R/aggregate_list.R b/R/aggregate_list.R index 6a847af..8655d47 100644 --- a/R/aggregate_list.R +++ b/R/aggregate_list.R @@ -12,7 +12,7 @@ #' @param trained A logical to indicate if the quantities for preprocessing #' have been estimated. #' @param list_agg Named list of . -#' @param fun_agg Function . +#' @param fun_agg Aggregation function like `sum` or `mean`. #' @param res This parameter is only produced after the recipe has been trained. #' @param prefix A character string for the prefix of the resulting new #' variables that are not named in `list_agg`. @@ -48,7 +48,8 @@ #' rec #' tidy(rec, 1) #' juice(rec) -step_aggregate_list <- function(recipe, ..., role = "predictor", trained = FALSE, +step_aggregate_list <- function(recipe, ..., role = "predictor", + trained = FALSE, list_agg = NULL, fun_agg = NULL, res = NULL, diff --git a/_pkgdown.yml b/_pkgdown.yml index 29da09e..ddc23e3 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -12,6 +12,7 @@ reference: - title: Feature Aggregation Steps contents: - '`step_aggregate_list`' + - '`step_aggregate_hclust`' - title: Feature Normalization Steps contents: - '`step_rownormalize_tss`' diff --git a/dev/dev_history.R b/dev/dev_history.R index 5213665..a77b1ab 100644 --- a/dev/dev_history.R +++ b/dev/dev_history.R @@ -59,6 +59,9 @@ library(testthat) # use_test("aggregate_list") +# use_r("aggregate_hclust") +# use_test("aggregate_hclust") + #### devtools::load_all() diff --git a/man/step_aggregate_hclust.Rd b/man/step_aggregate_hclust.Rd new file mode 100644 index 0000000..45a5d44 --- /dev/null +++ b/man/step_aggregate_hclust.Rd @@ -0,0 +1,89 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/aggregate_hclust.R +\name{step_aggregate_hclust} +\alias{step_aggregate_hclust} +\alias{tidy.step_aggregate_hclust} +\title{Feature aggregation step based on a hierarchical clustering.} +\usage{ +step_aggregate_hclust( + recipe, + ..., + role = "predictor", + trained = FALSE, + n_clusters, + fun_agg, + dist_metric = "euclidean", + linkage_method = "complete", + res = NULL, + prefix = "cl_", + keep_original_cols = FALSE, + skip = FALSE, + id = rand_id("aggregate_hclust") +) + +\method{tidy}{step_aggregate_hclust}(x, ...) +} +\arguments{ +\item{recipe}{A recipe object. The step will be added to the sequence of +operations for this recipe.} + +\item{...}{One or more selector functions to choose variables +for this step. See \code{\link[=selections]{selections()}} for more details.} + +\item{role}{For model terms created by this step, what analysis role should +they be assigned? By default, the new columns created by this step from +the original variables will be used as \code{predictors} in a model.} + +\item{trained}{A logical to indicate if the quantities for preprocessing +have been estimated.} + +\item{n_clusters}{Number of cluster to create.} + +\item{fun_agg}{Aggregation function like \code{sum} or \code{mean}.} + +\item{dist_metric}{Default to \code{euclidean}. See \code{\link[stats:dist]{stats::dist()}} for more +details.} + +\item{linkage_method}{Deault to \code{complete}. See \code{\link[stats:hclust]{stats::hclust()}} for more +details.} + +\item{res}{This parameter is only produced after the recipe has been trained.} + +\item{prefix}{A character string for the prefix of the resulting new +variables.} + +\item{keep_original_cols}{A logical to keep the original variables in +the output. Defaults to \code{FALSE}.} + +\item{skip}{A logical. Should the step be skipped when the +recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked +when \code{\link[=prep]{prep()}} is run, some operations may not be able to be +conducted on new data (e.g. processing the outcome variable(s)). +Care should be taken when using \code{skip = TRUE} as it may affect +the computations for subsequent operations.} + +\item{id}{A character string that is unique to this step to identify it.} + +\item{x}{A \code{step_aggregate_hclust} object.} +} +\value{ +An updated version of recipe with the new step added to the +sequence of any existing operations. +} +\description{ +Aggregate variables according to hierarchical clustering. +} +\examples{ +rec <- + iris \%>\% + recipe(formula = Species ~ .) \%>\% + step_aggregate_hclust(all_numeric_predictors(), + n_clusters = 2, fun_agg = sum) \%>\% + prep() +rec +tidy(rec, 1) +juice(rec) +} +\author{ +Antoine Bichat +} diff --git a/man/step_aggregate_list.Rd b/man/step_aggregate_list.Rd index fabbeb6..1cd2a38 100644 --- a/man/step_aggregate_list.Rd +++ b/man/step_aggregate_list.Rd @@ -37,7 +37,7 @@ have been estimated.} \item{list_agg}{Named list of .} -\item{fun_agg}{Function .} +\item{fun_agg}{Aggregation function like \code{sum} or \code{mean}.} \item{res}{This parameter is only produced after the recipe has been trained.} diff --git a/tests/testthat/test-aggregate_hclust.R b/tests/testthat/test-aggregate_hclust.R new file mode 100644 index 0000000..ae2f8a9 --- /dev/null +++ b/tests/testthat/test-aggregate_hclust.R @@ -0,0 +1,93 @@ +library(dplyr) + +data("cheese_abundance") + +nc <- sample(2:9, size = 1) +new_names <- paste0("hc_", 1:nc) + + +clustering <- + cheese_abundance %>% + select(where(is.numeric)) %>% + as.matrix() %>% + t() %>% + dist(method = "manhattan") %>% + hclust(method = "ward.D2") %>% + cutree(k = nc) + +rnd_asv <- sample(names(clustering), size = 1) +rnd_asv_group <- names(clustering[clustering == clustering[rnd_asv]]) + + +test_that("step_aggregate_hclust works", { + + rec <- + recipe(~ ., data = cheese_abundance) %>% + step_aggregate_hclust(all_numeric_predictors(), + n_clusters = nc, fun_agg = sum, + prefix = "hc_", dist_metric = "manhattan", + linkage_method = "ward.D2") + + expect_equal(nrow(tidy(rec, 1)), 1) + + prepped <- prep(rec) + hclust_tidy <- tidy(prepped, 1) + + expect_equal(nrow(hclust_tidy), ncol(cheese_abundance) - 3) + + + # check set equality on random cluster + expect_setequal( + hclust_tidy %>% + filter(aggregate %in% + filter(hclust_tidy, terms == rnd_asv)$aggregate) %>% + pull(terms), + rnd_asv_group) + + # check equality on groups cardinal + expect_setequal( + hclust_tidy %>% + count(aggregate) %>% + pull(n) %>% + sort(), + clustering %>% + table() %>% + sort() %>% + unname()) + + + expect_setequal(summary(prepped)$role, "predictor") + + baked <- bake(prepped, new_data = NULL) + + expect_equal(colnames(baked), + c("sample", "cheese", "rind_type", new_names)) + expect_equal(rowSums(baked[, -c(1:3)]), rowSums(cheese_abundance[, -c(1:3)])) + + cl <- sample(new_names, size = 1) + asv_in_cl <- + hclust_tidy %>% + filter(aggregate == cl) %>% + pull(terms) + + + expect_equal(baked[[cl]], + cheese_abundance %>% + select(all_of(asv_in_cl)) %>% + rowSums()) + + ## keep_original_cols + + baked2 <- + recipe(~ ., data = cheese_abundance) %>% + step_aggregate_hclust(all_numeric_predictors(), + n_clusters = nc, fun_agg = sum, + prefix = "hc_", dist_metric = "manhattan", + linkage_method = "ward.D2", + keep_original_cols = TRUE) %>% + prep() %>% + juice() + + expect_equal(colnames(baked2), + c(colnames(cheese_abundance), new_names)) +})