From dba16de0f8657b33f4b531725be5da57a29e14d7 Mon Sep 17 00:00:00 2001
From: abichat <a.bichat@yahoo.fr>
Date: Tue, 5 Mar 2024 14:55:28 +0100
Subject: [PATCH 1/3] fix tidy for aggregate_list when not trained

---
 R/aggregate_list.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)
diff --git a/R/aggregate_list.R b/R/aggregate_list.R
index f65602e..6a847af 100644
--- a/R/aggregate_list.R
+++ b/R/aggregate_list.R
@@ -168,8 +168,7 @@ tidy.step_aggregate_list <- function(x, ...) {
     res <-
       tibble(
         terms = term_names,
-        pv = rlang::na_dbl,
-        kept = rlang::na_lgl
+        aggregate = rlang::na_chr
       )
   }
   # Always return the step id:

From f29df5c2191679751ae1afb9f482501f4fd2f1d6 Mon Sep 17 00:00:00 2001
From: abichat <a.bichat@yahoo.fr>
Date: Tue, 5 Mar 2024 15:38:13 +0100
Subject: [PATCH 2/3] readme seed

---
 README.Rmd |  4 ++++
 README.md  | 26 +++++++++++++-------------
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/README.Rmd b/README.Rmd
index f6b2961..49749f5 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -56,6 +56,10 @@ list_family <- split(cheese_taxonomy$asv, cheese_taxonomy$family)
 head(list_family, 2)
 ```
 
+```{r seed, echo=FALSE}
+set.seed(42)
+```
+
 The following recipe will 
 
 1. aggregate the ASV variables at the family level, as defined by `list_family`;
diff --git a/README.md b/README.md
index cedebf0..c123cbc 100644
--- a/README.md
+++ b/README.md
@@ -137,19 +137,19 @@ tidy(rec, 3)
 #> # A tibble: 13 × 4
 #>    terms                                    pv kept  id                  
 #>    <chr>                                 <dbl> <lgl> <chr>               
-#>  1 Aspergillaceae                       0.0608 FALSE select_kruskal_rtUAJ
-#>  2 Debaryomycetaceae                    0.0273 TRUE  select_kruskal_rtUAJ
-#>  3 Dipodascaceae                        0.0273 TRUE  select_kruskal_rtUAJ
-#>  4 Dothioraceae                         0.101  FALSE select_kruskal_rtUAJ
-#>  5 Lichtheimiaceae                      0.276  FALSE select_kruskal_rtUAJ
-#>  6 Metschnikowiaceae                    0.0509 FALSE select_kruskal_rtUAJ
-#>  7 Mucoraceae                           0.0608 FALSE select_kruskal_rtUAJ
-#>  8 Phaffomycetaceae                     0.0794 FALSE select_kruskal_rtUAJ
-#>  9 Saccharomycetaceae                   0.0273 TRUE  select_kruskal_rtUAJ
-#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE  select_kruskal_rtUAJ
-#> 11 Trichomonascaceae                    0.0625 FALSE select_kruskal_rtUAJ
-#> 12 Trichosporonaceae                    0.0273 TRUE  select_kruskal_rtUAJ
-#> 13 Wickerhamomyceteae                   0.177  FALSE select_kruskal_rtUAJ
+#>  1 Aspergillaceae                       0.0608 FALSE select_kruskal_WKayj
+#>  2 Debaryomycetaceae                    0.0273 TRUE  select_kruskal_WKayj
+#>  3 Dipodascaceae                        0.0273 TRUE  select_kruskal_WKayj
+#>  4 Dothioraceae                         0.101  FALSE select_kruskal_WKayj
+#>  5 Lichtheimiaceae                      0.276  FALSE select_kruskal_WKayj
+#>  6 Metschnikowiaceae                    0.0509 FALSE select_kruskal_WKayj
+#>  7 Mucoraceae                           0.0608 FALSE select_kruskal_WKayj
+#>  8 Phaffomycetaceae                     0.0794 FALSE select_kruskal_WKayj
+#>  9 Saccharomycetaceae                   0.0273 TRUE  select_kruskal_WKayj
+#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE  select_kruskal_WKayj
+#> 11 Trichomonascaceae                    0.0625 FALSE select_kruskal_WKayj
+#> 12 Trichosporonaceae                    0.0273 TRUE  select_kruskal_WKayj
+#> 13 Wickerhamomyceteae                   0.177  FALSE select_kruskal_WKayj
 ```
 
 ## Notes

From 5f711eeaf97e20936ee4c62aad130776e8113c9d Mon Sep 17 00:00:00 2001
From: abichat <a.bichat@yahoo.fr>
Date: Tue, 5 Mar 2024 17:52:04 +0100
Subject: [PATCH 3/3] step_aggregate_hclust

---
 NAMESPACE                              |   8 +
 R/aggregate_hclust.R                   | 196 +++++++++++++++++++++++++
 R/aggregate_list.R                     |   5 +-
 _pkgdown.yml                           |   1 +
 dev/dev_history.R                      |   3 +
 man/step_aggregate_hclust.Rd           |  89 +++++++++++
 man/step_aggregate_list.Rd             |   2 +-
 tests/testthat/test-aggregate_hclust.R |  93 ++++++++++++
 8 files changed, 394 insertions(+), 3 deletions(-)
 create mode 100644 R/aggregate_hclust.R
 create mode 100644 man/step_aggregate_hclust.Rd
 create mode 100644 tests/testthat/test-aggregate_hclust.R

diff --git a/NAMESPACE b/NAMESPACE
index cb873a3..971df69 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,23 +1,27 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(bake,step_aggregate_hclust)
 S3method(bake,step_aggregate_list)
 S3method(bake,step_rownormalize_tss)
 S3method(bake,step_select_background)
 S3method(bake,step_select_cv)
 S3method(bake,step_select_kruskal)
 S3method(bake,step_select_wilcoxon)
+S3method(prep,step_aggregate_hclust)
 S3method(prep,step_aggregate_list)
 S3method(prep,step_rownormalize_tss)
 S3method(prep,step_select_background)
 S3method(prep,step_select_cv)
 S3method(prep,step_select_kruskal)
 S3method(prep,step_select_wilcoxon)
+S3method(print,step_aggregate_hclust)
 S3method(print,step_aggregate_list)
 S3method(print,step_rownormalize_tss)
 S3method(print,step_select_background)
 S3method(print,step_select_cv)
 S3method(print,step_select_kruskal)
 S3method(print,step_select_wilcoxon)
+S3method(tidy,step_aggregate_hclust)
 S3method(tidy,step_aggregate_list)
 S3method(tidy,step_rownormalize_tss)
 S3method(tidy,step_select_background)
@@ -26,6 +30,7 @@ S3method(tidy,step_select_kruskal)
 S3method(tidy,step_select_wilcoxon)
 export("%>%")
 export(cv)
+export(step_aggregate_hclust)
 export(step_aggregate_list)
 export(step_rownormalize_tss)
 export(step_select_background)
@@ -54,6 +59,9 @@ importFrom(rlang,.data)
 importFrom(rlang,abort)
 importFrom(rlang,enquos)
 importFrom(stats,as.formula)
+importFrom(stats,cutree)
+importFrom(stats,dist)
+importFrom(stats,hclust)
 importFrom(stats,kruskal.test)
 importFrom(stats,p.adjust)
 importFrom(stats,sd)
diff --git a/R/aggregate_hclust.R b/R/aggregate_hclust.R
new file mode 100644
index 0000000..3b8cf5f
--- /dev/null
+++ b/R/aggregate_hclust.R
@@ -0,0 +1,196 @@
+#' Feature aggregation step based on a hierarchical clustering.
+#'
+#' Aggregate variables according to hierarchical clustering.
+#'
+#' @param recipe A recipe object. The step will be added to the sequence of
+#' operations for this recipe.
+#' @param ... One or more selector functions to choose variables
+#'  for this step. See [selections()] for more details.
+#' @param role For model terms created by this step, what analysis role should
+#'  they be assigned? By default, the new columns created by this step from
+#'  the original variables will be used as `predictors` in a model.
+#' @param trained A logical to indicate if the quantities for preprocessing
+#' have been estimated.
+#' @param n_clusters Number of cluster to create.
+#' @param fun_agg Aggregation function like `sum` or `mean`.
+#' @param dist_metric Default to `euclidean`. See [stats::dist()] for more
+#' details.
+#' @param linkage_method Deault to `complete`. See [stats::hclust()] for more
+#' details.
+#' @param res This parameter is only produced after the recipe has been trained.
+#' @param prefix A character string for the prefix of the resulting new
+#' variables.
+#' @param keep_original_cols A logical to keep the original variables in
+#' the output. Defaults to `FALSE`.
+#' @param skip A logical. Should the step be skipped when the
+#'  recipe is baked by [bake()]? While all operations are baked
+#'  when [prep()] is run, some operations may not be able to be
+#'  conducted on new data (e.g. processing the outcome variable(s)).
+#'  Care should be taken when using `skip = TRUE` as it may affect
+#'  the computations for subsequent operations.
+#' @param id A character string that is unique to this step to identify it.
+#'
+#' @return An updated version of recipe with the new step added to the
+#' sequence of any existing operations.
+#'
+#' @export
+#'
+#' @importFrom recipes add_step rand_id
+#' @importFrom rlang enquos
+#'
+#' @author Antoine Bichat
+#'
+#' @examples
+#' rec <-
+#'   iris %>%
+#'   recipe(formula = Species ~ .) %>%
+#'   step_aggregate_hclust(all_numeric_predictors(),
+#'                         n_clusters = 2, fun_agg = sum) %>%
+#'   prep()
+#' rec
+#' tidy(rec, 1)
+#' juice(rec)
+step_aggregate_hclust <- function(recipe, ..., role = "predictor",
+                                  trained = FALSE,
+                                  n_clusters,
+                                  fun_agg,
+                                  dist_metric = "euclidean",
+                                  linkage_method = "complete",
+                                  res = NULL,
+                                  prefix = "cl_",
+                                  keep_original_cols = FALSE,
+                                  skip = FALSE,
+                                  id = rand_id("aggregate_hclust")) {
+
+  add_step(
+    recipe,
+    step_aggregate_hclust_new(
+      terms = enquos(...),
+      role = role,
+      trained = trained,
+      n_clusters = n_clusters,
+      fun_agg = fun_agg,
+      dist_metric = dist_metric,
+      linkage_method = linkage_method,
+      res = res,
+      prefix = prefix,
+      keep_original_cols = keep_original_cols,
+      skip = skip,
+      id = id
+    )
+  )
+}
+
+#' @importFrom recipes step
+step_aggregate_hclust_new <- function(terms, role, trained,
+                                      n_clusters, fun_agg,
+                                      dist_metric, linkage_method,
+                                      res, prefix, keep_original_cols,
+                                      skip, id) {
+
+  step(subclass = "aggregate_hclust",
+       terms = terms,
+       role = role,
+       trained = trained,
+       n_clusters = n_clusters,
+       fun_agg = fun_agg,
+       dist_metric = dist_metric,
+       linkage_method = linkage_method,
+       res = res,
+       prefix = prefix,
+       keep_original_cols = keep_original_cols,
+       skip = skip,
+       id = id)
+}
+
+#' @export
+#' @importFrom dplyr mutate
+#' @importFrom recipes recipes_eval_select
+#' @importFrom rlang .data
+#' @importFrom stats cutree dist hclust
+#' @importFrom tibble enframe
+prep.step_aggregate_hclust <- function(x, training, info = NULL, ...) {
+  col_names <- recipes_eval_select(x$terms, training, info)
+  check_type(training[, col_names], quant = TRUE)
+
+  ct <-
+    training[, col_names] %>%
+    as.matrix() %>%
+    t() %>%
+    dist(method = x$dist_metric) %>%
+    hclust(method = x$linkage_method) %>%
+    cutree(k = x$n_clusters)
+
+  res_agg_hclust <-
+    ct %>%
+    enframe(name = "terms", value = "aggregate") %>%
+    mutate(aggregate = paste0(x$prefix, .data$aggregate))
+
+
+  step_aggregate_hclust_new(
+    terms = x$terms,
+    role = x$role,
+    trained = TRUE,
+    n_clusters = x$n_clusters,
+    fun_agg = x$fun_agg,
+    dist_metric = x$dist_metric,
+    linkage_method = x$linkage_method,
+    res = res_agg_hclust,
+    prefix = x$prefix,
+    keep_original_cols = x$keep_original_cols,
+    skip = x$skip,
+    id = x$id
+  )
+}
+
+
+#' @export
+#' @importFrom recipes check_new_data
+bake.step_aggregate_hclust <- function(object, new_data, ...) {
+  col_names <- object$res$terms
+  check_new_data(col_names, object, new_data)
+
+  list_agg_hc <- split(object$res$terms, object$res$aggregate)
+
+  aggregate_var(new_data, list_agg = list_agg_hc, fun_agg = object$fun_agg,
+                prefix = object$prefix,
+                keep_original_cols = object$keep_original_cols)
+}
+
+#' @export
+#' @importFrom recipes print_step
+print.step_aggregate_hclust <-
+  function(x, width = max(20, options()$width - 35), ...) {
+    title <- paste("`hclust` aggregation of ")
+
+    print_step(
+      tr_obj = x$res$terms,
+      untr_obj = x$terms,
+      trained = x$trained,
+      title = title,
+      width = width
+    )
+    invisible(x)
+  }
+
+
+#' @rdname step_aggregate_hclust
+#' @param x A `step_aggregate_hclust` object.
+#' @export
+#' @importFrom recipes is_trained sel2char
+#' @importFrom tibble tibble
+tidy.step_aggregate_hclust <- function(x, ...) {
+  if (is_trained(x)) {
+    res <- x$res
+  } else {
+    term_names <- sel2char(x$terms)
+    res <-
+      tibble(
+        terms = term_names,
+        aggregate = rlang::na_chr
+      )
+  }
+  # Always return the step id:
+  res$id <- x$id
+  res
+}
diff --git a/R/aggregate_list.R b/R/aggregate_list.R
index 6a847af..8655d47 100644
--- a/R/aggregate_list.R
+++ b/R/aggregate_list.R
@@ -12,7 +12,7 @@
 #' @param trained A logical to indicate if the quantities for preprocessing
 #' have been estimated.
 #' @param list_agg Named list of .
-#' @param fun_agg Function .
+#' @param fun_agg Aggregation function like `sum` or `mean`.
 #' @param res This parameter is only produced after the recipe has been trained.
 #' @param prefix A character string for the prefix of the resulting new
 #' variables that are not named in `list_agg`.
@@ -48,7 +48,8 @@
 #' rec
 #' tidy(rec, 1)
 #' juice(rec)
-step_aggregate_list <- function(recipe, ..., role = "predictor", trained = FALSE,
+step_aggregate_list <- function(recipe, ..., role = "predictor",
+                                trained = FALSE,
                                 list_agg = NULL,
                                 fun_agg = NULL,
                                 res = NULL,
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 29da09e..ddc23e3 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -12,6 +12,7 @@ reference:
 - title: Feature Aggregation Steps
   contents:
   - '`step_aggregate_list`'
+  - '`step_aggregate_hclust`'
 - title: Feature Normalization Steps
   contents:
   - '`step_rownormalize_tss`'
diff --git a/dev/dev_history.R b/dev/dev_history.R
index 5213665..a77b1ab 100644
--- a/dev/dev_history.R
+++ b/dev/dev_history.R
@@ -59,6 +59,9 @@ library(testthat)
 
 # use_test("aggregate_list")
 
+# use_r("aggregate_hclust")
+# use_test("aggregate_hclust")
+
 ####
 
 devtools::load_all()
diff --git a/man/step_aggregate_hclust.Rd b/man/step_aggregate_hclust.Rd
new file mode 100644
index 0000000..45a5d44
--- /dev/null
+++ b/man/step_aggregate_hclust.Rd
@@ -0,0 +1,89 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/aggregate_hclust.R
+\name{step_aggregate_hclust}
+\alias{step_aggregate_hclust}
+\alias{tidy.step_aggregate_hclust}
+\title{Feature aggregation step based on a hierarchical clustering.}
+\usage{
+step_aggregate_hclust(
+  recipe,
+  ...,
+  role = "predictor",
+  trained = FALSE,
+  n_clusters,
+  fun_agg,
+  dist_metric = "euclidean",
+  linkage_method = "complete",
+  res = NULL,
+  prefix = "cl_",
+  keep_original_cols = FALSE,
+  skip = FALSE,
+  id = rand_id("aggregate_hclust")
+)
+
+\method{tidy}{step_aggregate_hclust}(x, ...)
+}
+\arguments{
+\item{recipe}{A recipe object. The step will be added to the sequence of
+operations for this recipe.}
+
+\item{...}{One or more selector functions to choose variables
+for this step. See \code{\link[=selections]{selections()}} for more details.}
+
+\item{role}{For model terms created by this step, what analysis role should
+they be assigned? By default, the new columns created by this step from
+the original variables will be used as \code{predictors} in a model.}
+
+\item{trained}{A logical to indicate if the quantities for preprocessing
+have been estimated.}
+
+\item{n_clusters}{Number of cluster to create.}
+
+\item{fun_agg}{Aggregation function like \code{sum} or \code{mean}.}
+
+\item{dist_metric}{Default to \code{euclidean}. See \code{\link[stats:dist]{stats::dist()}} for more
+details.}
+
+\item{linkage_method}{Deault to \code{complete}. See \code{\link[stats:hclust]{stats::hclust()}} for more
+details.}
+
+\item{res}{This parameter is only produced after the recipe has been trained.}
+
+\item{prefix}{A character string for the prefix of the resulting new
+variables.}
+
+\item{keep_original_cols}{A logical to keep the original variables in
+the output. Defaults to \code{FALSE}.}
+
+\item{skip}{A logical. Should the step be skipped when the
+recipe is baked by \code{\link[=bake]{bake()}}? While all operations are baked
+when \code{\link[=prep]{prep()}} is run, some operations may not be able to be
+conducted on new data (e.g. processing the outcome variable(s)).
+Care should be taken when using \code{skip = TRUE} as it may affect
+the computations for subsequent operations.}
+
+\item{id}{A character string that is unique to this step to identify it.}
+
+\item{x}{A \code{step_aggregate_hclust} object.}
+}
+\value{
+An updated version of recipe with the new step added to the
+sequence of any existing operations.
+}
+\description{
+Aggregate variables according to hierarchical clustering.
+}
+\examples{
+rec <-
+  iris \%>\%
+  recipe(formula = Species ~ .) \%>\%
+  step_aggregate_hclust(all_numeric_predictors(),
+                        n_clusters = 2, fun_agg = sum) \%>\%
+  prep()
+rec
+tidy(rec, 1)
+juice(rec)
+}
+\author{
+Antoine Bichat
+}
diff --git a/man/step_aggregate_list.Rd b/man/step_aggregate_list.Rd
index fabbeb6..1cd2a38 100644
--- a/man/step_aggregate_list.Rd
+++ b/man/step_aggregate_list.Rd
@@ -37,7 +37,7 @@ have been estimated.}
 
 \item{list_agg}{Named list of .}
 
-\item{fun_agg}{Function .}
+\item{fun_agg}{Aggregation function like \code{sum} or \code{mean}.}
 
 \item{res}{This parameter is only produced after the recipe has been trained.}
 
diff --git a/tests/testthat/test-aggregate_hclust.R b/tests/testthat/test-aggregate_hclust.R
new file mode 100644
index 0000000..ae2f8a9
--- /dev/null
+++ b/tests/testthat/test-aggregate_hclust.R
@@ -0,0 +1,93 @@
+library(dplyr)
+
+data("cheese_abundance")
+
+nc <- sample(2:9, size = 1)
+new_names <- paste0("hc_", 1:nc)
+
+
+clustering <-
+  cheese_abundance %>%
+  select(where(is.numeric)) %>%
+  as.matrix() %>%
+  t() %>%
+  dist(method = "manhattan") %>%
+  hclust(method = "ward.D2") %>%
+  cutree(k = nc)
+
+rnd_asv <- sample(names(clustering), size = 1)
+rnd_asv_group <- names(clustering[clustering == clustering[rnd_asv]])
+
+
+test_that("step_aggregate_hclust works", {
+
+  rec <-
+    recipe(~ ., data = cheese_abundance) %>%
+    step_aggregate_hclust(all_numeric_predictors(),
+                          n_clusters = nc, fun_agg = sum,
+                          prefix = "hc_", dist_metric = "manhattan",
+                          linkage_method = "ward.D2")
+
+  expect_equal(nrow(tidy(rec, 1)), 1)
+
+  prepped <- prep(rec)
+  hclust_tidy <- tidy(prepped, 1)
+
+  expect_equal(nrow(hclust_tidy), ncol(cheese_abundance) - 3)
+
+
+ # check set equality on random cluster
+  expect_setequal(
+    hclust_tidy %>%
+      filter(aggregate %in%
+               filter(hclust_tidy, terms == rnd_asv)$aggregate) %>%
+      pull(terms),
+    rnd_asv_group)
+
+  # check equality on groups cardinal
+  expect_setequal(
+    hclust_tidy %>%
+      count(aggregate) %>%
+      pull(n) %>%
+      sort(),
+    clustering %>%
+      table() %>%
+      sort() %>%
+      unname())
+
+
+  expect_setequal(summary(prepped)$role, "predictor")
+
+  baked <- bake(prepped, new_data = NULL)
+
+  expect_equal(colnames(baked),
+               c("sample", "cheese", "rind_type", new_names))
+  expect_equal(rowSums(baked[, -c(1:3)]), rowSums(cheese_abundance[, -c(1:3)]))
+
+  cl <- sample(new_names, size = 1)
+  asv_in_cl <-
+    hclust_tidy %>%
+    filter(aggregate == cl) %>%
+    pull(terms)
+
+
+  expect_equal(baked[[cl]],
+               cheese_abundance %>%
+                 select(all_of(asv_in_cl)) %>%
+                 rowSums())
+
+  ## keep_original_cols
+
+  baked2 <-
+    recipe(~ ., data = cheese_abundance) %>%
+    step_aggregate_hclust(all_numeric_predictors(),
+                          n_clusters = nc, fun_agg = sum,
+                          prefix = "hc_", dist_metric = "manhattan",
+                          linkage_method = "ward.D2",
+                          keep_original_cols = TRUE) %>%
+    prep() %>%
+    juice()
+
+  expect_equal(colnames(baked2),
+               c(colnames(cheese_abundance), new_names))
+})