Correction conflit

abichat · Mar 6, 2024 · 84b8f74 · 84b8f74
2 parents f621c95 + 5f711ee
commit 84b8f74
Show file tree

Hide file tree

Showing 10 changed files with 413 additions and 19 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,23 +1,27 @@
 # Generated by roxygen2: do not edit by hand
 
+S3method(bake,step_aggregate_hclust)
 S3method(bake,step_aggregate_list)
 S3method(bake,step_rownormalize_tss)
 S3method(bake,step_select_background)
 S3method(bake,step_select_cv)
 S3method(bake,step_select_kruskal)
 S3method(bake,step_select_wilcoxon)
+S3method(prep,step_aggregate_hclust)
 S3method(prep,step_aggregate_list)
 S3method(prep,step_rownormalize_tss)
 S3method(prep,step_select_background)
 S3method(prep,step_select_cv)
 S3method(prep,step_select_kruskal)
 S3method(prep,step_select_wilcoxon)
+S3method(print,step_aggregate_hclust)
 S3method(print,step_aggregate_list)
 S3method(print,step_rownormalize_tss)
 S3method(print,step_select_background)
 S3method(print,step_select_cv)
 S3method(print,step_select_kruskal)
 S3method(print,step_select_wilcoxon)
+S3method(tidy,step_aggregate_hclust)
 S3method(tidy,step_aggregate_list)
 S3method(tidy,step_rownormalize_tss)
 S3method(tidy,step_select_background)
@@ -26,6 +30,7 @@ S3method(tidy,step_select_kruskal)
 S3method(tidy,step_select_wilcoxon)
 export("%>%")
 export(cv)
+export(step_aggregate_hclust)
 export(step_aggregate_list)
 export(step_rownormalize_tss)
 export(step_select_background)
@@ -54,6 +59,9 @@ importFrom(rlang,.data)
 importFrom(rlang,abort)
 importFrom(rlang,enquos)
 importFrom(stats,as.formula)
+importFrom(stats,cutree)
+importFrom(stats,dist)
+importFrom(stats,hclust)
 importFrom(stats,kruskal.test)
 importFrom(stats,p.adjust)
 importFrom(stats,sd)

diff --git a/R/aggregate_hclust.R b/R/aggregate_hclust.R
@@ -0,0 +1,196 @@
+#' Feature aggregation step based on a hierarchical clustering.
+#'
+#' Aggregate variables according to hierarchical clustering.
+#'
+#' @param recipe A recipe object. The step will be added to the sequence of
+#' operations for this recipe.
+#' @param ... One or more selector functions to choose variables
+#'  for this step. See [selections()] for more details.
+#' @param role For model terms created by this step, what analysis role should
+#'  they be assigned? By default, the new columns created by this step from
+#'  the original variables will be used as `predictors` in a model.
+#' @param trained A logical to indicate if the quantities for preprocessing
+#' have been estimated.
+#' @param n_clusters Number of cluster to create.
+#' @param fun_agg Aggregation function like `sum` or `mean`.
+#' @param dist_metric Default to `euclidean`. See [stats::dist()] for more
+#' details.
+#' @param linkage_method Deault to `complete`. See [stats::hclust()] for more
+#' details.
+#' @param res This parameter is only produced after the recipe has been trained.
+#' @param prefix A character string for the prefix of the resulting new
+#' variables.
+#' @param keep_original_cols A logical to keep the original variables in
+#' the output. Defaults to `FALSE`.
+#' @param skip A logical. Should the step be skipped when the
+#'  recipe is baked by [bake()]? While all operations are baked
+#'  when [prep()] is run, some operations may not be able to be
+#'  conducted on new data (e.g. processing the outcome variable(s)).
+#'  Care should be taken when using `skip = TRUE` as it may affect
+#'  the computations for subsequent operations.
+#' @param id A character string that is unique to this step to identify it.
+#'
+#' @return An updated version of recipe with the new step added to the
+#' sequence of any existing operations.
+#'
+#' @export
+#'
+#' @importFrom recipes add_step rand_id
+#' @importFrom rlang enquos
+#'
+#' @author Antoine Bichat
+#'
+#' @examples
+#' rec <-
+#'   iris %>%
+#'   recipe(formula = Species ~ .) %>%
+#'   step_aggregate_hclust(all_numeric_predictors(),
+#'                         n_clusters = 2, fun_agg = sum) %>%
+#'   prep()
+#' rec
+#' tidy(rec, 1)
+#' juice(rec)
+step_aggregate_hclust <- function(recipe, ..., role = "predictor",
+                                  trained = FALSE,
+                                  n_clusters,
+                                  fun_agg,
+                                  dist_metric = "euclidean",
+                                  linkage_method = "complete",
+                                  res = NULL,
+                                  prefix = "cl_",
+                                  keep_original_cols = FALSE,
+                                  skip = FALSE,
+                                  id = rand_id("aggregate_hclust")) {
+
+  add_step(
+    recipe,
+    step_aggregate_hclust_new(
+      terms = enquos(...),
+      role = role,
+      trained = trained,
+      n_clusters = n_clusters,
+      fun_agg = fun_agg,
+      dist_metric = dist_metric,
+      linkage_method = linkage_method,
+      res = res,
+      prefix = prefix,
+      keep_original_cols = keep_original_cols,
+      skip = skip,
+      id = id
+    )
+  )
+}
+
+#' @importFrom recipes step
+step_aggregate_hclust_new <- function(terms, role, trained,
+                                      n_clusters, fun_agg,
+                                      dist_metric, linkage_method,
+                                      res, prefix, keep_original_cols,
+                                      skip, id) {
+
+  step(subclass = "aggregate_hclust",
+       terms = terms,
+       role = role,
+       trained = trained,
+       n_clusters = n_clusters,
+       fun_agg = fun_agg,
+       dist_metric = dist_metric,
+       linkage_method = linkage_method,
+       res = res,
+       prefix = prefix,
+       keep_original_cols = keep_original_cols,
+       skip = skip,
+       id = id)
+}
+
+#' @export
+#' @importFrom dplyr mutate
+#' @importFrom recipes recipes_eval_select
+#' @importFrom rlang .data
+#' @importFrom stats cutree dist hclust
+#' @importFrom tibble enframe
+prep.step_aggregate_hclust <- function(x, training, info = NULL, ...) {
+  col_names <- recipes_eval_select(x$terms, training, info)
+  check_type(training[, col_names], quant = TRUE)
+
+  ct <-
+    training[, col_names] %>%
+    as.matrix() %>%
+    t() %>%
+    dist(method = x$dist_metric) %>%
+    hclust(method = x$linkage_method) %>%
+    cutree(k = x$n_clusters)
+
+  res_agg_hclust <-
+    ct %>%
+    enframe(name = "terms", value = "aggregate") %>%
+    mutate(aggregate = paste0(x$prefix, .data$aggregate))
+
+
+  step_aggregate_hclust_new(
+    terms = x$terms,
+    role = x$role,
+    trained = TRUE,
+    n_clusters = x$n_clusters,
+    fun_agg = x$fun_agg,
+    dist_metric = x$dist_metric,
+    linkage_method = x$linkage_method,
+    res = res_agg_hclust,
+    prefix = x$prefix,
+    keep_original_cols = x$keep_original_cols,
+    skip = x$skip,
+    id = x$id
+  )
+}
+
+
+#' @export
+#' @importFrom recipes check_new_data
+bake.step_aggregate_hclust <- function(object, new_data, ...) {
+  col_names <- object$res$terms
+  check_new_data(col_names, object, new_data)
+
+  list_agg_hc <- split(object$res$terms, object$res$aggregate)
+
+  aggregate_var(new_data, list_agg = list_agg_hc, fun_agg = object$fun_agg,
+                prefix = object$prefix,
+                keep_original_cols = object$keep_original_cols)
+}
+
+#' @export
+#' @importFrom recipes print_step
+print.step_aggregate_hclust <-
+  function(x, width = max(20, options()$width - 35), ...) {
+    title <- paste("`hclust` aggregation of ")
+
+    print_step(
+      tr_obj = x$res$terms,
+      untr_obj = x$terms,
+      trained = x$trained,
+      title = title,
+      width = width
+    )
+    invisible(x)
+  }
+
+
+#' @rdname step_aggregate_hclust
+#' @param x A `step_aggregate_hclust` object.
+#' @export
+#' @importFrom recipes is_trained sel2char
+#' @importFrom tibble tibble
+tidy.step_aggregate_hclust <- function(x, ...) {
+  if (is_trained(x)) {
+    res <- x$res
+  } else {
+    term_names <- sel2char(x$terms)
+    res <-
+      tibble(
+        terms = term_names,
+        aggregate = rlang::na_chr
+      )
+  }
+  # Always return the step id:
+  res$id <- x$id
+  res
+}
diff --git a/R/aggregate_list.R b/R/aggregate_list.R
@@ -10,9 +10,9 @@
 #'  they be assigned? By default, the new columns created by this step from
 #'  the original variables will be used as `predictors` in a model.
 #' @param trained A logical to indicate if the quantities for preprocessing
-#' have been estimated.
+#' have been estimated.D
 #' @param list_agg Named list of aggregated variables.
-#' @param fun_agg Function to be used to aggregate variables.
+#' @param fun_agg  Aggregation function like `sum` or `mean`.
 #' @param res This parameter is only produced after the recipe has been trained.
 #' @param prefix A character string for the prefix of the resulting new
 #' variables that are not named in `list_agg`.
@@ -48,7 +48,8 @@
 #' rec
 #' tidy(rec, 1)
 #' juice(rec)
-step_aggregate_list <- function(recipe, ..., role = "predictor", trained = FALSE,
+step_aggregate_list <- function(recipe, ..., role = "predictor",
+                                trained = FALSE,
                                 list_agg = NULL,
                                 fun_agg = NULL,
                                 res = NULL,
@@ -168,8 +169,7 @@ tidy.step_aggregate_list <- function(x, ...) {
     res <-
       tibble(
         terms = term_names,
-        pv = rlang::na_dbl,
-        kept = rlang::na_lgl
+        aggregate = rlang::na_chr
       )
   }
   # Always return the step id:

diff --git a/README.Rmd b/README.Rmd
@@ -56,6 +56,10 @@ list_family <- split(cheese_taxonomy$asv, cheese_taxonomy$family)
 head(list_family, 2)
 ```
 
+```{r seed, echo=FALSE}
+set.seed(42)
+```
+
 The following recipe will 
 
 1. aggregate the ASV variables at the family level, as defined by `list_family`;

diff --git a/README.md b/README.md
@@ -137,19 +137,19 @@ tidy(rec, 3)
 #> # A tibble: 13 × 4
 #>    terms                                    pv kept  id                  
 #>    <chr>                                 <dbl> <lgl> <chr>               
-#>  1 Aspergillaceae                       0.0608 FALSE select_kruskal_rtUAJ
-#>  2 Debaryomycetaceae                    0.0273 TRUE  select_kruskal_rtUAJ
-#>  3 Dipodascaceae                        0.0273 TRUE  select_kruskal_rtUAJ
-#>  4 Dothioraceae                         0.101  FALSE select_kruskal_rtUAJ
-#>  5 Lichtheimiaceae                      0.276  FALSE select_kruskal_rtUAJ
-#>  6 Metschnikowiaceae                    0.0509 FALSE select_kruskal_rtUAJ
-#>  7 Mucoraceae                           0.0608 FALSE select_kruskal_rtUAJ
-#>  8 Phaffomycetaceae                     0.0794 FALSE select_kruskal_rtUAJ
-#>  9 Saccharomycetaceae                   0.0273 TRUE  select_kruskal_rtUAJ
-#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE  select_kruskal_rtUAJ
-#> 11 Trichomonascaceae                    0.0625 FALSE select_kruskal_rtUAJ
-#> 12 Trichosporonaceae                    0.0273 TRUE  select_kruskal_rtUAJ
-#> 13 Wickerhamomyceteae                   0.177  FALSE select_kruskal_rtUAJ
+#>  1 Aspergillaceae                       0.0608 FALSE select_kruskal_WKayj
+#>  2 Debaryomycetaceae                    0.0273 TRUE  select_kruskal_WKayj
+#>  3 Dipodascaceae                        0.0273 TRUE  select_kruskal_WKayj
+#>  4 Dothioraceae                         0.101  FALSE select_kruskal_WKayj
+#>  5 Lichtheimiaceae                      0.276  FALSE select_kruskal_WKayj
+#>  6 Metschnikowiaceae                    0.0509 FALSE select_kruskal_WKayj
+#>  7 Mucoraceae                           0.0608 FALSE select_kruskal_WKayj
+#>  8 Phaffomycetaceae                     0.0794 FALSE select_kruskal_WKayj
+#>  9 Saccharomycetaceae                   0.0273 TRUE  select_kruskal_WKayj
+#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE  select_kruskal_WKayj
+#> 11 Trichomonascaceae                    0.0625 FALSE select_kruskal_WKayj
+#> 12 Trichosporonaceae                    0.0273 TRUE  select_kruskal_WKayj
+#> 13 Wickerhamomyceteae                   0.177  FALSE select_kruskal_WKayj
 ```
 
 ## Notes

diff --git a/_pkgdown.yml b/_pkgdown.yml
@@ -12,6 +12,7 @@ reference:
 - title: Feature Aggregation Steps
   contents:
   - '`step_aggregate_list`'
+  - '`step_aggregate_hclust`'
 - title: Feature Normalization Steps
   contents:
   - '`step_rownormalize_tss`'

diff --git a/dev/dev_history.R b/dev/dev_history.R
@@ -59,6 +59,9 @@ library(testthat)
 
 # use_test("aggregate_list")
 
+# use_r("aggregate_hclust")
+# use_test("aggregate_hclust")
+
 ####
 
 devtools::load_all()