Skip to content

Commit

Permalink
Correction conflit
Browse files Browse the repository at this point in the history
  • Loading branch information
julieaubert committed Mar 6, 2024
2 parents f621c95 + 5f711ee commit 84b8f74
Show file tree
Hide file tree
Showing 10 changed files with 413 additions and 19 deletions.
8 changes: 8 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,23 +1,27 @@
# Generated by roxygen2: do not edit by hand

S3method(bake,step_aggregate_hclust)
S3method(bake,step_aggregate_list)
S3method(bake,step_rownormalize_tss)
S3method(bake,step_select_background)
S3method(bake,step_select_cv)
S3method(bake,step_select_kruskal)
S3method(bake,step_select_wilcoxon)
S3method(prep,step_aggregate_hclust)
S3method(prep,step_aggregate_list)
S3method(prep,step_rownormalize_tss)
S3method(prep,step_select_background)
S3method(prep,step_select_cv)
S3method(prep,step_select_kruskal)
S3method(prep,step_select_wilcoxon)
S3method(print,step_aggregate_hclust)
S3method(print,step_aggregate_list)
S3method(print,step_rownormalize_tss)
S3method(print,step_select_background)
S3method(print,step_select_cv)
S3method(print,step_select_kruskal)
S3method(print,step_select_wilcoxon)
S3method(tidy,step_aggregate_hclust)
S3method(tidy,step_aggregate_list)
S3method(tidy,step_rownormalize_tss)
S3method(tidy,step_select_background)
Expand All @@ -26,6 +30,7 @@ S3method(tidy,step_select_kruskal)
S3method(tidy,step_select_wilcoxon)
export("%>%")
export(cv)
export(step_aggregate_hclust)
export(step_aggregate_list)
export(step_rownormalize_tss)
export(step_select_background)
Expand Down Expand Up @@ -54,6 +59,9 @@ importFrom(rlang,.data)
importFrom(rlang,abort)
importFrom(rlang,enquos)
importFrom(stats,as.formula)
importFrom(stats,cutree)
importFrom(stats,dist)
importFrom(stats,hclust)
importFrom(stats,kruskal.test)
importFrom(stats,p.adjust)
importFrom(stats,sd)
Expand Down
196 changes: 196 additions & 0 deletions R/aggregate_hclust.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,196 @@
#' Feature aggregation step based on a hierarchical clustering.
#'
#' Aggregate variables according to hierarchical clustering.
#'
#' @param recipe A recipe object. The step will be added to the sequence of
#' operations for this recipe.
#' @param ... One or more selector functions to choose variables
#' for this step. See [selections()] for more details.
#' @param role For model terms created by this step, what analysis role should
#' they be assigned? By default, the new columns created by this step from
#' the original variables will be used as `predictors` in a model.
#' @param trained A logical to indicate if the quantities for preprocessing
#' have been estimated.
#' @param n_clusters Number of cluster to create.
#' @param fun_agg Aggregation function like `sum` or `mean`.
#' @param dist_metric Default to `euclidean`. See [stats::dist()] for more
#' details.
#' @param linkage_method Deault to `complete`. See [stats::hclust()] for more
#' details.
#' @param res This parameter is only produced after the recipe has been trained.
#' @param prefix A character string for the prefix of the resulting new
#' variables.
#' @param keep_original_cols A logical to keep the original variables in
#' the output. Defaults to `FALSE`.
#' @param skip A logical. Should the step be skipped when the
#' recipe is baked by [bake()]? While all operations are baked
#' when [prep()] is run, some operations may not be able to be
#' conducted on new data (e.g. processing the outcome variable(s)).
#' Care should be taken when using `skip = TRUE` as it may affect
#' the computations for subsequent operations.
#' @param id A character string that is unique to this step to identify it.
#'
#' @return An updated version of recipe with the new step added to the
#' sequence of any existing operations.
#'
#' @export
#'
#' @importFrom recipes add_step rand_id
#' @importFrom rlang enquos
#'
#' @author Antoine Bichat
#'
#' @examples
#' rec <-
#' iris %>%
#' recipe(formula = Species ~ .) %>%
#' step_aggregate_hclust(all_numeric_predictors(),
#' n_clusters = 2, fun_agg = sum) %>%
#' prep()
#' rec
#' tidy(rec, 1)
#' juice(rec)
step_aggregate_hclust <- function(recipe, ..., role = "predictor",
trained = FALSE,
n_clusters,
fun_agg,
dist_metric = "euclidean",
linkage_method = "complete",
res = NULL,
prefix = "cl_",
keep_original_cols = FALSE,
skip = FALSE,
id = rand_id("aggregate_hclust")) {

add_step(
recipe,
step_aggregate_hclust_new(
terms = enquos(...),
role = role,
trained = trained,
n_clusters = n_clusters,
fun_agg = fun_agg,
dist_metric = dist_metric,
linkage_method = linkage_method,
res = res,
prefix = prefix,
keep_original_cols = keep_original_cols,
skip = skip,
id = id
)
)
}

#' @importFrom recipes step
step_aggregate_hclust_new <- function(terms, role, trained,
n_clusters, fun_agg,
dist_metric, linkage_method,
res, prefix, keep_original_cols,
skip, id) {

step(subclass = "aggregate_hclust",
terms = terms,
role = role,
trained = trained,
n_clusters = n_clusters,
fun_agg = fun_agg,
dist_metric = dist_metric,
linkage_method = linkage_method,
res = res,
prefix = prefix,
keep_original_cols = keep_original_cols,
skip = skip,
id = id)
}

#' @export
#' @importFrom dplyr mutate
#' @importFrom recipes recipes_eval_select
#' @importFrom rlang .data
#' @importFrom stats cutree dist hclust
#' @importFrom tibble enframe
prep.step_aggregate_hclust <- function(x, training, info = NULL, ...) {
col_names <- recipes_eval_select(x$terms, training, info)
check_type(training[, col_names], quant = TRUE)

ct <-
training[, col_names] %>%
as.matrix() %>%
t() %>%
dist(method = x$dist_metric) %>%
hclust(method = x$linkage_method) %>%
cutree(k = x$n_clusters)

res_agg_hclust <-
ct %>%
enframe(name = "terms", value = "aggregate") %>%
mutate(aggregate = paste0(x$prefix, .data$aggregate))


step_aggregate_hclust_new(
terms = x$terms,
role = x$role,
trained = TRUE,
n_clusters = x$n_clusters,
fun_agg = x$fun_agg,
dist_metric = x$dist_metric,
linkage_method = x$linkage_method,
res = res_agg_hclust,
prefix = x$prefix,
keep_original_cols = x$keep_original_cols,
skip = x$skip,
id = x$id
)
}


#' @export
#' @importFrom recipes check_new_data
bake.step_aggregate_hclust <- function(object, new_data, ...) {
col_names <- object$res$terms
check_new_data(col_names, object, new_data)

list_agg_hc <- split(object$res$terms, object$res$aggregate)

aggregate_var(new_data, list_agg = list_agg_hc, fun_agg = object$fun_agg,
prefix = object$prefix,
keep_original_cols = object$keep_original_cols)
}

#' @export
#' @importFrom recipes print_step
print.step_aggregate_hclust <-
function(x, width = max(20, options()$width - 35), ...) {
title <- paste("`hclust` aggregation of ")

print_step(
tr_obj = x$res$terms,
untr_obj = x$terms,
trained = x$trained,
title = title,
width = width
)
invisible(x)
}


#' @rdname step_aggregate_hclust
#' @param x A `step_aggregate_hclust` object.
#' @export
#' @importFrom recipes is_trained sel2char
#' @importFrom tibble tibble
tidy.step_aggregate_hclust <- function(x, ...) {
if (is_trained(x)) {
res <- x$res
} else {
term_names <- sel2char(x$terms)
res <-
tibble(
terms = term_names,
aggregate = rlang::na_chr
)
}
# Always return the step id:
res$id <- x$id
res
}
10 changes: 5 additions & 5 deletions R/aggregate_list.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
#' they be assigned? By default, the new columns created by this step from
#' the original variables will be used as `predictors` in a model.
#' @param trained A logical to indicate if the quantities for preprocessing
#' have been estimated.
#' have been estimated.D
#' @param list_agg Named list of aggregated variables.
#' @param fun_agg Function to be used to aggregate variables.
#' @param fun_agg Aggregation function like `sum` or `mean`.
#' @param res This parameter is only produced after the recipe has been trained.
#' @param prefix A character string for the prefix of the resulting new
#' variables that are not named in `list_agg`.
Expand Down Expand Up @@ -48,7 +48,8 @@
#' rec
#' tidy(rec, 1)
#' juice(rec)
step_aggregate_list <- function(recipe, ..., role = "predictor", trained = FALSE,
step_aggregate_list <- function(recipe, ..., role = "predictor",
trained = FALSE,
list_agg = NULL,
fun_agg = NULL,
res = NULL,
Expand Down Expand Up @@ -168,8 +169,7 @@ tidy.step_aggregate_list <- function(x, ...) {
res <-
tibble(
terms = term_names,
pv = rlang::na_dbl,
kept = rlang::na_lgl
aggregate = rlang::na_chr
)
}
# Always return the step id:
Expand Down
4 changes: 4 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ list_family <- split(cheese_taxonomy$asv, cheese_taxonomy$family)
head(list_family, 2)
```

```{r seed, echo=FALSE}
set.seed(42)
```

The following recipe will

1. aggregate the ASV variables at the family level, as defined by `list_family`;
Expand Down
26 changes: 13 additions & 13 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -137,19 +137,19 @@ tidy(rec, 3)
#> # A tibble: 13 × 4
#> terms pv kept id
#> <chr> <dbl> <lgl> <chr>
#> 1 Aspergillaceae 0.0608 FALSE select_kruskal_rtUAJ
#> 2 Debaryomycetaceae 0.0273 TRUE select_kruskal_rtUAJ
#> 3 Dipodascaceae 0.0273 TRUE select_kruskal_rtUAJ
#> 4 Dothioraceae 0.101 FALSE select_kruskal_rtUAJ
#> 5 Lichtheimiaceae 0.276 FALSE select_kruskal_rtUAJ
#> 6 Metschnikowiaceae 0.0509 FALSE select_kruskal_rtUAJ
#> 7 Mucoraceae 0.0608 FALSE select_kruskal_rtUAJ
#> 8 Phaffomycetaceae 0.0794 FALSE select_kruskal_rtUAJ
#> 9 Saccharomycetaceae 0.0273 TRUE select_kruskal_rtUAJ
#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE select_kruskal_rtUAJ
#> 11 Trichomonascaceae 0.0625 FALSE select_kruskal_rtUAJ
#> 12 Trichosporonaceae 0.0273 TRUE select_kruskal_rtUAJ
#> 13 Wickerhamomyceteae 0.177 FALSE select_kruskal_rtUAJ
#> 1 Aspergillaceae 0.0608 FALSE select_kruskal_WKayj
#> 2 Debaryomycetaceae 0.0273 TRUE select_kruskal_WKayj
#> 3 Dipodascaceae 0.0273 TRUE select_kruskal_WKayj
#> 4 Dothioraceae 0.101 FALSE select_kruskal_WKayj
#> 5 Lichtheimiaceae 0.276 FALSE select_kruskal_WKayj
#> 6 Metschnikowiaceae 0.0509 FALSE select_kruskal_WKayj
#> 7 Mucoraceae 0.0608 FALSE select_kruskal_WKayj
#> 8 Phaffomycetaceae 0.0794 FALSE select_kruskal_WKayj
#> 9 Saccharomycetaceae 0.0273 TRUE select_kruskal_WKayj
#> 10 Saccharomycetales fam Incertae sedis 0.0221 TRUE select_kruskal_WKayj
#> 11 Trichomonascaceae 0.0625 FALSE select_kruskal_WKayj
#> 12 Trichosporonaceae 0.0273 TRUE select_kruskal_WKayj
#> 13 Wickerhamomyceteae 0.177 FALSE select_kruskal_WKayj
```

## Notes
Expand Down
1 change: 1 addition & 0 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ reference:
- title: Feature Aggregation Steps
contents:
- '`step_aggregate_list`'
- '`step_aggregate_hclust`'
- title: Feature Normalization Steps
contents:
- '`step_rownormalize_tss`'
Expand Down
3 changes: 3 additions & 0 deletions dev/dev_history.R
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ library(testthat)

# use_test("aggregate_list")

# use_r("aggregate_hclust")
# use_test("aggregate_hclust")

####

devtools::load_all()
Expand Down
Loading

0 comments on commit 84b8f74

Please sign in to comment.