Skip to content

Commit 0a55cf5

Browse files
DavisVaughanhadley
andauthored
Implement .by (#6528)
* Implement `.by` for `mutate()` and `summarise()` * Add tests related to #6100 * Add `.by` support in `filter()` * Add `.by` support in `slice()` family * Move the `.by` collision checks into the generics * Tweak `summarise_verbose()` to respect if the global option is `TRUE` This should override the `global_env()` reference check, so we can force verbosity in relevant documentation pages * Add a full documentation page specific to `.by` * Add section about verbs without `.by` support * NEWS bullet * Order groups by first appearance when using `.by` * NEWS bullet updates * We have decided that the `NULL` column case is too obscure to care about * NEWS tweaks based on feedback * Second pass on `.by` help page based on feedback * Include `.by` help page in pkgdown reference * Apply suggestions from code review Co-authored-by: Hadley Wickham <[email protected]> * Regenerate snapshots * Regenerate documentation * Ensure that `compute_by()` is type stable on `$data` It should always return a bare tibble, even though `group_data()` returns a data frame for data frame input. * Remove `TODO`s Co-authored-by: Hadley Wickham <[email protected]>
1 parent c348ef3 commit 0a55cf5

34 files changed

+1381
-129
lines changed

NEWS.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,47 @@
11
# dplyr (development version)
22

3+
* `.by` is a new experimental inline alternative to `group_by()` that supports
4+
_temporary_ grouping in the following key dplyr verbs: `mutate()`,
5+
`summarise()`, `filter()`, and the `slice()` family (#6528).
6+
7+
Rather than:
8+
9+
```
10+
starwars %>%
11+
group_by(species, homeworld) %>%
12+
summarise(mean_height = mean(height))
13+
```
14+
15+
You can now write:
16+
17+
```
18+
starwars %>%
19+
summarise(
20+
mean_height = mean(height),
21+
.by = c(species, homeworld)
22+
)
23+
```
24+
25+
The most useful reason to do this is because grouping with `.by` is
26+
_temporary_ and only affects the verb it is being applied to. An ungrouped
27+
data frame went into the `summarise()` call, so an ungrouped data frame will
28+
come out; with `.by`, you never need to remember to `ungroup()` afterwards.
29+
30+
Additionally, using `summarise()` or `slice()` with `.by` will never sort the
31+
results by the group key, unlike with `group_by()`. Instead, the results are
32+
returned using the existing ordering of the groups from the original data. We
33+
feel this is more predictable, better maintains any ordering you might have
34+
already applied with a previous call to `arrange()`, and provides a way to
35+
maintain the current ordering without having to resort to factors.
36+
37+
This exciting feature was inspired by
38+
[data.table](https://CRAN.R-project.org/package=data.table), where the
39+
equivalent syntax looks like:
40+
41+
```
42+
starwars[, .(mean_height = mean(height)), by = .(species, homeworld)]
43+
```
44+
345
* `summarise()` now correctly recycles named 0-column data frames (#6509).
446

547
* `.cols` and `.fns` are now required arguments in `across()`, `c_across()`,

R/by.R

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
#' Helper for consistent documentation of `.by`
2+
#'
3+
#' Use `@inheritParams args_by` to consistently document `.by`.
4+
#'
5+
#' @param .by `r lifecycle::badge("experimental")`
6+
#'
7+
#' <[`tidy-select`][dplyr_tidy_select]> Optionally, a selection of columns to
8+
#' temporarily group by using an inline alternative to [group_by()]. For
9+
#' details and examples, see [?dplyr_by][dplyr_by].
10+
#'
11+
#' @name args_by
12+
#' @keywords internal
13+
NULL
14+
15+
#' Temporary grouping with `.by`
16+
#'
17+
#' ```{r, echo = FALSE, results = "asis"}
18+
#' result <- rlang::with_options(
19+
#' knitr::knit_child("man/rmd/by.Rmd"),
20+
#' dplyr.summarise.inform = TRUE
21+
#' )
22+
#' cat(result, sep = "\n")
23+
#' ```
24+
#'
25+
#' @name dplyr_by
26+
NULL
27+
28+
compute_by <- function(by,
29+
data,
30+
...,
31+
by_arg = "by",
32+
data_arg = "data",
33+
error_call = caller_env()) {
34+
check_dots_empty0(...)
35+
36+
error_call <- dplyr_error_call(error_call)
37+
38+
by <- enquo(by)
39+
check_by(by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call)
40+
41+
if (is_grouped_df(data)) {
42+
type <- "grouped"
43+
names <- group_vars(data)
44+
data <- group_data(data)
45+
} else if (is_rowwise_df(data)) {
46+
type <- "rowwise"
47+
names <- group_vars(data)
48+
data <- group_data(data)
49+
} else {
50+
by <- eval_select_by(by, data, error_call = error_call)
51+
52+
if (length(by) == 0L) {
53+
# `by = NULL` or empty selection
54+
type <- "ungrouped"
55+
names <- by
56+
data <- group_data(data)
57+
data <- as_tibble(data)
58+
} else {
59+
type <- "grouped"
60+
names <- by
61+
data <- compute_by_groups(data, by, error_call = error_call)
62+
}
63+
}
64+
65+
new_by(type = type, names = names, data = data)
66+
}
67+
68+
compute_by_groups <- function(data, names, error_call = caller_env()) {
69+
data <- dplyr_col_select(data, names, error_call = error_call)
70+
info <- vec_group_loc(data)
71+
72+
size <- vec_size(info)
73+
74+
out <- dplyr_new_list(info$key)
75+
out[[".rows"]] <- new_list_of(info$loc, ptype = integer())
76+
out <- new_tibble(out, nrow = size)
77+
78+
out
79+
}
80+
81+
check_by <- function(by,
82+
data,
83+
...,
84+
by_arg = "by",
85+
data_arg = "data",
86+
error_call = caller_env()) {
87+
check_dots_empty0(...)
88+
89+
if (quo_is_null(by)) {
90+
return(invisible(NULL))
91+
}
92+
93+
if (is_grouped_df(data)) {
94+
message <- paste0(
95+
"Can't supply {.arg {by_arg}} when ",
96+
"{.arg {data_arg}} is a grouped data frame."
97+
)
98+
cli::cli_abort(message, call = error_call)
99+
}
100+
101+
if (is_rowwise_df(data)) {
102+
message <- paste0(
103+
"Can't supply {.arg {by_arg}} when ",
104+
"{.arg {data_arg}} is a rowwise data frame."
105+
)
106+
cli::cli_abort(message, call = error_call)
107+
}
108+
109+
invisible(NULL)
110+
}
111+
112+
eval_select_by <- function(by,
113+
data,
114+
error_call = caller_env()) {
115+
out <- tidyselect::eval_select(
116+
expr = by,
117+
data = data,
118+
allow_rename = FALSE,
119+
error_call = error_call
120+
)
121+
names(out)
122+
}
123+
124+
new_by <- function(type, names, data) {
125+
structure(list(type = type, names = names, data = data), class = "dplyr_by")
126+
}

R/conditions.R

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ cnd_bullet_cur_group_label <- function(what = "error") {
4747
}
4848

4949
cnd_bullet_rowwise_unlist <- function() {
50-
if (peek_mask()$is_rowwise_df()) {
50+
if (peek_mask()$is_rowwise()) {
5151
glue_data(peek_error_context(), "Did you mean: `{error_name} = list({error_expression})` ?")
5252
}
5353
}
@@ -131,9 +131,9 @@ dot_as_label <- function(expr) {
131131

132132
mask_type <- function(mask = peek_mask()) {
133133
if (mask$get_size() > 0) {
134-
if (mask$is_grouped_df()) {
134+
if (mask$is_grouped()) {
135135
return("grouped")
136-
} else if (mask$is_rowwise_df()) {
136+
} else if (mask$is_rowwise()) {
137137
return("rowwise")
138138
}
139139
}

R/data-mask.R

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
DataMask <- R6Class("DataMask",
22
public = list(
3-
initialize = function(data, verb, error_call) {
4-
rows <- group_rows(data)
3+
initialize = function(data, by, verb, error_call) {
4+
rows <- by$data$.rows
55
if (length(rows) == 0) {
66
# Specially handle case of zero groups
77
rows <- new_list_of(list(integer()), ptype = integer())
@@ -16,22 +16,23 @@ DataMask <- R6Class("DataMask",
1616
abort("Can't transform a data frame with duplicate names.", call = error_call)
1717
}
1818
names(data) <- names_bindings
19+
1920
private$size <- nrow(data)
2021
private$current_data <- dplyr_new_list(data)
2122

22-
private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows)
23-
private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows)
23+
private$grouped <- by$type == "grouped"
24+
private$rowwise <- by$type == "rowwise"
2425

25-
private$grouped_df <- is_grouped_df(data)
26-
private$rowwise_df <- is_rowwise_df(data)
26+
private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows, private$grouped, private$rowwise)
27+
private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows)
2728

28-
private$keys <- group_keys(data)
29-
private$group_vars <- group_vars(data)
29+
private$keys <- group_keys0(by$data)
30+
private$by_names <- by$names
3031
private$verb <- verb
3132
},
3233

3334
add_one = function(name, chunks, result) {
34-
if (self$is_rowwise_df()){
35+
if (self$is_rowwise()){
3536
is_scalar_list <- function(.x) {
3637
vec_is_list(.x) && length(.x) == 1L
3738
}
@@ -78,7 +79,7 @@ DataMask <- R6Class("DataMask",
7879
# `across(.fns = NULL)`. We should remove this when we defunct those.
7980
cols <- self$current_cols(vars)
8081

81-
if (self$is_rowwise_df()) {
82+
if (self$is_rowwise()) {
8283
cols <- map2(cols, names(cols), function(col, name) {
8384
if (vec_is_list(private$current_data[[name]])) {
8485
col <- list(col)
@@ -117,7 +118,7 @@ DataMask <- R6Class("DataMask",
117118
},
118119

119120
current_non_group_vars = function() {
120-
setdiff(self$current_vars(), private$group_vars)
121+
setdiff(self$current_vars(), private$by_names)
121122
},
122123

123124
get_current_group = function() {
@@ -172,12 +173,12 @@ DataMask <- R6Class("DataMask",
172173
})
173174
},
174175

175-
is_grouped_df = function() {
176-
private$grouped_df
176+
is_grouped = function() {
177+
private$grouped
177178
},
178179

179-
is_rowwise_df = function() {
180-
private$rowwise_df
180+
is_rowwise = function() {
181+
private$rowwise
181182
},
182183

183184
get_keys = function() {
@@ -215,8 +216,8 @@ DataMask <- R6Class("DataMask",
215216
# ptypes of all the variables
216217
current_data = list(),
217218

218-
# names of the grouping variables
219-
group_vars = character(),
219+
# names of the `by` variables
220+
by_names = character(),
220221

221222
# list of indices, one integer vector per group
222223
rows = NULL,
@@ -228,8 +229,8 @@ DataMask <- R6Class("DataMask",
228229
size = NULL,
229230

230231
# Type of data frame
231-
grouped_df = NULL,
232-
rowwise_df = NULL,
232+
grouped = NULL,
233+
rowwise = NULL,
233234

234235
verb = character()
235236
)

R/filter.R

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
#'
4949
#' @family single table verbs
5050
#' @inheritParams arrange
51+
#' @inheritParams args_by
5152
#' @param ... <[`data-masking`][dplyr_data_masking]> Expressions that return a
5253
#' logical value, and are defined in terms of the variables in `.data`.
5354
#' If multiple expressions are included, they are combined with the `&` operator.
@@ -105,23 +106,37 @@
105106
#' .data[[vars[[2]]]] > cond[[2]]
106107
#' )
107108
#' # Learn more in ?dplyr_data_masking
108-
filter <- function(.data, ..., .preserve = FALSE) {
109+
filter <- function(.data, ..., .by = NULL, .preserve = FALSE) {
110+
by <- enquo(.by)
111+
112+
if (!quo_is_null(by) && !is_false(.preserve)) {
113+
abort("Can't supply both `.by` and `.preserve`.")
114+
}
115+
109116
UseMethod("filter")
110117
}
111118

112119
#' @export
113-
filter.data.frame <- function(.data, ..., .preserve = FALSE) {
114-
loc <- filter_rows(.data, ...)
120+
filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) {
121+
loc <- filter_rows(.data, ..., .by = {{ .by }})
115122
dplyr_row_slice(.data, loc, preserve = .preserve)
116123
}
117124

118-
filter_rows <- function(.data, ..., error_call = caller_env()) {
125+
filter_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) {
119126
error_call <- dplyr_error_call(error_call)
120127

121128
dots <- dplyr_quosures(...)
122129
check_filter(dots, error_call = error_call)
123130

124-
mask <- DataMask$new(.data, "filter", error_call = error_call)
131+
by <- compute_by(
132+
by = {{ .by }},
133+
data = .data,
134+
by_arg = ".by",
135+
data_arg = ".data",
136+
error_call = error_call
137+
)
138+
139+
mask <- DataMask$new(.data, by, "filter", error_call = error_call)
125140
on.exit(mask$forget(), add = TRUE)
126141

127142
dots <- filter_expand(dots, mask = mask, error_call = error_call)

R/group-by.R

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -244,9 +244,13 @@ add_computed_columns <- function(.data,
244244
if (any(needs_mutate)) {
245245
# TODO: use less of a hack
246246
if (inherits(.data, "data.frame")) {
247+
bare_data <- ungroup(.data)
248+
by <- compute_by(by = NULL, data = bare_data)
249+
247250
cols <- mutate_cols(
248-
ungroup(.data),
251+
bare_data,
249252
dplyr_quosures(!!!vars),
253+
by = by,
250254
error_call = error_call
251255
)
252256

R/group-data.R

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,13 @@ group_keys.data.frame <- function(.tbl, ...) {
9797
.tbl <- group_by(.tbl, ...)
9898
}
9999
out <- group_data(.tbl)
100-
.Call(`dplyr_group_keys`, out)
100+
group_keys0(out)
101101
}
102+
group_keys0 <- function(x) {
103+
# Compute keys directly from `group_data()` results
104+
.Call(`dplyr_group_keys`, x)
105+
}
106+
102107
#' @rdname group_data
103108
#' @export
104109
group_rows <- function(.data) {

0 commit comments

Comments
 (0)