-
Notifications
You must be signed in to change notification settings - Fork 2.1k
Implement .by
#6528
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Implement .by
#6528
Changes from all commits
0abeb53
b2b159e
22de139
2e7f5a7
26e703d
660dfc0
8743441
a56c439
bd47260
c5449d8
d581f68
244caa3
be5f977
0985ddf
ea80373
e7567c9
0a72ad6
9201a7f
fb6ee6d
ce6926f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
#' Helper for consistent documentation of `.by` | ||
#' | ||
#' Use `@inheritParams args_by` to consistently document `.by`. | ||
#' | ||
#' @param .by `r lifecycle::badge("experimental")` | ||
#' | ||
#' <[`tidy-select`][dplyr_tidy_select]> Optionally, a selection of columns to | ||
#' temporarily group by using an inline alternative to [group_by()]. For | ||
#' details and examples, see [?dplyr_by][dplyr_by]. | ||
#' | ||
#' @name args_by | ||
#' @keywords internal | ||
NULL | ||
|
||
#' Temporary grouping with `.by` | ||
#' | ||
#' ```{r, echo = FALSE, results = "asis"} | ||
#' result <- rlang::with_options( | ||
#' knitr::knit_child("man/rmd/by.Rmd"), | ||
#' dplyr.summarise.inform = TRUE | ||
#' ) | ||
#' cat(result, sep = "\n") | ||
#' ``` | ||
#' | ||
#' @name dplyr_by | ||
NULL | ||
|
||
compute_by <- function(by, | ||
data, | ||
..., | ||
by_arg = "by", | ||
data_arg = "data", | ||
error_call = caller_env()) { | ||
check_dots_empty0(...) | ||
|
||
error_call <- dplyr_error_call(error_call) | ||
|
||
by <- enquo(by) | ||
check_by(by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call) | ||
|
||
if (is_grouped_df(data)) { | ||
type <- "grouped" | ||
names <- group_vars(data) | ||
data <- group_data(data) | ||
} else if (is_rowwise_df(data)) { | ||
type <- "rowwise" | ||
names <- group_vars(data) | ||
data <- group_data(data) | ||
} else { | ||
by <- eval_select_by(by, data, error_call = error_call) | ||
|
||
if (length(by) == 0L) { | ||
# `by = NULL` or empty selection | ||
type <- "ungrouped" | ||
names <- by | ||
data <- group_data(data) | ||
data <- as_tibble(data) | ||
} else { | ||
type <- "grouped" | ||
names <- by | ||
data <- compute_by_groups(data, by, error_call = error_call) | ||
} | ||
} | ||
|
||
new_by(type = type, names = names, data = data) | ||
} | ||
|
||
compute_by_groups <- function(data, names, error_call = caller_env()) { | ||
data <- dplyr_col_select(data, names, error_call = error_call) | ||
info <- vec_group_loc(data) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We would potentially switch this out for But really once you get into the 100k+ range of number of groups, the group index computation isn't the slow part, it's the expression evaluation. So if we wanted to keep |
||
|
||
size <- vec_size(info) | ||
|
||
out <- dplyr_new_list(info$key) | ||
out[[".rows"]] <- new_list_of(info$loc, ptype = integer()) | ||
out <- new_tibble(out, nrow = size) | ||
|
||
out | ||
} | ||
|
||
check_by <- function(by, | ||
data, | ||
..., | ||
by_arg = "by", | ||
data_arg = "data", | ||
error_call = caller_env()) { | ||
check_dots_empty0(...) | ||
|
||
if (quo_is_null(by)) { | ||
return(invisible(NULL)) | ||
} | ||
|
||
if (is_grouped_df(data)) { | ||
message <- paste0( | ||
"Can't supply {.arg {by_arg}} when ", | ||
"{.arg {data_arg}} is a grouped data frame." | ||
) | ||
cli::cli_abort(message, call = error_call) | ||
} | ||
|
||
if (is_rowwise_df(data)) { | ||
message <- paste0( | ||
"Can't supply {.arg {by_arg}} when ", | ||
"{.arg {data_arg}} is a rowwise data frame." | ||
) | ||
cli::cli_abort(message, call = error_call) | ||
} | ||
|
||
invisible(NULL) | ||
} | ||
|
||
eval_select_by <- function(by, | ||
data, | ||
error_call = caller_env()) { | ||
out <- tidyselect::eval_select( | ||
expr = by, | ||
data = data, | ||
allow_rename = FALSE, | ||
error_call = error_call | ||
) | ||
names(out) | ||
} | ||
|
||
new_by <- function(type, names, data) { | ||
structure(list(type = type, names = names, data = data), class = "dplyr_by") | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we want to use temporary or transient?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think I went with temporary in documentation because it seemed like an easier verb for users to understand, but I am not tied to it if you feel like transient is clearer