From 0abeb53a6eaafaebdab93c2bd8db04f5c2f7d34b Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Fri, 4 Nov 2022 17:34:29 -0400 Subject: [PATCH 01/20] Implement `.by` for `mutate()` and `summarise()` --- R/by.R | 100 +++++++++++++++++++++++++++++ R/conditions.R | 6 +- R/data-mask.R | 39 +++++------ R/filter.R | 11 +++- R/group-by.R | 6 +- R/group-data.R | 7 +- R/mutate.R | 18 ++++-- R/slice.R | 12 +++- R/summarise.R | 57 +++++++++++----- R/transmute.R | 7 +- man/args_by.Rd | 17 +++++ man/mutate.Rd | 8 +++ man/summarise.Rd | 11 +++- src/chop.cpp | 9 ++- src/dplyr.h | 2 +- src/init.cpp | 2 +- tests/testthat/_snaps/by.md | 33 ++++++++++ tests/testthat/_snaps/mutate.md | 16 +++++ tests/testthat/_snaps/summarise.md | 24 +++++++ tests/testthat/test-across.R | 6 +- tests/testthat/test-by.R | 75 ++++++++++++++++++++++ tests/testthat/test-mutate.R | 51 +++++++++++++++ tests/testthat/test-pick.R | 6 +- tests/testthat/test-summarise.R | 44 +++++++++++++ 24 files changed, 506 insertions(+), 61 deletions(-) create mode 100644 R/by.R create mode 100644 man/args_by.Rd create mode 100644 tests/testthat/_snaps/by.md create mode 100644 tests/testthat/test-by.R diff --git a/R/by.R b/R/by.R new file mode 100644 index 0000000000..2ed9d473a5 --- /dev/null +++ b/R/by.R @@ -0,0 +1,100 @@ +#' Helper for consistent documentation of `.by` +#' +#' Use `@inheritParams args_by` to consistently document `.by`. +#' +#' @param .by `r lifecycle::badge("experimental")` +#' +#' <[`tidy-select`][dplyr_tidy_select]> Optionally, select columns to group +#' by. This grouping will only be active for the duration of this verb. +#' +#' Can't be used when the data is already a grouped or rowwise data frame. +#' +#' @name args_by +#' @keywords internal +NULL + +compute_by <- function(by, + data, + ..., + by_arg = "by", + data_arg = "data", + error_call = caller_env()) { + check_dots_empty0(...) + + error_call <- dplyr_error_call(error_call) + + by <- enquo(by) + check_by(by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call) + + if (is_grouped_df(data)) { + type <- "grouped" + names <- group_vars(data) + data <- group_data(data) + } else if (is_rowwise_df(data)) { + type <- "rowwise" + names <- group_vars(data) + data <- group_data(data) + } else { + by <- eval_select_by(by, data, error_call = error_call) + + if (length(by) == 0L) { + # `by = NULL` or empty selection + type <- "ungrouped" + names <- by + data <- group_data(data) + } else { + type <- "grouped" + names <- by + data <- compute_groups(data, by, drop = TRUE) + } + } + + new_by(type = type, names = names, data = data) +} + +check_by <- function(by, + data, + ..., + by_arg = "by", + data_arg = "data", + error_call = caller_env()) { + check_dots_empty0(...) + + if (quo_is_null(by)) { + return(invisible(NULL)) + } + + if (is_grouped_df(data)) { + message <- paste0( + "{.arg {by_arg}} can't be supplied when ", + "{.arg {data_arg}} is a grouped data frame." + ) + cli::cli_abort(message, call = error_call) + } + + if (is_rowwise_df(data)) { + message <- paste0( + "{.arg {by_arg}} can't be supplied when ", + "{.arg {data_arg}} is a rowwise data frame." + ) + cli::cli_abort(message, call = error_call) + } + + invisible(NULL) +} + +eval_select_by <- function(by, + data, + error_call = caller_env()) { + out <- tidyselect::eval_select( + expr = by, + data = data, + allow_rename = FALSE, + error_call = error_call + ) + names(out) +} + +new_by <- function(type, names, data) { + structure(list(type = type, names = names, data = data), class = "dplyr_by") +} diff --git a/R/conditions.R b/R/conditions.R index 341e944084..d892362c7f 100644 --- a/R/conditions.R +++ b/R/conditions.R @@ -47,7 +47,7 @@ cnd_bullet_cur_group_label <- function(what = "error") { } cnd_bullet_rowwise_unlist <- function() { - if (peek_mask()$is_rowwise_df()) { + if (peek_mask()$is_rowwise()) { glue_data(peek_error_context(), "Did you mean: `{error_name} = list({error_expression})` ?") } } @@ -131,9 +131,9 @@ dot_as_label <- function(expr) { mask_type <- function(mask = peek_mask()) { if (mask$get_size() > 0) { - if (mask$is_grouped_df()) { + if (mask$is_grouped()) { return("grouped") - } else if (mask$is_rowwise_df()) { + } else if (mask$is_rowwise()) { return("rowwise") } } diff --git a/R/data-mask.R b/R/data-mask.R index 95f6d7bbed..2d5f63a446 100644 --- a/R/data-mask.R +++ b/R/data-mask.R @@ -1,7 +1,7 @@ DataMask <- R6Class("DataMask", public = list( - initialize = function(data, verb, error_call) { - rows <- group_rows(data) + initialize = function(data, by, verb, error_call) { + rows <- by$data$.rows if (length(rows) == 0) { # Specially handle case of zero groups rows <- new_list_of(list(integer()), ptype = integer()) @@ -16,22 +16,23 @@ DataMask <- R6Class("DataMask", abort("Can't transform a data frame with duplicate names.", call = error_call) } names(data) <- names_bindings + private$size <- nrow(data) private$current_data <- dplyr_new_list(data) - private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows) - private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows) + private$grouped <- by$type == "grouped" + private$rowwise <- by$type == "rowwise" - private$grouped_df <- is_grouped_df(data) - private$rowwise_df <- is_rowwise_df(data) + private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows, private$grouped, private$rowwise) + private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows) - private$keys <- group_keys(data) - private$group_vars <- group_vars(data) + private$keys <- group_keys0(by$data) + private$by_names <- by$names private$verb <- verb }, add_one = function(name, chunks, result) { - if (self$is_rowwise_df()){ + if (self$is_rowwise()){ is_scalar_list <- function(.x) { vec_is_list(.x) && length(.x) == 1L } @@ -78,7 +79,7 @@ DataMask <- R6Class("DataMask", # `across(.fns = NULL)`. We should remove this when we defunct those. cols <- self$current_cols(vars) - if (self$is_rowwise_df()) { + if (self$is_rowwise()) { cols <- map2(cols, names(cols), function(col, name) { if (vec_is_list(private$current_data[[name]])) { col <- list(col) @@ -117,7 +118,7 @@ DataMask <- R6Class("DataMask", }, current_non_group_vars = function() { - setdiff(self$current_vars(), private$group_vars) + setdiff(self$current_vars(), private$by_names) }, get_current_group = function() { @@ -172,12 +173,12 @@ DataMask <- R6Class("DataMask", }) }, - is_grouped_df = function() { - private$grouped_df + is_grouped = function() { + private$grouped }, - is_rowwise_df = function() { - private$rowwise_df + is_rowwise = function() { + private$rowwise }, get_keys = function() { @@ -215,8 +216,8 @@ DataMask <- R6Class("DataMask", # ptypes of all the variables current_data = list(), - # names of the grouping variables - group_vars = character(), + # names of the `by` variables + by_names = character(), # list of indices, one integer vector per group rows = NULL, @@ -228,8 +229,8 @@ DataMask <- R6Class("DataMask", size = NULL, # Type of data frame - grouped_df = NULL, - rowwise_df = NULL, + grouped = NULL, + rowwise = NULL, verb = character() ) diff --git a/R/filter.R b/R/filter.R index 92350e1f29..67ade15fbd 100644 --- a/R/filter.R +++ b/R/filter.R @@ -121,7 +121,16 @@ filter_rows <- function(.data, ..., error_call = caller_env()) { dots <- dplyr_quosures(...) check_filter(dots, error_call = error_call) - mask <- DataMask$new(.data, "filter", error_call = error_call) + # TODO: Expose `.by` + by <- compute_by( + by = NULL, + data = .data, + by_arg = ".by", + data_arg = ".data", + error_call = error_call + ) + + mask <- DataMask$new(.data, by, "filter", error_call = error_call) on.exit(mask$forget(), add = TRUE) dots <- filter_expand(dots, mask = mask, error_call = error_call) diff --git a/R/group-by.R b/R/group-by.R index e10c4258d9..8e501cbc72 100644 --- a/R/group-by.R +++ b/R/group-by.R @@ -244,9 +244,13 @@ add_computed_columns <- function(.data, if (any(needs_mutate)) { # TODO: use less of a hack if (inherits(.data, "data.frame")) { + bare_data <- ungroup(.data) + by <- compute_by(by = NULL, data = bare_data) + cols <- mutate_cols( - ungroup(.data), + bare_data, dplyr_quosures(!!!vars), + by = by, error_call = error_call ) diff --git a/R/group-data.R b/R/group-data.R index 5b64e4bc8b..1184bea642 100644 --- a/R/group-data.R +++ b/R/group-data.R @@ -97,8 +97,13 @@ group_keys.data.frame <- function(.tbl, ...) { .tbl <- group_by(.tbl, ...) } out <- group_data(.tbl) - .Call(`dplyr_group_keys`, out) + group_keys0(out) } +group_keys0 <- function(x) { + # Compute keys directly from `group_data()` results + .Call(`dplyr_group_keys`, x) +} + #' @rdname group_data #' @export group_rows <- function(.data) { diff --git a/R/mutate.R b/R/mutate.R index 50c11a4a3d..a3d2c24a93 100644 --- a/R/mutate.R +++ b/R/mutate.R @@ -147,6 +147,9 @@ mutate <- function(.data, ...) { } #' @rdname mutate +#' +#' @inheritParams args_by +#' #' @param .keep #' Control which columns from `.data` are retained in the output. Grouping #' columns and columns created by `...` are always kept. @@ -167,12 +170,15 @@ mutate <- function(.data, ...) { #' @export mutate.data.frame <- function(.data, ..., + .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL) { keep <- arg_match(.keep) - cols <- mutate_cols(.data, dplyr_quosures(...)) + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") + + cols <- mutate_cols(.data, dplyr_quosures(...), by) used <- attr(cols, "used") out <- dplyr_col_modify(.data, cols) @@ -184,7 +190,7 @@ mutate.data.frame <- function(.data, cols <- compact_null(cols) cols_data <- names(.data) - cols_group <- group_vars(.data) + cols_group <- by$names cols_expr <- names(cols) cols_expr_modified <- intersect(cols_expr, cols_data) @@ -218,13 +224,13 @@ mutate.data.frame <- function(.data, # Helpers ----------------------------------------------------------------- -mutate_cols <- function(.data, dots, error_call = caller_env()) { +mutate_cols <- function(data, dots, by, error_call = caller_env()) { # Collect dots before setting up error handlers (#6178) force(dots) error_call <- dplyr_error_call(error_call) - mask <- DataMask$new(.data, "mutate", error_call = error_call) + mask <- DataMask$new(data, by, "mutate", error_call = error_call) old_current_column <- context_peek_bare("column") on.exit(context_poke("column", old_current_column), add = TRUE) @@ -240,7 +246,7 @@ mutate_cols <- function(.data, dots, error_call = caller_env()) { poke_error_context(dots, i, mask = mask) context_poke("column", old_current_column) - new_columns <- mutate_col(dots[[i]], .data, mask, new_columns) + new_columns <- mutate_col(dots[[i]], data, mask, new_columns) }, error = dplyr_error_handler( dots = dots, @@ -307,7 +313,7 @@ mutate_col <- function(dot, data, mask, new_columns) { chunks <- mask$resolve(name) } - if (inherits(data, "rowwise_df") && vec_is_list(result)) { + if (mask$is_rowwise() && vec_is_list(result)) { sizes <- list_sizes(result) wrong <- which(sizes != 1) if (length(wrong)) { diff --git a/R/slice.R b/R/slice.R index d7b023352b..542390b589 100644 --- a/R/slice.R +++ b/R/slice.R @@ -278,7 +278,17 @@ slice_rows <- function(.data, ..., error_call = caller_env()) { if (is_empty(dots)) { return(TRUE) } - mask <- DataMask$new(.data, "slice", error_call = error_call) + + # TODO: Expose `.by` + by <- compute_by( + by = NULL, + data = .data, + by_arg = ".by", + data_arg = ".data", + error_call = error_call + ) + + mask <- DataMask$new(.data, by, "slice", error_call = error_call) on.exit(mask$forget(), add = TRUE) chunks <- slice_eval(mask, dots, error_call = error_call) diff --git a/R/summarise.R b/R/summarise.R index 5aa2f090a3..0dee253cf7 100644 --- a/R/summarise.R +++ b/R/summarise.R @@ -33,6 +33,7 @@ #' #' @export #' @inheritParams arrange +#' @inheritParams args_by #' @param ... <[`data-masking`][dplyr_data_masking]> Name-value pairs of summary #' functions. The name will be the name of the variable in the result. #' @@ -116,7 +117,7 @@ #' var <- "mass" #' summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) #' # Learn more in ?dplyr_data_masking -summarise <- function(.data, ..., .groups = NULL) { +summarise <- function(.data, ..., .by = NULL, .groups = NULL) { UseMethod("summarise") } #' @rdname summarise @@ -124,19 +125,39 @@ summarise <- function(.data, ..., .groups = NULL) { summarize <- summarise #' @export -summarise.data.frame <- function(.data, ..., .groups = NULL) { - cols <- summarise_cols(.data, dplyr_quosures(...)) - out <- summarise_build(.data, cols) +summarise.data.frame <- function(.data, ..., .by = NULL, .groups = NULL) { + by <- enquo(.by) + + if (!quo_is_null(by) && !is.null(.groups)) { + abort("Can't supply both `.by` and `.groups`.") + } + + by <- compute_by(!!by, .data, by_arg = ".by", data_arg = ".data") + + cols <- summarise_cols(.data, dplyr_quosures(...), by) + out <- summarise_build(by, cols) + + if (!is_tibble(.data)) { + # The `by` group data we build from is always a tibble, + # so we have to manually downcast as needed + out <- as.data.frame(out) + } + if (identical(.groups, "rowwise")) { out <- rowwise_df(out, character()) } + out } #' @export -summarise.grouped_df <- function(.data, ..., .groups = NULL) { - cols <- summarise_cols(.data, dplyr_quosures(...)) - out <- summarise_build(.data, cols) +summarise.grouped_df <- function(.data, ..., .by = NULL, .groups = NULL) { + # TODO: Is it right to even expose `.by` here? It lets us catch errors. + # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") + + cols <- summarise_cols(.data, dplyr_quosures(...), by) + out <- summarise_build(by, cols) verbose <- summarise_verbose(.groups, caller_env()) if (is.null(.groups)) { @@ -147,7 +168,7 @@ summarise.grouped_df <- function(.data, ..., .groups = NULL) { } } - group_vars <- group_vars(.data) + group_vars <- by$names if (identical(.groups, "drop_last")) { n <- length(group_vars) if (n > 1) { @@ -177,12 +198,16 @@ summarise.grouped_df <- function(.data, ..., .groups = NULL) { } #' @export -summarise.rowwise_df <- function(.data, ..., .groups = NULL) { - cols <- summarise_cols(.data, dplyr_quosures(...)) - out <- summarise_build(.data, cols) +summarise.rowwise_df <- function(.data, ..., .by = NULL, .groups = NULL) { + # TODO: Is it right to even expose `.by` here? It lets us catch errors. + # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") + + cols <- summarise_cols(.data, dplyr_quosures(...), by) + out <- summarise_build(by, cols) verbose <- summarise_verbose(.groups, caller_env()) - group_vars <- group_vars(.data) + group_vars <- by$names if (is.null(.groups) || identical(.groups, "keep")) { if (verbose && length(group_vars)) { new_groups <- glue_collapse(paste0("'", group_vars, "'"), sep = ", ") @@ -202,10 +227,10 @@ summarise.rowwise_df <- function(.data, ..., .groups = NULL) { out } -summarise_cols <- function(.data, dots, error_call = caller_env()) { +summarise_cols <- function(data, dots, by, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) - mask <- DataMask$new(.data, "summarise", error_call = error_call) + mask <- DataMask$new(data, by, "summarise", error_call = error_call) old_current_column <- context_peek_bare("column") warnings_state <- env(warnings = list()) @@ -343,8 +368,8 @@ summarise_eval_one <- function(quo, mask) { list(chunks = chunks_k, types = types_k, results = result_k) } -summarise_build <- function(.data, cols) { - out <- group_keys(.data) +summarise_build <- function(by, cols) { + out <- group_keys0(by$data) if (!cols$all_one) { out <- vec_slice(out, rep(seq_len(nrow(out)), cols$sizes)) } diff --git a/R/transmute.R b/R/transmute.R index d6024b841c..106cde9860 100644 --- a/R/transmute.R +++ b/R/transmute.R @@ -39,7 +39,10 @@ transmute.data.frame <- function(.data, ...) { dots <- check_transmute_args(...) dots <- dplyr_quosures(!!!dots) - cols <- mutate_cols(.data, dots) + # We don't expose `.by` because `transmute()` is superseded + by <- compute_by(by = NULL, data = .data) + + cols <- mutate_cols(.data, dots, by) out <- dplyr_col_modify(.data, cols) @@ -53,7 +56,7 @@ transmute.data.frame <- function(.data, ...) { cols_expr <- names(cols) # Retain untouched group variables up front - cols_group <- group_vars(.data) + cols_group <- by$names cols_group <- setdiff(cols_group, cols_expr) cols_retain <- c(cols_group, cols_expr) diff --git a/man/args_by.Rd b/man/args_by.Rd new file mode 100644 index 0000000000..d6f6ee9687 --- /dev/null +++ b/man/args_by.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/by.R +\name{args_by} +\alias{args_by} +\title{Helper for consistent documentation of \code{.by}} +\arguments{ +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group +by. This grouping will only be active for the duration of this verb. + +Can't be used when the data is already a grouped or rowwise data frame.} +} +\description{ +Use \verb{@inheritParams args_by} to consistently document \code{.by}. +} +\keyword{internal} diff --git a/man/mutate.Rd b/man/mutate.Rd index 018a302be1..a1410b498a 100644 --- a/man/mutate.Rd +++ b/man/mutate.Rd @@ -10,6 +10,7 @@ mutate(.data, ...) \method{mutate}{data.frame}( .data, ..., + .by = NULL, .keep = c("all", "used", "unused", "none"), .before = NULL, .after = NULL @@ -32,6 +33,13 @@ if ungrouped). \item A data frame or tibble, to create multiple columns in the output. }} +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group +by. This grouping will only be active for the duration of this verb. + +Can't be used when the data is already a grouped or rowwise data frame.} + \item{.keep}{Control which columns from \code{.data} are retained in the output. Grouping columns and columns created by \code{...} are always kept. \itemize{ diff --git a/man/summarise.Rd b/man/summarise.Rd index 99f09dc56d..27fe68e537 100644 --- a/man/summarise.Rd +++ b/man/summarise.Rd @@ -5,9 +5,9 @@ \alias{summarize} \title{Summarise each group to fewer rows} \usage{ -summarise(.data, ..., .groups = NULL) +summarise(.data, ..., .by = NULL, .groups = NULL) -summarize(.data, ..., .groups = NULL) +summarize(.data, ..., .by = NULL, .groups = NULL) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -24,6 +24,13 @@ The value can be: \item A data frame, to add multiple columns from a single expression. }} +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group +by. This grouping will only be active for the duration of this verb. + +Can't be used when the data is already a grouped or rowwise data frame.} + \item{.groups}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Grouping structure of the result. \itemize{ \item "drop_last": dropping the last level of grouping. This was the diff --git a/src/chop.cpp b/src/chop.cpp index 44a89645d5..06c73a9825 100644 --- a/src/chop.cpp +++ b/src/chop.cpp @@ -66,7 +66,10 @@ void dplyr_lazy_vec_chop_ungrouped(SEXP chops_env, SEXP data) { UNPROTECT(1); } -SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows) { +SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise) { + bool grouped = static_cast(LOGICAL_ELT(ffi_grouped, 0)); + bool rowwise = static_cast(LOGICAL_ELT(ffi_rowwise, 0)); + // a first environment to hide `.indices` and `.current_group` // this is for example used by funs:: SEXP indices_env = PROTECT(new_environment(2, R_EmptyEnv)); @@ -75,9 +78,9 @@ SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows) { // then an environment to hold the chops of the columns SEXP chops_env = PROTECT(new_environment(XLENGTH(data), indices_env)); - if (Rf_inherits(data, "grouped_df")) { + if (grouped) { dplyr_lazy_vec_chop_grouped(chops_env, rows, data, false); - } else if (Rf_inherits(data, "rowwise_df")) { + } else if (rowwise) { dplyr_lazy_vec_chop_grouped(chops_env, rows, data, true); } else { dplyr_lazy_vec_chop_ungrouped(chops_env, data); diff --git a/src/dplyr.h b/src/dplyr.h index 2517a539ed..d00e601f12 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -112,7 +112,7 @@ SEXP dplyr_group_keys(SEXP group_data); SEXP dplyr_mask_remove(SEXP env_private, SEXP s_name); SEXP dplyr_mask_add(SEXP env_private, SEXP s_name, SEXP ptype, SEXP chunks); -SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows); +SEXP dplyr_lazy_vec_chop(SEXP data, SEXP rows, SEXP ffi_grouped, SEXP ffi_rowwise); SEXP dplyr_data_masks_setup(SEXP chops, SEXP data, SEXP rows); SEXP env_resolved(SEXP env, SEXP names); void add_mask_binding(SEXP name, SEXP env_bindings, SEXP env_chops); diff --git a/src/init.cpp b/src/init.cpp index ba354ad591..7bf0b26f92 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -114,7 +114,7 @@ static const R_CallMethodDef CallEntries[] = { {"dplyr_mask_remove", (DL_FUNC)& dplyr_mask_remove, 2}, {"dplyr_mask_add", (DL_FUNC)& dplyr_mask_add, 4}, - {"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 2}, + {"dplyr_lazy_vec_chop_impl", (DL_FUNC)& dplyr_lazy_vec_chop, 4}, {"dplyr_data_masks_setup", (DL_FUNC)& dplyr_data_masks_setup, 3}, {"env_resolved", (DL_FUNC)& env_resolved, 2}, diff --git a/tests/testthat/_snaps/by.md b/tests/testthat/_snaps/by.md new file mode 100644 index 0000000000..d9ae71c204 --- /dev/null +++ b/tests/testthat/_snaps/by.md @@ -0,0 +1,33 @@ +# throws tidyselect errors + + Code + compute_by(by = y, data = df) + Condition + Error: + ! Can't subset columns that don't exist. + x Column `y` doesn't exist. + +# can't set `.by` with a grouped-df + + Code + compute_by(x, gdf) + Condition + Error: + ! `by` can't be supplied when `data` is a grouped data frame. + +# can't set `.by` with a rowwise-df + + Code + compute_by(x, rdf) + Condition + Error: + ! `by` can't be supplied when `data` is a rowwise data frame. + +# can tweak the error args + + Code + compute_by(x, gdf, by_arg = "x", data_arg = "dat") + Condition + Error: + ! `x` can't be supplied when `dat` is a grouped data frame. + diff --git a/tests/testthat/_snaps/mutate.md b/tests/testthat/_snaps/mutate.md index 1e3edd3e00..b406242fde 100644 --- a/tests/testthat/_snaps/mutate.md +++ b/tests/testthat/_snaps/mutate.md @@ -59,6 +59,22 @@ ! `y` must be size 1, not 2. i Did you mean: `y = list(x)` ? +# catches `.by` with grouped-df + + Code + mutate(gdf, .by = x) + Condition + Error in `mutate()`: + ! `.by` can't be supplied when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + mutate(rdf, .by = x) + Condition + Error in `mutate()`: + ! `.by` can't be supplied when `.data` is a rowwise data frame. + # mutate() deals with 0 groups (#5534) Code diff --git a/tests/testthat/_snaps/summarise.md b/tests/testthat/_snaps/summarise.md index c9fd3c2a86..987d1e8745 100644 --- a/tests/testthat/_snaps/summarise.md +++ b/tests/testthat/_snaps/summarise.md @@ -1,3 +1,27 @@ +# can't use `.by` with `.groups` + + Code + summarise(df, .by = x, .groups = "drop") + Condition + Error in `summarise()`: + ! Can't supply both `.by` and `.groups`. + +# catches `.by` with grouped-df + + Code + summarise(gdf, .by = x) + Condition + Error in `summarise()`: + ! `.by` can't be supplied when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + summarise(rdf, .by = x) + Condition + Error in `summarise()`: + ! `.by` can't be supplied when `.data` is a rowwise data frame. + # summarise() gives meaningful errors Code diff --git a/tests/testthat/test-across.R b/tests/testthat/test-across.R index b387ea284f..b662b7b5fa 100644 --- a/tests/testthat/test-across.R +++ b/tests/testthat/test-across.R @@ -913,7 +913,8 @@ test_that("expand_across() expands lambdas", { index = 1 ) - DataMask$new(mtcars, "mutate", call("caller")) + by <- compute_by(by = NULL, data = mtcars, error_call = call("caller")) + DataMask$new(mtcars, by, "mutate", call("caller")) expect_equal( map(expand_across(quo), quo_get_expr), @@ -934,7 +935,8 @@ test_that("expand_if_across() expands lambdas", { index = 1 ) - DataMask$new(mtcars, "mutate", call("caller")) + by <- compute_by(by = NULL, data = mtcars, error_call = call("caller")) + DataMask$new(mtcars, by, "mutate", call("caller")) expect_equal( map(expand_if_across(quo), quo_squash), diff --git a/tests/testthat/test-by.R b/tests/testthat/test-by.R new file mode 100644 index 0000000000..8e9b4ca9f5 --- /dev/null +++ b/tests/testthat/test-by.R @@ -0,0 +1,75 @@ +test_that("computes group data when `by` is set", { + df <- tibble(x = c(1, 1, 2, 2, 1)) + + out <- compute_by(by = x, data = df) + expect_identical(out$type, "grouped") + expect_identical(out$names, "x") + expect_identical(out$data, group_data(group_by(df, x))) + + # In particular, sets `drop = TRUE` + expect_true(attr(out$data, ".drop")) +}) + +test_that("extracts existing data when `by = NULL`", { + df <- tibble(x = c(1, 1, 2, 2, 1)) + out <- compute_by(by = NULL, data = df) + expect_identical(out$type, "ungrouped") + expect_identical(out$names, character()) + expect_identical(out$data, group_data(df)) + + gdf <- group_by(df, x) + out <- compute_by(by = NULL, data = gdf) + expect_identical(out$type, "grouped") + expect_identical(out$names, "x") + expect_identical(out$data, group_data(gdf)) + + rdf <- rowwise(df) + out <- compute_by(by = NULL, data = rdf) + expect_identical(out$type, "rowwise") + expect_identical(out$names, character()) + expect_identical(out$data, group_data(rdf)) +}) + +test_that("empty selection results in ungrouped group data", { + df <- tibble(x = 1) + + out <- compute_by(by = c(), data = df) + expect_identical(out$type, "ungrouped") + expect_identical(out$names, character()) + expect_identical(out$data, group_data(df)) +}) + +test_that("throws tidyselect errors", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + compute_by(by = y, data = df) + }) +}) + +test_that("can't set `.by` with a grouped-df", { + df <- tibble(x = 1:5) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + compute_by(x, gdf) + }) +}) + +test_that("can't set `.by` with a rowwise-df", { + df <- tibble(x = 1:5) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + compute_by(x, rdf) + }) +}) + +test_that("can tweak the error args", { + df <- tibble(x = 1:5) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + compute_by(x, gdf, by_arg = "x", data_arg = "dat") + }) +}) diff --git a/tests/testthat/test-mutate.R b/tests/testthat/test-mutate.R index b0631a1557..122eee315b 100644 --- a/tests/testthat/test-mutate.R +++ b/tests/testthat/test-mutate.R @@ -3,16 +3,22 @@ test_that("empty mutate returns input", { gf <- group_by(df, x) expect_equal(mutate(df), df) + expect_equal(mutate(df, .by = x), df) expect_equal(mutate(gf), gf) expect_equal(mutate(df, !!!list()), df) + expect_equal(mutate(df, !!!list(), .by = x), df) expect_equal(mutate(gf, !!!list()), gf) }) test_that("rownames preserved", { df <- data.frame(x = c(1, 2), row.names = c("a", "b")) + df <- mutate(df, y = 2) expect_equal(row.names(df), c("a", "b")) + + df <- mutate(df, y = 2, .by = x) + expect_equal(row.names(df), c("a", "b")) }) test_that("mutations applied progressively", { @@ -75,6 +81,7 @@ test_that("assignments don't overwrite variables (#315)", { }) test_that("can mutate a data frame with zero columns and `NULL` column names", { + skip("Do we really care about this gremlin case? This data frame is unusable.") df <- new_data_frame(n = 2L) colnames(df) <- NULL expect_equal(mutate(df, x = 1), data.frame(x = c(1, 1))) @@ -382,6 +389,50 @@ test_that("DataMask$add() forces chunks (#4677)", { expect_equal(df$log_e_bf01, log(1 / 0.244)) }) +# .by ------------------------------------------------------------------------- + +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + + out <- mutate(df, x = mean(x), .by = g) + + expect_identical(out$g, df$g) + expect_identical(out$x, c(3, 3, 2, 3, 2)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + out <- mutate(df, x = mean(x), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("can `NULL` out the `.by` column", { + df <- tibble(x = 1:3) + + expect_identical( + mutate(df, x = NULL, .by = x), + new_tibble(list(), nrow = 3) + ) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + mutate(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + mutate(rdf, .by = x) + }) +}) # .before, .after, .keep ------------------------------------------------------ diff --git a/tests/testthat/test-pick.R b/tests/testthat/test-pick.R index 8441cb82cf..48a439042a 100644 --- a/tests/testthat/test-pick.R +++ b/tests/testthat/test-pick.R @@ -503,7 +503,8 @@ test_that("`pick()` can be used inside `group_by()` wrappers", { test_that("`pick()` doesn't expand across anonymous function boundaries", { df <- tibble(x = 1, y = 2) - mask <- DataMask$new(df, verb = "mutate", error_call = current_env()) + by <- compute_by(by = NULL, data = df, error_call = current_env()) + mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env()) # With inline `function() { }` calls (this also handles native R anonymous functions) quo <- dplyr_quosures(z = function() pick(y, x))$z @@ -516,7 +517,8 @@ test_that("`pick()` doesn't expand across anonymous function boundaries", { test_that("`pick()` expands embedded quosures", { df <- tibble(x = 1, y = 2) - mask <- DataMask$new(df, verb = "mutate", error_call = current_env()) + by <- compute_by(by = NULL, data = df, error_call = current_env()) + mask <- DataMask$new(df, by, verb = "mutate", error_call = current_env()) wrapper <- function(x) { dplyr_quosures(z = dense_rank({{x}})) diff --git a/tests/testthat/test-summarise.R b/tests/testthat/test-summarise.R index c42b39629c..44e3355bc1 100644 --- a/tests/testthat/test-summarise.R +++ b/tests/testthat/test-summarise.R @@ -257,6 +257,50 @@ test_that("summarise() silently skips when all results are NULL (#5708)", { expect_error(summarise(df, x = if(g == 1) 42)) }) +# .by ---------------------------------------------------------------------- + +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + + out <- summarise(df, x = mean(x), .by = g) + + expect_identical(out$g, c(1, 2)) + expect_identical(out$x, c(3, 2)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- data.frame(g = c(1, 1, 2, 1, 2), x = c(5, 2, 1, 2, 3)) + out <- summarise(df, x = mean(x), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("can't use `.by` with `.groups`", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + summarise(df, .by = x, .groups = "drop") + }) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + summarise(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + summarise(rdf, .by = x) + }) +}) + # errors ------------------------------------------------------------------- test_that("summarise() preserves the call stack on error (#5308)", { From b2b159eed85e3b5de2a4bed10ce8e9c757412ff1 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 7 Nov 2022 09:51:06 -0500 Subject: [PATCH 02/20] Add tests related to #6100 --- tests/testthat/test-mutate.R | 15 +++++++++++++++ tests/testthat/test-summarise.R | 17 +++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/tests/testthat/test-mutate.R b/tests/testthat/test-mutate.R index 122eee315b..0f21f4cbd0 100644 --- a/tests/testthat/test-mutate.R +++ b/tests/testthat/test-mutate.R @@ -407,6 +407,21 @@ test_that("transient grouping retains bare data.frame class", { expect_s3_class(out, class(df), exact = TRUE) }) +test_that("transient grouping retains data frame attributes (#6100)", { + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- mutate(df, x = mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") + + out <- mutate(tbl, x = mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") +}) + test_that("can `NULL` out the `.by` column", { df <- tibble(x = 1:3) diff --git a/tests/testthat/test-summarise.R b/tests/testthat/test-summarise.R index 44e3355bc1..b1c5fa1e0f 100644 --- a/tests/testthat/test-summarise.R +++ b/tests/testthat/test-summarise.R @@ -275,6 +275,23 @@ test_that("transient grouping retains bare data.frame class", { expect_s3_class(out, class(df), exact = TRUE) }) +test_that("transient grouping drops data frame attributes", { + # Because `summarise()` theoretically creates a "new" data frame + + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- summarise(df, x = mean(x), .by = g) + expect_null(attr(out, "foo")) + + out <- summarise(tbl, x = mean(x), .by = g) + expect_null(attr(out, "foo")) +}) + test_that("can't use `.by` with `.groups`", { df <- tibble(x = 1) From 22de139c85458d01490cd7235c1c12fa555330e0 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 7 Nov 2022 10:57:20 -0500 Subject: [PATCH 03/20] Add `.by` support in `filter()` --- R/filter.R | 18 ++++++---- man/filter.Rd | 9 ++++- tests/testthat/_snaps/filter.md | 24 ++++++++++++++ tests/testthat/test-filter.R | 59 +++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 7 deletions(-) diff --git a/R/filter.R b/R/filter.R index 67ade15fbd..780b1fa504 100644 --- a/R/filter.R +++ b/R/filter.R @@ -48,6 +48,7 @@ #' #' @family single table verbs #' @inheritParams arrange +#' @inheritParams args_by #' @param ... <[`data-masking`][dplyr_data_masking]> Expressions that return a #' logical value, and are defined in terms of the variables in `.data`. #' If multiple expressions are included, they are combined with the `&` operator. @@ -105,25 +106,30 @@ #' .data[[vars[[2]]]] > cond[[2]] #' ) #' # Learn more in ?dplyr_data_masking -filter <- function(.data, ..., .preserve = FALSE) { +filter <- function(.data, ..., .by = NULL, .preserve = FALSE) { UseMethod("filter") } #' @export -filter.data.frame <- function(.data, ..., .preserve = FALSE) { - loc <- filter_rows(.data, ...) +filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + by <- enquo(.by) + + if (!quo_is_null(by) && !is_false(.preserve)) { + abort("Can't supply both `.by` and `.preserve`.") + } + + loc <- filter_rows(.data, ..., .by = !!by) dplyr_row_slice(.data, loc, preserve = .preserve) } -filter_rows <- function(.data, ..., error_call = caller_env()) { +filter_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) dots <- dplyr_quosures(...) check_filter(dots, error_call = error_call) - # TODO: Expose `.by` by <- compute_by( - by = NULL, + by = {{ .by }}, data = .data, by_arg = ".by", data_arg = ".data", diff --git a/man/filter.Rd b/man/filter.Rd index 431013b4e9..1b845050b9 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -4,7 +4,7 @@ \alias{filter} \title{Keep rows that match a condition} \usage{ -filter(.data, ..., .preserve = FALSE) +filter(.data, ..., .by = NULL, .preserve = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -16,6 +16,13 @@ logical value, and are defined in terms of the variables in \code{.data}. If multiple expressions are included, they are combined with the \code{&} operator. Only rows for which all conditions evaluate to \code{TRUE} are kept.} +\item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group +by. This grouping will only be active for the duration of this verb. + +Can't be used when the data is already a grouped or rowwise data frame.} + \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise the grouping is kept as is.} diff --git a/tests/testthat/_snaps/filter.md b/tests/testthat/_snaps/filter.md index 128e727f1e..405f797358 100644 --- a/tests/testthat/_snaps/filter.md +++ b/tests/testthat/_snaps/filter.md @@ -229,3 +229,27 @@ x y 1 1 1 +# can't use `.by` with `.preserve` + + Code + filter(df, .by = x, .preserve = TRUE) + Condition + Error in `filter()`: + ! Can't supply both `.by` and `.preserve`. + +# catches `.by` with grouped-df + + Code + filter(gdf, .by = x) + Condition + Error in `filter()`: + ! `.by` can't be supplied when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + filter(rdf, .by = x) + Condition + Error in `filter()`: + ! `.by` can't be supplied when `.data` is a rowwise data frame. + diff --git a/tests/testthat/test-filter.R b/tests/testthat/test-filter.R index f7c33d3cc7..ac78adc544 100644 --- a/tests/testthat/test-filter.R +++ b/tests/testthat/test-filter.R @@ -587,3 +587,62 @@ test_that("if_any() and if_all() work", { filter(df, x1 > 6 | x2 > 6) ) }) + +# .by ------------------------------------------------------------------------- + +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) + + out <- filter(df, x > mean(x), .by = g) + + expect_identical(out$g, c(1, 2)) + expect_identical(out$x, c(10, 3)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) + out <- filter(df, x > mean(x), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains data frame attributes", { + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 1)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- filter(df, x > mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") + + out <- filter(tbl, x > mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") +}) + +test_that("can't use `.by` with `.preserve`", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + filter(df, .by = x, .preserve = TRUE) + }) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + filter(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + filter(rdf, .by = x) + }) +}) From 2e7f5a7020f4f957b0027b627148662130527ced Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 7 Nov 2022 13:26:16 -0500 Subject: [PATCH 04/20] Add `.by` support in `slice()` family --- R/slice.R | 156 ++++++++++++++++++++------------- man/slice.Rd | 37 ++++++-- tests/testthat/_snaps/slice.md | 52 +++++++++++ tests/testthat/test-slice.R | 88 +++++++++++++++++++ 4 files changed, 267 insertions(+), 66 deletions(-) diff --git a/R/slice.R b/R/slice.R index 542390b589..eb0dafc8da 100644 --- a/R/slice.R +++ b/R/slice.R @@ -20,6 +20,7 @@ #' operation, use [filter()] and [row_number()]. #' #' @family single table verbs +#' @inheritParams args_by #' @inheritParams arrange #' @inheritParams filter #' @param ... For `slice()`: <[`data-masking`][dplyr_data_masking]> Integer row @@ -118,52 +119,62 @@ #' filter(mtcars, row_number() == 1L) #' filter(mtcars, row_number() == n()) #' filter(mtcars, between(row_number(), 5, n())) -slice <- function(.data, ..., .preserve = FALSE) { +slice <- function(.data, ..., .by = NULL, .preserve = FALSE) { UseMethod("slice") } #' @export -slice.data.frame <- function(.data, ..., .preserve = FALSE) { - loc <- slice_rows(.data, ..., error_call = current_env()) +slice.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + by <- enquo(.by) + + if (!quo_is_null(by) && !is_false(.preserve)) { + abort("Can't supply both `.by` and `.preserve`.") + } + + loc <- slice_rows(.data, ..., .by = !!by) dplyr_row_slice(.data, loc, preserve = .preserve) } #' @export #' @rdname slice -slice_head <- function(.data, ..., n, prop) { +slice_head <- function(.data, ..., n, prop, by = NULL) { check_slice_dots(..., n = n, prop = prop) UseMethod("slice_head") } #' @export -slice_head.data.frame <- function(.data, ..., n, prop) { +slice_head.data.frame <- function(.data, ..., n, prop, by = NULL) { size <- get_slice_size(n = n, prop = prop) idx <- function(n) { seq2(1, size(n)) } dplyr_local_error_call() - slice(.data, idx(dplyr::n())) + dplyr_local_slice_by_arg("by") + + slice(.data, idx(dplyr::n()), .by = {{ by }}) } #' @export #' @rdname slice -slice_tail <- function(.data, ..., n, prop) { +slice_tail <- function(.data, ..., n, prop, by = NULL) { check_slice_dots(..., n = n, prop = prop) UseMethod("slice_tail") } #' @export -slice_tail.data.frame <- function(.data, ..., n, prop) { +slice_tail.data.frame <- function(.data, ..., n, prop, by = NULL) { size <- get_slice_size(n = n, prop = prop) idx <- function(n) { seq2(n - size(n) + 1, n) } dplyr_local_error_call() - slice(.data, idx(dplyr::n())) + dplyr_local_slice_by_arg("by") + + slice(.data, idx(dplyr::n()), .by = {{ by }}) } #' @export @@ -178,7 +189,7 @@ slice_tail.data.frame <- function(.data, ..., n, prop) { #' If `FALSE`, `NA` values are sorted to the end (like in [arrange()]), so #' they will only be included if there are insufficient non-missing values to #' reach `n`/`prop`. -slice_min <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_min <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { check_required(order_by) check_slice_dots(..., n = n, prop = prop) check_bool(with_ties) @@ -188,28 +199,34 @@ slice_min <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = F } #' @export -slice_min.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_min.data.frame <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { size <- get_slice_size(n = n, prop = prop) - dplyr_local_error_call() - slice(.data, local({ - n <- dplyr::n() - order_by <- {{ order_by }} - vec_assert(order_by, size = n) - - slice_rank_idx( - order_by, - size(n), - direction = "asc", - with_ties = with_ties, - na_rm = na_rm - ) - })) + dplyr_local_error_call() + dplyr_local_slice_by_arg("by") + + slice( + .data, + .by = {{ by }}, + local({ + n <- dplyr::n() + order_by <- {{ order_by }} + vec_assert(order_by, size = n) + + slice_rank_idx( + order_by, + size(n), + direction = "asc", + with_ties = with_ties, + na_rm = na_rm + ) + }) + ) } #' @export #' @rdname slice -slice_max <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_max <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { check_required(order_by) check_slice_dots(..., n = n, prop = prop) check_bool(with_ties) @@ -219,24 +236,30 @@ slice_max <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = F } #' @export -slice_max.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) { +slice_max.data.frame <- function(.data, order_by, ..., n, prop, by = NULL, with_ties = TRUE, na_rm = FALSE) { size <- get_slice_size(n = n, prop = prop) dplyr_local_error_call() - slice(.data, local({ - n <- dplyr::n() - order_by <- {{ order_by }} - - vec_assert(order_by, size = n) - - slice_rank_idx( - order_by, - size(n), - direction = "desc", - with_ties = with_ties, - na_rm = na_rm - ) - })) + dplyr_local_slice_by_arg("by") + + slice( + .data, + .by = {{ by }}, + local({ + n <- dplyr::n() + order_by <- {{ order_by }} + + vec_assert(order_by, size = n) + + slice_rank_idx( + order_by, + size(n), + direction = "desc", + with_ties = with_ties, + na_rm = na_rm + ) + }) + ) } #' @export @@ -246,7 +269,7 @@ slice_max.data.frame <- function(.data, order_by, ..., n, prop, with_ties = TRUE #' @param weight_by <[`data-masking`][dplyr_data_masking]> Sampling weights. #' This must evaluate to a vector of non-negative numbers the same length as #' the input. Weights are automatically standardised to sum to 1. -slice_sample <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) { +slice_sample <- function(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) { check_slice_dots(..., n = n, prop = prop) check_bool(replace) @@ -254,40 +277,45 @@ slice_sample <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) } #' @export -slice_sample.data.frame <- function(.data, ..., n, prop, weight_by = NULL, replace = FALSE) { +slice_sample.data.frame <- function(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) { size <- get_slice_size(n = n, prop = prop, allow_outsize = replace) dplyr_local_error_call() - slice(.data, local({ - weight_by <- {{ weight_by }} + dplyr_local_slice_by_arg("by") - n <- dplyr::n() - if (!is.null(weight_by)) { - weight_by <- vec_assert(weight_by, size = n, arg = "weight_by") - } - sample_int(n, size(n), replace = replace, wt = weight_by) - })) + slice( + .data, + .by = {{ by }}, + local({ + weight_by <- {{ weight_by }} + + n <- dplyr::n() + if (!is.null(weight_by)) { + weight_by <- vec_assert(weight_by, size = n, arg = "weight_by") + } + sample_int(n, size(n), replace = replace, wt = weight_by) + }) + ) } # helpers ----------------------------------------------------------------- -slice_rows <- function(.data, ..., error_call = caller_env()) { +slice_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) { error_call <- dplyr_error_call(error_call) - dots <- enquos(...) - if (is_empty(dots)) { - return(TRUE) - } - - # TODO: Expose `.by` by <- compute_by( - by = NULL, + by = {{ .by }}, data = .data, - by_arg = ".by", + by_arg = the$slice_by_arg, data_arg = ".data", error_call = error_call ) + dots <- enquos(...) + if (is_empty(dots)) { + return(TRUE) + } + mask <- DataMask$new(.data, by, "slice", error_call = error_call) on.exit(mask$forget(), add = TRUE) @@ -510,6 +538,14 @@ slice_rank_idx <- function( which[order(ranks[which])] } +on_load({ + # Default used by `slice()` + the$slice_by_arg <- ".by" +}) +dplyr_local_slice_by_arg <- function(by_arg, frame = caller_env()) { + local_bindings(slice_by_arg = by_arg, .env = the, .frame = frame) +} + # Backports for R 3.5.0 utils ...length2 <- function(frame = caller_env()) { length(env_get(frame, "...")) diff --git a/man/slice.Rd b/man/slice.Rd index c75a4fd59b..28c24eaac0 100644 --- a/man/slice.Rd +++ b/man/slice.Rd @@ -9,17 +9,35 @@ \alias{slice_sample} \title{Subset rows using their positions} \usage{ -slice(.data, ..., .preserve = FALSE) +slice(.data, ..., .by = NULL, .preserve = FALSE) -slice_head(.data, ..., n, prop) +slice_head(.data, ..., n, prop, by = NULL) -slice_tail(.data, ..., n, prop) +slice_tail(.data, ..., n, prop, by = NULL) -slice_min(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) +slice_min( + .data, + order_by, + ..., + n, + prop, + by = NULL, + with_ties = TRUE, + na_rm = FALSE +) -slice_max(.data, order_by, ..., n, prop, with_ties = TRUE, na_rm = FALSE) +slice_max( + .data, + order_by, + ..., + n, + prop, + by = NULL, + with_ties = TRUE, + na_rm = FALSE +) -slice_sample(.data, ..., n, prop, weight_by = NULL, replace = FALSE) +slice_sample(.data, ..., n, prop, by = NULL, weight_by = NULL, replace = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a @@ -35,6 +53,13 @@ Indices beyond the number of rows in the input are silently ignored. For \verb{slice_*()}, these arguments are passed on to methods.} +\item{.by, by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} + +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group +by. This grouping will only be active for the duration of this verb. + +Can't be used when the data is already a grouped or rowwise data frame.} + \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the resulting data, otherwise the grouping is kept as is.} diff --git a/tests/testthat/_snaps/slice.md b/tests/testthat/_snaps/slice.md index d7d90c88ee..dde7d7d033 100644 --- a/tests/testthat/_snaps/slice.md +++ b/tests/testthat/_snaps/slice.md @@ -47,6 +47,30 @@ Caused by error in `1 + ""`: ! non-numeric argument to binary operator +# can't use `.by` with `.preserve` + + Code + slice(df, .by = x, .preserve = TRUE) + Condition + Error in `slice()`: + ! Can't supply both `.by` and `.preserve`. + +# catches `.by` with grouped-df + + Code + slice(gdf, .by = x) + Condition + Error in `slice()`: + ! `.by` can't be supplied when `.data` is a grouped data frame. + +# catches `.by` with rowwise-df + + Code + slice(rdf, .by = x) + Condition + Error in `slice()`: + ! `.by` can't be supplied when `.data` is a rowwise data frame. + # slice_helpers() call get_slice_size() Code @@ -196,6 +220,34 @@ * ..2 = 2 i Did you forget to name an argument? +# slice_helper `by` errors use correct error context and correct `by_arg` + + Code + slice_head(gdf, n = 1, by = x) + Condition + Error in `slice_head()`: + ! `by` can't be supplied when `.data` is a grouped data frame. + Code + slice_tail(gdf, n = 1, by = x) + Condition + Error in `slice_tail()`: + ! `by` can't be supplied when `.data` is a grouped data frame. + Code + slice_min(gdf, order_by = x, by = x) + Condition + Error in `slice_min()`: + ! `by` can't be supplied when `.data` is a grouped data frame. + Code + slice_max(gdf, order_by = x, by = x) + Condition + Error in `slice_max()`: + ! `by` can't be supplied when `.data` is a grouped data frame. + Code + slice_sample(gdf, n = 1, by = x) + Condition + Error in `slice_sample()`: + ! `by` can't be supplied when `.data` is a grouped data frame. + # slice_min/max() check size of `order_by=` (#5922) Code diff --git a/tests/testthat/test-slice.R b/tests/testthat/test-slice.R index 8e78cde13d..d3ad543c87 100644 --- a/tests/testthat/test-slice.R +++ b/tests/testthat/test-slice.R @@ -91,6 +91,63 @@ test_that("user errors are correctly labelled", { }) }) +test_that("can group transiently using `.by`", { + df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3)) + + out <- slice(df, n(), .by = g) + + expect_identical(out$g, c(1, 2)) + expect_identical(out$x, c(2, 3)) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains bare data.frame class", { + df <- tibble(g = c(1, 1, 2), x = c(1, 2, 3)) + out <- slice(df, n(), .by = g) + expect_s3_class(out, class(df), exact = TRUE) +}) + +test_that("transient grouping retains data frame attributes", { + # With data.frames or tibbles + df <- data.frame(g = c(1, 1, 2), x = c(1, 2, 3)) + tbl <- as_tibble(df) + + attr(df, "foo") <- "bar" + attr(tbl, "foo") <- "bar" + + out <- slice(df, n(), .by = g) + expect_identical(attr(out, "foo"), "bar") + + out <- slice(tbl, n(), .by = g) + expect_identical(attr(out, "foo"), "bar") +}) + +test_that("can't use `.by` with `.preserve`", { + df <- tibble(x = 1) + + expect_snapshot(error = TRUE, { + slice(df, .by = x, .preserve = TRUE) + }) +}) + +test_that("catches `.by` with grouped-df", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + slice(gdf, .by = x) + }) +}) + +test_that("catches `.by` with rowwise-df", { + df <- tibble(x = 1) + rdf <- rowwise(df) + + expect_snapshot(error = TRUE, { + slice(rdf, .by = x) + }) +}) + # Slice variants ---------------------------------------------------------- test_that("slice_helpers() call get_slice_size()", { @@ -221,6 +278,19 @@ test_that("slice_helpers do call slice() and benefit from dispatch (#6084)", { expect_warning(sample_frac(nf, .5), "noisy") }) +test_that("slice_helper `by` errors use correct error context and correct `by_arg`", { + df <- tibble(x = 1) + gdf <- group_by(df, x) + + expect_snapshot(error = TRUE, { + slice_head(gdf, n = 1, by = x) + slice_tail(gdf, n = 1, by = x) + slice_min(gdf, order_by = x, by = x) + slice_max(gdf, order_by = x, by = x) + slice_sample(gdf, n = 1, by = x) + }) +}) + # slice_min/slice_max ----------------------------------------------------- test_that("min and max return ties by default", { @@ -287,6 +357,13 @@ test_that("slice_min/max() can order by multiple variables (#6176)", { expect_equal(slice_max(df, tibble(x, y), n = 1)$id, 2) }) +test_that("slice_min/max() work with `by`", { + df <- tibble(g = c(2, 2, 1, 1), x = c(1, 2, 3, 1)) + + expect_identical(slice_min(df, x, by = g), df[c(4, 1),]) + expect_identical(slice_max(df, x, by = g), df[c(3, 2),]) +}) + test_that("slice_min/max() check size of `order_by=` (#5922)", { expect_snapshot(error = TRUE, { slice_min(data.frame(x = 1:10), 1:6) @@ -360,6 +437,11 @@ test_that("slice_sample() handles negative n= and prop= (#6402)", { expect_equal(nrow(slice_sample(df, prop = -2)), 0) }) +test_that("slice_sample() works with `by`", { + df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) + expect_identical(slice_sample(df, n = 2, by = g)$g, c(1, 2, 2)) +}) + # slice_head/slice_tail --------------------------------------------------- test_that("slice_head/slice_tail keep positive values", { @@ -404,3 +486,9 @@ test_that("slice_head/slice_tail handle infinite n/prop", { expect_identical(slice_head(df, prop = -Inf), df[0, ]) expect_identical(slice_tail(df, prop = -Inf), df[0, ]) }) + +test_that("slice_head/slice_tail work with `by`", { + df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) + expect_identical(slice_head(df, n = 2, by = g), df[c(4, 1, 2),]) + expect_identical(slice_tail(df, n = 2, by = g), df[c(4, 2, 3),]) +}) From 26e703de802230f3d4aafd1be265ac056f5f9d19 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 7 Nov 2022 13:32:45 -0500 Subject: [PATCH 05/20] Move the `.by` collision checks into the generics --- R/filter.R | 12 ++++++------ R/slice.R | 12 ++++++------ R/summarise.R | 14 +++++++------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/R/filter.R b/R/filter.R index 780b1fa504..f7d3518141 100644 --- a/R/filter.R +++ b/R/filter.R @@ -107,18 +107,18 @@ #' ) #' # Learn more in ?dplyr_data_masking filter <- function(.data, ..., .by = NULL, .preserve = FALSE) { - UseMethod("filter") -} - -#' @export -filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { by <- enquo(.by) if (!quo_is_null(by) && !is_false(.preserve)) { abort("Can't supply both `.by` and `.preserve`.") } - loc <- filter_rows(.data, ..., .by = !!by) + UseMethod("filter") +} + +#' @export +filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + loc <- filter_rows(.data, ..., .by = {{ .by }}) dplyr_row_slice(.data, loc, preserve = .preserve) } diff --git a/R/slice.R b/R/slice.R index eb0dafc8da..de6a271a83 100644 --- a/R/slice.R +++ b/R/slice.R @@ -120,18 +120,18 @@ #' filter(mtcars, row_number() == n()) #' filter(mtcars, between(row_number(), 5, n())) slice <- function(.data, ..., .by = NULL, .preserve = FALSE) { - UseMethod("slice") -} - -#' @export -slice.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { by <- enquo(.by) if (!quo_is_null(by) && !is_false(.preserve)) { abort("Can't supply both `.by` and `.preserve`.") } - loc <- slice_rows(.data, ..., .by = !!by) + UseMethod("slice") +} + +#' @export +slice.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + loc <- slice_rows(.data, ..., .by = {{ .by }}) dplyr_row_slice(.data, loc, preserve = .preserve) } diff --git a/R/summarise.R b/R/summarise.R index 0dee253cf7..536c029db6 100644 --- a/R/summarise.R +++ b/R/summarise.R @@ -118,6 +118,12 @@ #' summarise(starwars, avg = mean(.data[[var]], na.rm = TRUE)) #' # Learn more in ?dplyr_data_masking summarise <- function(.data, ..., .by = NULL, .groups = NULL) { + by <- enquo(.by) + + if (!quo_is_null(by) && !is.null(.groups)) { + abort("Can't supply both `.by` and `.groups`.") + } + UseMethod("summarise") } #' @rdname summarise @@ -126,13 +132,7 @@ summarize <- summarise #' @export summarise.data.frame <- function(.data, ..., .by = NULL, .groups = NULL) { - by <- enquo(.by) - - if (!quo_is_null(by) && !is.null(.groups)) { - abort("Can't supply both `.by` and `.groups`.") - } - - by <- compute_by(!!by, .data, by_arg = ".by", data_arg = ".data") + by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") cols <- summarise_cols(.data, dplyr_quosures(...), by) out <- summarise_build(by, cols) From 660dfc031afb9fdb2659c9ed443783372701184c Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 7 Nov 2022 17:48:22 -0500 Subject: [PATCH 06/20] Tweak `summarise_verbose()` to respect if the global option is `TRUE` This should override the `global_env()` reference check, so we can force verbosity in relevant documentation pages --- R/summarise.R | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/R/summarise.R b/R/summarise.R index 536c029db6..0c1fc52d7e 100644 --- a/R/summarise.R +++ b/R/summarise.R @@ -421,9 +421,19 @@ summarise_bullets <- function(cnd, ...) { # messaging --------------------------------------------------------------- summarise_verbose <- function(.groups, .env) { - is.null(.groups) && - is_reference(topenv(.env), global_env()) && - !identical(getOption("dplyr.summarise.inform"), FALSE) + if (!is.null(.groups)) { + # User supplied `.groups` + return(FALSE) + } + + inform <- getOption("dplyr.summarise.inform") + + if (is_true(inform) || is_false(inform)) { + # User supplied global option + return(inform) + } + + is_reference(topenv(.env), global_env()) } summarise_inform <- function(..., .env = parent.frame()) { From 874344114b1dc2f1cfeb4b392ab1b5c8baa2689a Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Mon, 7 Nov 2022 18:14:41 -0500 Subject: [PATCH 07/20] Add a full documentation page specific to `.by` --- R/by.R | 21 ++++-- man/args_by.Rd | 7 +- man/dplyr_by.Rd | 180 +++++++++++++++++++++++++++++++++++++++++++++++ man/filter.Rd | 7 +- man/mutate.Rd | 7 +- man/rmd/by.Rmd | 118 +++++++++++++++++++++++++++++++ man/slice.Rd | 7 +- man/summarise.Rd | 7 +- 8 files changed, 330 insertions(+), 24 deletions(-) create mode 100644 man/dplyr_by.Rd create mode 100644 man/rmd/by.Rmd diff --git a/R/by.R b/R/by.R index 2ed9d473a5..fbfe7b0240 100644 --- a/R/by.R +++ b/R/by.R @@ -4,15 +4,28 @@ #' #' @param .by `r lifecycle::badge("experimental")` #' -#' <[`tidy-select`][dplyr_tidy_select]> Optionally, select columns to group -#' by. This grouping will only be active for the duration of this verb. -#' -#' Can't be used when the data is already a grouped or rowwise data frame. +#' <[`tidy-select`][dplyr_tidy_select]> Optionally, a selection of columns to +#' temporarily group by using an inline alternative to [group_by()]. For +#' details and examples, see [?dplyr_by][dplyr_by]. #' #' @name args_by #' @keywords internal NULL +#' Grouping with `.by` +#' +#' ```{r, echo = FALSE, results = "asis"} +#' result <- rlang::with_options( +#' knitr::knit_child("man/rmd/by.Rmd"), +#' dplyr.summarise.inform = TRUE +#' ) +#' cat(result, sep = "\n") +#' ``` +#' +#' @name dplyr_by +#' @keywords internal +NULL + compute_by <- function(by, data, ..., diff --git a/man/args_by.Rd b/man/args_by.Rd index d6f6ee9687..cdb9aa9e9d 100644 --- a/man/args_by.Rd +++ b/man/args_by.Rd @@ -6,10 +6,9 @@ \arguments{ \item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} -<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group -by. This grouping will only be active for the duration of this verb. - -Can't be used when the data is already a grouped or rowwise data frame.} +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} } \description{ Use \verb{@inheritParams args_by} to consistently document \code{.by}. diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd new file mode 100644 index 0000000000..9ebc641c9a --- /dev/null +++ b/man/dplyr_by.Rd @@ -0,0 +1,180 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/by.R +\name{dplyr_by} +\alias{dplyr_by} +\title{Grouping with \code{.by}} +\description{ +There are two ways to group in dplyr: +\itemize{ +\item Persistent grouping with \code{\link[=group_by]{group_by()}} +\item Temporary grouping with \code{.by} +} + +This help page is dedicated to explaining where and why you might want to use the latter. +The most important reason to use \code{.by} is that you \emph{never} have to remember to use \code{\link[=ungroup]{ungroup()}} after it. +The grouping is always temporary, meaning that if an ungrouped data frame goes in, then an ungrouped data frame comes out, regardless of the number of grouping columns that are specified. + +The following dplyr verbs support \code{.by}: +\itemize{ +\item \code{\link[=mutate]{mutate()}} +\item \code{\link[=summarise]{summarise()}} +\item \code{\link[=filter]{filter()}} +\item \code{\link[=slice]{slice()}} and its variants, such as \code{\link[=slice_head]{slice_head()}} +} + +Let's take a look at the two grouping approaches using this \code{expenses} data set, which tracks costs accumulated across various \code{id}s and \code{region}s: + +\if{html}{\out{
}}\preformatted{expenses <- tibble( + id = c(1, 2, 1, 3, 1, 2, 3), + region = c("A", "A", "A", "B", "B", "A", "A"), + cost = c(25, 20, 19, 12, 9, 6, 6) +) +expenses +#> # A tibble: 7 x 3 +#> id region cost +#> +#> 1 1 A 25 +#> 2 2 A 20 +#> 3 1 A 19 +#> 4 3 B 12 +#> 5 1 B 9 +#> 6 2 A 6 +#> 7 3 A 6 +}\if{html}{\out{
}} + +Imagine that you wanted to compute the average cost per region. +You'd probably write something like this: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(region) \%>\% + summarise(cost = mean(cost)) +#> # A tibble: 2 x 2 +#> region cost +#> +#> 1 A 15.2 +#> 2 B 10.5 +}\if{html}{\out{
}} + +As of dplyr 1.1.0, an additional option is available that lets you specify the grouping \emph{inline} within the verb: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + summarise(cost = mean(cost), .by = region) +#> # A tibble: 2 x 2 +#> region cost +#> +#> 1 A 15.2 +#> 2 B 10.5 +}\if{html}{\out{
}} + +This great idea comes from \href{https://CRAN.R-project.org/package=data.table}{data.table}, where the equivalent syntax looks something like \code{expenses[, .(cost = mean(cost)), by = region]}. + +Grouping with \code{.by} is temporary, meaning that since \code{expenses} was an ungrouped data frame, the result after applying \code{.by} will also always be an ungrouped data frame, full stop. +Compare that with \code{group_by() \%>\% summarise()}, where \code{summarise()} generally peels off 1 layer of grouping by default, with a message that it is doing so if there were originally more than one grouping columns: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(id, region) \%>\% + summarise(cost = mean(cost)) +#> `summarise()` has grouped output by 'id'. You can override using the `.groups` +#> argument. +#> # A tibble: 5 x 3 +#> # Groups: id [3] +#> id region cost +#> +#> 1 1 A 22 +#> 2 1 B 9 +#> 3 2 A 13 +#> 4 3 A 6 +#> 5 3 B 12 +}\if{html}{\out{
}} + +This behavior is sometimes useful to sequentially "roll up" a data frame that has been grouped by multiple columns, but often you just want to group temporarily. +Traditionally, you'd either \code{ungroup()} after the \code{summarise()} or explicitly set \code{.groups = "drop"} to achieve this: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(id, region) \%>\% + summarise(cost = mean(cost), .groups = "drop") +#> # A tibble: 5 x 3 +#> id region cost +#> +#> 1 1 A 22 +#> 2 1 B 9 +#> 3 2 A 13 +#> 4 3 A 6 +#> 5 3 B 12 +}\if{html}{\out{
}} + +Because \code{.by} grouping is temporary, you don't need to worry about ungrouping and it never needs to emit a message to remind you what is happening: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + summarise(cost = mean(cost), .by = c(id, region)) +#> # A tibble: 5 x 3 +#> id region cost +#> +#> 1 1 A 22 +#> 2 1 B 9 +#> 3 2 A 13 +#> 4 3 A 6 +#> 5 3 B 12 +}\if{html}{\out{
}} + +Note that we specified multiple columns to group by using the \link[=dplyr_tidy_select]{tidy-select} syntax \code{c(id, region)}. +If you have a character vector of column names you'd like to group by, you can do so with \code{.by = all_of(my_cols)}. + +To prevent surprising results, you can't use \code{.by} on an existing grouped data frame: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(id) \%>\% + summarise(cost = mean(cost), .by = c(id, region)) +#> Error in `summarise()`: +#> ! `.by` can't be supplied when `.data` is a grouped data frame. +}\if{html}{\out{
}} + +So far we've focused on the usage of \code{.by} with \code{summarise()}, but \code{.by} works with a number of other dplyr verbs. +For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + mutate(cost_by_region = mean(cost), .by = region) +#> # A tibble: 7 x 4 +#> id region cost cost_by_region +#> +#> 1 1 A 25 15.2 +#> 2 2 A 20 15.2 +#> 3 1 A 19 15.2 +#> 4 3 B 12 10.5 +#> 5 1 B 9 10.5 +#> 6 2 A 6 15.2 +#> 7 3 A 6 15.2 +}\if{html}{\out{
}} + +Or you could slice out the maximum cost per combination of id and region: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + slice_max(cost, n = 1, by = c(id, region)) +#> # A tibble: 5 x 3 +#> id region cost +#> +#> 1 1 A 25 +#> 2 1 B 9 +#> 3 2 A 20 +#> 4 3 A 6 +#> 5 3 B 12 +}\if{html}{\out{
}} + +Again, note that the result of this \code{slice_max()} is an ungrouped data frame, which is probably what you'd expect here. +Compare that to the \code{group_by()} approach, which uses persistent grouping and returns another grouped data frame: + +\if{html}{\out{
}}\preformatted{expenses \%>\% + group_by(id, region) \%>\% + slice_max(cost, n = 1) +#> # A tibble: 5 x 3 +#> # Groups: id, region [5] +#> id region cost +#> +#> 1 1 A 25 +#> 2 1 B 9 +#> 3 2 A 20 +#> 4 3 A 6 +#> 5 3 B 12 +}\if{html}{\out{
}} +} +\keyword{internal} diff --git a/man/filter.Rd b/man/filter.Rd index 1b845050b9..6bd7bf7469 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -18,10 +18,9 @@ Only rows for which all conditions evaluate to \code{TRUE} are kept.} \item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} -<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group -by. This grouping will only be active for the duration of this verb. - -Can't be used when the data is already a grouped or rowwise data frame.} +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure diff --git a/man/mutate.Rd b/man/mutate.Rd index a1410b498a..666446c3a8 100644 --- a/man/mutate.Rd +++ b/man/mutate.Rd @@ -35,10 +35,9 @@ if ungrouped). \item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} -<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group -by. This grouping will only be active for the duration of this verb. - -Can't be used when the data is already a grouped or rowwise data frame.} +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.keep}{Control which columns from \code{.data} are retained in the output. Grouping columns and columns created by \code{...} are always kept. diff --git a/man/rmd/by.Rmd b/man/rmd/by.Rmd new file mode 100644 index 0000000000..699e4082ee --- /dev/null +++ b/man/rmd/by.Rmd @@ -0,0 +1,118 @@ +--- +output: html_document +editor_options: + chunk_output_type: console +--- + +```{r setup, include=FALSE} +knitr::opts_chunk$set(echo = TRUE) +``` + +There are two ways to group in dplyr: + +- Persistent grouping with [group_by()] + +- Temporary grouping with `.by` + +This help page is dedicated to explaining where and why you might want to use the latter. +The most important reason to use `.by` is that you *never* have to remember to use [ungroup()] after it. +The grouping is always temporary, meaning that if an ungrouped data frame goes in, then an ungrouped data frame comes out, regardless of the number of grouping columns that are specified. + +The following dplyr verbs support `.by`: + +- [mutate()] + +- [summarise()] + +- [filter()] + +- [slice()] and its variants, such as [slice_head()] + +Let's take a look at the two grouping approaches using this `expenses` data set, which tracks costs accumulated across various `id`s and `region`s: + +```{r} +expenses <- tibble( + id = c(1, 2, 1, 3, 1, 2, 3), + region = c("A", "A", "A", "B", "B", "A", "A"), + cost = c(25, 20, 19, 12, 9, 6, 6) +) +expenses +``` + +Imagine that you wanted to compute the average cost per region. +You'd probably write something like this: + +```{r} +expenses %>% + group_by(region) %>% + summarise(cost = mean(cost)) +``` + +As of dplyr 1.1.0, an additional option is available that lets you specify the grouping *inline* within the verb: + +```{r} +expenses %>% + summarise(cost = mean(cost), .by = region) +``` + +This great idea comes from [data.table](https://CRAN.R-project.org/package=data.table), where the equivalent syntax looks something like `expenses[, .(cost = mean(cost)), by = region]`. + +Grouping with `.by` is temporary, meaning that since `expenses` was an ungrouped data frame, the result after applying `.by` will also always be an ungrouped data frame, full stop. +Compare that with `group_by() %>% summarise()`, where `summarise()` generally peels off 1 layer of grouping by default, with a message that it is doing so if there were originally more than one grouping columns: + +```{r} +expenses %>% + group_by(id, region) %>% + summarise(cost = mean(cost)) +``` + +This behavior is sometimes useful to sequentially "roll up" a data frame that has been grouped by multiple columns, but often you just want to group temporarily. +Traditionally, you'd either `ungroup()` after the `summarise()` or explicitly set `.groups = "drop"` to achieve this: + +```{r} +expenses %>% + group_by(id, region) %>% + summarise(cost = mean(cost), .groups = "drop") +``` + +Because `.by` grouping is temporary, you don't need to worry about ungrouping and it never needs to emit a message to remind you what is happening: + +```{r} +expenses %>% + summarise(cost = mean(cost), .by = c(id, region)) +``` + +Note that we specified multiple columns to group by using the [tidy-select][dplyr_tidy_select] syntax `c(id, region)`. +If you have a character vector of column names you'd like to group by, you can do so with `.by = all_of(my_cols)`. + +To prevent surprising results, you can't use `.by` on an existing grouped data frame: + +```{r, error=TRUE} +expenses %>% + group_by(id) %>% + summarise(cost = mean(cost), .by = c(id, region)) +``` + +So far we've focused on the usage of `.by` with `summarise()`, but `.by` works with a number of other dplyr verbs. +For example, you could append the mean cost per region onto the original data frame as a new column rather than computing a summary: + +```{r} +expenses %>% + mutate(cost_by_region = mean(cost), .by = region) +``` + +Or you could slice out the maximum cost per combination of id and region: + +```{r} +expenses %>% + slice_max(cost, n = 1, by = c(id, region)) +``` + +Again, note that the result of this `slice_max()` is an ungrouped data frame, which is probably what you'd expect here. +Compare that to the `group_by()` approach, which uses persistent grouping and returns another grouped data frame: + +```{r} +expenses %>% + group_by(id, region) %>% + slice_max(cost, n = 1) +``` diff --git a/man/slice.Rd b/man/slice.Rd index 28c24eaac0..cfb89c59de 100644 --- a/man/slice.Rd +++ b/man/slice.Rd @@ -55,10 +55,9 @@ For \verb{slice_*()}, these arguments are passed on to methods.} \item{.by, by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} -<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group -by. This grouping will only be active for the duration of this verb. - -Can't be used when the data is already a grouped or rowwise data frame.} +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure diff --git a/man/summarise.Rd b/man/summarise.Rd index 27fe68e537..7c2055fb20 100644 --- a/man/summarise.Rd +++ b/man/summarise.Rd @@ -26,10 +26,9 @@ The value can be: \item{.by}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} -<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, select columns to group -by. This grouping will only be active for the duration of this verb. - -Can't be used when the data is already a grouped or rowwise data frame.} +<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to +temporarily group by using an inline alternative to \code{\link[=group_by]{group_by()}}. For +details and examples, see \link[=dplyr_by]{?dplyr_by}.} \item{.groups}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} Grouping structure of the result. \itemize{ From a56c43924bf9676c1dbdd4ee940c34f0eb18ab49 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 8 Nov 2022 09:23:06 -0500 Subject: [PATCH 08/20] Add section about verbs without `.by` support --- man/dplyr_by.Rd | 15 +++++++++++++++ man/rmd/by.Rmd | 16 ++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd index 9ebc641c9a..55f116ded9 100644 --- a/man/dplyr_by.Rd +++ b/man/dplyr_by.Rd @@ -176,5 +176,20 @@ Compare that to the \code{group_by()} approach, which uses persistent grouping a #> 4 3 A 6 #> 5 3 B 12 }\if{html}{\out{}} +\subsection{Verbs without \code{.by} support}{ + +If a dplyr verb doesn't support \code{.by}, then that typically means that the verb isn't inherently affected by grouping. +For example, \code{\link[=pull]{pull()}} and \code{\link[=rename]{rename()}} don't support \code{.by}, because specifying columns to group by would not affect their implementations. + +That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support \code{.by}, but \emph{does} have special support for grouped data frames created by \code{\link[=group_by]{group_by()}}. +This is typically because the verbs are required to retain the grouping columns, for example: +\itemize{ +\item \code{\link[=select]{select()}} always retains grouping columns, with a message if any aren't specified in the \code{select()} call. +\item \code{\link[=distinct]{distinct()}} and \code{\link[=count]{count()}} place unspecified grouping columns at the front of the data frame before computing their results. +\item \code{\link[=arrange]{arrange()}} has a \code{.by_group} argument to optionally order by grouping columns first. +} + +If \code{group_by()} didn't exist, then these verbs would not have special support for grouped data frames. +} } \keyword{internal} diff --git a/man/rmd/by.Rmd b/man/rmd/by.Rmd index 699e4082ee..3380b27fa7 100644 --- a/man/rmd/by.Rmd +++ b/man/rmd/by.Rmd @@ -116,3 +116,19 @@ expenses %>% group_by(id, region) %>% slice_max(cost, n = 1) ``` + +### Verbs without `.by` support + +If a dplyr verb doesn't support `.by`, then that typically means that the verb isn't inherently affected by grouping. +For example, [pull()] and [rename()] don't support `.by`, because specifying columns to group by would not affect their implementations. + +That said, there are a few exceptions to this where sometimes a dplyr verb doesn't support `.by`, but *does* have special support for grouped data frames created by [group_by()]. +This is typically because the verbs are required to retain the grouping columns, for example: + +- [select()] always retains grouping columns, with a message if any aren't specified in the `select()` call. + +- [distinct()] and [count()] place unspecified grouping columns at the front of the data frame before computing their results. + +- [arrange()] has a `.by_group` argument to optionally order by grouping columns first. + +If `group_by()` didn't exist, then these verbs would not have special support for grouped data frames. From bd4726046ac1c87e344a3be4fc37e95575c2f901 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Tue, 8 Nov 2022 11:34:56 -0500 Subject: [PATCH 09/20] NEWS bullet --- NEWS.md | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/NEWS.md b/NEWS.md index 001ec4e5e1..1a657a8c16 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,43 @@ # dplyr (development version) +* `.by` is a new experimental inline alternative to `group_by()` that supports + _temporary_ grouping in the following key dplyr verbs: `mutate()`, + `summarise()`, `filter()`, and the `slice()` family (#6528). + + Rather than: + + ``` + starwars %>% + group_by(species, homeworld) %>% + summarise(mean_height = mean(height)) + ``` + + You can now write: + + ``` + starwars %>% + summarise(mean_height = mean(height), .by = c(species, homeworld)) + ``` + + The most useful reason to do this is because grouping with `.by` is + _temporary_ and only affects the verb it is being applied to. An ungrouped + data frame went into the `summarise()` call, so an ungrouped data frame will + come out. With `.by`, you never need to remember to `ungroup()` afterwards. + + Compare that with the `group_by()` example, where one layer of grouping would + be peeled off and the output of `summarise()` would be another grouped data + frame with a message informing you of how `summarise()` had re-grouped the + result. This typically requires either the usage of `.groups` to silence the + message or an explicit `ungroup()` afterwards. + + This exciting feature was inspired by + [data.table](https://CRAN.R-project.org/package=data.table), where the + equivalent syntax looks like: + + ``` + starwars[, .(mean_height = mean(height)), by = .(species, homeworld)] + ``` + * `summarise()` now correctly recycles named 0-column data frames (#6509). * `.cols` and `.fns` are now required arguments in `across()`, `c_across()`, From c5449d83b6c5826d829cf0493350a3390fc5eb8e Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Wed, 16 Nov 2022 16:22:39 -0500 Subject: [PATCH 10/20] Order groups by first appearance when using `.by` --- R/by.R | 15 ++++++- man/dplyr_by.Rd | 73 +++++++++++++++++++++++++++++---- man/rmd/by.Rmd | 33 +++++++++++++++ tests/testthat/test-by.R | 24 +++++++++-- tests/testthat/test-slice.R | 19 ++++++--- tests/testthat/test-summarise.R | 9 ++++ 6 files changed, 156 insertions(+), 17 deletions(-) diff --git a/R/by.R b/R/by.R index fbfe7b0240..4363fe742e 100644 --- a/R/by.R +++ b/R/by.R @@ -58,13 +58,26 @@ compute_by <- function(by, } else { type <- "grouped" names <- by - data <- compute_groups(data, by, drop = TRUE) + data <- compute_by_groups(data, by, error_call = error_call) } } new_by(type = type, names = names, data = data) } +compute_by_groups <- function(data, names, error_call = caller_env()) { + data <- dplyr_col_select(data, names, error_call = error_call) + info <- vec_group_loc(data) + + size <- vec_size(info) + + out <- dplyr_new_list(info$key) + out[[".rows"]] <- new_list_of(info$loc, ptype = integer()) + out <- new_tibble(out, nrow = size) + + out +} + check_by <- function(by, data, ..., diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd index 55f116ded9..e60332a1a3 100644 --- a/man/dplyr_by.Rd +++ b/man/dplyr_by.Rd @@ -111,10 +111,10 @@ Because \code{.by} grouping is temporary, you don't need to worry about ungroupi #> id region cost #> #> 1 1 A 22 -#> 2 1 B 9 -#> 3 2 A 13 -#> 4 3 A 6 -#> 5 3 B 12 +#> 2 2 A 13 +#> 3 3 B 12 +#> 4 1 B 9 +#> 5 3 A 6 }\if{html}{\out{}} Note that we specified multiple columns to group by using the \link[=dplyr_tidy_select]{tidy-select} syntax \code{c(id, region)}. @@ -154,10 +154,10 @@ Or you could slice out the maximum cost per combination of id and region: #> id region cost #> #> 1 1 A 25 -#> 2 1 B 9 -#> 3 2 A 20 -#> 4 3 A 6 -#> 5 3 B 12 +#> 2 2 A 20 +#> 3 3 B 12 +#> 4 1 B 9 +#> 5 3 A 6 }\if{html}{\out{}} Again, note that the result of this \code{slice_max()} is an ungrouped data frame, which is probably what you'd expect here. @@ -176,6 +176,63 @@ Compare that to the \code{group_by()} approach, which uses persistent grouping a #> 4 3 A 6 #> 5 3 B 12 }\if{html}{\out{}} +\subsection{Result ordering}{ + +When used with \code{.by}, \code{summarise()} and \code{slice()} both return results where the group keys are ordered by their existing ordering in the original data rather than by their values. +This is different from \code{group_by()}, which has always sorted the keys in ascending order. + +\if{html}{\out{
}}\preformatted{df <- tibble( + month = c("jan", "jan", "feb", "feb", "mar"), + temp = c(20, 25, 18, 20, 40) +) + +# Uses ordering by "first appearance" in the original data +df \%>\% + summarise(average_temp = mean(temp), .by = month) +#> # A tibble: 3 x 2 +#> month average_temp +#> +#> 1 jan 22.5 +#> 2 feb 19 +#> 3 mar 40 + +# Sorts in ascending order +df \%>\% + group_by(month) \%>\% + summarise(average_temp = mean(temp)) +#> # A tibble: 3 x 2 +#> month average_temp +#> +#> 1 feb 19 +#> 2 jan 22.5 +#> 3 mar 40 + +# Same with `slice()` +df \%>\% + slice_max(temp, by = month) +#> # A tibble: 3 x 2 +#> month temp +#> +#> 1 jan 25 +#> 2 feb 20 +#> 3 mar 40 + +df \%>\% + group_by(month) \%>\% + slice_max(temp) \%>\% + ungroup() +#> # A tibble: 3 x 2 +#> month temp +#> +#> 1 feb 20 +#> 2 jan 25 +#> 3 mar 40 +}\if{html}{\out{
}} + +If you need sorted group keys, we recommend that you explicitly use \code{\link[=arrange]{arrange()}} either before or after the call to \code{summarise()} or \code{slice()}. +This also gives you full access to all of \code{arrange()}'s features, such as \code{desc()} and the \code{.locale} argument. +} + \subsection{Verbs without \code{.by} support}{ If a dplyr verb doesn't support \code{.by}, then that typically means that the verb isn't inherently affected by grouping. diff --git a/man/rmd/by.Rmd b/man/rmd/by.Rmd index 3380b27fa7..1a43879be6 100644 --- a/man/rmd/by.Rmd +++ b/man/rmd/by.Rmd @@ -117,6 +117,39 @@ expenses %>% slice_max(cost, n = 1) ``` +### Result ordering + +When used with `.by`, `summarise()` and `slice()` both return results where the group keys are ordered by their existing ordering in the original data rather than by their values. +This is different from `group_by()`, which has always sorted the keys in ascending order. + +```{r} +df <- tibble( + month = c("jan", "jan", "feb", "feb", "mar"), + temp = c(20, 25, 18, 20, 40) +) + +# Uses ordering by "first appearance" in the original data +df %>% + summarise(average_temp = mean(temp), .by = month) + +# Sorts in ascending order +df %>% + group_by(month) %>% + summarise(average_temp = mean(temp)) + +# Same with `slice()` +df %>% + slice_max(temp, by = month) + +df %>% + group_by(month) %>% + slice_max(temp) %>% + ungroup() +``` + +If you need sorted group keys, we recommend that you explicitly use [arrange()] either before or after the call to `summarise()` or `slice()`. +This also gives you full access to all of `arrange()`'s features, such as `desc()` and the `.locale` argument. + ### Verbs without `.by` support If a dplyr verb doesn't support `.by`, then that typically means that the verb isn't inherently affected by grouping. diff --git a/tests/testthat/test-by.R b/tests/testthat/test-by.R index 8e9b4ca9f5..bbcf660209 100644 --- a/tests/testthat/test-by.R +++ b/tests/testthat/test-by.R @@ -4,10 +4,28 @@ test_that("computes group data when `by` is set", { out <- compute_by(by = x, data = df) expect_identical(out$type, "grouped") expect_identical(out$names, "x") - expect_identical(out$data, group_data(group_by(df, x))) - # In particular, sets `drop = TRUE` - expect_true(attr(out$data, ".drop")) + expect_identical( + out$data, + tibble(x = c(1, 2), ".rows" := list_of(c(1L, 2L, 5L), c(3L, 4L))) + ) +}) + +test_that("computes `by` group data in order of appearance", { + df <- tibble( + x = c(5, 4, 5, 5), + y = c(2, 3, 1, 2) + ) + + out <- compute_by(by = c(x, y), data = df) + + expect <- tibble( + x = c(5, 4, 5), + y = c(2, 3, 1), + ".rows" := list_of(c(1L, 4L), 2L, 3L) + ) + + expect_identical(out$data, expect) }) test_that("extracts existing data when `by = NULL`", { diff --git a/tests/testthat/test-slice.R b/tests/testthat/test-slice.R index d3ad543c87..95ba3350d7 100644 --- a/tests/testthat/test-slice.R +++ b/tests/testthat/test-slice.R @@ -122,6 +122,15 @@ test_that("transient grouping retains data frame attributes", { expect_identical(attr(out, "foo"), "bar") }) +test_that("transient grouping orders by first appearance", { + df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) + + out <- slice(df, which(x == max(x)), .by = g) + + expect_identical(out$g, c(2, 1, 0)) + expect_identical(out$x, c(8, 2, 5)) +}) + test_that("can't use `.by` with `.preserve`", { df <- tibble(x = 1) @@ -360,8 +369,8 @@ test_that("slice_min/max() can order by multiple variables (#6176)", { test_that("slice_min/max() work with `by`", { df <- tibble(g = c(2, 2, 1, 1), x = c(1, 2, 3, 1)) - expect_identical(slice_min(df, x, by = g), df[c(4, 1),]) - expect_identical(slice_max(df, x, by = g), df[c(3, 2),]) + expect_identical(slice_min(df, x, by = g), df[c(1, 4),]) + expect_identical(slice_max(df, x, by = g), df[c(2, 3),]) }) test_that("slice_min/max() check size of `order_by=` (#5922)", { @@ -439,7 +448,7 @@ test_that("slice_sample() handles negative n= and prop= (#6402)", { test_that("slice_sample() works with `by`", { df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) - expect_identical(slice_sample(df, n = 2, by = g)$g, c(1, 2, 2)) + expect_identical(slice_sample(df, n = 2, by = g)$g, c(2, 2, 1)) }) # slice_head/slice_tail --------------------------------------------------- @@ -489,6 +498,6 @@ test_that("slice_head/slice_tail handle infinite n/prop", { test_that("slice_head/slice_tail work with `by`", { df <- tibble(g = c(2, 2, 2, 1), x = c(1, 2, 3, 1)) - expect_identical(slice_head(df, n = 2, by = g), df[c(4, 1, 2),]) - expect_identical(slice_tail(df, n = 2, by = g), df[c(4, 2, 3),]) + expect_identical(slice_head(df, n = 2, by = g), df[c(1, 2, 4),]) + expect_identical(slice_tail(df, n = 2, by = g), df[c(2, 3, 4),]) }) diff --git a/tests/testthat/test-summarise.R b/tests/testthat/test-summarise.R index b1c5fa1e0f..31f85f3f47 100644 --- a/tests/testthat/test-summarise.R +++ b/tests/testthat/test-summarise.R @@ -292,6 +292,15 @@ test_that("transient grouping drops data frame attributes", { expect_null(attr(out, "foo")) }) +test_that("transient grouping orders by first appearance", { + df <- tibble(g = c(2, 1, 2, 0), x = c(4, 2, 8, 5)) + + out <- summarise(df, x = mean(x), .by = g) + + expect_identical(out$g, c(2, 1, 0)) + expect_identical(out$x, c(6, 2, 5)) +}) + test_that("can't use `.by` with `.groups`", { df <- tibble(x = 1) From d581f6846e53d941f928653248464768ba3ce3c4 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Wed, 16 Nov 2022 16:45:28 -0500 Subject: [PATCH 11/20] NEWS bullet updates --- NEWS.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/NEWS.md b/NEWS.md index 1a657a8c16..60e40ccf9a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -30,6 +30,13 @@ result. This typically requires either the usage of `.groups` to silence the message or an explicit `ungroup()` afterwards. + Additionally, using `summarise()` or `slice()` with `.by` will never sort the + results by the group key, unlike with `group_by()`. Instead, the results are + returned using the existing ordering of the groups from the original data. We + feel this is more predictable, better maintains any ordering you might have + already applied with a previous call to `arrange()`, and provides a way to + maintain the current ordering without having to resort to factors. + This exciting feature was inspired by [data.table](https://CRAN.R-project.org/package=data.table), where the equivalent syntax looks like: From 244caa3a43613efc946619a2e5165c92eaf16ecf Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 15:36:07 -0500 Subject: [PATCH 12/20] We have decided that the `NULL` column case is too obscure to care about --- tests/testthat/test-mutate.R | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/testthat/test-mutate.R b/tests/testthat/test-mutate.R index 0f21f4cbd0..baed61ef78 100644 --- a/tests/testthat/test-mutate.R +++ b/tests/testthat/test-mutate.R @@ -80,10 +80,8 @@ test_that("assignments don't overwrite variables (#315)", { expect_equal(out, tibble(x = 1, y = 2, z = 10)) }) -test_that("can mutate a data frame with zero columns and `NULL` column names", { - skip("Do we really care about this gremlin case? This data frame is unusable.") +test_that("can mutate a data frame with zero columns", { df <- new_data_frame(n = 2L) - colnames(df) <- NULL expect_equal(mutate(df, x = 1), data.frame(x = c(1, 1))) }) From be5f9771f6262f8c790ba20ec3e88be0c51b82a0 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 15:59:53 -0500 Subject: [PATCH 13/20] NEWS tweaks based on feedback --- NEWS.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/NEWS.md b/NEWS.md index 60e40ccf9a..b07057f90c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -16,19 +16,16 @@ ``` starwars %>% - summarise(mean_height = mean(height), .by = c(species, homeworld)) + summarise( + mean_height = mean(height), + .by = c(species, homeworld) + ) ``` The most useful reason to do this is because grouping with `.by` is _temporary_ and only affects the verb it is being applied to. An ungrouped data frame went into the `summarise()` call, so an ungrouped data frame will - come out. With `.by`, you never need to remember to `ungroup()` afterwards. - - Compare that with the `group_by()` example, where one layer of grouping would - be peeled off and the output of `summarise()` would be another grouped data - frame with a message informing you of how `summarise()` had re-grouped the - result. This typically requires either the usage of `.groups` to silence the - message or an explicit `ungroup()` afterwards. + come out; with `.by`, you never need to remember to `ungroup()` afterwards. Additionally, using `summarise()` or `slice()` with `.by` will never sort the results by the group key, unlike with `group_by()`. Instead, the results are From 0985ddf0e717b0f33fcfe629cd7693745eb87b81 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 16:50:17 -0500 Subject: [PATCH 14/20] Second pass on `.by` help page based on feedback --- R/by.R | 2 +- man/dplyr_by.Rd | 109 ++++++++++++++++-------------------------------- man/rmd/by.Rmd | 66 ++++++++++++----------------- 3 files changed, 62 insertions(+), 115 deletions(-) diff --git a/R/by.R b/R/by.R index 4363fe742e..09132542c5 100644 --- a/R/by.R +++ b/R/by.R @@ -12,7 +12,7 @@ #' @keywords internal NULL -#' Grouping with `.by` +#' Temporary grouping with `.by` #' #' ```{r, echo = FALSE, results = "asis"} #' result <- rlang::with_options( diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd index e60332a1a3..415d684691 100644 --- a/man/dplyr_by.Rd +++ b/man/dplyr_by.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/by.R \name{dplyr_by} \alias{dplyr_by} -\title{Grouping with \code{.by}} +\title{Temporary grouping with \code{.by}} \description{ There are two ways to group in dplyr: \itemize{ @@ -11,16 +11,29 @@ There are two ways to group in dplyr: } This help page is dedicated to explaining where and why you might want to use the latter. -The most important reason to use \code{.by} is that you \emph{never} have to remember to use \code{\link[=ungroup]{ungroup()}} after it. -The grouping is always temporary, meaning that if an ungrouped data frame goes in, then an ungrouped data frame comes out, regardless of the number of grouping columns that are specified. +Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of \code{.by} is to allow you to place that grouping specification alongside the code that actually uses it. +As an added benefit, with \code{.by} you no longer need to remember to \code{\link[=ungroup]{ungroup()}} after \code{\link[=summarise]{summarise()}}, and \code{summarise()} won't ever message you about how it's handling the groups! -The following dplyr verbs support \code{.by}: +This great idea comes from \href{https://CRAN.R-project.org/package=data.table}{data.table}, which allows you to specify \code{by} alongside modifications in \code{j}, like: \code{dt[, .(x = mean(x)), by = g]}. +\subsection{Supported verbs}{ \itemize{ \item \code{\link[=mutate]{mutate()}} \item \code{\link[=summarise]{summarise()}} \item \code{\link[=filter]{filter()}} \item \code{\link[=slice]{slice()}} and its variants, such as \code{\link[=slice_head]{slice_head()}} } +} + +\subsection{Differences between \code{.by} and \code{group_by()}}{\tabular{ll}{ + \code{.by} \tab \code{group_by()} \cr + Grouping only affects a single verb \tab Grouping is persistent across multiple verbs \cr + Selects variables with \link[=dplyr_tidy_select]{tidy-select} \tab Computes expressions with \link[=dplyr_data_masking]{data-masking} \cr + Summaries use existing order of group keys \tab Summaries sort group keys in ascending order \cr +} + +} + +\subsection{Using \code{.by}}{ Let's take a look at the two grouping approaches using this \code{expenses} data set, which tracks costs accumulated across various \code{id}s and \code{region}s: @@ -55,7 +68,7 @@ You'd probably write something like this: #> 2 B 10.5 }\if{html}{\out{}} -As of dplyr 1.1.0, an additional option is available that lets you specify the grouping \emph{inline} within the verb: +Instead, you can now specify the grouping \emph{inline} within the verb: \if{html}{\out{
}}\preformatted{expenses \%>\% summarise(cost = mean(cost), .by = region) @@ -66,34 +79,29 @@ As of dplyr 1.1.0, an additional option is available that lets you specify the g #> 2 B 10.5 }\if{html}{\out{
}} -This great idea comes from \href{https://CRAN.R-project.org/package=data.table}{data.table}, where the equivalent syntax looks something like \code{expenses[, .(cost = mean(cost)), by = region]}. - -Grouping with \code{.by} is temporary, meaning that since \code{expenses} was an ungrouped data frame, the result after applying \code{.by} will also always be an ungrouped data frame, full stop. -Compare that with \code{group_by() \%>\% summarise()}, where \code{summarise()} generally peels off 1 layer of grouping by default, with a message that it is doing so if there were originally more than one grouping columns: +Grouping with \code{.by} is temporary, meaning that since \code{expenses} was an ungrouped data frame, the result after applying \code{.by} will also always be an ungrouped data frame, regardless of the number of grouping columns. \if{html}{\out{
}}\preformatted{expenses \%>\% - group_by(id, region) \%>\% - summarise(cost = mean(cost)) -#> `summarise()` has grouped output by 'id'. You can override using the `.groups` -#> argument. + summarise(cost = mean(cost), .by = c(id, region)) #> # A tibble: 5 x 3 -#> # Groups: id [3] #> id region cost #> #> 1 1 A 22 -#> 2 1 B 9 -#> 3 2 A 13 -#> 4 3 A 6 -#> 5 3 B 12 +#> 2 2 A 13 +#> 3 3 B 12 +#> 4 1 B 9 +#> 5 3 A 6 }\if{html}{\out{
}} -This behavior is sometimes useful to sequentially "roll up" a data frame that has been grouped by multiple columns, but often you just want to group temporarily. -Traditionally, you'd either \code{ungroup()} after the \code{summarise()} or explicitly set \code{.groups = "drop"} to achieve this: +Compare that with \code{group_by() \%>\% summarise()}, where \code{summarise()} generally peels off 1 layer of grouping by default, typically with a message that it is doing so: \if{html}{\out{
}}\preformatted{expenses \%>\% group_by(id, region) \%>\% - summarise(cost = mean(cost), .groups = "drop") + summarise(cost = mean(cost)) +#> `summarise()` has grouped output by 'id'. You can override using the `.groups` +#> argument. #> # A tibble: 5 x 3 +#> # Groups: id [3] #> id region cost #> #> 1 1 A 22 @@ -103,22 +111,11 @@ Traditionally, you'd either \code{ungroup()} after the \code{summarise()} or exp #> 5 3 B 12 }\if{html}{\out{
}} -Because \code{.by} grouping is temporary, you don't need to worry about ungrouping and it never needs to emit a message to remind you what is happening: - -\if{html}{\out{
}}\preformatted{expenses \%>\% - summarise(cost = mean(cost), .by = c(id, region)) -#> # A tibble: 5 x 3 -#> id region cost -#> -#> 1 1 A 22 -#> 2 2 A 13 -#> 3 3 B 12 -#> 4 1 B 9 -#> 5 3 A 6 -}\if{html}{\out{
}} +Because \code{.by} grouping is temporary, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups. -Note that we specified multiple columns to group by using the \link[=dplyr_tidy_select]{tidy-select} syntax \code{c(id, region)}. +Note that with \code{.by} we specified multiple columns to group by using the \link[=dplyr_tidy_select]{tidy-select} syntax \code{c(id, region)}. If you have a character vector of column names you'd like to group by, you can do so with \code{.by = all_of(my_cols)}. +It will group by the columns in the order they were provided. To prevent surprising results, you can't use \code{.by} on an existing grouped data frame: @@ -159,27 +156,12 @@ Or you could slice out the maximum cost per combination of id and region: #> 4 1 B 9 #> 5 3 A 6 }\if{html}{\out{}} +} -Again, note that the result of this \code{slice_max()} is an ungrouped data frame, which is probably what you'd expect here. -Compare that to the \code{group_by()} approach, which uses persistent grouping and returns another grouped data frame: - -\if{html}{\out{
}}\preformatted{expenses \%>\% - group_by(id, region) \%>\% - slice_max(cost, n = 1) -#> # A tibble: 5 x 3 -#> # Groups: id, region [5] -#> id region cost -#> -#> 1 1 A 25 -#> 2 1 B 9 -#> 3 2 A 20 -#> 4 3 A 6 -#> 5 3 B 12 -}\if{html}{\out{
}} \subsection{Result ordering}{ -When used with \code{.by}, \code{summarise()} and \code{slice()} both return results where the group keys are ordered by their existing ordering in the original data rather than by their values. -This is different from \code{group_by()}, which has always sorted the keys in ascending order. +When used with \code{.by}, \code{summarise()} and \code{slice()} both maintain the ordering of the existing data. +This is different from \code{group_by()}, which has always sorted the group keys in ascending order. \if{html}{\out{
}}\preformatted{df <- tibble( month = c("jan", "jan", "feb", "feb", "mar"), @@ -206,27 +188,6 @@ df \%>\% #> 1 feb 19 #> 2 jan 22.5 #> 3 mar 40 - -# Same with `slice()` -df \%>\% - slice_max(temp, by = month) -#> # A tibble: 3 x 2 -#> month temp -#> -#> 1 jan 25 -#> 2 feb 20 -#> 3 mar 40 - -df \%>\% - group_by(month) \%>\% - slice_max(temp) \%>\% - ungroup() -#> # A tibble: 3 x 2 -#> month temp -#> -#> 1 feb 20 -#> 2 jan 25 -#> 3 mar 40 }\if{html}{\out{
}} If you need sorted group keys, we recommend that you explicitly use \code{\link[=arrange]{arrange()}} either before or after the call to \code{summarise()} or \code{slice()}. diff --git a/man/rmd/by.Rmd b/man/rmd/by.Rmd index 1a43879be6..65756e8685 100644 --- a/man/rmd/by.Rmd +++ b/man/rmd/by.Rmd @@ -15,10 +15,12 @@ There are two ways to group in dplyr: - Temporary grouping with `.by` This help page is dedicated to explaining where and why you might want to use the latter. -The most important reason to use `.by` is that you *never* have to remember to use [ungroup()] after it. -The grouping is always temporary, meaning that if an ungrouped data frame goes in, then an ungrouped data frame comes out, regardless of the number of grouping columns that are specified. +Grouping radically affects the computation of the dplyr verb you use it with, and one of the goals of `.by` is to allow you to place that grouping specification alongside the code that actually uses it. +As an added benefit, with `.by` you no longer need to remember to [ungroup()] after [summarise()], and `summarise()` won't ever message you about how it's handling the groups! -The following dplyr verbs support `.by`: +This great idea comes from [data.table](https://CRAN.R-project.org/package=data.table), which allows you to specify `by` alongside modifications in `j`, like: `dt[, .(x = mean(x)), by = g]`. + +### Supported verbs - [mutate()] @@ -28,6 +30,16 @@ The following dplyr verbs support `.by`: - [slice()] and its variants, such as [slice_head()] +### Differences between `.by` and `group_by()` + +| `.by` | `group_by()` | +|----------------------------------|-------------------------------------| +| Grouping only affects a single verb | Grouping is persistent across multiple verbs | +| Selects variables with [tidy-select][dplyr_tidy_select] | Computes expressions with [data-masking][dplyr_data_masking] | +| Summaries use existing order of group keys | Summaries sort group keys in ascending order | + +### Using `.by` + Let's take a look at the two grouping approaches using this `expenses` data set, which tracks costs accumulated across various `id`s and `region`s: ```{r} @@ -48,42 +60,33 @@ expenses %>% summarise(cost = mean(cost)) ``` -As of dplyr 1.1.0, an additional option is available that lets you specify the grouping *inline* within the verb: +Instead, you can now specify the grouping *inline* within the verb: ```{r} expenses %>% summarise(cost = mean(cost), .by = region) ``` -This great idea comes from [data.table](https://CRAN.R-project.org/package=data.table), where the equivalent syntax looks something like `expenses[, .(cost = mean(cost)), by = region]`. - -Grouping with `.by` is temporary, meaning that since `expenses` was an ungrouped data frame, the result after applying `.by` will also always be an ungrouped data frame, full stop. -Compare that with `group_by() %>% summarise()`, where `summarise()` generally peels off 1 layer of grouping by default, with a message that it is doing so if there were originally more than one grouping columns: +Grouping with `.by` is temporary, meaning that since `expenses` was an ungrouped data frame, the result after applying `.by` will also always be an ungrouped data frame, regardless of the number of grouping columns. ```{r} expenses %>% - group_by(id, region) %>% - summarise(cost = mean(cost)) + summarise(cost = mean(cost), .by = c(id, region)) ``` -This behavior is sometimes useful to sequentially "roll up" a data frame that has been grouped by multiple columns, but often you just want to group temporarily. -Traditionally, you'd either `ungroup()` after the `summarise()` or explicitly set `.groups = "drop"` to achieve this: +Compare that with `group_by() %>% summarise()`, where `summarise()` generally peels off 1 layer of grouping by default, typically with a message that it is doing so: ```{r} expenses %>% group_by(id, region) %>% - summarise(cost = mean(cost), .groups = "drop") + summarise(cost = mean(cost)) ``` -Because `.by` grouping is temporary, you don't need to worry about ungrouping and it never needs to emit a message to remind you what is happening: - -```{r} -expenses %>% - summarise(cost = mean(cost), .by = c(id, region)) -``` +Because `.by` grouping is temporary, you don't need to worry about ungrouping, and it never needs to emit a message to remind you what it is doing with the groups. -Note that we specified multiple columns to group by using the [tidy-select][dplyr_tidy_select] syntax `c(id, region)`. +Note that with `.by` we specified multiple columns to group by using the [tidy-select][dplyr_tidy_select] syntax `c(id, region)`. If you have a character vector of column names you'd like to group by, you can do so with `.by = all_of(my_cols)`. +It will group by the columns in the order they were provided. To prevent surprising results, you can't use `.by` on an existing grouped data frame: @@ -108,19 +111,10 @@ expenses %>% slice_max(cost, n = 1, by = c(id, region)) ``` -Again, note that the result of this `slice_max()` is an ungrouped data frame, which is probably what you'd expect here. -Compare that to the `group_by()` approach, which uses persistent grouping and returns another grouped data frame: - -```{r} -expenses %>% - group_by(id, region) %>% - slice_max(cost, n = 1) -``` - ### Result ordering -When used with `.by`, `summarise()` and `slice()` both return results where the group keys are ordered by their existing ordering in the original data rather than by their values. -This is different from `group_by()`, which has always sorted the keys in ascending order. +When used with `.by`, `summarise()` and `slice()` both maintain the ordering of the existing data. +This is different from `group_by()`, which has always sorted the group keys in ascending order. ```{r} df <- tibble( @@ -136,15 +130,6 @@ df %>% df %>% group_by(month) %>% summarise(average_temp = mean(temp)) - -# Same with `slice()` -df %>% - slice_max(temp, by = month) - -df %>% - group_by(month) %>% - slice_max(temp) %>% - ungroup() ``` If you need sorted group keys, we recommend that you explicitly use [arrange()] either before or after the call to `summarise()` or `slice()`. @@ -165,3 +150,4 @@ This is typically because the verbs are required to retain the grouping columns, - [arrange()] has a `.by_group` argument to optionally order by grouping columns first. If `group_by()` didn't exist, then these verbs would not have special support for grouped data frames. + From ea8037317d6c532d21b9da6521ad35dcbfb92124 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 16:59:40 -0500 Subject: [PATCH 15/20] Include `.by` help page in pkgdown reference --- R/by.R | 1 - _pkgdown.yml | 1 + man/dplyr_by.Rd | 1 - 3 files changed, 1 insertion(+), 2 deletions(-) diff --git a/R/by.R b/R/by.R index 09132542c5..f812f41bc2 100644 --- a/R/by.R +++ b/R/by.R @@ -23,7 +23,6 @@ NULL #' ``` #' #' @name dplyr_by -#' @keywords internal NULL compute_by <- function(by, diff --git a/_pkgdown.yml b/_pkgdown.yml index 69b824e1bc..ce0f44a6c0 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -41,6 +41,7 @@ reference: contents: - count - group_by + - dplyr_by - rowwise - summarise - 'n' diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd index 415d684691..8c42555bcb 100644 --- a/man/dplyr_by.Rd +++ b/man/dplyr_by.Rd @@ -210,4 +210,3 @@ This is typically because the verbs are required to retain the grouping columns, If \code{group_by()} didn't exist, then these verbs would not have special support for grouped data frames. } } -\keyword{internal} From e7567c9d9c1d5d9902db76962632e9435b753a9b Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Thu, 17 Nov 2022 17:01:42 -0500 Subject: [PATCH 16/20] Apply suggestions from code review Co-authored-by: Hadley Wickham --- R/by.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R/by.R b/R/by.R index f812f41bc2..7715edea65 100644 --- a/R/by.R +++ b/R/by.R @@ -91,7 +91,7 @@ check_by <- function(by, if (is_grouped_df(data)) { message <- paste0( - "{.arg {by_arg}} can't be supplied when ", + "Can't supply {.arg {by_arg}} when ", "{.arg {data_arg}} is a grouped data frame." ) cli::cli_abort(message, call = error_call) @@ -99,7 +99,7 @@ check_by <- function(by, if (is_rowwise_df(data)) { message <- paste0( - "{.arg {by_arg}} can't be supplied when ", + "Can't supply {.arg {by_arg}} when ", "{.arg {data_arg}} is a rowwise data frame." ) cli::cli_abort(message, call = error_call) From 0a72ad6ec08fe3153aad87a1a280bceee25a63c8 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 17:05:16 -0500 Subject: [PATCH 17/20] Regenerate snapshots --- tests/testthat/_snaps/by.md | 6 +++--- tests/testthat/_snaps/filter.md | 4 ++-- tests/testthat/_snaps/mutate.md | 4 ++-- tests/testthat/_snaps/slice.md | 14 +++++++------- tests/testthat/_snaps/summarise.md | 4 ++-- 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/tests/testthat/_snaps/by.md b/tests/testthat/_snaps/by.md index d9ae71c204..343d99012e 100644 --- a/tests/testthat/_snaps/by.md +++ b/tests/testthat/_snaps/by.md @@ -13,7 +13,7 @@ compute_by(x, gdf) Condition Error: - ! `by` can't be supplied when `data` is a grouped data frame. + ! Can't supply `by` when `data` is a grouped data frame. # can't set `.by` with a rowwise-df @@ -21,7 +21,7 @@ compute_by(x, rdf) Condition Error: - ! `by` can't be supplied when `data` is a rowwise data frame. + ! Can't supply `by` when `data` is a rowwise data frame. # can tweak the error args @@ -29,5 +29,5 @@ compute_by(x, gdf, by_arg = "x", data_arg = "dat") Condition Error: - ! `x` can't be supplied when `dat` is a grouped data frame. + ! Can't supply `x` when `dat` is a grouped data frame. diff --git a/tests/testthat/_snaps/filter.md b/tests/testthat/_snaps/filter.md index 405f797358..4c4f7fd619 100644 --- a/tests/testthat/_snaps/filter.md +++ b/tests/testthat/_snaps/filter.md @@ -243,7 +243,7 @@ filter(gdf, .by = x) Condition Error in `filter()`: - ! `.by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df @@ -251,5 +251,5 @@ filter(rdf, .by = x) Condition Error in `filter()`: - ! `.by` can't be supplied when `.data` is a rowwise data frame. + ! Can't supply `.by` when `.data` is a rowwise data frame. diff --git a/tests/testthat/_snaps/mutate.md b/tests/testthat/_snaps/mutate.md index b406242fde..2211198702 100644 --- a/tests/testthat/_snaps/mutate.md +++ b/tests/testthat/_snaps/mutate.md @@ -65,7 +65,7 @@ mutate(gdf, .by = x) Condition Error in `mutate()`: - ! `.by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df @@ -73,7 +73,7 @@ mutate(rdf, .by = x) Condition Error in `mutate()`: - ! `.by` can't be supplied when `.data` is a rowwise data frame. + ! Can't supply `.by` when `.data` is a rowwise data frame. # mutate() deals with 0 groups (#5534) diff --git a/tests/testthat/_snaps/slice.md b/tests/testthat/_snaps/slice.md index dde7d7d033..750561e94f 100644 --- a/tests/testthat/_snaps/slice.md +++ b/tests/testthat/_snaps/slice.md @@ -61,7 +61,7 @@ slice(gdf, .by = x) Condition Error in `slice()`: - ! `.by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df @@ -69,7 +69,7 @@ slice(rdf, .by = x) Condition Error in `slice()`: - ! `.by` can't be supplied when `.data` is a rowwise data frame. + ! Can't supply `.by` when `.data` is a rowwise data frame. # slice_helpers() call get_slice_size() @@ -226,27 +226,27 @@ slice_head(gdf, n = 1, by = x) Condition Error in `slice_head()`: - ! `by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `by` when `.data` is a grouped data frame. Code slice_tail(gdf, n = 1, by = x) Condition Error in `slice_tail()`: - ! `by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `by` when `.data` is a grouped data frame. Code slice_min(gdf, order_by = x, by = x) Condition Error in `slice_min()`: - ! `by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `by` when `.data` is a grouped data frame. Code slice_max(gdf, order_by = x, by = x) Condition Error in `slice_max()`: - ! `by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `by` when `.data` is a grouped data frame. Code slice_sample(gdf, n = 1, by = x) Condition Error in `slice_sample()`: - ! `by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `by` when `.data` is a grouped data frame. # slice_min/max() check size of `order_by=` (#5922) diff --git a/tests/testthat/_snaps/summarise.md b/tests/testthat/_snaps/summarise.md index 987d1e8745..21f084e581 100644 --- a/tests/testthat/_snaps/summarise.md +++ b/tests/testthat/_snaps/summarise.md @@ -12,7 +12,7 @@ summarise(gdf, .by = x) Condition Error in `summarise()`: - ! `.by` can't be supplied when `.data` is a grouped data frame. + ! Can't supply `.by` when `.data` is a grouped data frame. # catches `.by` with rowwise-df @@ -20,7 +20,7 @@ summarise(rdf, .by = x) Condition Error in `summarise()`: - ! `.by` can't be supplied when `.data` is a rowwise data frame. + ! Can't supply `.by` when `.data` is a rowwise data frame. # summarise() gives meaningful errors From 9201a7f08d1239e80a96a907ae71381d9cd40801 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 17:13:01 -0500 Subject: [PATCH 18/20] Regenerate documentation --- man/dplyr_by.Rd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd index 8c42555bcb..9c9a85de1a 100644 --- a/man/dplyr_by.Rd +++ b/man/dplyr_by.Rd @@ -123,7 +123,7 @@ To prevent surprising results, you can't use \code{.by} on an existing grouped d group_by(id) \%>\% summarise(cost = mean(cost), .by = c(id, region)) #> Error in `summarise()`: -#> ! `.by` can't be supplied when `.data` is a grouped data frame. +#> ! Can't supply `.by` when `.data` is a grouped data frame. }\if{html}{\out{}} So far we've focused on the usage of \code{.by} with \code{summarise()}, but \code{.by} works with a number of other dplyr verbs. From fb6ee6d9183744de70e2a9835241d5a73237b3a8 Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 18:04:55 -0500 Subject: [PATCH 19/20] Ensure that `compute_by()` is type stable on `$data` It should always return a bare tibble, even though `group_data()` returns a data frame for data frame input. --- R/by.R | 1 + tests/testthat/test-by.R | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/R/by.R b/R/by.R index 7715edea65..7d291e07e2 100644 --- a/R/by.R +++ b/R/by.R @@ -54,6 +54,7 @@ compute_by <- function(by, type <- "ungrouped" names <- by data <- group_data(data) + data <- as_tibble(data) } else { type <- "grouped" names <- by diff --git a/tests/testthat/test-by.R b/tests/testthat/test-by.R index bbcf660209..de4dc1a41d 100644 --- a/tests/testthat/test-by.R +++ b/tests/testthat/test-by.R @@ -29,6 +29,13 @@ test_that("computes `by` group data in order of appearance", { }) test_that("extracts existing data when `by = NULL`", { + df <- data.frame(x = c(1, 1, 2, 2, 1)) + out <- compute_by(by = NULL, data = df) + expect_identical(out$type, "ungrouped") + expect_identical(out$names, character()) + # `compute_by()` is always type stable on `$data` and returns a bare tibble + expect_identical(out$data, as_tibble(group_data(df))) + df <- tibble(x = c(1, 1, 2, 2, 1)) out <- compute_by(by = NULL, data = df) expect_identical(out$type, "ungrouped") From ce6926f984311b7c03909773da773a018101e45a Mon Sep 17 00:00:00 2001 From: DavisVaughan Date: Thu, 17 Nov 2022 18:05:50 -0500 Subject: [PATCH 20/20] Remove `TODO`s --- R/summarise.R | 2 -- 1 file changed, 2 deletions(-) diff --git a/R/summarise.R b/R/summarise.R index 0c1fc52d7e..bad974ac9a 100644 --- a/R/summarise.R +++ b/R/summarise.R @@ -152,7 +152,6 @@ summarise.data.frame <- function(.data, ..., .by = NULL, .groups = NULL) { #' @export summarise.grouped_df <- function(.data, ..., .by = NULL, .groups = NULL) { - # TODO: Is it right to even expose `.by` here? It lets us catch errors. # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data") @@ -199,7 +198,6 @@ summarise.grouped_df <- function(.data, ..., .by = NULL, .groups = NULL) { #' @export summarise.rowwise_df <- function(.data, ..., .by = NULL, .groups = NULL) { - # TODO: Is it right to even expose `.by` here? It lets us catch errors. # Will always error if `.by != NULL` b/c you can't use it with grouped/rowwise dfs. by <- compute_by({{ .by }}, .data, by_arg = ".by", data_arg = ".data")