tidyverse · DavisVaughan · Nov 17, 2022 · Nov 4, 2022 · Nov 7, 2022 · Nov 7, 2022
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,47 @@
 # dplyr (development version)
 
+* `.by` is a new experimental inline alternative to `group_by()` that supports
+  _temporary_ grouping in the following key dplyr verbs: `mutate()`,
+  `summarise()`, `filter()`, and the `slice()` family (#6528).
+
+  Rather than:
+
+  ```
+  starwars %>%
+    group_by(species, homeworld) %>%
+    summarise(mean_height = mean(height))
+  ```
+
+  You can now write:
+
+  ```
+  starwars %>%
+    summarise(
+      mean_height = mean(height),
+      .by = c(species, homeworld)
+    )
+  ```
+
+  The most useful reason to do this is because grouping with `.by` is
+  _temporary_ and only affects the verb it is being applied to. An ungrouped
+  data frame went into the `summarise()` call, so an ungrouped data frame will
+  come out; with `.by`, you never need to remember to `ungroup()` afterwards.
+
+  Additionally, using `summarise()` or `slice()` with `.by` will never sort the
+  results by the group key, unlike with `group_by()`. Instead, the results are
+  returned using the existing ordering of the groups from the original data. We
+  feel this is more predictable, better maintains any ordering you might have
+  already applied with a previous call to `arrange()`, and provides a way to
+  maintain the current ordering without having to resort to factors.
+
+  This exciting feature was inspired by
+  [data.table](https://CRAN.R-project.org/package=data.table), where the
+  equivalent syntax looks like:
+
+  ```
+  starwars[, .(mean_height = mean(height)), by = .(species, homeworld)]
+  ```
+
 * `summarise()` now correctly recycles named 0-column data frames (#6509).
 
 * `.cols` and `.fns` are now required arguments in `across()`, `c_across()`,

diff --git a/R/by.R b/R/by.R
@@ -0,0 +1,126 @@
+#' Helper for consistent documentation of `.by`
+#'
+#' Use `@inheritParams args_by` to consistently document `.by`.
+#'
+#' @param .by `r lifecycle::badge("experimental")`
+#'
+#'   <[`tidy-select`][dplyr_tidy_select]> Optionally, a selection of columns to
+#'   temporarily group by using an inline alternative to [group_by()]. For
+#'   details and examples, see [?dplyr_by][dplyr_by].
+#'
+#' @name args_by
+#' @keywords internal
+NULL
+
+#' Temporary grouping with `.by`
+#'
+#' ```{r, echo = FALSE, results = "asis"}
+#' result <- rlang::with_options(
+#'   knitr::knit_child("man/rmd/by.Rmd"),
+#'   dplyr.summarise.inform = TRUE
+#' )
+#' cat(result, sep = "\n")
+#' ```
+#'
+#' @name dplyr_by
+NULL
+
+compute_by <- function(by,
+                       data,
+                       ...,
+                       by_arg = "by",
+                       data_arg = "data",
+                       error_call = caller_env()) {
+  check_dots_empty0(...)
+
+  error_call <- dplyr_error_call(error_call)
+
+  by <- enquo(by)
+  check_by(by, data, by_arg = by_arg, data_arg = data_arg, error_call = error_call)
+
+  if (is_grouped_df(data)) {
+    type <- "grouped"
+    names <- group_vars(data)
+    data <- group_data(data)
+  } else if (is_rowwise_df(data)) {
+    type <- "rowwise"
+    names <- group_vars(data)
+    data <- group_data(data)
+  } else {
+    by <- eval_select_by(by, data, error_call = error_call)
+
+    if (length(by) == 0L) {
+      # `by = NULL` or empty selection
+      type <- "ungrouped"
+      names <- by
+      data <- group_data(data)
+      data <- as_tibble(data)
+    } else {
+      type <- "grouped"
+      names <- by
+      data <- compute_by_groups(data, by, error_call = error_call)
+    }
+  }
+
+  new_by(type = type, names = names, data = data)
+}
+
+compute_by_groups <- function(data, names, error_call = caller_env()) {
+  data <- dplyr_col_select(data, names, error_call = error_call)
+  info <- vec_group_loc(data)
+
+  size <- vec_size(info)
+
+  out <- dplyr_new_list(info$key)
+  out[[".rows"]] <- new_list_of(info$loc, ptype = integer())
+  out <- new_tibble(out, nrow = size)
+
+  out
+}
+
+check_by <- function(by,
+                     data,
+                     ...,
+                     by_arg = "by",
+                     data_arg = "data",
+                     error_call = caller_env()) {
+  check_dots_empty0(...)
+
+  if (quo_is_null(by)) {
+    return(invisible(NULL))
+  }
+
+  if (is_grouped_df(data)) {
+    message <- paste0(
+      "Can't supply {.arg {by_arg}} when ",
+      "{.arg {data_arg}} is a grouped data frame."
+    )
+    cli::cli_abort(message, call = error_call)
+  }
+
+  if (is_rowwise_df(data)) {
+    message <- paste0(
+      "Can't supply {.arg {by_arg}} when ",
+      "{.arg {data_arg}} is a rowwise data frame."
+    )
+    cli::cli_abort(message, call = error_call)
+  }
+
+  invisible(NULL)
+}
+
+eval_select_by <- function(by,
+                           data,
+                           error_call = caller_env()) {
+  out <- tidyselect::eval_select(
+    expr = by,
+    data = data,
+    allow_rename = FALSE,
+    error_call = error_call
+  )
+  names(out)
+}
+
+new_by <- function(type, names, data) {
+  structure(list(type = type, names = names, data = data), class = "dplyr_by")
+}
diff --git a/R/conditions.R b/R/conditions.R
@@ -47,7 +47,7 @@ cnd_bullet_cur_group_label <- function(what = "error") {
 }
 
 cnd_bullet_rowwise_unlist <- function() {
-  if (peek_mask()$is_rowwise_df()) {
+  if (peek_mask()$is_rowwise()) {
     glue_data(peek_error_context(), "Did you mean: `{error_name} = list({error_expression})` ?")
   }
 }
@@ -131,9 +131,9 @@ dot_as_label <- function(expr) {
 
 mask_type <- function(mask = peek_mask()) {
   if (mask$get_size() > 0) {
-    if (mask$is_grouped_df()) {
+    if (mask$is_grouped()) {
       return("grouped")
-    } else if (mask$is_rowwise_df()) {
+    } else if (mask$is_rowwise()) {
       return("rowwise")
     }
   }

diff --git a/R/data-mask.R b/R/data-mask.R
@@ -1,7 +1,7 @@
 DataMask <- R6Class("DataMask",
   public = list(
-    initialize = function(data, verb, error_call) {
-      rows <- group_rows(data)
+    initialize = function(data, by, verb, error_call) {
+      rows <- by$data$.rows
       if (length(rows) == 0) {
         # Specially handle case of zero groups
         rows <- new_list_of(list(integer()), ptype = integer())
@@ -16,22 +16,23 @@ DataMask <- R6Class("DataMask",
         abort("Can't transform a data frame with duplicate names.", call = error_call)
       }
       names(data) <- names_bindings
+
       private$size <- nrow(data)
       private$current_data <- dplyr_new_list(data)
 
-      private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows)
-      private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows)
+      private$grouped <- by$type == "grouped"
+      private$rowwise <- by$type == "rowwise"
 
-      private$grouped_df <- is_grouped_df(data)
-      private$rowwise_df <- is_rowwise_df(data)
+      private$chops <- .Call(dplyr_lazy_vec_chop_impl, data, rows, private$grouped, private$rowwise)
+      private$mask <- .Call(dplyr_data_masks_setup, private$chops, data, rows)
 
-      private$keys <- group_keys(data)
-      private$group_vars <- group_vars(data)
+      private$keys <- group_keys0(by$data)
+      private$by_names <- by$names
       private$verb <- verb
     },
 
     add_one = function(name, chunks, result) {
-      if (self$is_rowwise_df()){
+      if (self$is_rowwise()){
         is_scalar_list <- function(.x) {
           vec_is_list(.x) && length(.x) == 1L
         }
@@ -78,7 +79,7 @@ DataMask <- R6Class("DataMask",
       # `across(.fns = NULL)`. We should remove this when we defunct those.
       cols <- self$current_cols(vars)
 
-      if (self$is_rowwise_df()) {
+      if (self$is_rowwise()) {
         cols <- map2(cols, names(cols), function(col, name) {
           if (vec_is_list(private$current_data[[name]])) {
             col <- list(col)
@@ -117,7 +118,7 @@ DataMask <- R6Class("DataMask",
     },
 
     current_non_group_vars = function() {
-      setdiff(self$current_vars(), private$group_vars)
+      setdiff(self$current_vars(), private$by_names)
     },
 
     get_current_group = function() {
@@ -172,12 +173,12 @@ DataMask <- R6Class("DataMask",
       })
     },
 
-    is_grouped_df = function() {
-      private$grouped_df
+    is_grouped = function() {
+      private$grouped
     },
 
-    is_rowwise_df = function() {
-      private$rowwise_df
+    is_rowwise = function() {
+      private$rowwise
     },
 
     get_keys = function() {
@@ -215,8 +216,8 @@ DataMask <- R6Class("DataMask",
     # ptypes of all the variables
     current_data = list(),
 
-    # names of the grouping variables
-    group_vars = character(),
+    # names of the `by` variables
+    by_names = character(),
 
     # list of indices, one integer vector per group
     rows = NULL,
@@ -228,8 +229,8 @@ DataMask <- R6Class("DataMask",
     size = NULL,
 
     # Type of data frame
-    grouped_df = NULL,
-    rowwise_df = NULL,
+    grouped = NULL,
+    rowwise = NULL,
 
     verb = character()
   )

diff --git a/R/filter.R b/R/filter.R
@@ -48,6 +48,7 @@
 #'
 #' @family single table verbs
 #' @inheritParams arrange
+#' @inheritParams args_by
 #' @param ... <[`data-masking`][dplyr_data_masking]> Expressions that return a
 #'   logical value, and are defined in terms of the variables in `.data`.
 #'   If multiple expressions are included, they are combined with the `&` operator.
@@ -105,23 +106,37 @@
 #'     .data[[vars[[2]]]] > cond[[2]]
 #'   )
 #' # Learn more in ?dplyr_data_masking
-filter <- function(.data, ..., .preserve = FALSE) {
+filter <- function(.data, ..., .by = NULL, .preserve = FALSE) {
+  by <- enquo(.by)
+
+  if (!quo_is_null(by) && !is_false(.preserve)) {
+    abort("Can't supply both `.by` and `.preserve`.")
+  }
+
   UseMethod("filter")
 }
 
 #' @export
-filter.data.frame <- function(.data, ..., .preserve = FALSE) {
-  loc <- filter_rows(.data, ...)
+filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) {
+  loc <- filter_rows(.data, ..., .by = {{ .by }})
   dplyr_row_slice(.data, loc, preserve = .preserve)
 }
 
-filter_rows <- function(.data, ..., error_call = caller_env()) {
+filter_rows <- function(.data, ..., .by = NULL, error_call = caller_env()) {
   error_call <- dplyr_error_call(error_call)
 
   dots <- dplyr_quosures(...)
   check_filter(dots, error_call = error_call)
 
-  mask <- DataMask$new(.data, "filter", error_call = error_call)
+  by <- compute_by(
+    by = {{ .by }},
+    data = .data,
+    by_arg = ".by",
+    data_arg = ".data",
+    error_call = error_call
+  )
+
+  mask <- DataMask$new(.data, by, "filter", error_call = error_call)
   on.exit(mask$forget(), add = TRUE)
 
   dots <- filter_expand(dots, mask = mask, error_call = error_call)

diff --git a/R/group-by.R b/R/group-by.R
@@ -244,9 +244,13 @@ add_computed_columns <- function(.data,
   if (any(needs_mutate)) {
     # TODO: use less of a hack
     if (inherits(.data, "data.frame")) {
+      bare_data <- ungroup(.data)
+      by <- compute_by(by = NULL, data = bare_data)
+
       cols <- mutate_cols(
-        ungroup(.data),
+        bare_data,
         dplyr_quosures(!!!vars),
+        by = by,
         error_call = error_call
       )
 

diff --git a/R/group-data.R b/R/group-data.R
@@ -97,8 +97,13 @@ group_keys.data.frame <- function(.tbl, ...) {
     .tbl <- group_by(.tbl, ...)
   }
   out <- group_data(.tbl)
-  .Call(`dplyr_group_keys`, out)
+  group_keys0(out)
 }
+group_keys0 <- function(x) {
+  # Compute keys directly from `group_data()` results
+  .Call(`dplyr_group_keys`, x)
+}
+
 #' @rdname group_data
 #' @export
 group_rows <- function(.data) {