From 15bb164479d058fbd00b96f0ef3d51d081946d6b Mon Sep 17 00:00:00 2001 From: Lorenzo ISELLA Date: Thu, 12 Feb 2026 14:35:41 +0100 Subject: [PATCH 1/5] r: add support for dplyr::filter_out() --- r/R/dplyr-filter.R | 61 +++++++++++++++++++++++++++- r/tests/testthat/test-dplyr-filter.R | 41 +++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index 18f5c929affb..5d0af8ef47f5 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -47,7 +47,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) call = expr ) } - out <- set_filters(out, filt) + out <- set_filters(out, filt, exclude = FALSE) } if (by$from_by) { @@ -59,7 +59,58 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) } filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query -set_filters <- function(.data, expressions) { +filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { + try_arrow_dplyr({ + # TODO something with the .preserve argument + out <- as_adq(.data) + + by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") + + if (by$from_by) { + out$group_by_vars <- by$names + } + + expanded_filters <- expand_across(out, quos(...)) + if (length(expanded_filters) == 0) { + # Nothing to do + return(as_adq(.data)) + } + + # tidy-eval the filter expressions inside an Arrow data_mask + mask <- arrow_mask(out) + + combined <- NULL + + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) + + if (length(mask$.aggregations)) { + arrow_not_supported( + .actual_msg = "Expression not supported in filter_out() in Arrow", + call = expr + ) + } + + # arrow_eval() may return either an Expression or a list_of + if (is_list_of(filt, "Expression")) { + filt <- Reduce("&", filt) + } + + combined <- if (is.null(combined)) filt else (combined & filt) + } + + out <- set_filters(out, combined, exclude = TRUE) + + if (by$from_by) { + out$group_by_vars <- character() + } + + out + }) +} +filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader <- filter_out.arrow_dplyr_query + +set_filters <- function(.data, expressions, exclude = FALSE) { if (length(expressions)) { if (is_list_of(expressions, "Expression")) { # expressions is a list of Expressions. AND them together and set them on .data @@ -70,6 +121,12 @@ set_filters <- function(.data, expressions) { stop("filter expressions must be either an expression or a list of expressions", call. = FALSE) } + if (isTRUE(exclude)) { + # dplyr::filter_out() semantics: drop rows where predicate is TRUE; + # keep rows where predicate is FALSE or NA. + new_filter <- (!new_filter) | is.na(new_filter) + } + if (isTRUE(.data$filtered_rows)) { # TRUE is default (i.e. no filter yet), so we don't need to & with it .data$filtered_rows <- new_filter diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index d56e25fca329..9bf81b9a4f04 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -498,3 +498,44 @@ test_that("filter() with aggregation expressions errors", { "not supported in filter" ) }) + +test_that("filter_out() basic", { + compare_dplyr_binding( + .input |> + filter_out(chr == "b") |> + select(chr, int, lgl) |> + collect(), + tbl + ) +}) + +test_that("filter_out() keeps NA values in predicate result", { + compare_dplyr_binding( + .input |> + filter_out(lgl) |> + select(chr, int, lgl) |> + collect(), + tbl + ) +}) + +test_that("filter_out() with multiple conditions", { + compare_dplyr_binding( + .input |> + filter_out(dbl > 2, chr %in% c("d", "f")) |> + collect(), + tbl + ) +}) + +test_that("More complex select/filter_out", { + compare_dplyr_binding( + .input |> + filter_out(dbl > 2, chr == "d" | chr == "f") |> + select(chr, int, lgl) |> + filter(int < 5) |> + select(int, chr) |> + collect(), + tbl + ) +}) From d6f4671cbcc1088b6c3704a25c618c9a45f044a3 Mon Sep 17 00:00:00 2001 From: Lorenzo ISELLA Date: Fri, 13 Feb 2026 16:09:20 +0100 Subject: [PATCH 2/5] GH-49257: [R] Support dplyr::filter_out() in Arrow backend --- r/R/arrow-package.R | 1 + r/R/dplyr-filter.R | 124 ++++++++++++++++++++++---------------------- 2 files changed, 64 insertions(+), 61 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index a1167433c932..5a596dffe3cd 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -38,6 +38,7 @@ supported_dplyr_methods <- list( select = NULL, filter = NULL, + filter_out = NULL, collect = NULL, summarise = c( "window functions not currently supported;", diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index 5d0af8ef47f5..da4fd4bd6618 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -17,68 +17,30 @@ # The following S3 methods are registered on load if dplyr is present -filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { - try_arrow_dplyr({ - # TODO something with the .preserve argument - out <- as_adq(.data) - - by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") - - if (by$from_by) { - out$group_by_vars <- by$names - } - - expanded_filters <- expand_across(out, quos(...)) - if (length(expanded_filters) == 0) { - # Nothing to do - return(as_adq(.data)) - } - - # tidy-eval the filter expressions inside an Arrow data_mask - mask <- arrow_mask(out) - for (expr in expanded_filters) { - filt <- arrow_eval(expr, mask) - if (length(mask$.aggregations)) { - # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. - # But we could, the same way it works in mutate() via join, if someone asks. - # Until then, just error. - arrow_not_supported( - .actual_msg = "Expression not supported in filter() in Arrow", - call = expr - ) - } - out <- set_filters(out, filt, exclude = FALSE) - } +apply_filter_impl <- function(.data, ..., .by = NULL, .preserve = FALSE, + exclude = FALSE, verb = c("filter", "filter_out")) { + verb <- match.arg(verb) - if (by$from_by) { - out$group_by_vars <- character() - } + # TODO something with the .preserve argument + out <- as_adq(.data) - out - }) -} -filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query - -filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { - try_arrow_dplyr({ - # TODO something with the .preserve argument - out <- as_adq(.data) - - by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") + by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") - if (by$from_by) { - out$group_by_vars <- by$names - } + if (by$from_by) { + out$group_by_vars <- by$names + } - expanded_filters <- expand_across(out, quos(...)) - if (length(expanded_filters) == 0) { - # Nothing to do - return(as_adq(.data)) - } + expanded_filters <- expand_across(out, quos(...)) + if (length(expanded_filters) == 0) { + # Nothing to do + return(as_adq(.data)) + } - # tidy-eval the filter expressions inside an Arrow data_mask - mask <- arrow_mask(out) + # tidy-eval the filter expressions inside an Arrow data_mask + mask <- arrow_mask(out) + if (isTRUE(exclude)) { + # filter_out(): combine all predicates with &, then exclude combined <- NULL for (expr in expanded_filters) { @@ -86,12 +48,11 @@ filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FAL if (length(mask$.aggregations)) { arrow_not_supported( - .actual_msg = "Expression not supported in filter_out() in Arrow", + .actual_msg = sprintf("Expression not supported in %s() in Arrow", verb), call = expr ) } - # arrow_eval() may return either an Expression or a list_of if (is_list_of(filt, "Expression")) { filt <- Reduce("&", filt) } @@ -100,12 +61,53 @@ filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FAL } out <- set_filters(out, combined, exclude = TRUE) + } else { + # filter(): apply each predicate sequentially + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) - if (by$from_by) { - out$group_by_vars <- character() + if (length(mask$.aggregations)) { + arrow_not_supported( + .actual_msg = sprintf("Expression not supported in %s() in Arrow", verb), + call = expr + ) + } + + out <- set_filters(out, filt, exclude = FALSE) } + } - out + if (by$from_by) { + out$group_by_vars <- character() + } + + out +} + +filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { + try_arrow_dplyr({ + apply_filter_impl( + .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + exclude = FALSE, + verb = "filter" + ) + }) +} +filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query + +filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { + try_arrow_dplyr({ + apply_filter_impl( + .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + exclude = TRUE, + verb = "filter_out" + ) }) } filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader <- filter_out.arrow_dplyr_query From 7134175a095f93125a9e6869128bc5a2a9a70467 Mon Sep 17 00:00:00 2001 From: Lorenzo Isella Date: Sat, 14 Feb 2026 18:13:45 +0100 Subject: [PATCH 3/5] GH-49257: [R] Run air format --- r/R/dplyr-filter.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index da4fd4bd6618..0ccb5fb8944d 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -17,8 +17,14 @@ # The following S3 methods are registered on load if dplyr is present -apply_filter_impl <- function(.data, ..., .by = NULL, .preserve = FALSE, - exclude = FALSE, verb = c("filter", "filter_out")) { +apply_filter_impl <- function( + .data, + ..., + .by = NULL, + .preserve = FALSE, + exclude = FALSE, + verb = c("filter", "filter_out") +) { verb <- match.arg(verb) # TODO something with the .preserve argument From 6ae4f4c7978439851c7bbe973c7ef66e2768640f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 16 Feb 2026 10:26:00 +0000 Subject: [PATCH 4/5] Tweak parameters, add another test --- r/R/dplyr-filter.R | 52 ++++++++++++++++++---------- r/tests/testthat/test-dplyr-filter.R | 7 ++++ 2 files changed, 40 insertions(+), 19 deletions(-) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index 0ccb5fb8944d..26fa1bf7d5f2 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -22,11 +22,8 @@ apply_filter_impl <- function( ..., .by = NULL, .preserve = FALSE, - exclude = FALSE, - verb = c("filter", "filter_out") + negate = FALSE ) { - verb <- match.arg(verb) - # TODO something with the .preserve argument out <- as_adq(.data) @@ -45,16 +42,19 @@ apply_filter_impl <- function( # tidy-eval the filter expressions inside an Arrow data_mask mask <- arrow_mask(out) - if (isTRUE(exclude)) { - # filter_out(): combine all predicates with &, then exclude + if (isTRUE(negate)) { + # filter_out(): combine all predicates with &, then negate combined <- NULL for (expr in expanded_filters) { filt <- arrow_eval(expr, mask) if (length(mask$.aggregations)) { + # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. arrow_not_supported( - .actual_msg = sprintf("Expression not supported in %s() in Arrow", verb), + .actual_msg = "Expression not supported in filter_out() in Arrow", call = expr ) } @@ -66,20 +66,23 @@ apply_filter_impl <- function( combined <- if (is.null(combined)) filt else (combined & filt) } - out <- set_filters(out, combined, exclude = TRUE) + out <- set_filters(out, combined, negate = TRUE) } else { # filter(): apply each predicate sequentially for (expr in expanded_filters) { filt <- arrow_eval(expr, mask) if (length(mask$.aggregations)) { + # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. + # But we could, the same way it works in mutate() via join, if someone asks. + # Until then, just error. arrow_not_supported( - .actual_msg = sprintf("Expression not supported in %s() in Arrow", verb), + .actual_msg = "Expression not supported in filter() in Arrow", call = expr ) } - out <- set_filters(out, filt, exclude = FALSE) + out <- set_filters(out, filt, negate = FALSE) } } @@ -90,35 +93,43 @@ apply_filter_impl <- function( out } -filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { +filter.arrow_dplyr_query <- function( + .data, + ..., + .by = NULL, + .preserve = FALSE +) { try_arrow_dplyr({ apply_filter_impl( .data, ..., .by = {{ .by }}, .preserve = .preserve, - exclude = FALSE, - verb = "filter" + negate = FALSE ) }) } filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query -filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { +filter_out.arrow_dplyr_query <- function( + .data, + ..., + .by = NULL, + .preserve = FALSE +) { try_arrow_dplyr({ apply_filter_impl( .data, ..., .by = {{ .by }}, .preserve = .preserve, - exclude = TRUE, - verb = "filter_out" + negate = TRUE ) }) } filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader <- filter_out.arrow_dplyr_query -set_filters <- function(.data, expressions, exclude = FALSE) { +set_filters <- function(.data, expressions, negate = FALSE) { if (length(expressions)) { if (is_list_of(expressions, "Expression")) { # expressions is a list of Expressions. AND them together and set them on .data @@ -126,10 +137,13 @@ set_filters <- function(.data, expressions, exclude = FALSE) { } else if (inherits(expressions, "Expression")) { new_filter <- expressions } else { - stop("filter expressions must be either an expression or a list of expressions", call. = FALSE) + stop( + "filter expressions must be either an expression or a list of expressions", + call. = FALSE + ) } - if (isTRUE(exclude)) { + if (isTRUE(negate)) { # dplyr::filter_out() semantics: drop rows where predicate is TRUE; # keep rows where predicate is FALSE or NA. new_filter <- (!new_filter) | is.na(new_filter) diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index 9bf81b9a4f04..3912e518ed08 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -538,4 +538,11 @@ test_that("More complex select/filter_out", { collect(), tbl ) + + compare_dplyr_binding( + .input |> + filter_out(!is.na(int)) |> + collect(), + tbl + ) }) From 2ebd4faa41aa94b3bd0f682c1f0e485b856c7e2f Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Mon, 16 Feb 2026 11:49:23 +0000 Subject: [PATCH 5/5] Run make doc --- r/R/dplyr-funcs-doc.R | 3 ++- r/man/acero.Rd | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index bbd1c91a0213..9293d14c94c0 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -19,7 +19,7 @@ #' Functions available in Arrow dplyr queries #' -#' The `arrow` package contains methods for 37 `dplyr` table functions, many of +#' The `arrow` package contains methods for 38 `dplyr` table functions, many of #' which are "verbs" that do transformations to one or more tables. #' The package also has mappings of 224 R functions to the corresponding #' functions in the Arrow compute library. These allow you to write code inside @@ -45,6 +45,7 @@ #' * [`distinct()`][dplyr::distinct()]: `.keep_all = TRUE` returns a non-missing value if present, only returning missing values if all are missing. #' * [`explain()`][dplyr::explain()] #' * [`filter()`][dplyr::filter()] +#' * [`filter_out()`][dplyr::filter_out()] #' * [`full_join()`][dplyr::full_join()]: the `copy` argument is ignored #' * [`glimpse()`][dplyr::glimpse()] #' * [`group_by()`][dplyr::group_by()] diff --git a/r/man/acero.Rd b/r/man/acero.Rd index dcaca04d2f2c..ee156cc9129b 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -7,7 +7,7 @@ \alias{arrow-dplyr} \title{Functions available in Arrow dplyr queries} \description{ -The \code{arrow} package contains methods for 37 \code{dplyr} table functions, many of +The \code{arrow} package contains methods for 38 \code{dplyr} table functions, many of which are "verbs" that do transformations to one or more tables. The package also has mappings of 224 R functions to the corresponding functions in the Arrow compute library. These allow you to write code inside @@ -32,6 +32,7 @@ Table into an R \code{tibble}. \item \code{\link[dplyr:distinct]{distinct()}}: \code{.keep_all = TRUE} returns a non-missing value if present, only returning missing values if all are missing. \item \code{\link[dplyr:explain]{explain()}} \item \code{\link[dplyr:filter]{filter()}} +\item \code{\link[dplyr:filter]{filter_out()}} \item \code{\link[dplyr:mutate-joins]{full_join()}}: the \code{copy} argument is ignored \item \code{\link[dplyr:glimpse]{glimpse()}} \item \code{\link[dplyr:group_by]{group_by()}} @@ -198,7 +199,7 @@ Valid values are "s", "ms" (default), "us", "ns". \itemize{ \item \code{\link[dplyr:across]{across()}} \item \code{\link[dplyr:between]{between()}} -\item \code{\link[dplyr:case_when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported +\item \code{\link[dplyr:case-and-replace-when]{case_when()}}: \code{.ptype} and \code{.size} arguments not supported \item \code{\link[dplyr:coalesce]{coalesce()}} \item \code{\link[dplyr:desc]{desc()}} \item \code{\link[dplyr:across]{if_all()}}