From 15bb164479d058fbd00b96f0ef3d51d081946d6b Mon Sep 17 00:00:00 2001 From: Lorenzo ISELLA Date: Thu, 12 Feb 2026 14:35:41 +0100 Subject: [PATCH 1/3] r: add support for dplyr::filter_out() --- r/R/dplyr-filter.R | 61 +++++++++++++++++++++++++++- r/tests/testthat/test-dplyr-filter.R | 41 +++++++++++++++++++ 2 files changed, 100 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index 18f5c929aff..5d0af8ef47f 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -47,7 +47,7 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) call = expr ) } - out <- set_filters(out, filt) + out <- set_filters(out, filt, exclude = FALSE) } if (by$from_by) { @@ -59,7 +59,58 @@ filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) } filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query -set_filters <- function(.data, expressions) { +filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { + try_arrow_dplyr({ + # TODO something with the .preserve argument + out <- as_adq(.data) + + by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") + + if (by$from_by) { + out$group_by_vars <- by$names + } + + expanded_filters <- expand_across(out, quos(...)) + if (length(expanded_filters) == 0) { + # Nothing to do + return(as_adq(.data)) + } + + # tidy-eval the filter expressions inside an Arrow data_mask + mask <- arrow_mask(out) + + combined <- NULL + + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) + + if (length(mask$.aggregations)) { + arrow_not_supported( + .actual_msg = "Expression not supported in filter_out() in Arrow", + call = expr + ) + } + + # arrow_eval() may return either an Expression or a list_of + if (is_list_of(filt, "Expression")) { + filt <- Reduce("&", filt) + } + + combined <- if (is.null(combined)) filt else (combined & filt) + } + + out <- set_filters(out, combined, exclude = TRUE) + + if (by$from_by) { + out$group_by_vars <- character() + } + + out + }) +} +filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader <- filter_out.arrow_dplyr_query + +set_filters <- function(.data, expressions, exclude = FALSE) { if (length(expressions)) { if (is_list_of(expressions, "Expression")) { # expressions is a list of Expressions. AND them together and set them on .data @@ -70,6 +121,12 @@ set_filters <- function(.data, expressions) { stop("filter expressions must be either an expression or a list of expressions", call. = FALSE) } + if (isTRUE(exclude)) { + # dplyr::filter_out() semantics: drop rows where predicate is TRUE; + # keep rows where predicate is FALSE or NA. + new_filter <- (!new_filter) | is.na(new_filter) + } + if (isTRUE(.data$filtered_rows)) { # TRUE is default (i.e. no filter yet), so we don't need to & with it .data$filtered_rows <- new_filter diff --git a/r/tests/testthat/test-dplyr-filter.R b/r/tests/testthat/test-dplyr-filter.R index d56e25fca32..9bf81b9a4f0 100644 --- a/r/tests/testthat/test-dplyr-filter.R +++ b/r/tests/testthat/test-dplyr-filter.R @@ -498,3 +498,44 @@ test_that("filter() with aggregation expressions errors", { "not supported in filter" ) }) + +test_that("filter_out() basic", { + compare_dplyr_binding( + .input |> + filter_out(chr == "b") |> + select(chr, int, lgl) |> + collect(), + tbl + ) +}) + +test_that("filter_out() keeps NA values in predicate result", { + compare_dplyr_binding( + .input |> + filter_out(lgl) |> + select(chr, int, lgl) |> + collect(), + tbl + ) +}) + +test_that("filter_out() with multiple conditions", { + compare_dplyr_binding( + .input |> + filter_out(dbl > 2, chr %in% c("d", "f")) |> + collect(), + tbl + ) +}) + +test_that("More complex select/filter_out", { + compare_dplyr_binding( + .input |> + filter_out(dbl > 2, chr == "d" | chr == "f") |> + select(chr, int, lgl) |> + filter(int < 5) |> + select(int, chr) |> + collect(), + tbl + ) +}) From d6f4671cbcc1088b6c3704a25c618c9a45f044a3 Mon Sep 17 00:00:00 2001 From: Lorenzo ISELLA Date: Fri, 13 Feb 2026 16:09:20 +0100 Subject: [PATCH 2/3] GH-49257: [R] Support dplyr::filter_out() in Arrow backend --- r/R/arrow-package.R | 1 + r/R/dplyr-filter.R | 124 ++++++++++++++++++++++---------------------- 2 files changed, 64 insertions(+), 61 deletions(-) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index a1167433c93..5a596dffe3c 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -38,6 +38,7 @@ supported_dplyr_methods <- list( select = NULL, filter = NULL, + filter_out = NULL, collect = NULL, summarise = c( "window functions not currently supported;", diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index 5d0af8ef47f..da4fd4bd661 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -17,68 +17,30 @@ # The following S3 methods are registered on load if dplyr is present -filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { - try_arrow_dplyr({ - # TODO something with the .preserve argument - out <- as_adq(.data) - - by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") - - if (by$from_by) { - out$group_by_vars <- by$names - } - - expanded_filters <- expand_across(out, quos(...)) - if (length(expanded_filters) == 0) { - # Nothing to do - return(as_adq(.data)) - } - - # tidy-eval the filter expressions inside an Arrow data_mask - mask <- arrow_mask(out) - for (expr in expanded_filters) { - filt <- arrow_eval(expr, mask) - if (length(mask$.aggregations)) { - # dplyr lets you filter on e.g. x < mean(x), but we haven't implemented it. - # But we could, the same way it works in mutate() via join, if someone asks. - # Until then, just error. - arrow_not_supported( - .actual_msg = "Expression not supported in filter() in Arrow", - call = expr - ) - } - out <- set_filters(out, filt, exclude = FALSE) - } +apply_filter_impl <- function(.data, ..., .by = NULL, .preserve = FALSE, + exclude = FALSE, verb = c("filter", "filter_out")) { + verb <- match.arg(verb) - if (by$from_by) { - out$group_by_vars <- character() - } + # TODO something with the .preserve argument + out <- as_adq(.data) - out - }) -} -filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query - -filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { - try_arrow_dplyr({ - # TODO something with the .preserve argument - out <- as_adq(.data) - - by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") + by <- compute_by({{ .by }}, out, by_arg = ".by", data_arg = ".data") - if (by$from_by) { - out$group_by_vars <- by$names - } + if (by$from_by) { + out$group_by_vars <- by$names + } - expanded_filters <- expand_across(out, quos(...)) - if (length(expanded_filters) == 0) { - # Nothing to do - return(as_adq(.data)) - } + expanded_filters <- expand_across(out, quos(...)) + if (length(expanded_filters) == 0) { + # Nothing to do + return(as_adq(.data)) + } - # tidy-eval the filter expressions inside an Arrow data_mask - mask <- arrow_mask(out) + # tidy-eval the filter expressions inside an Arrow data_mask + mask <- arrow_mask(out) + if (isTRUE(exclude)) { + # filter_out(): combine all predicates with &, then exclude combined <- NULL for (expr in expanded_filters) { @@ -86,12 +48,11 @@ filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FAL if (length(mask$.aggregations)) { arrow_not_supported( - .actual_msg = "Expression not supported in filter_out() in Arrow", + .actual_msg = sprintf("Expression not supported in %s() in Arrow", verb), call = expr ) } - # arrow_eval() may return either an Expression or a list_of if (is_list_of(filt, "Expression")) { filt <- Reduce("&", filt) } @@ -100,12 +61,53 @@ filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FAL } out <- set_filters(out, combined, exclude = TRUE) + } else { + # filter(): apply each predicate sequentially + for (expr in expanded_filters) { + filt <- arrow_eval(expr, mask) - if (by$from_by) { - out$group_by_vars <- character() + if (length(mask$.aggregations)) { + arrow_not_supported( + .actual_msg = sprintf("Expression not supported in %s() in Arrow", verb), + call = expr + ) + } + + out <- set_filters(out, filt, exclude = FALSE) } + } - out + if (by$from_by) { + out$group_by_vars <- character() + } + + out +} + +filter.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { + try_arrow_dplyr({ + apply_filter_impl( + .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + exclude = FALSE, + verb = "filter" + ) + }) +} +filter.Dataset <- filter.ArrowTabular <- filter.RecordBatchReader <- filter.arrow_dplyr_query + +filter_out.arrow_dplyr_query <- function(.data, ..., .by = NULL, .preserve = FALSE) { + try_arrow_dplyr({ + apply_filter_impl( + .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + exclude = TRUE, + verb = "filter_out" + ) }) } filter_out.Dataset <- filter_out.ArrowTabular <- filter_out.RecordBatchReader <- filter_out.arrow_dplyr_query From 7134175a095f93125a9e6869128bc5a2a9a70467 Mon Sep 17 00:00:00 2001 From: Lorenzo Isella Date: Sat, 14 Feb 2026 18:13:45 +0100 Subject: [PATCH 3/3] GH-49257: [R] Run air format --- r/R/dplyr-filter.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/r/R/dplyr-filter.R b/r/R/dplyr-filter.R index da4fd4bd661..0ccb5fb8944 100644 --- a/r/R/dplyr-filter.R +++ b/r/R/dplyr-filter.R @@ -17,8 +17,14 @@ # The following S3 methods are registered on load if dplyr is present -apply_filter_impl <- function(.data, ..., .by = NULL, .preserve = FALSE, - exclude = FALSE, verb = c("filter", "filter_out")) { +apply_filter_impl <- function( + .data, + ..., + .by = NULL, + .preserve = FALSE, + exclude = FALSE, + verb = c("filter", "filter_out") +) { verb <- match.arg(verb) # TODO something with the .preserve argument