diff --git a/NEWS.md b/NEWS.md index 89fb9f405..5412986c5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -39,6 +39,7 @@ * `boolean_arithmetic_linter()` finds many more cases like `sum(x | y) == 0` where the total of a known-logical vector is compared to 0 (#1580, @MichaelChirico). * `expect_lint()` has a new argument `ignore_order` (default `FALSE`), which, if `TRUE`, allows the `checks=` to be provided in arbitary order vs. how `lint()` produces them (@MichaelChirico). * `undesirable_function_linter()` accepts unnamed entries, treating them as undesirable functions, e.g. `undesirable_function_linter("sum")` (#2536, @MichaelChirico). +* `any_duplicated_linter()` is extended to recognize some usages from {dplyr} and {data.table} that could be replaced by `anyDuplicated()`, e.g. `n_distinct(col) == n()` or `uniqueN(col) == .N` (#2482, @MichaelChirico). ### New linters diff --git a/R/any_duplicated_linter.R b/R/any_duplicated_linter.R index bada723cb..d0883c1b9 100644 --- a/R/any_duplicated_linter.R +++ b/R/any_duplicated_linter.R @@ -35,7 +35,7 @@ #' @export any_duplicated_linter <- function() { any_duplicated_xpath <- " - following-sibling::expr[1][expr[1][SYMBOL_FUNCTION_CALL[text() = 'duplicated']]] + following-sibling::expr[1][expr[1]/SYMBOL_FUNCTION_CALL[text() = 'duplicated']] /parent::expr[ count(expr) = 2 or (count(expr) = 3 and SYMBOL_SUB[text() = 'na.rm']) @@ -56,32 +56,70 @@ any_duplicated_linter <- function() { //{ c('EQ', 'NE', 'GT', 'LT') } /parent::expr /expr[ - expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']] - and expr[expr[1][ + expr[1]/SYMBOL_FUNCTION_CALL[text() = 'length'] + and expr/expr[1][ SYMBOL_FUNCTION_CALL[text() = 'unique'] and ( following-sibling::expr = parent::expr - /parent::expr - /parent::expr - /expr - /expr[1][SYMBOL_FUNCTION_CALL[text()= 'length']] - /following-sibling::expr - or - following-sibling::expr[OP-DOLLAR or LBB]/expr[1] = + /parent::expr + /parent::expr + /expr + /expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']] + /following-sibling::expr + or following-sibling::expr[OP-DOLLAR or LBB]/expr[1] = parent::expr + /parent::expr + /parent::expr + /expr + /expr[1][SYMBOL_FUNCTION_CALL[text() = 'nrow']] + /following-sibling::expr + or parent::expr /parent::expr /parent::expr - /expr - /expr[1][SYMBOL_FUNCTION_CALL[text()= 'nrow']] - /following-sibling::expr + /expr[ + SYMBOL[text() = '.N'] + or (expr/SYMBOL_FUNCTION_CALL[text() = 'n'] and count(expr) = 1) + ] ) - ]] + ] ] ") length_unique_xpath <- paste(length_unique_xpath_parts, collapse = " | ") + distinct_xpath <- glue(" + //{ c('EQ', 'NE', 'GT', 'LT') } + /parent::expr + /expr[ + expr[1][ + SYMBOL_FUNCTION_CALL[text() = 'uniqueN' or text() = 'n_distinct'] + and ( + following-sibling::expr = + parent::expr + /parent::expr + /expr + /expr[1][SYMBOL_FUNCTION_CALL[text() = 'length' or text() = 'nrow']] + /following-sibling::expr + or following-sibling::expr[OP-DOLLAR or LBB]/expr[1] = + parent::expr + /parent::expr + /expr + /expr[1][SYMBOL_FUNCTION_CALL[text() = 'nrow']] + /following-sibling::expr + or parent::expr + /parent::expr + /expr[ + SYMBOL[text() = '.N'] + or (expr/SYMBOL_FUNCTION_CALL[text() = 'n'] and count(expr) = 1) + ] + ) + ] + ] + ") + uses_nrow_xpath <- "./parent::expr/expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'nrow']" + uses_dtn_xpath <- "./parent::expr/expr/SYMBOL[text() = '.N']" + uses_dplyr_xpath <- "./parent::expr/expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'n']" Linter(linter_level = "expression", function(source_expression) { xml <- source_expression$xml_parsed_content @@ -96,18 +134,39 @@ any_duplicated_linter <- function() { ) length_unique_expr <- xml_find_all(xml, length_unique_xpath) - lint_message <- ifelse( - is.na(xml_find_first(length_unique_expr, uses_nrow_xpath)), - "anyDuplicated(x) == 0L is better than length(unique(x)) == length(x).", + length_unique_lint_message <- character(length(length_unique_expr)) + length_unique_lint_message[] <- "anyDuplicated(x) == 0L is better than length(unique(x)) == length(x)." + length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_nrow_xpath))] <- "anyDuplicated(DF$col) == 0L is better than length(unique(DF$col)) == nrow(DF)" - ) + length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_dtn_xpath))] <- + "anyDuplicated(x) == 0L is better than length(unique(x)) == .N" + length_unique_lint_message[!is.na(xml_find_first(length_unique_expr, uses_dplyr_xpath))] <- + "anyDuplicated(x) == 0L is better than length(unique(x)) == n()." length_unique_lints <- xml_nodes_to_lints( length_unique_expr, source_expression = source_expression, - lint_message = lint_message, + lint_message = length_unique_lint_message, + type = "warning" + ) + + distinct_expr <- xml_find_all(xml, distinct_xpath) + distinct_lint_message_fmt <- character(length(distinct_expr)) + distinct_lint_message_fmt[] <- "anyDuplicated(x) == 0L is better than %s(x) == length(x)." + distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_nrow_xpath))] <- + "anyDuplicated(DF$col) == 0L is better than %s(DF$col) == nrow(DF)" + distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_dtn_xpath))] <- + "anyDuplicated(x) == 0L is better than %s(x) == .N" + distinct_lint_message_fmt[!is.na(xml_find_first(distinct_expr, uses_dplyr_xpath))] <- + "anyDuplicated(x) == 0L is better than %s(x) == n()." + + distinct_lint_message <- sprintf(distinct_lint_message_fmt, xp_call_name(distinct_expr)) + distinct_lints <- xml_nodes_to_lints( + distinct_expr, + source_expression = source_expression, + lint_message = distinct_lint_message, type = "warning" ) - c(any_duplicated_lints, length_unique_lints) + c(any_duplicated_lints, length_unique_lints, distinct_lints) }) } diff --git a/tests/testthat/test-any_duplicated_linter.R b/tests/testthat/test-any_duplicated_linter.R index 22100b1cf..bcf6fb84a 100644 --- a/tests/testthat/test-any_duplicated_linter.R +++ b/tests/testthat/test-any_duplicated_linter.R @@ -1,12 +1,12 @@ test_that("any_duplicated_linter skips allowed usages", { linter <- any_duplicated_linter() - expect_lint("x <- any(y)", NULL, linter) - expect_lint("y <- duplicated(z)", NULL, linter) + expect_no_lint("x <- any(y)", linter) + expect_no_lint("y <- duplicated(z)", linter) # extended usage of any is not covered - expect_lint("any(duplicated(y), b)", NULL, linter) - expect_lint("any(b, duplicated(y))", NULL, linter) + expect_no_lint("any(duplicated(y), b)", linter) + expect_no_lint("any(b, duplicated(y))", linter) }) test_that("any_duplicated_linter blocks simple disallowed usages", { @@ -28,10 +28,10 @@ test_that("any_duplicated_linter catches length(unique()) equivalencies too", { # non-matches ## different variable - expect_lint("length(unique(x)) == length(y)", NULL, linter) + expect_no_lint("length(unique(x)) == length(y)", linter) ## different table - expect_lint("length(unique(DF$x)) == nrow(DT)", NULL, linter) - expect_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", NULL, linter) + expect_no_lint("length(unique(DF$x)) == nrow(DT)", linter) + expect_no_lint("length(unique(l1$DF$x)) == nrow(l2$DF)", linter) # lintable usage expect_lint("length(unique(x)) == length(x)", lint_msg_x, linter) @@ -51,6 +51,27 @@ test_that("any_duplicated_linter catches length(unique()) equivalencies too", { expect_lint("length(x) > length(unique(x))", lint_msg_x, linter) }) +test_that("dplyr & data.table equivalents are also linted", { + linter <- any_duplicated_linter() + + expect_no_lint("uniqueN(x) == nrow(y)", linter) + expect_no_lint("n_distinct(x) == nrow(y)", linter) + expect_no_lint("x[, length(unique(col)) == .N()]", linter) + # some other n function, not dplyr::n + expect_no_lint("x %>% summarize(length(unique(col)) == n(2))", linter) + expect_no_lint("x %>% summarize(length(unique(col)) == n)", linter) + + expect_lint("uniqueN(x) == nrow(x)", rex::rex("uniqueN(DF$col) == nrow(DF)"), linter) + expect_lint("data.table::uniqueN(x) == nrow(x)", rex::rex("uniqueN(DF$col) == nrow(DF)"), linter) + expect_lint("x[, length(unique(col)) == .N]", rex::rex("length(unique(x)) == .N"), linter) + expect_lint("x[, uniqueN(col) == .N]", rex::rex("uniqueN(x) == .N"), linter) + + expect_lint("n_distinct(x) == nrow(x)", rex::rex("n_distinct(DF$col) == nrow(DF)"), linter) + expect_lint("dplyr::n_distinct(x) == nrow(x)", rex::rex("n_distinct(DF$col) == nrow(DF)"), linter) + expect_lint("x %>% summarize(length(unique(col)) == n())", rex::rex("length(unique(x)) == n()"), linter) + expect_lint("x %>% summarize(n_distinct(col) == n())", rex::rex("n_distinct(x) == n()"), linter) +}) + test_that("any_duplicated_linter catches expression with two types of lint", { linter <- any_duplicated_linter() lint_msg <- rex::rex("anyDuplicated(DF$col) == 0L is better than length(unique(DF$col)) == nrow(DF)")