Skip to content

Improve the nofuzz system to allow specific exclusions #2832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: fuzz-assignment
Choose a base branch
from
117 changes: 68 additions & 49 deletions .dev/ast_fuzz_test.R
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ if (
}

contents[wrong_number_def_idx] <-
'wrong_number_fmt <- "got %d lints instead of %d%s\\nFile contents:\\n%s"'
' wrong_number_fmt <- "got %d lints instead of %d%s\\nFile contents:\\n%s"'
contents[wrong_number_use_idx] <-
gsub("\\)$", ", readChar(file, file.size(file)))", contents[wrong_number_use_idx])
writeLines(contents, expect_lint_file)
Expand All @@ -66,61 +66,81 @@ withr::defer({

suppressMessages(pkgload::load_all())

can_parse <- \(lines) !inherits(tryCatch(parse(text = lines), error = identity), "error")
get_str <- \(x) tail(unlist(strsplit(x, ": ", fixed = TRUE)), 1L)

# beware lazy eval: originally tried adding a withr::defer() in each iteration, but
# this effectively only runs the last 'defer' expression as the names are only
# evaluated at run-time. So instead keep track of all edits in this object.
# this approach to implementing 'nofuzz' feels painfully manual, but I couldn't
# figure out how else to get 'testthat' to give us what we need -- the failures
# object in the reporter is frustratingly inconsistent in whether the trace
# exists, and even if it does, we'd have to text-mangle to get the corresponding
# file names out. Also, the trace 'srcref' happens under keep.source=FALSE,
# so we lose any associated comments anyway. even that would not solve the issue
# of getting top-level exclusions done for 'nofuzz start|end' ranges, except
# maybe if it enabled us to reuse lintr's own exclude() system.
# therefore we take this approach: pass over the test suite first and comment out
# any tests/units that have been marked 'nofuzz'. restore later. one consequence
# is there's no support for fuzzer-specific exclusion, e.g. we fully disable
# the unnecessary_placeholder_linter() tests because |> and _ placeholders differ.
# these have to be enabled/disabled at runtime as it's not possible to disentagle which
# fuzzer caused the error ex-post (and it might be the interaction of >1 at issue).
# an earlier approach was like the current 'nofuzz' -- just comment out the troublesome
# tests from being run at all. But that led to a very quickly growing set of tests being
# skipped totally, which also hid some issues that are surfaced by the current approach.
# Another idea would be to just leave the enable/disable calls as code in the test suite,
# but I prefer the current approach of leaving them as comments: (1) it's more consistent
# with the 'nolint' exclusion system and (2) it doesn't distract the casual reader as much.
test_restorations <- list()
for (test_file in list.files("tests/testthat", pattern = "^test-", full.names = TRUE)) {
xml <- read_xml(xmlparsedata::xml_parse_data(parse(test_file, keep.source = TRUE)))
# parent::* to catch top-level comments (exprlist). matches one-line nofuzz and start/end ranges.
nofuzz_lines <- xml_find_all(xml, "//COMMENT[contains(text(), 'nofuzz')]/parent::*")
if (length(nofuzz_lines) == 0L) next

test_original <- test_lines <- readLines(test_file)

for (nofuzz_line in nofuzz_lines) {
comments <- xml_find_all(nofuzz_line, "COMMENT[contains(text(), 'nofuzz')]")
comment_text <- xml_text(comments)
# handle start/end ranges first.
start_idx <- grep("nofuzz start", comment_text, fixed = TRUE)
end_idx <- grep("nofuzz end", comment_text, fixed = TRUE)
if (length(start_idx) != length(end_idx) || any(end_idx < start_idx)) {
stop(sprintf(
"Mismatched '# nofuzz start' (%s), '# nofuzz end' (%s) in %s",
toString(start_idx), toString(end_idx), test_file
))
test_lines <- readLines(test_file)
one_expr_idx <- grep("# nofuzz", test_lines, fixed = TRUE)
range_start_idx <- grep("^\\s*# fuzzer disable:", test_lines)
if (length(one_expr_idx) == 0L && length(range_start_idx) == 0L) next

test_original <- test_lines
pd <- getParseData(parse(test_file))

for (start_line in rev(one_expr_idx)) {
end_line <- start_line
while (end_line <= length(test_lines) && !can_parse(test_lines[start_line:end_line])) {
end_line <- end_line + 1L
}

comment_ranges <- Map(`:`,
as.integer(xml_attr(comments[start_idx], "line1")),
as.integer(xml_attr(comments[end_idx], "line1"))
)
for (comment_range in comment_ranges) {
test_lines[comment_range] <- paste("#", test_lines[comment_range])
if (end_line > length(test_lines)) {
stop("Unable to parse any expression starting from line ", start_line)
}
comment_txt <- subset(pd, line1 == start_line & token == "COMMENT", select = "text", drop = TRUE)
# blanket disable means the test cannot be run. this happens e.g. for tests of encoding
# that are too complicated to deal with in this GHA.
if (comment_txt == "# nofuzz") {
test_lines[start_line:end_line] <- ""
} else {
deactivated <- get_str(comment_txt)
test_lines <- c(
head(test_lines, start_line - 1L),
sprintf("deactivate_fuzzers('%s')", deactivated),
test_lines[start_line:end_line],
sprintf("activate_fuzzers('%s')", deactivated),
tail(test_lines, -end_line)
)
}
}

if (length(start_idx) > 0L && !any(!start_idx & !end_idx)) next
if (length(one_expr_idx)) {
writeLines(test_lines, test_file)
pd <- getParseData(parse(test_file))
range_start_idx <- grep("^\\s*# fuzzer disable:", test_lines)
}

# NB: one-line tests line expect_lint(...) # nofuzz are not supported,
# since the comment will attach to the parent test_that() & thus comment
# out the whole unit. Easiest solution is just to spread out those few tests for now.
comment_range <- as.integer(xml_attr(nofuzz_line, "line1")):as.integer(xml_attr(nofuzz_line, "line2"))
test_lines[comment_range] <- paste("#", test_lines[comment_range])
range_end_idx <- grep("^\\s*# fuzzer enable:", test_lines)

if (length(range_start_idx) != length(range_end_idx) || any(range_end_idx < range_start_idx)) {
stop(sprintf(
"Mismatched '# fuzzer disable' (%s), '# fuzzer enable' (%s) in %s",
toString(range_start_idx), toString(range_end_idx), test_file
))
}

for (ii in seq_along(range_start_idx)) {
start_line <- test_lines[range_start_idx[ii]]
test_lines[range_start_idx[ii]] <-
gsub("#.*", sprintf("deactivate_fuzzers('%s')", get_str(start_line)), start_line)
end_line <- test_lines[range_end_idx[ii]]
test_lines[range_end_idx[ii]] <-
gsub("#.*", sprintf("activate_fuzzers('%s')", get_str(end_line)), end_line)
}

writeLines(test_lines, test_file)
if (length(range_start_idx)) writeLines(test_lines, test_file)

test_restorations <- c(test_restorations, list(list(file = test_file, lines = test_original)))
}
withr::defer(for (restoration in test_restorations) writeLines(restoration$lines, restoration$file))
Expand All @@ -134,8 +154,7 @@ all_classes <- unlist(lapply(
reporter$get_results(),
\(test) lapply(test$results, \(x) class(x)[1L])
))
cat("Summary of test statuses:\n")
print(table(all_classes))
print(table(`Summary of test statuses:` = all_classes))

# ignore any test that failed for expected reasons, e.g. some known lint metadata changes
# about line numbers or the contents of the line. this saves us having to pepper tons of
Expand All @@ -160,7 +179,7 @@ if (length(invalid_failures) > 0L) {
\(x) sprintf("%s:%s", x$file, x$test),
character(1L)
)
cat("Some fuzzed tests failed unexpectedly!\n")
cat(sprintf("%d fuzzed tests failed unexpectedly!\n", length(invalid_failures)))
print(invalid_failures)
stop("Use # nofuzz [start|end] to mark false positives or fix any bugs.")
stop("Fix any bugs, or use '# nofuzz'/'# fuzzer [dis|en]able' to mark false positives.")
}
46 changes: 38 additions & 8 deletions .dev/maybe_fuzz_content.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,7 @@ maybe_fuzz_content <- function(file, lines) {
file.copy(file, new_file, copy.mode = FALSE)
}

apply_fuzzers(new_file, fuzzers = list(
function_lambda_fuzzer,
pipe_fuzzer,
dollar_at_fuzzer,
comment_injection_fuzzer,
assignment_fuzzer
))
apply_fuzzers(new_file, fuzzers = .fuzzers$active)

new_file
}
Expand Down Expand Up @@ -106,7 +100,7 @@ apply_fuzzers <- function(f, fuzzers) {
return(invisible())
}

unedited <- lines <- readLines(f)
unedited <- lines <- readLines(f, warn = FALSE)
for (fuzzer in fuzzers) {
updated_lines <- fuzzer(pd, lines)
if (is.null(updated_lines)) next # skip some I/O if we can
Expand All @@ -122,3 +116,39 @@ apply_fuzzers <- function(f, fuzzers) {

invisible()
}

.fuzzers <- new.env()
.fuzzers$active <- list(
assignment = assignment_fuzzer,
comment_injection = comment_injection_fuzzer,
dollar_at = dollar_at_fuzzer,
function_lambda = function_lambda_fuzzer,
pipe = pipe_fuzzer
)
.fuzzers$inactive <- list()

deactivate_fuzzers <- function(names_str) {
req <- unlist(strsplit(names_str, " ", fixed = TRUE))
if (!all(req %in% names(.fuzzers$active))) {
stop(sprintf(
"Invalid attempt to deactivate fuzzers: '%s'\n Currently active fuzzers: %s\n Currently inactive fuzzers: %s",
names_str, toString(names(.fuzzers$active)), toString(names(.fuzzers$inactive))
))
}
.fuzzers$inactive[req] <- .fuzzers$active[req]
.fuzzers$active[req] <- NULL
invisible()
}

activate_fuzzers <- function(names_str) {
req <- unlist(strsplit(names_str, " ", fixed = TRUE))
if (!all(req %in% names(.fuzzers$inactive))) {
stop(sprintf(
"Invalid attempt to activate fuzzers: '%s'\n Currently active fuzzers: %s\n Currently inactive fuzzers: %s",
names_str, toString(names(.fuzzers$active)), toString(names(.fuzzers$inactive))
))
}
.fuzzers$active[req] <- .fuzzers$inactive[req]
.fuzzers$inactive[req] <- NULL
invisible()
}
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
* `boolean_arithmetic_linter()` finds many more cases like `sum(x | y) == 0` where the total of a known-logical vector is compared to 0 (#1580, @MichaelChirico).
* New argument `include_s4_slots` for the `xml_find_function_calls()` entry in the `get_source_expressions()` to govern whether calls of the form `s4Obj@fun()` are included in the result (#2820, @MichaelChirico).
* General handling of logic around where comments can appear in code has been improved (#2822, @MichaelChirico). In many cases, this is a tiny robustness fix for weird edge cases unlikely to be found in practice, but in others, this improves practical linter precision (reduced false positives and/or false negatives). The affected linters (with annotations for changes noteworthy enough to have gotten a dedicated bug) are:
+ `any_duplicated_linter()`
+ `brace_linter()`
+ `coalesce_linter()`
+ `comparison_negation_linter()` #2826
Expand All @@ -42,6 +43,8 @@
+ `if_switch_linter()`
+ `ifelse_censor_linter()` #2826
+ `implicit_assignment_linter()`
+ `is_numeric_linter()`
+ `keyword_quote_linter()`
+ `length_test_linter()`
+ `literal_coercion_linter()` #2824
+ `matrix_apply_linter()` #2825
Expand Down
69 changes: 36 additions & 33 deletions R/any_duplicated_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,50 +52,53 @@ any_duplicated_linter <- function() {
# the final parent::expr/expr gets us to the expr on the other side of EQ;
# this lets us match on either side of EQ, where following-sibling
# assumes we are before EQ, preceding-sibling assumes we are after EQ.
length_unique_xpath_parts <- glue("
//{ c('EQ', 'NE', 'GT', 'LT') }
/parent::expr
/expr[
expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']]
and expr[expr[1][
SYMBOL_FUNCTION_CALL[text() = 'unique']
and (
following-sibling::expr =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text()= 'length']]
/following-sibling::expr
or
following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text()= 'nrow']]
/following-sibling::expr
)
]]
]
")
length_unique_xpath <- paste(length_unique_xpath_parts, collapse = " | ")
length_comparison_xpath <- "
parent::expr
/parent::expr[expr/SYMBOL_FUNCTION_CALL[text() = 'length']]
/parent::expr[EQ or NE or GT or LT]
"
length_unique_xpath <- "
expr/expr/expr[1][
SYMBOL_FUNCTION_CALL[text() = 'unique']
and (
following-sibling::expr =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'length']]
/following-sibling::expr
or
following-sibling::expr[OP-DOLLAR or LBB]/expr[1] =
parent::expr
/parent::expr
/parent::expr
/expr
/expr[1][SYMBOL_FUNCTION_CALL[text() = 'nrow']]
/following-sibling::expr
)
]"

uses_nrow_xpath <- "./parent::expr/expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'nrow']"
uses_nrow_xpath <- "./expr/expr[1]/SYMBOL_FUNCTION_CALL[text() = 'nrow']"

Linter(linter_level = "expression", function(source_expression) {
xml <- source_expression$xml_parsed_content
xml_calls <- source_expression$xml_find_function_calls("any")
any_calls <- source_expression$xml_find_function_calls("any")
unique_calls <- source_expression$xml_find_function_calls("unique")

any_duplicated_expr <- xml_find_all(xml_calls, any_duplicated_xpath)
any_duplicated_expr <- xml_find_all(any_calls, any_duplicated_xpath)
any_duplicated_lints <- xml_nodes_to_lints(
any_duplicated_expr,
source_expression = source_expression,
lint_message = "anyDuplicated(x, ...) > 0 is better than any(duplicated(x), ...).",
type = "warning"
)

length_unique_expr <- xml_find_all(xml, length_unique_xpath)
in_length_comparison <- !is.na(xml_find_first(unique_calls, length_comparison_xpath))
unique_calls <- strip_comments_from_subtree(
xml_parent(xml_parent(xml_parent(unique_calls[in_length_comparison])))
)
is_length_unique <- !is.na(xml_find_first(unique_calls, length_unique_xpath))
length_unique_expr <- unique_calls[is_length_unique]
lint_message <- ifelse(
is.na(xml_find_first(length_unique_expr, uses_nrow_xpath)),
"anyDuplicated(x) == 0L is better than length(unique(x)) == length(x).",
Expand Down
18 changes: 8 additions & 10 deletions R/is_numeric_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,12 @@ is_numeric_linter <- function() {

# testing things like is.numeric(x) || is.integer(x)
or_xpath <- glue("
//OR2
/parent::expr[
expr/{is_numeric_expr}
and expr/{is_integer_expr}
and
expr/{is_numeric_expr}/following-sibling::expr[1]
= expr/{is_integer_expr}/following-sibling::expr[1]
]
//OR2/parent::expr[expr/{is_numeric_expr} and expr/{is_integer_expr}]
")
node_match_xpath <- glue("self::*[
expr/{is_numeric_expr}/following-sibling::expr[1]
= expr/{is_integer_expr}/following-sibling::expr[1]
]")

# testing class(x) %in% c("numeric", "integer")
class_xpath <- "
Expand All @@ -69,9 +66,10 @@ is_numeric_linter <- function() {
Linter(linter_level = "expression", function(source_expression) {
xml <- source_expression$xml_parsed_content

or_expr <- xml_find_all(xml, or_xpath)
or_expr <- strip_comments_from_subtree(xml_find_all(xml, or_xpath))
expr_match <- !is.na(xml_find_first(or_expr, node_match_xpath))
or_lints <- xml_nodes_to_lints(
or_expr,
or_expr[expr_match],
source_expression = source_expression,
lint_message = paste(
"Use `is.numeric(x)` instead of the equivalent `is.numeric(x) || is.integer(x)`.",
Expand Down
2 changes: 1 addition & 1 deletion R/keyword_quote_linter.R
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ keyword_quote_linter <- function() {
)

extraction_expr <- extraction_expr[invalid_extraction_quoting]
extractor <- xml_find_chr(extraction_expr, "string(preceding-sibling::*[1])")
extractor <- xml_find_chr(extraction_expr, "string(preceding-sibling::*[not(self::COMMENT)][1])")
gen_extractor <- ifelse(extractor == "$", "[[", "slot()")

extraction_lints <- xml_nodes_to_lints(
Expand Down
Loading