From 171781d9411e06021c80d56c3f8551c6b76eb085 Mon Sep 17 00:00:00 2001
From: Daniel <mail@danielluedecke.de>
Date: Thu, 30 May 2024 14:13:20 +0200
Subject: [PATCH] Check for misspelled colnames in `report_sample()`

---
 DESCRIPTION                         |  3 +-
 NEWS.md                             |  1 +
 R/report_sample.R                   |  6 +++
 R/utils_misspelled_variables.R      | 75 +++++++++++++++++++++++++++++
 tests/testthat/test-report_sample.R |  2 +
 5 files changed, 86 insertions(+), 1 deletion(-)
 create mode 100644 R/utils_misspelled_variables.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 7ad0c624..5fbfc3dd 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: report
 Type: Package
 Title: Automated Reporting of Results and Statistical Models
-Version: 0.5.8.3
+Version: 0.5.8.4
 Authors@R:
     c(person(given = "Dominique",
              family = "Makowski",
@@ -148,6 +148,7 @@ Collate:
     'report_table.R'
     'utils_error_message.R'
     'utils_grouped_df.R'
+    'utils_misspelled_variables.R'
     'zzz.R'
 Roxygen: list(markdown = TRUE)
 Remotes: easystats/insight, easystats/datawizard, easystats/parameters, easystats/performance, easystats/modelbased
diff --git a/NEWS.md b/NEWS.md
index 1d5f9e21..2693e9bf 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -13,6 +13,7 @@ Minor changes
 
 * `report` now supports reporting of Bayesian model comparison with variables of class `brms::loo_compare`.
 * `report` now supports reporting of BayesFactor objects with variables of class `BFBayesFactor`.
+* `report_sample()` now suggests valid column names for misspelled columns in the `select`, `by`, `weights` and `exclude` arguments.
 
 # report 0.5.8
 
diff --git a/R/report_sample.R b/R/report_sample.R
index 41d6a8f8..83b95115 100644
--- a/R/report_sample.R
+++ b/R/report_sample.R
@@ -114,6 +114,12 @@ report_sample <- function(data,
     select <- colnames(data)[select]
   }
 
+  # sanity check for existing columns
+  .check_spelling(data, select)
+  .check_spelling(data, exclude)
+  .check_spelling(data, by)
+  .check_spelling(data, weights)
+
   # variables to keep
   if (!is.null(weights)) {
     select <- unique(c(select, weights))
diff --git a/R/utils_misspelled_variables.R b/R/utils_misspelled_variables.R
new file mode 100644
index 00000000..6537cf85
--- /dev/null
+++ b/R/utils_misspelled_variables.R
@@ -0,0 +1,75 @@
+# call this function to check arguments. "select" is the argument where user
+# specified column names. "arg_name" is the name of that argument, can be NULL
+.check_spelling <- function(data, select) {
+  wrong_arg <- paste0("specified in `", deparse(substitute(select)), "` ")
+  if (!is.null(select) && isTRUE(nzchar(select)) && !all(select %in% colnames(data))) {
+    not_found <- select[!select %in% colnames(data)]
+    insight::format_error(
+      paste0(
+        sprintf("The following column(s) %sdon't exist in the dataset: ", wrong_arg),
+        datawizard::text_concatenate(not_found), "."
+      ),
+      .misspelled_string(colnames(data), not_found, "Possibly misspelled?")
+    )
+  }
+}
+
+
+#' Fuzzy grep, matches pattern that are close, but not identical
+#' @examples
+#' colnames(iris)
+#' p <- sprintf("(%s){~%i}", "Spela", 2)
+#' grep(pattern = p, x = colnames(iris), ignore.case = FALSE)
+#' @keywords internal
+#' @noRd
+.fuzzy_grep <- function(x, pattern, precision = NULL) {
+  if (is.null(precision)) {
+    precision <- round(nchar(pattern) / 3)
+  }
+  if (precision > nchar(pattern)) {
+    return(NULL)
+  }
+  p <- sprintf("(%s){~%i}", pattern, precision)
+  grep(pattern = p, x = x, ignore.case = FALSE)
+}
+
+
+#' create a message string to tell user about matches that could possibly
+#' be the string they were looking for
+#'
+#' @keywords internal
+#' @noRd
+.misspelled_string <- function(source, searchterm, default_message = NULL) {
+  if (is.null(searchterm) || length(searchterm) < 1) {
+    return(default_message)
+  }
+  # used for many matches
+  more_found <- ""
+  # init default
+  msg <- ""
+  # guess the misspelled string
+  possible_strings <- unlist(lapply(searchterm, function(s) {
+    source[.fuzzy_grep(source, s)] # nolint
+  }), use.names = FALSE)
+  if (length(possible_strings)) {
+    msg <- "Did you mean "
+    if (length(possible_strings) > 1) {
+      # make sure we don't print dozens of alternatives for larger data frames
+      if (length(possible_strings) > 5) {
+        more_found <- sprintf(
+          " We even found %i more possible matches, not shown here.",
+          length(possible_strings) - 5
+        )
+        possible_strings <- possible_strings[1:5]
+      }
+      msg <- paste0(msg, "one of ", datawizard::text_concatenate(possible_strings, enclose = "\"", last = " or "))
+    } else {
+      msg <- paste0(msg, "\"", possible_strings, "\"")
+    }
+    msg <- paste0(msg, "?", more_found)
+  } else {
+    msg <- default_message
+  }
+  # no double white space
+  insight::trim_ws(msg)
+}
diff --git a/tests/testthat/test-report_sample.R b/tests/testthat/test-report_sample.R
index a012f969..c3d99e35 100644
--- a/tests/testthat/test-report_sample.R
+++ b/tests/testthat/test-report_sample.R
@@ -51,6 +51,8 @@ test_that("report_sample check input", {
   data(iris)
   expect_error(report_sample(lm(Sepal.Length ~ Species, data = iris)))
   expect_silent(report_sample(iris$Species))
+  expect_error(report_sample(iris, by = "Spedies"), regex = "The following column")
+  expect_error(report_sample(iris, select = "Spedies"), regex = "The following column")
 })
 
 test_that("report_sample default", {