898: detect initial accumulation (#933)

* detect initial accumulation * update docs * add test * add news item * use inform not warn * update test * insert PR number * rephrase news * improve variable names as suggested by @jamesmbaazam Co-authored-by: James Azam <[email protected]> * add i * missed one Co-authored-by: James Azam <[email protected]> * break line * fix tests --------- Co-authored-by: Sam Abbott <[email protected]> Co-authored-by: James Azam <[email protected]>
epiforecasts · Jan 31, 2025 · 844b8f9 · 844b8f9
1 parent 73a4a88
commit 844b8f9
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 18 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -2,7 +2,7 @@
 
 ## Model changes
 
-- The models now support more complex patterns of aggregating reported cases by accumulating them over multiple time points, as well as mixtures of accumulation and missingness via the new `fill_missing()` function and a logical `accumulate` column that can be supplied with the data. By @sbfnk in #851 and reviewed by @seabbs and @jamesmbaazam..
+- The models now support more complex patterns of aggregating reported cases by accumulating them over multiple time points, as well as mixtures of accumulation and missingness via the new `fill_missing()` function and a logical `accumulate` column that can be supplied with the data. If the accumulation frequency is fixed in the data this is detected when using `fill_missing()`.  By @sbfnk in #851 and #933 and reviewed by @seabbs and @jamesmbaazam.
 
   ```r
   # Deprecated

diff --git a/R/preprocessing.R b/R/preprocessing.R
@@ -36,7 +36,10 @@
 ##'   have a sufficient number of modelled observations accumulated onto the
 ##'   first data point. For modelling weekly incidence data this should be set
 ##'   to 7. If accumulating and the first data point is not NA and this is
-##'   argument is not set, then that data point will be removed with a warning.
+##'   argument is not set, then if all dates in the data have the same gap this
+##'   will be taken as initial accumulation and a warning given to inform the
+##'   user. If not all gaps are the same the first data point will be removed
+##'   with a warning.
 ##' @param obs_column Character (default: "confirm"). If given, only the column
 ##'   specified here will be used for checking missingness. This is useful if
 ##'   using a data set that has multiple columns of hwich one of them
@@ -74,31 +77,46 @@ fill_missing <- function(data,
   assert_character(missing_obs)
   assert_character(obs_column)
   assert_character(by, null.ok = TRUE)
-  if (!missing(initial_accumulate)) {
-    assert_integerish(initial_accumulate, lower = 1)
-  }
   assert_names(
     colnames(data),
     must.include = c("date", by, obs_column),
     disjunct.from = "accumulate"
   )
   assert_date(data$date, any.missing = FALSE)
 
-  data <- as.data.table(data)
-
   missing_dates <- arg_match(missing_dates)
   missing_obs <- arg_match(missing_obs)
 
+  data <- as.data.table(data)
+
+  if (missing(initial_accumulate)) {
+    ## detect frequency of accumulation if possible
+    missing_date_patterns <- data[, list(pattern = unique(diff(date))), by = by]
+    unique_patterns <- unique(missing_date_patterns$pattern)
+    if (length(unique_patterns) == 1 && unique_patterns > 1) {
+      cli_inform(
+        c(
+          "!" =
+            "Detected fixed accumulation frequency of {unique_patterns}.
+            This will be used for initial accumulation. Use
+            {.var initial_accumulate} to change this behaviour. To silence this
+            warning, set {.var initial_accumulate} to {unique_patterns}."
+        )
+      )
+      initial_accumulate <- unique_patterns
+    }
+  } else {
+    assert_integerish(initial_accumulate, lower = 1)
+  }
+
   ## first, processing missing dates
   initial_add <- ifelse(missing(initial_accumulate), 1, initial_accumulate)
 
   cols <- list(
     date = seq(min(data$date) - initial_add + 1, max(data$date), by = "day")
   )
-  if (!is.null(by))  {
-    for (by_col in by) {
-      cols[[by_col]] <- unique(data[[by_col]])
-    }
+  for (by_col in by) {
+    cols[[by_col]] <- unique(data[[by_col]])
   }
 
   complete <- do.call(CJ, cols)

diff --git a/man/fill_missing.Rd b/man/fill_missing.Rd
diff --git a/tests/testthat/test-preprocessing.R b/tests/testthat/test-preprocessing.R
@@ -28,21 +28,26 @@ test_that("fill_missing works with by columns", {
 })
 
 test_that("fill_missing warns about initial data points", {
-  expect_warning(
+  expect_message(
     fill_missing(cases, missing_dates = "accumulate"),
+    "Detected fixed accumulation frequency"
+  )
+  shifted <- copy(cases)
+  shifted[1, date := date + 1]
+  expect_warning(
+    fill_missing(shifted, missing_dates = "accumulate"),
     "Initial data point not marked as accumulated"
   )
 })
 
 test_that("add_horizon works", {
-  expect_warning(
-    fill_missing(cases, missing_dates = "accumulate"),
-    "Initial data point not marked as accumulated"
-  )
+  expect_equal(nrow(add_horizon(cases, horizon = 7L)), nrow(cases) + 7L)
 })
 
 test_that("add_horizon identifies gaps correctly", {
-  filled <- fill_missing(cases, missing_dates = "accumulate", initial_accumulate = 7)
+  filled <- fill_missing(
+    cases, missing_dates = "accumulate", initial_accumulate = 7
+  )
   expect_message(
     result <- add_horizon(filled, horizon = 7),
     "Forecasts accumulated every 7 days"