From 844b8f98d05717a69c84b986f79f43789e59dc22 Mon Sep 17 00:00:00 2001 From: Sebastian Funk Date: Fri, 31 Jan 2025 11:03:44 +0000 Subject: [PATCH] 898: detect initial accumulation (#933) * detect initial accumulation * update docs * add test * add news item * use inform not warn * update test * insert PR number * rephrase news * improve variable names as suggested by @jamesmbaazam Co-authored-by: James Azam * add i * missed one Co-authored-by: James Azam * break line * fix tests --------- Co-authored-by: Sam Abbott Co-authored-by: James Azam --- NEWS.md | 2 +- R/preprocessing.R | 38 +++++++++++++++++++++-------- man/fill_missing.Rd | 5 +++- tests/testthat/test-preprocessing.R | 17 ++++++++----- 4 files changed, 44 insertions(+), 18 deletions(-) diff --git a/NEWS.md b/NEWS.md index 89e427909..065801527 100644 --- a/NEWS.md +++ b/NEWS.md @@ -2,7 +2,7 @@ ## Model changes -- The models now support more complex patterns of aggregating reported cases by accumulating them over multiple time points, as well as mixtures of accumulation and missingness via the new `fill_missing()` function and a logical `accumulate` column that can be supplied with the data. By @sbfnk in #851 and reviewed by @seabbs and @jamesmbaazam.. +- The models now support more complex patterns of aggregating reported cases by accumulating them over multiple time points, as well as mixtures of accumulation and missingness via the new `fill_missing()` function and a logical `accumulate` column that can be supplied with the data. If the accumulation frequency is fixed in the data this is detected when using `fill_missing()`. By @sbfnk in #851 and #933 and reviewed by @seabbs and @jamesmbaazam. ```r # Deprecated diff --git a/R/preprocessing.R b/R/preprocessing.R index 6056f8022..10302611f 100644 --- a/R/preprocessing.R +++ b/R/preprocessing.R @@ -36,7 +36,10 @@ ##' have a sufficient number of modelled observations accumulated onto the ##' first data point. For modelling weekly incidence data this should be set ##' to 7. If accumulating and the first data point is not NA and this is -##' argument is not set, then that data point will be removed with a warning. +##' argument is not set, then if all dates in the data have the same gap this +##' will be taken as initial accumulation and a warning given to inform the +##' user. If not all gaps are the same the first data point will be removed +##' with a warning. ##' @param obs_column Character (default: "confirm"). If given, only the column ##' specified here will be used for checking missingness. This is useful if ##' using a data set that has multiple columns of hwich one of them @@ -74,9 +77,6 @@ fill_missing <- function(data, assert_character(missing_obs) assert_character(obs_column) assert_character(by, null.ok = TRUE) - if (!missing(initial_accumulate)) { - assert_integerish(initial_accumulate, lower = 1) - } assert_names( colnames(data), must.include = c("date", by, obs_column), @@ -84,21 +84,39 @@ fill_missing <- function(data, ) assert_date(data$date, any.missing = FALSE) - data <- as.data.table(data) - missing_dates <- arg_match(missing_dates) missing_obs <- arg_match(missing_obs) + data <- as.data.table(data) + + if (missing(initial_accumulate)) { + ## detect frequency of accumulation if possible + missing_date_patterns <- data[, list(pattern = unique(diff(date))), by = by] + unique_patterns <- unique(missing_date_patterns$pattern) + if (length(unique_patterns) == 1 && unique_patterns > 1) { + cli_inform( + c( + "!" = + "Detected fixed accumulation frequency of {unique_patterns}. + This will be used for initial accumulation. Use + {.var initial_accumulate} to change this behaviour. To silence this + warning, set {.var initial_accumulate} to {unique_patterns}." + ) + ) + initial_accumulate <- unique_patterns + } + } else { + assert_integerish(initial_accumulate, lower = 1) + } + ## first, processing missing dates initial_add <- ifelse(missing(initial_accumulate), 1, initial_accumulate) cols <- list( date = seq(min(data$date) - initial_add + 1, max(data$date), by = "day") ) - if (!is.null(by)) { - for (by_col in by) { - cols[[by_col]] <- unique(data[[by_col]]) - } + for (by_col in by) { + cols[[by_col]] <- unique(data[[by_col]]) } complete <- do.call(CJ, cols) diff --git a/man/fill_missing.Rd b/man/fill_missing.Rd index 9e37e0b02..ac9a2582d 100644 --- a/man/fill_missing.Rd +++ b/man/fill_missing.Rd @@ -44,7 +44,10 @@ then dates are added to the beginning of the data set to get be able to have a sufficient number of modelled observations accumulated onto the first data point. For modelling weekly incidence data this should be set to 7. If accumulating and the first data point is not NA and this is -argument is not set, then that data point will be removed with a warning.} +argument is not set, then if all dates in the data have the same gap this +will be taken as initial accumulation and a warning given to inform the +user. If not all gaps are the same the first data point will be removed +with a warning.} \item{obs_column}{Character (default: "confirm"). If given, only the column specified here will be used for checking missingness. This is useful if diff --git a/tests/testthat/test-preprocessing.R b/tests/testthat/test-preprocessing.R index 667ae5f6f..00d0e5da6 100644 --- a/tests/testthat/test-preprocessing.R +++ b/tests/testthat/test-preprocessing.R @@ -28,21 +28,26 @@ test_that("fill_missing works with by columns", { }) test_that("fill_missing warns about initial data points", { - expect_warning( + expect_message( fill_missing(cases, missing_dates = "accumulate"), + "Detected fixed accumulation frequency" + ) + shifted <- copy(cases) + shifted[1, date := date + 1] + expect_warning( + fill_missing(shifted, missing_dates = "accumulate"), "Initial data point not marked as accumulated" ) }) test_that("add_horizon works", { - expect_warning( - fill_missing(cases, missing_dates = "accumulate"), - "Initial data point not marked as accumulated" - ) + expect_equal(nrow(add_horizon(cases, horizon = 7L)), nrow(cases) + 7L) }) test_that("add_horizon identifies gaps correctly", { - filled <- fill_missing(cases, missing_dates = "accumulate", initial_accumulate = 7) + filled <- fill_missing( + cases, missing_dates = "accumulate", initial_accumulate = 7 + ) expect_message( result <- add_horizon(filled, horizon = 7), "Forecasts accumulated every 7 days"