Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update derived categories via expression #476

Draft
wants to merge 18 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ Collate:
'auth.R'
'batches.R'
'case-variables.R'
'case-when-variable.R'
'categories.R'
'category.R'
'change-category-id.R'
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ export(addSubvariable)
export(addSubvariables)
export(addSummaryStat)
export(addVariables)
export(alterArrayExpr)
export(alterCategoriesExpr)
export(analyses)
export(analysis)
Expand All @@ -119,6 +120,7 @@ export(availableGeodataFeatures)
export(batches)
export(bin)
export(caseExpr)
export(caseWhenExpr)
export(categoriesFromLevels)
export(cd)
export(changeCategoryID)
Expand Down Expand Up @@ -230,6 +232,7 @@ export(logout)
export(makeArray)
export(makeArrayGadget)
export(makeCaseVariable)
export(makeCaseWhenVariable)
export(makeFrame)
export(makeMR)
export(makeMRFromText)
Expand Down
22 changes: 21 additions & 1 deletion R/AllClasses.R
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,27 @@ setClass("Categories", contains = "AbstractCategories")

#' @rdname Categories
#' @export
Categories <- GenericConstructor("Categories")
Categories <- function(..., data = NULL) {
# Fill in ids if missing
if (is.null(data)) data <- list(...)

# use try because we haven't validated that they're category-like yet
used_ids <- try(vapply(data, function(x) x$id %||% NA, numeric(1)), silent = TRUE)
if (!inherits(used_ids, "try-error") && any(is.na(used_ids))) {
all_ids <- used_ids
all_ids[is.na(used_ids)] <- setdiff(
seq_along(data),
used_ids
)[seq_len(sum(is.na(used_ids)))]

data <- mapply(function(cat, used_id, all_id) {
if (is.na(used_id)) cat$id <- all_id
cat
}, data, used_ids, all_ids, SIMPLIFY = FALSE)
}

new("Categories", data)
}

#' @rdname Categories
#' @export
Expand Down
2 changes: 1 addition & 1 deletion R/R-to-variable.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ setGeneric("toVariable", function(x, ...) standardGeneric("toVariable"))

#' @rdname toVariable
#' @export
setMethod("toVariable", "CrunchExpr", function(x, ...) {
setMethod("toVariable", "CrunchVarOrExpr", function(x, ...) {
structure(list(derivation = zcl(x), ...), class = "VariableDefinition")
})

Expand Down
162 changes: 162 additions & 0 deletions R/case-when-variable.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#' Create a variable from categorical variables or categories based on conditions
#'
#' Conditions are specified using a series of formulas: the left-hand side is
#' the condition that must be true (a `CrunchLogicalExpr`) and the right-hand
#' side is where to get the value if the condition on the left-hand side is
#' true. This must be either a Crunch Categorical variable or a Category.
#'
#' @param ... formulas where the left hand side is a `CrunchLogicalExpression` (or `TRUE`
#' to indicate the "else" case that will be met if all the other expression are
#' not met) and the right hand side is a CrunchVariable that should be filled in,
#' a `Category` object describing the Category it should be used, a string
#' which will be the name of the `Category` or `NA` to indicate that it should
#' be replaced with the system missing value. For `makeCaseWhenVariable()`
#' non-formula arguments will be passed to `[VarDef()]`
#' @param data A CrunchDataset to use if variable aliases are left bare in the
#' formulas.
#' @param cases A list of formulas that match the description in `...` or a list of
#' lists with named items, "expression" (like the left-hand side of the formulas above),
#' "fill" for a variable to fill in, or "name", "id", and other items that describe a
#' category.
#' @param name For `makeCaseWhenVariable()` the name of the variable to create.
#'
#' @return `makeCaseWhenVariable()` returns a `VariableDefinition` and
#' `caseWhenExpr()` returns an expression
#' @export
#' @examples
#' \dontrun{
#' ds$new_var <- makeCaseWhenVariable(
#' ds$x %in% c("a", "b") ~ ds$y, # can fill with a variable
#' ds$x %in% c("c", "d") ~ Category(name = "c or d", numeric_value = 10), # or a Category
#' # If none of the categories match, will be set to missing unless you
#' # specify an "else" case with `TRUE` in the left hand side
#' TRUE ~ Category(name = "catch all"),
#' name = "combined x and y"
#' )
#'
#' ds$brand_x_pref <- makeCaseWhenVariable(
#' ds$brand[[1]] == "Brand X" ~ ds$pref[[1]],
#' ds$brand[[2]] == "Brand X" ~ ds$pref[[2]],
#' ds$brand[[3]] == "Brand X" ~ ds$pref[[3]]
#' name = "brand x preference"
#' )
#'
#' ds$rebased_x <- makeCaseWhenVariable(
#' ds$skipped_x != "Yes" ~ ds$x,
#' name = "rebased x"
#' )
#'
#' # caseWhenExpr can be used inside other expressions
#' ds$brand_x_prefer_high <- VarDef(
#' selectCategories(
#' caseWhenExpr(
#' ds$brand[[1]] == "Brand X" ~ ds$pref[[1]],
#' ds$brand[[2]] == "Brand X" ~ ds$pref[[2]],
#' ds$brand[[3]] == "Brand X" ~ ds$pref[[3]]
#' ),
#' c("Best", "Very Good")
#' ),
#' name = "brand x preference selected"
#' )
#'
#' # Using lists in `cases` argument can be helpful when working programmatically
#' fill_var <- ds$x
#' fill_condition <- ds$skipped_x != "Yes"
#'
#' ds$rebased_x2 <- makeCaseWhenVariable(
#' cases = list(list(fill = fill_var, expression = fill_condition)),
#' name = "rebased x 2"
#' )
#' }
makeCaseWhenVariable <- function(..., data = NULL, cases = NULL, name) {
dots <- list(...)
formula_dots <- vapply(dots, function(x) inherits(x, "formula"), logical(1))

args <- list(
data = caseWhenExpr(data = data, cases = c(cases, unname(dots[formula_dots]))),
name = name
)
args <- c(args, dots[!formula_dots])

do.call(VarDef, args)
}

#' @export
#' @rdname makeCaseWhenVariable
caseWhenExpr <- function(..., data = NULL, cases = NULL) {
cases <- unname(c(cases, list(...)))
case_fills <- lapply(cases, parse_case_when_formula, data = data)

# Get set of unique IDs that fill in for when IDs are missing
used_ids <- vapply(case_fills, function(x) x$id %||% NA, numeric(1))
case_ids <- used_ids
case_ids[is.na(used_ids)] <- setdiff(
seq_along(case_fills),
used_ids
)[seq_len(sum(is.na(used_ids)))]

cases <- mapply(function(case_fill, case_id) {
# Make a temporary cases for expressions that will be filled in
if ("fill" %in% names(case_fill)) {
list(
expression = case_fill$expression,
id = as.integer(case_id),
name = paste0("casefill__internal", case_id)
)
} else {
case_fill
}
}, case_fills, case_ids, SIMPLIFY = FALSE)

need_fills <- vapply(case_fills, function(x) "fill" %in% names(x), logical(1))

if (!any(need_fills)) return(caseExpr(cases = cases))

fills <- lapply(which(need_fills), function(cf_num) {
case_fill <- case_fills[[cf_num]]
list(fill = case_fill$fill, id = case_ids[cf_num])
})

fillExpr(caseExpr(cases = cases), fills = fills)
}

parse_case_when_formula <- function(formula, data) {
if (is.list(formula)) {
if (identical(formula$expression, TRUE)) formula$expression <- "else"
return(formula)
}

if (length(formula) != 3) {
halt(
"The condition provided must be a proper formula: ",
deparseAndFlatten(formula)
)
}

expr <- evalLHS(formula, data)
if (!inherits(expr, c("logical", "CrunchLogicalExpr"))) {
halt(
"The left-hand side provided must be a logical or a ",
"CrunchLogicalExpr: ", dQuote(LHS_string(formula))
)
}
if (identical(expr, TRUE)) expr <- "else"

rhs <- evalRHS(formula, data)
if (is.variable(rhs)) {
rhs <- list(fill = rhs)
} else if (inherits(rhs, "Category")) {
rhs <- lapply(rhs, identity)
} else if (is.character(rhs)) {
rhs <- list(name = rhs)
} else if (is.na(rhs)) {
list(name = "No Data", missing = TRUE)
} else {
halt(
"The right-hand side provided must be a Category, CrunchVariable ",
"string, or `NA`: ", dQuote(RHS_string(formula))
)
}

c(list(expression = expr), rhs)
}
77 changes: 75 additions & 2 deletions R/categories.R
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,70 @@ is.3vl <- function(cats) {
)
}

updateCategoriesDerivation <- function(x, value) {
old_cats <- categories(x)

if (!setequal(ids(old_cats), ids(value))) {
halt("Updating categories must preserve the same ids for derived variables")
}

cat_changes <- lapply(ids(value), function(cat_id) {
old <- old_cats[[which(ids(old_cats) == cat_id)]]
new <- value[[which(ids(value) == cat_id)]]
defaults <- list(
numeric_value = NA_real_, missing = FALSE, selected = FALSE, date = NA_character_
)

old <- modifyList(defaults, unclass(old))
new <- modifyList(defaults, unclass(new))

changed <- vapply(names(new), function(attr) {
!isTRUE(all.equal(old[[attr]], new[[attr]])) && !(is.na(old[[attr]]) && is.null(new[[attr]])) && !(is.null(old[[attr]]) && is.na(new[[attr]]))
}, logical(1))

new[names(new) == "id" | changed]
})

# We'll use `selectCategories()` for selection changes, but `alterCategories()` for everything else
sel_changes <- vapply(cat_changes, function(x) x$selected %||% NA, logical(1))
names(sel_changes) <- names(value)
sel_changes <- sel_changes[!is.na(sel_changes)]

nonsel_changes <- lapply(cat_changes, function(x) {
names <- setdiff(names(x), "selected")
if (identical(names, "id")) return(NULL)

x[names]
})
nonsel_changes <- nonsel_changes[lengths(nonsel_changes) > 0]
if (length(nonsel_changes) == 0) nonsel_changes <- NULL

old_order <- ids(old_cats)
new_order <- ids(value)
if (identical(old_order, new_order)) new_order <- NULL


# can't use derivation because it mangles urls for array subvariables
derivation <- CrunchExpr(expression = entity(x)@body$derivation)
if (!is.null(nonsel_changes) || !is.null(new_order)) {
derivation <- alterCategoriesExpr(
derivation,
categories = unname(nonsel_changes),
category_order = new_order
)
}
if (length(sel_changes) > 0) {
current_sels <- ids(old_cats[is.selected(old_cats)])
drop_sels <- names(sel_changes[!sel_changes])
add_sels <- names(sel_changes[sel_changes])
new_sels <- c(setdiff(current_sels, drop_sels), add_sels)

derivation <- selectCategories(derivation, new_sels, collapse = FALSE)
}
derivation(x) <- derivation
x
}

#' Get and set Categories on Variables
#'
#' @param x a Variable
Expand Down Expand Up @@ -303,7 +367,12 @@ setMethod(
setMethod(
"categories<-", c("CategoricalVariable", "Categories"),
function(x, value) {
ent <- setEntitySlot(entity(x), "categories", value)
if (!is.derived(x)) {
ent <- setEntitySlot(entity(x), "categories", value)
} else {
x <- updateCategoriesDerivation(x, value)
}

dropCache(cubeURL(x))
return(x)
}
Expand All @@ -313,7 +382,11 @@ setMethod(
setMethod(
"categories<-", c("CategoricalArrayVariable", "Categories"),
function(x, value) {
ent <- setEntitySlot(entity(x), "categories", value)
if (!is.derived(x)) {
ent <- setEntitySlot(entity(x), "categories", value)
} else {
x <- updateCategoriesDerivation(x, value)
}
lapply(subvariableURLs(tuple(x)), dropCache) ## Subvariables will update too
dropCache(cubeURL(x))
return(x)
Expand Down
2 changes: 1 addition & 1 deletion R/category.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
is.category <- function(x) inherits(x, "Category")

setValidity("Category", function(object) {
is.cat <- all(c("id", "name") %in% names(object))
is.cat <- all(c("name") %in% names(object))
if (!all(is.cat)) {
val <- "Not a category"
} else {
Expand Down
Loading