Skip to content

Commit

Permalink
Improve unnest() performance (#1141)
Browse files Browse the repository at this point in the history
* Simplify `unnest()` to improve performance

It is now a simple combination of `unchop()` + `unpack()` on the remaining df-cols.

* Add a test for unnesting df-cols

* Add skipped tests for lists of `NULL`

* Add a test and NEWS bullet for the error with mixed type list-cols

* Remove `skip()`s now that #1140 is merged
  • Loading branch information
DavisVaughan authored Aug 26, 2021
1 parent 37cb5d0 commit 36e5399
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 42 deletions.
6 changes: 6 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
# tidyr (development version)

* `unnest()` is now much faster (@mgirlich, @DavisVaughan, #1127).

* `unnest()` no longer allows unnesting a list-col containing a mix of vector
and data frame elements. Previously, this only worked by accident, and is
considered an off-label usage of `unnest()` that has now become an error.

* `unchop()` is now much faster, which propagates through to various functions,
such as `unnest()`, `unnest_longer()`, `unnest_wider()`, and
`separate_rows()` (@mgirlich, @DavisVaughan, #1127).
Expand Down
40 changes: 1 addition & 39 deletions R/nest.R
Original file line number Diff line number Diff line change
Expand Up @@ -329,20 +329,9 @@ unnest.data.frame <- function(
.id = "DEPRECATED",
.sep = "DEPRECATED",
.preserve = "DEPRECATED") {

cols <- tidyselect::eval_select(enquo(cols), data)

if (nrow(data) == 0) {
for (col in names(cols)) {
data[[col]] <- as_empty_df(data[[col]], col = col)
}
} else {
for (col in names(cols)) {
data[[col]] <- map(data[[col]], as_df, col = col)
}
}

data <- unchop(data, any_of(cols), keep_empty = keep_empty, ptype = ptype)
cols <- cols[map_lgl(unclass(data)[cols], is.data.frame)]
unpack(data, any_of(cols), names_sep = names_sep, names_repair = names_repair)
}

Expand Down Expand Up @@ -370,30 +359,3 @@ unnest.rowwise_df <- function(

out
}

# helpers -----------------------------------------------------------------

# n cols, n rows
as_df <- function(x, col) {
if (is.null(x)) {
x
} else if (is.data.frame(x)) {
x
} else if (vec_is(x)) {
# Preserves vec_size() invariant
new_data_frame(list(x), names = col)
} else {
stop("Input must be list of vectors", call. = FALSE)
}
}

as_empty_df <- function(x, col) {
if (is_list_of(x)) {
x
} else if (is.list(x)) {
list_of(.ptype = tibble(!!col := unspecified()))
} else {
stop("Input must be list of vectors", call. = FALSE)
}
}

14 changes: 14 additions & 0 deletions tests/testthat/_snaps/nest.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# bad inputs generate errors

Code
unnest(df, y)
Error <vctrs_error_scalar_type>
Input must be a vector, not a function.

# multiple columns must be same length

Code
Expand All @@ -12,3 +19,10 @@
Error <rlang_error>
In row 1, can't recycle input of size 2 to size 3.

# unnesting column of mixed vector / data frame input is an error

Code
unnest(df, x)
Error <vctrs_error_incompatible_type>
Can't combine `..1` <double> and `..2` <tbl_df>.

27 changes: 24 additions & 3 deletions tests/testthat/test-nest.R
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ test_that("empty rows still affect output type", {

test_that("bad inputs generate errors", {
df <- tibble(x = 1, y = list(mean))
expect_error(unnest(df, y), "must be list of vectors")
expect_snapshot(error = TRUE, unnest(df, y))
})

test_that("unesting combines augmented vectors", {
Expand Down Expand Up @@ -204,6 +204,15 @@ test_that("can use non-syntactic names", {
expect_named(out, "foo bar")
})

test_that("unpacks df-cols (#1112)", {
df <- tibble(x = 1, y = tibble(a = 1, b = 2))
expect_identical(unnest(df, y), tibble(x = 1, a = 1, b = 2))
})

test_that("unnesting column of mixed vector / data frame input is an error", {
df <- tibble(x = list(1, tibble(a = 1)))
expect_snapshot(error = TRUE, unnest(df, x))
})

# other methods -----------------------------------------------------------------

Expand Down Expand Up @@ -233,15 +242,27 @@ test_that("can unnest empty data frame", {
expect_equal(out, tibble(x = integer(), y = unspecified()))
})

test_that("unnesting bare lists of NULLs is equivalent to unnesting empty lists", {
df <- tibble(x = 1L, y = list(NULL))
out <- unnest(df, y)
expect_identical(out, tibble(x = integer(), y = unspecified()))
})

test_that("unnest() preserves ptype", {
tbl <- tibble(x = integer(), y = list_of(ptype = tibble(a = integer())))
res <- unnest(tbl, y)
expect_equal(res, tibble(x = integer(), a = integer()))
})

test_that("errors on bad inputs", {
test_that("unnesting typed lists of NULLs retains ptype", {
df <- tibble(x = 1L, y = list_of(NULL, .ptype = tibble(a = integer())))
out <- unnest(df, y)
expect_identical(out, tibble(x = integer(), a = integer()))
})

test_that("skips over vector columns", {
df <- tibble(x = integer(), y = list())
expect_error(unnest(df, x), "list of vectors")
expect_identical(unnest(df, x), df)
})

test_that("unnest keeps list cols", {
Expand Down

0 comments on commit 36e5399

Please sign in to comment.