From 2c662aab4aba83f25697711c459536ec03166f04 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 14 Nov 2018 15:42:59 -0800 Subject: [PATCH 1/2] Content-Encoding: gzip for large enough request bodies --- DESCRIPTION | 2 +- NEWS.md | 1 + R/api.R | 30 ++++++++++++++++++++++++------ R/batches.R | 4 +++- R/projects.R | 3 ++- inst/crunch-test.R | 1 + man/crunchAPI.Rd | 9 +++++++-- tests/testthat/test-api.R | 12 ++++++++++++ 8 files changed, 51 insertions(+), 11 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 747e29391..797cbe2eb 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -42,7 +42,7 @@ Suggests: VignetteBuilder: knitr Language: en-US Encoding: UTF-8 -RoxygenNote: 6.1.0 +RoxygenNote: 6.1.1 Roxygen: list(markdown = TRUE) LazyData: true Collate: diff --git a/NEWS.md b/NEWS.md index d65d3edd6..73ec2e4d1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,7 @@ * Support for `"."` as a folder path/segment, referencing the current folder. `cd(project, ".")` returns `project`; `mv(project, ds, ".")` moves `ds` into `project`. * Fix bug in printing folders that contain entities with excessively long names * Improved speed for calculating insertions (subtotals, headers, etc.) on large cubes (speed ups of ~25x on large, realistic cubes). +* Use `gzip` compression of request bodies larger than 1MB for faster transmission over the network. * First draft support for deck creation and manipulation. # crunch 1.24.2 diff --git a/R/api.R b/R/api.R index f6ec3e095..038a78188 100644 --- a/R/api.R +++ b/R/api.R @@ -1,22 +1,40 @@ #' Main Crunch API handling function #' @param http.verb character in GET, PUT, POST, PATCH, DELETE #' @param url character URL to do the verb on -#' @param ... additional arguments passed to `GET`, `PUT`, -#' `POST`, `PATCH`, or `DELETE` #' @param config list of config parameters. See httr documentation. +#' @param body request body. See httr documentation. +#' @param encode How to encode `body` if it is a named list. Unlike in `httr`, +#' the default is `"json"`. #' @param status.handlers named list of specific HTTP statuses and a response #' function to call in the case where that status is returned. Passed to the #' [handleAPIresponse()] function. +#' @param ... additional arguments passed to `GET`, `PUT`, +#' `POST`, `PATCH`, or `DELETE` #' @keywords internal -crunchAPI <- function(http.verb, url, config = list(), status.handlers = list(), ...) { +crunchAPI <- function(http.verb, + url, + config = list(), + body = NULL, + encode = "json", + status.handlers = list(), + ...) { url ## force lazy eval of url if (isTRUE(getOption("crunch.debug"))) { ## TODO: work this into httpcache.log - payload <- list(...)$body - if (!is.null(payload)) try(cat("\n", payload, "\n"), silent = TRUE) + if (!is.null(body)) try(cat("\n", body, "\n"), silent = TRUE) } FUN <- get(http.verb, envir = asNamespace("httpcache")) - x <- FUN(url, ..., config = c(get_crunch_config(), config)) + baseconfig <- get_crunch_config() + if (!is.null(body) && encode == "json") { + # TODO: push the toJSON to here? + baseconfig <- c(baseconfig, add_headers(`Content-Type`="application/json")) + if (object.size(body) > getOption("crunch.min.gzip.bytes", 1024L)) { + # Compress + body <- memCompress(body, "gzip") + baseconfig <- c(baseconfig, add_headers(`Content-Encoding`="gzip")) + } + } + x <- FUN(url, ..., body = body, encode = encode, config = c(baseconfig, config)) out <- handleAPIresponse(x, special.statuses = status.handlers) return(out) } diff --git a/R/batches.R b/R/batches.R index 7a10b7503..b3187017a 100644 --- a/R/batches.R +++ b/R/batches.R @@ -50,7 +50,9 @@ createSource <- function(file, url, ...) { if (!missing(file)) { if (file.exists(file)) { u <- crPOST(sources_url, - body = list(uploaded_file = upload_file(file)), ... + body = list(uploaded_file = upload_file(file)), + encode = "multipart", + ... ) } else { halt("File not found") diff --git a/R/projects.R b/R/projects.R index 7759d0697..37de9bf12 100644 --- a/R/projects.R +++ b/R/projects.R @@ -82,7 +82,8 @@ icon <- function(x) { #' @export `icon<-` <- function(x, value) { crPUT(shojiURL(x, "views", "icon"), - body = list(icon = upload_file(value)) + body = list(icon = upload_file(value)), + encode = "multipart" ) dropOnly(absoluteURL("../", self(x))) ## Invalidate catalog return(refresh(x)) diff --git a/inst/crunch-test.R b/inst/crunch-test.R index 62e9b5c3b..75536e7eb 100644 --- a/inst/crunch-test.R +++ b/inst/crunch-test.R @@ -79,6 +79,7 @@ cubify <- function(..., dims) { with_mock_crunch <- function(expr) { opts <- temp.options( crunch.api = "https://app.crunch.io/api/", + crunch.min.gzip.bytes = 1e9, ## Don't gzip request bodies in tests (until httptest can handle) httptest.mock.paths = c(".", "../inst/", system.file(package = "crunch")) ) with( diff --git a/man/crunchAPI.Rd b/man/crunchAPI.Rd index 1f283c963..6192a8ff3 100644 --- a/man/crunchAPI.Rd +++ b/man/crunchAPI.Rd @@ -4,8 +4,8 @@ \alias{crunchAPI} \title{Main Crunch API handling function} \usage{ -crunchAPI(http.verb, url, config = list(), status.handlers = list(), - ...) +crunchAPI(http.verb, url, config = list(), body = NULL, + encode = "json", status.handlers = list(), ...) } \arguments{ \item{http.verb}{character in GET, PUT, POST, PATCH, DELETE} @@ -14,6 +14,11 @@ crunchAPI(http.verb, url, config = list(), status.handlers = list(), \item{config}{list of config parameters. See httr documentation.} +\item{body}{request body. See httr documentation.} + +\item{encode}{How to encode \code{body} if it is a named list. Unlike in \code{httr}, +the default is \code{"json"}.} + \item{status.handlers}{named list of specific HTTP statuses and a response function to call in the case where that status is returned. Passed to the \code{\link[=handleAPIresponse]{handleAPIresponse()}} function.} diff --git a/tests/testthat/test-api.R b/tests/testthat/test-api.R index 823f74e40..28b08c54e 100644 --- a/tests/testthat/test-api.R +++ b/tests/testthat/test-api.R @@ -79,6 +79,18 @@ with_mock_crunch({ expect_true(featureFlag("this_is_on")) expect_false(featureFlag("this_is_off")) }) + + test_that("Request bodies are gzipped (if large enough)", { + with(temp.option(crunch.min.gzip.bytes=0), { + expect_header( + expect_error( # httptest errors printing the gzipped body + # expect_POST( + crPOST("https://app.crunch.io/api/", body = '{"value":1}') + ), + "Content-Encoding: gzip" + ) + }) + }) }) test_that("retry", { From 1bdbf0648b3dbb4036b25fe65cf0644c0c448a06 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 14 Nov 2018 16:10:20 -0800 Subject: [PATCH 2/2] Comment, test move, and workaround for mocked vignettes --- R/api.R | 6 ++++- tests/testthat/test-api.R | 39 ++++++++++++++++--------------- vignettes/crunch.Rmd | 1 + vignettes/fork-and-merge.Rmd | 45 ++++++++++++++++++------------------ 4 files changed, 50 insertions(+), 41 deletions(-) diff --git a/R/api.R b/R/api.R index 038a78188..bec75f902 100644 --- a/R/api.R +++ b/R/api.R @@ -29,7 +29,11 @@ crunchAPI <- function(http.verb, # TODO: push the toJSON to here? baseconfig <- c(baseconfig, add_headers(`Content-Type`="application/json")) if (object.size(body) > getOption("crunch.min.gzip.bytes", 1024L)) { - # Compress + # Compress. But only compress above a minimum threshold because + # gzip can make small objects a little bigger, + # + compression/decompression overhead. + # getOption makes this configurable for tests and also to provide a + # way to turn it off in case of emergency. body <- memCompress(body, "gzip") baseconfig <- c(baseconfig, add_headers(`Content-Encoding`="gzip")) } diff --git a/tests/testthat/test-api.R b/tests/testthat/test-api.R index 28b08c54e..ee10e97da 100644 --- a/tests/testthat/test-api.R +++ b/tests/testthat/test-api.R @@ -91,6 +91,13 @@ with_mock_crunch({ ) }) }) + + test_that("Other request headers", { + uncached({ # So we go to httr for these and not read cached responses + expect_header(crGET("https://app.crunch.io/api/"), + "User-Agent.*rcrunch", ignore.case=TRUE) + }) + }) }) test_that("retry", { @@ -108,29 +115,25 @@ test_that("retry", { ) }) +test_that("crunchUserAgent", { + expect_true(grepl("rcrunch", crunchUserAgent())) + expect_true(grepl("libcurl", crunchUserAgent())) + expect_error( + crunchUserAgent("anotherpackage/3.1.4"), + NA + ) + expect_true(grepl("anotherpackage", crunchUserAgent("anotherpackage"))) +}) + if (run.integration.tests) { - test_that("Request headers", { + test_that("Requests send Accept-Encoding: gzip", { + # libcurl adds this according to CURLOPT_ACCEPT_ENCODING, which appears + # to be on by default. expect_header() can't catch this because it gets + # added below httr, so we can't test this with mocks skip_if_disconnected() r <- crGET("http://httpbin.org/gzip") expect_true(r$gzipped) expect_true(grepl("gzip", r$headers[["Accept-Encoding"]])) - expect_true(grepl("rcrunch", r$headers[["User-Agent"]])) - }) - - test_that("crunchUserAgent", { - expect_true(grepl("rcrunch", crunchUserAgent())) - expect_true(grepl("libcurl", crunchUserAgent())) - expect_error( - crunchUserAgent("anotherpackage/3.1.4"), - NA - ) - expect_true(grepl("anotherpackage", crunchUserAgent("anotherpackage"))) - }) - - with_test_authentication({ - test_that("API root can be fetched", { - expect_true(is.shojiObject(getAPIRoot())) - }) }) test_that("API calls throw an error if user is not authenticated", { diff --git a/vignettes/crunch.Rmd b/vignettes/crunch.Rmd index 7fd8ee7a8..79e4732f8 100644 --- a/vignettes/crunch.Rmd +++ b/vignettes/crunch.Rmd @@ -16,6 +16,7 @@ library(crunch) ```{r, results='hide', include = FALSE} library(httptest) +options(crunch.min.gzip.bytes = 1e9) ## Don't gzip request bodies in tests (until httptest can handle)) start_vignette("crunch") login() ``` diff --git a/vignettes/fork-and-merge.Rmd b/vignettes/fork-and-merge.Rmd index 12c4b13fa..97caf0b12 100644 --- a/vignettes/fork-and-merge.Rmd +++ b/vignettes/fork-and-merge.Rmd @@ -8,9 +8,9 @@ vignette: > %\VignetteEncoding{UTF-8} --- -One of the main benefits of Crunch is that it lets analysts and clients work with the same datasets. Instead of emailing datasets to clients, you can update the live dataset and ensure that they will see see the most up-to-date information. The potential problem with this setup is that it can become difficult to make provisional changes to the dataset without publishing it to the client. Sometimes an analyst wants to investigate or experiment with a dataset without the risk of sending incorrect or confusing information to the end user This is why we implemented a fork-edit-merge workflow for Crunch datasets. +One of the main benefits of Crunch is that it lets analysts and clients work with the same datasets. Instead of emailing datasets to clients, you can update the live dataset and ensure that they will see see the most up-to-date information. The potential problem with this setup is that it can become difficult to make provisional changes to the dataset without publishing it to the client. Sometimes an analyst wants to investigate or experiment with a dataset without the risk of sending incorrect or confusing information to the end user This is why we implemented a fork-edit-merge workflow for Crunch datasets. -"Fork" originates in computer version control systems and just means to take a copy of something with the intention of making some changes to the copy, and then incorporating those changes back into the original. A helpful mnemonic is to think of a path which forks off from the main road, but then rejoins it later on. To see how this works lets first upload a new dataset to Crunch. +"Fork" originates in computer version control systems and just means to take a copy of something with the intention of making some changes to the copy, and then incorporating those changes back into the original. A helpful mnemonic is to think of a path which forks off from the main road, but then rejoins it later on. To see how this works lets first upload a new dataset to Crunch. ```{r, eval = FALSE, message=FALSE} library(crunch) @@ -19,6 +19,7 @@ login() ```{r, results='hide', echo=FALSE, message=FALSE} library(crunch) library(httptest) +options(crunch.min.gzip.bytes = 1e9) ## Don't gzip request bodies in tests (until httptest can handle)) start_vignette("fork-and-merge") ``` @@ -26,32 +27,32 @@ start_vignette("fork-and-merge") ds <- newDataset(SO_survey, "stackoverflow_survey") ``` -Imagine that this dataset is shared with several users, and you want to update it without affecting their usage. You might also want to consult with other analysts or decision makers to make sure that the data is accurate before sharing it with clients. To do this you call `forkDataset()` to create a copy of the dataset. +Imagine that this dataset is shared with several users, and you want to update it without affecting their usage. You might also want to consult with other analysts or decision makers to make sure that the data is accurate before sharing it with clients. To do this you call `forkDataset()` to create a copy of the dataset. ```{r fork dataset} forked_ds <- forkDataset(ds) ``` -You now have a copied dataset which is identical to the original, and are free to make changes without fear of disrupting the client's experience. You can add or remove variables, delete records, or change the dataset's organization. These changes will be isolated to your fork and won't be visible to the end user until you decide to merge the fork back into the original dataset. This lets you edit the dataset with confidence because your work is isolated. +You now have a copied dataset which is identical to the original, and are free to make changes without fear of disrupting the client's experience. You can add or remove variables, delete records, or change the dataset's organization. These changes will be isolated to your fork and won't be visible to the end user until you decide to merge the fork back into the original dataset. This lets you edit the dataset with confidence because your work is isolated. -In this case, let's create a new categorical array variable. +In this case, let's create a new categorical array variable. ```{r state change1, include=FALSE} change_state() ``` ```{r create Multiple Response Variable} -ds$ImportantHiringCA <- makeArray(ds[, c("ImportantHiringTechExp", "ImportantHiringPMExp")], - name = "importantCatArray") +ds$ImportantHiringCA <- makeArray(ds[, c("ImportantHiringTechExp", "ImportantHiringPMExp")], + name = "importantCatArray") ``` -Our forked dataset has diverged from the original dataset. Which we can see by comparing their names. +Our forked dataset has diverged from the original dataset. Which we can see by comparing their names. ```{r compare datasets} all(names(ds) %in% names(forked_ds)) ``` -You can work with the forked dataset as long as you like, if you want to see it in the web App or share it with other analysts by you can do so by calling `webApp(forked_ds)`. You might create many forks and discard most of them without merging them into the original dataset. +You can work with the forked dataset as long as you like, if you want to see it in the web App or share it with other analysts by you can do so by calling `webApp(forked_ds)`. You might create many forks and discard most of them without merging them into the original dataset. -If you do end up with changes to the forked dataset that you want to include in the original dataset you can do so with the `mergeFork()` function. This function figures out what changes you made the fork, and then applies those changes to the original dataset. +If you do end up with changes to the forked dataset that you want to include in the original dataset you can do so with the `mergeFork()` function. This function figures out what changes you made the fork, and then applies those changes to the original dataset. ```{r state change2, include=FALSE} change_state() ``` @@ -59,20 +60,20 @@ change_state() ds <- mergeFork(ds, forked_ds) ``` -After merging the original dataset includes the categorical array variable which we created on the fork. +After merging the original dataset includes the categorical array variable which we created on the fork. ```{r check successful merge} ds$ImportantHiringCA ``` -It's possible to to make changes to a fork which can't be easily merged into the original dataset. For instance if, while we were working on this fork someone added another variable called `ImportantHiringCA` to the original dataset the merge might fail because there's no safe way to reconcile the two forks. This is called a "merge conflict" and there are a couple best practices that you can follow to avoid this problem: +It's possible to to make changes to a fork which can't be easily merged into the original dataset. For instance if, while we were working on this fork someone added another variable called `ImportantHiringCA` to the original dataset the merge might fail because there's no safe way to reconcile the two forks. This is called a "merge conflict" and there are a couple best practices that you can follow to avoid this problem: 1) **Make minimal changes to dataset forks.** Instead of making lots of changes to a fork, make a couple of small change to the fork, merge it back into the original dataset, and a create a new fork for the next set of changes -2) **Have other analysts work on their own forks.** It's easier to avoid conflicts if each member of the team makes changes to their own fork and then periodically merges those changes back into the original dataset. This lets you coordinate the order that you want to apply changes to the original dataset, and so avoid some merge conflicts. +2) **Have other analysts work on their own forks.** It's easier to avoid conflicts if each member of the team makes changes to their own fork and then periodically merges those changes back into the original dataset. This lets you coordinate the order that you want to apply changes to the original dataset, and so avoid some merge conflicts. ## Appending data -Another good use of the fork-edit-merge workflow is when you want to append data to an existing dataset. When appending data you usually want to check that the append operation completed successfully before publishing the data to users. This might come up if you are adding a second wave of a survey, or including some additional data which came in after the dataset was initially sent to clients. The first step is to upload the second survey wave as its own dataset. +Another good use of the fork-edit-merge workflow is when you want to append data to an existing dataset. When appending data you usually want to check that the append operation completed successfully before publishing the data to users. This might come up if you are adding a second wave of a survey, or including some additional data which came in after the dataset was initially sent to clients. The first step is to upload the second survey wave as its own dataset. ```{r state change3, include=FALSE} change_state() @@ -82,7 +83,7 @@ wave2 <- newDataset(SO_survey, "SO_survey_wave2") ``` -We then fork the original dataset and append the new wave onto the forked dataset. +We then fork the original dataset and append the new wave onto the forked dataset. ```{r state change4, include=FALSE} change_state() ``` @@ -97,7 +98,7 @@ nrow(ds) nrow(ds_fork) ``` -Once we've confirmed that the append completed successfully we can merge the forked dataset back into the original one. +Once we've confirmed that the append completed successfully we can merge the forked dataset back into the original one. ```{r state change5, include=FALSE} change_state() ``` @@ -105,7 +106,7 @@ change_state() ds <- mergeFork(ds, ds_fork) ``` -`ds` now has the additional rows. +`ds` now has the additional rows. ```{r} nrow(ds) @@ -113,7 +114,7 @@ nrow(ds) ## Merging datasets -Merging two datasets together can often be the source of unexpected behavior like misaligning or overwriting variables, and so it's a good candidate for this workflow. Let's create a fake dataset with household size to merge onto the original one. +Merging two datasets together can often be the source of unexpected behavior like misaligning or overwriting variables, and so it's a good candidate for this workflow. Let's create a fake dataset with household size to merge onto the original one. ```{r state change6, include=FALSE} change_state() ``` @@ -127,20 +128,20 @@ house_table$HouseholdSize <- sample( house_ds <- newDataset(house_table, "House Size") ``` -There are a few reasons why we might not want to merge this new table onto our user facing data. For instance we might make a mistake in constructing the table, or have some category names which don't quite match up. Merging the data onto a forked dataset again gives us the safety to make changes and verify accuracy without affecting client-facing data. +There are a few reasons why we might not want to merge this new table onto our user facing data. For instance we might make a mistake in constructing the table, or have some category names which don't quite match up. Merging the data onto a forked dataset again gives us the safety to make changes and verify accuracy without affecting client-facing data. ```{r fork and merge recode, message=FALSE, results='hide'} ds_fork <- forkDataset(ds) ds_fork <- merge(ds_fork, house_ds, by = "Respondent") ``` -Before merging the fork back into the original dataset, we can check that everything went well with the join. +Before merging the fork back into the original dataset, we can check that everything went well with the join. ```{r check new data} crtabs(~ TabsSpaces + HouseholdSize, ds_fork) ``` -And finally once we're comfortable that everything went as expected we can send the data to the client by merging the fork back to the original dataset. +And finally once we're comfortable that everything went as expected we can send the data to the client by merging the fork back to the original dataset. ```{r state change7, include=FALSE} change_state() ``` @@ -151,7 +152,7 @@ ds$HouseholdSize ## Conclusion -Forking and merging datasets is a great way to make changes to the data. It allows you to verify your work and get approval before putting the data in front of clients, and gives you the freedom to make changes and mistakes without worrying about disrupting production data. +Forking and merging datasets is a great way to make changes to the data. It allows you to verify your work and get approval before putting the data in front of clients, and gives you the freedom to make changes and mistakes without worrying about disrupting production data. ```{r, results='hide', echo=FALSE, message=FALSE}