Closed
Description
Hi all,
I have been playing around with httr2::url_parse
and I believe I have got some performance improvements.
library(rlang)
# Original httr2
# Note: used `httr2:::` to simplify reprex example
url_parse <- function(url) {
httr2:::check_string(url)
# https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
pieces <- parse_match(url, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?")
scheme <- pieces[[2]]
authority <- pieces[[4]]
path <- pieces[[5]]
query <- pieces[[7]]
if (!is.null(query)) {
query <- query_parse(query)
}
fragment <- pieces[[9]]
# https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
pieces <- parse_match(authority %||% "", "^(([^@]+)@)?([^:]+)?(:([^#]+))?")
userinfo <- pieces[[2]]
if (!is.null(userinfo)) {
if (grepl(":", userinfo)) {
userinfo <- httr2:::parse_in_half(userinfo, ":")
} else {
userinfo <- list(userinfo, NULL)
}
}
hostname <- pieces[[3]]
port <- pieces[[5]]
structure(
list(
scheme = scheme,
hostname = hostname,
username = userinfo[[1]],
password = userinfo[[2]],
port = port,
path = path,
query = query,
fragment = fragment
),
class = "httr2_url"
)
}
# remove loop when applying NULL to empty elements
parse_match <- function(x, pattern) {
m <- regexec(pattern, x, perl = TRUE)
pieces <- regmatches(x, m)[[1]][-1]
# replace empty element with null
pieces[pieces == ""] <- list(NULL)
return(pieces)
}
# same as original httr2
query_parse <- function(x) {
x <- gsub("^\\?", "", x) # strip leading ?, if present
params <- parse_name_equals_value(httr2:::parse_delim(x, "&"))
if (length(params) == 0) {
return(NULL)
}
out <- as.list(curl::curl_unescape(params))
names(out) <- curl::curl_unescape(names(params))
out
}
# use matrix operations instead
parse_name_equals_value <- function(x) {
if (length(x) == 0) return(NULL)
pieces <- strsplit(x, "=")
pieces_matrix <- do.call(rbind, pieces)
if (ncol(pieces_matrix) == 1) {
pieces_matrix <- cbind(pieces_matrix, rep("", nrow(pieces_matrix)))
}
# If only one piece, assume it's a field name with empty value
found <- pieces_matrix[,1] == pieces_matrix[,2]
pieces_matrix[found, 2] <- ""
set_names(as.list(pieces_matrix[,2]), pieces_matrix[,1])
}
url <- 'https://someurl.com/with/query_string?i=main&mode=front&sid=12ab&enc=+Hello'
(bm <- bench::mark(
httr = httr::parse_url(url),
httr2 = httr2::url_parse(url),
new_mthd = url_parse(url),
urltools = urltools::url_parse(url),
adaR = adaR::ada_url_parse(url),
check = F
))
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 httr 241µs 265µs 3596. 387.71KB 35.8
#> 2 httr2 207µs 223µs 4322. 466.62KB 37.2
#> 3 new_mthd 129µs 142µs 6575. 4.5MB 39.3
#> 4 urltools 125µs 136µs 6892. 7.45MB 44.0
#> 5 adaR 190µs 207µs 4656. 1.06MB 44.8
bm |> ggplot2::autoplot()
#> Loading required namespace: tidyr
urls <- list(
"/",
"//google.com",
"file:///",
"http://google.com/",
"http://google.com/path",
"http://google.com/path?a=1&b=2",
"http://google.com:80/path?a=1&b=2",
"http://google.com:80/path?a=1&b=2#frag",
"http://google.com:80/path?a=1&b=2&c=%7B1%7B2%7D3%7D#frag",
"http://[email protected]:80/path?a=1&b=2",
"http://user:[email protected]:80/path?a=1&b=2",
"svn+ssh://my.svn.server/repo/trunk",
'https://someurl.com/with/query_string?i=main&mode=front&sid=12ab&enc=+Hello',
"http://user:[email protected]:80/path?a=1&b=2&c={1{2}3}#frag"
)
(bm <- bench::mark(
httr = lapply(urls, httr::parse_url),
httr2 = lapply(urls, httr2::url_parse),
adaR = lapply(urls, adaR::ada_url_parse),
urltools = lapply(urls, urltools::url_parse),
new_mthd = lapply(urls, url_parse),
check = F,
iterations = 1000
))
#> # A tibble: 5 × 6
#> expression min median `itr/sec` mem_alloc `gc/sec`
#> <bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl>
#> 1 httr 2.48ms 2.67ms 368. 161.9KB 17.4
#> 2 httr2 2.25ms 2.4ms 405. 70.9KB 19.5
#> 3 adaR 2.67ms 2.84ms 345. 34.9KB 23.6
#> 4 urltools 1.74ms 1.88ms 524. 34.9KB 22.4
#> 5 new_mthd 1.59ms 1.7ms 556. 62.9KB 20.8
bm |> ggplot2::autoplot()
Created on 2024-01-25 with reprex v2.1.0
I seems to be a nice performance enhancement with results comparable to urltools
. However this is only a small benchmark on my computer.
I am happy to raise a PR for this if the changes makes sense :)
Note: it passes test-url.R
unit tests as well :)
Metadata
Metadata
Assignees
Labels
No labels