Skip to content

Minor performance enhancement to url_parse #429

Closed
@DyfanJones

Description

@DyfanJones

Hi all,

I have been playing around with httr2::url_parse and I believe I have got some performance improvements.

library(rlang)

# Original httr2 
# Note: used `httr2:::` to simplify reprex example
url_parse <- function(url) {
  httr2:::check_string(url)
  
  # https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
  pieces <- parse_match(url, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?")
  
  scheme <- pieces[[2]]
  authority <- pieces[[4]]
  path <- pieces[[5]]
  query <- pieces[[7]]
  if (!is.null(query)) {
    query <- query_parse(query)
  }
  fragment <- pieces[[9]]
  
  # https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
  pieces <- parse_match(authority %||% "", "^(([^@]+)@)?([^:]+)?(:([^#]+))?")
  
  userinfo <- pieces[[2]]
  if (!is.null(userinfo)) {
    if (grepl(":", userinfo)) {
      userinfo <- httr2:::parse_in_half(userinfo, ":")
    } else {
      userinfo <- list(userinfo, NULL)
    }
  }
  hostname <- pieces[[3]]
  port <- pieces[[5]]
  
  structure(
    list(
      scheme = scheme,
      hostname = hostname,
      username = userinfo[[1]],
      password = userinfo[[2]],
      port = port,
      path = path,
      query = query,
      fragment = fragment
    ),
    class = "httr2_url"
  )
}

# remove loop when applying NULL to empty elements
parse_match <- function(x, pattern) {
  m <- regexec(pattern, x, perl = TRUE)
  pieces <- regmatches(x, m)[[1]][-1]
  
  # replace empty element with null
  pieces[pieces == ""] <- list(NULL)
  return(pieces)
}

# same as original httr2
query_parse <- function(x) {
  x <- gsub("^\\?", "", x) # strip leading ?, if present
  params <- parse_name_equals_value(httr2:::parse_delim(x, "&"))
  
  if (length(params) == 0) {
    return(NULL)
  }
  
  out <- as.list(curl::curl_unescape(params))
  names(out) <- curl::curl_unescape(names(params))
  out
}

# use matrix operations instead
parse_name_equals_value <- function(x) {
  if (length(x) == 0) return(NULL)
  pieces <- strsplit(x, "=")
  pieces_matrix <- do.call(rbind, pieces)
  
  if (ncol(pieces_matrix) == 1) {
    pieces_matrix <- cbind(pieces_matrix, rep("", nrow(pieces_matrix)))
  }
  
  # If only one piece, assume it's a field name with empty value
  found <- pieces_matrix[,1] == pieces_matrix[,2]
  pieces_matrix[found, 2] <- ""
  set_names(as.list(pieces_matrix[,2]), pieces_matrix[,1])
}


url <- 'https://someurl.com/with/query_string?i=main&mode=front&sid=12ab&enc=+Hello'

(bm <- bench::mark(
  httr = httr::parse_url(url),
  httr2 = httr2::url_parse(url),
  new_mthd = url_parse(url),
  urltools = urltools::url_parse(url),
  adaR = adaR::ada_url_parse(url),
  check = F
))
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 httr          241µs    265µs     3596.  387.71KB     35.8
#> 2 httr2         207µs    223µs     4322.  466.62KB     37.2
#> 3 new_mthd      129µs    142µs     6575.     4.5MB     39.3
#> 4 urltools      125µs    136µs     6892.    7.45MB     44.0
#> 5 adaR          190µs    207µs     4656.    1.06MB     44.8

bm |> ggplot2::autoplot()
#> Loading required namespace: tidyr

urls <- list(
  "/",
  "//google.com",
  "file:///",
  "http://google.com/",
  "http://google.com/path",
  "http://google.com/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2#frag",
  "http://google.com:80/path?a=1&b=2&c=%7B1%7B2%7D3%7D#frag",
  "http://[email protected]:80/path?a=1&b=2",
  "http://user:[email protected]:80/path?a=1&b=2",
  "svn+ssh://my.svn.server/repo/trunk",
  'https://someurl.com/with/query_string?i=main&mode=front&sid=12ab&enc=+Hello',
  "http://user:[email protected]:80/path?a=1&b=2&c={1{2}3}#frag"
)

(bm <- bench::mark(
  httr = lapply(urls, httr::parse_url),
  httr2 = lapply(urls, httr2::url_parse),
  adaR = lapply(urls, adaR::ada_url_parse),
  urltools = lapply(urls, urltools::url_parse),
  new_mthd = lapply(urls, url_parse),
  check = F,
  iterations = 1000
))
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 httr         2.48ms   2.67ms      368.   161.9KB     17.4
#> 2 httr2        2.25ms    2.4ms      405.    70.9KB     19.5
#> 3 adaR         2.67ms   2.84ms      345.    34.9KB     23.6
#> 4 urltools     1.74ms   1.88ms      524.    34.9KB     22.4
#> 5 new_mthd     1.59ms    1.7ms      556.    62.9KB     20.8
bm |> ggplot2::autoplot()

Created on 2024-01-25 with reprex v2.1.0

I seems to be a nice performance enhancement with results comparable to urltools. However this is only a small benchmark on my computer.

I am happy to raise a PR for this if the changes makes sense :)

Note: it passes test-url.R unit tests as well :)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions