Minor performance enhancement to url_parse

Hi all,

I have been playing around with `httr2::url_parse` and I believe I have got some performance improvements. 

``` r
library(rlang)

# Original httr2 
# Note: used `httr2:::` to simplify reprex example
url_parse <- function(url) {
  httr2:::check_string(url)
  
  # https://datatracker.ietf.org/doc/html/rfc3986#appendix-B
  pieces <- parse_match(url, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?")
  
  scheme <- pieces[[2]]
  authority <- pieces[[4]]
  path <- pieces[[5]]
  query <- pieces[[7]]
  if (!is.null(query)) {
    query <- query_parse(query)
  }
  fragment <- pieces[[9]]
  
  # https://datatracker.ietf.org/doc/html/rfc3986#section-3.2
  pieces <- parse_match(authority %||% "", "^(([^@]+)@)?([^:]+)?(:([^#]+))?")
  
  userinfo <- pieces[[2]]
  if (!is.null(userinfo)) {
    if (grepl(":", userinfo)) {
      userinfo <- httr2:::parse_in_half(userinfo, ":")
    } else {
      userinfo <- list(userinfo, NULL)
    }
  }
  hostname <- pieces[[3]]
  port <- pieces[[5]]
  
  structure(
    list(
      scheme = scheme,
      hostname = hostname,
      username = userinfo[[1]],
      password = userinfo[[2]],
      port = port,
      path = path,
      query = query,
      fragment = fragment
    ),
    class = "httr2_url"
  )
}

# remove loop when applying NULL to empty elements
parse_match <- function(x, pattern) {
  m <- regexec(pattern, x, perl = TRUE)
  pieces <- regmatches(x, m)[[1]][-1]
  
  # replace empty element with null
  pieces[pieces == ""] <- list(NULL)
  return(pieces)
}

# same as original httr2
query_parse <- function(x) {
  x <- gsub("^\\?", "", x) # strip leading ?, if present
  params <- parse_name_equals_value(httr2:::parse_delim(x, "&"))
  
  if (length(params) == 0) {
    return(NULL)
  }
  
  out <- as.list(curl::curl_unescape(params))
  names(out) <- curl::curl_unescape(names(params))
  out
}

# use matrix operations instead
parse_name_equals_value <- function(x) {
  if (length(x) == 0) return(NULL)
  pieces <- strsplit(x, "=")
  pieces_matrix <- do.call(rbind, pieces)
  
  if (ncol(pieces_matrix) == 1) {
    pieces_matrix <- cbind(pieces_matrix, rep("", nrow(pieces_matrix)))
  }
  
  # If only one piece, assume it's a field name with empty value
  found <- pieces_matrix[,1] == pieces_matrix[,2]
  pieces_matrix[found, 2] <- ""
  set_names(as.list(pieces_matrix[,2]), pieces_matrix[,1])
}


url <- 'https://someurl.com/with/query_string?i=main&mode=front&sid=12ab&enc=+Hello'

(bm <- bench::mark(
  httr = httr::parse_url(url),
  httr2 = httr2::url_parse(url),
  new_mthd = url_parse(url),
  urltools = urltools::url_parse(url),
  adaR = adaR::ada_url_parse(url),
  check = F
))
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 httr          241µs    265µs     3596.  387.71KB     35.8
#> 2 httr2         207µs    223µs     4322.  466.62KB     37.2
#> 3 new_mthd      129µs    142µs     6575.     4.5MB     39.3
#> 4 urltools      125µs    136µs     6892.    7.45MB     44.0
#> 5 adaR          190µs    207µs     4656.    1.06MB     44.8

bm |> ggplot2::autoplot()
#> Loading required namespace: tidyr
```

![](https://i.imgur.com/l3LISzs.png)

``` r

urls <- list(
  "/",
  "//google.com",
  "file:///",
  "http://google.com/",
  "http://google.com/path",
  "http://google.com/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2",
  "http://google.com:80/path?a=1&b=2#frag",
  "http://google.com:80/path?a=1&b=2&c=%7B1%7B2%7D3%7D#frag",
  "http://user@google.com:80/path?a=1&b=2",
  "http://user:pass@google.com:80/path?a=1&b=2",
  "svn+ssh://my.svn.server/repo/trunk",
  'https://someurl.com/with/query_string?i=main&mode=front&sid=12ab&enc=+Hello',
  "http://user:pass@example.com:80/path?a=1&b=2&c={1{2}3}#frag"
)

(bm <- bench::mark(
  httr = lapply(urls, httr::parse_url),
  httr2 = lapply(urls, httr2::url_parse),
  adaR = lapply(urls, adaR::ada_url_parse),
  urltools = lapply(urls, urltools::url_parse),
  new_mthd = lapply(urls, url_parse),
  check = F,
  iterations = 1000
))
#> # A tibble: 5 × 6
#>   expression      min   median `itr/sec` mem_alloc `gc/sec`
#>   <bch:expr> <bch:tm> <bch:tm>     <dbl> <bch:byt>    <dbl>
#> 1 httr         2.48ms   2.67ms      368.   161.9KB     17.4
#> 2 httr2        2.25ms    2.4ms      405.    70.9KB     19.5
#> 3 adaR         2.67ms   2.84ms      345.    34.9KB     23.6
#> 4 urltools     1.74ms   1.88ms      524.    34.9KB     22.4
#> 5 new_mthd     1.59ms    1.7ms      556.    62.9KB     20.8
bm |> ggplot2::autoplot()
```

![](https://i.imgur.com/AihA0yq.png)

<sup>Created on 2024-01-25 with [reprex v2.1.0](https://reprex.tidyverse.org)</sup>

I seems to be a nice performance enhancement with results comparable to `urltools`. However this is only a small benchmark on my computer. 

I am happy to raise a PR for this if the changes makes sense :)

**Note:** it passes `test-url.R` unit tests as well :)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Minor performance enhancement to url_parse #429

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Minor performance enhancement to url_parse #429

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions