Skip to content

Commit

Permalink
Implement req_perform_spider()
Browse files Browse the repository at this point in the history
Fixes#456
  • Loading branch information
hadley committed Nov 15, 2024
1 parent d9eb6f7 commit da6754e
Showing 1 changed file with 149 additions and 0 deletions.
149 changes: 149 additions & 0 deletions R/spider.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#' @param hash_key A function with argument `req` that returns the components
#' of the request that should be used for computing equality. By default,
#' `hash_key` inspects the `url`, `body`, and `headers`, which should be
#' adequate for most needs.
#' @param progress_label A 1function with `req` that returns a string used to
#' label the progress bar. The default displays the URL which is most useful
#' for spidering HTML sites.
#' @examples
#' url <- "https://ggplot2.tidyverse.org/"
#' req <- request(url)
#' req_perform_spider(req, next_reqs = spider_descendents(url))
req_perform_spider <- function(
req,
next_reqs,
path = NULL,
on_error = c("stop", "return", "continue"),
hash_key = NULL,
progress = TRUE,
progress_label = NULL
) {

check_request(req)
check_function2(next_reqs, args = c("resp", "req"))
check_string(path, allow_empty = FALSE, allow_null = TRUE)
on_error <- match.arg(on_error)
check_function2(hash_key, args = "req", allow_null = TRUE)
check_function2(progress_label, args = "req", allow_null = TRUE)
check_bool(progress)

Check warning on line 28 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L22-L28

Added lines #L22 - L28 were not covered by tests

hash_key <- hash_key %||% function(req) req[c("url", "body", "headers")]
progress_label <- progress_label %||% function(req) req$url

Check warning on line 31 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L30-L31

Added lines #L30 - L31 were not covered by tests

get_path <- function(hash) {
if (is.null(path)) {
NULL
} else {
glue::glue(path)
}
}

Check warning on line 39 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L33-L39

Added lines #L33 - L39 were not covered by tests

todo <- fastmap::fastqueue()
done <- fastmap::fastmap()
seen <- fastmap::fastmap()

Check warning on line 43 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L41-L43

Added lines #L41 - L43 were not covered by tests

todo$add(req)

Check warning on line 45 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L45

Added line #L45 was not covered by tests

if (progress) {
cli::cli_progress_bar(
type = "custom",
total = NA,
format = "Spidering {done$size()}/{done$size() + todo$size()}: {progress_label(req)}"
)

Check warning on line 52 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L47-L52

Added lines #L47 - L52 were not covered by tests
}

while (todo$size() > 0) {
req <- todo$remove()
if (progress) cli::cli_progress_update()

Check warning on line 57 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L55-L57

Added lines #L55 - L57 were not covered by tests

req_hash <- hash(hash_key(req))
resp <- req_perform(req, path = get_path(req_hash))
done$set(req_hash, resp)
seen$set(req_hash, TRUE)

Check warning on line 62 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L59-L62

Added lines #L59 - L62 were not covered by tests

up_next <- next_reqs(req, resp)
for (req in up_next) {
req_hash <- hash(hash_key(req))
if (!seen$has(req_hash)) {
seen$set(req_hash, TRUE)
todo$add(req)
}
}
}

Check warning on line 72 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L64-L72

Added lines #L64 - L72 were not covered by tests

unname(done$as_list())

Check warning on line 74 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L74

Added line #L74 was not covered by tests
}


#' @export
#' @rdname req_perform_spider
spider_descendents <- function(home_url) {
force(home_url)
function(req, resp) {
html <- resp_body_html(resp)

Check warning on line 83 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L81-L83

Added lines #L81 - L83 were not covered by tests

a <- xml2::xml_find_all(html, "//a[@href]")
href <- xml2::xml_attr(a, "href")
href <- xml2::url_absolute(href, resp_url(resp))
href <- href[map_lgl(href, can_parse)]
href <- map_chr(href, strip_fragment)
href <- unique(href)

Check warning on line 90 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L85-L90

Added lines #L85 - L90 were not covered by tests

descendents <- href[map_lgl(href, url_is_child, home_url)]

Check warning on line 92 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L92

Added line #L92 was not covered by tests

map(descendents, function(path) req_url(req, path))
}

Check warning on line 95 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L94-L95

Added lines #L94 - L95 were not covered by tests
}

url_is_child <- function(child, parent) {
parent <- url_parse(parent)
child <- url_parse(child)

Check warning on line 100 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L99-L100

Added lines #L99 - L100 were not covered by tests

identical(child$scheme, parent$scheme) &&
identical(child$hostname, parent$hostname) &&
identical(child$port, parent$port) &&
path_is_child(child$path, parent$path)

Check warning on line 105 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L102-L105

Added lines #L102 - L105 were not covered by tests
}

# path_is_child("/foo2", "/foo")
# path_is_child("/foo/bar", "/foo")
path_is_child <- function(child, parent) {
parent <- normalize_path(parent)
child <- normalize_path(child)

Check warning on line 112 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L111-L112

Added lines #L111 - L112 were not covered by tests

if (startsWith(child, parent)) {
if (nchar(child) > nchar(parent)) {
i <- nchar(parent) + 1
substring(child, i, i) == "/"

Check warning on line 117 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L114-L117

Added lines #L114 - L117 were not covered by tests
} else {
FALSE

Check warning on line 119 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L119

Added line #L119 was not covered by tests
}
} else {
FALSE

Check warning on line 122 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L122

Added line #L122 was not covered by tests
}
}

normalize_path <- function(path) {
# strip index.html and friends
path <- sub("(index|default)\\.[a-z]+$", "", path, ignore.case = TRUE)

Check warning on line 128 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L128

Added line #L128 was not covered by tests
# strip trailing /
path <- sub("/$", "", path)

Check warning on line 130 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L130

Added line #L130 was not covered by tests
# url_parse ensures it always starts with /
path

Check warning on line 132 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L132

Added line #L132 was not covered by tests
}

strip_fragment <- function(url) {
url <- url_parse(url)
url$fragment <- NULL
url_build(url)

Check warning on line 138 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L136-L138

Added lines #L136 - L138 were not covered by tests
}

can_parse <- function(url) {
tryCatch(
{
url_parse(url)
TRUE
},
error = function(cnd) FALSE
)

Check warning on line 148 in R/spider.R

View check run for this annotation

Codecov / codecov/patch

R/spider.R#L142-L148

Added lines #L142 - L148 were not covered by tests
}

0 comments on commit da6754e

Please sign in to comment.