From a42de3c5001cf611bbb24fae98a993d1a77fc923 Mon Sep 17 00:00:00 2001 From: lisalevinson Date: Wed, 10 Aug 2022 17:52:00 -0400 Subject: [PATCH] escape hyphen in URL regex updated to be compatible with R 4.0 and PCRE2 --- R/word_count.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/word_count.R b/R/word_count.R index 2235470..e3538fa 100644 --- a/R/word_count.R +++ b/R/word_count.R @@ -51,7 +51,7 @@ function( ## unnest_tokens() splits URLs by default into multiple tokens if (!isTRUE(split_urls)) { # borrowed from: https://stackoverflow.com/a/8234912/2338862 - url_regex <- "((([A-Za-z]{3,9}:(?:\\/\\/)?)(?:[-;:&=+$,\\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=+$,\\w]+@)[A-Za-z0-9.-]+)((?:\\/[\\+~%\\/.\\w-_]*)?\\??(?:[-\\\\+=&;%@.\\w_]*)#?(?:[\\w]*))?)" + url_regex <- "((([A-Za-z]{3,9}:(?:\\/\\/)?)(?:[-;:&=+$,\\w]+@)?[A-Za-z0-9.-]+|(?:www.|[-;:&=+$,\\w]+@)[A-Za-z0-9.-]+)((?:\\/[\\+~%\\/.\\w\\-_]*)?\\??(?:[-\\\\+=&;%@.\\w_]*)#?(?:[\\w]*))?)" char <- gsub(url_regex, "URL", char, perl = TRUE) }