From f735abded361d5ca375eefde595de7a3b94cd4f1 Mon Sep 17 00:00:00 2001 From: MihalisP Date: Sun, 31 May 2020 22:30:33 +0300 Subject: [PATCH] added disregarded protocols --- .../java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java index afa9c47e0..7bbac546c 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/TikaHtmlParser.java @@ -101,8 +101,12 @@ private Set getOutgoingUrls(String contextURL, HtmlContentHandler conten } String hrefLoweredCase = href.trim().toLowerCase(); - if (!hrefLoweredCase.contains("javascript:") && - !hrefLoweredCase.contains("mailto:") && !hrefLoweredCase.contains("@")) { + if (!hrefLoweredCase.contains("about:") && !hrefLoweredCase.contains("tel:") && + !hrefLoweredCase.contains("data:") && !hrefLoweredCase.contains("whatsapp:") && + !hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("viber:") && + !hrefLoweredCase.contains("sms:") && !hrefLoweredCase.contains("android-app:") && + !hrefLoweredCase.contains("fb-messenger:") && !hrefLoweredCase.contains("mailto:") && + !hrefLoweredCase.contains("@") && !hrefLoweredCase.contains("fb-messenger:")) { // Prefer page's content charset to encode href url Charset hrefCharset = ((contentCharset == null) || contentCharset.isEmpty()) ? StandardCharsets.UTF_8 : Charset.forName(contentCharset);