diff --git a/pom.xml b/pom.xml index 5a48a842a..c33e206f4 100644 --- a/pom.xml +++ b/pom.xml @@ -4,7 +4,7 @@ crawler4j jar crawler4j - 4.3-SNAPSHOT + 4.3-SNAPSHOT-code911 Open Source Web Crawler for Java https://github.com/yasserg/crawler4j @@ -138,7 +138,7 @@ org.apache.tika tika-parsers - 1.5 + 1.11 diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index dcc9b37d1..ca368d3a5 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -109,7 +109,7 @@ public class WebCrawler implements Runnable { * @param crawlController * the controller that manages this crawling session */ - public void init(int id, CrawlController crawlController) { + public void init(int id, CrawlController crawlController) throws InstantiationException, IllegalAccessException { this.myId = id; this.pageFetcher = crawlController.getPageFetcher(); this.robotstxtServer = crawlController.getRobotstxtServer(); diff --git a/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java index 5d3e6304f..ed49870b0 100644 --- a/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java +++ b/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java @@ -228,8 +228,9 @@ public PageFetchResult fetchPage(WebURL webUrl) // Applying Politeness delay synchronized (mutex) { long now = (new Date()).getTime(); - if ((now - lastFetchTime) < config.getPolitenessDelay()) { - Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime)); + int delay = config.getPolitenessDelay(); + if ((now - lastFetchTime) < delay) { + Thread.sleep(delay - (now - lastFetchTime)); } lastFetchTime = (new Date()).getTime(); } diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java b/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java new file mode 100644 index 000000000..4317ae60d --- /dev/null +++ b/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java @@ -0,0 +1,27 @@ +package edu.uci.ics.crawler4j.parser; + +import org.apache.tika.parser.html.HtmlMapper; + +/** + * Maps all HTML tags (not ignore some of this) + * + * @author Andrey Nikolaev (vajadhava@gmail.com) + */ +public class AllTagMapper implements HtmlMapper { + + @Override + public String mapSafeElement(String name) { + return name.toLowerCase(); + } + + @Override + public boolean isDiscardElement(String name) { + return false; + } + + @Override + public String mapSafeAttribute(String elementName, String attributeName) { + return attributeName.toLowerCase(); + } + +} diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java index 2d3d17b6a..e5e0c4a4c 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java @@ -32,6 +32,7 @@ public class HtmlContentHandler extends DefaultHandler { private enum Element { A, + SCRIPT, AREA, LINK, IFRAME, @@ -89,11 +90,10 @@ public void startElement(String uri, String localName, String qName, Attributes addToOutgoingUrls(href, localName); } - } else if (element == Element.IMG) { - String imgSrc = attributes.getValue("src"); - if (imgSrc != null) { - addToOutgoingUrls(imgSrc, localName); - + } else if ((element == Element.IMG) || (element == Element.SCRIPT)) { + String src = attributes.getValue("src"); + if (src != null) { + addToOutgoingUrls(src, localName); } } else if ((element == Element.IFRAME) || (element == Element.FRAME) || (element == Element.EMBED)) { String src = attributes.getValue("src"); diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java index aca2778bc..dbe36174a 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java @@ -1,20 +1,15 @@ /** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at + * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE + * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file + * to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. + * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on + * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the + * specific language governing permissions and limitations under the License. */ - package edu.uci.ics.crawler4j.parser; import java.io.ByteArrayInputStream; @@ -39,122 +34,125 @@ import edu.uci.ics.crawler4j.url.WebURL; import edu.uci.ics.crawler4j.util.Net; import edu.uci.ics.crawler4j.util.Util; +import org.apache.tika.parser.html.HtmlMapper; /** * @author Yasser Ganjisaffar */ public class Parser extends Configurable { - protected static final Logger logger = LoggerFactory.getLogger(Parser.class); - - private final HtmlParser htmlParser; - private final ParseContext parseContext; - - public Parser(CrawlConfig config) { - super(config); - htmlParser = new HtmlParser(); - parseContext = new ParseContext(); - } - - public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException { - if (Util.hasBinaryContent(page.getContentType())) { // BINARY - BinaryParseData parseData = new BinaryParseData(); - if (config.isIncludeBinaryContentInCrawling()) { - if (config.isProcessBinaryContentInCrawling()) { - parseData.setBinaryContent(page.getContentData()); - } else { - parseData.setHtml(""); - } - page.setParseData(parseData); - if (parseData.getHtml() == null) { - throw new ParseException(); - } - parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml())); - } else { - throw new NotAllowedContentException(); - } - } else if (Util.hasPlainTextContent(page.getContentType())) { // plain Text - try { - TextParseData parseData = new TextParseData(); - if (page.getContentCharset() == null) { - parseData.setTextContent(new String(page.getContentData())); - } else { - parseData.setTextContent(new String(page.getContentData(), page.getContentCharset())); - } - parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent())); - page.setParseData(parseData); - } catch (Exception e) { - logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL()); - throw new ParseException(); - } - } else { // isHTML - Metadata metadata = new Metadata(); - HtmlContentHandler contentHandler = new HtmlContentHandler(); - try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) { - htmlParser.parse(inputStream, contentHandler, metadata, parseContext); - } catch (Exception e) { - logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL()); - throw new ParseException(); - } - - if (page.getContentCharset() == null) { - page.setContentCharset(metadata.get("Content-Encoding")); - } - - HtmlParseData parseData = new HtmlParseData(); - parseData.setText(contentHandler.getBodyText().trim()); - parseData.setTitle(metadata.get(DublinCore.TITLE)); - parseData.setMetaTags(contentHandler.getMetaTags()); - // Please note that identifying language takes less than 10 milliseconds - LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText()); - page.setLanguage(languageIdentifier.getLanguage()); - - Set outgoingUrls = new HashSet<>(); - - String baseURL = contentHandler.getBaseUrl(); - if (baseURL != null) { - contextURL = baseURL; - } - - int urlCount = 0; - for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) { - - String href = urlAnchorPair.getHref(); - if ((href == null) || href.trim().isEmpty()) { - continue; - } + protected static final Logger logger = LoggerFactory.getLogger(Parser.class); + + private final HtmlParser htmlParser; + private final ParseContext parseContext; - String hrefLoweredCase = href.trim().toLowerCase(); - if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") && - !hrefLoweredCase.contains("@")) { - String url = URLCanonicalizer.getCanonicalURL(href, contextURL); - if (url != null) { - WebURL webURL = new WebURL(); - webURL.setURL(url); - webURL.setTag(urlAnchorPair.getTag()); - webURL.setAnchor(urlAnchorPair.getAnchor()); - outgoingUrls.add(webURL); - urlCount++; - if (urlCount > config.getMaxOutgoingLinksToFollow()) { - break; + public Parser(CrawlConfig config) throws InstantiationException, IllegalAccessException { + super(config); + htmlParser = new HtmlParser(); + parseContext = new ParseContext(); + // override html maping for mapping all html tags (for example script tag) + parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance()); + } + + public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException { + if (Util.hasBinaryContent(page.getContentType())) { // BINARY + BinaryParseData parseData = new BinaryParseData(); + if (config.isIncludeBinaryContentInCrawling()) { + if (config.isProcessBinaryContentInCrawling()) { + parseData.setBinaryContent(page.getContentData()); + } else { + parseData.setHtml(""); + } + page.setParseData(parseData); + if (parseData.getHtml() == null) { + throw new ParseException(); + } + parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml())); + } else { + throw new NotAllowedContentException(); } - } - } - } - parseData.setOutgoingUrls(outgoingUrls); - - try { - if (page.getContentCharset() == null) { - parseData.setHtml(new String(page.getContentData())); - } else { - parseData.setHtml(new String(page.getContentData(), page.getContentCharset())); - } + } else if (Util.hasPlainTextContent(page.getContentType())) { // plain Text + try { + TextParseData parseData = new TextParseData(); + if (page.getContentCharset() == null) { + parseData.setTextContent(new String(page.getContentData())); + } else { + parseData.setTextContent(new String(page.getContentData(), page.getContentCharset())); + } + parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent())); + page.setParseData(parseData); + } catch (Exception e) { + logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL()); + throw new ParseException(); + } + } else { // isHTML + Metadata metadata = new Metadata(); + HtmlContentHandler contentHandler = new HtmlContentHandler(); + try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) { + htmlParser.parse(inputStream, contentHandler, metadata, parseContext); + } catch (Exception e) { + logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL()); + throw new ParseException(); + } + + if (page.getContentCharset() == null) { + page.setContentCharset(metadata.get("Content-Encoding")); + } + + HtmlParseData parseData = new HtmlParseData(); + parseData.setText(contentHandler.getBodyText().trim()); + parseData.setTitle(metadata.get(DublinCore.TITLE)); + parseData.setMetaTags(contentHandler.getMetaTags()); + // Please note that identifying language takes less than 10 milliseconds + LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText()); + page.setLanguage(languageIdentifier.getLanguage()); + + Set outgoingUrls = new HashSet<>(); - page.setParseData(parseData); - } catch (UnsupportedEncodingException e) { - logger.error("error parsing the html: " + page.getWebURL().getURL(), e); - throw new ParseException(); - } + String baseURL = contentHandler.getBaseUrl(); + if (baseURL != null) { + contextURL = baseURL; + } + + int urlCount = 0; + for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) { + + String href = urlAnchorPair.getHref(); + if ((href == null) || href.trim().isEmpty()) { + continue; + } + + String hrefLoweredCase = href.trim().toLowerCase(); + if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") + && !hrefLoweredCase.contains("@")) { + String url = URLCanonicalizer.getCanonicalURL(href, contextURL); + if (url != null) { + WebURL webURL = new WebURL(); + webURL.setURL(url); + webURL.setTag(urlAnchorPair.getTag()); + webURL.setAnchor(urlAnchorPair.getAnchor()); + outgoingUrls.add(webURL); + urlCount++; + if (urlCount > config.getMaxOutgoingLinksToFollow()) { + break; + } + } + } + } + parseData.setOutgoingUrls(outgoingUrls); + + try { + if (page.getContentCharset() == null) { + parseData.setHtml(new String(page.getContentData())); + } else { + parseData.setHtml(new String(page.getContentData(), page.getContentCharset())); + } + + page.setParseData(parseData); + } catch (UnsupportedEncodingException e) { + logger.error("error parsing the html: " + page.getWebURL().getURL(), e); + throw new ParseException(); + } + } } - } -} \ No newline at end of file +} diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java index 3879735d4..1e50b4d90 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java @@ -40,13 +40,13 @@ public class Downloader { private final Parser parser; private final PageFetcher pageFetcher; - public Downloader() { + public Downloader() throws InstantiationException, IllegalAccessException { CrawlConfig config = new CrawlConfig(); parser = new Parser(config); pageFetcher = new PageFetcher(config); } - public static void main(String[] args) { + public static void main(String[] args) throws InstantiationException, IllegalAccessException { Downloader downloader = new Downloader(); downloader.processUrl("http://en.wikipedia.org/wiki/Main_Page/"); downloader.processUrl("http://www.yahoo.com/");