diff --git a/pom.xml b/pom.xml
index 5a48a842a..c33e206f4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -4,7 +4,7 @@
crawler4j
jar
crawler4j
- 4.3-SNAPSHOT
+ 4.3-SNAPSHOT-code911
Open Source Web Crawler for Java
https://github.com/yasserg/crawler4j
@@ -138,7 +138,7 @@
org.apache.tika
tika-parsers
- 1.5
+ 1.11
diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
index dcc9b37d1..ca368d3a5 100644
--- a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
+++ b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -109,7 +109,7 @@ public class WebCrawler implements Runnable {
* @param crawlController
* the controller that manages this crawling session
*/
- public void init(int id, CrawlController crawlController) {
+ public void init(int id, CrawlController crawlController) throws InstantiationException, IllegalAccessException {
this.myId = id;
this.pageFetcher = crawlController.getPageFetcher();
this.robotstxtServer = crawlController.getRobotstxtServer();
diff --git a/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
index 5d3e6304f..ed49870b0 100644
--- a/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
+++ b/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
@@ -228,8 +228,9 @@ public PageFetchResult fetchPage(WebURL webUrl)
// Applying Politeness delay
synchronized (mutex) {
long now = (new Date()).getTime();
- if ((now - lastFetchTime) < config.getPolitenessDelay()) {
- Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
+ int delay = config.getPolitenessDelay();
+ if ((now - lastFetchTime) < delay) {
+ Thread.sleep(delay - (now - lastFetchTime));
}
lastFetchTime = (new Date()).getTime();
}
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java b/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java
new file mode 100644
index 000000000..4317ae60d
--- /dev/null
+++ b/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java
@@ -0,0 +1,27 @@
+package edu.uci.ics.crawler4j.parser;
+
+import org.apache.tika.parser.html.HtmlMapper;
+
+/**
+ * Maps all HTML tags (not ignore some of this)
+ *
+ * @author Andrey Nikolaev (vajadhava@gmail.com)
+ */
+public class AllTagMapper implements HtmlMapper {
+
+ @Override
+ public String mapSafeElement(String name) {
+ return name.toLowerCase();
+ }
+
+ @Override
+ public boolean isDiscardElement(String name) {
+ return false;
+ }
+
+ @Override
+ public String mapSafeAttribute(String elementName, String attributeName) {
+ return attributeName.toLowerCase();
+ }
+
+}
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
index 2d3d17b6a..e5e0c4a4c 100644
--- a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
+++ b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
@@ -32,6 +32,7 @@ public class HtmlContentHandler extends DefaultHandler {
private enum Element {
A,
+ SCRIPT,
AREA,
LINK,
IFRAME,
@@ -89,11 +90,10 @@ public void startElement(String uri, String localName, String qName, Attributes
addToOutgoingUrls(href, localName);
}
- } else if (element == Element.IMG) {
- String imgSrc = attributes.getValue("src");
- if (imgSrc != null) {
- addToOutgoingUrls(imgSrc, localName);
-
+ } else if ((element == Element.IMG) || (element == Element.SCRIPT)) {
+ String src = attributes.getValue("src");
+ if (src != null) {
+ addToOutgoingUrls(src, localName);
}
} else if ((element == Element.IFRAME) || (element == Element.FRAME) || (element == Element.EMBED)) {
String src = attributes.getValue("src");
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
index aca2778bc..dbe36174a 100644
--- a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
+++ b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
@@ -1,20 +1,15 @@
/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE
+ * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file
+ * to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
*
- * http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
*
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
*/
-
package edu.uci.ics.crawler4j.parser;
import java.io.ByteArrayInputStream;
@@ -39,122 +34,125 @@
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Net;
import edu.uci.ics.crawler4j.util.Util;
+import org.apache.tika.parser.html.HtmlMapper;
/**
* @author Yasser Ganjisaffar
*/
public class Parser extends Configurable {
- protected static final Logger logger = LoggerFactory.getLogger(Parser.class);
-
- private final HtmlParser htmlParser;
- private final ParseContext parseContext;
-
- public Parser(CrawlConfig config) {
- super(config);
- htmlParser = new HtmlParser();
- parseContext = new ParseContext();
- }
-
- public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
- if (Util.hasBinaryContent(page.getContentType())) { // BINARY
- BinaryParseData parseData = new BinaryParseData();
- if (config.isIncludeBinaryContentInCrawling()) {
- if (config.isProcessBinaryContentInCrawling()) {
- parseData.setBinaryContent(page.getContentData());
- } else {
- parseData.setHtml("");
- }
- page.setParseData(parseData);
- if (parseData.getHtml() == null) {
- throw new ParseException();
- }
- parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
- } else {
- throw new NotAllowedContentException();
- }
- } else if (Util.hasPlainTextContent(page.getContentType())) { // plain Text
- try {
- TextParseData parseData = new TextParseData();
- if (page.getContentCharset() == null) {
- parseData.setTextContent(new String(page.getContentData()));
- } else {
- parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
- }
- parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
- page.setParseData(parseData);
- } catch (Exception e) {
- logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
- throw new ParseException();
- }
- } else { // isHTML
- Metadata metadata = new Metadata();
- HtmlContentHandler contentHandler = new HtmlContentHandler();
- try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
- htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
- } catch (Exception e) {
- logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
- throw new ParseException();
- }
-
- if (page.getContentCharset() == null) {
- page.setContentCharset(metadata.get("Content-Encoding"));
- }
-
- HtmlParseData parseData = new HtmlParseData();
- parseData.setText(contentHandler.getBodyText().trim());
- parseData.setTitle(metadata.get(DublinCore.TITLE));
- parseData.setMetaTags(contentHandler.getMetaTags());
- // Please note that identifying language takes less than 10 milliseconds
- LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
- page.setLanguage(languageIdentifier.getLanguage());
-
- Set outgoingUrls = new HashSet<>();
-
- String baseURL = contentHandler.getBaseUrl();
- if (baseURL != null) {
- contextURL = baseURL;
- }
-
- int urlCount = 0;
- for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
-
- String href = urlAnchorPair.getHref();
- if ((href == null) || href.trim().isEmpty()) {
- continue;
- }
+ protected static final Logger logger = LoggerFactory.getLogger(Parser.class);
+
+ private final HtmlParser htmlParser;
+ private final ParseContext parseContext;
- String hrefLoweredCase = href.trim().toLowerCase();
- if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") &&
- !hrefLoweredCase.contains("@")) {
- String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
- if (url != null) {
- WebURL webURL = new WebURL();
- webURL.setURL(url);
- webURL.setTag(urlAnchorPair.getTag());
- webURL.setAnchor(urlAnchorPair.getAnchor());
- outgoingUrls.add(webURL);
- urlCount++;
- if (urlCount > config.getMaxOutgoingLinksToFollow()) {
- break;
+ public Parser(CrawlConfig config) throws InstantiationException, IllegalAccessException {
+ super(config);
+ htmlParser = new HtmlParser();
+ parseContext = new ParseContext();
+ // override html maping for mapping all html tags (for example script tag)
+ parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());
+ }
+
+ public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
+ if (Util.hasBinaryContent(page.getContentType())) { // BINARY
+ BinaryParseData parseData = new BinaryParseData();
+ if (config.isIncludeBinaryContentInCrawling()) {
+ if (config.isProcessBinaryContentInCrawling()) {
+ parseData.setBinaryContent(page.getContentData());
+ } else {
+ parseData.setHtml("");
+ }
+ page.setParseData(parseData);
+ if (parseData.getHtml() == null) {
+ throw new ParseException();
+ }
+ parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
+ } else {
+ throw new NotAllowedContentException();
}
- }
- }
- }
- parseData.setOutgoingUrls(outgoingUrls);
-
- try {
- if (page.getContentCharset() == null) {
- parseData.setHtml(new String(page.getContentData()));
- } else {
- parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
- }
+ } else if (Util.hasPlainTextContent(page.getContentType())) { // plain Text
+ try {
+ TextParseData parseData = new TextParseData();
+ if (page.getContentCharset() == null) {
+ parseData.setTextContent(new String(page.getContentData()));
+ } else {
+ parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
+ }
+ parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
+ page.setParseData(parseData);
+ } catch (Exception e) {
+ logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
+ throw new ParseException();
+ }
+ } else { // isHTML
+ Metadata metadata = new Metadata();
+ HtmlContentHandler contentHandler = new HtmlContentHandler();
+ try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
+ htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
+ } catch (Exception e) {
+ logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
+ throw new ParseException();
+ }
+
+ if (page.getContentCharset() == null) {
+ page.setContentCharset(metadata.get("Content-Encoding"));
+ }
+
+ HtmlParseData parseData = new HtmlParseData();
+ parseData.setText(contentHandler.getBodyText().trim());
+ parseData.setTitle(metadata.get(DublinCore.TITLE));
+ parseData.setMetaTags(contentHandler.getMetaTags());
+ // Please note that identifying language takes less than 10 milliseconds
+ LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
+ page.setLanguage(languageIdentifier.getLanguage());
+
+ Set outgoingUrls = new HashSet<>();
- page.setParseData(parseData);
- } catch (UnsupportedEncodingException e) {
- logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
- throw new ParseException();
- }
+ String baseURL = contentHandler.getBaseUrl();
+ if (baseURL != null) {
+ contextURL = baseURL;
+ }
+
+ int urlCount = 0;
+ for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
+
+ String href = urlAnchorPair.getHref();
+ if ((href == null) || href.trim().isEmpty()) {
+ continue;
+ }
+
+ String hrefLoweredCase = href.trim().toLowerCase();
+ if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:")
+ && !hrefLoweredCase.contains("@")) {
+ String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
+ if (url != null) {
+ WebURL webURL = new WebURL();
+ webURL.setURL(url);
+ webURL.setTag(urlAnchorPair.getTag());
+ webURL.setAnchor(urlAnchorPair.getAnchor());
+ outgoingUrls.add(webURL);
+ urlCount++;
+ if (urlCount > config.getMaxOutgoingLinksToFollow()) {
+ break;
+ }
+ }
+ }
+ }
+ parseData.setOutgoingUrls(outgoingUrls);
+
+ try {
+ if (page.getContentCharset() == null) {
+ parseData.setHtml(new String(page.getContentData()));
+ } else {
+ parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
+ }
+
+ page.setParseData(parseData);
+ } catch (UnsupportedEncodingException e) {
+ logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
+ throw new ParseException();
+ }
+ }
}
- }
-}
\ No newline at end of file
+}
diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java
index 3879735d4..1e50b4d90 100644
--- a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java
+++ b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java
@@ -40,13 +40,13 @@ public class Downloader {
private final Parser parser;
private final PageFetcher pageFetcher;
- public Downloader() {
+ public Downloader() throws InstantiationException, IllegalAccessException {
CrawlConfig config = new CrawlConfig();
parser = new Parser(config);
pageFetcher = new PageFetcher(config);
}
- public static void main(String[] args) {
+ public static void main(String[] args) throws InstantiationException, IllegalAccessException {
Downloader downloader = new Downloader();
downloader.processUrl("http://en.wikipedia.org/wiki/Main_Page/");
downloader.processUrl("http://www.yahoo.com/");