yasserg · code-911 · Feb 10, 2016 · Mar 9, 2016
diff --git a/pom.xml b/pom.xml
@@ -4,7 +4,7 @@
 	<artifactId>crawler4j</artifactId>
 	<packaging>jar</packaging>
 	<name>crawler4j</name>
-	<version>4.3-SNAPSHOT</version>
+	<version>4.3-SNAPSHOT-code911</version>
 	<description>Open Source Web Crawler for Java</description>
 	<url>https://github.com/yasserg/crawler4j</url>
 	<licenses>
@@ -138,7 +138,7 @@
 	<dependency>
 		<groupId>org.apache.tika</groupId>
 		<artifactId>tika-parsers</artifactId>
-		<version>1.5</version>
+		<version>1.11</version>
 	</dependency>
 
     <!-- Test Dependencies -->

diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -109,7 +109,7 @@ public class WebCrawler implements Runnable {
    * @param crawlController
    *            the controller that manages this crawling session
    */
-  public void init(int id, CrawlController crawlController) {
+  public void init(int id, CrawlController crawlController) throws InstantiationException, IllegalAccessException {
     this.myId = id;
     this.pageFetcher = crawlController.getPageFetcher();
     this.robotstxtServer = crawlController.getRobotstxtServer();

diff --git a/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
@@ -228,8 +228,9 @@ public PageFetchResult fetchPage(WebURL webUrl)
       // Applying Politeness delay
       synchronized (mutex) {
         long now = (new Date()).getTime();
-        if ((now - lastFetchTime) < config.getPolitenessDelay()) {
-          Thread.sleep(config.getPolitenessDelay() - (now - lastFetchTime));
+        int delay = config.getPolitenessDelay();
+        if ((now - lastFetchTime) < delay) {
+          Thread.sleep(delay - (now - lastFetchTime));
         }
         lastFetchTime = (new Date()).getTime();
       }

diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java b/src/main/java/edu/uci/ics/crawler4j/parser/AllTagMapper.java
@@ -0,0 +1,27 @@
+package edu.uci.ics.crawler4j.parser;
+
+import org.apache.tika.parser.html.HtmlMapper;
+
+/**
+ * Maps all HTML tags (not ignore some of this)
+ *
+ * @author Andrey Nikolaev ([email protected])
+ */
+public class AllTagMapper implements HtmlMapper {
+
+    @Override
+    public String mapSafeElement(String name) {
+        return name.toLowerCase();
+    }
+
+    @Override
+    public boolean isDiscardElement(String name) {
+        return false;
+    }
+
+    @Override
+    public String mapSafeAttribute(String elementName, String attributeName) {
+        return attributeName.toLowerCase();
+    }
+
+}
diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java
@@ -32,6 +32,7 @@ public class HtmlContentHandler extends DefaultHandler {
 
   private enum Element {
     A,
+    SCRIPT,
     AREA,
     LINK,
     IFRAME,
@@ -89,11 +90,10 @@ public void startElement(String uri, String localName, String qName, Attributes
         addToOutgoingUrls(href, localName);
 
       }
-    } else if (element == Element.IMG) {
-      String imgSrc = attributes.getValue("src");
-      if (imgSrc != null) {
-        addToOutgoingUrls(imgSrc, localName);
-
+    } else if ((element == Element.IMG) || (element == Element.SCRIPT)) {
+      String src = attributes.getValue("src");
+      if (src != null) {
+        addToOutgoingUrls(src, localName);
       }
     } else if ((element == Element.IFRAME) || (element == Element.FRAME) || (element == Element.EMBED)) {
       String src = attributes.getValue("src");

diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java
@@ -1,20 +1,15 @@
 /**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE
+ * file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file
+ * to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the
+ * License. You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
+ * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+ * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations under the License.
  */
-
 package edu.uci.ics.crawler4j.parser;
 
 import java.io.ByteArrayInputStream;
@@ -39,122 +34,125 @@
 import edu.uci.ics.crawler4j.url.WebURL;
 import edu.uci.ics.crawler4j.util.Net;
 import edu.uci.ics.crawler4j.util.Util;
+import org.apache.tika.parser.html.HtmlMapper;
 
 /**
  * @author Yasser Ganjisaffar
  */
 public class Parser extends Configurable {
 
-  protected static final Logger logger = LoggerFactory.getLogger(Parser.class);
-
-  private final HtmlParser htmlParser;
-  private final ParseContext parseContext;
-
-  public Parser(CrawlConfig config) {
-    super(config);
-    htmlParser = new HtmlParser();
-    parseContext = new ParseContext();
-  }
-
-  public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
-    if (Util.hasBinaryContent(page.getContentType())) { // BINARY
-      BinaryParseData parseData = new BinaryParseData();
-      if (config.isIncludeBinaryContentInCrawling()) {
-        if (config.isProcessBinaryContentInCrawling()) {
-          parseData.setBinaryContent(page.getContentData());
-        } else {
-          parseData.setHtml("<html></html>");
-        }
-        page.setParseData(parseData);
-        if (parseData.getHtml() == null) {
-          throw new ParseException();
-        }
-        parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
-      } else {
-        throw new NotAllowedContentException();
-      }
-    } else if (Util.hasPlainTextContent(page.getContentType())) { // plain Text
-      try {
-        TextParseData parseData = new TextParseData();
-        if (page.getContentCharset() == null) {
-          parseData.setTextContent(new String(page.getContentData()));
-        } else {
-          parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
-        }
-        parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
-        page.setParseData(parseData);
-      } catch (Exception e) {
-        logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
-        throw new ParseException();
-      }
-    } else { // isHTML
-      Metadata metadata = new Metadata();
-      HtmlContentHandler contentHandler = new HtmlContentHandler();
-      try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
-        htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
-      } catch (Exception e) {
-        logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
-        throw new ParseException();
-      }
-
-      if (page.getContentCharset() == null) {
-        page.setContentCharset(metadata.get("Content-Encoding"));
-      }
-
-      HtmlParseData parseData = new HtmlParseData();
-      parseData.setText(contentHandler.getBodyText().trim());
-      parseData.setTitle(metadata.get(DublinCore.TITLE));
-      parseData.setMetaTags(contentHandler.getMetaTags());
-      // Please note that identifying language takes less than 10 milliseconds
-      LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
-      page.setLanguage(languageIdentifier.getLanguage());
-
-      Set<WebURL> outgoingUrls = new HashSet<>();
-
-      String baseURL = contentHandler.getBaseUrl();
-      if (baseURL != null) {
-        contextURL = baseURL;
-      }
-
-      int urlCount = 0;
-      for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
-
-        String href = urlAnchorPair.getHref();
-        if ((href == null) || href.trim().isEmpty()) {
-          continue;
-        }
+    protected static final Logger logger = LoggerFactory.getLogger(Parser.class);
+
+    private final HtmlParser htmlParser;
+    private final ParseContext parseContext;
 
-        String hrefLoweredCase = href.trim().toLowerCase();
-        if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:") &&
-            !hrefLoweredCase.contains("@")) {
-          String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
-          if (url != null) {
-            WebURL webURL = new WebURL();
-            webURL.setURL(url);
-            webURL.setTag(urlAnchorPair.getTag());
-            webURL.setAnchor(urlAnchorPair.getAnchor());
-            outgoingUrls.add(webURL);
-            urlCount++;
-            if (urlCount > config.getMaxOutgoingLinksToFollow()) {
-              break;
+    public Parser(CrawlConfig config) throws InstantiationException, IllegalAccessException {
+        super(config);
+        htmlParser = new HtmlParser();
+        parseContext = new ParseContext();
+        // override html maping for mapping all html tags (for example script tag)
+        parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance()); 
+    }
+
+    public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException {
+        if (Util.hasBinaryContent(page.getContentType())) { // BINARY
+            BinaryParseData parseData = new BinaryParseData();
+            if (config.isIncludeBinaryContentInCrawling()) {
+                if (config.isProcessBinaryContentInCrawling()) {
+                    parseData.setBinaryContent(page.getContentData());
+                } else {
+                    parseData.setHtml("<html></html>");
+                }
+                page.setParseData(parseData);
+                if (parseData.getHtml() == null) {
+                    throw new ParseException();
+                }
+                parseData.setOutgoingUrls(Net.extractUrls(parseData.getHtml()));
+            } else {
+                throw new NotAllowedContentException();
             }
-          }
-        }
-      }
-      parseData.setOutgoingUrls(outgoingUrls);
-
-      try {
-        if (page.getContentCharset() == null) {
-          parseData.setHtml(new String(page.getContentData()));
-        } else {
-          parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
-        }
+        } else if (Util.hasPlainTextContent(page.getContentType())) { // plain Text
+            try {
+                TextParseData parseData = new TextParseData();
+                if (page.getContentCharset() == null) {
+                    parseData.setTextContent(new String(page.getContentData()));
+                } else {
+                    parseData.setTextContent(new String(page.getContentData(), page.getContentCharset()));
+                }
+                parseData.setOutgoingUrls(Net.extractUrls(parseData.getTextContent()));
+                page.setParseData(parseData);
+            } catch (Exception e) {
+                logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
+                throw new ParseException();
+            }
+        } else { // isHTML
+            Metadata metadata = new Metadata();
+            HtmlContentHandler contentHandler = new HtmlContentHandler();
+            try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) {
+                htmlParser.parse(inputStream, contentHandler, metadata, parseContext);
+            } catch (Exception e) {
+                logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL());
+                throw new ParseException();
+            }
+
+            if (page.getContentCharset() == null) {
+                page.setContentCharset(metadata.get("Content-Encoding"));
+            }
+
+            HtmlParseData parseData = new HtmlParseData();
+            parseData.setText(contentHandler.getBodyText().trim());
+            parseData.setTitle(metadata.get(DublinCore.TITLE));
+            parseData.setMetaTags(contentHandler.getMetaTags());
+            // Please note that identifying language takes less than 10 milliseconds
+            LanguageIdentifier languageIdentifier = new LanguageIdentifier(parseData.getText());
+            page.setLanguage(languageIdentifier.getLanguage());
+
+            Set<WebURL> outgoingUrls = new HashSet<>();
 
-        page.setParseData(parseData);
-      } catch (UnsupportedEncodingException e) {
-        logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
-        throw new ParseException();
-      }
+            String baseURL = contentHandler.getBaseUrl();
+            if (baseURL != null) {
+                contextURL = baseURL;
+            }
+
+            int urlCount = 0;
+            for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
+
+                String href = urlAnchorPair.getHref();
+                if ((href == null) || href.trim().isEmpty()) {
+                    continue;
+                }
+
+                String hrefLoweredCase = href.trim().toLowerCase();
+                if (!hrefLoweredCase.contains("javascript:") && !hrefLoweredCase.contains("mailto:")
+                        && !hrefLoweredCase.contains("@")) {
+                    String url = URLCanonicalizer.getCanonicalURL(href, contextURL);
+                    if (url != null) {
+                        WebURL webURL = new WebURL();
+                        webURL.setURL(url);
+                        webURL.setTag(urlAnchorPair.getTag());
+                        webURL.setAnchor(urlAnchorPair.getAnchor());
+                        outgoingUrls.add(webURL);
+                        urlCount++;
+                        if (urlCount > config.getMaxOutgoingLinksToFollow()) {
+                            break;
+                        }
+                    }
+                }
+            }
+            parseData.setOutgoingUrls(outgoingUrls);
+
+            try {
+                if (page.getContentCharset() == null) {
+                    parseData.setHtml(new String(page.getContentData()));
+                } else {
+                    parseData.setHtml(new String(page.getContentData(), page.getContentCharset()));
+                }
+
+                page.setParseData(parseData);
+            } catch (UnsupportedEncodingException e) {
+                logger.error("error parsing the html: " + page.getWebURL().getURL(), e);
+                throw new ParseException();
+            }
+        }
     }
-  }
-}
+}
diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java b/src/test/java/edu/uci/ics/crawler4j/examples/localdata/Downloader.java
@@ -40,13 +40,13 @@ public class Downloader {
   private final Parser parser;
   private final PageFetcher pageFetcher;
 
-  public Downloader() {
+  public Downloader() throws InstantiationException, IllegalAccessException {
     CrawlConfig config = new CrawlConfig();
     parser = new Parser(config);
     pageFetcher = new PageFetcher(config);
   }
 
-  public static void main(String[] args) {
+  public static void main(String[] args) throws InstantiationException, IllegalAccessException {
     Downloader downloader = new Downloader();
     downloader.processUrl("http://en.wikipedia.org/wiki/Main_Page/");
     downloader.processUrl("http://www.yahoo.com/");