yasserg · dgoiko · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020 · Feb 3, 2020
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
@@ -213,6 +213,17 @@ public class CrawlConfig {
      */
     private CookieStore cookieStore;
 
+    /**
+     * Maximun number of times a failing WebURL will be tried again before giving up.
+     * Default value is zero.
+     */
+    private short maxRetries = 0;
+
+    /**
+     * If true, it will retry non-responding servers while maxRetries requeriment it met.
+     */
+    private boolean allowRetryConnectionError = false;
+
     /**
      * DNS resolver to use, {@link SystemDefaultDnsResolver} is default.
      */
@@ -732,6 +743,22 @@ public void setBatchReadSize(int batchReadSize) {
         this.batchReadSize = batchReadSize;
     }
 
+    public short getMaxRetries() {
+        return maxRetries;
+    }
+
+    public void setMaxRetries(short maxRetries) {
+        this.maxRetries = maxRetries;
+    }
+
+    public boolean isAllowRetryConnectionError() {
+        return allowRetryConnectionError;
+    }
+
+    public void setAllowRetryConnectionError(boolean allowRetryConnectionError) {
+        this.allowRetryConnectionError = allowRetryConnectionError;
+    }
+
     @Override
     public String toString() {
         StringBuilder sb = new StringBuilder();

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -18,12 +18,15 @@
 package edu.uci.ics.crawler4j.crawler;
 
 import java.io.IOException;
+import java.net.ConnectException;
 import java.net.SocketTimeoutException;
+import java.net.UnknownHostException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Locale;
 
 import org.apache.http.HttpStatus;
+import org.apache.http.conn.HttpHostConnectException;
 import org.apache.http.impl.EnglishReasonPhraseCatalog;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -106,6 +109,10 @@ public class WebCrawler implements Runnable {
 
     private int batchReadSize;
 
+    private short maxRetries;
+
+    private boolean allowRetryConnectionError;
+
     /**
      * Initializes the current instance of the crawler
      *
@@ -127,6 +134,8 @@ public void init(int id, CrawlController crawlController)
         this.myController = crawlController;
         this.isWaitingForNewURLs = false;
         this.batchReadSize = crawlController.getConfig().getBatchReadSize();
+        this.maxRetries = crawlController.getConfig().getMaxRetries();
+        this.allowRetryConnectionError = crawlController.getConfig().isAllowRetryConnectionError();
     }
 
     /**
@@ -238,7 +247,7 @@ protected void onUnexpectedStatusCode(String urlStr, int statusCode, String cont
      *
      * @param webUrl URL which content failed to be fetched
      *
-     * @deprecated use {@link #onContentFetchError(Page)}
+     * @deprecated use {@link #onContentFetchError(Page, Throwable)}
      */
     @Deprecated
     protected void onContentFetchError(WebURL webUrl) {
@@ -251,13 +260,41 @@ protected void onContentFetchError(WebURL webUrl) {
      * This function is called if the content of a url could not be fetched.
      *
      * @param page Partial page object
+     *
+     * @deprecated use {@link #onContentFetchError(Page, Throwable)}
      */
+    @Deprecated
     protected void onContentFetchError(Page page) {
-        logger.warn("Can't fetch content of: {}", page.getWebURL().getURL());
+        onContentFetchError(page.getWebURL());
+        // Do nothing by default (except basic logging)
+        // Sub-classed can override this to add their custom functionality
+    }
+
+    /**
+     * This function is called if the content of a url could not be fetched.
+     *
+     * @param page Partial page object
+     */
+    protected void onContentFetchError(Page page, Throwable exception) {
+        onContentFetchError(page);
         // Do nothing by default (except basic logging)
         // Sub-classed can override this to add their custom functionality
     }
 
+    /**
+     * This function is called if the content of a url could not be fetched.
+     * Subclases may override to decide if it should discard a re-schedule
+     * based on Page or Exception. Returning false means it should not schedule.
+     *
+     * @param page Partial page object
+     * @return true to allow scheduling the WebURL again, false to abort schedule
+     */
+    protected boolean onContentFetchErrorNotFinal(Page page, Throwable exception) {
+        // Call onContentFetchError for retrocompatibility. Should be removed
+        onContentFetchError(page, exception);
+        return true;
+    }
+
     /**
      * This function is called when a unhandled exception was encountered during fetching
      *
@@ -296,6 +333,40 @@ protected void onParseError(WebURL webUrl) {
         // Sub-classed can override this to add their custom functionality
     }
 
+    /**
+     * This function is called if there's a connection error
+     *
+     * @param page constructed around failing URL which failed on parsing
+     * @param e Exception thrown
+     */
+    protected void onConnectionError(Page page, ConnectException e) {
+        logger.warn("Connection error. URL: {} . Error: {}", page.getWebURL().getURL(), e.toString());
+    }
+
+    /**
+     * This function is called if there's a connection error
+     * Subclases may override to decide if it should discard a re-schedule
+     * based on Page or Exception. Returning false means it should not schedul
+     *
+     * @param page constructed around failing URL which failed on parsing
+     * @param e Exception thrown
+     * @returns true if the URL sgould be scheduled again, false otherwise
+     */
+    protected boolean onConnectionErrorNotFinal(Page page, ConnectException e) {
+        logger.warn("Connection error. Scheduling again. URL: {}. Error: {}", page.getWebURL().getURL(), e.toString());
+        return true;
+    }
+
+    /**
+     * This function is called if there's an UnknownHostException
+     *
+     * @param page constructed around failing URL which failed on parsing
+     * @param e Exception thrown
+     */
+    protected void onUnknownHost(Page page, UnknownHostException e) {
+        logger.warn("Unknown host for URL: {}", page.getWebURL().getURL());
+    }
+
     /**
      * The CrawlController instance that has created this crawler instance will
      * call this function just before terminating this crawler thread. Classes
@@ -569,12 +640,27 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
         } catch (ParseException pe) {
             onParseError(curURL, pe);
         } catch (ContentFetchException | SocketTimeoutException cfe) {
-            onContentFetchError(curURL);
-            onContentFetchError(page);
+            if (curURL.getFailedFetches() < maxRetries) {
+                if (onContentFetchErrorNotFinal(page, cfe)) {
+                    scheduleAgain0(curURL);
+                }
+            } else {
+                onContentFetchError(page, cfe);
+            }
         } catch (NotAllowedContentException nace) {
             logger.debug(
                 "Skipping: {} as it contains binary content which you configured not to crawl",
                 curURL.getURL());
+        } catch (HttpHostConnectException e) {
+            if (allowRetryConnectionError && curURL.getFailedFetches() < maxRetries) {
+                if (onConnectionErrorNotFinal(page, e)) {
+                    scheduleAgain0(curURL);
+                }
+            } else {
+                onConnectionError(page, e);
+            }
+        } catch (UnknownHostException e) {
+            onUnknownHost(page, e);
         } catch (IOException | InterruptedException | RuntimeException e) {
             onUnhandledException(curURL, e);
         } finally {
@@ -584,6 +670,30 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
         }
     }
 
+    /**
+     * Schedules an URL that failed somehow if getFailedFetches < maxRetries.
+     *
+     * True return doesn't guarantee that the URL was scheduled: It may have been rejected by frontier.
+     * @param curURL
+     * @return true if the url was passed to the frontier, false otherwise
+     */
+    protected boolean scheduleAgain(WebURL curURL) {
+        if (curURL.getFailedFetches() < maxRetries) {
+            scheduleAgain0(curURL);
+            return true;
+        }
+        return false;
+    }
+
+    private void scheduleAgain0(WebURL curURL) {
+        curURL.incrementFailedFetches();
+        frontier.schedule(curURL);
+    }
+
+    protected short getMaxRetries() {
+        return maxRetries;
+    }
+
     public Thread getThread() {
         return myThread;
     }

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java
@@ -38,6 +38,7 @@ public WebURL entryToObject(TupleInput input) {
         webURL.setDepth(input.readShort());
         webURL.setPriority(input.readByte());
         webURL.setAnchor(input.readString());
+        webURL.setFailedFetches(input.readShort());
         return webURL;
     }
 
@@ -50,5 +51,6 @@ public void objectToEntry(WebURL url, TupleOutput output) {
         output.writeShort(url.getDepth());
         output.writeByte(url.getPriority());
         output.writeString(url.getAnchor());
+        output.writeShort(url.getFailedFetches());
     }
 }
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
@@ -48,6 +48,7 @@ public class WebURL implements Serializable {
     private String tag;
     private Map<String, String> attributes;
     private TLDList tldList;
+    private int failedFetches = 0;
 
     /**
      * Set the TLDList if you want {@linkplain #getDomain()} and
@@ -249,6 +250,18 @@ public String getAttribute(String name) {
         return attributes.getOrDefault(name, "");
     }
 
+    public int getFailedFetches() {
+        return failedFetches;
+    }
+
+    public void setFailedFetches(int failedFetches) {
+        this.failedFetches = failedFetches;
+    }
+
+    public void incrementFailedFetches() {
+        this.failedFetches++;
+    }
+
     @Override
     public int hashCode() {
         return url.hashCode();