Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry on ContentFetchError #437

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,17 @@ public class CrawlConfig {
*/
private CookieStore cookieStore;

/**
* Maximun number of times a failing WebURL will be tried again before giving up.
* Default value is zero.
*/
private short maxRetries = 0;

/**
* If true, it will retry non-responding servers while maxRetries requeriment it met.
*/
private boolean allowRetryConnectionError = false;

/**
* DNS resolver to use, {@link SystemDefaultDnsResolver} is default.
*/
Expand Down Expand Up @@ -732,6 +743,22 @@ public void setBatchReadSize(int batchReadSize) {
this.batchReadSize = batchReadSize;
}

public short getMaxRetries() {
return maxRetries;
}

public void setMaxRetries(short maxRetries) {
this.maxRetries = maxRetries;
}

public boolean isAllowRetryConnectionError() {
return allowRetryConnectionError;
}

public void setAllowRetryConnectionError(boolean allowRetryConnectionError) {
this.allowRetryConnectionError = allowRetryConnectionError;
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
Expand Down
118 changes: 114 additions & 4 deletions crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@
package edu.uci.ics.crawler4j.crawler;

import java.io.IOException;
import java.net.ConnectException;
import java.net.SocketTimeoutException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.List;
import java.util.Locale;

import org.apache.http.HttpStatus;
import org.apache.http.conn.HttpHostConnectException;
import org.apache.http.impl.EnglishReasonPhraseCatalog;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -106,6 +109,10 @@ public class WebCrawler implements Runnable {

private int batchReadSize;

private short maxRetries;

private boolean allowRetryConnectionError;

/**
* Initializes the current instance of the crawler
*
Expand All @@ -127,6 +134,8 @@ public void init(int id, CrawlController crawlController)
this.myController = crawlController;
this.isWaitingForNewURLs = false;
this.batchReadSize = crawlController.getConfig().getBatchReadSize();
this.maxRetries = crawlController.getConfig().getMaxRetries();
this.allowRetryConnectionError = crawlController.getConfig().isAllowRetryConnectionError();
}

/**
Expand Down Expand Up @@ -238,7 +247,7 @@ protected void onUnexpectedStatusCode(String urlStr, int statusCode, String cont
*
* @param webUrl URL which content failed to be fetched
*
* @deprecated use {@link #onContentFetchError(Page)}
* @deprecated use {@link #onContentFetchError(Page, Throwable)}
*/
@Deprecated
protected void onContentFetchError(WebURL webUrl) {
Expand All @@ -251,13 +260,41 @@ protected void onContentFetchError(WebURL webUrl) {
* This function is called if the content of a url could not be fetched.
*
* @param page Partial page object
*
* @deprecated use {@link #onContentFetchError(Page, Throwable)}
*/
@Deprecated
protected void onContentFetchError(Page page) {
logger.warn("Can't fetch content of: {}", page.getWebURL().getURL());
onContentFetchError(page.getWebURL());
// Do nothing by default (except basic logging)
// Sub-classed can override this to add their custom functionality
}

/**
* This function is called if the content of a url could not be fetched.
*
* @param page Partial page object
*/
protected void onContentFetchError(Page page, Throwable exception) {
onContentFetchError(page);
// Do nothing by default (except basic logging)
// Sub-classed can override this to add their custom functionality
}

/**
* This function is called if the content of a url could not be fetched.
* Subclases may override to decide if it should discard a re-schedule
* based on Page or Exception. Returning false means it should not schedule.
*
* @param page Partial page object
* @return true to allow scheduling the WebURL again, false to abort schedule
*/
protected boolean onContentFetchErrorNotFinal(Page page, Throwable exception) {
// Call onContentFetchError for retrocompatibility. Should be removed
onContentFetchError(page, exception);
return true;
}

/**
* This function is called when a unhandled exception was encountered during fetching
*
Expand Down Expand Up @@ -296,6 +333,40 @@ protected void onParseError(WebURL webUrl) {
// Sub-classed can override this to add their custom functionality
}

/**
* This function is called if there's a connection error
*
* @param page constructed around failing URL which failed on parsing
* @param e Exception thrown
*/
protected void onConnectionError(Page page, ConnectException e) {
logger.warn("Connection error. URL: {} . Error: {}", page.getWebURL().getURL(), e.toString());
}

/**
* This function is called if there's a connection error
* Subclases may override to decide if it should discard a re-schedule
* based on Page or Exception. Returning false means it should not schedul
*
* @param page constructed around failing URL which failed on parsing
* @param e Exception thrown
* @returns true if the URL sgould be scheduled again, false otherwise
*/
protected boolean onConnectionErrorNotFinal(Page page, ConnectException e) {
logger.warn("Connection error. Scheduling again. URL: {}. Error: {}", page.getWebURL().getURL(), e.toString());
return true;
}

/**
* This function is called if there's an UnknownHostException
*
* @param page constructed around failing URL which failed on parsing
* @param e Exception thrown
*/
protected void onUnknownHost(Page page, UnknownHostException e) {
logger.warn("Unknown host for URL: {}", page.getWebURL().getURL());
}

/**
* The CrawlController instance that has created this crawler instance will
* call this function just before terminating this crawler thread. Classes
Expand Down Expand Up @@ -569,12 +640,27 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
} catch (ParseException pe) {
onParseError(curURL, pe);
} catch (ContentFetchException | SocketTimeoutException cfe) {
onContentFetchError(curURL);
onContentFetchError(page);
if (curURL.getFailedFetches() < maxRetries) {
if (onContentFetchErrorNotFinal(page, cfe)) {
scheduleAgain0(curURL);
}
} else {
onContentFetchError(page, cfe);
}
} catch (NotAllowedContentException nace) {
logger.debug(
"Skipping: {} as it contains binary content which you configured not to crawl",
curURL.getURL());
} catch (HttpHostConnectException e) {
if (allowRetryConnectionError && curURL.getFailedFetches() < maxRetries) {
if (onConnectionErrorNotFinal(page, e)) {
scheduleAgain0(curURL);
}
} else {
onConnectionError(page, e);
}
} catch (UnknownHostException e) {
onUnknownHost(page, e);
} catch (IOException | InterruptedException | RuntimeException e) {
onUnhandledException(curURL, e);
} finally {
Expand All @@ -584,6 +670,30 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
}
}

/**
* Schedules an URL that failed somehow if getFailedFetches < maxRetries.
*
* True return doesn't guarantee that the URL was scheduled: It may have been rejected by frontier.
* @param curURL
* @return true if the url was passed to the frontier, false otherwise
*/
protected boolean scheduleAgain(WebURL curURL) {
if (curURL.getFailedFetches() < maxRetries) {
scheduleAgain0(curURL);
return true;
}
return false;
}

private void scheduleAgain0(WebURL curURL) {
curURL.incrementFailedFetches();
frontier.schedule(curURL);
}

protected short getMaxRetries() {
return maxRetries;
}

public Thread getThread() {
return myThread;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ public WebURL entryToObject(TupleInput input) {
webURL.setDepth(input.readShort());
webURL.setPriority(input.readByte());
webURL.setAnchor(input.readString());
webURL.setFailedFetches(input.readShort());
return webURL;
}

Expand All @@ -50,5 +51,6 @@ public void objectToEntry(WebURL url, TupleOutput output) {
output.writeShort(url.getDepth());
output.writeByte(url.getPriority());
output.writeString(url.getAnchor());
output.writeShort(url.getFailedFetches());
}
}
13 changes: 13 additions & 0 deletions crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ public class WebURL implements Serializable {
private String tag;
private Map<String, String> attributes;
private TLDList tldList;
private int failedFetches = 0;

/**
* Set the TLDList if you want {@linkplain #getDomain()} and
Expand Down Expand Up @@ -249,6 +250,18 @@ public String getAttribute(String name) {
return attributes.getOrDefault(name, "");
}

public int getFailedFetches() {
return failedFetches;
}

public void setFailedFetches(int failedFetches) {
this.failedFetches = failedFetches;
}

public void incrementFailedFetches() {
this.failedFetches++;
}

@Override
public int hashCode() {
return url.hashCode();
Expand Down