diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 6f7c6573b..7135522af 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -448,20 +448,17 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException onRedirectedStatusCode(page); if (myController.getConfig().isFollowRedirects()) { + if (curURL.isFollowRedirectsInmediatly() && curURL.getMaxInmediateRedirects() > 0) { + followRedirectInmediatly(curURL, movedToUrl); + return; + } int newDocId = docIdServer.getDocId(movedToUrl); if (newDocId > 0) { logger.debug("Redirect page: {} is already seen", curURL); return; } - WebURL webURL = new WebURL(); - webURL.setTldList(myController.getTldList()); - webURL.setURL(movedToUrl); - webURL.setParentDocid(curURL.getParentDocid()); - webURL.setParentUrl(curURL.getParentUrl()); - webURL.setDepth(curURL.getDepth()); - webURL.setDocid(-1); - webURL.setAnchor(curURL.getAnchor()); + WebURL webURL = createRedirectedWebURL(curURL, movedToUrl); if (shouldVisit(page, webURL)) { if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) { webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); @@ -584,6 +581,50 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException } } + /** + * Creates a new WebURL based on provided WebURL data. + * + * Subclases may use aditional parameters or use subclasses of WebURL. + * + * @param curURL + * @param movedToUrl + * @return + */ + protected WebURL createRedirectedWebURL(WebURL curURL, String movedToUrl) { + WebURL webURL = new WebURL(); + webURL.setTldList(myController.getTldList()); + webURL.setURL(movedToUrl); + webURL.setParentDocid(curURL.getParentDocid()); + webURL.setParentUrl(curURL.getParentUrl()); + webURL.setDepth(curURL.getDepth()); + webURL.setAnchor(curURL.getAnchor()); + webURL.setDocid(-1); + return webURL; + } + + /** + * Processes the redirected page without scheduling it, even if it was already seen. + * + * @param curURL + * @param movedToUrl + * @throws IOException + * @throws InterruptedException + * @throws ParseException + */ + protected void followRedirectInmediatly(WebURL curURL, String movedToUrl) + throws IOException, InterruptedException, ParseException { + WebURL webURL = createRedirectedWebURL(curURL, movedToUrl); + webURL.setFollowRedirectsInmediatly(true); + int newDocId = docIdServer.getDocId(movedToUrl); + if (newDocId < 0) { + // Repeated visits are accepted, however, no new docIds will be generated. + newDocId = docIdServer.getNewDocID(movedToUrl); + } + webURL.setDocid(newDocId); + webURL.setMaxInmediateRedirects((short)(curURL.getMaxInmediateRedirects() - 1)); + this.processPage(webURL); + } + public Thread getThread() { return myThread; } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java index 1406da166..8efc8c8a7 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java @@ -38,6 +38,8 @@ public WebURL entryToObject(TupleInput input) { webURL.setDepth(input.readShort()); webURL.setPriority(input.readByte()); webURL.setAnchor(input.readString()); + webURL.setFollowRedirectsInmediatly(input.readBoolean()); + webURL.setMaxInmediateRedirects(input.readShort()); return webURL; } @@ -50,5 +52,7 @@ public void objectToEntry(WebURL url, TupleOutput output) { output.writeShort(url.getDepth()); output.writeByte(url.getPriority()); output.writeString(url.getAnchor()); + output.writeBoolean(url.isFollowRedirectsInmediatly()); + output.writeShort(url.getMaxInmediateRedirects()); } } \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java index f7f0c1d93..2cf5bfd62 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java @@ -48,6 +48,8 @@ public class WebURL implements Serializable { private String tag; private Map attributes; private TLDList tldList; + private boolean followRedirectsInmediatly = false; + private short maxInmediateRedirects = 10; /** * Set the TLDList if you want {@linkplain #getDomain()} and @@ -249,6 +251,22 @@ public String getAttribute(String name) { return attributes.getOrDefault(name, ""); } + public boolean isFollowRedirectsInmediatly() { + return followRedirectsInmediatly; + } + + public void setFollowRedirectsInmediatly(boolean followRedirectsInmediatly) { + this.followRedirectsInmediatly = followRedirectsInmediatly; + } + + public short getMaxInmediateRedirects() { + return maxInmediateRedirects; + } + + public void setMaxInmediateRedirects(short maxInmediateRedirects) { + this.maxInmediateRedirects = maxInmediateRedirects; + } + @Override public int hashCode() { return url.hashCode(); @@ -272,4 +290,5 @@ public boolean equals(Object o) { public String toString() { return url; } + } \ No newline at end of file