yasserg · dgoiko · Nov 14, 2019 · Nov 14, 2019 · Nov 14, 2019 · Nov 14, 2019
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java
@@ -21,7 +21,7 @@
  * Several core components of crawler4j extend this class
  * to make them configurable.
  *
- * @deprecated This will removed without notice.
+ * @deprecated This will be removed without notice.
  * @author Yasser Ganjisaffar
  */
 @Deprecated

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java
@@ -511,20 +511,52 @@ public void addSeed(String pageUrl) throws IOException, InterruptedException {
      * @throws IOException
      */
     public void addSeed(String pageUrl, int docId) throws IOException, InterruptedException {
-        String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
+        WebURL webUrl = new WebURL();
+        webUrl.setURL(pageUrl);
+        webUrl.setDocid(docId);
+        addSeed(webUrl);
+    }
+
+    /**
+     * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
+     * to extract new URLs in it and follow them for crawling. You can also
+     * specify a specific document id to be assigned to this seed URL. This
+     * document id needs to be unique. Also, note that if you add three seeds
+     * with document ids 1,2, and 7. Then the next URL that is found during the
+     * crawl will get a doc id of 8. Also you need to ensure to add seeds in
+     * increasing order of document ids.
+     *
+     * Specifying doc ids is mainly useful when you have had a previous crawl
+     * and have stored the results and want to start a new crawl with seeds
+     * which get the same document ids as the previous crawl.
+     *
+     * NOTE: It will modify the provided URL to set it to a canonical form.
+     * It will also set depth 0 and add the tldList to the WebURL.
+     *
+     * @param pageUrl
+     *            the URL of the seed
+     *
+     * @throws InterruptedException
+     * @throws IOException
+     */
+    public void addSeed(WebURL pageUrl) throws IOException, InterruptedException {
+        String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl.getURL());
         if (canonicalUrl == null) {
             logger.error("Invalid seed URL: {}", pageUrl);
         } else {
+            int docId = pageUrl.getDocid();
+            pageUrl.setURL(canonicalUrl);
             if (docId < 0) {
-                docId = docIdServer.getDocId(canonicalUrl);
+                docId = docIdServer.getDocId(pageUrl);
                 if (docId > 0) {
                     logger.trace("This URL is already seen.");
                     return;
                 }
-                docId = docIdServer.getNewDocID(canonicalUrl);
+                docId = docIdServer.getNewDocID(pageUrl);
+                pageUrl.setDocid(docId);
             } else {
                 try {
-                    docIdServer.addUrlAndDocId(canonicalUrl, docId);
+                    docIdServer.addUrlAndDocId(pageUrl);
                 } catch (RuntimeException e) {
                     if (config.isHaltOnError()) {
                         throw e;
@@ -534,13 +566,10 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
                 }
             }
 
-            WebURL webUrl = new WebURL();
-            webUrl.setTldList(tldList);
-            webUrl.setURL(canonicalUrl);
-            webUrl.setDocid(docId);
-            webUrl.setDepth((short) 0);
-            if (robotstxtServer.allows(webUrl)) {
-                frontier.schedule(webUrl);
+            pageUrl.setTldList(tldList);
+            pageUrl.setDepth((short) 0);
+            if (robotstxtServer.allows(pageUrl)) {
+                frontier.schedule(pageUrl);
             } else {
                 // using the WARN level here, as the user specifically asked to add this seed
                 logger.warn("Robots.txt does not allow this seed: {}", pageUrl);
@@ -564,14 +593,38 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
      *            the document id that you want to be assigned to this URL.
      * @throws UnsupportedEncodingException
      *
+     *
      */
     public void addSeenUrl(String url, int docId) throws UnsupportedEncodingException {
-        String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
+        WebURL webUrl = new WebURL();
+        webUrl.setURL(url);
+        webUrl.setDocid(docId);
+        addSeenUrl(webUrl);
+    }
+
+    /**
+     * This function can called to assign a specific document id to a url. This
+     * feature is useful when you have had a previous crawl and have stored the
+     * Urls and their associated document ids and want to have a new crawl which
+     * is aware of the previously seen Urls and won't re-crawl them.
+     *
+     * Note that if you add three seen Urls with document ids 1,2, and 7. Then
+     * the next URL that is found during the crawl will get a doc id of 8. Also
+     * you need to ensure to add seen Urls in increasing order of document ids.
+     *
+     * @param url
+     *            the URL of the page
+     * @throws UnsupportedEncodingException
+     *
+     */
+    public void addSeenUrl(WebURL url) throws UnsupportedEncodingException {
+        String canonicalUrl = URLCanonicalizer.getCanonicalURL(url.getURL());
         if (canonicalUrl == null) {
             logger.error("Invalid Url: {} (can't cannonicalize it!)", url);
         } else {
+            url.setURL(canonicalUrl);
             try {
-                docIdServer.addUrlAndDocId(canonicalUrl, docId);
+                docIdServer.addUrlAndDocId(url);
             } catch (RuntimeException e) {
                 if (config.isHaltOnError()) {
                     throw e;

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -448,23 +448,24 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
                     onRedirectedStatusCode(page);
 
                     if (myController.getConfig().isFollowRedirects()) {
-                        int newDocId = docIdServer.getDocId(movedToUrl);
+                        WebURL webURL = new WebURL();
+                        webURL.setURL(movedToUrl);
+
+                        int newDocId = docIdServer.getDocId(webURL);
                         if (newDocId > 0) {
                             logger.debug("Redirect page: {} is already seen", curURL);
                             return;
                         }
 
-                        WebURL webURL = new WebURL();
                         webURL.setTldList(myController.getTldList());
-                        webURL.setURL(movedToUrl);
                         webURL.setParentDocid(curURL.getParentDocid());
                         webURL.setParentUrl(curURL.getParentUrl());
                         webURL.setDepth(curURL.getDepth());
                         webURL.setDocid(-1);
                         webURL.setAnchor(curURL.getAnchor());
                         if (shouldVisit(page, webURL)) {
                             if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
-                                webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
+                                webURL.setDocid(docIdServer.getNewDocID(webURL));
                                 frontier.schedule(webURL);
                             } else {
                                 logger.debug(
@@ -489,13 +490,20 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
                 }
 
             } else { // if status code is 200
-                if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
-                    if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
+                String fetchedUrl;
+                WebURL fetchedWebURL = fetchResult.getFetchedWebUrl();
+                if (fetchedWebURL != null) {
+                    fetchedUrl = fetchedWebURL.getURL();
+                } else {
+                    fetchedUrl = null;
+                }
+                if (!curURL.getURL().equals(fetchedUrl)) {
+                    if (docIdServer.isSeenBefore(fetchResult.getFetchedWebUrl())) {
                         logger.debug("Redirect page: {} has already been seen", curURL);
                         return;
                     }
-                    curURL.setURL(fetchResult.getFetchedUrl());
-                    curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
+                    curURL.setURL(fetchedUrl);
+                    curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedWebUrl()));
                 }
 
                 if (!fetchResult.fetchContent(page,
@@ -519,7 +527,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
                     for (WebURL webURL : parseData.getOutgoingUrls()) {
                         webURL.setParentDocid(curURL.getDocid());
                         webURL.setParentUrl(curURL.getURL());
-                        int newdocid = docIdServer.getDocId(webURL.getURL());
+                        int newdocid = docIdServer.getDocId(webURL);
                         if (newdocid > 0) {
                             // This is not the first time that this Url is visited. So, we set the
                             // depth to a negative number.
@@ -531,7 +539,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
                             if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
                                 if (shouldVisit(page, webURL)) {
                                     if (robotstxtServer.allows(webURL)) {
-                                        webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
+                                        webURL.setDocid(docIdServer.getNewDocID(webURL));
                                         toSchedule.add(webURL);
                                     } else {
                                         logger.debug(

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java
@@ -27,6 +27,7 @@
 import org.slf4j.LoggerFactory;
 
 import edu.uci.ics.crawler4j.crawler.Page;
+import edu.uci.ics.crawler4j.url.WebURL;
 
 /**
  * @author Yasser Ganjisaffar
@@ -40,6 +41,7 @@ public class PageFetchResult {
     protected HttpEntity entity = null;
     protected Header[] responseHeaders = null;
     protected String fetchedUrl = null;
+    protected WebURL fetchedWebUrl = null;
     protected String movedToUrl = null;
 
     public PageFetchResult(boolean haltOnError) {
@@ -70,12 +72,30 @@ public void setResponseHeaders(Header[] responseHeaders) {
         this.responseHeaders = responseHeaders;
     }
 
+    @Deprecated
     public String getFetchedUrl() {
         return fetchedUrl;
     }
 
+    @Deprecated
     public void setFetchedUrl(String fetchedUrl) {
-        this.fetchedUrl = fetchedUrl;
+        WebURL fetchedWebURL = new WebURL();
+        fetchedWebURL.setURL(fetchedUrl);
+        setFetchedWebUrl(fetchedWebURL);
+    }
+
+    public WebURL getFetchedWebUrl() {
+        return fetchedWebUrl;
+    }
+
+    public void setFetchedWebUrl(WebURL fetchedWebUrl) {
+        this.fetchedWebUrl = fetchedWebUrl;
+        // Compatibility until deprecated methods are deleted
+        if (fetchedWebUrl != null) {
+            this.fetchedUrl = fetchedWebUrl.getURL();
+        } else {
+            this.fetchedUrl = null;
+        }
     }
 
     public boolean fetchContent(Page page, int maxBytes) throws SocketTimeoutException, IOException {

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java
@@ -255,10 +255,12 @@ public PageFetchResult fetchPage(WebURL webUrl)
             throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
         // Getting URL, setting headers & content
         PageFetchResult fetchResult = new PageFetchResult(config.isHaltOnError());
-        String toFetchURL = webUrl.getURL();
+        String toFetchURL;
         HttpUriRequest request = null;
         try {
-            request = newHttpUriRequest(toFetchURL);
+            request = newHttpUriRequest(webUrl);
+            toFetchURL = request.getURI().toString();
+            webUrl.setURL(toFetchURL);
             if (config.getPolitenessDelay() > 0) {
                 // Applying Politeness delay
                 synchronized (mutex) {
@@ -293,11 +295,12 @@ public PageFetchResult fetchPage(WebURL webUrl)
                     fetchResult.setMovedToUrl(movedToUrl);
                 }
             } else if (statusCode >= 200 && statusCode <= 299) { // is 2XX, everything looks ok
-                fetchResult.setFetchedUrl(toFetchURL);
+                fetchResult.setFetchedWebUrl(webUrl);
                 String uri = request.getURI().toString();
                 if (!uri.equals(toFetchURL)) {
                     if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
-                        fetchResult.setFetchedUrl(uri);
+                        webUrl.setURL(uri);
+                        fetchResult.setFetchedWebUrl(webUrl);
                     }
                 }
 
@@ -345,10 +348,34 @@ public synchronized void shutDown() {
      * @param url the url to be fetched
      * @return the HttpUriRequest for the given url
      */
+    @Deprecated
     protected HttpUriRequest newHttpUriRequest(String url) {
         return new HttpGet(url);
     }
 
+    /**
+     * Creates a new HttpUriRequest for the given url. The default is to create a HttpGet without
+     * any further configuration. Subclasses may override this method and provide their own logic.
+     *
+     * @param url the url to be fetched
+     * @return the HttpUriRequest for the given url
+     */
+    protected HttpUriRequest newHttpUriRequest(WebURL url) {
+        if (!url.isPost()) {
+            return this.newHttpUriRequest(url.getURL());
+        }
+        HttpPost req = new HttpPost(url.getURL());
+        if (url.getParamsPost() == null || url.getParamsPost().isEmpty()) {
+            return req;
+        }
+        List<BasicNameValuePair> pairs = url.getParamsPost().getAsList();
+        if (pairs != null && pairs.size() > 0) {
+            // Unnecesary comprobaion.
+            req.setEntity(new UrlEncodedFormEntity(pairs, StandardCharsets.UTF_8));
+        }
+        return req;
+    }
+
     protected CrawlConfig getConfig() {
         return config;
     }

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java
@@ -28,6 +28,7 @@
 import com.sleepycat.je.OperationStatus;
 
 import edu.uci.ics.crawler4j.crawler.CrawlConfig;
+import edu.uci.ics.crawler4j.url.WebURL;
 import edu.uci.ics.crawler4j.util.Util;
 
 /**
@@ -68,6 +69,7 @@ public DocIDServer(Environment env, CrawlConfig config) {
      * @param url the URL for which the docid is returned.
      * @return the docid of the url if it is seen before. Otherwise -1 is returned.
      */
+    @Deprecated
     public int getDocId(String url) {
         synchronized (mutex) {
             OperationStatus result = null;
@@ -93,6 +95,17 @@ public int getDocId(String url) {
         }
     }
 
+    /**
+     * Returns the docid of an already seen url.
+     *
+     * @param url the URL for which the docid is returned.
+     * @return the docid of the url if it is seen before. Otherwise -1 is returned.
+     */
+    public int getDocId(WebURL url) {
+        return getDocId(url.encode());
+    }
+
+    @Deprecated
     public int getNewDocID(String url) {
         synchronized (mutex) {
             try {
@@ -117,6 +130,11 @@ public int getNewDocID(String url) {
         }
     }
 
+    public int getNewDocID(WebURL url) {
+        return getNewDocID(url.encode());
+    }
+
+    @Deprecated
     public void addUrlAndDocId(String url, int docId) {
         synchronized (mutex) {
             if (docId <= lastDocID) {
@@ -139,10 +157,19 @@ public void addUrlAndDocId(String url, int docId) {
         }
     }
 
+    public void addUrlAndDocId(WebURL url) {
+        addUrlAndDocId(url.encode(), url.getDocid());
+    }
+
+    @Deprecated
     public boolean isSeenBefore(String url) {
         return getDocId(url) != -1;
     }
 
+    public boolean isSeenBefore(WebURL url) {
+        return isSeenBefore(url.encode());
+    }
+
     public final int getDocCount() {
         try {
             return (int) docIDsDB.count();