yasserg · dgoiko · Jan 24, 2020 · Jan 24, 2020
diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java
@@ -513,40 +513,56 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
                 parser.parse(page, curURL.getURL());
 
                 if (shouldFollowLinksIn(page.getWebURL())) {
-                    ParseData parseData = page.getParseData();
-                    List<WebURL> toSchedule = new ArrayList<>();
-                    int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
-                    for (WebURL webURL : parseData.getOutgoingUrls()) {
-                        webURL.setParentDocid(curURL.getDocid());
-                        webURL.setParentUrl(curURL.getURL());
-                        int newdocid = docIdServer.getDocId(webURL.getURL());
-                        if (newdocid > 0) {
-                            // This is not the first time that this Url is visited. So, we set the
-                            // depth to a negative number.
-                            webURL.setDepth((short) -1);
-                            webURL.setDocid(newdocid);
-                        } else {
-                            webURL.setDocid(-1);
-                            webURL.setDepth((short) (curURL.getDepth() + 1));
-                            if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
-                                if (shouldVisit(page, webURL)) {
-                                    if (robotstxtServer.allows(webURL)) {
-                                        webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
-                                        toSchedule.add(webURL);
+                    if (frontier.canFetchPages()) {
+                        ParseData parseData = page.getParseData();
+                        List<WebURL> toSchedule = new ArrayList<>();
+                        int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling();
+                        long remaining = frontier.numberToReachMaxPagesToFetch();
+                        for (WebURL webURL : parseData.getOutgoingUrls()) {
+                            // This is not threads safe. Other threads may reduce remaining elements
+                            if (remaining == 0) {
+                                logger.debug("Ignoring remaining links in page {}, "
+                                        + "because maxPagesToFetch was reached",
+                                        page.getWebURL().getURL());
+                                break;
+                            }
+                            webURL.setParentDocid(curURL.getDocid());
+                            webURL.setParentUrl(curURL.getURL());
+                            int newdocid = docIdServer.getDocId(webURL.getURL());
+                            if (newdocid > 0) {
+                                // This is not the first time that this Url is visited. So, we set the
+                                // depth to a negative number.
+                                webURL.setDepth((short) -1);
+                                webURL.setDocid(newdocid);
+                            } else {
+                                webURL.setDocid(-1);
+                                webURL.setDepth((short) (curURL.getDepth() + 1));
+                                if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
+                                    if (shouldVisit(page, webURL)) {
+                                        if (robotstxtServer.allows(webURL)) {
+                                            webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
+                                            remaining--;
+                                            toSchedule.add(webURL);
+                                        } else {
+                                            logger.debug(
+                                                "Not visiting: {} as per the server's \"robots.txt\" " +
+                                                "policy", webURL.getURL());
+                                        }
                                     } else {
                                         logger.debug(
-                                            "Not visiting: {} as per the server's \"robots.txt\" " +
-                                            "policy", webURL.getURL());
+                                            "Not visiting: {} as per your \"shouldVisit\" policy",
+                                            webURL.getURL());
                                     }
-                                } else {
-                                    logger.debug(
-                                        "Not visiting: {} as per your \"shouldVisit\" policy",
-                                        webURL.getURL());
                                 }
                             }
                         }
+                        frontier.scheduleAll(toSchedule);
+                    } else {
+                        logger.debug("Not looking for links in page {}, "
+                                + "because maxPagesToFetch was reached",
+                                page.getWebURL().getURL());
                     }
-                    frontier.scheduleAll(toSchedule);
+
                 } else {
                     logger.debug("Not looking for links in page {}, "
                                  + "as per your \"shouldFollowLinksInPage\" policy",

diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java
@@ -161,6 +161,23 @@ public void getNextURLs(int max, List<WebURL> result) {
         }
     }
 
+    public boolean canFetchPages() {
+        int maxPagesToFetch = config.getMaxPagesToFetch();
+        return maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch;
+    }
+
+    public long numberToReachMaxPagesToFetch() {
+        int maxPagesToFetch = config.getMaxPagesToFetch();
+        if (maxPagesToFetch < 0) {
+            return -1;
+        }
+        long remaining = maxPagesToFetch - scheduledPages;
+        if (remaining < 0) {
+            return 0;
+        }
+        return remaining;
+    }
+
     public void setProcessed(WebURL webURL) {
         counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES);
         if (inProcessPages != null) {