From d8c6690c116653fc552f966b3562b7c743d2311e Mon Sep 17 00:00:00 2001 From: dgoiko <12698425+dgoiko@users.noreply.github.com> Date: Fri, 24 Jan 2020 23:10:40 +0100 Subject: [PATCH 1/2] No DocIDs will be created if maxPagesToFetch This is not a thread safe solution. One thread may fill the DocIDServer while another thread is looping, however, the amount of memory wasted will be decreased. --- .../uci/ics/crawler4j/crawler/WebCrawler.java | 70 ++++++++++++------- .../uci/ics/crawler4j/frontier/Frontier.java | 17 +++++ 2 files changed, 60 insertions(+), 27 deletions(-) diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 6f7c6573b..11874ff18 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -513,40 +513,56 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException parser.parse(page, curURL.getURL()); if (shouldFollowLinksIn(page.getWebURL())) { - ParseData parseData = page.getParseData(); - List toSchedule = new ArrayList<>(); - int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); - for (WebURL webURL : parseData.getOutgoingUrls()) { - webURL.setParentDocid(curURL.getDocid()); - webURL.setParentUrl(curURL.getURL()); - int newdocid = docIdServer.getDocId(webURL.getURL()); - if (newdocid > 0) { - // This is not the first time that this Url is visited. So, we set the - // depth to a negative number. - webURL.setDepth((short) -1); - webURL.setDocid(newdocid); - } else { - webURL.setDocid(-1); - webURL.setDepth((short) (curURL.getDepth() + 1)); - if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { - if (shouldVisit(page, webURL)) { - if (robotstxtServer.allows(webURL)) { - webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); - toSchedule.add(webURL); + if (frontier.canFetchPages()) { + ParseData parseData = page.getParseData(); + List toSchedule = new ArrayList<>(); + int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); + // This is not threads safe. Other threads may reduce remaining elements + long remaining = frontier.numberToReachMaxPagesToFetch(); + for (WebURL webURL : parseData.getOutgoingUrls()) { + if (remaining == 0) { + logger.debug("Ignoring remaining links in page {}, " + + "because maxPagesToFetch was reached", + page.getWebURL().getURL()); + break; + } + webURL.setParentDocid(curURL.getDocid()); + webURL.setParentUrl(curURL.getURL()); + int newdocid = docIdServer.getDocId(webURL.getURL()); + if (newdocid > 0) { + // This is not the first time that this Url is visited. So, we set the + // depth to a negative number. + webURL.setDepth((short) -1); + webURL.setDocid(newdocid); + } else { + webURL.setDocid(-1); + webURL.setDepth((short) (curURL.getDepth() + 1)); + if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { + if (shouldVisit(page, webURL)) { + if (robotstxtServer.allows(webURL)) { + webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); + remaining--; + toSchedule.add(webURL); + } else { + logger.debug( + "Not visiting: {} as per the server's \"robots.txt\" " + + "policy", webURL.getURL()); + } } else { logger.debug( - "Not visiting: {} as per the server's \"robots.txt\" " + - "policy", webURL.getURL()); + "Not visiting: {} as per your \"shouldVisit\" policy", + webURL.getURL()); } - } else { - logger.debug( - "Not visiting: {} as per your \"shouldVisit\" policy", - webURL.getURL()); } } } + frontier.scheduleAll(toSchedule); + } else { + logger.debug("Not looking for links in page {}, " + + "because maxPagesToFetch was reached", + page.getWebURL().getURL()); } - frontier.scheduleAll(toSchedule); + } else { logger.debug("Not looking for links in page {}, " + "as per your \"shouldFollowLinksInPage\" policy", diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java index d80ebdf5a..e525727bb 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java @@ -161,6 +161,23 @@ public void getNextURLs(int max, List result) { } } + public boolean canFetchPages() { + int maxPagesToFetch = config.getMaxPagesToFetch(); + return maxPagesToFetch < 0 || scheduledPages < maxPagesToFetch; + } + + public long numberToReachMaxPagesToFetch() { + int maxPagesToFetch = config.getMaxPagesToFetch(); + if (maxPagesToFetch < 0) { + return -1; + } + long remaining = maxPagesToFetch - scheduledPages; + if (remaining < 0) { + return 0; + } + return remaining; + } + public void setProcessed(WebURL webURL) { counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES); if (inProcessPages != null) { From 2564c098d3445e360ab47380ee450beefb7987e9 Mon Sep 17 00:00:00 2001 From: dgoiko <12698425+dgoiko@users.noreply.github.com> Date: Fri, 24 Jan 2020 23:17:13 +0100 Subject: [PATCH 2/2] Style fixes --- .../uci/ics/crawler4j/crawler/WebCrawler.java | 2 +- .../edu/uci/ics/crawler4j/frontier/Frontier.java | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 11874ff18..372b6534f 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -517,9 +517,9 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException ParseData parseData = page.getParseData(); List toSchedule = new ArrayList<>(); int maxCrawlDepth = myController.getConfig().getMaxDepthOfCrawling(); - // This is not threads safe. Other threads may reduce remaining elements long remaining = frontier.numberToReachMaxPagesToFetch(); for (WebURL webURL : parseData.getOutgoingUrls()) { + // This is not threads safe. Other threads may reduce remaining elements if (remaining == 0) { logger.debug("Ignoring remaining links in page {}, " + "because maxPagesToFetch was reached", diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java index e525727bb..1cae8fff7 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java @@ -168,14 +168,14 @@ public boolean canFetchPages() { public long numberToReachMaxPagesToFetch() { int maxPagesToFetch = config.getMaxPagesToFetch(); - if (maxPagesToFetch < 0) { - return -1; - } - long remaining = maxPagesToFetch - scheduledPages; - if (remaining < 0) { - return 0; - } - return remaining; + if (maxPagesToFetch < 0) { + return -1; + } + long remaining = maxPagesToFetch - scheduledPages; + if (remaining < 0) { + return 0; + } + return remaining; } public void setProcessed(WebURL webURL) {