diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java index cdfa1a344..ea97f1ef3 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java @@ -13,6 +13,7 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. + * */ package edu.uci.ics.crawler4j.crawler; @@ -21,7 +22,7 @@ * Several core components of crawler4j extend this class * to make them configurable. * - * @deprecated This will removed without notice. + * @deprecated This will be removed without notice. * @author Yasser Ganjisaffar */ @Deprecated diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java index 2f7b8b321..7cdec287d 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java @@ -30,10 +30,13 @@ import com.sleepycat.je.Environment; import com.sleepycat.je.EnvironmentConfig; -import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.fetcher.PageFetcherInterface; import edu.uci.ics.crawler4j.frontier.DocIDServer; +import edu.uci.ics.crawler4j.frontier.DocIDServerInterface; import edu.uci.ics.crawler4j.frontier.Frontier; +import edu.uci.ics.crawler4j.frontier.FrontierInterface; import edu.uci.ics.crawler4j.parser.Parser; +import edu.uci.ics.crawler4j.parser.ParserInterface; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; import edu.uci.ics.crawler4j.url.TLDList; import edu.uci.ics.crawler4j.url.URLCanonicalizer; @@ -75,28 +78,28 @@ public class CrawlController { */ protected boolean shuttingDown; - protected PageFetcher pageFetcher; + protected PageFetcherInterface pageFetcher; protected RobotstxtServer robotstxtServer; - protected Frontier frontier; - protected DocIDServer docIdServer; + protected FrontierInterface frontier; + protected DocIDServerInterface docIdServer; protected TLDList tldList; protected final Object waitingLock = new Object(); protected final Environment env; - protected Parser parser; + protected ParserInterface parser; - public CrawlController(CrawlConfig config, PageFetcher pageFetcher, + public CrawlController(CrawlConfig config, PageFetcherInterface pageFetcher, RobotstxtServer robotstxtServer) throws Exception { this(config, pageFetcher, null, robotstxtServer, null); } - public CrawlController(CrawlConfig config, PageFetcher pageFetcher, + public CrawlController(CrawlConfig config, PageFetcherInterface pageFetcher, RobotstxtServer robotstxtServer, TLDList tldList) throws Exception { this(config, pageFetcher, null, robotstxtServer, tldList); } - public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parser, + public CrawlController(CrawlConfig config, PageFetcherInterface pageFetcher, ParserInterface parser, RobotstxtServer robotstxtServer, TLDList tldList) throws Exception { config.validate(); this.config = config; @@ -153,7 +156,7 @@ public CrawlController(CrawlConfig config, PageFetcher pageFetcher, Parser parse robotstxtServer.setCrawlConfig(config); } - public Parser getParser() { + public ParserInterface getParser() { return parser; } @@ -582,11 +585,11 @@ public void addSeenUrl(String url, int docId) throws UnsupportedEncodingExceptio } } - public PageFetcher getPageFetcher() { + public PageFetcherInterface getPageFetcher() { return pageFetcher; } - public void setPageFetcher(PageFetcher pageFetcher) { + public void setPageFetcher(PageFetcherInterface pageFetcher) { this.pageFetcher = pageFetcher; } @@ -598,19 +601,19 @@ public void setRobotstxtServer(RobotstxtServer robotstxtServer) { this.robotstxtServer = robotstxtServer; } - public Frontier getFrontier() { + public FrontierInterface getFrontier() { return frontier; } - public void setFrontier(Frontier frontier) { + public void setFrontier(FrontierInterface frontier) { this.frontier = frontier; } - public DocIDServer getDocIdServer() { + public DocIDServerInterface getDocIdServer() { return docIdServer; } - public void setDocIdServer(DocIDServer docIdServer) { + public void setDocIdServer(DocIDServerInterface docIdServer) { this.docIdServer = docIdServer; } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 6f7c6573b..f143cc9bd 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -32,13 +32,13 @@ import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException; import edu.uci.ics.crawler4j.crawler.exceptions.ParseException; import edu.uci.ics.crawler4j.fetcher.PageFetchResult; -import edu.uci.ics.crawler4j.fetcher.PageFetcher; -import edu.uci.ics.crawler4j.frontier.DocIDServer; -import edu.uci.ics.crawler4j.frontier.Frontier; +import edu.uci.ics.crawler4j.fetcher.PageFetcherInterface; +import edu.uci.ics.crawler4j.frontier.DocIDServerInterface; +import edu.uci.ics.crawler4j.frontier.FrontierInterface; import edu.uci.ics.crawler4j.parser.HtmlParseData; import edu.uci.ics.crawler4j.parser.NotAllowedContentException; import edu.uci.ics.crawler4j.parser.ParseData; -import edu.uci.ics.crawler4j.parser.Parser; +import edu.uci.ics.crawler4j.parser.ParserInterface; import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; import edu.uci.ics.crawler4j.url.WebURL; @@ -71,12 +71,12 @@ public class WebCrawler implements Runnable { /** * The parser that is used by this crawler instance to parse the content of the fetched pages. */ - private Parser parser; + private ParserInterface parser; /** * The fetcher that is used by this crawler instance to fetch the content of pages from the web. */ - private PageFetcher pageFetcher; + private PageFetcherInterface pageFetcher; /** * The RobotstxtServer instance that is used by this crawler instance to @@ -87,12 +87,12 @@ public class WebCrawler implements Runnable { /** * The DocIDServer that is used by this crawler instance to map each URL to a unique docid. */ - private DocIDServer docIdServer; + private DocIDServerInterface docIdServer; /** * The Frontier object that manages the crawl queue. */ - private Frontier frontier; + private FrontierInterface frontier; /** * Is the current crawler instance waiting for new URLs? This field is diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java index 9b42a5a8f..4fb2a4ba0 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java @@ -78,7 +78,7 @@ /** * @author Yasser Ganjisaffar */ -public class PageFetcher { +public class PageFetcher implements PageFetcherInterface { protected static final Logger logger = LoggerFactory.getLogger(PageFetcher.class); protected final Object mutex = new Object(); /** @@ -251,6 +251,7 @@ private void doFormLogin(FormAuthInfo authInfo) { } } + @Override public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException { // Getting URL, setting headers & content @@ -331,6 +332,7 @@ public PageFetchResult fetchPage(WebURL webUrl) } } + @Override public synchronized void shutDown() { if (connectionMonitorThread != null) { connectionManager.shutdown(); diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcherInterface.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcherInterface.java new file mode 100644 index 000000000..e1a52c29f --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcherInterface.java @@ -0,0 +1,15 @@ +package edu.uci.ics.crawler4j.fetcher; + +import java.io.IOException; + +import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException; +import edu.uci.ics.crawler4j.url.WebURL; + +public interface PageFetcherInterface { + + PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, + PageBiggerThanMaxSizeException; + + void shutDown(); + +} \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java index fe3d5b8a2..e6c63e596 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java @@ -34,7 +34,7 @@ * @author Yasser Ganjisaffar */ -public class DocIDServer { +public class DocIDServer implements DocIDServerInterface { private static final Logger logger = LoggerFactory.getLogger(DocIDServer.class); private final Database docIDsDB; @@ -68,6 +68,7 @@ public DocIDServer(Environment env, CrawlConfig config) { * @param url the URL for which the docid is returned. * @return the docid of the url if it is seen before. Otherwise -1 is returned. */ + @Override public int getDocId(String url) { synchronized (mutex) { OperationStatus result = null; @@ -93,6 +94,7 @@ public int getDocId(String url) { } } + @Override public int getNewDocID(String url) { synchronized (mutex) { try { @@ -117,6 +119,7 @@ public int getNewDocID(String url) { } } + @Override public void addUrlAndDocId(String url, int docId) { synchronized (mutex) { if (docId <= lastDocID) { @@ -139,10 +142,12 @@ public void addUrlAndDocId(String url, int docId) { } } + @Override public boolean isSeenBefore(String url) { return getDocId(url) != -1; } + @Override public final int getDocCount() { try { return (int) docIDsDB.count(); @@ -152,6 +157,7 @@ public final int getDocCount() { } } + @Override public void close() { try { docIDsDB.close(); diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServerInterface.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServerInterface.java new file mode 100644 index 000000000..f695dd51a --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServerInterface.java @@ -0,0 +1,23 @@ +package edu.uci.ics.crawler4j.frontier; + +public interface DocIDServerInterface { + + /** + * Returns the docid of an already seen url. + * + * @param url the URL for which the docid is returned. + * @return the docid of the url if it is seen before. Otherwise -1 is returned. + */ + int getDocId(String url); + + int getNewDocID(String url); + + void addUrlAndDocId(String url, int docId); + + boolean isSeenBefore(String url); + + int getDocCount(); + + void close(); + +} \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java index d80ebdf5a..202d57aad 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/Frontier.java @@ -32,7 +32,7 @@ * @author Yasser Ganjisaffar */ -public class Frontier { +public class Frontier implements FrontierInterface { protected static final Logger logger = LoggerFactory.getLogger(Frontier.class); private static final String DATABASE_NAME = "PendingURLsDB"; @@ -82,6 +82,7 @@ public Frontier(Environment env, CrawlConfig config) { } } + @Override public void scheduleAll(List urls) { int maxPagesToFetch = config.getMaxPagesToFetch(); synchronized (mutex) { @@ -109,6 +110,7 @@ public void scheduleAll(List urls) { } } + @Override public void schedule(WebURL url) { int maxPagesToFetch = config.getMaxPagesToFetch(); synchronized (mutex) { @@ -124,6 +126,7 @@ public void schedule(WebURL url) { } } + @Override public void getNextURLs(int max, List result) { while (true) { synchronized (mutex) { @@ -161,6 +164,7 @@ public void getNextURLs(int max, List result) { } } + @Override public void setProcessed(WebURL webURL) { counters.increment(Counters.ReservedCounterNames.PROCESSED_PAGES); if (inProcessPages != null) { @@ -170,10 +174,12 @@ public void setProcessed(WebURL webURL) { } } + @Override public long getQueueLength() { return workQueues.getLength(); } + @Override public long getNumberOfAssignedPages() { if (inProcessPages != null) { return inProcessPages.getLength(); @@ -182,18 +188,22 @@ public long getNumberOfAssignedPages() { } } + @Override public long getNumberOfProcessedPages() { return counters.getValue(Counters.ReservedCounterNames.PROCESSED_PAGES); } + @Override public long getNumberOfScheduledPages() { return counters.getValue(Counters.ReservedCounterNames.SCHEDULED_PAGES); } + @Override public boolean isFinished() { return isFinished; } + @Override public void close() { workQueues.close(); counters.close(); @@ -202,6 +212,7 @@ public void close() { } } + @Override public void finish() { isFinished = true; synchronized (waitingList) { diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/FrontierInterface.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/FrontierInterface.java new file mode 100644 index 000000000..9755c1ead --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/FrontierInterface.java @@ -0,0 +1,31 @@ +package edu.uci.ics.crawler4j.frontier; + +import java.util.List; + +import edu.uci.ics.crawler4j.url.WebURL; + +public interface FrontierInterface { + + void scheduleAll(List urls); + + void schedule(WebURL url); + + void getNextURLs(int max, List result); + + void setProcessed(WebURL webURL); + + long getQueueLength(); + + long getNumberOfAssignedPages(); + + long getNumberOfProcessedPages(); + + long getNumberOfScheduledPages(); + + boolean isFinished(); + + void close(); + + void finish(); + +} \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java index b1aefad0f..b5f688714 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java @@ -31,7 +31,7 @@ /** * @author Yasser Ganjisaffar */ -public class Parser { +public class Parser implements ParserInterface { private static final Logger logger = LoggerFactory.getLogger(Parser.class); @@ -61,6 +61,7 @@ public Parser(CrawlConfig config, HtmlParser htmlParser, TLDList tldList) { this.net = new Net(config, tldList); } + @Override public void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException { if (Util.hasBinaryContent(page.getContentType())) { // BINARY BinaryParseData parseData = new BinaryParseData(); diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ParserInterface.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ParserInterface.java new file mode 100644 index 000000000..69d1feb1a --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/parser/ParserInterface.java @@ -0,0 +1,10 @@ +package edu.uci.ics.crawler4j.parser; + +import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.crawler.exceptions.ParseException; + +public interface ParserInterface { + + void parse(Page page, String contextURL) throws NotAllowedContentException, ParseException; + +} \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java index e8bfba806..4bbdf5c40 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/robotstxt/RobotstxtServer.java @@ -33,7 +33,7 @@ import edu.uci.ics.crawler4j.crawler.Page; import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException; import edu.uci.ics.crawler4j.fetcher.PageFetchResult; -import edu.uci.ics.crawler4j.fetcher.PageFetcher; +import edu.uci.ics.crawler4j.fetcher.PageFetcherInterface; import edu.uci.ics.crawler4j.url.WebURL; import edu.uci.ics.crawler4j.util.Util; @@ -50,15 +50,15 @@ public class RobotstxtServer { protected final Map host2directivesCache = new HashMap<>(); - protected PageFetcher pageFetcher; + protected PageFetcherInterface pageFetcher; private final int maxBytes; - public RobotstxtServer(RobotstxtConfig config, PageFetcher pageFetcher) { + public RobotstxtServer(RobotstxtConfig config, PageFetcherInterface pageFetcher) { this(config, pageFetcher, 16384); } - public RobotstxtServer(RobotstxtConfig config, PageFetcher pageFetcher, int maxBytes) { + public RobotstxtServer(RobotstxtConfig config, PageFetcherInterface pageFetcher, int maxBytes) { this.config = config; this.pageFetcher = pageFetcher; this.maxBytes = maxBytes;