Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added POST capabilities #419

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
* Several core components of crawler4j extend this class
* to make them configurable.
*
* @deprecated This will removed without notice.
* @deprecated This will be removed without notice.
* @author Yasser Ganjisaffar
*/
@Deprecated
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -511,20 +511,52 @@ public void addSeed(String pageUrl) throws IOException, InterruptedException {
* @throws IOException
*/
public void addSeed(String pageUrl, int docId) throws IOException, InterruptedException {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl);
WebURL webUrl = new WebURL();
webUrl.setURL(pageUrl);
webUrl.setDocid(docId);
addSeed(webUrl);
}

/**
* Adds a new seed URL. A seed URL is a URL that is fetched by the crawler
* to extract new URLs in it and follow them for crawling. You can also
* specify a specific document id to be assigned to this seed URL. This
* document id needs to be unique. Also, note that if you add three seeds
* with document ids 1,2, and 7. Then the next URL that is found during the
* crawl will get a doc id of 8. Also you need to ensure to add seeds in
* increasing order of document ids.
*
* Specifying doc ids is mainly useful when you have had a previous crawl
* and have stored the results and want to start a new crawl with seeds
* which get the same document ids as the previous crawl.
*
* NOTE: It will modify the provided URL to set it to a canonical form.
* It will also set depth 0 and add the tldList to the WebURL.
*
* @param pageUrl
* the URL of the seed
*
* @throws InterruptedException
* @throws IOException
*/
public void addSeed(WebURL pageUrl) throws IOException, InterruptedException {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl.getURL());
if (canonicalUrl == null) {
logger.error("Invalid seed URL: {}", pageUrl);
} else {
int docId = pageUrl.getDocid();
pageUrl.setURL(canonicalUrl);
if (docId < 0) {
docId = docIdServer.getDocId(canonicalUrl);
docId = docIdServer.getDocId(pageUrl);
if (docId > 0) {
logger.trace("This URL is already seen.");
return;
}
docId = docIdServer.getNewDocID(canonicalUrl);
docId = docIdServer.getNewDocID(pageUrl);
pageUrl.setDocid(docId);
} else {
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
docIdServer.addUrlAndDocId(pageUrl);
} catch (RuntimeException e) {
if (config.isHaltOnError()) {
throw e;
Expand All @@ -534,13 +566,10 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
}
}

WebURL webUrl = new WebURL();
webUrl.setTldList(tldList);
webUrl.setURL(canonicalUrl);
webUrl.setDocid(docId);
webUrl.setDepth((short) 0);
if (robotstxtServer.allows(webUrl)) {
frontier.schedule(webUrl);
pageUrl.setTldList(tldList);
pageUrl.setDepth((short) 0);
if (robotstxtServer.allows(pageUrl)) {
frontier.schedule(pageUrl);
} else {
// using the WARN level here, as the user specifically asked to add this seed
logger.warn("Robots.txt does not allow this seed: {}", pageUrl);
Expand All @@ -564,14 +593,38 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx
* the document id that you want to be assigned to this URL.
* @throws UnsupportedEncodingException
*
*
*/
public void addSeenUrl(String url, int docId) throws UnsupportedEncodingException {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url);
WebURL webUrl = new WebURL();
webUrl.setURL(url);
webUrl.setDocid(docId);
addSeenUrl(webUrl);
}

/**
* This function can called to assign a specific document id to a url. This
* feature is useful when you have had a previous crawl and have stored the
* Urls and their associated document ids and want to have a new crawl which
* is aware of the previously seen Urls and won't re-crawl them.
*
* Note that if you add three seen Urls with document ids 1,2, and 7. Then
* the next URL that is found during the crawl will get a doc id of 8. Also
* you need to ensure to add seen Urls in increasing order of document ids.
*
* @param url
* the URL of the page
* @throws UnsupportedEncodingException
*
*/
public void addSeenUrl(WebURL url) throws UnsupportedEncodingException {
String canonicalUrl = URLCanonicalizer.getCanonicalURL(url.getURL());
if (canonicalUrl == null) {
logger.error("Invalid Url: {} (can't cannonicalize it!)", url);
} else {
url.setURL(canonicalUrl);
try {
docIdServer.addUrlAndDocId(canonicalUrl, docId);
docIdServer.addUrlAndDocId(url);
} catch (RuntimeException e) {
if (config.isHaltOnError()) {
throw e;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -448,23 +448,24 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
onRedirectedStatusCode(page);

if (myController.getConfig().isFollowRedirects()) {
int newDocId = docIdServer.getDocId(movedToUrl);
WebURL webURL = new WebURL();
webURL.setURL(movedToUrl);

int newDocId = docIdServer.getDocId(webURL);
if (newDocId > 0) {
logger.debug("Redirect page: {} is already seen", curURL);
return;
}

WebURL webURL = new WebURL();
webURL.setTldList(myController.getTldList());
webURL.setURL(movedToUrl);
webURL.setParentDocid(curURL.getParentDocid());
webURL.setParentUrl(curURL.getParentUrl());
webURL.setDepth(curURL.getDepth());
webURL.setDocid(-1);
webURL.setAnchor(curURL.getAnchor());
if (shouldVisit(page, webURL)) {
if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(movedToUrl));
webURL.setDocid(docIdServer.getNewDocID(webURL));
frontier.schedule(webURL);
} else {
logger.debug(
Expand All @@ -489,13 +490,20 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
}

} else { // if status code is 200
if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) {
String fetchedUrl;
WebURL fetchedWebURL = fetchResult.getFetchedWebUrl();
if (fetchedWebURL != null) {
fetchedUrl = fetchedWebURL.getURL();
} else {
fetchedUrl = null;
}
if (!curURL.getURL().equals(fetchedUrl)) {
if (docIdServer.isSeenBefore(fetchResult.getFetchedWebUrl())) {
logger.debug("Redirect page: {} has already been seen", curURL);
return;
}
curURL.setURL(fetchResult.getFetchedUrl());
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl()));
curURL.setURL(fetchedUrl);
curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedWebUrl()));
}

if (!fetchResult.fetchContent(page,
Expand All @@ -519,7 +527,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
for (WebURL webURL : parseData.getOutgoingUrls()) {
webURL.setParentDocid(curURL.getDocid());
webURL.setParentUrl(curURL.getURL());
int newdocid = docIdServer.getDocId(webURL.getURL());
int newdocid = docIdServer.getDocId(webURL);
if (newdocid > 0) {
// This is not the first time that this Url is visited. So, we set the
// depth to a negative number.
Expand All @@ -531,7 +539,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException
if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) {
if (shouldVisit(page, webURL)) {
if (robotstxtServer.allows(webURL)) {
webURL.setDocid(docIdServer.getNewDocID(webURL.getURL()));
webURL.setDocid(docIdServer.getNewDocID(webURL));
toSchedule.add(webURL);
} else {
logger.debug(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import org.slf4j.LoggerFactory;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.url.WebURL;

/**
* @author Yasser Ganjisaffar
Expand All @@ -40,6 +41,7 @@ public class PageFetchResult {
protected HttpEntity entity = null;
protected Header[] responseHeaders = null;
protected String fetchedUrl = null;
protected WebURL fetchedWebUrl = null;
protected String movedToUrl = null;

public PageFetchResult(boolean haltOnError) {
Expand Down Expand Up @@ -70,12 +72,30 @@ public void setResponseHeaders(Header[] responseHeaders) {
this.responseHeaders = responseHeaders;
}

@Deprecated
public String getFetchedUrl() {
return fetchedUrl;
}

@Deprecated
public void setFetchedUrl(String fetchedUrl) {
this.fetchedUrl = fetchedUrl;
WebURL fetchedWebURL = new WebURL();
fetchedWebURL.setURL(fetchedUrl);
setFetchedWebUrl(fetchedWebURL);
}

public WebURL getFetchedWebUrl() {
return fetchedWebUrl;
}

public void setFetchedWebUrl(WebURL fetchedWebUrl) {
this.fetchedWebUrl = fetchedWebUrl;
// Compatibility until deprecated methods are deleted
if (fetchedWebUrl != null) {
this.fetchedUrl = fetchedWebUrl.getURL();
} else {
this.fetchedUrl = null;
}
}

public boolean fetchContent(Page page, int maxBytes) throws SocketTimeoutException, IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -255,10 +255,12 @@ public PageFetchResult fetchPage(WebURL webUrl)
throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
// Getting URL, setting headers & content
PageFetchResult fetchResult = new PageFetchResult(config.isHaltOnError());
String toFetchURL = webUrl.getURL();
String toFetchURL;
HttpUriRequest request = null;
try {
request = newHttpUriRequest(toFetchURL);
request = newHttpUriRequest(webUrl);
toFetchURL = request.getURI().toString();
webUrl.setURL(toFetchURL);
if (config.getPolitenessDelay() > 0) {
// Applying Politeness delay
synchronized (mutex) {
Expand Down Expand Up @@ -293,11 +295,12 @@ public PageFetchResult fetchPage(WebURL webUrl)
fetchResult.setMovedToUrl(movedToUrl);
}
} else if (statusCode >= 200 && statusCode <= 299) { // is 2XX, everything looks ok
fetchResult.setFetchedUrl(toFetchURL);
fetchResult.setFetchedWebUrl(webUrl);
String uri = request.getURI().toString();
if (!uri.equals(toFetchURL)) {
if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) {
fetchResult.setFetchedUrl(uri);
webUrl.setURL(uri);
fetchResult.setFetchedWebUrl(webUrl);
}
}

Expand Down Expand Up @@ -345,10 +348,34 @@ public synchronized void shutDown() {
* @param url the url to be fetched
* @return the HttpUriRequest for the given url
*/
@Deprecated
protected HttpUriRequest newHttpUriRequest(String url) {
return new HttpGet(url);
}

/**
* Creates a new HttpUriRequest for the given url. The default is to create a HttpGet without
* any further configuration. Subclasses may override this method and provide their own logic.
*
* @param url the url to be fetched
* @return the HttpUriRequest for the given url
*/
protected HttpUriRequest newHttpUriRequest(WebURL url) {
if (!url.isPost()) {
return this.newHttpUriRequest(url.getURL());
}
HttpPost req = new HttpPost(url.getURL());
if (url.getParamsPost() == null || url.getParamsPost().isEmpty()) {
return req;
}
List<BasicNameValuePair> pairs = url.getParamsPost().getAsList();
if (pairs != null && pairs.size() > 0) {
// Unnecesary comprobaion.
req.setEntity(new UrlEncodedFormEntity(pairs, StandardCharsets.UTF_8));
}
return req;
}

protected CrawlConfig getConfig() {
return config;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import com.sleepycat.je.OperationStatus;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.url.WebURL;
import edu.uci.ics.crawler4j.util.Util;

/**
Expand Down Expand Up @@ -68,6 +69,7 @@ public DocIDServer(Environment env, CrawlConfig config) {
* @param url the URL for which the docid is returned.
* @return the docid of the url if it is seen before. Otherwise -1 is returned.
*/
@Deprecated
public int getDocId(String url) {
synchronized (mutex) {
OperationStatus result = null;
Expand All @@ -93,6 +95,17 @@ public int getDocId(String url) {
}
}

/**
* Returns the docid of an already seen url.
*
* @param url the URL for which the docid is returned.
* @return the docid of the url if it is seen before. Otherwise -1 is returned.
*/
public int getDocId(WebURL url) {
return getDocId(url.encode());
}

@Deprecated
public int getNewDocID(String url) {
synchronized (mutex) {
try {
Expand All @@ -117,6 +130,11 @@ public int getNewDocID(String url) {
}
}

public int getNewDocID(WebURL url) {
return getNewDocID(url.encode());
}

@Deprecated
public void addUrlAndDocId(String url, int docId) {
synchronized (mutex) {
if (docId <= lastDocID) {
Expand All @@ -139,10 +157,19 @@ public void addUrlAndDocId(String url, int docId) {
}
}

public void addUrlAndDocId(WebURL url) {
addUrlAndDocId(url.encode(), url.getDocid());
}

@Deprecated
public boolean isSeenBefore(String url) {
return getDocId(url) != -1;
}

public boolean isSeenBefore(WebURL url) {
return isSeenBefore(url.encode());
}

public final int getDocCount() {
try {
return (int) docIDsDB.count();
Expand Down
Loading