diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java index cdfa1a344..ecf14f323 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Configurable.java @@ -21,7 +21,7 @@ * Several core components of crawler4j extend this class * to make them configurable. * - * @deprecated This will removed without notice. + * @deprecated This will be removed without notice. * @author Yasser Ganjisaffar */ @Deprecated diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java index 2f7b8b321..5cbaa760d 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlController.java @@ -511,20 +511,52 @@ public void addSeed(String pageUrl) throws IOException, InterruptedException { * @throws IOException */ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedException { - String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl); + WebURL webUrl = new WebURL(); + webUrl.setURL(pageUrl); + webUrl.setDocid(docId); + addSeed(webUrl); + } + + /** + * Adds a new seed URL. A seed URL is a URL that is fetched by the crawler + * to extract new URLs in it and follow them for crawling. You can also + * specify a specific document id to be assigned to this seed URL. This + * document id needs to be unique. Also, note that if you add three seeds + * with document ids 1,2, and 7. Then the next URL that is found during the + * crawl will get a doc id of 8. Also you need to ensure to add seeds in + * increasing order of document ids. + * + * Specifying doc ids is mainly useful when you have had a previous crawl + * and have stored the results and want to start a new crawl with seeds + * which get the same document ids as the previous crawl. + * + * NOTE: It will modify the provided URL to set it to a canonical form. + * It will also set depth 0 and add the tldList to the WebURL. + * + * @param pageUrl + * the URL of the seed + * + * @throws InterruptedException + * @throws IOException + */ + public void addSeed(WebURL pageUrl) throws IOException, InterruptedException { + String canonicalUrl = URLCanonicalizer.getCanonicalURL(pageUrl.getURL()); if (canonicalUrl == null) { logger.error("Invalid seed URL: {}", pageUrl); } else { + int docId = pageUrl.getDocid(); + pageUrl.setURL(canonicalUrl); if (docId < 0) { - docId = docIdServer.getDocId(canonicalUrl); + docId = docIdServer.getDocId(pageUrl); if (docId > 0) { logger.trace("This URL is already seen."); return; } - docId = docIdServer.getNewDocID(canonicalUrl); + docId = docIdServer.getNewDocID(pageUrl); + pageUrl.setDocid(docId); } else { try { - docIdServer.addUrlAndDocId(canonicalUrl, docId); + docIdServer.addUrlAndDocId(pageUrl); } catch (RuntimeException e) { if (config.isHaltOnError()) { throw e; @@ -534,13 +566,10 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx } } - WebURL webUrl = new WebURL(); - webUrl.setTldList(tldList); - webUrl.setURL(canonicalUrl); - webUrl.setDocid(docId); - webUrl.setDepth((short) 0); - if (robotstxtServer.allows(webUrl)) { - frontier.schedule(webUrl); + pageUrl.setTldList(tldList); + pageUrl.setDepth((short) 0); + if (robotstxtServer.allows(pageUrl)) { + frontier.schedule(pageUrl); } else { // using the WARN level here, as the user specifically asked to add this seed logger.warn("Robots.txt does not allow this seed: {}", pageUrl); @@ -564,14 +593,38 @@ public void addSeed(String pageUrl, int docId) throws IOException, InterruptedEx * the document id that you want to be assigned to this URL. * @throws UnsupportedEncodingException * + * */ public void addSeenUrl(String url, int docId) throws UnsupportedEncodingException { - String canonicalUrl = URLCanonicalizer.getCanonicalURL(url); + WebURL webUrl = new WebURL(); + webUrl.setURL(url); + webUrl.setDocid(docId); + addSeenUrl(webUrl); + } + + /** + * This function can called to assign a specific document id to a url. This + * feature is useful when you have had a previous crawl and have stored the + * Urls and their associated document ids and want to have a new crawl which + * is aware of the previously seen Urls and won't re-crawl them. + * + * Note that if you add three seen Urls with document ids 1,2, and 7. Then + * the next URL that is found during the crawl will get a doc id of 8. Also + * you need to ensure to add seen Urls in increasing order of document ids. + * + * @param url + * the URL of the page + * @throws UnsupportedEncodingException + * + */ + public void addSeenUrl(WebURL url) throws UnsupportedEncodingException { + String canonicalUrl = URLCanonicalizer.getCanonicalURL(url.getURL()); if (canonicalUrl == null) { logger.error("Invalid Url: {} (can't cannonicalize it!)", url); } else { + url.setURL(canonicalUrl); try { - docIdServer.addUrlAndDocId(canonicalUrl, docId); + docIdServer.addUrlAndDocId(url); } catch (RuntimeException e) { if (config.isHaltOnError()) { throw e; diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java index 6f7c6573b..04de5443a 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/WebCrawler.java @@ -448,15 +448,16 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException onRedirectedStatusCode(page); if (myController.getConfig().isFollowRedirects()) { - int newDocId = docIdServer.getDocId(movedToUrl); + WebURL webURL = new WebURL(); + webURL.setURL(movedToUrl); + + int newDocId = docIdServer.getDocId(webURL); if (newDocId > 0) { logger.debug("Redirect page: {} is already seen", curURL); return; } - WebURL webURL = new WebURL(); webURL.setTldList(myController.getTldList()); - webURL.setURL(movedToUrl); webURL.setParentDocid(curURL.getParentDocid()); webURL.setParentUrl(curURL.getParentUrl()); webURL.setDepth(curURL.getDepth()); @@ -464,7 +465,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException webURL.setAnchor(curURL.getAnchor()); if (shouldVisit(page, webURL)) { if (!shouldFollowLinksIn(webURL) || robotstxtServer.allows(webURL)) { - webURL.setDocid(docIdServer.getNewDocID(movedToUrl)); + webURL.setDocid(docIdServer.getNewDocID(webURL)); frontier.schedule(webURL); } else { logger.debug( @@ -489,13 +490,20 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException } } else { // if status code is 200 - if (!curURL.getURL().equals(fetchResult.getFetchedUrl())) { - if (docIdServer.isSeenBefore(fetchResult.getFetchedUrl())) { + String fetchedUrl; + WebURL fetchedWebURL = fetchResult.getFetchedWebUrl(); + if (fetchedWebURL != null) { + fetchedUrl = fetchedWebURL.getURL(); + } else { + fetchedUrl = null; + } + if (!curURL.getURL().equals(fetchedUrl)) { + if (docIdServer.isSeenBefore(fetchResult.getFetchedWebUrl())) { logger.debug("Redirect page: {} has already been seen", curURL); return; } - curURL.setURL(fetchResult.getFetchedUrl()); - curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedUrl())); + curURL.setURL(fetchedUrl); + curURL.setDocid(docIdServer.getNewDocID(fetchResult.getFetchedWebUrl())); } if (!fetchResult.fetchContent(page, @@ -519,7 +527,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException for (WebURL webURL : parseData.getOutgoingUrls()) { webURL.setParentDocid(curURL.getDocid()); webURL.setParentUrl(curURL.getURL()); - int newdocid = docIdServer.getDocId(webURL.getURL()); + int newdocid = docIdServer.getDocId(webURL); if (newdocid > 0) { // This is not the first time that this Url is visited. So, we set the // depth to a negative number. @@ -531,7 +539,7 @@ private void processPage(WebURL curURL) throws IOException, InterruptedException if ((maxCrawlDepth == -1) || (curURL.getDepth() < maxCrawlDepth)) { if (shouldVisit(page, webURL)) { if (robotstxtServer.allows(webURL)) { - webURL.setDocid(docIdServer.getNewDocID(webURL.getURL())); + webURL.setDocid(docIdServer.getNewDocID(webURL)); toSchedule.add(webURL); } else { logger.debug( diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java index 5a81b00f9..eafff59b1 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetchResult.java @@ -27,6 +27,7 @@ import org.slf4j.LoggerFactory; import edu.uci.ics.crawler4j.crawler.Page; +import edu.uci.ics.crawler4j.url.WebURL; /** * @author Yasser Ganjisaffar @@ -40,6 +41,7 @@ public class PageFetchResult { protected HttpEntity entity = null; protected Header[] responseHeaders = null; protected String fetchedUrl = null; + protected WebURL fetchedWebUrl = null; protected String movedToUrl = null; public PageFetchResult(boolean haltOnError) { @@ -70,12 +72,30 @@ public void setResponseHeaders(Header[] responseHeaders) { this.responseHeaders = responseHeaders; } + @Deprecated public String getFetchedUrl() { return fetchedUrl; } + @Deprecated public void setFetchedUrl(String fetchedUrl) { - this.fetchedUrl = fetchedUrl; + WebURL fetchedWebURL = new WebURL(); + fetchedWebURL.setURL(fetchedUrl); + setFetchedWebUrl(fetchedWebURL); + } + + public WebURL getFetchedWebUrl() { + return fetchedWebUrl; + } + + public void setFetchedWebUrl(WebURL fetchedWebUrl) { + this.fetchedWebUrl = fetchedWebUrl; + // Compatibility until deprecated methods are deleted + if (fetchedWebUrl != null) { + this.fetchedUrl = fetchedWebUrl.getURL(); + } else { + this.fetchedUrl = null; + } } public boolean fetchContent(Page page, int maxBytes) throws SocketTimeoutException, IOException { diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java index 9b42a5a8f..ace917852 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/fetcher/PageFetcher.java @@ -255,10 +255,12 @@ public PageFetchResult fetchPage(WebURL webUrl) throws InterruptedException, IOException, PageBiggerThanMaxSizeException { // Getting URL, setting headers & content PageFetchResult fetchResult = new PageFetchResult(config.isHaltOnError()); - String toFetchURL = webUrl.getURL(); + String toFetchURL; HttpUriRequest request = null; try { - request = newHttpUriRequest(toFetchURL); + request = newHttpUriRequest(webUrl); + toFetchURL = request.getURI().toString(); + webUrl.setURL(toFetchURL); if (config.getPolitenessDelay() > 0) { // Applying Politeness delay synchronized (mutex) { @@ -293,11 +295,12 @@ public PageFetchResult fetchPage(WebURL webUrl) fetchResult.setMovedToUrl(movedToUrl); } } else if (statusCode >= 200 && statusCode <= 299) { // is 2XX, everything looks ok - fetchResult.setFetchedUrl(toFetchURL); + fetchResult.setFetchedWebUrl(webUrl); String uri = request.getURI().toString(); if (!uri.equals(toFetchURL)) { if (!URLCanonicalizer.getCanonicalURL(uri).equals(toFetchURL)) { - fetchResult.setFetchedUrl(uri); + webUrl.setURL(uri); + fetchResult.setFetchedWebUrl(webUrl); } } @@ -345,10 +348,34 @@ public synchronized void shutDown() { * @param url the url to be fetched * @return the HttpUriRequest for the given url */ + @Deprecated protected HttpUriRequest newHttpUriRequest(String url) { return new HttpGet(url); } + /** + * Creates a new HttpUriRequest for the given url. The default is to create a HttpGet without + * any further configuration. Subclasses may override this method and provide their own logic. + * + * @param url the url to be fetched + * @return the HttpUriRequest for the given url + */ + protected HttpUriRequest newHttpUriRequest(WebURL url) { + if (!url.isPost()) { + return this.newHttpUriRequest(url.getURL()); + } + HttpPost req = new HttpPost(url.getURL()); + if (url.getParamsPost() == null || url.getParamsPost().isEmpty()) { + return req; + } + List pairs = url.getParamsPost().getAsList(); + if (pairs != null && pairs.size() > 0) { + // Unnecesary comprobaion. + req.setEntity(new UrlEncodedFormEntity(pairs, StandardCharsets.UTF_8)); + } + return req; + } + protected CrawlConfig getConfig() { return config; } diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java index fe3d5b8a2..fff5c228b 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/DocIDServer.java @@ -28,6 +28,7 @@ import com.sleepycat.je.OperationStatus; import edu.uci.ics.crawler4j.crawler.CrawlConfig; +import edu.uci.ics.crawler4j.url.WebURL; import edu.uci.ics.crawler4j.util.Util; /** @@ -68,6 +69,7 @@ public DocIDServer(Environment env, CrawlConfig config) { * @param url the URL for which the docid is returned. * @return the docid of the url if it is seen before. Otherwise -1 is returned. */ + @Deprecated public int getDocId(String url) { synchronized (mutex) { OperationStatus result = null; @@ -93,6 +95,17 @@ public int getDocId(String url) { } } + /** + * Returns the docid of an already seen url. + * + * @param url the URL for which the docid is returned. + * @return the docid of the url if it is seen before. Otherwise -1 is returned. + */ + public int getDocId(WebURL url) { + return getDocId(url.encode()); + } + + @Deprecated public int getNewDocID(String url) { synchronized (mutex) { try { @@ -117,6 +130,11 @@ public int getNewDocID(String url) { } } + public int getNewDocID(WebURL url) { + return getNewDocID(url.encode()); + } + + @Deprecated public void addUrlAndDocId(String url, int docId) { synchronized (mutex) { if (docId <= lastDocID) { @@ -139,10 +157,19 @@ public void addUrlAndDocId(String url, int docId) { } } + public void addUrlAndDocId(WebURL url) { + addUrlAndDocId(url.encode(), url.getDocid()); + } + + @Deprecated public boolean isSeenBefore(String url) { return getDocId(url) != -1; } + public boolean isSeenBefore(WebURL url) { + return isSeenBefore(url.encode()); + } + public final int getDocCount() { try { return (int) docIDsDB.count(); diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java index 1406da166..629861ba2 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/frontier/WebURLTupleBinding.java @@ -17,6 +17,8 @@ package edu.uci.ics.crawler4j.frontier; +import org.apache.http.message.BasicNameValuePair; + import com.sleepycat.bind.tuple.TupleBinding; import com.sleepycat.bind.tuple.TupleInput; import com.sleepycat.bind.tuple.TupleOutput; @@ -38,6 +40,18 @@ public WebURL entryToObject(TupleInput input) { webURL.setDepth(input.readShort()); webURL.setPriority(input.readByte()); webURL.setAnchor(input.readString()); + webURL.setPost(input.readBoolean()); + if (webURL.isPost()) { + try { + while (true) { + String name = input.readString(); + String value = input.readString(); + webURL.addPostParameter(name, value); + } + } catch (IndexOutOfBoundsException e) { + // Do nothing, no more parameters to fetch + } + } return webURL; } @@ -50,5 +64,12 @@ public void objectToEntry(WebURL url, TupleOutput output) { output.writeShort(url.getDepth()); output.writeByte(url.getPriority()); output.writeString(url.getAnchor()); + output.writeBoolean(url.isPost()); + if (url.isPost() && url.getParamsPost() != null) { + for (BasicNameValuePair param : url.getParamsPost().getAsList()) { + output.writeString(param.getName()); + output.writeString(param.getValue()); + } + } } } \ No newline at end of file diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/PostParameters.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/PostParameters.java new file mode 100644 index 000000000..f005bb51d --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/PostParameters.java @@ -0,0 +1,38 @@ +package edu.uci.ics.crawler4j.url; + +import java.util.List; + +import org.apache.http.message.BasicNameValuePair; + +/** + * + * @author Dario + * + */ +public interface PostParameters { + + String encode(); + + boolean addParameter(String key, String value) throws IllegalArgumentException; + + /** + * Remove a parameter from list using the key. + * Implementation may throw IllegalArgumentException if key is null or return false + * + * @param key name of the pair to be removed + * @param maxOcurrences maximum number of ocurences to remove (http accepts duplicated keys) + * @return true if there are changes false otherwise + * @throws IllegalArgumentException if key is null or maxOcurrences < 1 + */ + boolean removeParameter(String key, int maxOcurrences) throws IllegalArgumentException; + + boolean isEmpty(); + + /** + * Gets the parameters as a List of BasicNameValuePair. + * + * Implementations may return a copy that does not affect the internal list, but they should clearly state it. + * @return + */ + List getAsList(); +} diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/SimplePostParameters.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/SimplePostParameters.java new file mode 100644 index 000000000..56bae2c3d --- /dev/null +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/SimplePostParameters.java @@ -0,0 +1,116 @@ +package edu.uci.ics.crawler4j.url; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; + +import org.apache.http.message.BasicNameValuePair; + +public class SimplePostParameters implements PostParameters { + + public static final String PAIR_SEPARATOR = "``--``"; + + public static final String VALUE_SEPARATOR = "="; + + private List paramsPost; + + public SimplePostParameters() { + this.paramsPost = new ArrayList(); + } + + public SimplePostParameters(List paramsPost) { + this.paramsPost = new ArrayList(); + if (paramsPost != null) { + this.paramsPost.addAll(paramsPost); + } + } + + @Override + public String encode() { + return encodePostAttributes(paramsPost); + } + + public boolean addParameter(BasicNameValuePair pair) throws IllegalArgumentException { + if (pair == null) { + throw new IllegalArgumentException("pair cannot be null"); + } + if (pair.getName() == null || pair.getName().isEmpty()) { + throw new IllegalArgumentException("key cannot be null or empty"); + } + return paramsPost.add(pair); + } + + @Override + public boolean addParameter(String key, String value) throws IllegalArgumentException { + if (key == null || key.isEmpty()) { + throw new IllegalArgumentException("key cannot be null or empty"); + } + return this.paramsPost.add(new BasicNameValuePair(key, value)); + } + + @Override + public boolean removeParameter(String key, int maxOcurrences) throws IllegalArgumentException { + if (key == null || key.isEmpty()) { + throw new IllegalArgumentException("key cannot be null or empty"); + } + if (maxOcurrences < 1) { + throw new IllegalArgumentException("maxOcurrences must be a positive number"); + } + Iterator it = paramsPost.iterator(); + boolean changes = false; + while (maxOcurrences > 0 && it.hasNext()) { + BasicNameValuePair curr = it.next(); + if (key.equals(curr.getName())) { + it.remove(); + maxOcurrences--; + changes = true; + } + } + return changes; + } + + protected static String encodePostAttributes(List postAttributes) { + if (postAttributes == null || postAttributes.isEmpty()) { + return ""; + } + List pares = new ArrayList(); + for (BasicNameValuePair par : postAttributes) { + if (par == null) { + continue; + } + pares.add(par.getName() + VALUE_SEPARATOR + par.getValue()); + } + return String.join(PAIR_SEPARATOR, pares); + } + + public static SimplePostParameters decodePostAtributes(String encodedUrl) { + if (encodedUrl == null || encodedUrl.isEmpty()) { + return null; + } + List list = new ArrayList(); + for (String pair : encodedUrl.split(PAIR_SEPARATOR)) { + if (pair == null) { + continue; + } + String[] splitted = pair.split(VALUE_SEPARATOR, 2); + if (splitted.length > 1) { + list.add(new BasicNameValuePair(splitted[0], splitted[1])); + } else { + list.add(new BasicNameValuePair(splitted[0], "")); + } + + } + return new SimplePostParameters(list); + } + + @Override + public boolean isEmpty() { + return paramsPost.isEmpty(); + } + + @Override + public List getAsList() { + return paramsPost; + } + +} diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java index f7f0c1d93..d0ebecb92 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/url/WebURL.java @@ -33,10 +33,12 @@ public class WebURL implements Serializable { private static final long serialVersionUID = 1L; + public static final String POST_SEPARATOR = "<<>>"; + @PrimaryKey private String url; - private int docid; + private int docid = -1; private int parentDocid; private String parentUrl; private short depth; @@ -48,6 +50,49 @@ public class WebURL implements Serializable { private String tag; private Map attributes; private TLDList tldList; + private boolean post; + private PostParameters paramsPost; + + public PostParameters getParamsPost() { + return paramsPost; + } + + public void setParamsPost(PostParameters paramsPost) { + this.paramsPost = paramsPost; + } + + /** + * Adds a POST key / value pair. Subclases may override this behaviour for optimization. + * + * @param key + * @param value + * @return + * @throws IllegalArgumentException if key is null + * @see PostParameters#addParameter(String, String) + */ + public boolean addPostParameter(String key, String value) throws IllegalArgumentException { + if (paramsPost == null) { + paramsPost = createEmptyPostParams(); + } + return paramsPost.addParameter(key, value); + } + + /** + * Returns true if this WebURL represents a POST request. + * + * @return + */ + public boolean isPost() { + return post; + } + + /** + * Configures the HTTP request type to be POST (true) or GET(false) + * @param post true to configure POST request, false for a GET request. + */ + public void setPost(boolean post) { + this.post = post; + } /** * Set the TLDList if you want {@linkplain #getDomain()} and @@ -249,6 +294,10 @@ public String getAttribute(String name) { return attributes.getOrDefault(name, ""); } + protected PostParameters createEmptyPostParams() { + return new SimplePostParameters(); + } + @Override public int hashCode() { return url.hashCode(); @@ -272,4 +321,94 @@ public boolean equals(Object o) { public String toString() { return url; } + + /** + * Encodes the URL and the post parameters in a string to store in the DocIDServer. + * + * This is what identifies this URL as already visited or new. + * @return + */ + public String encode() { + return encodeWebURL(this); + } + + /** + * Encodes the URL and the post parameters in a string to store in the DocIDServer. + * + * This is what identifies this URL as already visited or new. + * @param url + * @return + */ + public static String encodeWebURL(WebURL url) { + if (url == null || url.getURL() == null) { + return null; + } + if (!url.isPost()) { + return url.getURL(); + } + if (url.getParamsPost() != null) { + return url.getURL() + POST_SEPARATOR + url.getParamsPost().encode(); + } else { + return url.getURL() + POST_SEPARATOR; + } + } + + /** + * Converts an encoded String in an instance of WebURL. + * + * String encoded by subclases of WebURL may not be compatible. + * + * @param url string with the URL and POST parameters included + * @return the {@link WebURL} that represents the string provided + */ + public static WebURL decodeString(String url) { + if (url == null) { + return null; + } + WebURL result = new WebURL(); + if (isPost(url)) { + result.setPost(true); + // Check if there's something usefull after POST_SEPARATOR. + if (hasPostParams(url)) { + // There are valid parameters. + String[] splitted = url.split(POST_SEPARATOR, 2); + result.setURL(splitted[0]); + if (splitted.length > 1) { + result.setParamsPost(SimplePostParameters.decodePostAtributes(splitted[1])); + } + } else { + result.setURL(url.replaceAll(POST_SEPARATOR, "")); + } + } else { + result.setURL(url); + } + return result; + } + + public static boolean isPost(String encodedUrl) { + if (encodedUrl == null) { + return false; + } + if (encodedUrl.contains(POST_SEPARATOR)) { + return true; + } + return false; + } + + protected static boolean hasPostParams(String encodedUrl) { + // Check if the URL has post parameters + if (encodedUrl == null) { + return false; + } + String[] parts = encodedUrl.split(POST_SEPARATOR); + if (parts.length > 1) { + for (int i = 1; i < parts.length; i++) { + if (parts[i] != null && !parts[i].isEmpty()) { + return true; + } + } + } + return false; + } + } \ No newline at end of file diff --git a/crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java b/crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java index 6a238adc5..6d4c9aa56 100644 --- a/crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java +++ b/crawler4j/src/test/java/edu/uci/ics/crawler4j/tests/fetcher/PageFetcherHtmlOnly.java @@ -43,7 +43,7 @@ public PageFetchResult fetchPage(WebURL webUrl) HttpResponse response = httpClient.execute(head); fetchResult.setEntity(response.getEntity()); fetchResult.setResponseHeaders(response.getAllHeaders()); - fetchResult.setFetchedUrl(toFetchURL); + fetchResult.setFetchedWebUrl(webUrl); fetchResult.setStatusCode(response.getStatusLine().getStatusCode()); String contentType = response.containsHeader("Content-Type") ?