Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Began work on an asynchronous crawling #157

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions build.sbt
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name := "crawler4j"
organization := "edu.uci.ics"
version := "5.0-SNAPSHOT"
publishMavenStyle := true
autoScalaLibrary := false
crossPaths := false

javacOptions ++= Seq("-source", "1.7", "-target", "1.7")

libraryDependencies += "org.slf4j" % "slf4j-api" % "1.7.10"
libraryDependencies += "org.apache.httpcomponents" % "httpasyncclient" % "4.1.2"
libraryDependencies += "org.apache.httpcomponents" % "httpclient" % "4.4"
libraryDependencies += "com.sleepycat" % "je" % "5.0.73"
libraryDependencies += "org.apache.tika" % "tika-parsers" % "1.5"
libraryDependencies += "uk.org.lidalia" % "lidalia-slf4j-ext" % "1.0.0"

libraryDependencies += "junit" % "junit" % "4.11" % "test"
9 changes: 8 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
<artifactId>crawler4j</artifactId>
<packaging>jar</packaging>
<name>crawler4j</name>
<version>4.3-SNAPSHOT</version>
<version>5.0-SNAPSHOT</version>
<description>Open Source Web Crawler for Java</description>
<url>https://github.com/yasserg/crawler4j</url>
<licenses>
Expand Down Expand Up @@ -130,6 +130,13 @@
</dependency>

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpasyncclient</artifactId>
<version>4.1.2</version>
<scope>compile</scope>
</dependency>

<dependency>
<groupId>com.sleepycat</groupId>
<artifactId>je</artifactId>
<version>5.0.73</version>
Expand Down
25 changes: 20 additions & 5 deletions src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,15 @@

package edu.uci.ics.crawler4j.crawler;

import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;
import org.apache.http.Header;
import org.apache.http.message.BasicHeader;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;

import org.apache.http.Header;
import org.apache.http.message.BasicHeader;

import edu.uci.ics.crawler4j.crawler.authentication.AuthInfo;

public class CrawlConfig {

/**
Expand Down Expand Up @@ -164,6 +163,11 @@ public class CrawlConfig {
*/
private List<AuthInfo> authInfos;

/**
* Should we skip SSL verification
*/
private boolean skipSSLVerification = false;

/**
* Validates the configs specified by this instance.
*
Expand Down Expand Up @@ -518,4 +522,15 @@ public String toString() {
sb.append("Proxy password: " + getProxyPassword() + "\n");
return sb.toString();
}

public boolean getSkipSSLVerification() {
return skipSSLVerification;
}

/**
* Should we skip SSL verification
*/
public void setSkipSSLVerification(boolean skipSSLVerification) {
this.skipSSLVerification = skipSSLVerification;
}
}
178 changes: 178 additions & 0 deletions src/main/java/edu/uci/ics/crawler4j/fetcher/PageAsyncFetcher.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
package edu.uci.ics.crawler4j.fetcher;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.exceptions.PageBiggerThanMaxSizeException;
import edu.uci.ics.crawler4j.url.WebURL;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.HttpUriRequest;
import org.apache.http.concurrent.FutureCallback;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.ssl.NoopHostnameVerifier;
import org.apache.http.conn.ssl.TrustStrategy;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.nio.client.CloseableHttpAsyncClient;
import org.apache.http.impl.nio.client.HttpAsyncClientBuilder;
import org.apache.http.impl.nio.conn.PoolingNHttpClientConnectionManager;
import org.apache.http.impl.nio.reactor.DefaultConnectingIOReactor;
import org.apache.http.impl.nio.reactor.IOReactorConfig;
import org.apache.http.nio.conn.NoopIOSessionStrategy;
import org.apache.http.nio.conn.SchemeIOSessionStrategy;
import org.apache.http.nio.conn.ssl.SSLIOSessionStrategy;
import org.apache.http.nio.reactor.IOReactorException;
import org.apache.http.ssl.SSLContexts;

import javax.net.ssl.SSLContext;
import java.io.IOException;
import java.security.KeyManagementException;
import java.security.KeyStoreException;
import java.security.NoSuchAlgorithmException;
import java.security.cert.X509Certificate;
import java.util.Date;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;

/**
* Created by lost on 9/7/2016.
*/
public class PageAsyncFetcher extends PageFetcherBase {
protected final CloseableHttpAsyncClient httpClient;
protected final PoolingNHttpClientConnectionManager connectionManager;
private final AtomicLong nextCleanupTimeMs = new AtomicLong();

public PageAsyncFetcher(CrawlConfig config) {
super(config);

if ((config.getAuthInfos() != null) && !config.getAuthInfos().isEmpty()) {
throw new UnsupportedOperationException("Authentication is not implemented");
}

if (config.getPolitenessDelay() > 0) {
throw new UnsupportedOperationException("PolitenessDelay is not implemented");
}

RequestConfig requestConfig = getRequestConfig(config);

SchemeIOSessionStrategy sslSessionStrategy = config.getSkipSSLVerification()
? buildNoVerificationSSLSessionStrategy()
: SSLIOSessionStrategy.getSystemDefaultStrategy();
Registry<SchemeIOSessionStrategy> uriSchemeSessionRegistry = RegistryBuilder.<SchemeIOSessionStrategy>create()
.register("https", sslSessionStrategy)
.register("http", NoopIOSessionStrategy.INSTANCE)
.build();
try {
connectionManager = new PoolingNHttpClientConnectionManager(new DefaultConnectingIOReactor(IOReactorConfig.DEFAULT), uriSchemeSessionRegistry);
} catch (IOReactorException e) {
throw new RuntimeException(e);
}
connectionManager.setMaxTotal(config.getMaxTotalConnections());
connectionManager.setDefaultMaxPerRoute(config.getMaxConnectionsPerHost());

HttpAsyncClientBuilder clientBuilder = HttpAsyncClientBuilder.create();
clientBuilder.setDefaultRequestConfig(requestConfig);
clientBuilder.setConnectionManager(connectionManager);
clientBuilder.setUserAgent(config.getUserAgentString());
clientBuilder.setDefaultHeaders(config.getDefaultHeaders());

if (config.getProxyHost() != null) {
if (config.getProxyUsername() != null) {
BasicCredentialsProvider credentialsProvider = getCredentialsProvider(config);
clientBuilder.setDefaultCredentialsProvider(credentialsProvider);
}

HttpHost proxy = new HttpHost(config.getProxyHost(), config.getProxyPort());
clientBuilder.setProxy(proxy);
logger.debug("Working through Proxy: {}", proxy.getHostName());
}

httpClient = clientBuilder.build();
nextCleanupTimeMs.set(new Date().getTime());
updateLastCleanup();
}

private boolean updateLastCleanup() {
long now = new Date().getTime();
long scheduledCleanup = nextCleanupTimeMs.get();
if (scheduledCleanup < now) {
return nextCleanupTimeMs.compareAndSet(scheduledCleanup, now);
}

return false;
}

public void fetchPage(WebURL webUrl, final FutureCallback<PageFetchResult> callback)
throws InterruptedException, IOException, PageBiggerThanMaxSizeException {
if (webUrl == null)
throw new IllegalArgumentException("Must supply webUrl");
if (callback == null)
throw new IllegalArgumentException("Must supply callback");

cleanupIfNecessary();

// Getting URL, setting headers & content
final PageFetchResult fetchResult = new PageFetchResult();
final String toFetchURL = webUrl.getURL();
final HttpUriRequest request = newHttpUriRequest(toFetchURL);

try {
// TODO Applying Politeness delay
httpClient.execute(request, new FutureCallback<HttpResponse>() {
@Override
public void completed(HttpResponse response) {
try {
ParseResponse(fetchResult, toFetchURL, request, response);
} catch (Exception e) {
callback.failed(e);
}
callback.completed(fetchResult);
}

@Override
public void failed(Exception e) {
callback.failed(e);
}

@Override
public void cancelled() {
callback.cancelled();
}
});
} finally { // occurs also with thrown exceptions
if ((fetchResult.getEntity() == null) && (request != null)) {
request.abort();
}
}
}

public void shutdown() throws IOException {
connectionManager.shutdown();
}

private void cleanupIfNecessary() {
if (!updateLastCleanup())
return;

cleanupConnections();
}

protected void cleanupConnections() {
connectionManager.closeExpiredConnections();
connectionManager.closeIdleConnections(30, TimeUnit.SECONDS);
}

private static SSLIOSessionStrategy buildNoVerificationSSLSessionStrategy() {
try {
SSLContext sslContext = SSLContexts.custom().loadTrustMaterial(null, new TrustStrategy() {
@Override
public boolean isTrusted(final X509Certificate[] chain, String authType) {
return true;
}
}).build();
return new SSLIOSessionStrategy(sslContext, NoopHostnameVerifier.INSTANCE);
} catch (KeyStoreException | NoSuchAlgorithmException | KeyManagementException e) {
throw new RuntimeException(e);
}
}
}
Loading