Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Timeoutable regular expressions in RobotstxtServer #429

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package edu.uci.ics.crawler4j.crawler.exceptions;

public class RegexpTimeoutException extends RuntimeException {
private static final long serialVersionUID = 6437153127902393756L;

private final String regularExpression;

private final String stringToMatch;

private final long timeoutMillis;

public RegexpTimeoutException() {
super();
regularExpression = null;
stringToMatch = null;
timeoutMillis = 0;
}

public RegexpTimeoutException(String message, Throwable cause) {
super(message, cause);
regularExpression = null;
stringToMatch = null;
timeoutMillis = 0;
}

public RegexpTimeoutException(String message) {
super(message);
regularExpression = null;
stringToMatch = null;
timeoutMillis = 0;
}

public RegexpTimeoutException(Throwable cause) {
super(cause);
regularExpression = null;
stringToMatch = null;
timeoutMillis = 0;
}

public RegexpTimeoutException(String regularExpression, String stringToMatch, long timeoutMillis) {
super("Timeout occurred after " + timeoutMillis + "ms while processing regular expression '"
+ regularExpression + "' on input '" + stringToMatch + "'!");
this.regularExpression = regularExpression;
this.stringToMatch = stringToMatch;
this.timeoutMillis = timeoutMillis;
}

public String getRegularExpression() {
return regularExpression;
}

public String getStringToMatch() {
return stringToMatch;
}

public long getTimeoutMillis() {
return timeoutMillis;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,25 @@ public class RobotstxtConfig {
*/
private int cacheSize = 500;

/**
* The milliseconds before a regexp timeouts. -1 means no timeout. This will
* reduce regexp performance.
*/
private long timeout = -1;

/**
* If true, the system will consider that a timed out regexp is a matching one.
*/
private boolean matchOnTimeout = false;

/**
* In order to be able to timeout inside a matcher, the system check's for timeout
* while reading the CharSecuence. This parameter sets the number of characters that
* will be read between timeout checking. Higher values means less CPU overhead and
* less accuracy on timeout. Default value is 30000000. Null means default.
*/
private Integer checkTimeoutInterval = null;

public boolean isEnabled() {
return enabled;
}
Expand Down Expand Up @@ -77,4 +96,28 @@ public void setIgnoreUADiscrimination(boolean ignore) {
public boolean getIgnoreUADiscrimination() {
return ignoreUADiscrimination;
}

public long getTimeout() {
return timeout;
}

public void setTimeout(long timeout) {
this.timeout = timeout;
}

public boolean isMatchOnTimeout() {
return matchOnTimeout;
}

public void setMatchOnTimeout(boolean matchOnTimeout) {
this.matchOnTimeout = matchOnTimeout;
}

public Integer getCheckTimeoutInterval() {
return checkTimeoutInterval;
}

public void setCheckTimeoutInterval(Integer checkTimeoutInterval) {
this.checkTimeoutInterval = checkTimeoutInterval;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,8 @@ public static HostDirectives parse(String content, RobotstxtConfig config) {
if (userAgents.isEmpty()) {
userAgents.add("*");
}
uaDirectives = new UserAgentDirectives(userAgents);
uaDirectives = new UserAgentDirectives(userAgents, config.getTimeout(),
config.isMatchOnTimeout(), config.getCheckTimeoutInterval());
}
uaDirectives.add(rule, value);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package edu.uci.ics.crawler4j.robotstxt;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.uci.ics.crawler4j.crawler.exceptions.RegexpTimeoutException;
import edu.uci.ics.crawler4j.util.RegularExpressionUtils;

public class TimeoutablePathRule extends PathRule {
protected static final Logger logger = LoggerFactory.getLogger(TimeoutablePathRule.class);
public static final int defaultCheckInterval = 30000000;

private final long timeout;
private final boolean matchOnTimeout;
private final int checkInterval;

/**
* Check if the specified path matches a robots.txt pattern
*
* @param pattern The pattern to match
* @param path The path to match with the pattern
* @return True when the pattern matches, false if it does not
* @throws RegexpTimeoutException if the regexp timeouts.
*/
public static boolean matchesRobotsPattern(String pattern, String path, long timeout, boolean matchOnTimeout,
int checkInterval) throws RegexpTimeoutException {
try {
return RegularExpressionUtils.createMatcherWithTimeout(path, robotsPatternToRegexp(pattern), timeout,
checkInterval).matches();
} catch (RegexpTimeoutException e) {
if (matchOnTimeout) {
return true;
}
throw e;
}
}

public static boolean matchesRobotsPattern(String pattern, String path, long timeout, boolean matchOnTimeout) {
return matchesRobotsPattern(pattern, path, timeout, matchOnTimeout, defaultCheckInterval);
}

/**
* Create a new path rule, based on the specified pattern
*
* @param type Either HostDirectives.ALLOWS or HostDirectives.DISALLOWS
* @param pattern The pattern for this rule
*/
public TimeoutablePathRule(int type, String pattern, long timeout, boolean matchOnTimeout, int checkInterval) {
super(type, pattern);
this.timeout = timeout;
this.matchOnTimeout = matchOnTimeout;
this.checkInterval = checkInterval;
}

/**
* Create a new path rule, based on the specified pattern
*
* @param type Either HostDirectives.ALLOWS or HostDirectives.DISALLOWS
* @param pattern The pattern for this rule
*/
public TimeoutablePathRule(int type, String pattern, long timeout, boolean matchOnTimeout) {
this(type, pattern, timeout, matchOnTimeout, defaultCheckInterval);
}

/**
* Check if the specified path matches this rule
*
* @param path The path to match with this pattern
* @return True when the path matches, false when it does not
*/
public boolean matches(String path) {
try {
return RegularExpressionUtils.createMatcherWithTimeout(path, pattern, timeout, checkInterval).matches();
} catch (RegexpTimeoutException e) {
logger.warn(e.toString());
return matchOnTimeout;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ public class UserAgentDirectives {
private Double crawlDelay = null;
private Set<PathRule> pathRules = new HashSet<>();

private long timeout;
private boolean matchOnTimeout;
private int checkInterval;

/**
* Comparator used to order the list of matching path rules in such a way
* that the most specific match (= longest) match comes first.
Expand Down Expand Up @@ -78,6 +82,32 @@ public int compare(PathRule lhs, PathRule rhs) {
*/
public UserAgentDirectives(Set<String> userAgents) {
this.userAgents = userAgents;
this.timeout = -1;
this.matchOnTimeout = false;
this.checkInterval = TimeoutablePathRule.defaultCheckInterval;
}

public UserAgentDirectives(Set<String> userAgents, long timeout, boolean matchOnTimeout) {
this(userAgents, timeout, matchOnTimeout, null);
}

/**
* Create a UserAgentDirectives clause
*
* @param userAgents The list user agents for this rule
* @param timeout milliseconds before regular expressions timeout.
* @param matchOnTimeout if true, a timeout will mean a match.
* @param checkInterval number of characters read between timeout attemps. Default: 30000000
*/
public UserAgentDirectives(Set<String> userAgents, long timeout, boolean matchOnTimeout, Integer checkInterval) {
this.userAgents = userAgents;
this.timeout = timeout;
this.matchOnTimeout = matchOnTimeout;
if (checkInterval == null) {
this.checkInterval = TimeoutablePathRule.defaultCheckInterval;
} else {
this.checkInterval = checkInterval;
}
}

/**
Expand Down Expand Up @@ -195,9 +225,20 @@ public void add(String rule, String value) {
} else if (rule.equals("host")) {
this.preferredHost = value;
} else if (rule.equals("allow")) {
this.pathRules.add(new PathRule(HostDirectives.ALLOWED, value));
if (timeout < 0) {
this.pathRules.add(new PathRule(HostDirectives.ALLOWED, value));
} else {
this.pathRules.add(new TimeoutablePathRule(HostDirectives.ALLOWED, value, timeout, matchOnTimeout,
checkInterval));
}

} else if (rule.equals("disallow")) {
this.pathRules.add(new PathRule(HostDirectives.DISALLOWED, value));
if (timeout < 0) {
this.pathRules.add(new PathRule(HostDirectives.DISALLOWED, value));
} else {
this.pathRules.add(new TimeoutablePathRule(HostDirectives.DISALLOWED, value, timeout, matchOnTimeout,
checkInterval));
}
} else {
logger.error("Invalid key in robots.txt passed to UserAgentRules: {}", rule);
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
package edu.uci.ics.crawler4j.util;

import java.util.regex.Matcher;
import java.util.regex.Pattern;

import edu.uci.ics.crawler4j.crawler.exceptions.RegexpTimeoutException;

/**
* Allows to create timeoutable regular expressions.
*
* Limitations: Can only throw RuntimeException. Decreases performance.
*
* Posted by Kris in stackoverflow.
*
* Modified by dgoiko to ejecute timeout check only every n chars.
* Now timeout < 0 means no timeout.
*
* @author Kris https://stackoverflow.com/a/910798/9465588
*
*/
public class RegularExpressionUtils {

// demonstrates behavior for regular expression running into catastrophic backtracking for given input
public static void main(String[] args) {
long millis = System.currentTimeMillis();
// This checkInterval produces a < 500 ms delay. Higher checkInterval will produce higher delays on timeout.
Matcher matcher = createMatcherWithTimeout(
"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx", "(x+x+)+y", 2000, 30000000);
try {
System.out.println(matcher.matches());
} catch (RuntimeException e) {
System.out.println("Operation timed out after " + (System.currentTimeMillis() - millis) + " milliseconds");
}
}

public static Matcher createMatcherWithTimeout(String stringToMatch, String regularExpression, long timeoutMillis,
int checkInterval) {
Pattern pattern = Pattern.compile(regularExpression);
return createMatcherWithTimeout(stringToMatch, pattern, timeoutMillis, checkInterval);
}

public static Matcher createMatcherWithTimeout(String stringToMatch, Pattern regularExpressionPattern,
long timeoutMillis, int checkInterval) {
if (timeoutMillis < 0) {
return regularExpressionPattern.matcher(stringToMatch);
}
CharSequence charSequence = new TimeoutRegexCharSequence(stringToMatch, timeoutMillis, stringToMatch,
regularExpressionPattern.pattern(), checkInterval);
return regularExpressionPattern.matcher(charSequence);
}

private static class TimeoutRegexCharSequence implements CharSequence {

private final CharSequence inner;

private final long timeoutMillis;

private final long timeoutTime;

private final String stringToMatch;

private final String regularExpression;

private int checkInterval;

private int attemps;

TimeoutRegexCharSequence(CharSequence inner, long timeoutMillis, String stringToMatch,
String regularExpression, int checkInterval) {
super();
this.inner = inner;
this.timeoutMillis = timeoutMillis;
this.stringToMatch = stringToMatch;
this.regularExpression = regularExpression;
timeoutTime = System.currentTimeMillis() + timeoutMillis;
this.checkInterval = checkInterval;
this.attemps = 0;
}

public char charAt(int index) {
if (this.attemps == this.checkInterval) {
if (System.currentTimeMillis() > timeoutTime) {
throw new RegexpTimeoutException(regularExpression, stringToMatch, timeoutMillis);
}
this.attemps = 0;
} else {
this.attemps++;
}

return inner.charAt(index);
}

public int length() {
return inner.length();
}

public CharSequence subSequence(int start, int end) {
return new TimeoutRegexCharSequence(inner.subSequence(start, end), timeoutMillis, stringToMatch,
regularExpression, checkInterval);
}

@Override
public String toString() {
return inner.toString();
}
}

}