Skip to content

Commit

Permalink
DelegatorProtocol to filter with regexps on URLs, fix #1110
Browse files Browse the repository at this point in the history
Signed-off-by: Julien Nioche <[email protected]>
  • Loading branch information
jnioche committed Oct 19, 2023
1 parent 0623bde commit e233c85
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 18 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import org.apache.storm.Config;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.slf4j.LoggerFactory;

/**
* Protocol implementation that enables selection from a collection of sub-protocols using filters
* based on each call's metadata
* based on each call's metadata and URL.
*
* <p>Is configured like this
* <p>It is configured like this
*
* <pre>
* protocol.delegator.config:
Expand All @@ -43,19 +44,25 @@
* - className: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
* filters:
* robots.txt:
* - className: "com.digitalpebble.stormcrawler.protocol.okhttp.HttpProtocol"
* regex:
* - \.pdf
* - \.doc
* - className: "com.digitalpebble.stormcrawler.protocol.selenium.SeleniumProtocol"
* </pre>
*
* The last one in the list must not have filters as it is used as a default value. The protocols
* are tried for matches in the order in which they are listed in the configuration. The first to
* match gets used to fetch a URL.
* Typically, the last one in the list must not have filters as it is used as a default value. The
* protocols are tried for matches in the order in which they are listed in the configuration. The
* first to match gets used to fetch a URL.
*
* <p>A filter without value is valid, we just test for the presence of the key.
*
* <p>A special value <i>robots.txt</i> can be used in the filtering rules to trigger on robots.txt
* files. This is automatically generated by the DelegatorProtocol, you don't need to add it to the
* metadata explicitly.
*
* <p>The regex are considered a hit if they are found in the URL, they do not have to match the
* entire URL. The operator logic applies to them as well.
*
* @since 2.2
*/
public class DelegatorProtocol implements Protocol {
Expand Down Expand Up @@ -83,6 +90,8 @@ static class FilteredProtocol {
final List<Filter> filters = new ArrayList<>();
final String id;

final List<java.util.regex.Pattern> urlPatterns = new ArrayList<>();

enum Operator {
AND,
OR
Expand All @@ -99,15 +108,16 @@ Protocol getProtocolInstance() {
/** Filterless implementation * */
public FilteredProtocol(
@Nullable String id, @NotNull String protocolImpl, @NotNull Config config) {
this(id, protocolImpl, config, null, null);
this(id, protocolImpl, config, null, null, null);
}

public FilteredProtocol(
@Nullable String id,
@NotNull String protocolImpl,
@NotNull Config config,
@Nullable Map<String, String> filterImpls,
@Nullable String op) {
@Nullable String op,
@Nullable List<String> regexps) {

protoInstance =
InitialisationUtil.initializeFromQualifiedName(protocolImpl, Protocol.class);
Expand All @@ -123,15 +133,21 @@ public FilteredProtocol(
this.operator = Operator.valueOf(op);
}

// regular expressions
if (regexps != null) {
regexps.forEach(s -> urlPatterns.add(Pattern.compile(s)));
}

this.id = id;

// log filters found
LOG.info(
"Loaded {} filters for {}; id {}; operator {}",
"Loaded {} filters for {}; id {}; operator {}; regexp {}",
filters.size(),
protocolImpl,
id,
operator);
operator,
urlPatterns.size());
}

public ProtocolResponse getProtocolOutput(String url, Metadata metadata) throws Exception {
Expand All @@ -146,9 +162,9 @@ public void cleanup() {
protoInstance.cleanup();
}

boolean isMatch(final Metadata metadata) {
// if this FP has no filters - it can handle anything
if (filters.isEmpty()) return true;
boolean isMatch(final String url, final Metadata metadata) {
// if this FP has no filters nor regexps - it can handle anything
if (filters.isEmpty() && urlPatterns.isEmpty()) return true;

boolean atLeastOneMatch = false;

Expand Down Expand Up @@ -179,7 +195,18 @@ boolean isMatch(final Metadata metadata) {
else if (operator.equals(Operator.OR) && match) return true;
}

// if we get to this point and the operator is AND is means everything has
// same approach with the URLs
for (Pattern p : urlPatterns) {
boolean found = p.asPredicate().test(url);
if (found) {
atLeastOneMatch = true;
}
// optimisation
if (operator.equals(Operator.AND) && !found) return false;
else if (operator.equals(Operator.OR) && found) return true;
}

// if we get to this point and the operator is AND, it means everything has
// matched
// but if the operator is OR we need to check that something has matched at all

Expand Down Expand Up @@ -207,15 +234,21 @@ public void configure(@NotNull Config conf) {
final Object filters = subConf.get("filters");
final String operator = (String) subConf.get("operator");
final String id = (String) subConf.get("id");
final Object regexp = subConf.get("regex");

FilteredProtocol protocol;
if (filters == null) {
if (filters == null && regexp == null) {
protocol = new FilteredProtocol(id, className, conf);
} else {
// noinspection unchecked
protocol =
new FilteredProtocol(
id, className, conf, (Map<String, String>) filters, operator);
id,
className,
conf,
(Map<String, String>) filters,
operator,
(List<String>) regexp);
}
protocols.add(protocol);
}
Expand All @@ -238,7 +271,7 @@ public void configure(@NotNull Config conf) {
final FilteredProtocol getProtocolFor(String url, Metadata metadata) {

for (FilteredProtocol p : protocols) {
if (p.isMatch(metadata)) {
if (p.isMatch(url, metadata)) {
return p;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ public void getProtocolTest() throws FileNotFoundException {

pf = superProto.getProtocolFor("https://www.example-two.com", meta);

Assert.assertEquals(pf.id, "third");
// URLs
meta = new Metadata();

pf = superProto.getProtocolFor("https://www.example-two.com/large.pdf", meta);

Assert.assertEquals(pf.id, "fourth");

pf = superProto.getProtocolFor("https://www.example-two.com/large.doc", meta);

Assert.assertEquals(pf.id, "fourth");
}
}
6 changes: 6 additions & 0 deletions core/src/test/resources/delegator-conf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ config:
filters:
ping:
pong:
- className: "com.digitalpebble.stormcrawler.protocol.DummyProtocol"
id: "fourth"
operator: OR
regex:
- \.pdf
- \.doc
- className: "com.digitalpebble.stormcrawler.protocol.DummyProtocol"
id: "default"

0 comments on commit e233c85

Please sign in to comment.