diff --git a/.gitignore b/.gitignore index 79912b364..b624a1e92 100644 --- a/.gitignore +++ b/.gitignore @@ -7,8 +7,5 @@ javadocs/* logs/ # IntelliJ IDEA -**/.idea/dictionaries/ -**/.idea/dataSources.* -**/.idea/libraries/ -**/.idea/tasks.xml -**/.idea/workspace.xml +**/.idea +*.iml diff --git a/.idea/.name b/.idea/.name deleted file mode 100644 index 3dd7124b0..000000000 --- a/.idea/.name +++ /dev/null @@ -1 +0,0 @@ -crawler4j \ No newline at end of file diff --git a/pom.xml b/pom.xml index c41b8aa99..2798ff491 100644 --- a/pom.xml +++ b/pom.xml @@ -1,190 +1,188 @@ - - 4.0.0 - edu.uci.ics - crawler4j - jar - crawler4j - 4.3-SNAPSHOT - Open Source Web Crawler for Java - https://github.com/yasserg/crawler4j - - - The Apache Software License, Version 2.0 - http://www.apache.org/licenses/LICENSE-2.0.txt - repo - - - - https://github.com/yasserg/crawler4j - scm:git:git@github.com:yasserg/crawler4j.git - scm:git:git@github.com:yasserg/crawler4j.git - + + 4.0.0 + edu.uci.ics + crawler4j + jar + crawler4j + 4.3-SNAPSHOT + Open Source Web Crawler for Java + https://github.com/yasserg/crawler4j + + + The Apache Software License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + https://github.com/yasserg/crawler4j + scm:git:git@github.com:yasserg/crawler4j.git + scm:git:git@github.com:yasserg/crawler4j.git + - - org.sonatype.oss - oss-parent - 7 - + + org.sonatype.oss + oss-parent + 7 + - - - - org.apache.maven.plugins - maven-compiler-plugin - 3.2 - - 1.7 - 1.7 - - - - org.apache.maven.plugins - maven-jar-plugin - 2.5 - - - **/*.properties - - - - - - org.apache.maven.plugins - maven-source-plugin - 2.4 - - - attach-sources - - jar - - - - - - - org.apache.maven.plugins - maven-javadoc-plugin - 2.10.1 - - - attach-javadocs - - jar - - - - - - maven-assembly-plugin - 2.5.3 - - - jar-with-dependencies - - - - - make-fat-jar - package - - single - - - crawler4j-${project.version} - - - - - - org.apache.maven.plugins - maven-checkstyle-plugin - 2.17 - - - compile - compile - - checkstyle.xml - UTF-8 - true - true - true - - - check - - - - - - com.puppycrawl.tools - checkstyle - 7.1 - - - - - + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.2 + + 1.7 + 1.7 + + + + org.apache.maven.plugins + maven-jar-plugin + 2.5 + + + **/*.properties + + + + + + org.apache.maven.plugins + maven-source-plugin + 2.4 + + + attach-sources + + jar + + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.1 + + + attach-javadocs + + jar + + + + + + maven-assembly-plugin + 2.5.3 + + + jar-with-dependencies + + + + + make-fat-jar + package + + single + + + crawler4j-${project.version} + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + 2.17 + + + compile + compile + + checkstyle.xml + UTF-8 + true + true + true + + + check + + + + + + com.puppycrawl.tools + checkstyle + 7.1 + + + + + - + + + + org.slf4j + slf4j-api + 1.7.21 + + + + ch.qos.logback + logback-classic + 1.1.7 + runtime + + + + com.google.guava + guava + 19.0 + + + org.apache.httpcomponents + httpclient + 4.4 + compile + - + + com.sleepycat + je + 5.0.73 + - - - org.slf4j - slf4j-api - 1.7.21 - - - - ch.qos.logback - logback-classic - 1.1.7 - runtime - - - - com.google.guava - guava - 19.0 - - - org.apache.httpcomponents - httpclient - 4.4 - compile - + + org.apache.tika + tika-parsers + 1.5 + - - com.sleepycat - je - 5.0.73 - + + + junit + junit + 4.11 + test + + - - org.apache.tika - tika-parsers - 1.5 - - - - - junit - junit - 4.11 - test - - - - - - oracleReleases - Oracle Released Java Packages - http://download.oracle.com/maven - default - - + + + oracleReleases + Oracle Released Java Packages + http://download.oracle.com/maven + default + + diff --git a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java index 27ab1d3e4..cf2e68274 100644 --- a/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java +++ b/src/main/java/edu/uci/ics/crawler4j/crawler/CrawlConfig.java @@ -164,6 +164,14 @@ public class CrawlConfig { */ private List authInfos; + /** + * Possibility to filter out certain parts of the html but + * will still follow the links in between the filter tags. + * eg. tag name "crawlerfilter", then all the html between those tags will not be + * processed in the text. + */ + private String htmlFilterTag = null; + /** * Validates the configs specified by this instance. * @@ -502,6 +510,26 @@ public void setAuthInfos(List authInfos) { this.authInfos = authInfos; } + /** + * + * @return the HTML filter tag + */ + public String getHtmlFilterTag() { + return htmlFilterTag; + } + + /** + * Possibility to filter out certain parts of the html but + * will still follow the links in between the filter tags. + * eg. tag name "crawlerfilter", then all the html between those tags will not be + * processed in the text. + * + * @param htmlFilterTag String containing the html filter tag eg "crawler-filter" + */ + public void setHtmlFilterTag(String htmlFilterTag) { + this.htmlFilterTag = htmlFilterTag.toLowerCase(); + } + @Override public String toString() { StringBuilder sb = new StringBuilder(); @@ -523,6 +551,7 @@ public String toString() { sb.append("Proxy port: " + getProxyPort() + "\n"); sb.append("Proxy username: " + getProxyUsername() + "\n"); sb.append("Proxy password: " + getProxyPassword() + "\n"); + sb.append("HTML filter tags: " + getHtmlFilterTag() + "\n"); return sb.toString(); } } diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HTMLSchema.java b/src/main/java/edu/uci/ics/crawler4j/parser/HTMLSchema.java new file mode 100644 index 000000000..c85a2c392 --- /dev/null +++ b/src/main/java/edu/uci/ics/crawler4j/parser/HTMLSchema.java @@ -0,0 +1,12 @@ +package edu.uci.ics.crawler4j.parser; + +public class HTMLSchema extends org.ccil.cowan.tagsoup.HTMLSchema { + + public HTMLSchema(String htmlFilterTag) { + super(); + if (htmlFilterTag != null && !htmlFilterTag.isEmpty()) { + elementType(htmlFilterTag, M_PCDATA | M_INLINE | M_BLOCK, M_BLOCK, 0); + parent(htmlFilterTag, "body"); + } + } +} diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java index 54cb5d74d..3e1726e10 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentHandler.java @@ -22,12 +22,16 @@ import java.util.List; import java.util.Map; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class HtmlContentHandler extends DefaultHandler { + protected static final Logger logger = LoggerFactory.getLogger(HtmlContentHandler.class); + private static final int MAX_ANCHOR_LENGTH = 100; private enum Element { @@ -71,11 +75,14 @@ public static Element getElement(String name) { private ExtractedUrlAnchorPair curUrl = null; private boolean anchorFlag = false; private final StringBuilder anchorText = new StringBuilder(); + private String htmlFilterTag = null; + private boolean isWithinFilteredHtml = false; - public HtmlContentHandler() { + public HtmlContentHandler(String htmlFilterTag) { isWithinBodyElement = false; bodyText = new StringBuilder(); outgoingUrls = new ArrayList<>(); + this.htmlFilterTag = htmlFilterTag; } @Override @@ -83,6 +90,10 @@ public void startElement(String uri, String localName, String qName, Attributes throws SAXException { Element element = HtmlFactory.getElement(localName); + if (htmlFilterTag != null && localName.equals(htmlFilterTag)) { + isWithinFilteredHtml = true; + } + if ((element == Element.A) || (element == Element.AREA) || (element == Element.LINK)) { String href = attributes.getValue("href"); if (href != null) { @@ -151,6 +162,10 @@ private void addToOutgoingUrls(String href, String tag) { @Override public void endElement(String uri, String localName, String qName) throws SAXException { Element element = HtmlFactory.getElement(localName); + if (htmlFilterTag != null && localName.equals(htmlFilterTag)) { + isWithinFilteredHtml = false; + } + if ((element == Element.A) || (element == Element.AREA) || (element == Element.LINK)) { anchorFlag = false; if (curUrl != null) { @@ -173,7 +188,7 @@ public void endElement(String uri, String localName, String qName) throws SAXExc @Override public void characters(char[] ch, int start, int length) throws SAXException { - if (isWithinBodyElement) { + if (isWithinBodyElement && !isWithinFilteredHtml) { if (bodyText.length() > 0) { bodyText.append(' '); } diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentMapper.java b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentMapper.java new file mode 100644 index 000000000..40a4d009e --- /dev/null +++ b/src/main/java/edu/uci/ics/crawler4j/parser/HtmlContentMapper.java @@ -0,0 +1,25 @@ +package edu.uci.ics.crawler4j.parser; + +import java.util.LinkedHashMap; +import java.util.Map; + +import org.apache.tika.parser.html.DefaultHtmlMapper; + +public class HtmlContentMapper extends DefaultHtmlMapper { + private static Map customSafeElements = new LinkedHashMap<>(); + + public HtmlContentMapper(String htmlFilterTag) { + if (htmlFilterTag != null) { + customSafeElements.put(htmlFilterTag.toUpperCase(), htmlFilterTag.toLowerCase()); + } + } + + @Override + public String mapSafeElement(String name) { + String mapSafeElement = super.mapSafeElement(name); + if (customSafeElements.size() > 0 && mapSafeElement == null) { + mapSafeElement = customSafeElements.get(name); + } + return mapSafeElement; + } +} diff --git a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java index fbc065bde..8a9842656 100644 --- a/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java +++ b/src/main/java/edu/uci/ics/crawler4j/parser/Parser.java @@ -27,7 +27,9 @@ import org.apache.tika.metadata.DublinCore; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.parser.html.HtmlParser; +import org.ccil.cowan.tagsoup.Schema; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -91,8 +93,11 @@ public void parse(Page page, String contextURL) } } else { // isHTML Metadata metadata = new Metadata(); - HtmlContentHandler contentHandler = new HtmlContentHandler(); + HtmlContentHandler contentHandler = new HtmlContentHandler(config.getHtmlFilterTag()); try (InputStream inputStream = new ByteArrayInputStream(page.getContentData())) { + parseContext.set(Schema.class, new HTMLSchema(config.getHtmlFilterTag())); + parseContext.set(HtmlMapper.class, + new HtmlContentMapper(config.getHtmlFilterTag())); htmlParser.parse(inputStream, contentHandler, metadata, parseContext); } catch (Exception e) { logger.error("{}, while parsing: {}", e.getMessage(), page.getWebURL().getURL()); diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawlController.java b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawlController.java index 7ffb34747..d3ecffca1 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawlController.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawlController.java @@ -98,6 +98,17 @@ public static void main(String[] args) throws Exception { */ config.setResumableCrawling(false); + /* + * This config parameter is used to filter out certain parts of the text. + * When text / links are between the ... tag, + * the text will not be added to the added to the Text. + * This will not affect the list of links, they will still be processed. + * Use case: to exclude the content of the navigation or footer put still + * want to follow the links. + * This will not work on the given seeds but is to give an idea for future projects + */ + config.setHtmlFilterTag("crawlerfilter"); + /* * Instantiate the controller for this crawl. */ diff --git a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java index a49c1f353..eb7e34e20 100644 --- a/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java +++ b/src/test/java/edu/uci/ics/crawler4j/examples/basic/BasicCrawler.java @@ -79,8 +79,13 @@ public void visit(Page page) { Set links = htmlParseData.getOutgoingUrls(); logger.debug("Text length: {}", text.length()); + logger.debug("Text : {}", text); logger.debug("Html length: {}", html.length()); + logger.debug("Html : {}", html); logger.debug("Number of outgoing links: {}", links.size()); + for (WebURL link : links) { + logger.debug("Link : {}", link.getURL()); + } } Header[] responseHeaders = page.getFetchResponseHeaders(); diff --git a/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java b/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java index 3c2a41827..34b6bd6a8 100644 --- a/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java +++ b/src/test/java/edu/uci/ics/crawler4j/tests/HtmlContentHandlerTest.java @@ -6,39 +6,46 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlMapper; import org.apache.tika.parser.html.HtmlParser; +import org.ccil.cowan.tagsoup.Schema; import org.junit.Test; +import edu.uci.ics.crawler4j.parser.HTMLSchema; import edu.uci.ics.crawler4j.parser.HtmlContentHandler; +import edu.uci.ics.crawler4j.parser.HtmlContentMapper; public class HtmlContentHandlerTest { private HtmlParser parser = new HtmlParser(); private ParseContext parseContext = new ParseContext(); - private HtmlContentHandler parseHtml(String html) throws Exception { + private HtmlContentHandler parseHtml(String html, String htmlFilterTag) throws Exception { ByteArrayInputStream bais = new ByteArrayInputStream(html.getBytes()); Metadata metadata = new Metadata(); - HtmlContentHandler contentHandler = new HtmlContentHandler(); + parseContext.set(Schema.class, new HTMLSchema(htmlFilterTag)); + parseContext.set(HtmlMapper.class, new HtmlContentMapper(htmlFilterTag)); + HtmlContentHandler contentHandler = new HtmlContentHandler(htmlFilterTag); parser.parse(bais, contentHandler, metadata, parseContext); return contentHandler; } @Test public void testEmpty() throws Exception { - HtmlContentHandler parse = parseHtml(""); + HtmlContentHandler parse = parseHtml("", null); assertEquals("", parse.getBodyText()); } @Test public void testParaInBody() throws Exception { - HtmlContentHandler parse = parseHtml("

Hello there

"); + HtmlContentHandler parse = parseHtml("

Hello there

", null); assertEquals("Hello there", parse.getBodyText()); } @Test public void test2ParaInBody() throws Exception { - HtmlContentHandler parse = parseHtml("

Hello there

mr

"); + HtmlContentHandler parse = + parseHtml("

Hello there

mr

", null); assertEquals("Hello there mr", parse.getBodyText()); } @@ -46,8 +53,16 @@ public void test2ParaInBody() throws Exception { public void testTableInBody() throws Exception { HtmlContentHandler parse = parseHtml( "" + - ""); + "", null); assertEquals("Hello there mr bear", parse.getBodyText()); } + @Test + public void testFilterHtmlTagInBody() throws Exception { + HtmlContentHandler parse = parseHtml( + "

Hello there

should not be in ", + "crawlfilter"); + assertEquals("Hello there", parse.getBodyText()); + } + } diff --git a/src/test/resources/log4j.xml b/src/test/resources/log4j.xml new file mode 100755 index 000000000..a4ece93a4 --- /dev/null +++ b/src/test/resources/log4j.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + + + + \ No newline at end of file
Hellothere
mrbear
mrbear