From 1f6e3a6449abb4bc2dab14e65c5c36bece7deeaf Mon Sep 17 00:00:00 2001 From: cnsgithub Date: Fri, 29 Mar 2019 09:22:32 +0100 Subject: [PATCH 1/2] closes #399: on-the-fly calculation of checksum --- .../edu/uci/ics/crawler4j/crawler/Page.java | 26 ++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java index 1c911a816..b29684ad8 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java @@ -20,7 +20,11 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.entity.ContentType; @@ -65,6 +69,11 @@ public class Page { */ protected byte[] contentData; + /** + * The checksum of this page's content. + */ + protected String contentChecksum; + /** * The ContentType of this page. * For example: "text/html; charset=UTF-8" @@ -121,7 +130,13 @@ protected byte[] toByteArray(HttpEntity entity, int maxBytes) throws IOException if (entity == null) { return new byte[0]; } - try (InputStream is = entity.getContent()) { + MessageDigest md; + try { + md = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new RuntimeException("Cannot calculate checksum", e); + } + try (InputStream is = new DigestInputStream(entity.getContent(), md)) { int size = (int) entity.getContentLength(); int readBufferLength = size; @@ -150,6 +165,8 @@ protected byte[] toByteArray(HttpEntity entity, int maxBytes) throws IOException } } return buffer.toByteArray(); + } finally { + contentChecksum = DigestUtils.md5Hex(md.digest()); } } @@ -256,6 +273,13 @@ public void setContentData(byte[] contentData) { this.contentData = contentData; } + /** + * @return checksum of this page's content. + */ + public String getContentChecksum() { + return contentChecksum; + } + /** * @return ContentType of this page. * For example: "text/html; charset=UTF-8" From 1f6ce2c342c47cdc3644188005982aba6c28b42e Mon Sep 17 00:00:00 2001 From: cnsgithub Date: Fri, 29 Mar 2019 09:28:03 +0100 Subject: [PATCH 2/2] checkstyle violations --- crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java index b29684ad8..694c4add6 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java @@ -279,7 +279,7 @@ public void setContentData(byte[] contentData) { public String getContentChecksum() { return contentChecksum; } - + /** * @return ContentType of this page. * For example: "text/html; charset=UTF-8"