diff --git a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java index 1c911a816..694c4add6 100644 --- a/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java +++ b/crawler4j/src/main/java/edu/uci/ics/crawler4j/crawler/Page.java @@ -20,7 +20,11 @@ import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.security.DigestInputStream; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import org.apache.commons.codec.digest.DigestUtils; import org.apache.http.Header; import org.apache.http.HttpEntity; import org.apache.http.entity.ContentType; @@ -65,6 +69,11 @@ public class Page { */ protected byte[] contentData; + /** + * The checksum of this page's content. + */ + protected String contentChecksum; + /** * The ContentType of this page. * For example: "text/html; charset=UTF-8" @@ -121,7 +130,13 @@ protected byte[] toByteArray(HttpEntity entity, int maxBytes) throws IOException if (entity == null) { return new byte[0]; } - try (InputStream is = entity.getContent()) { + MessageDigest md; + try { + md = MessageDigest.getInstance("MD5"); + } catch (NoSuchAlgorithmException e) { + throw new RuntimeException("Cannot calculate checksum", e); + } + try (InputStream is = new DigestInputStream(entity.getContent(), md)) { int size = (int) entity.getContentLength(); int readBufferLength = size; @@ -150,6 +165,8 @@ protected byte[] toByteArray(HttpEntity entity, int maxBytes) throws IOException } } return buffer.toByteArray(); + } finally { + contentChecksum = DigestUtils.md5Hex(md.digest()); } } @@ -256,6 +273,13 @@ public void setContentData(byte[] contentData) { this.contentData = contentData; } + /** + * @return checksum of this page's content. + */ + public String getContentChecksum() { + return contentChecksum; + } + /** * @return ContentType of this page. * For example: "text/html; charset=UTF-8"