diff --git a/.hgignore b/.hgignore
deleted file mode 100644
index c72a731..0000000
--- a/.hgignore
+++ /dev/null
@@ -1,17 +0,0 @@
-syntax: glob
-*.swp
-*.log
-*.pyc
-*.pyo
-*.warc
-*.gz
-login.txt
-.DS_Store
-build/*
-dist/*
-hanzo_warc_tools.egg-info/*
-*~
-*.orig
-debian/*
-*.deb
-test-reports/*
diff --git a/.hgtags b/.hgtags
deleted file mode 100644
index 26ce816..0000000
--- a/.hgtags
+++ /dev/null
@@ -1,9 +0,0 @@
-58d7d99406b04e7c36bfba1c91e2b06f558c22ee hanzo-4.0-rc0
-764a52f90a951a8c4acc9c9f60f5d8321662d418 hanzo-4.0-rc1
-94b65646332e5e86f3d274f66e38ce26cc30ccad hanzo-4.0
-092e8d0615ecc5ace8b067edbeacd5e3b12c9be0 hanzo-4.1-rc0
-8f64ab5556344065cd68e0cf8265af87e6b9d0cf hanzo-4.1-rc1
-8ceff9fcde584ec577048dbd9a13743d31dfc74f hanzo-4.1-rc2
-f54be58d0d8b3aa47b3f935a732a7b5752f0e92e hanzo-4.1-rc4
-0a1d728557b8d29b15b3796f83b6a9dc7f25abff build_success-2012-09-14T15-24-42.616660024
-741fe327f233f936cd65c6e2c415cd01f9fc9871 build_success-2012-09-14T16-25-56.483325901
diff --git a/.travis.yml b/.travis.yml
index 86d04d3..40ba390 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -6,14 +6,10 @@ python:
- 3.3
- 3.4
- 3.5
+ - 3.5-dev
- nightly
- pypy
- pypy3
-matrix:
- allow_failures:
- - python: 3.5
- - python: nightly
-
script: python setup.py test
diff --git a/README b/README
deleted file mode 100644
index c51c46e..0000000
--- a/README
+++ /dev/null
@@ -1,76 +0,0 @@
-dependencies
- setuptools
- unittest2
- python 2.6
-
-hanzo warc tools:
-
- warcvalid.py
- returns 0 if the arguments are all valid arc/warc files
- non zero on error
-
- warcdump.py - writes human readable summary of warcfiles:
- usage: python warcdump.py foo.warc foo.warc.gz
- autodetects input format when filenames are passed
- i.e recordgzip vs plaintext, warc vs arc
-
- assumes uncompressed warc on stdin if no args
-
- warcfilter.py
- python warcfilter.py pattern file file file
- searches all headers for regex pattern
- use -i to invert search
- use -U to constrain to url
- use -T to constrain to record type
- use -C to constrain to content-type
-
- autodetects and stdin like warcdump
-
- prints out a warc format by default.
-
- warc2warc.py:
- python warc2warc
-
- autodetects compression on file
- args, assumes uncompressed stdin if none
-
- use -Z to write compressed output
-
- i.e warc2warc -Z input > input.gz
-
- should ignore buggy records in input
-
- arc2warc.py
- creates a crappy warc file from arc files on input
- a handful of headers are preserved
- use -Z to write compressed output
- i.e arc2warc -Z input.arc > input.warc.gz
-
- warcindex.py
- spits out an index like this:
-#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length
-warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=request 193
-warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=response 3279474
- not great, but a start
-
-notes:
-
- arc2warc uses the conversion rules from the earlier arc2warc.c
- as a starter for converting the headers
-
- I haven't profiled the code yet (and don't plan to until it falls over)
-
- warcvalid barely skirts some of the iso standard:
- missing things: strict whitespace, required headers check...
- mime quoted printable header encoding
- treating headers as utf8
-
-things left to do (in no order):
- lots more testing.
- supporting pre 1.0 warc files
- add more documentation
- support more commandline options for output and filenames
- s3 urls
-
-
--- tef thomas.figg@hanzoarchives.com
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..7740805
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,87 @@
+.. image:: https://travis-ci.org/internetarchive/warctools.svg?branch=master
+ :target: https://travis-ci.org/internetarchive/warctools
+
+warctools
+=========
+
+dependencies
+
+- setuptools
+- unittest2
+- python 2.7, 3.2+
+
+hanzo warc tools:
+
+warcvalid.py
+ returns 0 if the arguments are all valid arc/warc files
+ non zero on error
+
+warcdump.py - writes human readable summary of warcfiles:
+ usage: ``python warcdump.py foo.warc foo.warc.gz``
+
+ autodetects input format when filenames are passed,
+ i.e recordgzip vs plaintext, warc vs arc
+
+ assumes uncompressed warc on stdin if no args
+
+warcfilter.py
+ ``python warcfilter.py pattern file file file`` --
+ searches all headers for regex pattern
+
+ - use -i to invert search
+ - use -U to constrain to url
+ - use -T to constrain to record type
+ - use -C to constrain to content-type
+
+ autodetects and stdin like warcdump
+
+ prints out a warc format by default.
+
+warc2warc.py:
+ ``python warc2warc ``
+
+ autodetects compression on file
+ args, assumes uncompressed stdin if none
+
+ use -Z to write compressed output
+
+ i.e warc2warc -Z input > input.gz
+
+ should ignore buggy records in input
+
+arc2warc.py
+ creates a crappy warc file from arc files on input
+ a handful of headers are preserved
+ use -Z to write compressed output,
+ i.e ``arc2warc -Z input.arc > input.warc.gz``
+
+warcindex.py
+ spits out an index like this::
+
+ #WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length
+ warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=request 193
+ warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=response 3279474
+
+ not great, but a start
+
+notes:
+ arc2warc uses the conversion rules from the earlier arc2warc.c
+ as a starter for converting the headers
+
+ I haven't profiled the code yet (and don't plan to until it falls over)
+
+ warcvalid barely skirts some of the iso standard:
+ missing things: strict whitespace, required headers check...
+ mime quoted printable header encoding
+ treating headers as utf8
+
+things left to do (in no order):
+
+- lots more testing.
+- supporting pre 1.0 warc files
+- add more documentation
+- support more commandline options for output and filenames
+- s3 urls
+
+
+-- tef thomas.figg@hanzoarchives.com
diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
index 545b59c..ec69b19 100644
--- a/hanzo/warctools/arc.py
+++ b/hanzo/warctools/arc.py
@@ -68,7 +68,6 @@ def rx(pat):
nl_rx = rx('^\r\n|\r|\n$')
length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') #pylint: disable-msg=E1101
-SPLIT = re.compile(br'\b\s|\s\b').split
class ArcParser(ArchiveParser):
"""A parser for arc archives."""
@@ -115,16 +114,21 @@ def parse(self, stream, offset, line=None):
# configure parser instance
self.version = arc_version.split()[0]
self.headers = arc_names_line.strip().split()
-
+
+ # raj: some v1 ARC files are incorrectly sending a v2 header names line
+ if arc_names_line == 'URL IP-address Archive-date Content-type Result-code Checksum Location Offset Filepath Archive-length\n':
+ if arc_version == '1 0 InternetArchive' and 5 == len(line.split(' ')):
+ self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
# now we have read header field in record body
# we can extract the headers from the current record,
# and read the length field
# which is in a different place with v1 and v2
-
- # read headers
+
+ # read headers
arc_headers = self.parse_header_list(line)
-
+
# extract content, ignoring header lines parsed already
content_type, content_length, errors = \
self.get_content_headers(arc_headers)
@@ -139,7 +143,11 @@ def parse(self, stream, offset, line=None):
raw_headers=raw_headers)
else:
if not self.headers:
- raise Exception('missing filedesc')
+ #raj: some arc files are missing the filedesc:// line
+ #raise Exception('missing filedesc')
+ self.version = '1'
+ self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
headers = self.parse_header_list(line)
content_type, content_length, errors = \
self.get_content_headers(headers)
@@ -157,21 +165,32 @@ def trim(self, stream):
return ()
def parse_header_list(self, line):
- # some people use ' ' as the empty value. lovely.
- line = line.rstrip(b'\r\n')
- values = SPLIT(line)
- if len(self.headers) != len(values):
- if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
- # fencepost
- values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))]
- else:
- values = SPLIT(line, len(self.headers)-1)
-
- if len(self.headers) != len(values):
- raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))
-
- return list(zip(self.headers, values))
-
+ values = line.strip().split(b' ')
+ num_values = len(values)
+
+ #raj: some headers contain urls with unescaped spaces
+ if num_values > 5:
+ if re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]):
+ values = [b'%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]]
+ num_values = len(values)
+
+ if 4 == num_values:
+ #raj: alexa arc files don't always have content-type in header
+ return list(zip(self.short_headers, values))
+ elif 5 == num_values:
+ #normal case
+ #raj: some old alexa arcs have ip-address and date transposed in the header
+ if re.match(b'^\d{14}$', values[1]) and re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[2]):
+ values[1], values[2] = values[2], values[1]
+
+ return list(zip(self.headers, values))
+ elif 6 == num_values:
+ #raj: some old alexa arcs have "content-type; charset" in the header
+ v = values[0:4]+values[5:]
+ v[3] = v[3].rstrip(';')
+ return list(zip(self.headers, v))
+ else:
+ raise Exception('invalid number of header fields')
@staticmethod
def get_content_headers(headers):
@@ -195,3 +214,8 @@ def get_content_headers(headers):
register_record_type(re.compile(br'^filedesc://'), ArcRecord)
+
+#raj: some arc files are missing the filedesc:// line
+url_record_regex = re.compile('^https?://\S+ (?:\d{1,3}\.){3}\d{1,3} \d{14} \S+ \d+$')
+register_record_type(url_record_regex, ArcRecord)
+
diff --git a/hanzo/warctools/gz.py b/hanzo/warctools/gz.py
new file mode 100644
index 0000000..5a62a98
--- /dev/null
+++ b/hanzo/warctools/gz.py
@@ -0,0 +1,199 @@
+import struct
+import sys
+import os
+import zlib
+import io
+import gzip
+
+class MultiMemberGzipReader(object):
+ class InputBuffer(object):
+ MIN_CHUNK_SIZE = 1024
+ READ_SIZE = 8192
+
+ def __init__(self, fileobj):
+ self.fileobj = fileobj
+ self._offset = 0
+ self._buf = b''
+ self._buf_offset = 0
+
+ def _refill(self):
+ bytes_read = self.fileobj.read(self.READ_SIZE)
+ self._offset += len(bytes_read)
+ self._buf = self._buf[self._buf_offset:] + bytes_read
+ self._buf_offset = 0
+
+ def next_bytes(self, size):
+ """size is required here"""
+ if self._buf_offset + size - 1 >= len(self._buf):
+ self._refill()
+ try:
+ return self._buf[self._buf_offset:self._buf_offset+size]
+ finally:
+ self._buf_offset += size
+
+ def next_chunk(self):
+ if len(self._buf) - self._buf_offset < self.MIN_CHUNK_SIZE:
+ self._refill()
+ try:
+ return self._buf[self._buf_offset:]
+ finally:
+ self._buf_offset = len(self._buf)
+
+ def rewind(self, n):
+ if n < 0 or n > self._buf_offset:
+ raise IndexError
+ self._buf_offset -= n
+
+ def tell(self):
+ return self._offset - len(self._buf) + self._buf_offset
+
+ class GzipMemberReader(object):
+ def __init__(self, parent, member_offset):
+ self._parent = parent
+ self.eof = False
+ self.member_offset = member_offset
+
+ def _read_chunk(self, size=-1, delim=None):
+ if self.eof:
+ return b''
+
+ res = self._parent._decompress_until(size, delim)
+ if self._parent._new_member:
+ self.eof = True
+
+ return res
+
+ def readline(self, size=-1):
+ return self._read_chunk(size, b'\n')
+
+ def read(self, size=-1):
+ return self._read_chunk(size)
+
+ def __iter__(self):
+ return iter(self.readline, b'')
+
+ def close(self):
+ self.eof = True
+
+ def __init__(self, fileobj):
+ self._cbuf = self.InputBuffer(fileobj)
+ self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
+ self._dbuf = b''
+ self._new_member = True
+ self._cbuf_new_member = True
+ self._member_offset = 0
+
+ def __iter__(self):
+ return self
+
+ def _decompress_until(self, size=-1, delim=None):
+ """Decompresses within until delim is found, size is reached, or the
+ end of the member. After the end of the member is reached, subsequent
+ calls return b'' (until the next call to self.__next__())."""
+ if self._new_member:
+ return b''
+ while True:
+ end = None
+ if delim is not None:
+ delim_offset = self._dbuf.find(delim, 0, size)
+ if delim_offset >= 0:
+ end = delim_offset + len(delim)
+ if end is None and size >= 0 and size < len(self._dbuf):
+ end = size
+ if end is None and self._cbuf_new_member:
+ end = len(self._dbuf)
+
+ if end == len(self._dbuf) and self._cbuf_new_member:
+ self._new_member = True
+
+ if end is not None:
+ res = self._dbuf[:end]
+ self._dbuf = self._dbuf[end:]
+ return res
+
+ tmp_cbuf = self._cbuf.next_chunk()
+ if tmp_cbuf == b'':
+ raise EOFError(
+ 'Compressed file ended before the end-of-stream '
+ 'marker was reached')
+ self._dbuf += self._decompressor.decompress(tmp_cbuf)
+ if self._decompressor.unused_data != b'':
+ self._cbuf.rewind(len(self._decompressor.unused_data))
+ self._skip_eof()
+ self._cbuf_new_member = True
+ self._member_offset = self._cbuf.tell()
+ self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
+
+ def __next__(self):
+ while not self._new_member:
+ self._decompress_until(8192)
+
+ if self._cbuf.next_bytes(1) != b'':
+ self._cbuf.rewind(1)
+ res = self.GzipMemberReader(self, self._member_offset)
+ self._skip_gzip_header()
+ self._cbuf_new_member = False
+ self._new_member = False
+ return res
+ else:
+ raise StopIteration
+
+ # python2
+ def next(self):
+ return self.__next__()
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ pass
+
+ def _skip_gzip_header(self):
+ magic = self._cbuf.next_bytes(2)
+
+ if magic != b'\037\213':
+ raise OSError('Not a gzipped file (%r)' % magic)
+
+ (method, flag, self._last_mtime) = struct.unpack(
+ "