diff --git a/.hgignore b/.hgignore deleted file mode 100644 index c72a731..0000000 --- a/.hgignore +++ /dev/null @@ -1,17 +0,0 @@ -syntax: glob -*.swp -*.log -*.pyc -*.pyo -*.warc -*.gz -login.txt -.DS_Store -build/* -dist/* -hanzo_warc_tools.egg-info/* -*~ -*.orig -debian/* -*.deb -test-reports/* diff --git a/.hgtags b/.hgtags deleted file mode 100644 index 26ce816..0000000 --- a/.hgtags +++ /dev/null @@ -1,9 +0,0 @@ -58d7d99406b04e7c36bfba1c91e2b06f558c22ee hanzo-4.0-rc0 -764a52f90a951a8c4acc9c9f60f5d8321662d418 hanzo-4.0-rc1 -94b65646332e5e86f3d274f66e38ce26cc30ccad hanzo-4.0 -092e8d0615ecc5ace8b067edbeacd5e3b12c9be0 hanzo-4.1-rc0 -8f64ab5556344065cd68e0cf8265af87e6b9d0cf hanzo-4.1-rc1 -8ceff9fcde584ec577048dbd9a13743d31dfc74f hanzo-4.1-rc2 -f54be58d0d8b3aa47b3f935a732a7b5752f0e92e hanzo-4.1-rc4 -0a1d728557b8d29b15b3796f83b6a9dc7f25abff build_success-2012-09-14T15-24-42.616660024 -741fe327f233f936cd65c6e2c415cd01f9fc9871 build_success-2012-09-14T16-25-56.483325901 diff --git a/.travis.yml b/.travis.yml index 86d04d3..40ba390 100644 --- a/.travis.yml +++ b/.travis.yml @@ -6,14 +6,10 @@ python: - 3.3 - 3.4 - 3.5 + - 3.5-dev - nightly - pypy - pypy3 -matrix: - allow_failures: - - python: 3.5 - - python: nightly - script: python setup.py test diff --git a/README b/README deleted file mode 100644 index c51c46e..0000000 --- a/README +++ /dev/null @@ -1,76 +0,0 @@ -dependencies - setuptools - unittest2 - python 2.6 - -hanzo warc tools: - - warcvalid.py - returns 0 if the arguments are all valid arc/warc files - non zero on error - - warcdump.py - writes human readable summary of warcfiles: - usage: python warcdump.py foo.warc foo.warc.gz - autodetects input format when filenames are passed - i.e recordgzip vs plaintext, warc vs arc - - assumes uncompressed warc on stdin if no args - - warcfilter.py - python warcfilter.py pattern file file file - searches all headers for regex pattern - use -i to invert search - use -U to constrain to url - use -T to constrain to record type - use -C to constrain to content-type - - autodetects and stdin like warcdump - - prints out a warc format by default. - - warc2warc.py: - python warc2warc - - autodetects compression on file - args, assumes uncompressed stdin if none - - use -Z to write compressed output - - i.e warc2warc -Z input > input.gz - - should ignore buggy records in input - - arc2warc.py - creates a crappy warc file from arc files on input - a handful of headers are preserved - use -Z to write compressed output - i.e arc2warc -Z input.arc > input.warc.gz - - warcindex.py - spits out an index like this: -#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length -warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=request 193 -warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=response 3279474 - not great, but a start - -notes: - - arc2warc uses the conversion rules from the earlier arc2warc.c - as a starter for converting the headers - - I haven't profiled the code yet (and don't plan to until it falls over) - - warcvalid barely skirts some of the iso standard: - missing things: strict whitespace, required headers check... - mime quoted printable header encoding - treating headers as utf8 - -things left to do (in no order): - lots more testing. - supporting pre 1.0 warc files - add more documentation - support more commandline options for output and filenames - s3 urls - - --- tef thomas.figg@hanzoarchives.com diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..7740805 --- /dev/null +++ b/README.rst @@ -0,0 +1,87 @@ +.. image:: https://travis-ci.org/internetarchive/warctools.svg?branch=master + :target: https://travis-ci.org/internetarchive/warctools + +warctools +========= + +dependencies + +- setuptools +- unittest2 +- python 2.7, 3.2+ + +hanzo warc tools: + +warcvalid.py + returns 0 if the arguments are all valid arc/warc files + non zero on error + +warcdump.py - writes human readable summary of warcfiles: + usage: ``python warcdump.py foo.warc foo.warc.gz`` + + autodetects input format when filenames are passed, + i.e recordgzip vs plaintext, warc vs arc + + assumes uncompressed warc on stdin if no args + +warcfilter.py + ``python warcfilter.py pattern file file file`` -- + searches all headers for regex pattern + + - use -i to invert search + - use -U to constrain to url + - use -T to constrain to record type + - use -C to constrain to content-type + + autodetects and stdin like warcdump + + prints out a warc format by default. + +warc2warc.py: + ``python warc2warc `` + + autodetects compression on file + args, assumes uncompressed stdin if none + + use -Z to write compressed output + + i.e warc2warc -Z input > input.gz + + should ignore buggy records in input + +arc2warc.py + creates a crappy warc file from arc files on input + a handful of headers are preserved + use -Z to write compressed output, + i.e ``arc2warc -Z input.arc > input.warc.gz`` + +warcindex.py + spits out an index like this:: + + #WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length + warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=request 193 + warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=response 3279474 + + not great, but a start + +notes: + arc2warc uses the conversion rules from the earlier arc2warc.c + as a starter for converting the headers + + I haven't profiled the code yet (and don't plan to until it falls over) + + warcvalid barely skirts some of the iso standard: + missing things: strict whitespace, required headers check... + mime quoted printable header encoding + treating headers as utf8 + +things left to do (in no order): + +- lots more testing. +- supporting pre 1.0 warc files +- add more documentation +- support more commandline options for output and filenames +- s3 urls + + +-- tef thomas.figg@hanzoarchives.com diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py index 545b59c..ec69b19 100644 --- a/hanzo/warctools/arc.py +++ b/hanzo/warctools/arc.py @@ -68,7 +68,6 @@ def rx(pat): nl_rx = rx('^\r\n|\r|\n$') length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101 type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') #pylint: disable-msg=E1101 -SPLIT = re.compile(br'\b\s|\s\b').split class ArcParser(ArchiveParser): """A parser for arc archives.""" @@ -115,16 +114,21 @@ def parse(self, stream, offset, line=None): # configure parser instance self.version = arc_version.split()[0] self.headers = arc_names_line.strip().split() - + + # raj: some v1 ARC files are incorrectly sending a v2 header names line + if arc_names_line == 'URL IP-address Archive-date Content-type Result-code Checksum Location Offset Filepath Archive-length\n': + if arc_version == '1 0 InternetArchive' and 5 == len(line.split(' ')): + self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length'] + # now we have read header field in record body # we can extract the headers from the current record, # and read the length field # which is in a different place with v1 and v2 - - # read headers + + # read headers arc_headers = self.parse_header_list(line) - + # extract content, ignoring header lines parsed already content_type, content_length, errors = \ self.get_content_headers(arc_headers) @@ -139,7 +143,11 @@ def parse(self, stream, offset, line=None): raw_headers=raw_headers) else: if not self.headers: - raise Exception('missing filedesc') + #raj: some arc files are missing the filedesc:// line + #raise Exception('missing filedesc') + self.version = '1' + self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length'] + headers = self.parse_header_list(line) content_type, content_length, errors = \ self.get_content_headers(headers) @@ -157,21 +165,32 @@ def trim(self, stream): return () def parse_header_list(self, line): - # some people use ' ' as the empty value. lovely. - line = line.rstrip(b'\r\n') - values = SPLIT(line) - if len(self.headers) != len(values): - if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE): - # fencepost - values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))] - else: - values = SPLIT(line, len(self.headers)-1) - - if len(self.headers) != len(values): - raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers))) - - return list(zip(self.headers, values)) - + values = line.strip().split(b' ') + num_values = len(values) + + #raj: some headers contain urls with unescaped spaces + if num_values > 5: + if re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]): + values = [b'%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]] + num_values = len(values) + + if 4 == num_values: + #raj: alexa arc files don't always have content-type in header + return list(zip(self.short_headers, values)) + elif 5 == num_values: + #normal case + #raj: some old alexa arcs have ip-address and date transposed in the header + if re.match(b'^\d{14}$', values[1]) and re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[2]): + values[1], values[2] = values[2], values[1] + + return list(zip(self.headers, values)) + elif 6 == num_values: + #raj: some old alexa arcs have "content-type; charset" in the header + v = values[0:4]+values[5:] + v[3] = v[3].rstrip(';') + return list(zip(self.headers, v)) + else: + raise Exception('invalid number of header fields') @staticmethod def get_content_headers(headers): @@ -195,3 +214,8 @@ def get_content_headers(headers): register_record_type(re.compile(br'^filedesc://'), ArcRecord) + +#raj: some arc files are missing the filedesc:// line +url_record_regex = re.compile('^https?://\S+ (?:\d{1,3}\.){3}\d{1,3} \d{14} \S+ \d+$') +register_record_type(url_record_regex, ArcRecord) + diff --git a/hanzo/warctools/gz.py b/hanzo/warctools/gz.py new file mode 100644 index 0000000..5a62a98 --- /dev/null +++ b/hanzo/warctools/gz.py @@ -0,0 +1,199 @@ +import struct +import sys +import os +import zlib +import io +import gzip + +class MultiMemberGzipReader(object): + class InputBuffer(object): + MIN_CHUNK_SIZE = 1024 + READ_SIZE = 8192 + + def __init__(self, fileobj): + self.fileobj = fileobj + self._offset = 0 + self._buf = b'' + self._buf_offset = 0 + + def _refill(self): + bytes_read = self.fileobj.read(self.READ_SIZE) + self._offset += len(bytes_read) + self._buf = self._buf[self._buf_offset:] + bytes_read + self._buf_offset = 0 + + def next_bytes(self, size): + """size is required here""" + if self._buf_offset + size - 1 >= len(self._buf): + self._refill() + try: + return self._buf[self._buf_offset:self._buf_offset+size] + finally: + self._buf_offset += size + + def next_chunk(self): + if len(self._buf) - self._buf_offset < self.MIN_CHUNK_SIZE: + self._refill() + try: + return self._buf[self._buf_offset:] + finally: + self._buf_offset = len(self._buf) + + def rewind(self, n): + if n < 0 or n > self._buf_offset: + raise IndexError + self._buf_offset -= n + + def tell(self): + return self._offset - len(self._buf) + self._buf_offset + + class GzipMemberReader(object): + def __init__(self, parent, member_offset): + self._parent = parent + self.eof = False + self.member_offset = member_offset + + def _read_chunk(self, size=-1, delim=None): + if self.eof: + return b'' + + res = self._parent._decompress_until(size, delim) + if self._parent._new_member: + self.eof = True + + return res + + def readline(self, size=-1): + return self._read_chunk(size, b'\n') + + def read(self, size=-1): + return self._read_chunk(size) + + def __iter__(self): + return iter(self.readline, b'') + + def close(self): + self.eof = True + + def __init__(self, fileobj): + self._cbuf = self.InputBuffer(fileobj) + self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS) + self._dbuf = b'' + self._new_member = True + self._cbuf_new_member = True + self._member_offset = 0 + + def __iter__(self): + return self + + def _decompress_until(self, size=-1, delim=None): + """Decompresses within until delim is found, size is reached, or the + end of the member. After the end of the member is reached, subsequent + calls return b'' (until the next call to self.__next__()).""" + if self._new_member: + return b'' + while True: + end = None + if delim is not None: + delim_offset = self._dbuf.find(delim, 0, size) + if delim_offset >= 0: + end = delim_offset + len(delim) + if end is None and size >= 0 and size < len(self._dbuf): + end = size + if end is None and self._cbuf_new_member: + end = len(self._dbuf) + + if end == len(self._dbuf) and self._cbuf_new_member: + self._new_member = True + + if end is not None: + res = self._dbuf[:end] + self._dbuf = self._dbuf[end:] + return res + + tmp_cbuf = self._cbuf.next_chunk() + if tmp_cbuf == b'': + raise EOFError( + 'Compressed file ended before the end-of-stream ' + 'marker was reached') + self._dbuf += self._decompressor.decompress(tmp_cbuf) + if self._decompressor.unused_data != b'': + self._cbuf.rewind(len(self._decompressor.unused_data)) + self._skip_eof() + self._cbuf_new_member = True + self._member_offset = self._cbuf.tell() + self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS) + + def __next__(self): + while not self._new_member: + self._decompress_until(8192) + + if self._cbuf.next_bytes(1) != b'': + self._cbuf.rewind(1) + res = self.GzipMemberReader(self, self._member_offset) + self._skip_gzip_header() + self._cbuf_new_member = False + self._new_member = False + return res + else: + raise StopIteration + + # python2 + def next(self): + return self.__next__() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + pass + + def _skip_gzip_header(self): + magic = self._cbuf.next_bytes(2) + + if magic != b'\037\213': + raise OSError('Not a gzipped file (%r)' % magic) + + (method, flag, self._last_mtime) = struct.unpack( + "