From 27fb0814f3cb9f87f646ff6b5a5587df3cbf9510 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Mar 2016 20:43:50 +0000 Subject: [PATCH 01/13] invalidate content_file of previous record when reading next one --- hanzo/warctools/stream.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/hanzo/warctools/stream.py b/hanzo/warctools/stream.py index 1fecc91..5b71eff 100644 --- a/hanzo/warctools/stream.py +++ b/hanzo/warctools/stream.py @@ -67,7 +67,11 @@ def read_records(self, limit=1, offsets=True): Record is an object and errors is an empty list or record is none and errors is a list""" nrecords = 0 + record = None while limit is None or nrecords < limit: + if record: + # invalidate content_file of previous record + record.content_file = None offset, record, errors = self._read_record(offsets) nrecords += 1 yield (offset, record, errors) @@ -75,7 +79,11 @@ def read_records(self, limit=1, offsets=True): break def __iter__(self): + record = None while True: + if record: + # invalidate content_file of previous record + record.content_file = None _, record, errors = self._read_record(offsets=False) if record: yield record From 98341bbce759f1655d0259b4ea83dbddcd30fc9a Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 22 Mar 2016 20:45:40 +0000 Subject: [PATCH 02/13] port over raj hacks from old cdx-writer branch to accept arcs missing filedesc header, and split line only on space character --- hanzo/warctools/arc.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py index 545b59c..49465e3 100644 --- a/hanzo/warctools/arc.py +++ b/hanzo/warctools/arc.py @@ -68,7 +68,10 @@ def rx(pat): nl_rx = rx('^\r\n|\r|\n$') length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101 type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') #pylint: disable-msg=E1101 -SPLIT = re.compile(br'\b\s|\s\b').split +#raj/noah: change the call to split below to only split on space (some arcs +#have a \x0c formfeed character in the url) +# SPLIT = re.compile(br'\b\s|\s\b').split +SPLIT = re.compile(br'\b | \b').split class ArcParser(ArchiveParser): """A parser for arc archives.""" @@ -115,16 +118,16 @@ def parse(self, stream, offset, line=None): # configure parser instance self.version = arc_version.split()[0] self.headers = arc_names_line.strip().split() - + # now we have read header field in record body # we can extract the headers from the current record, # and read the length field # which is in a different place with v1 and v2 - - # read headers + + # read headers arc_headers = self.parse_header_list(line) - + # extract content, ignoring header lines parsed already content_type, content_length, errors = \ self.get_content_headers(arc_headers) @@ -139,7 +142,11 @@ def parse(self, stream, offset, line=None): raw_headers=raw_headers) else: if not self.headers: - raise Exception('missing filedesc') + #raj: some arc files are missing the filedesc:// line + #raise Exception('missing filedesc') + self.version = '1' + self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length'] + headers = self.parse_header_list(line) content_type, content_length, errors = \ self.get_content_headers(headers) @@ -169,7 +176,7 @@ def parse_header_list(self, line): if len(self.headers) != len(values): raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers))) - + return list(zip(self.headers, values)) @@ -195,3 +202,8 @@ def get_content_headers(headers): register_record_type(re.compile(br'^filedesc://'), ArcRecord) + +#raj: some arc files are missing the filedesc:// line +url_record_regex = re.compile('^https?://\S+ (?:\d{1,3}\.){3}\d{1,3} \d{14} \S+ \d+$') +register_record_type(url_record_regex, ArcRecord) + From fb47352438acaee77e7897511413a3d686da299e Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Mon, 28 Mar 2016 22:55:01 +0000 Subject: [PATCH 03/13] new class MultiMemberGzipReader for better member-at-a-time gzip parsing (fixes fundamental problem with old approach, that a read on a short gzip member would read into the following member) --- .travis.yml | 7 +- hanzo/warctools/gz.py | 194 +++++++++++++++++++ hanzo/warctools/stream.py | 47 +---- hanzo/warctools/tests/test_warctools.py | 240 +++++++++++++++++++++++- setup.py | 2 +- 5 files changed, 443 insertions(+), 47 deletions(-) create mode 100644 hanzo/warctools/gz.py diff --git a/.travis.yml b/.travis.yml index 8c2dc14..f59ffda 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,11 +1,12 @@ -# vim: set sw=4 et: -# - language: python python: - "2.7" - "3.2" - "3.3" + - "3.4" + - "3.5" + - "dev" + - "nightly" - "pypy" script: python setup.py test diff --git a/hanzo/warctools/gz.py b/hanzo/warctools/gz.py new file mode 100644 index 0000000..c7db399 --- /dev/null +++ b/hanzo/warctools/gz.py @@ -0,0 +1,194 @@ +import struct +import sys +import os +import zlib +import io +import gzip + +class MultiMemberGzipReader(object): + class InputBuffer(object): + MIN_CHUNK_SIZE = 1024 + READ_SIZE = 8192 + + def __init__(self, fileobj): + self.fileobj = fileobj + self._offset = 0 + self._buf = b'' + self._buf_offset = 0 + + def _refill(self): + bytes_read = self.fileobj.read(self.READ_SIZE) + self._offset += len(bytes_read) + self._buf = self._buf[self._buf_offset:] + bytes_read + self._buf_offset = 0 + + def next_bytes(self, size): + """size is required here""" + if self._buf_offset + size - 1 >= len(self._buf): + self._refill() + try: + return self._buf[self._buf_offset:self._buf_offset+size] + finally: + self._buf_offset += size + + def next_chunk(self): + if len(self._buf) - self._buf_offset < self.MIN_CHUNK_SIZE: + self._refill() + try: + return self._buf[self._buf_offset:] + finally: + self._buf_offset = len(self._buf) + + def rewind(self, n): + if n < 0 or n > self._buf_offset: + raise IndexError + self._buf_offset -= n + + def tell(self): + return self._offset - len(self._buf) + self._buf_offset + + class GzipMemberReader(object): + def __init__(self, parent, member_offset): + self._parent = parent + self.eof = False + self.member_offset = member_offset + + def _read_chunk(self, size=-1, delim=None): + if self.eof: + return b'' + + res = self._parent._decompress_until(size, delim) + if self._parent._new_member: + self.eof = True + + return res + + def readline(self, size=-1): + return self._read_chunk(size, b'\n') + + def read(self, size=-1): + return self._read_chunk(size) + + def __iter__(self): + return iter(self.readline, b'') + + def close(self): + self.eof = True + + def __init__(self, fileobj): + self._cbuf = self.InputBuffer(fileobj) + self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS) + self._dbuf = b'' + self._new_member = True + self._cbuf_new_member = True + self._member_offset = 0 + + def __iter__(self): + return self + + def _decompress_until(self, size=-1, delim=None): + """Decompresses within until delim is found, size is reached, or the + end of the member. After the end of the member is reached, subsequent + calls return b'' (until the next call to self.__next__()).""" + if self._new_member: + return b'' + while True: + end = None + if delim is not None: + delim_offset = self._dbuf.find(delim, 0, size) + if delim_offset >= 0: + end = delim_offset + len(delim) + if end is None and size >= 0 and size < len(self._dbuf): + end = size + if end is None and self._cbuf_new_member: + end = len(self._dbuf) + + if end == len(self._dbuf) and self._cbuf_new_member: + self._new_member = True + + if end is not None: + res = self._dbuf[:end] + self._dbuf = self._dbuf[end:] + return res + + self._dbuf += self._decompressor.decompress(self._cbuf.next_chunk()) + if self._decompressor.unused_data != b'': + self._cbuf.rewind(len(self._decompressor.unused_data)) + self._skip_eof() + self._cbuf_new_member = True + self._member_offset = self._cbuf.tell() + self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS) + + def __next__(self): + while not self._new_member: + self._decompress_until(8192) + + if self._cbuf.next_bytes(1) != b'': + self._cbuf.rewind(1) + res = self.GzipMemberReader(self, self._member_offset) + self._skip_gzip_header() + self._cbuf_new_member = False + self._new_member = False + return res + else: + raise StopIteration + + # python2 + def next(self): + return self.__next__() + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + pass + + def _skip_gzip_header(self): + magic = self._cbuf.next_bytes(2) + + if magic != b'\037\213': + raise OSError('Not a gzipped file (%r)' % magic) + + (method, flag, self._last_mtime) = struct.unpack( + " Date: Mon, 28 Mar 2016 23:14:30 +0000 Subject: [PATCH 04/13] oops travis has no such version of python "dev" --- .travis.yml | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index f59ffda..83f75e5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,12 +1,13 @@ language: python python: - - "2.7" - - "3.2" - - "3.3" - - "3.4" - - "3.5" - - "dev" - - "nightly" - - "pypy" + - 2.7 + - 3.2 + - 3.3 + - 3.4 + - 3.5 + - 3.5-dev + - nightly + - pypy + - pypy3 script: python setup.py test From 6cc3575961d3fae84acc2fc0ad078b86f8b35094 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 Mar 2016 06:39:11 +0000 Subject: [PATCH 05/13] handle arc record header corner cases (copied raj code from ihttps://github.com/internetarchive/warctools/blob/cdx-writer/hanzo/warctools/arc.py --- hanzo/warctools/arc.py | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py index 49465e3..d1b70de 100644 --- a/hanzo/warctools/arc.py +++ b/hanzo/warctools/arc.py @@ -164,21 +164,32 @@ def trim(self, stream): return () def parse_header_list(self, line): - # some people use ' ' as the empty value. lovely. - line = line.rstrip(b'\r\n') - values = SPLIT(line) - if len(self.headers) != len(values): - if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE): - # fencepost - values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))] - else: - values = SPLIT(line, len(self.headers)-1) - - if len(self.headers) != len(values): - raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers))) - - return list(zip(self.headers, values)) - + values = SPLIT(line.strip()) + num_values = len(values) + + #raj: some headers contain urls with unescaped spaces + if num_values > 5: + if re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]): + values = ['%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]] + num_values = len(values) + + if 4 == num_values: + #raj: alexa arc files don't always have content-type in header + return list(zip(self.short_headers, values)) + elif 5 == num_values: + #normal case + #raj: some old alexa arcs have ip-address and date transposed in the header + if re.match('^\d{14}$', values[1]) and re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[2]): + values[1], values[2] = values[2], values[1] + + return list(zip(self.headers, values)) + elif 6 == num_values: + #raj: some old alexa arcs have "content-type; charset" in the header + v = values[0:4]+values[5:] + v[3] = v[3].rstrip(';') + return list(zip(self.headers, v)) + else: + raise Exception('invalid number of header fields') @staticmethod def get_content_headers(headers): From af79aa5b8934ad7cc7578cd259e90fbb672c48c8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Tue, 29 Mar 2016 07:30:10 +0000 Subject: [PATCH 06/13] fix more corner cases, fix py --- hanzo/warctools/arc.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py index d1b70de..7e31f3e 100644 --- a/hanzo/warctools/arc.py +++ b/hanzo/warctools/arc.py @@ -119,6 +119,11 @@ def parse(self, stream, offset, line=None): self.version = arc_version.split()[0] self.headers = arc_names_line.strip().split() + # raj: some v1 ARC files are incorrectly sending a v2 header names line + if arc_names_line == 'URL IP-address Archive-date Content-type Result-code Checksum Location Offset Filepath Archive-length\n': + if arc_version == '1 0 InternetArchive' and 5 == len(line.split(' ')): + self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length'] + # now we have read header field in record body # we can extract the headers from the current record, # and read the length field @@ -164,13 +169,13 @@ def trim(self, stream): return () def parse_header_list(self, line): - values = SPLIT(line.strip()) + values = line.strip().split(b' ') num_values = len(values) #raj: some headers contain urls with unescaped spaces if num_values > 5: - if re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]): - values = ['%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]] + if re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]): + values = [b'%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]] num_values = len(values) if 4 == num_values: @@ -179,7 +184,7 @@ def parse_header_list(self, line): elif 5 == num_values: #normal case #raj: some old alexa arcs have ip-address and date transposed in the header - if re.match('^\d{14}$', values[1]) and re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[2]): + if re.match(b'^\d{14}$', values[1]) and re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[2]): values[1], values[2] = values[2], values[1] return list(zip(self.headers, values)) From 13fc74dfda9968ee985115d4a5e3cda53940577b Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 Mar 2016 19:38:17 +0000 Subject: [PATCH 07/13] never snarf content as a side effect of looking at record.content_type --- hanzo/warctools/record.py | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/hanzo/warctools/record.py b/hanzo/warctools/record.py index 9d9d094..b1f52cf 100644 --- a/hanzo/warctools/record.py +++ b/hanzo/warctools/record.py @@ -55,10 +55,6 @@ def error(self, *args): def type(self): return self.get_header(self.TYPE) - @property - def content_type(self): - return self.content[0] - @property def content_file(self): """ @@ -103,15 +99,13 @@ def content(self): @property def content_type(self): - """If self.content tuple was supplied, or has already been snarfed, or - we don't have a Content-Type header, return self.content[0]. Otherwise, - return the value of the Content-Type header.""" - if self._content is None: - content_type = self.get_header(self.CONTENT_TYPE) - if content_type is not None: - return content_type - - return self.content[0] + """If self.content tuple was supplied, or has already been snarfed, + return self.content[0]. Otherwise, return the value of the Content-Type + header.""" + if self._content: + return self._content[0] + else: + return self.get_header(self.CONTENT_TYPE) @property def content_length(self): From 9c346e0550581ac526b057381eea7ddfaac4f236 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 Mar 2016 19:38:27 +0000 Subject: [PATCH 08/13] improve error message --- hanzo/warctools/stream.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hanzo/warctools/stream.py b/hanzo/warctools/stream.py index b1c959c..e9502de 100644 --- a/hanzo/warctools/stream.py +++ b/hanzo/warctools/stream.py @@ -25,7 +25,7 @@ def open_record_stream(record_class=None, filename=None, file_handle=None, record_class = guess_record_type(file_handle) if record_class == None: - raise Exception('Failed to guess compression') + raise Exception('Failed to guess record type') record_parser = record_class.make_parser() From 2d3cbc5b0652b01fcf83b2ee56a97d202de4ec26 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Wed, 30 Mar 2016 19:39:43 +0000 Subject: [PATCH 09/13] remove unusued SPLIT thing --- hanzo/warctools/arc.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py index 7e31f3e..ec69b19 100644 --- a/hanzo/warctools/arc.py +++ b/hanzo/warctools/arc.py @@ -68,10 +68,6 @@ def rx(pat): nl_rx = rx('^\r\n|\r|\n$') length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101 type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$') #pylint: disable-msg=E1101 -#raj/noah: change the call to split below to only split on space (some arcs -#have a \x0c formfeed character in the url) -# SPLIT = re.compile(br'\b\s|\s\b').split -SPLIT = re.compile(br'\b | \b').split class ArcParser(ArchiveParser): """A parser for arc archives.""" From fb49c7c3e394697eb99a8149bc54f098462c7a84 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Sep 2016 12:32:46 -0700 Subject: [PATCH 10/13] change to RST (restructured text) and add travis-ci badge --- README | 76 ----------------------------------------------- README.rst | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 76 deletions(-) delete mode 100644 README create mode 100644 README.rst diff --git a/README b/README deleted file mode 100644 index c51c46e..0000000 --- a/README +++ /dev/null @@ -1,76 +0,0 @@ -dependencies - setuptools - unittest2 - python 2.6 - -hanzo warc tools: - - warcvalid.py - returns 0 if the arguments are all valid arc/warc files - non zero on error - - warcdump.py - writes human readable summary of warcfiles: - usage: python warcdump.py foo.warc foo.warc.gz - autodetects input format when filenames are passed - i.e recordgzip vs plaintext, warc vs arc - - assumes uncompressed warc on stdin if no args - - warcfilter.py - python warcfilter.py pattern file file file - searches all headers for regex pattern - use -i to invert search - use -U to constrain to url - use -T to constrain to record type - use -C to constrain to content-type - - autodetects and stdin like warcdump - - prints out a warc format by default. - - warc2warc.py: - python warc2warc - - autodetects compression on file - args, assumes uncompressed stdin if none - - use -Z to write compressed output - - i.e warc2warc -Z input > input.gz - - should ignore buggy records in input - - arc2warc.py - creates a crappy warc file from arc files on input - a handful of headers are preserved - use -Z to write compressed output - i.e arc2warc -Z input.arc > input.warc.gz - - warcindex.py - spits out an index like this: -#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length -warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=request 193 -warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=response 3279474 - not great, but a start - -notes: - - arc2warc uses the conversion rules from the earlier arc2warc.c - as a starter for converting the headers - - I haven't profiled the code yet (and don't plan to until it falls over) - - warcvalid barely skirts some of the iso standard: - missing things: strict whitespace, required headers check... - mime quoted printable header encoding - treating headers as utf8 - -things left to do (in no order): - lots more testing. - supporting pre 1.0 warc files - add more documentation - support more commandline options for output and filenames - s3 urls - - --- tef thomas.figg@hanzoarchives.com diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..a1797f6 --- /dev/null +++ b/README.rst @@ -0,0 +1,87 @@ +.. image:: https://travis-ci.org/internetarchive/warctools.svg?branch=master + :target: https://travis-ci.org/internetarchive/warctools + +warctools +========= + +dependencies + + - setuptools + - unittest2 + - python 2.7, 3.2+ + +hanzo warc tools: + +warcvalid.py + returns 0 if the arguments are all valid arc/warc files + non zero on error + +warcdump.py - writes human readable summary of warcfiles: + usage: ``python warcdump.py foo.warc foo.warc.gz`` + + autodetects input format when filenames are passed, + i.e recordgzip vs plaintext, warc vs arc + + assumes uncompressed warc on stdin if no args + +warcfilter.py + ``python warcfilter.py pattern file file file`` -- + searches all headers for regex pattern + + - use -i to invert search + - use -U to constrain to url + - use -T to constrain to record type + - use -C to constrain to content-type + + autodetects and stdin like warcdump + + prints out a warc format by default. + +warc2warc.py: + ``python warc2warc `` + + autodetects compression on file + args, assumes uncompressed stdin if none + + use -Z to write compressed output + + i.e warc2warc -Z input > input.gz + + should ignore buggy records in input + +arc2warc.py + creates a crappy warc file from arc files on input + a handful of headers are preserved + use -Z to write compressed output, + i.e ``arc2warc -Z input.arc > input.warc.gz`` + +warcindex.py + spits out an index like this:: + + #WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length + warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=request 193 + warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf application/http;msgtype=response 3279474 + + not great, but a start + +notes: + arc2warc uses the conversion rules from the earlier arc2warc.c + as a starter for converting the headers + + I haven't profiled the code yet (and don't plan to until it falls over) + + warcvalid barely skirts some of the iso standard: + missing things: strict whitespace, required headers check... + mime quoted printable header encoding + treating headers as utf8 + +things left to do (in no order): + + - lots more testing. + - supporting pre 1.0 warc files + - add more documentation + - support more commandline options for output and filenames + - s3 urls + + +-- tef thomas.figg@hanzoarchives.com From f0b8866de966888bd493e0681716e5f9bdfbe9d6 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Sep 2016 12:33:59 -0700 Subject: [PATCH 11/13] unindent lists to make them look normal --- README.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index a1797f6..7740805 100644 --- a/README.rst +++ b/README.rst @@ -6,9 +6,9 @@ warctools dependencies - - setuptools - - unittest2 - - python 2.7, 3.2+ +- setuptools +- unittest2 +- python 2.7, 3.2+ hanzo warc tools: @@ -77,11 +77,11 @@ notes: things left to do (in no order): - - lots more testing. - - supporting pre 1.0 warc files - - add more documentation - - support more commandline options for output and filenames - - s3 urls +- lots more testing. +- supporting pre 1.0 warc files +- add more documentation +- support more commandline options for output and filenames +- s3 urls -- tef thomas.figg@hanzoarchives.com From 1e5b380fe23623ab7dc3421285eaffaf1055c335 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Fri, 2 Sep 2016 13:08:27 -0700 Subject: [PATCH 12/13] not using mercurial anymore --- .hgignore | 17 ----------------- .hgtags | 9 --------- 2 files changed, 26 deletions(-) delete mode 100644 .hgignore delete mode 100644 .hgtags diff --git a/.hgignore b/.hgignore deleted file mode 100644 index c72a731..0000000 --- a/.hgignore +++ /dev/null @@ -1,17 +0,0 @@ -syntax: glob -*.swp -*.log -*.pyc -*.pyo -*.warc -*.gz -login.txt -.DS_Store -build/* -dist/* -hanzo_warc_tools.egg-info/* -*~ -*.orig -debian/* -*.deb -test-reports/* diff --git a/.hgtags b/.hgtags deleted file mode 100644 index 26ce816..0000000 --- a/.hgtags +++ /dev/null @@ -1,9 +0,0 @@ -58d7d99406b04e7c36bfba1c91e2b06f558c22ee hanzo-4.0-rc0 -764a52f90a951a8c4acc9c9f60f5d8321662d418 hanzo-4.0-rc1 -94b65646332e5e86f3d274f66e38ce26cc30ccad hanzo-4.0 -092e8d0615ecc5ace8b067edbeacd5e3b12c9be0 hanzo-4.1-rc0 -8f64ab5556344065cd68e0cf8265af87e6b9d0cf hanzo-4.1-rc1 -8ceff9fcde584ec577048dbd9a13743d31dfc74f hanzo-4.1-rc2 -f54be58d0d8b3aa47b3f935a732a7b5752f0e92e hanzo-4.1-rc4 -0a1d728557b8d29b15b3796f83b6a9dc7f25abff build_success-2012-09-14T15-24-42.616660024 -741fe327f233f936cd65c6e2c415cd01f9fc9871 build_success-2012-09-14T16-25-56.483325901 From cf898415d1731f18f3d02ccff6afa0a63175fbd8 Mon Sep 17 00:00:00 2001 From: Noah Levitt Date: Thu, 26 Jan 2017 09:55:11 -0800 Subject: [PATCH 13/13] fix bug spinning forever in case gzip input stream ends in the middle of the data section, and add tests for incomplete gzip input --- hanzo/warctools/gz.py | 7 ++++++- hanzo/warctools/tests/test_warctools.py | 25 +++++++++++++++++++------ setup.py | 2 +- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/hanzo/warctools/gz.py b/hanzo/warctools/gz.py index c7db399..5a62a98 100644 --- a/hanzo/warctools/gz.py +++ b/hanzo/warctools/gz.py @@ -111,7 +111,12 @@ def _decompress_until(self, size=-1, delim=None): self._dbuf = self._dbuf[end:] return res - self._dbuf += self._decompressor.decompress(self._cbuf.next_chunk()) + tmp_cbuf = self._cbuf.next_chunk() + if tmp_cbuf == b'': + raise EOFError( + 'Compressed file ended before the end-of-stream ' + 'marker was reached') + self._dbuf += self._decompressor.decompress(tmp_cbuf) if self._decompressor.unused_data != b'': self._cbuf.rewind(len(self._decompressor.unused_data)) self._skip_eof() diff --git a/hanzo/warctools/tests/test_warctools.py b/hanzo/warctools/tests/test_warctools.py index ab05fdf..05bfd35 100644 --- a/hanzo/warctools/tests/test_warctools.py +++ b/hanzo/warctools/tests/test_warctools.py @@ -570,17 +570,30 @@ def test_readline_with_size(self): # def test_invalid_gzip(self): # # XXX # # test crc mismatch - # # test end of stream in middle of - # # - magic - # # - various parts of gzip header - # # - data - # # - gzip footer # # test invalid # # - magic # # - gzip header # # - data # # - gzip footer - # pass + + # test end of stream in middle of + # - magic + # - various parts of gzip header + # - data + # - gzip footer + def test_incomplete_gzip(self): + for l in range(1, len(self.GZ_8BYTE)): + with BytesIO(self.GZ_8BYTE[:l]) as f: + with MultiMemberGzipReader(f) as g: + with self.assertRaises(Exception): + for m in g: + buf = m.read() + + # sanity check of the full gzip + with BytesIO(self.GZ_8BYTE) as f: + with MultiMemberGzipReader(f) as g: + for m in g: + self.assertEqual(b'abcdefgh', m.read()) # def test_bad_behavior(self): # # XXX diff --git a/setup.py b/setup.py index 35a0b18..ecd4c1b 100755 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ from setuptools import setup setup(name='warctools', - version="5.0.dev1", + version="5.0.dev2", license="MIT License", description='Command line tools and libraries for handling and manipulating WARC files (and HTTP contents)', author='Thomas Figg',