From 27fb0814f3cb9f87f646ff6b5a5587df3cbf9510 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 Mar 2016 20:43:50 +0000
Subject: [PATCH 01/13] invalidate content_file of previous record when reading
 next one

---
 hanzo/warctools/stream.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/hanzo/warctools/stream.py b/hanzo/warctools/stream.py
index 1fecc91..5b71eff 100644
--- a/hanzo/warctools/stream.py
+++ b/hanzo/warctools/stream.py
@@ -67,7 +67,11 @@ def read_records(self, limit=1, offsets=True):
         Record is an object and errors is an empty list
         or record is none and errors is a list"""
         nrecords = 0
+        record = None
         while limit is None or nrecords < limit:
+            if record:
+                # invalidate content_file of previous record
+                record.content_file = None
             offset, record, errors = self._read_record(offsets)
             nrecords += 1
             yield (offset, record, errors)
@@ -75,7 +79,11 @@ def read_records(self, limit=1, offsets=True):
                 break
 
     def __iter__(self):
+        record = None
         while True:
+            if record:
+                # invalidate content_file of previous record
+                record.content_file = None
             _, record, errors = self._read_record(offsets=False)
             if record:
                 yield record

From 98341bbce759f1655d0259b4ea83dbddcd30fc9a Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 22 Mar 2016 20:45:40 +0000
Subject: [PATCH 02/13] port over raj hacks from old cdx-writer branch to
 accept arcs missing filedesc header, and split line only on space character

---
 hanzo/warctools/arc.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
index 545b59c..49465e3 100644
--- a/hanzo/warctools/arc.py
+++ b/hanzo/warctools/arc.py
@@ -68,7 +68,10 @@ def rx(pat):
 nl_rx = rx('^\r\n|\r|\n$')
 length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
 type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')     #pylint: disable-msg=E1101
-SPLIT = re.compile(br'\b\s|\s\b').split
+#raj/noah: change the call to split below to only split on space (some arcs
+#have a \x0c formfeed character in the url)
+# SPLIT = re.compile(br'\b\s|\s\b').split
+SPLIT = re.compile(br'\b | \b').split
 
 class ArcParser(ArchiveParser):
     """A parser for arc archives."""
@@ -115,16 +118,16 @@ def parse(self, stream, offset, line=None):
             # configure parser instance
             self.version = arc_version.split()[0]
             self.headers = arc_names_line.strip().split()
-            
+
             # now we have read header field in record body
             # we can extract the headers from the current record,
             # and read the length field
 
             # which is in a different place with v1 and v2
-        
-            # read headers 
+
+            # read headers
             arc_headers = self.parse_header_list(line)
-            
+
             # extract content, ignoring header lines parsed already
             content_type, content_length, errors = \
                 self.get_content_headers(arc_headers)
@@ -139,7 +142,11 @@ def parse(self, stream, offset, line=None):
                                      raw_headers=raw_headers)
         else:
             if not self.headers:
-                raise Exception('missing filedesc')
+                #raj: some arc files are missing the filedesc:// line
+                #raise Exception('missing filedesc')
+                self.version = '1'
+                self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
             headers = self.parse_header_list(line)
             content_type, content_length, errors = \
                 self.get_content_headers(headers)
@@ -169,7 +176,7 @@ def parse_header_list(self, line):
 
         if len(self.headers) != len(values):
             raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))
-                
+
         return list(zip(self.headers, values))
 
 
@@ -195,3 +202,8 @@ def get_content_headers(headers):
 
 
 register_record_type(re.compile(br'^filedesc://'), ArcRecord)
+
+#raj: some arc files are missing the filedesc:// line
+url_record_regex = re.compile('^https?://\S+ (?:\d{1,3}\.){3}\d{1,3} \d{14} \S+ \d+$')
+register_record_type(url_record_regex, ArcRecord)
+

From fb47352438acaee77e7897511413a3d686da299e Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 28 Mar 2016 22:55:01 +0000
Subject: [PATCH 03/13] new class MultiMemberGzipReader for better
 member-at-a-time gzip parsing (fixes fundamental problem with old approach,
 that a read on a short gzip member would read into the following member)

---
 .travis.yml                             |   7 +-
 hanzo/warctools/gz.py                   | 194 +++++++++++++++++++
 hanzo/warctools/stream.py               |  47 +----
 hanzo/warctools/tests/test_warctools.py | 240 +++++++++++++++++++++++-
 setup.py                                |   2 +-
 5 files changed, 443 insertions(+), 47 deletions(-)
 create mode 100644 hanzo/warctools/gz.py

diff --git a/.travis.yml b/.travis.yml
index 8c2dc14..f59ffda 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,11 +1,12 @@
-# vim: set sw=4 et:
-#
-
 language: python
 python:
     - "2.7"
     - "3.2"
     - "3.3"
+    - "3.4"
+    - "3.5"
+    - "dev"
+    - "nightly"
     - "pypy"
 script: python setup.py test
 
diff --git a/hanzo/warctools/gz.py b/hanzo/warctools/gz.py
new file mode 100644
index 0000000..c7db399
--- /dev/null
+++ b/hanzo/warctools/gz.py
@@ -0,0 +1,194 @@
+import struct
+import sys
+import os
+import zlib
+import io
+import gzip
+
+class MultiMemberGzipReader(object):
+    class InputBuffer(object):
+        MIN_CHUNK_SIZE = 1024
+        READ_SIZE = 8192
+
+        def __init__(self, fileobj):
+            self.fileobj = fileobj
+            self._offset = 0
+            self._buf = b''
+            self._buf_offset = 0
+
+        def _refill(self):
+            bytes_read = self.fileobj.read(self.READ_SIZE)
+            self._offset += len(bytes_read)
+            self._buf = self._buf[self._buf_offset:] + bytes_read
+            self._buf_offset = 0
+
+        def next_bytes(self, size):
+            """size is required here"""
+            if self._buf_offset + size - 1 >= len(self._buf):
+                self._refill()
+            try:
+                return self._buf[self._buf_offset:self._buf_offset+size]
+            finally:
+                self._buf_offset += size
+
+        def next_chunk(self):
+            if len(self._buf) - self._buf_offset < self.MIN_CHUNK_SIZE:
+                self._refill()
+            try:
+                return self._buf[self._buf_offset:]
+            finally:
+                self._buf_offset = len(self._buf)
+
+        def rewind(self, n):
+            if n < 0 or n > self._buf_offset:
+                raise IndexError
+            self._buf_offset -= n
+
+        def tell(self):
+            return self._offset - len(self._buf) + self._buf_offset
+
+    class GzipMemberReader(object):
+        def __init__(self, parent, member_offset):
+            self._parent = parent
+            self.eof = False
+            self.member_offset = member_offset
+
+        def _read_chunk(self, size=-1, delim=None):
+            if self.eof:
+                return b''
+
+            res = self._parent._decompress_until(size, delim)
+            if self._parent._new_member:
+                self.eof = True
+
+            return res
+
+        def readline(self, size=-1):
+            return self._read_chunk(size, b'\n')
+
+        def read(self, size=-1):
+            return self._read_chunk(size)
+
+        def __iter__(self):
+            return iter(self.readline, b'')
+
+        def close(self):
+            self.eof = True
+
+    def __init__(self, fileobj):
+        self._cbuf = self.InputBuffer(fileobj)
+        self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
+        self._dbuf = b''
+        self._new_member = True
+        self._cbuf_new_member = True
+        self._member_offset = 0
+
+    def __iter__(self):
+        return self
+
+    def _decompress_until(self, size=-1, delim=None):
+        """Decompresses within until delim is found, size is reached, or the
+        end of the member. After the end of the member is reached, subsequent
+        calls return b'' (until the next call to self.__next__())."""
+        if self._new_member:
+            return b''
+        while True:
+            end = None
+            if delim is not None:
+                delim_offset = self._dbuf.find(delim, 0, size)
+                if delim_offset >= 0:
+                    end = delim_offset + len(delim)
+            if end is None and size >= 0 and size < len(self._dbuf):
+                end = size
+            if end is None and self._cbuf_new_member:
+                end = len(self._dbuf)
+
+            if end == len(self._dbuf) and self._cbuf_new_member:
+                self._new_member = True
+
+            if end is not None:
+                res = self._dbuf[:end]
+                self._dbuf = self._dbuf[end:]
+                return res
+
+            self._dbuf += self._decompressor.decompress(self._cbuf.next_chunk())
+            if self._decompressor.unused_data != b'':
+                self._cbuf.rewind(len(self._decompressor.unused_data))
+                self._skip_eof()
+                self._cbuf_new_member = True
+                self._member_offset = self._cbuf.tell()
+                self._decompressor = zlib.decompressobj(-zlib.MAX_WBITS)
+
+    def __next__(self):
+        while not self._new_member:
+            self._decompress_until(8192)
+
+        if self._cbuf.next_bytes(1) != b'':
+            self._cbuf.rewind(1)
+            res = self.GzipMemberReader(self, self._member_offset)
+            self._skip_gzip_header()
+            self._cbuf_new_member = False
+            self._new_member = False
+            return res
+        else:
+            raise StopIteration
+
+    # python2
+    def next(self):
+        return self.__next__()
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        pass
+
+    def _skip_gzip_header(self):
+        magic = self._cbuf.next_bytes(2)
+
+        if magic != b'\037\213':
+            raise OSError('Not a gzipped file (%r)' % magic)
+
+        (method, flag, self._last_mtime) = struct.unpack(
+                "<BBIxx", self._cbuf.next_bytes(8))
+        if method != 8:
+            raise OSError('Unknown compression method')
+
+        if flag & gzip.FEXTRA:
+            # Read & discard the extra field, if present
+            extra_len, = struct.unpack("<H", self._cbuf.next_bytes(2))
+            self._cbuf.next_bytes(extra_len)
+
+        if flag & gzip.FNAME:
+            # Read and discard a null-terminated string containing the filename
+            while True:
+                s = self._cbuf.next_bytes(1)
+                if not s or s == b'\000':
+                    break
+
+        if flag & gzip.FCOMMENT:
+            # Read and discard a null-terminated string containing a comment
+            while True:
+                s = self._cbuf.next_bytes(1)
+                if not s or s==b'\000':
+                    break
+
+        if flag & gzip.FHCRC:
+            s = self._cbuf.next_bytes(2)
+
+        return True
+
+    def _skip_eof(self):
+        crc32, isize = struct.unpack("<II", self._cbuf.next_bytes(8))
+        # if crc32 != self._crc:
+        #     raise OSError("CRC check failed %s != %s" % (hex(crc32),
+        #                                                  hex(self._crc)))
+        # elif isize != (self._stream_size & 0xffffffff):
+        #     raise OSError("Incorrect length of data produced")
+
+        while True:
+            bite = self._cbuf.next_bytes(1)
+            if bite != b'\0':
+                self._cbuf.rewind(1)
+                break
+
diff --git a/hanzo/warctools/stream.py b/hanzo/warctools/stream.py
index 5b71eff..b1c959c 100644
--- a/hanzo/warctools/stream.py
+++ b/hanzo/warctools/stream.py
@@ -4,6 +4,7 @@
 import re
 
 from hanzo.warctools.archive_detect import is_gzip_file, guess_record_type
+from hanzo.warctools.gz import MultiMemberGzipReader
 
 def open_record_stream(record_class=None, filename=None, file_handle=None,
                        mode="rb", gzip="auto", offset=None, length=None):
@@ -134,7 +135,7 @@ def _read(self, count=None):
         else:
             result = self.fh.read()
 
-        if self.bytes_to_eoc is not None:
+        if result and self.bytes_to_eoc is not None:
             self.bytes_to_eoc -= len(result)
 
         return result
@@ -188,59 +189,25 @@ def readline(self, maxlen=None):
 
 CHUNK_SIZE = 8192 # the size to read in, make this bigger things go faster.
 
-class GeeZipFile(gzip.GzipFile):
-    """Extends gzip.GzipFile to remember self.member_offset, the raw file
-    offset of the current gzip member."""
-
-    def __init__(self, filename=None, mode=None,
-                 compresslevel=9, fileobj=None, mtime=None):
-        # ignore mtime for python 2.6
-        gzip.GzipFile.__init__(self, filename=filename, mode=mode, compresslevel=compresslevel, fileobj=fileobj)
-        self.member_offset = None
-
-    # hook in to the place we seem to be able to reliably get the raw gzip
-    # member offset
-    def _read(self, size=1024):
-        if self._new_member:
-            try:
-                # works for python3.2
-                self.member_offset = self.fileobj.tell() - self.fileobj._length + (self.fileobj._read or 0)
-            except AttributeError:
-                # works for python2.7
-                self.member_offset = self.fileobj.tell()
-
-        return gzip.GzipFile._read(self, size)
-
 class GzipRecordStream(RecordStream):
     """A stream to read/write concatted file made up of gzipped
     archive records"""
     def __init__(self, file_handle, record_parser):
-        RecordStream.__init__(self, GeeZipFile(fileobj=file_handle), record_parser)
         self.raw_fh = file_handle
+        self.multi_member_gzip_reader = MultiMemberGzipReader(self.raw_fh)
+        RecordStream.__init__(self, None, record_parser)
 
     def _read_record(self, offsets):
-        if self.bytes_to_eoc is not None:
-            self._skip_to_eoc()  # skip to end of previous record
         self.bytes_to_eoc = None
-
-        # handle any sort of valid or invalid record terminator
-        while True:
-            line = self.fh.readline()
-            if not re.match(br'^[\r\n]+$', line):
-                break
-
-        record, errors, _offset = \
-            self.record_parser.parse(self, offset=None, line=line)
-
+        self.fh = self.multi_member_gzip_reader.next()
+        record, errors, _ = self.record_parser.parse(stream=self, offset=None)
         offset = self.fh.member_offset
-
         return offset, record, errors
 
     def seek(self, offset, pos=0):
         """Same as a seek on a file"""
         self.raw_fh.seek(offset, pos)
-        # trick to avoid closing and recreating GzipFile, does it always work?
-        self.fh._new_member = True
+        self.fh = MultiMemberGzipReader(raw_fh)
 
 class GzipFileStream(RecordStream):
     """A stream to read/write gzipped file made up of all archive records"""
diff --git a/hanzo/warctools/tests/test_warctools.py b/hanzo/warctools/tests/test_warctools.py
index 4576da5..ab05fdf 100644
--- a/hanzo/warctools/tests/test_warctools.py
+++ b/hanzo/warctools/tests/test_warctools.py
@@ -12,6 +12,7 @@
 import tempfile
 import gzip
 from hanzo import warctools, httptools
+from hanzo.warctools.gz import MultiMemberGzipReader
 
 try:
     from io import BytesIO
@@ -50,7 +51,7 @@ def _test_terminator(self, terminator):
             self._run_checks(fin, terminator, False)
         finally:
             fin.close()
-        
+
         fin = self._arc_gz(terminator)
         try:
             self._run_checks(fin, terminator, True)
@@ -98,7 +99,7 @@ def _run_checks(self, fin, terminator, gzipped):
     def runTest(self):
         # anything works as long as it contains only \r and \n and ends with \n
         self._test_terminator(b'\n') # the good one
-        self._test_terminator(b'\r\n\r\n') 
+        self._test_terminator(b'\r\n\r\n')
         self._test_terminator(b'\r\n')
         self._test_terminator(b'\n\r\n')
         self._test_terminator(b'\n\n\r\n')
@@ -153,7 +154,7 @@ def _test_terminator(self, terminator):
             self._run_checks(fin, terminator, False)
         finally:
             fin.close()
-        
+
         fin = self._warc_gz(terminator)
         try:
             self._run_checks(fin, terminator, True)
@@ -360,6 +361,239 @@ def test_write_using_stream_gz(self):
             record.write_to(f, gzip=True)
         f.close()
 
+class MultiMemberGzipTest(unittest.TestCase):
+    GZ_0BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00'      # b''
+    GZ_1BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03K\x04\x00C\xbe\xb7\xe8\x01\x00\x00\x00'        # b'a'
+    GZ_2BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03KL\x02\x00mH\x83\x9e\x02\x00\x00\x00'          # b'ab'
+    GZ_3BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03KLJ\x06\x00\xc2A$5\x03\x00\x00\x00'            # b'abc
+    GZ_4BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03KLJN\x01\x00\x11\xcd\x82\xed\x04\x00\x00\x00'  # b'abcd'
+    GZ_5BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03KLJNI\x05\x00e\xd8\x87\x85\x05\x00\x00\x00'    # b'abcde'
+    GZ_6BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03KLJNIM\x03\x00\xef9\x8eK\x06\x00\x00\x00'      # b'abcdef'
+    GZ_7BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03KLJNIMK\x07\x00\xa6j*1\x07\x00\x00\x00'        # b'abcdefg'
+    GZ_8BYTE = b'\x1f\x8b\x08\x00%\xb6\xf1V\x00\x03KLJNIMK\xcf\x00\x00P*\xef\xae\x08\x00\x00\x00' # b'abcdefgh'
+    def test_small_members(self):
+        multimember = (self.GZ_0BYTE + self.GZ_1BYTE + self.GZ_2BYTE + self.GZ_3BYTE + self.GZ_4BYTE
+                       + self.GZ_5BYTE + self.GZ_6BYTE + self.GZ_7BYTE + self.GZ_8BYTE)
+        f = BytesIO(multimember)
+
+        g = MultiMemberGzipReader(f)
+
+        # empty gzip member
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'a')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'ab')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'abc')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'abcd')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'abcde')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'abcdef')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'abcdefg')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        m = g.next()
+        self.assertTrue(isinstance(
+            m, MultiMemberGzipReader.GzipMemberReader))
+        self.assertEquals(m.readline(), b'abcdefgh')
+        self.assertEquals(m.readline(), b'')
+        self.assertEquals(m.readline(), b'')
+
+        self.assertRaises(StopIteration, g.next)
+        self.assertRaises(StopIteration, g.next)
+        self.assertRaises(StopIteration, g.next)
+
+    def test_one_member_long_lines(self):
+        # all the same length
+        line_length = 100000
+        with BytesIO() as f:
+            with gzip.GzipFile(fileobj=f, mode='wb') as g:
+                g.write(b'm' * line_length + b'\n')
+                g.write(b'm' * line_length + b'\n')
+                g.write(b'm' * line_length + b'\n')
+                g.write(b'm' * line_length + b'\n')
+                g.write(b'm' * line_length + b'\n')
+                g.write(b'm' * line_length + b'\n')
+            gz = f.getvalue()
+
+        member_count = 0
+        with BytesIO(gz) as f:
+            with MultiMemberGzipReader(f) as g:
+                for member in g:
+                    member_count += 1
+                    self.assertEquals(member.member_offset, 0)
+                    self.assertEquals(member.readline(),
+                                      b'm' * line_length + b'\n')
+                    self.assertEquals(member.readline(),
+                                      b'm' * line_length + b'\n')
+                    self.assertEquals(member.readline(),
+                                      b'm' * line_length + b'\n')
+                    self.assertEquals(member.readline(),
+                                      b'm' * line_length + b'\n')
+                    self.assertEquals(member.readline(),
+                                      b'm' * line_length + b'\n')
+                    self.assertEquals(member.readline(),
+                                      b'm' * line_length + b'\n')
+                    self.assertEquals(member.readline(), b'')
+                    self.assertEquals(member.readline(), b'')
+                    self.assertEquals(member.readline(), b'')
+
+        self.assertEqual(member_count, 1)
+
+        # increasing lengths
+        line_length = 11
+        with BytesIO() as f:
+            with gzip.GzipFile(fileobj=f, mode='wb') as g:
+                while line_length < 500000:
+                    g.write(b'z' * line_length + b'\n')
+                    line_length *= 2
+            gz = f.getvalue()
+
+        f0 = BytesIO(gz)
+        g0 = gzip.GzipFile(fileobj=f0)
+
+        f1 = BytesIO(gz)
+        g1 = MultiMemberGzipReader(f1)
+        m1 = g1.next()
+
+        i = 0
+        while True:
+            l0 = g0.readline()
+            l1 = m1.readline()
+            self.assertEqual(l0, l1)
+            if not l0:
+                break
+
+        self.assertRaises(StopIteration, g1.next)
+
+    def test_large_members(self):
+        multimember = b''
+        line_length = 11
+        member_offsets = []
+        for n_lines in range(1, 6):
+            with BytesIO() as f:
+                with gzip.GzipFile(fileobj=f, mode='wb') as g:
+                    for n in range(n_lines):
+                        member_offsets.append(len(multimember))
+                        g.write(b'x' * line_length + b'\n')
+                        line_length *= 2
+                multimember += f.getvalue()
+        with BytesIO(multimember) as f:
+            with MultiMemberGzipReader(f) as g:
+                line_length = 11
+                member_count = 0
+                for member in g:
+                    line_count = 0
+                    member_count += 1
+                    for line in member:
+                        self.assertEquals(len(line), line_length + 1)
+                        line_length *= 2
+                        line_count += 1
+                    self.assertEquals(line_count, member_count)
+
+    def test_readline_with_size(self):
+        with BytesIO() as f:
+            with gzip.GzipFile(fileobj=f, mode='wb') as g:
+                g.write(b'x' * 80 + b'\n')
+                g.write(b'x' * 80 + b'\n')
+                g.write(b'x' * 80 + b'\n')
+            three_line_gz = f.getvalue()
+
+        with BytesIO(three_line_gz) as f:
+            with MultiMemberGzipReader(f) as g:
+                for m in g:
+                    self.assertEquals(m.readline(size=40), b'x' * 40)
+                    self.assertEquals(m.readline(size=80), b'x' * 40 + b'\n')
+                    self.assertEquals(m.readline(size=120), b'x' * 80 + b'\n')
+                    self.assertEquals(m.readline(size=120), b'x' * 80 + b'\n')
+                    self.assertEquals(m.readline(size=80), b'')
+
+    ### TODO ###
+    # def test_skip_ahead(self):
+    #     # XXX test skipping ahead to next member after reading
+    #     #  - no data
+    #     #  - some data
+    #     pass
+    # def test_read(self):
+    #     # XXX test read() with and without size
+    #     pass
+
+    # def test_unusual_gzip(self):
+    #     # XXX
+    #     # test with trailing null bytes
+    #     # test with various flags in header
+    #     # test with crazy long FNAME, FCOMMENT
+    #     pass
+
+    # def test_invalid_gzip(self):
+    #     # XXX
+    #     # test crc mismatch
+    #     # test end of stream in middle of
+    #     #  - magic
+    #     #  - various parts of gzip header
+    #     #  - data
+    #     #  - gzip footer
+    #     # test invalid
+    #     #  - magic
+    #     #  - gzip header
+    #     #  - data
+    #     #  - gzip footer
+    #     pass
+
+    # def test_bad_behavior(self):
+    #     # XXX
+    #     # test at different stages of decompression:
+    #     #  - reading from wrapped fileobj
+    #     #  - seeking in wrapped fileobj
+    #     #  - closing wrapped fileobj
+    #     pass
+
+    # def test_delim(self):
+    #     # XXX test GzipMemberReader._read_chunk() with multibyte delimiter
+    #     # with size parameter cutting off read in middle of multibyte delimiter
+    #     pass
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/setup.py b/setup.py
index 1bcd252..35a0b18 100755
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import setup
 
 setup(name='warctools',
-    version="4.8.3",
+    version="5.0.dev1",
     license="MIT License",
     description='Command line tools and libraries for handling and manipulating WARC files (and HTTP contents)',
     author='Thomas Figg',

From 7803c2778174f5a783b87a010669f523354cb201 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Mon, 28 Mar 2016 23:14:30 +0000
Subject: [PATCH 04/13] oops travis has no such version of python "dev"

---
 .travis.yml | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index f59ffda..83f75e5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,12 +1,13 @@
 language: python
 python:
-    - "2.7"
-    - "3.2"
-    - "3.3"
-    - "3.4"
-    - "3.5"
-    - "dev"
-    - "nightly"
-    - "pypy"
+    - 2.7
+    - 3.2
+    - 3.3
+    - 3.4
+    - 3.5
+    - 3.5-dev
+    - nightly
+    - pypy
+    - pypy3
 script: python setup.py test
 

From 6cc3575961d3fae84acc2fc0ad078b86f8b35094 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 Mar 2016 06:39:11 +0000
Subject: [PATCH 05/13] handle arc record header corner cases (copied raj code
 from
 ihttps://github.com/internetarchive/warctools/blob/cdx-writer/hanzo/warctools/arc.py

---
 hanzo/warctools/arc.py | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
index 49465e3..d1b70de 100644
--- a/hanzo/warctools/arc.py
+++ b/hanzo/warctools/arc.py
@@ -164,21 +164,32 @@ def trim(self, stream):
         return ()
 
     def parse_header_list(self, line):
-        # some people use ' ' as the empty value. lovely.
-        line = line.rstrip(b'\r\n')
-        values = SPLIT(line)
-        if len(self.headers) != len(values):
-            if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
-                # fencepost
-                values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))]
-            else:
-                values = SPLIT(line, len(self.headers)-1)
-
-        if len(self.headers) != len(values):
-            raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))
-
-        return list(zip(self.headers, values))
-
+        values = SPLIT(line.strip())
+        num_values = len(values)
+
+        #raj: some headers contain urls with unescaped spaces
+        if num_values > 5:
+            if re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]):
+                values = ['%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]]
+                num_values = len(values)
+
+        if 4 == num_values:
+            #raj: alexa arc files don't always have content-type in header
+            return list(zip(self.short_headers, values))
+        elif 5 == num_values:
+            #normal case
+            #raj: some old alexa arcs have ip-address and date transposed in the header
+            if re.match('^\d{14}$', values[1]) and re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[2]):
+                values[1], values[2] = values[2], values[1]
+
+            return list(zip(self.headers, values))
+        elif 6 == num_values:
+            #raj: some old alexa arcs have "content-type; charset" in the header
+            v = values[0:4]+values[5:]
+            v[3] = v[3].rstrip(';')
+            return list(zip(self.headers, v))
+        else:
+            raise Exception('invalid number of header fields')
 
     @staticmethod
     def get_content_headers(headers):

From af79aa5b8934ad7cc7578cd259e90fbb672c48c8 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Tue, 29 Mar 2016 07:30:10 +0000
Subject: [PATCH 06/13] fix more corner cases, fix py

---
 hanzo/warctools/arc.py | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
index d1b70de..7e31f3e 100644
--- a/hanzo/warctools/arc.py
+++ b/hanzo/warctools/arc.py
@@ -119,6 +119,11 @@ def parse(self, stream, offset, line=None):
             self.version = arc_version.split()[0]
             self.headers = arc_names_line.strip().split()
 
+            # raj: some v1 ARC files are incorrectly sending a v2 header names line
+            if arc_names_line == 'URL IP-address Archive-date Content-type Result-code Checksum Location Offset Filepath Archive-length\n':
+                if arc_version == '1 0 InternetArchive' and 5 == len(line.split(' ')):
+                    self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
             # now we have read header field in record body
             # we can extract the headers from the current record,
             # and read the length field
@@ -164,13 +169,13 @@ def trim(self, stream):
         return ()
 
     def parse_header_list(self, line):
-        values = SPLIT(line.strip())
+        values = line.strip().split(b' ')
         num_values = len(values)
 
         #raj: some headers contain urls with unescaped spaces
         if num_values > 5:
-            if re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]):
-                values = ['%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]]
+            if re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]):
+                values = [b'%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]]
                 num_values = len(values)
 
         if 4 == num_values:
@@ -179,7 +184,7 @@ def parse_header_list(self, line):
         elif 5 == num_values:
             #normal case
             #raj: some old alexa arcs have ip-address and date transposed in the header
-            if re.match('^\d{14}$', values[1]) and re.match('^(?:\d{1,3}\.){3}\d{1,3}$', values[2]):
+            if re.match(b'^\d{14}$', values[1]) and re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[2]):
                 values[1], values[2] = values[2], values[1]
 
             return list(zip(self.headers, values))

From 13fc74dfda9968ee985115d4a5e3cda53940577b Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 30 Mar 2016 19:38:17 +0000
Subject: [PATCH 07/13] never snarf content as a side effect of looking at
 record.content_type

---
 hanzo/warctools/record.py | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/hanzo/warctools/record.py b/hanzo/warctools/record.py
index 9d9d094..b1f52cf 100644
--- a/hanzo/warctools/record.py
+++ b/hanzo/warctools/record.py
@@ -55,10 +55,6 @@ def error(self, *args):
     def type(self):
         return self.get_header(self.TYPE)
 
-    @property
-    def content_type(self):
-        return self.content[0]
-
     @property
     def content_file(self):
         """
@@ -103,15 +99,13 @@ def content(self):
 
     @property
     def content_type(self):
-        """If self.content tuple was supplied, or has already been snarfed, or
-        we don't have a Content-Type header, return self.content[0]. Otherwise, 
-        return the value of the Content-Type header."""
-        if self._content is None:
-            content_type = self.get_header(self.CONTENT_TYPE)
-            if content_type is not None:
-                return content_type
-
-        return self.content[0]
+        """If self.content tuple was supplied, or has already been snarfed,
+        return self.content[0]. Otherwise, return the value of the Content-Type
+        header."""
+        if self._content:
+            return self._content[0]
+        else:
+            return self.get_header(self.CONTENT_TYPE)
 
     @property
     def content_length(self):

From 9c346e0550581ac526b057381eea7ddfaac4f236 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 30 Mar 2016 19:38:27 +0000
Subject: [PATCH 08/13] improve error message

---
 hanzo/warctools/stream.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hanzo/warctools/stream.py b/hanzo/warctools/stream.py
index b1c959c..e9502de 100644
--- a/hanzo/warctools/stream.py
+++ b/hanzo/warctools/stream.py
@@ -25,7 +25,7 @@ def open_record_stream(record_class=None, filename=None, file_handle=None,
         record_class = guess_record_type(file_handle)
 
     if record_class == None:
-        raise Exception('Failed to guess compression')
+        raise Exception('Failed to guess record type')
 
     record_parser = record_class.make_parser()
 

From 2d3cbc5b0652b01fcf83b2ee56a97d202de4ec26 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Wed, 30 Mar 2016 19:39:43 +0000
Subject: [PATCH 09/13] remove unusued SPLIT thing

---
 hanzo/warctools/arc.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
index 7e31f3e..ec69b19 100644
--- a/hanzo/warctools/arc.py
+++ b/hanzo/warctools/arc.py
@@ -68,10 +68,6 @@ def rx(pat):
 nl_rx = rx('^\r\n|\r|\n$')
 length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
 type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')     #pylint: disable-msg=E1101
-#raj/noah: change the call to split below to only split on space (some arcs
-#have a \x0c formfeed character in the url)
-# SPLIT = re.compile(br'\b\s|\s\b').split
-SPLIT = re.compile(br'\b | \b').split
 
 class ArcParser(ArchiveParser):
     """A parser for arc archives."""

From fb49c7c3e394697eb99a8149bc54f098462c7a84 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 2 Sep 2016 12:32:46 -0700
Subject: [PATCH 10/13] change to RST (restructured text) and add travis-ci
 badge

---
 README     | 76 -----------------------------------------------
 README.rst | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 76 deletions(-)
 delete mode 100644 README
 create mode 100644 README.rst

diff --git a/README b/README
deleted file mode 100644
index c51c46e..0000000
--- a/README
+++ /dev/null
@@ -1,76 +0,0 @@
-dependencies
-	setuptools
-	unittest2
-	python 2.6
-
-hanzo warc tools:
-
-    warcvalid.py
-        returns 0 if the arguments are all valid arc/warc files
-        non zero on error 
-
-    warcdump.py - writes human readable summary of warcfiles:
-        usage: python warcdump.py foo.warc foo.warc.gz
-        autodetects input format when filenames are passed
-        i.e recordgzip vs plaintext, warc vs arc
-
-        assumes uncompressed warc on stdin if no args
-
-    warcfilter.py 
-        python warcfilter.py pattern file file file
-            searches all headers for regex pattern
-        use -i to invert search
-        use -U to constrain to url
-        use -T to constrain to record type
-        use -C to constrain to content-type
-            
-        autodetects and stdin like warcdump
-
-        prints out a warc format by default.
-
-    warc2warc.py:
-        python warc2warc <input files>
-
-        autodetects compression on file
-        args, assumes uncompressed stdin if none
-
-        use -Z to write compressed output
-
-        i.e warc2warc -Z input > input.gz
-
-        should ignore buggy records in input
-
-    arc2warc.py
-        creates a crappy warc file from arc files on input
-        a handful of headers are preserved
-        use -Z to write compressed output
-        i.e arc2warc -Z input.arc > input.warc.gz
-
-    warcindex.py
-        spits out an index like this:
-#WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length
-warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd1255a8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=request 193
-warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd2614f8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=response 3279474
-        not great, but a start
-                
-notes:
-
-    arc2warc uses the conversion rules from the earlier arc2warc.c
-    as a starter for converting the headers
-
-    I haven't profiled the code yet (and don't plan to until it falls over)
-
-    warcvalid barely skirts some of the iso standard:
-        missing things: strict whitespace, required headers check...
-	mime quoted printable header encoding
-	treating headers as utf8
-
-things left to do (in no order):
-    lots more testing.
-    supporting pre 1.0 warc files
-    add more documentation
-    support more commandline options for output and filenames
-    s3 urls
-
-
--- tef thomas.figg@hanzoarchives.com
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..a1797f6
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,87 @@
+.. image:: https://travis-ci.org/internetarchive/warctools.svg?branch=master
+    :target: https://travis-ci.org/internetarchive/warctools
+
+warctools
+=========
+
+dependencies
+
+	- setuptools
+	- unittest2
+	- python 2.7, 3.2+
+
+hanzo warc tools:
+
+warcvalid.py
+        returns 0 if the arguments are all valid arc/warc files
+        non zero on error 
+
+warcdump.py - writes human readable summary of warcfiles:
+        usage: ``python warcdump.py foo.warc foo.warc.gz``
+        
+        autodetects input format when filenames are passed,
+        i.e recordgzip vs plaintext, warc vs arc
+
+        assumes uncompressed warc on stdin if no args
+
+warcfilter.py 
+        ``python warcfilter.py pattern file file file`` -- 
+        searches all headers for regex pattern
+        
+        - use -i to invert search
+        - use -U to constrain to url
+        - use -T to constrain to record type
+        - use -C to constrain to content-type
+            
+        autodetects and stdin like warcdump
+
+        prints out a warc format by default.
+
+warc2warc.py:
+        ``python warc2warc <input files>``
+
+        autodetects compression on file
+        args, assumes uncompressed stdin if none
+
+        use -Z to write compressed output
+
+        i.e warc2warc -Z input > input.gz
+
+        should ignore buggy records in input
+
+arc2warc.py
+        creates a crappy warc file from arc files on input
+        a handful of headers are preserved
+        use -Z to write compressed output,
+        i.e ``arc2warc -Z input.arc > input.warc.gz``
+
+warcindex.py
+        spits out an index like this::
+        
+            #WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length
+            warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd1255a8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=request 193
+            warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd2614f8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=response 3279474
+            
+        not great, but a start
+                
+notes:
+    arc2warc uses the conversion rules from the earlier arc2warc.c
+    as a starter for converting the headers
+
+    I haven't profiled the code yet (and don't plan to until it falls over)
+
+    warcvalid barely skirts some of the iso standard:
+        missing things: strict whitespace, required headers check...
+	mime quoted printable header encoding
+	treating headers as utf8
+
+things left to do (in no order):
+
+    - lots more testing.
+    - supporting pre 1.0 warc files
+    - add more documentation
+    - support more commandline options for output and filenames
+    - s3 urls
+
+
+-- tef thomas.figg@hanzoarchives.com

From f0b8866de966888bd493e0681716e5f9bdfbe9d6 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 2 Sep 2016 12:33:59 -0700
Subject: [PATCH 11/13] unindent lists to make them look normal

---
 README.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.rst b/README.rst
index a1797f6..7740805 100644
--- a/README.rst
+++ b/README.rst
@@ -6,9 +6,9 @@ warctools
 
 dependencies
 
-	- setuptools
-	- unittest2
-	- python 2.7, 3.2+
+- setuptools
+- unittest2
+- python 2.7, 3.2+
 
 hanzo warc tools:
 
@@ -77,11 +77,11 @@ notes:
 
 things left to do (in no order):
 
-    - lots more testing.
-    - supporting pre 1.0 warc files
-    - add more documentation
-    - support more commandline options for output and filenames
-    - s3 urls
+- lots more testing.
+- supporting pre 1.0 warc files
+- add more documentation
+- support more commandline options for output and filenames
+- s3 urls
 
 
 -- tef thomas.figg@hanzoarchives.com

From 1e5b380fe23623ab7dc3421285eaffaf1055c335 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Fri, 2 Sep 2016 13:08:27 -0700
Subject: [PATCH 12/13] not using mercurial anymore

---
 .hgignore | 17 -----------------
 .hgtags   |  9 ---------
 2 files changed, 26 deletions(-)
 delete mode 100644 .hgignore
 delete mode 100644 .hgtags

diff --git a/.hgignore b/.hgignore
deleted file mode 100644
index c72a731..0000000
--- a/.hgignore
+++ /dev/null
@@ -1,17 +0,0 @@
-syntax: glob
-*.swp
-*.log
-*.pyc
-*.pyo
-*.warc
-*.gz
-login.txt
-.DS_Store
-build/*
-dist/*
-hanzo_warc_tools.egg-info/*
-*~
-*.orig
-debian/*
-*.deb
-test-reports/*
diff --git a/.hgtags b/.hgtags
deleted file mode 100644
index 26ce816..0000000
--- a/.hgtags
+++ /dev/null
@@ -1,9 +0,0 @@
-58d7d99406b04e7c36bfba1c91e2b06f558c22ee hanzo-4.0-rc0
-764a52f90a951a8c4acc9c9f60f5d8321662d418 hanzo-4.0-rc1
-94b65646332e5e86f3d274f66e38ce26cc30ccad hanzo-4.0
-092e8d0615ecc5ace8b067edbeacd5e3b12c9be0 hanzo-4.1-rc0
-8f64ab5556344065cd68e0cf8265af87e6b9d0cf hanzo-4.1-rc1
-8ceff9fcde584ec577048dbd9a13743d31dfc74f hanzo-4.1-rc2
-f54be58d0d8b3aa47b3f935a732a7b5752f0e92e hanzo-4.1-rc4
-0a1d728557b8d29b15b3796f83b6a9dc7f25abff build_success-2012-09-14T15-24-42.616660024
-741fe327f233f936cd65c6e2c415cd01f9fc9871 build_success-2012-09-14T16-25-56.483325901

From cf898415d1731f18f3d02ccff6afa0a63175fbd8 Mon Sep 17 00:00:00 2001
From: Noah Levitt <nlevitt@archive.org>
Date: Thu, 26 Jan 2017 09:55:11 -0800
Subject: [PATCH 13/13] fix bug spinning forever in case gzip input stream ends
 in the middle of the data section, and add tests for incomplete gzip input

---
 hanzo/warctools/gz.py                   |  7 ++++++-
 hanzo/warctools/tests/test_warctools.py | 25 +++++++++++++++++++------
 setup.py                                |  2 +-
 3 files changed, 26 insertions(+), 8 deletions(-)

diff --git a/hanzo/warctools/gz.py b/hanzo/warctools/gz.py
index c7db399..5a62a98 100644
--- a/hanzo/warctools/gz.py
+++ b/hanzo/warctools/gz.py
@@ -111,7 +111,12 @@ def _decompress_until(self, size=-1, delim=None):
                 self._dbuf = self._dbuf[end:]
                 return res
 
-            self._dbuf += self._decompressor.decompress(self._cbuf.next_chunk())
+            tmp_cbuf = self._cbuf.next_chunk()
+            if tmp_cbuf == b'':
+                raise EOFError(
+                        'Compressed file ended before the end-of-stream '
+                        'marker was reached')
+            self._dbuf += self._decompressor.decompress(tmp_cbuf)
             if self._decompressor.unused_data != b'':
                 self._cbuf.rewind(len(self._decompressor.unused_data))
                 self._skip_eof()
diff --git a/hanzo/warctools/tests/test_warctools.py b/hanzo/warctools/tests/test_warctools.py
index ab05fdf..05bfd35 100644
--- a/hanzo/warctools/tests/test_warctools.py
+++ b/hanzo/warctools/tests/test_warctools.py
@@ -570,17 +570,30 @@ def test_readline_with_size(self):
     # def test_invalid_gzip(self):
     #     # XXX
     #     # test crc mismatch
-    #     # test end of stream in middle of
-    #     #  - magic
-    #     #  - various parts of gzip header
-    #     #  - data
-    #     #  - gzip footer
     #     # test invalid
     #     #  - magic
     #     #  - gzip header
     #     #  - data
     #     #  - gzip footer
-    #     pass
+
+    # test end of stream in middle of
+    #  - magic
+    #  - various parts of gzip header
+    #  - data
+    #  - gzip footer
+    def test_incomplete_gzip(self):
+        for l in range(1, len(self.GZ_8BYTE)):
+            with BytesIO(self.GZ_8BYTE[:l]) as f:
+                with MultiMemberGzipReader(f) as g:
+                    with self.assertRaises(Exception):
+                        for m in g:
+                            buf = m.read()
+
+        # sanity check of the full gzip
+        with BytesIO(self.GZ_8BYTE) as f:
+            with MultiMemberGzipReader(f) as g:
+                for m in g:
+                    self.assertEqual(b'abcdefgh', m.read())
 
     # def test_bad_behavior(self):
     #     # XXX
diff --git a/setup.py b/setup.py
index 35a0b18..ecd4c1b 100755
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 from setuptools import setup
 
 setup(name='warctools',
-    version="5.0.dev1",
+    version="5.0.dev2",
     license="MIT License",
     description='Command line tools and libraries for handling and manipulating WARC files (and HTTP contents)',
     author='Thomas Figg',