diff --git a/.gitignore b/.gitignore
index 0a312c6..c647a60 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,4 @@ docs/_build/
 build/
 .coverage
 htmlcov/
+.ropeproject/
diff --git a/Readme.rst b/Readme.rst
index b918dc5..5627d82 100644
--- a/Readme.rst
+++ b/Readme.rst
@@ -1,9 +1,7 @@
-warc: Python library to work with WARC files
-============================================
+warc3: Python3 library to work with WARC files
+==============================================
 
-.. image:: https://secure.travis-ci.org/anandology/warc.png?branch=master
-   :alt: build status
-   :target: http://travis-ci.org/anandology/warc
+Note: This is a fork of the original (now dead) warc repository.
 
 WARC (Web ARChive) is a file format for storing web crawls.
 
@@ -12,14 +10,17 @@ http://bibnum.bnf.fr/WARC/
 This `warc` library makes it very easy to work with WARC files.::
 
     import warc
-    f = warc.open("test.warc")
-    for record in f:
-        print record['WARC-Target-URI'], record['Content-Length']
+    with warc.open("test.warc") as f:
+        for record in f:
+            print record['WARC-Target-URI'], record['Content-Length']
 
 Documentation
 -------------
 
 The documentation of the warc library is available at http://warc.readthedocs.org/.
+
+Apart from the install from pip, which will not work for this warc3 version, the
+interface as described there is unchanged.
 	
 License
 -------
@@ -27,3 +28,17 @@ License
 This software is licensed under GPL v2. See LICENSE_ file for details.
 
 .. LICENSE: http://github.com/internetarchive/warc/blob/master/LICENSE
+
+Authors
+-------
+
+Original Python2 Versions:
+
+* Anand Chitipothu
+* Noufal Ibrahim
+
+Python3 Port:
+
+* Ryan Chartier 
+* Jan Pieter Bruins Slot
+* Almer S. Tigelaar
diff --git a/docs/conf.py b/docs/conf.py
index 4469a18..a59b35c 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -40,8 +40,8 @@
 master_doc = 'index'
 
 # General information about the project.
-project = u'warc'
-copyright = u'2012, Internet Archive'
+project = 'warc'
+copyright = '2012, Internet Archive'
 
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
@@ -178,8 +178,8 @@
 # Grouping the document tree into LaTeX files. List of tuples
 # (source start file, target name, title, author, documentclass [howto/manual]).
 latex_documents = [
-  ('index', 'warc.tex', u'WARC Documentation',
-   u'Internet Archive', 'manual'),
+  ('index', 'warc.tex', 'WARC Documentation',
+   'Internet Archive', 'manual'),
 ]
 
 # The name of an image file (relative to this directory) to place at the top of
@@ -211,6 +211,6 @@
 # One entry per manual page. List of tuples
 # (source start file, name, description, authors, manual section).
 man_pages = [
-    ('index', 'warc', u'WARC Documentation',
-     [u'Internet Archive'], 1)
+    ('index', 'warc', 'WARC Documentation',
+     ['Internet Archive'], 1)
 ]
diff --git a/requirements.txt b/requirements.txt
index e079f8a..f3c7e8e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
-pytest
+nose
diff --git a/setup.py b/setup.py
index 23a2f87..07feba3 100644
--- a/setup.py
+++ b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name="warc",
-    version="0.2.0",
+    version="0.2.2",
     description="Python library to work with ARC and WARC files",
     long_description=open('Readme.rst').read(),
     license='GPLv2',
@@ -19,7 +19,7 @@
         'Development Status :: 4 - Beta',
         'Environment :: Web Environment',
         'Intended Audience :: Developers',
-        'License :: OSI Approved :: BSD License',
+        'License :: OSI Approved :: GNU General Public License v2 (GPLv2)',
         'Operating System :: OS Independent',
         'Programming Language :: Python',
     ],
diff --git a/warc/__init__.py b/warc/__init__.py
index 71392bc..32a04b8 100644
--- a/warc/__init__.py
+++ b/warc/__init__.py
@@ -7,27 +7,30 @@
 :copyright: (c) 2012 Internet Archive
 """
 
-from .arc import ARCFile, ARCRecord, ARCHeader
-from .warc import WARCFile, WARCRecord, WARCHeader, WARCReader
+from .arc import ARCFile
+from .warc import WARCFile
+
 
 def detect_format(filename):
     """Tries to figure out the type of the file. Return 'warc' for
     WARC files and 'arc' for ARC files"""
 
-    if ".arc" in filename:
-        return "arc"
-    if ".warc" in filename:
+    if filename.endswith(".warc") or filename.endswith(".warc.gz"):
         return "warc"
 
+    if filename.endswith('.arc') or filename.endswith('.arc.gz'):
+        return 'arc'
+
     return "unknown"
 
-def open(filename, mode="rb", format = None):
+
+def open(filename, mode="rb", format=None):
     """Shorthand for WARCFile(filename, mode).
 
     Auto detects file and opens it.
 
     """
-    if format == "auto" or format == None:
+    if format == "auto" or format is None:
         format = detect_format(filename)
 
     if format == "warc":
@@ -35,4 +38,4 @@ def open(filename, mode="rb", format = None):
     elif format == "arc":
         return ARCFile(filename, mode)
     else:
-        raise IOError("Don't know how to open '%s' files"%format)
+        raise IOError("Don't know how to open '%s' files" % format)
diff --git a/warc/arc.py b/warc/arc.py
index 5889587..93cb9ec 100644
--- a/warc/arc.py
+++ b/warc/arc.py
@@ -1,20 +1,29 @@
 """
-Provides support for ARC v1 files. 
+Provides support for ARC v1 files.
 
 :copyright: (c) 2012 Internet Archive
 """
 
-import __builtin__
+import builtins
 import datetime
 import os
 import re
-import StringIO
+import io
 import warnings
+import gzip
 
-from .utils import CaseInsensitiveDict
+from .utils import CaseInsensitiveDict, status_code
+
+
+ARC1_HEADER_RE = re.compile(('(?P<url>\S*)\s(?P<ip_address>\S*)\s(?P<date>\S*)'
+                             '\s(?P<content_type>\S*)\s(?P<length>\S*)'))
+
+ARC2_HEADER_RE = re.compile(('(?P<url>\S*)\s(?P<ip_address>\S*)\s(?P<date>\S*)'
+                             '\s(?P<content_type>\S*)\s(?P<result_code>\S*)'
+                             '\s(?P<checksum>\S*)\s(?P<location>\S*)'
+                             '\s(?P<offset>\S*)\s(?P<filename>\S*)'
+                             '\s(?P<length>\S*)'))
 
-ARC1_HEADER_RE = re.compile('(?P<url>\S*)\s(?P<ip_address>\S*)\s(?P<date>\S*)\s(?P<content_type>\S*)\s(?P<length>\S*)')
-ARC2_HEADER_RE = re.compile('(?P<url>\S*)\s(?P<ip_address>\S*)\s(?P<date>\S*)\s(?P<content_type>\S*)\s(?P<result_code>\S*)\s(?P<checksum>\S*)\s(?P<location>\S*)\s(?P<offset>\S*)\s(?P<filename>\S*)\s(?P<length>\S*)')
 
 class ARCHeader(CaseInsensitiveDict):
     """
@@ -27,22 +36,24 @@ class ARCHeader(CaseInsensitiveDict):
         * content_type
         * length (length of the n/w doc in bytes)
 
-    V2 header fields are 
+    V2 header fields are
 
         * url
         * ip_address
         * date (date of archival)
-        * content_type 
+        * content_type
         * result_code (response code)
-        * checksum 
+        * checksum
         * location
         * offset (offset from beginning of file to recrod)
         * filename (name of arc file)
         * length (length of the n/w doc in bytes)
 
     """
-    def __init__(self, url = "",  ip_address = "",  date = "",  content_type = "",  
-                 result_code = "",  checksum = "",  location = "",  offset = "",  filename = "",  length = "", version = 2):
+    def __init__(self, url="",  ip_address="",  date="",
+                 content_type="", result_code="",  checksum="",
+                 location="",  offset="",  filename="",  length=0,
+                 version=2):
 
         if isinstance(date, datetime.datetime):
             date = date.strftime("%Y%m%d%H%M%S")
@@ -50,25 +61,26 @@ def __init__(self, url = "",  ip_address = "",  date = "",  content_type = "",
             try:
                 datetime.datetime.strptime(date, "%Y%m%d%H%M%S")
             except ValueError:
-                raise ValueError("Couldn't parse the date '%s' in file header"%date)
+                raise ValueError("Couldn't parse the date '%s' in file "
+                                 "header" % date)
 
         self.version = version
-
-        CaseInsensitiveDict.__init__(self, 
-                                     url = url, 
-                                     ip_address = ip_address,
-                                     date = date,
-                                     content_type = content_type,
-                                     result_code = result_code,
-                                     checksum = checksum,
-                                     location = location,
-                                     offset = offset,
-                                     filename = filename,
-                                     length = length)
-    
-    def write_to(self, f, version = None):
+        super().__init__({
+            'url': url,
+            'ip_address': ip_address,
+            'date': date,
+            'content_type': content_type,
+            'result_code': result_code,
+            'checksum': checksum,
+            'location': location,
+            'offset': offset,
+            'filename': filename,
+            'length': int(length),
+        })
+
+    def write_to(self, f, version=None):
         """
-        Writes out the arc header to the file like object `f`. 
+        Writes out the arc header to the file like object `f`.
 
         If the version field is 1, it writes out an arc v1 header,
         otherwise (and this is default), it outputs a v2 header.
@@ -77,96 +89,115 @@ def write_to(self, f, version = None):
         if not version:
             version = self.version
         if version == 1:
-            header = "%(url)s %(ip_address)s %(date)s %(content_type)s %(length)s"
+            header = ("%(url)s %(ip_address)s %(date)s "
+                      "%(content_type)s %(length)s")
         elif version == 2:
-            header = "%(url)s %(ip_address)s %(date)s %(content_type)s %(result_code)s %(checksum)s %(location)s %(offset)s %(filename)s %(length)s"
-
-        header =  header%dict(url          = self['url'],
-                              ip_address   = self['ip_address'],
-                              date         = self['date'],
-                              content_type = self['content_type'],
-                              result_code  = self['result_code'],
-                              checksum     = self['checksum'],
-                              location     = self['location'],
-                              offset       = self['offset'],
-                              filename     = self['filename'],
-                              length       = self['length'])
+            header = ("%(url)s %(ip_address)s %(date)s %(content_type)s "
+                      "%(result_code)s %(checksum)s %(location)s %(offset)s "
+                      "%(filename)s %(length)s")
+
+        header = header % dict(self)
         f.write(header)
-            
 
     @property
     def url(self):
         return self["url"]
-    
+
     @property
     def ip_address(self):
         return self["ip_address"]
-    
+
     @property
     def date(self):
         return datetime.datetime.strptime(self['date'], "%Y%m%d%H%M%S")
-    
+
     @property
     def content_type(self):
         return self["content_type"]
-    
+
     @property
     def result_code(self):
         return self["result_code"]
-    
+
     @property
-    def checksum (self):
+    def checksum(self):
         return self["checksum"]
-    
+
     @property
     def location(self):
         return self["location"]
-    
+
     @property
     def offset(self):
         return int(self["offset"])
-    
+
     @property
     def filename(self):
         return self["filename"]
-    
+
     @property
     def length(self):
         return int(self["length"])
 
     def __str__(self):
-        f = StringIO.StringIO()
+        f = io.StringIO()
         self.write_to(f)
         return f.getvalue()
-        
+
     def __repr__(self):
         f = {}
-        for i in "url ip_address date content_typeresult_code checksum location offset filename length".split():
-            if hasattr(self,i):
+        fields = ("url ip_address date content_type result_code checksum "
+                  "location offset filename length".split())
+        for i in fields:
+            if hasattr(self, i):
                 f[i] = getattr(self, i)
-        s = ['%s = "%s"'%(k, v) for k,v in f.iteritems()]
+        s = ['%s = "%s"' % (k, v) for k, v in f.items()]
         s = ", ".join(s)
-        return "<ARCHeader(%s)>"%s
+        return "<ARCHeader(%s)>" % s
+
 
-        
 class ARCRecord(object):
-    def __init__(self, header = None, payload = None, headers = {}, version = None):
+    def __init__(self, header=None, payload=None, headers={}, version=None):
         if not (header or headers):
-            raise TypeError("Can't write create an ARC1 record without a header")
-        self.header = header or ARCHeader(version = version, **headers)
-        self.payload = payload
+            raise TypeError("Can't write create an ARC1 record "
+                            "without a header")
+        self.header = header or ARCHeader(version=version, **headers)
+        self.payload = io.BytesIO(payload)
         self.version = version
-    
+        self._read_html_headers()
+
+    def _read_html_headers(self):
+        line = self.payload.readline().decode('utf-8')
+        if not line.startswith("HTTP/1"):
+            self.payload.seek(0)
+            return
+
+        line = line.strip()
+        headers = {
+            'protocol': line,
+            'status_code': status_code(line),
+        }
+        for line in self.payload:
+            line = line.decode('utf-8')
+            if not line.strip():
+                break
+            name, content = line.split(':', 1)
+            name = name.strip()
+            content = content.strip()
+            headers[name.lower()] = content
+        self.header['http_headers'] = headers
+        self.payload = io.BytesIO(self.payload.read())
+
     @classmethod
     def from_string(cls, string, version):
         """
-        Constructs an ARC record from a string and returns it.  
+        Constructs an ARC record from a string and returns it.
 
         TODO: It might be best to merge this with the _read_arc_record
         function rather than reimplement the functionality here.
         """
-        header, payload = string.split("\n",1)
-        if payload[0] == '\n': # There's an extra
+        header, payload = string.split("\n", 1)
+        if payload[0] == '\n':  # There's an extra
             payload = payload[1:]
         if int(version) == 1:
             arc_header_re = ARC1_HEADER_RE
@@ -176,16 +207,19 @@ def from_string(cls, string, version):
         matches = arc_header_re.search(header)
         headers = matches.groupdict()
         arc_header = ARCHeader(**headers)
-        return cls(header = arc_header, payload = payload, version = version)
+        return cls(header=arc_header, payload=payload, version=version)
 
-    def write_to(self, f, version = None):
+    def write_to(self, f, version=None):
         version = version or self.version or 2
         self.header.write_to(f, version)
-        f.write("\n") # This separates the header and the body
-        if isinstance(self.payload, str): #Usually used for small payloads
+        f.write("\n")  # This separates the header and the body
+        # Usually used for small payloads
+        if isinstance(self.payload, str):
             f.write(self.payload)
-        elif hasattr(self.payload, "read"): #Used for large payloads where we give a file like object
-            chunk_size = 10 * 1024 * 1024 # Read 10MB by 10MB
+        # Used for large payloads where we give a file like object
+        elif hasattr(self.payload, "read"):
+            # Read 10MB by 10MB
+            chunk_size = 10 * 1024 * 1024
             d = self.payload.read(chunk_size)
             while d:
                 f.write(d)
@@ -198,20 +232,20 @@ def __getitem__(self, name):
     def __setitem__(self, name, value):
         self.header[name] = value
 
-    
     def __str__(self):
-        f = StringIO.StringIO()
+        f = io.StringIO()
         self.write_to(f)
         return f.getvalue()
-        
-    
+
+
 class ARCFile(object):
-    def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}):
+    def __init__(self, filename=None, mode=None, fileobj=None, version=None,
+                 file_headers=None, compress=None):
         """
         Initialises a file like object that can be used to read or
         write Arc files. Works for both version 1 or version 2.
 
-        This can be called similar to the builtin `file` constructor. 
+        This can be called similar to the builtin `file` constructor.
 
         It can also just be given a fileobj which is a file like
         object that it will use directly for its work.
@@ -223,7 +257,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
 
            * ip_address - IP address of the machine doing the Archiving
            * date - Date of archival
-           * org - Organisation that's doing the Archiving. 
+           * org - Organisation that's doing the Archiving.
 
         The version parameter tries to work intuitively as follows
 
@@ -247,20 +281,41 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_
                   * When we try to read a record, it will read out one
                     record and try to guess the version from it (for
                     the first read).
-        
+
         """
         if fileobj is None:
-            fileobj = __builtin__.open(filename, mode or "rb")
+            fileobj = builtins.open(filename, mode or "rb")
+            mode = fileobj.mode
+        # initialize compress based on filename, if not already specified
+        if compress is None and filename and filename.endswith(".gz"):
+            compress = True
+
+        if compress:
+            fileobj = gzip.open(fileobj, mode)
+
         self.fileobj = fileobj
 
-        if version != None and int(version) not in (1, 2):
+        self.filename = filename
+        if self.filename is None:
+            if hasattr(self.fileobj, "name"):
+                self.filename = self.fileobj.name
+            else:
+                self.filename = ""
+
+        if version and int(version) not in (1, 2):
             raise TypeError("ARC version has to be 1 or 2")
         self.version = version
-        self.file_headers = file_headers
+        self.file_headers = file_headers or {}
         self.header_written = False
         self.header_read = False
+        self.file_meta = ''
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
 
-        
     def _write_header(self):
         "Writes out an ARC header"
         if "org" not in self.file_headers:
@@ -268,31 +323,37 @@ def _write_header(self):
             self.file_headers['org'] = "Unknown"
         if "date" not in self.file_headers:
             now = datetime.datetime.utcnow()
-            warnings.warn("Using '%s' for Archiving time"%now)
+            warnings.warn("Using '%s' for Archiving time" % now)
             self.file_headers['date'] = now
         if "ip_address" not in self.file_headers:
-            warnings.warn("Using '127.0.0.1' as IP address of machine that's archiving")
+            warnings.warn("Using '127.0.0.1' as IP address of machine "
+                          "that's archiving")
             self.file_headers['ip_address'] = "127.0.0.1"
         if self.version == 1:
-            payload = "1 0 %(org)s\nURL IP-address Archive-date Content-type Archive-length"%dict(org = self.file_headers['org'])
+            payload = ("1 0 %s\nURL IP-address Archive-date Content-type "
+                       "Archive-length") % self.file_headers['org']
         elif self.version == 2:
-            payload = "2 0 %(org)s\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length"
+            payload = ("2 0 %s\nURL IP-address Archive-date Content-type "
+                       "Result-code Checksum Location Offset Filename "
+                       "Archive-length" % self.file_headers['org'])
         else:
-            raise IOError("Can't write an ARC file with version '\"%s\"'"%self.version)
-        
-        fname = os.path.basename(self.fileobj.name)
-        header = ARCHeader(url = "filedesc://%s"%fname,
-                           ip_address = self.file_headers['ip_address'], 
-                           date = self.file_headers['date'],
-                           content_type = "text/plain", 
-                           length = len(payload),
-                           result_code = "200",
-                           checksum = "-", 
-                           location = "-",
-                           offset = str(self.fileobj.tell()),
-                           filename = fname)
-        arc_file_header_record = ARCRecord(header, payload%self.file_headers)
-        self.write(arc_file_header_record)
+            raise IOError("Can't write an ARC file "
+                          "with version '\"%s\"'" % self.version)
+
+        fname = os.path.basename(self.filename)
+        header = ARCHeader(url="filedesc://%s" % fname,
+                           ip_address=self.file_headers['ip_address'],
+                           date=self.file_headers['date'],
+                           content_type="text/plain",
+                           length=len(payload),
+                           result_code="200",
+                           checksum="-",
+                           location="-",
+                           offset=str(self.fileobj.tell()),
+                           filename=fname)
+        arc_file_header_record = ARCRecord(header, payload % self.file_headers)
+        arc_file_header_record.write_to(self.fileobj, self.version)
+        self.fileobj.write("\n")  # record separator
 
     def write(self, arc_record):
         "Writes out the given arc record to the file"
@@ -302,75 +363,114 @@ def write(self, arc_record):
             self.header_written = True
             self._write_header()
         arc_record.write_to(self.fileobj, self.version)
-        self.fileobj.write("\n") # Record separator
+        self.fileobj.write("\n")  # Record separator
 
     def _read_file_header(self):
         """Reads out the file header for the arc file. If version was
         not provided, this will autopopulate it."""
-        header = self.fileobj.readline()
-        payload1 = self.fileobj.readline()
-        payload2 = self.fileobj.readline()
-        version, reserved, organisation = payload1.split(None, 2)
-        self.fileobj.readline() # Lose the separator newline
+        header = self.fileobj.readline().decode('utf-8')
+        payload1 = self.fileobj.readline().decode('utf-8')
+        payload2 = self.fileobj.readline().decode('utf-8')
+        version, reserved, organisation = payload1.split(maxsplit=2)
         self.header_read = True
-        # print "--------------------------------------------------"
-        # print header,"\n", payload1, "\n", payload2,"\n"
-        # print "--------------------------------------------------"
+        version = int(version)
+        # print("--------------------------------------------------")
+        # print(header, "\n", payload1, "\n", payload2, "\n", version)
+        # print("--------------------------------------------------")
         if self.version and int(self.version) != version:
-            raise IOError("Version mismatch. Requested version was '%s' but version in file was '%s'"%(self.version, version))
-        
-        if version == '1':
+            raise IOError("Version mismatch. Requested version was '%s' but "
+                          "version in file was '%s'" % (self.version, version))
+
+        if version == 1:
             url, ip_address, date, content_type, length = header.split()
-            self.file_headers = {"ip_address" : ip_address,
-                                 "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"),
-                                 "org" : organisation}
+            self.file_headers = {
+                "ip_address": ip_address,
+                "date": datetime.datetime.strptime(date, "%Y%m%d%H%M%S"),
+                "org": organisation,
+                "url": url,
+                'content_type': content_type,
+                'length': int(length),
+            }
             self.version = 1
-        elif version == '2':
-            url, ip_address, date, content_type, result_code, checksum, location, offset, filename, length  = header.split()
-            self.file_headers = {"ip_address" : ip_address,
-                                 "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"),
-                                 "org" : organisation}
+        elif version == 2:
+            (url, ip_address, date, content_type, result_code,
+             checksum, location, offset, filename, length) = header.split()
+            self.file_headers = {
+                "ip_address": ip_address,
+                "date": datetime.datetime.strptime(date, "%Y%m%d%H%M%S"),
+                "org": organisation,
+                'url': url,
+                'content_type': content_type,
+                'length': int(length),
+                'filename': filename,
+                'location': location,
+            }
             self.version = 2
         else:
-            raise IOError("Unknown ARC version '%s'"%version)
+            raise IOError("Unknown ARC version '%s'" % version)
+
+        length = int(length)
+        current_size = len(payload1 + payload2)
+        self.file_meta = b''
+        while current_size < length:
+            line = self.fileobj.readline()
+            self.file_meta = self.file_meta + line
+            current_size = current_size + len(line)
+        self.fileobj.readline()  # Lose the separator newline
+
+    def _strip_initial_new_lines(self):
+        line = self.fileobj.readline()
+        while line and not line.strip():
+            line = self.fileobj.readline()
+        return line.decode('utf-8').strip()
+
+    def _safe_from_arcmetadata(self, line):
+        # JG: this block stops the header parser / reader
+        # from getting caught on the <arcmetadata> XML lump
+        # that can appear in ARC files
+        if line.startswith("<arcmetadata"):
+            while not line.endswith("</arcmetadata>\n"):
+                line = self.fileobj.readline().decode('utf-8')
+            line = self.fileobj.readline().decode('utf-8')
+            line = self.fileobj.readline().decode('utf-8')
+        return line.strip()
+
+    def _read_record_header(self, line):
+        if self.version == 1:
+            arc_header_re = ARC1_HEADER_RE
+        elif self.version == 2:
+            arc_header_re = ARC2_HEADER_RE
+
+        matches = arc_header_re.search(line)
+        headers = matches.groupdict()
+        return ARCHeader(**headers)
 
     def _read_arc_record(self):
         "Reads out an arc record, formats it and returns it"
-        #XXX:Noufal Stream payload here rather than just read it
+        # XXX:Noufal Stream payload here rather than just read it
         # r = self.fileobj.readline() # Drop the initial newline
         # if r == "":
         #     return None
         # header = self.fileobj.readline()
 
-        # Strip the initial new lines and read first line
-        header = self.fileobj.readline()
-        while header and header.strip() == "":
-            header = self.fileobj.readline()
+        line = self._strip_initial_new_lines()
+        line = self._safe_from_arcmetadata(line)
 
-        if header == "":
+        if not line:
             return None
 
-        if int(self.version) == 1:
-            arc_header_re = ARC1_HEADER_RE
-        elif int(self.version) == 2:
-            arc_header_re = ARC2_HEADER_RE
+        header = self._read_record_header(line)
+        payload = self.fileobj.read(header['length'])
 
-        matches = arc_header_re.search(header)
-        headers = matches.groupdict()
-        arc_header = ARCHeader(**headers)
-
-        payload = self.fileobj.read(int(headers['length']))
-
-        self.fileobj.readline() # Munge the separator newline.
+        self.fileobj.readline()  # Munge the separator newline.
+        return ARCRecord(header=header, payload=payload)
 
-        return ARCRecord(header = arc_header, payload = payload)
-        
     def read(self):
         "Reads out an arc record from the file"
         if not self.header_read:
             self._read_file_header()
         return self._read_arc_record()
-        
+
     # For compatability with WARCFile
     read_record = read
     write_record = write
@@ -380,16 +480,6 @@ def __iter__(self):
         while record:
             yield record
             record = self.read()
-    
+
     def close(self):
         self.fileobj.close()
-        
-        
-        
-        
-        
-        
-    
-    
-    
-    
diff --git a/warc/gzip2.py b/warc/gzip2.py
deleted file mode 100644
index fcd6b48..0000000
--- a/warc/gzip2.py
+++ /dev/null
@@ -1,121 +0,0 @@
-"""Enhanced gzip library to support multiple member gzip files.
-
-GZIP has an interesting property that contatination of mutliple gzip files is a valid gzip file. 
-In other words, a gzip file can have multiple members, each individually gzip 
-compressed. The members simply appear one after another in the file, with no 
-additional information before, between, or after them.
-
-See GZIP RFC for more information.
-
-http://www.gzip.org/zlib/rfc-gzip.html
-
-This library provides support for creating and reading multi-member gzip files.
-"""
-from gzip import WRITE, READ, write32u, GzipFile as BaseGzipFile
-import zlib
-
-def open(filename, mode="rb", compresslevel=9):
-    """Shorthand for GzipFile(filename, mode, compresslevel).
-    """
-    return GzipFile(filename, mode, compresslevel)
-
-class GzipFile(BaseGzipFile):
-    """GzipFile with support for multi-member gzip files.
-    """
-    def __init__(self, filename=None, mode=None, 
-                 compresslevel=9, fileobj=None):
-        BaseGzipFile.__init__(self, 
-            filename=filename, 
-            mode=mode,
-            compresslevel=compresslevel,
-            fileobj=fileobj)
-            
-        if self.mode == WRITE:
-            # Indicates the start of a new member if value is True.
-            # The BaseGzipFile constructor already wrote the header for new 
-            # member, so marking as False.
-            self._new_member = False
-            
-        # When _member_lock is True, only one member in gzip file is read
-        self._member_lock = False
-    
-    def close_member(self):
-        """Closes the current member being written.
-        """
-        # The new member is not yet started, no need to close
-        if self._new_member:
-            return
-            
-        self.fileobj.write(self.compress.flush())
-        write32u(self.fileobj, self.crc)
-        # self.size may exceed 2GB, or even 4GB
-        write32u(self.fileobj, self.size & 0xffffffffL)
-        self.size = 0
-        self.compress = zlib.compressobj(9,
-                                         zlib.DEFLATED,
-                                         -zlib.MAX_WBITS,
-                                         zlib.DEF_MEM_LEVEL,
-                                         0)
-        self._new_member = True
-        
-    def _start_member(self):
-        """Starts writing a new member if required.
-        """
-        if self._new_member:
-            self._init_write(self.name)
-            self._write_gzip_header()
-            self._new_member = False
-        
-    def write(self, data):
-        self._start_member()
-        BaseGzipFile.write(self, data)
-        
-    def close(self):
-        """Closes the gzip with care to handle multiple members.
-        """
-        if self.fileobj is None:
-            return
-        if self.mode == WRITE:
-            self.close_member()
-            self.fileobj = None
-        elif self.mode == READ:
-            self.fileobj = None
-            
-        if self.myfileobj:
-            self.myfileobj.close()
-            self.myfileobj = None
-            
-    def _read(self, size):
-        # Treat end of member as end of file when _member_lock flag is set
-        if self._member_lock and self._new_member:
-            raise EOFError()
-        else:
-            return BaseGzipFile._read(self, size)
-            
-    def read_member(self):
-        """Returns a file-like object to read one member from the gzip file.
-        """
-        if self._member_lock is False:
-            self._member_lock = True
-
-        if self._new_member:
-            try:
-                # Read one byte to move to the next member
-                BaseGzipFile._read(self, 1)
-                assert self._new_member is False
-            except EOFError:
-                return None
-        
-        return self
-
-    def write_member(self, data):
-        """Writes the given data as one gzip member.
-        
-        The data can be a string, an iterator that gives strings or a file-like object.
-        """
-        if isinstance(data, basestring):
-            self.write(data)
-        else:
-            for text in data:
-                self.write(text)
-        self.close_member()
diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py
index 11305e5..d0f0b9c 100644
--- a/warc/tests/test_arc.py
+++ b/warc/tests/test_arc.py
@@ -1,29 +1,27 @@
 import datetime
 import hashlib
-import StringIO
+import io
 
 from .. import arc
 
-import pytest
+#def test_init_arc_header():
+#    "Make sure Header can be initialise only with expected fields"
+#    with pytest.raises(TypeError):
+#        arc.ARCHeader(test="1234")
 
-def test_init_arc_header():
-    "Make sure Header can be initialise only with expected fields"
-    with pytest.raises(TypeError):
-        arc.ARCHeader(test="1234")
-    
 def test_arc_header_attributes():
     "Make sure that ARC1 header fields are accessible as attributes. Double check for attributes that are converted for convenience (e.g. date and length)"
     header = arc.ARCHeader(url = "http://archive.org",
-                           ip_address = "127.0.0.1", 
-                           date = "20120301093000", 
-                           content_type = "text/html", 
+                           ip_address = "127.0.0.1",
+                           date = "20120301093000",
+                           content_type = "text/html",
                            length = "500",
                            result_code = "200",
-                           checksum = "a123456", 
+                           checksum = "a123456",
                            location = "http://www.archive.org",
                            offset = "300",
                            filename = "sample.arc.gz")
-    
+
     assert header.url == "http://archive.org"
     assert header.ip_address == "127.0.0.1"
     assert header.date == datetime.datetime.strptime("20120301093000", "%Y%m%d%H%M%S")
@@ -36,57 +34,57 @@ def test_arc_header_attributes():
     assert header.location == "http://www.archive.org"
     assert header.offset == 300
     assert header.filename == "sample.arc.gz"
-    
+
 def test_arc_v1_header_creation():
     "Validate ARC V1 header creation"
     header = arc.ARCHeader(url = "http://archive.org",
-                           ip_address = "127.0.0.1", 
-                           date = "20120301093000", 
-                           content_type = "text/html", 
+                           ip_address = "127.0.0.1",
+                           date = "20120301093000",
+                           content_type = "text/html",
                            length = "500",
                            result_code = "200",
-                           checksum = "a123456", 
+                           checksum = "a123456",
                            location = "http://www.archive.org",
                            offset = "300",
                            filename = "sample.arc.gz")
-    f = StringIO.StringIO()
+    f = io.StringIO()
     header.write_to(f, 1)
     header_v1_string = f.getvalue()
     assert header_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500"
-    
-    
+
+
 def test_arc_v2_header_creation():
     "Validate ARC V2 header creation"
     header = arc.ARCHeader(url = "http://archive.org",
-                           ip_address = "127.0.0.1", 
-                           date = "20120301093000", 
-                           content_type = "text/html", 
+                           ip_address = "127.0.0.1",
+                           date = "20120301093000",
+                           content_type = "text/html",
                            length = "500",
                            result_code = "200",
-                           checksum = "a123456", 
+                           checksum = "a123456",
                            location = "http://www.archive.org",
                            offset = "300",
                            filename = "sample.arc.gz")
-    f = StringIO.StringIO()
+    f = io.StringIO()
     header.write_to(f)
     header_v2_string = f.getvalue()
     assert header_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500"
-    
-    
+
+
 def test_arc_v1_record_creation():
     "Validate ARC V1 record creation"
     header = arc.ARCHeader(url = "http://archive.org",
-                           ip_address = "127.0.0.1", 
-                           date = "20120301093000", 
-                           content_type = "text/html", 
+                           ip_address = "127.0.0.1",
+                           date = "20120301093000",
+                           content_type = "text/html",
                            length = "500",
                            result_code = "200",
-                           checksum = "a123456", 
+                           checksum = "a123456",
                            location = "http://www.archive.org",
                            offset = "300",
                            filename = "sample.arc.gz")
     record_v1 = arc.ARCRecord(header, "BlahBlah")
-    f = StringIO.StringIO()
+    f = io.StringIO()
     record_v1.write_to(f, 1)
     record_v1_string = f.getvalue()
     assert record_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n"
@@ -94,17 +92,17 @@ def test_arc_v1_record_creation():
 def test_arc_v2_record_creation():
     "Validate ARC V1 record creation"
     header = dict(url = "http://archive.org",
-                  ip_address = "127.0.0.1", 
-                  date = "20120301093000", 
-                  content_type = "text/html", 
+                  ip_address = "127.0.0.1",
+                  date = "20120301093000",
+                  content_type = "text/html",
                   length = "500",
                   result_code = "200",
-                  checksum = "a123456", 
+                  checksum = "a123456",
                   location = "http://www.archive.org",
                   offset = "300",
                   filename = "sample.arc.gz")
     record_v2 = arc.ARCRecord(payload = "BlahBlah", headers = header)
-    f = StringIO.StringIO()
+    f = io.StringIO()
     record_v2.write_to(f)
     record_v2_string = f.getvalue()
     assert record_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n"
@@ -116,14 +114,14 @@ def test_arc_v1_writer():
                         date = now,
                         org = "Internet Archive")
 
-    opfile = StringIO.StringIO()
+    opfile = io.StringIO()
     opfile.name = "sample.arc" # Necessary since only file objects in Python have names.
 
     f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers)
     for payload in "Payload1 Payload2".split():
         header = dict(url = "http://www.archive.org",
-                      ip_address = "127.0.0.1", 
-                      date = now, 
+                      ip_address = "127.0.0.1",
+                      date = now,
                       content_type = "text/html",
                       length = len(payload))
         r = arc.ARCRecord(headers = header, payload = payload)
@@ -137,14 +135,14 @@ def test_arc1_v1_writer_default_headers():
     now = datetime.datetime(year = 2012, month = 3, day = 2, hour = 19, minute = 32, second = 10)
     file_headers = dict(date = now)
 
-    opfile = StringIO.StringIO()
+    opfile = io.StringIO()
     opfile.name = "sample.arc" # Necessary since only file objects in Python have names.
-        
+
     f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers)
     for payload in "Payload1 Payload2".split():
         header = dict(url = "http://www.archive.org",
-                      ip_address = "127.0.0.1", 
-                      date = now, 
+                      ip_address = "127.0.0.1",
+                      date = now,
                       content_type = "text/html",
                       length = len(payload))
         r = arc.ARCRecord(headers = header, payload = payload)
@@ -153,7 +151,6 @@ def test_arc1_v1_writer_default_headers():
     assert opfile.getvalue() == expected_value
     f.close()
 
-
 def test_arc_v2_writer():
     "Try writing records to an ARC V2 file. This is what API will feel like to a user of the library"
     now = "20120302193210"
@@ -161,18 +158,18 @@ def test_arc_v2_writer():
                         date = now,
                         org = "Internet Archive")
 
-    opfile = StringIO.StringIO()
+    opfile = io.StringIO()
     opfile.name = "sample.arc" # Necessary since only file objects in Python have names.
 
     f = arc.ARCFile(fileobj = opfile, file_headers = file_headers)
     for payload in "Payload1 Payload2".split():
         header = arc.ARCHeader(url = "http://archive.org",
-                               ip_address = "127.0.0.1", 
-                               date = "20120301093000", 
-                               content_type = "text/html", 
+                               ip_address = "127.0.0.1",
+                               date = "20120301093000",
+                               content_type = "text/html",
                                length = "500",
                                result_code = "200",
-                               checksum = "a123456", 
+                               checksum = "a123456",
                                location = "http://www.archive.org",
                                offset = "300",
                                filename = "sample.arc.gz")
@@ -183,21 +180,21 @@ def test_arc_v2_writer():
 
 def test_arc_reader_guess_version():
     "Make sure that the ARCFile object automatically detects the file version"
-    v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2")
-    v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")
-    
+    v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2")
+    v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")
+
     arc_v1 = arc.ARCFile(fileobj = v1)
     arc_v2 = arc.ARCFile(fileobj = v2)
 
     arc_v1.read()
     arc_v2.read()
-    
+
     assert arc_v1.version == 1
     assert arc_v2.version == 2
-    
+
 def test_arc_reader_read_file_headers():
     "Make sure that the parser is reading file headers properly"
-    ip = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")
+    ip = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2")
     arc_file = arc.ARCFile(fileobj = ip)
     arc_file.read()
     arc_file.file_headers['ip_address'] == "127.0.0.1"
@@ -205,14 +202,14 @@ def test_arc_reader_read_file_headers():
     arc_file.file_headers['org'] == "Internet Archive"
 
 
-def test_arc_reader_v1():    
+def test_arc_reader_v1():
     "Make sure that the parser reads out V1 ARC records. (Also tests iterator behaviour)"
-    v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2")
-    arc_file = arc.ARCFile(fileobj = v1)    
+    v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2")
+    arc_file = arc.ARCFile(fileobj = v1)
 
     r1  = arc_file.read()
     r2  = arc_file.read()
-    
+
     assert r1['url'] == "http://www.archive.org"
     assert r1['ip_address'] == "127.0.0.1"
     assert r1['date'] == "20120302193210"
@@ -228,12 +225,12 @@ def test_arc_reader_v1():
     assert r2.payload == "Payload2"
 
 
-def test_arc_reader_v2():    
+def test_arc_reader_v2():
     "Make sure that the parser reads out V2 ARC records. (Also tests iterator behaviour)"
-    v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2")
-    arc_file = arc.ARCFile(fileobj = v2)    
+    v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2")
+    arc_file = arc.ARCFile(fileobj = v2)
     r1, r2 = list(arc_file)
-    
+
     assert r1['url'] == "http://archive.org"
     assert r1['ip_address'] == "127.0.0.1"
     assert r1['date'] == "20120301093000"
@@ -277,24 +274,52 @@ def test_arc_v2_record_from_string():
 def test_arc_record_versions():
     "Check initialising an ARCRecord with a version to see if it outputs stuff properly"
     header = dict(url = "http://archive.org",
-                  ip_address = "127.0.0.1", 
-                  date = "20120301093000", 
-                  content_type = "text/html", 
+                  ip_address = "127.0.0.1",
+                  date = "20120301093000",
+                  content_type = "text/html",
                   length = "500",
                   result_code = "200",
-                  checksum = "a123456", 
+                  checksum = "a123456",
                   location = "http://www.archive.org",
                   offset = "300",
                   filename = "sample.arc.gz")
     record_1 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 1)
     record_2 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 2)
-    f = StringIO.StringIO()
+    f = io.StringIO()
     record_1.write_to(f)
     record_string = f.getvalue()
     assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n"
 
-    f = StringIO.StringIO()
+    f = io.StringIO()
     record_2.write_to(f)
     record_string = f.getvalue()
     assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n"
 
+
+class TestARCFile:
+    def test_write_headers(self):
+        """Test to make sure header is written just once.
+        """
+        f = io.StringIO()
+        f.name = "sample.arc"
+        afile = arc.ARCFile(fileobj=f, version=1)
+        afile._write_header()
+
+        # Make sure header is written only once
+        assert f.getvalue().count("filedesc://") == 1
+
+    def test_filename(self):
+        """If filename is specified as argument to ARCFile, it should be used."""
+        f = io.StringIO()
+        afile = arc.ARCFile(fileobj=f, filename="sample.arc", version=1)
+        afile._write_header()
+        assert "sample.arc" in f.getvalue()
+
+    def test_no_filename(self):
+        """should be able to write ARCFile even if there is no filename."""
+        f = io.StringIO()
+        afile = arc.ARCFile(fileobj=f, version=1)
+        afile._write_header()
+        # filename should be empty
+        assert f.getvalue().startswith("filedesc:// ")
+
diff --git a/warc/tests/test_common.py b/warc/tests/test_common.py
index d2c2353..4ad14ec 100644
--- a/warc/tests/test_common.py
+++ b/warc/tests/test_common.py
@@ -1,44 +1,13 @@
-from .. import open as libopen
-from .. import WARCFile, ARCFile
+from ..__init__ import open as libopen
+from ..warc import WARCFile
 
 import os
 
-import pytest
-
 def test_open_warc_file():
     "Test opening a WARC file"
-    
+
     f = libopen("foo.warc","wb")
     assert isinstance(f, WARCFile)
     f.close()
     os.unlink("foo.warc")
 
-
-def test_open_arc_file():
-    "Test opening an ARC file"
-    
-    f = libopen("foo.arc","wb")
-    assert isinstance(f, ARCFile)
-    f.close()
-    os.unlink("foo.arc")
-
-
-def test_open_unknown_file():
-    "Test opening a WARC file"
-
-    with pytest.raises(IOError):
-        libopen("foo","wb")
-
-    
-def test_sample_data():
-    import gzip
-    f = gzip.GzipFile("test_data/alexa_short_header.arc.gz")
-    a = ARCFile(fileobj = f)
-    record = str(a.read())
-    expected = """http://www.killerjo.net:80/robots.txt 211.111.217.29 20110804181142       39
-SSH-2.0-OpenSSH_5.3p1 Debian-3ubuntu3\r\n\n"""
-    assert record == expected
-
-
-    
-
diff --git a/warc/tests/test_utils.py b/warc/tests/test_utils.py
index c155e6e..6be5abc 100644
--- a/warc/tests/test_utils.py
+++ b/warc/tests/test_utils.py
@@ -1,5 +1,5 @@
 from ..utils import FilePart, CaseInsensitiveDict
-from cStringIO import StringIO
+import io
 
 class TestCaseInsensitiveDict:
     def test_all(self):
@@ -13,44 +13,52 @@ def test_all(self):
         d['BAR'] = 2
         assert 'bar' in d
         assert d['bar'] == 2
-        
+
         assert sorted(d.keys()) == ["bar", "foo"]
         assert sorted(d.items()) == [("bar", 2), ("foo", 1)]
-        
+
 class TestFilePart:
-    def setup_method(self, m):
+    def setup(self):
         # 5 chars in each line
-        self.text = "\n".join(["aaaa", "bbbb", "cccc", "dddd", "eeee", "ffff"])
-        
+        self.text = b"\n".join([b"aaaa", b"bbbb", b"cccc", b"dddd", b"eeee", b"ffff"])
+
     def test_read(self):
-        part = FilePart(StringIO(self.text), 0)
-        assert part.read() == ""
-    
-        part = FilePart(StringIO(self.text), 5)
-        assert part.read() == "aaaa\n"
-
-        part = FilePart(StringIO(self.text), 10)
-        assert part.read() == "aaaa\nbbbb\n"
-        
+        part = FilePart(io.BytesIO(self.text), 0)
+        assert part.read() == b""
+
+        part = FilePart(io.BytesIO(self.text), 5)
+        assert part.read() == b"aaaa\n"
+
+        part = FilePart(io.BytesIO(self.text), 10)
+        assert part.read() == b"aaaa\nbbbb\n"
+
         # try with large data
-        part = FilePart(StringIO("a" * 10000), 10)
+        part = FilePart(io.BytesIO(b"a" * 10000), 10)
         assert len(part.read()) == 10
-        
+
     def test_read_with_size(self):
-        part = FilePart(StringIO(self.text), 10)
-        assert part.read(3) == "aaa"
-        assert part.read(3) == "a\nb"
-        assert part.read(3) == "bbb"
-        assert part.read(3) == "\n"
-        assert part.read(3) == ""
-        
+        part = FilePart(io.BytesIO(self.text), 10)
+        assert part.read(3) == b"aaa"
+        assert part.read(3) == b"a\nb"
+        assert part.read(3) == b"bbb"
+        assert part.read(3) == b"\n"
+        assert part.read(3) == b""
+
+    def test_read_with_buffer(self):
+        "Tests read size when read length is larger than buffer."
+        fb = io.BytesIO(b'a' * 10000)
+        part = FilePart(fb, 10000)
+        temp = part.read(100)
+        part._unread(temp)
+        assert len(part.read(1000)) == 1000
+
     def test_readline(self):
-        part = FilePart(StringIO(self.text), 11)
-        assert part.readline() == "aaaa\n"
-        assert part.readline() == "bbbb\n"
-        assert part.readline() == "c"
-        assert part.readline() == ""
-        
+        part = FilePart(io.BytesIO(self.text), 11)
+        assert part.readline() == b"aaaa\n"
+        assert part.readline() == b"bbbb\n"
+        assert part.readline() == b"c"
+        assert part.readline() == b""
+
     def test_iter(self):
-        part = FilePart(StringIO(self.text), 11)
-        assert list(part) == ["aaaa\n", "bbbb\n", "c"]
\ No newline at end of file
+        part = FilePart(io.BytesIO(self.text), 11)
+        assert list(part) == [b"aaaa\n", b"bbbb\n", b"c"]
diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py
index 92545ba..7f722f7 100644
--- a/warc/tests/test_warc.py
+++ b/warc/tests/test_warc.py
@@ -1,6 +1,5 @@
 from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile
-
-from StringIO import StringIO
+import io
 
 class TestWARCHeader:
     def test_attrs(self):
@@ -53,21 +52,21 @@ def f(type):
         assert f("newtype")["Content-Type"] == "application/octet-stream"
 
 SAMPLE_WARC_RECORD_TEXT = (
-    "WARC/1.0\r\n" +
-    "Content-Length: 10\r\n" +
-    "WARC-Date: 2012-02-10T16:15:52Z\r\n" +
-    "Content-Type: application/http; msgtype=response\r\n" +
-    "WARC-Type: response\r\n" +
-    "WARC-Record-ID: <urn:uuid:80fb9262-5402-11e1-8206-545200690126>\r\n" +
-    "WARC-Target-URI: http://example.com/\r\n" +
-    "\r\n" +
-    "Helloworld" +
-    "\r\n\r\n"
+    b"WARC/1.0\r\n" +
+    b"Content-Length: 10\r\n" +
+    b"WARC-Date: 2012-02-10T16:15:52Z\r\n" +
+    b"Content-Type: application/http; msgtype=response\r\n" +
+    b"WARC-Type: response\r\n" +
+    b"WARC-Record-ID: <urn:uuid:80fb9262-5402-11e1-8206-545200690126>\r\n" +
+    b"WARC-Target-URI: http://example.com/\r\n" +
+    b"\r\n" +
+    b"Helloworld" +
+    b"\r\n\r\n"
 )
 
 class TestWARCReader:
     def test_read_header1(self):
-        f = StringIO(SAMPLE_WARC_RECORD_TEXT)
+        f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT)
         h = WARCReader(f).read_record().header
         assert h.date == "2012-02-10T16:15:52Z"
         assert h.record_id == "<urn:uuid:80fb9262-5402-11e1-8206-545200690126>"
@@ -75,17 +74,17 @@ def test_read_header1(self):
         assert h.content_length == 10
 
     def test_empty(self):
-        reader = WARCReader(StringIO(""))
+        reader = WARCReader(io.BytesIO(b""))
         assert reader.read_record() is None
 
     def test_read_record(self):
-        f = StringIO(SAMPLE_WARC_RECORD_TEXT)
+        f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT)
         reader = WARCReader(f)
         record = reader.read_record()
-        assert "".join(record.payload) == "Helloworld"
+        assert record.payload.readline() == b"Helloworld"
 
     def read_multiple_records(self):
-        f = StringIO(SAMPLE_WARC_RECORD_TEXT * 5)
+        f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT * 5)
         reader = WARCReader(f)
         for i in range(5):
             rec = reader.read_record()
@@ -93,21 +92,10 @@ def read_multiple_records(self):
 
 class TestWarcFile:
     def test_read(self):
-        f = WARCFile(fileobj=StringIO(SAMPLE_WARC_RECORD_TEXT))
+        f = WARCFile(fileobj=io.BytesIO(SAMPLE_WARC_RECORD_TEXT))
         assert f.read_record() is not None
         assert f.read_record() is None
 
-    def test_write_gz(self):
-        """Test writing multiple member gzip file."""
-        buffer = StringIO()
-        f = WARCFile(fileobj=buffer, mode="w", compress=True)
-        for i in range(10):
-            record = WARCRecord(payload="hello %d" % i)
-            f.write_record(record)
-
-        GZIP_MAGIC_NUMBER = '\037\213'
-        assert buffer.getvalue().count(GZIP_MAGIC_NUMBER) == 10
-
     def test_long_header(self):
         """Test large WARC header with a CRLF across a 1024 byte boundrary"""
         from .. import warc
diff --git a/warc/utils.py b/warc/utils.py
index 8620e8e..404eff8 100644
--- a/warc/utils.py
+++ b/warc/utils.py
@@ -7,11 +7,42 @@
 :copyright: (c) 2012 Internet Archive
 """
 
-from UserDict import DictMixin
+from collections import MutableMapping
+import re
 
-class CaseInsensitiveDict(DictMixin):
+SEP = re.compile("[;:=]")
+
+
+def status_code(protocol):
+    code = protocol.split(' ')[1]
+    return int(code)
+
+
+def get_http_headers(f):
+    line = f.readline().decode('utf-8')
+    if not line.startswith("HTTP/1"):
+        f.seek(0)
+        return
+
+    line = line.strip()
+    headers = {
+        'protocol': line,
+        'status_code': status_code(line),
+    }
+    for line in f:
+        line = line.decode('utf-8')
+        if not line.strip():
+            break
+        name, content = line.split(':', 1)
+        name = name.strip()
+        content = content.strip()
+        headers[name.lower()] = content
+    return headers
+
+
+class CaseInsensitiveDict(MutableMapping):
     """Almost like a dictionary, but keys are case-insensitive.
-    
+
         >>> d = CaseInsensitiveDict(foo=1, Bar=2)
         >>> d['foo']
         1
@@ -23,71 +54,72 @@ class CaseInsensitiveDict(DictMixin):
         >>> d.keys()
         ["foo", "bar"]
     """
-    def __init__(self, mapping=None, **kwargs):
+    def __init__(self, *args, **kwargs):
         self._d = {}
-        self.update(mapping, **kwargs)
-        
+        self.update(dict(*args, **kwargs))
+
     def __setitem__(self, name, value):
         self._d[name.lower()] = value
-    
+
     def __getitem__(self, name):
         return self._d[name.lower()]
-        
+
     def __delitem__(self, name):
         del self._d[name.lower()]
-        
+
     def __eq__(self, other):
         return isinstance(other, CaseInsensitiveDict) and other._d == self._d
-        
-    def keys(self):
-        return self._d.keys()
+
+    def __iter__(self):
+        return iter(self._d)
+
+    def __len__(self):
+        return len(self._d)
+
 
 class FilePart:
     """File interface over a part of file.
-    
-    Takes a file and length to read from the file and returns a file-object 
+
+    Takes a file and length to read from the file and returns a file-object
     over that part of the file.
     """
     def __init__(self, fileobj, length):
         self.fileobj = fileobj
         self.length = length
         self.offset = 0
-        self.buf = "" 
-        
+        self.buf = b''
+
     def read(self, size=-1):
         if size == -1:
-            return self._read(self.length)
-        else:
-            return self._read(size)
-        
-    def _read(self, size):
+            size = self.length
+
         if len(self.buf) >= size:
             content = self.buf[:size]
             self.buf = self.buf[size:]
         else:
-            size = min(size, self.length - self.offset - len(self.buf))
-            content = self.buf + self.fileobj.read(size)
-            self.buf = ""
+            size = min(size, self.length - self.offset)
+            content = self.buf + self.fileobj.read(size - len(self.buf))
+            self.buf = b''
         self.offset += len(content)
         return content
-        
+
     def _unread(self, content):
         self.buf = content + self.buf
         self.offset -= len(content)
-        
-    def readline(self):
+
+    def readline(self, size=1024):
         chunks = []
-        chunk = self._read(1024)
-        while chunk and "\n" not in chunk:
+        chunk = self.read(size)
+        while chunk and b"\n" not in chunk:
             chunks.append(chunk)
-            chunk = self._read(1024)
-            
-        if "\n" in chunk:
-            index = chunk.index("\n")
+            chunk = self.read(size)
+
+        if b"\n" in chunk:
+            index = chunk.index(b"\n")
             self._unread(chunk[index+1:])
             chunk = chunk[:index+1]
         chunks.append(chunk)
-        return "".join(chunks)
+        return b"".join(chunks)
 
     def __iter__(self):
         line = self.readline()
diff --git a/warc/warc.py b/warc/warc.py
index 0c762a6..a5eaae0 100644
--- a/warc/warc.py
+++ b/warc/warc.py
@@ -7,31 +7,31 @@
 :copyright: (c) 2012 Internet Archive
 """
 
-import __builtin__
+import gzip
+import builtins
 import datetime
 import uuid
-import logging
 import re
-from cStringIO import StringIO
+import io
 import hashlib
 
-from . import gzip2
-from .utils import CaseInsensitiveDict, FilePart
+from .utils import CaseInsensitiveDict, FilePart, get_http_headers
+
 
 class WARCHeader(CaseInsensitiveDict):
     """The WARC Header object represents the headers of a WARC record.
 
-    It provides dictionary like interface for accessing the headers.    
-    
+    It provides dictionary like interface for accessing the headers.
+
     The following mandatory fields are accessible also as attributes.
-    
+
         * h.record_id == h['WARC-Record-ID']
         * h.content_length == int(h['Content-Length'])
         * h.date == h['WARC-Date']
         * h.type == h['WARC-Type']
-        
-    :params headers: dictionary of headers. 
-    :params defaults: If True, important headers like WARC-Record-ID, 
+
+    :params headers: dictionary of headers.
+    :params defaults: If True, important headers like WARC-Record-ID,
                       WARC-Date, Content-Type and Content-Length are
                       initialized to automatically if not already present.
     TODO:
@@ -40,9 +40,9 @@ class WARCHeader(CaseInsensitiveDict):
         * url
         * ip_address
         * date (date of archival)
-        * content_type 
+        * content_type
         * result_code (response code)
-        * checksum 
+        * checksum
         * location
         * offset (offset from beginning of file to recrod)
         * filename (name of arc file)
@@ -50,10 +50,10 @@ class WARCHeader(CaseInsensitiveDict):
 
     """
     CONTENT_TYPES = dict(warcinfo='application/warc-fields',
-                        response='application/http; msgtype=response',
-                        request='application/http; msgtype=request',
-                        metadata='application/warc-fields')
-                            
+                         response='application/http; msgtype=response',
+                         request='application/http; msgtype=request',
+                         metadata='application/warc-fields')
+
     KNOWN_HEADERS = {
         "type": "WARC-Type",
         "date": "WARC-Date",
@@ -65,16 +65,17 @@ class WARCHeader(CaseInsensitiveDict):
         "content_type": "Content-Type",
         "content_length": "Content-Length"
     }
-                            
+
     def __init__(self, headers, defaults=False):
-        self.version = "WARC/1.0"
-        CaseInsensitiveDict.__init__(self, headers)
+        super().__init__(headers)
         if defaults:
             self.init_defaults()
-        
+        self.version = "WARC/%s" % self.get('warc-version', '1.0')
+
     def init_defaults(self):
-        """Initializes important headers to default values, if not already specified.
-        
+        """Initializes important headers to default values,
+        if not already specified.
+
         The WARC-Record-ID header is set to a newly generated UUID.
         The WARC-Date header is set to the current datetime.
         The Content-Type is set based on the WARC-Type header.
@@ -86,83 +87,112 @@ def init_defaults(self):
             self['WARC-Date'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ')
         if "Content-Type" not in self:
             self['Content-Type'] = WARCHeader.CONTENT_TYPES.get(self.type, "application/octet-stream")
-                        
+
     def write_to(self, f):
         """Writes this header to a file, in the format specified by WARC.
         """
-        f.write(self.version + "\r\n")
+        f.write(self.version.encode() + b"\r\n")
         for name, value in self.items():
             name = name.title()
             # Use standard forms for commonly used patterns
-            name = name.replace("Warc-", "WARC-").replace("-Ip-", "-IP-").replace("-Id", "-ID").replace("-Uri", "-URI")
-            f.write(name)
-            f.write(": ")
-            f.write(value)
-            f.write("\r\n")
-        
+            name = (name.replace("Warc-", "WARC-")
+                    .replace("-Ip-", "-IP-")
+                    .replace("-Id", "-ID")
+                    .replace("-Uri", "-URI"))
+            f.write(str(name).encode())
+            f.write(b": ")
+            f.write(str(value).encode())
+            f.write(b"\r\n")
+
         # Header ends with an extra CRLF
-        f.write("\r\n")
+        f.write(b"\r\n")
 
     @property
     def content_length(self):
         """The Content-Length header as int."""
         return int(self['Content-Length'])
-        
+
     @property
-    def type(self): 
+    def type(self):
         """The value of WARC-Type header."""
-        return self.get('WARC-Type')
-        
+        return self['WARC-Type']
+
     @property
     def record_id(self):
         """The value of WARC-Record-ID header."""
         return self['WARC-Record-ID']
-        
+
     @property
     def date(self):
         """The value of WARC-Date header."""
         return self['WARC-Date']
-    
+
     def __str__(self):
-        f = StringIO()
+        f = io.BytesIO()
         self.write_to(f)
-        return f.getvalue()
-        
+        return str(f.getvalue(), 'utf-8')
+
     def __repr__(self):
         return "<WARCHeader: type=%r, record_id=%r>" % (self.type, self.record_id)
 
+
 class WARCRecord(object):
     """The WARCRecord object represents a WARC Record.
     """
     def __init__(self, header=None, payload=None,  headers={}, defaults=True):
-        """Creates a new WARC record. 
+        """Creates a new WARC record.
+
+           @param payload must be of type 'bytes' or FilePart
         """
 
         if header is None and defaults is True:
             headers.setdefault("WARC-Type", "response")
 
         self.header = header or WARCHeader(headers, defaults=True)
-        self.payload = payload
-        
+
         if defaults is True and 'Content-Length' not in self.header:
             if payload:
-                self.header['Content-Length'] = str(len(payload))
+                self.header['Content-Length'] = len(payload)
             else:
                 self.header['Content-Length'] = "0"
-                
+
         if defaults is True and 'WARC-Payload-Digest' not in self.header:
             self.header['WARC-Payload-Digest'] = self._compute_digest(payload)
-            
+
+        if isinstance(payload, bytes):
+            payload = io.BytesIO(payload)
+
+        self.payload = payload
+        self._content = None
+
+        self._custom_cases()
+
+    def _custom_cases(self):
+        # TODO: this need to be done using other pattern, but first we need
+        # tests
+        if self.version == '0.18':
+            self._custom_0_18()
+
+    def _custom_0_18(self):
+        if not self.type == 'response':
+            return
+
+        if not self['content-type'].startswith('application/http'):
+            return
+
+        headers = get_http_headers(self.payload)
+        self.header['http_headers'] = headers
+
     def _compute_digest(self, payload):
         return "sha1:" + hashlib.sha1(payload).hexdigest()
-                
+
     def write_to(self, f):
         self.header.write_to(f)
-        f.write(self.payload)
-        f.write("\r\n")
-        f.write("\r\n")
+        f.write(self.payload.read())
+        f.write(b"\r\n")
+        f.write(b"\r\n")
         f.flush()
-        
+
     @property
     def type(self):
         """Record type"""
@@ -172,11 +202,11 @@ def type(self):
     def url(self):
         """The value of the WARC-Target-URI header if the record is of type "response"."""
         return self.header.get('WARC-Target-URI')
-    
+
     @property
     def ip_address(self):
-        """The IP address of the host contacted to retrieve the content of this record. 
-        
+        """The IP address of the host contacted to retrieve the content of this record.
+
         This value is available from the WARC-IP-Address header."""
         return self.header.get('WARC-IP-Address')
 
@@ -184,159 +214,177 @@ def ip_address(self):
     def date(self):
         """UTC timestamp of the record."""
         return self.header.get("WARC-Date")
-    
+
     @property
     def checksum(self):
         return self.header.get('WARC-Payload-Digest')
-        
+
+    @property
+    def version(self):
+        return self.header['warc-version']
+
     @property
     def offset(self):
         """Offset of this record in the warc file from which this record is read.
         """
         pass
-        
+
     def __getitem__(self, name):
-        return self.header[name]
+        try:
+            return self.header[name]
+        except KeyError:
+            if name == "content_type":
+                return self.content.type
+            elif name in self.content:
+                return self.content[name]
 
     def __setitem__(self, name, value):
         self.header[name] = value
-        
+
     def __contains__(self, name):
         return name in self.header
-        
+
     def __str__(self):
-        f = StringIO()
+        f = io.BytesIO()
         self.write_to(f)
-        return f.getvalue()
-    
+        return str(f.getvalue())
+
     def __repr__(self):
-        return "<WARCRecord: type=%r record_id=%s>" % (self.type, self['WARC-Record-ID'])
-        
+        return "<WARCRecord: type=%r record_id=%s>" % (self.type,
+                                                       self['WARC-Record-ID'])
+
     @staticmethod
     def from_response(response):
         """Creates a WARCRecord from given response object.
 
-        This must be called before reading the response. The response can be 
+        This must be called before reading the response. The response can be
         read after this method is called.
-        
+
         :param response: An instance of :class:`requests.models.Response`.
         """
         # Get the httplib.HTTPResponse object
         http_response = response.raw._original_response
-        
-        # HTTP status line, headers and body as strings
-        status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason)
+
+        # HTTP status line, headers as string
+        status_line = "HTTP/1.1 %d %s" % (http_response.status,
+                                          http_response.reason)
         headers = str(http_response.msg)
-        body = http_response.read()
 
-        # Monkey-patch the response object so that it is possible to read from it later.
-        response.raw._fp = StringIO(body)
+        # Read raw response data out of request
+        stream = io.BytesIO()
+        stream.write(status_line.encode())
+        stream.write(b'\r\n')
+        stream.write(http_response.msg.as_bytes())
+        stream.write(b'\r\n')
+        for chunk in response.iter_content(1024):
+            stream.write(chunk)
+
+        payload = stream.getvalue()
 
-        # Build the payload to create warc file.
-        payload = status_line + "\r\n" + headers + "\r\n" + body
-        
         headers = {
             "WARC-Type": "response",
-            "WARC-Target-URI": response.request.full_url.encode('utf-8')
+            "WARC-Target-URI": response.request.url
         }
         return WARCRecord(payload=payload, headers=headers)
 
+
 class WARCFile:
     def __init__(self, filename=None, mode=None, fileobj=None, compress=None):
         if fileobj is None:
-            fileobj = __builtin__.open(filename, mode or "rb")
+            fileobj = builtins.open(filename, mode or "rb")
             mode = fileobj.mode
         # initiaize compress based on filename, if not already specified
         if compress is None and filename and filename.endswith(".gz"):
             compress = True
-        
+
         if compress:
-            fileobj = gzip2.GzipFile(fileobj=fileobj, mode=mode)
-        
+            fileobj = gzip.open(fileobj.name, mode)
+
         self.fileobj = fileobj
         self._reader = None
-        
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.close()
+
+    def __iter__(self):
+        return iter(self.reader)
+
     @property
     def reader(self):
         if self._reader is None:
             self._reader = WARCReader(self.fileobj)
         return self._reader
-    
+
     def write_record(self, warc_record):
         """Adds a warc record to this WARC file.
         """
         warc_record.write_to(self.fileobj)
-        # Each warc record is written as separate member in the gzip file
-        # so that each record can be read independetly.
-        if isinstance(self.fileobj, gzip2.GzipFile):
-            self.fileobj.close_member()
-        
+
     def read_record(self):
         """Reads a warc record from this WARC file."""
         return self.reader.read_record()
-        
-    def __iter__(self):
-        return iter(self.reader)
-        
+
     def close(self):
         self.fileobj.close()
-        
+
     def browse(self):
         """Utility to browse through the records in the warc file.
-        
-        This returns an iterator over (record, offset, size) for each record in 
-        the file. If the file is gzip compressed, the offset and size will 
-        corresponds to the compressed file. 
-        
-        The payload of each record is limited to 1MB to keep memory consumption 
+
+        This returns an iterator over (record, offset, size) for each record in
+        the file. If the file is gzip compressed, the offset and size will
+        corresponds to the compressed file.
+
+        The payload of each record is limited to 1MB to keep memory consumption
         under control.
         """
         offset = 0
         for record in self.reader:
             # Just read the first 1MB of the payload.
-            # This will make sure memory consuption is under control and it 
-            # is possible to look at the first MB of the payload, which is 
+            # This will make sure memory consuption is under control and it
+            # is possible to look at the first MB of the payload, which is
             # typically sufficient to read http headers in the payload.
-            record.payload = StringIO(record.payload.read(1024*1024))
+            record.payload = io.BytesIO(record.payload.read(1024*1024))
             self.reader.finish_reading_current_record()
             next_offset = self.tell()
             yield record, offset, next_offset-offset
             offset = next_offset
 
     def tell(self):
-        """Returns the file offset. If this is a compressed file, then the 
-        offset in the compressed file is returned.
+        """Returns the file offset.
         """
-        if isinstance(self.fileobj, gzip2.GzipFile):
-            return self.fileobj.fileobj.tell()
-        else:
-            return self.fileobj.tell()            
-    
+        return self.fileobj.tell()
+
+
 class WARCReader:
     RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n")
     RE_HEADER = re.compile(r"([a-zA-Z_\-]+): *(.*)\r\n")
-    SUPPORTED_VERSIONS = ["1.0"]
-    
+    SUPPORTED_VERSIONS = ["1.0", "0.18"]
+
     def __init__(self, fileobj):
         self.fileobj = fileobj
         self.current_payload = None
-        
+
     def read_header(self, fileobj):
-        version_line = fileobj.readline()
+        version_line = fileobj.readline().decode("utf-8")
         if not version_line:
             return None
-            
+
         m = self.RE_VERSION.match(version_line)
         if not m:
             raise IOError("Bad version line: %r" % version_line)
         version = m.group(1)
         if version not in self.SUPPORTED_VERSIONS:
             raise IOError("Unsupported WARC version: %s" % version)
-            
-        headers = {}
+
+        headers = {
+            'warc-version': version,
+        }
         while True:
-            line = fileobj.readline()
-            if line == "\r\n": # end of headers
+            line = fileobj.readline().decode("utf-8")
+            if line == "\r\n":  # end of headers
                 break
             m = self.RE_HEADER.match(line)
             if not m:
@@ -344,42 +392,42 @@ def read_header(self, fileobj):
             name, value = m.groups()
             headers[name] = value
         return WARCHeader(headers)
-        
+
     def expect(self, fileobj, expected_line, message=None):
-        line = fileobj.readline()
+        line = fileobj.readline().decode("utf-8")
         if line != expected_line:
             message = message or "Expected %r, found %r" % (expected_line, line)
             raise IOError(message)
-            
+
     def finish_reading_current_record(self):
         # consume the footer from the previous record
         if self.current_payload:
-            # consume all data from the current_payload before moving to next record
+            # consume all data from the current_payload before
+            # moving to next record
             self.current_payload.read()
             self.expect(self.current_payload.fileobj, "\r\n")
-            self.expect(self.current_payload.fileobj, "\r\n")
+            if self.current_payload.length:
+                self.expect(self.current_payload.fileobj, "\r\n")
             self.current_payload = None
 
     def read_record(self):
         self.finish_reading_current_record()
+        fileobj = self.fileobj
 
-        if isinstance(self.fileobj, gzip2.GzipFile):
-            fileobj = self.fileobj.read_member()
-            if fileobj is None:
-                return None
-        else:
-            fileobj = self.fileobj
-            
         header = self.read_header(fileobj)
         if header is None:
             return None
-        
+
         self.current_payload = FilePart(fileobj, header.content_length)
         record = WARCRecord(header, self.current_payload, defaults=False)
         return record
 
     def _read_payload(self, fileobj, content_length):
         size = 0
+        if content_length <= 0:
+            yield b''
+            raise StopIteration
+
         while size < content_length:
             chunk_size = min(1024, content_length-size)
             chunk = fileobj.read(chunk_size)
diff --git a/warcscrape.py b/warcscrape.py
new file mode 100644
index 0000000..f3776de
--- /dev/null
+++ b/warcscrape.py
@@ -0,0 +1,205 @@
+#! /usr/bin/env python3
+import os
+import re
+import argparse
+import warc
+import sys
+import mimetypes
+from urllib.parse import urlparse, unquote
+from pprint import pprint
+import shutil
+
+counts = {}
+
+class filterObject:
+    """Basic object for storing filters."""
+    def __init__(self, string):
+        self.result = True
+        if string[0] == "!":
+            self.result = False
+            string = string[1:]
+
+        _list = string.lower().split(":")
+
+        self.http = (_list[0] == 'http')
+        if self.http:
+            del _list[0]
+
+        self.k = _list[0]
+        self.v = _list[1]
+
+def inc(obj, header=False, dic=False):
+    """Short script for counting entries."""
+    if header:
+        try:
+            obj = obj[header]
+        except KeyError:
+            obj = None
+
+    holder = counts
+    if dic:
+        if dic not in counts:
+            counts[dic] = {}
+        holder = counts[dic]
+
+    if obj in holder:
+        holder[obj] += 1
+    else:
+        holder[obj] = 1
+
+def warc_records(string, path):
+    """Iterates over warc records in path."""
+    for filename in os.listdir(path):
+        if re.search(string, filename) and ".warc" in filename:
+            print("parsing", filename)
+            with warc.open(path + filename) as warc_file:
+                for record in warc_file:
+                    yield record
+
+def checkFilter(filters, record):
+    """Check record against filters."""
+    for i in filters:
+        if i.http:
+            if not record.http:
+                return False
+            value = record.http
+        else:
+            value = record.header
+
+        string = value.get(i.k, None)
+        if not string or (i.v in string) != i.result:
+            return False
+    return True
+
+def parse(args):
+    #Clear output warc file.
+    if args.dump == "warc":
+        print("Recording", args.dump, "to", args.output + ".")
+        with open(args.output_path + args.output, "wb"):
+            pass
+
+    for record in warc_records(args.string, args.path):
+        try:
+            #Filter out unwanted entries.
+            if not checkFilter(args.filter, record):
+                continue
+
+            #Increment Index counters.
+            if args.silence:
+                inc("records")
+                inc(record,"warc-type", "types")
+                inc(record, "content_type", "warc-content")
+                if record.http:
+                    inc(record.http, "content_type", "http-content")
+                    inc(record.http, "error", "status")
+
+            #Dump records to file.
+            if args.dump == "warc":
+                with open(args.output_path + args.output, "ab") as output:
+                    record.write_to(output)
+
+            if args.dump == "content":
+                url = urlparse(unquote(record['WARC-Target-URI']))
+
+                #Set up folder
+                index = url.path.rfind("/") + 1
+                file = url.path[index:]
+                path = url.path[:index]
+
+                #Process filename
+                if "." not in file:
+                    path += file
+                    if not path.endswith("/"):
+                        path += "/"
+
+                    file = 'index.html'
+
+                #Final fixes.
+                path = path.replace(".", "-")
+                host = url.hostname.replace('www.', '', 1)
+                path = args.output_path + host + path
+
+                #Create new directories
+                if not os.path.exists(path):
+                    try:
+                        os.makedirs(path)
+                    except OSError:
+                        path = "/".join([i[:25] for i in path.split("/")])
+                        os.makedirs(path)
+
+                #Test if file has a proper extension.
+                index = file.index(".")
+                suffix = file[index:]
+                content = record.http.get("content_type", "")
+                slist = mimetypes.guess_all_extensions(content)
+                if suffix not in slist:
+                    #Correct suffix if we can.
+                    suffix = mimetypes.guess_extension(content)
+                    if suffix:
+                        file = file[:index] + suffix
+                    else:
+                        inc(record.http, "content_type", "unknown mime type")
+
+                #Check for gzip compression.
+                if record.http.get("content-encoding", None) == "gzip":
+                    file += ".gz"
+
+                path += file
+
+                #If Duplicate file then insert numbers
+                index = path.rfind(".")
+                temp = path
+                n = 0
+                while os.path.isfile(temp):
+                    n +=1
+                    temp = path[:index] + "("+ str(n) + ")" + path[index:]
+                path = temp
+
+                #Write file.
+                with open(path, 'wb') as fp:
+                    record.http.write_payload_to(fp)
+        except:
+            if args.error:
+                print("Error in record. Recording to error.warc.")
+                with open(args.output_path + "error.warc", "wb") as fp:
+                    record.write_to(fp)
+            else:
+                raise
+
+    #print results
+    if args.silence:
+        print("-----------------------------")
+        for i in counts:
+            print("\nCount of {}.".format(i))
+            pprint(counts[i])
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Extracts attributes from warc files.')
+    parser.add_argument("filter", nargs='*', help="Attributes to filter by. Entries that do not contain filtered elements are ignored. Example: warc-type:response, would ignore all warc entries that are not responses. Attributes in an HTTP object should be prefixed by 'http'. Example, http:error:200.")
+    parser.add_argument("-silence", action="store_false", help="Silences output of warc data.")
+    parser.add_argument("-error", action="store_true", help="Silences most errors and records problematic warc entries to error.warc.")
+    parser.add_argument("-string", default="", help="Regular expression to limit parsed warc files. Defaults to empty string.")
+    parser.add_argument("-path", default="./", help="Path to folder containing warc files. Defaults to current folder.")
+    parser.add_argument("-output_path", default="data/", help="Path to folder to dump content files. Defaults to data/ folder.")
+    parser.add_argument("-output", default="output.warc", help="File to output warc contents. Defaults to 'output.warc'.")
+    parser.add_argument("-dump", choices=['warc', 'content'], type=str, help="Dumps all entries that survived filter. 'warc' creates a filtered warc file. 'content' tries to reproduce file structure of archived websites.")
+    args = parser.parse_args()
+
+    if args.path[-1] != "/":
+        args.path += "/"
+
+    if args.output_path[-1] != "/":
+        args.output_path += "/"
+
+    if not os.path.exists(args.output_path):
+        os.makedirs(args.output_path)
+
+    #Forced filters
+    if args.dump == "content":
+        args.filter.append("warc-type:response")
+        args.filter.append("content-type:application/http")
+
+    args.filter = [filterObject(i) for i in args.filter]
+
+    args.string = re.compile(args.string)
+    parse(args)