diff --git a/.gitignore b/.gitignore index 0a312c6..c647a60 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,4 @@ docs/_build/ build/ .coverage htmlcov/ +.ropeproject/ diff --git a/Readme.rst b/Readme.rst index b918dc5..5627d82 100644 --- a/Readme.rst +++ b/Readme.rst @@ -1,9 +1,7 @@ -warc: Python library to work with WARC files -============================================ +warc3: Python3 library to work with WARC files +============================================== -.. image:: https://secure.travis-ci.org/anandology/warc.png?branch=master - :alt: build status - :target: http://travis-ci.org/anandology/warc +Note: This is a fork of the original (now dead) warc repository. WARC (Web ARChive) is a file format for storing web crawls. @@ -12,14 +10,17 @@ http://bibnum.bnf.fr/WARC/ This `warc` library makes it very easy to work with WARC files.:: import warc - f = warc.open("test.warc") - for record in f: - print record['WARC-Target-URI'], record['Content-Length'] + with warc.open("test.warc") as f: + for record in f: + print record['WARC-Target-URI'], record['Content-Length'] Documentation ------------- The documentation of the warc library is available at http://warc.readthedocs.org/. + +Apart from the install from pip, which will not work for this warc3 version, the +interface as described there is unchanged. License ------- @@ -27,3 +28,17 @@ License This software is licensed under GPL v2. See LICENSE_ file for details. .. LICENSE: http://github.com/internetarchive/warc/blob/master/LICENSE + +Authors +------- + +Original Python2 Versions: + +* Anand Chitipothu +* Noufal Ibrahim + +Python3 Port: + +* Ryan Chartier +* Jan Pieter Bruins Slot +* Almer S. Tigelaar diff --git a/docs/conf.py b/docs/conf.py index 4469a18..a59b35c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -40,8 +40,8 @@ master_doc = 'index' # General information about the project. -project = u'warc' -copyright = u'2012, Internet Archive' +project = 'warc' +copyright = '2012, Internet Archive' # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the @@ -178,8 +178,8 @@ # Grouping the document tree into LaTeX files. List of tuples # (source start file, target name, title, author, documentclass [howto/manual]). latex_documents = [ - ('index', 'warc.tex', u'WARC Documentation', - u'Internet Archive', 'manual'), + ('index', 'warc.tex', 'WARC Documentation', + 'Internet Archive', 'manual'), ] # The name of an image file (relative to this directory) to place at the top of @@ -211,6 +211,6 @@ # One entry per manual page. List of tuples # (source start file, name, description, authors, manual section). man_pages = [ - ('index', 'warc', u'WARC Documentation', - [u'Internet Archive'], 1) + ('index', 'warc', 'WARC Documentation', + ['Internet Archive'], 1) ] diff --git a/requirements.txt b/requirements.txt index e079f8a..f3c7e8e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1 @@ -pytest +nose diff --git a/setup.py b/setup.py index 23a2f87..07feba3 100644 --- a/setup.py +++ b/setup.py @@ -4,7 +4,7 @@ setup( name="warc", - version="0.2.0", + version="0.2.2", description="Python library to work with ARC and WARC files", long_description=open('Readme.rst').read(), license='GPLv2', @@ -19,7 +19,7 @@ 'Development Status :: 4 - Beta', 'Environment :: Web Environment', 'Intended Audience :: Developers', - 'License :: OSI Approved :: BSD License', + 'License :: OSI Approved :: GNU General Public License v2 (GPLv2)', 'Operating System :: OS Independent', 'Programming Language :: Python', ], diff --git a/warc/__init__.py b/warc/__init__.py index 71392bc..32a04b8 100644 --- a/warc/__init__.py +++ b/warc/__init__.py @@ -7,27 +7,30 @@ :copyright: (c) 2012 Internet Archive """ -from .arc import ARCFile, ARCRecord, ARCHeader -from .warc import WARCFile, WARCRecord, WARCHeader, WARCReader +from .arc import ARCFile +from .warc import WARCFile + def detect_format(filename): """Tries to figure out the type of the file. Return 'warc' for WARC files and 'arc' for ARC files""" - if ".arc" in filename: - return "arc" - if ".warc" in filename: + if filename.endswith(".warc") or filename.endswith(".warc.gz"): return "warc" + if filename.endswith('.arc') or filename.endswith('.arc.gz'): + return 'arc' + return "unknown" -def open(filename, mode="rb", format = None): + +def open(filename, mode="rb", format=None): """Shorthand for WARCFile(filename, mode). Auto detects file and opens it. """ - if format == "auto" or format == None: + if format == "auto" or format is None: format = detect_format(filename) if format == "warc": @@ -35,4 +38,4 @@ def open(filename, mode="rb", format = None): elif format == "arc": return ARCFile(filename, mode) else: - raise IOError("Don't know how to open '%s' files"%format) + raise IOError("Don't know how to open '%s' files" % format) diff --git a/warc/arc.py b/warc/arc.py index 5889587..93cb9ec 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -1,20 +1,29 @@ """ -Provides support for ARC v1 files. +Provides support for ARC v1 files. :copyright: (c) 2012 Internet Archive """ -import __builtin__ +import builtins import datetime import os import re -import StringIO +import io import warnings +import gzip -from .utils import CaseInsensitiveDict +from .utils import CaseInsensitiveDict, status_code + + +ARC1_HEADER_RE = re.compile(('(?P\S*)\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)')) + +ARC2_HEADER_RE = re.compile(('(?P\S*)\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)\s(?P\S*)' + '\s(?P\S*)')) -ARC1_HEADER_RE = re.compile('(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)') -ARC2_HEADER_RE = re.compile('(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)\s(?P\S*)') class ARCHeader(CaseInsensitiveDict): """ @@ -27,22 +36,24 @@ class ARCHeader(CaseInsensitiveDict): * content_type * length (length of the n/w doc in bytes) - V2 header fields are + V2 header fields are * url * ip_address * date (date of archival) - * content_type + * content_type * result_code (response code) - * checksum + * checksum * location * offset (offset from beginning of file to recrod) * filename (name of arc file) * length (length of the n/w doc in bytes) """ - def __init__(self, url = "", ip_address = "", date = "", content_type = "", - result_code = "", checksum = "", location = "", offset = "", filename = "", length = "", version = 2): + def __init__(self, url="", ip_address="", date="", + content_type="", result_code="", checksum="", + location="", offset="", filename="", length=0, + version=2): if isinstance(date, datetime.datetime): date = date.strftime("%Y%m%d%H%M%S") @@ -50,25 +61,26 @@ def __init__(self, url = "", ip_address = "", date = "", content_type = "", try: datetime.datetime.strptime(date, "%Y%m%d%H%M%S") except ValueError: - raise ValueError("Couldn't parse the date '%s' in file header"%date) + raise ValueError("Couldn't parse the date '%s' in file " + "header" % date) self.version = version - - CaseInsensitiveDict.__init__(self, - url = url, - ip_address = ip_address, - date = date, - content_type = content_type, - result_code = result_code, - checksum = checksum, - location = location, - offset = offset, - filename = filename, - length = length) - - def write_to(self, f, version = None): + super().__init__({ + 'url': url, + 'ip_address': ip_address, + 'date': date, + 'content_type': content_type, + 'result_code': result_code, + 'checksum': checksum, + 'location': location, + 'offset': offset, + 'filename': filename, + 'length': int(length), + }) + + def write_to(self, f, version=None): """ - Writes out the arc header to the file like object `f`. + Writes out the arc header to the file like object `f`. If the version field is 1, it writes out an arc v1 header, otherwise (and this is default), it outputs a v2 header. @@ -77,96 +89,115 @@ def write_to(self, f, version = None): if not version: version = self.version if version == 1: - header = "%(url)s %(ip_address)s %(date)s %(content_type)s %(length)s" + header = ("%(url)s %(ip_address)s %(date)s " + "%(content_type)s %(length)s") elif version == 2: - header = "%(url)s %(ip_address)s %(date)s %(content_type)s %(result_code)s %(checksum)s %(location)s %(offset)s %(filename)s %(length)s" - - header = header%dict(url = self['url'], - ip_address = self['ip_address'], - date = self['date'], - content_type = self['content_type'], - result_code = self['result_code'], - checksum = self['checksum'], - location = self['location'], - offset = self['offset'], - filename = self['filename'], - length = self['length']) + header = ("%(url)s %(ip_address)s %(date)s %(content_type)s " + "%(result_code)s %(checksum)s %(location)s %(offset)s " + "%(filename)s %(length)s") + + header = header % dict(self) f.write(header) - @property def url(self): return self["url"] - + @property def ip_address(self): return self["ip_address"] - + @property def date(self): return datetime.datetime.strptime(self['date'], "%Y%m%d%H%M%S") - + @property def content_type(self): return self["content_type"] - + @property def result_code(self): return self["result_code"] - + @property - def checksum (self): + def checksum(self): return self["checksum"] - + @property def location(self): return self["location"] - + @property def offset(self): return int(self["offset"]) - + @property def filename(self): return self["filename"] - + @property def length(self): return int(self["length"]) def __str__(self): - f = StringIO.StringIO() + f = io.StringIO() self.write_to(f) return f.getvalue() - + def __repr__(self): f = {} - for i in "url ip_address date content_typeresult_code checksum location offset filename length".split(): - if hasattr(self,i): + fields = ("url ip_address date content_type result_code checksum " + "location offset filename length".split()) + for i in fields: + if hasattr(self, i): f[i] = getattr(self, i) - s = ['%s = "%s"'%(k, v) for k,v in f.iteritems()] + s = ['%s = "%s"' % (k, v) for k, v in f.items()] s = ", ".join(s) - return ""%s + return "" % s + - class ARCRecord(object): - def __init__(self, header = None, payload = None, headers = {}, version = None): + def __init__(self, header=None, payload=None, headers={}, version=None): if not (header or headers): - raise TypeError("Can't write create an ARC1 record without a header") - self.header = header or ARCHeader(version = version, **headers) - self.payload = payload + raise TypeError("Can't write create an ARC1 record " + "without a header") + self.header = header or ARCHeader(version=version, **headers) + self.payload = io.BytesIO(payload) self.version = version - + self._read_html_headers() + + def _read_html_headers(self): + line = self.payload.readline().decode('utf-8') + if not line.startswith("HTTP/1"): + self.payload.seek(0) + return + + line = line.strip() + headers = { + 'protocol': line, + 'status_code': status_code(line), + } + for line in self.payload: + line = line.decode('utf-8') + if not line.strip(): + break + name, content = line.split(':', 1) + name = name.strip() + content = content.strip() + headers[name.lower()] = content + self.header['http_headers'] = headers + self.payload = io.BytesIO(self.payload.read()) + @classmethod def from_string(cls, string, version): """ - Constructs an ARC record from a string and returns it. + Constructs an ARC record from a string and returns it. TODO: It might be best to merge this with the _read_arc_record function rather than reimplement the functionality here. """ - header, payload = string.split("\n",1) - if payload[0] == '\n': # There's an extra + header, payload = string.split("\n", 1) + if payload[0] == '\n': # There's an extra payload = payload[1:] if int(version) == 1: arc_header_re = ARC1_HEADER_RE @@ -176,16 +207,19 @@ def from_string(cls, string, version): matches = arc_header_re.search(header) headers = matches.groupdict() arc_header = ARCHeader(**headers) - return cls(header = arc_header, payload = payload, version = version) + return cls(header=arc_header, payload=payload, version=version) - def write_to(self, f, version = None): + def write_to(self, f, version=None): version = version or self.version or 2 self.header.write_to(f, version) - f.write("\n") # This separates the header and the body - if isinstance(self.payload, str): #Usually used for small payloads + f.write("\n") # This separates the header and the body + # Usually used for small payloads + if isinstance(self.payload, str): f.write(self.payload) - elif hasattr(self.payload, "read"): #Used for large payloads where we give a file like object - chunk_size = 10 * 1024 * 1024 # Read 10MB by 10MB + # Used for large payloads where we give a file like object + elif hasattr(self.payload, "read"): + # Read 10MB by 10MB + chunk_size = 10 * 1024 * 1024 d = self.payload.read(chunk_size) while d: f.write(d) @@ -198,20 +232,20 @@ def __getitem__(self, name): def __setitem__(self, name, value): self.header[name] = value - def __str__(self): - f = StringIO.StringIO() + f = io.StringIO() self.write_to(f) return f.getvalue() - - + + class ARCFile(object): - def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_headers = {}): + def __init__(self, filename=None, mode=None, fileobj=None, version=None, + file_headers=None, compress=None): """ Initialises a file like object that can be used to read or write Arc files. Works for both version 1 or version 2. - This can be called similar to the builtin `file` constructor. + This can be called similar to the builtin `file` constructor. It can also just be given a fileobj which is a file like object that it will use directly for its work. @@ -223,7 +257,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ * ip_address - IP address of the machine doing the Archiving * date - Date of archival - * org - Organisation that's doing the Archiving. + * org - Organisation that's doing the Archiving. The version parameter tries to work intuitively as follows @@ -247,20 +281,41 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ * When we try to read a record, it will read out one record and try to guess the version from it (for the first read). - + """ if fileobj is None: - fileobj = __builtin__.open(filename, mode or "rb") + fileobj = builtins.open(filename, mode or "rb") + mode = fileobj.mode + # initialize compress based on filename, if not already specified + if compress is None and filename and filename.endswith(".gz"): + compress = True + + if compress: + fileobj = gzip.open(fileobj, mode) + self.fileobj = fileobj - if version != None and int(version) not in (1, 2): + self.filename = filename + if self.filename is None: + if hasattr(self.fileobj, "name"): + self.filename = self.fileobj.name + else: + self.filename = "" + + if version and int(version) not in (1, 2): raise TypeError("ARC version has to be 1 or 2") self.version = version - self.file_headers = file_headers + self.file_headers = file_headers or {} self.header_written = False self.header_read = False + self.file_meta = '' + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() - def _write_header(self): "Writes out an ARC header" if "org" not in self.file_headers: @@ -268,31 +323,37 @@ def _write_header(self): self.file_headers['org'] = "Unknown" if "date" not in self.file_headers: now = datetime.datetime.utcnow() - warnings.warn("Using '%s' for Archiving time"%now) + warnings.warn("Using '%s' for Archiving time" % now) self.file_headers['date'] = now if "ip_address" not in self.file_headers: - warnings.warn("Using '127.0.0.1' as IP address of machine that's archiving") + warnings.warn("Using '127.0.0.1' as IP address of machine " + "that's archiving") self.file_headers['ip_address'] = "127.0.0.1" if self.version == 1: - payload = "1 0 %(org)s\nURL IP-address Archive-date Content-type Archive-length"%dict(org = self.file_headers['org']) + payload = ("1 0 %s\nURL IP-address Archive-date Content-type " + "Archive-length") % self.file_headers['org'] elif self.version == 2: - payload = "2 0 %(org)s\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length" + payload = ("2 0 %s\nURL IP-address Archive-date Content-type " + "Result-code Checksum Location Offset Filename " + "Archive-length" % self.file_headers['org']) else: - raise IOError("Can't write an ARC file with version '\"%s\"'"%self.version) - - fname = os.path.basename(self.fileobj.name) - header = ARCHeader(url = "filedesc://%s"%fname, - ip_address = self.file_headers['ip_address'], - date = self.file_headers['date'], - content_type = "text/plain", - length = len(payload), - result_code = "200", - checksum = "-", - location = "-", - offset = str(self.fileobj.tell()), - filename = fname) - arc_file_header_record = ARCRecord(header, payload%self.file_headers) - self.write(arc_file_header_record) + raise IOError("Can't write an ARC file " + "with version '\"%s\"'" % self.version) + + fname = os.path.basename(self.filename) + header = ARCHeader(url="filedesc://%s" % fname, + ip_address=self.file_headers['ip_address'], + date=self.file_headers['date'], + content_type="text/plain", + length=len(payload), + result_code="200", + checksum="-", + location="-", + offset=str(self.fileobj.tell()), + filename=fname) + arc_file_header_record = ARCRecord(header, payload % self.file_headers) + arc_file_header_record.write_to(self.fileobj, self.version) + self.fileobj.write("\n") # record separator def write(self, arc_record): "Writes out the given arc record to the file" @@ -302,75 +363,114 @@ def write(self, arc_record): self.header_written = True self._write_header() arc_record.write_to(self.fileobj, self.version) - self.fileobj.write("\n") # Record separator + self.fileobj.write("\n") # Record separator def _read_file_header(self): """Reads out the file header for the arc file. If version was not provided, this will autopopulate it.""" - header = self.fileobj.readline() - payload1 = self.fileobj.readline() - payload2 = self.fileobj.readline() - version, reserved, organisation = payload1.split(None, 2) - self.fileobj.readline() # Lose the separator newline + header = self.fileobj.readline().decode('utf-8') + payload1 = self.fileobj.readline().decode('utf-8') + payload2 = self.fileobj.readline().decode('utf-8') + version, reserved, organisation = payload1.split(maxsplit=2) self.header_read = True - # print "--------------------------------------------------" - # print header,"\n", payload1, "\n", payload2,"\n" - # print "--------------------------------------------------" + version = int(version) + # print("--------------------------------------------------") + # print(header, "\n", payload1, "\n", payload2, "\n", version) + # print("--------------------------------------------------") if self.version and int(self.version) != version: - raise IOError("Version mismatch. Requested version was '%s' but version in file was '%s'"%(self.version, version)) - - if version == '1': + raise IOError("Version mismatch. Requested version was '%s' but " + "version in file was '%s'" % (self.version, version)) + + if version == 1: url, ip_address, date, content_type, length = header.split() - self.file_headers = {"ip_address" : ip_address, - "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), - "org" : organisation} + self.file_headers = { + "ip_address": ip_address, + "date": datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), + "org": organisation, + "url": url, + 'content_type': content_type, + 'length': int(length), + } self.version = 1 - elif version == '2': - url, ip_address, date, content_type, result_code, checksum, location, offset, filename, length = header.split() - self.file_headers = {"ip_address" : ip_address, - "date" : datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), - "org" : organisation} + elif version == 2: + (url, ip_address, date, content_type, result_code, + checksum, location, offset, filename, length) = header.split() + self.file_headers = { + "ip_address": ip_address, + "date": datetime.datetime.strptime(date, "%Y%m%d%H%M%S"), + "org": organisation, + 'url': url, + 'content_type': content_type, + 'length': int(length), + 'filename': filename, + 'location': location, + } self.version = 2 else: - raise IOError("Unknown ARC version '%s'"%version) + raise IOError("Unknown ARC version '%s'" % version) + + length = int(length) + current_size = len(payload1 + payload2) + self.file_meta = b'' + while current_size < length: + line = self.fileobj.readline() + self.file_meta = self.file_meta + line + current_size = current_size + len(line) + self.fileobj.readline() # Lose the separator newline + + def _strip_initial_new_lines(self): + line = self.fileobj.readline() + while line and not line.strip(): + line = self.fileobj.readline() + return line.decode('utf-8').strip() + + def _safe_from_arcmetadata(self, line): + # JG: this block stops the header parser / reader + # from getting caught on the XML lump + # that can appear in ARC files + if line.startswith("\n"): + line = self.fileobj.readline().decode('utf-8') + line = self.fileobj.readline().decode('utf-8') + line = self.fileobj.readline().decode('utf-8') + return line.strip() + + def _read_record_header(self, line): + if self.version == 1: + arc_header_re = ARC1_HEADER_RE + elif self.version == 2: + arc_header_re = ARC2_HEADER_RE + + matches = arc_header_re.search(line) + headers = matches.groupdict() + return ARCHeader(**headers) def _read_arc_record(self): "Reads out an arc record, formats it and returns it" - #XXX:Noufal Stream payload here rather than just read it + # XXX:Noufal Stream payload here rather than just read it # r = self.fileobj.readline() # Drop the initial newline # if r == "": # return None # header = self.fileobj.readline() - # Strip the initial new lines and read first line - header = self.fileobj.readline() - while header and header.strip() == "": - header = self.fileobj.readline() + line = self._strip_initial_new_lines() + line = self._safe_from_arcmetadata(line) - if header == "": + if not line: return None - if int(self.version) == 1: - arc_header_re = ARC1_HEADER_RE - elif int(self.version) == 2: - arc_header_re = ARC2_HEADER_RE + header = self._read_record_header(line) + payload = self.fileobj.read(header['length']) - matches = arc_header_re.search(header) - headers = matches.groupdict() - arc_header = ARCHeader(**headers) - - payload = self.fileobj.read(int(headers['length'])) - - self.fileobj.readline() # Munge the separator newline. + self.fileobj.readline() # Munge the separator newline. + return ARCRecord(header=header, payload=payload) - return ARCRecord(header = arc_header, payload = payload) - def read(self): "Reads out an arc record from the file" if not self.header_read: self._read_file_header() return self._read_arc_record() - + # For compatability with WARCFile read_record = read write_record = write @@ -380,16 +480,6 @@ def __iter__(self): while record: yield record record = self.read() - + def close(self): self.fileobj.close() - - - - - - - - - - diff --git a/warc/gzip2.py b/warc/gzip2.py deleted file mode 100644 index fcd6b48..0000000 --- a/warc/gzip2.py +++ /dev/null @@ -1,121 +0,0 @@ -"""Enhanced gzip library to support multiple member gzip files. - -GZIP has an interesting property that contatination of mutliple gzip files is a valid gzip file. -In other words, a gzip file can have multiple members, each individually gzip -compressed. The members simply appear one after another in the file, with no -additional information before, between, or after them. - -See GZIP RFC for more information. - -http://www.gzip.org/zlib/rfc-gzip.html - -This library provides support for creating and reading multi-member gzip files. -""" -from gzip import WRITE, READ, write32u, GzipFile as BaseGzipFile -import zlib - -def open(filename, mode="rb", compresslevel=9): - """Shorthand for GzipFile(filename, mode, compresslevel). - """ - return GzipFile(filename, mode, compresslevel) - -class GzipFile(BaseGzipFile): - """GzipFile with support for multi-member gzip files. - """ - def __init__(self, filename=None, mode=None, - compresslevel=9, fileobj=None): - BaseGzipFile.__init__(self, - filename=filename, - mode=mode, - compresslevel=compresslevel, - fileobj=fileobj) - - if self.mode == WRITE: - # Indicates the start of a new member if value is True. - # The BaseGzipFile constructor already wrote the header for new - # member, so marking as False. - self._new_member = False - - # When _member_lock is True, only one member in gzip file is read - self._member_lock = False - - def close_member(self): - """Closes the current member being written. - """ - # The new member is not yet started, no need to close - if self._new_member: - return - - self.fileobj.write(self.compress.flush()) - write32u(self.fileobj, self.crc) - # self.size may exceed 2GB, or even 4GB - write32u(self.fileobj, self.size & 0xffffffffL) - self.size = 0 - self.compress = zlib.compressobj(9, - zlib.DEFLATED, - -zlib.MAX_WBITS, - zlib.DEF_MEM_LEVEL, - 0) - self._new_member = True - - def _start_member(self): - """Starts writing a new member if required. - """ - if self._new_member: - self._init_write(self.name) - self._write_gzip_header() - self._new_member = False - - def write(self, data): - self._start_member() - BaseGzipFile.write(self, data) - - def close(self): - """Closes the gzip with care to handle multiple members. - """ - if self.fileobj is None: - return - if self.mode == WRITE: - self.close_member() - self.fileobj = None - elif self.mode == READ: - self.fileobj = None - - if self.myfileobj: - self.myfileobj.close() - self.myfileobj = None - - def _read(self, size): - # Treat end of member as end of file when _member_lock flag is set - if self._member_lock and self._new_member: - raise EOFError() - else: - return BaseGzipFile._read(self, size) - - def read_member(self): - """Returns a file-like object to read one member from the gzip file. - """ - if self._member_lock is False: - self._member_lock = True - - if self._new_member: - try: - # Read one byte to move to the next member - BaseGzipFile._read(self, 1) - assert self._new_member is False - except EOFError: - return None - - return self - - def write_member(self, data): - """Writes the given data as one gzip member. - - The data can be a string, an iterator that gives strings or a file-like object. - """ - if isinstance(data, basestring): - self.write(data) - else: - for text in data: - self.write(text) - self.close_member() diff --git a/warc/tests/test_arc.py b/warc/tests/test_arc.py index 11305e5..d0f0b9c 100644 --- a/warc/tests/test_arc.py +++ b/warc/tests/test_arc.py @@ -1,29 +1,27 @@ import datetime import hashlib -import StringIO +import io from .. import arc -import pytest +#def test_init_arc_header(): +# "Make sure Header can be initialise only with expected fields" +# with pytest.raises(TypeError): +# arc.ARCHeader(test="1234") -def test_init_arc_header(): - "Make sure Header can be initialise only with expected fields" - with pytest.raises(TypeError): - arc.ARCHeader(test="1234") - def test_arc_header_attributes(): "Make sure that ARC1 header fields are accessible as attributes. Double check for attributes that are converted for convenience (e.g. date and length)" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - + assert header.url == "http://archive.org" assert header.ip_address == "127.0.0.1" assert header.date == datetime.datetime.strptime("20120301093000", "%Y%m%d%H%M%S") @@ -36,57 +34,57 @@ def test_arc_header_attributes(): assert header.location == "http://www.archive.org" assert header.offset == 300 assert header.filename == "sample.arc.gz" - + def test_arc_v1_header_creation(): "Validate ARC V1 header creation" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - f = StringIO.StringIO() + f = io.StringIO() header.write_to(f, 1) header_v1_string = f.getvalue() assert header_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500" - - + + def test_arc_v2_header_creation(): "Validate ARC V2 header creation" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") - f = StringIO.StringIO() + f = io.StringIO() header.write_to(f) header_v2_string = f.getvalue() assert header_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500" - - + + def test_arc_v1_record_creation(): "Validate ARC V1 record creation" header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") record_v1 = arc.ARCRecord(header, "BlahBlah") - f = StringIO.StringIO() + f = io.StringIO() record_v1.write_to(f, 1) record_v1_string = f.getvalue() assert record_v1_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n" @@ -94,17 +92,17 @@ def test_arc_v1_record_creation(): def test_arc_v2_record_creation(): "Validate ARC V1 record creation" header = dict(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") record_v2 = arc.ARCRecord(payload = "BlahBlah", headers = header) - f = StringIO.StringIO() + f = io.StringIO() record_v2.write_to(f) record_v2_string = f.getvalue() assert record_v2_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n" @@ -116,14 +114,14 @@ def test_arc_v1_writer(): date = now, org = "Internet Archive") - opfile = StringIO.StringIO() + opfile = io.StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) for payload in "Payload1 Payload2".split(): header = dict(url = "http://www.archive.org", - ip_address = "127.0.0.1", - date = now, + ip_address = "127.0.0.1", + date = now, content_type = "text/html", length = len(payload)) r = arc.ARCRecord(headers = header, payload = payload) @@ -137,14 +135,14 @@ def test_arc1_v1_writer_default_headers(): now = datetime.datetime(year = 2012, month = 3, day = 2, hour = 19, minute = 32, second = 10) file_headers = dict(date = now) - opfile = StringIO.StringIO() + opfile = io.StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. - + f = arc.ARCFile(fileobj = opfile, version = 1, file_headers = file_headers) for payload in "Payload1 Payload2".split(): header = dict(url = "http://www.archive.org", - ip_address = "127.0.0.1", - date = now, + ip_address = "127.0.0.1", + date = now, content_type = "text/html", length = len(payload)) r = arc.ARCRecord(headers = header, payload = payload) @@ -153,7 +151,6 @@ def test_arc1_v1_writer_default_headers(): assert opfile.getvalue() == expected_value f.close() - def test_arc_v2_writer(): "Try writing records to an ARC V2 file. This is what API will feel like to a user of the library" now = "20120302193210" @@ -161,18 +158,18 @@ def test_arc_v2_writer(): date = now, org = "Internet Archive") - opfile = StringIO.StringIO() + opfile = io.StringIO() opfile.name = "sample.arc" # Necessary since only file objects in Python have names. f = arc.ARCFile(fileobj = opfile, file_headers = file_headers) for payload in "Payload1 Payload2".split(): header = arc.ARCHeader(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") @@ -183,21 +180,21 @@ def test_arc_v2_writer(): def test_arc_reader_guess_version(): "Make sure that the ARCFile object automatically detects the file version" - v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2") - v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") - + v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload1\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\n\nPayload2") + v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") + arc_v1 = arc.ARCFile(fileobj = v1) arc_v2 = arc.ARCFile(fileobj = v2) arc_v1.read() arc_v2.read() - + assert arc_v1.version == 1 assert arc_v2.version == 2 - + def test_arc_reader_read_file_headers(): "Make sure that the parser is reading file headers properly" - ip = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") + ip = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\n\nPayload2") arc_file = arc.ARCFile(fileobj = ip) arc_file.read() arc_file.file_headers['ip_address'] == "127.0.0.1" @@ -205,14 +202,14 @@ def test_arc_reader_read_file_headers(): arc_file.file_headers['org'] == "Internet Archive" -def test_arc_reader_v1(): +def test_arc_reader_v1(): "Make sure that the parser reads out V1 ARC records. (Also tests iterator behaviour)" - v1 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2") - arc_file = arc.ARCFile(fileobj = v1) + v1 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 68\n1 0 Unknown\nURL IP-address Archive-date Content-type Archive-length\n\n\nhttp://www.archive.org 127.0.0.1 20120302193210 text/html 8\nPayload1\nhttp://archive.org 127.0.0.1 20120302193211 text/plain 8\nPayload2") + arc_file = arc.ARCFile(fileobj = v1) r1 = arc_file.read() r2 = arc_file.read() - + assert r1['url'] == "http://www.archive.org" assert r1['ip_address'] == "127.0.0.1" assert r1['date'] == "20120302193210" @@ -228,12 +225,12 @@ def test_arc_reader_v1(): assert r2.payload == "Payload2" -def test_arc_reader_v2(): +def test_arc_reader_v2(): "Make sure that the parser reads out V2 ARC records. (Also tests iterator behaviour)" - v2 = StringIO.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2") - arc_file = arc.ARCFile(fileobj = v2) + v2 = io.StringIO("filedesc://sample.arc 127.0.0.1 20120302193210 text/plain 200 - - 0 sample.arc 114\n2 0 Internet Archive\nURL IP-address Archive-date Content-type Result-code Checksum Location Offset Filename Archive-length\n\n\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload1\nhttp://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 8\nPayload2") + arc_file = arc.ARCFile(fileobj = v2) r1, r2 = list(arc_file) - + assert r1['url'] == "http://archive.org" assert r1['ip_address'] == "127.0.0.1" assert r1['date'] == "20120301093000" @@ -277,24 +274,52 @@ def test_arc_v2_record_from_string(): def test_arc_record_versions(): "Check initialising an ARCRecord with a version to see if it outputs stuff properly" header = dict(url = "http://archive.org", - ip_address = "127.0.0.1", - date = "20120301093000", - content_type = "text/html", + ip_address = "127.0.0.1", + date = "20120301093000", + content_type = "text/html", length = "500", result_code = "200", - checksum = "a123456", + checksum = "a123456", location = "http://www.archive.org", offset = "300", filename = "sample.arc.gz") record_1 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 1) record_2 = arc.ARCRecord(payload = "BlahBlah", headers = header, version = 2) - f = StringIO.StringIO() + f = io.StringIO() record_1.write_to(f) record_string = f.getvalue() assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 500\nBlahBlah\n" - f = StringIO.StringIO() + f = io.StringIO() record_2.write_to(f) record_string = f.getvalue() assert record_string == "http://archive.org 127.0.0.1 20120301093000 text/html 200 a123456 http://www.archive.org 300 sample.arc.gz 500\nBlahBlah\n" + +class TestARCFile: + def test_write_headers(self): + """Test to make sure header is written just once. + """ + f = io.StringIO() + f.name = "sample.arc" + afile = arc.ARCFile(fileobj=f, version=1) + afile._write_header() + + # Make sure header is written only once + assert f.getvalue().count("filedesc://") == 1 + + def test_filename(self): + """If filename is specified as argument to ARCFile, it should be used.""" + f = io.StringIO() + afile = arc.ARCFile(fileobj=f, filename="sample.arc", version=1) + afile._write_header() + assert "sample.arc" in f.getvalue() + + def test_no_filename(self): + """should be able to write ARCFile even if there is no filename.""" + f = io.StringIO() + afile = arc.ARCFile(fileobj=f, version=1) + afile._write_header() + # filename should be empty + assert f.getvalue().startswith("filedesc:// ") + diff --git a/warc/tests/test_common.py b/warc/tests/test_common.py index d2c2353..4ad14ec 100644 --- a/warc/tests/test_common.py +++ b/warc/tests/test_common.py @@ -1,44 +1,13 @@ -from .. import open as libopen -from .. import WARCFile, ARCFile +from ..__init__ import open as libopen +from ..warc import WARCFile import os -import pytest - def test_open_warc_file(): "Test opening a WARC file" - + f = libopen("foo.warc","wb") assert isinstance(f, WARCFile) f.close() os.unlink("foo.warc") - -def test_open_arc_file(): - "Test opening an ARC file" - - f = libopen("foo.arc","wb") - assert isinstance(f, ARCFile) - f.close() - os.unlink("foo.arc") - - -def test_open_unknown_file(): - "Test opening a WARC file" - - with pytest.raises(IOError): - libopen("foo","wb") - - -def test_sample_data(): - import gzip - f = gzip.GzipFile("test_data/alexa_short_header.arc.gz") - a = ARCFile(fileobj = f) - record = str(a.read()) - expected = """http://www.killerjo.net:80/robots.txt 211.111.217.29 20110804181142 39 -SSH-2.0-OpenSSH_5.3p1 Debian-3ubuntu3\r\n\n""" - assert record == expected - - - - diff --git a/warc/tests/test_utils.py b/warc/tests/test_utils.py index c155e6e..6be5abc 100644 --- a/warc/tests/test_utils.py +++ b/warc/tests/test_utils.py @@ -1,5 +1,5 @@ from ..utils import FilePart, CaseInsensitiveDict -from cStringIO import StringIO +import io class TestCaseInsensitiveDict: def test_all(self): @@ -13,44 +13,52 @@ def test_all(self): d['BAR'] = 2 assert 'bar' in d assert d['bar'] == 2 - + assert sorted(d.keys()) == ["bar", "foo"] assert sorted(d.items()) == [("bar", 2), ("foo", 1)] - + class TestFilePart: - def setup_method(self, m): + def setup(self): # 5 chars in each line - self.text = "\n".join(["aaaa", "bbbb", "cccc", "dddd", "eeee", "ffff"]) - + self.text = b"\n".join([b"aaaa", b"bbbb", b"cccc", b"dddd", b"eeee", b"ffff"]) + def test_read(self): - part = FilePart(StringIO(self.text), 0) - assert part.read() == "" - - part = FilePart(StringIO(self.text), 5) - assert part.read() == "aaaa\n" - - part = FilePart(StringIO(self.text), 10) - assert part.read() == "aaaa\nbbbb\n" - + part = FilePart(io.BytesIO(self.text), 0) + assert part.read() == b"" + + part = FilePart(io.BytesIO(self.text), 5) + assert part.read() == b"aaaa\n" + + part = FilePart(io.BytesIO(self.text), 10) + assert part.read() == b"aaaa\nbbbb\n" + # try with large data - part = FilePart(StringIO("a" * 10000), 10) + part = FilePart(io.BytesIO(b"a" * 10000), 10) assert len(part.read()) == 10 - + def test_read_with_size(self): - part = FilePart(StringIO(self.text), 10) - assert part.read(3) == "aaa" - assert part.read(3) == "a\nb" - assert part.read(3) == "bbb" - assert part.read(3) == "\n" - assert part.read(3) == "" - + part = FilePart(io.BytesIO(self.text), 10) + assert part.read(3) == b"aaa" + assert part.read(3) == b"a\nb" + assert part.read(3) == b"bbb" + assert part.read(3) == b"\n" + assert part.read(3) == b"" + + def test_read_with_buffer(self): + "Tests read size when read length is larger than buffer." + fb = io.BytesIO(b'a' * 10000) + part = FilePart(fb, 10000) + temp = part.read(100) + part._unread(temp) + assert len(part.read(1000)) == 1000 + def test_readline(self): - part = FilePart(StringIO(self.text), 11) - assert part.readline() == "aaaa\n" - assert part.readline() == "bbbb\n" - assert part.readline() == "c" - assert part.readline() == "" - + part = FilePart(io.BytesIO(self.text), 11) + assert part.readline() == b"aaaa\n" + assert part.readline() == b"bbbb\n" + assert part.readline() == b"c" + assert part.readline() == b"" + def test_iter(self): - part = FilePart(StringIO(self.text), 11) - assert list(part) == ["aaaa\n", "bbbb\n", "c"] \ No newline at end of file + part = FilePart(io.BytesIO(self.text), 11) + assert list(part) == [b"aaaa\n", b"bbbb\n", b"c"] diff --git a/warc/tests/test_warc.py b/warc/tests/test_warc.py index 92545ba..7f722f7 100644 --- a/warc/tests/test_warc.py +++ b/warc/tests/test_warc.py @@ -1,6 +1,5 @@ from ..warc import WARCReader, WARCHeader, WARCRecord, WARCFile - -from StringIO import StringIO +import io class TestWARCHeader: def test_attrs(self): @@ -53,21 +52,21 @@ def f(type): assert f("newtype")["Content-Type"] == "application/octet-stream" SAMPLE_WARC_RECORD_TEXT = ( - "WARC/1.0\r\n" + - "Content-Length: 10\r\n" + - "WARC-Date: 2012-02-10T16:15:52Z\r\n" + - "Content-Type: application/http; msgtype=response\r\n" + - "WARC-Type: response\r\n" + - "WARC-Record-ID: \r\n" + - "WARC-Target-URI: http://example.com/\r\n" + - "\r\n" + - "Helloworld" + - "\r\n\r\n" + b"WARC/1.0\r\n" + + b"Content-Length: 10\r\n" + + b"WARC-Date: 2012-02-10T16:15:52Z\r\n" + + b"Content-Type: application/http; msgtype=response\r\n" + + b"WARC-Type: response\r\n" + + b"WARC-Record-ID: \r\n" + + b"WARC-Target-URI: http://example.com/\r\n" + + b"\r\n" + + b"Helloworld" + + b"\r\n\r\n" ) class TestWARCReader: def test_read_header1(self): - f = StringIO(SAMPLE_WARC_RECORD_TEXT) + f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT) h = WARCReader(f).read_record().header assert h.date == "2012-02-10T16:15:52Z" assert h.record_id == "" @@ -75,17 +74,17 @@ def test_read_header1(self): assert h.content_length == 10 def test_empty(self): - reader = WARCReader(StringIO("")) + reader = WARCReader(io.BytesIO(b"")) assert reader.read_record() is None def test_read_record(self): - f = StringIO(SAMPLE_WARC_RECORD_TEXT) + f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT) reader = WARCReader(f) record = reader.read_record() - assert "".join(record.payload) == "Helloworld" + assert record.payload.readline() == b"Helloworld" def read_multiple_records(self): - f = StringIO(SAMPLE_WARC_RECORD_TEXT * 5) + f = io.BytesIO(SAMPLE_WARC_RECORD_TEXT * 5) reader = WARCReader(f) for i in range(5): rec = reader.read_record() @@ -93,21 +92,10 @@ def read_multiple_records(self): class TestWarcFile: def test_read(self): - f = WARCFile(fileobj=StringIO(SAMPLE_WARC_RECORD_TEXT)) + f = WARCFile(fileobj=io.BytesIO(SAMPLE_WARC_RECORD_TEXT)) assert f.read_record() is not None assert f.read_record() is None - def test_write_gz(self): - """Test writing multiple member gzip file.""" - buffer = StringIO() - f = WARCFile(fileobj=buffer, mode="w", compress=True) - for i in range(10): - record = WARCRecord(payload="hello %d" % i) - f.write_record(record) - - GZIP_MAGIC_NUMBER = '\037\213' - assert buffer.getvalue().count(GZIP_MAGIC_NUMBER) == 10 - def test_long_header(self): """Test large WARC header with a CRLF across a 1024 byte boundrary""" from .. import warc diff --git a/warc/utils.py b/warc/utils.py index 8620e8e..404eff8 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -7,11 +7,42 @@ :copyright: (c) 2012 Internet Archive """ -from UserDict import DictMixin +from collections import MutableMapping +import re -class CaseInsensitiveDict(DictMixin): +SEP = re.compile("[;:=]") + + +def status_code(protocol): + code = protocol.split(' ')[1] + return int(code) + + +def get_http_headers(f): + line = f.readline().decode('utf-8') + if not line.startswith("HTTP/1"): + f.seek(0) + return + + line = line.strip() + headers = { + 'protocol': line, + 'status_code': status_code(line), + } + for line in f: + line = line.decode('utf-8') + if not line.strip(): + break + name, content = line.split(':', 1) + name = name.strip() + content = content.strip() + headers[name.lower()] = content + return headers + + +class CaseInsensitiveDict(MutableMapping): """Almost like a dictionary, but keys are case-insensitive. - + >>> d = CaseInsensitiveDict(foo=1, Bar=2) >>> d['foo'] 1 @@ -23,71 +54,72 @@ class CaseInsensitiveDict(DictMixin): >>> d.keys() ["foo", "bar"] """ - def __init__(self, mapping=None, **kwargs): + def __init__(self, *args, **kwargs): self._d = {} - self.update(mapping, **kwargs) - + self.update(dict(*args, **kwargs)) + def __setitem__(self, name, value): self._d[name.lower()] = value - + def __getitem__(self, name): return self._d[name.lower()] - + def __delitem__(self, name): del self._d[name.lower()] - + def __eq__(self, other): return isinstance(other, CaseInsensitiveDict) and other._d == self._d - - def keys(self): - return self._d.keys() + + def __iter__(self): + return iter(self._d) + + def __len__(self): + return len(self._d) + class FilePart: """File interface over a part of file. - - Takes a file and length to read from the file and returns a file-object + + Takes a file and length to read from the file and returns a file-object over that part of the file. """ def __init__(self, fileobj, length): self.fileobj = fileobj self.length = length self.offset = 0 - self.buf = "" - + self.buf = b'' + def read(self, size=-1): if size == -1: - return self._read(self.length) - else: - return self._read(size) - - def _read(self, size): + size = self.length + if len(self.buf) >= size: content = self.buf[:size] self.buf = self.buf[size:] else: - size = min(size, self.length - self.offset - len(self.buf)) - content = self.buf + self.fileobj.read(size) - self.buf = "" + size = min(size, self.length - self.offset) + content = self.buf + self.fileobj.read(size - len(self.buf)) + self.buf = b'' self.offset += len(content) return content - + def _unread(self, content): self.buf = content + self.buf self.offset -= len(content) - - def readline(self): + + def readline(self, size=1024): chunks = [] - chunk = self._read(1024) - while chunk and "\n" not in chunk: + chunk = self.read(size) + while chunk and b"\n" not in chunk: chunks.append(chunk) - chunk = self._read(1024) - - if "\n" in chunk: - index = chunk.index("\n") + chunk = self.read(size) + + if b"\n" in chunk: + index = chunk.index(b"\n") self._unread(chunk[index+1:]) chunk = chunk[:index+1] chunks.append(chunk) - return "".join(chunks) + return b"".join(chunks) def __iter__(self): line = self.readline() diff --git a/warc/warc.py b/warc/warc.py index 0c762a6..a5eaae0 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -7,31 +7,31 @@ :copyright: (c) 2012 Internet Archive """ -import __builtin__ +import gzip +import builtins import datetime import uuid -import logging import re -from cStringIO import StringIO +import io import hashlib -from . import gzip2 -from .utils import CaseInsensitiveDict, FilePart +from .utils import CaseInsensitiveDict, FilePart, get_http_headers + class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. - It provides dictionary like interface for accessing the headers. - + It provides dictionary like interface for accessing the headers. + The following mandatory fields are accessible also as attributes. - + * h.record_id == h['WARC-Record-ID'] * h.content_length == int(h['Content-Length']) * h.date == h['WARC-Date'] * h.type == h['WARC-Type'] - - :params headers: dictionary of headers. - :params defaults: If True, important headers like WARC-Record-ID, + + :params headers: dictionary of headers. + :params defaults: If True, important headers like WARC-Record-ID, WARC-Date, Content-Type and Content-Length are initialized to automatically if not already present. TODO: @@ -40,9 +40,9 @@ class WARCHeader(CaseInsensitiveDict): * url * ip_address * date (date of archival) - * content_type + * content_type * result_code (response code) - * checksum + * checksum * location * offset (offset from beginning of file to recrod) * filename (name of arc file) @@ -50,10 +50,10 @@ class WARCHeader(CaseInsensitiveDict): """ CONTENT_TYPES = dict(warcinfo='application/warc-fields', - response='application/http; msgtype=response', - request='application/http; msgtype=request', - metadata='application/warc-fields') - + response='application/http; msgtype=response', + request='application/http; msgtype=request', + metadata='application/warc-fields') + KNOWN_HEADERS = { "type": "WARC-Type", "date": "WARC-Date", @@ -65,16 +65,17 @@ class WARCHeader(CaseInsensitiveDict): "content_type": "Content-Type", "content_length": "Content-Length" } - + def __init__(self, headers, defaults=False): - self.version = "WARC/1.0" - CaseInsensitiveDict.__init__(self, headers) + super().__init__(headers) if defaults: self.init_defaults() - + self.version = "WARC/%s" % self.get('warc-version', '1.0') + def init_defaults(self): - """Initializes important headers to default values, if not already specified. - + """Initializes important headers to default values, + if not already specified. + The WARC-Record-ID header is set to a newly generated UUID. The WARC-Date header is set to the current datetime. The Content-Type is set based on the WARC-Type header. @@ -86,83 +87,112 @@ def init_defaults(self): self['WARC-Date'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') if "Content-Type" not in self: self['Content-Type'] = WARCHeader.CONTENT_TYPES.get(self.type, "application/octet-stream") - + def write_to(self, f): """Writes this header to a file, in the format specified by WARC. """ - f.write(self.version + "\r\n") + f.write(self.version.encode() + b"\r\n") for name, value in self.items(): name = name.title() # Use standard forms for commonly used patterns - name = name.replace("Warc-", "WARC-").replace("-Ip-", "-IP-").replace("-Id", "-ID").replace("-Uri", "-URI") - f.write(name) - f.write(": ") - f.write(value) - f.write("\r\n") - + name = (name.replace("Warc-", "WARC-") + .replace("-Ip-", "-IP-") + .replace("-Id", "-ID") + .replace("-Uri", "-URI")) + f.write(str(name).encode()) + f.write(b": ") + f.write(str(value).encode()) + f.write(b"\r\n") + # Header ends with an extra CRLF - f.write("\r\n") + f.write(b"\r\n") @property def content_length(self): """The Content-Length header as int.""" return int(self['Content-Length']) - + @property - def type(self): + def type(self): """The value of WARC-Type header.""" - return self.get('WARC-Type') - + return self['WARC-Type'] + @property def record_id(self): """The value of WARC-Record-ID header.""" return self['WARC-Record-ID'] - + @property def date(self): """The value of WARC-Date header.""" return self['WARC-Date'] - + def __str__(self): - f = StringIO() + f = io.BytesIO() self.write_to(f) - return f.getvalue() - + return str(f.getvalue(), 'utf-8') + def __repr__(self): return "" % (self.type, self.record_id) + class WARCRecord(object): """The WARCRecord object represents a WARC Record. """ def __init__(self, header=None, payload=None, headers={}, defaults=True): - """Creates a new WARC record. + """Creates a new WARC record. + + @param payload must be of type 'bytes' or FilePart """ if header is None and defaults is True: headers.setdefault("WARC-Type", "response") self.header = header or WARCHeader(headers, defaults=True) - self.payload = payload - + if defaults is True and 'Content-Length' not in self.header: if payload: - self.header['Content-Length'] = str(len(payload)) + self.header['Content-Length'] = len(payload) else: self.header['Content-Length'] = "0" - + if defaults is True and 'WARC-Payload-Digest' not in self.header: self.header['WARC-Payload-Digest'] = self._compute_digest(payload) - + + if isinstance(payload, bytes): + payload = io.BytesIO(payload) + + self.payload = payload + self._content = None + + self._custom_cases() + + def _custom_cases(self): + # TODO: this need to be done using other pattern, but first we need + # tests + if self.version == '0.18': + self._custom_0_18() + + def _custom_0_18(self): + if not self.type == 'response': + return + + if not self['content-type'].startswith('application/http'): + return + + headers = get_http_headers(self.payload) + self.header['http_headers'] = headers + def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() - + def write_to(self, f): self.header.write_to(f) - f.write(self.payload) - f.write("\r\n") - f.write("\r\n") + f.write(self.payload.read()) + f.write(b"\r\n") + f.write(b"\r\n") f.flush() - + @property def type(self): """Record type""" @@ -172,11 +202,11 @@ def type(self): def url(self): """The value of the WARC-Target-URI header if the record is of type "response".""" return self.header.get('WARC-Target-URI') - + @property def ip_address(self): - """The IP address of the host contacted to retrieve the content of this record. - + """The IP address of the host contacted to retrieve the content of this record. + This value is available from the WARC-IP-Address header.""" return self.header.get('WARC-IP-Address') @@ -184,159 +214,177 @@ def ip_address(self): def date(self): """UTC timestamp of the record.""" return self.header.get("WARC-Date") - + @property def checksum(self): return self.header.get('WARC-Payload-Digest') - + + @property + def version(self): + return self.header['warc-version'] + @property def offset(self): """Offset of this record in the warc file from which this record is read. """ pass - + def __getitem__(self, name): - return self.header[name] + try: + return self.header[name] + except KeyError: + if name == "content_type": + return self.content.type + elif name in self.content: + return self.content[name] def __setitem__(self, name, value): self.header[name] = value - + def __contains__(self, name): return name in self.header - + def __str__(self): - f = StringIO() + f = io.BytesIO() self.write_to(f) - return f.getvalue() - + return str(f.getvalue()) + def __repr__(self): - return "" % (self.type, self['WARC-Record-ID']) - + return "" % (self.type, + self['WARC-Record-ID']) + @staticmethod def from_response(response): """Creates a WARCRecord from given response object. - This must be called before reading the response. The response can be + This must be called before reading the response. The response can be read after this method is called. - + :param response: An instance of :class:`requests.models.Response`. """ # Get the httplib.HTTPResponse object http_response = response.raw._original_response - - # HTTP status line, headers and body as strings - status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) + + # HTTP status line, headers as string + status_line = "HTTP/1.1 %d %s" % (http_response.status, + http_response.reason) headers = str(http_response.msg) - body = http_response.read() - # Monkey-patch the response object so that it is possible to read from it later. - response.raw._fp = StringIO(body) + # Read raw response data out of request + stream = io.BytesIO() + stream.write(status_line.encode()) + stream.write(b'\r\n') + stream.write(http_response.msg.as_bytes()) + stream.write(b'\r\n') + for chunk in response.iter_content(1024): + stream.write(chunk) + + payload = stream.getvalue() - # Build the payload to create warc file. - payload = status_line + "\r\n" + headers + "\r\n" + body - headers = { "WARC-Type": "response", - "WARC-Target-URI": response.request.full_url.encode('utf-8') + "WARC-Target-URI": response.request.url } return WARCRecord(payload=payload, headers=headers) + class WARCFile: def __init__(self, filename=None, mode=None, fileobj=None, compress=None): if fileobj is None: - fileobj = __builtin__.open(filename, mode or "rb") + fileobj = builtins.open(filename, mode or "rb") mode = fileobj.mode # initiaize compress based on filename, if not already specified if compress is None and filename and filename.endswith(".gz"): compress = True - + if compress: - fileobj = gzip2.GzipFile(fileobj=fileobj, mode=mode) - + fileobj = gzip.open(fileobj.name, mode) + self.fileobj = fileobj self._reader = None - + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self.close() + + def __iter__(self): + return iter(self.reader) + @property def reader(self): if self._reader is None: self._reader = WARCReader(self.fileobj) return self._reader - + def write_record(self, warc_record): """Adds a warc record to this WARC file. """ warc_record.write_to(self.fileobj) - # Each warc record is written as separate member in the gzip file - # so that each record can be read independetly. - if isinstance(self.fileobj, gzip2.GzipFile): - self.fileobj.close_member() - + def read_record(self): """Reads a warc record from this WARC file.""" return self.reader.read_record() - - def __iter__(self): - return iter(self.reader) - + def close(self): self.fileobj.close() - + def browse(self): """Utility to browse through the records in the warc file. - - This returns an iterator over (record, offset, size) for each record in - the file. If the file is gzip compressed, the offset and size will - corresponds to the compressed file. - - The payload of each record is limited to 1MB to keep memory consumption + + This returns an iterator over (record, offset, size) for each record in + the file. If the file is gzip compressed, the offset and size will + corresponds to the compressed file. + + The payload of each record is limited to 1MB to keep memory consumption under control. """ offset = 0 for record in self.reader: # Just read the first 1MB of the payload. - # This will make sure memory consuption is under control and it - # is possible to look at the first MB of the payload, which is + # This will make sure memory consuption is under control and it + # is possible to look at the first MB of the payload, which is # typically sufficient to read http headers in the payload. - record.payload = StringIO(record.payload.read(1024*1024)) + record.payload = io.BytesIO(record.payload.read(1024*1024)) self.reader.finish_reading_current_record() next_offset = self.tell() yield record, offset, next_offset-offset offset = next_offset def tell(self): - """Returns the file offset. If this is a compressed file, then the - offset in the compressed file is returned. + """Returns the file offset. """ - if isinstance(self.fileobj, gzip2.GzipFile): - return self.fileobj.fileobj.tell() - else: - return self.fileobj.tell() - + return self.fileobj.tell() + + class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") RE_HEADER = re.compile(r"([a-zA-Z_\-]+): *(.*)\r\n") - SUPPORTED_VERSIONS = ["1.0"] - + SUPPORTED_VERSIONS = ["1.0", "0.18"] + def __init__(self, fileobj): self.fileobj = fileobj self.current_payload = None - + def read_header(self, fileobj): - version_line = fileobj.readline() + version_line = fileobj.readline().decode("utf-8") if not version_line: return None - + m = self.RE_VERSION.match(version_line) if not m: raise IOError("Bad version line: %r" % version_line) version = m.group(1) if version not in self.SUPPORTED_VERSIONS: raise IOError("Unsupported WARC version: %s" % version) - - headers = {} + + headers = { + 'warc-version': version, + } while True: - line = fileobj.readline() - if line == "\r\n": # end of headers + line = fileobj.readline().decode("utf-8") + if line == "\r\n": # end of headers break m = self.RE_HEADER.match(line) if not m: @@ -344,42 +392,42 @@ def read_header(self, fileobj): name, value = m.groups() headers[name] = value return WARCHeader(headers) - + def expect(self, fileobj, expected_line, message=None): - line = fileobj.readline() + line = fileobj.readline().decode("utf-8") if line != expected_line: message = message or "Expected %r, found %r" % (expected_line, line) raise IOError(message) - + def finish_reading_current_record(self): # consume the footer from the previous record if self.current_payload: - # consume all data from the current_payload before moving to next record + # consume all data from the current_payload before + # moving to next record self.current_payload.read() self.expect(self.current_payload.fileobj, "\r\n") - self.expect(self.current_payload.fileobj, "\r\n") + if self.current_payload.length: + self.expect(self.current_payload.fileobj, "\r\n") self.current_payload = None def read_record(self): self.finish_reading_current_record() + fileobj = self.fileobj - if isinstance(self.fileobj, gzip2.GzipFile): - fileobj = self.fileobj.read_member() - if fileobj is None: - return None - else: - fileobj = self.fileobj - header = self.read_header(fileobj) if header is None: return None - + self.current_payload = FilePart(fileobj, header.content_length) record = WARCRecord(header, self.current_payload, defaults=False) return record def _read_payload(self, fileobj, content_length): size = 0 + if content_length <= 0: + yield b'' + raise StopIteration + while size < content_length: chunk_size = min(1024, content_length-size) chunk = fileobj.read(chunk_size) diff --git a/warcscrape.py b/warcscrape.py new file mode 100644 index 0000000..f3776de --- /dev/null +++ b/warcscrape.py @@ -0,0 +1,205 @@ +#! /usr/bin/env python3 +import os +import re +import argparse +import warc +import sys +import mimetypes +from urllib.parse import urlparse, unquote +from pprint import pprint +import shutil + +counts = {} + +class filterObject: + """Basic object for storing filters.""" + def __init__(self, string): + self.result = True + if string[0] == "!": + self.result = False + string = string[1:] + + _list = string.lower().split(":") + + self.http = (_list[0] == 'http') + if self.http: + del _list[0] + + self.k = _list[0] + self.v = _list[1] + +def inc(obj, header=False, dic=False): + """Short script for counting entries.""" + if header: + try: + obj = obj[header] + except KeyError: + obj = None + + holder = counts + if dic: + if dic not in counts: + counts[dic] = {} + holder = counts[dic] + + if obj in holder: + holder[obj] += 1 + else: + holder[obj] = 1 + +def warc_records(string, path): + """Iterates over warc records in path.""" + for filename in os.listdir(path): + if re.search(string, filename) and ".warc" in filename: + print("parsing", filename) + with warc.open(path + filename) as warc_file: + for record in warc_file: + yield record + +def checkFilter(filters, record): + """Check record against filters.""" + for i in filters: + if i.http: + if not record.http: + return False + value = record.http + else: + value = record.header + + string = value.get(i.k, None) + if not string or (i.v in string) != i.result: + return False + return True + +def parse(args): + #Clear output warc file. + if args.dump == "warc": + print("Recording", args.dump, "to", args.output + ".") + with open(args.output_path + args.output, "wb"): + pass + + for record in warc_records(args.string, args.path): + try: + #Filter out unwanted entries. + if not checkFilter(args.filter, record): + continue + + #Increment Index counters. + if args.silence: + inc("records") + inc(record,"warc-type", "types") + inc(record, "content_type", "warc-content") + if record.http: + inc(record.http, "content_type", "http-content") + inc(record.http, "error", "status") + + #Dump records to file. + if args.dump == "warc": + with open(args.output_path + args.output, "ab") as output: + record.write_to(output) + + if args.dump == "content": + url = urlparse(unquote(record['WARC-Target-URI'])) + + #Set up folder + index = url.path.rfind("/") + 1 + file = url.path[index:] + path = url.path[:index] + + #Process filename + if "." not in file: + path += file + if not path.endswith("/"): + path += "/" + + file = 'index.html' + + #Final fixes. + path = path.replace(".", "-") + host = url.hostname.replace('www.', '', 1) + path = args.output_path + host + path + + #Create new directories + if not os.path.exists(path): + try: + os.makedirs(path) + except OSError: + path = "/".join([i[:25] for i in path.split("/")]) + os.makedirs(path) + + #Test if file has a proper extension. + index = file.index(".") + suffix = file[index:] + content = record.http.get("content_type", "") + slist = mimetypes.guess_all_extensions(content) + if suffix not in slist: + #Correct suffix if we can. + suffix = mimetypes.guess_extension(content) + if suffix: + file = file[:index] + suffix + else: + inc(record.http, "content_type", "unknown mime type") + + #Check for gzip compression. + if record.http.get("content-encoding", None) == "gzip": + file += ".gz" + + path += file + + #If Duplicate file then insert numbers + index = path.rfind(".") + temp = path + n = 0 + while os.path.isfile(temp): + n +=1 + temp = path[:index] + "("+ str(n) + ")" + path[index:] + path = temp + + #Write file. + with open(path, 'wb') as fp: + record.http.write_payload_to(fp) + except: + if args.error: + print("Error in record. Recording to error.warc.") + with open(args.output_path + "error.warc", "wb") as fp: + record.write_to(fp) + else: + raise + + #print results + if args.silence: + print("-----------------------------") + for i in counts: + print("\nCount of {}.".format(i)) + pprint(counts[i]) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Extracts attributes from warc files.') + parser.add_argument("filter", nargs='*', help="Attributes to filter by. Entries that do not contain filtered elements are ignored. Example: warc-type:response, would ignore all warc entries that are not responses. Attributes in an HTTP object should be prefixed by 'http'. Example, http:error:200.") + parser.add_argument("-silence", action="store_false", help="Silences output of warc data.") + parser.add_argument("-error", action="store_true", help="Silences most errors and records problematic warc entries to error.warc.") + parser.add_argument("-string", default="", help="Regular expression to limit parsed warc files. Defaults to empty string.") + parser.add_argument("-path", default="./", help="Path to folder containing warc files. Defaults to current folder.") + parser.add_argument("-output_path", default="data/", help="Path to folder to dump content files. Defaults to data/ folder.") + parser.add_argument("-output", default="output.warc", help="File to output warc contents. Defaults to 'output.warc'.") + parser.add_argument("-dump", choices=['warc', 'content'], type=str, help="Dumps all entries that survived filter. 'warc' creates a filtered warc file. 'content' tries to reproduce file structure of archived websites.") + args = parser.parse_args() + + if args.path[-1] != "/": + args.path += "/" + + if args.output_path[-1] != "/": + args.output_path += "/" + + if not os.path.exists(args.output_path): + os.makedirs(args.output_path) + + #Forced filters + if args.dump == "content": + args.filter.append("warc-type:response") + args.filter.append("content-type:application/http") + + args.filter = [filterObject(i) for i in args.filter] + + args.string = re.compile(args.string) + parse(args)