diff --git a/warc/utils.py b/warc/utils.py index 8fb783b..bfb8bc5 100644 --- a/warc/utils.py +++ b/warc/utils.py @@ -14,7 +14,7 @@ class CaseInsensitiveDict(DictMixin): """Almost like a dictionary, but keys are case-insensitive. - + >>> d = CaseInsensitiveDict(foo=1, Bar=2) >>> d['foo'] 1 @@ -29,16 +29,16 @@ class CaseInsensitiveDict(DictMixin): def __init__(self, *args, **kwargs): self._d = {} self.update(*args, **kwargs) - + def __setitem__(self, name, value): self._d[name.lower()] = value - + def __getitem__(self, name): return self._d[name.lower()] - + def __delitem__(self, name): del self._d[name.lower()] - + def __eq__(self, other): return isinstance(other, CaseInsensitiveDict) and other._d == self._d @@ -54,22 +54,22 @@ def keys(self): class FilePart: """File interface over a part of file. - - Takes a file and length to read from the file and returns a file-object + + Takes a file and length to read from the file and returns a file-object over that part of the file. """ def __init__(self, fileobj, length): self.fileobj = fileobj self.length = length self.offset = 0 - self.buf = "" - + self.buf = self.fileobj.read(0) + def read(self, size=-1): if size == -1: return self._read(self.length) else: return self._read(size) - + def _read(self, size): if len(self.buf) >= size: content = self.buf[:size] @@ -77,21 +77,25 @@ def _read(self, size): else: size = min(size, self.length - self.offset - len(self.buf)) content = self.buf + self.fileobj.read(size) - self.buf = "" + self.buf = type(self.buf)() self.offset += len(content) + + if isinstance(content, bytes): + content = content.decode("utf-8") + return content - + def _unread(self, content): self.buf = content + self.buf self.offset -= len(content) - + def readline(self): chunks = [] chunk = self._read(1024) while chunk and "\n" not in chunk: chunks.append(chunk) chunk = self._read(1024) - + if "\n" in chunk: index = chunk.index("\n") self._unread(chunk[index+1:]) diff --git a/warc/warc.py b/warc/warc.py index d15d839..5900b23 100644 --- a/warc/warc.py +++ b/warc/warc.py @@ -26,17 +26,17 @@ class WARCHeader(CaseInsensitiveDict): """The WARC Header object represents the headers of a WARC record. - It provides dictionary like interface for accessing the headers. - + It provides dictionary like interface for accessing the headers. + The following mandatory fields are accessible also as attributes. - + * h.record_id == h['WARC-Record-ID'] * h.content_length == int(h['Content-Length']) * h.date == h['WARC-Date'] * h.type == h['WARC-Type'] - - :params headers: dictionary of headers. - :params defaults: If True, important headers like WARC-Record-ID, + + :params headers: dictionary of headers. + :params defaults: If True, important headers like WARC-Record-ID, WARC-Date, Content-Type and Content-Length are initialized to automatically if not already present. TODO: @@ -45,9 +45,9 @@ class WARCHeader(CaseInsensitiveDict): * url * ip_address * date (date of archival) - * content_type + * content_type * result_code (response code) - * checksum + * checksum * location * offset (offset from beginning of file to recrod) * filename (name of arc file) @@ -58,7 +58,7 @@ class WARCHeader(CaseInsensitiveDict): response='application/http; msgtype=response', request='application/http; msgtype=request', metadata='application/warc-fields') - + KNOWN_HEADERS = { "type": "WARC-Type", "date": "WARC-Date", @@ -70,16 +70,16 @@ class WARCHeader(CaseInsensitiveDict): "content_type": "Content-Type", "content_length": "Content-Length" } - + def __init__(self, headers, defaults=False): self.version = b"WARC/1.0" CaseInsensitiveDict.__init__(self, headers) if defaults: self.init_defaults() - + def init_defaults(self): """Initializes important headers to default values, if not already specified. - + The WARC-Record-ID header is set to a newly generated UUID. The WARC-Date header is set to the current datetime. The Content-Type is set based on the WARC-Type header. @@ -91,7 +91,7 @@ def init_defaults(self): self['WARC-Date'] = datetime.datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%SZ') if "Content-Type" not in self: self['Content-Type'] = WARCHeader.CONTENT_TYPES.get(self.type, "application/octet-stream") - + def write_to(self, f): """Writes this header to a file, in the format specified by WARC. """ @@ -104,7 +104,7 @@ def write_to(self, f): f.write(b": ") f.write(value.encode('utf-8')) f.write(b"\r\n") - + # Header ends with an extra CRLF f.write(b"\r\n") @@ -114,7 +114,7 @@ def content_length(self): return int(self['Content-Length']) @property - def type(self): + def type(self): """The value of WARC-Type header.""" return self.get('WARC-Type') @@ -122,18 +122,18 @@ def type(self): def record_id(self): """The value of WARC-Record-ID header.""" return self['WARC-Record-ID'] - + @property def date(self): """The value of WARC-Date header.""" return self['WARC-Date'] - + def __str__(self): from io import BytesIO f = BytesIO() self.write_to(f) return f.getvalue().decode('utf-8') - + def __repr__(self): return "" % (self.type, self.record_id) @@ -141,7 +141,7 @@ class WARCRecord(object): """The WARCRecord object represents a WARC Record. """ def __init__(self, header=None, payload=None, headers={}, defaults=True): - """Creates a new WARC record. + """Creates a new WARC record. """ if header is None and defaults is True: @@ -149,19 +149,19 @@ def __init__(self, header=None, payload=None, headers={}, defaults=True): self.header = header or WARCHeader(headers, defaults=True) self.payload = payload - + if defaults is True and 'Content-Length' not in self.header: if payload: self.header['Content-Length'] = str(len(payload)) else: self.header['Content-Length'] = "0" - + if defaults is True and 'WARC-Payload-Digest' not in self.header: self.header['WARC-Payload-Digest'] = self._compute_digest(payload) - + def _compute_digest(self, payload): return "sha1:" + hashlib.sha1(payload).hexdigest() - + def write_to(self, f): if isinstance(self.payload, bytes): line_break = b"\r\n" @@ -172,7 +172,7 @@ def write_to(self, f): f.write(line_break) f.write(line_break) f.flush() - + @property def type(self): """Record type""" @@ -182,11 +182,11 @@ def type(self): def url(self): """The value of the WARC-Target-URI header if the record is of type "response".""" return self.header.get('WARC-Target-URI') - + @property def ip_address(self): - """The IP address of the host contacted to retrieve the content of this record. - + """The IP address of the host contacted to retrieve the content of this record. + This value is available from the WARC-IP-Address header.""" return self.header.get('WARC-IP-Address') @@ -194,46 +194,46 @@ def ip_address(self): def date(self): """UTC timestamp of the record.""" return self.header.get("WARC-Date") - + @property def checksum(self): return self.header.get('WARC-Payload-Digest') - + @property def offset(self): """Offset of this record in the warc file from which this record is read. """ pass - + def __getitem__(self, name): return self.header[name] def __setitem__(self, name, value): self.header[name] = value - + def __contains__(self, name): return name in self.header - + def __str__(self): f = StringIO() self.write_to(f) return f.getvalue() - + def __repr__(self): return "" % (self.type, self['WARC-Record-ID']) - + @staticmethod def from_response(response): """Creates a WARCRecord from given response object. - This must be called before reading the response. The response can be + This must be called before reading the response. The response can be read after this method is called. - + :param response: An instance of :class:`requests.models.Response`. """ # Get the httplib.HTTPResponse object http_response = response.raw._original_response - + # HTTP status line, headers and body as strings status_line = "HTTP/1.1 %d %s" % (http_response.status, http_response.reason) headers = str(http_response.msg) @@ -244,7 +244,7 @@ def from_response(response): # Build the payload to create warc file. payload = status_line + "\r\n" + headers + "\r\n" + body - + headers = { "WARC-Type": "response", "WARC-Target-URI": response.request.full_url.encode('utf-8') @@ -259,19 +259,19 @@ def __init__(self, filename=None, mode=None, fileobj=None, compress=None): # initiaize compress based on filename, if not already specified if compress is None and filename and filename.endswith(".gz"): compress = True - + if compress: fileobj = gzip2.GzipFile(fileobj=fileobj, mode=mode) - + self.fileobj = fileobj self._reader = None - + @property def reader(self): if self._reader is None: self._reader = WARCReader(self.fileobj) return self._reader - + def write_record(self, warc_record): """Adds a warc record to this WARC file. """ @@ -280,32 +280,32 @@ def write_record(self, warc_record): # so that each record can be read independetly. if isinstance(self.fileobj, gzip2.GzipFile): self.fileobj.close_member() - + def read_record(self): """Reads a warc record from this WARC file.""" return self.reader.read_record() - + def __iter__(self): return iter(self.reader) - + def close(self): self.fileobj.close() - + def browse(self): """Utility to browse through the records in the warc file. - - This returns an iterator over (record, offset, size) for each record in - the file. If the file is gzip compressed, the offset and size will - corresponds to the compressed file. - - The payload of each record is limited to 1MB to keep memory consumption + + This returns an iterator over (record, offset, size) for each record in + the file. If the file is gzip compressed, the offset and size will + corresponds to the compressed file. + + The payload of each record is limited to 1MB to keep memory consumption under control. """ offset = 0 for record in self.reader: # Just read the first 1MB of the payload. - # This will make sure memory consuption is under control and it - # is possible to look at the first MB of the payload, which is + # This will make sure memory consuption is under control and it + # is possible to look at the first MB of the payload, which is # typically sufficient to read http headers in the payload. record.payload = StringIO(record.payload.read(1024*1024)) self.reader.finish_reading_current_record() @@ -314,23 +314,23 @@ def browse(self): offset = next_offset def tell(self): - """Returns the file offset. If this is a compressed file, then the + """Returns the file offset. If this is a compressed file, then the offset in the compressed file is returned. """ if isinstance(self.fileobj, gzip2.GzipFile): return self.fileobj.fileobj.tell() else: - return self.fileobj.tell() - + return self.fileobj.tell() + class WARCReader: RE_VERSION = re.compile("WARC/(\d+.\d+)\r\n") RE_HEADER = re.compile(r"([a-zA-Z_\-]+): *(.*)\r\n") SUPPORTED_VERSIONS = ["1.0"] - + def __init__(self, fileobj): self.fileobj = fileobj self.current_payload = None - + def read_header(self, fileobj): version_line = fileobj.readline() if not version_line: @@ -343,7 +343,7 @@ def read_header(self, fileobj): version = m.group(1) if version not in self.SUPPORTED_VERSIONS: raise IOError("Unsupported WARC version: %s" % version) - + headers = {} while True: line = fileobj.readline() @@ -357,20 +357,20 @@ def read_header(self, fileobj): name, value = m.groups() headers[name] = value return WARCHeader(headers) - + def expect(self, fileobj, expected_line, message=None): line = fileobj.readline() if line != expected_line: message = message or "Expected %r, found %r" % (expected_line, line) raise IOError(message) - + def finish_reading_current_record(self): # consume the footer from the previous record if self.current_payload: # consume all data from the current_payload before moving to next record self.current_payload.read() - self.expect(self.current_payload.fileobj, "\r\n") - self.expect(self.current_payload.fileobj, "\r\n") + self.expect(self.current_payload.fileobj, b"\r\n") + self.expect(self.current_payload.fileobj, b"\r\n") self.current_payload = None def read_record(self): @@ -382,11 +382,11 @@ def read_record(self): return None else: fileobj = self.fileobj - + header = self.read_header(fileobj) if header is None: return None - + self.current_payload = FilePart(fileobj, header.content_length) record = WARCRecord(header, self.current_payload, defaults=False) return record