diff --git a/warc/arc.py b/warc/arc.py index 5889587..e4e25df 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -259,6 +259,7 @@ def __init__(self, filename=None, mode=None, fileobj=None, version = None, file_ self.file_headers = file_headers self.header_written = False self.header_read = False + self.file_meta = '' def _write_header(self): @@ -311,7 +312,6 @@ def _read_file_header(self): payload1 = self.fileobj.readline() payload2 = self.fileobj.readline() version, reserved, organisation = payload1.split(None, 2) - self.fileobj.readline() # Lose the separator newline self.header_read = True # print "--------------------------------------------------" # print header,"\n", payload1, "\n", payload2,"\n" @@ -334,6 +334,14 @@ def _read_file_header(self): else: raise IOError("Unknown ARC version '%s'"%version) + current = len(payload1) + len(payload2) + self.file_meta = '' + while current < int(length): + line = self.fileobj.readline() + current = current + len(line) + self.file_meta = self.file_meta + line + self.fileobj.readline() # Lose the separator newline + def _read_arc_record(self): "Reads out an arc record, formats it and returns it" #XXX:Noufal Stream payload here rather than just read it