diff --git a/warc/arc.py b/warc/arc.py index 5889587..1d20916 100644 --- a/warc/arc.py +++ b/warc/arc.py @@ -346,6 +346,17 @@ def _read_arc_record(self): header = self.fileobj.readline() while header and header.strip() == "": header = self.fileobj.readline() + + #JG: this block stops the header parser / reader + #from getting caught on the XML lump + #that can appear in ARC files + + if header.startswith("\n"): + header = self.fileobj.readline() + header = self.fileobj.readline() + header = self.fileobj.readline() + if header == "": return None