internetarchive · nlevitt · Mar 22, 2016 · Mar 22, 2016 · Mar 28, 2016 · Mar 28, 2016
diff --git a/.hgignore b/.hgignore
diff --git a/.hgtags b/.hgtags
diff --git a/.travis.yml b/.travis.yml
@@ -6,14 +6,10 @@ python:
  - 3.3
  - 3.4
  - 3.5
+ - 3.5-dev
  - nightly
  - pypy
  - pypy3
 
-matrix:
- allow_failures:
-  - python: 3.5
-  - python: nightly
-
 script: python setup.py test
 
diff --git a/README b/README
diff --git a/README.rst b/README.rst
@@ -0,0 +1,87 @@
+.. image:: https://travis-ci.org/internetarchive/warctools.svg?branch=master
+    :target: https://travis-ci.org/internetarchive/warctools
+
+warctools
+=========
+
+dependencies
+
+- setuptools
+- unittest2
+- python 2.7, 3.2+
+
+hanzo warc tools:
+
+warcvalid.py
+        returns 0 if the arguments are all valid arc/warc files
+        non zero on error 
+
+warcdump.py - writes human readable summary of warcfiles:
+        usage: ``python warcdump.py foo.warc foo.warc.gz``
+
+        autodetects input format when filenames are passed,
+        i.e recordgzip vs plaintext, warc vs arc
+
+        assumes uncompressed warc on stdin if no args
+
+warcfilter.py 
+        ``python warcfilter.py pattern file file file`` -- 
+        searches all headers for regex pattern
+
+        - use -i to invert search
+        - use -U to constrain to url
+        - use -T to constrain to record type
+        - use -C to constrain to content-type
+
+        autodetects and stdin like warcdump
+
+        prints out a warc format by default.
+
+warc2warc.py:
+        ``python warc2warc <input files>``
+
+        autodetects compression on file
+        args, assumes uncompressed stdin if none
+
+        use -Z to write compressed output
+
+        i.e warc2warc -Z input > input.gz
+
+        should ignore buggy records in input
+
+arc2warc.py
+        creates a crappy warc file from arc files on input
+        a handful of headers are preserved
+        use -Z to write compressed output,
+        i.e ``arc2warc -Z input.arc > input.warc.gz``
+
+warcindex.py
+        spits out an index like this::
+
+            #WARC filename offset warc-type warc-subject-uri warc-record-id content-type content-length
+            warccrap/mywarc.warc 1196018 request /images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd1255a8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=request 193
+            warccrap/mywarc.warc 1196631 response http://www.hanzoarchives.com/images/slides/hanzo_markm__wwwoh.pdf <urn:uuid:fd2614f8-d07c-11df-b125-12313b0a18c6> application/http;msgtype=response 3279474
+
+        not great, but a start
+
+notes:
+    arc2warc uses the conversion rules from the earlier arc2warc.c
+    as a starter for converting the headers
+
+    I haven't profiled the code yet (and don't plan to until it falls over)
+
+    warcvalid barely skirts some of the iso standard:
+        missing things: strict whitespace, required headers check...
+	mime quoted printable header encoding
+	treating headers as utf8
+
+things left to do (in no order):
+
+- lots more testing.
+- supporting pre 1.0 warc files
+- add more documentation
+- support more commandline options for output and filenames
+- s3 urls
+
+
+-- tef [email protected]
diff --git a/hanzo/warctools/arc.py b/hanzo/warctools/arc.py
@@ -68,7 +68,6 @@ def rx(pat):
 nl_rx = rx('^\r\n|\r|\n$')
 length_rx = rx(b'^' + ArcRecord.CONTENT_LENGTH + b'$') #pylint: disable-msg=E1101
 type_rx = rx(b'^' + ArcRecord.CONTENT_TYPE + b'$')     #pylint: disable-msg=E1101
-SPLIT = re.compile(br'\b\s|\s\b').split
 
 class ArcParser(ArchiveParser):
     """A parser for arc archives."""
@@ -115,16 +114,21 @@ def parse(self, stream, offset, line=None):
             # configure parser instance
             self.version = arc_version.split()[0]
             self.headers = arc_names_line.strip().split()
-
+
+            # raj: some v1 ARC files are incorrectly sending a v2 header names line
+            if arc_names_line == 'URL IP-address Archive-date Content-type Result-code Checksum Location Offset Filepath Archive-length\n':
+                if arc_version == '1 0 InternetArchive' and 5 == len(line.split(' ')):
+                    self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
             # now we have read header field in record body
             # we can extract the headers from the current record,
             # and read the length field
 
             # which is in a different place with v1 and v2
-        
-            # read headers 
+
+            # read headers
             arc_headers = self.parse_header_list(line)
-            
+
             # extract content, ignoring header lines parsed already
             content_type, content_length, errors = \
                 self.get_content_headers(arc_headers)
@@ -139,7 +143,11 @@ def parse(self, stream, offset, line=None):
                                      raw_headers=raw_headers)
         else:
             if not self.headers:
-                raise Exception('missing filedesc')
+                #raj: some arc files are missing the filedesc:// line
+                #raise Exception('missing filedesc')
+                self.version = '1'
+                self.headers = ['URL', 'IP-address', 'Archive-date', 'Content-type', 'Archive-length']
+
             headers = self.parse_header_list(line)
             content_type, content_length, errors = \
                 self.get_content_headers(headers)
@@ -157,21 +165,32 @@ def trim(self, stream):
         return ()
 
     def parse_header_list(self, line):
-        # some people use ' ' as the empty value. lovely.
-        line = line.rstrip(b'\r\n')
-        values = SPLIT(line)
-        if len(self.headers) != len(values):
-            if self.headers[0] in (ArcRecord.URL, ArcRecord.CONTENT_TYPE):
-                # fencepost
-                values = [s[::-1] for s in reversed(SPLIT(line[::-1], len(self.headers)-1))]
-            else:
-                values = SPLIT(line, len(self.headers)-1)
-
-        if len(self.headers) != len(values):
-            raise Exception('missing headers %s %s'%(",".join(values), ",".join(self.headers)))
-
-        return list(zip(self.headers, values))
-
+        values = line.strip().split(b' ')
+        num_values = len(values)
+
+        #raj: some headers contain urls with unescaped spaces
+        if num_values > 5:
+            if re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[-4]) and re.match('^\d{14}$', values[-3]) and re.match('^\d+$', values[-1]):
+                values = [b'%20'.join(values[0:-4]), values[-4], values[-3], values[-2], values[-1]]
+                num_values = len(values)
+
+        if 4 == num_values:
+            #raj: alexa arc files don't always have content-type in header
+            return list(zip(self.short_headers, values))
+        elif 5 == num_values:
+            #normal case
+            #raj: some old alexa arcs have ip-address and date transposed in the header
+            if re.match(b'^\d{14}$', values[1]) and re.match(b'^(?:\d{1,3}\.){3}\d{1,3}$', values[2]):
+                values[1], values[2] = values[2], values[1]
+
+            return list(zip(self.headers, values))
+        elif 6 == num_values:
+            #raj: some old alexa arcs have "content-type; charset" in the header
+            v = values[0:4]+values[5:]
+            v[3] = v[3].rstrip(';')
+            return list(zip(self.headers, v))
+        else:
+            raise Exception('invalid number of header fields')
 
     @staticmethod
     def get_content_headers(headers):
@@ -195,3 +214,8 @@ def get_content_headers(headers):
 
 
 register_record_type(re.compile(br'^filedesc://'), ArcRecord)
+
+#raj: some arc files are missing the filedesc:// line
+url_record_regex = re.compile('^https?://\S+ (?:\d{1,3}\.){3}\d{1,3} \d{14} \S+ \d+$')
+register_record_type(url_record_regex, ArcRecord)
+