Implement format autodetection for decompression

aleks-v-k · aleks-v-k · commit 96601328c070 · 2017-12-28T14:50:52.000+03:00
diff --git a/snappy/__main__.py b/snappy/__main__.py
@@ -2,27 +2,8 @@
 import io
 import sys
 
-from .snappy import stream_compress, stream_decompress
-from .hadoop_snappy import (
-    stream_compress as hadoop_stream_compress,
-    stream_decompress as hadoop_stream_decompress)
-
-
-FRAMING_FORMAT = 'framing'
-
-HADOOP_FORMAT = 'hadoop_snappy'
-
-DEFAULT_FORMAT = FRAMING_FORMAT
-
-COMPRESS_METHODS = {
-    FRAMING_FORMAT: stream_compress,
-    HADOOP_FORMAT: hadoop_stream_compress,
-}
-
-DECOMPRESS_METHODS = {
-    FRAMING_FORMAT: stream_decompress,
-    HADOOP_FORMAT: hadoop_stream_decompress,
-}
+from . import snappy_formats as formats
+from .snappy import UncompressError
 
 
 def cmdline_main():
@@ -58,9 +39,11 @@ def cmdline_main():
     parser.add_argument(
         '-t',
         dest='target_format',
-        default=DEFAULT_FORMAT,
-        choices=[FRAMING_FORMAT, HADOOP_FORMAT],
-        help='Target format, default is {}'.format(DEFAULT_FORMAT)
+        default=formats.DEFAULT_FORMAT,
+        choices=formats.ALL_SUPPORTED_FORMATS,
+        help=(
+            'Target format, default is "{}"'.format(formats.DEFAULT_FORMAT)
+        )
     )
 
     parser.add_argument(
@@ -79,18 +62,27 @@ def cmdline_main():
     )
 
     args = parser.parse_args()
-    if args.compress:
-        method = COMPRESS_METHODS[args.target_format]
-    else:
-        method = DECOMPRESS_METHODS[args.target_format]
 
     # workaround for https://bugs.python.org/issue14156
     if isinstance(args.infile, io.TextIOWrapper):
         args.infile = stdin
     if isinstance(args.outfile, io.TextIOWrapper):
         args.outfile = stdout
 
-    method(args.infile, args.outfile)
+    additional_args = {}
+    if args.compress:
+        method = formats.get_compress_function(args.target_format)
+    else:
+        try:
+            method, read_chunk = formats.get_decompress_function(
+                args.target_format,
+                args.infile
+            )
+        except UncompressError as err:
+            sys.exit("Failed to get decompress function: {}".format(err))
+        additional_args['start_chunk'] = read_chunk
+
+    method(args.infile, args.outfile, **additional_args)
 
 
 if __name__ == "__main__":
diff --git a/snappy/hadoop_snappy.py b/snappy/hadoop_snappy.py
@@ -26,6 +26,7 @@
     _compress, _uncompress,
     stream_compress as _stream_compress,
     stream_decompress as _stream_decompress,
+    check_format as _check_format,
     UncompressError,
     _CHUNK_MAX)
 
@@ -112,6 +113,26 @@ def __init__(self):
         # total uncompressed data length of the current block
         self._uncompressed_length = 0
 
+    @staticmethod
+    def check_format(data):
+        """Just checks that first two integers (big endian four-bytes int)
+        in the given data block comply to: first int >= second int.
+        This is a simple assumption that we have in the data a start of a
+        block for hadoop snappy format. It should contain uncompressed block
+        length as the first integer, and compressed subblock length as the
+        second integer.
+        Raises UncompressError if the condition is not fulfilled.
+        :return: None
+        """
+        int_size = _INT_SIZE
+        if len(data) < int_size * 2:
+            raise UncompressError("Too short data length")
+        # We cant actually be sure abot the format here.
+        # Assumption that compressed data length is less than uncompressed
+        # is not true in general.
+        # So, just don't check anything
+        return
+
     def decompress(self, data):
         """Decompress 'data', returning a string containing the uncompressed
         data corresponding to at least part of the data in string. This data
@@ -178,8 +199,17 @@ def stream_compress(src, dst, blocksize=SNAPPY_BUFFER_SIZE_DEFAULT):
     )
 
 
-def stream_decompress(src, dst, blocksize=_STREAM_TO_STREAM_BLOCK_SIZE):
+def stream_decompress(src, dst, blocksize=_STREAM_TO_STREAM_BLOCK_SIZE,
+                      start_chunk=None):
     return _stream_decompress(
         src, dst, blocksize=blocksize,
+        decompressor_cls=StreamDecompressor,
+        start_chunk=start_chunk
+    )
+
+
+def check_format(fin=None, chunk=None, blocksize=_STREAM_TO_STREAM_BLOCK_SIZE):
+    return _check_format(
+        fin=fin, chunk=chunk, blocksize=blocksize,
         decompressor_cls=StreamDecompressor
     )
diff --git a/snappy/snappy.py b/snappy/snappy.py
@@ -195,6 +195,25 @@ def __init__(self):
         self._buf = b""
         self._header_found = False
 
+    @staticmethod
+    def check_format(data):
+        """Checks that the given data starts with snappy framing format
+        stream identifier.
+        Raises UncompressError if it doesn't start with the identifier.
+        :return: None
+        """
+        if len(data) < 6:
+            raise UncompressError("Too short data length")
+        chunk_type = struct.unpack("<L", data[:4])[0]
+        size = (chunk_type >> 8)
+        chunk_type &= 0xff
+        if (chunk_type != _IDENTIFIER_CHUNK or
+                size != len(_STREAM_IDENTIFIER)):
+            raise UncompressError("stream missing snappy identifier")
+        chunk = data[4:4 + size]
+        if chunk != _STREAM_IDENTIFIER:
+            raise UncompressError("stream has invalid snappy identifier")
+
     def decompress(self, data):
         """Decompress 'data', returning a string containing the uncompressed
         data corresponding to at least part of the data in string. This data
@@ -279,17 +298,41 @@ def stream_compress(src,
 def stream_decompress(src,
                       dst,
                       blocksize=_STREAM_TO_STREAM_BLOCK_SIZE,
-                      decompressor_cls=StreamDecompressor):
+                      decompressor_cls=StreamDecompressor,
+                      start_chunk=None):
     """Takes an incoming file-like object and an outgoing file-like object,
     reads data from src, decompresses it, and writes it to dst. 'src' should
     support the read method, and 'dst' should support the write method.
 
     The default blocksize is good for almost every scenario.
+    :param decompressor_cls: class that implements `decompress` method like
+        StreamDecompressor in the module
+    :param start_chunk: start block of data that have already been read from
+        the input stream (to detect the format, for example)
     """
     decompressor = decompressor_cls()
     while True:
-        buf = src.read(blocksize)
-        if not buf: break
+        if start_chunk:
+            buf = start_chunk
+            start_chunk = None
+        else:
+            buf = src.read(blocksize)
+            if not buf: break
         buf = decompressor.decompress(buf)
         if buf: dst.write(buf)
     decompressor.flush()  # makes sure the stream ended well
+
+
+def check_format(fin=None, chunk=None,
+                 blocksize=_STREAM_TO_STREAM_BLOCK_SIZE,
+                 decompressor_cls=StreamDecompressor):
+    ok = True
+    if chunk is None:
+        chunk = fin.read(blocksize)
+        if not chunk:
+            raise UncompressError("Empty input stream")
+    try:
+        decompressor_cls.check_format(chunk)
+    except UncompressError as err:
+        ok = False
+    return ok, chunk
diff --git a/snappy/snappy_formats.py b/snappy/snappy_formats.py
@@ -0,0 +1,83 @@
+"""Consts and function to handle target format.
+ALL_SUPPORTED_FORMATS - list of supported formats
+get_decompress_function - returns stream decompress function for a current
+    format (specified or autodetected)
+get_compress_function - returns compress function for a current format
+    (specifed or default)
+"""
+from .snappy import (
+    stream_compress, stream_decompress, check_format, UncompressError)
+from .hadoop_snappy import (
+    stream_compress as hadoop_stream_compress,
+    stream_decompress as hadoop_stream_decompress,
+    check_format as hadoop_check_format)
+
+
+FRAMING_FORMAT = 'framing'
+
+HADOOP_FORMAT = 'hadoop_snappy'
+
+# Means format auto detection.
+# For compression will be used framing format.
+# In case of decompression will try to detect a format from the input stream
+# header.
+FORMAT_AUTO = 'auto'
+
+DEFAULT_FORMAT = FORMAT_AUTO
+
+ALL_SUPPORTED_FORMATS = [FRAMING_FORMAT, HADOOP_FORMAT, FORMAT_AUTO]
+
+_COMPRESS_METHODS = {
+    FRAMING_FORMAT: stream_compress,
+    HADOOP_FORMAT: hadoop_stream_compress,
+}
+
+_DECOMPRESS_METHODS = {
+    FRAMING_FORMAT: stream_decompress,
+    HADOOP_FORMAT: hadoop_stream_decompress,
+}
+
+# We will use framing format as the default to compression.
+# And for decompression, if it's not defined explicitly, we will try to
+# guess the format from the file header.
+_DEFAULT_COMPRESS_FORMAT = FRAMING_FORMAT
+
+# The tuple contains an ordered sequence of a format checking function and
+# a format-specific decompression function.
+# Framing format has it's header, that may be recognized.
+# Hadoop snappy format hasn't any special headers, it contains only
+# uncompressed block length integer and length of compressed subblock.
+# So we first check framing format and if it is not the case, then
+# check for snappy format.
+_DECOMPRESS_FORMAT_FUNCS = (
+    (check_format, stream_decompress),
+    (hadoop_check_format, hadoop_stream_decompress),
+)
+
+
+def guess_format_by_header(fin):
+    """Tries to guess a compression format for the given input file by it's
+    header.
+    :return: tuple of decompression method and a chunk that was taken from the
+        input for format detection.
+    """
+    chunk = None
+    for check_method, decompress_func in _DECOMPRESS_FORMAT_FUNCS:
+        ok, chunk = check_method(fin=fin, chunk=chunk)
+        if not ok:
+            continue
+        return decompress_func, chunk
+    raise UncompressError("Can't detect archive format")
+
+
+def get_decompress_function(specified_format, fin):
+    if specified_format == FORMAT_AUTO:
+        decompress_func, read_chunk = guess_format_by_header(fin)
+        return decompress_func, read_chunk
+    return _DECOMPRESS_METHODS[specified_format], None
+
+
+def get_compress_function(specified_format):
+    if specified_format == FORMAT_AUTO:
+        return _COMPRESS_METHODS[_DEFAULT_COMPRESS_FORMAT]
+    return _COMPRESS_METHODS[specified_format]
diff --git a/test_formats.py b/test_formats.py
@@ -0,0 +1,96 @@
+import io
+import os
+from unittest import TestCase
+
+from snappy import snappy_formats as formats
+from snappy.snappy import _CHUNK_MAX, UncompressError
+
+
+class TestFormatBase(TestCase):
+    compress_format = formats.FORMAT_AUTO
+    decompress_format = formats.FORMAT_AUTO
+    success = True
+
+    def runTest(self):
+        data = os.urandom(1024 * 256 * 2) + os.urandom(13245 * 2)
+        compress_func = formats.get_compress_function(self.compress_format)
+        instream = io.BytesIO(data)
+        compressed_stream = io.BytesIO()
+        compress_func(instream, compressed_stream)
+        compressed_stream.seek(0)
+        if not self.success:
+            with self.assertRaises(UncompressError) as err:
+                decompress_func, read_chunk = formats.get_decompress_function(
+                    self.decompress_format, compressed_stream
+                )
+                decompressed_stream = io.BytesIO()
+                decompress_func(
+                    compressed_stream,
+                    decompressed_stream,
+                    start_chunk=read_chunk
+                )
+            return
+        decompress_func, read_chunk = formats.get_decompress_function(
+            self.decompress_format, compressed_stream
+        )
+        decompressed_stream = io.BytesIO()
+        decompress_func(
+            compressed_stream,
+            decompressed_stream,
+            start_chunk=read_chunk
+        )
+        decompressed_stream.seek(0)
+        self.assertEqual(data, decompressed_stream.read())
+
+
+class TestFormatFramingFraming(TestFormatBase):
+    compress_format = formats.FRAMING_FORMAT
+    decompress_format = formats.FRAMING_FORMAT
+    success = True
+
+
+class TestFormatFramingHadoop(TestFormatBase):
+    compress_format = formats.FRAMING_FORMAT
+    decompress_format = formats.HADOOP_FORMAT
+    success = False
+
+
+class TestFormatFramingAuto(TestFormatBase):
+    compress_format = formats.FRAMING_FORMAT
+    decompress_format = formats.FORMAT_AUTO
+    success = True
+
+
+class TestFormatHadoopHadoop(TestFormatBase):
+    compress_format = formats.HADOOP_FORMAT
+    decompress_format = formats.HADOOP_FORMAT
+    success = True
+
+
+class TestFormatHadoopFraming(TestFormatBase):
+    compress_format = formats.HADOOP_FORMAT
+    decompress_format = formats.FRAMING_FORMAT
+    success = False
+
+
+class TestFormatHadoopAuto(TestFormatBase):
+    compress_format = formats.HADOOP_FORMAT
+    decompress_format = formats.FORMAT_AUTO
+    success = True
+
+
+class TestFormatAutoFraming(TestFormatBase):
+    compress_format = formats.FORMAT_AUTO
+    decompress_format = formats.FRAMING_FORMAT
+    success = True
+
+
+class TestFormatAutoHadoop(TestFormatBase):
+    compress_format = formats.FORMAT_AUTO
+    decompress_format = formats.HADOOP_FORMAT
+    success = False
+
+
+if __name__ == "__main__":
+    import unittest
+    unittest.main()