|
| 1 | +"""Consts and function to handle target format. |
| 2 | +ALL_SUPPORTED_FORMATS - list of supported formats |
| 3 | +get_decompress_function - returns stream decompress function for a current |
| 4 | + format (specified or autodetected) |
| 5 | +get_compress_function - returns compress function for a current format |
| 6 | + (specifed or default) |
| 7 | +""" |
| 8 | +from .snappy import ( |
| 9 | + stream_compress, stream_decompress, check_format, UncompressError) |
| 10 | +from .hadoop_snappy import ( |
| 11 | + stream_compress as hadoop_stream_compress, |
| 12 | + stream_decompress as hadoop_stream_decompress, |
| 13 | + check_format as hadoop_check_format) |
| 14 | + |
| 15 | + |
| 16 | +FRAMING_FORMAT = 'framing' |
| 17 | + |
| 18 | +HADOOP_FORMAT = 'hadoop_snappy' |
| 19 | + |
| 20 | +# Means format auto detection. |
| 21 | +# For compression will be used framing format. |
| 22 | +# In case of decompression will try to detect a format from the input stream |
| 23 | +# header. |
| 24 | +FORMAT_AUTO = 'auto' |
| 25 | + |
| 26 | +DEFAULT_FORMAT = FORMAT_AUTO |
| 27 | + |
| 28 | +ALL_SUPPORTED_FORMATS = [FRAMING_FORMAT, HADOOP_FORMAT, FORMAT_AUTO] |
| 29 | + |
| 30 | +_COMPRESS_METHODS = { |
| 31 | + FRAMING_FORMAT: stream_compress, |
| 32 | + HADOOP_FORMAT: hadoop_stream_compress, |
| 33 | +} |
| 34 | + |
| 35 | +_DECOMPRESS_METHODS = { |
| 36 | + FRAMING_FORMAT: stream_decompress, |
| 37 | + HADOOP_FORMAT: hadoop_stream_decompress, |
| 38 | +} |
| 39 | + |
| 40 | +# We will use framing format as the default to compression. |
| 41 | +# And for decompression, if it's not defined explicitly, we will try to |
| 42 | +# guess the format from the file header. |
| 43 | +_DEFAULT_COMPRESS_FORMAT = FRAMING_FORMAT |
| 44 | + |
| 45 | +# The tuple contains an ordered sequence of a format checking function and |
| 46 | +# a format-specific decompression function. |
| 47 | +# Framing format has it's header, that may be recognized. |
| 48 | +# Hadoop snappy format hasn't any special headers, it contains only |
| 49 | +# uncompressed block length integer and length of compressed subblock. |
| 50 | +# So we first check framing format and if it is not the case, then |
| 51 | +# check for snappy format. |
| 52 | +_DECOMPRESS_FORMAT_FUNCS = ( |
| 53 | + (check_format, stream_decompress), |
| 54 | + (hadoop_check_format, hadoop_stream_decompress), |
| 55 | +) |
| 56 | + |
| 57 | + |
| 58 | +def guess_format_by_header(fin): |
| 59 | + """Tries to guess a compression format for the given input file by it's |
| 60 | + header. |
| 61 | + :return: tuple of decompression method and a chunk that was taken from the |
| 62 | + input for format detection. |
| 63 | + """ |
| 64 | + chunk = None |
| 65 | + for check_method, decompress_func in _DECOMPRESS_FORMAT_FUNCS: |
| 66 | + ok, chunk = check_method(fin=fin, chunk=chunk) |
| 67 | + if not ok: |
| 68 | + continue |
| 69 | + return decompress_func, chunk |
| 70 | + raise UncompressError("Can't detect archive format") |
| 71 | + |
| 72 | + |
| 73 | +def get_decompress_function(specified_format, fin): |
| 74 | + if specified_format == FORMAT_AUTO: |
| 75 | + decompress_func, read_chunk = guess_format_by_header(fin) |
| 76 | + return decompress_func, read_chunk |
| 77 | + return _DECOMPRESS_METHODS[specified_format], None |
| 78 | + |
| 79 | + |
| 80 | +def get_compress_function(specified_format): |
| 81 | + if specified_format == FORMAT_AUTO: |
| 82 | + return _COMPRESS_METHODS[_DEFAULT_COMPRESS_FORMAT] |
| 83 | + return _COMPRESS_METHODS[specified_format] |
0 commit comments