|
| 1 | +"""The module implements compression/decompression with snappy using |
| 2 | +Hadoop snappy format: https://github.com/kubo/snzip#hadoop-snappy-format |
| 3 | +
|
| 4 | +Expected usage like: |
| 5 | +
|
| 6 | + import snappy |
| 7 | +
|
| 8 | + src = 'uncompressed' |
| 9 | + dst = 'compressed' |
| 10 | + dst2 = 'decompressed' |
| 11 | +
|
| 12 | + with open(src, 'rb') as fin, open(dst, 'wb') as fout: |
| 13 | + snappy.hadoop_stream_compress(src, dst) |
| 14 | +
|
| 15 | + with open(dst, 'rb') as fin, open(dst2, 'wb') as fout: |
| 16 | + snappy.hadoop_stream_decompress(fin, fout) |
| 17 | +
|
| 18 | + with open(src, 'rb') as fin1, open(dst2, 'rb') as fin2: |
| 19 | + assert fin1.read() == fin2.read() |
| 20 | +
|
| 21 | +""" |
| 22 | + |
| 23 | +import struct |
| 24 | + |
| 25 | +from .snappy import ( |
| 26 | + _compress, _uncompress, |
| 27 | + stream_compress as _stream_compress, |
| 28 | + stream_decompress as _stream_decompress, |
| 29 | + check_format as _check_format, |
| 30 | + UncompressError, |
| 31 | + _CHUNK_MAX) |
| 32 | + |
| 33 | + |
| 34 | +SNAPPY_BUFFER_SIZE_DEFAULT = 256 * 1024 |
| 35 | +_STREAM_TO_STREAM_BLOCK_SIZE = _CHUNK_MAX |
| 36 | + |
| 37 | +_INT_SIZE = 4 |
| 38 | + |
| 39 | + |
| 40 | +def pack_int(num): |
| 41 | + big_endian_uint = struct.pack('>I', num) |
| 42 | + return big_endian_uint |
| 43 | + |
| 44 | + |
| 45 | +def unpack_int(data): |
| 46 | + return struct.unpack('>I', data)[0] |
| 47 | + |
| 48 | + |
| 49 | +class StreamCompressor(object): |
| 50 | + |
| 51 | + """This class implements the compressor-side of the hadoop snappy |
| 52 | + format, taken from https://github.com/kubo/snzip#hadoop-snappy-format |
| 53 | +
|
| 54 | + Keep in mind that this compressor object does no buffering for you to |
| 55 | + appropriately size chunks. Every call to StreamCompressor.compress results |
| 56 | + in a unique call to the underlying snappy compression method. |
| 57 | + """ |
| 58 | + |
| 59 | + def __init__(self): |
| 60 | + pass |
| 61 | + |
| 62 | + def add_chunk(self, data): |
| 63 | + """Add a chunk containing 'data', returning a string that is |
| 64 | + compressed. This data should be concatenated to |
| 65 | + the tail end of an existing Snappy stream. In the absence of any |
| 66 | + internal buffering, no data is left in any internal buffers, and so |
| 67 | + unlike zlib.compress, this method returns everything. |
| 68 | + """ |
| 69 | + out = [] |
| 70 | + uncompressed_length = len(data) |
| 71 | + out.append(pack_int(uncompressed_length)) |
| 72 | + compressed_chunk = _compress(data) |
| 73 | + compressed_length = len(compressed_chunk) |
| 74 | + out.append(pack_int(compressed_length)) |
| 75 | + out.append(compressed_chunk) |
| 76 | + return b"".join(out) |
| 77 | + |
| 78 | + def compress(self, data): |
| 79 | + """This method is simply an alias for compatibility with zlib |
| 80 | + compressobj's compress method. |
| 81 | + """ |
| 82 | + return self.add_chunk(data) |
| 83 | + |
| 84 | + def flush(self, mode=None): |
| 85 | + """This method does nothing and only exists for compatibility with |
| 86 | + the zlib compressobj |
| 87 | + """ |
| 88 | + pass |
| 89 | + |
| 90 | + def copy(self): |
| 91 | + """This method exists for compatibility with the zlib compressobj. |
| 92 | + """ |
| 93 | + return StreamCompressor() |
| 94 | + |
| 95 | + |
| 96 | +class StreamDecompressor(object): |
| 97 | + |
| 98 | + """This class implements the decompressor-side of the hadoop snappy |
| 99 | + format. |
| 100 | +
|
| 101 | + This class matches a subset of the interface found for the zlib module's |
| 102 | + decompression objects (see zlib.decompressobj). Specifically, it currently |
| 103 | + implements the decompress method without the max_length option, the flush |
| 104 | + method without the length option, and the copy method. |
| 105 | + """ |
| 106 | + |
| 107 | + __slots__ = ["_buf", "_block_length", "_uncompressed_length"] |
| 108 | + |
| 109 | + def __init__(self): |
| 110 | + self._buf = b"" |
| 111 | + # current block length |
| 112 | + self._block_length = 0 |
| 113 | + # total uncompressed data length of the current block |
| 114 | + self._uncompressed_length = 0 |
| 115 | + |
| 116 | + @staticmethod |
| 117 | + def check_format(data): |
| 118 | + """Just checks that first two integers (big endian four-bytes int) |
| 119 | + in the given data block comply to: first int >= second int. |
| 120 | + This is a simple assumption that we have in the data a start of a |
| 121 | + block for hadoop snappy format. It should contain uncompressed block |
| 122 | + length as the first integer, and compressed subblock length as the |
| 123 | + second integer. |
| 124 | + Raises UncompressError if the condition is not fulfilled. |
| 125 | + :return: None |
| 126 | + """ |
| 127 | + int_size = _INT_SIZE |
| 128 | + if len(data) < int_size * 2: |
| 129 | + raise UncompressError("Too short data length") |
| 130 | + # We cant actually be sure abot the format here. |
| 131 | + # Assumption that compressed data length is less than uncompressed |
| 132 | + # is not true in general. |
| 133 | + # So, just don't check anything |
| 134 | + return |
| 135 | + |
| 136 | + def decompress(self, data): |
| 137 | + """Decompress 'data', returning a string containing the uncompressed |
| 138 | + data corresponding to at least part of the data in string. This data |
| 139 | + should be concatenated to the output produced by any preceding calls to |
| 140 | + the decompress() method. Some of the input data may be preserved in |
| 141 | + internal buffers for later processing. |
| 142 | + """ |
| 143 | + int_size = _INT_SIZE |
| 144 | + self._buf += data |
| 145 | + uncompressed = [] |
| 146 | + while True: |
| 147 | + if len(self._buf) < int_size: |
| 148 | + return b"".join(uncompressed) |
| 149 | + next_start = 0 |
| 150 | + if not self._block_length: |
| 151 | + self._block_length = unpack_int(self._buf[:int_size]) |
| 152 | + self._buf = self._buf[int_size:] |
| 153 | + if len(self._buf) < int_size: |
| 154 | + return b"".join(uncompressed) |
| 155 | + compressed_length = unpack_int( |
| 156 | + self._buf[next_start:next_start + int_size] |
| 157 | + ) |
| 158 | + next_start += int_size |
| 159 | + if len(self._buf) < compressed_length + next_start: |
| 160 | + return b"".join(uncompressed) |
| 161 | + chunk = self._buf[ |
| 162 | + next_start:next_start + compressed_length |
| 163 | + ] |
| 164 | + self._buf = self._buf[next_start + compressed_length:] |
| 165 | + uncompressed_chunk = _uncompress(chunk) |
| 166 | + self._uncompressed_length += len(uncompressed_chunk) |
| 167 | + uncompressed.append(uncompressed_chunk) |
| 168 | + if self._uncompressed_length == self._block_length: |
| 169 | + # Here we have uncompressed all subblocks of the current block |
| 170 | + self._uncompressed_length = 0 |
| 171 | + self._block_length = 0 |
| 172 | + continue |
| 173 | + |
| 174 | + def flush(self): |
| 175 | + """All pending input is processed, and a string containing the |
| 176 | + remaining uncompressed output is returned. After calling flush(), the |
| 177 | + decompress() method cannot be called again; the only realistic action |
| 178 | + is to delete the object. |
| 179 | + """ |
| 180 | + if self._buf != b"": |
| 181 | + raise UncompressError("chunk truncated") |
| 182 | + return b"" |
| 183 | + |
| 184 | + def copy(self): |
| 185 | + """Returns a copy of the decompression object. This can be used to save |
| 186 | + the state of the decompressor midway through the data stream in order |
| 187 | + to speed up random seeks into the stream at a future point. |
| 188 | + """ |
| 189 | + copy = StreamDecompressor() |
| 190 | + copy._buf = self._buf |
| 191 | + copy._block_length = self._block_length |
| 192 | + copy._uncompressed_length = self._uncompressed_length |
| 193 | + return copy |
| 194 | + |
| 195 | + |
| 196 | +def stream_compress(src, dst, blocksize=SNAPPY_BUFFER_SIZE_DEFAULT): |
| 197 | + return _stream_compress( |
| 198 | + src, dst, blocksize=blocksize, compressor_cls=StreamCompressor |
| 199 | + ) |
| 200 | + |
| 201 | + |
| 202 | +def stream_decompress(src, dst, blocksize=_STREAM_TO_STREAM_BLOCK_SIZE, |
| 203 | + start_chunk=None): |
| 204 | + return _stream_decompress( |
| 205 | + src, dst, blocksize=blocksize, |
| 206 | + decompressor_cls=StreamDecompressor, |
| 207 | + start_chunk=start_chunk |
| 208 | + ) |
| 209 | + |
| 210 | + |
| 211 | +def check_format(fin=None, chunk=None, blocksize=_STREAM_TO_STREAM_BLOCK_SIZE): |
| 212 | + return _check_format( |
| 213 | + fin=fin, chunk=chunk, blocksize=blocksize, |
| 214 | + decompressor_cls=StreamDecompressor |
| 215 | + ) |
0 commit comments