From 2a80329a137ae5a9226e43d3a4f5e9850b21f8a2 Mon Sep 17 00:00:00 2001 From: David Manthey Date: Mon, 15 Apr 2024 09:55:46 -0400 Subject: [PATCH] Add a dedup option. Also, refactor ifdsFirst option to be closer to the COGs standard. --- .circleci/config.yml | 2 +- CHANGELOG.md | 6 + tests/test_write_tiff.py | 38 +++- tifftools/commands.py | 14 +- tifftools/constants.py | 9 +- tifftools/tifftools.py | 371 ++++++++++++++++++++++++--------------- tox.ini | 4 +- 7 files changed, 293 insertions(+), 151 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 13a37b5..42d6616 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -95,7 +95,7 @@ jobs: steps: - checkout - tox: - env: flake8 + env: lint release: docker: - image: python:3.8 diff --git a/CHANGELOG.md b/CHANGELOG.md index 76a4ffa..bea97d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change Log +## Version 1.5.0 + +### Features +- Add a deduplicate option to avoid writing all identical data blocks ([#92](../../pull/92)) +- Refactor how ifds-first option emits data so it is closer to the COGs standard ([#92](../../pull/92)) + ## Version 1.4.1 ### Improvements diff --git a/tests/test_write_tiff.py b/tests/test_write_tiff.py index 0ac5fe9..3b8859c 100644 --- a/tests/test_write_tiff.py +++ b/tests/test_write_tiff.py @@ -1,4 +1,5 @@ import copy +import hashlib import logging import os @@ -152,6 +153,30 @@ def test_write_bigtiff_with_repeated_offset_data(tmp_path): assert destinfo['bigtiff'] is False +def test_write_with_dedup(tmp_path): + path = os.path.join(os.path.dirname(__file__), 'data', 'good_single.tif') + info = tifftools.read_tiff(path) + uniqueString = b'UNIQUESTRING' + info['ifds'][0]['tags'][23456] = { + 'datatype': tifftools.Datatype.UNDEFINED, + 'data': uniqueString + } + info['ifds'][0]['tags'][23457] = { + 'datatype': tifftools.Datatype.UNDEFINED, + 'data': uniqueString + } + destpath = tmp_path / 'sample.tiff' + tifftools.write_tiff(info, destpath) + assert len(open(destpath, 'rb').read().split(uniqueString)) == 3 + dest2path = tmp_path / 'sample2.tiff' + tifftools.write_tiff(info, dest2path, dedup=True) + assert len(open(dest2path, 'rb').read().split(uniqueString)) == 2 + info2 = tifftools.read_tiff(dest2path) + dest3path = tmp_path / 'sample3.tiff' + tifftools.write_tiff(info2, dest3path) + assert open(destpath, 'rb').read() == open(dest3path, 'rb').read() + + def test_write_bytecount_data(tmp_path): path = os.path.join(os.path.dirname(__file__), 'data', 'good_single.tif') info = tifftools.read_tiff(path) @@ -227,5 +252,14 @@ def test_write_ifds_first(tmp_path): destpath = tmp_path / 'sample.tiff' tifftools.write_tiff(info, destpath, ifdsFirst=True) len = os.path.getsize(destpath) - tifftools.write_tiff(info, destpath, allowExisting=True) - assert len == os.path.getsize(destpath) + destpath2 = tmp_path / 'sample2.tiff' + tifftools.write_tiff(info, destpath2) + assert len == os.path.getsize(destpath2) + + if hasattr(hashlib, 'file_digest'): + info = tifftools.read_tiff(destpath) + destpath3 = tmp_path / 'sample3.tiff' + tifftools.write_tiff(info, destpath3) + assert ( + hashlib.file_digest(open(destpath2, 'rb'), 'sha512').hexdigest() == + hashlib.file_digest(open(destpath3, 'rb'), 'sha512').hexdigest()) diff --git a/tifftools/commands.py b/tifftools/commands.py index a47987c..16df2a4 100644 --- a/tifftools/commands.py +++ b/tifftools/commands.py @@ -59,7 +59,9 @@ def tiff_concat(source, output, overwrite=False, **kwargs): nextInfo = read_tiff(path) ifds.extend(nextInfo['ifds']) _apply_flags_to_ifd(ifds, **kwargs) - write_tiff(ifds, output, allowExisting=overwrite, ifdsFirst=kwargs.get('ifdsfirst', False)) + write_tiff(ifds, output, allowExisting=overwrite, + ifdsFirst=kwargs.get('ifdsfirst', False), + dedup=kwargs.get('dedup', False)) def _tiff_dump_tag(tag, taginfo, linePrefix, max, dest=None, max_text=None, ifd=None): @@ -360,7 +362,8 @@ def tiff_split(source, prefix=None, subifds=False, overwrite=False, **kwargs): logger.info('Writing %s', outputPath) _apply_flags_to_ifd(ifd, **kwargs) write_tiff(ifd, outputPath, allowExisting=overwrite, - ifdsFirst=kwargs.get('ifdsfirst', False)) + ifdsFirst=kwargs.get('ifdsfirst', False), + dedup=kwargs.get('dedup', False)) def _value_to_types_numeric(results): @@ -527,7 +530,9 @@ def _tiff_set(source, output=None, setlist=None, unset=None, setfrom=None, else: ifd['tags'][int(tag)] = setinfo['ifds'][0]['tags'][int(tag)] _apply_flags_to_ifd(info, **kwargs) - write_tiff(info, output, allowExisting=overwrite, ifdsFirst=kwargs.get('ifdsfirst', False)) + write_tiff(info, output, allowExisting=overwrite, + ifdsFirst=kwargs.get('ifdsfirst', False), + dedup=kwargs.get('dedup', False)) def tiff_set(source, output=None, overwrite=False, setlist=None, unset=None, @@ -599,6 +604,9 @@ def main(args=None): }, { 'args': ('--ifdsfirst', '--ifds-first'), 'kwargs': dict(action='store_true', help='Store IFDs before their related data.'), + }, { + 'args': ('--dedup', '--deduplicate'), + 'kwargs': dict(action='store_true', help='Do not repeat identical data.'), }, { 'args': ('--stop-on-warning', '-X'), 'kwargs': dict( diff --git a/tifftools/constants.py b/tifftools/constants.py index 8a08e4d..e1c7be4 100644 --- a/tifftools/constants.py +++ b/tifftools/constants.py @@ -28,9 +28,10 @@ def __str__(self): return '%d (0x%X)' % (self.value, self.value) def __getitem__(self, key): - if hasattr(self, str(key)): + try: return getattr(self, str(key)) - raise KeyError(key) + except AttributeError: + raise KeyError(key) def __int__(self): return self.value @@ -61,9 +62,7 @@ def __hash__(self): return hash((type(self).__name__, self.value)) def get(self, key, default=None): - if hasattr(self, str(key)): - return getattr(self, str(key)) - return default + return getattr(self, str(key), default) class TiffTag(TiffConstant): diff --git a/tifftools/tifftools.py b/tifftools/tifftools.py index 586bd57..b46cf0f 100755 --- a/tifftools/tifftools.py +++ b/tifftools/tifftools.py @@ -1,9 +1,12 @@ #!/usr/bin/env python3 import functools +import hashlib import logging import os +import shutil import struct +import tempfile from .constants import Datatype, Tag, get_or_create_tag from .exceptions import MustBeBigTiffError, TifftoolsError @@ -11,6 +14,8 @@ logger = logging.getLogger(__name__) +_DEDUP_HASH_METHOD = 'sha256' + def check_offset(filelen, offset, length): """ @@ -154,6 +159,7 @@ def read_ifd(tiff, info, ifdOffset, ifdList, tagSet=Tag): :param ifdList: a list that this ifd will be appended to. :param tagSet: the TiffConstantSet class to use for tags. """ + logger.debug(f'read_ifd: {ifdOffset} (0x{ifdOffset:X})') bom = info['endianPack'] if not check_offset(info['size'], ifdOffset, 16 if info['bigtiff'] else 6): return @@ -269,7 +275,8 @@ def read_ifd_tag_data(tiff, info, ifd, tagSet=Tag): break -def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False, ifdsFirst=False): +def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False, + ifdsFirst=False, dedup=False): """ Write a tiff file based on data in a list of ifds. @@ -292,9 +299,17 @@ def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False, if just convert to bigtiff, but actually rewrites the file to avoid unaccounted bytes in the file. :param allowExisting: if False, raise an error if the path already exists. - :param ifdsFirst: if True, write IFDs before their respective data. - Otherwise, IFDs are written after their data. IFDs are always adjacent - to their data. + :param ifdsFirst: if True, write IFDs before their respective data. When + this is not set, data is stored (mixed tag and offset data),(ifd), + (mixed tag and offset data),(ifd),... When it is set, data is stored + (ifd),(tag data),(ifd),(tag data),...,(offset data),(offset data),... + This is not quite the COG specification, as that requires only the + strip or tile offset data to be at the end, and that data to be ordered + with the smallest image first, but if there are multiple conceptual + images, each one in turn (e.g., level0,level1,level2,...,level0,level1, + level2,...,...). + :param dedup: if False, all data is written. If True, data blocks that are + identical are only written once. """ if isinstance(ifds, dict): bigEndian = ifds.get('bigEndian') if bigEndian is None else bigEndian @@ -302,30 +317,57 @@ def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False, if ifds = ifds.get('ifds', [ifds]) bigEndian = ifds[0].get('bigEndian', False) if bigEndian is None else bigEndian bigtiff = ifds[0].get('bigtiff', False) if bigtiff is None else bigtiff - if not allowExisting and not is_filelike_object(path) and os.path.exists(path): - raise TifftoolsError('File already exists') + finalpath = path + if not is_filelike_object(path) and os.path.exists(path): + if not allowExisting: + raise TifftoolsError('File already exists') + with tempfile.NamedTemporaryFile( + prefix=os.path.basename(path), dir=os.path.dirname(path)) as temppath: + path = temppath.name rewriteBigtiff = False - with OpenPathOrFobj(path, 'wb') as dest: - bom = '>' if bigEndian else '<' - header = b'II' if not bigEndian else b'MM' - if bigtiff: - header += struct.pack(bom + 'HHHQ', 0x2B, 8, 0, 0) - ifdPtr = len(header) - 8 - else: - header += struct.pack(bom + 'HL', 0x2A, 0) - ifdPtr = len(header) - 4 - dest.write(header) - for ifd in ifds: - try: - ifdPtr = write_ifd(dest, bom, bigtiff, ifd, ifdPtr, ifdsFirst=ifdsFirst) - except MustBeBigTiffError: - # This can only be raised if bigtiff is false - rewriteBigtiff = True - break - if rewriteBigtiff: - dest.seek(0) - dest.truncate(0) - write_tiff(ifds, dest, bigEndian, True) + try: + with OpenPathOrFobj(path, 'wb') as dest: + bom = '>' if bigEndian else '<' + header = b'II' if not bigEndian else b'MM' + if bigtiff: + header += struct.pack(bom + 'HHHQ', 0x2B, 8, 0, 0) + ifdPtr = len(header) - 8 + else: + header += struct.pack(bom + 'HL', 0x2A, 0) + ifdPtr = len(header) - 4 + dest.write(header) + origifdPtr = ifdPtr + for datadest, ifddest in _ifdsPass(ifdsFirst, dest): + ifdPtr = origifdPtr + if bool(dedup): + dedup = {'hashes': {}, 'reused': 0} + for ifd in ifds: + try: + ifdPtr = write_ifd( + datadest, ifddest, bom, bigtiff, ifd, ifdPtr, + ifdsFirst=ifdsFirst, dedup=dedup) + except MustBeBigTiffError: + # This can only be raised if bigtiff is false + rewriteBigtiff = True + break + if rewriteBigtiff: + dest.seek(0) + dest.truncate(0) + write_tiff(ifds, dest, bigEndian, True, ifdsFirst=ifdsFirst, dedup=bool(dedup)) + elif dedup and dedup['reused']: + logger.info('Deduplication reused %d block(s)', dedup['reused']) + except Exception: + if path != finalpath: + os.unlink(path) + raise + else: + if path != finalpath: + # By copying the tempfile to the existing destination, the target + # path keeps its inode + with open(finalpath, 'r+b') as fdest, open(path, 'rb') as fsrc: + fdest.truncate(0) + shutil.copyfileobj(fsrc, fdest) + os.unlink(path) class _WriteTracker(): @@ -350,7 +392,7 @@ def seek(self, offset, whence=os.SEEK_SET): self.pos if whence == os.SEEK_CUR else 0)) + offset -def _ifdsPass(ifdsFirst, ifdsPass, origdest, ifddest): +def _ifdsPass(ifdsFirst, dest): """ To handle writing IFDs before or after their associated data, return a pair of pointers to handle writing data. For writing IFDs after the data, @@ -362,23 +404,17 @@ def _ifdsPass(ifdsFirst, ifdsPass, origdest, ifddest): data. Lasttly, we write the actual data. :param ifdsFirst: if True, ifds are written before data. - :param ifdsPass: ignored if idsFirst is False, otherwise a pass number in - the set of {0, 1, 2}. - :param origdest: the original destination I/O pointer. - :param ifddest: the most recent ifd I/O pointer. + :param dest: the original destination I/O pointer. + :yields: the data destination I/O pointer and the ifd destination I/O + pointer. """ if not ifdsFirst: - return origdest, origdest - if ifdsPass == 0: - dest = _WriteTracker(origdest.tell()) - ifddest = _WriteTracker(0) - elif ifdsPass == 1: - dest = _WriteTracker(origdest.tell() + ifddest.tell()) - ifddest = origdest + yield dest, dest else: - dest = origdest ifddest = _WriteTracker(0) - return dest, ifddest + yield _WriteTracker(dest.tell()), ifddest + yield _WriteTracker(dest.tell() + ifddest.tell()), dest + yield dest, _WriteTracker(0) def _adjustTaginfoForNonBigtiff(bigtiff, taginfo): @@ -407,7 +443,7 @@ def _checkDataForNonBigtiff(bigtiff, data): in a uint32 value. :param bigtiff: True if this is a bigtiff. - :param data: an array of integersto check. + :param data: an array of integers to check. """ if not bigtiff and any(val for val in data if val >= 0x100000000): raise MustBeBigTiffError('The file is large enough it must be in bigtiff format.') @@ -457,11 +493,13 @@ def _writeDeferredData(bigtiff, bom, dest, ifd, ifdrecord, deferredData): return ifdrecord -def write_ifd(dest, bom, bigtiff, ifd, ifdPtr, tagSet=Tag, ifdsFirst=False): +def write_ifd(datadest, ifddest, bom, bigtiff, ifd, ifdPtr, tagSet=Tag, + ifdsFirst=False, dedup=False): """ Write an IFD to a TIFF file. This copies image data from other tiff files. - :param dest: the open file handle to write. + :param datadest: the open file handle to write offset data. + :param ifddest: the open file handle to write ids and tag data. :param bom: either '<' or '>' for using struct to encode values based on endian. :param bigtiff: True if this is a bigtiff. @@ -472,110 +510,135 @@ def write_ifd(dest, bom, bigtiff, ifd, ifdPtr, tagSet=Tag, ifdsFirst=False): :param ifdsFirst: if True, write IFDs before their respective data. Otherwise, IFDs are written after their data. IFDs are always adjacent to their data. + :param dedup: if False, all data is written. Otherwise, a dictionary with + 'hashes' and 'reused', where 'hashes' is a dictionary with keys of + hashed data that have been written and values of the offsets where it + was written, and 'reused' is a count of data blocks that were + deduplicated. :return: the ifdPtr for the next ifd that could be written. """ ptrpack = 'Q' if bigtiff else 'L' tagdatalen = 8 if bigtiff else 4 - dest.seek(0, os.SEEK_END) - origPos = dest.tell() - origdest = ifddest = dest - for ifdsPass in range(3 if ifdsFirst else 1): - dest, ifddest = _ifdsPass(ifdsFirst, ifdsPass, origdest, ifddest) - ifdrecord = struct.pack(bom + ('Q' if bigtiff else 'H'), len(ifd['tags'])) - subifdPtrs = {} - deferredData = {} - with OpenPathOrFobj(ifd.get('path_or_fobj', False), 'rb') as src: - for tag, taginfo in sorted(ifd['tags'].items()): - tag = get_or_create_tag( - tag, tagSet, **({'datatype': Datatype[taginfo['datatype']]} - if taginfo.get('datatype') else {})) - if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8): - data = [0] * len(taginfo['ifds']) - taginfo = taginfo.copy() - taginfo['datatype'] = Datatype.IFD8 if bigtiff else Datatype.IFD - else: - data = taginfo['data'] - count = len(data) - if tag.isOffsetData(): - taginfo = taginfo.copy() - taginfo['datatype'] = Datatype.LONG8 if bigtiff else Datatype.LONG - if isinstance(tag.bytecounts, str): - if ifdsFirst: - deferredData[int(tagSet[tag.bytecounts])] = { - 'tag': tagSet[tag.bytecounts], - 'data': ifd['tags'][int(tagSet[tag.bytecounts])]['data'][:], - } - deferredData[int(tag)] = { - 'tag': tag, - 'data': data[:], - 'write': ( - dest, src, data, - deferredData[int(tagSet[tag.bytecounts])]['data'], - ifd['size']), - 'taginfo': taginfo, - } - else: - data = write_tag_data( - dest, src, data, - ifd['tags'][int(tagSet[tag.bytecounts])]['data'], - ifd['size']) + # dest.seek(0, os.SEEK_END) + # origPos = dest.tell() + # origdest = ifddest = dest + nextifdPtr = None + ifdrecord = struct.pack(bom + ('Q' if bigtiff else 'H'), len(ifd['tags'])) + subifdPtrs = {} + deferredData = {} + ifdpos = ifddest.tell() + if ifdsFirst: + ifdlen = ( + len(ifdrecord) + (20 if bigtiff else 12) * len(ifd['tags']) + (8 if bigtiff else 4)) + ifddest.write(b'\x00' * ifdlen) + with OpenPathOrFobj(ifd.get('path_or_fobj', False), 'rb') as src: + for tag, taginfo in sorted(ifd['tags'].items()): + tag = get_or_create_tag( + tag, tagSet, **({'datatype': Datatype[taginfo['datatype']]} + if taginfo.get('datatype') else {})) + if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8): + data = [0] * len(taginfo['ifds']) + taginfo = taginfo.copy() + taginfo['datatype'] = Datatype.IFD8 if bigtiff else Datatype.IFD + else: + data = taginfo['data'] + count = len(data) + if tag.isOffsetData(): + taginfo = taginfo.copy() + taginfo['datatype'] = Datatype.LONG8 if bigtiff else Datatype.LONG + if isinstance(tag.bytecounts, str): + if ifdsFirst: + deferredData[int(tagSet[tag.bytecounts])] = { + 'tag': tagSet[tag.bytecounts], + 'data': ifd['tags'][int(tagSet[tag.bytecounts])]['data'][:], + } + deferredData[int(tag)] = { + 'tag': tag, + 'data': data[:], + 'write': ( + datadest, src, data, + deferredData[int(tagSet[tag.bytecounts])]['data'], + ifd['size'], dedup), + 'taginfo': taginfo, + } else: data = write_tag_data( - dest, src, data, [tag.bytecounts] * count, ifd['size']) - _checkDataForNonBigtiff(bigtiff, data) - _adjustTaginfoForNonBigtiff(bigtiff, taginfo) - if Datatype[taginfo['datatype']].pack: - pack = Datatype[taginfo['datatype']].pack - count //= len(pack) - data = struct.pack(bom + pack * count, *data) - elif Datatype[taginfo['datatype']] == Datatype.ASCII: - # Handle null-seperated lists - data = (data if isinstance(data, bytes) else data.encode()) + b'\x00' - count = len(data) + ifddest, src, data, + ifd['tags'][int(tagSet[tag.bytecounts])]['data'], + ifd['size'], dedup) else: - data = taginfo['data'] - tagrecord = struct.pack(bom + 'HH' + ptrpack, tag, taginfo['datatype'], count) - if len(data) <= tagdatalen: - if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8): - subifdPtrs[tag] = -(len(ifdrecord) + len(tagrecord)) - if int(tag) in deferredData: - deferredData[int(tag)]['ifdoffset'] = len(ifdrecord) + len(tagrecord) - tagrecord += data + b'\x00' * (tagdatalen - len(data)) - else: - # word alignment - if dest.tell() % 2: - dest.write(b'\x00') - if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8): - subifdPtrs[tag] = dest.tell() - _checkDataForNonBigtiff(bigtiff, [dest.tell()]) - tagrecord += struct.pack(bom + ptrpack, dest.tell()) - if int(tag) in deferredData: - deferredData[int(tag)]['offset'] = dest.tell() - dest.write(data) - ifdrecord += tagrecord - ifdrecord = _writeDeferredData(bigtiff, bom, dest, ifd, ifdrecord, deferredData) - _checkDataForNonBigtiff(bigtiff, [dest.tell()]) - pos = dest.tell() - # ifds are expected to be on word boundaries - if pos % 2: - dest.write(b'\x00') - pos = dest.tell() - dest.seek(ifdPtr) - dest.write(struct.pack(bom + ptrpack, origPos if ifdsFirst else pos)) - dest.seek(0, os.SEEK_END) - ifddest.write(ifdrecord) - nextifdPtr = dest.tell() - ifddest.write(struct.pack(bom + ptrpack, 0)) - write_sub_ifds(dest, bom, bigtiff, ifd, pos, subifdPtrs, ifdsFirst=ifdsFirst) + data = write_tag_data( + ifddest, src, data, [tag.bytecounts] * count, + ifd['size'], dedup) + _checkDataForNonBigtiff(bigtiff, data) + _adjustTaginfoForNonBigtiff(bigtiff, taginfo) + if Datatype[taginfo['datatype']].pack: + pack = Datatype[taginfo['datatype']].pack + count //= len(pack) + data = struct.pack(bom + pack * count, *data) + elif Datatype[taginfo['datatype']] == Datatype.ASCII: + # Handle null-seperated lists + data = (data if isinstance(data, bytes) else data.encode()) + b'\x00' + count = len(data) + else: + data = taginfo['data'] + tagrecord = struct.pack(bom + 'HH' + ptrpack, tag, taginfo['datatype'], count) + if len(data) <= tagdatalen: + if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8): + subifdPtrs[tag] = -(len(ifdrecord) + len(tagrecord)) + if int(tag) in deferredData: + deferredData[int(tag)]['ifdoffset'] = len(ifdrecord) + len(tagrecord) + tagrecord += data + b'\x00' * (tagdatalen - len(data)) + else: + # word alignment for tag position + if ifddest.tell() % 2: + ifddest.write(b'\x00') + h = None + tpos = ifddest.tell() + if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8): + subifdPtrs[tag] = tpos + elif dedup: + h = hashlib.new(_DEDUP_HASH_METHOD, data).digest() + if h in dedup['hashes']: + tpos = dedup['hashes'][h] + else: + dedup['hashes'][h] = tpos + h = None + _checkDataForNonBigtiff(bigtiff, [tpos]) + tagrecord += struct.pack(bom + ptrpack, tpos) + if int(tag) in deferredData: + deferredData[int(tag)]['offset'] = tpos + if not dedup or h is None: + ifddest.write(data) + ifdrecord += tagrecord + ifdrecord = _writeDeferredData(bigtiff, bom, ifddest, ifd, ifdrecord, deferredData) + _checkDataForNonBigtiff(bigtiff, [ifddest.tell(), datadest.tell()]) + pos = ifddest.tell() + # ifds are expected to be on word boundaries + if pos % 2: + ifddest.write(b'\x00') + pos = ifddest.tell() + ifddest.seek(ifdPtr) + ifddest.write(struct.pack(bom + ptrpack, ifdpos if ifdsFirst else pos)) + ifddest.seek(ifdpos if ifdsFirst else 0, os.SEEK_SET if ifdsFirst else os.SEEK_END) + ifddest.write(ifdrecord) + nextifdPtr = ifddest.tell() + ifddest.write(struct.pack(bom + ptrpack, 0)) + ifddest.seek(0, os.SEEK_END) + write_sub_ifds(datadest, ifddest, bom, bigtiff, ifd, + ifdpos if ifdsFirst else pos, subifdPtrs, + ifdsFirst=ifdsFirst, dedup=dedup) return nextifdPtr -def write_sub_ifds(dest, bom, bigtiff, ifd, parentPos, subifdPtrs, tagSet=Tag, ifdsFirst=False): +def write_sub_ifds(datadest, ifddest, bom, bigtiff, ifd, parentPos, subifdPtrs, + tagSet=Tag, ifdsFirst=False, dedup=False): """ Write any number of SubIFDs to a TIFF file. These can be based on tags other than the SubIFD tag. - :param dest: the open file handle to write. + :param datadest: the open file handle to write offset data. + :param ifddest: the open file handle to write ifd and tag data. :param bom: eithter '<' or '>' for using struct to encode values based on endian. :param bigtiff: True if this is a bigtiff. @@ -591,6 +654,11 @@ def write_sub_ifds(dest, bom, bigtiff, ifd, parentPos, subifdPtrs, tagSet=Tag, i :param ifdsFirst: if True, write IFDs before their respective data. Otherwise, IFDs are written after their data. IFDs are always adjacent to their data. + :param dedup: if False, all data is written. Otherwise, a dictionary with + 'hashes' and 'reused', where 'hashes' is a dictionary with keys of + hashed data that have been written and values of the offsets where it + was written, and 'reused' is a count of data blocks that were + deduplicated. """ tagdatalen = 8 if bigtiff else 4 for tag, subifdPtr in subifdPtrs.items(): @@ -602,12 +670,13 @@ def write_sub_ifds(dest, bom, bigtiff, ifd, parentPos, subifdPtrs, tagSet=Tag, i nextSubifdPtr = subifdPtr for ifdInSubifd in subifd: nextSubifdPtr = write_ifd( - dest, bom, bigtiff, ifdInSubifd, nextSubifdPtr, - getattr(tag, 'tagset', None), ifdsFirst=ifdsFirst) + datadest, ifddest, bom, bigtiff, ifdInSubifd, + nextSubifdPtr, getattr(tag, 'tagset', None), + ifdsFirst=ifdsFirst, dedup=dedup) subifdPtr += tagdatalen -def write_tag_data(dest, src, offsets, lengths, srclen): +def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False): """ Copy data from a source tiff to a destination tiff, return a list of offsets where data was written. @@ -617,6 +686,11 @@ def write_tag_data(dest, src, offsets, lengths, srclen): :param offsets: an array of offsets where data will be copied from. :param lengths: an array of lengths to copy from each offset. :param srclen: the length of the source file. + :param dedup: if False, all data is written. Otherwise, a dictionary with + 'hashes' and 'reused', where 'hashes' is a dictionary with keys of + hashed data that have been written and values of the offsets where it + was written, and 'reused' is a count of data blocks that were + deduplicated. :return: the offsets in the destination file corresponding to the data copied. """ @@ -633,6 +707,7 @@ def write_tag_data(dest, src, offsets, lengths, srclen): offset, idx = offsetList[olidx] length = lengths[idx] if offset and check_offset(srclen, offset, length): + # if a block repeats a previous block, continue the pattern if lastOffset == offset and lastLength == length: destOffsets[idx] = destOffsets[lastOffsetIdx] olidx += 1 @@ -640,16 +715,36 @@ def write_tag_data(dest, src, offsets, lengths, srclen): lastOffset, lastLength, lastOffsetIdx = offset, length, idx src.seek(offset) destOffsets[idx] = dest.tell() + tells = {'idx': [idx], 'pos': destOffsets[idx], 'offset': offset} # Group reads when possible; the biggest overhead is in the actual # read call - while (olidx + 1 < len(offsetList) and + while (not dedup and olidx + 1 < len(offsetList) and offsetList[olidx + 1][0] == offsetList[olidx][0] + lengths[idx] and check_offset(srclen, offsetList[olidx + 1][0], lengths[offsetList[olidx + 1][1]])): destOffsets[offsetList[olidx + 1][1]] = destOffsets[idx] + lengths[idx] + tells['idx'].append(offsetList[olidx + 1][1]) olidx += 1 offset, idx = offsetList[olidx] length += lengths[idx] + if dedup: + readlen = length + h = hashlib.new(_DEDUP_HASH_METHOD) + while readlen: + data = src.read(min(readlen, COPY_CHUNKSIZE)) + h.update(data) + readlen -= len(data) + h = h.digest() + if h in dedup['hashes']: + hpos = dedup['hashes'][h] + for tidx in tells['idx']: + destOffsets[tidx] = destOffsets[tidx] - tells['pos'] + hpos + dedup['reused'] += 1 + logger.debug('Deduplication: %d', dedup['reused']) + length = 0 + else: + dedup['hashes'][h] = tells['pos'] + src.seek(tells['offset']) while length: data = src.read(min(length, COPY_CHUNKSIZE)) dest.write(data) diff --git a/tox.ini b/tox.ini index 2a6d609..7c98d6b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,7 +1,7 @@ [tox] envlist = test-py{38,39,310,311,312} - flake8 + lint [testenv] deps = @@ -15,7 +15,7 @@ deps = commands = pytest --cov {envsitepackagesdir}/tifftools {posargs} -[testenv:flake8] +[testenv:lint] basepython = python3 skipsdist = true skip_install = true