From 2a80329a137ae5a9226e43d3a4f5e9850b21f8a2 Mon Sep 17 00:00:00 2001
From: David Manthey <david.manthey@kitware.com>
Date: Mon, 15 Apr 2024 09:55:46 -0400
Subject: [PATCH] Add a dedup option.

Also, refactor ifdsFirst option to be closer to the COGs standard.
---
 .circleci/config.yml     |   2 +-
 CHANGELOG.md             |   6 +
 tests/test_write_tiff.py |  38 +++-
 tifftools/commands.py    |  14 +-
 tifftools/constants.py   |   9 +-
 tifftools/tifftools.py   | 371 ++++++++++++++++++++++++---------------
 tox.ini                  |   4 +-
 7 files changed, 293 insertions(+), 151 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 13a37b5..42d6616 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -95,7 +95,7 @@ jobs:
     steps:
       - checkout
       - tox:
-          env: flake8
+          env: lint
   release:
     docker:
       - image: python:3.8
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 76a4ffa..bea97d6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change Log
 
+## Version 1.5.0
+
+### Features
+- Add a deduplicate option to avoid writing all identical data blocks ([#92](../../pull/92))
+- Refactor how ifds-first option emits data so it is closer to the COGs standard ([#92](../../pull/92))
+
 ## Version 1.4.1
 
 ### Improvements
diff --git a/tests/test_write_tiff.py b/tests/test_write_tiff.py
index 0ac5fe9..3b8859c 100644
--- a/tests/test_write_tiff.py
+++ b/tests/test_write_tiff.py
@@ -1,4 +1,5 @@
 import copy
+import hashlib
 import logging
 import os
 
@@ -152,6 +153,30 @@ def test_write_bigtiff_with_repeated_offset_data(tmp_path):
     assert destinfo['bigtiff'] is False
 
 
+def test_write_with_dedup(tmp_path):
+    path = os.path.join(os.path.dirname(__file__), 'data', 'good_single.tif')
+    info = tifftools.read_tiff(path)
+    uniqueString = b'UNIQUESTRING'
+    info['ifds'][0]['tags'][23456] = {
+        'datatype': tifftools.Datatype.UNDEFINED,
+        'data': uniqueString
+    }
+    info['ifds'][0]['tags'][23457] = {
+        'datatype': tifftools.Datatype.UNDEFINED,
+        'data': uniqueString
+    }
+    destpath = tmp_path / 'sample.tiff'
+    tifftools.write_tiff(info, destpath)
+    assert len(open(destpath, 'rb').read().split(uniqueString)) == 3
+    dest2path = tmp_path / 'sample2.tiff'
+    tifftools.write_tiff(info, dest2path, dedup=True)
+    assert len(open(dest2path, 'rb').read().split(uniqueString)) == 2
+    info2 = tifftools.read_tiff(dest2path)
+    dest3path = tmp_path / 'sample3.tiff'
+    tifftools.write_tiff(info2, dest3path)
+    assert open(destpath, 'rb').read() == open(dest3path, 'rb').read()
+
+
 def test_write_bytecount_data(tmp_path):
     path = os.path.join(os.path.dirname(__file__), 'data', 'good_single.tif')
     info = tifftools.read_tiff(path)
@@ -227,5 +252,14 @@ def test_write_ifds_first(tmp_path):
     destpath = tmp_path / 'sample.tiff'
     tifftools.write_tiff(info, destpath, ifdsFirst=True)
     len = os.path.getsize(destpath)
-    tifftools.write_tiff(info, destpath, allowExisting=True)
-    assert len == os.path.getsize(destpath)
+    destpath2 = tmp_path / 'sample2.tiff'
+    tifftools.write_tiff(info, destpath2)
+    assert len == os.path.getsize(destpath2)
+
+    if hasattr(hashlib, 'file_digest'):
+        info = tifftools.read_tiff(destpath)
+        destpath3 = tmp_path / 'sample3.tiff'
+        tifftools.write_tiff(info, destpath3)
+        assert (
+            hashlib.file_digest(open(destpath2, 'rb'), 'sha512').hexdigest() ==
+            hashlib.file_digest(open(destpath3, 'rb'), 'sha512').hexdigest())
diff --git a/tifftools/commands.py b/tifftools/commands.py
index a47987c..16df2a4 100644
--- a/tifftools/commands.py
+++ b/tifftools/commands.py
@@ -59,7 +59,9 @@ def tiff_concat(source, output, overwrite=False, **kwargs):
         nextInfo = read_tiff(path)
         ifds.extend(nextInfo['ifds'])
     _apply_flags_to_ifd(ifds, **kwargs)
-    write_tiff(ifds, output, allowExisting=overwrite, ifdsFirst=kwargs.get('ifdsfirst', False))
+    write_tiff(ifds, output, allowExisting=overwrite,
+               ifdsFirst=kwargs.get('ifdsfirst', False),
+               dedup=kwargs.get('dedup', False))
 
 
 def _tiff_dump_tag(tag, taginfo, linePrefix, max, dest=None, max_text=None, ifd=None):
@@ -360,7 +362,8 @@ def tiff_split(source, prefix=None, subifds=False, overwrite=False, **kwargs):
         logger.info('Writing %s', outputPath)
         _apply_flags_to_ifd(ifd, **kwargs)
         write_tiff(ifd, outputPath, allowExisting=overwrite,
-                   ifdsFirst=kwargs.get('ifdsfirst', False))
+                   ifdsFirst=kwargs.get('ifdsfirst', False),
+                   dedup=kwargs.get('dedup', False))
 
 
 def _value_to_types_numeric(results):
@@ -527,7 +530,9 @@ def _tiff_set(source, output=None, setlist=None, unset=None, setfrom=None,
             else:
                 ifd['tags'][int(tag)] = setinfo['ifds'][0]['tags'][int(tag)]
     _apply_flags_to_ifd(info, **kwargs)
-    write_tiff(info, output, allowExisting=overwrite, ifdsFirst=kwargs.get('ifdsfirst', False))
+    write_tiff(info, output, allowExisting=overwrite,
+               ifdsFirst=kwargs.get('ifdsfirst', False),
+               dedup=kwargs.get('dedup', False))
 
 
 def tiff_set(source, output=None, overwrite=False, setlist=None, unset=None,
@@ -599,6 +604,9 @@ def main(args=None):
     }, {
         'args': ('--ifdsfirst', '--ifds-first'),
         'kwargs': dict(action='store_true', help='Store IFDs before their related data.'),
+    }, {
+        'args': ('--dedup', '--deduplicate'),
+        'kwargs': dict(action='store_true', help='Do not repeat identical data.'),
     }, {
         'args': ('--stop-on-warning', '-X'),
         'kwargs': dict(
diff --git a/tifftools/constants.py b/tifftools/constants.py
index 8a08e4d..e1c7be4 100644
--- a/tifftools/constants.py
+++ b/tifftools/constants.py
@@ -28,9 +28,10 @@ def __str__(self):
         return '%d (0x%X)' % (self.value, self.value)
 
     def __getitem__(self, key):
-        if hasattr(self, str(key)):
+        try:
             return getattr(self, str(key))
-        raise KeyError(key)
+        except AttributeError:
+            raise KeyError(key)
 
     def __int__(self):
         return self.value
@@ -61,9 +62,7 @@ def __hash__(self):
         return hash((type(self).__name__, self.value))
 
     def get(self, key, default=None):
-        if hasattr(self, str(key)):
-            return getattr(self, str(key))
-        return default
+        return getattr(self, str(key), default)
 
 
 class TiffTag(TiffConstant):
diff --git a/tifftools/tifftools.py b/tifftools/tifftools.py
index 586bd57..b46cf0f 100755
--- a/tifftools/tifftools.py
+++ b/tifftools/tifftools.py
@@ -1,9 +1,12 @@
 #!/usr/bin/env python3
 
 import functools
+import hashlib
 import logging
 import os
+import shutil
 import struct
+import tempfile
 
 from .constants import Datatype, Tag, get_or_create_tag
 from .exceptions import MustBeBigTiffError, TifftoolsError
@@ -11,6 +14,8 @@
 
 logger = logging.getLogger(__name__)
 
+_DEDUP_HASH_METHOD = 'sha256'
+
 
 def check_offset(filelen, offset, length):
     """
@@ -154,6 +159,7 @@ def read_ifd(tiff, info, ifdOffset, ifdList, tagSet=Tag):
     :param ifdList: a list that this ifd will be appended to.
     :param tagSet: the TiffConstantSet class to use for tags.
     """
+    logger.debug(f'read_ifd: {ifdOffset} (0x{ifdOffset:X})')
     bom = info['endianPack']
     if not check_offset(info['size'], ifdOffset, 16 if info['bigtiff'] else 6):
         return
@@ -269,7 +275,8 @@ def read_ifd_tag_data(tiff, info, ifd, tagSet=Tag):
                         break
 
 
-def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False, ifdsFirst=False):
+def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False,
+               ifdsFirst=False, dedup=False):
     """
     Write a tiff file based on data in a list of ifds.
 
@@ -292,9 +299,17 @@ def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False, if
         just convert to bigtiff, but actually rewrites the file to avoid
         unaccounted bytes in the file.
     :param allowExisting: if False, raise an error if the path already exists.
-    :param ifdsFirst: if True, write IFDs before their respective data.
-        Otherwise, IFDs are written after their data.  IFDs are always adjacent
-        to their data.
+    :param ifdsFirst: if True, write IFDs before their respective data.  When
+        this is not set, data is stored (mixed tag and offset data),(ifd),
+        (mixed tag and offset data),(ifd),...  When it is set, data is stored
+        (ifd),(tag data),(ifd),(tag data),...,(offset data),(offset data),...
+        This is not quite the COG specification, as that requires only the
+        strip or tile offset data to be at the end, and that data to be ordered
+        with the smallest image first, but if there are multiple conceptual
+        images, each one in turn (e.g., level0,level1,level2,...,level0,level1,
+        level2,...,...).
+    :param dedup: if False, all data is written.  If True, data blocks that are
+        identical are only written once.
     """
     if isinstance(ifds, dict):
         bigEndian = ifds.get('bigEndian') if bigEndian is None else bigEndian
@@ -302,30 +317,57 @@ def write_tiff(ifds, path, bigEndian=None, bigtiff=None, allowExisting=False, if
         ifds = ifds.get('ifds', [ifds])
     bigEndian = ifds[0].get('bigEndian', False) if bigEndian is None else bigEndian
     bigtiff = ifds[0].get('bigtiff', False) if bigtiff is None else bigtiff
-    if not allowExisting and not is_filelike_object(path) and os.path.exists(path):
-        raise TifftoolsError('File already exists')
+    finalpath = path
+    if not is_filelike_object(path) and os.path.exists(path):
+        if not allowExisting:
+            raise TifftoolsError('File already exists')
+        with tempfile.NamedTemporaryFile(
+                prefix=os.path.basename(path), dir=os.path.dirname(path)) as temppath:
+            path = temppath.name
     rewriteBigtiff = False
-    with OpenPathOrFobj(path, 'wb') as dest:
-        bom = '>' if bigEndian else '<'
-        header = b'II' if not bigEndian else b'MM'
-        if bigtiff:
-            header += struct.pack(bom + 'HHHQ', 0x2B, 8, 0, 0)
-            ifdPtr = len(header) - 8
-        else:
-            header += struct.pack(bom + 'HL', 0x2A, 0)
-            ifdPtr = len(header) - 4
-        dest.write(header)
-        for ifd in ifds:
-            try:
-                ifdPtr = write_ifd(dest, bom, bigtiff, ifd, ifdPtr, ifdsFirst=ifdsFirst)
-            except MustBeBigTiffError:
-                # This can only be raised if bigtiff is false
-                rewriteBigtiff = True
-                break
-        if rewriteBigtiff:
-            dest.seek(0)
-            dest.truncate(0)
-            write_tiff(ifds, dest, bigEndian, True)
+    try:
+        with OpenPathOrFobj(path, 'wb') as dest:
+            bom = '>' if bigEndian else '<'
+            header = b'II' if not bigEndian else b'MM'
+            if bigtiff:
+                header += struct.pack(bom + 'HHHQ', 0x2B, 8, 0, 0)
+                ifdPtr = len(header) - 8
+            else:
+                header += struct.pack(bom + 'HL', 0x2A, 0)
+                ifdPtr = len(header) - 4
+            dest.write(header)
+            origifdPtr = ifdPtr
+            for datadest, ifddest in _ifdsPass(ifdsFirst, dest):
+                ifdPtr = origifdPtr
+                if bool(dedup):
+                    dedup = {'hashes': {}, 'reused': 0}
+                for ifd in ifds:
+                    try:
+                        ifdPtr = write_ifd(
+                            datadest, ifddest, bom, bigtiff, ifd, ifdPtr,
+                            ifdsFirst=ifdsFirst, dedup=dedup)
+                    except MustBeBigTiffError:
+                        # This can only be raised if bigtiff is false
+                        rewriteBigtiff = True
+                        break
+            if rewriteBigtiff:
+                dest.seek(0)
+                dest.truncate(0)
+                write_tiff(ifds, dest, bigEndian, True, ifdsFirst=ifdsFirst, dedup=bool(dedup))
+            elif dedup and dedup['reused']:
+                logger.info('Deduplication reused %d block(s)', dedup['reused'])
+    except Exception:
+        if path != finalpath:
+            os.unlink(path)
+        raise
+    else:
+        if path != finalpath:
+            # By copying the tempfile to the existing destination, the target
+            # path keeps its inode
+            with open(finalpath, 'r+b') as fdest, open(path, 'rb') as fsrc:
+                fdest.truncate(0)
+                shutil.copyfileobj(fsrc, fdest)
+            os.unlink(path)
 
 
 class _WriteTracker():
@@ -350,7 +392,7 @@ def seek(self, offset, whence=os.SEEK_SET):
             self.pos if whence == os.SEEK_CUR else 0)) + offset
 
 
-def _ifdsPass(ifdsFirst, ifdsPass, origdest, ifddest):
+def _ifdsPass(ifdsFirst, dest):
     """
     To handle writing IFDs before or after their associated data, return a
     pair of pointers to handle writing data.  For writing IFDs after the data,
@@ -362,23 +404,17 @@ def _ifdsPass(ifdsFirst, ifdsPass, origdest, ifddest):
     data.  Lasttly, we write the actual data.
 
     :param ifdsFirst: if True, ifds are written before data.
-    :param ifdsPass: ignored if idsFirst is False, otherwise a pass number in
-        the set of {0, 1, 2}.
-    :param origdest: the original destination I/O pointer.
-    :param ifddest: the most recent ifd I/O pointer.
+    :param dest: the original destination I/O pointer.
+    :yields: the data destination I/O pointer and the ifd destination I/O
+        pointer.
     """
     if not ifdsFirst:
-        return origdest, origdest
-    if ifdsPass == 0:
-        dest = _WriteTracker(origdest.tell())
-        ifddest = _WriteTracker(0)
-    elif ifdsPass == 1:
-        dest = _WriteTracker(origdest.tell() + ifddest.tell())
-        ifddest = origdest
+        yield dest, dest
     else:
-        dest = origdest
         ifddest = _WriteTracker(0)
-    return dest, ifddest
+        yield _WriteTracker(dest.tell()), ifddest
+        yield _WriteTracker(dest.tell() + ifddest.tell()), dest
+        yield dest, _WriteTracker(0)
 
 
 def _adjustTaginfoForNonBigtiff(bigtiff, taginfo):
@@ -407,7 +443,7 @@ def _checkDataForNonBigtiff(bigtiff, data):
     in a uint32 value.
 
     :param bigtiff: True if this is a bigtiff.
-    :param data: an array of integersto check.
+    :param data: an array of integers to check.
     """
     if not bigtiff and any(val for val in data if val >= 0x100000000):
         raise MustBeBigTiffError('The file is large enough it must be in bigtiff format.')
@@ -457,11 +493,13 @@ def _writeDeferredData(bigtiff, bom, dest, ifd, ifdrecord, deferredData):
     return ifdrecord
 
 
-def write_ifd(dest, bom, bigtiff, ifd, ifdPtr, tagSet=Tag, ifdsFirst=False):
+def write_ifd(datadest, ifddest, bom, bigtiff, ifd, ifdPtr, tagSet=Tag,
+              ifdsFirst=False, dedup=False):
     """
     Write an IFD to a TIFF file.  This copies image data from other tiff files.
 
-    :param dest: the open file handle to write.
+    :param datadest: the open file handle to write offset data.
+    :param ifddest: the open file handle to write ids and tag data.
     :param bom: either '<' or '>' for using struct to encode values based on
         endian.
     :param bigtiff: True if this is a bigtiff.
@@ -472,110 +510,135 @@ def write_ifd(dest, bom, bigtiff, ifd, ifdPtr, tagSet=Tag, ifdsFirst=False):
     :param ifdsFirst: if True, write IFDs before their respective data.
         Otherwise, IFDs are written after their data.  IFDs are always adjacent
         to their data.
+    :param dedup: if False, all data is written.  Otherwise, a dictionary with
+        'hashes' and 'reused', where 'hashes' is a dictionary with keys of
+        hashed data that have been written and values of the offsets where it
+        was written, and 'reused' is a count of data blocks that were
+        deduplicated.
     :return: the ifdPtr for the next ifd that could be written.
     """
     ptrpack = 'Q' if bigtiff else 'L'
     tagdatalen = 8 if bigtiff else 4
-    dest.seek(0, os.SEEK_END)
-    origPos = dest.tell()
-    origdest = ifddest = dest
-    for ifdsPass in range(3 if ifdsFirst else 1):
-        dest, ifddest = _ifdsPass(ifdsFirst, ifdsPass, origdest, ifddest)
-        ifdrecord = struct.pack(bom + ('Q' if bigtiff else 'H'), len(ifd['tags']))
-        subifdPtrs = {}
-        deferredData = {}
-        with OpenPathOrFobj(ifd.get('path_or_fobj', False), 'rb') as src:
-            for tag, taginfo in sorted(ifd['tags'].items()):
-                tag = get_or_create_tag(
-                    tag, tagSet, **({'datatype': Datatype[taginfo['datatype']]}
-                                    if taginfo.get('datatype') else {}))
-                if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8):
-                    data = [0] * len(taginfo['ifds'])
-                    taginfo = taginfo.copy()
-                    taginfo['datatype'] = Datatype.IFD8 if bigtiff else Datatype.IFD
-                else:
-                    data = taginfo['data']
-                count = len(data)
-                if tag.isOffsetData():
-                    taginfo = taginfo.copy()
-                    taginfo['datatype'] = Datatype.LONG8 if bigtiff else Datatype.LONG
-                    if isinstance(tag.bytecounts, str):
-                        if ifdsFirst:
-                            deferredData[int(tagSet[tag.bytecounts])] = {
-                                'tag': tagSet[tag.bytecounts],
-                                'data': ifd['tags'][int(tagSet[tag.bytecounts])]['data'][:],
-                            }
-                            deferredData[int(tag)] = {
-                                'tag': tag,
-                                'data': data[:],
-                                'write': (
-                                    dest, src, data,
-                                    deferredData[int(tagSet[tag.bytecounts])]['data'],
-                                    ifd['size']),
-                                'taginfo': taginfo,
-                            }
-                        else:
-                            data = write_tag_data(
-                                dest, src, data,
-                                ifd['tags'][int(tagSet[tag.bytecounts])]['data'],
-                                ifd['size'])
+    # dest.seek(0, os.SEEK_END)
+    # origPos = dest.tell()
+    # origdest = ifddest = dest
+    nextifdPtr = None
+    ifdrecord = struct.pack(bom + ('Q' if bigtiff else 'H'), len(ifd['tags']))
+    subifdPtrs = {}
+    deferredData = {}
+    ifdpos = ifddest.tell()
+    if ifdsFirst:
+        ifdlen = (
+            len(ifdrecord) + (20 if bigtiff else 12) * len(ifd['tags']) + (8 if bigtiff else 4))
+        ifddest.write(b'\x00' * ifdlen)
+    with OpenPathOrFobj(ifd.get('path_or_fobj', False), 'rb') as src:
+        for tag, taginfo in sorted(ifd['tags'].items()):
+            tag = get_or_create_tag(
+                tag, tagSet, **({'datatype': Datatype[taginfo['datatype']]}
+                                if taginfo.get('datatype') else {}))
+            if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8):
+                data = [0] * len(taginfo['ifds'])
+                taginfo = taginfo.copy()
+                taginfo['datatype'] = Datatype.IFD8 if bigtiff else Datatype.IFD
+            else:
+                data = taginfo['data']
+            count = len(data)
+            if tag.isOffsetData():
+                taginfo = taginfo.copy()
+                taginfo['datatype'] = Datatype.LONG8 if bigtiff else Datatype.LONG
+                if isinstance(tag.bytecounts, str):
+                    if ifdsFirst:
+                        deferredData[int(tagSet[tag.bytecounts])] = {
+                            'tag': tagSet[tag.bytecounts],
+                            'data': ifd['tags'][int(tagSet[tag.bytecounts])]['data'][:],
+                        }
+                        deferredData[int(tag)] = {
+                            'tag': tag,
+                            'data': data[:],
+                            'write': (
+                                datadest, src, data,
+                                deferredData[int(tagSet[tag.bytecounts])]['data'],
+                                ifd['size'], dedup),
+                            'taginfo': taginfo,
+                        }
                     else:
                         data = write_tag_data(
-                            dest, src, data, [tag.bytecounts] * count, ifd['size'])
-                    _checkDataForNonBigtiff(bigtiff, data)
-                _adjustTaginfoForNonBigtiff(bigtiff, taginfo)
-                if Datatype[taginfo['datatype']].pack:
-                    pack = Datatype[taginfo['datatype']].pack
-                    count //= len(pack)
-                    data = struct.pack(bom + pack * count, *data)
-                elif Datatype[taginfo['datatype']] == Datatype.ASCII:
-                    # Handle null-seperated lists
-                    data = (data if isinstance(data, bytes) else data.encode()) + b'\x00'
-                    count = len(data)
+                            ifddest, src, data,
+                            ifd['tags'][int(tagSet[tag.bytecounts])]['data'],
+                            ifd['size'], dedup)
                 else:
-                    data = taginfo['data']
-                tagrecord = struct.pack(bom + 'HH' + ptrpack, tag, taginfo['datatype'], count)
-                if len(data) <= tagdatalen:
-                    if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8):
-                        subifdPtrs[tag] = -(len(ifdrecord) + len(tagrecord))
-                    if int(tag) in deferredData:
-                        deferredData[int(tag)]['ifdoffset'] = len(ifdrecord) + len(tagrecord)
-                    tagrecord += data + b'\x00' * (tagdatalen - len(data))
-                else:
-                    # word alignment
-                    if dest.tell() % 2:
-                        dest.write(b'\x00')
-                    if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8):
-                        subifdPtrs[tag] = dest.tell()
-                    _checkDataForNonBigtiff(bigtiff, [dest.tell()])
-                    tagrecord += struct.pack(bom + ptrpack, dest.tell())
-                    if int(tag) in deferredData:
-                        deferredData[int(tag)]['offset'] = dest.tell()
-                    dest.write(data)
-                ifdrecord += tagrecord
-            ifdrecord = _writeDeferredData(bigtiff, bom, dest, ifd, ifdrecord, deferredData)
-        _checkDataForNonBigtiff(bigtiff, [dest.tell()])
-        pos = dest.tell()
-        # ifds are expected to be on word boundaries
-        if pos % 2:
-            dest.write(b'\x00')
-            pos = dest.tell()
-        dest.seek(ifdPtr)
-        dest.write(struct.pack(bom + ptrpack, origPos if ifdsFirst else pos))
-        dest.seek(0, os.SEEK_END)
-        ifddest.write(ifdrecord)
-        nextifdPtr = dest.tell()
-        ifddest.write(struct.pack(bom + ptrpack, 0))
-    write_sub_ifds(dest, bom, bigtiff, ifd, pos, subifdPtrs, ifdsFirst=ifdsFirst)
+                    data = write_tag_data(
+                        ifddest, src, data, [tag.bytecounts] * count,
+                        ifd['size'], dedup)
+                _checkDataForNonBigtiff(bigtiff, data)
+            _adjustTaginfoForNonBigtiff(bigtiff, taginfo)
+            if Datatype[taginfo['datatype']].pack:
+                pack = Datatype[taginfo['datatype']].pack
+                count //= len(pack)
+                data = struct.pack(bom + pack * count, *data)
+            elif Datatype[taginfo['datatype']] == Datatype.ASCII:
+                # Handle null-seperated lists
+                data = (data if isinstance(data, bytes) else data.encode()) + b'\x00'
+                count = len(data)
+            else:
+                data = taginfo['data']
+            tagrecord = struct.pack(bom + 'HH' + ptrpack, tag, taginfo['datatype'], count)
+            if len(data) <= tagdatalen:
+                if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8):
+                    subifdPtrs[tag] = -(len(ifdrecord) + len(tagrecord))
+                if int(tag) in deferredData:
+                    deferredData[int(tag)]['ifdoffset'] = len(ifdrecord) + len(tagrecord)
+                tagrecord += data + b'\x00' * (tagdatalen - len(data))
+            else:
+                # word alignment for tag position
+                if ifddest.tell() % 2:
+                    ifddest.write(b'\x00')
+                h = None
+                tpos = ifddest.tell()
+                if tag.isIFD() or taginfo.get('datatype') in (Datatype.IFD, Datatype.IFD8):
+                    subifdPtrs[tag] = tpos
+                elif dedup:
+                    h = hashlib.new(_DEDUP_HASH_METHOD, data).digest()
+                    if h in dedup['hashes']:
+                        tpos = dedup['hashes'][h]
+                    else:
+                        dedup['hashes'][h] = tpos
+                        h = None
+                _checkDataForNonBigtiff(bigtiff, [tpos])
+                tagrecord += struct.pack(bom + ptrpack, tpos)
+                if int(tag) in deferredData:
+                    deferredData[int(tag)]['offset'] = tpos
+                if not dedup or h is None:
+                    ifddest.write(data)
+            ifdrecord += tagrecord
+        ifdrecord = _writeDeferredData(bigtiff, bom, ifddest, ifd, ifdrecord, deferredData)
+    _checkDataForNonBigtiff(bigtiff, [ifddest.tell(), datadest.tell()])
+    pos = ifddest.tell()
+    # ifds are expected to be on word boundaries
+    if pos % 2:
+        ifddest.write(b'\x00')
+        pos = ifddest.tell()
+    ifddest.seek(ifdPtr)
+    ifddest.write(struct.pack(bom + ptrpack, ifdpos if ifdsFirst else pos))
+    ifddest.seek(ifdpos if ifdsFirst else 0, os.SEEK_SET if ifdsFirst else os.SEEK_END)
+    ifddest.write(ifdrecord)
+    nextifdPtr = ifddest.tell()
+    ifddest.write(struct.pack(bom + ptrpack, 0))
+    ifddest.seek(0, os.SEEK_END)
+    write_sub_ifds(datadest, ifddest, bom, bigtiff, ifd,
+                   ifdpos if ifdsFirst else pos, subifdPtrs,
+                   ifdsFirst=ifdsFirst, dedup=dedup)
     return nextifdPtr
 
 
-def write_sub_ifds(dest, bom, bigtiff, ifd, parentPos, subifdPtrs, tagSet=Tag, ifdsFirst=False):
+def write_sub_ifds(datadest, ifddest, bom, bigtiff, ifd, parentPos, subifdPtrs,
+                   tagSet=Tag, ifdsFirst=False, dedup=False):
     """
     Write any number of SubIFDs to a TIFF file.  These can be based on tags
     other than the SubIFD tag.
 
-    :param dest: the open file handle to write.
+    :param datadest: the open file handle to write offset data.
+    :param ifddest: the open file handle to write ifd and tag data.
     :param bom: eithter '<' or '>' for using struct to encode values based on
         endian.
     :param bigtiff: True if this is a bigtiff.
@@ -591,6 +654,11 @@ def write_sub_ifds(dest, bom, bigtiff, ifd, parentPos, subifdPtrs, tagSet=Tag, i
     :param ifdsFirst: if True, write IFDs before their respective data.
         Otherwise, IFDs are written after their data.  IFDs are always adjacent
         to their data.
+    :param dedup: if False, all data is written.  Otherwise, a dictionary with
+        'hashes' and 'reused', where 'hashes' is a dictionary with keys of
+        hashed data that have been written and values of the offsets where it
+        was written, and 'reused' is a count of data blocks that were
+        deduplicated.
     """
     tagdatalen = 8 if bigtiff else 4
     for tag, subifdPtr in subifdPtrs.items():
@@ -602,12 +670,13 @@ def write_sub_ifds(dest, bom, bigtiff, ifd, parentPos, subifdPtrs, tagSet=Tag, i
             nextSubifdPtr = subifdPtr
             for ifdInSubifd in subifd:
                 nextSubifdPtr = write_ifd(
-                    dest, bom, bigtiff, ifdInSubifd, nextSubifdPtr,
-                    getattr(tag, 'tagset', None), ifdsFirst=ifdsFirst)
+                    datadest, ifddest, bom, bigtiff, ifdInSubifd,
+                    nextSubifdPtr, getattr(tag, 'tagset', None),
+                    ifdsFirst=ifdsFirst, dedup=dedup)
             subifdPtr += tagdatalen
 
 
-def write_tag_data(dest, src, offsets, lengths, srclen):
+def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False):
     """
     Copy data from a source tiff to a destination tiff, return a list of
     offsets where data was written.
@@ -617,6 +686,11 @@ def write_tag_data(dest, src, offsets, lengths, srclen):
     :param offsets: an array of offsets where data will be copied from.
     :param lengths: an array of lengths to copy from each offset.
     :param srclen: the length of the source file.
+    :param dedup: if False, all data is written.  Otherwise, a dictionary with
+        'hashes' and 'reused', where 'hashes' is a dictionary with keys of
+        hashed data that have been written and values of the offsets where it
+        was written, and 'reused' is a count of data blocks that were
+        deduplicated.
     :return: the offsets in the destination file corresponding to the data
         copied.
     """
@@ -633,6 +707,7 @@ def write_tag_data(dest, src, offsets, lengths, srclen):
         offset, idx = offsetList[olidx]
         length = lengths[idx]
         if offset and check_offset(srclen, offset, length):
+            # if a block repeats a previous block, continue the pattern
             if lastOffset == offset and lastLength == length:
                 destOffsets[idx] = destOffsets[lastOffsetIdx]
                 olidx += 1
@@ -640,16 +715,36 @@ def write_tag_data(dest, src, offsets, lengths, srclen):
             lastOffset, lastLength, lastOffsetIdx = offset, length, idx
             src.seek(offset)
             destOffsets[idx] = dest.tell()
+            tells = {'idx': [idx], 'pos': destOffsets[idx], 'offset': offset}
             # Group reads when possible; the biggest overhead is in the actual
             # read call
-            while (olidx + 1 < len(offsetList) and
+            while (not dedup and olidx + 1 < len(offsetList) and
                    offsetList[olidx + 1][0] == offsetList[olidx][0] + lengths[idx] and
                    check_offset(srclen, offsetList[olidx + 1][0],
                                 lengths[offsetList[olidx + 1][1]])):
                 destOffsets[offsetList[olidx + 1][1]] = destOffsets[idx] + lengths[idx]
+                tells['idx'].append(offsetList[olidx + 1][1])
                 olidx += 1
                 offset, idx = offsetList[olidx]
                 length += lengths[idx]
+            if dedup:
+                readlen = length
+                h = hashlib.new(_DEDUP_HASH_METHOD)
+                while readlen:
+                    data = src.read(min(readlen, COPY_CHUNKSIZE))
+                    h.update(data)
+                    readlen -= len(data)
+                h = h.digest()
+                if h in dedup['hashes']:
+                    hpos = dedup['hashes'][h]
+                    for tidx in tells['idx']:
+                        destOffsets[tidx] = destOffsets[tidx] - tells['pos'] + hpos
+                    dedup['reused'] += 1
+                    logger.debug('Deduplication: %d', dedup['reused'])
+                    length = 0
+                else:
+                    dedup['hashes'][h] = tells['pos']
+                    src.seek(tells['offset'])
             while length:
                 data = src.read(min(length, COPY_CHUNKSIZE))
                 dest.write(data)
diff --git a/tox.ini b/tox.ini
index 2a6d609..7c98d6b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,7 +1,7 @@
 [tox]
 envlist =
   test-py{38,39,310,311,312}
-  flake8
+  lint
 
 [testenv]
 deps =
@@ -15,7 +15,7 @@ deps =
 commands =
   pytest --cov {envsitepackagesdir}/tifftools {posargs}
 
-[testenv:flake8]
+[testenv:lint]
 basepython = python3
 skipsdist = true
 skip_install = true