DigitalSlideArchive · manthey · Apr 25, 2024 · Apr 25, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # Change Log
 
+## Version 1.5.2
+
+### Improvements
+- Simplify the deduplication code ([#94](../../pull/94))
+
 ## Version 1.5.1
 
 ### Improvements

diff --git a/tests/test_write_tiff.py b/tests/test_write_tiff.py
@@ -174,7 +174,31 @@ def test_write_with_dedup(tmp_path):
     info2 = tifftools.read_tiff(dest2path)
     dest3path = tmp_path / 'sample3.tiff'
     tifftools.write_tiff(info2, dest3path)
-    assert open(destpath, 'rb').read() == open(dest3path, 'rb').read()
+    assert len(open(dest3path, 'rb').read().split(uniqueString)) == 3
+
+
+def test_write_with_dedup_and_ifdsfirst(tmp_path):
+    path = os.path.join(os.path.dirname(__file__), 'data', 'good_single.tif')
+    info = tifftools.read_tiff(path)
+    uniqueString = b'UNIQUESTRING'
+    info['ifds'][0]['tags'][23456] = {
+        'datatype': tifftools.Datatype.UNDEFINED,
+        'data': uniqueString
+    }
+    info['ifds'][0]['tags'][23457] = {
+        'datatype': tifftools.Datatype.UNDEFINED,
+        'data': uniqueString
+    }
+    destpath = tmp_path / 'sample.tiff'
+    tifftools.write_tiff(info, destpath)
+    assert len(open(destpath, 'rb').read().split(uniqueString)) == 3
+    dest2path = tmp_path / 'sample2.tiff'
+    tifftools.write_tiff(info, dest2path, dedup=True, ifdsFirst=True)
+    assert len(open(dest2path, 'rb').read().split(uniqueString)) == 2
+    info2 = tifftools.read_tiff(dest2path)
+    dest3path = tmp_path / 'sample3.tiff'
+    tifftools.write_tiff(info2, dest3path)
+    assert len(open(dest3path, 'rb').read().split(uniqueString)) == 3
 
 
 def test_write_bytecount_data(tmp_path):

diff --git a/tifftools/tifftools.py b/tifftools/tifftools.py
@@ -705,6 +705,8 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False):
     offsetList = sorted([(offset, idx) for idx, offset in enumerate(offsets)])
     olidx = 0
     lastOffset = lastLength = lastOffsetIdx = None
+    blocks = []
+    desttell = dest.tell()
     while olidx < len(offsetList):
         offset, idx = offsetList[olidx]
         length = lengths[idx]
@@ -715,27 +717,15 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False):
                 olidx += 1
                 continue
             lastOffset, lastLength, lastOffsetIdx = offset, length, idx
-            src.seek(offset)
-            destOffsets[idx] = dest.tell()
-            tells = {'idx': [idx], 'pos': destOffsets[idx], 'offset': offset}
-            # Group reads when possible; the biggest overhead is in the actual
-            # read call
-            while (not dedup and olidx + 1 < len(offsetList) and
-                   offsetList[olidx + 1][0] == offsetList[olidx][0] + lengths[idx] and
-                   check_offset(srclen, offsetList[olidx + 1][0],
-                                lengths[offsetList[olidx + 1][1]])):
-                destOffsets[offsetList[olidx + 1][1]] = destOffsets[idx] + lengths[idx]
-                tells['idx'].append(offsetList[olidx + 1][1])
-                olidx += 1
-                offset, idx = offsetList[olidx]
-                length += lengths[idx]
+            destOffsets[idx] = desttell
             if dedup:
                 hashkey = (hash(getattr(src, 'name', src)), offset)
                 if hashkey in dedup['hashlog']:
                     h = dedup['hashlog'][hashkey]
                 else:
                     readlen = length
                     h = hashlib.new(_DEDUP_HASH_METHOD)
+                    src.seek(offset)
                     while readlen:
                         data = src.read(min(readlen, COPY_CHUNKSIZE))
                         h.update(data)
@@ -744,16 +734,24 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False):
                     dedup['hashlog'][hashkey] = h
                 if h in dedup['hashes']:
                     hpos = dedup['hashes'][h]
-                    for tidx in tells['idx']:
-                        destOffsets[tidx] = destOffsets[tidx] - tells['pos'] + hpos
+                    destOffsets[idx] = hpos
                     dedup['reused'] += 1
                     length = 0
                 else:
-                    dedup['hashes'][h] = tells['pos']
-                    src.seek(tells['offset'])
-            while length:
-                data = src.read(min(length, COPY_CHUNKSIZE))
-                dest.write(data)
-                length -= len(data)
+                    dedup['hashes'][h] = destOffsets[idx]
+            # Group reads when possible; the biggest overhead is in the actual
+            # read call
+            if length:
+                if len(blocks) and offset == blocks[-1][0] + blocks[-1][1]:
+                    blocks[-1] = (blocks[-1][0], blocks[-1][1] + length)
+                else:
+                    blocks.append((offset, length))
+                desttell += length
         olidx += 1
+    for offset, length in blocks:
+        src.seek(offset)
+        while length:
+            data = src.read(min(length, COPY_CHUNKSIZE))
+            dest.write(data)
+            length -= len(data)
     return destOffsets