diff --git a/CHANGELOG.md b/CHANGELOG.md index fcfa4f9..f0e46e9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,10 @@ # Change Log +## Version 1.5.2 + +### Improvements +- Simplify the deduplication code ([#94](../../pull/94)) + ## Version 1.5.1 ### Improvements diff --git a/tests/test_write_tiff.py b/tests/test_write_tiff.py index 3b8859c..ae19c81 100644 --- a/tests/test_write_tiff.py +++ b/tests/test_write_tiff.py @@ -174,7 +174,31 @@ def test_write_with_dedup(tmp_path): info2 = tifftools.read_tiff(dest2path) dest3path = tmp_path / 'sample3.tiff' tifftools.write_tiff(info2, dest3path) - assert open(destpath, 'rb').read() == open(dest3path, 'rb').read() + assert len(open(dest3path, 'rb').read().split(uniqueString)) == 3 + + +def test_write_with_dedup_and_ifdsfirst(tmp_path): + path = os.path.join(os.path.dirname(__file__), 'data', 'good_single.tif') + info = tifftools.read_tiff(path) + uniqueString = b'UNIQUESTRING' + info['ifds'][0]['tags'][23456] = { + 'datatype': tifftools.Datatype.UNDEFINED, + 'data': uniqueString + } + info['ifds'][0]['tags'][23457] = { + 'datatype': tifftools.Datatype.UNDEFINED, + 'data': uniqueString + } + destpath = tmp_path / 'sample.tiff' + tifftools.write_tiff(info, destpath) + assert len(open(destpath, 'rb').read().split(uniqueString)) == 3 + dest2path = tmp_path / 'sample2.tiff' + tifftools.write_tiff(info, dest2path, dedup=True, ifdsFirst=True) + assert len(open(dest2path, 'rb').read().split(uniqueString)) == 2 + info2 = tifftools.read_tiff(dest2path) + dest3path = tmp_path / 'sample3.tiff' + tifftools.write_tiff(info2, dest3path) + assert len(open(dest3path, 'rb').read().split(uniqueString)) == 3 def test_write_bytecount_data(tmp_path): diff --git a/tifftools/tifftools.py b/tifftools/tifftools.py index d0a903c..1f9aea2 100755 --- a/tifftools/tifftools.py +++ b/tifftools/tifftools.py @@ -705,6 +705,8 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False): offsetList = sorted([(offset, idx) for idx, offset in enumerate(offsets)]) olidx = 0 lastOffset = lastLength = lastOffsetIdx = None + blocks = [] + desttell = dest.tell() while olidx < len(offsetList): offset, idx = offsetList[olidx] length = lengths[idx] @@ -715,20 +717,7 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False): olidx += 1 continue lastOffset, lastLength, lastOffsetIdx = offset, length, idx - src.seek(offset) - destOffsets[idx] = dest.tell() - tells = {'idx': [idx], 'pos': destOffsets[idx], 'offset': offset} - # Group reads when possible; the biggest overhead is in the actual - # read call - while (not dedup and olidx + 1 < len(offsetList) and - offsetList[olidx + 1][0] == offsetList[olidx][0] + lengths[idx] and - check_offset(srclen, offsetList[olidx + 1][0], - lengths[offsetList[olidx + 1][1]])): - destOffsets[offsetList[olidx + 1][1]] = destOffsets[idx] + lengths[idx] - tells['idx'].append(offsetList[olidx + 1][1]) - olidx += 1 - offset, idx = offsetList[olidx] - length += lengths[idx] + destOffsets[idx] = desttell if dedup: hashkey = (hash(getattr(src, 'name', src)), offset) if hashkey in dedup['hashlog']: @@ -736,6 +725,7 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False): else: readlen = length h = hashlib.new(_DEDUP_HASH_METHOD) + src.seek(offset) while readlen: data = src.read(min(readlen, COPY_CHUNKSIZE)) h.update(data) @@ -744,16 +734,24 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False): dedup['hashlog'][hashkey] = h if h in dedup['hashes']: hpos = dedup['hashes'][h] - for tidx in tells['idx']: - destOffsets[tidx] = destOffsets[tidx] - tells['pos'] + hpos + destOffsets[idx] = hpos dedup['reused'] += 1 length = 0 else: - dedup['hashes'][h] = tells['pos'] - src.seek(tells['offset']) - while length: - data = src.read(min(length, COPY_CHUNKSIZE)) - dest.write(data) - length -= len(data) + dedup['hashes'][h] = destOffsets[idx] + # Group reads when possible; the biggest overhead is in the actual + # read call + if length: + if len(blocks) and offset == blocks[-1][0] + blocks[-1][1]: + blocks[-1] = (blocks[-1][0], blocks[-1][1] + length) + else: + blocks.append((offset, length)) + desttell += length olidx += 1 + for offset, length in blocks: + src.seek(offset) + while length: + data = src.read(min(length, COPY_CHUNKSIZE)) + dest.write(data) + length -= len(data) return destOffsets