Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
# Change Log

## Version 1.5.2

### Improvements
- Simplify the deduplication code ([#94](../../pull/94))

## Version 1.5.1

### Improvements
Expand Down
26 changes: 25 additions & 1 deletion tests/test_write_tiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,31 @@ def test_write_with_dedup(tmp_path):
info2 = tifftools.read_tiff(dest2path)
dest3path = tmp_path / 'sample3.tiff'
tifftools.write_tiff(info2, dest3path)
assert open(destpath, 'rb').read() == open(dest3path, 'rb').read()
assert len(open(dest3path, 'rb').read().split(uniqueString)) == 3


def test_write_with_dedup_and_ifdsfirst(tmp_path):
path = os.path.join(os.path.dirname(__file__), 'data', 'good_single.tif')
info = tifftools.read_tiff(path)
uniqueString = b'UNIQUESTRING'
info['ifds'][0]['tags'][23456] = {
'datatype': tifftools.Datatype.UNDEFINED,
'data': uniqueString
}
info['ifds'][0]['tags'][23457] = {
'datatype': tifftools.Datatype.UNDEFINED,
'data': uniqueString
}
destpath = tmp_path / 'sample.tiff'
tifftools.write_tiff(info, destpath)
assert len(open(destpath, 'rb').read().split(uniqueString)) == 3
dest2path = tmp_path / 'sample2.tiff'
tifftools.write_tiff(info, dest2path, dedup=True, ifdsFirst=True)
assert len(open(dest2path, 'rb').read().split(uniqueString)) == 2
info2 = tifftools.read_tiff(dest2path)
dest3path = tmp_path / 'sample3.tiff'
tifftools.write_tiff(info2, dest3path)
assert len(open(dest3path, 'rb').read().split(uniqueString)) == 3


def test_write_bytecount_data(tmp_path):
Expand Down
42 changes: 20 additions & 22 deletions tifftools/tifftools.py
Original file line number Diff line number Diff line change
Expand Up @@ -705,6 +705,8 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False):
offsetList = sorted([(offset, idx) for idx, offset in enumerate(offsets)])
olidx = 0
lastOffset = lastLength = lastOffsetIdx = None
blocks = []
desttell = dest.tell()
while olidx < len(offsetList):
offset, idx = offsetList[olidx]
length = lengths[idx]
Expand All @@ -715,27 +717,15 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False):
olidx += 1
continue
lastOffset, lastLength, lastOffsetIdx = offset, length, idx
src.seek(offset)
destOffsets[idx] = dest.tell()
tells = {'idx': [idx], 'pos': destOffsets[idx], 'offset': offset}
# Group reads when possible; the biggest overhead is in the actual
# read call
while (not dedup and olidx + 1 < len(offsetList) and
offsetList[olidx + 1][0] == offsetList[olidx][0] + lengths[idx] and
check_offset(srclen, offsetList[olidx + 1][0],
lengths[offsetList[olidx + 1][1]])):
destOffsets[offsetList[olidx + 1][1]] = destOffsets[idx] + lengths[idx]
tells['idx'].append(offsetList[olidx + 1][1])
olidx += 1
offset, idx = offsetList[olidx]
length += lengths[idx]
destOffsets[idx] = desttell
if dedup:
hashkey = (hash(getattr(src, 'name', src)), offset)
if hashkey in dedup['hashlog']:
h = dedup['hashlog'][hashkey]
else:
readlen = length
h = hashlib.new(_DEDUP_HASH_METHOD)
src.seek(offset)
while readlen:
data = src.read(min(readlen, COPY_CHUNKSIZE))
h.update(data)
Expand All @@ -744,16 +734,24 @@ def write_tag_data(dest, src, offsets, lengths, srclen, dedup=False):
dedup['hashlog'][hashkey] = h
if h in dedup['hashes']:
hpos = dedup['hashes'][h]
for tidx in tells['idx']:
destOffsets[tidx] = destOffsets[tidx] - tells['pos'] + hpos
destOffsets[idx] = hpos
dedup['reused'] += 1
length = 0
else:
dedup['hashes'][h] = tells['pos']
src.seek(tells['offset'])
while length:
data = src.read(min(length, COPY_CHUNKSIZE))
dest.write(data)
length -= len(data)
dedup['hashes'][h] = destOffsets[idx]
# Group reads when possible; the biggest overhead is in the actual
# read call
if length:
if len(blocks) and offset == blocks[-1][0] + blocks[-1][1]:
blocks[-1] = (blocks[-1][0], blocks[-1][1] + length)
else:
blocks.append((offset, length))
desttell += length
olidx += 1
for offset, length in blocks:
src.seek(offset)
while length:
data = src.read(min(length, COPY_CHUNKSIZE))
dest.write(data)
length -= len(data)
return destOffsets