Skip to content

Commit 001108d

Browse files
authored
Merge pull request #261 from openzim/dedup_zim_items
Add utility to deduplicate ZIM items and replace them with redirects at ZIM creation time
2 parents dc21be3 + f86b79c commit 001108d

File tree

4 files changed

+194
-1
lines changed

4 files changed

+194
-1
lines changed

CHANGELOG.md

+4
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- New `zim.dedup.Deduplicator` class to handle automatic deduplication of content before adding to the ZIM (#33)
13+
1014
### Changed
1115

1216
- Upgrade to wombat 3.8.11 (#256)

pyproject.toml

+3-1
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,9 @@ dependencies = [
3232
"pillow>=7.0.0,<12.0",
3333
"urllib3>=1.26.5,<2.4.0",
3434
"piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway
35-
"idna>=2.5,<4.0"
35+
"idna>=2.5,<4.0",
36+
"xxhash>=2.0,<4.0",
37+
"types-xxhash>=2.0,<4.0",
3638
]
3739
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
3840

src/zimscraperlib/zim/dedup.py

+80
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import pathlib
2+
import re
3+
from typing import Any
4+
5+
import xxhash
6+
from libzim.writer import Hint # pyright: ignore[reportMissingModuleSource]
7+
8+
from zimscraperlib.zim.creator import Creator
9+
10+
CONTENT_BUFFER_READ_SIZE = 1048576 # 1M
11+
12+
13+
class Deduplicator:
14+
"""Automatically deduplicate potential ZIM items before adding them to the ZIM
15+
16+
This class automatically computes the digest of every item added to the ZIM, and
17+
either add the entry (if item is not yet inside the ZIM) or an alias (if item with
18+
same digest has already been added inside the ZIM).
19+
20+
This class must be configured with filters to specifiy which items paths to
21+
consider. It is of course possible to consider all paths (i.e. all items) with a
22+
wide regex or to operate on a subset (e.g. all images) with more precise filters.
23+
Item is considered for deduplication if any filter matches. It is recommended to
24+
properly configure these filters to save time / memory by automatically ignoring
25+
items which are known to always be different and / or be too numerous.
26+
27+
Only the digest and path of items matching the filters are computed and stored.
28+
29+
The xxh32 algorithm (https://github.com/Cyan4973/xxHash) which is known to be good
30+
at avoiding collision with minimal memory and CPU footprint is used, so the sheer
31+
memory consumption will come from the paths we have to keep. This hashing algorithm
32+
is not meant for security purpose since one might infer original content from
33+
hashes, but this is not our use case.
34+
"""
35+
36+
def __init__(self, creator: Creator):
37+
self.creator = creator
38+
self.filters: list[re.Pattern[str]] = []
39+
self.added_items: dict[bytes, str] = {}
40+
41+
def add_item_for(
42+
self,
43+
path: str,
44+
title: str | None = None,
45+
*,
46+
fpath: pathlib.Path | None = None,
47+
content: bytes | str | None = None,
48+
**kwargs: Any,
49+
):
50+
"""Add an item at given path or an alias"""
51+
existing_item = None
52+
if any(_filter.match(path) is not None for _filter in self.filters):
53+
if content:
54+
digest = xxhash.xxh32(
55+
content.encode() if isinstance(content, str) else content
56+
).digest()
57+
else:
58+
if not fpath:
59+
raise Exception("Either content or fpath are mandatory")
60+
xxh32 = xxhash.xxh32()
61+
with open(fpath, "rb") as f:
62+
while True:
63+
data = f.read(CONTENT_BUFFER_READ_SIZE) # read content in chunk
64+
if not data:
65+
break
66+
xxh32.update(data)
67+
digest = xxh32.digest()
68+
69+
if existing_item := self.added_items.get(digest):
70+
self.creator.add_alias(
71+
path,
72+
targetPath=existing_item,
73+
title=title or path,
74+
hints={Hint.FRONT_ARTICLE: True} if kwargs.get("is_front") else {},
75+
)
76+
return
77+
else:
78+
self.added_items[digest] = path
79+
80+
self.creator.add_item_for(path, title, fpath=fpath, content=content, **kwargs)

tests/zim/test_dedup.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import pathlib
2+
import re
3+
from typing import Any
4+
5+
import pytest
6+
7+
from zimscraperlib.zim import Archive, Creator
8+
from zimscraperlib.zim.dedup import Deduplicator
9+
10+
11+
def test_deduplicator(
12+
tmp_path: pathlib.Path,
13+
png_image: pathlib.Path,
14+
html_file: pathlib.Path,
15+
html_str: str,
16+
html_str_cn: str,
17+
):
18+
main_path = "welcome"
19+
20+
png_data = png_image.read_bytes()
21+
22+
def add_items(creator_or_deduplicator: Any):
23+
creator_or_deduplicator.add_item_for(
24+
"welcome1", "wel1", content=html_str, is_front=True
25+
)
26+
creator_or_deduplicator.add_item_for(
27+
"welcome2", "wel2", content=html_str, is_front=True
28+
)
29+
creator_or_deduplicator.add_item_for(
30+
"dedup/welcome3", "wel3", content=html_str, is_front=True
31+
)
32+
creator_or_deduplicator.add_item_for(
33+
"dedup/welcome4", "wel4", content=html_str, is_front=True
34+
)
35+
creator_or_deduplicator.add_item_for(
36+
"prefix/dedup/welcome5", "wel5", content=html_str, is_front=True
37+
)
38+
creator_or_deduplicator.add_item_for("image1", None, fpath=png_image)
39+
creator_or_deduplicator.add_item_for("image2", None, content=png_data)
40+
creator_or_deduplicator.add_item_for("dedup/image3", None, fpath=png_image)
41+
creator_or_deduplicator.add_item_for("dedup/image4", None, content=png_data)
42+
creator_or_deduplicator.add_item_for("dedup/html", None, fpath=html_file)
43+
creator_or_deduplicator.add_item_for("dedup/html_cn", None, content=html_str_cn)
44+
creator_or_deduplicator.add_item_for(
45+
"prefix/dedup/image5", None, content=png_data
46+
)
47+
48+
fpath_without_dedup = tmp_path / "zim_without_dedup.zim"
49+
with Creator(fpath_without_dedup, main_path).config_dev_metadata() as creator:
50+
add_items(creator)
51+
52+
assert fpath_without_dedup.exists()
53+
54+
fpath_with_dedup = tmp_path / "zim_with_dedup.zim"
55+
with Creator(fpath_with_dedup, main_path).config_dev_metadata() as creator:
56+
deduplicator = Deduplicator(creator)
57+
deduplicator.filters.append(re.compile("^foo/.*$"))
58+
deduplicator.filters.append(re.compile("^dedup/.*$"))
59+
deduplicator.filters.append(re.compile("^bar/.*$"))
60+
add_items(deduplicator)
61+
62+
# added_items contains only original items, not the duplicates
63+
assert set(deduplicator.added_items.values()) == {
64+
"dedup/welcome3",
65+
"dedup/image3",
66+
"dedup/html_cn",
67+
}
68+
69+
assert fpath_with_dedup.exists()
70+
71+
# check that deduplication has a consequence on ZIM size
72+
assert (
73+
fpath_without_dedup.lstat().st_size - fpath_with_dedup.lstat().st_size
74+
) > 3000 # 3291 as of libzim 9.3
75+
76+
for zim_path in [fpath_with_dedup, fpath_without_dedup]:
77+
reader = Archive(zim_path)
78+
79+
assert reader.all_entry_count == 24
80+
81+
for html_path in [
82+
"welcome1",
83+
"welcome2",
84+
"dedup/welcome3",
85+
"dedup/welcome4",
86+
"prefix/dedup/welcome5",
87+
"dedup/html",
88+
]:
89+
assert bytes(reader.get_item(html_path).content).decode() == html_str
90+
assert bytes(reader.get_item("dedup/html_cn").content).decode() == html_str_cn
91+
92+
for img_path in [
93+
"image1",
94+
"image2",
95+
"dedup/image3",
96+
"dedup/image4",
97+
"prefix/dedup/image5",
98+
]:
99+
assert bytes(reader.get_item(img_path).content) == png_data
100+
101+
102+
def test_missing_content(tmp_path: pathlib.Path):
103+
with Creator(tmp_path / "test.zin", "foo").config_dev_metadata() as creator:
104+
deduplicator = Deduplicator(creator)
105+
deduplicator.filters.append(re.compile(".*"))
106+
with pytest.raises(Exception, match="Either content or fpath are mandatory"):
107+
deduplicator.add_item_for("welcome", None)

0 commit comments

Comments
 (0)