diff --git a/CHANGELOG.md b/CHANGELOG.md index 1d182f8..1af4791 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added + +- New `zim.dedup.Deduplicator` class to handle automatic deduplication of content before adding to the ZIM (#33) + ### Changed - Upgrade to wombat 3.8.11 (#256) diff --git a/pyproject.toml b/pyproject.toml index 85055e1..19c6a74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,9 @@ dependencies = [ "pillow>=7.0.0,<12.0", "urllib3>=1.26.5,<2.4.0", "piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway - "idna>=2.5,<4.0" + "idna>=2.5,<4.0", + "xxhash>=2.0,<4.0", + "types-xxhash>=2.0,<4.0", ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] diff --git a/src/zimscraperlib/zim/dedup.py b/src/zimscraperlib/zim/dedup.py new file mode 100644 index 0000000..7f9f2cf --- /dev/null +++ b/src/zimscraperlib/zim/dedup.py @@ -0,0 +1,80 @@ +import pathlib +import re +from typing import Any + +import xxhash +from libzim.writer import Hint # pyright: ignore[reportMissingModuleSource] + +from zimscraperlib.zim.creator import Creator + +CONTENT_BUFFER_READ_SIZE = 1048576 # 1M + + +class Deduplicator: + """Automatically deduplicate potential ZIM items before adding them to the ZIM + + This class automatically computes the digest of every item added to the ZIM, and + either add the entry (if item is not yet inside the ZIM) or an alias (if item with + same digest has already been added inside the ZIM). + + This class must be configured with filters to specifiy which items paths to + consider. It is of course possible to consider all paths (i.e. all items) with a + wide regex or to operate on a subset (e.g. all images) with more precise filters. + Item is considered for deduplication if any filter matches. It is recommended to + properly configure these filters to save time / memory by automatically ignoring + items which are known to always be different and / or be too numerous. + + Only the digest and path of items matching the filters are computed and stored. + + The xxh32 algorithm (https://github.com/Cyan4973/xxHash) which is known to be good + at avoiding collision with minimal memory and CPU footprint is used, so the sheer + memory consumption will come from the paths we have to keep. This hashing algorithm + is not meant for security purpose since one might infer original content from + hashes, but this is not our use case. + """ + + def __init__(self, creator: Creator): + self.creator = creator + self.filters: list[re.Pattern[str]] = [] + self.added_items: dict[bytes, str] = {} + + def add_item_for( + self, + path: str, + title: str | None = None, + *, + fpath: pathlib.Path | None = None, + content: bytes | str | None = None, + **kwargs: Any, + ): + """Add an item at given path or an alias""" + existing_item = None + if any(_filter.match(path) is not None for _filter in self.filters): + if content: + digest = xxhash.xxh32( + content.encode() if isinstance(content, str) else content + ).digest() + else: + if not fpath: + raise Exception("Either content or fpath are mandatory") + xxh32 = xxhash.xxh32() + with open(fpath, "rb") as f: + while True: + data = f.read(CONTENT_BUFFER_READ_SIZE) # read content in chunk + if not data: + break + xxh32.update(data) + digest = xxh32.digest() + + if existing_item := self.added_items.get(digest): + self.creator.add_alias( + path, + targetPath=existing_item, + title=title or path, + hints={Hint.FRONT_ARTICLE: True} if kwargs.get("is_front") else {}, + ) + return + else: + self.added_items[digest] = path + + self.creator.add_item_for(path, title, fpath=fpath, content=content, **kwargs) diff --git a/tests/zim/test_dedup.py b/tests/zim/test_dedup.py new file mode 100644 index 0000000..1f36554 --- /dev/null +++ b/tests/zim/test_dedup.py @@ -0,0 +1,107 @@ +import pathlib +import re +from typing import Any + +import pytest + +from zimscraperlib.zim import Archive, Creator +from zimscraperlib.zim.dedup import Deduplicator + + +def test_deduplicator( + tmp_path: pathlib.Path, + png_image: pathlib.Path, + html_file: pathlib.Path, + html_str: str, + html_str_cn: str, +): + main_path = "welcome" + + png_data = png_image.read_bytes() + + def add_items(creator_or_deduplicator: Any): + creator_or_deduplicator.add_item_for( + "welcome1", "wel1", content=html_str, is_front=True + ) + creator_or_deduplicator.add_item_for( + "welcome2", "wel2", content=html_str, is_front=True + ) + creator_or_deduplicator.add_item_for( + "dedup/welcome3", "wel3", content=html_str, is_front=True + ) + creator_or_deduplicator.add_item_for( + "dedup/welcome4", "wel4", content=html_str, is_front=True + ) + creator_or_deduplicator.add_item_for( + "prefix/dedup/welcome5", "wel5", content=html_str, is_front=True + ) + creator_or_deduplicator.add_item_for("image1", None, fpath=png_image) + creator_or_deduplicator.add_item_for("image2", None, content=png_data) + creator_or_deduplicator.add_item_for("dedup/image3", None, fpath=png_image) + creator_or_deduplicator.add_item_for("dedup/image4", None, content=png_data) + creator_or_deduplicator.add_item_for("dedup/html", None, fpath=html_file) + creator_or_deduplicator.add_item_for("dedup/html_cn", None, content=html_str_cn) + creator_or_deduplicator.add_item_for( + "prefix/dedup/image5", None, content=png_data + ) + + fpath_without_dedup = tmp_path / "zim_without_dedup.zim" + with Creator(fpath_without_dedup, main_path).config_dev_metadata() as creator: + add_items(creator) + + assert fpath_without_dedup.exists() + + fpath_with_dedup = tmp_path / "zim_with_dedup.zim" + with Creator(fpath_with_dedup, main_path).config_dev_metadata() as creator: + deduplicator = Deduplicator(creator) + deduplicator.filters.append(re.compile("^foo/.*$")) + deduplicator.filters.append(re.compile("^dedup/.*$")) + deduplicator.filters.append(re.compile("^bar/.*$")) + add_items(deduplicator) + + # added_items contains only original items, not the duplicates + assert set(deduplicator.added_items.values()) == { + "dedup/welcome3", + "dedup/image3", + "dedup/html_cn", + } + + assert fpath_with_dedup.exists() + + # check that deduplication has a consequence on ZIM size + assert ( + fpath_without_dedup.lstat().st_size - fpath_with_dedup.lstat().st_size + ) > 3000 # 3291 as of libzim 9.3 + + for zim_path in [fpath_with_dedup, fpath_without_dedup]: + reader = Archive(zim_path) + + assert reader.all_entry_count == 24 + + for html_path in [ + "welcome1", + "welcome2", + "dedup/welcome3", + "dedup/welcome4", + "prefix/dedup/welcome5", + "dedup/html", + ]: + assert bytes(reader.get_item(html_path).content).decode() == html_str + assert bytes(reader.get_item("dedup/html_cn").content).decode() == html_str_cn + + for img_path in [ + "image1", + "image2", + "dedup/image3", + "dedup/image4", + "prefix/dedup/image5", + ]: + assert bytes(reader.get_item(img_path).content) == png_data + + +def test_missing_content(tmp_path: pathlib.Path): + with Creator(tmp_path / "test.zin", "foo").config_dev_metadata() as creator: + deduplicator = Deduplicator(creator) + deduplicator.filters.append(re.compile(".*")) + with pytest.raises(Exception, match="Either content or fpath are mandatory"): + deduplicator.add_item_for("welcome", None)