openzim · benoit74 · May 9, 2025 · Apr 29, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Added
+
+- New `zim.dedup.Deduplicator` class to handle automatic deduplication of content before adding to the ZIM (#33)
+
 ### Changed
 
 - Upgrade to wombat 3.8.11 (#256)

diff --git a/pyproject.toml b/pyproject.toml
@@ -32,7 +32,9 @@ dependencies = [
   "pillow>=7.0.0,<12.0",
   "urllib3>=1.26.5,<2.4.0",
   "piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway
-  "idna>=2.5,<4.0"
+  "idna>=2.5,<4.0",
+  "xxhash>=2.0,<4.0",
+  "types-xxhash>=2.0,<4.0",
 ]
 dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]
 

diff --git a/src/zimscraperlib/zim/dedup.py b/src/zimscraperlib/zim/dedup.py
@@ -0,0 +1,80 @@
+import pathlib
+import re
+from typing import Any
+
+import xxhash
+from libzim.writer import Hint  # pyright: ignore[reportMissingModuleSource]
+
+from zimscraperlib.zim.creator import Creator
+
+CONTENT_BUFFER_READ_SIZE = 1048576  # 1M
+
+
+class Deduplicator:
+    """Automatically deduplicate potential ZIM items before adding them to the ZIM
+
+    This class automatically computes the digest of every item added to the ZIM, and
+    either add the entry (if item is not yet inside the ZIM) or an alias (if item with
+    same digest has already been added inside the ZIM).
+
+    This class must be configured with filters to specifiy which items paths to
+    consider. It is of course possible to consider all paths (i.e. all items) with a
+    wide regex or to operate on a subset (e.g. all images) with more precise filters.
+    Item is considered for deduplication if any filter matches. It is recommended to
+    properly configure these filters to save time / memory by automatically ignoring
+    items which are known to always be different and / or be too numerous.
+
+    Only the digest and path of items matching the filters are computed and stored.
+
+    The xxh32 algorithm (https://github.com/Cyan4973/xxHash) which is known to be good
+    at avoiding collision with minimal memory and CPU footprint is used, so the sheer
+    memory consumption will come from the paths we have to keep. This hashing algorithm
+    is not meant for security purpose since one might infer original content from
+    hashes, but this is not our use case.
+    """
+
+    def __init__(self, creator: Creator):
+        self.creator = creator
+        self.filters: list[re.Pattern[str]] = []
+        self.added_items: dict[bytes, str] = {}
+
+    def add_item_for(
+        self,
+        path: str,
+        title: str | None = None,
+        *,
+        fpath: pathlib.Path | None = None,
+        content: bytes | str | None = None,
+        **kwargs: Any,
+    ):
+        """Add an item at given path or an alias"""
+        existing_item = None
+        if any(_filter.match(path) is not None for _filter in self.filters):
+            if content:
+                digest = xxhash.xxh32(
+                    content.encode() if isinstance(content, str) else content
+                ).digest()
+            else:
+                if not fpath:
+                    raise Exception("Either content or fpath are mandatory")
+                xxh32 = xxhash.xxh32()
+                with open(fpath, "rb") as f:
+                    while True:
+                        data = f.read(CONTENT_BUFFER_READ_SIZE)  # read content in chunk
+                        if not data:
+                            break
+                        xxh32.update(data)
+                digest = xxh32.digest()
+
+            if existing_item := self.added_items.get(digest):
+                self.creator.add_alias(
+                    path,
+                    targetPath=existing_item,
+                    title=title or path,
+                    hints={Hint.FRONT_ARTICLE: True} if kwargs.get("is_front") else {},
+                )
+                return
+            else:
+                self.added_items[digest] = path
+
+        self.creator.add_item_for(path, title, fpath=fpath, content=content, **kwargs)
diff --git a/tests/zim/test_dedup.py b/tests/zim/test_dedup.py
@@ -0,0 +1,107 @@
+import pathlib
+import re
+from typing import Any
+
+import pytest
+
+from zimscraperlib.zim import Archive, Creator
+from zimscraperlib.zim.dedup import Deduplicator
+
+
+def test_deduplicator(
+    tmp_path: pathlib.Path,
+    png_image: pathlib.Path,
+    html_file: pathlib.Path,
+    html_str: str,
+    html_str_cn: str,
+):
+    main_path = "welcome"
+
+    png_data = png_image.read_bytes()
+
+    def add_items(creator_or_deduplicator: Any):
+        creator_or_deduplicator.add_item_for(
+            "welcome1", "wel1", content=html_str, is_front=True
+        )
+        creator_or_deduplicator.add_item_for(
+            "welcome2", "wel2", content=html_str, is_front=True
+        )
+        creator_or_deduplicator.add_item_for(
+            "dedup/welcome3", "wel3", content=html_str, is_front=True
+        )
+        creator_or_deduplicator.add_item_for(
+            "dedup/welcome4", "wel4", content=html_str, is_front=True
+        )
+        creator_or_deduplicator.add_item_for(
+            "prefix/dedup/welcome5", "wel5", content=html_str, is_front=True
+        )
+        creator_or_deduplicator.add_item_for("image1", None, fpath=png_image)
+        creator_or_deduplicator.add_item_for("image2", None, content=png_data)
+        creator_or_deduplicator.add_item_for("dedup/image3", None, fpath=png_image)
+        creator_or_deduplicator.add_item_for("dedup/image4", None, content=png_data)
+        creator_or_deduplicator.add_item_for("dedup/html", None, fpath=html_file)
+        creator_or_deduplicator.add_item_for("dedup/html_cn", None, content=html_str_cn)
+        creator_or_deduplicator.add_item_for(
+            "prefix/dedup/image5", None, content=png_data
+        )
+
+    fpath_without_dedup = tmp_path / "zim_without_dedup.zim"
+    with Creator(fpath_without_dedup, main_path).config_dev_metadata() as creator:
+        add_items(creator)
+
+    assert fpath_without_dedup.exists()
+
+    fpath_with_dedup = tmp_path / "zim_with_dedup.zim"
+    with Creator(fpath_with_dedup, main_path).config_dev_metadata() as creator:
+        deduplicator = Deduplicator(creator)
+        deduplicator.filters.append(re.compile("^foo/.*$"))
+        deduplicator.filters.append(re.compile("^dedup/.*$"))
+        deduplicator.filters.append(re.compile("^bar/.*$"))
+        add_items(deduplicator)
+
+        # added_items contains only original items, not the duplicates
+        assert set(deduplicator.added_items.values()) == {
+            "dedup/welcome3",
+            "dedup/image3",
+            "dedup/html_cn",
+        }
+
+    assert fpath_with_dedup.exists()
+
+    # check that deduplication has a consequence on ZIM size
+    assert (
+        fpath_without_dedup.lstat().st_size - fpath_with_dedup.lstat().st_size
+    ) > 3000  # 3291 as of libzim 9.3
+
+    for zim_path in [fpath_with_dedup, fpath_without_dedup]:
+        reader = Archive(zim_path)
+
+        assert reader.all_entry_count == 24
+
+        for html_path in [
+            "welcome1",
+            "welcome2",
+            "dedup/welcome3",
+            "dedup/welcome4",
+            "prefix/dedup/welcome5",
+            "dedup/html",
+        ]:
+            assert bytes(reader.get_item(html_path).content).decode() == html_str
+        assert bytes(reader.get_item("dedup/html_cn").content).decode() == html_str_cn
+
+        for img_path in [
+            "image1",
+            "image2",
+            "dedup/image3",
+            "dedup/image4",
+            "prefix/dedup/image5",
+        ]:
+            assert bytes(reader.get_item(img_path).content) == png_data
+
+
+def test_missing_content(tmp_path: pathlib.Path):
+    with Creator(tmp_path / "test.zin", "foo").config_dev_metadata() as creator:
+        deduplicator = Deduplicator(creator)
+        deduplicator.filters.append(re.compile(".*"))
+        with pytest.raises(Exception, match="Either content or fpath are mandatory"):
+            deduplicator.add_item_for("welcome", None)