Skip to content

Add utility to deduplicate ZIM items and replace them with redirects at ZIM creation time #261

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Added

- New `zim.dedup.Deduplicator` class to handle automatic deduplication of content before adding to the ZIM (#33)

### Changed

- Upgrade to wombat 3.8.11 (#256)
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,9 @@ dependencies = [
"pillow>=7.0.0,<12.0",
"urllib3>=1.26.5,<2.4.0",
"piexif==1.1.3", # this dep is a nightmare in terms of release management, better pinned just like in optimize-images anyway
"idna>=2.5,<4.0"
"idna>=2.5,<4.0",
"xxhash>=2.0,<4.0",
"types-xxhash>=2.0,<4.0",
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

Expand Down
80 changes: 80 additions & 0 deletions src/zimscraperlib/zim/dedup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import pathlib
import re
from typing import Any

import xxhash
from libzim.writer import Hint # pyright: ignore[reportMissingModuleSource]

from zimscraperlib.zim.creator import Creator

CONTENT_BUFFER_READ_SIZE = 1048576 # 1M


class Deduplicator:
"""Automatically deduplicate potential ZIM items before adding them to the ZIM

This class automatically computes the digest of every item added to the ZIM, and
either add the entry (if item is not yet inside the ZIM) or an alias (if item with
same digest has already been added inside the ZIM).

This class must be configured with filters to specifiy which items paths to
consider. It is of course possible to consider all paths (i.e. all items) with a
wide regex or to operate on a subset (e.g. all images) with more precise filters.
Item is considered for deduplication if any filter matches. It is recommended to
properly configure these filters to save time / memory by automatically ignoring
items which are known to always be different and / or be too numerous.

Only the digest and path of items matching the filters are computed and stored.

The xxh32 algorithm (https://github.com/Cyan4973/xxHash) which is known to be good
at avoiding collision with minimal memory and CPU footprint is used, so the sheer
memory consumption will come from the paths we have to keep. This hashing algorithm
is not meant for security purpose since one might infer original content from
hashes, but this is not our use case.
"""

def __init__(self, creator: Creator):
self.creator = creator
self.filters: list[re.Pattern[str]] = []
self.added_items: dict[bytes, str] = {}

def add_item_for(
self,
path: str,
title: str | None = None,
*,
fpath: pathlib.Path | None = None,
content: bytes | str | None = None,
**kwargs: Any,
):
"""Add an item at given path or an alias"""
existing_item = None
if any(_filter.match(path) is not None for _filter in self.filters):
if content:
digest = xxhash.xxh32(
content.encode() if isinstance(content, str) else content
).digest()
else:
if not fpath:
raise Exception("Either content or fpath are mandatory")
xxh32 = xxhash.xxh32()
with open(fpath, "rb") as f:
while True:
data = f.read(CONTENT_BUFFER_READ_SIZE) # read content in chunk
if not data:
break
xxh32.update(data)
digest = xxh32.digest()

if existing_item := self.added_items.get(digest):
self.creator.add_alias(
path,
targetPath=existing_item,
title=title or path,
hints={Hint.FRONT_ARTICLE: True} if kwargs.get("is_front") else {},
)
return
else:
self.added_items[digest] = path

self.creator.add_item_for(path, title, fpath=fpath, content=content, **kwargs)
107 changes: 107 additions & 0 deletions tests/zim/test_dedup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import pathlib
import re
from typing import Any

import pytest

from zimscraperlib.zim import Archive, Creator
from zimscraperlib.zim.dedup import Deduplicator


def test_deduplicator(
tmp_path: pathlib.Path,
png_image: pathlib.Path,
html_file: pathlib.Path,
html_str: str,
html_str_cn: str,
):
main_path = "welcome"

png_data = png_image.read_bytes()

def add_items(creator_or_deduplicator: Any):
creator_or_deduplicator.add_item_for(
"welcome1", "wel1", content=html_str, is_front=True
)
creator_or_deduplicator.add_item_for(
"welcome2", "wel2", content=html_str, is_front=True
)
creator_or_deduplicator.add_item_for(
"dedup/welcome3", "wel3", content=html_str, is_front=True
)
creator_or_deduplicator.add_item_for(
"dedup/welcome4", "wel4", content=html_str, is_front=True
)
creator_or_deduplicator.add_item_for(
"prefix/dedup/welcome5", "wel5", content=html_str, is_front=True
)
creator_or_deduplicator.add_item_for("image1", None, fpath=png_image)
creator_or_deduplicator.add_item_for("image2", None, content=png_data)
creator_or_deduplicator.add_item_for("dedup/image3", None, fpath=png_image)
creator_or_deduplicator.add_item_for("dedup/image4", None, content=png_data)
creator_or_deduplicator.add_item_for("dedup/html", None, fpath=html_file)
creator_or_deduplicator.add_item_for("dedup/html_cn", None, content=html_str_cn)
creator_or_deduplicator.add_item_for(
"prefix/dedup/image5", None, content=png_data
)

fpath_without_dedup = tmp_path / "zim_without_dedup.zim"
with Creator(fpath_without_dedup, main_path).config_dev_metadata() as creator:
add_items(creator)

assert fpath_without_dedup.exists()

fpath_with_dedup = tmp_path / "zim_with_dedup.zim"
with Creator(fpath_with_dedup, main_path).config_dev_metadata() as creator:
deduplicator = Deduplicator(creator)
deduplicator.filters.append(re.compile("^foo/.*$"))
deduplicator.filters.append(re.compile("^dedup/.*$"))
deduplicator.filters.append(re.compile("^bar/.*$"))
add_items(deduplicator)

# added_items contains only original items, not the duplicates
assert set(deduplicator.added_items.values()) == {
"dedup/welcome3",
"dedup/image3",
"dedup/html_cn",
}

assert fpath_with_dedup.exists()

# check that deduplication has a consequence on ZIM size
assert (
fpath_without_dedup.lstat().st_size - fpath_with_dedup.lstat().st_size
) > 3000 # 3291 as of libzim 9.3

for zim_path in [fpath_with_dedup, fpath_without_dedup]:
reader = Archive(zim_path)

assert reader.all_entry_count == 24

for html_path in [
"welcome1",
"welcome2",
"dedup/welcome3",
"dedup/welcome4",
"prefix/dedup/welcome5",
"dedup/html",
]:
assert bytes(reader.get_item(html_path).content).decode() == html_str
assert bytes(reader.get_item("dedup/html_cn").content).decode() == html_str_cn

for img_path in [
"image1",
"image2",
"dedup/image3",
"dedup/image4",
"prefix/dedup/image5",
]:
assert bytes(reader.get_item(img_path).content) == png_data


def test_missing_content(tmp_path: pathlib.Path):
with Creator(tmp_path / "test.zin", "foo").config_dev_metadata() as creator:
deduplicator = Deduplicator(creator)
deduplicator.filters.append(re.compile(".*"))
with pytest.raises(Exception, match="Either content or fpath are mandatory"):
deduplicator.add_item_for("welcome", None)