deduplication: account for all punct chars in tokenization (#783)

* Fixing tokenizer not stripping "Ideographic Full Stop" (chinese full stop) * Bolder fix with unicode category stripping * Implemeting new implemenation as fallback * removing reformats * Creating function for duplicate sample length code and adding tests for sample_tokens * removing reformats * removing redundant comments * Removing sys dependcy in deduplication and making translate table static
adbar · Feb 8, 2025 · 139dfd6 · 139dfd6
1 parent fbdffe3
commit 139dfd6
Show file tree

Hide file tree

Showing 2 changed files with 88 additions and 7 deletions.
diff --git a/tests/deduplication_tests.py b/tests/deduplication_tests.py
@@ -24,6 +24,24 @@ def test_hashes():
     assert generate_hash_filename(content) == "42LNugG3Sc95646i"
 
 
+def test_content_fingerprint():
+    "Test content fingerprint generation for different types of text"
+    # Test regular Latin text
+    text = "Hello world! This is a test string with some numbers 123."
+    fingerprint = trafilatura.deduplication.content_fingerprint(text)
+    assert fingerprint == "5efdce9f2b554683"
+
+    # Test Chinese text
+    chinese_text = "这是一个测试。我们在测试中文。"
+    chinese_fingerprint = trafilatura.deduplication.content_fingerprint(chinese_text)
+    print(chinese_fingerprint)
+    assert chinese_fingerprint == "ff377edee6edfb78"
+
+    # Test mixed text
+    mixed_text = "Hello世界。This is混合文本!"
+    mixed_fingerprint = trafilatura.deduplication.content_fingerprint(mixed_text)
+    assert mixed_fingerprint == "24979dc6c8a26a5"
+
 
 def test_simhash():
     "Test similarity calculation based on Simhash class."
@@ -122,8 +140,47 @@ def test_dedup():
     assert trafilatura.htmlprocessing.process_node(my_p, options) is None
 
 
+def test_sample_tokens(monkeypatch):
+    "Test token sampling functions including fallback for non-latin text"
+
+    call_counter = {'fallback': 0, 'main': 0}
+    original_fallback = trafilatura.deduplication.sample_tokens_fallback
+
+    def spy_fallback(*args, **kwargs):
+        call_counter['fallback'] += 1
+        return original_fallback(*args, **kwargs)
+
+    monkeypatch.setattr(trafilatura.deduplication, 'sample_tokens_fallback', spy_fallback)
+
+    # Test regular text
+    text = "Hello world! This is a test string with some numbers 123."
+    tokens = trafilatura.deduplication.sample_tokens(text)
+    assert len(tokens) > 0
+    assert "Hello" in tokens
+    assert "world" in tokens
+    assert "123" in tokens
+    assert call_counter['fallback'] == 0, "Fallback shouldn't be called for Latin character text"
+
+    # Test Chinese text with Chinese punctuation
+    chinese_text = "这是一个测试。我们在测试中文。"
+    tokens = trafilatura.deduplication.sample_tokens(chinese_text)
+    assert len(tokens) == 2
+    assert "这是一个测试" in tokens
+    assert "我们在测试中文" in tokens
+    assert call_counter['fallback'] == 1, "Fallback should be called for Chinese text"
+
+    # Test mixed text using the default sample tokens method
+    mixed_text = "Hello世界。This is混合文本!"
+    tokens = trafilatura.deduplication.sample_tokens(mixed_text)
+    assert len(tokens) == 1
+    assert 'is混合文本' in tokens
+    assert call_counter['fallback'] == 1, "Fallback shouldn't be called due to blank"
+
+
 if __name__ == "__main__":
     test_hashes()
     test_simhash()
     test_lrucache()
     test_dedup()
+    test_sample_tokens()
+    test_content_fingerprint()
diff --git a/trafilatura/deduplication.py b/trafilatura/deduplication.py
@@ -5,24 +5,25 @@
 
 import re
 import string
-
 from difflib import SequenceMatcher
 from functools import lru_cache
 from hashlib import blake2b
 from operator import add
 from threading import RLock
 from typing import Any, Dict, List, Optional, Union
 
+import unicodedata
 from lxml.etree import _Element
 
 from .settings import LRU_SIZE
 from .utils import trim
 
-
 STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$")
 
 BIN_COUNT_FUNC = getattr(int, "bit_count", lambda x: bin(x).count("1"))
 
+PUNCT_TBL = str.maketrans({i: ' ' for i in range(0x10FFFF) if unicodedata.category(chr(i)).startswith('P')})
+
 
 @lru_cache(maxsize=1024)
 def is_similar_domain(reference: str, new_string: str, threshold: float = 0.5) -> bool:
@@ -32,6 +33,28 @@ def is_similar_domain(reference: str, new_string: str, threshold: float = 0.5) -
     return SequenceMatcher(None, reference, new_string).ratio() >= threshold
 
 
+def _get_sample_by_length(tokens: List[str], target_length: int) -> List[str]:
+    """Helper function to get a sample of tokens based on length criteria."""
+    sample = []
+    for i in range(4, -1, -1):
+        sample = [t for t in tokens if len(t) > i]
+        if len(sample) >= target_length / 2:
+            return sample
+    return sample
+
+
+def sample_tokens_fallback(inputstring: str, length: int = 64) -> List[str]:
+    """
+    This fallback implementation is used when the primary sample_tokens function
+    generates an empty token list. This is mostly relevant for languages like
+    mandarin where none latin-based punctuation is used e.g.: 。
+    """
+    # Replace all punctuation with spaces using translation table
+    clean_text = inputstring.translate(PUNCT_TBL)
+    tokens = [t for t in clean_text.split() if t.isalnum()]
+    return _get_sample_by_length(tokens, length)
+
+
 def sample_tokens(inputstring: str, length: int = 64) -> List[str]:
     """Split input into list of tokens and adjust length threshold to make sure
     there is enough data."""
@@ -40,11 +63,12 @@ def sample_tokens(inputstring: str, length: int = 64) -> List[str]:
         token = token.strip(string.punctuation)
         if token.isalnum():
             tokens.append(token)
-    sample = []
-    for i in range(4, -1, -1):
-        sample = [t for t in tokens if len(t) > i]
-        if len(sample) >= length / 2:
-            return sample
+
+    sample = _get_sample_by_length(tokens, length)
+
+    if len(sample) == 0:
+        return sample_tokens_fallback(inputstring, length)
+
     return sample