Skip to content

Commit

Permalink
deduplication: account for all punct chars in tokenization (#783)
Browse files Browse the repository at this point in the history
* Fixing tokenizer not stripping "Ideographic Full Stop" (chinese full stop)

* Bolder fix with unicode category stripping

* Implemeting new implemenation as fallback

* removing reformats

* Creating function for duplicate sample length code and adding tests for sample_tokens

* removing reformats

* removing redundant comments

* Removing sys dependcy in deduplication and making translate table static
  • Loading branch information
reinoldus authored Feb 8, 2025
1 parent fbdffe3 commit 139dfd6
Show file tree
Hide file tree
Showing 2 changed files with 88 additions and 7 deletions.
57 changes: 57 additions & 0 deletions tests/deduplication_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,24 @@ def test_hashes():
assert generate_hash_filename(content) == "42LNugG3Sc95646i"


def test_content_fingerprint():
"Test content fingerprint generation for different types of text"
# Test regular Latin text
text = "Hello world! This is a test string with some numbers 123."
fingerprint = trafilatura.deduplication.content_fingerprint(text)
assert fingerprint == "5efdce9f2b554683"

# Test Chinese text
chinese_text = "这是一个测试。我们在测试中文。"
chinese_fingerprint = trafilatura.deduplication.content_fingerprint(chinese_text)
print(chinese_fingerprint)
assert chinese_fingerprint == "ff377edee6edfb78"

# Test mixed text
mixed_text = "Hello世界。This is混合文本!"
mixed_fingerprint = trafilatura.deduplication.content_fingerprint(mixed_text)
assert mixed_fingerprint == "24979dc6c8a26a5"


def test_simhash():
"Test similarity calculation based on Simhash class."
Expand Down Expand Up @@ -122,8 +140,47 @@ def test_dedup():
assert trafilatura.htmlprocessing.process_node(my_p, options) is None


def test_sample_tokens(monkeypatch):
"Test token sampling functions including fallback for non-latin text"

call_counter = {'fallback': 0, 'main': 0}
original_fallback = trafilatura.deduplication.sample_tokens_fallback

def spy_fallback(*args, **kwargs):
call_counter['fallback'] += 1
return original_fallback(*args, **kwargs)

monkeypatch.setattr(trafilatura.deduplication, 'sample_tokens_fallback', spy_fallback)

# Test regular text
text = "Hello world! This is a test string with some numbers 123."
tokens = trafilatura.deduplication.sample_tokens(text)
assert len(tokens) > 0
assert "Hello" in tokens
assert "world" in tokens
assert "123" in tokens
assert call_counter['fallback'] == 0, "Fallback shouldn't be called for Latin character text"

# Test Chinese text with Chinese punctuation
chinese_text = "这是一个测试。我们在测试中文。"
tokens = trafilatura.deduplication.sample_tokens(chinese_text)
assert len(tokens) == 2
assert "这是一个测试" in tokens
assert "我们在测试中文" in tokens
assert call_counter['fallback'] == 1, "Fallback should be called for Chinese text"

# Test mixed text using the default sample tokens method
mixed_text = "Hello世界。This is混合文本!"
tokens = trafilatura.deduplication.sample_tokens(mixed_text)
assert len(tokens) == 1
assert 'is混合文本' in tokens
assert call_counter['fallback'] == 1, "Fallback shouldn't be called due to blank"


if __name__ == "__main__":
test_hashes()
test_simhash()
test_lrucache()
test_dedup()
test_sample_tokens()
test_content_fingerprint()
38 changes: 31 additions & 7 deletions trafilatura/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,25 @@

import re
import string

from difflib import SequenceMatcher
from functools import lru_cache
from hashlib import blake2b
from operator import add
from threading import RLock
from typing import Any, Dict, List, Optional, Union

import unicodedata
from lxml.etree import _Element

from .settings import LRU_SIZE
from .utils import trim


STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$")

BIN_COUNT_FUNC = getattr(int, "bit_count", lambda x: bin(x).count("1"))

PUNCT_TBL = str.maketrans({i: ' ' for i in range(0x10FFFF) if unicodedata.category(chr(i)).startswith('P')})


@lru_cache(maxsize=1024)
def is_similar_domain(reference: str, new_string: str, threshold: float = 0.5) -> bool:
Expand All @@ -32,6 +33,28 @@ def is_similar_domain(reference: str, new_string: str, threshold: float = 0.5) -
return SequenceMatcher(None, reference, new_string).ratio() >= threshold


def _get_sample_by_length(tokens: List[str], target_length: int) -> List[str]:
"""Helper function to get a sample of tokens based on length criteria."""
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= target_length / 2:
return sample
return sample


def sample_tokens_fallback(inputstring: str, length: int = 64) -> List[str]:
"""
This fallback implementation is used when the primary sample_tokens function
generates an empty token list. This is mostly relevant for languages like
mandarin where none latin-based punctuation is used e.g.: 。
"""
# Replace all punctuation with spaces using translation table
clean_text = inputstring.translate(PUNCT_TBL)
tokens = [t for t in clean_text.split() if t.isalnum()]
return _get_sample_by_length(tokens, length)


def sample_tokens(inputstring: str, length: int = 64) -> List[str]:
"""Split input into list of tokens and adjust length threshold to make sure
there is enough data."""
Expand All @@ -40,11 +63,12 @@ def sample_tokens(inputstring: str, length: int = 64) -> List[str]:
token = token.strip(string.punctuation)
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample

sample = _get_sample_by_length(tokens, length)

if len(sample) == 0:
return sample_tokens_fallback(inputstring, length)

return sample


Expand Down

0 comments on commit 139dfd6

Please sign in to comment.