Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixing tokenizer not stripping "Ideographic Full Stop" (chinese full … #783

Merged
merged 8 commits into from
Feb 8, 2025
57 changes: 57 additions & 0 deletions tests/deduplication_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,24 @@ def test_hashes():
assert generate_hash_filename(content) == "42LNugG3Sc95646i"


def test_content_fingerprint():
"Test content fingerprint generation for different types of text"
# Test regular Latin text
text = "Hello world! This is a test string with some numbers 123."
fingerprint = trafilatura.deduplication.content_fingerprint(text)
assert fingerprint == "5efdce9f2b554683"

# Test Chinese text
chinese_text = "这是一个测试。我们在测试中文。"
chinese_fingerprint = trafilatura.deduplication.content_fingerprint(chinese_text)
print(chinese_fingerprint)
assert chinese_fingerprint == "ff377edee6edfb78"

# Test mixed text
mixed_text = "Hello世界。This is混合文本!"
mixed_fingerprint = trafilatura.deduplication.content_fingerprint(mixed_text)
assert mixed_fingerprint == "24979dc6c8a26a5"


def test_simhash():
"Test similarity calculation based on Simhash class."
Expand Down Expand Up @@ -122,8 +140,47 @@ def test_dedup():
assert trafilatura.htmlprocessing.process_node(my_p, options) is None


def test_sample_tokens(monkeypatch):
"Test token sampling functions including fallback for non-latin text"

call_counter = {'fallback': 0, 'main': 0}
original_fallback = trafilatura.deduplication.sample_tokens_fallback

def spy_fallback(*args, **kwargs):
call_counter['fallback'] += 1
return original_fallback(*args, **kwargs)

monkeypatch.setattr(trafilatura.deduplication, 'sample_tokens_fallback', spy_fallback)

# Test regular text
text = "Hello world! This is a test string with some numbers 123."
tokens = trafilatura.deduplication.sample_tokens(text)
assert len(tokens) > 0
assert "Hello" in tokens
assert "world" in tokens
assert "123" in tokens
assert call_counter['fallback'] == 0, "Fallback shouldn't be called for Latin character text"

# Test Chinese text with Chinese punctuation
chinese_text = "这是一个测试。我们在测试中文。"
tokens = trafilatura.deduplication.sample_tokens(chinese_text)
assert len(tokens) == 2
assert "这是一个测试" in tokens
assert "我们在测试中文" in tokens
assert call_counter['fallback'] == 1, "Fallback should be called for Chinese text"

# Test mixed text using the default sample tokens method
mixed_text = "Hello世界。This is混合文本!"
tokens = trafilatura.deduplication.sample_tokens(mixed_text)
assert len(tokens) == 1
assert 'is混合文本' in tokens
assert call_counter['fallback'] == 1, "Fallback shouldn't be called due to blank"


if __name__ == "__main__":
test_hashes()
test_simhash()
test_lrucache()
test_dedup()
test_sample_tokens()
test_content_fingerprint()
40 changes: 33 additions & 7 deletions trafilatura/deduplication.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,25 +4,28 @@
# 3.11+: from typing import Self

import re
import sys
import string

from difflib import SequenceMatcher
from functools import lru_cache
from hashlib import blake2b
from operator import add
from threading import RLock
from typing import Any, Dict, List, Optional, Union

import unicodedata
from lxml.etree import _Element

from .settings import LRU_SIZE
from .utils import trim


STRIP_EXTENSION = re.compile(r"\.[^/?#]{2,63}$")

BIN_COUNT_FUNC = getattr(int, "bit_count", lambda x: bin(x).count("1"))

PUNCT_TBL = dict.fromkeys((i for i in range(sys.maxunicode)
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PUNCT_TBL = {i: ' ' for i in range(0x10FFFF) if unicodedata.category(chr(i)).startswith('P')}

This is simpler and entering the maximum Unicode codepoint manually removes the need for sys.

if unicodedata.category(chr(i)).startswith('P')), ord(' '))


@lru_cache(maxsize=1024)
def is_similar_domain(reference: str, new_string: str, threshold: float = 0.5) -> bool:
Expand All @@ -32,6 +35,28 @@ def is_similar_domain(reference: str, new_string: str, threshold: float = 0.5) -
return SequenceMatcher(None, reference, new_string).ratio() >= threshold


def _get_sample_by_length(tokens: List[str], target_length: int) -> List[str]:
"""Helper function to get a sample of tokens based on length criteria."""
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= target_length / 2:
return sample
return sample


def sample_tokens_fallback(inputstring: str, length: int = 64) -> List[str]:
"""
This fallback implementation is used when the primary sample_tokens function
generates an empty token list. This is mostly relevant for languages like
mandarin where none latin-based punctuation is used e.g.: 。
"""
# Replace all punctuation with spaces using translation table
clean_text = inputstring.translate(PUNCT_TBL)
tokens = [t for t in clean_text.split() if t.isalnum()]
return _get_sample_by_length(tokens, length)


def sample_tokens(inputstring: str, length: int = 64) -> List[str]:
"""Split input into list of tokens and adjust length threshold to make sure
there is enough data."""
Expand All @@ -40,11 +65,12 @@ def sample_tokens(inputstring: str, length: int = 64) -> List[str]:
token = token.strip(string.punctuation)
if token.isalnum():
tokens.append(token)
sample = []
for i in range(4, -1, -1):
sample = [t for t in tokens if len(t) > i]
if len(sample) >= length / 2:
return sample

sample = _get_sample_by_length(tokens, length)

if len(sample) == 0:
return sample_tokens_fallback(inputstring, length)

return sample


Expand Down
Loading