From 79dce4857914fead2ffe55eb787cad6d5cf14643 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Tue, 31 Oct 2023 21:21:01 +0100 Subject: [PATCH] :bug: Regression on some detection case showcased in the documentation (#371) (#378) and added noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife, thanks!) --- CHANGELOG.md | 3 +++ charset_normalizer/md.py | 33 ++++++++++++++++++++++++++++++++- charset_normalizer/utils.py | 22 ++++++++++++++++++++++ tests/test_base_detection.py | 8 ++++++++ tests/test_large_payload.py | 3 +++ tests/test_mess_detection.py | 2 +- 6 files changed, 69 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffbccaf3..de66da4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - Unintentional memory usage regression when using large payload that match several encoding (#376) +- Regression on some detection case showcased in the documentation (#371) +### Added +- Noise (md) probe that identify malformed arabic representation due to the presence of letters in isolated form (credit to my wife) ## [3.3.1](https://github.com/Ousret/charset_normalizer/compare/3.3.0...3.3.1) (2023-10-22) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 103dfdd6..77897aae 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -9,6 +9,8 @@ ) from .utils import ( is_accentuated, + is_arabic, + is_arabic_isolated_form, is_case_variable, is_cjk, is_emoticon, @@ -127,8 +129,9 @@ def reset(self) -> None: # pragma: no cover @property def ratio(self) -> float: - if self._character_count == 0 or self._character_count < 8: + if self._character_count < 8: return 0.0 + ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 @@ -455,6 +458,34 @@ def ratio(self) -> float: return self._successive_upper_lower_count_final / self._character_count +class ArabicIsolatedFormPlugin(MessDetectorPlugin): + def __init__(self) -> None: + self._character_count: int = 0 + self._isolated_form_count: int = 0 + + def reset(self) -> None: # pragma: no cover + self._character_count = 0 + self._isolated_form_count = 0 + + def eligible(self, character: str) -> bool: + return is_arabic(character) + + def feed(self, character: str) -> None: + self._character_count += 1 + + if is_arabic_isolated_form(character): + self._isolated_form_count += 1 + + @property + def ratio(self) -> float: + if self._character_count < 8: + return 0.0 + + isolated_form_usage: float = self._isolated_form_count / self._character_count + + return isolated_form_usage + + @lru_cache(maxsize=1024) def is_suspiciously_successive_range( unicode_range_a: Optional[str], unicode_range_b: Optional[str] diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index b5ee8459..e5cbbf4c 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -32,6 +32,8 @@ def is_accentuated(character: str) -> bool: or "WITH DIAERESIS" in description or "WITH CIRCUMFLEX" in description or "WITH TILDE" in description + or "WITH MACRON" in description + or "WITH RING ABOVE" in description ) @@ -174,6 +176,26 @@ def is_thai(character: str) -> bool: return "THAI" in character_name +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_arabic(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "ARABIC" in character_name + + +@lru_cache(maxsize=UTF8_MAXIMAL_ALLOCATION) +def is_arabic_isolated_form(character: str) -> bool: + try: + character_name = unicodedata.name(character) + except ValueError: + return False + + return "ARABIC" in character_name and "ISOLATED FORM" in character_name + + @lru_cache(maxsize=len(UNICODE_RANGES_COMBINED)) def is_unicode_range_secondary(range_name: str) -> bool: return any(keyword in range_name for keyword in UNICODE_SECONDARY_RANGE_KEYWORD) diff --git a/tests/test_base_detection.py b/tests/test_base_detection.py index cb80003a..3180a500 100644 --- a/tests/test_base_detection.py +++ b/tests/test_base_detection.py @@ -115,3 +115,11 @@ def test_alphabets_property(): assert "Basic Latin" in best_guess.alphabets assert "Emoticons range(Emoji)" in best_guess.alphabets assert best_guess.alphabets.count("Basic Latin") == 1 + + +def test_doc_example_short_cp1251(): + best_guess = from_bytes( + 'Bсеки човек има право на образование.'.encode('cp1251') + ).best() + + assert best_guess.encoding == "cp1251" diff --git a/tests/test_large_payload.py b/tests/test_large_payload.py index b3893e3a..04526d38 100644 --- a/tests/test_large_payload.py +++ b/tests/test_large_payload.py @@ -12,6 +12,7 @@ def test_large_payload_u8_sig_basic_entry(): assert best_guess.encoding == "utf_8", "Large U8 payload case detection wrongly detected!" assert best_guess.bom is True, "SIG/BOM property should be True" assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" + assert best_guess._string is not None, "str should be decoded before direct access (sig available)" def test_large_payload_ascii_basic_entry(): @@ -22,6 +23,7 @@ def test_large_payload_ascii_basic_entry(): assert best_guess.encoding == "ascii", "Large ASCII payload case detection wrongly detected!" assert best_guess.bom is False, "SIG/BOM property should be False" assert len(best_guess.raw) == len(payload), "Large payload should remain untouched when accessed through .raw" + assert best_guess._string is None, "str should not be decoded until direct access" def test_misleading_large_sequence(): @@ -32,5 +34,6 @@ def test_misleading_large_sequence(): assert len(guesses) > 0 match = guesses.best() assert match is not None + assert match._string is not None, "str should be cached as only match" assert match.encoding == 'utf_8' assert str(match) is not None diff --git a/tests/test_mess_detection.py b/tests/test_mess_detection.py index 46eed879..d70fee45 100644 --- a/tests/test_mess_detection.py +++ b/tests/test_mess_detection.py @@ -12,7 +12,7 @@ ("´Á¥½³ø§i -- ±i®Ìºû, ³¯·Ø©v", 0.5, 1.), ("ïstanbul, T■rkiye'nin en kalabal»k, iktisadi ve k■lt■rel aÓ»dan en —nemli", 0.1, 0.5), ("Parce que Óa, c'est la vÕritable histoire de la rencontre avec votre Tante Robin.", 0.01, 0.5), - ("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 2.0), + ("""ØĢØŠØģاØĶŲ„ Ų„Ųˆ ØĢŲ† اŲ„Ų†Ø§Øģ ŲŠŲˆŲ… Ų…ا ØģŲˆŲŲŠØŠØģاØĶŲ„ŲˆŲ†ØŒ ØŊØđŲ†Ø§ Ų†ØģŲ…Øđ ØđŲ† (ŲØąŲˆØŊŲˆ) ŲˆØ§Ų„ØŪا؊Ų…""", 0.8, 3.0), ("""ÇáÚŞáíÉ , ÇáÊäæíã ÇáãÛäÇØíÓí æ / Ãæ ÇáÇŞÊÑÇÍ""", 0.8, 2.5), ("""hishamkoc@yahoo.com ุชุฑุฌู…ู€ู€ุฉ ู‡ู€ุดู€ู€ู€ุงู… ุงู„ู€ู‚ู€ู€ู€ู€ู„ุงูRadoZ ุชู€ู€ู€ุนู€ู€ู€ู€ุฏูŠู€ู€ู„ ุงู„ู€ู€ู€ุชู€ู€ู€ู€ูˆู‚ู€ู€ูŠู€ู€ู€ู€ุช ู…ู€ู€ู€ู† ู‚ู€ู€ุจู€ู€ู„""", 0.5, 2.0)