From 70a2367b8a4dd8ca260d4a40ad2f18819803a939 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 18:55:21 +0200 Subject: [PATCH 01/10] Added --threshold option to CLI Possibility to tweak maximum amount of chaos allowed from CLI --- charset_normalizer/cli/normalizer.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 5af8f01e..3d3fb9e7 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -1,9 +1,10 @@ import argparse import sys -from charset_normalizer import CharsetNormalizerMatches from prettytable import PrettyTable +from charset_normalizer import CharsetNormalizerMatches + def query_yes_no(question, default="yes"): """Ask a yes/no question via input() and return their answer. @@ -56,6 +57,8 @@ def cli_detect(argv=None): help='Replace file when trying to normalize it instead of creating a new one.') parser.add_argument('--force', action="store_true", default=False, dest='force', help='Replace file without asking if you are sure, use this flag with caution.') + parser.add_argument('--threshold', action="store", default=0.2, type=float, dest='threshold', + help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.") args = parser.parse_args(argv) @@ -72,10 +75,15 @@ def cli_detect(argv=None): print('Use --force in addition of --replace only.', file=sys.stderr) return 1 + if args.threshold < 0. or args.threshold > 1.: + print('--threshold VALUE should be between 0. AND 1.') + return 1 + for my_file in args.file: matches = CharsetNormalizerMatches.from_fp( - my_file + my_file, + threshold=args.threshold ) if len(matches) == 0: From 0b8c445b0612261ca1a6261832313d9ad429a598 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 18:56:45 +0200 Subject: [PATCH 02/10] Add methods to UnicodeRangeIdentify --- charset_normalizer/unicode.py | 125 ++++++++++++++++++++++++++++++++-- 1 file changed, 118 insertions(+), 7 deletions(-) diff --git a/charset_normalizer/unicode.py b/charset_normalizer/unicode.py index fd70c306..69a7d46d 100644 --- a/charset_normalizer/unicode.py +++ b/charset_normalizer/unicode.py @@ -1,7 +1,8 @@ # coding: utf-8 -from charset_normalizer.constant import UNICODE_RANGES_ZIP, UNICODE_RANGES_NAMES from functools import lru_cache +from charset_normalizer.constant import UNICODE_RANGES_ZIP, UNICODE_RANGES_NAMES, UNICODE_SECONDARY_RANGE_KEYWORD + class UnicodeRangeIdentify: @@ -52,7 +53,7 @@ def is_latin(letter): :param str letter: :return: """ - return 'Latin' in UnicodeRangeIdentify.find_letter_type(letter) + return 'Latin' in (UnicodeRangeIdentify.find_letter_type(letter) or '') @staticmethod @lru_cache(maxsize=8192) @@ -65,9 +66,10 @@ def is_punc(letter): if letter.isspace(): return True r_name = UnicodeRangeIdentify.find_letter_type(letter) - return "Punctuation" in r_name or \ - 'Forms' in r_name or \ - letter in 'º¯—–‒‐⁃«‹?!;.:^$*»£¹¿~ª؟©±¡{}[]|¼½¾⅕⅙⅛™℠‼⁇❝❞¶⁋√↑↓�' + return r_name is not None and \ + ("Punctuation" in r_name or + 'Forms' in r_name or + letter in 'º¯—–‒‐⁃«‹?!;.:^$*»£¹¿~ª؟©±¡{}[]|¼½¾⅕⅙⅛™℠‼⁇❝❞¶⁋√↑↓�¤`') @staticmethod @lru_cache(maxsize=8192) @@ -77,7 +79,7 @@ def is_cjk(letter): :param str letter: :return: """ - return 'CJK' in UnicodeRangeIdentify.find_letter_type(letter) + return 'CJK' in (UnicodeRangeIdentify.find_letter_type(letter) or '') @staticmethod def unravel_suspicious_ranges(str_len, encountered_unicode_range_occurrences): @@ -110,13 +112,15 @@ def is_suspiciously_successive_range(range_name_a, range_name_b): :param str range_name_b: :return: """ + if range_name_a is None or range_name_b is None: + return True dec_range_name_a, dec_range_name_b = range_name_a.split(), range_name_b.split() if range_name_a == range_name_b: return False - if 'Latin' in range_name_a or 'Latin' in range_name_b: + if 'Latin' in range_name_a and 'Latin' in range_name_b: return False for el in dec_range_name_a: @@ -130,3 +134,110 @@ def is_suspiciously_successive_range(range_name_a, range_name_b): return False return True + + @staticmethod + def classification(word): + """ + :param str word: + :return: + """ + cla_ = dict() + + for el in word: + if el.isspace(): + raise IOError('Classification should not be invoked with sentences !') + u_name = UnicodeRangeIdentify.find_letter_type(el) + if u_name is None: + u_name = 'Unknown' + if u_name not in cla_: + cla_[u_name] = 0 + cla_[u_name] += 1 + + return cla_ + + @staticmethod + @lru_cache(maxsize=512) + def is_range_secondary(u_range): + """ + :param str u_range: + :return: + """ + try: + UnicodeRangeIdentify.get_range_id(u_range) + except ValueError: + return True + + for keyword in UNICODE_SECONDARY_RANGE_KEYWORD: + if keyword in u_range: + return True + + return False + + @staticmethod + def part_punc(word): + """ + Determine how much of the word is composed of punc sign + :param str word: + :return: + """ + return [UnicodeRangeIdentify.is_punc(el) for el in word].count(True) / len(word) + + @staticmethod + def part_accent(word): + """ + Determine how much of the word is composed of accentuated letter + :param word: + :return: + """ + return [UnicodeRangeIdentify.is_accentuated(el) for el in word].count(True) / len(word) + + @staticmethod + def word_to_range_list(word): + """ + + :param str word: + :return: + """ + return [UnicodeRangeIdentify.find_letter_type(el) for el in word] + + @staticmethod + def word_to_range_continue(word): + """ + + :param str word: + :return: + """ + l_ = list() + + for el in word: + u_name = UnicodeRangeIdentify.find_letter_type(el) + if len(l_) == 0: + l_.append( + ( + u_name, + 1 + ) + ) + else: + if UnicodeRangeIdentify.is_suspiciously_successive_range(u_name, l_[-1][0]) is True: + l_.append( + ( + u_name, + 1 + ) + ) + else: + l_[-1] = ( + u_name, + l_[-1][1]+1 + ) + + return l_ + + @staticmethod + def part_lonely_range(word): + """ + :param str word: + :return: + """ + return [u_occ_cont == 1 for u_name, u_occ_cont in UnicodeRangeIdentify.word_to_range_continue(word)].count(True) / len(word) From 3d748047209ebb27a2153ebcedffb79f5753a35b Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 18:57:26 +0200 Subject: [PATCH 03/10] Add probe on words Support class to ProbeChaos --- charset_normalizer/__init__.py | 1 + charset_normalizer/probe_words.py | 69 +++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+) create mode 100644 charset_normalizer/probe_words.py diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py index 830e790b..8bf9349a 100644 --- a/charset_normalizer/__init__.py +++ b/charset_normalizer/__init__.py @@ -3,4 +3,5 @@ from charset_normalizer.unicode import UnicodeRangeIdentify from charset_normalizer.probe_chaos import ProbeChaos from charset_normalizer.probe_coherence import ProbeCoherence +from charset_normalizer.probe_words import ProbeWords from charset_normalizer.legacy import detect diff --git a/charset_normalizer/probe_words.py b/charset_normalizer/probe_words.py new file mode 100644 index 00000000..b3a2cc06 --- /dev/null +++ b/charset_normalizer/probe_words.py @@ -0,0 +1,69 @@ +from functools import lru_cache + +from charset_normalizer.probe_coherence import HashableCounter +from charset_normalizer.unicode import UnicodeRangeIdentify + + +@lru_cache(maxsize=8192) +class ProbeWords: + + def __init__(self, w_counter): + """ + :param HashableCounter w_counter: + """ + self._w_counter = w_counter + + self._words = list() + self._nb_words = 0 + + self._suspicious = list() + + if w_counter is not None: + self._words = list(w_counter.keys()) + self._nb_words = len(self._words) + + self._probe() + + def __add__(self, other): + """ + + :param ProbeWords other: + :return: + """ + k_ = ProbeWords(None) + + k_._nb_words = self._nb_words + other._nb_words + k_._suspicious = self._suspicious + other._suspicious + + return k_ + + def _probe(self): + + for el in self._words: + + w_len = len(el) + classification = UnicodeRangeIdentify.classification(el) + + c_ = 0 + + is_latin_based = all(['Latin' in el for el in list(classification.keys())]) + + if len(classification.keys()) > 1: + for u_name, u_occ in classification.items(): + + if UnicodeRangeIdentify.is_range_secondary(u_name) is True: + c_ += u_occ + + if (not is_latin_based and c_ > int(w_len / 4)) \ + or (is_latin_based and c_ > int(w_len / 2)) \ + or (UnicodeRangeIdentify.part_punc(el) > 0.4 and len(classification.keys()) > 1) \ + or (not is_latin_based and UnicodeRangeIdentify.part_accent(el) > 0.4) \ + or (not is_latin_based and len(el) > 10 and UnicodeRangeIdentify.part_lonely_range(el) > 0.3): + self._suspicious.append(el) + else: + pass + + @property + def ratio(self): + return len(self._suspicious) / self._nb_words if self._nb_words > 5 else 0. + From fe5b6df3d4e54803baf3a1e14b77003a53d217ec Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 18:57:49 +0200 Subject: [PATCH 04/10] Adjust Chaos Ratio with Probe Words --- charset_normalizer/probe_chaos.py | 71 ++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 6 deletions(-) diff --git a/charset_normalizer/probe_chaos.py b/charset_normalizer/probe_chaos.py index 4ec0fa79..7f5d87c1 100644 --- a/charset_normalizer/probe_chaos.py +++ b/charset_normalizer/probe_chaos.py @@ -1,14 +1,15 @@ # coding: utf-8 import re +from functools import lru_cache from dragonmapper.hanzi import MIXED, BOTH, UNKNOWN from dragonmapper.hanzi import identify as s_identify from zhon.hanzi import sentence as cjc_sentence_re +from charset_normalizer.probe_coherence import HashableCounter +from charset_normalizer.probe_words import ProbeWords from charset_normalizer.unicode import UnicodeRangeIdentify -from functools import lru_cache - @lru_cache(maxsize=8192) class ProbeChaos: @@ -48,14 +49,62 @@ def __init__(self, string, giveup_threshold=0.09): self.total_upper_accent_encountered_inner = 0 self.total_unaccented_letter_encountered = 0 + self._probe_word = ProbeWords(HashableCounter(self._string.split())) + self.gave_up = False if len(self._string) >= 10: self._probe() + def __add__(self, other): + """ + :param ProbeChaos other: + :return: + """ + k_ = ProbeChaos('', self._threshold) + + k_.successive_upper_lower = self.successive_upper_lower + other.successive_upper_lower + k_.successive_accent = self.successive_accent + other.successive_accent + k_.successive_different_unicode_range = self.successive_different_unicode_range + other.successive_different_unicode_range + + for el in self.encountered_unicode_range: + k_.encountered_unicode_range.add(el) + + for el in other.encountered_unicode_range: + k_.encountered_unicode_range.add(el) + + k_.encountered_punc_sign = self.encountered_punc_sign + other.encountered_punc_sign + k_.unprintable = self.unprintable + other.unprintable + k_.encountered_white_space = self.encountered_white_space + other.encountered_white_space + k_.not_encountered_white_space = self.not_encountered_white_space + other.not_encountered_white_space + + for u_name, u_occ in self.encountered_unicode_range_occurrences.items(): + if u_name not in k_.encountered_unicode_range_occurrences.keys(): + k_.encountered_unicode_range_occurrences[u_name] = 0 + k_.encountered_unicode_range_occurrences[u_name] += u_occ + + for u_name, u_occ in other.encountered_unicode_range_occurrences.items(): + if u_name not in k_.encountered_unicode_range_occurrences.keys(): + k_.encountered_unicode_range_occurrences[u_name] = 0 + k_.encountered_unicode_range_occurrences[u_name] += u_occ + + k_.not_encountered_white_space_reset = self.not_encountered_white_space_reset + other.not_encountered_white_space_reset + k_.total_letter_encountered = self.total_letter_encountered + other.total_letter_encountered + k_.total_lower_letter_encountered = self.total_lower_letter_encountered + other.total_lower_letter_encountered + k_.total_upper_accent_encountered = self.total_upper_accent_encountered + other.total_upper_accent_encountered + k_.total_upper_accent_encountered_inner = self.total_upper_accent_encountered_inner + other.total_upper_accent_encountered_inner + k_.total_unaccented_letter_encountered = self.total_unaccented_letter_encountered + other.total_unaccented_letter_encountered + + k_._probe_word = self._probe_word + other._probe_word + + k_._string = self._string + other._string + + return k_ + def _probe(self): c__ = False + upper_lower_m = False for c, i_ in zip(self._string, range(0, len(self._string))): @@ -133,7 +182,13 @@ def _probe(self): continue if (is_lower and self.previous_printable_letter.isupper()) or (is_upper and self.previous_printable_letter.islower()): - self.successive_upper_lower += 1 + if not upper_lower_m: + upper_lower_m = True + else: + self.successive_upper_lower += 1 + upper_lower_m = False + else: + upper_lower_m = False if is_latin: self.previous_encountered_unicode_range = u_name @@ -154,6 +209,8 @@ def _probe(self): @staticmethod def _unravel_cjk_suspicious_chinese(string, encountered_unicode_range_occurrences): + if len(string) <= 10: + return UNKNOWN encountered_unicode_range = encountered_unicode_range_occurrences.keys() @@ -161,8 +218,10 @@ def _unravel_cjk_suspicious_chinese(string, encountered_unicode_range_occurrence i_ = s_identify(string) if i_ in [MIXED, BOTH]: return encountered_unicode_range_occurrences['CJK Unified Ideographs'] - elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) == 0: - return encountered_unicode_range_occurrences['CJK Unified Ideographs'] + elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) > 0: + return -encountered_unicode_range_occurrences['CJK Unified Ideographs'] + elif i_ != UNKNOWN: + return int(encountered_unicode_range_occurrences['CJK Unified Ideographs']*0.3) return UNKNOWN @@ -178,4 +237,4 @@ def ratio(self): r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0 z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string), self.encountered_unicode_range_occurrences) p_ = self.encountered_punc_sign if self.encountered_punc_sign / len(self._string) > 0.2 else 0 - return (r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string) # + len(self.encountered_unicode_range)-1 + return ((r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio # + len(self.encountered_unicode_range)-1 From cca713c1b60b0b27e1012a0d7c3120abf8726273 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 18:58:41 +0200 Subject: [PATCH 05/10] Change ratio calc formula for coherence --- charset_normalizer/probe_coherence.py | 33 ++++++++++++++++++--------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/charset_normalizer/probe_coherence.py b/charset_normalizer/probe_coherence.py index 0bdddfa4..8faeb4c9 100644 --- a/charset_normalizer/probe_coherence.py +++ b/charset_normalizer/probe_coherence.py @@ -1,13 +1,10 @@ # coding: utf-8 -import statistics -from collections import Counter - -from cached_property import cached_property - import json +from collections import Counter +from functools import lru_cache from os.path import dirname, realpath, exists -from functools import lru_cache +from cached_property import cached_property class HashableCounter(Counter): @@ -56,6 +53,15 @@ def most_likely(self): k_ = [self.index_of_rates[str(el[0])][str(el[1])] for el in sorted(p_, key=lambda tup: sum(tup))] return [item for sublist in k_ for item in sublist][:3] + def ratio_of(self, language): + """ + :param str language: + :return: + """ + if language.capitalize() not in self.rank_per_lang: + return 1. + return self.rank_per_lang[language.capitalize()] + @cached_property def ratio(self): """ @@ -65,11 +71,16 @@ def ratio(self): :return: Ratio as floating number :rtype: float """ - p_ = [(float(el), float(sorted(self.index_of_rates[str(el)].keys())[0])) for el in - sorted([float(el) for el in list(self.index_of_rates.keys())])] - - return statistics.mean([sum(el) for el in p_[:2]]) if len( - self.rank_per_lang.keys()) > 0 else 1. + # p_ = [(float(el), float(sorted(self.index_of_rates[str(el)].keys())[0])) for el in + # sorted([float(el) for el in list(self.index_of_rates.keys())])] + # + # return statistics.mean([sum(el) for el in p_[:2]]) if len( + # self.rank_per_lang.keys()) > 0 else 1. + languages = self.most_likely + if len(languages) == 0: + return 1. + ratios = [self.rank_per_lang[lg] for lg in languages] + return sum(ratios) def _probe(self): From 1d7e874a61710b0eb1905ba6043827097db81857 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 18:59:40 +0200 Subject: [PATCH 06/10] from_bytes method revised give up on dict() working. --- charset_normalizer/normalizer.py | 124 ++++++++++++++----------------- 1 file changed, 56 insertions(+), 68 deletions(-) diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py index 4dfb57fb..c4c80b2f 100644 --- a/charset_normalizer/normalizer.py +++ b/charset_normalizer/normalizer.py @@ -1,17 +1,16 @@ # coding: utf-8 +import collections import re import statistics from encodings.aliases import aliases from os.path import basename, splitext -import collections +from platform import python_version_tuple from cached_property import cached_property -from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter -from charset_normalizer.probe_chaos import ProbeChaos from charset_normalizer.constant import BYTE_ORDER_MARK - -from platform import python_version_tuple +from charset_normalizer.probe_chaos import ProbeChaos +from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter class CharsetNormalizerMatch: @@ -93,8 +92,13 @@ def language(self): :return: Most used/probable language in text :rtype: str """ - languages = ProbeCoherence(self.char_counter).most_likely - return languages[0] if len(languages) > 0 else ('English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown') + probe_coherence = ProbeCoherence(self.char_counter) + languages = probe_coherence.most_likely + + if len(languages) == 0: + return 'English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown' + + return languages[0] @cached_property def chaos(self): @@ -194,7 +198,7 @@ def __len__(self): return len(self._matches) @staticmethod - def normalize(path, steps=10, chunk_size=512, threshold=0.09): + def normalize(path, steps=10, chunk_size=512, threshold=0.20): """ :param str path: :param int steps: @@ -226,7 +230,7 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.09): return b_ @staticmethod - def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09): + def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20): """ Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported charset encoding. @@ -244,7 +248,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09): supported = sorted(aliases.items()) if py_need_sort else aliases.items() tested = set() - working = dict() + matches = list() maximum_length = len(sequences) @@ -286,70 +290,54 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09): except LookupError: continue - chaos_measures = list() - ranges_encountered_t = dict() - decoded_len_t = 0 - - successive_chaos_zero = 0 r_ = range( 0 if bom_available is False else bom_len, maximum_length, int(maximum_length / steps) ) - p_ = len(r_) - - for i in r_: - - chunk = sequences[i:i + chunk_size] - decoded = str(chunk, encoding=p, errors='ignore') - - probe_chaos = ProbeChaos(decoded, giveup_threshold=threshold) - chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences - - for k, e in ranges_encountered.items(): - if k not in ranges_encountered_t.keys(): - ranges_encountered_t[k] = 0 - ranges_encountered_t[k] += e - - if bom_available is True: - if chaos_measure > 0.: - chaos_measure /= 2 - else: - chaos_measure = -1. - - if chaos_measure > threshold: - if p in working.keys(): - del working[p] - break - elif chaos_measure == 0.: - successive_chaos_zero += 1 - if steps > 2 and successive_chaos_zero > p_ / 2: - break - elif chaos_measure > 0. and successive_chaos_zero > 0: - successive_chaos_zero = 0 - - chaos_measures.append(chaos_measure) - - if p not in working.keys(): - working[p] = dict() - - if p in working.keys(): - working[p]['ratio'] = statistics.mean(chaos_measures) - working[p]['ranges'] = ranges_encountered_t - working[p]['chaos'] = sum(chaos_measures) - working[p]['len'] = decoded_len_t - working[p]['bom'] = bom_available - working[p]['bom_len'] = bom_len - - if p == 'ascii' and p in working.keys() and working[p]['ratio'] == 0.: - break - - return CharsetNormalizerMatches( - [CharsetNormalizerMatch(sequences if working[enc]['bom'] is False else sequences[working[enc]['bom_len']:], enc, working[enc]['ratio'], working[enc]['ranges'], working[enc]['bom']) for enc in - (sorted(working.keys()) if py_need_sort else working.keys()) if working[enc]['ratio'] <= threshold]) + + measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_] + ratios = [el.ratio for el in measures] + nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True) + + chaos_means = statistics.mean(ratios) + chaos_median = statistics.median(ratios) + chaos_min = min(ratios) + chaos_max = max(ratios) + + if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold: + # print(p, 'is too much chaos for decoded input !') + continue + + encountered_unicode_range_occurrences = dict() + + for el in measures: + for u_name, u_occ in el.encountered_unicode_range_occurrences.items(): + if u_name not in encountered_unicode_range_occurrences.keys(): + encountered_unicode_range_occurrences[u_name] = 0 + encountered_unicode_range_occurrences[u_name] += u_occ + + # print(p, 'U RANGES', encountered_unicode_range_occurrences) + + matches.append( + CharsetNormalizerMatch( + sequences if not bom_available else sequences[bom_len:], + p, + chaos_means, + encountered_unicode_range_occurrences, + bom_available + ) + ) + + # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].language) + + if (p == 'ascii' and chaos_median == 0.) or bom_available is True: + return CharsetNormalizerMatches([matches[-1]]) + + return CharsetNormalizerMatches(matches) @staticmethod - def from_fp(fp, steps=10, chunk_size=512, threshold=0.09): + def from_fp(fp, steps=10, chunk_size=512, threshold=0.20): """ :param io.BinaryIO fp: :param int steps: @@ -365,7 +353,7 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.09): ) @staticmethod - def from_path(path, steps=10, chunk_size=512, threshold=0.09): + def from_path(path, steps=10, chunk_size=512, threshold=0.20): """ :param str path: :param int steps: From 576891715aaf4ffa39fe6492acd45fccddc187e2 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 18:59:52 +0200 Subject: [PATCH 07/10] Add constant UNICODE_SECONDARY_RANGE_KEYWORD --- charset_normalizer/constant.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index c16ed562..1ec27a16 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -569,6 +569,24 @@ "Variation Selectors Supplement" ] +UNICODE_SECONDARY_RANGE_KEYWORD = [ + 'Supplement', + 'Extended', + 'Extensions', + 'Modifier', + 'Marks', + 'Punctuation', + 'Symbols', + 'Forms', + 'Operators', + 'Miscellaneous', + 'Drawing', + 'Block', + 'Shapes', + 'Supplemental', + 'Tags' +] + BYTE_ORDER_MARK = { 'utf_8': BOM_UTF8, 'utf_7': [ From 124fb8afeb61e2d3026b88ac7e64755074f31e1c Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 19:00:37 +0200 Subject: [PATCH 08/10] Tests improvements --- test/test_on_byte.py | 189 +++++++++++++++++++++------------------ test/test_on_file.py | 4 +- test/test_probe_chaos.py | 2 +- 3 files changed, 103 insertions(+), 92 deletions(-) diff --git a/test/test_on_byte.py b/test/test_on_byte.py index 52a22176..c68aeb74 100644 --- a/test/test_on_byte.py +++ b/test/test_on_byte.py @@ -6,95 +6,106 @@ class TestBytes(unittest.TestCase): def test_bom_detection(self): - self.assertFalse( - CnM.from_bytes( - '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030') - ).best().first().byte_order_mark - ) - - self.assertTrue( - CnM.from_bytes( - (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030') - ).best().first().byte_order_mark - ) - - self.assertTrue( - CnM.from_bytes( - b'\x2b\x2f\x76\x38' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7') - ).best().first().byte_order_mark - ) - - self.assertTrue( - CnM.from_bytes( - b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') - ).best().first().byte_order_mark - ) + with self.subTest('GB18030 UNAVAILABLE SIG'): + self.assertFalse( + CnM.from_bytes( + '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030') + ).best().first().byte_order_mark + ) + + with self.subTest('GB18030 AVAILABLE SIG'): + self.assertTrue( + CnM.from_bytes( + (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030') + ).best().first().byte_order_mark + ) + + with self.subTest('UTF-7 AVAILABLE BOM'): + self.assertTrue( + CnM.from_bytes( + b'\x2b\x2f\x76\x38' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7') + ).best().first().byte_order_mark + ) + + with self.subTest('UTF-8 AVAILABLE BOM'): + self.assertTrue( + CnM.from_bytes( + b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') + ).best().first().byte_order_mark + ) def test_encode_decode(self): - self.assertEqual( - CnM.from_bytes( - '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030') - ).best().first().encoding, - 'gb18030' - ) - - self.assertEqual( - CnM.from_bytes( - (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030') - ).best().first().encoding, - 'gb18030' - ) - - self.assertEqual( - CnM.from_bytes( - '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') - ).best().first().encoding, - 'utf_8' - ) - - self.assertEqual( - CnM.from_bytes( - '我没有埋怨,蹉跎的只是一些时间。'.encode('utf_7') - ).best().first().encoding, - 'utf_7' - ) - - self.assertEqual( - CnM.from_bytes( - b'\x2b\x2f\x76\x38'+'我没有埋怨,磋砣的只是一些时间。'.encode('utf_7') - ).best().first().encoding, - 'utf_7' - ) - - - - self.assertEqual( - CnM.from_bytes( - 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно,'.encode('utf_7') - ).best().first().encoding, - 'utf_7' - ) - - self.assertEqual( - CnM.from_bytes( - b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') - ).best().first().encoding, - 'utf_8' - ) - - self.assertEqual( - CnM.from_bytes( - 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, ' - 'поне що се отнася до началното и основното образование.'.encode('utf_8') - ).best().first().encoding, - 'utf_8' - ) - - self.assertEqual( - CnM.from_bytes( - 'Bсеки човек има право на образование.'.encode( - 'utf_8') - ).best().first().encoding, - 'utf_8' - ) + with self.subTest('Encode & Detect GB18030 WITHOUT SIG'): + self.assertEqual( + CnM.from_bytes( + '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030') + ).best().first().encoding, + 'gb18030' + ) + + with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'): + self.assertEqual( + CnM.from_bytes( + (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030') + ).best().first().encoding, + 'gb18030' + ) + + with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'): + self.assertEqual( + CnM.from_bytes( + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') + ).best().first().encoding, + 'utf_8' + ) + + with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'): + self.assertEqual( + CnM.from_bytes( + '我没有埋怨,蹉跎的只是一些时间。'.encode('utf_7') + ).best().first().encoding, + 'utf_7' + ) + + with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'): + self.assertEqual( + CnM.from_bytes( + b'\x2b\x2f\x76\x38'+'我没有埋怨,磋砣的只是一些时间。'.encode('utf_7') + ).best().first().encoding, + 'utf_7' + ) + + with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'): + self.assertEqual( + CnM.from_bytes( + 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно,'.encode('utf_7') + ).best().first().encoding, + 'utf_7' + ) + + with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'): + self.assertEqual( + CnM.from_bytes( + b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8') + ).best().first().encoding, + 'utf_8' + ) + + with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): + self.assertEqual( + CnM.from_bytes( + 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, ' + 'поне що се отнася до началното и основното образование.'.encode('utf_8') + ).best().first().encoding, + 'utf_8' + ) + + with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'): + self.assertEqual( + CnM.from_bytes( + 'Bсеки човек има право на образование.'.encode( + 'utf_8') + ).best().first().encoding, + 'utf_8' + ) diff --git a/test/test_on_file.py b/test/test_on_file.py index 34cf3096..a1d726ef 100644 --- a/test/test_on_file.py +++ b/test/test_on_file.py @@ -11,11 +11,11 @@ class TestFileCharsetNormalizer(unittest.TestCase): SHOULD_BE = { 'sample.1.ar.srt': 'cp1256', 'sample.1.fr.srt': 'cp1252', - 'sample.1.gr.srt': 'iso8859_7', + 'sample.1.gr.srt': 'cp1253', 'sample.1.he.srt': 'cp1255', 'sample.1.hi.srt': 'ascii', 'sample.1.ru.srt': 'cp1251', - 'sample.1.tu.srt': 'cp1256', # Not actually the good one. But kinda readable. + 'sample.1.tu.srt': 'cp1252', # Not actually the good one. But kinda readable. 'sample.2.ar.srt': 'cp1256', 'sample.3.ar.srt': 'utf_8', 'sample.4.ar.srt': 'cp1256', diff --git a/test/test_probe_chaos.py b/test/test_probe_chaos.py index 0d322c9f..a064533b 100644 --- a/test/test_probe_chaos.py +++ b/test/test_probe_chaos.py @@ -7,7 +7,7 @@ class TestProbeChaos(unittest.TestCase): def test_not_gibberish(self): - self.assertEqual( + self.assertLessEqual( ProbeChaos('典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。').ratio, 0. ) From 77014f89ad5fa81aef6a76017c40c38e233c2284 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 19:00:47 +0200 Subject: [PATCH 09/10] bump 0.3.0 --- setup.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/setup.py b/setup.py index 113d2f7d..55894ab8 100644 --- a/setup.py +++ b/setup.py @@ -8,12 +8,12 @@ # Package meta-data. NAME = 'charset_normalizer' -DESCRIPTION = 'The Real First Universal Charset Detector. Offer a viable solution alternative to Chardet.' +DESCRIPTION = 'The Real First Universal Charset Detector. No Cpp Bindings, Using Voodoo and Magical Artifacts.' URL = 'https://github.com/ousret/charset_normalizer' EMAIL = 'ahmed.tahri@cloudnursery.dev' AUTHOR = 'Ahmed TAHRI @Ousret' -REQUIRES_PYTHON = '>=3.4.0' -VERSION = '0.2.3' +REQUIRES_PYTHON = '>=3.5.0' +VERSION = '0.3.0' REQUIRED = [ 'cached_property', @@ -67,7 +67,6 @@ 'Operating System :: OS Independent', 'Programming Language :: Python', 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', 'Programming Language :: Python :: 3.7', From d5473af5ba817333dcff4b19b98a7a80eeb95699 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Thu, 12 Sep 2019 19:01:40 +0200 Subject: [PATCH 10/10] readme upd8 charset_normalizer become slower from revision 0.3 100ms/file --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f2ca0db..060d1293 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ This project offer you a alternative to **Universal Charset Encoding Detector**, | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | | ------------- | :-------------: | :------------------: | :------------------: | -| `Fast` | ❌
🐌🐌 | ✅
| ✅
⚡ | +| `Fast` | ❌
| ❌
| ✅
⚡ | | `Universal**` | ❌ | ✅ | ❌ | | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |