diff --git a/README.md b/README.md
index 9f2ca0db..060d1293 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ This project offer you a alternative to **Universal Charset Encoding Detector**,
| Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) |
| ------------- | :-------------: | :------------------: | :------------------: |
-| `Fast` | ❌
🐌🐌 | ✅
| ✅
⚡ |
+| `Fast` | ❌
| ❌
| ✅
⚡ |
| `Universal**` | ❌ | ✅ | ❌ |
| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ |
| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ |
diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
index 830e790b..8bf9349a 100644
--- a/charset_normalizer/__init__.py
+++ b/charset_normalizer/__init__.py
@@ -3,4 +3,5 @@
from charset_normalizer.unicode import UnicodeRangeIdentify
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.probe_coherence import ProbeCoherence
+from charset_normalizer.probe_words import ProbeWords
from charset_normalizer.legacy import detect
diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py
index 5af8f01e..3d3fb9e7 100644
--- a/charset_normalizer/cli/normalizer.py
+++ b/charset_normalizer/cli/normalizer.py
@@ -1,9 +1,10 @@
import argparse
import sys
-from charset_normalizer import CharsetNormalizerMatches
from prettytable import PrettyTable
+from charset_normalizer import CharsetNormalizerMatches
+
def query_yes_no(question, default="yes"):
"""Ask a yes/no question via input() and return their answer.
@@ -56,6 +57,8 @@ def cli_detect(argv=None):
help='Replace file when trying to normalize it instead of creating a new one.')
parser.add_argument('--force', action="store_true", default=False, dest='force',
help='Replace file without asking if you are sure, use this flag with caution.')
+ parser.add_argument('--threshold', action="store", default=0.2, type=float, dest='threshold',
+ help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.")
args = parser.parse_args(argv)
@@ -72,10 +75,15 @@ def cli_detect(argv=None):
print('Use --force in addition of --replace only.', file=sys.stderr)
return 1
+ if args.threshold < 0. or args.threshold > 1.:
+ print('--threshold VALUE should be between 0. AND 1.')
+ return 1
+
for my_file in args.file:
matches = CharsetNormalizerMatches.from_fp(
- my_file
+ my_file,
+ threshold=args.threshold
)
if len(matches) == 0:
diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py
index c16ed562..1ec27a16 100644
--- a/charset_normalizer/constant.py
+++ b/charset_normalizer/constant.py
@@ -569,6 +569,24 @@
"Variation Selectors Supplement"
]
+UNICODE_SECONDARY_RANGE_KEYWORD = [
+ 'Supplement',
+ 'Extended',
+ 'Extensions',
+ 'Modifier',
+ 'Marks',
+ 'Punctuation',
+ 'Symbols',
+ 'Forms',
+ 'Operators',
+ 'Miscellaneous',
+ 'Drawing',
+ 'Block',
+ 'Shapes',
+ 'Supplemental',
+ 'Tags'
+]
+
BYTE_ORDER_MARK = {
'utf_8': BOM_UTF8,
'utf_7': [
diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py
index 4dfb57fb..c4c80b2f 100644
--- a/charset_normalizer/normalizer.py
+++ b/charset_normalizer/normalizer.py
@@ -1,17 +1,16 @@
# coding: utf-8
+import collections
import re
import statistics
from encodings.aliases import aliases
from os.path import basename, splitext
-import collections
+from platform import python_version_tuple
from cached_property import cached_property
-from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter
-from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.constant import BYTE_ORDER_MARK
-
-from platform import python_version_tuple
+from charset_normalizer.probe_chaos import ProbeChaos
+from charset_normalizer.probe_coherence import ProbeCoherence, HashableCounter
class CharsetNormalizerMatch:
@@ -93,8 +92,13 @@ def language(self):
:return: Most used/probable language in text
:rtype: str
"""
- languages = ProbeCoherence(self.char_counter).most_likely
- return languages[0] if len(languages) > 0 else ('English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown')
+ probe_coherence = ProbeCoherence(self.char_counter)
+ languages = probe_coherence.most_likely
+
+ if len(languages) == 0:
+ return 'English' if len(self.alphabets) == 1 and self.alphabets[0] == 'Basic Latin' else 'Unknown'
+
+ return languages[0]
@cached_property
def chaos(self):
@@ -194,7 +198,7 @@ def __len__(self):
return len(self._matches)
@staticmethod
- def normalize(path, steps=10, chunk_size=512, threshold=0.09):
+ def normalize(path, steps=10, chunk_size=512, threshold=0.20):
"""
:param str path:
:param int steps:
@@ -226,7 +230,7 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.09):
return b_
@staticmethod
- def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
+ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20):
"""
Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
charset encoding.
@@ -244,7 +248,7 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
supported = sorted(aliases.items()) if py_need_sort else aliases.items()
tested = set()
- working = dict()
+ matches = list()
maximum_length = len(sequences)
@@ -286,70 +290,54 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.09):
except LookupError:
continue
- chaos_measures = list()
- ranges_encountered_t = dict()
- decoded_len_t = 0
-
- successive_chaos_zero = 0
r_ = range(
0 if bom_available is False else bom_len,
maximum_length,
int(maximum_length / steps)
)
- p_ = len(r_)
-
- for i in r_:
-
- chunk = sequences[i:i + chunk_size]
- decoded = str(chunk, encoding=p, errors='ignore')
-
- probe_chaos = ProbeChaos(decoded, giveup_threshold=threshold)
- chaos_measure, ranges_encountered = probe_chaos.ratio, probe_chaos.encountered_unicode_range_occurrences
-
- for k, e in ranges_encountered.items():
- if k not in ranges_encountered_t.keys():
- ranges_encountered_t[k] = 0
- ranges_encountered_t[k] += e
-
- if bom_available is True:
- if chaos_measure > 0.:
- chaos_measure /= 2
- else:
- chaos_measure = -1.
-
- if chaos_measure > threshold:
- if p in working.keys():
- del working[p]
- break
- elif chaos_measure == 0.:
- successive_chaos_zero += 1
- if steps > 2 and successive_chaos_zero > p_ / 2:
- break
- elif chaos_measure > 0. and successive_chaos_zero > 0:
- successive_chaos_zero = 0
-
- chaos_measures.append(chaos_measure)
-
- if p not in working.keys():
- working[p] = dict()
-
- if p in working.keys():
- working[p]['ratio'] = statistics.mean(chaos_measures)
- working[p]['ranges'] = ranges_encountered_t
- working[p]['chaos'] = sum(chaos_measures)
- working[p]['len'] = decoded_len_t
- working[p]['bom'] = bom_available
- working[p]['bom_len'] = bom_len
-
- if p == 'ascii' and p in working.keys() and working[p]['ratio'] == 0.:
- break
-
- return CharsetNormalizerMatches(
- [CharsetNormalizerMatch(sequences if working[enc]['bom'] is False else sequences[working[enc]['bom_len']:], enc, working[enc]['ratio'], working[enc]['ranges'], working[enc]['bom']) for enc in
- (sorted(working.keys()) if py_need_sort else working.keys()) if working[enc]['ratio'] <= threshold])
+
+ measures = [ProbeChaos(str(sequences[i:i + chunk_size], encoding=p, errors='ignore'), giveup_threshold=threshold) for i in r_]
+ ratios = [el.ratio for el in measures]
+ nb_gave_up = [el.gave_up is True or el.ratio >= threshold for el in measures].count(True)
+
+ chaos_means = statistics.mean(ratios)
+ chaos_median = statistics.median(ratios)
+ chaos_min = min(ratios)
+ chaos_max = max(ratios)
+
+ if (len(r_) >= 4 and nb_gave_up > len(r_) / 4) or chaos_median > threshold:
+ # print(p, 'is too much chaos for decoded input !')
+ continue
+
+ encountered_unicode_range_occurrences = dict()
+
+ for el in measures:
+ for u_name, u_occ in el.encountered_unicode_range_occurrences.items():
+ if u_name not in encountered_unicode_range_occurrences.keys():
+ encountered_unicode_range_occurrences[u_name] = 0
+ encountered_unicode_range_occurrences[u_name] += u_occ
+
+ # print(p, 'U RANGES', encountered_unicode_range_occurrences)
+
+ matches.append(
+ CharsetNormalizerMatch(
+ sequences if not bom_available else sequences[bom_len:],
+ p,
+ chaos_means,
+ encountered_unicode_range_occurrences,
+ bom_available
+ )
+ )
+
+ # print(p, nb_gave_up, chaos_means, chaos_median, chaos_min, chaos_max, matches[-1].coherence, matches[-1].language)
+
+ if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
+ return CharsetNormalizerMatches([matches[-1]])
+
+ return CharsetNormalizerMatches(matches)
@staticmethod
- def from_fp(fp, steps=10, chunk_size=512, threshold=0.09):
+ def from_fp(fp, steps=10, chunk_size=512, threshold=0.20):
"""
:param io.BinaryIO fp:
:param int steps:
@@ -365,7 +353,7 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.09):
)
@staticmethod
- def from_path(path, steps=10, chunk_size=512, threshold=0.09):
+ def from_path(path, steps=10, chunk_size=512, threshold=0.20):
"""
:param str path:
:param int steps:
diff --git a/charset_normalizer/probe_chaos.py b/charset_normalizer/probe_chaos.py
index 4ec0fa79..7f5d87c1 100644
--- a/charset_normalizer/probe_chaos.py
+++ b/charset_normalizer/probe_chaos.py
@@ -1,14 +1,15 @@
# coding: utf-8
import re
+from functools import lru_cache
from dragonmapper.hanzi import MIXED, BOTH, UNKNOWN
from dragonmapper.hanzi import identify as s_identify
from zhon.hanzi import sentence as cjc_sentence_re
+from charset_normalizer.probe_coherence import HashableCounter
+from charset_normalizer.probe_words import ProbeWords
from charset_normalizer.unicode import UnicodeRangeIdentify
-from functools import lru_cache
-
@lru_cache(maxsize=8192)
class ProbeChaos:
@@ -48,14 +49,62 @@ def __init__(self, string, giveup_threshold=0.09):
self.total_upper_accent_encountered_inner = 0
self.total_unaccented_letter_encountered = 0
+ self._probe_word = ProbeWords(HashableCounter(self._string.split()))
+
self.gave_up = False
if len(self._string) >= 10:
self._probe()
+ def __add__(self, other):
+ """
+ :param ProbeChaos other:
+ :return:
+ """
+ k_ = ProbeChaos('', self._threshold)
+
+ k_.successive_upper_lower = self.successive_upper_lower + other.successive_upper_lower
+ k_.successive_accent = self.successive_accent + other.successive_accent
+ k_.successive_different_unicode_range = self.successive_different_unicode_range + other.successive_different_unicode_range
+
+ for el in self.encountered_unicode_range:
+ k_.encountered_unicode_range.add(el)
+
+ for el in other.encountered_unicode_range:
+ k_.encountered_unicode_range.add(el)
+
+ k_.encountered_punc_sign = self.encountered_punc_sign + other.encountered_punc_sign
+ k_.unprintable = self.unprintable + other.unprintable
+ k_.encountered_white_space = self.encountered_white_space + other.encountered_white_space
+ k_.not_encountered_white_space = self.not_encountered_white_space + other.not_encountered_white_space
+
+ for u_name, u_occ in self.encountered_unicode_range_occurrences.items():
+ if u_name not in k_.encountered_unicode_range_occurrences.keys():
+ k_.encountered_unicode_range_occurrences[u_name] = 0
+ k_.encountered_unicode_range_occurrences[u_name] += u_occ
+
+ for u_name, u_occ in other.encountered_unicode_range_occurrences.items():
+ if u_name not in k_.encountered_unicode_range_occurrences.keys():
+ k_.encountered_unicode_range_occurrences[u_name] = 0
+ k_.encountered_unicode_range_occurrences[u_name] += u_occ
+
+ k_.not_encountered_white_space_reset = self.not_encountered_white_space_reset + other.not_encountered_white_space_reset
+ k_.total_letter_encountered = self.total_letter_encountered + other.total_letter_encountered
+ k_.total_lower_letter_encountered = self.total_lower_letter_encountered + other.total_lower_letter_encountered
+ k_.total_upper_accent_encountered = self.total_upper_accent_encountered + other.total_upper_accent_encountered
+ k_.total_upper_accent_encountered_inner = self.total_upper_accent_encountered_inner + other.total_upper_accent_encountered_inner
+ k_.total_unaccented_letter_encountered = self.total_unaccented_letter_encountered + other.total_unaccented_letter_encountered
+
+ k_._probe_word = self._probe_word + other._probe_word
+
+ k_._string = self._string + other._string
+
+ return k_
+
def _probe(self):
c__ = False
+ upper_lower_m = False
for c, i_ in zip(self._string, range(0, len(self._string))):
@@ -133,7 +182,13 @@ def _probe(self):
continue
if (is_lower and self.previous_printable_letter.isupper()) or (is_upper and self.previous_printable_letter.islower()):
- self.successive_upper_lower += 1
+ if not upper_lower_m:
+ upper_lower_m = True
+ else:
+ self.successive_upper_lower += 1
+ upper_lower_m = False
+ else:
+ upper_lower_m = False
if is_latin:
self.previous_encountered_unicode_range = u_name
@@ -154,6 +209,8 @@ def _probe(self):
@staticmethod
def _unravel_cjk_suspicious_chinese(string, encountered_unicode_range_occurrences):
+ if len(string) <= 10:
+ return UNKNOWN
encountered_unicode_range = encountered_unicode_range_occurrences.keys()
@@ -161,8 +218,10 @@ def _unravel_cjk_suspicious_chinese(string, encountered_unicode_range_occurrence
i_ = s_identify(string)
if i_ in [MIXED, BOTH]:
return encountered_unicode_range_occurrences['CJK Unified Ideographs']
- elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) == 0:
- return encountered_unicode_range_occurrences['CJK Unified Ideographs']
+ elif i_ != UNKNOWN and len(re.findall(cjc_sentence_re, string)) > 0:
+ return -encountered_unicode_range_occurrences['CJK Unified Ideographs']
+ elif i_ != UNKNOWN:
+ return int(encountered_unicode_range_occurrences['CJK Unified Ideographs']*0.3)
return UNKNOWN
@@ -178,4 +237,4 @@ def ratio(self):
r_ = self.total_upper_accent_encountered if self.total_letter_encountered > 0 and self.total_unaccented_letter_encountered / self.total_letter_encountered < 0.5 else 0
z_ = UnicodeRangeIdentify.unravel_suspicious_ranges(len(self._string), self.encountered_unicode_range_occurrences)
p_ = self.encountered_punc_sign if self.encountered_punc_sign / len(self._string) > 0.2 else 0
- return (r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string) # + len(self.encountered_unicode_range)-1
+ return ((r_ + p_ + self.successive_upper_lower + self.successive_accent + self.successive_different_unicode_range + self.not_encountered_white_space + self.unprintable + z_ + ProbeChaos._unravel_cjk_suspicious_chinese.__func__(self._string, self.encountered_unicode_range_occurrences)) / len(self._string)) + self._probe_word.ratio # + len(self.encountered_unicode_range)-1
diff --git a/charset_normalizer/probe_coherence.py b/charset_normalizer/probe_coherence.py
index 0bdddfa4..8faeb4c9 100644
--- a/charset_normalizer/probe_coherence.py
+++ b/charset_normalizer/probe_coherence.py
@@ -1,13 +1,10 @@
# coding: utf-8
-import statistics
-from collections import Counter
-
-from cached_property import cached_property
-
import json
+from collections import Counter
+from functools import lru_cache
from os.path import dirname, realpath, exists
-from functools import lru_cache
+from cached_property import cached_property
class HashableCounter(Counter):
@@ -56,6 +53,15 @@ def most_likely(self):
k_ = [self.index_of_rates[str(el[0])][str(el[1])] for el in sorted(p_, key=lambda tup: sum(tup))]
return [item for sublist in k_ for item in sublist][:3]
+ def ratio_of(self, language):
+ """
+ :param str language:
+ :return:
+ """
+ if language.capitalize() not in self.rank_per_lang:
+ return 1.
+ return self.rank_per_lang[language.capitalize()]
+
@cached_property
def ratio(self):
"""
@@ -65,11 +71,16 @@ def ratio(self):
:return: Ratio as floating number
:rtype: float
"""
- p_ = [(float(el), float(sorted(self.index_of_rates[str(el)].keys())[0])) for el in
- sorted([float(el) for el in list(self.index_of_rates.keys())])]
-
- return statistics.mean([sum(el) for el in p_[:2]]) if len(
- self.rank_per_lang.keys()) > 0 else 1.
+ # p_ = [(float(el), float(sorted(self.index_of_rates[str(el)].keys())[0])) for el in
+ # sorted([float(el) for el in list(self.index_of_rates.keys())])]
+ #
+ # return statistics.mean([sum(el) for el in p_[:2]]) if len(
+ # self.rank_per_lang.keys()) > 0 else 1.
+ languages = self.most_likely
+ if len(languages) == 0:
+ return 1.
+ ratios = [self.rank_per_lang[lg] for lg in languages]
+ return sum(ratios)
def _probe(self):
diff --git a/charset_normalizer/probe_words.py b/charset_normalizer/probe_words.py
new file mode 100644
index 00000000..b3a2cc06
--- /dev/null
+++ b/charset_normalizer/probe_words.py
@@ -0,0 +1,69 @@
+from functools import lru_cache
+
+from charset_normalizer.probe_coherence import HashableCounter
+from charset_normalizer.unicode import UnicodeRangeIdentify
+
+
+@lru_cache(maxsize=8192)
+class ProbeWords:
+
+ def __init__(self, w_counter):
+ """
+ :param HashableCounter w_counter:
+ """
+ self._w_counter = w_counter
+
+ self._words = list()
+ self._nb_words = 0
+
+ self._suspicious = list()
+
+ if w_counter is not None:
+ self._words = list(w_counter.keys())
+ self._nb_words = len(self._words)
+
+ self._probe()
+
+ def __add__(self, other):
+ """
+
+ :param ProbeWords other:
+ :return:
+ """
+ k_ = ProbeWords(None)
+
+ k_._nb_words = self._nb_words + other._nb_words
+ k_._suspicious = self._suspicious + other._suspicious
+
+ return k_
+
+ def _probe(self):
+
+ for el in self._words:
+
+ w_len = len(el)
+ classification = UnicodeRangeIdentify.classification(el)
+
+ c_ = 0
+
+ is_latin_based = all(['Latin' in el for el in list(classification.keys())])
+
+ if len(classification.keys()) > 1:
+ for u_name, u_occ in classification.items():
+
+ if UnicodeRangeIdentify.is_range_secondary(u_name) is True:
+ c_ += u_occ
+
+ if (not is_latin_based and c_ > int(w_len / 4)) \
+ or (is_latin_based and c_ > int(w_len / 2)) \
+ or (UnicodeRangeIdentify.part_punc(el) > 0.4 and len(classification.keys()) > 1) \
+ or (not is_latin_based and UnicodeRangeIdentify.part_accent(el) > 0.4) \
+ or (not is_latin_based and len(el) > 10 and UnicodeRangeIdentify.part_lonely_range(el) > 0.3):
+ self._suspicious.append(el)
+ else:
+ pass
+
+ @property
+ def ratio(self):
+ return len(self._suspicious) / self._nb_words if self._nb_words > 5 else 0.
+
diff --git a/charset_normalizer/unicode.py b/charset_normalizer/unicode.py
index fd70c306..69a7d46d 100644
--- a/charset_normalizer/unicode.py
+++ b/charset_normalizer/unicode.py
@@ -1,7 +1,8 @@
# coding: utf-8
-from charset_normalizer.constant import UNICODE_RANGES_ZIP, UNICODE_RANGES_NAMES
from functools import lru_cache
+from charset_normalizer.constant import UNICODE_RANGES_ZIP, UNICODE_RANGES_NAMES, UNICODE_SECONDARY_RANGE_KEYWORD
+
class UnicodeRangeIdentify:
@@ -52,7 +53,7 @@ def is_latin(letter):
:param str letter:
:return:
"""
- return 'Latin' in UnicodeRangeIdentify.find_letter_type(letter)
+ return 'Latin' in (UnicodeRangeIdentify.find_letter_type(letter) or '')
@staticmethod
@lru_cache(maxsize=8192)
@@ -65,9 +66,10 @@ def is_punc(letter):
if letter.isspace():
return True
r_name = UnicodeRangeIdentify.find_letter_type(letter)
- return "Punctuation" in r_name or \
- 'Forms' in r_name or \
- letter in 'º¯—–‒‐⁃«‹?!;.:^$*»£¹¿~ª؟©±¡{}[]|¼½¾⅕⅙⅛™℠‼⁇❝❞¶⁋√↑↓�'
+ return r_name is not None and \
+ ("Punctuation" in r_name or
+ 'Forms' in r_name or
+ letter in 'º¯—–‒‐⁃«‹?!;.:^$*»£¹¿~ª؟©±¡{}[]|¼½¾⅕⅙⅛™℠‼⁇❝❞¶⁋√↑↓�¤`')
@staticmethod
@lru_cache(maxsize=8192)
@@ -77,7 +79,7 @@ def is_cjk(letter):
:param str letter:
:return:
"""
- return 'CJK' in UnicodeRangeIdentify.find_letter_type(letter)
+ return 'CJK' in (UnicodeRangeIdentify.find_letter_type(letter) or '')
@staticmethod
def unravel_suspicious_ranges(str_len, encountered_unicode_range_occurrences):
@@ -110,13 +112,15 @@ def is_suspiciously_successive_range(range_name_a, range_name_b):
:param str range_name_b:
:return:
"""
+ if range_name_a is None or range_name_b is None:
+ return True
dec_range_name_a, dec_range_name_b = range_name_a.split(), range_name_b.split()
if range_name_a == range_name_b:
return False
- if 'Latin' in range_name_a or 'Latin' in range_name_b:
+ if 'Latin' in range_name_a and 'Latin' in range_name_b:
return False
for el in dec_range_name_a:
@@ -130,3 +134,110 @@ def is_suspiciously_successive_range(range_name_a, range_name_b):
return False
return True
+
+ @staticmethod
+ def classification(word):
+ """
+ :param str word:
+ :return:
+ """
+ cla_ = dict()
+
+ for el in word:
+ if el.isspace():
+ raise IOError('Classification should not be invoked with sentences !')
+ u_name = UnicodeRangeIdentify.find_letter_type(el)
+ if u_name is None:
+ u_name = 'Unknown'
+ if u_name not in cla_:
+ cla_[u_name] = 0
+ cla_[u_name] += 1
+
+ return cla_
+
+ @staticmethod
+ @lru_cache(maxsize=512)
+ def is_range_secondary(u_range):
+ """
+ :param str u_range:
+ :return:
+ """
+ try:
+ UnicodeRangeIdentify.get_range_id(u_range)
+ except ValueError:
+ return True
+
+ for keyword in UNICODE_SECONDARY_RANGE_KEYWORD:
+ if keyword in u_range:
+ return True
+
+ return False
+
+ @staticmethod
+ def part_punc(word):
+ """
+ Determine how much of the word is composed of punc sign
+ :param str word:
+ :return:
+ """
+ return [UnicodeRangeIdentify.is_punc(el) for el in word].count(True) / len(word)
+
+ @staticmethod
+ def part_accent(word):
+ """
+ Determine how much of the word is composed of accentuated letter
+ :param word:
+ :return:
+ """
+ return [UnicodeRangeIdentify.is_accentuated(el) for el in word].count(True) / len(word)
+
+ @staticmethod
+ def word_to_range_list(word):
+ """
+
+ :param str word:
+ :return:
+ """
+ return [UnicodeRangeIdentify.find_letter_type(el) for el in word]
+
+ @staticmethod
+ def word_to_range_continue(word):
+ """
+
+ :param str word:
+ :return:
+ """
+ l_ = list()
+
+ for el in word:
+ u_name = UnicodeRangeIdentify.find_letter_type(el)
+ if len(l_) == 0:
+ l_.append(
+ (
+ u_name,
+ 1
+ )
+ )
+ else:
+ if UnicodeRangeIdentify.is_suspiciously_successive_range(u_name, l_[-1][0]) is True:
+ l_.append(
+ (
+ u_name,
+ 1
+ )
+ )
+ else:
+ l_[-1] = (
+ u_name,
+ l_[-1][1]+1
+ )
+
+ return l_
+
+ @staticmethod
+ def part_lonely_range(word):
+ """
+ :param str word:
+ :return:
+ """
+ return [u_occ_cont == 1 for u_name, u_occ_cont in UnicodeRangeIdentify.word_to_range_continue(word)].count(True) / len(word)
diff --git a/setup.py b/setup.py
index 113d2f7d..55894ab8 100644
--- a/setup.py
+++ b/setup.py
@@ -8,12 +8,12 @@
# Package meta-data.
NAME = 'charset_normalizer'
-DESCRIPTION = 'The Real First Universal Charset Detector. Offer a viable solution alternative to Chardet.'
+DESCRIPTION = 'The Real First Universal Charset Detector. No Cpp Bindings, Using Voodoo and Magical Artifacts.'
URL = 'https://github.com/ousret/charset_normalizer'
EMAIL = 'ahmed.tahri@cloudnursery.dev'
AUTHOR = 'Ahmed TAHRI @Ousret'
-REQUIRES_PYTHON = '>=3.4.0'
-VERSION = '0.2.3'
+REQUIRES_PYTHON = '>=3.5.0'
+VERSION = '0.3.0'
REQUIRED = [
'cached_property',
@@ -67,7 +67,6 @@
'Operating System :: OS Independent',
'Programming Language :: Python',
'Programming Language :: Python :: 3',
- 'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.7',
diff --git a/test/test_on_byte.py b/test/test_on_byte.py
index 52a22176..c68aeb74 100644
--- a/test/test_on_byte.py
+++ b/test/test_on_byte.py
@@ -6,95 +6,106 @@
class TestBytes(unittest.TestCase):
def test_bom_detection(self):
- self.assertFalse(
- CnM.from_bytes(
- '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030')
- ).best().first().byte_order_mark
- )
-
- self.assertTrue(
- CnM.from_bytes(
- (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030')
- ).best().first().byte_order_mark
- )
-
- self.assertTrue(
- CnM.from_bytes(
- b'\x2b\x2f\x76\x38' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')
- ).best().first().byte_order_mark
- )
-
- self.assertTrue(
- CnM.from_bytes(
- b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
- ).best().first().byte_order_mark
- )
+ with self.subTest('GB18030 UNAVAILABLE SIG'):
+ self.assertFalse(
+ CnM.from_bytes(
+ '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030')
+ ).best().first().byte_order_mark
+ )
+
+ with self.subTest('GB18030 AVAILABLE SIG'):
+ self.assertTrue(
+ CnM.from_bytes(
+ (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030')
+ ).best().first().byte_order_mark
+ )
+
+ with self.subTest('UTF-7 AVAILABLE BOM'):
+ self.assertTrue(
+ CnM.from_bytes(
+ b'\x2b\x2f\x76\x38' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')
+ ).best().first().byte_order_mark
+ )
+
+ with self.subTest('UTF-8 AVAILABLE BOM'):
+ self.assertTrue(
+ CnM.from_bytes(
+ b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
+ ).best().first().byte_order_mark
+ )
def test_encode_decode(self):
- self.assertEqual(
- CnM.from_bytes(
- '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030')
- ).best().first().encoding,
- 'gb18030'
- )
-
- self.assertEqual(
- CnM.from_bytes(
- (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030')
- ).best().first().encoding,
- 'gb18030'
- )
-
- self.assertEqual(
- CnM.from_bytes(
- '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
- ).best().first().encoding,
- 'utf_8'
- )
-
- self.assertEqual(
- CnM.from_bytes(
- '我没有埋怨,蹉跎的只是一些时间。'.encode('utf_7')
- ).best().first().encoding,
- 'utf_7'
- )
-
- self.assertEqual(
- CnM.from_bytes(
- b'\x2b\x2f\x76\x38'+'我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')
- ).best().first().encoding,
- 'utf_7'
- )
-
-
-
- self.assertEqual(
- CnM.from_bytes(
- 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно,'.encode('utf_7')
- ).best().first().encoding,
- 'utf_7'
- )
-
- self.assertEqual(
- CnM.from_bytes(
- b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
- ).best().first().encoding,
- 'utf_8'
- )
-
- self.assertEqual(
- CnM.from_bytes(
- 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, '
- 'поне що се отнася до началното и основното образование.'.encode('utf_8')
- ).best().first().encoding,
- 'utf_8'
- )
-
- self.assertEqual(
- CnM.from_bytes(
- 'Bсеки човек има право на образование.'.encode(
- 'utf_8')
- ).best().first().encoding,
- 'utf_8'
- )
+ with self.subTest('Encode & Detect GB18030 WITHOUT SIG'):
+ self.assertEqual(
+ CnM.from_bytes(
+ '我没有埋怨,磋砣的只是一些时间。'.encode('gb18030')
+ ).best().first().encoding,
+ 'gb18030'
+ )
+
+ with self.subTest('Encode & Detect GB18030 WITH SIG (CJK)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ (u'\uFEFF' + '我没有埋怨,磋砣的只是一些时间。').encode('gb18030')
+ ).best().first().encoding,
+ 'gb18030'
+ )
+
+ with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CJK)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
+ ).best().first().encoding,
+ 'utf_8'
+ )
+
+ with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CJK)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ '我没有埋怨,蹉跎的只是一些时间。'.encode('utf_7')
+ ).best().first().encoding,
+ 'utf_7'
+ )
+
+ with self.subTest('Encode & Detect UTF-7 WITH BOM (CJK)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ b'\x2b\x2f\x76\x38'+'我没有埋怨,磋砣的只是一些时间。'.encode('utf_7')
+ ).best().first().encoding,
+ 'utf_7'
+ )
+
+ with self.subTest('Encode & Detect UTF-7 WITHOUT BOM (CYRILLIC)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно,'.encode('utf_7')
+ ).best().first().encoding,
+ 'utf_7'
+ )
+
+ with self.subTest('Encode & Detect UTF-8 WITH SIG (CJK)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ b'\xef\xbb\xbf' + '我没有埋怨,磋砣的只是一些时间。'.encode('utf_8')
+ ).best().first().encoding,
+ 'utf_8'
+ )
+
+ with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ 'Bсеки човек има право на образование. Oбразованието трябва да бъде безплатно, '
+ 'поне що се отнася до началното и основното образование.'.encode('utf_8')
+ ).best().first().encoding,
+ 'utf_8'
+ )
+
+ with self.subTest('Encode & Detect UTF-8 WITHOUT SIG (CYRILLIC)'):
+ self.assertEqual(
+ CnM.from_bytes(
+ 'Bсеки човек има право на образование.'.encode(
+ 'utf_8')
+ ).best().first().encoding,
+ 'utf_8'
+ )
diff --git a/test/test_on_file.py b/test/test_on_file.py
index 34cf3096..a1d726ef 100644
--- a/test/test_on_file.py
+++ b/test/test_on_file.py
@@ -11,11 +11,11 @@ class TestFileCharsetNormalizer(unittest.TestCase):
SHOULD_BE = {
'sample.1.ar.srt': 'cp1256',
'sample.1.fr.srt': 'cp1252',
- 'sample.1.gr.srt': 'iso8859_7',
+ 'sample.1.gr.srt': 'cp1253',
'sample.1.he.srt': 'cp1255',
'sample.1.hi.srt': 'ascii',
'sample.1.ru.srt': 'cp1251',
- 'sample.1.tu.srt': 'cp1256', # Not actually the good one. But kinda readable.
+ 'sample.1.tu.srt': 'cp1252', # Not actually the good one. But kinda readable.
'sample.2.ar.srt': 'cp1256',
'sample.3.ar.srt': 'utf_8',
'sample.4.ar.srt': 'cp1256',
diff --git a/test/test_probe_chaos.py b/test/test_probe_chaos.py
index 0d322c9f..a064533b 100644
--- a/test/test_probe_chaos.py
+++ b/test/test_probe_chaos.py
@@ -7,7 +7,7 @@ class TestProbeChaos(unittest.TestCase):
def test_not_gibberish(self):
- self.assertEqual(
+ self.assertLessEqual(
ProbeChaos('典肇乎庚辰年十二月廿一,及己丑年二月十九,收各方語言二百五十,合逾七百萬目;二十大卷佔八成,單英文卷亦過二百萬。悉文乃天下有志共筆而成;有意助之,幾網路、隨纂作,大典茁焉。').ratio,
0.
)