From a2a4682ba7055314bd11d8ec8787d703918cbd9a Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Mon, 30 Sep 2019 20:01:29 +0200 Subject: [PATCH] Release 1.3.0 (#17) * Backport unicodedata for v12 impl into python if available * Add aliases to CharsetNormalizerMatches class * Add feature preemptive behaviour, looking for encoding declaration * import aliases in __init__ * Change text in Why. More concise. * bump 1.3.0 * initial docs work in progress --- README.md | 5 +- charset_normalizer/__init__.py | 3 +- charset_normalizer/hook.py | 6 + charset_normalizer/normalizer.py | 39 ++++- charset_normalizer/probe_inherent_sign.py | 39 +++++ docs/Makefile | 20 +++ docs/advanced_search.rst | 20 +++ docs/conf.py | 163 +++++++++++++++++++++ docs/getstarted.rst | 70 +++++++++ docs/handling_result.rst | 5 + docs/index.rst | 48 +++++++ docs/requirements.txt | 1 + docs/support.rst | 167 ++++++++++++++++++++++ setup.py | 5 +- 14 files changed, 579 insertions(+), 12 deletions(-) create mode 100644 charset_normalizer/probe_inherent_sign.py create mode 100755 docs/Makefile create mode 100755 docs/advanced_search.rst create mode 100755 docs/conf.py create mode 100755 docs/getstarted.rst create mode 100755 docs/handling_result.rst create mode 100755 docs/index.rst create mode 100755 docs/requirements.txt create mode 100755 docs/support.rst diff --git a/README.md b/README.md index 532f3a67..d0773fb6 100644 --- a/README.md +++ b/README.md @@ -106,9 +106,8 @@ See wiki for advanced usages. *Todo, not yet available.* ## 😇 Why -When I started using Chardet, I noticed that this library was wrong most of the time -when it's not about Unicode, Gb or Big5. That because some charset are easily identifiable -because of there standards and Chardet does a really good job at identifying them. +When I started using Chardet, I noticed that this library was unreliable nowadays and also +it's unmaintained, and most likely will never be. I **don't care** about the **originating charset** encoding, that because **two different table** can produce **two identical file.** diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py index 13492872..c1366cc1 100644 --- a/charset_normalizer/__init__.py +++ b/charset_normalizer/__init__.py @@ -1,5 +1,6 @@ # coding: utf-8 -from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch +from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch, \ + CharsetDetector, CharsetDoctor, EncodingDetector # Aliases from charset_normalizer.unicode import UnicodeRangeIdentify from charset_normalizer.probe_chaos import ProbeChaos from charset_normalizer.probe_coherence import ProbeCoherence diff --git a/charset_normalizer/hook.py b/charset_normalizer/hook.py index a0b2c5a8..2994a213 100644 --- a/charset_normalizer/hook.py +++ b/charset_normalizer/hook.py @@ -12,3 +12,9 @@ def charset_normalizer_hook(exctype, value, traceback): sys.excepthook = charset_normalizer_hook + +try: + import unicodedata2 + sys.modules['unicodedata'] = unicodedata2 +except ImportError: + pass diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py index 922c7048..a6845876 100644 --- a/charset_normalizer/normalizer.py +++ b/charset_normalizer/normalizer.py @@ -15,6 +15,8 @@ from charset_normalizer.encoding import is_multi_byte_encoding +from charset_normalizer.probe_inherent_sign import any_specified_encoding + from loguru import logger from hashlib import sha256 @@ -319,7 +321,7 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.20): return b_ @staticmethod - def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False): + def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False): """ Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported charset encoding. @@ -327,7 +329,8 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation :param bytes sequences: Actual sequence of bytes to analyse :param float threshold: Maximum amount of chaos allowed on first pass :param int chunk_size: Size to extract and analyse in each step - :param int steps: Number of steps + :param int steps: Number of steps/block to extract from sequence + :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding :param bool explain: Print on screen what is happening when searching for a match :param list[str] cp_isolation: Finite list of encoding to use when searching for a match :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match @@ -381,6 +384,13 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation tested = set() matches = list() + specified_encoding = any_specified_encoding(sequences) if preemptive_behaviour is True else None + + if specified_encoding is not None: + warn( + 'Trying to detect encoding on a sequence that seems to declare a encoding ({}).'.format(specified_encoding) + ) + for support in supported: k, p = support @@ -493,8 +503,16 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation cnm ) + if specified_encoding is not None and p == specified_encoding: + logger.info('{encoding} is most likely the one. ' + 'Because it is specified in analysed byte sequence and ' + 'initial test passed successfully. ' + 'Disable this behaviour by setting preemptive_behaviour ' + 'to False', encoding=specified_encoding) + return CharsetNormalizerMatches([cnm]) if any(fingerprint_tests) is False else CharsetNormalizerMatches([matches[fingerprint_tests.index(True)]]) + if (p == 'ascii' and chaos_median == 0.) or bom_available is True: - logger.info('{encoding} is the most likely the one. {bom_available}', + logger.info('{encoding} is most likely the one. {bom_available}', encoding=p, bom_available='BOM/SIG available' if bom_available else '') @@ -503,13 +521,14 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation return CharsetNormalizerMatches(matches) @staticmethod - def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False): + def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False): """ :param io.BinaryIO fp: :param int steps: :param int chunk_size: :param float threshold: :param bool explain: Print on screen what is happening when searching for a match + :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding :param list[str] cp_isolation: Finite list of encoding to use when searching for a match :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match :return: List of potential matches @@ -522,16 +541,18 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_ threshold, cp_isolation, cp_exclusion, + preemptive_behaviour, explain ) @staticmethod - def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False): + def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False): """ :param str path: :param int steps: :param int chunk_size: :param float threshold: + :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding :param bool explain: Print on screen what is happening when searching for a match :param list[str] cp_isolation: Finite list of encoding to use when searching for a match :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match @@ -539,7 +560,7 @@ def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, :rtype: CharsetNormalizerMatches """ with open(path, 'rb') as fp: - return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, explain) + return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain) @cached_property def could_be_from_charset(self): @@ -596,3 +617,9 @@ def best(self): return CharsetNormalizerMatches( sorted_matches[:nb_lowest_ratio+1] ) + + +# Some aliases to CharsetNormalizerMatches, because it is too long for a class name. +CharsetDetector = CharsetNormalizerMatches +EncodingDetector = CharsetNormalizerMatches +CharsetDoctor = CharsetNormalizerMatches diff --git a/charset_normalizer/probe_inherent_sign.py b/charset_normalizer/probe_inherent_sign.py new file mode 100644 index 00000000..5b2a536c --- /dev/null +++ b/charset_normalizer/probe_inherent_sign.py @@ -0,0 +1,39 @@ +from re import findall, compile, IGNORECASE +from encodings.aliases import aliases + +RE_POSSIBLE_ENCODING_INDICATION = compile( + r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)', + IGNORECASE +) + + +def any_specified_encoding(sequence): + """ + Search in sequence (ASCII-mode) if there is any sign of declared encoding. + :param bytes sequence: + :return: Declared encoding if any else None + :rtype: str + """ + if not isinstance(sequence, bytes) and not isinstance(sequence, bytearray): + raise TypeError + + seq_len = len(sequence) + + results = findall( + RE_POSSIBLE_ENCODING_INDICATION, + sequence[:seq_len if seq_len <= 2048 else int(seq_len*0.3)].decode('ascii', errors='ignore') + ) # type: list[str] + + if len(results) == 0: + return None + + for specified_encoding in results: + specified_encoding = specified_encoding.lower().replace('-', '_') + + for a, b in aliases.items(): + if a == specified_encoding: + return b + if b == specified_encoding: + return b + + return None diff --git a/docs/Makefile b/docs/Makefile new file mode 100755 index 00000000..b6888c16 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line. +SPHINXOPTS = +SPHINXBUILD = python -msphinx +SPHINXPROJ = Charset Normalizer +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) \ No newline at end of file diff --git a/docs/advanced_search.rst b/docs/advanced_search.rst new file mode 100755 index 00000000..dca714d9 --- /dev/null +++ b/docs/advanced_search.rst @@ -0,0 +1,20 @@ +Advanced Search +=============== + +Charset Normalizer method ``from_bytes``, ``from_fp`` and ``from_path`` provide some +optional parameters that can be tweaked. + +As follow :: + + CharsetDetector.from_bytes( + my_byte_str, + steps=10, # Number of steps/block to extract from my_byte_str + chunk_size=512, # Set block size of each extraction + threshold=0.2, # Maximum amount of chaos allowed on first pass + cp_isolation=None, # Finite list of encoding to use when searching for a match + cp_exclusion=None, # Finite list of encoding to avoid when searching for a match + preemptive_behaviour=True, # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding + explain=False # Print on screen what is happening when searching for a match + ) + +!! Warning !! Work in Progress Documentation !! diff --git a/docs/conf.py b/docs/conf.py new file mode 100755 index 00000000..dd93ebcd --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +# +# charset-normalizer documentation build configuration file, created by +# sphinx-quickstart on Fri Jun 16 04:30:35 2017. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) +from recommonmark.parser import CommonMarkParser +import sphinx_rtd_theme + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +# +# needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +# source_suffix = ['.rst', '.md'] +# source_suffix = '.rst' + +source_parsers = { + '.md': CommonMarkParser, +} + +source_suffix = ['.rst', '.md'] + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = 'charset_normalizer' +copyright = '2019, Ahmed TAHRI' +author = 'Ahmed TAHRI @Ousret' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '1.1' +# The full version, including alpha/beta/rc tags. +release = '1.1.1' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This patterns also effect to html_static_path and html_extra_path +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'sphinx_rtd_theme' + +html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# +# html_theme_options = {} + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + + +# -- Options for HTMLHelp output ------------------------------------------ + +# Output file base name for HTML help builder. +htmlhelp_basename = 'charset-normalizer-doc' + + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', + + # Latex figure (float) alignment + # + # 'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'charset-normalizer.tex', 'Charset Normalizer Documentation', + 'Ahmed TAHRI', 'manual'), +] + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'charset-normalizer', 'Charset Normalizer Documentation', + [author], 1) +] + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'Charset Normalizer', 'Charsert Normalizer Documentation', + author, 'charset-normalizer', '🎁 Maintained library on encoding & language detection. 🚀No Cpp Bindings, Using Voodoo and Magical Artifacts. 🔎 Like Chardet', + 'Miscellaneous'), +] diff --git a/docs/getstarted.rst b/docs/getstarted.rst new file mode 100755 index 00000000..cce2468d --- /dev/null +++ b/docs/getstarted.rst @@ -0,0 +1,70 @@ +Installation +============ + +This installs a package that can be used from Python (``import charset_normalizer``). + +To install for all users on the system, administrator rights (root) +may be required. + +From PyPI +--------- +Charset Normalizer can be installed from PyPI:: + + pip install charset-normalizer + +You may enable extra feature Unicode Data v12 backport as follow:: + + pip install charset-normalizer[UnicodeDataBackport] + +From git via dev-master +----------------------- +You can install from dev-master branch using git:: + + git clone https://github.com/Ousret/charset_normalizer.git + cd charset_normalizer/ + python setup.py install + +Basic Usage +=========== + +The new way +----------- + +You may want to get right to it. :: + + from charset_normalizer import CharsetDetector + + # This is going to print out your sequence once encoding has been detected + print( + CharsetDetector.from_bytes( + my_byte_str + ).best().first() + ) + + # You could also want the same from a file + print( + CharsetDetector.from_path( + './data/sample.1.ar.srt' + ).best().first() + ) + + +Backward compatibility +---------------------- + +If you were used to python chardet, we are providing the very same ``detect()`` method as chardet. + + :: + + from charset_normalizer import detect + + # This will behave exactly the same as python chardet + result = detect(my_byte_str) + + if result['encoding'] is not None: + print('got', result['encoding'], 'as detected encoding') + + +You may upgrade your code with ease. +CTRL + R ``from chardet import detect`` to ``from charset_normalizer import detect``. + diff --git a/docs/handling_result.rst b/docs/handling_result.rst new file mode 100755 index 00000000..a0213c53 --- /dev/null +++ b/docs/handling_result.rst @@ -0,0 +1,5 @@ +================ + Handling Result +================ + +!! Warning !! Work in Progress Documentation !! \ No newline at end of file diff --git a/docs/index.rst b/docs/index.rst new file mode 100755 index 00000000..3064b639 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,48 @@ +=================== + Charset Normalizer +=================== + +Overview +======== + +Library that help you read text from unknown charset encoding. +Project motivated by chardet, I'm trying to resolve the issue by taking another approach. +All IANA character set names for which the Python core library provides codecs are supported. + +.. image:: https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df + :width: 500px + :scale: 100 % + :alt: CLI Charset Normalizer + :align: right + + +It is released under MIT license, see LICENSE for more +details. Be aware that no warranty of any kind is provided with this package. + +Copyright (C) 2019 Ahmed TAHRI @Ousret + +!! Warning !! Work in Progress Documentation !! + +Features +======== + +- Encoding detection on a buffer, bytes or file. +- Transpose any encoded content to Unicode the best we can. +- Detect spoken language in text. + +Contents: + +.. toctree:: + :maxdepth: 2 + + support + getstarted + advanced_search + handling_result + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` \ No newline at end of file diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100755 index 00000000..483a4e96 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1 @@ +sphinx_rtd_theme diff --git a/docs/support.rst b/docs/support.rst new file mode 100755 index 00000000..6a48da73 --- /dev/null +++ b/docs/support.rst @@ -0,0 +1,167 @@ +================= + Support +================= + +!! Warning !! Work in Progress Documentation !! + +------- +Supported Encodings +------- + +Charset Normalizer is able to detect any of those encoding. + ++-----------------+----------------------------------------------------------------------------------------------------------------------------------+ +| IANA Code Page | Aliases | ++=================+==================================================================================================================================+ +| ascii | 646, ansi_x3.4_1968, ansi_x3_4_1968, ansi_x3.4_1986, cp367, csascii, ibm367, iso646_us, iso_646.irv_1991, iso_ir_6, us, us_ascii | +| big5 | big5_tw, csbig5, x_mac_trad_chinese | +| big5hkscs | big5_hkscs, hkscs | +| cp037 | 037, csibm037, ebcdic_cp_ca, ebcdic_cp_nl, ebcdic_cp_us, ebcdic_cp_wt, ibm037, ibm039 | +| cp1026 | 1026, csibm1026, ibm1026 | +| cp1125 | 1125, ibm1125, cp866u, ruscii | +| cp1140 | 1140, ibm1140 | +| cp1250 | 1250, windows_1250 | +| cp1251 | 1251, windows_1251 | +| cp1252 | 1252, windows_1252 | +| cp1253 | 1253, windows_1253 | +| cp1254 | 1254, windows_1254 | +| cp1255 | 1255, windows_1255 | +| cp1256 | 1256, windows_1256 | +| cp1257 | 1257, windows_1257 | +| cp1258 | 1258, windows_1258 | +| cp273 | 273, ibm273, csibm273 | +| cp424 | 424, csibm424, ebcdic_cp_he, ibm424 | +| cp437 | 437, cspc8codepage437, ibm437 | +| cp500 | 500, csibm500, ebcdic_cp_be, ebcdic_cp_ch, ibm500 | +| cp775 | 775, cspc775baltic, ibm775 | +| cp850 | 850, cspc850multilingual, ibm850 | +| cp852 | 852, cspcp852, ibm852 | +| cp855 | 855, csibm855, ibm855 | +| cp857 | 857, csibm857, ibm857 | +| cp858 | 858, csibm858, ibm858 | +| cp860 | 860, csibm860, ibm860 | +| cp861 | 861, cp_is, csibm861, ibm861 | +| cp862 | 862, cspc862latinhebrew, ibm862 | +| cp863 | 863, csibm863, ibm863 | +| cp864 | 864, csibm864, ibm864 | +| cp865 | 865, csibm865, ibm865 | +| cp866 | 866, csibm866, ibm866 | +| cp869 | 869, cp_gr, csibm869, ibm869 | +| cp932 | 932, ms932, mskanji, ms_kanji | +| cp949 | 949, ms949, uhc | +| cp950 | 950, ms950 | +| euc_jis_2004 | jisx0213, eucjis2004, euc_jis2004 | +| euc_jisx0213 | eucjisx0213 | +| euc_jp | eucjp, ujis, u_jis | +| euc_kr | euckr, korean, ksc5601, ks_c_5601, ks_c_5601_1987, ksx1001, ks_x_1001, x_mac_korean | +| gb18030 | gb18030_2000 | +| gb2312 | chinese, csiso58gb231280, euc_cn, euccn, eucgb2312_cn, gb2312_1980, gb2312_80, iso_ir_58, x_mac_simp_chinese | +| gbk | 936, cp936, ms936 | +| hp_roman8 | roman8, r8, csHPRoman8 | +| hz | hzgb, hz_gb, hz_gb_2312 | +| iso2022_jp | csiso2022jp, iso2022jp, iso_2022_jp | +| iso2022_jp_1 | iso2022jp_1, iso_2022_jp_1 | +| iso2022_jp_2 | iso2022jp_2, iso_2022_jp_2 | +| iso2022_jp_2004 | iso_2022_jp_2004, iso2022jp_2004 | +| iso2022_jp_3 | iso2022jp_3, iso_2022_jp_3 | +| iso2022_jp_ext | iso2022jp_ext, iso_2022_jp_ext | +| iso2022_kr | csiso2022kr, iso2022kr, iso_2022_kr | +| iso8859_10 | csisolatin6, iso_8859_10, iso_8859_10_1992, iso_ir_157, l6, latin6 | +| iso8859_11 | thai, iso_8859_11, iso_8859_11_2001 | +| iso8859_13 | iso_8859_13, l7, latin7 | +| iso8859_14 | iso_8859_14, iso_8859_14_1998, iso_celtic, iso_ir_199, l8, latin8 | +| iso8859_15 | iso_8859_15, l9, latin9 | +| iso8859_16 | iso_8859_16, iso_8859_16_2001, iso_ir_226, l10, latin10 | +| iso8859_2 | csisolatin2, iso_8859_2, iso_8859_2_1987, iso_ir_101, l2, latin2 | +| iso8859_3 | csisolatin3, iso_8859_3, iso_8859_3_1988, iso_ir_109, l3, latin3 | +| iso8859_4 | csisolatin4, iso_8859_4, iso_8859_4_1988, iso_ir_110, l4, latin4 | +| iso8859_5 | csisolatincyrillic, cyrillic, iso_8859_5, iso_8859_5_1988, iso_ir_144 | +| iso8859_6 | arabic, asmo_708, csisolatinarabic, ecma_114, iso_8859_6, iso_8859_6_1987, iso_ir_127 | +| iso8859_7 | csisolatingreek, ecma_118, elot_928, greek, greek8, iso_8859_7, iso_8859_7_1987, iso_ir_126 | +| iso8859_8 | csisolatinhebrew, hebrew, iso_8859_8, iso_8859_8_1988, iso_ir_138 | +| iso8859_9 | csisolatin5, iso_8859_9, iso_8859_9_1989, iso_ir_148, l5, latin5 | +| johab | cp1361, ms1361 | +| koi8_r | cskoi8r | +| kz1048 | kz_1048, rk1048, strk1048_2002 | +| latin_1 | 8859, cp819, csisolatin1, ibm819, iso8859, iso8859_1, iso_8859_1, iso_8859_1_1987, iso_ir_100, l1, latin, latin1 | +| mac_cyrillic | maccyrillic | +| mac_greek | macgreek | +| mac_iceland | maciceland | +| mac_latin2 | maccentraleurope, maclatin2 | +| mac_roman | macintosh, macroman | +| mac_turkish | macturkish | +| mbcs | ansi, dbcs | +| ptcp154 | csptcp154, pt154, cp154, cyrillic_asian | +| quopri_codec | quopri, quoted_printable, quotedprintable | +| rot_13 | rot13 | +| shift_jis | csshiftjis, shiftjis, sjis, s_jis, x_mac_japanese | +| shift_jis_2004 | shiftjis2004, sjis_2004, s_jis_2004 | +| shift_jisx0213 | shiftjisx0213, sjisx0213, s_jisx0213 | +| tactis | tis260 | +| tis_620 | tis620, tis_620_0, tis_620_2529_0, tis_620_2529_1, iso_ir_166 | +| utf_16 | u16, utf16 | +| utf_16_be | unicodebigunmarked, utf_16be | +| utf_16_le | unicodelittleunmarked, utf_16le | +| utf_32 | u32, utf32 | +| utf_32_be | utf_32be | +| utf_32_le | utf_32le | +| utf_7 | u7, utf7, unicode_1_1_utf_7 | +| utf_8 | u8, utf, utf8, utf8_ucs2, utf8_ucs4 | ++-----------------+----------------------------------------------------------------------------------------------------------------------------------+ + +------- +Supported Languages +------- + +Those language can be detected inside your content. All of these are specified in ./charset_normalizer/assets/frequencies.json . + +['English', + 'German', + 'French', + 'Dutch', + 'Italian', + 'Polish', + 'Spanish', + 'Russian', + 'Japanese', + 'Portuguese', + 'Swedish', + 'Chinese', + 'Catalan', + 'Ukrainian', + 'Norwegian', + 'Finnish', + 'Vietnamese', + 'Czech', + 'Hungarian', + 'Korean', + 'Indonesian', + 'Turkish', + 'Romanian', + 'Farsi', + 'Arabic', + 'Danish', + 'Esperanto', + 'Serbian', + 'Lithuanian', + 'Slovene', + 'Slovak', + 'Malay', + 'Hebrew', + 'Bulgarian', + 'Kazakh', + 'Baque', + 'Volapük', + 'Croatian', + 'Hindi', + 'Estonian', + 'Azeri', + 'Galician', + 'Simple English', + 'Nynorsk', + 'Thai', + 'Greek', + 'Macedonian', + 'Serbocroatian', + 'Tamil', + 'Classical Chinese'] diff --git a/setup.py b/setup.py index adf8c442..41f9da33 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ EMAIL = 'ahmed.tahri@cloudnursery.dev' AUTHOR = 'Ahmed TAHRI @Ousret' REQUIRES_PYTHON = '>=3.5.0' -VERSION = '1.2.0' +VERSION = '1.3.0' REQUIRED = [ 'cached_property', @@ -24,7 +24,8 @@ ] EXTRAS = { - 'permit to generate frequencies.json': ['requests_html', 'requests'], + 'LetterFrequency': ['requests_html', 'requests'], + 'UnicodeDataBackport': ['unicodedata2'] }