From a2a4682ba7055314bd11d8ec8787d703918cbd9a Mon Sep 17 00:00:00 2001
From: TAHRI Ahmed R <Ousret@users.noreply.github.com>
Date: Mon, 30 Sep 2019 20:01:29 +0200
Subject: [PATCH] Release 1.3.0 (#17)

* Backport unicodedata for v12 impl into python if available

* Add aliases to CharsetNormalizerMatches class

* Add feature preemptive behaviour, looking for encoding declaration

* import aliases in __init__

* Change text in Why. More concise.

* bump 1.3.0

* initial docs

work in progress
---
 README.md                                 |   5 +-
 charset_normalizer/__init__.py            |   3 +-
 charset_normalizer/hook.py                |   6 +
 charset_normalizer/normalizer.py          |  39 ++++-
 charset_normalizer/probe_inherent_sign.py |  39 +++++
 docs/Makefile                             |  20 +++
 docs/advanced_search.rst                  |  20 +++
 docs/conf.py                              | 163 +++++++++++++++++++++
 docs/getstarted.rst                       |  70 +++++++++
 docs/handling_result.rst                  |   5 +
 docs/index.rst                            |  48 +++++++
 docs/requirements.txt                     |   1 +
 docs/support.rst                          | 167 ++++++++++++++++++++++
 setup.py                                  |   5 +-
 14 files changed, 579 insertions(+), 12 deletions(-)
 create mode 100644 charset_normalizer/probe_inherent_sign.py
 create mode 100755 docs/Makefile
 create mode 100755 docs/advanced_search.rst
 create mode 100755 docs/conf.py
 create mode 100755 docs/getstarted.rst
 create mode 100755 docs/handling_result.rst
 create mode 100755 docs/index.rst
 create mode 100755 docs/requirements.txt
 create mode 100755 docs/support.rst

diff --git a/README.md b/README.md
index 532f3a67..d0773fb6 100644
--- a/README.md
+++ b/README.md
@@ -106,9 +106,8 @@ See wiki for advanced usages. *Todo, not yet available.*
 
 ## 😇 Why
 
-When I started using Chardet, I noticed that this library was wrong most of the time 
-when it's not about Unicode, Gb or Big5. That because some charset are easily identifiable 
-because of there standards and Chardet does a really good job at identifying them.
+When I started using Chardet, I noticed that this library was unreliable nowadays and also  
+it's unmaintained, and most likely will never be.
 
 I **don't care** about the **originating charset** encoding, that because **two different table** can 
 produce **two identical file.**
diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
index 13492872..c1366cc1 100644
--- a/charset_normalizer/__init__.py
+++ b/charset_normalizer/__init__.py
@@ -1,5 +1,6 @@
 # coding: utf-8
-from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch
+from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch, \
+    CharsetDetector, CharsetDoctor, EncodingDetector  # Aliases
 from charset_normalizer.unicode import UnicodeRangeIdentify
 from charset_normalizer.probe_chaos import ProbeChaos
 from charset_normalizer.probe_coherence import ProbeCoherence
diff --git a/charset_normalizer/hook.py b/charset_normalizer/hook.py
index a0b2c5a8..2994a213 100644
--- a/charset_normalizer/hook.py
+++ b/charset_normalizer/hook.py
@@ -12,3 +12,9 @@ def charset_normalizer_hook(exctype, value, traceback):
 
 
 sys.excepthook = charset_normalizer_hook
+
+try:
+    import unicodedata2
+    sys.modules['unicodedata'] = unicodedata2
+except ImportError:
+    pass
diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py
index 922c7048..a6845876 100644
--- a/charset_normalizer/normalizer.py
+++ b/charset_normalizer/normalizer.py
@@ -15,6 +15,8 @@
 
 from charset_normalizer.encoding import is_multi_byte_encoding
 
+from charset_normalizer.probe_inherent_sign import any_specified_encoding
+
 from loguru import logger
 
 from hashlib import sha256
@@ -319,7 +321,7 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.20):
         return b_
 
     @staticmethod
-    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
+    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
         charset encoding.
@@ -327,7 +329,8 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
         :param bytes sequences: Actual sequence of bytes to analyse
         :param float threshold: Maximum amount of chaos allowed on first pass
         :param int chunk_size: Size to extract and analyse in each step
-        :param int steps: Number of steps
+        :param int steps: Number of steps/block to extract from sequence
+        :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
         :param bool explain: Print on screen what is happening when searching for a match
         :param list[str] cp_isolation: Finite list of encoding to use when searching for a match
         :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
@@ -381,6 +384,13 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
         tested = set()
         matches = list()
 
+        specified_encoding = any_specified_encoding(sequences) if preemptive_behaviour is True else None
+
+        if specified_encoding is not None:
+            warn(
+                'Trying to detect encoding on a sequence that seems to declare a encoding ({}).'.format(specified_encoding)
+            )
+
         for support in supported:
 
             k, p = support
@@ -493,8 +503,16 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
                     cnm
                 )
 
+            if specified_encoding is not None and p == specified_encoding:
+                logger.info('{encoding} is most likely the one. '
+                            'Because it is specified in analysed byte sequence and '
+                            'initial test passed successfully. '
+                            'Disable this behaviour by setting preemptive_behaviour '
+                            'to False', encoding=specified_encoding)
+                return CharsetNormalizerMatches([cnm]) if any(fingerprint_tests) is False else CharsetNormalizerMatches([matches[fingerprint_tests.index(True)]])
+
             if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
-                logger.info('{encoding} is the most likely the one. {bom_available}',
+                logger.info('{encoding} is most likely the one. {bom_available}',
                                    encoding=p,
                                    bom_available='BOM/SIG available' if bom_available else '')
 
@@ -503,13 +521,14 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
         return CharsetNormalizerMatches(matches)
 
     @staticmethod
-    def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
+    def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         :param io.BinaryIO fp:
         :param int steps:
         :param int chunk_size:
         :param float threshold:
         :param bool explain: Print on screen what is happening when searching for a match
+        :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
         :param list[str] cp_isolation: Finite list of encoding to use when searching for a match
         :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
         :return: List of potential matches
@@ -522,16 +541,18 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_
             threshold,
             cp_isolation,
             cp_exclusion,
+            preemptive_behaviour,
             explain
         )
 
     @staticmethod
-    def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
+    def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         :param str path:
         :param int steps:
         :param int chunk_size:
         :param float threshold:
+        :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
         :param bool explain: Print on screen what is happening when searching for a match
         :param list[str] cp_isolation: Finite list of encoding to use when searching for a match
         :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
@@ -539,7 +560,7 @@ def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None,
         :rtype: CharsetNormalizerMatches
         """
         with open(path, 'rb') as fp:
-            return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, explain)
+            return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain)
 
     @cached_property
     def could_be_from_charset(self):
@@ -596,3 +617,9 @@ def best(self):
         return CharsetNormalizerMatches(
             sorted_matches[:nb_lowest_ratio+1]
         )
+
+
+# Some aliases to CharsetNormalizerMatches, because it is too long for a class name.
+CharsetDetector = CharsetNormalizerMatches
+EncodingDetector = CharsetNormalizerMatches
+CharsetDoctor = CharsetNormalizerMatches
diff --git a/charset_normalizer/probe_inherent_sign.py b/charset_normalizer/probe_inherent_sign.py
new file mode 100644
index 00000000..5b2a536c
--- /dev/null
+++ b/charset_normalizer/probe_inherent_sign.py
@@ -0,0 +1,39 @@
+from re import findall, compile, IGNORECASE
+from encodings.aliases import aliases
+
+RE_POSSIBLE_ENCODING_INDICATION = compile(
+    r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
+    IGNORECASE
+)
+
+
+def any_specified_encoding(sequence):
+    """
+    Search in sequence (ASCII-mode) if there is any sign of declared encoding.
+    :param bytes sequence:
+    :return: Declared encoding if any else None
+    :rtype: str
+    """
+    if not isinstance(sequence, bytes) and not isinstance(sequence, bytearray):
+        raise TypeError
+
+    seq_len = len(sequence)
+
+    results = findall(
+        RE_POSSIBLE_ENCODING_INDICATION,
+        sequence[:seq_len if seq_len <= 2048 else int(seq_len*0.3)].decode('ascii', errors='ignore')
+    )  # type: list[str]
+
+    if len(results) == 0:
+        return None
+
+    for specified_encoding in results:
+        specified_encoding = specified_encoding.lower().replace('-', '_')
+
+        for a, b in aliases.items():
+            if a == specified_encoding:
+                return b
+            if b == specified_encoding:
+                return b
+
+    return None
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100755
index 00000000..b6888c16
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = Charset Normalizer
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs/advanced_search.rst b/docs/advanced_search.rst
new file mode 100755
index 00000000..dca714d9
--- /dev/null
+++ b/docs/advanced_search.rst
@@ -0,0 +1,20 @@
+Advanced Search
+===============
+
+Charset Normalizer method ``from_bytes``, ``from_fp`` and ``from_path`` provide some
+optional parameters that can be tweaked.
+
+As follow ::
+
+    CharsetDetector.from_bytes(
+        my_byte_str,
+        steps=10,  # Number of steps/block to extract from my_byte_str
+        chunk_size=512,  # Set block size of each extraction
+        threshold=0.2,  # Maximum amount of chaos allowed on first pass
+        cp_isolation=None,  # Finite list of encoding to use when searching for a match
+        cp_exclusion=None,  # Finite list of encoding to avoid when searching for a match
+        preemptive_behaviour=True,  # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding
+        explain=False  # Print on screen what is happening when searching for a match
+    )
+
+!! Warning !! Work in Progress Documentation !!
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100755
index 00000000..dd93ebcd
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# charset-normalizer documentation build configuration file, created by
+# sphinx-quickstart on Fri Jun 16 04:30:35 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+from recommonmark.parser import CommonMarkParser
+import sphinx_rtd_theme
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+# source_suffix = '.rst'
+
+source_parsers = {
+    '.md': CommonMarkParser,
+}
+
+source_suffix = ['.rst', '.md']
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'charset_normalizer'
+copyright = '2019, Ahmed TAHRI'
+author = 'Ahmed TAHRI @Ousret'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.1'
+# The full version, including alpha/beta/rc tags.
+release = '1.1.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'charset-normalizer-doc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'charset-normalizer.tex', 'Charset Normalizer Documentation',
+     'Ahmed TAHRI', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'charset-normalizer', 'Charset Normalizer Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Charset Normalizer', 'Charsert Normalizer Documentation',
+     author, 'charset-normalizer', '🎁 Maintained library on encoding & language detection. 🚀No Cpp Bindings, Using Voodoo and Magical Artifacts. 🔎 Like Chardet',
+     'Miscellaneous'),
+]
diff --git a/docs/getstarted.rst b/docs/getstarted.rst
new file mode 100755
index 00000000..cce2468d
--- /dev/null
+++ b/docs/getstarted.rst
@@ -0,0 +1,70 @@
+Installation
+============
+
+This installs a package that can be used from Python (``import charset_normalizer``).
+
+To install for all users on the system, administrator rights (root)
+may be required.
+
+From PyPI
+---------
+Charset Normalizer can be installed from PyPI::
+
+    pip install charset-normalizer
+
+You may enable extra feature Unicode Data v12 backport as follow::
+
+    pip install charset-normalizer[UnicodeDataBackport]
+
+From git via dev-master
+-----------------------
+You can install from dev-master branch using git::
+
+    git clone https://github.com/Ousret/charset_normalizer.git
+    cd charset_normalizer/
+    python setup.py install
+
+Basic Usage
+===========
+
+The new way
+-----------
+
+You may want to get right to it. ::
+
+    from charset_normalizer import CharsetDetector
+
+    # This is going to print out your sequence once encoding has been detected
+    print(
+        CharsetDetector.from_bytes(
+            my_byte_str
+        ).best().first()
+    )
+
+    # You could also want the same from a file
+    print(
+        CharsetDetector.from_path(
+            './data/sample.1.ar.srt'
+        ).best().first()
+    )
+
+
+Backward compatibility
+----------------------
+
+If you were used to python chardet, we are providing the very same ``detect()`` method as chardet.
+
+ ::
+
+    from charset_normalizer import detect
+
+    # This will behave exactly the same as python chardet
+    result = detect(my_byte_str)
+
+    if result['encoding'] is not None:
+        print('got', result['encoding'], 'as detected encoding')
+
+
+You may upgrade your code with ease.
+CTRL + R ``from chardet import detect`` to ``from charset_normalizer import detect``.
+
diff --git a/docs/handling_result.rst b/docs/handling_result.rst
new file mode 100755
index 00000000..a0213c53
--- /dev/null
+++ b/docs/handling_result.rst
@@ -0,0 +1,5 @@
+================
+ Handling Result
+================
+
+!! Warning !! Work in Progress Documentation !!
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100755
index 00000000..3064b639
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,48 @@
+===================
+ Charset Normalizer
+===================
+
+Overview
+========
+
+Library that help you read text from unknown charset encoding.
+Project motivated by chardet, I'm trying to resolve the issue by taking another approach.
+All IANA character set names for which the Python core library provides codecs are supported.
+
+.. image:: https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df
+   :width: 500px
+   :scale: 100 %
+   :alt: CLI Charset Normalizer
+   :align: right
+
+
+It is released under MIT license, see LICENSE for more
+details. Be aware that no warranty of any kind is provided with this package.
+
+Copyright (C) 2019 Ahmed TAHRI @Ousret <ahmed(dot)tahri(at)cloudnursery.dev>
+
+!! Warning !! Work in Progress Documentation !!
+
+Features
+========
+
+- Encoding detection on a buffer, bytes or file.
+- Transpose any encoded content to Unicode the best we can.
+- Detect spoken language in text.
+
+Contents:
+
+.. toctree::
+    :maxdepth: 2
+
+    support
+    getstarted
+    advanced_search
+    handling_result
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
\ No newline at end of file
diff --git a/docs/requirements.txt b/docs/requirements.txt
new file mode 100755
index 00000000..483a4e96
--- /dev/null
+++ b/docs/requirements.txt
@@ -0,0 +1 @@
+sphinx_rtd_theme
diff --git a/docs/support.rst b/docs/support.rst
new file mode 100755
index 00000000..6a48da73
--- /dev/null
+++ b/docs/support.rst
@@ -0,0 +1,167 @@
+=================
+ Support
+=================
+
+!! Warning !! Work in Progress Documentation !!
+
+-------
+Supported Encodings
+-------
+
+Charset Normalizer is able to detect any of those encoding.
+
++-----------------+----------------------------------------------------------------------------------------------------------------------------------+
+|  IANA Code Page |                                                             Aliases                                                              |
++=================+==================================================================================================================================+
+|      ascii      | 646, ansi_x3.4_1968, ansi_x3_4_1968, ansi_x3.4_1986, cp367, csascii, ibm367, iso646_us, iso_646.irv_1991, iso_ir_6, us, us_ascii |
+|       big5      |                                               big5_tw, csbig5, x_mac_trad_chinese                                                |
+|    big5hkscs    |                                                        big5_hkscs, hkscs                                                         |
+|      cp037      |                      037, csibm037, ebcdic_cp_ca, ebcdic_cp_nl, ebcdic_cp_us, ebcdic_cp_wt, ibm037, ibm039                       |
+|      cp1026     |                                                     1026, csibm1026, ibm1026                                                     |
+|      cp1125     |                                                  1125, ibm1125, cp866u, ruscii                                                   |
+|      cp1140     |                                                          1140, ibm1140                                                           |
+|      cp1250     |                                                        1250, windows_1250                                                        |
+|      cp1251     |                                                        1251, windows_1251                                                        |
+|      cp1252     |                                                        1252, windows_1252                                                        |
+|      cp1253     |                                                        1253, windows_1253                                                        |
+|      cp1254     |                                                        1254, windows_1254                                                        |
+|      cp1255     |                                                        1255, windows_1255                                                        |
+|      cp1256     |                                                        1256, windows_1256                                                        |
+|      cp1257     |                                                        1257, windows_1257                                                        |
+|      cp1258     |                                                        1258, windows_1258                                                        |
+|      cp273      |                                                      273, ibm273, csibm273                                                       |
+|      cp424      |                                               424, csibm424, ebcdic_cp_he, ibm424                                                |
+|      cp437      |                                                  437, cspc8codepage437, ibm437                                                   |
+|      cp500      |                                        500, csibm500, ebcdic_cp_be, ebcdic_cp_ch, ibm500                                         |
+|      cp775      |                                                    775, cspc775baltic, ibm775                                                    |
+|      cp850      |                                                 850, cspc850multilingual, ibm850                                                 |
+|      cp852      |                                                      852, cspcp852, ibm852                                                       |
+|      cp855      |                                                      855, csibm855, ibm855                                                       |
+|      cp857      |                                                      857, csibm857, ibm857                                                       |
+|      cp858      |                                                      858, csibm858, ibm858                                                       |
+|      cp860      |                                                      860, csibm860, ibm860                                                       |
+|      cp861      |                                                   861, cp_is, csibm861, ibm861                                                   |
+|      cp862      |                                                 862, cspc862latinhebrew, ibm862                                                  |
+|      cp863      |                                                      863, csibm863, ibm863                                                       |
+|      cp864      |                                                      864, csibm864, ibm864                                                       |
+|      cp865      |                                                      865, csibm865, ibm865                                                       |
+|      cp866      |                                                      866, csibm866, ibm866                                                       |
+|      cp869      |                                                   869, cp_gr, csibm869, ibm869                                                   |
+|      cp932      |                                                  932, ms932, mskanji, ms_kanji                                                   |
+|      cp949      |                                                         949, ms949, uhc                                                          |
+|      cp950      |                                                            950, ms950                                                            |
+|   euc_jis_2004  |                                                jisx0213, eucjis2004, euc_jis2004                                                 |
+|   euc_jisx0213  |                                                           eucjisx0213                                                            |
+|      euc_jp     |                                                        eucjp, ujis, u_jis                                                        |
+|      euc_kr     |                       euckr, korean, ksc5601, ks_c_5601, ks_c_5601_1987, ksx1001, ks_x_1001, x_mac_korean                        |
+|     gb18030     |                                                           gb18030_2000                                                           |
+|      gb2312     |           chinese, csiso58gb231280, euc_cn, euccn, eucgb2312_cn, gb2312_1980, gb2312_80, iso_ir_58, x_mac_simp_chinese           |
+|       gbk       |                                                        936, cp936, ms936                                                         |
+|    hp_roman8    |                                                      roman8, r8, csHPRoman8                                                      |
+|        hz       |                                                     hzgb, hz_gb, hz_gb_2312                                                      |
+|    iso2022_jp   |                                               csiso2022jp, iso2022jp, iso_2022_jp                                                |
+|   iso2022_jp_1  |                                                    iso2022jp_1, iso_2022_jp_1                                                    |
+|   iso2022_jp_2  |                                                    iso2022jp_2, iso_2022_jp_2                                                    |
+| iso2022_jp_2004 |                                                 iso_2022_jp_2004, iso2022jp_2004                                                 |
+|   iso2022_jp_3  |                                                    iso2022jp_3, iso_2022_jp_3                                                    |
+|  iso2022_jp_ext |                                                  iso2022jp_ext, iso_2022_jp_ext                                                  |
+|    iso2022_kr   |                                               csiso2022kr, iso2022kr, iso_2022_kr                                                |
+|    iso8859_10   |                                csisolatin6, iso_8859_10, iso_8859_10_1992, iso_ir_157, l6, latin6                                |
+|    iso8859_11   |                                               thai, iso_8859_11, iso_8859_11_2001                                                |
+|    iso8859_13   |                                                     iso_8859_13, l7, latin7                                                      |
+|    iso8859_14   |                                iso_8859_14, iso_8859_14_1998, iso_celtic, iso_ir_199, l8, latin8                                 |
+|    iso8859_15   |                                                     iso_8859_15, l9, latin9                                                      |
+|    iso8859_16   |                                     iso_8859_16, iso_8859_16_2001, iso_ir_226, l10, latin10                                      |
+|    iso8859_2    |                                 csisolatin2, iso_8859_2, iso_8859_2_1987, iso_ir_101, l2, latin2                                 |
+|    iso8859_3    |                                 csisolatin3, iso_8859_3, iso_8859_3_1988, iso_ir_109, l3, latin3                                 |
+|    iso8859_4    |                                 csisolatin4, iso_8859_4, iso_8859_4_1988, iso_ir_110, l4, latin4                                 |
+|    iso8859_5    |                              csisolatincyrillic, cyrillic, iso_8859_5, iso_8859_5_1988, iso_ir_144                               |
+|    iso8859_6    |                      arabic, asmo_708, csisolatinarabic, ecma_114, iso_8859_6, iso_8859_6_1987, iso_ir_127                       |
+|    iso8859_7    |                   csisolatingreek, ecma_118, elot_928, greek, greek8, iso_8859_7, iso_8859_7_1987, iso_ir_126                    |
+|    iso8859_8    |                                csisolatinhebrew, hebrew, iso_8859_8, iso_8859_8_1988, iso_ir_138                                 |
+|    iso8859_9    |                                 csisolatin5, iso_8859_9, iso_8859_9_1989, iso_ir_148, l5, latin5                                 |
+|      johab      |                                                          cp1361, ms1361                                                          |
+|      koi8_r     |                                                             cskoi8r                                                              |
+|      kz1048     |                                                  kz_1048, rk1048, strk1048_2002                                                  |
+|     latin_1     |         8859, cp819, csisolatin1, ibm819, iso8859, iso8859_1, iso_8859_1, iso_8859_1_1987, iso_ir_100, l1, latin, latin1         |
+|   mac_cyrillic  |                                                           maccyrillic                                                            |
+|    mac_greek    |                                                             macgreek                                                             |
+|   mac_iceland   |                                                            maciceland                                                            |
+|    mac_latin2   |                                                   maccentraleurope, maclatin2                                                    |
+|    mac_roman    |                                                       macintosh, macroman                                                        |
+|   mac_turkish   |                                                            macturkish                                                            |
+|       mbcs      |                                                            ansi, dbcs                                                            |
+|     ptcp154     |                                             csptcp154, pt154, cp154, cyrillic_asian                                              |
+|   quopri_codec  |                                            quopri, quoted_printable, quotedprintable                                             |
+|      rot_13     |                                                              rot13                                                               |
+|    shift_jis    |                                        csshiftjis, shiftjis, sjis, s_jis, x_mac_japanese                                         |
+|  shift_jis_2004 |                                               shiftjis2004, sjis_2004, s_jis_2004                                                |
+|  shift_jisx0213 |                                               shiftjisx0213, sjisx0213, s_jisx0213                                               |
+|      tactis     |                                                              tis260                                                              |
+|     tis_620     |                                  tis620, tis_620_0, tis_620_2529_0, tis_620_2529_1, iso_ir_166                                   |
+|      utf_16     |                                                            u16, utf16                                                            |
+|    utf_16_be    |                                                   unicodebigunmarked, utf_16be                                                   |
+|    utf_16_le    |                                                 unicodelittleunmarked, utf_16le                                                  |
+|      utf_32     |                                                            u32, utf32                                                            |
+|    utf_32_be    |                                                             utf_32be                                                             |
+|    utf_32_le    |                                                             utf_32le                                                             |
+|      utf_7      |                                                   u7, utf7, unicode_1_1_utf_7                                                    |
+|      utf_8      |                                               u8, utf, utf8, utf8_ucs2, utf8_ucs4                                                |
++-----------------+----------------------------------------------------------------------------------------------------------------------------------+
+
+-------
+Supported Languages
+-------
+
+Those language can be detected inside your content. All of these are specified in ./charset_normalizer/assets/frequencies.json .
+
+['English',
+ 'German',
+ 'French',
+ 'Dutch',
+ 'Italian',
+ 'Polish',
+ 'Spanish',
+ 'Russian',
+ 'Japanese',
+ 'Portuguese',
+ 'Swedish',
+ 'Chinese',
+ 'Catalan',
+ 'Ukrainian',
+ 'Norwegian',
+ 'Finnish',
+ 'Vietnamese',
+ 'Czech',
+ 'Hungarian',
+ 'Korean',
+ 'Indonesian',
+ 'Turkish',
+ 'Romanian',
+ 'Farsi',
+ 'Arabic',
+ 'Danish',
+ 'Esperanto',
+ 'Serbian',
+ 'Lithuanian',
+ 'Slovene',
+ 'Slovak',
+ 'Malay',
+ 'Hebrew',
+ 'Bulgarian',
+ 'Kazakh',
+ 'Baque',
+ 'Volapük',
+ 'Croatian',
+ 'Hindi',
+ 'Estonian',
+ 'Azeri',
+ 'Galician',
+ 'Simple English',
+ 'Nynorsk',
+ 'Thai',
+ 'Greek',
+ 'Macedonian',
+ 'Serbocroatian',
+ 'Tamil',
+ 'Classical Chinese']
diff --git a/setup.py b/setup.py
index adf8c442..41f9da33 100644
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@
 EMAIL = 'ahmed.tahri@cloudnursery.dev'
 AUTHOR = 'Ahmed TAHRI @Ousret'
 REQUIRES_PYTHON = '>=3.5.0'
-VERSION = '1.2.0'
+VERSION = '1.3.0'
 
 REQUIRED = [
     'cached_property',
@@ -24,7 +24,8 @@
 ]
 
 EXTRAS = {
-    'permit to generate frequencies.json': ['requests_html', 'requests'],
+    'LetterFrequency': ['requests_html', 'requests'],
+    'UnicodeDataBackport': ['unicodedata2']
 }