Release 1.3.0 (#17)

* Backport unicodedata for v12 impl into python if available * Add aliases to CharsetNormalizerMatches class * Add feature preemptive behaviour, looking for encoding declaration * import aliases in __init__ * Change text in Why. More concise. * bump 1.3.0 * initial docs work in progress
jawah · Sep 30, 2019 · a2a4682 · a2a4682
1 parent f44ecb6
commit a2a4682
Show file tree

Hide file tree

Showing 14 changed files with 579 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -106,9 +106,8 @@ See wiki for advanced usages. *Todo, not yet available.*
 
 ## 😇 Why
 
-When I started using Chardet, I noticed that this library was wrong most of the time 
-when it's not about Unicode, Gb or Big5. That because some charset are easily identifiable 
-because of there standards and Chardet does a really good job at identifying them.
+When I started using Chardet, I noticed that this library was unreliable nowadays and also  
+it's unmaintained, and most likely will never be.
 
 I **don't care** about the **originating charset** encoding, that because **two different table** can 
 produce **two identical file.**

diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py
@@ -1,5 +1,6 @@
 # coding: utf-8
-from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch
+from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch, \
+    CharsetDetector, CharsetDoctor, EncodingDetector  # Aliases
 from charset_normalizer.unicode import UnicodeRangeIdentify
 from charset_normalizer.probe_chaos import ProbeChaos
 from charset_normalizer.probe_coherence import ProbeCoherence

diff --git a/charset_normalizer/hook.py b/charset_normalizer/hook.py
@@ -12,3 +12,9 @@ def charset_normalizer_hook(exctype, value, traceback):
 
 
 sys.excepthook = charset_normalizer_hook
+
+try:
+    import unicodedata2
+    sys.modules['unicodedata'] = unicodedata2
+except ImportError:
+    pass
diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py
@@ -15,6 +15,8 @@
 
 from charset_normalizer.encoding import is_multi_byte_encoding
 
+from charset_normalizer.probe_inherent_sign import any_specified_encoding
+
 from loguru import logger
 
 from hashlib import sha256
@@ -319,15 +321,16 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.20):
         return b_
 
     @staticmethod
-    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
+    def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
         charset encoding.
         Will test input like this (with steps=4 & chunk_size=4) --> [####     ####     ####     ####]
         :param bytes sequences: Actual sequence of bytes to analyse
         :param float threshold: Maximum amount of chaos allowed on first pass
         :param int chunk_size: Size to extract and analyse in each step
-        :param int steps: Number of steps
+        :param int steps: Number of steps/block to extract from sequence
+        :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
         :param bool explain: Print on screen what is happening when searching for a match
         :param list[str] cp_isolation: Finite list of encoding to use when searching for a match
         :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
@@ -381,6 +384,13 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
         tested = set()
         matches = list()
 
+        specified_encoding = any_specified_encoding(sequences) if preemptive_behaviour is True else None
+
+        if specified_encoding is not None:
+            warn(
+                'Trying to detect encoding on a sequence that seems to declare a encoding ({}).'.format(specified_encoding)
+            )
+
         for support in supported:
 
             k, p = support
@@ -493,8 +503,16 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
                     cnm
                 )
 
+            if specified_encoding is not None and p == specified_encoding:
+                logger.info('{encoding} is most likely the one. '
+                            'Because it is specified in analysed byte sequence and '
+                            'initial test passed successfully. '
+                            'Disable this behaviour by setting preemptive_behaviour '
+                            'to False', encoding=specified_encoding)
+                return CharsetNormalizerMatches([cnm]) if any(fingerprint_tests) is False else CharsetNormalizerMatches([matches[fingerprint_tests.index(True)]])
+
             if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
-                logger.info('{encoding} is the most likely the one. {bom_available}',
+                logger.info('{encoding} is most likely the one. {bom_available}',
                                    encoding=p,
                                    bom_available='BOM/SIG available' if bom_available else '')
 
@@ -503,13 +521,14 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
         return CharsetNormalizerMatches(matches)
 
     @staticmethod
-    def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
+    def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         :param io.BinaryIO fp:
         :param int steps:
         :param int chunk_size:
         :param float threshold:
         :param bool explain: Print on screen what is happening when searching for a match
+        :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
         :param list[str] cp_isolation: Finite list of encoding to use when searching for a match
         :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
         :return: List of potential matches
@@ -522,24 +541,26 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_
             threshold,
             cp_isolation,
             cp_exclusion,
+            preemptive_behaviour,
             explain
         )
 
     @staticmethod
-    def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
+    def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
         """
         :param str path:
         :param int steps:
         :param int chunk_size:
         :param float threshold:
+        :param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
         :param bool explain: Print on screen what is happening when searching for a match
         :param list[str] cp_isolation: Finite list of encoding to use when searching for a match
         :param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
         :return: List of potential matches
         :rtype: CharsetNormalizerMatches
         """
         with open(path, 'rb') as fp:
-            return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, explain)
+            return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain)
 
     @cached_property
     def could_be_from_charset(self):
@@ -596,3 +617,9 @@ def best(self):
         return CharsetNormalizerMatches(
             sorted_matches[:nb_lowest_ratio+1]
         )
+
+
+# Some aliases to CharsetNormalizerMatches, because it is too long for a class name.
+CharsetDetector = CharsetNormalizerMatches
+EncodingDetector = CharsetNormalizerMatches
+CharsetDoctor = CharsetNormalizerMatches
diff --git a/charset_normalizer/probe_inherent_sign.py b/charset_normalizer/probe_inherent_sign.py
@@ -0,0 +1,39 @@
+from re import findall, compile, IGNORECASE
+from encodings.aliases import aliases
+
+RE_POSSIBLE_ENCODING_INDICATION = compile(
+    r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
+    IGNORECASE
+)
+
+
+def any_specified_encoding(sequence):
+    """
+    Search in sequence (ASCII-mode) if there is any sign of declared encoding.
+    :param bytes sequence:
+    :return: Declared encoding if any else None
+    :rtype: str
+    """
+    if not isinstance(sequence, bytes) and not isinstance(sequence, bytearray):
+        raise TypeError
+
+    seq_len = len(sequence)
+
+    results = findall(
+        RE_POSSIBLE_ENCODING_INDICATION,
+        sequence[:seq_len if seq_len <= 2048 else int(seq_len*0.3)].decode('ascii', errors='ignore')
+    )  # type: list[str]
+
+    if len(results) == 0:
+        return None
+
+    for specified_encoding in results:
+        specified_encoding = specified_encoding.lower().replace('-', '_')
+
+        for a, b in aliases.items():
+            if a == specified_encoding:
+                return b
+            if b == specified_encoding:
+                return b
+
+    return None
diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = python -msphinx
+SPHINXPROJ    = Charset Normalizer
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/advanced_search.rst b/docs/advanced_search.rst
@@ -0,0 +1,20 @@
+Advanced Search
+===============
+
+Charset Normalizer method ``from_bytes``, ``from_fp`` and ``from_path`` provide some
+optional parameters that can be tweaked.
+
+As follow ::
+
+    CharsetDetector.from_bytes(
+        my_byte_str,
+        steps=10,  # Number of steps/block to extract from my_byte_str
+        chunk_size=512,  # Set block size of each extraction
+        threshold=0.2,  # Maximum amount of chaos allowed on first pass
+        cp_isolation=None,  # Finite list of encoding to use when searching for a match
+        cp_exclusion=None,  # Finite list of encoding to avoid when searching for a match
+        preemptive_behaviour=True,  # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding
+        explain=False  # Print on screen what is happening when searching for a match
+    )
+
+!! Warning !! Work in Progress Documentation !!
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+#
+# charset-normalizer documentation build configuration file, created by
+# sphinx-quickstart on Fri Jun 16 04:30:35 2017.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+# import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+from recommonmark.parser import CommonMarkParser
+import sphinx_rtd_theme
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+# source_suffix = '.rst'
+
+source_parsers = {
+    '.md': CommonMarkParser,
+}
+
+source_suffix = ['.rst', '.md']
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = 'charset_normalizer'
+copyright = '2019, Ahmed TAHRI'
+author = 'Ahmed TAHRI @Ousret'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '1.1'
+# The full version, including alpha/beta/rc tags.
+release = '1.1.1'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This patterns also effect to html_static_path and html_extra_path
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+
+# -- Options for HTMLHelp output ------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'charset-normalizer-doc'
+
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'charset-normalizer.tex', 'Charset Normalizer Documentation',
+     'Ahmed TAHRI', 'manual'),
+]
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'charset-normalizer', 'Charset Normalizer Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'Charset Normalizer', 'Charsert Normalizer Documentation',
+     author, 'charset-normalizer', '🎁 Maintained library on encoding & language detection. 🚀No Cpp Bindings, Using Voodoo and Magical Artifacts. 🔎 Like Chardet',
+     'Miscellaneous'),
+]