Skip to content

Commit

Permalink
Release 1.3.0 (#17)
Browse files Browse the repository at this point in the history
* Backport unicodedata for v12 impl into python if available

* Add aliases to CharsetNormalizerMatches class

* Add feature preemptive behaviour, looking for encoding declaration

* import aliases in __init__

* Change text in Why. More concise.

* bump 1.3.0

* initial docs

work in progress
  • Loading branch information
Ousret authored Sep 30, 2019
1 parent f44ecb6 commit a2a4682
Show file tree
Hide file tree
Showing 14 changed files with 579 additions and 12 deletions.
5 changes: 2 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,8 @@ See wiki for advanced usages. *Todo, not yet available.*

## 😇 Why

When I started using Chardet, I noticed that this library was wrong most of the time
when it's not about Unicode, Gb or Big5. That because some charset are easily identifiable
because of there standards and Chardet does a really good job at identifying them.
When I started using Chardet, I noticed that this library was unreliable nowadays and also
it's unmaintained, and most likely will never be.

I **don't care** about the **originating charset** encoding, that because **two different table** can
produce **two identical file.**
Expand Down
3 changes: 2 additions & 1 deletion charset_normalizer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# coding: utf-8
from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch
from charset_normalizer.normalizer import CharsetNormalizerMatches, CharsetNormalizerMatch, \
CharsetDetector, CharsetDoctor, EncodingDetector # Aliases
from charset_normalizer.unicode import UnicodeRangeIdentify
from charset_normalizer.probe_chaos import ProbeChaos
from charset_normalizer.probe_coherence import ProbeCoherence
Expand Down
6 changes: 6 additions & 0 deletions charset_normalizer/hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,9 @@ def charset_normalizer_hook(exctype, value, traceback):


sys.excepthook = charset_normalizer_hook

try:
import unicodedata2
sys.modules['unicodedata'] = unicodedata2
except ImportError:
pass
39 changes: 33 additions & 6 deletions charset_normalizer/normalizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

from charset_normalizer.encoding import is_multi_byte_encoding

from charset_normalizer.probe_inherent_sign import any_specified_encoding

from loguru import logger

from hashlib import sha256
Expand Down Expand Up @@ -319,15 +321,16 @@ def normalize(path, steps=10, chunk_size=512, threshold=0.20):
return b_

@staticmethod
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
"""
Take a sequence of bytes that could potentially be decoded to str and discard all obvious non supported
charset encoding.
Will test input like this (with steps=4 & chunk_size=4) --> [#### #### #### ####]
:param bytes sequences: Actual sequence of bytes to analyse
:param float threshold: Maximum amount of chaos allowed on first pass
:param int chunk_size: Size to extract and analyse in each step
:param int steps: Number of steps
:param int steps: Number of steps/block to extract from sequence
:param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
:param bool explain: Print on screen what is happening when searching for a match
:param list[str] cp_isolation: Finite list of encoding to use when searching for a match
:param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
Expand Down Expand Up @@ -381,6 +384,13 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
tested = set()
matches = list()

specified_encoding = any_specified_encoding(sequences) if preemptive_behaviour is True else None

if specified_encoding is not None:
warn(
'Trying to detect encoding on a sequence that seems to declare a encoding ({}).'.format(specified_encoding)
)

for support in supported:

k, p = support
Expand Down Expand Up @@ -493,8 +503,16 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
cnm
)

if specified_encoding is not None and p == specified_encoding:
logger.info('{encoding} is most likely the one. '
'Because it is specified in analysed byte sequence and '
'initial test passed successfully. '
'Disable this behaviour by setting preemptive_behaviour '
'to False', encoding=specified_encoding)
return CharsetNormalizerMatches([cnm]) if any(fingerprint_tests) is False else CharsetNormalizerMatches([matches[fingerprint_tests.index(True)]])

if (p == 'ascii' and chaos_median == 0.) or bom_available is True:
logger.info('{encoding} is the most likely the one. {bom_available}',
logger.info('{encoding} is most likely the one. {bom_available}',
encoding=p,
bom_available='BOM/SIG available' if bom_available else '')

Expand All @@ -503,13 +521,14 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation
return CharsetNormalizerMatches(matches)

@staticmethod
def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
"""
:param io.BinaryIO fp:
:param int steps:
:param int chunk_size:
:param float threshold:
:param bool explain: Print on screen what is happening when searching for a match
:param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
:param list[str] cp_isolation: Finite list of encoding to use when searching for a match
:param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
:return: List of potential matches
Expand All @@ -522,24 +541,26 @@ def from_fp(fp, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_
threshold,
cp_isolation,
cp_exclusion,
preemptive_behaviour,
explain
)

@staticmethod
def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, explain=False):
def from_path(path, steps=10, chunk_size=512, threshold=0.20, cp_isolation=None, cp_exclusion=None, preemptive_behaviour=True, explain=False):
"""
:param str path:
:param int steps:
:param int chunk_size:
:param float threshold:
:param bool preemptive_behaviour: Determine if we should look into sequence (ASCII-Mode) for pre-defined encoding
:param bool explain: Print on screen what is happening when searching for a match
:param list[str] cp_isolation: Finite list of encoding to use when searching for a match
:param list[str] cp_exclusion: Finite list of encoding to avoid when searching for a match
:return: List of potential matches
:rtype: CharsetNormalizerMatches
"""
with open(path, 'rb') as fp:
return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, explain)
return CharsetNormalizerMatches.from_fp(fp, steps, chunk_size, threshold, cp_isolation, cp_exclusion, preemptive_behaviour, explain)

@cached_property
def could_be_from_charset(self):
Expand Down Expand Up @@ -596,3 +617,9 @@ def best(self):
return CharsetNormalizerMatches(
sorted_matches[:nb_lowest_ratio+1]
)


# Some aliases to CharsetNormalizerMatches, because it is too long for a class name.
CharsetDetector = CharsetNormalizerMatches
EncodingDetector = CharsetNormalizerMatches
CharsetDoctor = CharsetNormalizerMatches
39 changes: 39 additions & 0 deletions charset_normalizer/probe_inherent_sign.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from re import findall, compile, IGNORECASE
from encodings.aliases import aliases

RE_POSSIBLE_ENCODING_INDICATION = compile(
r'(?:(?:encoding)|(?:charset)|(?:coding))(?:[\:= ]{1,10})(?:[\"\']?)([a-zA-Z0-9\-_]+)(?:[\"\']?)',
IGNORECASE
)


def any_specified_encoding(sequence):
"""
Search in sequence (ASCII-mode) if there is any sign of declared encoding.
:param bytes sequence:
:return: Declared encoding if any else None
:rtype: str
"""
if not isinstance(sequence, bytes) and not isinstance(sequence, bytearray):
raise TypeError

seq_len = len(sequence)

results = findall(
RE_POSSIBLE_ENCODING_INDICATION,
sequence[:seq_len if seq_len <= 2048 else int(seq_len*0.3)].decode('ascii', errors='ignore')
) # type: list[str]

if len(results) == 0:
return None

for specified_encoding in results:
specified_encoding = specified_encoding.lower().replace('-', '_')

for a, b in aliases.items():
if a == specified_encoding:
return b
if b == specified_encoding:
return b

return None
20 changes: 20 additions & 0 deletions docs/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = python -msphinx
SPHINXPROJ = Charset Normalizer
SOURCEDIR = .
BUILDDIR = _build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
20 changes: 20 additions & 0 deletions docs/advanced_search.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Advanced Search
===============

Charset Normalizer method ``from_bytes``, ``from_fp`` and ``from_path`` provide some
optional parameters that can be tweaked.

As follow ::

CharsetDetector.from_bytes(
my_byte_str,
steps=10, # Number of steps/block to extract from my_byte_str
chunk_size=512, # Set block size of each extraction
threshold=0.2, # Maximum amount of chaos allowed on first pass
cp_isolation=None, # Finite list of encoding to use when searching for a match
cp_exclusion=None, # Finite list of encoding to avoid when searching for a match
preemptive_behaviour=True, # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding
explain=False # Print on screen what is happening when searching for a match
)

!! Warning !! Work in Progress Documentation !!
163 changes: 163 additions & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# charset-normalizer documentation build configuration file, created by
# sphinx-quickstart on Fri Jun 16 04:30:35 2017.
#
# This file is execfile()d with the current directory set to its
# containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.

# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
from recommonmark.parser import CommonMarkParser
import sphinx_rtd_theme

# -- General configuration ------------------------------------------------

# If your documentation needs a minimal Sphinx version, state it here.
#
# needs_sphinx = '1.0'

# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = []

# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']

# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
#
# source_suffix = ['.rst', '.md']
# source_suffix = '.rst'

source_parsers = {
'.md': CommonMarkParser,
}

source_suffix = ['.rst', '.md']

# The master toctree document.
master_doc = 'index'

# General information about the project.
project = 'charset_normalizer'
copyright = '2019, Ahmed TAHRI'
author = 'Ahmed TAHRI @Ousret'

# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '1.1'
# The full version, including alpha/beta/rc tags.
release = '1.1.1'

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = None

# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This patterns also effect to html_static_path and html_extra_path
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']

# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'

# If true, `todo` and `todoList` produce output, else they produce nothing.
todo_include_todos = False


# -- Options for HTML output ----------------------------------------------

# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = 'sphinx_rtd_theme'

html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]

# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#
# html_theme_options = {}

# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']


# -- Options for HTMLHelp output ------------------------------------------

# Output file base name for HTML help builder.
htmlhelp_basename = 'charset-normalizer-doc'


# -- Options for LaTeX output ---------------------------------------------

latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#
# 'papersize': 'letterpaper',

# The font size ('10pt', '11pt' or '12pt').
#
# 'pointsize': '10pt',

# Additional stuff for the LaTeX preamble.
#
# 'preamble': '',

# Latex figure (float) alignment
#
# 'figure_align': 'htbp',
}

# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title,
# author, documentclass [howto, manual, or own class]).
latex_documents = [
(master_doc, 'charset-normalizer.tex', 'Charset Normalizer Documentation',
'Ahmed TAHRI', 'manual'),
]


# -- Options for manual page output ---------------------------------------

# One entry per manual page. List of tuples
# (source start file, name, description, authors, manual section).
man_pages = [
(master_doc, 'charset-normalizer', 'Charset Normalizer Documentation',
[author], 1)
]


# -- Options for Texinfo output -------------------------------------------

# Grouping the document tree into Texinfo files. List of tuples
# (source start file, target name, title, author,
# dir menu entry, description, category)
texinfo_documents = [
(master_doc, 'Charset Normalizer', 'Charsert Normalizer Documentation',
author, 'charset-normalizer', '🎁 Maintained library on encoding & language detection. 🚀No Cpp Bindings, Using Voodoo and Magical Artifacts. 🔎 Like Chardet',
'Miscellaneous'),
]
Loading

0 comments on commit a2a4682

Please sign in to comment.