From e770c9911f874cd367ac361104a5eaca0e2da9f3 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 19:56:50 +0200 Subject: [PATCH 01/63] :bookmark: Bump version to 3.0.0 b1 --- charset_normalizer/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index af7e749e..1bee3dc5 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.1.0" +__version__ = "3.0.0b1" VERSION = __version__.split(".") From 4e9b2d32223ded694a5fd7b48e4d26032c9149dc Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 19:57:29 +0200 Subject: [PATCH 02/63] :wrench: Add support to build Whl using MYPYC --- setup.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/setup.py b/setup.py index 298d12be..415e9051 100644 --- a/setup.py +++ b/setup.py @@ -3,11 +3,38 @@ import io import os +import sys from re import search from setuptools import find_packages, setup +USE_MYPYC = False + +if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc": + sys.argv.pop(1) + USE_MYPYC = True +if os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1": + USE_MYPYC = True + +if USE_MYPYC: + from mypyc.build import mypycify + + MYPYC_MODULES = mypycify([ + "charset_normalizer/__init__.py", + "charset_normalizer/api.py", + "charset_normalizer/cd.py", + "charset_normalizer/constant.py", + "charset_normalizer/legacy.py", + "charset_normalizer/md.py", + "charset_normalizer/models.py", + "charset_normalizer/utils.py", + "charset_normalizer/assets/__init__.py" + ], opt_level="4") +else: + MYPYC_MODULES = None + + def get_version(): with open('charset_normalizer/version.py') as version_file: return search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", @@ -83,4 +110,5 @@ def get_version(): 'Bug Reports': 'https://github.com/Ousret/charset_normalizer/issues', 'Documentation': 'https://charset-normalizer.readthedocs.io/en/latest', }, + ext_modules=MYPYC_MODULES ) From 482d2e38551e9ee0606284eb34fe4dae9920d3b1 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 19:58:02 +0200 Subject: [PATCH 03/63] :wrench: remove opt level override --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 415e9051..0f6ae096 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ "charset_normalizer/models.py", "charset_normalizer/utils.py", "charset_normalizer/assets/__init__.py" - ], opt_level="4") + ]) else: MYPYC_MODULES = None From e74851a4910fbc8bdb486ea020f4f845de7ba2ec Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 20:31:06 +0200 Subject: [PATCH 04/63] :fire: remove deprecated --- charset_normalizer/__init__.py | 15 ++-------- charset_normalizer/api.py | 45 ---------------------------- charset_normalizer/legacy.py | 54 +--------------------------------- charset_normalizer/models.py | 44 --------------------------- charset_normalizer/utils.py | 8 +---- 5 files changed, 4 insertions(+), 162 deletions(-) diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py index 2dcaf56f..ebb5da89 100644 --- a/charset_normalizer/__init__.py +++ b/charset_normalizer/__init__.py @@ -21,14 +21,8 @@ """ import logging -from .api import from_bytes, from_fp, from_path, normalize -from .legacy import ( - CharsetDetector, - CharsetDoctor, - CharsetNormalizerMatch, - CharsetNormalizerMatches, - detect, -) +from .api import from_bytes, from_fp, from_path +from .legacy import detect from .models import CharsetMatch, CharsetMatches from .utils import set_logging_handler from .version import VERSION, __version__ @@ -37,14 +31,9 @@ "from_fp", "from_path", "from_bytes", - "normalize", "detect", "CharsetMatch", "CharsetMatches", - "CharsetNormalizerMatch", - "CharsetNormalizerMatches", - "CharsetDetector", - "CharsetDoctor", "__version__", "VERSION", "set_logging_handler", diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 3697291c..c4d3c7c3 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -1,6 +1,5 @@ import logging from os import PathLike -from os.path import basename, splitext from typing import Any, BinaryIO, List, Optional, Set from .cd import ( @@ -532,47 +531,3 @@ def from_path( preemptive_behaviour, explain, ) - - -def normalize( - path: "PathLike[Any]", - steps: int = 5, - chunk_size: int = 512, - threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, - preemptive_behaviour: bool = True, -) -> CharsetMatch: - """ - Take a (text-based) file path and try to create another file next to it, this time using UTF-8. - """ - results = from_path( - path, - steps, - chunk_size, - threshold, - cp_isolation, - cp_exclusion, - preemptive_behaviour, - ) - - filename = basename(path) - target_extensions = list(splitext(filename)) - - if len(results) == 0: - raise IOError( - 'Unable to normalize "{}", no encoding charset seems to fit.'.format( - filename - ) - ) - - result = results.best() - - target_extensions[0] += "-" + result.encoding # type: ignore - - with open( - "{}".format(str(path).replace(filename, "".join(target_extensions))), "wb" - ) as fp: - fp.write(result.output()) # type: ignore - - return result # type: ignore diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py index cdebe2b8..b266d176 100644 --- a/charset_normalizer/legacy.py +++ b/charset_normalizer/legacy.py @@ -1,9 +1,7 @@ -import warnings from typing import Dict, Optional, Union -from .api import from_bytes, from_fp, from_path, normalize +from .api import from_bytes from .constant import CHARDET_CORRESPONDENCE -from .models import CharsetMatch, CharsetMatches def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: @@ -43,53 +41,3 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: "language": language, "confidence": confidence, } - - -class CharsetNormalizerMatch(CharsetMatch): - pass - - -class CharsetNormalizerMatches(CharsetMatches): - @staticmethod - def from_fp(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_fp(*args, **kwargs) # pragma: nocover - - @staticmethod - def from_bytes(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_bytes(*args, **kwargs) # pragma: nocover - - @staticmethod - def from_path(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_path(*args, **kwargs) # pragma: nocover - - @staticmethod - def normalize(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return normalize(*args, **kwargs) # pragma: nocover - - -class CharsetDetector(CharsetNormalizerMatches): - pass - - -class CharsetDoctor(CharsetNormalizerMatches): - pass diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index ccb0d475..b52bae78 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -1,12 +1,8 @@ -import warnings -from collections import Counter from encodings.aliases import aliases from hashlib import sha256 from json import dumps -from re import sub from typing import ( Any, - Counter as TypeCounter, Dict, Iterator, List, @@ -16,7 +12,6 @@ ) from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE -from .md import mess_ratio from .utils import iana_name, is_multi_byte_encoding, unicode_range @@ -78,45 +73,6 @@ def __lt__(self, other: object) -> bool: def multi_byte_usage(self) -> float: return 1.0 - len(str(self)) / len(self.raw) - @property - def chaos_secondary_pass(self) -> float: - """ - Check once again chaos in decoded text, except this time, with full content. - Use with caution, this can be very slow. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "chaos_secondary_pass is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - return mess_ratio(str(self), 1.0) - - @property - def coherence_non_latin(self) -> float: - """ - Coherence ratio on the first non-latin language detected if ANY. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "coherence_non_latin is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - return 0.0 - - @property - def w_counter(self) -> TypeCounter[str]: - """ - Word counter instance on decoded text. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "w_counter is deprecated and will be removed in 3.0", DeprecationWarning - ) - - string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower()) - - return Counter(string_printable_only.split()) - def __str__(self) -> str: # Lazy Str Loading if self._string is None: diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 859f212b..c2375114 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -1,10 +1,4 @@ -try: - # WARNING: unicodedata2 support is going to be removed in 3.0 - # Python is quickly catching up. - import unicodedata2 as unicodedata -except ImportError: - import unicodedata # type: ignore[no-redef] - +import unicodedata import importlib import logging from codecs import IncrementalDecoder From cd4be0dd19ac82b79206353a9468ff6abb806e23 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 20:37:58 +0200 Subject: [PATCH 05/63] :fire: remove test_normalize_fp as target fn been removed --- tests/test_normalize_fp.py | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 tests/test_normalize_fp.py diff --git a/tests/test_normalize_fp.py b/tests/test_normalize_fp.py deleted file mode 100644 index e2ce364a..00000000 --- a/tests/test_normalize_fp.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest -from charset_normalizer import normalize -from os.path import exists -from os import unlink - - -def test_normalize_fp_creation(): - guesses = normalize( - "./data/sample-arabic-1.txt" - ) - - predicted_path = "./data/sample-arabic-1-{}.txt".format(guesses.best().encoding) - path_exist = exists( - "./data/sample-arabic-1-{}.txt".format(guesses.best().encoding) - ) - - assert path_exist is True - - if path_exist: - unlink(predicted_path) From 0d89020aa6d959ad9f03e542914e846ab8ff26cc Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 20:38:52 +0200 Subject: [PATCH 06/63] :fire: remove extra unicodedata backport support --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0f6ae096..1f556726 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,7 @@ def get_version(): REQUIRED = [] EXTRAS = { - 'unicode_backport': ['unicodedata2'] + 'unicode_backport': [] } here = os.path.abspath(os.path.dirname(__file__)) From b3c0d5a38d982534a1a6fb8f4e2708e8a9b06da3 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 20:39:49 +0200 Subject: [PATCH 07/63] :art: reformat models.py and utils.py --- charset_normalizer/models.py | 10 +--------- charset_normalizer/utils.py | 2 +- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index b52bae78..7a8ff565 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -1,15 +1,7 @@ from encodings.aliases import aliases from hashlib import sha256 from json import dumps -from typing import ( - Any, - Dict, - Iterator, - List, - Optional, - Tuple, - Union, -) +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index c2375114..425d8365 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -1,6 +1,6 @@ -import unicodedata import importlib import logging +import unicodedata from codecs import IncrementalDecoder from encodings.aliases import aliases from functools import lru_cache From 6f6300a7e97d5abcf6752c21d2e92fcc62490fc3 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 20:42:01 +0200 Subject: [PATCH 08/63] :art: fix flake8 F401 '.constant.NOT_PRINTABLE_PATTERN' imported but unused --- charset_normalizer/constant.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index ac840c46..2e181638 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -489,8 +489,6 @@ KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} -NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+") - LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) # Logging LEVEL bellow DEBUG From 0262569567eedd7279a5c122a644e0eafff9ee3f Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 14 Aug 2022 20:44:00 +0200 Subject: [PATCH 09/63] :fire: remove NOT_PRINTABLE_PATTERN import in models.py --- charset_normalizer/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 7a8ff565..2da82cbd 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -3,7 +3,7 @@ from json import dumps from typing import Any, Dict, Iterator, List, Optional, Tuple, Union -from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE +from .constant import TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range From c0283d904d991682509163338b7aef7b4a673de6 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 16:33:59 +0200 Subject: [PATCH 10/63] :zap: Only "compile" md.py for whl size sake We do not need to optimize everything, most of the time is spent in md.py --- setup.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 1f556726..476d65d9 100644 --- a/setup.py +++ b/setup.py @@ -21,15 +21,7 @@ from mypyc.build import mypycify MYPYC_MODULES = mypycify([ - "charset_normalizer/__init__.py", - "charset_normalizer/api.py", - "charset_normalizer/cd.py", - "charset_normalizer/constant.py", - "charset_normalizer/legacy.py", - "charset_normalizer/md.py", - "charset_normalizer/models.py", - "charset_normalizer/utils.py", - "charset_normalizer/assets/__init__.py" + "charset_normalizer/md.py" ]) else: MYPYC_MODULES = None From 6328f7c288249297318c8a99520c9431e02b9cc7 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 17:14:07 +0200 Subject: [PATCH 11/63] :sparkle: Add mypyc gha minimum testing --- .github/workflows/mypyc-verify.yml | 37 ++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 .github/workflows/mypyc-verify.yml diff --git a/.github/workflows/mypyc-verify.yml b/.github/workflows/mypyc-verify.yml new file mode 100644 index 00000000..499487c8 --- /dev/null +++ b/.github/workflows/mypyc-verify.yml @@ -0,0 +1,37 @@ +name: MYPYC Run + +on: [push, pull_request] + +jobs: + detection_coverage: + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + python-version: [3.9] + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install -U pip setuptools + pip install -r dev-requirements.txt + pip uninstall -y charset-normalizer + - name: Install the package + run: | + python setup.py --use-mypyc install + - name: Clone the complete dataset + run: | + git clone https://github.com/Ousret/char-dataset.git + - name: Coverage WITH preemptive + run: | + python ./bin/coverage.py --coverage 97 --with-preemptive + - name: Coverage WITHOUT preemptive + run: | + python ./bin/coverage.py --coverage 95 From 31f2673f4955c8b077b1ea569f4d1b1b0fce7876 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 17:14:46 +0200 Subject: [PATCH 12/63] :sparkle: initial ci update to include building wheels (specific) +universal --- .github/workflows/python-publish.yml | 90 +++++++++++++++++++++++++++- 1 file changed, 87 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 88986a21..2393715e 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -145,11 +145,88 @@ jobs: - name: Integration Tests with Requests run: | python ./bin/integration.py + universal-wheel: + runs-on: ubuntu-latest + needs: + - integration + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Update pip, setuptools, wheel and twine + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build Wheel + env: + CHARSET_NORMALIZER_USE_MYPYC: '0' + run: python setup.py sdist bdist_wheel + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: dist + + build-wheels: + name: Build wheels on ${{ matrix.os }} ${{ matrix.qemu }} + runs-on: ${{ matrix.os }}-latest + needs: universal-wheel + strategy: + matrix: + os: [ ubuntu, windows, macos ] + qemu: [ '' ] + include: + # Split ubuntu job for the sake of speed-up + - os: ubuntu + qemu: aarch64 + - os: ubuntu + qemu: ppc64le + - os: ubuntu + qemu: s390x + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + - name: Set up QEMU + if: ${{ matrix.qemu }} + uses: docker/setup-qemu-action@v2 + with: + platforms: all + id: qemu + - name: Prepare emulation + run: | + if [[ -n "${{ matrix.qemu }}" ]]; then + # Build emulated architectures only if QEMU is set, + # use default "auto" otherwise + echo "CIBW_ARCHS_LINUX=${{ matrix.qemu }}" >> $GITHUB_ENV + fi + shell: bash + - name: Setup Python + uses: actions/setup-python@v4 + - name: Update pip, wheel, setuptools, build, twine + run: | + python -m pip install -U pip wheel setuptools build twine + - name: Install dev-dependencies + run: | + pip install -r dev-requirements.txt + - name: Build Wheel + env: + CHARSET_NORMALIZER_USE_MYPYC: '1' + run: python setup.py sdist bdist_wheel + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: dist + deploy: runs-on: ubuntu-latest needs: - - integration + - build-wheels steps: - uses: actions/checkout@v2 @@ -161,10 +238,17 @@ jobs: run: | python -m pip install --upgrade pip pip install setuptools wheel twine - - name: Build and publish + - name: Download disctributions + uses: actions/download-artifact@v3 + with: + name: dist + path: dist + - name: Collected dists + run: | + tree dist + - name: Publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python setup.py sdist bdist_wheel twine upload dist/* From e8d7405a45e6d73aed4e269f34cd3466e742bb91 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 17:26:54 +0200 Subject: [PATCH 13/63] :pencil: Add CHANGELOG entry for first beta --- CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68f38819..692b608e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15) + +### Changed +- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1 + +### Removed +- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches +- Breaking: Top-level function `normalize` +- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch +- Support for the backport `unicodedata2` + ## [2.1.0](https://github.com/Ousret/charset_normalizer/compare/2.0.12...2.1.0) (2022-06-19) ### Added From 316b5beeeb21c5d1cf5dff27c577c36634a43da5 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 17:39:24 +0200 Subject: [PATCH 14/63] :pencil: Update README --- README.md | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index d58ede1b..f7084441 100644 --- a/README.md +++ b/README.md @@ -53,12 +53,12 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ms) | File per sec (est) | | ------------- | :-------------: | :------------------: | :------------------: | | [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec | -| charset-normalizer | **98 %** | **39 ms** | 26 file/sec | +| charset-normalizer | **98 %** | **10 ms** | 100 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | | ------------- | :-------------: | :------------------: | :------------------: | | [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms | -| charset-normalizer | 400 ms | 200 ms | 15 ms | +| charset-normalizer | 100 ms | 50 ms | 5 ms | Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. @@ -68,9 +68,6 @@ Chardet's performance on larger file (1MB+) are very poor. Expect huge differenc > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability > (eg. Supported Encoding) Challenge-them if you want. -[cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) and unmaintained faster alternative with -a better accuracy than chardet but lower than this package. If speed is the most important factor, you should try it. - ## ✨ Installation Using PyPi for latest stable @@ -78,11 +75,6 @@ Using PyPi for latest stable pip install charset-normalizer -U ``` -If you want a more up-to-date `unicodedata` than the one available in your Python setup. -```sh -pip install charset-normalizer[unicode_backport] -U -``` - ## 🚀 Basic Usage ### CLI From 82fb1b2441e5fc0e2371a950528b759c8558d90c Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 17:54:56 +0200 Subject: [PATCH 15/63] :pencil: Add a bit of docs about this --- docs/community/speedup.rst | 45 ++++++++++++++++++++++++++++++++++ docs/community/why_migrate.rst | 4 +-- docs/index.rst | 1 + 3 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 docs/community/speedup.rst diff --git a/docs/community/speedup.rst b/docs/community/speedup.rst new file mode 100644 index 00000000..cf810ab7 --- /dev/null +++ b/docs/community/speedup.rst @@ -0,0 +1,45 @@ +Optional speedup extension +=========================== + +Why? +------- + +charset-normalizer will always remain pure Python, meaning that a environment without any build-capabilities will +run this program without any additional requirements. + +Nonetheless, starting from the version 3.0 we introduce and publish some platform specific wheels including a +pre-build extension. + +Most of the time is spent in the module `md.py` so we decided to "compile it" using Mypyc. + +(1) It does not require to have a separate code base +(2) Our project code base is rather simple and lightweight +(3) Mypyc is robust enough today +(4) Four times faster! + +How? +------- + +If your platform and/or architecture is not served by this swift optimization you may compile it easily yourself. +Following those instructions (provided you have the necessary toolchain installed): + + :: + + git clone https://github.com/Ousret/charset_normalizer.git + cd charset_normalizer + git checkout 3.0 + pip install -r dev-requirements.txt + python setup.py --use-mypyc install + + +How not to? +------- + +You may install charset-normalizer without any specific (pre-built wheel) by directly using the universal wheel +(most likely hosted on PyPi or any valid mirror you use) + + :: + + pip install https://........./charset_normalizer-3.0.0b2-py3-none-any.whl + +Directly. diff --git a/docs/community/why_migrate.rst b/docs/community/why_migrate.rst index 717fc3b5..1909c770 100644 --- a/docs/community/why_migrate.rst +++ b/docs/community/why_migrate.rst @@ -4,13 +4,13 @@ Why should I migrate to Charset-Normalizer? There is so many reason to migrate your current project. Here are some of them: - Remove ANY license ambiguity/restriction for projects bundling Chardet (even indirectly). -- X5 faster than Chardet in average and X3 faster in 99% of the cases AND support 3 times more encoding. +- X10 faster than Chardet in average and X6 faster in 99% of the cases AND support 3 times more encoding. - Never return a encoding if not suited for the given decoder. Eg. Never get UnicodeDecodeError! - Actively maintained, open to contributors. - Have the backward compatible function ``detect`` that come from Chardet. - Truly detect the language used in the text. - It is, for the first time, really universal! As there is no specific probe per charset. -- The package size is X4 lower than Chardet's (5.0)! +- The package size is X2~X4 lower than Chardet's (5.0)! (Depends on your arch) - Propose much more options/public kwargs to tweak the detection as you sees fit! - Using static typing to ease your development. - Detect Unicode content better than Chardet or cChardet does. diff --git a/docs/index.rst b/docs/index.rst index 2398a7f0..ca065097 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -72,6 +72,7 @@ Community Guide .. toctree:: :maxdepth: 2 + community/speedup community/faq community/why_migrate From 09402e6a58733db00dadfdf3a4c3cf47a2540da0 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 18:05:04 +0200 Subject: [PATCH 16/63] :wrench: Add py matrix build specific wheels --- .github/workflows/python-publish.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 2393715e..af243734 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -177,6 +177,7 @@ jobs: matrix: os: [ ubuntu, windows, macos ] qemu: [ '' ] + python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10" ] include: # Split ubuntu job for the sake of speed-up - os: ubuntu @@ -206,6 +207,8 @@ jobs: shell: bash - name: Setup Python uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} - name: Update pip, wheel, setuptools, build, twine run: | python -m pip install -U pip wheel setuptools build twine From c19faca176e351da420270864134e5fdbcceb683 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Mon, 15 Aug 2022 22:49:32 +0200 Subject: [PATCH 17/63] Use cibuildwheel action --- .github/workflows/python-publish.yml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index af243734..54a312f2 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,6 +1,7 @@ name: Release-Deployment CI on: + workflow_dispatch: release: types: [created] @@ -177,7 +178,6 @@ jobs: matrix: os: [ ubuntu, windows, macos ] qemu: [ '' ] - python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10" ] include: # Split ubuntu job for the sake of speed-up - os: ubuntu @@ -207,18 +207,18 @@ jobs: shell: bash - name: Setup Python uses: actions/setup-python@v4 - with: - python-version: ${{ matrix.python-version }} - name: Update pip, wheel, setuptools, build, twine run: | python -m pip install -U pip wheel setuptools build twine - name: Install dev-dependencies run: | pip install -r dev-requirements.txt - - name: Build Wheel + - name: Build wheels + uses: pypa/cibuildwheel@2.7.0 env: + CIBW_ARCHS_MACOS: x86_64 arm64 universal2 + CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' CHARSET_NORMALIZER_USE_MYPYC: '1' - run: python setup.py sdist bdist_wheel - name: Upload artifacts uses: actions/upload-artifact@v3 with: From 05b7e7eec46468f938dac7a1f4da2055c2d2d3cc Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Mon, 15 Aug 2022 23:23:38 +0200 Subject: [PATCH 18/63] Update python-publish.yml --- .github/workflows/python-publish.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 54a312f2..124b97cb 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,7 +1,6 @@ name: Release-Deployment CI on: - workflow_dispatch: release: types: [created] From 68f5aff1ddadab41cedf24f35b16714b420e4e95 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Tue, 16 Aug 2022 17:00:22 +0200 Subject: [PATCH 19/63] Update python-publish.yml --- .github/workflows/python-publish.yml | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 124b97cb..bd1dee35 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,6 +1,7 @@ name: Release-Deployment CI on: + workflow_dispatch: release: types: [created] @@ -209,15 +210,12 @@ jobs: - name: Update pip, wheel, setuptools, build, twine run: | python -m pip install -U pip wheel setuptools build twine - - name: Install dev-dependencies - run: | - pip install -r dev-requirements.txt - name: Build wheels uses: pypa/cibuildwheel@2.7.0 env: CIBW_ARCHS_MACOS: x86_64 arm64 universal2 CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' - CHARSET_NORMALIZER_USE_MYPYC: '1' + CIBW_DEPENDENCY_VERSIONS: ./dev-requirements.txt - name: Upload artifacts uses: actions/upload-artifact@v3 with: From 35f79f607a796a5dcc08373f0835a9588da117fe Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Tue, 16 Aug 2022 17:12:45 +0200 Subject: [PATCH 20/63] Update python-publish.yml --- .github/workflows/python-publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index bd1dee35..545a9ac0 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -215,7 +215,7 @@ jobs: env: CIBW_ARCHS_MACOS: x86_64 arm64 universal2 CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' - CIBW_DEPENDENCY_VERSIONS: ./dev-requirements.txt + CIBW_BEFORE_BUILD: pip install -r dev-requirements.txt - name: Upload artifacts uses: actions/upload-artifact@v3 with: From 57a84852fd31c98e667a501ec9165356269d08aa Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Tue, 16 Aug 2022 17:27:04 +0200 Subject: [PATCH 21/63] Update python-publish.yml --- .github/workflows/python-publish.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 545a9ac0..452abb97 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -216,6 +216,7 @@ jobs: CIBW_ARCHS_MACOS: x86_64 arm64 universal2 CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' CIBW_BEFORE_BUILD: pip install -r dev-requirements.txt + CIBW_SKIP: pp* - name: Upload artifacts uses: actions/upload-artifact@v3 with: From 2f5130a4c27e5924d910c6c4fa365b93f2431a08 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Tue, 16 Aug 2022 17:57:56 +0200 Subject: [PATCH 22/63] Update python-publish.yml --- .github/workflows/python-publish.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 452abb97..fd1d2a9d 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -221,7 +221,7 @@ jobs: uses: actions/upload-artifact@v3 with: name: dist - path: dist + path: ./wheelhouse/*.whl deploy: From 0a0e20b7479af103bb9de7f37744c1d2623e765a Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Tue, 16 Aug 2022 17:59:59 +0200 Subject: [PATCH 23/63] Update python-publish.yml --- .github/workflows/python-publish.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index fd1d2a9d..d0f405d4 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,7 +1,6 @@ name: Release-Deployment CI on: - workflow_dispatch: release: types: [created] From 443ab7d5bab1040a1c758b3364aa951bf929efda Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:06:17 +0200 Subject: [PATCH 24/63] :fire: remove unicodedata2 import ver in cli --- charset_normalizer/cli/normalizer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index b8b652a5..4902d36e 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -5,10 +5,7 @@ from platform import python_version from typing import List, Optional -try: - from unicodedata2 import unidata_version -except ImportError: - from unicodedata import unidata_version +from unicodedata import unidata_version from charset_normalizer import from_fp from charset_normalizer.models import CliDetectionResult From b580e970440825c18758cd84e97f86e8ff2fc951 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:11:24 +0200 Subject: [PATCH 25/63] :sparkle: normalizer --version specify if extra speedup is present --- charset_normalizer/cli/normalizer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 4902d36e..0a0a0648 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -9,6 +9,7 @@ from charset_normalizer import from_fp from charset_normalizer.models import CliDetectionResult +import charset_normalizer.md as md_module from charset_normalizer.version import __version__ @@ -121,8 +122,8 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: parser.add_argument( "--version", action="version", - version="Charset-Normalizer {} - Python {} - Unicode {}".format( - __version__, python_version(), unidata_version + version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( + __version__, python_version(), unidata_version, "OFF" if md_module.__file__.lower().endswith(".py") else "ON" ), help="Show version information and exit.", ) From eb4577c0d2c82d925a8e00d2b245269717ca6d66 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:13:05 +0200 Subject: [PATCH 26/63] :bookmark: bump to beta2 --- charset_normalizer/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 1bee3dc5..c05c9cd1 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.0.0b1" +__version__ = "3.0.0b2" VERSION = __version__.split(".") From 03a25998d2ca6a5d41a1b8771fe1f54845d56ad3 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:16:28 +0200 Subject: [PATCH 27/63] :pencil: Add changelog entry --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 692b608e..b797f747 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15) +### Added +- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl) + ### Changed - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1 From 97b87f0905c6a10d979299c74986c6d753d01e69 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:21:52 +0200 Subject: [PATCH 28/63] :pencil: update changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b797f747..29915d29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,11 +2,13 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15) +## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-19) ### Added - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl) +## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15) + ### Changed - Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1 From 1755db9cee1495cd29a1c06ac98deec524eff69e Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:23:03 +0200 Subject: [PATCH 29/63] :pencil: update speedup doc --- docs/community/speedup.rst | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/community/speedup.rst b/docs/community/speedup.rst index cf810ab7..e88b40b6 100644 --- a/docs/community/speedup.rst +++ b/docs/community/speedup.rst @@ -25,11 +25,8 @@ Following those instructions (provided you have the necessary toolchain installe :: - git clone https://github.com/Ousret/charset_normalizer.git - cd charset_normalizer - git checkout 3.0 - pip install -r dev-requirements.txt - python setup.py --use-mypyc install + export CHARSET_NORMALIZER_USE_MYPYC=1 + pip install charset-normalizer --no-binary :all: How not to? @@ -40,6 +37,6 @@ You may install charset-normalizer without any specific (pre-built wheel) by dir :: - pip install https://........./charset_normalizer-3.0.0b2-py3-none-any.whl + pip install charset-normalizer --no-binary :all: Directly. From 1faeed046660898cd7a727711c9706245aeccbf8 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:33:30 +0200 Subject: [PATCH 30/63] :heavy_check_mark: Verify that --version work as intended for CLI --- tests/test_cli.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_cli.py b/tests/test_cli.py index 16601750..440ce9df 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -28,6 +28,12 @@ def test_single_file(self): ) ) + def test_version_output_success(self): + with self.assertRaises(SystemExit): + cli_detect( + ['--version'] + ) + def test_single_file_normalize(self): self.assertEqual( 0, From 8e5af122d10135e42f1653dfe054ca144dc69725 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:33:44 +0200 Subject: [PATCH 31/63] :art: reformat normalizer.py --- charset_normalizer/cli/normalizer.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 0a0a0648..70293895 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -4,12 +4,11 @@ from os.path import abspath from platform import python_version from typing import List, Optional - from unicodedata import unidata_version +import charset_normalizer.md as md_module from charset_normalizer import from_fp from charset_normalizer.models import CliDetectionResult -import charset_normalizer.md as md_module from charset_normalizer.version import __version__ @@ -123,7 +122,10 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: "--version", action="version", version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( - __version__, python_version(), unidata_version, "OFF" if md_module.__file__.lower().endswith(".py") else "ON" + __version__, + python_version(), + unidata_version, + "OFF" if md_module.__file__.lower().endswith(".py") else "ON", ), help="Show version information and exit.", ) From 368d0600921270ab657e6b1234f7fe13fb594b97 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Fri, 19 Aug 2022 23:59:39 +0200 Subject: [PATCH 32/63] :fire: remove method first() and best() from class CharsetMatch --- CHANGELOG.md | 3 +++ charset_normalizer/models.py | 12 ------------ 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 29915d29..afc5fdb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl) +### Removed +- Breaking: Method `first()` and `best()` from CharsetMatch + ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15) ### Changed diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index 2da82cbd..07cbc12a 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -200,18 +200,6 @@ def could_be_from_charset(self) -> List[str]: """ return [self._encoding] + [m.encoding for m in self._leaves] - def first(self) -> "CharsetMatch": - """ - Kept for BC reasons. Will be removed in 3.0. - """ - return self - - def best(self) -> "CharsetMatch": - """ - Kept for BC reasons. Will be removed in 3.0. - """ - return self - def output(self, encoding: str = "utf_8") -> bytes: """ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. From 8e5171a61726d2a7d571f3c896d17f5491aa5688 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 21 Aug 2022 19:41:46 +0200 Subject: [PATCH 33/63] :fire: :art: remove unused import "warnings" --- charset_normalizer/api.py | 1 - 1 file changed, 1 deletion(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 375d3b0c..c4d3c7c3 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -1,5 +1,4 @@ import logging -import warnings from os import PathLike from typing import Any, BinaryIO, List, Optional, Set From 1957898fc2de2f026baff7da04611958e1fbc02a Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 21 Aug 2022 20:00:07 +0200 Subject: [PATCH 34/63] :art: Fix warnings in Sphinx docs generation process Close #196 --- docs/api.rst | 11 +++++++---- docs/community/speedup.rst | 2 +- docs/conf.py | 4 ++-- docs/index.rst | 1 - 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index 47a985e5..48b74951 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -14,11 +14,9 @@ Those functions are publicly exposed and are protected through our BC guarantee. .. autofunction:: from_fp .. autofunction:: from_path -.. autofunction:: normalize - -.. autoclass:: charset_normalizer.CharsetMatches +.. autoclass:: charset_normalizer.models.CharsetMatches :inherited-members: -.. autoclass:: charset_normalizer.CharsetMatch +.. autoclass:: charset_normalizer.models.CharsetMatch :inherited-members: .. autofunction:: detect @@ -99,3 +97,8 @@ Some reusable functions used across the project. We do not guarantee the BC in t .. autofunction:: charset_normalizer.utils.range_scan .. autofunction:: charset_normalizer.utils.is_cp_similar + + +.. class:: os.PathLike + + Used as a generic way to accept AnyStr for paths. diff --git a/docs/community/speedup.rst b/docs/community/speedup.rst index e88b40b6..cefc57e4 100644 --- a/docs/community/speedup.rst +++ b/docs/community/speedup.rst @@ -30,7 +30,7 @@ Following those instructions (provided you have the necessary toolchain installe How not to? -------- +----------- You may install charset-normalizer without any specific (pre-built wheel) by directly using the universal wheel (most likely hosted on PyPi or any valid mirror you use) diff --git a/docs/conf.py b/docs/conf.py index 5cfe028b..3e675d42 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -81,7 +81,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -113,7 +113,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = [] # -- Options for HTMLHelp output ------------------------------------------ diff --git a/docs/index.rst b/docs/index.rst index ca065097..05d5f98a 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,7 +13,6 @@ It aims to be as generic as possible. .. image:: https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df :width: 500px - :scale: 100 % :alt: CLI Charset Normalizer :align: right From 1eeb42354544cc332a95d1bb2a496fad3ad872d9 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 21 Aug 2022 20:07:15 +0200 Subject: [PATCH 35/63] :pencil: update changelog --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4607bae1..033b0640 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-19) +## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21) ### Added - `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl) @@ -10,6 +10,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Removed - Breaking: Method `first()` and `best()` from CharsetMatch +### Fixed +- Sphinx warnings when generating the documentation + ## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15) ### Changed From f119e43c1290be169cda5a866711e3984ebcfbad Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 21 Aug 2022 20:49:43 +0200 Subject: [PATCH 36/63] :pencil: update docs support section --- README.md | 2 +- docs/user/support.rst | 7 +++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index f7084441..b7819f97 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ This project offers you an alternative to **Universal Charset Encoding Detector* | `Detect spoken language` | ❌ | :heavy_check_mark: | N/A | | `UnicodeDecodeError Safety` | ❌ | :heavy_check_mark: | ❌ | | `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB | -| `Supported Encoding` | 33 | :tada: [93](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 +| `Supported Encoding` | 33 | :tada: [90](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40

Reading Normalized TextCat Reading Text diff --git a/docs/user/support.rst b/docs/user/support.rst index 8b624933..0dbf06b9 100644 --- a/docs/user/support.rst +++ b/docs/user/support.rst @@ -92,13 +92,10 @@ mac_iceland maciceland mac_latin2 maccentraleurope, maclatin2 mac_roman macintosh, macroman mac_turkish macturkish -mbcs ansi, dbcs ptcp154 csptcp154, pt154, cp154, cyrillic_asian -rot_13 rot13 shift_jis csshiftjis, shiftjis, sjis, s_jis, x_mac_japanese shift_jis_2004 shiftjis2004, sjis_2004, s_jis_2004 shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213 -tactis tis260 tis_620 tis620, tis_620_0, tis_620_2529_0, tis_620_2529_1, iso_ir_166 utf_16 u16, utf16 utf_16_be unicodebigunmarked, utf_16be @@ -107,9 +104,11 @@ utf_32 u32, utf32 utf_32_be utf_32be utf_32_le utf_32le utf_8 u8, utf, utf8, utf8_ucs2, utf8_ucs4 (+utf_8_sig) -utf_7 u7, unicode-1-1-utf-7 +utf_7* u7, unicode-1-1-utf-7 =============== =============================================================================================================================== +*: Only if a SIG/mark is found. + ------------------- Supported Languages ------------------- From 216d1c673615451b6fbfc82865da66ccfd8d66a5 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 21 Aug 2022 20:51:20 +0200 Subject: [PATCH 37/63] make sure utf-7 is not "detected" without a mark/sig conflict with ascii detector. cannot enable it without compromises. --- charset_normalizer/api.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 0be42c01..c2e54356 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -199,6 +199,13 @@ def from_bytes( encoding_iana, ) continue + if encoding_iana in {"utf_7"} and not bom_or_sig_available: + logger.log( + TRACE, + "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.", + encoding_iana, + ) + continue try: is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana) From 03aa701dae61f4c71c0910147061cb3730e0a16f Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 21 Aug 2022 20:53:43 +0200 Subject: [PATCH 38/63] :pencil: update changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 033b0640..9dee06a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Removed - Breaking: Method `first()` and `best()` from CharsetMatch +- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII) ### Fixed - Sphinx warnings when generating the documentation From c12a07a513c4aa5328cb5c340954647193f87e28 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 21:00:09 +0200 Subject: [PATCH 39/63] :wrench: switch to static metadata (setup.cfg) and use 'build' --- .github/workflows/chardet-bc.yml | 3 +- .github/workflows/detector-coverage.yml | 3 +- .github/workflows/integration.yml | 3 +- .github/workflows/lint.yml | 3 +- .github/workflows/mypyc-verify.yml | 7 +- .github/workflows/performance.yml | 3 +- .github/workflows/python-publish.yml | 20 ++++-- .github/workflows/run-tests.yml | 3 +- README.md | 14 ++-- dev-requirements.txt | 22 ++++--- setup.cfg | 58 ++++++++++++++++- setup.py | 86 +++---------------------- 12 files changed, 115 insertions(+), 110 deletions(-) diff --git a/.github/workflows/chardet-bc.yml b/.github/workflows/chardet-bc.yml index 0bbeaec8..dfbc64cc 100644 --- a/.github/workflows/chardet-bc.yml +++ b/.github/workflows/chardet-bc.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/detector-coverage.yml b/.github/workflows/detector-coverage.yml index 19eed9ae..1527f22b 100644 --- a/.github/workflows/detector-coverage.yml +++ b/.github/workflows/detector-coverage.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f74a56d2..00aa98eb 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -28,7 +28,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 877b890e..4f1f12f4 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Type checking (Mypy) run: | mypy --strict charset_normalizer diff --git a/.github/workflows/mypyc-verify.yml b/.github/workflows/mypyc-verify.yml index 499487c8..e9b2a9c7 100644 --- a/.github/workflows/mypyc-verify.yml +++ b/.github/workflows/mypyc-verify.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [3.9] + python-version: [3.6, 3.7, 3.8, 3.9, "3.10"] os: [ubuntu-latest] steps: @@ -24,8 +24,11 @@ jobs: pip install -r dev-requirements.txt pip uninstall -y charset-normalizer - name: Install the package + env: + CHARSET_NORMALIZER_USE_MYPYC: '1' run: | - python setup.py --use-mypyc install + python -m build --no-isolation + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml index fddd9d30..e675061a 100644 --- a/.github/workflows/performance.yml +++ b/.github/workflows/performance.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index fd1d2a9d..4a4f4e48 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -29,7 +29,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Type checking (Mypy) run: | mypy charset_normalizer @@ -51,7 +52,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10" ] + python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10", "3.11-dev" ] os: [ ubuntu-latest ] steps: @@ -67,7 +68,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Run tests run: | pytest @@ -96,7 +98,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git @@ -136,7 +139,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git @@ -163,7 +167,7 @@ jobs: - name: Build Wheel env: CHARSET_NORMALIZER_USE_MYPYC: '0' - run: python setup.py sdist bdist_wheel + run: python -m build - name: Upload artifacts uses: actions/upload-artifact@v3 with: @@ -211,10 +215,12 @@ jobs: run: | python -m pip install -U pip wheel setuptools build twine - name: Build wheels - uses: pypa/cibuildwheel@2.7.0 + uses: pypa/cibuildwheel@2.10.2 env: + CIBW_BUILD_FRONTEND: "build" CIBW_ARCHS_MACOS: x86_64 arm64 universal2 CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' + CIBW_CONFIG_SETTINGS: "--no-isolation" CIBW_BEFORE_BUILD: pip install -r dev-requirements.txt CIBW_SKIP: pp* - name: Upload artifacts diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 2e999729..27dc5d5f 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build --no-isolation + pip install ./dist/*.whl - name: Run tests run: | pytest diff --git a/README.md b/README.md index b7819f97..27736830 100644 --- a/README.md +++ b/README.md @@ -25,14 +25,14 @@ This project offers you an alternative to **Universal Charset Encoding Detector* | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | | ------------- | :-------------: | :------------------: | :------------------: | -| `Fast` | ❌
| :heavy_check_mark:
| :heavy_check_mark:
| -| `Universal**` | ❌ | :heavy_check_mark: | ❌ | -| `Reliable` **without** distinguishable standards | ❌ | :heavy_check_mark: | :heavy_check_mark: | -| `Reliable` **with** distinguishable standards | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| `Fast` | ❌
| ✅
| ✅
| +| `Universal**` | ❌ | ✅ | ❌ | +| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | +| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ | | `License` | LGPL-2.1
_restrictive_ | MIT | MPL-1.1
_restrictive_ | -| `Native Python` | :heavy_check_mark: | :heavy_check_mark: | ❌ | -| `Detect spoken language` | ❌ | :heavy_check_mark: | N/A | -| `UnicodeDecodeError Safety` | ❌ | :heavy_check_mark: | ❌ | +| `Native Python` | ✅ | ✅ | ❌ | +| `Detect spoken language` | ❌ | ✅ | N/A | +| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ | | `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB | | `Supported Encoding` | 33 | :tada: [90](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 diff --git a/dev-requirements.txt b/dev-requirements.txt index 8e77fe94..df125d66 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,10 +1,12 @@ -pytest -pytest-cov -codecov -chardet>=5.0,<5.1 -Flask>=2.0,<3.0 -requests>=2.26,<3.0 -black==22.8.0 -flake8==5.0.4 -mypy==0.971 -isort +black==22.8.0 +flake8==5.0.4 +mypy>=0.970 +Flask==2.2.2 +chardet==5.0.0 +isort==5.10.1 +codecov==2.1.12 +pytest==7.1.3 +pytest-cov==4.0.0 +build==0.8.0 +requests==2.28.1 +wheel==0.37.1 diff --git a/setup.cfg b/setup.cfg index bb4f9c50..8000f5cd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,59 @@ +[metadata] +name = charset-normalizer +description = The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet. +long_description = file: README.md, CHANGELOG.md, LICENSE +long_description_content_type = text/markdown +keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect +url = https://github.com/Ousret/charset_normalizer +license = MIT +author_email = ahmed.tahri@cloudnursery.dev +author = Ahmed TAHRI +python_requires = >=3.6.0 +project_urls = + Bug Reports = https://github.com/Ousret/charset_normalizer/issues + Documentation = https://charset-normalizer.readthedocs.io/en/latest +classifiers = + Development Status :: 5 - Production/Stable + License :: OSI Approved :: MIT License + Intended Audience :: Developers + Topic :: Software Development :: Libraries :: Python Modules + Operating System :: OS Independent + Programming Language :: Python + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: Implementation :: PyPy + Topic :: Text Processing :: Linguistic + Topic :: Utilities + Typing :: Typed + +[options.packages.find] +exclude = + tests + *.tests + *.tests.* + tests.* + docs* + data* + +[options.extras_require] +unicode_backport = + +[options.entry_points] +console_scripts = + normalizer = charset_normalizer.cli.normalizer:cli_detect + +[options] +packages = find: +include_package_data = True + +[options.package_data] +charset_normalizer = py.typed + [tool:pytest] addopts = --cov=charset_normalizer --cov-report=term-missing -rxXs @@ -11,4 +67,4 @@ ignore_missing_imports = True [tool:isort] profile = black -combine_as_imports = True \ No newline at end of file +combine_as_imports = True diff --git a/setup.py b/setup.py index 476d65d9..7c64a695 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,17 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import io import os import sys from re import search -from setuptools import find_packages, setup +from setuptools import setup + + +def get_version(): + with open('charset_normalizer/version.py') as version_file: + return search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", + version_file.read()).group('version') USE_MYPYC = False @@ -26,81 +31,8 @@ else: MYPYC_MODULES = None - -def get_version(): - with open('charset_normalizer/version.py') as version_file: - return search(r"""__version__\s+=\s+(['"])(?P.+?)\1""", - version_file.read()).group('version') - - -# Package meta-data. -NAME = 'charset-normalizer' -DESCRIPTION = 'The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.' -URL = 'https://github.com/ousret/charset_normalizer' -EMAIL = 'ahmed.tahri@cloudnursery.dev' -AUTHOR = 'Ahmed TAHRI @Ousret' -REQUIRES_PYTHON = '>=3.6.0' -VERSION = get_version() - -REQUIRED = [] - -EXTRAS = { - 'unicode_backport': [] -} - -here = os.path.abspath(os.path.dirname(__file__)) - -try: - with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: - long_description = '\n' + f.read() -except FileNotFoundError: - long_description = DESCRIPTION - setup( - name=NAME, - version=VERSION, - description=DESCRIPTION, - long_description=long_description.replace(':heavy_check_mark:', '✅'), - long_description_content_type='text/markdown', - author=AUTHOR, - author_email=EMAIL, - python_requires=REQUIRES_PYTHON, - url=URL, - keywords=['encoding', 'i18n', 'txt', 'text', 'charset', 'charset-detector', 'normalization', 'unicode', 'chardet'], - packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]), - install_requires=REQUIRED, - extras_require=EXTRAS, - include_package_data=True, - package_data={"charset_normalizer": ["py.typed"]}, - license='MIT', - entry_points={ - 'console_scripts': - [ - 'normalizer = charset_normalizer.cli.normalizer:cli_detect' - ] - }, - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: MIT License', - 'Intended Audience :: Developers', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Topic :: Text Processing :: Linguistic', - 'Topic :: Utilities', - 'Programming Language :: Python :: Implementation :: PyPy', - 'Typing :: Typed' - ], - project_urls={ - 'Bug Reports': 'https://github.com/Ousret/charset_normalizer/issues', - 'Documentation': 'https://charset-normalizer.readthedocs.io/en/latest', - }, + name="charset-normalizer", + version=get_version(), ext_modules=MYPYC_MODULES ) From b2da4cbd914da2a4e8dc1a3023210f20616d82e3 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 21:06:55 +0200 Subject: [PATCH 40/63] :wrench: Lax on Flask version range (py 3.6) --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index df125d66..ec379eee 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ black==22.8.0 flake8==5.0.4 mypy>=0.970 -Flask==2.2.2 +Flask>=2.0.3,<=2.2.2 chardet==5.0.0 isort==5.10.1 codecov==2.1.12 From 95253c85a24204120422266364b020ef9d60e23e Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 21:11:41 +0200 Subject: [PATCH 41/63] :wrench: Lax on pytest version range (py 3.6) --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index ec379eee..c394f232 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -5,7 +5,7 @@ Flask>=2.0.3,<=2.2.2 chardet==5.0.0 isort==5.10.1 codecov==2.1.12 -pytest==7.1.3 +pytest>=7.0.0,<8.0 pytest-cov==4.0.0 build==0.8.0 requests==2.28.1 From a28be6ba9156d9bd95e95f3f604e6a77519cde26 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 21:13:20 +0200 Subject: [PATCH 42/63] :wrench: Lax on requests version range (py 3.6) --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index c394f232..1760f9f5 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -8,5 +8,5 @@ codecov==2.1.12 pytest>=7.0.0,<8.0 pytest-cov==4.0.0 build==0.8.0 -requests==2.28.1 +requests>=2.27.1,<3.0.0 wheel==0.37.1 From 02969005a82bfc28f10dcc5d4ae4108dd53018ea Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 21:43:43 +0200 Subject: [PATCH 43/63] :fire: remove codeql action --- .github/workflows/codeql-analysis.yml | 56 --------------------------- 1 file changed, 56 deletions(-) delete mode 100644 .github/workflows/codeql-analysis.yml diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml deleted file mode 100644 index 1a7014d5..00000000 --- a/.github/workflows/codeql-analysis.yml +++ /dev/null @@ -1,56 +0,0 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -name: "CodeQL" - -on: - push: - branches: [master, develop] - pull_request: - # The branches below must be a subset of the branches above - branches: [master, develop] - schedule: - - cron: '0 23 * * 0' - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - - strategy: - matrix: - python-version: [3.9] - fail-fast: false - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - with: - # We must fetch at least the immediate parents so that if this is - # a pull request then we can checkout the head. - fetch-depth: 2 - - # If this run was triggered by a pull request event, then checkout - # the head of the pull request instead of the merge commit. - - run: git checkout HEAD^2 - if: ${{ github.event_name == 'pull_request' }} - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v1 - with: - languages: "python" - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - # queries: ./path/to/local/query, your-org/your-repo/queries@main - - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v1 - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v1 From 0e91fb606606ffb4a25f7380c1704b49272c9f10 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 22:16:31 +0200 Subject: [PATCH 44/63] :bug: Fix CLI --normalize opt using fullpath in args --- charset_normalizer/cli/normalizer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 70293895..77f351f7 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -1,7 +1,7 @@ import argparse import sys from json import dumps -from os.path import abspath +from os.path import abspath, realpath, dirname, basename, join from platform import python_version from typing import List, Optional from unicodedata import unidata_version @@ -234,7 +234,10 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: my_file.close() continue - o_: List[str] = my_file.name.split(".") + dir_path = dirname(realpath(my_file.name)) + file_name = basename(realpath(my_file.name)) + + o_: List[str] = file_name.split(".") if args.replace is False: o_.insert(-1, best_guess.encoding) @@ -255,7 +258,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: continue try: - x_[0].unicode_path = abspath("./{}".format(".".join(o_))) + x_[0].unicode_path = join(dir_path, ".".join(o_)) with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: fp.write(str(best_guess)) From 5910d20efca877ea32f28ad9b09fddf500f01a6c Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 22:17:14 +0200 Subject: [PATCH 45/63] :heavy_check_mark: Ensure tests run with cibuildwheel --- .github/workflows/python-publish.yml | 2 ++ tests/test_cli.py | 50 ++++++++++++++++------------ tests/test_full_detection.py | 8 ++++- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 4a4f4e48..2042d90e 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -222,6 +222,8 @@ jobs: CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' CIBW_CONFIG_SETTINGS: "--no-isolation" CIBW_BEFORE_BUILD: pip install -r dev-requirements.txt + CIBW_TEST_REQUIRES: pytest codecov pytest-cov + CIBW_TEST_COMMAND: pytest {package}/tests CIBW_SKIP: pp* - name: Upload artifacts uses: actions/upload-artifact@v3 diff --git a/tests/test_cli.py b/tests/test_cli.py index 440ce9df..d42bf46b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,7 +2,12 @@ from charset_normalizer.cli.normalizer import cli_detect, query_yes_no from unittest.mock import patch from os.path import exists -from os import remove +from os import remove, path, pardir + +DIR_PATH = path.join( + path.dirname(path.realpath(__file__)), + pardir +) class TestCommandLineInterface(unittest.TestCase): @@ -24,7 +29,7 @@ def test_single_file(self): self.assertEqual( 0, cli_detect( - ['./data/sample-arabic-1.txt'] + [DIR_PATH + '/data/sample-arabic-1.txt'] ) ) @@ -38,16 +43,19 @@ def test_single_file_normalize(self): self.assertEqual( 0, cli_detect( - ['./data/sample-arabic-1.txt', '--normalize'] + [ + DIR_PATH + '/data/sample-arabic-1.txt', + '--normalize' + ] ) ) self.assertTrue( - exists('./data/sample-arabic-1.cp1256.txt') + exists(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') ) try: - remove('./data/sample-arabic-1.cp1256.txt') + remove(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') except: pass @@ -55,7 +63,7 @@ def test_single_verbose_file(self): self.assertEqual( 0, cli_detect( - ['./data/sample-arabic-1.txt', '--verbose'] + [DIR_PATH + '/data/sample-arabic-1.txt', '--verbose'] ) ) @@ -64,9 +72,9 @@ def test_multiple_file(self): 0, cli_detect( [ - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -77,9 +85,9 @@ def test_with_alternative(self): cli_detect( [ '-a', - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -90,9 +98,9 @@ def test_with_minimal_output(self): cli_detect( [ '-m', - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -104,9 +112,9 @@ def test_with_minimal_and_alt(self): [ '-m', '-a', - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -115,7 +123,7 @@ def test_non_existent_file(self): with self.assertRaises(SystemExit) as cm: cli_detect( - ['./data/not_found_data.txt'] + [DIR_PATH + '/data/not_found_data.txt'] ) self.assertEqual(cm.exception.code, 2) @@ -125,7 +133,7 @@ def test_replace_without_normalize(self): self.assertEqual( cli_detect( [ - './data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-arabic-1.txt', '--replace' ] ), @@ -136,7 +144,7 @@ def test_force_replace_without_replace(self): self.assertEqual( cli_detect( [ - './data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-arabic-1.txt', '--force' ] ), diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py index 96e0b797..fd8ac80c 100644 --- a/tests/test_full_detection.py +++ b/tests/test_full_detection.py @@ -1,5 +1,11 @@ from charset_normalizer.api import from_path import pytest +from os import path, pardir + +DIR_PATH = path.join( + path.dirname(path.realpath(__file__)), + pardir +) @pytest.mark.parametrize( @@ -30,7 +36,7 @@ def test_elementary_detection( expected_charset: str, expected_language: str, ): - best_guess = from_path("./data/{}".format(input_data_file)).best() + best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best() assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file) assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file) From 093889b224e64310670c49e0758c061a36ce6a59 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sat, 1 Oct 2022 22:20:34 +0200 Subject: [PATCH 46/63] :art: apply isort on normalizer.py --- charset_normalizer/cli/normalizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index 77f351f7..ad26b4d0 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -1,7 +1,7 @@ import argparse import sys from json import dumps -from os.path import abspath, realpath, dirname, basename, join +from os.path import abspath, basename, dirname, join, realpath from platform import python_version from typing import List, Optional from unicodedata import unidata_version From d0df3f49377992dd3ec32e83bd2538bd03dae52d Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 2 Oct 2022 13:56:19 +0200 Subject: [PATCH 47/63] :sparkle: Extend the capability of explain=True when cp_isolation contain at most two entries, will log in details the Mess-detector results --- CHANGELOG.md | 11 +++++++++++ charset_normalizer/api.py | 6 +++++- charset_normalizer/md.py | 21 +++++++++++++++++++-- 3 files changed, 35 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9dee06a8..f1c893c9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,17 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.0.0b3](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0b3) (2022-10-??) + +### Added +- Extend the capability of explain=True when cp_isolation contain at most two entries, will log in details the Mess-detector results + +### Changed +- Build with static metadata using 'build' frontend + +### Fixed +- CLI with opt --normalize fail when using full path for files + ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21) ### Added diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index c2e54356..1edd92f0 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -302,7 +302,11 @@ def from_bytes( ): md_chunks.append(chunk) - md_ratios.append(mess_ratio(chunk, threshold)) + md_ratios.append( + mess_ratio( + chunk, threshold, explain is True and len(cp_isolation) <= 2 + ) + ) if md_ratios[-1] >= threshold: early_stop_count += 1 diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 31808af8..8c0eb095 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -1,7 +1,12 @@ from functools import lru_cache +from logging import getLogger from typing import List, Optional -from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD +from .constant import ( + COMMON_SAFE_ASCII_CHARACTERS, + TRACE, + UNICODE_SECONDARY_RANGE_KEYWORD, +) from .utils import ( is_accentuated, is_ascii, @@ -547,7 +552,19 @@ def mess_ratio( break if debug: + logger = getLogger("charset_normalizer") + + logger.log( + TRACE, + "Mess-detector extended-analysis start. " + f"{intermediary_mean_mess_ratio_calc=} {mean_mess_ratio=} {maximum_threshold=}", + ) + + if len(decoded_sequence) > 16: + logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") + logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") + for dt in detectors: # pragma: nocover - print(dt.__class__, dt.ratio) + logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") return round(mean_mess_ratio, 3) From 32cbafeef71ef6988f6815dea51fb48820c6cc55 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 2 Oct 2022 13:56:56 +0200 Subject: [PATCH 48/63] :wrench: run_checks.sh adjust black target lvl py36 --- bin/run_checks.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/run_checks.sh b/bin/run_checks.sh index 0ae730eb..1e135b35 100755 --- a/bin/run_checks.sh +++ b/bin/run_checks.sh @@ -8,7 +8,7 @@ fi set -x ${PREFIX}pytest -${PREFIX}black --check --diff --target-version=py35 charset_normalizer +${PREFIX}black --check --diff --target-version=py36 charset_normalizer ${PREFIX}flake8 charset_normalizer ${PREFIX}mypy charset_normalizer ${PREFIX}isort --check --diff charset_normalizer From b5ef79832c851f40c65c667b2a93962072aa2a9a Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 2 Oct 2022 15:29:22 +0200 Subject: [PATCH 49/63] :ambulance: Fix invalid syntax fstring eq auto format (py 36) --- charset_normalizer/md.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 8c0eb095..0152f326 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -557,7 +557,7 @@ def mess_ratio( logger.log( TRACE, "Mess-detector extended-analysis start. " - f"{intermediary_mean_mess_ratio_calc=} {mean_mess_ratio=} {maximum_threshold=}", + f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} maximum_threshold={maximum_threshold}", ) if len(decoded_sequence) > 16: From 2cb15cf6380f3795fa98c3ac51c01f28bf6c4eb8 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 2 Oct 2022 15:38:36 +0200 Subject: [PATCH 50/63] Amend commit d0df3f49377992dd3ec32e83bd2538bd03dae52d --- CHANGELOG.md | 2 +- charset_normalizer/api.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1c893c9..b1fce47f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ## [3.0.0b3](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0b3) (2022-10-??) ### Added -- Extend the capability of explain=True when cp_isolation contain at most two entries, will log in details the Mess-detector results +- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results ### Changed - Build with static metadata using 'build' frontend diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index 1edd92f0..c54fda32 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -304,7 +304,7 @@ def from_bytes( md_ratios.append( mess_ratio( - chunk, threshold, explain is True and len(cp_isolation) <= 2 + chunk, threshold, explain is True and cp_isolation and len(cp_isolation) <= 2 ) ) From 9b4a2095f5a3ad84ec8d4a42dd48b64fae3cec06 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 2 Oct 2022 15:44:52 +0200 Subject: [PATCH 51/63] :art: reformat file (flake8) --- charset_normalizer/api.py | 2 +- charset_normalizer/md.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index c54fda32..dbb2abf9 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -304,7 +304,7 @@ def from_bytes( md_ratios.append( mess_ratio( - chunk, threshold, explain is True and cp_isolation and len(cp_isolation) <= 2 + chunk, threshold, explain is True and 1 <= len(cp_isolation) <= 2 ) ) diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 0152f326..d62a8bda 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -557,7 +557,8 @@ def mess_ratio( logger.log( TRACE, "Mess-detector extended-analysis start. " - f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} maximum_threshold={maximum_threshold}", + f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " + f"maximum_threshold={maximum_threshold}", ) if len(decoded_sequence) > 16: From 5e2368ef47d61f73aac353b1859082d58e90f784 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Sun, 2 Oct 2022 17:11:24 +0200 Subject: [PATCH 52/63] :art: reformat api.py --- charset_normalizer/api.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index dbb2abf9..bee14481 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -304,7 +304,9 @@ def from_bytes( md_ratios.append( mess_ratio( - chunk, threshold, explain is True and 1 <= len(cp_isolation) <= 2 + chunk, + threshold, + explain is True and 1 <= len(cp_isolation) <= 2, ) ) From c76a83d5f01552364fe3da9bf3ea3010e491f7bf Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Thu, 6 Oct 2022 17:59:12 +0200 Subject: [PATCH 53/63] :sparkle: Support for alternative language frequency set + :fire: Coherence detector no longer return 'Simple English' instead return 'English' --- CHANGELOG.md | 4 ++ charset_normalizer/assets/__init__.py | 58 ++++++++++++++------------- charset_normalizer/cd.py | 31 +++++++++++++- tests/test_coherence_detection.py | 17 +++++++- 4 files changed, 80 insertions(+), 30 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b1fce47f..089e0bfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results +- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES ### Changed - Build with static metadata using 'build' frontend @@ -13,6 +14,9 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - CLI with opt --normalize fail when using full path for files +### Removed +- Coherence detector no longer return 'Simple English' instead return 'English' + ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21) ### Added diff --git a/charset_normalizer/assets/__init__.py b/charset_normalizer/assets/__init__.py index 3c33ba30..029be349 100644 --- a/charset_normalizer/assets/__init__.py +++ b/charset_normalizer/assets/__init__.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from typing import Dict, List +# Language label that contain the em dash "—" +# character are to be considered alternative seq to origin FREQUENCIES: Dict[str, List[str]] = { "English": [ "e", @@ -30,6 +32,34 @@ "z", "q", ], + "English—": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "m", + "u", + "f", + "p", + "g", + "w", + "b", + "y", + "v", + "k", + "j", + "x", + "z", + "q", + ], "German": [ "e", "n", @@ -956,34 +986,6 @@ "ö", "y", ], - "Simple English": [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "m", - "u", - "f", - "p", - "g", - "w", - "b", - "y", - "v", - "k", - "j", - "x", - "z", - "q", - ], "Thai": [ "า", "น", diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index ee4b7424..a294257e 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -289,6 +289,33 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: return sorted(merge, key=lambda x: x[1], reverse=True) +def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: + """ + We shall NOT return "English—" in CoherenceMatches because it is an alternative + of "English". This function only keeps the best match and remove the em-dash in it. + """ + index_results: Dict[str, List[float]] = dict() + + for result in results: + language, ratio = result + no_em_name: str = language.replace("—", "") + + if no_em_name not in index_results: + index_results[no_em_name] = [] + + index_results[no_em_name].append(ratio) + + if any(len(index_results[e]) > 1 for e in index_results): + filtered_results: CoherenceMatches = [] + + for language in index_results: + filtered_results.append((language, max(index_results[language]))) + + return filtered_results + + return results + + @lru_cache(maxsize=2048) def coherence_ratio( decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None @@ -336,4 +363,6 @@ def coherence_ratio( if sufficient_match_count >= 3: break - return sorted(results, key=lambda x: x[1], reverse=True) + return sorted( + filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True + ) diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py index 6ad95927..7e399132 100644 --- a/tests/test_coherence_detection.py +++ b/tests/test_coherence_detection.py @@ -1,5 +1,5 @@ import pytest -from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features +from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches @pytest.mark.parametrize( @@ -39,3 +39,18 @@ def test_target_features(language, expected_have_accents, expected_pure_latin): assert target_have_accents is expected_have_accents assert target_pure_latin is expected_pure_latin + + +@pytest.mark.parametrize( + "matches, expected_return", + [ + ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]), + ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]), + ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]), + ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]), + ] +) +def test_filter_alt_coherence_matches(matches, expected_return): + results = filter_alt_coherence_matches(matches) + + assert results == expected_return From 70c551a203c3c93593c21b90902de0e059566d6c Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 09:06:18 +0200 Subject: [PATCH 54/63] :sparkle: Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio --- CHANGELOG.md | 1 + charset_normalizer/api.py | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 089e0bfd..230f448a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results - Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES +- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio ### Changed - Build with static metadata using 'build' frontend diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index bee14481..6c7e8983 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -39,6 +39,7 @@ def from_bytes( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Given a raw bytes sequence, return the best possibles charset usable to render str objects. @@ -400,7 +401,9 @@ def from_bytes( if encoding_iana != "ascii": for chunk in md_chunks: chunk_languages = coherence_ratio( - chunk, 0.1, ",".join(target_languages) if target_languages else None + chunk, + language_threshold, + ",".join(target_languages) if target_languages else None, ) cd_ratios.append(chunk_languages) @@ -502,6 +505,7 @@ def from_fp( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Same thing than the function from_bytes but using a file pointer that is already ready. @@ -516,6 +520,7 @@ def from_fp( cp_exclusion, preemptive_behaviour, explain, + language_threshold, ) @@ -528,6 +533,7 @@ def from_path( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode. @@ -543,4 +549,5 @@ def from_path( cp_exclusion, preemptive_behaviour, explain, + language_threshold, ) From 14689be661abbd86769597149db7564b4ce5d899 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 09:07:31 +0200 Subject: [PATCH 55/63] :wrench: Make the language detection stricter Improve the condition on issue #200 --- CHANGELOG.md | 1 + charset_normalizer/assets/__init__.py | 388 +++++++++++++++++++++++--- charset_normalizer/cd.py | 43 ++- tests/test_full_detection.py | 2 +- 4 files changed, 387 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 230f448a..112c8a42 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Changed - Build with static metadata using 'build' frontend +- Make the language detection stricter ### Fixed - CLI with opt --normalize fail when using full path for files diff --git a/charset_normalizer/assets/__init__.py b/charset_normalizer/assets/__init__.py index 029be349..9075930d 100644 --- a/charset_normalizer/assets/__init__.py +++ b/charset_normalizer/assets/__init__.py @@ -256,33 +256,303 @@ "ж", "ц", ], + # Jap-Kanji "Japanese": [ + "人", + "一", + "大", + "亅", + "丁", + "丨", + "竹", + "笑", + "口", + "日", + "今", + "二", + "彳", + "行", + "十", + "土", + "丶", + "寸", + "寺", + "時", + "乙", + "丿", + "乂", + "气", + "気", + "冂", + "巾", + "亠", + "市", + "目", + "儿", + "見", + "八", + "小", + "凵", + "県", + "月", + "彐", + "門", + "間", + "木", + "東", + "山", + "出", + "本", + "中", + "刀", + "分", + "耳", + "又", + "取", + "最", + "言", + "田", + "心", + "思", + "刂", + "前", + "京", + "尹", + "事", + "生", + "厶", + "云", + "会", + "未", + "来", + "白", + "冫", + "楽", + "灬", + "馬", + "尸", + "尺", + "駅", + "明", + "耂", + "者", + "了", + "阝", + "都", + "高", + "卜", + "占", + "厂", + "广", + "店", + "子", + "申", + "奄", + "亻", + "俺", + "上", + "方", + "冖", + "学", + "衣", + "艮", + "食", + "自", + ], + # Jap-Katakana + "Japanese—": [ + "ー", + "ン", + "ス", + "・", + "ル", + "ト", + "リ", + "イ", + "ア", + "ラ", + "ッ", + "ク", + "ド", + "シ", + "レ", + "ジ", + "タ", + "フ", + "ロ", + "カ", + "テ", + "マ", + "ィ", + "グ", + "バ", + "ム", + "プ", + "オ", + "コ", + "デ", + "ニ", + "ウ", + "メ", + "サ", + "ビ", + "ナ", + "ブ", + "ャ", + "エ", + "ュ", + "チ", + "キ", + "ズ", + "ダ", + "パ", + "ミ", + "ェ", + "ョ", + "ハ", + "セ", + "ベ", + "ガ", + "モ", + "ツ", + "ネ", + "ボ", + "ソ", + "ノ", + "ァ", + "ヴ", + "ワ", + "ポ", + "ペ", + "ピ", + "ケ", + "ゴ", + "ギ", + "ザ", + "ホ", + "ゲ", + "ォ", + "ヤ", + "ヒ", + "ユ", + "ヨ", + "ヘ", + "ゼ", + "ヌ", + "ゥ", + "ゾ", + "ヶ", + "ヂ", + "ヲ", + "ヅ", + "ヵ", + "ヱ", + "ヰ", + "ヮ", + "ヽ", + "゠", + "ヾ", + "ヷ", + "ヿ", + "ヸ", + "ヹ", + "ヺ", + ], + # Jap-Hiragana + "Japanese——": [ "の", "に", "る", "た", - "は", - "ー", "と", + "は", "し", + "い", "を", "で", "て", "が", - "い", - "ン", - "れ", "な", - "年", - "ス", - "っ", - "ル", + "れ", "か", "ら", - "あ", "さ", - "も", + "っ", "り", + "す", + "あ", + "も", + "こ", + "ま", + "う", + "く", + "よ", + "き", + "ん", + "め", + "お", + "け", + "そ", + "つ", + "だ", + "や", + "え", + "ど", + "わ", + "ち", + "み", + "せ", + "じ", + "ば", + "へ", + "び", + "ず", + "ろ", + "ほ", + "げ", + "む", + "べ", + "ひ", + "ょ", + "ゆ", + "ぶ", + "ご", + "ゃ", + "ね", + "ふ", + "ぐ", + "ぎ", + "ぼ", + "ゅ", + "づ", + "ざ", + "ぞ", + "ぬ", + "ぜ", + "ぱ", + "ぽ", + "ぷ", + "ぴ", + "ぃ", + "ぁ", + "ぇ", + "ぺ", + "ゞ", + "ぢ", + "ぉ", + "ぅ", + "ゐ", + "ゝ", + "ゑ", + "゛", + "゜", + "ゎ", + "ゔ", + "゚", + "ゟ", + "゙", + "ゕ", + "ゖ", ], "Portuguese": [ "a", @@ -370,6 +640,77 @@ "就", "出", "会", + "可", + "也", + "你", + "对", + "生", + "能", + "而", + "子", + "那", + "得", + "于", + "着", + "下", + "自", + "之", + "年", + "过", + "发", + "后", + "作", + "里", + "用", + "道", + "行", + "所", + "然", + "家", + "种", + "事", + "成", + "方", + "多", + "经", + "么", + "去", + "法", + "学", + "如", + "都", + "同", + "现", + "当", + "没", + "动", + "面", + "起", + "看", + "定", + "天", + "分", + "还", + "进", + "好", + "小", + "部", + "其", + "些", + "主", + "样", + "理", + "心", + "她", + "本", + "前", + "开", + "但", + "因", + "只", + "从", + "想", + "实", ], "Ukrainian": [ "о", @@ -1068,31 +1409,6 @@ "ஒ", "ஸ", ], - "Classical Chinese": [ - "之", - "年", - "為", - "也", - "以", - "一", - "人", - "其", - "者", - "國", - "有", - "二", - "十", - "於", - "曰", - "三", - "不", - "大", - "而", - "子", - "中", - "五", - "四", - ], "Kazakh": [ "а", "ы", diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index a294257e..ae2813fb 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -105,7 +105,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]: ): return ["Japanese"] if iana_name.startswith("gb") or iana_name in ZH_NAMES: - return ["Chinese", "Classical Chinese"] + return ["Chinese"] if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: return ["Korean"] @@ -179,22 +179,45 @@ def characters_popularity_compare( character_approved_count: int = 0 FREQUENCIES_language_set = set(FREQUENCIES[language]) - for character in ordered_characters: + ordered_characters_count: int = len(ordered_characters) + target_language_characters_count: int = len(FREQUENCIES[language]) + + large_alphabet: bool = target_language_characters_count > 26 + + for character, character_rank in zip( + ordered_characters, range(0, ordered_characters_count) + ): if character not in FREQUENCIES_language_set: continue + character_rank_in_language: int = FREQUENCIES[language].index(character) + expected_projection_ratio: float = ( + target_language_characters_count / ordered_characters_count + ) + character_rank_projection: int = int(character_rank * expected_projection_ratio) + + if ( + large_alphabet is False + and abs(character_rank_projection - character_rank_in_language) > 4 + ): + continue + + if ( + large_alphabet is True + and abs(character_rank_projection - character_rank_in_language) + < target_language_characters_count / 3 + ): + character_approved_count += 1 + continue + characters_before_source: List[str] = FREQUENCIES[language][ - 0 : FREQUENCIES[language].index(character) + 0:character_rank_in_language ] characters_after_source: List[str] = FREQUENCIES[language][ - FREQUENCIES[language].index(character) : - ] - characters_before: List[str] = ordered_characters[ - 0 : ordered_characters.index(character) - ] - characters_after: List[str] = ordered_characters[ - ordered_characters.index(character) : + character_rank_in_language: ] + characters_before: List[str] = ordered_characters[0:character_rank] + characters_after: List[str] = ordered_characters[character_rank:] before_match_count: int = len( set(characters_before) & set(characters_before_source) diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py index fd8ac80c..adff8801 100644 --- a/tests/test_full_detection.py +++ b/tests/test_full_detection.py @@ -16,7 +16,7 @@ ('sample-arabic.txt', 'utf_8', 'Arabic'), ('sample-russian-3.txt', 'utf_8', 'Russian'), ('sample-french.txt', 'utf_8', 'French'), - ('sample-chinese.txt', 'big5', 'Classical Chinese'), + ('sample-chinese.txt', 'big5', 'Chinese'), ('sample-greek.txt', 'cp1253', 'Greek'), ('sample-greek-2.txt', 'cp1253', 'Greek'), ('sample-hebrew-2.txt', 'cp1255', 'Hebrew'), From 8f91aa4eb6fbb81a1024b2e4c93626e2383b6419 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 09:08:48 +0200 Subject: [PATCH 56/63] :bug: TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it --- CHANGELOG.md | 1 + charset_normalizer/md.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 112c8a42..46e56147 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Fixed - CLI with opt --normalize fail when using full path for files +- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it ### Removed - Coherence detector no longer return 'Simple English' instead return 'English' diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index d62a8bda..56e9321a 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -128,7 +128,7 @@ def reset(self) -> None: # pragma: no cover @property def ratio(self) -> float: - if self._character_count == 0: + if self._character_count == 0 or self._character_count < 8: return 0.0 ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 From e0010ff55b4d4b553838789576035c45685dceb6 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 09:10:02 +0200 Subject: [PATCH 57/63] :bookmark: Bump version rc1 --- charset_normalizer/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index c05c9cd1..25bf3bcf 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "3.0.0b2" +__version__ = "3.0.0rc1" VERSION = __version__.split(".") From 840a6e08a9f1ac3d64b77b05be6c9ae52451bc87 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 20:27:06 +0200 Subject: [PATCH 58/63] :wrench: Ensure proper version lock (i) ensure build are reproductible (ii) still support python 3.6 --- dev-requirements.txt | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 1760f9f5..1bc2ee26 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,12 +1,24 @@ -black==22.8.0 flake8==5.0.4 -mypy>=0.970 -Flask>=2.0.3,<=2.2.2 chardet==5.0.0 isort==5.10.1 codecov==2.1.12 -pytest>=7.0.0,<8.0 pytest-cov==4.0.0 build==0.8.0 -requests>=2.27.1,<3.0.0 wheel==0.37.1 + +# The vast majority of project dropped Python 3.6 +# This is to ensure build are reproducible >=3.6 +black==22.8.0; python_version < "3.7" +black==22.10.0; python_version >= "3.7" + +mypy==0.982; python_version >= "3.7" +mypy==0.970; python_version < "3.7" + +Flask==2.2.2; python_version >= "3.7" +Flask==2.0.3; python_version < "3.7" + +pytest==7.0.0; python_version < "3.7" +pytest==7.1.3; python_version >= "3.7" + +requests==2.27.1; python_version < "3.7" +requests==2.28.1; python_version >= "3.7" From 9b8b048767901f08c3d7ec40beb0c975d7eda438 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 20:29:04 +0200 Subject: [PATCH 59/63] :wrench: set target-version to py36 black autofix script --- bin/run_autofix.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh index f853cacd..e88f45c6 100755 --- a/bin/run_autofix.sh +++ b/bin/run_autofix.sh @@ -7,5 +7,5 @@ fi set -x -${PREFIX}black --target-version=py35 charset_normalizer +${PREFIX}black --target-version=py36 charset_normalizer ${PREFIX}isort charset_normalizer From 13d9a99617af987227b7f36c8d46152991c1806a Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 20:43:44 +0200 Subject: [PATCH 60/63] :wrench: mypy ver lock for py 3.6 revised --- dev-requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev-requirements.txt b/dev-requirements.txt index 1bc2ee26..91e06b88 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -12,7 +12,7 @@ black==22.8.0; python_version < "3.7" black==22.10.0; python_version >= "3.7" mypy==0.982; python_version >= "3.7" -mypy==0.970; python_version < "3.7" +mypy==0.971; python_version < "3.7" Flask==2.2.2; python_version >= "3.7" Flask==2.0.3; python_version < "3.7" From f8e1153a0f9a392227f770a97e1f6d8a84a5e22e Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 20:48:05 +0200 Subject: [PATCH 61/63] :pencil: Adjust speedup docs section --- docs/community/speedup.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/community/speedup.rst b/docs/community/speedup.rst index cefc57e4..ea45b297 100644 --- a/docs/community/speedup.rst +++ b/docs/community/speedup.rst @@ -26,6 +26,7 @@ Following those instructions (provided you have the necessary toolchain installe :: export CHARSET_NORMALIZER_USE_MYPYC=1 + pip install mypy build wheel pip install charset-normalizer --no-binary :all: From b15f416535fbfd49120bb2284672ef52616a27e0 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 20:51:50 +0200 Subject: [PATCH 62/63] :pencil: Update CHANGELOG.md rc1 publish --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 46e56147..0bd07b40 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). -## [3.0.0b3](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0b3) (2022-10-??) +## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18) ### Added - Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results From 6367d5343791c8e1e9f54fe3055f80cd41b73ce8 Mon Sep 17 00:00:00 2001 From: Ahmed TAHRI Date: Tue, 18 Oct 2022 20:55:22 +0200 Subject: [PATCH 63/63] :pencil: Missing CHANGELOG entry and add language_threshold to docs::advanced usage --- CHANGELOG.md | 1 + docs/user/advanced_search.rst | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bd07b40..dcfd8f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Removed - Coherence detector no longer return 'Simple English' instead return 'English' +- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese' ## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21) diff --git a/docs/user/advanced_search.rst b/docs/user/advanced_search.rst index b4441e58..a269cd10 100644 --- a/docs/user/advanced_search.rst +++ b/docs/user/advanced_search.rst @@ -18,7 +18,8 @@ As follow :: cp_isolation=None, # Finite list of encoding to use when searching for a match cp_exclusion=None, # Finite list of encoding to avoid when searching for a match preemptive_behaviour=True, # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding - explain=False # Print on screen what is happening when searching for a match + explain=False, # Print on screen what is happening when searching for a match + language_threshold=0.1 # Minimum coherence ratio / language ratio match accepted )