diff --git a/.github/workflows/chardet-bc.yml b/.github/workflows/chardet-bc.yml index 0bbeaec8..dfbc64cc 100644 --- a/.github/workflows/chardet-bc.yml +++ b/.github/workflows/chardet-bc.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml deleted file mode 100644 index 1a7014d5..00000000 --- a/.github/workflows/codeql-analysis.yml +++ /dev/null @@ -1,56 +0,0 @@ -# For most projects, this workflow file will not need changing; you simply need -# to commit it to your repository. -# -# You may wish to alter this file to override the set of languages analyzed, -# or to provide custom queries or build logic. -name: "CodeQL" - -on: - push: - branches: [master, develop] - pull_request: - # The branches below must be a subset of the branches above - branches: [master, develop] - schedule: - - cron: '0 23 * * 0' - -jobs: - analyze: - name: Analyze - runs-on: ubuntu-latest - - strategy: - matrix: - python-version: [3.9] - fail-fast: false - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - with: - # We must fetch at least the immediate parents so that if this is - # a pull request then we can checkout the head. - fetch-depth: 2 - - # If this run was triggered by a pull request event, then checkout - # the head of the pull request instead of the merge commit. - - run: git checkout HEAD^2 - if: ${{ github.event_name == 'pull_request' }} - - # Initializes the CodeQL tools for scanning. - - name: Initialize CodeQL - uses: github/codeql-action/init@v1 - with: - languages: "python" - # If you wish to specify custom queries, you can do so here or in a config file. - # By default, queries listed here will override any specified in a config file. - # Prefix the list here with "+" to use these queries and those in the config file. - # queries: ./path/to/local/query, your-org/your-repo/queries@main - - # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). - # If this step fails, then you should remove it and run the build manually (see below) - - name: Autobuild - uses: github/codeql-action/autobuild@v1 - - - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v1 diff --git a/.github/workflows/detector-coverage.yml b/.github/workflows/detector-coverage.yml index 19eed9ae..1527f22b 100644 --- a/.github/workflows/detector-coverage.yml +++ b/.github/workflows/detector-coverage.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/integration.yml b/.github/workflows/integration.yml index f74a56d2..00aa98eb 100644 --- a/.github/workflows/integration.yml +++ b/.github/workflows/integration.yml @@ -28,7 +28,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 877b890e..4f1f12f4 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Type checking (Mypy) run: | mypy --strict charset_normalizer diff --git a/.github/workflows/mypyc-verify.yml b/.github/workflows/mypyc-verify.yml new file mode 100644 index 00000000..e9b2a9c7 --- /dev/null +++ b/.github/workflows/mypyc-verify.yml @@ -0,0 +1,40 @@ +name: MYPYC Run + +on: [push, pull_request] + +jobs: + detection_coverage: + runs-on: ${{ matrix.os }} + + strategy: + fail-fast: false + matrix: + python-version: [3.6, 3.7, 3.8, 3.9, "3.10"] + os: [ubuntu-latest] + + steps: + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + pip install -U pip setuptools + pip install -r dev-requirements.txt + pip uninstall -y charset-normalizer + - name: Install the package + env: + CHARSET_NORMALIZER_USE_MYPYC: '1' + run: | + python -m build --no-isolation + pip install ./dist/*.whl + - name: Clone the complete dataset + run: | + git clone https://github.com/Ousret/char-dataset.git + - name: Coverage WITH preemptive + run: | + python ./bin/coverage.py --coverage 97 --with-preemptive + - name: Coverage WITHOUT preemptive + run: | + python ./bin/coverage.py --coverage 95 diff --git a/.github/workflows/performance.yml b/.github/workflows/performance.yml index fddd9d30..e675061a 100644 --- a/.github/workflows/performance.yml +++ b/.github/workflows/performance.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index d9e664c1..2042d90e 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -29,7 +29,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Type checking (Mypy) run: | mypy charset_normalizer @@ -51,7 +52,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10" ] + python-version: [ 3.6, 3.7, 3.8, 3.9, "3.10", "3.11-dev" ] os: [ ubuntu-latest ] steps: @@ -67,7 +68,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Run tests run: | pytest @@ -96,7 +98,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git @@ -136,7 +139,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build + pip install ./dist/*.whl - name: Clone the complete dataset run: | git clone https://github.com/Ousret/char-dataset.git @@ -146,11 +150,92 @@ jobs: - name: Integration Tests with Requests run: | python ./bin/integration.py + universal-wheel: + runs-on: ubuntu-latest + needs: + - integration + steps: + - uses: actions/checkout@v2 + - name: Set up Python + uses: actions/setup-python@v2 + with: + python-version: '3.x' + - name: Update pip, setuptools, wheel and twine + run: | + python -m pip install --upgrade pip + pip install setuptools wheel twine + - name: Build Wheel + env: + CHARSET_NORMALIZER_USE_MYPYC: '0' + run: python -m build + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: dist + + build-wheels: + name: Build wheels on ${{ matrix.os }} ${{ matrix.qemu }} + runs-on: ${{ matrix.os }}-latest + needs: universal-wheel + strategy: + matrix: + os: [ ubuntu, windows, macos ] + qemu: [ '' ] + include: + # Split ubuntu job for the sake of speed-up + - os: ubuntu + qemu: aarch64 + - os: ubuntu + qemu: ppc64le + - os: ubuntu + qemu: s390x + steps: + - name: Checkout + uses: actions/checkout@v3 + with: + submodules: true + - name: Set up QEMU + if: ${{ matrix.qemu }} + uses: docker/setup-qemu-action@v2 + with: + platforms: all + id: qemu + - name: Prepare emulation + run: | + if [[ -n "${{ matrix.qemu }}" ]]; then + # Build emulated architectures only if QEMU is set, + # use default "auto" otherwise + echo "CIBW_ARCHS_LINUX=${{ matrix.qemu }}" >> $GITHUB_ENV + fi + shell: bash + - name: Setup Python + uses: actions/setup-python@v4 + - name: Update pip, wheel, setuptools, build, twine + run: | + python -m pip install -U pip wheel setuptools build twine + - name: Build wheels + uses: pypa/cibuildwheel@2.10.2 + env: + CIBW_BUILD_FRONTEND: "build" + CIBW_ARCHS_MACOS: x86_64 arm64 universal2 + CIBW_ENVIRONMENT: CHARSET_NORMALIZER_USE_MYPYC='1' + CIBW_CONFIG_SETTINGS: "--no-isolation" + CIBW_BEFORE_BUILD: pip install -r dev-requirements.txt + CIBW_TEST_REQUIRES: pytest codecov pytest-cov + CIBW_TEST_COMMAND: pytest {package}/tests + CIBW_SKIP: pp* + - name: Upload artifacts + uses: actions/upload-artifact@v3 + with: + name: dist + path: ./wheelhouse/*.whl + deploy: runs-on: ubuntu-latest needs: - - integration + - build-wheels steps: - uses: actions/checkout@v2 @@ -162,10 +247,17 @@ jobs: run: | python -m pip install --upgrade pip pip install setuptools wheel twine - - name: Build and publish + - name: Download disctributions + uses: actions/download-artifact@v3 + with: + name: dist + path: dist + - name: Collected dists + run: | + tree dist + - name: Publish env: TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} run: | - python setup.py sdist bdist_wheel twine upload dist/* diff --git a/.github/workflows/run-tests.yml b/.github/workflows/run-tests.yml index 2e999729..27dc5d5f 100644 --- a/.github/workflows/run-tests.yml +++ b/.github/workflows/run-tests.yml @@ -25,7 +25,8 @@ jobs: pip uninstall -y charset-normalizer - name: Install the package run: | - python setup.py install + python -m build --no-isolation + pip install ./dist/*.whl - name: Run tests run: | pytest diff --git a/CHANGELOG.md b/CHANGELOG.md index b80e7cd1..dcfd8f76 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,48 @@ All notable changes to charset-normalizer will be documented in this file. This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). +## [3.0.0rc1](https://github.com/Ousret/charset_normalizer/compare/3.0.0b2...3.0.0rc1) (2022-10-18) + +### Added +- Extend the capability of explain=True when cp_isolation contains at most two entries (min one), will log in details of the Mess-detector results +- Support for alternative language frequency set in charset_normalizer.assets.FREQUENCIES +- Add parameter `language_threshold` in `from_bytes`, `from_path` and `from_fp` to adjust the minimum expected coherence ratio + +### Changed +- Build with static metadata using 'build' frontend +- Make the language detection stricter + +### Fixed +- CLI with opt --normalize fail when using full path for files +- TooManyAccentuatedPlugin induce false positive on the mess detection when too few alpha character have been fed to it + +### Removed +- Coherence detector no longer return 'Simple English' instead return 'English' +- Coherence detector no longer return 'Classical Chinese' instead return 'Chinese' + +## [3.0.0b2](https://github.com/Ousret/charset_normalizer/compare/3.0.0b1...3.0.0b2) (2022-08-21) + +### Added +- `normalizer --version` now specify if current version provide extra speedup (meaning mypyc compilation whl) + +### Removed +- Breaking: Method `first()` and `best()` from CharsetMatch +- UTF-7 will no longer appear as "detected" without a recognized SIG/mark (is unreliable/conflict with ASCII) + +### Fixed +- Sphinx warnings when generating the documentation + +## [3.0.0b1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...3.0.0b1) (2022-08-15) + +### Changed +- Optional: Module `md.py` can be compiled using Mypyc to provide an extra speedup up to 4x faster than v2.1 + +### Removed +- Breaking: Class aliases CharsetDetector, CharsetDoctor, CharsetNormalizerMatch and CharsetNormalizerMatches +- Breaking: Top-level function `normalize` +- Breaking: Properties `chaos_secondary_pass`, `coherence_non_latin` and `w_counter` from CharsetMatch +- Support for the backport `unicodedata2` + ## [2.1.1](https://github.com/Ousret/charset_normalizer/compare/2.1.0...2.1.1) (2022-08-19) ### Deprecated diff --git a/README.md b/README.md index d58ede1b..27736830 100644 --- a/README.md +++ b/README.md @@ -25,16 +25,16 @@ This project offers you an alternative to **Universal Charset Encoding Detector* | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | | ------------- | :-------------: | :------------------: | :------------------: | -| `Fast` | ❌
| :heavy_check_mark:
| :heavy_check_mark:
| -| `Universal**` | ❌ | :heavy_check_mark: | ❌ | -| `Reliable` **without** distinguishable standards | ❌ | :heavy_check_mark: | :heavy_check_mark: | -| `Reliable` **with** distinguishable standards | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | +| `Fast` | ❌
| ✅
| ✅
| +| `Universal**` | ❌ | ✅ | ❌ | +| `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | +| `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ | | `License` | LGPL-2.1
_restrictive_ | MIT | MPL-1.1
_restrictive_ | -| `Native Python` | :heavy_check_mark: | :heavy_check_mark: | ❌ | -| `Detect spoken language` | ❌ | :heavy_check_mark: | N/A | -| `UnicodeDecodeError Safety` | ❌ | :heavy_check_mark: | ❌ | +| `Native Python` | ✅ | ✅ | ❌ | +| `Detect spoken language` | ❌ | ✅ | N/A | +| `UnicodeDecodeError Safety` | ❌ | ✅ | ❌ | | `Whl Size` | 193.6 kB | 39.5 kB | ~200 kB | -| `Supported Encoding` | 33 | :tada: [93](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40 +| `Supported Encoding` | 33 | :tada: [90](https://charset-normalizer.readthedocs.io/en/latest/user/support.html#supported-encodings) | 40

Reading Normalized TextCat Reading Text @@ -53,12 +53,12 @@ This package offer better performance than its counterpart Chardet. Here are som | Package | Accuracy | Mean per file (ms) | File per sec (est) | | ------------- | :-------------: | :------------------: | :------------------: | | [chardet](https://github.com/chardet/chardet) | 86 % | 200 ms | 5 file/sec | -| charset-normalizer | **98 %** | **39 ms** | 26 file/sec | +| charset-normalizer | **98 %** | **10 ms** | 100 file/sec | | Package | 99th percentile | 95th percentile | 50th percentile | | ------------- | :-------------: | :------------------: | :------------------: | | [chardet](https://github.com/chardet/chardet) | 1200 ms | 287 ms | 23 ms | -| charset-normalizer | 400 ms | 200 ms | 15 ms | +| charset-normalizer | 100 ms | 50 ms | 5 ms | Chardet's performance on larger file (1MB+) are very poor. Expect huge difference on large payload. @@ -68,9 +68,6 @@ Chardet's performance on larger file (1MB+) are very poor. Expect huge differenc > Keep in mind that the stats are generous and that Chardet accuracy vs our is measured using Chardet initial capability > (eg. Supported Encoding) Challenge-them if you want. -[cchardet](https://github.com/PyYoshi/cChardet) is a non-native (cpp binding) and unmaintained faster alternative with -a better accuracy than chardet but lower than this package. If speed is the most important factor, you should try it. - ## ✨ Installation Using PyPi for latest stable @@ -78,11 +75,6 @@ Using PyPi for latest stable pip install charset-normalizer -U ``` -If you want a more up-to-date `unicodedata` than the one available in your Python setup. -```sh -pip install charset-normalizer[unicode_backport] -U -``` - ## 🚀 Basic Usage ### CLI diff --git a/bin/run_autofix.sh b/bin/run_autofix.sh index f853cacd..e88f45c6 100755 --- a/bin/run_autofix.sh +++ b/bin/run_autofix.sh @@ -7,5 +7,5 @@ fi set -x -${PREFIX}black --target-version=py35 charset_normalizer +${PREFIX}black --target-version=py36 charset_normalizer ${PREFIX}isort charset_normalizer diff --git a/bin/run_checks.sh b/bin/run_checks.sh index 0ae730eb..1e135b35 100755 --- a/bin/run_checks.sh +++ b/bin/run_checks.sh @@ -8,7 +8,7 @@ fi set -x ${PREFIX}pytest -${PREFIX}black --check --diff --target-version=py35 charset_normalizer +${PREFIX}black --check --diff --target-version=py36 charset_normalizer ${PREFIX}flake8 charset_normalizer ${PREFIX}mypy charset_normalizer ${PREFIX}isort --check --diff charset_normalizer diff --git a/charset_normalizer/__init__.py b/charset_normalizer/__init__.py index 2dcaf56f..ebb5da89 100644 --- a/charset_normalizer/__init__.py +++ b/charset_normalizer/__init__.py @@ -21,14 +21,8 @@ """ import logging -from .api import from_bytes, from_fp, from_path, normalize -from .legacy import ( - CharsetDetector, - CharsetDoctor, - CharsetNormalizerMatch, - CharsetNormalizerMatches, - detect, -) +from .api import from_bytes, from_fp, from_path +from .legacy import detect from .models import CharsetMatch, CharsetMatches from .utils import set_logging_handler from .version import VERSION, __version__ @@ -37,14 +31,9 @@ "from_fp", "from_path", "from_bytes", - "normalize", "detect", "CharsetMatch", "CharsetMatches", - "CharsetNormalizerMatch", - "CharsetNormalizerMatches", - "CharsetDetector", - "CharsetDoctor", "__version__", "VERSION", "set_logging_handler", diff --git a/charset_normalizer/api.py b/charset_normalizer/api.py index b6c37e8b..6c7e8983 100644 --- a/charset_normalizer/api.py +++ b/charset_normalizer/api.py @@ -1,7 +1,5 @@ import logging -import warnings from os import PathLike -from os.path import basename, splitext from typing import Any, BinaryIO, List, Optional, Set from .cd import ( @@ -41,6 +39,7 @@ def from_bytes( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Given a raw bytes sequence, return the best possibles charset usable to render str objects. @@ -201,6 +200,13 @@ def from_bytes( encoding_iana, ) continue + if encoding_iana in {"utf_7"} and not bom_or_sig_available: + logger.log( + TRACE, + "Encoding %s won't be tested as-is because detection is unreliable without BOM/SIG.", + encoding_iana, + ) + continue try: is_multi_byte_decoder: bool = is_multi_byte_encoding(encoding_iana) @@ -297,7 +303,13 @@ def from_bytes( ): md_chunks.append(chunk) - md_ratios.append(mess_ratio(chunk, threshold)) + md_ratios.append( + mess_ratio( + chunk, + threshold, + explain is True and 1 <= len(cp_isolation) <= 2, + ) + ) if md_ratios[-1] >= threshold: early_stop_count += 1 @@ -389,7 +401,9 @@ def from_bytes( if encoding_iana != "ascii": for chunk in md_chunks: chunk_languages = coherence_ratio( - chunk, 0.1, ",".join(target_languages) if target_languages else None + chunk, + language_threshold, + ",".join(target_languages) if target_languages else None, ) cd_ratios.append(chunk_languages) @@ -491,6 +505,7 @@ def from_fp( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Same thing than the function from_bytes but using a file pointer that is already ready. @@ -505,6 +520,7 @@ def from_fp( cp_exclusion, preemptive_behaviour, explain, + language_threshold, ) @@ -517,6 +533,7 @@ def from_path( cp_exclusion: Optional[List[str]] = None, preemptive_behaviour: bool = True, explain: bool = False, + language_threshold: float = 0.1, ) -> CharsetMatches: """ Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode. @@ -532,53 +549,5 @@ def from_path( cp_exclusion, preemptive_behaviour, explain, + language_threshold, ) - - -def normalize( - path: "PathLike[Any]", - steps: int = 5, - chunk_size: int = 512, - threshold: float = 0.20, - cp_isolation: Optional[List[str]] = None, - cp_exclusion: Optional[List[str]] = None, - preemptive_behaviour: bool = True, -) -> CharsetMatch: - """ - Take a (text-based) file path and try to create another file next to it, this time using UTF-8. - """ - warnings.warn( - "normalize is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - - results = from_path( - path, - steps, - chunk_size, - threshold, - cp_isolation, - cp_exclusion, - preemptive_behaviour, - ) - - filename = basename(path) - target_extensions = list(splitext(filename)) - - if len(results) == 0: - raise IOError( - 'Unable to normalize "{}", no encoding charset seems to fit.'.format( - filename - ) - ) - - result = results.best() - - target_extensions[0] += "-" + result.encoding # type: ignore - - with open( - "{}".format(str(path).replace(filename, "".join(target_extensions))), "wb" - ) as fp: - fp.write(result.output()) # type: ignore - - return result # type: ignore diff --git a/charset_normalizer/assets/__init__.py b/charset_normalizer/assets/__init__.py index 3c33ba30..9075930d 100644 --- a/charset_normalizer/assets/__init__.py +++ b/charset_normalizer/assets/__init__.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- from typing import Dict, List +# Language label that contain the em dash "—" +# character are to be considered alternative seq to origin FREQUENCIES: Dict[str, List[str]] = { "English": [ "e", @@ -30,6 +32,34 @@ "z", "q", ], + "English—": [ + "e", + "a", + "t", + "i", + "o", + "n", + "s", + "r", + "h", + "l", + "d", + "c", + "m", + "u", + "f", + "p", + "g", + "w", + "b", + "y", + "v", + "k", + "j", + "x", + "z", + "q", + ], "German": [ "e", "n", @@ -226,33 +256,303 @@ "ж", "ц", ], + # Jap-Kanji "Japanese": [ + "人", + "一", + "大", + "亅", + "丁", + "丨", + "竹", + "笑", + "口", + "日", + "今", + "二", + "彳", + "行", + "十", + "土", + "丶", + "寸", + "寺", + "時", + "乙", + "丿", + "乂", + "气", + "気", + "冂", + "巾", + "亠", + "市", + "目", + "儿", + "見", + "八", + "小", + "凵", + "県", + "月", + "彐", + "門", + "間", + "木", + "東", + "山", + "出", + "本", + "中", + "刀", + "分", + "耳", + "又", + "取", + "最", + "言", + "田", + "心", + "思", + "刂", + "前", + "京", + "尹", + "事", + "生", + "厶", + "云", + "会", + "未", + "来", + "白", + "冫", + "楽", + "灬", + "馬", + "尸", + "尺", + "駅", + "明", + "耂", + "者", + "了", + "阝", + "都", + "高", + "卜", + "占", + "厂", + "广", + "店", + "子", + "申", + "奄", + "亻", + "俺", + "上", + "方", + "冖", + "学", + "衣", + "艮", + "食", + "自", + ], + # Jap-Katakana + "Japanese—": [ + "ー", + "ン", + "ス", + "・", + "ル", + "ト", + "リ", + "イ", + "ア", + "ラ", + "ッ", + "ク", + "ド", + "シ", + "レ", + "ジ", + "タ", + "フ", + "ロ", + "カ", + "テ", + "マ", + "ィ", + "グ", + "バ", + "ム", + "プ", + "オ", + "コ", + "デ", + "ニ", + "ウ", + "メ", + "サ", + "ビ", + "ナ", + "ブ", + "ャ", + "エ", + "ュ", + "チ", + "キ", + "ズ", + "ダ", + "パ", + "ミ", + "ェ", + "ョ", + "ハ", + "セ", + "ベ", + "ガ", + "モ", + "ツ", + "ネ", + "ボ", + "ソ", + "ノ", + "ァ", + "ヴ", + "ワ", + "ポ", + "ペ", + "ピ", + "ケ", + "ゴ", + "ギ", + "ザ", + "ホ", + "ゲ", + "ォ", + "ヤ", + "ヒ", + "ユ", + "ヨ", + "ヘ", + "ゼ", + "ヌ", + "ゥ", + "ゾ", + "ヶ", + "ヂ", + "ヲ", + "ヅ", + "ヵ", + "ヱ", + "ヰ", + "ヮ", + "ヽ", + "゠", + "ヾ", + "ヷ", + "ヿ", + "ヸ", + "ヹ", + "ヺ", + ], + # Jap-Hiragana + "Japanese——": [ "の", "に", "る", "た", - "は", - "ー", "と", + "は", "し", + "い", "を", "で", "て", "が", - "い", - "ン", - "れ", "な", - "年", - "ス", - "っ", - "ル", + "れ", "か", "ら", - "あ", "さ", - "も", + "っ", "り", + "す", + "あ", + "も", + "こ", + "ま", + "う", + "く", + "よ", + "き", + "ん", + "め", + "お", + "け", + "そ", + "つ", + "だ", + "や", + "え", + "ど", + "わ", + "ち", + "み", + "せ", + "じ", + "ば", + "へ", + "び", + "ず", + "ろ", + "ほ", + "げ", + "む", + "べ", + "ひ", + "ょ", + "ゆ", + "ぶ", + "ご", + "ゃ", + "ね", + "ふ", + "ぐ", + "ぎ", + "ぼ", + "ゅ", + "づ", + "ざ", + "ぞ", + "ぬ", + "ぜ", + "ぱ", + "ぽ", + "ぷ", + "ぴ", + "ぃ", + "ぁ", + "ぇ", + "ぺ", + "ゞ", + "ぢ", + "ぉ", + "ぅ", + "ゐ", + "ゝ", + "ゑ", + "゛", + "゜", + "ゎ", + "ゔ", + "゚", + "ゟ", + "゙", + "ゕ", + "ゖ", ], "Portuguese": [ "a", @@ -340,6 +640,77 @@ "就", "出", "会", + "可", + "也", + "你", + "对", + "生", + "能", + "而", + "子", + "那", + "得", + "于", + "着", + "下", + "自", + "之", + "年", + "过", + "发", + "后", + "作", + "里", + "用", + "道", + "行", + "所", + "然", + "家", + "种", + "事", + "成", + "方", + "多", + "经", + "么", + "去", + "法", + "学", + "如", + "都", + "同", + "现", + "当", + "没", + "动", + "面", + "起", + "看", + "定", + "天", + "分", + "还", + "进", + "好", + "小", + "部", + "其", + "些", + "主", + "样", + "理", + "心", + "她", + "本", + "前", + "开", + "但", + "因", + "只", + "从", + "想", + "实", ], "Ukrainian": [ "о", @@ -956,34 +1327,6 @@ "ö", "y", ], - "Simple English": [ - "e", - "a", - "t", - "i", - "o", - "n", - "s", - "r", - "h", - "l", - "d", - "c", - "m", - "u", - "f", - "p", - "g", - "w", - "b", - "y", - "v", - "k", - "j", - "x", - "z", - "q", - ], "Thai": [ "า", "น", @@ -1066,31 +1409,6 @@ "ஒ", "ஸ", ], - "Classical Chinese": [ - "之", - "年", - "為", - "也", - "以", - "一", - "人", - "其", - "者", - "國", - "有", - "二", - "十", - "於", - "曰", - "三", - "不", - "大", - "而", - "子", - "中", - "五", - "四", - ], "Kazakh": [ "а", "ы", diff --git a/charset_normalizer/cd.py b/charset_normalizer/cd.py index ee4b7424..ae2813fb 100644 --- a/charset_normalizer/cd.py +++ b/charset_normalizer/cd.py @@ -105,7 +105,7 @@ def mb_encoding_languages(iana_name: str) -> List[str]: ): return ["Japanese"] if iana_name.startswith("gb") or iana_name in ZH_NAMES: - return ["Chinese", "Classical Chinese"] + return ["Chinese"] if iana_name.startswith("iso2022_kr") or iana_name in KO_NAMES: return ["Korean"] @@ -179,22 +179,45 @@ def characters_popularity_compare( character_approved_count: int = 0 FREQUENCIES_language_set = set(FREQUENCIES[language]) - for character in ordered_characters: + ordered_characters_count: int = len(ordered_characters) + target_language_characters_count: int = len(FREQUENCIES[language]) + + large_alphabet: bool = target_language_characters_count > 26 + + for character, character_rank in zip( + ordered_characters, range(0, ordered_characters_count) + ): if character not in FREQUENCIES_language_set: continue + character_rank_in_language: int = FREQUENCIES[language].index(character) + expected_projection_ratio: float = ( + target_language_characters_count / ordered_characters_count + ) + character_rank_projection: int = int(character_rank * expected_projection_ratio) + + if ( + large_alphabet is False + and abs(character_rank_projection - character_rank_in_language) > 4 + ): + continue + + if ( + large_alphabet is True + and abs(character_rank_projection - character_rank_in_language) + < target_language_characters_count / 3 + ): + character_approved_count += 1 + continue + characters_before_source: List[str] = FREQUENCIES[language][ - 0 : FREQUENCIES[language].index(character) + 0:character_rank_in_language ] characters_after_source: List[str] = FREQUENCIES[language][ - FREQUENCIES[language].index(character) : - ] - characters_before: List[str] = ordered_characters[ - 0 : ordered_characters.index(character) - ] - characters_after: List[str] = ordered_characters[ - ordered_characters.index(character) : + character_rank_in_language: ] + characters_before: List[str] = ordered_characters[0:character_rank] + characters_after: List[str] = ordered_characters[character_rank:] before_match_count: int = len( set(characters_before) & set(characters_before_source) @@ -289,6 +312,33 @@ def merge_coherence_ratios(results: List[CoherenceMatches]) -> CoherenceMatches: return sorted(merge, key=lambda x: x[1], reverse=True) +def filter_alt_coherence_matches(results: CoherenceMatches) -> CoherenceMatches: + """ + We shall NOT return "English—" in CoherenceMatches because it is an alternative + of "English". This function only keeps the best match and remove the em-dash in it. + """ + index_results: Dict[str, List[float]] = dict() + + for result in results: + language, ratio = result + no_em_name: str = language.replace("—", "") + + if no_em_name not in index_results: + index_results[no_em_name] = [] + + index_results[no_em_name].append(ratio) + + if any(len(index_results[e]) > 1 for e in index_results): + filtered_results: CoherenceMatches = [] + + for language in index_results: + filtered_results.append((language, max(index_results[language]))) + + return filtered_results + + return results + + @lru_cache(maxsize=2048) def coherence_ratio( decoded_sequence: str, threshold: float = 0.1, lg_inclusion: Optional[str] = None @@ -336,4 +386,6 @@ def coherence_ratio( if sufficient_match_count >= 3: break - return sorted(results, key=lambda x: x[1], reverse=True) + return sorted( + filter_alt_coherence_matches(results), key=lambda x: x[1], reverse=True + ) diff --git a/charset_normalizer/cli/normalizer.py b/charset_normalizer/cli/normalizer.py index b8b652a5..ad26b4d0 100644 --- a/charset_normalizer/cli/normalizer.py +++ b/charset_normalizer/cli/normalizer.py @@ -1,15 +1,12 @@ import argparse import sys from json import dumps -from os.path import abspath +from os.path import abspath, basename, dirname, join, realpath from platform import python_version from typing import List, Optional +from unicodedata import unidata_version -try: - from unicodedata2 import unidata_version -except ImportError: - from unicodedata import unidata_version - +import charset_normalizer.md as md_module from charset_normalizer import from_fp from charset_normalizer.models import CliDetectionResult from charset_normalizer.version import __version__ @@ -124,8 +121,11 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: parser.add_argument( "--version", action="version", - version="Charset-Normalizer {} - Python {} - Unicode {}".format( - __version__, python_version(), unidata_version + version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format( + __version__, + python_version(), + unidata_version, + "OFF" if md_module.__file__.lower().endswith(".py") else "ON", ), help="Show version information and exit.", ) @@ -234,7 +234,10 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: my_file.close() continue - o_: List[str] = my_file.name.split(".") + dir_path = dirname(realpath(my_file.name)) + file_name = basename(realpath(my_file.name)) + + o_: List[str] = file_name.split(".") if args.replace is False: o_.insert(-1, best_guess.encoding) @@ -255,7 +258,7 @@ def cli_detect(argv: Optional[List[str]] = None) -> int: continue try: - x_[0].unicode_path = abspath("./{}".format(".".join(o_))) + x_[0].unicode_path = join(dir_path, ".".join(o_)) with open(x_[0].unicode_path, "w", encoding="utf-8") as fp: fp.write(str(best_guess)) diff --git a/charset_normalizer/constant.py b/charset_normalizer/constant.py index e679f79c..3188108d 100644 --- a/charset_normalizer/constant.py +++ b/charset_normalizer/constant.py @@ -489,8 +489,6 @@ KO_NAMES: Set[str] = {"johab", "cp949", "euc_kr"} ZH_NAMES: Set[str] = {"big5", "cp950", "big5hkscs", "hz"} -NOT_PRINTABLE_PATTERN = re_compile(r"[0-9\W\n\r\t]+") - LANGUAGE_SUPPORTED_COUNT: int = len(FREQUENCIES) # Logging LEVEL below DEBUG diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py index cdebe2b8..b266d176 100644 --- a/charset_normalizer/legacy.py +++ b/charset_normalizer/legacy.py @@ -1,9 +1,7 @@ -import warnings from typing import Dict, Optional, Union -from .api import from_bytes, from_fp, from_path, normalize +from .api import from_bytes from .constant import CHARDET_CORRESPONDENCE -from .models import CharsetMatch, CharsetMatches def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: @@ -43,53 +41,3 @@ def detect(byte_str: bytes) -> Dict[str, Optional[Union[str, float]]]: "language": language, "confidence": confidence, } - - -class CharsetNormalizerMatch(CharsetMatch): - pass - - -class CharsetNormalizerMatches(CharsetMatches): - @staticmethod - def from_fp(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_fp(*args, **kwargs) # pragma: nocover - - @staticmethod - def from_bytes(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_bytes(*args, **kwargs) # pragma: nocover - - @staticmethod - def from_path(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return from_path(*args, **kwargs) # pragma: nocover - - @staticmethod - def normalize(*args, **kwargs): # type: ignore - warnings.warn( # pragma: nocover - "staticmethod from_fp, from_bytes, from_path and normalize are deprecated " - "and scheduled to be removed in 3.0", - DeprecationWarning, - ) - return normalize(*args, **kwargs) # pragma: nocover - - -class CharsetDetector(CharsetNormalizerMatches): - pass - - -class CharsetDoctor(CharsetNormalizerMatches): - pass diff --git a/charset_normalizer/md.py b/charset_normalizer/md.py index 31808af8..56e9321a 100644 --- a/charset_normalizer/md.py +++ b/charset_normalizer/md.py @@ -1,7 +1,12 @@ from functools import lru_cache +from logging import getLogger from typing import List, Optional -from .constant import COMMON_SAFE_ASCII_CHARACTERS, UNICODE_SECONDARY_RANGE_KEYWORD +from .constant import ( + COMMON_SAFE_ASCII_CHARACTERS, + TRACE, + UNICODE_SECONDARY_RANGE_KEYWORD, +) from .utils import ( is_accentuated, is_ascii, @@ -123,7 +128,7 @@ def reset(self) -> None: # pragma: no cover @property def ratio(self) -> float: - if self._character_count == 0: + if self._character_count == 0 or self._character_count < 8: return 0.0 ratio_of_accentuation: float = self._accentuated_count / self._character_count return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0 @@ -547,7 +552,20 @@ def mess_ratio( break if debug: + logger = getLogger("charset_normalizer") + + logger.log( + TRACE, + "Mess-detector extended-analysis start. " + f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} " + f"maximum_threshold={maximum_threshold}", + ) + + if len(decoded_sequence) > 16: + logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}") + logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}") + for dt in detectors: # pragma: nocover - print(dt.__class__, dt.ratio) + logger.log(TRACE, f"{dt.__class__}: {dt.ratio}") return round(mean_mess_ratio, 3) diff --git a/charset_normalizer/models.py b/charset_normalizer/models.py index bc16bfb6..7f8ca389 100644 --- a/charset_normalizer/models.py +++ b/charset_normalizer/models.py @@ -1,22 +1,9 @@ -import warnings -from collections import Counter from encodings.aliases import aliases from hashlib import sha256 from json import dumps -from re import sub -from typing import ( - Any, - Counter as TypeCounter, - Dict, - Iterator, - List, - Optional, - Tuple, - Union, -) - -from .constant import NOT_PRINTABLE_PATTERN, TOO_BIG_SEQUENCE -from .md import mess_ratio +from typing import Any, Dict, Iterator, List, Optional, Tuple, Union + +from .constant import TOO_BIG_SEQUENCE from .utils import iana_name, is_multi_byte_encoding, unicode_range @@ -78,45 +65,6 @@ def __lt__(self, other: object) -> bool: def multi_byte_usage(self) -> float: return 1.0 - len(str(self)) / len(self.raw) - @property - def chaos_secondary_pass(self) -> float: - """ - Check once again chaos in decoded text, except this time, with full content. - Use with caution, this can be very slow. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "chaos_secondary_pass is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - return mess_ratio(str(self), 1.0) - - @property - def coherence_non_latin(self) -> float: - """ - Coherence ratio on the first non-latin language detected if ANY. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "coherence_non_latin is deprecated and will be removed in 3.0", - DeprecationWarning, - ) - return 0.0 - - @property - def w_counter(self) -> TypeCounter[str]: - """ - Word counter instance on decoded text. - Notice: Will be removed in 3.0 - """ - warnings.warn( - "w_counter is deprecated and will be removed in 3.0", DeprecationWarning - ) - - string_printable_only = sub(NOT_PRINTABLE_PATTERN, " ", str(self).lower()) - - return Counter(string_printable_only.split()) - def __str__(self) -> str: # Lazy Str Loading if self._string is None: @@ -252,18 +200,6 @@ def could_be_from_charset(self) -> List[str]: """ return [self._encoding] + [m.encoding for m in self._leaves] - def first(self) -> "CharsetMatch": - """ - Kept for BC reasons. Will be removed in 3.0. - """ - return self - - def best(self) -> "CharsetMatch": - """ - Kept for BC reasons. Will be removed in 3.0. - """ - return self - def output(self, encoding: str = "utf_8") -> bytes: """ Method to get re-encoded bytes payload using given target encoding. Default to UTF-8. diff --git a/charset_normalizer/utils.py b/charset_normalizer/utils.py index 859f212b..425d8365 100644 --- a/charset_normalizer/utils.py +++ b/charset_normalizer/utils.py @@ -1,12 +1,6 @@ -try: - # WARNING: unicodedata2 support is going to be removed in 3.0 - # Python is quickly catching up. - import unicodedata2 as unicodedata -except ImportError: - import unicodedata # type: ignore[no-redef] - import importlib import logging +import unicodedata from codecs import IncrementalDecoder from encodings.aliases import aliases from functools import lru_cache diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index 64c0dbde..25bf3bcf 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "2.1.1" +__version__ = "3.0.0rc1" VERSION = __version__.split(".") diff --git a/dev-requirements.txt b/dev-requirements.txt index 8e77fe94..91e06b88 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,10 +1,24 @@ -pytest -pytest-cov -codecov -chardet>=5.0,<5.1 -Flask>=2.0,<3.0 -requests>=2.26,<3.0 -black==22.8.0 -flake8==5.0.4 -mypy==0.971 -isort +flake8==5.0.4 +chardet==5.0.0 +isort==5.10.1 +codecov==2.1.12 +pytest-cov==4.0.0 +build==0.8.0 +wheel==0.37.1 + +# The vast majority of project dropped Python 3.6 +# This is to ensure build are reproducible >=3.6 +black==22.8.0; python_version < "3.7" +black==22.10.0; python_version >= "3.7" + +mypy==0.982; python_version >= "3.7" +mypy==0.971; python_version < "3.7" + +Flask==2.2.2; python_version >= "3.7" +Flask==2.0.3; python_version < "3.7" + +pytest==7.0.0; python_version < "3.7" +pytest==7.1.3; python_version >= "3.7" + +requests==2.27.1; python_version < "3.7" +requests==2.28.1; python_version >= "3.7" diff --git a/docs/api.rst b/docs/api.rst index 47a985e5..48b74951 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -14,11 +14,9 @@ Those functions are publicly exposed and are protected through our BC guarantee. .. autofunction:: from_fp .. autofunction:: from_path -.. autofunction:: normalize - -.. autoclass:: charset_normalizer.CharsetMatches +.. autoclass:: charset_normalizer.models.CharsetMatches :inherited-members: -.. autoclass:: charset_normalizer.CharsetMatch +.. autoclass:: charset_normalizer.models.CharsetMatch :inherited-members: .. autofunction:: detect @@ -99,3 +97,8 @@ Some reusable functions used across the project. We do not guarantee the BC in t .. autofunction:: charset_normalizer.utils.range_scan .. autofunction:: charset_normalizer.utils.is_cp_similar + + +.. class:: os.PathLike + + Used as a generic way to accept AnyStr for paths. diff --git a/docs/community/speedup.rst b/docs/community/speedup.rst new file mode 100644 index 00000000..ea45b297 --- /dev/null +++ b/docs/community/speedup.rst @@ -0,0 +1,43 @@ +Optional speedup extension +=========================== + +Why? +------- + +charset-normalizer will always remain pure Python, meaning that a environment without any build-capabilities will +run this program without any additional requirements. + +Nonetheless, starting from the version 3.0 we introduce and publish some platform specific wheels including a +pre-build extension. + +Most of the time is spent in the module `md.py` so we decided to "compile it" using Mypyc. + +(1) It does not require to have a separate code base +(2) Our project code base is rather simple and lightweight +(3) Mypyc is robust enough today +(4) Four times faster! + +How? +------- + +If your platform and/or architecture is not served by this swift optimization you may compile it easily yourself. +Following those instructions (provided you have the necessary toolchain installed): + + :: + + export CHARSET_NORMALIZER_USE_MYPYC=1 + pip install mypy build wheel + pip install charset-normalizer --no-binary :all: + + +How not to? +----------- + +You may install charset-normalizer without any specific (pre-built wheel) by directly using the universal wheel +(most likely hosted on PyPi or any valid mirror you use) + + :: + + pip install charset-normalizer --no-binary :all: + +Directly. diff --git a/docs/community/why_migrate.rst b/docs/community/why_migrate.rst index 717fc3b5..1909c770 100644 --- a/docs/community/why_migrate.rst +++ b/docs/community/why_migrate.rst @@ -4,13 +4,13 @@ Why should I migrate to Charset-Normalizer? There is so many reason to migrate your current project. Here are some of them: - Remove ANY license ambiguity/restriction for projects bundling Chardet (even indirectly). -- X5 faster than Chardet in average and X3 faster in 99% of the cases AND support 3 times more encoding. +- X10 faster than Chardet in average and X6 faster in 99% of the cases AND support 3 times more encoding. - Never return a encoding if not suited for the given decoder. Eg. Never get UnicodeDecodeError! - Actively maintained, open to contributors. - Have the backward compatible function ``detect`` that come from Chardet. - Truly detect the language used in the text. - It is, for the first time, really universal! As there is no specific probe per charset. -- The package size is X4 lower than Chardet's (5.0)! +- The package size is X2~X4 lower than Chardet's (5.0)! (Depends on your arch) - Propose much more options/public kwargs to tweak the detection as you sees fit! - Using static typing to ease your development. - Detect Unicode content better than Chardet or cChardet does. diff --git a/docs/conf.py b/docs/conf.py index 5cfe028b..3e675d42 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -81,7 +81,7 @@ # # This is also used if you do content translation via gettext catalogs. # Usually you set "language" from the command line for these cases. -language = None +language = "en" # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. @@ -113,7 +113,7 @@ # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = [] # -- Options for HTMLHelp output ------------------------------------------ diff --git a/docs/index.rst b/docs/index.rst index 2398a7f0..05d5f98a 100755 --- a/docs/index.rst +++ b/docs/index.rst @@ -13,7 +13,6 @@ It aims to be as generic as possible. .. image:: https://repository-images.githubusercontent.com/200259335/d3da9600-dedc-11e9-83e8-081f597505df :width: 500px - :scale: 100 % :alt: CLI Charset Normalizer :align: right @@ -72,6 +71,7 @@ Community Guide .. toctree:: :maxdepth: 2 + community/speedup community/faq community/why_migrate diff --git a/docs/user/advanced_search.rst b/docs/user/advanced_search.rst index b4441e58..a269cd10 100644 --- a/docs/user/advanced_search.rst +++ b/docs/user/advanced_search.rst @@ -18,7 +18,8 @@ As follow :: cp_isolation=None, # Finite list of encoding to use when searching for a match cp_exclusion=None, # Finite list of encoding to avoid when searching for a match preemptive_behaviour=True, # Determine if we should look into my_byte_str (ASCII-Mode) for pre-defined encoding - explain=False # Print on screen what is happening when searching for a match + explain=False, # Print on screen what is happening when searching for a match + language_threshold=0.1 # Minimum coherence ratio / language ratio match accepted ) diff --git a/docs/user/support.rst b/docs/user/support.rst index 8b624933..0dbf06b9 100644 --- a/docs/user/support.rst +++ b/docs/user/support.rst @@ -92,13 +92,10 @@ mac_iceland maciceland mac_latin2 maccentraleurope, maclatin2 mac_roman macintosh, macroman mac_turkish macturkish -mbcs ansi, dbcs ptcp154 csptcp154, pt154, cp154, cyrillic_asian -rot_13 rot13 shift_jis csshiftjis, shiftjis, sjis, s_jis, x_mac_japanese shift_jis_2004 shiftjis2004, sjis_2004, s_jis_2004 shift_jisx0213 shiftjisx0213, sjisx0213, s_jisx0213 -tactis tis260 tis_620 tis620, tis_620_0, tis_620_2529_0, tis_620_2529_1, iso_ir_166 utf_16 u16, utf16 utf_16_be unicodebigunmarked, utf_16be @@ -107,9 +104,11 @@ utf_32 u32, utf32 utf_32_be utf_32be utf_32_le utf_32le utf_8 u8, utf, utf8, utf8_ucs2, utf8_ucs4 (+utf_8_sig) -utf_7 u7, unicode-1-1-utf-7 +utf_7* u7, unicode-1-1-utf-7 =============== =============================================================================================================================== +*: Only if a SIG/mark is found. + ------------------- Supported Languages ------------------- diff --git a/setup.cfg b/setup.cfg index bb4f9c50..8000f5cd 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,3 +1,59 @@ +[metadata] +name = charset-normalizer +description = The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet. +long_description = file: README.md, CHANGELOG.md, LICENSE +long_description_content_type = text/markdown +keywords = encoding, charset, charset-detector, detector, normalization, unicode, chardet, detect +url = https://github.com/Ousret/charset_normalizer +license = MIT +author_email = ahmed.tahri@cloudnursery.dev +author = Ahmed TAHRI +python_requires = >=3.6.0 +project_urls = + Bug Reports = https://github.com/Ousret/charset_normalizer/issues + Documentation = https://charset-normalizer.readthedocs.io/en/latest +classifiers = + Development Status :: 5 - Production/Stable + License :: OSI Approved :: MIT License + Intended Audience :: Developers + Topic :: Software Development :: Libraries :: Python Modules + Operating System :: OS Independent + Programming Language :: Python + Programming Language :: Python :: 3 + Programming Language :: Python :: 3.6 + Programming Language :: Python :: 3.7 + Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 + Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 + Programming Language :: Python :: Implementation :: PyPy + Topic :: Text Processing :: Linguistic + Topic :: Utilities + Typing :: Typed + +[options.packages.find] +exclude = + tests + *.tests + *.tests.* + tests.* + docs* + data* + +[options.extras_require] +unicode_backport = + +[options.entry_points] +console_scripts = + normalizer = charset_normalizer.cli.normalizer:cli_detect + +[options] +packages = find: +include_package_data = True + +[options.package_data] +charset_normalizer = py.typed + [tool:pytest] addopts = --cov=charset_normalizer --cov-report=term-missing -rxXs @@ -11,4 +67,4 @@ ignore_missing_imports = True [tool:isort] profile = black -combine_as_imports = True \ No newline at end of file +combine_as_imports = True diff --git a/setup.py b/setup.py index 298d12be..7c64a695 100644 --- a/setup.py +++ b/setup.py @@ -1,11 +1,11 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import io import os +import sys from re import search -from setuptools import find_packages, setup +from setuptools import setup def get_version(): @@ -14,73 +14,25 @@ def get_version(): version_file.read()).group('version') -# Package meta-data. -NAME = 'charset-normalizer' -DESCRIPTION = 'The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet.' -URL = 'https://github.com/ousret/charset_normalizer' -EMAIL = 'ahmed.tahri@cloudnursery.dev' -AUTHOR = 'Ahmed TAHRI @Ousret' -REQUIRES_PYTHON = '>=3.6.0' -VERSION = get_version() +USE_MYPYC = False -REQUIRED = [] +if len(sys.argv) > 1 and sys.argv[1] == "--use-mypyc": + sys.argv.pop(1) + USE_MYPYC = True +if os.getenv("CHARSET_NORMALIZER_USE_MYPYC", None) == "1": + USE_MYPYC = True -EXTRAS = { - 'unicode_backport': ['unicodedata2'] -} +if USE_MYPYC: + from mypyc.build import mypycify -here = os.path.abspath(os.path.dirname(__file__)) - -try: - with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f: - long_description = '\n' + f.read() -except FileNotFoundError: - long_description = DESCRIPTION + MYPYC_MODULES = mypycify([ + "charset_normalizer/md.py" + ]) +else: + MYPYC_MODULES = None setup( - name=NAME, - version=VERSION, - description=DESCRIPTION, - long_description=long_description.replace(':heavy_check_mark:', '✅'), - long_description_content_type='text/markdown', - author=AUTHOR, - author_email=EMAIL, - python_requires=REQUIRES_PYTHON, - url=URL, - keywords=['encoding', 'i18n', 'txt', 'text', 'charset', 'charset-detector', 'normalization', 'unicode', 'chardet'], - packages=find_packages(exclude=["tests", "*.tests", "*.tests.*", "tests.*"]), - install_requires=REQUIRED, - extras_require=EXTRAS, - include_package_data=True, - package_data={"charset_normalizer": ["py.typed"]}, - license='MIT', - entry_points={ - 'console_scripts': - [ - 'normalizer = charset_normalizer.cli.normalizer:cli_detect' - ] - }, - classifiers=[ - 'Development Status :: 5 - Production/Stable', - 'License :: OSI Approved :: MIT License', - 'Intended Audience :: Developers', - 'Topic :: Software Development :: Libraries :: Python Modules', - 'Operating System :: OS Independent', - 'Programming Language :: Python', - 'Programming Language :: Python :: 3', - 'Programming Language :: Python :: 3.6', - 'Programming Language :: Python :: 3.7', - 'Programming Language :: Python :: 3.8', - 'Programming Language :: Python :: 3.9', - 'Programming Language :: Python :: 3.10', - 'Programming Language :: Python :: 3.11', - 'Topic :: Text Processing :: Linguistic', - 'Topic :: Utilities', - 'Programming Language :: Python :: Implementation :: PyPy', - 'Typing :: Typed' - ], - project_urls={ - 'Bug Reports': 'https://github.com/Ousret/charset_normalizer/issues', - 'Documentation': 'https://charset-normalizer.readthedocs.io/en/latest', - }, + name="charset-normalizer", + version=get_version(), + ext_modules=MYPYC_MODULES ) diff --git a/tests/test_cli.py b/tests/test_cli.py index 16601750..d42bf46b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -2,7 +2,12 @@ from charset_normalizer.cli.normalizer import cli_detect, query_yes_no from unittest.mock import patch from os.path import exists -from os import remove +from os import remove, path, pardir + +DIR_PATH = path.join( + path.dirname(path.realpath(__file__)), + pardir +) class TestCommandLineInterface(unittest.TestCase): @@ -24,24 +29,33 @@ def test_single_file(self): self.assertEqual( 0, cli_detect( - ['./data/sample-arabic-1.txt'] + [DIR_PATH + '/data/sample-arabic-1.txt'] ) ) + def test_version_output_success(self): + with self.assertRaises(SystemExit): + cli_detect( + ['--version'] + ) + def test_single_file_normalize(self): self.assertEqual( 0, cli_detect( - ['./data/sample-arabic-1.txt', '--normalize'] + [ + DIR_PATH + '/data/sample-arabic-1.txt', + '--normalize' + ] ) ) self.assertTrue( - exists('./data/sample-arabic-1.cp1256.txt') + exists(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') ) try: - remove('./data/sample-arabic-1.cp1256.txt') + remove(DIR_PATH + '/data/sample-arabic-1.cp1256.txt') except: pass @@ -49,7 +63,7 @@ def test_single_verbose_file(self): self.assertEqual( 0, cli_detect( - ['./data/sample-arabic-1.txt', '--verbose'] + [DIR_PATH + '/data/sample-arabic-1.txt', '--verbose'] ) ) @@ -58,9 +72,9 @@ def test_multiple_file(self): 0, cli_detect( [ - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -71,9 +85,9 @@ def test_with_alternative(self): cli_detect( [ '-a', - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -84,9 +98,9 @@ def test_with_minimal_output(self): cli_detect( [ '-m', - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -98,9 +112,9 @@ def test_with_minimal_and_alt(self): [ '-m', '-a', - './data/sample-arabic-1.txt', - './data/sample-french.txt', - './data/sample-chinese.txt' + DIR_PATH + '/data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-french.txt', + DIR_PATH + '/data/sample-chinese.txt' ] ) ) @@ -109,7 +123,7 @@ def test_non_existent_file(self): with self.assertRaises(SystemExit) as cm: cli_detect( - ['./data/not_found_data.txt'] + [DIR_PATH + '/data/not_found_data.txt'] ) self.assertEqual(cm.exception.code, 2) @@ -119,7 +133,7 @@ def test_replace_without_normalize(self): self.assertEqual( cli_detect( [ - './data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-arabic-1.txt', '--replace' ] ), @@ -130,7 +144,7 @@ def test_force_replace_without_replace(self): self.assertEqual( cli_detect( [ - './data/sample-arabic-1.txt', + DIR_PATH + '/data/sample-arabic-1.txt', '--force' ] ), diff --git a/tests/test_coherence_detection.py b/tests/test_coherence_detection.py index 6ad95927..7e399132 100644 --- a/tests/test_coherence_detection.py +++ b/tests/test_coherence_detection.py @@ -1,5 +1,5 @@ import pytest -from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features +from charset_normalizer.cd import encoding_languages, mb_encoding_languages, is_multi_byte_encoding, get_target_features, filter_alt_coherence_matches @pytest.mark.parametrize( @@ -39,3 +39,18 @@ def test_target_features(language, expected_have_accents, expected_pure_latin): assert target_have_accents is expected_have_accents assert target_pure_latin is expected_pure_latin + + +@pytest.mark.parametrize( + "matches, expected_return", + [ + ([("English", 0.88,), ("English—", 0.99)], [("English", 0.99)]), + ([("English", 0.88,), ("English—", 0.99), ("English——", 0.999)], [("English", 0.999)]), + ([("English", 0.88,), ("English—", 0.77)], [("English", 0.88)]), + ([("English", 0.88,), ("Italian", 0.77)], [("English", 0.88), ("Italian", 0.77)]), + ] +) +def test_filter_alt_coherence_matches(matches, expected_return): + results = filter_alt_coherence_matches(matches) + + assert results == expected_return diff --git a/tests/test_full_detection.py b/tests/test_full_detection.py index 96e0b797..adff8801 100644 --- a/tests/test_full_detection.py +++ b/tests/test_full_detection.py @@ -1,5 +1,11 @@ from charset_normalizer.api import from_path import pytest +from os import path, pardir + +DIR_PATH = path.join( + path.dirname(path.realpath(__file__)), + pardir +) @pytest.mark.parametrize( @@ -10,7 +16,7 @@ ('sample-arabic.txt', 'utf_8', 'Arabic'), ('sample-russian-3.txt', 'utf_8', 'Russian'), ('sample-french.txt', 'utf_8', 'French'), - ('sample-chinese.txt', 'big5', 'Classical Chinese'), + ('sample-chinese.txt', 'big5', 'Chinese'), ('sample-greek.txt', 'cp1253', 'Greek'), ('sample-greek-2.txt', 'cp1253', 'Greek'), ('sample-hebrew-2.txt', 'cp1255', 'Hebrew'), @@ -30,7 +36,7 @@ def test_elementary_detection( expected_charset: str, expected_language: str, ): - best_guess = from_path("./data/{}".format(input_data_file)).best() + best_guess = from_path(DIR_PATH + "/data/{}".format(input_data_file)).best() assert best_guess is not None, "Elementary detection has failed upon '{}'".format(input_data_file) assert best_guess.encoding == expected_charset, "Elementary charset detection has failed upon '{}'".format(input_data_file) diff --git a/tests/test_normalize_fp.py b/tests/test_normalize_fp.py deleted file mode 100644 index e2ce364a..00000000 --- a/tests/test_normalize_fp.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest -from charset_normalizer import normalize -from os.path import exists -from os import unlink - - -def test_normalize_fp_creation(): - guesses = normalize( - "./data/sample-arabic-1.txt" - ) - - predicted_path = "./data/sample-arabic-1-{}.txt".format(guesses.best().encoding) - path_exist = exists( - "./data/sample-arabic-1-{}.txt".format(guesses.best().encoding) - ) - - assert path_exist is True - - if path_exist: - unlink(predicted_path)