From cfa2fda7cc9adba1bdeae9d6bc8977b17f83c6c9 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Fri, 11 Oct 2019 14:58:15 +0200 Subject: [PATCH] Add more reliable numbers/stats (#24) * some numbers on how well it performs * some numbers on how well it performs * Update README.md * diff between ftfy and chardet --- README.md | 10 +- paper/README.md | 343 ++++++++++++++++++++++++++++++++++++++++++++++ paper/features.py | 283 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 635 insertions(+), 1 deletion(-) create mode 100644 paper/README.md create mode 100644 paper/features.py diff --git a/README.md b/README.md index 3745b164..68bd7023 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ This project offers you an alternative to **Universal Charset Encoding Detector* | Feature | [Chardet](https://github.com/chardet/chardet) | Charset Normalizer | [cChardet](https://github.com/PyYoshi/cChardet) | | ------------- | :-------------: | :------------------: | :------------------: | -| `Fast` | ❌
| ✅
| ✅
⚡ | +| `Fast` | ❌
| ❌
| ✅
| | `Universal**` | ❌ | ✅ | ❌ | | `Reliable` **without** distinguishable standards | ❌ | ✅ | ✅ | | `Reliable` **with** distinguishable standards | ✅ | ✅ | ✅ | @@ -45,6 +45,12 @@ This project offers you an alternative to **Universal Charset Encoding Detector* | `Detect spoken language` | ❌ | ✅ | N/A | | `Supported Encoding` | 30 | :tada: [90](https://charset-normalizer.readthedocs.io/en/latest/support.html) | 40 +| Package | Accuracy | Mean per file (ns) | File per sec (est) | +| ------------- | :-------------: | :------------------: | :------------------: | +| [chardet](https://github.com/chardet/chardet) | 93.5 % | 126 081 168 ns | 7.931 file/sec | +| [cchardet](https://github.com/PyYoshi/cChardet) | 97.0 % | 1 668 145 ns | **599.468 file/sec** | +| charset-normalizer | **97.25 %** | 209 503 253 ns | 4.773 file/sec | +

Reading Normalized TextCat Reading Text @@ -119,6 +125,8 @@ What I want is to get readable text, the best I can. In a way, **I'm brute forcing text decoding.** How cool is that ? 😎 +Don't confuse package **ftfy** with charset-normalizer or chardet. ftfy goal is to repair unicode string whereas charset-normalizer to convert raw file in unknown encoding to unicode. + ## 🍰 How - Discard all charset encoding table that could not fit the binary content. diff --git a/paper/README.md b/paper/README.md new file mode 100644 index 00000000..68f67c80 --- /dev/null +++ b/paper/README.md @@ -0,0 +1,343 @@ +## State of the art + +pre-requisite +------------- + + - chardet + - cchardet + - charset-normalizer + +Thoses tests are running on charset-normalizer (1.3.1), chardet (3.0.4) and cchardet (2.1.4). + + - files used in python chardet tests + - files used in python charset-normalizer + - script features.py + +how it is computed +------------------ + + - `from time import perf_counter_ns` for performance measure + - reading file before measure + - using folder name as encoding hint (considering it is the right one) + - when guessed and not equal to target encoding try to verify if it matter + +*if it matter* : Means if a package guessed cp1252 instead of cp1254, decode bytes using both and compare output string. +If equal, then it does not matter. + +run it yourself +--------------- + +```sh +git clone https://github.com/ousret/charset_normalizer.git +cd ./charset_normalizer/paper/ +python features.py +``` + +## Global results + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | +| ------------- | :-------------: | :------------------: | :------------------: | +| [chardet](https://github.com/chardet/chardet) | 93.5 % | 126 081 168 ns | 7.931 file/sec | +| [cchardet](https://github.com/PyYoshi/cChardet) | 97.0 % | 1 668 145 ns | **599.468 file/sec** | +| charset-normalizer | **97.25 %** | 209 503 253 ns | 4.773 file/sec | + +### Per encoding + +ascii +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 201824 | 4954.812 | | +| cchardet | 100.0 | 53517 | 18685.651 | | +| charset-normalizer | 75.0 | 48069575 | 20.803 | | + +big5 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 187318613 | 5.338 | | +| cchardet | 100.0 | 420464 | 2378.325 | | +| charset-normalizer | 100.0 | 29030648 | 34.446 | | + +cp932 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 281405894 | 3.554 | | +| cchardet | 0.0 | 448856 | 2227.886 | shift_jis | +| charset-normalizer | 100.0 | 407520555 | 2.454 | | + +cp949 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 235318707 | 4.25 | | +| cchardet | 50.0 | 275061 | 3635.557 | euc_kr | +| charset-normalizer | 100.0 | 283099072 | 3.532 | | + + +euc_jp +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 285472520 | 3.503 | | +| cchardet | 96.55 | 152344 | 6564.092 | cp1252 | +| charset-normalizer | 96.55 | 223891487 | 4.466 | cp1252 | + +euc_kr +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 206658004 | 4.839 | | +| cchardet | 100.0 | 379409 | 2635.678 | | +| charset-normalizer | 96.88 | 237868022 | 4.204 | gb2312 | + +gb2312 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 204280068 | 4.895 | | +| cchardet | 75.0 | 406609 | 2459.365 | gb18030 | +| charset-normalizer | 90.0 | 267700099 | 3.736 | euc_jis_2004, big5hkscs | + + +cp855 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 48681383 | 20.542 | | +| cchardet | 100.0 | 3109405 | 321.605 | | +| charset-normalizer | 100.0 | 131318072 | 7.615 | | + +cp866 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 34948580 | 28.613 | | +| cchardet | 100.0 | 2120040 | 471.689 | | +| charset-normalizer | 100.0 | 134351478 | 7.443 | | + +iso2022_jp +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 487661 | 2050.605 | | +| cchardet | 100.0 | 11730 | 85251.492 | | +| charset-normalizer | 100.0 | 53079390 | 18.84 | | + + +iso2022_kr +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 80494 | 12423.286 | | +| cchardet | 100.0 | 8798 | 113662.196 | | +| charset-normalizer | 100.0 | 28608808 | 34.954 | | + + +latin_1 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 10655475 | 93.848 | | +| cchardet | 100.0 | 158693 | 6301.475 | | +| charset-normalizer | 100.0 | 219482041 | 4.556 | | + + +iso8859_2 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 9.09 | 59510154 | 16.804 | latin_1 | +| cchardet | 100.0 | 1072225 | 932.64 | | +| charset-normalizer | 100.0 | 364305515 | 2.745 | | + + +iso8859_5 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 41274746 | 24.228 | | +| cchardet | 100.0 | 1931337 | 517.776 | | +| charset-normalizer | 100.0 | 235748017 | 4.242 | | + + +iso8859_6 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 0.0 | 8770859 | 114.014 | mac_cyrillic | +| cchardet | 100.0 | 248962 | 4016.677 | | +| charset-normalizer | 100.0 | 58873846 | 16.985 | | + +iso8859_7 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 90.91 | 26917723 | 37.15 | cp1253 | +| cchardet | 100.0 | 475691 | 2102.205 | | +| charset-normalizer | 100.0 | 219005790 | 4.566 | | + +iso8859_9 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 0.0 | 16149994 | 61.92 | latin_1 | +| cchardet | 100.0 | 303994 | 3289.539 | | +| charset-normalizer | 33.33 | 313328579 | 3.192 | cp1252 | + +koi8_r +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 48102976 | 20.789 | | +| cchardet | 100.0 | 4506055 | 221.924 | | +| charset-normalizer | 100.0 | 285609163 | 3.501 | | + + +mac_cyrillic +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 39849823 | 25.094 | | +| cchardet | 100.0 | 2161876 | 462.561 | | +| charset-normalizer | 100.0 | 273638883 | 3.654 | | + + +shift_jis +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 261271302 | 3.827 | | +| cchardet | 100.0 | 199649 | 5008.79 | | +| charset-normalizer | 96.67 | 394586122 | 2.534 | cp932 | + + +tis_620 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 89417757 | 11.183 | | +| cchardet | 100.0 | 2988084 | 334.663 | | +| charset-normalizer | 100.0 | 367614610 | 2.72 | | + + +utf_16 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 24339 | 41086.322 | | +| cchardet | 100.0 | 5718 | 174886.324 | | +| charset-normalizer | 100.0 | 99274930 | 10.073 | | + + +utf_32 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 39001 | 25640.368 | | +| cchardet | 100.0 | 9383 | 106575.722 | | +| charset-normalizer | 100.0 | 181745938 | 5.502 | | + + +utf_8 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 86177619 | 11.604 | | +| cchardet | 100.0 | 53297 | 18762.782 | | +| charset-normalizer | 100.0 | 275800555 | 3.626 | | + + +cp1250 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 16.67 | 570631080 | 1.752 | cp1252, latin_1 | +| cchardet | 100.0 | 12933035 | 77.321 | | +| charset-normalizer | 100.0 | 86223475 | 11.598 | | + + +cp1251 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 39826493 | 25.109 | | +| cchardet | 100.0 | 2292002 | 436.3 | | +| charset-normalizer | 100.0 | 68469359 | 14.605 | | + + +cp1252 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 28613134 | 34.949 | | +| cchardet | 50.0 | 330630 | 3024.529 | iso8859_7, iso8859_9 | +| charset-normalizer | 50.0 | 200121270 | 4.997 | cp437 | + + +cp1253 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 66.67 | 47471533 | 21.065 | iso8859_7 | +| cchardet | 100.0 | 2930658 | 341.22 | | +| charset-normalizer | 100.0 | 168027309 | 5.951 | | + + +cp1254 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 0.0 | 3659364 | 273.272 | latin_1 | +| cchardet | 100.0 | 84747 | 11799.828 | | +| charset-normalizer | 0.0 | 77320217 | 12.933 | cp1252 | + + +cp1255 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 100.0 | 93093102 | 10.742 | | +| cchardet | 100.0 | 2764209 | 361.767 | | +| charset-normalizer | 100.0 | 119266978 | 8.385 | | + +cp1256 +----- + +| Package | Accuracy | Mean per file (ns) | File per sec (est) | Confuse it with (sometime) | +| ------------- | :-------------: | :------------------: | :------------------: | :------------------: | +| chardet | 0.0 | 437728948 | 2.285 | mac_cyrillic | +| cchardet | 100.0 | 14977980 | 66.765 | | +| charset-normalizer | 100.0 | 336711209 | 2.97 | | + diff --git a/paper/features.py b/paper/features.py new file mode 100644 index 00000000..8078bb29 --- /dev/null +++ b/paper/features.py @@ -0,0 +1,283 @@ +from loguru import logger + +from charset_normalizer.normalizer import CharsetNormalizerMatches as CnM +from chardet import detect as chardet_detect +from cchardet import detect as cchardet_detect + +from charset_normalizer.version import __version__ as charset_normalizer_ver +from chardet import __version__ as chardet_version +from cchardet import __version__ as cchardet_version + +from time import perf_counter_ns, sleep +from prettytable import PrettyTable +from encodings.aliases import aliases + +from os.path import basename +from glob import glob +from statistics import mean + +report = { + + 'chardet': { + 'success': 0, + 'failure': 0, + 'performances': list(), + 'encodings': dict() + }, + + 'cchardet': { + 'success': 0, + 'failure': 0, + 'performances': list(), + 'encodings': dict() + }, + + 'charset-normalizer': { + 'success': 0, + 'failure': 0, + 'performances': list(), + 'encodings': dict() + }, + +} + + +def also_could_be(encoding): + if encoding == 'uhc': # for cchardet. UHC ==> EUC_KR + return 'euc_kr' + if 'sig' in encoding: + encoding = encoding.replace('_sig', '') + for a, b in aliases.items(): + if a == encoding: + return b + return encoding + + +def does_it_matter(raw_bytes, detected_encoding, should_be_encoding): + """ + :param bytes raw_bytes: + :param str detected_encoding: + :param str should_be_encoding: + :return: + """ + if detected_encoding == should_be_encoding: + return False + try: + return raw_bytes.decode(detected_encoding) != raw_bytes.decode(should_be_encoding) + except UnicodeDecodeError: + return True + + +if __name__ == '__main__': + + logger.warning( + "Running feature benchmark on charset-normalizer ({v1}), chardet ({v2}) and cchardet ({v3}).", + v1=charset_normalizer_ver, + v2=chardet_version, + v3=cchardet_version + ) + + files_queue = glob('../data/chardet/**/*') + + for path in files_queue: + + target_encoding_dir = path.split('\\')[-2].lower().replace('-', '_') + + if target_encoding_dir.startswith('windows_'): + target_encoding_dir = '_'.join(target_encoding_dir.split('_')[:2]) + if target_encoding_dir.startswith('iso'): + target_encoding_dir = 'iso{n1}_{n2}'.format(n1=target_encoding_dir.split('_')[1], + n2=target_encoding_dir.split('_')[2]) + should_be_encoding = None + + for a, b in aliases.items(): + if a == target_encoding_dir: + should_be_encoding = b + break + elif b == target_encoding_dir: + should_be_encoding = target_encoding_dir + break + + if should_be_encoding is None: + logger.warning('{encoding} could not be identified in encodings.aliases.') + continue + + if should_be_encoding not in report['charset-normalizer']['encodings'].keys(): + for competitor in report.keys(): + report[competitor]['encodings'][should_be_encoding] = { + 'performances': list(), + 'success': 0, + 'failure': 0, + 'confuse_it_with': set() + } + + logger.info('File "{filename}" is going to be tested. File owner claim it is encoded with {target_encoding}.', filename=basename(path), target_encoding=should_be_encoding) + + before_reading_file = perf_counter_ns() + + raw_content = open(path, 'rb').read() + + after_reading_file = perf_counter_ns() + + logger.info('"{filename}" was loaded within {ts} nanoseconds.', filename=basename(path), ts=(after_reading_file-before_reading_file)) + + before_chardet_decide = perf_counter_ns() + + chardet_result = chardet_detect(raw_content) + + after_chardet_decide = perf_counter_ns() + + logger.info('"{filename}" content was identified within {ts} nanoseconds by CHARDET.', filename=basename(path), + ts=(after_chardet_decide - before_chardet_decide)) + + report['chardet']['performances'].append(after_chardet_decide - before_chardet_decide) + report['chardet']['encodings'][should_be_encoding]['performances'].append(after_chardet_decide - before_chardet_decide) + + if chardet_result.get('encoding') is None: + logger.warning('"{filename}" content could not identified by CHARDET.') + report['chardet']['failure'] += 1 + report['chardet']['encodings'][should_be_encoding]['failure'] += 1 + elif should_be_encoding != also_could_be(chardet_result.get('encoding').lower().replace('-', '_')) and does_it_matter(raw_content, should_be_encoding, also_could_be(chardet_result.get('encoding').lower().replace('-', '_'))) is True: + logger.error('"{filename}" content could not identified properly by CHARDET. ({got} instead of {should_be})', filename=basename(path), got=chardet_result.get('encoding').replace('-', '_'), should_be=should_be_encoding) + report['chardet']['failure'] += 1 + report['chardet']['encodings'][should_be_encoding]['failure'] += 1 + report['chardet']['encodings'][should_be_encoding]['confuse_it_with'].add( + also_could_be(chardet_result.get('encoding').lower().replace('-', '_'))) + else: + report['chardet']['success'] += 1 + report['chardet']['encodings'][should_be_encoding]['success'] += 1 + + before_cchardet_decide = perf_counter_ns() + + cchardet_result = cchardet_detect(raw_content) + + after_cchardet_decide = perf_counter_ns() + + logger.info('"{filename}" content was identified within {ts} nanoseconds by CCHARDET.', filename=basename(path), + ts=(after_cchardet_decide - before_cchardet_decide)) + + report['cchardet']['performances'].append(after_cchardet_decide - before_cchardet_decide) + report['cchardet']['encodings'][should_be_encoding]['performances'].append( + after_cchardet_decide - before_cchardet_decide) + + if cchardet_result.get('encoding') is None: + logger.warning('"{filename}" content could not identified by CCHARDET.') + report['cchardet']['failure'] += 1 + report['cchardet']['encodings'][should_be_encoding]['failure'] += 1 + elif should_be_encoding != also_could_be(cchardet_result.get('encoding').lower().replace('-', '_')) and does_it_matter(raw_content, should_be_encoding, also_could_be(cchardet_result.get('encoding').lower().replace('-', '_'))) is True: + logger.error('"{filename}" content could not identified properly by CCHARDET. ({got} instead of {should_be})', filename=basename(path), got=cchardet_result.get('encoding').replace('-', '_'), should_be=should_be_encoding) + report['cchardet']['failure'] += 1 + report['cchardet']['encodings'][should_be_encoding]['failure'] += 1 + report['cchardet']['encodings'][should_be_encoding]['confuse_it_with'].add( + also_could_be(cchardet_result.get('encoding').lower().replace('-', '_'))) + else: + report['cchardet']['success'] += 1 + report['cchardet']['encodings'][should_be_encoding]['success'] += 1 + + before_cn_decide = perf_counter_ns() + + cn_result = CnM.from_bytes(raw_content).best().first() + + after_cn_decide = perf_counter_ns() + + logger.info('"{filename}" content was identified within {ts} nanoseconds by CHARSET-NORMALIZER.', filename=basename(path), + ts=(after_cn_decide - before_cn_decide)) + + report['charset-normalizer']['performances'].append(after_cn_decide - before_cn_decide) + report['charset-normalizer']['encodings'][should_be_encoding]['performances'].append( + after_cn_decide - before_cn_decide) + + if cn_result is None: + logger.warning('"{filename}" content could not identified by CHARSET-NORMALIZER.', filename=basename(path)) + report['charset-normalizer']['failure'] += 1 + report['charset-normalizer']['encodings'][should_be_encoding]['failure'] += 1 + elif should_be_encoding not in ' '.join(cn_result.could_be_from_charset): + logger.error('"{filename}" content could not identified properly by CHARSET-NORMALIZER. ({got} instead of {should_be})', filename=basename(path), got=cn_result.encoding, should_be=should_be_encoding) + report['charset-normalizer']['failure'] += 1 + report['charset-normalizer']['encodings'][should_be_encoding]['failure'] += 1 + report['charset-normalizer']['encodings'][should_be_encoding]['confuse_it_with'].add( + cn_result.encoding) + else: + report['charset-normalizer']['success'] += 1 + report['charset-normalizer']['encodings'][should_be_encoding]['success'] += 1 + + # Publish result + logger.info('Publishing results') + + sleep(0.5) + + x = PrettyTable( + [ + 'Package', + 'Success Rate', + 'Mean per file (ns)', + 'File per sec (est)' + ] + ) + + for package in report.keys(): + + mean_perf_ns = round( + mean( + report[package]['performances'] + ) + ) + + x.add_row( + [ + package, + round( + (report[package]['success'] / (report[package]['success'] + report[package]['failure'])) * 100, + ndigits=2 + ), + mean_perf_ns, + round( + 1. / (mean_perf_ns / 1e+9), + ndigits=3 + ) + ] + ) + + print( + x + ) + + y = PrettyTable( + [ + 'Package', + 'Encoding', + 'Success Rate', + 'Mean per file (ns)', + 'File per sec (est)', + 'Confuse it with (sometime)' + ] + ) + + for encoding in report['charset-normalizer']['encodings'].keys(): + for package in report.keys(): + mean_perf_ns = round( + mean( + report[package]['encodings'][encoding]['performances'] + ) + ) + + y.add_row( + [ + package, + encoding, + round( + (report[package]['encodings'][encoding]['success'] / (report[package]['encodings'][encoding]['success'] + report[package]['encodings'][encoding]['failure'])) * 100, + ndigits=2 + ), + mean_perf_ns, + round( + 1. / (mean_perf_ns / 1e+9), + ndigits=3 + ), + ', '.join(report[package]['encodings'][encoding]['confuse_it_with']) + ] + ) + + print( + y + )