From b0e4e94edf18d824af4928a154a5d2b3c9d00884 Mon Sep 17 00:00:00 2001 From: TAHRI Ahmed R Date: Fri, 13 Dec 2019 14:42:23 +0100 Subject: [PATCH] Bugfix when passing empty bytes to detector (#29) * Bugfix when passing empty seq to detector * Add test for empty bytes case * bump version 1.3.2 --- charset_normalizer/normalizer.py | 8 ++++++++ charset_normalizer/version.py | 2 +- test/test_on_byte.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 1 deletion(-) diff --git a/charset_normalizer/normalizer.py b/charset_normalizer/normalizer.py index 9adad169..72ec86d7 100644 --- a/charset_normalizer/normalizer.py +++ b/charset_normalizer/normalizer.py @@ -348,6 +348,14 @@ def from_bytes(sequences, steps=10, chunk_size=512, threshold=0.20, cp_isolation if not explain: logger.disable('charset_normalizer') + if len(sequences) == 0: + return CharsetNormalizerMatch( + sequences, + 'utf-8', + 0., + [] + ) + too_small_sequence = len(sequences) < 24 if too_small_sequence is True: diff --git a/charset_normalizer/version.py b/charset_normalizer/version.py index cf6c110e..d966375e 100644 --- a/charset_normalizer/version.py +++ b/charset_normalizer/version.py @@ -2,5 +2,5 @@ Expose version """ -__version__ = "1.3.1" +__version__ = "1.3.2" VERSION = __version__.split('.') diff --git a/test/test_on_byte.py b/test/test_on_byte.py index 34f46922..2ea73ac0 100644 --- a/test/test_on_byte.py +++ b/test/test_on_byte.py @@ -10,6 +10,18 @@ def test_too_short_none(self): CnM.from_bytes(b'\xfe\xff').best().first() ) + def test_empty_bytes(self): + r = CnM.from_bytes(b'').best().first() + + self.assertIsNotNone( + r + ) + + self.assertEqual( + 'utf-8', + r.encoding + ) + def test_bom_detection(self): with self.subTest('GB18030 UNAVAILABLE SIG'): self.assertFalse(