diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 64b0d46..a2ddf52 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -127,6 +127,43 @@ def extract_pdf(data): return '', {} +def trial_decode(text): + """ + Tries to detect text encoding using `magic`. If the detected encoding is + not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding + as utf-8 replacing invalid chars with `U+FFFD` (the replacement character). + + This is far from an ideal solution, but the extractor and the rest of the + pipeline need an unicode object. + """ + with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: + content_encoding = m.id_buffer(text) + + forced_decoding = False + try: + result = text.decode(content_encoding) + except LookupError: + # If the detected encoding is not supported, we try to decode it as + # utf-8. + try: + result = text.decode('utf-8') + except UnicodeDecodeError: + # Is there a better way of doing this than nesting try/except + # blocks? This smells really bad. + try: + result = text.decode('iso-8859-1') + except UnicodeDecodeError: + # If neither utf-8 nor iso-885901 work are capable of handling + # this text, we just decode it using utf-8 and replace invalid + # chars with U+FFFD. + # Two somewhat arbitrary decisions were made here: use utf-8 + # and use 'replace' instead of 'ignore'. + result = text.decode('utf-8', 'replace') + forced_decoding = True + + return result, forced_decoding + + class Extractor(Worker): #TODO: need to verify some exceptions when trying to convert 'evil' PDFs #TODO: should 'replace_with' be '' when extracting from HTML? @@ -153,20 +190,15 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: - content_encoding = m.id_buffer(text) - try: - text = text.decode(content_encoding) + text, forced_decoding = trial_decode(text) + + if isinstance(text, unicode): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we # also shouldn't. There's no way of knowing if it's a badly encoded # html or a binary blob that happens do have bytes that look liked # html entities. text = HTMLParser().unescape(text) - except LookupError: - # If the detected encoding is not supported, we just treat the - # content as we used to: ignoring it's encoding. - pass text = clean(text) @@ -176,4 +208,4 @@ def process(self, file_data): language = cld.detect(text)[1] return {'text': text, 'file_metadata': metadata, 'language': language, - 'mimetype': file_mime_type} + 'mimetype': file_mime_type, 'forced_decoding': forced_decoding}