From 5deaba1987b85da4128b2d748b8fa1f7516cc4b5 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 23 Oct 2014 12:54:16 -0200 Subject: [PATCH 1/4] Tries to improve encoding handling in `Extractor` Instead of immediately falling back to returning an unencoded string, we try to use utf-8 if we cannot accurately detect the content encoding. This is far from ideal, but since returning an unencoded string brings problems further down the pipeline (other workers are not able to decode it), this is a step to reduce this problems. --- pypln/backend/workers/extractor.py | 41 ++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 64b0d46..d2070ee 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -127,6 +127,36 @@ def extract_pdf(data): return '', {} +def trial_decode(text): + """ + Tries to detect text encoding using `magic`. If the detected encoding is + not supported, try utf-8 and ultimately falls back to returning the string + as it was given. + + This is far from an ideal solution, but the extractor and the rest of the + pipeline need an unicode object. + """ + with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: + content_encoding = m.id_buffer(text) + + try: + result = text.decode(content_encoding) + except LookupError: + # If the detected encoding is not supported, we try to decode it as + # utf-8. + try: + result = text.decode('utf-8') + except UnicodeDecodeError: + # If utf-8 is also not capable of handling this text, we just + # treat the content as we used to: ignoring it's encoding. + result = text + + # TODO: Maybe try ISO-8859-1 if UTF-8 raises an UnicodeDecodeError and + # only then fallback to returning a string? + + return result + + class Extractor(Worker): #TODO: need to verify some exceptions when trying to convert 'evil' PDFs #TODO: should 'replace_with' be '' when extracting from HTML? @@ -153,20 +183,15 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: - content_encoding = m.id_buffer(text) - try: - text = text.decode(content_encoding) + text = trial_decode(text) + + if isinstance(text, unicode): # HTMLParser only handles unicode objects. We can't pass the text # through it if we don't know the encoding, and it's possible we # also shouldn't. There's no way of knowing if it's a badly encoded # html or a binary blob that happens do have bytes that look liked # html entities. text = HTMLParser().unescape(text) - except LookupError: - # If the detected encoding is not supported, we just treat the - # content as we used to: ignoring it's encoding. - pass text = clean(text) From 708abe5c9d3c4293ac4399ebc4e549724b921fdd Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 5 Nov 2014 13:06:08 -0200 Subject: [PATCH 2/4] Also tries to decode text as iso-8859-1 if utf-8 fails to parse it --- pypln/backend/workers/extractor.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index d2070ee..c411e37 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -147,12 +147,15 @@ def trial_decode(text): try: result = text.decode('utf-8') except UnicodeDecodeError: - # If utf-8 is also not capable of handling this text, we just - # treat the content as we used to: ignoring it's encoding. - result = text - - # TODO: Maybe try ISO-8859-1 if UTF-8 raises an UnicodeDecodeError and - # only then fallback to returning a string? + # Is there a better way of doing this than nesting try/except + # blocks? This smells really bad. + try: + result = text.decode('iso-8859-1') + except UnicodeDecodeError: + # If neither utf-8 nor iso-885901 work are capable of handling + # this text, we just treat the content as we used to: ignoring + # it's encoding. + result = text return result From f65f2dfe09b42fc4bbce99cc68b5848ae0a68ff4 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 5 Nov 2014 13:11:59 -0200 Subject: [PATCH 3/4] Always return a unicode object from `trial_decode` Since the rest of our workflow depends on the text being a unicode object, we need to make sure this is what we return from our decode process. If even after trying different codecs we cannot successfully decode the string, we will do it using utf-8 and replacing invalid chars with the unicode replacement character. I had to make two decisions here that could be changed further down the line: the encoding to use and the error handling. I decided on utf-8 because I think it is the most common encoding in our use cases and on `replace` instead of `ignore` so we would have some kind of evidence of where the invalid characters were (and that they existed in the first place). --- pypln/backend/workers/extractor.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index c411e37..acba619 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -130,8 +130,8 @@ def extract_pdf(data): def trial_decode(text): """ Tries to detect text encoding using `magic`. If the detected encoding is - not supported, try utf-8 and ultimately falls back to returning the string - as it was given. + not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding + as utf-8 replacing invalid chars with `U+FFFD` (the replacement character). This is far from an ideal solution, but the extractor and the rest of the pipeline need an unicode object. @@ -153,9 +153,11 @@ def trial_decode(text): result = text.decode('iso-8859-1') except UnicodeDecodeError: # If neither utf-8 nor iso-885901 work are capable of handling - # this text, we just treat the content as we used to: ignoring - # it's encoding. - result = text + # this text, we just decode it using utf-8 and replace invalid + # chars with U+FFFD. + # Two somewhat arbitrary decisions were made here: use utf-8 + # and use 'replace' instead of 'ignore'. + result = text.decode('utf-8', 'replace') return result From c8690b129fda7f2b5ace6c7a9ad684021fdd57fa Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 5 Nov 2014 15:14:42 -0200 Subject: [PATCH 4/4] Adds metadata to the document specifying if the text was forcibly decoded --- pypln/backend/workers/extractor.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index acba619..a2ddf52 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -139,6 +139,7 @@ def trial_decode(text): with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m: content_encoding = m.id_buffer(text) + forced_decoding = False try: result = text.decode(content_encoding) except LookupError: @@ -158,8 +159,9 @@ def trial_decode(text): # Two somewhat arbitrary decisions were made here: use utf-8 # and use 'replace' instead of 'ignore'. result = text.decode('utf-8', 'replace') + forced_decoding = True - return result + return result, forced_decoding class Extractor(Worker): @@ -188,7 +190,7 @@ def process(self, file_data): return {'mimetype': 'unknown', 'text': "", 'file_metadata': {}, 'language': ""} - text = trial_decode(text) + text, forced_decoding = trial_decode(text) if isinstance(text, unicode): # HTMLParser only handles unicode objects. We can't pass the text @@ -206,4 +208,4 @@ def process(self, file_data): language = cld.detect(text)[1] return {'text': text, 'file_metadata': metadata, 'language': language, - 'mimetype': file_mime_type} + 'mimetype': file_mime_type, 'forced_decoding': forced_decoding}