NAMD · flavioamieiro · Nov 5, 2014 · Oct 23, 2014 · Nov 5, 2014 · Nov 5, 2014
diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
@@ -127,6 +127,43 @@ def extract_pdf(data):
         return '', {}
 
 
+def trial_decode(text):
+    """
+    Tries to detect text encoding using `magic`. If the detected encoding is
+    not supported, try utf-8, iso-8859-1 and ultimately falls back to decoding
+    as utf-8 replacing invalid chars with `U+FFFD` (the replacement character).
+
+    This is far from an ideal solution, but the extractor and the rest of the
+    pipeline need an unicode object.
+    """
+    with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
+        content_encoding = m.id_buffer(text)
+
+    forced_decoding = False
+    try:
+        result = text.decode(content_encoding)
+    except LookupError:
+        # If the detected encoding is not supported, we try to decode it as
+        # utf-8.
+        try:
+            result = text.decode('utf-8')
+        except UnicodeDecodeError:
+            # Is there a better way of doing this than nesting try/except
+            # blocks? This smells really bad.
+            try:
+                result = text.decode('iso-8859-1')
+            except UnicodeDecodeError:
+                # If neither utf-8 nor iso-885901 work are capable of handling
+                # this text, we just decode it using utf-8 and replace invalid
+                # chars with U+FFFD.
+                # Two somewhat arbitrary decisions were made here: use utf-8
+                # and use 'replace' instead of 'ignore'.
+                result = text.decode('utf-8', 'replace')
+                forced_decoding = True
+
+    return result, forced_decoding
+
+
 class Extractor(Worker):
     #TODO: need to verify some exceptions when trying to convert 'evil' PDFs
     #TODO: should 'replace_with' be '' when extracting from HTML?
@@ -153,20 +190,15 @@ def process(self, file_data):
             return {'mimetype': 'unknown', 'text': "",
                     'file_metadata': {}, 'language': ""}
 
-        with magic.Magic(flags=magic.MAGIC_MIME_ENCODING) as m:
-            content_encoding = m.id_buffer(text)
-        try:
-            text = text.decode(content_encoding)
+        text, forced_decoding = trial_decode(text)
+
+        if isinstance(text, unicode):
             # HTMLParser only handles unicode objects. We can't pass the text
             # through it if we don't know the encoding, and it's possible we
             # also shouldn't. There's no way of knowing if it's a badly encoded
             # html or a binary blob that happens do have bytes that look liked
             # html entities.
             text = HTMLParser().unescape(text)
-        except LookupError:
-            # If the detected encoding is not supported, we just treat the
-            # content as we used to: ignoring it's encoding.
-            pass
 
         text = clean(text)
 
@@ -176,4 +208,4 @@ def process(self, file_data):
             language = cld.detect(text)[1]
 
         return {'text': text, 'file_metadata': metadata, 'language': language,
-                'mimetype': file_mime_type}
+                'mimetype': file_mime_type, 'forced_decoding': forced_decoding}