NAMD · fccoelho · Jun 26, 2015 · Jun 26, 2015
diff --git a/pypln/backend/workers/elastic_indexer.py b/pypln/backend/workers/elastic_indexer.py
@@ -32,6 +32,10 @@ def process(self, document):
         doc_type = document.pop('doc_type')
         file_id = document["file_id"]
         ES.indices.create(index_name, ignore=400)
+        # We need to remove the raw contents of the file.
+        # See `test_regression_indexing_should_not_include_contents` in
+        # tests/test_elastic_indexer.py for details.
+        document.pop('contents')
         result = ES.index(index=index_name, doc_type=doc_type,
                 body=document, id=file_id)
         return result
diff --git a/requirements/development.txt b/requirements/development.txt
@@ -3,3 +3,4 @@
 nose
 epydoc
 sphinx
+mock
diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py
@@ -8,6 +8,8 @@
 __docformat__ = 'restructuredtext en'
 
 
+from mock import patch
+
 from pypln.backend.workers.elastic_indexer import ElasticIndexer
 from .utils import TaskTest
 from elasticsearch import Elasticsearch
@@ -26,9 +28,37 @@ def test_indexing_go_through(self):
             'index_name': "test_pypln",
             'doc_type': 'document',
             'file_id': 'deadbeef',
-            'text': "Om nama Shivaya "*100
+            'text': "Om nama Shivaya "*100,
+            'contents': 'raw_file_contents',
         }
 
         self.document.update(doc)
         ElasticIndexer().delay(self.fake_id)
         assert self.document['created']  # must be True
+
+    @patch('pypln.backend.workers.elastic_indexer.ES')
+    def test_regression_indexing_should_not_include_contents(self, ES):
+        """
+        We should not index the original file contents for two reasons: 1) they
+        are not relevant to the search. The `text` attribute should include the
+        relevant content and 2) they may be in a binary format that will not be
+        serializable.
+
+        See https://github.com/NAMD/pypln.backend/issues/176 for details.
+        """
+        doc = {
+            'index_name': "test_pypln",
+            'doc_type': 'document',
+            'file_id': 'deadbeef',
+            'text': "Om nama Shivaya "*100,
+            'contents': 'raw_file_contents',
+        }
+
+        self.document.update(doc)
+        ElasticIndexer().delay(self.fake_id)
+        # remove properties that won't be indexed
+        index_name = doc.pop("index_name")
+        doc_type = doc.pop('doc_type')
+        doc.pop('contents')
+        ES.index.assert_called_with(body=doc, id=doc['file_id'],
+                doc_type=doc_type, index=index_name)
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,4 @@ @@
     nose
     epydoc
     sphinx
+    mock