Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixes ElasticIndexer for binary files #177

Merged
merged 1 commit into from
Jun 26, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pypln/backend/workers/elastic_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ def process(self, document):
doc_type = document.pop('doc_type')
file_id = document["file_id"]
ES.indices.create(index_name, ignore=400)
# We need to remove the raw contents of the file.
# See `test_regression_indexing_should_not_include_contents` in
# tests/test_elastic_indexer.py for details.
document.pop('contents')
result = ES.index(index=index_name, doc_type=doc_type,
body=document, id=file_id)
return result
1 change: 1 addition & 0 deletions requirements/development.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
nose
epydoc
sphinx
mock
32 changes: 31 additions & 1 deletion tests/test_elastic_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
__docformat__ = 'restructuredtext en'


from mock import patch

from pypln.backend.workers.elastic_indexer import ElasticIndexer
from .utils import TaskTest
from elasticsearch import Elasticsearch
Expand All @@ -26,9 +28,37 @@ def test_indexing_go_through(self):
'index_name': "test_pypln",
'doc_type': 'document',
'file_id': 'deadbeef',
'text': "Om nama Shivaya "*100
'text': "Om nama Shivaya "*100,
'contents': 'raw_file_contents',
}

self.document.update(doc)
ElasticIndexer().delay(self.fake_id)
assert self.document['created'] # must be True

@patch('pypln.backend.workers.elastic_indexer.ES')
def test_regression_indexing_should_not_include_contents(self, ES):
"""
We should not index the original file contents for two reasons: 1) they
are not relevant to the search. The `text` attribute should include the
relevant content and 2) they may be in a binary format that will not be
serializable.

See https://github.com/NAMD/pypln.backend/issues/176 for details.
"""
doc = {
'index_name': "test_pypln",
'doc_type': 'document',
'file_id': 'deadbeef',
'text': "Om nama Shivaya "*100,
'contents': 'raw_file_contents',
}

self.document.update(doc)
ElasticIndexer().delay(self.fake_id)
# remove properties that won't be indexed
index_name = doc.pop("index_name")
doc_type = doc.pop('doc_type')
doc.pop('contents')
ES.index.assert_called_with(body=doc, id=doc['file_id'],
doc_type=doc_type, index=index_name)