Skip to content

Commit d3e1d78

Browse files
committed
Merge pull request #179 from flavioamieiro/feature/remove_mongodict
Feature/remove mongodict
2 parents 07cc3fc + 5569227 commit d3e1d78

29 files changed

+384
-415
lines changed

pypln/backend/celery_task.py

+9-13
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,9 @@
1616
#
1717
# You should have received a copy of the GNU General Public License
1818
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
19-
19+
import pymongo
2020
from celery import Task
2121

22-
from pypln.backend.mongodict_adapter import MongoDictAdapter
23-
2422
# This import may look like an unused imported, but it is not.
2523
# When our base task class is defined, the Celery app must have already been
2624
# instantiated, otherwise when this code is imported elsewhere (like in a
@@ -33,6 +31,11 @@
3331
from pypln.backend import config
3432

3533

34+
mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"],
35+
port=config.MONGODB_CONFIG["port"])
36+
database = mongo_client[config.MONGODB_CONFIG["database"]]
37+
document_collection = database[config.MONGODB_CONFIG["collection"]]
38+
3639
class PyPLNTask(Task):
3740
"""
3841
A base class for PyPLN tasks. It is in charge of getting the document
@@ -48,16 +51,9 @@ def run(self, document_id):
4851
It will call the `process` method with a dictionary containing all the
4952
document information and will update de database with results.
5053
"""
51-
document = MongoDictAdapter(doc_id=document_id,
52-
host=config.MONGODB_CONFIG['host'],
53-
port=config.MONGODB_CONFIG['port'],
54-
database=config.MONGODB_CONFIG['database'])
55-
# Create a dictionary out of our document. We could simply pass
56-
# it on to the process method, but for now we won't let the user
57-
# manipulate the MongoDict directly.
58-
dic = {k: v for k, v in document.iteritems()}
59-
result = self.process(dic)
60-
document.update(result)
54+
document = document_collection.find_one({"_id": document_id})
55+
result = self.process(document)
56+
document_collection.update({"_id": document_id}, {"$set": result})
6157
return document_id
6258

6359
def process(self, document):

pypln/backend/config.py

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ def get_store_config():
55
defaults = {'host': 'localhost',
66
'port': '27017',
77
'database': 'pypln_dev',
8+
'collection': 'documents',
89
'gridfs_collection': 'files',
910
}
1011
config = ConfigParser.ConfigParser(defaults=defaults)

pypln/backend/mongodict_adapter.py

-63
This file was deleted.

pypln/backend/workers/elastic_indexer.py

+6
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,12 @@ def process(self, document):
3636
# See `test_regression_indexing_should_not_include_contents` in
3737
# tests/test_elastic_indexer.py for details.
3838
document.pop('contents')
39+
# We also need to exclude _id, because ObjectId's won't be
40+
# serializable.
41+
document.pop("_id")
42+
3943
result = ES.index(index=index_name, doc_type=doc_type,
4044
body=document, id=file_id)
45+
index_id = result.pop("_id")
46+
result["index_id"] = index_id
4147
return result

pypln/backend/workers/extractor.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
# You should have received a copy of the GNU General Public License
1818
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
1919

20+
import base64
2021
import shlex
2122

2223
from HTMLParser import HTMLParser
@@ -169,15 +170,16 @@ class Extractor(PyPLNTask):
169170
#TODO: should 'replace_with' be '' when extracting from HTML?
170171

171172
def process(self, file_data):
173+
contents = base64.b64decode(file_data['contents'])
172174
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
173-
file_mime_type = m.id_buffer(file_data['contents'])
175+
file_mime_type = m.id_buffer(contents)
174176
metadata = {}
175177
if file_mime_type == 'text/plain':
176-
text = file_data['contents']
178+
text = contents
177179
elif file_mime_type == 'text/html':
178-
text = parse_html(file_data['contents'], True, ['script', 'style'])
180+
text = parse_html(contents, True, ['script', 'style'])
179181
elif file_mime_type == 'application/pdf':
180-
text, metadata = extract_pdf(file_data['contents'])
182+
text, metadata = extract_pdf(contents)
181183
else:
182184
# If we can't detect the mimetype we add a flag that can be read by
183185
# the frontend to provide more information on why the document

pypln/backend/workers/gridfs_data_retriever.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#
1717
# You should have received a copy of the GNU General Public License
1818
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
19+
import base64
1920
from bson import ObjectId
2021
from gridfs import GridFS
2122
import pymongo
@@ -31,9 +32,17 @@ def process(self, document):
3132
gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection'])
3233

3334
file_data = gridfs.get(ObjectId(document['file_id']))
35+
36+
# We decided to store 'contents' as a base64 encoded string in the
37+
# database to avoid possible corruption of files. For example: when
38+
# it's a pdf, the process of storing the data as utf-8 in mongo might
39+
# be corrupting the file. This wasn't a problem before, because
40+
# MongoDict pickled everything before storing.
41+
contents = base64.b64encode(file_data.read())
42+
3443
result = {'length': file_data.length,
3544
'md5': file_data.md5,
3645
'filename': file_data.filename,
3746
'upload_date': file_data.upload_date,
38-
'contents': file_data.read()}
47+
'contents': contents}
3948
return result

pypln/backend/workers/gridfs_file_deleter.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,9 @@
2525
class GridFSFileDeleter(PyPLNTask):
2626

2727
def process(self, document):
28-
database = pymongo.MongoClient(host=config.MONGODB_CONFIG['host'],
29-
port=config.MONGODB_CONFIG['port']
30-
)[config.MONGODB_CONFIG['database']]
28+
mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"],
29+
port=config.MONGODB_CONFIG["port"])
30+
database = mongo_client[config.MONGODB_CONFIG["database"]]
3131
gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection'])
3232

3333
gridfs.delete(ObjectId(document['file_id']))

pypln/backend/workers/tokenizer.py

-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
#
1717
# You should have received a copy of the GNU General Public License
1818
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
19-
from mongodict import MongoDict
2019
from nltk import word_tokenize, sent_tokenize
2120

2221
from pypln.backend.celery_task import PyPLNTask

pypln/backend/workers/trigrams.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,14 @@ def process(self, document):
4242
tr = defaultdict(lambda: [])
4343
for m in metrics:
4444
for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)):
45-
tr[res[0]].append(res[1])
45+
# We cannot store the trigram as a tuple (mongo keys need to be
46+
# strings). We decided to join tokens using spaces since a
47+
# space will never be in a token.
48+
key = u' '.join(res[0])
49+
# Mongo cannot have `.` or `$` in key names. Unfortunatelly
50+
# this means we need to replace them with placeholders.
51+
key = key.replace(u'$', u'\dollarsign')
52+
key = key.replace(u'.', u'\dot')
53+
tr[key].append(res[1])
4654

47-
return {'trigram_rank': dict(tr), 'metrics':metrics}
55+
return {'trigram_rank': tr, 'metrics':metrics}

requirements/production.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
# Common
22
celery
33
pymongo==2.8.1
4-
mongodict
54

65
# The newest pyparsing (2.0) only supports python 3,
76
# so we explicitly install 1.5.7 (the last version that

tests/test_celery_task.py

+39
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
# coding: utf-8
2+
#
3+
# Copyright 2015 NAMD-EMAP-FGV
4+
#
5+
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
6+
#
7+
# PyPLN is free software: you can redistribute it and/or modify
8+
# it under the terms of the GNU General Public License as published by
9+
# the Free Software Foundation, either version 3 of the License, or
10+
# (at your option) any later version.
11+
#
12+
# PyPLN is distributed in the hope that it will be useful,
13+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
# GNU General Public License for more details.
16+
#
17+
# You should have received a copy of the GNU General Public License
18+
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.
19+
from pypln.backend.celery_task import PyPLNTask
20+
from utils import TaskTest
21+
22+
class FakeTask(PyPLNTask):
23+
def process(self, document):
24+
return {'result': document['input']}
25+
26+
class TestCeleryTask(TaskTest):
27+
def test_task_should_get_the_correct_document(self):
28+
"""This is a regression test. PyPLNTask was not filtering by _id. It
29+
was getting the first document it found. """
30+
31+
# This is just preparing the expected input in the database
32+
wrong_doc_id = self.collection.insert({'input': 'wrong'}, w=1)
33+
correct_doc_id = self.collection.insert({'input': 'correct'}, w=1)
34+
35+
FakeTask().delay(correct_doc_id)
36+
37+
refreshed_doc = self.collection.find_one({'_id': correct_doc_id})
38+
39+
self.assertEqual(refreshed_doc['result'], 'correct')

tests/test_elastic_indexer.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,10 @@ def test_indexing_go_through(self):
3232
'contents': 'raw_file_contents',
3333
}
3434

35-
self.document.update(doc)
36-
ElasticIndexer().delay(self.fake_id)
37-
assert self.document['created'] # must be True
35+
doc_id = self.collection.insert(doc, w=1)
36+
ElasticIndexer().delay(doc_id)
37+
refreshed_document = self.collection.find_one({'_id': doc_id})
38+
self.assertTrue(refreshed_document['created'])
3839

3940
@patch('pypln.backend.workers.elastic_indexer.ES')
4041
def test_regression_indexing_should_not_include_contents(self, ES):
@@ -54,11 +55,12 @@ def test_regression_indexing_should_not_include_contents(self, ES):
5455
'contents': 'raw_file_contents',
5556
}
5657

57-
self.document.update(doc)
58-
ElasticIndexer().delay(self.fake_id)
58+
doc_id = self.collection.insert(doc, w=1)
59+
ElasticIndexer().delay(doc_id)
5960
# remove properties that won't be indexed
6061
index_name = doc.pop("index_name")
6162
doc_type = doc.pop('doc_type')
6263
doc.pop('contents')
64+
doc.pop('_id')
6365
ES.index.assert_called_with(body=doc, id=doc['file_id'],
6466
doc_type=doc_type, index=index_name)

0 commit comments

Comments
 (0)