NAMD
diff --git a/‎pypln/backend/celery_task.py
+9-13 b/‎pypln/backend/celery_task.py
+9-13
diff --git a/‎pypln/backend/config.py
+1 b/‎pypln/backend/config.py
+1
diff --git a/‎pypln/backend/mongodict_adapter.py
-63 b/‎pypln/backend/mongodict_adapter.py
-63
diff --git a/‎pypln/backend/workers/elastic_indexer.py
+6 b/‎pypln/backend/workers/elastic_indexer.py
+6
diff --git a/‎pypln/backend/workers/extractor.py
+6-4 b/‎pypln/backend/workers/extractor.py
+6-4
diff --git a/‎pypln/backend/workers/gridfs_data_retriever.py
+10-1 b/‎pypln/backend/workers/gridfs_data_retriever.py
+10-1
diff --git a/‎pypln/backend/workers/gridfs_file_deleter.py
+3-3 b/‎pypln/backend/workers/gridfs_file_deleter.py
+3-3
diff --git a/‎pypln/backend/workers/tokenizer.py
-1 b/‎pypln/backend/workers/tokenizer.py
-1
diff --git a/‎pypln/backend/workers/trigrams.py
+10-2 b/‎pypln/backend/workers/trigrams.py
+10-2
diff --git a/‎requirements/production.txt
-1 b/‎requirements/production.txt
-1
diff --git a/‎tests/test_celery_task.py
+39 b/‎tests/test_celery_task.py
+39
diff --git a/‎tests/test_elastic_indexer.py
+7-5 b/‎tests/test_elastic_indexer.py
+7-5
@@ -16,11 +16,9 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
-
+import pymongo
 from celery import Task
 
-from pypln.backend.mongodict_adapter import MongoDictAdapter
-
 # This import may look like an unused imported, but it is not.
 # When our base task class is defined, the Celery app must have already been
 # instantiated, otherwise when this code is imported elsewhere (like in a
@@ -33,6 +31,11 @@
 from pypln.backend import config
 
 
+mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"],
+        port=config.MONGODB_CONFIG["port"])
+database = mongo_client[config.MONGODB_CONFIG["database"]]
+document_collection = database[config.MONGODB_CONFIG["collection"]]
+
 class PyPLNTask(Task):
     """
     A base class for PyPLN tasks. It is in charge of getting the document
@@ -48,16 +51,9 @@ def run(self, document_id):
         It will call the `process` method with a dictionary containing all the
         document information and will update de database with results.
         """
-        document = MongoDictAdapter(doc_id=document_id,
-                host=config.MONGODB_CONFIG['host'],
-                port=config.MONGODB_CONFIG['port'],
-                database=config.MONGODB_CONFIG['database'])
-        # Create a dictionary out of our document. We could simply pass
-        # it on to the process method, but for now we won't let the user
-        # manipulate the MongoDict directly.
-        dic = {k: v for k, v in document.iteritems()}
-        result = self.process(dic)
-        document.update(result)
+        document = document_collection.find_one({"_id": document_id})
+        result = self.process(document)
+        document_collection.update({"_id": document_id}, {"$set": result})
         return document_id
 
     def process(self, document):
 
@@ -5,6 +5,7 @@ def get_store_config():
     defaults = {'host': 'localhost',
                 'port': '27017',
                 'database': 'pypln_dev',
+                'collection': 'documents',
                 'gridfs_collection': 'files',
     }
     config = ConfigParser.ConfigParser(defaults=defaults)
 
@@ -36,6 +36,12 @@ def process(self, document):
         # See `test_regression_indexing_should_not_include_contents` in
         # tests/test_elastic_indexer.py for details.
         document.pop('contents')
+        # We also need to exclude _id, because ObjectId's won't be
+        # serializable.
+        document.pop("_id")
+
         result = ES.index(index=index_name, doc_type=doc_type,
                 body=document, id=file_id)
+        index_id = result.pop("_id")
+        result["index_id"] = index_id
         return result
@@ -17,6 +17,7 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
+import base64
 import shlex
 
 from HTMLParser import HTMLParser
@@ -169,15 +170,16 @@ class Extractor(PyPLNTask):
     #TODO: should 'replace_with' be '' when extracting from HTML?
 
     def process(self, file_data):
+        contents = base64.b64decode(file_data['contents'])
         with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
-            file_mime_type = m.id_buffer(file_data['contents'])
+            file_mime_type = m.id_buffer(contents)
         metadata = {}
         if file_mime_type == 'text/plain':
-            text = file_data['contents']
+            text = contents
         elif file_mime_type == 'text/html':
-            text = parse_html(file_data['contents'], True, ['script', 'style'])
+            text = parse_html(contents, True, ['script', 'style'])
         elif file_mime_type == 'application/pdf':
-            text, metadata = extract_pdf(file_data['contents'])
+            text, metadata = extract_pdf(contents)
         else:
             # If we can't detect the mimetype we add a flag that can be read by
             # the frontend to provide more information on why the document
 
@@ -16,6 +16,7 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+import base64
 from bson import ObjectId
 from gridfs import GridFS
 import pymongo
@@ -31,9 +32,17 @@ def process(self, document):
         gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection'])
 
         file_data = gridfs.get(ObjectId(document['file_id']))
+
+        # We decided to store 'contents' as a base64 encoded string in the
+        # database to avoid possible corruption of files. For example: when
+        # it's a pdf, the process of storing the data as utf-8 in mongo might
+        # be corrupting the file.  This wasn't a problem before, because
+        # MongoDict pickled everything before storing.
+        contents = base64.b64encode(file_data.read())
+
         result = {'length': file_data.length,
                   'md5': file_data.md5,
                   'filename': file_data.filename,
                   'upload_date': file_data.upload_date,
-                  'contents': file_data.read()}
+                  'contents': contents}
         return result
@@ -25,9 +25,9 @@
 class GridFSFileDeleter(PyPLNTask):
 
     def process(self, document):
-        database = pymongo.MongoClient(host=config.MONGODB_CONFIG['host'],
-                port=config.MONGODB_CONFIG['port']
-            )[config.MONGODB_CONFIG['database']]
+        mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"],
+                port=config.MONGODB_CONFIG["port"])
+        database = mongo_client[config.MONGODB_CONFIG["database"]]
         gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection'])
 
         gridfs.delete(ObjectId(document['file_id']))
 
@@ -16,7 +16,6 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
-from mongodict import MongoDict
 from nltk import word_tokenize, sent_tokenize
 
 from pypln.backend.celery_task import PyPLNTask
 
@@ -42,6 +42,14 @@ def process(self, document):
         tr = defaultdict(lambda: [])
         for m in metrics:
             for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)):
-                tr[res[0]].append(res[1])
+                # We cannot store the trigram as a tuple (mongo keys need to be
+                # strings). We decided to join tokens using spaces since a
+                # space will never be in a token.
+                key = u' '.join(res[0])
+                # Mongo cannot have `.` or `$` in key names. Unfortunatelly
+                # this means we need to replace them with placeholders.
+                key = key.replace(u'$', u'\dollarsign')
+                key = key.replace(u'.', u'\dot')
+                tr[key].append(res[1])
 
-        return {'trigram_rank': dict(tr), 'metrics':metrics}
+        return {'trigram_rank': tr, 'metrics':metrics}
@@ -1,7 +1,6 @@
 # Common
 celery
 pymongo==2.8.1
-mongodict
 
 # The newest pyparsing (2.0) only supports python 3,
 # so we explicitly install 1.5.7 (the last version that
 
@@ -0,0 +1,39 @@
+# coding: utf-8
+#
+# Copyright 2015 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+from pypln.backend.celery_task import PyPLNTask
+from utils import TaskTest
+
+class FakeTask(PyPLNTask):
+    def process(self, document):
+        return {'result': document['input']}
+
+class TestCeleryTask(TaskTest):
+    def test_task_should_get_the_correct_document(self):
+        """This is a regression test. PyPLNTask was not filtering by _id. It
+        was getting the first document it found. """
+
+        # This is just preparing the expected input in the database
+        wrong_doc_id = self.collection.insert({'input': 'wrong'}, w=1)
+        correct_doc_id = self.collection.insert({'input': 'correct'}, w=1)
+
+        FakeTask().delay(correct_doc_id)
+
+        refreshed_doc = self.collection.find_one({'_id': correct_doc_id})
+
+        self.assertEqual(refreshed_doc['result'], 'correct')
@@ -32,9 +32,10 @@ def test_indexing_go_through(self):
             'contents': 'raw_file_contents',
         }
 
-        self.document.update(doc)
-        ElasticIndexer().delay(self.fake_id)
-        assert self.document['created']  # must be True
+        doc_id = self.collection.insert(doc, w=1)
+        ElasticIndexer().delay(doc_id)
+        refreshed_document = self.collection.find_one({'_id': doc_id})
+        self.assertTrue(refreshed_document['created'])
 
     @patch('pypln.backend.workers.elastic_indexer.ES')
     def test_regression_indexing_should_not_include_contents(self, ES):
@@ -54,11 +55,12 @@ def test_regression_indexing_should_not_include_contents(self, ES):
             'contents': 'raw_file_contents',
         }
 
-        self.document.update(doc)
-        ElasticIndexer().delay(self.fake_id)
+        doc_id = self.collection.insert(doc, w=1)
+        ElasticIndexer().delay(doc_id)
         # remove properties that won't be indexed
         index_name = doc.pop("index_name")
         doc_type = doc.pop('doc_type')
         doc.pop('contents')
+        doc.pop('_id')
         ES.index.assert_called_with(body=doc, id=doc['file_id'],
                 doc_type=doc_type, index=index_name)
Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@ def get_store_config():`
`5`	`5`	`defaults = {'host': 'localhost',`
`6`	`6`	`'port': '27017',`
`7`	`7`	`'database': 'pypln_dev',`
	`8`	`+ 'collection': 'documents',`
`8`	`9`	`'gridfs_collection': 'files',`
`9`	`10`	`}`
`10`	`11`	`config = ConfigParser.ConfigParser(defaults=defaults)`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,6 @@`
`16`	`16`	`#`
`17`	`17`	`# You should have received a copy of the GNU General Public License`
`18`	`18`	`# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.`
`19`		`-from mongodict import MongoDict`
`20`	`19`	`from nltk import word_tokenize, sent_tokenize`
`21`	`20`
`22`	`21`	`from pypln.backend.celery_task import PyPLNTask`