From f8b4b87b54c88999ef514c59e16cbe64a9af0d34 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 6 Aug 2015 17:59:32 +0200 Subject: [PATCH 01/25] Removes MongoDictAdapter The idea behind this is to have less queries and to make the storing format less opaque. This should bring a performance improvement and also make it easier to inspect the database directly. Tests are still broken because they rely on the old format. --- pypln/backend/celery_task.py | 22 +++--- pypln/backend/config.py | 1 + pypln/backend/mongodict_adapter.py | 63 --------------- tests/test_mongodict_adapter.py | 120 ----------------------------- 4 files changed, 10 insertions(+), 196 deletions(-) delete mode 100644 pypln/backend/mongodict_adapter.py delete mode 100644 tests/test_mongodict_adapter.py diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py index ed1c957..ba0cf8b 100644 --- a/pypln/backend/celery_task.py +++ b/pypln/backend/celery_task.py @@ -16,11 +16,9 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . - +import pymongo from celery import Task -from pypln.backend.mongodict_adapter import MongoDictAdapter - # This import may look like an unused imported, but it is not. # When our base task class is defined, the Celery app must have already been # instantiated, otherwise when this code is imported elsewhere (like in a @@ -33,6 +31,11 @@ from pypln.backend import config +mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"], + port=config.MONGODB_CONFIG["port"]) +database = mongo_client[config.MONGODB_CONFIG["database"]] +document_collection = database[config.MONGODB_CONFIG["collection"]] + class PyPLNTask(Task): """ A base class for PyPLN tasks. It is in charge of getting the document @@ -48,16 +51,9 @@ def run(self, document_id): It will call the `process` method with a dictionary containing all the document information and will update de database with results. """ - document = MongoDictAdapter(doc_id=document_id, - host=config.MONGODB_CONFIG['host'], - port=config.MONGODB_CONFIG['port'], - database=config.MONGODB_CONFIG['database']) - # Create a dictionary out of our document. We could simply pass - # it on to the process method, but for now we won't let the user - # manipulate the MongoDict directly. - dic = {k: v for k, v in document.iteritems()} - result = self.process(dic) - document.update(result) + document = document_collection.find_one() + result = self.process(document) + document_collection.update({"_id": document_id}, {"$set": result}) return document_id def process(self, document): diff --git a/pypln/backend/config.py b/pypln/backend/config.py index f06fb1e..f074c3a 100644 --- a/pypln/backend/config.py +++ b/pypln/backend/config.py @@ -5,6 +5,7 @@ def get_store_config(): defaults = {'host': 'localhost', 'port': '27017', 'database': 'pypln_dev', + 'collection': 'documents', 'gridfs_collection': 'files', } config = ConfigParser.ConfigParser(defaults=defaults) diff --git a/pypln/backend/mongodict_adapter.py b/pypln/backend/mongodict_adapter.py deleted file mode 100644 index b57d322..0000000 --- a/pypln/backend/mongodict_adapter.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - -from mongodict import MongoDict - - -class MongoDictAdapter(MongoDict): - #TODO: implement clear, __iter__, __len__ and contains with filters by id - def __init__(self, doc_id, *args, **kwargs): - self.doc_id = doc_id - self.prefix = 'id:{}:'.format(self.doc_id) - self.prefixed_id_query = {'$regex': - '^{}'.format(self.prefix)} - return super(MongoDictAdapter, self).__init__(*args, **kwargs) - - def __setitem__(self, key, value): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__setitem__(key, value) - - def __getitem__(self, key): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__getitem__(key) - - def __delitem__(self, key): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__delitem__(key) - - def __contains__(self, key): - # If this is being called by other methods (like __delitem__) - # it will already have the prefix - if not key.startswith('id:'): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__contains__(key) - - has_key = __contains__ - - def __iter__(self): - query_result = self._collection.find({'_id': - self.prefixed_id_query}, {'_id': 1}) - keys = (k['_id'].replace(self.prefix, '', 1) for k in query_result) - return keys - - def __len__(self): - return self._collection.find({'_id': self.prefixed_id_query}).count() - - def clear(self): - self._collection.remove({'_id': self.prefixed_id_query}) diff --git a/tests/test_mongodict_adapter.py b/tests/test_mongodict_adapter.py deleted file mode 100644 index af1880d..0000000 --- a/tests/test_mongodict_adapter.py +++ /dev/null @@ -1,120 +0,0 @@ -# coding: utf-8 -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - -import pickle -import unittest - -from bson import Binary -import pymongo - -from pypln.backend.mongodict_adapter import MongoDictAdapter - - - -class TestMongoDictAdapter(unittest.TestCase): - db_name = 'test_mongodictbyid' - - def setUp(self): - self.fake_id = '1234' - self.document = MongoDictAdapter(self.fake_id, database=self.db_name) - self.db = pymongo.Connection()[self.db_name] - - def tearDown(self): - self.db.main.remove({}) - - @classmethod - def tearDownClass(cls): - pymongo.MongoClient().drop_database(cls.db_name) - - def test_creating_a_new_key_should_saved_the_information(self): - self.document['new_key'] = 'value' - stored_value = self.db.main.find_one( - {'_id': 'id:{}:new_key'.format(self.fake_id)}) - self.assertIsNotNone(stored_value) - # This decodes the value with the defaults for MongoDict - decoded_value = pickle.loads(str(stored_value['v'])) - self.assertEqual(decoded_value, 'value') - - def test_reading_an_existing_key_should_read_saved_information(self): - encoded_value = Binary(pickle.dumps( - 'value', protocol=pickle.HIGHEST_PROTOCOL)) - - self.db.main.insert( - {'_id': 'id:{}:key'.format(self.fake_id), 'v': encoded_value}) - - self.assertEqual(self.document['key'], 'value') - - def test_deleting_an_existing_key_should_delete_saved_information(self): - encoded_value = Binary(pickle.dumps( - 'value', protocol=pickle.HIGHEST_PROTOCOL)) - - self.db.main.insert( - {'_id': 'id:{}:key'.format(self.fake_id), 'v': encoded_value}) - - self.assertEqual(self.document['key'], 'value') - del self.document['key'] - - stored_value = self.db.main.find_one( - {'_id': 'id:{}:key'.format(self.fake_id)}) - self.assertIsNone(stored_value) - - def test_iterating_through_keys_does_not_bring_keys_from_other_docs(self): - self.document['key_1'] = 1 - self.document['key_2'] = 2 - other_document = MongoDictAdapter('other_id', database=self.db_name) - other_document['other_key'] = 3 - keys = [k for k in self.document] - - self.assertIn('key_1', keys) - self.assertIn('key_2', keys) - self.assertNotIn('key_3', keys) - - self.assertEquals(['key_1', 'key_2'], self.document.keys()) - - def test_clear_should_not_remove_keys_for_other_docs(self): - self.document['key_1'] = 1 - self.document['key_2'] = 2 - other_document = MongoDictAdapter('other_id', database=self.db_name) - other_document['other_key'] = 3 - - self.document.clear() - - with self.assertRaises(KeyError): - self.document['key_1'] - self.document['key_2'] - - self.assertEqual(other_document['other_key'], 3) - - def test_return_correct_length(self): - self.document['key_1'] = 1 - self.document['key_2'] = 2 - other_document = MongoDictAdapter('other_id', database=self.db_name) - other_document['other_key'] = 3 - - self.assertEquals(len(self.document), 2) - - def test_contains(self): - self.document['key'] = 1 - self.assertIn('key', self.document) - self.assertNotIn('inexistent_key', self.document) - - def test_has_key(self): - self.document['key'] = 1 - self.assertTrue(self.document.has_key('key')) - self.assertFalse(self.document.has_key('inexistent_key')) From f8ecdd4fa386afe7b9c489f27440a9fc345c5d33 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 6 Aug 2015 19:26:54 +0200 Subject: [PATCH 02/25] Adapts test utils for the new database format Includes changes for the freqdist worker tests. --- tests/test_worker_freqdist.py | 14 +++++++------- tests/utils.py | 6 ++---- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py index d8d61a9..8a97193 100644 --- a/tests/test_worker_freqdist.py +++ b/tests/test_worker_freqdist.py @@ -22,18 +22,18 @@ class TestFreqDistWorker(TaskTest): def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self): - tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', - 'yellow', '.'] + tokens = [u'The', u'sky', u'is', u'blue', u',', u'the', u'sun', u'is', + u'yellow', u'.'] - expected_fd = [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), - ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)] + expected_fd = [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1], + [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]] # This is just preparing the expected input in the database - self.document['tokens'] = tokens + doc_id = self.collection.insert({'tokens': tokens}) - FreqDist().delay(self.fake_id) + FreqDist().delay(doc_id) - resulting_fd = self.document['freqdist'] + resulting_fd = self.collection.find_one({'_id': doc_id})['freqdist'] self.assertEqual(resulting_fd, expected_fd) diff --git a/tests/utils.py b/tests/utils.py index 04e7c7c..452bc79 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -21,7 +21,6 @@ import unittest import pymongo -from pypln.backend.mongodict_adapter import MongoDictAdapter from pypln.backend.celery_app import app from pypln.backend import config @@ -33,12 +32,11 @@ class TaskTest(unittest.TestCase): def setUp(self): app.conf.update(CELERY_ALWAYS_EAGER=True) - self.fake_id = '1234' - self.document = MongoDictAdapter(self.fake_id, database=self.db_name) self.db = pymongo.Connection()[self.db_name] + self.collection = self.db[config.MONGODB_CONFIG['collection']] def tearDown(self): - self.db.main.remove({}) + self.collection.remove({}) @classmethod def setUpClass(cls): From acca8e8fb08bf366fa8f66744da3cfe581205518 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 6 Aug 2015 19:30:15 +0200 Subject: [PATCH 03/25] Fixes the name of the property for the index id in elasticindexer --- pypln/backend/workers/elastic_indexer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pypln/backend/workers/elastic_indexer.py b/pypln/backend/workers/elastic_indexer.py index faf8119..f5b55c3 100644 --- a/pypln/backend/workers/elastic_indexer.py +++ b/pypln/backend/workers/elastic_indexer.py @@ -36,6 +36,12 @@ def process(self, document): # See `test_regression_indexing_should_not_include_contents` in # tests/test_elastic_indexer.py for details. document.pop('contents') + # We also need to exclude _id, because ObjectId's won't be + # serializable. + document.pop("_id") + result = ES.index(index=index_name, doc_type=doc_type, body=document, id=file_id) + index_id = result.pop("_id") + result["index_id"] = index_id return result From 3b735cd4df5ed7dadf6b13159d51772f1fc337be Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 6 Aug 2015 19:31:21 +0200 Subject: [PATCH 04/25] Adapts elasticsearch worker tests --- tests/test_elastic_indexer.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py index a35dd23..b76dacd 100644 --- a/tests/test_elastic_indexer.py +++ b/tests/test_elastic_indexer.py @@ -32,9 +32,10 @@ def test_indexing_go_through(self): 'contents': 'raw_file_contents', } - self.document.update(doc) - ElasticIndexer().delay(self.fake_id) - assert self.document['created'] # must be True + doc_id = self.collection.insert(doc) + ElasticIndexer().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertTrue(refreshed_document['created']) @patch('pypln.backend.workers.elastic_indexer.ES') def test_regression_indexing_should_not_include_contents(self, ES): @@ -54,11 +55,12 @@ def test_regression_indexing_should_not_include_contents(self, ES): 'contents': 'raw_file_contents', } - self.document.update(doc) - ElasticIndexer().delay(self.fake_id) + doc_id = self.collection.insert(doc) + ElasticIndexer().delay(doc_id) # remove properties that won't be indexed index_name = doc.pop("index_name") doc_type = doc.pop('doc_type') doc.pop('contents') + doc.pop('_id') ES.index.assert_called_with(body=doc, id=doc['file_id'], doc_type=doc_type, index=index_name) From 4ae26540123b3c0b5ba80a0cf4740d25e863d5a7 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 6 Aug 2015 19:33:46 +0200 Subject: [PATCH 05/25] Adapts tests for bigrams worker to the removal of MongoDict --- tests/test_worker_bigrams.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index 91334b9..bf0225e 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -33,11 +33,12 @@ def test_bigrams_should_return_correct_score(self): tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] - self.document['tokens'] = tokens + doc_id = self.collection.insert({'tokens': tokens}) bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which') - Bigrams().delay(self.fake_id) - bigram_rank = self.document['bigram_rank'] + Bigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + bigram_rank = refreshed_document['bigram_rank'] result = bigram_rank[0][1][0] self.assertEqual(result, expected) From ad901fb086f776426a95c51f61cf9f71b9bb1b3e Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 10 Aug 2015 17:41:48 +0200 Subject: [PATCH 06/25] Fixes huge bug in PyPLNTask PyPLNTask wasn't filtering by the document's "_id". It was returning the first document it found. This commit includes a regression test and fixes the bug. --- pypln/backend/celery_task.py | 2 +- tests/test_celery_task.py | 39 ++++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 tests/test_celery_task.py diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py index ba0cf8b..bced3a2 100644 --- a/pypln/backend/celery_task.py +++ b/pypln/backend/celery_task.py @@ -51,7 +51,7 @@ def run(self, document_id): It will call the `process` method with a dictionary containing all the document information and will update de database with results. """ - document = document_collection.find_one() + document = document_collection.find_one({"_id": document_id}) result = self.process(document) document_collection.update({"_id": document_id}, {"$set": result}) return document_id diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py new file mode 100644 index 0000000..fd1adde --- /dev/null +++ b/tests/test_celery_task.py @@ -0,0 +1,39 @@ +# coding: utf-8 +# +# Copyright 2015 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . +from pypln.backend.celery_task import PyPLNTask +from utils import TaskTest + +class FakeTask(PyPLNTask): + def process(self, document): + return {'result': document['input']} + +class TestCeleryTask(TaskTest): + def test_task_should_get_the_correct_document(self): + """This is a regression test. PyPLNTask was not filtering by _id. It + was getting the first document it found. """ + + # This is just preparing the expected input in the database + wrong_doc_id = self.collection.insert({'input': 'wrong'}, w=1) + correct_doc_id = self.collection.insert({'input': 'correct'}, w=1) + + FakeTask().delay(correct_doc_id) + + refreshed_doc = self.collection.find_one({'_id': correct_doc_id}) + + self.assertEqual(refreshed_doc['result'], 'correct') From 38e3f7e5a91f5705713d690c8d8b348f18e848c6 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 10 Aug 2015 18:12:46 +0200 Subject: [PATCH 07/25] Changes GridFSDataRetriever to base64 encode the contents before storing This is necessary because the content of the file might be binary data (such as a pdf), and we don't want to change it by encoding/decoding. This was not an issue before because MongoDict used to pickle this before storing. This kind of adaptation is probably going to happen in other places where the mapping from python objects to json is not straight forward. --- .../backend/workers/gridfs_data_retriever.py | 11 ++++++- tests/test_worker_gridfs_data_retriever.py | 31 +++++++++++++------ 2 files changed, 31 insertions(+), 11 deletions(-) diff --git a/pypln/backend/workers/gridfs_data_retriever.py b/pypln/backend/workers/gridfs_data_retriever.py index 268cb21..68ed916 100644 --- a/pypln/backend/workers/gridfs_data_retriever.py +++ b/pypln/backend/workers/gridfs_data_retriever.py @@ -16,6 +16,7 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 from bson import ObjectId from gridfs import GridFS import pymongo @@ -31,9 +32,17 @@ def process(self, document): gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection']) file_data = gridfs.get(ObjectId(document['file_id'])) + + # We decided to store 'contents' as a base64 encoded string in the + # database to avoid possible corruption of files. For example: when + # it's a pdf, the process of storing the data as utf-8 in mongo might + # be corrupting the file. This wasn't a problem before, because + # MongoDict pickled everything before storing. + contents = base64.b64encode(file_data.read()) + result = {'length': file_data.length, 'md5': file_data.md5, 'filename': file_data.filename, 'upload_date': file_data.upload_date, - 'contents': file_data.read()} + 'contents': contents} return result diff --git a/tests/test_worker_gridfs_data_retriever.py b/tests/test_worker_gridfs_data_retriever.py index ffdfce5..da54f2f 100644 --- a/tests/test_worker_gridfs_data_retriever.py +++ b/tests/test_worker_gridfs_data_retriever.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 import bson from gridfs import GridFS from pypln.backend.workers import GridFSDataRetriever @@ -31,19 +32,29 @@ def test_extract_file_data_from_GridFS(self): new_file_id = gridfs.put(content) expected_file_data = gridfs.get(new_file_id) - self.document['file_id'] = str(new_file_id) - GridFSDataRetriever().delay(self.fake_id) + data = {'file_id': str(new_file_id)} + doc_id = self.collection.insert(data) + GridFSDataRetriever().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(self.document['contents'], content) - self.assertEqual(self.document['length'], expected_file_data.length) - self.assertEqual(self.document['md5'], expected_file_data.md5) - self.assertEqual(self.document['filename'], expected_file_data.filename) - self.assertEqual(self.document['upload_date'], expected_file_data.upload_date) - self.assertEqual(self.document['contents'], expected_file_data.read()) + self.assertEqual(refreshed_document['contents'], + base64.b64encode(content)) + self.assertEqual(refreshed_document['length'], + expected_file_data.length) + self.assertEqual(refreshed_document['md5'], expected_file_data.md5) + self.assertEqual(refreshed_document['filename'], + expected_file_data.filename) + self.assertEqual(refreshed_document['upload_date'], + expected_file_data.upload_date) + self.assertEqual(refreshed_document['contents'], + base64.b64encode(expected_file_data.read())) def test_task_raises_exception_when_file_does_not_exist(self): - self.document['file_id'] = "Inexistent document" - result = GridFSDataRetriever().delay(self.fake_id) + data = {'file_id': "Inexistent document"} + doc_id = self.collection.insert(data) + result = GridFSDataRetriever().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertTrue(result.failed()) self.assertEqual(result.status, "FAILURE") self.assertIsInstance(result.info, bson.errors.InvalidId) From 2c7bb074a150ab4378ceabf16a1e43a5ee2dc499 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 10 Aug 2015 18:27:44 +0200 Subject: [PATCH 08/25] Adapts Extractor to the removal of MongoDict Not only this means fixing tests, but also decoding the base64 encoded string that represents the contents of the file (see 38e3f7e5a9 for more info). Thanks @israelst for the help discussing this. --- pypln/backend/workers/extractor.py | 10 +- tests/test_worker_extractor.py | 171 +++++++++++++++++------------ 2 files changed, 106 insertions(+), 75 deletions(-) diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 09e1b32..110730b 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 import shlex from HTMLParser import HTMLParser @@ -169,15 +170,16 @@ class Extractor(PyPLNTask): #TODO: should 'replace_with' be '' when extracting from HTML? def process(self, file_data): + contents = base64.b64decode(file_data['contents']) with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: - file_mime_type = m.id_buffer(file_data['contents']) + file_mime_type = m.id_buffer(contents) metadata = {} if file_mime_type == 'text/plain': - text = file_data['contents'] + text = contents elif file_mime_type == 'text/html': - text = parse_html(file_data['contents'], True, ['script', 'style']) + text = parse_html(contents, True, ['script', 'style']) elif file_mime_type == 'application/pdf': - text, metadata = extract_pdf(file_data['contents']) + text, metadata = extract_pdf(contents) else: # If we can't detect the mimetype we add a flag that can be read by # the frontend to provide more information on why the document diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index 489bc02..3b03736 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 import os from textwrap import dedent from pypln.backend.workers import Extractor @@ -28,54 +29,64 @@ class TestExtractorWorker(TaskTest): def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.txt') - self.document.update({'filename': filename, - 'contents': open(filename).read()}) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['file_metadata'], {}) - self.assertEqual(self.document['mimetype'], 'text/plain') + doc_id = self.collection.insert({'filename': filename, + 'contents': base64.b64encode(open(filename).read())}) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['file_metadata'], {}) + self.assertEqual(refreshed_document['mimetype'], 'text/plain') def test_extraction_from_html_file(self): expected = "This is a test file. I'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.html') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['file_metadata'], {}) - self.assertEqual(self.document['mimetype'], 'text/html') + # When saving directly to mongodb we always get everything back from + # the database as unicode. Because of that, the extractor is having + # problems when there is a non-ascii character in the content. This + # wasn't a problem before because with mongodict we used to keep a + # pickled representation of the data. + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['file_metadata'], {}) + self.assertEqual(refreshed_document['mimetype'], 'text/html') def test_extraction_from_pdf_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.pdf') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) # Check that the expected metadata is a subset of what # our Extractor found (it may have found more details # depending on the toolset used to extract metadata) metadata_expected = { - 'Author': 'Álvaro Justen', - 'Creator': 'Writer', - 'Producer': 'LibreOffice 3.5', - 'CreationDate': 'Fri Jun 1 17:07:57 2012', - 'Tagged': 'no', - 'Pages': '1', - 'Encrypted': 'no', - 'Page size': '612 x 792 pts (letter)', - 'Optimized': 'no', - 'PDF version': '1.4', + u'Author': u'Álvaro Justen', + u'Creator': u'Writer', + u'Producer': u'LibreOffice 3.5', + u'CreationDate': u'Fri Jun 1 17:07:57 2012', + u'Tagged': u'no', + u'Pages': u'1', + u'Encrypted': u'no', + u'Page size': u'612 x 792 pts (letter)', + u'Optimized': u'no', + u'PDF version': u'1.4', } metadata_expected_set = set(metadata_expected.iteritems()) - metadata = self.document['file_metadata'] + metadata = refreshed_document['file_metadata'] metadata_set = set(metadata.iteritems()) diff_set = metadata_expected_set - metadata_set self.assertTrue(metadata_expected_set.issubset(metadata_set), ("Extracted metadata is not a subset of the expected metadata. " "Items missing or with different values: {}").format( u", ".join(unicode(item) for item in diff_set))) - self.assertEqual(self.document['mimetype'], 'application/pdf') + self.assertEqual(refreshed_document['mimetype'], 'application/pdf') def test_extraction_from_html(self): contents = dedent(''' @@ -101,9 +112,10 @@ def test_extraction_from_html(self): ''') - data = {'filename': 'test.html', 'contents': contents} - self.document.update(data) - result = Extractor().delay(self.fake_id) + data = {'filename': 'test.html', + 'contents': base64.b64encode(contents)} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) expected = dedent(''' Testing @@ -121,75 +133,92 @@ def test_extraction_from_html(self): bla1 bla2''').strip() - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['mimetype'], 'text/html') + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['mimetype'], 'text/html') def test_language_detection_pt(self): text_pt = 'Esse texto foi escrito por Álvaro em Português.' - data_pt = {'filename': 'text-pt.txt', 'contents': text_pt} - self.document.update(data_pt) - Extractor().delay(self.fake_id).get() - self.assertEqual(self.document['language'], 'pt') + data_pt = {'filename': 'text-pt.txt', + 'contents': base64.b64encode(text_pt)} + doc_id = self.collection.insert(data_pt) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['language'], 'pt') def test_language_detection_es(self): text_es = 'Este texto ha sido escrito en Español por Álvaro.' - data_es = {'filename': 'text-es.txt', 'contents': text_es} - self.document.update(data_es) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['language'], 'es') + data_es = {'filename': 'text-es.txt', + 'contents': base64.b64encode(text_es)} + doc_id = self.collection.insert(data_es) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['language'], 'es') def test_language_detection_en(self): text_en = 'This text was written by Álvaro in English.' - data_en = {'filename': 'text-en.txt', 'contents': text_en} - self.document.update(data_en) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['language'], 'en') + data_en = {'filename': 'text-en.txt', + 'contents': base64.b64encode(text_en)} + doc_id = self.collection.insert(data_en) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['language'], 'en') def test_unescape_html_entities(self): expected = (u"This text has html . Álvaro asked me to make" " sure it also has non ascii chars.") filename = os.path.join(DATA_DIR, 'test_html_entities.txt') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) def test_should_detect_encoding_and_return_a_unicode_object(self): expected = u"Flávio" filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(type(self.document['text']), unicode) + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(type(refreshed_document['text']), unicode) def test_should_guess_mimetype_for_file_without_extension(self): contents = "This is a test file. I'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'text_file') - data = {'filename': filename, 'contents': contents} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['mimetype'], 'text/plain') + data = {'filename': filename, + 'contents': base64.b64encode(contents)} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['mimetype'], 'text/plain') def test_unknown_mimetype_should_be_flagged(self): filename = os.path.join(DATA_DIR, 'random_file') # we can't put the expected text content here, so we'll just make sure # it's equal to the input content, since contents = open(filename).read() - data = {'filename': filename, 'contents': contents} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['mimetype'], 'unknown') - self.assertEqual(self.document['text'], "") - self.assertEqual(self.document['language'], "") - self.assertEqual(self.document['file_metadata'], {}) + data = {'filename': filename, + 'contents': base64.b64encode(contents)} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['mimetype'], 'unknown') + self.assertEqual(refreshed_document['text'], "") + self.assertEqual(refreshed_document['language'], "") + self.assertEqual(refreshed_document['file_metadata'], {}) def test_unknown_encoding_should_be_ignored(self): filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt') expected = u"This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding." - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['file_metadata'], {}) - self.assertEqual(self.document['language'], 'en') + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['file_metadata'], {}) + self.assertEqual(refreshed_document['language'], 'en') From d3ca42e586be6e5c9f029bc095a3c6b7ee05a792 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 31 Aug 2015 14:47:15 -0300 Subject: [PATCH 09/25] Adapts GridFSFileDeleter to the removal of MongoDict --- pypln/backend/workers/gridfs_file_deleter.py | 6 +++--- tests/test_worker_gridfs_file_deleter.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pypln/backend/workers/gridfs_file_deleter.py b/pypln/backend/workers/gridfs_file_deleter.py index c1dc15f..36ea082 100644 --- a/pypln/backend/workers/gridfs_file_deleter.py +++ b/pypln/backend/workers/gridfs_file_deleter.py @@ -25,9 +25,9 @@ class GridFSFileDeleter(PyPLNTask): def process(self, document): - database = pymongo.MongoClient(host=config.MONGODB_CONFIG['host'], - port=config.MONGODB_CONFIG['port'] - )[config.MONGODB_CONFIG['database']] + mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"], + port=config.MONGODB_CONFIG["port"]) + database = mongo_client[config.MONGODB_CONFIG["database"]] gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection']) gridfs.delete(ObjectId(document['file_id'])) diff --git a/tests/test_worker_gridfs_file_deleter.py b/tests/test_worker_gridfs_file_deleter.py index 60f9613..cec90b3 100644 --- a/tests/test_worker_gridfs_file_deleter.py +++ b/tests/test_worker_gridfs_file_deleter.py @@ -31,16 +31,16 @@ def test_delete_file_from_GridFS(self): new_file_id = gridfs.put(content) expected_file_data = gridfs.get(new_file_id) - self.document['file_id'] = str(new_file_id) + doc_id = self.collection.insert({'file_id': new_file_id}) self.assertTrue(gridfs.exists(new_file_id)) - GridFSFileDeleter().delay(self.fake_id) + GridFSFileDeleter().delay(doc_id) self.assertFalse(gridfs.exists(new_file_id)) def test_task_raises_exception_when_file_does_not_exist(self): - self.document['file_id'] = "Inexistent document" - result = GridFSFileDeleter().delay(self.fake_id) + doc_id = self.collection.insert({'file_id': "Inexistent document"}) + result = GridFSFileDeleter().delay(doc_id) self.assertTrue(result.failed()) self.assertEqual(result.status, "FAILURE") self.assertIsInstance(result.info, bson.errors.InvalidId) From a1880b8c0fbb6ab1b181f1653e923f61d941d987 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 31 Aug 2015 15:42:42 -0300 Subject: [PATCH 10/25] Adapts Lemmatizer tests to the removal of MongoDict --- tests/test_worker_lemmatizer_pt.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_worker_lemmatizer_pt.py b/tests/test_worker_lemmatizer_pt.py index 2dc4c62..fb96cd5 100644 --- a/tests/test_worker_lemmatizer_pt.py +++ b/tests/test_worker_lemmatizer_pt.py @@ -42,8 +42,9 @@ def test_lemmatizer_should_return_a_list_with_lemmas(self): ''').strip() + '\n\n' - self.document['palavras_raw'] = palavras_output - self.document['palavras_raw_ran'] = True - result = Lemmatizer().delay(self.fake_id) + doc = {'palavras_raw': palavras_output, 'palavras_raw_ran': True} + doc_id = self.collection.insert(doc) + result = Lemmatizer().delay(doc_id) expected = 'eu saber que em este momento falar para todo Brasil .'.split() - self.assertEqual(self.document['lemmas'], expected) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['lemmas'], expected) From 077d6a93afcffd1f19a201f00415a1688a0c081f Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 31 Aug 2015 17:19:32 -0300 Subject: [PATCH 11/25] Adapts NounPhrase worker to the removal of MongoDict --- tests/test_worker_palavras_noun_phrase.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_worker_palavras_noun_phrase.py b/tests/test_worker_palavras_noun_phrase.py index 950f8ae..d9e601e 100644 --- a/tests/test_worker_palavras_noun_phrase.py +++ b/tests/test_worker_palavras_noun_phrase.py @@ -44,9 +44,10 @@ def test_noun_phrase_worker_should_return_a_list_with_phrases(self): ''').strip() + '\n\n' - self.document.update({'palavras_raw': palavras_output, + doc_id = self.collection.insert({'palavras_raw': palavras_output, 'palavras_raw_ran': True}) - NounPhrase().delay(self.fake_id) + NounPhrase().delay(doc_id) expected = ['_este *momento', 'todo o *povo de_ _o Brasil .', '_o *Brasil .'] - self.assertEqual(self.document['noun_phrases'], expected) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['noun_phrases'], expected) From 23db88009b82ff9dd53a295b14d49f354f5f704f Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 31 Aug 2015 17:51:08 -0300 Subject: [PATCH 12/25] Adapts WordCloud worker to the removal of MongoDict --- tests/test_worker_wordcloud.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py index 56c48f2..dd0f6e0 100644 --- a/tests/test_worker_wordcloud.py +++ b/tests/test_worker_wordcloud.py @@ -29,12 +29,13 @@ class TestFreqDistWorker(TaskTest): name = "WordCloud" def test_wordcloud_should_return_a_base64_encoded_png(self): - self.document['freqdist'] = [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), - ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)] - self.document['language'] = 'en' - WordCloud().delay(self.fake_id).get() + doc = {'freqdist': [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), + ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)], 'language': 'en'} + doc_id = self.collection.insert(doc) + WordCloud().delay(doc_id) - raw_png_data = base64.b64decode(self.document['wordcloud']) + refreshed_document = self.collection.find_one({'_id': doc_id}) + raw_png_data = base64.b64decode(refreshed_document['wordcloud']) fake_file = StringIO(raw_png_data) img = Image.open(fake_file) From 02c620dff5b3034dab1f32f55946673efc643733 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 31 Aug 2015 17:53:29 -0300 Subject: [PATCH 13/25] Removes trailing `_` from Trigram worker test name --- tests/test_worker_trigrams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index 6b1818f..d99b676 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -26,7 +26,7 @@ class TestTrigramWorker(TaskTest): - def test_Trigrams_should_return_correct_score_(self): + def test_Trigrams_should_return_correct_score(self): tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) From 565c380b11397c29e8b6c8889c9be42e787d72f6 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 1 Sep 2015 10:13:54 -0300 Subject: [PATCH 14/25] Adapts Tokenizer test to the removal of MongoDict --- tests/test_worker_tokenizer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/test_worker_tokenizer.py b/tests/test_worker_tokenizer.py index 6d3e4d3..eb99a93 100644 --- a/tests/test_worker_tokenizer.py +++ b/tests/test_worker_tokenizer.py @@ -23,17 +23,19 @@ class TestTokenizerWorker(TaskTest): def test_tokenizer_should_receive_text_and_return_tokens(self): - self.document['text'] = 'The sky is blue, the sun is yellow. This is another sentence.' + doc = {'text': 'The sky is blue, the sun is yellow. This is another sentence.'} expected_tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.', 'This', 'is', 'another', 'sentence', '.'] expected_sentences = [['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.'], ['This', 'is', 'another', 'sentence', '.']] - Tokenizer().delay(self.fake_id) + doc_id = self.collection.insert(doc) + Tokenizer().delay(doc_id) - tokens = self.document['tokens'] - sentences = self.document['sentences'] + refreshed_document = self.collection.find_one({'_id': doc_id}) + tokens = refreshed_document['tokens'] + sentences = refreshed_document['sentences'] self.assertEqual(tokens, expected_tokens) self.assertEqual(sentences, expected_sentences) From d8f7435cd13504f492ed560a7ce50db89d47b850 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 1 Sep 2015 10:19:31 -0300 Subject: [PATCH 15/25] Adapts the Statistics worker tests to the removal of MongoDict --- tests/test_worker_statistics.py | 48 ++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 21 deletions(-) diff --git a/tests/test_worker_statistics.py b/tests/test_worker_statistics.py index efc412f..d3f3185 100644 --- a/tests/test_worker_statistics.py +++ b/tests/test_worker_statistics.py @@ -23,26 +23,32 @@ class TestStatisticsWorker(TaskTest): def test_simple(self): - self.document['sentences'] = [['this', 'is', 'a', 'test', '.'], - ['this', 'is', 'another', '!']] - self.document['freqdist'] = [('this', 2), ('is', 2), ('a', 1), - ('test', 1), ('.', 1), ('another', 1), ('!', 1)] - Statistics().delay(self.fake_id) - self.assertEqual(self.document['average_sentence_length'], 4.5) - self.assertEqual(self.document['average_sentence_repertoire'], 1) - self.assertAlmostEqual(self.document['momentum_1'], 1.2857, places=3) - self.assertAlmostEqual(self.document['momentum_2'], 1.8571, places=3) - self.assertEqual(self.document['momentum_3'], 3) - self.assertAlmostEqual(self.document['momentum_4'], 5.2857, places=3) - self.assertAlmostEqual(self.document['repertoire'], 0.7777, places=3) + doc = {'sentences': [['this', 'is', 'a', 'test', '.'], ['this', 'is', + 'another', '!']], 'freqdist': [('this', 2), ('is', 2), ('a', 1), + ('test', 1), ('.', 1), ('another', 1), ('!', 1)]} + doc_id = self.collection.insert(doc) + Statistics().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + + self.assertEqual(refreshed_document['average_sentence_length'], 4.5) + self.assertEqual(refreshed_document['average_sentence_repertoire'], 1) + self.assertAlmostEqual(refreshed_document['momentum_1'], 1.2857, places=3) + self.assertAlmostEqual(refreshed_document['momentum_2'], 1.8571, places=3) + self.assertEqual(refreshed_document['momentum_3'], 3) + self.assertAlmostEqual(refreshed_document['momentum_4'], 5.2857, places=3) + self.assertAlmostEqual(refreshed_document['repertoire'], 0.7777, places=3) def test_zero_division_error(self): - self.document.update({'freqdist': [], 'sentences': []}) - Statistics().delay(self.fake_id) - self.assertEqual(self.document['average_sentence_length'], 0) - self.assertEqual(self.document['average_sentence_repertoire'], 0) - self.assertEqual(self.document['momentum_1'], 0) - self.assertEqual(self.document['momentum_2'], 0) - self.assertEqual(self.document['momentum_3'], 0) - self.assertEqual(self.document['momentum_4'], 0) - self.assertEqual(self.document['repertoire'], 0) + doc_id = self.collection.insert({'freqdist': [], 'sentences': []}) + + Statistics().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['average_sentence_length'], 0) + self.assertEqual(refreshed_document['average_sentence_repertoire'], 0) + self.assertEqual(refreshed_document['momentum_1'], 0) + self.assertEqual(refreshed_document['momentum_2'], 0) + self.assertEqual(refreshed_document['momentum_3'], 0) + self.assertEqual(refreshed_document['momentum_4'], 0) + self.assertEqual(refreshed_document['repertoire'], 0) From dfc69f0a4245a6c3ec344fac1b437e76ccb49e73 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 1 Sep 2015 10:22:20 -0300 Subject: [PATCH 16/25] Adapts Spellchecker worker tests to the removal of MongoDict --- tests/test_worker_spellchecker.py | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py index 7700b75..646b89b 100644 --- a/tests/test_worker_spellchecker.py +++ b/tests/test_worker_spellchecker.py @@ -28,19 +28,23 @@ class TestSpellcheckerWorker(TaskTest): def test_spellchek_pt(self): text = u"Meu cachoro é um pastor" - self.document.update({'text': text, 'language': 'pt_BR'}) - spellchecker.SpellingChecker().delay(self.fake_id) - self.assertEqual(len(self.document['spelling_errors']), 1) - self.assertIn('cachoro', self.document['spelling_errors'][0]) - self.assertIn('cachorro', self.document['spelling_errors'][0][2]) - self.assertEqual(self.document['spelling_errors'][0][1], 4) + doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'}) + spellchecker.SpellingChecker().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(len(refreshed_document['spelling_errors']), 1) + self.assertIn('cachoro', refreshed_document['spelling_errors'][0]) + self.assertIn('cachorro', refreshed_document['spelling_errors'][0][2]) + self.assertEqual(refreshed_document['spelling_errors'][0][1], 4) def test_spellchek_en(self): text = u"The cat bit the doggyo" - self.document.update({'text': text, 'language': 'en'}) - spellchecker.SpellingChecker().delay(self.fake_id) - self.assertEqual(len(self.document['spelling_errors']), 1) - self.assertIn('doggyo', self.document['spelling_errors'][0]) - self.assertIn('doggy', self.document['spelling_errors'][0][2]) - self.assertEqual(self.document['spelling_errors'][0][1], 16) + doc_id = self.collection.insert({'text': text, 'language': 'en'}) + spellchecker.SpellingChecker().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(len(refreshed_document['spelling_errors']), 1) + self.assertIn('doggyo', refreshed_document['spelling_errors'][0]) + self.assertIn('doggy', refreshed_document['spelling_errors'][0][2]) + self.assertEqual(refreshed_document['spelling_errors'][0][1], 16) From bc9a1744ffc6d8862f1e58f246b4844a09c4d224 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 1 Sep 2015 10:25:57 -0300 Subject: [PATCH 17/25] Adapts POS worker tests to the removal of MongoDict --- tests/test_worker_pos.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/tests/test_worker_pos.py b/tests/test_worker_pos.py index 6413c18..2d64716 100644 --- a/tests/test_worker_pos.py +++ b/tests/test_worker_pos.py @@ -27,15 +27,16 @@ def test_pos_should_return_a_list_of_tuples_with_token_classification_and_offset text = 'The sky is blue, the sun is yellow.' tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.'] - expected = [('The', 'DT', 0), ('sky', 'NN', 4), ('is', 'VBZ', 8), - ('blue', 'JJ', 11), (',', ',', 15), ('the', 'DT', 17), - ('sun', 'NN', 21), ('is', 'VBZ', 25), ('yellow', 'JJ', 28), - ('.', '.', 34)] - self.document.update({'text': text, 'tokens': tokens, + expected = [['The', 'DT', 0], ['sky', 'NN', 4], ['is', 'VBZ', 8], + ['blue', 'JJ', 11], [',', ',', 15], ['the', 'DT', 17], + ['sun', 'NN', 21], ['is', 'VBZ', 25], ['yellow', 'JJ', 28], + ['.', '.', 34]] + doc_id = self.collection.insert({'text': text, 'tokens': tokens, 'language': 'en'}) - POS().delay(self.fake_id) - self.assertEqual(self.document['pos'], expected) - self.assertEqual(self.document['tagset'], 'en-nltk') + POS().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['pos'], expected) + self.assertEqual(refreshed_document['tagset'], 'en-nltk') def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self): text = 'Isso é uma frase em português.' @@ -51,9 +52,10 @@ def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self): ''').strip() + '\n\n' # '.' is the only named entity here. - expected = [(u'.', u'.', 29)] - self.document.update({'text': text, 'tokens': tokens, + expected = [[u'.', u'.', 29]] + doc_id = self.collection.insert({'text': text, 'tokens': tokens, 'language': 'pt', 'palavras_raw': palavras_raw}) - POS().delay(self.fake_id) - self.assertEqual(self.document['pos'], expected) - self.assertEqual(self.document['tagset'], 'pt-palavras') + POS().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['pos'], expected) + self.assertEqual(refreshed_document['tagset'], 'pt-palavras') From ba8b40f0b6817ebc210c5443a273f1fcea33d119 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 1 Sep 2015 10:35:20 -0300 Subject: [PATCH 18/25] Adapts SemanticTagger worker tests to the removal of MongoDict --- tests/test_worker_palavras_semantic_tagger.py | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/test_worker_palavras_semantic_tagger.py b/tests/test_worker_palavras_semantic_tagger.py index aa75a2a..0be2e11 100644 --- a/tests/test_worker_palavras_semantic_tagger.py +++ b/tests/test_worker_palavras_semantic_tagger.py @@ -54,11 +54,12 @@ def test_basic_semantic_tags(self): 'Verbs_related_human_things': ['falo'] } - self.document.update({'palavras_raw': palavras_output, + doc_id = self.collection.insert({'palavras_raw': palavras_output, 'palavras_raw_ran': True}) - SemanticTagger().delay(self.fake_id) + SemanticTagger().delay(doc_id) - self.assertEqual(self.document['semantic_tags'], expected_tags) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['semantic_tags'], expected_tags) def test_ambiguous_tags(self): @@ -77,12 +78,13 @@ def test_ambiguous_tags(self): ''').strip() + '\n\n' expected_tags = { - 'Non_Tagged': ['Eu', 'bem', 'enquanto', 'ele', 'está', 'em', - 'o'], - 'Place and spatial': ['canto'], - 'Verbs_related_human_things': ['canto'] + 'Non_Tagged': [u'Eu', u'bem', u'enquanto', u'ele', u'está', + u'em', u'o'], + 'Place and spatial': [u'canto'], + 'Verbs_related_human_things': [u'canto'] } - self.document.update({'palavras_raw': palavras_output, + doc_id = self.collection.insert({'palavras_raw': palavras_output, 'palavras_raw_ran': True}) - SemanticTagger().delay(self.fake_id) - self.assertEqual(self.document['semantic_tags'], expected_tags) + SemanticTagger().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['semantic_tags'], expected_tags) From 53515b47c9f45fb2a7c1cd9a419f1740c4a0ebdf Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 1 Sep 2015 10:38:38 -0300 Subject: [PATCH 19/25] Adapts PalavrasRaw worker tests to the removal of MongoDict --- tests/test_worker_palavras_raw.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py index 41e9c61..6242bd1 100644 --- a/tests/test_worker_palavras_raw.py +++ b/tests/test_worker_palavras_raw.py @@ -30,23 +30,26 @@ class TestPalavrasRawWorker(TaskTest): def test_should_run_only_if_language_is_portuguese(self): if palavras_raw.palavras_installed(): - self.document.update({'text': 'There was a rock on the way.', + doc_id = self.collection.insert({'text': 'There was a rock on the way.', 'language': 'en'}) - palavras_raw.PalavrasRaw().delay(self.fake_id) - self.assertEqual(self.document['palavras_raw_ran'], False) + palavras_raw.PalavrasRaw().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['palavras_raw_ran'], False) def test_palavras_not_installed(self): palavras_raw.BASE_PARSER = '/not-found' - self.document.update({'text': 'Tinha uma pedra no meio do caminho.', - 'language': 'pt'}) - palavras_raw.PalavrasRaw().delay(self.fake_id) - self.assertEqual(self.document['palavras_raw_ran'], False) + doc_id = self.collection.insert( + {'text': 'Tinha uma pedra no meio do caminho.', + 'language': 'pt'}) + palavras_raw.PalavrasRaw().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['palavras_raw_ran'], False) def test_palavras_should_return_raw_if_it_is_installed(self): palavras_raw.BASE_PARSER = ORIGINAL_PATH - self.document.update( + doc_id = self.collection.insert( {'text': 'Eu sei que neste momento falo para todo Brasil.', 'language': 'pt'}) expected_raw = dedent(''' @@ -63,6 +66,7 @@ def test_palavras_should_return_raw_if_it_is_installed(self): $. #11->0 ''').strip() + '\n\n' - result = palavras_raw.PalavrasRaw().delay(self.fake_id) - self.assertEqual(self.document['palavras_raw'], expected_raw) - self.assertEqual(self.document['palavras_raw_ran'], True) + result = palavras_raw.PalavrasRaw().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['palavras_raw'], expected_raw) + self.assertEqual(refreshed_document['palavras_raw_ran'], True) From 72a54a5bad92a1972fcd53d1c5c165e295306c55 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 14 Sep 2015 23:30:30 -0300 Subject: [PATCH 20/25] Adapts Trigrams worker to the removal of MongoDict This commit was co-authored by Israel Teixeira We needed to store the trigram results as strings (since that's the only possible type for mongo keys). We decided to turn the tuples into strings joined by spaces because spaces are never going to be part of a token. Also, mongo keys cannot contain neither `.` nor `$`, so we decided to replace those with `\dot` and `\dollarsign` respectively. Thanks a lot @israelst for the help! --- pypln/backend/workers/trigrams.py | 12 ++++++++++-- tests/test_worker_trigrams.py | 11 ++++++----- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index d1972c2..4ad46ef 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -42,6 +42,14 @@ def process(self, document): tr = defaultdict(lambda: []) for m in metrics: for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)): - tr[res[0]].append(res[1]) + # We cannot store the trigram as a tuple (mongo keys need to be + # strings). We decided to join tokens using spaces since a + # space will never be in a token. + key = u' '.join(res[0]) + # Mongo cannot have `.` or `$` in key names. Unfortunatelly + # this means we need to replace them with placeholders. + key = key.replace(u'$', u'\dollarsign') + key = key.replace(u'.', u'\dot') + tr[key].append(res[1]) - return {'trigram_rank': dict(tr), 'metrics':metrics} + return {'trigram_rank': tr, 'metrics':metrics} diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index d99b676..e8539f3 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -30,9 +30,10 @@ def test_Trigrams_should_return_correct_score(self): tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) - expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf',u'plucked') - self.document['tokens'] = tokens - Trigrams().delay(self.fake_id) - trigram_rank = self.document['trigram_rank'] - result = trigram_rank[(u'olive', u'leaf',u'plucked')][0] + expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf', u'plucked') + doc_id = self.collection.insert({'tokens': tokens}, w=1) + Trigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + trigram_rank = refreshed_document['trigram_rank'] + result = trigram_rank[u'olive leaf plucked'][0] self.assertEqual(result, expected) From 48d41a7ccd7a9a77762f41aa8a9bbaa66966eaf0 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Mon, 14 Sep 2015 23:42:53 -0300 Subject: [PATCH 21/25] Removes last references to MongoDict --- pypln/backend/workers/tokenizer.py | 1 - requirements/production.txt | 1 - tests/test_worker_bigrams.py | 4 ++-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pypln/backend/workers/tokenizer.py b/pypln/backend/workers/tokenizer.py index d6f30d0..fd5e37a 100644 --- a/pypln/backend/workers/tokenizer.py +++ b/pypln/backend/workers/tokenizer.py @@ -16,7 +16,6 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from mongodict import MongoDict from nltk import word_tokenize, sent_tokenize from pypln.backend.celery_task import PyPLNTask diff --git a/requirements/production.txt b/requirements/production.txt index 2c80c43..e19f0fe 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -1,7 +1,6 @@ # Common celery pymongo==2.8.1 -mongodict # The newest pyparsing (2.0) only supports python 3, # so we explicitly install 1.5.7 (the last version that diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index bf0225e..b9fcd4a 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -28,8 +28,8 @@ class TestBigramWorker(TaskTest): def test_bigrams_should_return_correct_score(self): # We need this list comprehension because we need to save the word list - # in MongoDict (thus, it needs to be pickleable). Also, a list is what - # will be available to the worker in real situations. + # in mongo (thus, it needs to be json serializable). Also, a list is + # what will be available to the worker in real situations. tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] From b12833e3eef14e43f5a0dd61e61eb231ef4ac4dd Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 16 Sep 2015 16:50:30 -0300 Subject: [PATCH 22/25] Adds `w=1` to all inserts in tests as suggested by @fccoelho --- tests/test_elastic_indexer.py | 4 ++-- tests/test_worker_bigrams.py | 2 +- tests/test_worker_extractor.py | 24 +++++++++---------- tests/test_worker_freqdist.py | 2 +- tests/test_worker_gridfs_data_retriever.py | 4 ++-- tests/test_worker_gridfs_file_deleter.py | 4 ++-- tests/test_worker_lemmatizer_pt.py | 2 +- tests/test_worker_palavras_noun_phrase.py | 2 +- tests/test_worker_palavras_raw.py | 6 ++--- tests/test_worker_palavras_semantic_tagger.py | 4 ++-- tests/test_worker_pos.py | 4 ++-- tests/test_worker_spellchecker.py | 5 ++-- tests/test_worker_statistics.py | 4 ++-- tests/test_worker_tokenizer.py | 2 +- tests/test_worker_wordcloud.py | 2 +- 15 files changed, 36 insertions(+), 35 deletions(-) diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py index b76dacd..faaafab 100644 --- a/tests/test_elastic_indexer.py +++ b/tests/test_elastic_indexer.py @@ -32,7 +32,7 @@ def test_indexing_go_through(self): 'contents': 'raw_file_contents', } - doc_id = self.collection.insert(doc) + doc_id = self.collection.insert(doc, w=1) ElasticIndexer().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertTrue(refreshed_document['created']) @@ -55,7 +55,7 @@ def test_regression_indexing_should_not_include_contents(self, ES): 'contents': 'raw_file_contents', } - doc_id = self.collection.insert(doc) + doc_id = self.collection.insert(doc, w=1) ElasticIndexer().delay(doc_id) # remove properties that won't be indexed index_name = doc.pop("index_name") diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index b9fcd4a..3ee2b74 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -33,7 +33,7 @@ def test_bigrams_should_return_correct_score(self): tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] - doc_id = self.collection.insert({'tokens': tokens}) + doc_id = self.collection.insert({'tokens': tokens}, w=1) bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which') diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index 3b03736..d7819a5 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -30,7 +30,7 @@ def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.txt') doc_id = self.collection.insert({'filename': filename, - 'contents': base64.b64encode(open(filename).read())}) + 'contents': base64.b64encode(open(filename).read())}, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['text'], expected) @@ -47,7 +47,7 @@ def test_extraction_from_html_file(self): # pickled representation of the data. data = {'filename': filename, 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['text'], expected) @@ -59,7 +59,7 @@ def test_extraction_from_pdf_file(self): filename = os.path.join(DATA_DIR, 'test.pdf') data = {'filename': filename, 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['text'], expected) @@ -114,7 +114,7 @@ def test_extraction_from_html(self): ''') data = {'filename': 'test.html', 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) expected = dedent(''' Testing @@ -141,7 +141,7 @@ def test_language_detection_pt(self): text_pt = 'Esse texto foi escrito por Álvaro em Português.' data_pt = {'filename': 'text-pt.txt', 'contents': base64.b64encode(text_pt)} - doc_id = self.collection.insert(data_pt) + doc_id = self.collection.insert(data_pt, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['language'], 'pt') @@ -150,7 +150,7 @@ def test_language_detection_es(self): text_es = 'Este texto ha sido escrito en Español por Álvaro.' data_es = {'filename': 'text-es.txt', 'contents': base64.b64encode(text_es)} - doc_id = self.collection.insert(data_es) + doc_id = self.collection.insert(data_es, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['language'], 'es') @@ -159,7 +159,7 @@ def test_language_detection_en(self): text_en = 'This text was written by Álvaro in English.' data_en = {'filename': 'text-en.txt', 'contents': base64.b64encode(text_en)} - doc_id = self.collection.insert(data_en) + doc_id = self.collection.insert(data_en, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['language'], 'en') @@ -170,7 +170,7 @@ def test_unescape_html_entities(self): filename = os.path.join(DATA_DIR, 'test_html_entities.txt') data = {'filename': filename, 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['text'], expected) @@ -180,7 +180,7 @@ def test_should_detect_encoding_and_return_a_unicode_object(self): filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt') data = {'filename': filename, 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['text'], expected) @@ -191,7 +191,7 @@ def test_should_guess_mimetype_for_file_without_extension(self): filename = os.path.join(DATA_DIR, 'text_file') data = {'filename': filename, 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['mimetype'], 'text/plain') @@ -203,7 +203,7 @@ def test_unknown_mimetype_should_be_flagged(self): contents = open(filename).read() data = {'filename': filename, 'contents': base64.b64encode(contents)} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['mimetype'], 'unknown') @@ -216,7 +216,7 @@ def test_unknown_encoding_should_be_ignored(self): expected = u"This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding." data = {'filename': filename, 'contents': base64.b64encode(open(filename).read())} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) Extractor().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['text'], expected) diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py index 8a97193..bde9c98 100644 --- a/tests/test_worker_freqdist.py +++ b/tests/test_worker_freqdist.py @@ -30,7 +30,7 @@ def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(sel # This is just preparing the expected input in the database - doc_id = self.collection.insert({'tokens': tokens}) + doc_id = self.collection.insert({'tokens': tokens}, w=1) FreqDist().delay(doc_id) diff --git a/tests/test_worker_gridfs_data_retriever.py b/tests/test_worker_gridfs_data_retriever.py index da54f2f..1012627 100644 --- a/tests/test_worker_gridfs_data_retriever.py +++ b/tests/test_worker_gridfs_data_retriever.py @@ -33,7 +33,7 @@ def test_extract_file_data_from_GridFS(self): expected_file_data = gridfs.get(new_file_id) data = {'file_id': str(new_file_id)} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) GridFSDataRetriever().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) @@ -51,7 +51,7 @@ def test_extract_file_data_from_GridFS(self): def test_task_raises_exception_when_file_does_not_exist(self): data = {'file_id': "Inexistent document"} - doc_id = self.collection.insert(data) + doc_id = self.collection.insert(data, w=1) result = GridFSDataRetriever().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) diff --git a/tests/test_worker_gridfs_file_deleter.py b/tests/test_worker_gridfs_file_deleter.py index cec90b3..149d9d0 100644 --- a/tests/test_worker_gridfs_file_deleter.py +++ b/tests/test_worker_gridfs_file_deleter.py @@ -31,7 +31,7 @@ def test_delete_file_from_GridFS(self): new_file_id = gridfs.put(content) expected_file_data = gridfs.get(new_file_id) - doc_id = self.collection.insert({'file_id': new_file_id}) + doc_id = self.collection.insert({'file_id': new_file_id}, w=1) self.assertTrue(gridfs.exists(new_file_id)) GridFSFileDeleter().delay(doc_id) @@ -39,7 +39,7 @@ def test_delete_file_from_GridFS(self): self.assertFalse(gridfs.exists(new_file_id)) def test_task_raises_exception_when_file_does_not_exist(self): - doc_id = self.collection.insert({'file_id': "Inexistent document"}) + doc_id = self.collection.insert({'file_id': "Inexistent document"}, w=1) result = GridFSFileDeleter().delay(doc_id) self.assertTrue(result.failed()) self.assertEqual(result.status, "FAILURE") diff --git a/tests/test_worker_lemmatizer_pt.py b/tests/test_worker_lemmatizer_pt.py index fb96cd5..3887d81 100644 --- a/tests/test_worker_lemmatizer_pt.py +++ b/tests/test_worker_lemmatizer_pt.py @@ -43,7 +43,7 @@ def test_lemmatizer_should_return_a_list_with_lemmas(self): ''').strip() + '\n\n' doc = {'palavras_raw': palavras_output, 'palavras_raw_ran': True} - doc_id = self.collection.insert(doc) + doc_id = self.collection.insert(doc, w=1) result = Lemmatizer().delay(doc_id) expected = 'eu saber que em este momento falar para todo Brasil .'.split() refreshed_document = self.collection.find_one({'_id': doc_id}) diff --git a/tests/test_worker_palavras_noun_phrase.py b/tests/test_worker_palavras_noun_phrase.py index d9e601e..68adcaa 100644 --- a/tests/test_worker_palavras_noun_phrase.py +++ b/tests/test_worker_palavras_noun_phrase.py @@ -45,7 +45,7 @@ def test_noun_phrase_worker_should_return_a_list_with_phrases(self): ''').strip() + '\n\n' doc_id = self.collection.insert({'palavras_raw': palavras_output, - 'palavras_raw_ran': True}) + 'palavras_raw_ran': True}, w=1) NounPhrase().delay(doc_id) expected = ['_este *momento', 'todo o *povo de_ _o Brasil .', '_o *Brasil .'] diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py index 6242bd1..90a3845 100644 --- a/tests/test_worker_palavras_raw.py +++ b/tests/test_worker_palavras_raw.py @@ -31,7 +31,7 @@ class TestPalavrasRawWorker(TaskTest): def test_should_run_only_if_language_is_portuguese(self): if palavras_raw.palavras_installed(): doc_id = self.collection.insert({'text': 'There was a rock on the way.', - 'language': 'en'}) + 'language': 'en'}, w=1) palavras_raw.PalavrasRaw().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) @@ -41,7 +41,7 @@ def test_palavras_not_installed(self): palavras_raw.BASE_PARSER = '/not-found' doc_id = self.collection.insert( {'text': 'Tinha uma pedra no meio do caminho.', - 'language': 'pt'}) + 'language': 'pt'}, w=1) palavras_raw.PalavrasRaw().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['palavras_raw_ran'], False) @@ -51,7 +51,7 @@ def test_palavras_should_return_raw_if_it_is_installed(self): palavras_raw.BASE_PARSER = ORIGINAL_PATH doc_id = self.collection.insert( {'text': 'Eu sei que neste momento falo para todo Brasil.', - 'language': 'pt'}) + 'language': 'pt'}, w=1) expected_raw = dedent(''' Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2 sei [saber] V PR 1S IND VFIN @FS-STA #2->0 diff --git a/tests/test_worker_palavras_semantic_tagger.py b/tests/test_worker_palavras_semantic_tagger.py index 0be2e11..1b3abbe 100644 --- a/tests/test_worker_palavras_semantic_tagger.py +++ b/tests/test_worker_palavras_semantic_tagger.py @@ -55,7 +55,7 @@ def test_basic_semantic_tags(self): } doc_id = self.collection.insert({'palavras_raw': palavras_output, - 'palavras_raw_ran': True}) + 'palavras_raw_ran': True}, w=1) SemanticTagger().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) @@ -84,7 +84,7 @@ def test_ambiguous_tags(self): 'Verbs_related_human_things': [u'canto'] } doc_id = self.collection.insert({'palavras_raw': palavras_output, - 'palavras_raw_ran': True}) + 'palavras_raw_ran': True}, w=1) SemanticTagger().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['semantic_tags'], expected_tags) diff --git a/tests/test_worker_pos.py b/tests/test_worker_pos.py index 2d64716..68a3bc2 100644 --- a/tests/test_worker_pos.py +++ b/tests/test_worker_pos.py @@ -32,7 +32,7 @@ def test_pos_should_return_a_list_of_tuples_with_token_classification_and_offset ['sun', 'NN', 21], ['is', 'VBZ', 25], ['yellow', 'JJ', 28], ['.', '.', 34]] doc_id = self.collection.insert({'text': text, 'tokens': tokens, - 'language': 'en'}) + 'language': 'en'}, w=1) POS().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['pos'], expected) @@ -54,7 +54,7 @@ def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self): # '.' is the only named entity here. expected = [[u'.', u'.', 29]] doc_id = self.collection.insert({'text': text, 'tokens': tokens, - 'language': 'pt', 'palavras_raw': palavras_raw}) + 'language': 'pt', 'palavras_raw': palavras_raw}, w=1) POS().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) self.assertEqual(refreshed_document['pos'], expected) diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py index 646b89b..b81bb93 100644 --- a/tests/test_worker_spellchecker.py +++ b/tests/test_worker_spellchecker.py @@ -28,7 +28,8 @@ class TestSpellcheckerWorker(TaskTest): def test_spellchek_pt(self): text = u"Meu cachoro é um pastor" - doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'}) + doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'}, + w=1) spellchecker.SpellingChecker().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) @@ -39,7 +40,7 @@ def test_spellchek_pt(self): def test_spellchek_en(self): text = u"The cat bit the doggyo" - doc_id = self.collection.insert({'text': text, 'language': 'en'}) + doc_id = self.collection.insert({'text': text, 'language': 'en'}, w=1) spellchecker.SpellingChecker().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) diff --git a/tests/test_worker_statistics.py b/tests/test_worker_statistics.py index d3f3185..3370e8d 100644 --- a/tests/test_worker_statistics.py +++ b/tests/test_worker_statistics.py @@ -26,7 +26,7 @@ def test_simple(self): doc = {'sentences': [['this', 'is', 'a', 'test', '.'], ['this', 'is', 'another', '!']], 'freqdist': [('this', 2), ('is', 2), ('a', 1), ('test', 1), ('.', 1), ('another', 1), ('!', 1)]} - doc_id = self.collection.insert(doc) + doc_id = self.collection.insert(doc, w=1) Statistics().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) @@ -40,7 +40,7 @@ def test_simple(self): self.assertAlmostEqual(refreshed_document['repertoire'], 0.7777, places=3) def test_zero_division_error(self): - doc_id = self.collection.insert({'freqdist': [], 'sentences': []}) + doc_id = self.collection.insert({'freqdist': [], 'sentences': []}, w=1) Statistics().delay(doc_id) diff --git a/tests/test_worker_tokenizer.py b/tests/test_worker_tokenizer.py index eb99a93..9d59cac 100644 --- a/tests/test_worker_tokenizer.py +++ b/tests/test_worker_tokenizer.py @@ -30,7 +30,7 @@ def test_tokenizer_should_receive_text_and_return_tokens(self): expected_sentences = [['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.'], ['This', 'is', 'another', 'sentence', '.']] - doc_id = self.collection.insert(doc) + doc_id = self.collection.insert(doc, w=1) Tokenizer().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py index dd0f6e0..23ed090 100644 --- a/tests/test_worker_wordcloud.py +++ b/tests/test_worker_wordcloud.py @@ -31,7 +31,7 @@ class TestFreqDistWorker(TaskTest): def test_wordcloud_should_return_a_base64_encoded_png(self): doc = {'freqdist': [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)], 'language': 'en'} - doc_id = self.collection.insert(doc) + doc_id = self.collection.insert(doc, w=1) WordCloud().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) From 187114889b20ab4bc5fed504bd424cd32f6029a0 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 29 Sep 2015 22:13:05 -0300 Subject: [PATCH 23/25] Adds test for trigrams with dots and dollar signs --- tests/test_worker_trigrams.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index e8539f3..6c5ad38 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -37,3 +37,15 @@ def test_Trigrams_should_return_correct_score(self): trigram_rank = refreshed_document['trigram_rank'] result = trigram_rank[u'olive leaf plucked'][0] self.assertEqual(result, expected) + + def test_Trigrams_may_contain_dots_and_dollar_signs(self): + tokens = ['$', 'test', '.'] + trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) + expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'$', + u'test', u'.') + doc_id = self.collection.insert({'tokens': tokens}, w=1) + Trigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + trigram_rank = refreshed_document['trigram_rank'] + result = trigram_rank[u'\dollarsign test \dot'][0] + self.assertEqual(result, expected) From 3bd3896b32c6cef3e607920eff8a5861f3b9d3a9 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 29 Sep 2015 22:47:58 -0300 Subject: [PATCH 24/25] Adds test for bigrams with dots and dollar signs --- tests/test_worker_bigrams.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index 3ee2b74..6676b4e 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -42,3 +42,15 @@ def test_bigrams_should_return_correct_score(self): bigram_rank = refreshed_document['bigram_rank'] result = bigram_rank[0][1][0] self.assertEqual(result, expected) + + def test_bigrams_could_contain_dollar_signs_and_dots(self): + tokens = ['$', '.'] + doc_id = self.collection.insert({'tokens': tokens}, w=1) + bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) + expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u'$', u'.') + + Bigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + bigram_rank = refreshed_document['bigram_rank'] + result = bigram_rank[0][1][0] + self.assertEqual(result, expected) From 55692278aefc6d4709928c9f905bb49b89b1cd52 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 30 Sep 2015 11:59:46 -0300 Subject: [PATCH 25/25] Start checking the values in bigram and trigram tests As @fccoelho pointed out, we were just running the same code as the worker. If there was an error in the way we call nltk (and we got 'None' for example), we would still have a valid assertion. We now get a known value and tests against that. --- tests/test_worker_bigrams.py | 14 ++++++++------ tests/test_worker_trigrams.py | 15 ++++++++------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index 6676b4e..de605e2 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -34,23 +34,25 @@ def test_bigrams_should_return_correct_score(self): nltk.corpus.genesis.words('english-web.txt')] doc_id = self.collection.insert({'tokens': tokens}, w=1) - bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) - expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which') Bigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) bigram_rank = refreshed_document['bigram_rank'] result = bigram_rank[0][1][0] - self.assertEqual(result, expected) + # This is the value of the chi_sq measure for this bigram in this + # colocation + expected_chi_sq = 95.59393417173634 + self.assertEqual(result, expected_chi_sq) def test_bigrams_could_contain_dollar_signs_and_dots(self): tokens = ['$', '.'] doc_id = self.collection.insert({'tokens': tokens}, w=1) - bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) - expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u'$', u'.') Bigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) bigram_rank = refreshed_document['bigram_rank'] result = bigram_rank[0][1][0] - self.assertEqual(result, expected) + # 2.0 is the value of the chi_sq measure for this bigram in this + # colocation + expected_chi_sq = 2.0 + self.assertEqual(result, expected_chi_sq) diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index 6c5ad38..93575e9 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -29,23 +29,24 @@ class TestTrigramWorker(TaskTest): def test_Trigrams_should_return_correct_score(self): tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] - trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) - expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf', u'plucked') doc_id = self.collection.insert({'tokens': tokens}, w=1) Trigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) trigram_rank = refreshed_document['trigram_rank'] result = trigram_rank[u'olive leaf plucked'][0] - self.assertEqual(result, expected) + # This is the value of the chi_sq measure for this trigram in this + # colocation + expected_chi_sq = 1940754916.9623578 + self.assertEqual(result, expected_chi_sq) def test_Trigrams_may_contain_dots_and_dollar_signs(self): tokens = ['$', 'test', '.'] - trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) - expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'$', - u'test', u'.') doc_id = self.collection.insert({'tokens': tokens}, w=1) Trigrams().delay(doc_id) refreshed_document = self.collection.find_one({'_id': doc_id}) trigram_rank = refreshed_document['trigram_rank'] result = trigram_rank[u'\dollarsign test \dot'][0] - self.assertEqual(result, expected) + # This is the value of the chi_sq measure for this trigram in this + # colocation + expected_chi_sq = 10.5 + self.assertEqual(result, expected_chi_sq)