diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py index ed1c957..bced3a2 100644 --- a/pypln/backend/celery_task.py +++ b/pypln/backend/celery_task.py @@ -16,11 +16,9 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . - +import pymongo from celery import Task -from pypln.backend.mongodict_adapter import MongoDictAdapter - # This import may look like an unused imported, but it is not. # When our base task class is defined, the Celery app must have already been # instantiated, otherwise when this code is imported elsewhere (like in a @@ -33,6 +31,11 @@ from pypln.backend import config +mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"], + port=config.MONGODB_CONFIG["port"]) +database = mongo_client[config.MONGODB_CONFIG["database"]] +document_collection = database[config.MONGODB_CONFIG["collection"]] + class PyPLNTask(Task): """ A base class for PyPLN tasks. It is in charge of getting the document @@ -48,16 +51,9 @@ def run(self, document_id): It will call the `process` method with a dictionary containing all the document information and will update de database with results. """ - document = MongoDictAdapter(doc_id=document_id, - host=config.MONGODB_CONFIG['host'], - port=config.MONGODB_CONFIG['port'], - database=config.MONGODB_CONFIG['database']) - # Create a dictionary out of our document. We could simply pass - # it on to the process method, but for now we won't let the user - # manipulate the MongoDict directly. - dic = {k: v for k, v in document.iteritems()} - result = self.process(dic) - document.update(result) + document = document_collection.find_one({"_id": document_id}) + result = self.process(document) + document_collection.update({"_id": document_id}, {"$set": result}) return document_id def process(self, document): diff --git a/pypln/backend/config.py b/pypln/backend/config.py index f06fb1e..f074c3a 100644 --- a/pypln/backend/config.py +++ b/pypln/backend/config.py @@ -5,6 +5,7 @@ def get_store_config(): defaults = {'host': 'localhost', 'port': '27017', 'database': 'pypln_dev', + 'collection': 'documents', 'gridfs_collection': 'files', } config = ConfigParser.ConfigParser(defaults=defaults) diff --git a/pypln/backend/mongodict_adapter.py b/pypln/backend/mongodict_adapter.py deleted file mode 100644 index b57d322..0000000 --- a/pypln/backend/mongodict_adapter.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - -from mongodict import MongoDict - - -class MongoDictAdapter(MongoDict): - #TODO: implement clear, __iter__, __len__ and contains with filters by id - def __init__(self, doc_id, *args, **kwargs): - self.doc_id = doc_id - self.prefix = 'id:{}:'.format(self.doc_id) - self.prefixed_id_query = {'$regex': - '^{}'.format(self.prefix)} - return super(MongoDictAdapter, self).__init__(*args, **kwargs) - - def __setitem__(self, key, value): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__setitem__(key, value) - - def __getitem__(self, key): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__getitem__(key) - - def __delitem__(self, key): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__delitem__(key) - - def __contains__(self, key): - # If this is being called by other methods (like __delitem__) - # it will already have the prefix - if not key.startswith('id:'): - key = 'id:{}:{}'.format(self.doc_id, key) - return super(MongoDictAdapter, self).__contains__(key) - - has_key = __contains__ - - def __iter__(self): - query_result = self._collection.find({'_id': - self.prefixed_id_query}, {'_id': 1}) - keys = (k['_id'].replace(self.prefix, '', 1) for k in query_result) - return keys - - def __len__(self): - return self._collection.find({'_id': self.prefixed_id_query}).count() - - def clear(self): - self._collection.remove({'_id': self.prefixed_id_query}) diff --git a/pypln/backend/workers/elastic_indexer.py b/pypln/backend/workers/elastic_indexer.py index faf8119..f5b55c3 100644 --- a/pypln/backend/workers/elastic_indexer.py +++ b/pypln/backend/workers/elastic_indexer.py @@ -36,6 +36,12 @@ def process(self, document): # See `test_regression_indexing_should_not_include_contents` in # tests/test_elastic_indexer.py for details. document.pop('contents') + # We also need to exclude _id, because ObjectId's won't be + # serializable. + document.pop("_id") + result = ES.index(index=index_name, doc_type=doc_type, body=document, id=file_id) + index_id = result.pop("_id") + result["index_id"] = index_id return result diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py index 09e1b32..110730b 100644 --- a/pypln/backend/workers/extractor.py +++ b/pypln/backend/workers/extractor.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 import shlex from HTMLParser import HTMLParser @@ -169,15 +170,16 @@ class Extractor(PyPLNTask): #TODO: should 'replace_with' be '' when extracting from HTML? def process(self, file_data): + contents = base64.b64decode(file_data['contents']) with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m: - file_mime_type = m.id_buffer(file_data['contents']) + file_mime_type = m.id_buffer(contents) metadata = {} if file_mime_type == 'text/plain': - text = file_data['contents'] + text = contents elif file_mime_type == 'text/html': - text = parse_html(file_data['contents'], True, ['script', 'style']) + text = parse_html(contents, True, ['script', 'style']) elif file_mime_type == 'application/pdf': - text, metadata = extract_pdf(file_data['contents']) + text, metadata = extract_pdf(contents) else: # If we can't detect the mimetype we add a flag that can be read by # the frontend to provide more information on why the document diff --git a/pypln/backend/workers/gridfs_data_retriever.py b/pypln/backend/workers/gridfs_data_retriever.py index 268cb21..68ed916 100644 --- a/pypln/backend/workers/gridfs_data_retriever.py +++ b/pypln/backend/workers/gridfs_data_retriever.py @@ -16,6 +16,7 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 from bson import ObjectId from gridfs import GridFS import pymongo @@ -31,9 +32,17 @@ def process(self, document): gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection']) file_data = gridfs.get(ObjectId(document['file_id'])) + + # We decided to store 'contents' as a base64 encoded string in the + # database to avoid possible corruption of files. For example: when + # it's a pdf, the process of storing the data as utf-8 in mongo might + # be corrupting the file. This wasn't a problem before, because + # MongoDict pickled everything before storing. + contents = base64.b64encode(file_data.read()) + result = {'length': file_data.length, 'md5': file_data.md5, 'filename': file_data.filename, 'upload_date': file_data.upload_date, - 'contents': file_data.read()} + 'contents': contents} return result diff --git a/pypln/backend/workers/gridfs_file_deleter.py b/pypln/backend/workers/gridfs_file_deleter.py index c1dc15f..36ea082 100644 --- a/pypln/backend/workers/gridfs_file_deleter.py +++ b/pypln/backend/workers/gridfs_file_deleter.py @@ -25,9 +25,9 @@ class GridFSFileDeleter(PyPLNTask): def process(self, document): - database = pymongo.MongoClient(host=config.MONGODB_CONFIG['host'], - port=config.MONGODB_CONFIG['port'] - )[config.MONGODB_CONFIG['database']] + mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"], + port=config.MONGODB_CONFIG["port"]) + database = mongo_client[config.MONGODB_CONFIG["database"]] gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection']) gridfs.delete(ObjectId(document['file_id'])) diff --git a/pypln/backend/workers/tokenizer.py b/pypln/backend/workers/tokenizer.py index d6f30d0..fd5e37a 100644 --- a/pypln/backend/workers/tokenizer.py +++ b/pypln/backend/workers/tokenizer.py @@ -16,7 +16,6 @@ # # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from mongodict import MongoDict from nltk import word_tokenize, sent_tokenize from pypln.backend.celery_task import PyPLNTask diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index d1972c2..4ad46ef 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -42,6 +42,14 @@ def process(self, document): tr = defaultdict(lambda: []) for m in metrics: for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)): - tr[res[0]].append(res[1]) + # We cannot store the trigram as a tuple (mongo keys need to be + # strings). We decided to join tokens using spaces since a + # space will never be in a token. + key = u' '.join(res[0]) + # Mongo cannot have `.` or `$` in key names. Unfortunatelly + # this means we need to replace them with placeholders. + key = key.replace(u'$', u'\dollarsign') + key = key.replace(u'.', u'\dot') + tr[key].append(res[1]) - return {'trigram_rank': dict(tr), 'metrics':metrics} + return {'trigram_rank': tr, 'metrics':metrics} diff --git a/requirements/production.txt b/requirements/production.txt index 2c80c43..e19f0fe 100644 --- a/requirements/production.txt +++ b/requirements/production.txt @@ -1,7 +1,6 @@ # Common celery pymongo==2.8.1 -mongodict # The newest pyparsing (2.0) only supports python 3, # so we explicitly install 1.5.7 (the last version that diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py new file mode 100644 index 0000000..fd1adde --- /dev/null +++ b/tests/test_celery_task.py @@ -0,0 +1,39 @@ +# coding: utf-8 +# +# Copyright 2015 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . +from pypln.backend.celery_task import PyPLNTask +from utils import TaskTest + +class FakeTask(PyPLNTask): + def process(self, document): + return {'result': document['input']} + +class TestCeleryTask(TaskTest): + def test_task_should_get_the_correct_document(self): + """This is a regression test. PyPLNTask was not filtering by _id. It + was getting the first document it found. """ + + # This is just preparing the expected input in the database + wrong_doc_id = self.collection.insert({'input': 'wrong'}, w=1) + correct_doc_id = self.collection.insert({'input': 'correct'}, w=1) + + FakeTask().delay(correct_doc_id) + + refreshed_doc = self.collection.find_one({'_id': correct_doc_id}) + + self.assertEqual(refreshed_doc['result'], 'correct') diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py index a35dd23..faaafab 100644 --- a/tests/test_elastic_indexer.py +++ b/tests/test_elastic_indexer.py @@ -32,9 +32,10 @@ def test_indexing_go_through(self): 'contents': 'raw_file_contents', } - self.document.update(doc) - ElasticIndexer().delay(self.fake_id) - assert self.document['created'] # must be True + doc_id = self.collection.insert(doc, w=1) + ElasticIndexer().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertTrue(refreshed_document['created']) @patch('pypln.backend.workers.elastic_indexer.ES') def test_regression_indexing_should_not_include_contents(self, ES): @@ -54,11 +55,12 @@ def test_regression_indexing_should_not_include_contents(self, ES): 'contents': 'raw_file_contents', } - self.document.update(doc) - ElasticIndexer().delay(self.fake_id) + doc_id = self.collection.insert(doc, w=1) + ElasticIndexer().delay(doc_id) # remove properties that won't be indexed index_name = doc.pop("index_name") doc_type = doc.pop('doc_type') doc.pop('contents') + doc.pop('_id') ES.index.assert_called_with(body=doc, id=doc['file_id'], doc_type=doc_type, index=index_name) diff --git a/tests/test_mongodict_adapter.py b/tests/test_mongodict_adapter.py deleted file mode 100644 index af1880d..0000000 --- a/tests/test_mongodict_adapter.py +++ /dev/null @@ -1,120 +0,0 @@ -# coding: utf-8 -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - -import pickle -import unittest - -from bson import Binary -import pymongo - -from pypln.backend.mongodict_adapter import MongoDictAdapter - - - -class TestMongoDictAdapter(unittest.TestCase): - db_name = 'test_mongodictbyid' - - def setUp(self): - self.fake_id = '1234' - self.document = MongoDictAdapter(self.fake_id, database=self.db_name) - self.db = pymongo.Connection()[self.db_name] - - def tearDown(self): - self.db.main.remove({}) - - @classmethod - def tearDownClass(cls): - pymongo.MongoClient().drop_database(cls.db_name) - - def test_creating_a_new_key_should_saved_the_information(self): - self.document['new_key'] = 'value' - stored_value = self.db.main.find_one( - {'_id': 'id:{}:new_key'.format(self.fake_id)}) - self.assertIsNotNone(stored_value) - # This decodes the value with the defaults for MongoDict - decoded_value = pickle.loads(str(stored_value['v'])) - self.assertEqual(decoded_value, 'value') - - def test_reading_an_existing_key_should_read_saved_information(self): - encoded_value = Binary(pickle.dumps( - 'value', protocol=pickle.HIGHEST_PROTOCOL)) - - self.db.main.insert( - {'_id': 'id:{}:key'.format(self.fake_id), 'v': encoded_value}) - - self.assertEqual(self.document['key'], 'value') - - def test_deleting_an_existing_key_should_delete_saved_information(self): - encoded_value = Binary(pickle.dumps( - 'value', protocol=pickle.HIGHEST_PROTOCOL)) - - self.db.main.insert( - {'_id': 'id:{}:key'.format(self.fake_id), 'v': encoded_value}) - - self.assertEqual(self.document['key'], 'value') - del self.document['key'] - - stored_value = self.db.main.find_one( - {'_id': 'id:{}:key'.format(self.fake_id)}) - self.assertIsNone(stored_value) - - def test_iterating_through_keys_does_not_bring_keys_from_other_docs(self): - self.document['key_1'] = 1 - self.document['key_2'] = 2 - other_document = MongoDictAdapter('other_id', database=self.db_name) - other_document['other_key'] = 3 - keys = [k for k in self.document] - - self.assertIn('key_1', keys) - self.assertIn('key_2', keys) - self.assertNotIn('key_3', keys) - - self.assertEquals(['key_1', 'key_2'], self.document.keys()) - - def test_clear_should_not_remove_keys_for_other_docs(self): - self.document['key_1'] = 1 - self.document['key_2'] = 2 - other_document = MongoDictAdapter('other_id', database=self.db_name) - other_document['other_key'] = 3 - - self.document.clear() - - with self.assertRaises(KeyError): - self.document['key_1'] - self.document['key_2'] - - self.assertEqual(other_document['other_key'], 3) - - def test_return_correct_length(self): - self.document['key_1'] = 1 - self.document['key_2'] = 2 - other_document = MongoDictAdapter('other_id', database=self.db_name) - other_document['other_key'] = 3 - - self.assertEquals(len(self.document), 2) - - def test_contains(self): - self.document['key'] = 1 - self.assertIn('key', self.document) - self.assertNotIn('inexistent_key', self.document) - - def test_has_key(self): - self.document['key'] = 1 - self.assertTrue(self.document.has_key('key')) - self.assertFalse(self.document.has_key('inexistent_key')) diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index 91334b9..de605e2 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -28,16 +28,31 @@ class TestBigramWorker(TaskTest): def test_bigrams_should_return_correct_score(self): # We need this list comprehension because we need to save the word list - # in MongoDict (thus, it needs to be pickleable). Also, a list is what - # will be available to the worker in real situations. + # in mongo (thus, it needs to be json serializable). Also, a list is + # what will be available to the worker in real situations. tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] - self.document['tokens'] = tokens - bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) - expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which') + doc_id = self.collection.insert({'tokens': tokens}, w=1) - Bigrams().delay(self.fake_id) - bigram_rank = self.document['bigram_rank'] + Bigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + bigram_rank = refreshed_document['bigram_rank'] result = bigram_rank[0][1][0] - self.assertEqual(result, expected) + # This is the value of the chi_sq measure for this bigram in this + # colocation + expected_chi_sq = 95.59393417173634 + self.assertEqual(result, expected_chi_sq) + + def test_bigrams_could_contain_dollar_signs_and_dots(self): + tokens = ['$', '.'] + doc_id = self.collection.insert({'tokens': tokens}, w=1) + + Bigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + bigram_rank = refreshed_document['bigram_rank'] + result = bigram_rank[0][1][0] + # 2.0 is the value of the chi_sq measure for this bigram in this + # colocation + expected_chi_sq = 2.0 + self.assertEqual(result, expected_chi_sq) diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py index 489bc02..d7819a5 100644 --- a/tests/test_worker_extractor.py +++ b/tests/test_worker_extractor.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 import os from textwrap import dedent from pypln.backend.workers import Extractor @@ -28,54 +29,64 @@ class TestExtractorWorker(TaskTest): def test_extraction_from_text_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.txt') - self.document.update({'filename': filename, - 'contents': open(filename).read()}) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['file_metadata'], {}) - self.assertEqual(self.document['mimetype'], 'text/plain') + doc_id = self.collection.insert({'filename': filename, + 'contents': base64.b64encode(open(filename).read())}, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['file_metadata'], {}) + self.assertEqual(refreshed_document['mimetype'], 'text/plain') def test_extraction_from_html_file(self): expected = "This is a test file. I'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.html') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['file_metadata'], {}) - self.assertEqual(self.document['mimetype'], 'text/html') + # When saving directly to mongodb we always get everything back from + # the database as unicode. Because of that, the extractor is having + # problems when there is a non-ascii character in the content. This + # wasn't a problem before because with mongodict we used to keep a + # pickled representation of the data. + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['file_metadata'], {}) + self.assertEqual(refreshed_document['mimetype'], 'text/html') def test_extraction_from_pdf_file(self): expected = "This is a test file.\nI'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'test.pdf') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) # Check that the expected metadata is a subset of what # our Extractor found (it may have found more details # depending on the toolset used to extract metadata) metadata_expected = { - 'Author': 'Álvaro Justen', - 'Creator': 'Writer', - 'Producer': 'LibreOffice 3.5', - 'CreationDate': 'Fri Jun 1 17:07:57 2012', - 'Tagged': 'no', - 'Pages': '1', - 'Encrypted': 'no', - 'Page size': '612 x 792 pts (letter)', - 'Optimized': 'no', - 'PDF version': '1.4', + u'Author': u'Álvaro Justen', + u'Creator': u'Writer', + u'Producer': u'LibreOffice 3.5', + u'CreationDate': u'Fri Jun 1 17:07:57 2012', + u'Tagged': u'no', + u'Pages': u'1', + u'Encrypted': u'no', + u'Page size': u'612 x 792 pts (letter)', + u'Optimized': u'no', + u'PDF version': u'1.4', } metadata_expected_set = set(metadata_expected.iteritems()) - metadata = self.document['file_metadata'] + metadata = refreshed_document['file_metadata'] metadata_set = set(metadata.iteritems()) diff_set = metadata_expected_set - metadata_set self.assertTrue(metadata_expected_set.issubset(metadata_set), ("Extracted metadata is not a subset of the expected metadata. " "Items missing or with different values: {}").format( u", ".join(unicode(item) for item in diff_set))) - self.assertEqual(self.document['mimetype'], 'application/pdf') + self.assertEqual(refreshed_document['mimetype'], 'application/pdf') def test_extraction_from_html(self): contents = dedent(''' @@ -101,9 +112,10 @@ def test_extraction_from_html(self): ''') - data = {'filename': 'test.html', 'contents': contents} - self.document.update(data) - result = Extractor().delay(self.fake_id) + data = {'filename': 'test.html', + 'contents': base64.b64encode(contents)} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) expected = dedent(''' Testing @@ -121,75 +133,92 @@ def test_extraction_from_html(self): bla1 bla2''').strip() - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['mimetype'], 'text/html') + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['mimetype'], 'text/html') def test_language_detection_pt(self): text_pt = 'Esse texto foi escrito por Álvaro em Português.' - data_pt = {'filename': 'text-pt.txt', 'contents': text_pt} - self.document.update(data_pt) - Extractor().delay(self.fake_id).get() - self.assertEqual(self.document['language'], 'pt') + data_pt = {'filename': 'text-pt.txt', + 'contents': base64.b64encode(text_pt)} + doc_id = self.collection.insert(data_pt, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['language'], 'pt') def test_language_detection_es(self): text_es = 'Este texto ha sido escrito en Español por Álvaro.' - data_es = {'filename': 'text-es.txt', 'contents': text_es} - self.document.update(data_es) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['language'], 'es') + data_es = {'filename': 'text-es.txt', + 'contents': base64.b64encode(text_es)} + doc_id = self.collection.insert(data_es, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['language'], 'es') def test_language_detection_en(self): text_en = 'This text was written by Álvaro in English.' - data_en = {'filename': 'text-en.txt', 'contents': text_en} - self.document.update(data_en) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['language'], 'en') + data_en = {'filename': 'text-en.txt', + 'contents': base64.b64encode(text_en)} + doc_id = self.collection.insert(data_en, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['language'], 'en') def test_unescape_html_entities(self): expected = (u"This text has html . Álvaro asked me to make" " sure it also has non ascii chars.") filename = os.path.join(DATA_DIR, 'test_html_entities.txt') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) def test_should_detect_encoding_and_return_a_unicode_object(self): expected = u"Flávio" filename = os.path.join(DATA_DIR, 'test_iso-8859-1.txt') - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(type(self.document['text']), unicode) + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(type(refreshed_document['text']), unicode) def test_should_guess_mimetype_for_file_without_extension(self): contents = "This is a test file. I'm testing PyPLN extractor worker!" filename = os.path.join(DATA_DIR, 'text_file') - data = {'filename': filename, 'contents': contents} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['mimetype'], 'text/plain') + data = {'filename': filename, + 'contents': base64.b64encode(contents)} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['mimetype'], 'text/plain') def test_unknown_mimetype_should_be_flagged(self): filename = os.path.join(DATA_DIR, 'random_file') # we can't put the expected text content here, so we'll just make sure # it's equal to the input content, since contents = open(filename).read() - data = {'filename': filename, 'contents': contents} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['mimetype'], 'unknown') - self.assertEqual(self.document['text'], "") - self.assertEqual(self.document['language'], "") - self.assertEqual(self.document['file_metadata'], {}) + data = {'filename': filename, + 'contents': base64.b64encode(contents)} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['mimetype'], 'unknown') + self.assertEqual(refreshed_document['text'], "") + self.assertEqual(refreshed_document['language'], "") + self.assertEqual(refreshed_document['file_metadata'], {}) def test_unknown_encoding_should_be_ignored(self): filename = os.path.join(DATA_DIR, 'encoding_unknown_to_libmagic.txt') expected = u"This file has a weird byte (\x96) that makes it impossible for libmagic to recognize it's encoding." - data = {'filename': filename, 'contents': open(filename).read()} - self.document.update(data) - Extractor().delay(self.fake_id) - self.assertEqual(self.document['text'], expected) - self.assertEqual(self.document['file_metadata'], {}) - self.assertEqual(self.document['language'], 'en') + data = {'filename': filename, + 'contents': base64.b64encode(open(filename).read())} + doc_id = self.collection.insert(data, w=1) + Extractor().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['text'], expected) + self.assertEqual(refreshed_document['file_metadata'], {}) + self.assertEqual(refreshed_document['language'], 'en') diff --git a/tests/test_worker_freqdist.py b/tests/test_worker_freqdist.py index d8d61a9..bde9c98 100644 --- a/tests/test_worker_freqdist.py +++ b/tests/test_worker_freqdist.py @@ -22,18 +22,18 @@ class TestFreqDistWorker(TaskTest): def test_freqdist_should_return_a_list_of_tuples_with_frequency_distribution(self): - tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', - 'yellow', '.'] + tokens = [u'The', u'sky', u'is', u'blue', u',', u'the', u'sun', u'is', + u'yellow', u'.'] - expected_fd = [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), - ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)] + expected_fd = [[u'is', 2], [u'the', 2], [u'blue', 1], [u'sun', 1], + [u'sky', 1], [u',', 1], [u'yellow', 1], [u'.', 1]] # This is just preparing the expected input in the database - self.document['tokens'] = tokens + doc_id = self.collection.insert({'tokens': tokens}, w=1) - FreqDist().delay(self.fake_id) + FreqDist().delay(doc_id) - resulting_fd = self.document['freqdist'] + resulting_fd = self.collection.find_one({'_id': doc_id})['freqdist'] self.assertEqual(resulting_fd, expected_fd) diff --git a/tests/test_worker_gridfs_data_retriever.py b/tests/test_worker_gridfs_data_retriever.py index ffdfce5..1012627 100644 --- a/tests/test_worker_gridfs_data_retriever.py +++ b/tests/test_worker_gridfs_data_retriever.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import base64 import bson from gridfs import GridFS from pypln.backend.workers import GridFSDataRetriever @@ -31,19 +32,29 @@ def test_extract_file_data_from_GridFS(self): new_file_id = gridfs.put(content) expected_file_data = gridfs.get(new_file_id) - self.document['file_id'] = str(new_file_id) - GridFSDataRetriever().delay(self.fake_id) + data = {'file_id': str(new_file_id)} + doc_id = self.collection.insert(data, w=1) + GridFSDataRetriever().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) - self.assertEqual(self.document['contents'], content) - self.assertEqual(self.document['length'], expected_file_data.length) - self.assertEqual(self.document['md5'], expected_file_data.md5) - self.assertEqual(self.document['filename'], expected_file_data.filename) - self.assertEqual(self.document['upload_date'], expected_file_data.upload_date) - self.assertEqual(self.document['contents'], expected_file_data.read()) + self.assertEqual(refreshed_document['contents'], + base64.b64encode(content)) + self.assertEqual(refreshed_document['length'], + expected_file_data.length) + self.assertEqual(refreshed_document['md5'], expected_file_data.md5) + self.assertEqual(refreshed_document['filename'], + expected_file_data.filename) + self.assertEqual(refreshed_document['upload_date'], + expected_file_data.upload_date) + self.assertEqual(refreshed_document['contents'], + base64.b64encode(expected_file_data.read())) def test_task_raises_exception_when_file_does_not_exist(self): - self.document['file_id'] = "Inexistent document" - result = GridFSDataRetriever().delay(self.fake_id) + data = {'file_id': "Inexistent document"} + doc_id = self.collection.insert(data, w=1) + result = GridFSDataRetriever().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertTrue(result.failed()) self.assertEqual(result.status, "FAILURE") self.assertIsInstance(result.info, bson.errors.InvalidId) diff --git a/tests/test_worker_gridfs_file_deleter.py b/tests/test_worker_gridfs_file_deleter.py index 60f9613..149d9d0 100644 --- a/tests/test_worker_gridfs_file_deleter.py +++ b/tests/test_worker_gridfs_file_deleter.py @@ -31,16 +31,16 @@ def test_delete_file_from_GridFS(self): new_file_id = gridfs.put(content) expected_file_data = gridfs.get(new_file_id) - self.document['file_id'] = str(new_file_id) + doc_id = self.collection.insert({'file_id': new_file_id}, w=1) self.assertTrue(gridfs.exists(new_file_id)) - GridFSFileDeleter().delay(self.fake_id) + GridFSFileDeleter().delay(doc_id) self.assertFalse(gridfs.exists(new_file_id)) def test_task_raises_exception_when_file_does_not_exist(self): - self.document['file_id'] = "Inexistent document" - result = GridFSFileDeleter().delay(self.fake_id) + doc_id = self.collection.insert({'file_id': "Inexistent document"}, w=1) + result = GridFSFileDeleter().delay(doc_id) self.assertTrue(result.failed()) self.assertEqual(result.status, "FAILURE") self.assertIsInstance(result.info, bson.errors.InvalidId) diff --git a/tests/test_worker_lemmatizer_pt.py b/tests/test_worker_lemmatizer_pt.py index 2dc4c62..3887d81 100644 --- a/tests/test_worker_lemmatizer_pt.py +++ b/tests/test_worker_lemmatizer_pt.py @@ -42,8 +42,9 @@ def test_lemmatizer_should_return_a_list_with_lemmas(self): ''').strip() + '\n\n' - self.document['palavras_raw'] = palavras_output - self.document['palavras_raw_ran'] = True - result = Lemmatizer().delay(self.fake_id) + doc = {'palavras_raw': palavras_output, 'palavras_raw_ran': True} + doc_id = self.collection.insert(doc, w=1) + result = Lemmatizer().delay(doc_id) expected = 'eu saber que em este momento falar para todo Brasil .'.split() - self.assertEqual(self.document['lemmas'], expected) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['lemmas'], expected) diff --git a/tests/test_worker_palavras_noun_phrase.py b/tests/test_worker_palavras_noun_phrase.py index 950f8ae..68adcaa 100644 --- a/tests/test_worker_palavras_noun_phrase.py +++ b/tests/test_worker_palavras_noun_phrase.py @@ -44,9 +44,10 @@ def test_noun_phrase_worker_should_return_a_list_with_phrases(self): ''').strip() + '\n\n' - self.document.update({'palavras_raw': palavras_output, - 'palavras_raw_ran': True}) - NounPhrase().delay(self.fake_id) + doc_id = self.collection.insert({'palavras_raw': palavras_output, + 'palavras_raw_ran': True}, w=1) + NounPhrase().delay(doc_id) expected = ['_este *momento', 'todo o *povo de_ _o Brasil .', '_o *Brasil .'] - self.assertEqual(self.document['noun_phrases'], expected) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['noun_phrases'], expected) diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py index 41e9c61..90a3845 100644 --- a/tests/test_worker_palavras_raw.py +++ b/tests/test_worker_palavras_raw.py @@ -30,25 +30,28 @@ class TestPalavrasRawWorker(TaskTest): def test_should_run_only_if_language_is_portuguese(self): if palavras_raw.palavras_installed(): - self.document.update({'text': 'There was a rock on the way.', - 'language': 'en'}) + doc_id = self.collection.insert({'text': 'There was a rock on the way.', + 'language': 'en'}, w=1) - palavras_raw.PalavrasRaw().delay(self.fake_id) - self.assertEqual(self.document['palavras_raw_ran'], False) + palavras_raw.PalavrasRaw().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['palavras_raw_ran'], False) def test_palavras_not_installed(self): palavras_raw.BASE_PARSER = '/not-found' - self.document.update({'text': 'Tinha uma pedra no meio do caminho.', - 'language': 'pt'}) - palavras_raw.PalavrasRaw().delay(self.fake_id) - self.assertEqual(self.document['palavras_raw_ran'], False) + doc_id = self.collection.insert( + {'text': 'Tinha uma pedra no meio do caminho.', + 'language': 'pt'}, w=1) + palavras_raw.PalavrasRaw().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['palavras_raw_ran'], False) def test_palavras_should_return_raw_if_it_is_installed(self): palavras_raw.BASE_PARSER = ORIGINAL_PATH - self.document.update( + doc_id = self.collection.insert( {'text': 'Eu sei que neste momento falo para todo Brasil.', - 'language': 'pt'}) + 'language': 'pt'}, w=1) expected_raw = dedent(''' Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2 sei [saber] V PR 1S IND VFIN @FS-STA #2->0 @@ -63,6 +66,7 @@ def test_palavras_should_return_raw_if_it_is_installed(self): $. #11->0 ''').strip() + '\n\n' - result = palavras_raw.PalavrasRaw().delay(self.fake_id) - self.assertEqual(self.document['palavras_raw'], expected_raw) - self.assertEqual(self.document['palavras_raw_ran'], True) + result = palavras_raw.PalavrasRaw().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['palavras_raw'], expected_raw) + self.assertEqual(refreshed_document['palavras_raw_ran'], True) diff --git a/tests/test_worker_palavras_semantic_tagger.py b/tests/test_worker_palavras_semantic_tagger.py index aa75a2a..1b3abbe 100644 --- a/tests/test_worker_palavras_semantic_tagger.py +++ b/tests/test_worker_palavras_semantic_tagger.py @@ -54,11 +54,12 @@ def test_basic_semantic_tags(self): 'Verbs_related_human_things': ['falo'] } - self.document.update({'palavras_raw': palavras_output, - 'palavras_raw_ran': True}) - SemanticTagger().delay(self.fake_id) + doc_id = self.collection.insert({'palavras_raw': palavras_output, + 'palavras_raw_ran': True}, w=1) + SemanticTagger().delay(doc_id) - self.assertEqual(self.document['semantic_tags'], expected_tags) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['semantic_tags'], expected_tags) def test_ambiguous_tags(self): @@ -77,12 +78,13 @@ def test_ambiguous_tags(self): ''').strip() + '\n\n' expected_tags = { - 'Non_Tagged': ['Eu', 'bem', 'enquanto', 'ele', 'está', 'em', - 'o'], - 'Place and spatial': ['canto'], - 'Verbs_related_human_things': ['canto'] + 'Non_Tagged': [u'Eu', u'bem', u'enquanto', u'ele', u'está', + u'em', u'o'], + 'Place and spatial': [u'canto'], + 'Verbs_related_human_things': [u'canto'] } - self.document.update({'palavras_raw': palavras_output, - 'palavras_raw_ran': True}) - SemanticTagger().delay(self.fake_id) - self.assertEqual(self.document['semantic_tags'], expected_tags) + doc_id = self.collection.insert({'palavras_raw': palavras_output, + 'palavras_raw_ran': True}, w=1) + SemanticTagger().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['semantic_tags'], expected_tags) diff --git a/tests/test_worker_pos.py b/tests/test_worker_pos.py index 6413c18..68a3bc2 100644 --- a/tests/test_worker_pos.py +++ b/tests/test_worker_pos.py @@ -27,15 +27,16 @@ def test_pos_should_return_a_list_of_tuples_with_token_classification_and_offset text = 'The sky is blue, the sun is yellow.' tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.'] - expected = [('The', 'DT', 0), ('sky', 'NN', 4), ('is', 'VBZ', 8), - ('blue', 'JJ', 11), (',', ',', 15), ('the', 'DT', 17), - ('sun', 'NN', 21), ('is', 'VBZ', 25), ('yellow', 'JJ', 28), - ('.', '.', 34)] - self.document.update({'text': text, 'tokens': tokens, - 'language': 'en'}) - POS().delay(self.fake_id) - self.assertEqual(self.document['pos'], expected) - self.assertEqual(self.document['tagset'], 'en-nltk') + expected = [['The', 'DT', 0], ['sky', 'NN', 4], ['is', 'VBZ', 8], + ['blue', 'JJ', 11], [',', ',', 15], ['the', 'DT', 17], + ['sun', 'NN', 21], ['is', 'VBZ', 25], ['yellow', 'JJ', 28], + ['.', '.', 34]] + doc_id = self.collection.insert({'text': text, 'tokens': tokens, + 'language': 'en'}, w=1) + POS().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['pos'], expected) + self.assertEqual(refreshed_document['tagset'], 'en-nltk') def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self): text = 'Isso é uma frase em português.' @@ -51,9 +52,10 @@ def test_pos_should_run_pt_palavras_if_text_is_in_portuguese(self): ''').strip() + '\n\n' # '.' is the only named entity here. - expected = [(u'.', u'.', 29)] - self.document.update({'text': text, 'tokens': tokens, - 'language': 'pt', 'palavras_raw': palavras_raw}) - POS().delay(self.fake_id) - self.assertEqual(self.document['pos'], expected) - self.assertEqual(self.document['tagset'], 'pt-palavras') + expected = [[u'.', u'.', 29]] + doc_id = self.collection.insert({'text': text, 'tokens': tokens, + 'language': 'pt', 'palavras_raw': palavras_raw}, w=1) + POS().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['pos'], expected) + self.assertEqual(refreshed_document['tagset'], 'pt-palavras') diff --git a/tests/test_worker_spellchecker.py b/tests/test_worker_spellchecker.py index 7700b75..b81bb93 100644 --- a/tests/test_worker_spellchecker.py +++ b/tests/test_worker_spellchecker.py @@ -28,19 +28,24 @@ class TestSpellcheckerWorker(TaskTest): def test_spellchek_pt(self): text = u"Meu cachoro é um pastor" - self.document.update({'text': text, 'language': 'pt_BR'}) - spellchecker.SpellingChecker().delay(self.fake_id) - self.assertEqual(len(self.document['spelling_errors']), 1) - self.assertIn('cachoro', self.document['spelling_errors'][0]) - self.assertIn('cachorro', self.document['spelling_errors'][0][2]) - self.assertEqual(self.document['spelling_errors'][0][1], 4) + doc_id = self.collection.insert({'text': text, 'language': 'pt_BR'}, + w=1) + spellchecker.SpellingChecker().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(len(refreshed_document['spelling_errors']), 1) + self.assertIn('cachoro', refreshed_document['spelling_errors'][0]) + self.assertIn('cachorro', refreshed_document['spelling_errors'][0][2]) + self.assertEqual(refreshed_document['spelling_errors'][0][1], 4) def test_spellchek_en(self): text = u"The cat bit the doggyo" - self.document.update({'text': text, 'language': 'en'}) - spellchecker.SpellingChecker().delay(self.fake_id) - self.assertEqual(len(self.document['spelling_errors']), 1) - self.assertIn('doggyo', self.document['spelling_errors'][0]) - self.assertIn('doggy', self.document['spelling_errors'][0][2]) - self.assertEqual(self.document['spelling_errors'][0][1], 16) + doc_id = self.collection.insert({'text': text, 'language': 'en'}, w=1) + spellchecker.SpellingChecker().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(len(refreshed_document['spelling_errors']), 1) + self.assertIn('doggyo', refreshed_document['spelling_errors'][0]) + self.assertIn('doggy', refreshed_document['spelling_errors'][0][2]) + self.assertEqual(refreshed_document['spelling_errors'][0][1], 16) diff --git a/tests/test_worker_statistics.py b/tests/test_worker_statistics.py index efc412f..3370e8d 100644 --- a/tests/test_worker_statistics.py +++ b/tests/test_worker_statistics.py @@ -23,26 +23,32 @@ class TestStatisticsWorker(TaskTest): def test_simple(self): - self.document['sentences'] = [['this', 'is', 'a', 'test', '.'], - ['this', 'is', 'another', '!']] - self.document['freqdist'] = [('this', 2), ('is', 2), ('a', 1), - ('test', 1), ('.', 1), ('another', 1), ('!', 1)] - Statistics().delay(self.fake_id) - self.assertEqual(self.document['average_sentence_length'], 4.5) - self.assertEqual(self.document['average_sentence_repertoire'], 1) - self.assertAlmostEqual(self.document['momentum_1'], 1.2857, places=3) - self.assertAlmostEqual(self.document['momentum_2'], 1.8571, places=3) - self.assertEqual(self.document['momentum_3'], 3) - self.assertAlmostEqual(self.document['momentum_4'], 5.2857, places=3) - self.assertAlmostEqual(self.document['repertoire'], 0.7777, places=3) + doc = {'sentences': [['this', 'is', 'a', 'test', '.'], ['this', 'is', + 'another', '!']], 'freqdist': [('this', 2), ('is', 2), ('a', 1), + ('test', 1), ('.', 1), ('another', 1), ('!', 1)]} + doc_id = self.collection.insert(doc, w=1) + Statistics().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + + self.assertEqual(refreshed_document['average_sentence_length'], 4.5) + self.assertEqual(refreshed_document['average_sentence_repertoire'], 1) + self.assertAlmostEqual(refreshed_document['momentum_1'], 1.2857, places=3) + self.assertAlmostEqual(refreshed_document['momentum_2'], 1.8571, places=3) + self.assertEqual(refreshed_document['momentum_3'], 3) + self.assertAlmostEqual(refreshed_document['momentum_4'], 5.2857, places=3) + self.assertAlmostEqual(refreshed_document['repertoire'], 0.7777, places=3) def test_zero_division_error(self): - self.document.update({'freqdist': [], 'sentences': []}) - Statistics().delay(self.fake_id) - self.assertEqual(self.document['average_sentence_length'], 0) - self.assertEqual(self.document['average_sentence_repertoire'], 0) - self.assertEqual(self.document['momentum_1'], 0) - self.assertEqual(self.document['momentum_2'], 0) - self.assertEqual(self.document['momentum_3'], 0) - self.assertEqual(self.document['momentum_4'], 0) - self.assertEqual(self.document['repertoire'], 0) + doc_id = self.collection.insert({'freqdist': [], 'sentences': []}, w=1) + + Statistics().delay(doc_id) + + refreshed_document = self.collection.find_one({'_id': doc_id}) + self.assertEqual(refreshed_document['average_sentence_length'], 0) + self.assertEqual(refreshed_document['average_sentence_repertoire'], 0) + self.assertEqual(refreshed_document['momentum_1'], 0) + self.assertEqual(refreshed_document['momentum_2'], 0) + self.assertEqual(refreshed_document['momentum_3'], 0) + self.assertEqual(refreshed_document['momentum_4'], 0) + self.assertEqual(refreshed_document['repertoire'], 0) diff --git a/tests/test_worker_tokenizer.py b/tests/test_worker_tokenizer.py index 6d3e4d3..9d59cac 100644 --- a/tests/test_worker_tokenizer.py +++ b/tests/test_worker_tokenizer.py @@ -23,17 +23,19 @@ class TestTokenizerWorker(TaskTest): def test_tokenizer_should_receive_text_and_return_tokens(self): - self.document['text'] = 'The sky is blue, the sun is yellow. This is another sentence.' + doc = {'text': 'The sky is blue, the sun is yellow. This is another sentence.'} expected_tokens = ['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.', 'This', 'is', 'another', 'sentence', '.'] expected_sentences = [['The', 'sky', 'is', 'blue', ',', 'the', 'sun', 'is', 'yellow', '.'], ['This', 'is', 'another', 'sentence', '.']] - Tokenizer().delay(self.fake_id) + doc_id = self.collection.insert(doc, w=1) + Tokenizer().delay(doc_id) - tokens = self.document['tokens'] - sentences = self.document['sentences'] + refreshed_document = self.collection.find_one({'_id': doc_id}) + tokens = refreshed_document['tokens'] + sentences = refreshed_document['sentences'] self.assertEqual(tokens, expected_tokens) self.assertEqual(sentences, expected_sentences) diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index 6b1818f..93575e9 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -26,13 +26,27 @@ class TestTrigramWorker(TaskTest): - def test_Trigrams_should_return_correct_score_(self): + def test_Trigrams_should_return_correct_score(self): tokens = [w for w in nltk.corpus.genesis.words('english-web.txt')] - trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) - expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf',u'plucked') - self.document['tokens'] = tokens - Trigrams().delay(self.fake_id) - trigram_rank = self.document['trigram_rank'] - result = trigram_rank[(u'olive', u'leaf',u'plucked')][0] - self.assertEqual(result, expected) + doc_id = self.collection.insert({'tokens': tokens}, w=1) + Trigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + trigram_rank = refreshed_document['trigram_rank'] + result = trigram_rank[u'olive leaf plucked'][0] + # This is the value of the chi_sq measure for this trigram in this + # colocation + expected_chi_sq = 1940754916.9623578 + self.assertEqual(result, expected_chi_sq) + + def test_Trigrams_may_contain_dots_and_dollar_signs(self): + tokens = ['$', 'test', '.'] + doc_id = self.collection.insert({'tokens': tokens}, w=1) + Trigrams().delay(doc_id) + refreshed_document = self.collection.find_one({'_id': doc_id}) + trigram_rank = refreshed_document['trigram_rank'] + result = trigram_rank[u'\dollarsign test \dot'][0] + # This is the value of the chi_sq measure for this trigram in this + # colocation + expected_chi_sq = 10.5 + self.assertEqual(result, expected_chi_sq) diff --git a/tests/test_worker_wordcloud.py b/tests/test_worker_wordcloud.py index 56c48f2..23ed090 100644 --- a/tests/test_worker_wordcloud.py +++ b/tests/test_worker_wordcloud.py @@ -29,12 +29,13 @@ class TestFreqDistWorker(TaskTest): name = "WordCloud" def test_wordcloud_should_return_a_base64_encoded_png(self): - self.document['freqdist'] = [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), - ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)] - self.document['language'] = 'en' - WordCloud().delay(self.fake_id).get() + doc = {'freqdist': [('is', 2), ('the', 2), ('blue', 1), ('sun', 1), + ('sky', 1), (',', 1), ('yellow', 1), ('.', 1)], 'language': 'en'} + doc_id = self.collection.insert(doc, w=1) + WordCloud().delay(doc_id) - raw_png_data = base64.b64decode(self.document['wordcloud']) + refreshed_document = self.collection.find_one({'_id': doc_id}) + raw_png_data = base64.b64decode(refreshed_document['wordcloud']) fake_file = StringIO(raw_png_data) img = Image.open(fake_file) diff --git a/tests/utils.py b/tests/utils.py index 04e7c7c..452bc79 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -21,7 +21,6 @@ import unittest import pymongo -from pypln.backend.mongodict_adapter import MongoDictAdapter from pypln.backend.celery_app import app from pypln.backend import config @@ -33,12 +32,11 @@ class TaskTest(unittest.TestCase): def setUp(self): app.conf.update(CELERY_ALWAYS_EAGER=True) - self.fake_id = '1234' - self.document = MongoDictAdapter(self.fake_id, database=self.db_name) self.db = pymongo.Connection()[self.db_name] + self.collection = self.db[config.MONGODB_CONFIG['collection']] def tearDown(self): - self.db.main.remove({}) + self.collection.remove({}) @classmethod def setUpClass(cls):