diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
index ed1c957..bced3a2 100644
--- a/pypln/backend/celery_task.py
+++ b/pypln/backend/celery_task.py
@@ -16,11 +16,9 @@
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see .
-
+import pymongo
from celery import Task
-from pypln.backend.mongodict_adapter import MongoDictAdapter
-
# This import may look like an unused imported, but it is not.
# When our base task class is defined, the Celery app must have already been
# instantiated, otherwise when this code is imported elsewhere (like in a
@@ -33,6 +31,11 @@
from pypln.backend import config
+mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"],
+ port=config.MONGODB_CONFIG["port"])
+database = mongo_client[config.MONGODB_CONFIG["database"]]
+document_collection = database[config.MONGODB_CONFIG["collection"]]
+
class PyPLNTask(Task):
"""
A base class for PyPLN tasks. It is in charge of getting the document
@@ -48,16 +51,9 @@ def run(self, document_id):
It will call the `process` method with a dictionary containing all the
document information and will update de database with results.
"""
- document = MongoDictAdapter(doc_id=document_id,
- host=config.MONGODB_CONFIG['host'],
- port=config.MONGODB_CONFIG['port'],
- database=config.MONGODB_CONFIG['database'])
- # Create a dictionary out of our document. We could simply pass
- # it on to the process method, but for now we won't let the user
- # manipulate the MongoDict directly.
- dic = {k: v for k, v in document.iteritems()}
- result = self.process(dic)
- document.update(result)
+ document = document_collection.find_one({"_id": document_id})
+ result = self.process(document)
+ document_collection.update({"_id": document_id}, {"$set": result})
return document_id
def process(self, document):
diff --git a/pypln/backend/config.py b/pypln/backend/config.py
index f06fb1e..f074c3a 100644
--- a/pypln/backend/config.py
+++ b/pypln/backend/config.py
@@ -5,6 +5,7 @@ def get_store_config():
defaults = {'host': 'localhost',
'port': '27017',
'database': 'pypln_dev',
+ 'collection': 'documents',
'gridfs_collection': 'files',
}
config = ConfigParser.ConfigParser(defaults=defaults)
diff --git a/pypln/backend/mongodict_adapter.py b/pypln/backend/mongodict_adapter.py
deleted file mode 100644
index b57d322..0000000
--- a/pypln/backend/mongodict_adapter.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# coding: utf-8
-#
-# Copyright 2012 NAMD-EMAP-FGV
-#
-# This file is part of PyPLN. You can get more information at: http://pypln.org/.
-#
-# PyPLN is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# PyPLN is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with PyPLN. If not, see .
-
-from mongodict import MongoDict
-
-
-class MongoDictAdapter(MongoDict):
- #TODO: implement clear, __iter__, __len__ and contains with filters by id
- def __init__(self, doc_id, *args, **kwargs):
- self.doc_id = doc_id
- self.prefix = 'id:{}:'.format(self.doc_id)
- self.prefixed_id_query = {'$regex':
- '^{}'.format(self.prefix)}
- return super(MongoDictAdapter, self).__init__(*args, **kwargs)
-
- def __setitem__(self, key, value):
- key = 'id:{}:{}'.format(self.doc_id, key)
- return super(MongoDictAdapter, self).__setitem__(key, value)
-
- def __getitem__(self, key):
- key = 'id:{}:{}'.format(self.doc_id, key)
- return super(MongoDictAdapter, self).__getitem__(key)
-
- def __delitem__(self, key):
- key = 'id:{}:{}'.format(self.doc_id, key)
- return super(MongoDictAdapter, self).__delitem__(key)
-
- def __contains__(self, key):
- # If this is being called by other methods (like __delitem__)
- # it will already have the prefix
- if not key.startswith('id:'):
- key = 'id:{}:{}'.format(self.doc_id, key)
- return super(MongoDictAdapter, self).__contains__(key)
-
- has_key = __contains__
-
- def __iter__(self):
- query_result = self._collection.find({'_id':
- self.prefixed_id_query}, {'_id': 1})
- keys = (k['_id'].replace(self.prefix, '', 1) for k in query_result)
- return keys
-
- def __len__(self):
- return self._collection.find({'_id': self.prefixed_id_query}).count()
-
- def clear(self):
- self._collection.remove({'_id': self.prefixed_id_query})
diff --git a/pypln/backend/workers/elastic_indexer.py b/pypln/backend/workers/elastic_indexer.py
index faf8119..f5b55c3 100644
--- a/pypln/backend/workers/elastic_indexer.py
+++ b/pypln/backend/workers/elastic_indexer.py
@@ -36,6 +36,12 @@ def process(self, document):
# See `test_regression_indexing_should_not_include_contents` in
# tests/test_elastic_indexer.py for details.
document.pop('contents')
+ # We also need to exclude _id, because ObjectId's won't be
+ # serializable.
+ document.pop("_id")
+
result = ES.index(index=index_name, doc_type=doc_type,
body=document, id=file_id)
+ index_id = result.pop("_id")
+ result["index_id"] = index_id
return result
diff --git a/pypln/backend/workers/extractor.py b/pypln/backend/workers/extractor.py
index 09e1b32..110730b 100644
--- a/pypln/backend/workers/extractor.py
+++ b/pypln/backend/workers/extractor.py
@@ -17,6 +17,7 @@
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see .
+import base64
import shlex
from HTMLParser import HTMLParser
@@ -169,15 +170,16 @@ class Extractor(PyPLNTask):
#TODO: should 'replace_with' be '' when extracting from HTML?
def process(self, file_data):
+ contents = base64.b64decode(file_data['contents'])
with magic.Magic(flags=magic.MAGIC_MIME_TYPE) as m:
- file_mime_type = m.id_buffer(file_data['contents'])
+ file_mime_type = m.id_buffer(contents)
metadata = {}
if file_mime_type == 'text/plain':
- text = file_data['contents']
+ text = contents
elif file_mime_type == 'text/html':
- text = parse_html(file_data['contents'], True, ['script', 'style'])
+ text = parse_html(contents, True, ['script', 'style'])
elif file_mime_type == 'application/pdf':
- text, metadata = extract_pdf(file_data['contents'])
+ text, metadata = extract_pdf(contents)
else:
# If we can't detect the mimetype we add a flag that can be read by
# the frontend to provide more information on why the document
diff --git a/pypln/backend/workers/gridfs_data_retriever.py b/pypln/backend/workers/gridfs_data_retriever.py
index 268cb21..68ed916 100644
--- a/pypln/backend/workers/gridfs_data_retriever.py
+++ b/pypln/backend/workers/gridfs_data_retriever.py
@@ -16,6 +16,7 @@
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see .
+import base64
from bson import ObjectId
from gridfs import GridFS
import pymongo
@@ -31,9 +32,17 @@ def process(self, document):
gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection'])
file_data = gridfs.get(ObjectId(document['file_id']))
+
+ # We decided to store 'contents' as a base64 encoded string in the
+ # database to avoid possible corruption of files. For example: when
+ # it's a pdf, the process of storing the data as utf-8 in mongo might
+ # be corrupting the file. This wasn't a problem before, because
+ # MongoDict pickled everything before storing.
+ contents = base64.b64encode(file_data.read())
+
result = {'length': file_data.length,
'md5': file_data.md5,
'filename': file_data.filename,
'upload_date': file_data.upload_date,
- 'contents': file_data.read()}
+ 'contents': contents}
return result
diff --git a/pypln/backend/workers/gridfs_file_deleter.py b/pypln/backend/workers/gridfs_file_deleter.py
index c1dc15f..36ea082 100644
--- a/pypln/backend/workers/gridfs_file_deleter.py
+++ b/pypln/backend/workers/gridfs_file_deleter.py
@@ -25,9 +25,9 @@
class GridFSFileDeleter(PyPLNTask):
def process(self, document):
- database = pymongo.MongoClient(host=config.MONGODB_CONFIG['host'],
- port=config.MONGODB_CONFIG['port']
- )[config.MONGODB_CONFIG['database']]
+ mongo_client = pymongo.MongoClient(host=config.MONGODB_CONFIG["host"],
+ port=config.MONGODB_CONFIG["port"])
+ database = mongo_client[config.MONGODB_CONFIG["database"]]
gridfs = GridFS(database, config.MONGODB_CONFIG['gridfs_collection'])
gridfs.delete(ObjectId(document['file_id']))
diff --git a/pypln/backend/workers/tokenizer.py b/pypln/backend/workers/tokenizer.py
index d6f30d0..fd5e37a 100644
--- a/pypln/backend/workers/tokenizer.py
+++ b/pypln/backend/workers/tokenizer.py
@@ -16,7 +16,6 @@
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see .
-from mongodict import MongoDict
from nltk import word_tokenize, sent_tokenize
from pypln.backend.celery_task import PyPLNTask
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
index d1972c2..4ad46ef 100644
--- a/pypln/backend/workers/trigrams.py
+++ b/pypln/backend/workers/trigrams.py
@@ -42,6 +42,14 @@ def process(self, document):
tr = defaultdict(lambda: [])
for m in metrics:
for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)):
- tr[res[0]].append(res[1])
+ # We cannot store the trigram as a tuple (mongo keys need to be
+ # strings). We decided to join tokens using spaces since a
+ # space will never be in a token.
+ key = u' '.join(res[0])
+ # Mongo cannot have `.` or `$` in key names. Unfortunatelly
+ # this means we need to replace them with placeholders.
+ key = key.replace(u'$', u'\dollarsign')
+ key = key.replace(u'.', u'\dot')
+ tr[key].append(res[1])
- return {'trigram_rank': dict(tr), 'metrics':metrics}
+ return {'trigram_rank': tr, 'metrics':metrics}
diff --git a/requirements/production.txt b/requirements/production.txt
index 2c80c43..e19f0fe 100644
--- a/requirements/production.txt
+++ b/requirements/production.txt
@@ -1,7 +1,6 @@
# Common
celery
pymongo==2.8.1
-mongodict
# The newest pyparsing (2.0) only supports python 3,
# so we explicitly install 1.5.7 (the last version that
diff --git a/tests/test_celery_task.py b/tests/test_celery_task.py
new file mode 100644
index 0000000..fd1adde
--- /dev/null
+++ b/tests/test_celery_task.py
@@ -0,0 +1,39 @@
+# coding: utf-8
+#
+# Copyright 2015 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN. If not, see .
+from pypln.backend.celery_task import PyPLNTask
+from utils import TaskTest
+
+class FakeTask(PyPLNTask):
+ def process(self, document):
+ return {'result': document['input']}
+
+class TestCeleryTask(TaskTest):
+ def test_task_should_get_the_correct_document(self):
+ """This is a regression test. PyPLNTask was not filtering by _id. It
+ was getting the first document it found. """
+
+ # This is just preparing the expected input in the database
+ wrong_doc_id = self.collection.insert({'input': 'wrong'}, w=1)
+ correct_doc_id = self.collection.insert({'input': 'correct'}, w=1)
+
+ FakeTask().delay(correct_doc_id)
+
+ refreshed_doc = self.collection.find_one({'_id': correct_doc_id})
+
+ self.assertEqual(refreshed_doc['result'], 'correct')
diff --git a/tests/test_elastic_indexer.py b/tests/test_elastic_indexer.py
index a35dd23..faaafab 100644
--- a/tests/test_elastic_indexer.py
+++ b/tests/test_elastic_indexer.py
@@ -32,9 +32,10 @@ def test_indexing_go_through(self):
'contents': 'raw_file_contents',
}
- self.document.update(doc)
- ElasticIndexer().delay(self.fake_id)
- assert self.document['created'] # must be True
+ doc_id = self.collection.insert(doc, w=1)
+ ElasticIndexer().delay(doc_id)
+ refreshed_document = self.collection.find_one({'_id': doc_id})
+ self.assertTrue(refreshed_document['created'])
@patch('pypln.backend.workers.elastic_indexer.ES')
def test_regression_indexing_should_not_include_contents(self, ES):
@@ -54,11 +55,12 @@ def test_regression_indexing_should_not_include_contents(self, ES):
'contents': 'raw_file_contents',
}
- self.document.update(doc)
- ElasticIndexer().delay(self.fake_id)
+ doc_id = self.collection.insert(doc, w=1)
+ ElasticIndexer().delay(doc_id)
# remove properties that won't be indexed
index_name = doc.pop("index_name")
doc_type = doc.pop('doc_type')
doc.pop('contents')
+ doc.pop('_id')
ES.index.assert_called_with(body=doc, id=doc['file_id'],
doc_type=doc_type, index=index_name)
diff --git a/tests/test_mongodict_adapter.py b/tests/test_mongodict_adapter.py
deleted file mode 100644
index af1880d..0000000
--- a/tests/test_mongodict_adapter.py
+++ /dev/null
@@ -1,120 +0,0 @@
-# coding: utf-8
-#
-# Copyright 2012 NAMD-EMAP-FGV
-#
-# This file is part of PyPLN. You can get more information at: http://pypln.org/.
-#
-# PyPLN is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# PyPLN is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with PyPLN. If not, see .
-
-import pickle
-import unittest
-
-from bson import Binary
-import pymongo
-
-from pypln.backend.mongodict_adapter import MongoDictAdapter
-
-
-
-class TestMongoDictAdapter(unittest.TestCase):
- db_name = 'test_mongodictbyid'
-
- def setUp(self):
- self.fake_id = '1234'
- self.document = MongoDictAdapter(self.fake_id, database=self.db_name)
- self.db = pymongo.Connection()[self.db_name]
-
- def tearDown(self):
- self.db.main.remove({})
-
- @classmethod
- def tearDownClass(cls):
- pymongo.MongoClient().drop_database(cls.db_name)
-
- def test_creating_a_new_key_should_saved_the_information(self):
- self.document['new_key'] = 'value'
- stored_value = self.db.main.find_one(
- {'_id': 'id:{}:new_key'.format(self.fake_id)})
- self.assertIsNotNone(stored_value)
- # This decodes the value with the defaults for MongoDict
- decoded_value = pickle.loads(str(stored_value['v']))
- self.assertEqual(decoded_value, 'value')
-
- def test_reading_an_existing_key_should_read_saved_information(self):
- encoded_value = Binary(pickle.dumps(
- 'value', protocol=pickle.HIGHEST_PROTOCOL))
-
- self.db.main.insert(
- {'_id': 'id:{}:key'.format(self.fake_id), 'v': encoded_value})
-
- self.assertEqual(self.document['key'], 'value')
-
- def test_deleting_an_existing_key_should_delete_saved_information(self):
- encoded_value = Binary(pickle.dumps(
- 'value', protocol=pickle.HIGHEST_PROTOCOL))
-
- self.db.main.insert(
- {'_id': 'id:{}:key'.format(self.fake_id), 'v': encoded_value})
-
- self.assertEqual(self.document['key'], 'value')
- del self.document['key']
-
- stored_value = self.db.main.find_one(
- {'_id': 'id:{}:key'.format(self.fake_id)})
- self.assertIsNone(stored_value)
-
- def test_iterating_through_keys_does_not_bring_keys_from_other_docs(self):
- self.document['key_1'] = 1
- self.document['key_2'] = 2
- other_document = MongoDictAdapter('other_id', database=self.db_name)
- other_document['other_key'] = 3
- keys = [k for k in self.document]
-
- self.assertIn('key_1', keys)
- self.assertIn('key_2', keys)
- self.assertNotIn('key_3', keys)
-
- self.assertEquals(['key_1', 'key_2'], self.document.keys())
-
- def test_clear_should_not_remove_keys_for_other_docs(self):
- self.document['key_1'] = 1
- self.document['key_2'] = 2
- other_document = MongoDictAdapter('other_id', database=self.db_name)
- other_document['other_key'] = 3
-
- self.document.clear()
-
- with self.assertRaises(KeyError):
- self.document['key_1']
- self.document['key_2']
-
- self.assertEqual(other_document['other_key'], 3)
-
- def test_return_correct_length(self):
- self.document['key_1'] = 1
- self.document['key_2'] = 2
- other_document = MongoDictAdapter('other_id', database=self.db_name)
- other_document['other_key'] = 3
-
- self.assertEquals(len(self.document), 2)
-
- def test_contains(self):
- self.document['key'] = 1
- self.assertIn('key', self.document)
- self.assertNotIn('inexistent_key', self.document)
-
- def test_has_key(self):
- self.document['key'] = 1
- self.assertTrue(self.document.has_key('key'))
- self.assertFalse(self.document.has_key('inexistent_key'))
diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index 91334b9..de605e2 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -28,16 +28,31 @@
class TestBigramWorker(TaskTest):
def test_bigrams_should_return_correct_score(self):
# We need this list comprehension because we need to save the word list
- # in MongoDict (thus, it needs to be pickleable). Also, a list is what
- # will be available to the worker in real situations.
+ # in mongo (thus, it needs to be json serializable). Also, a list is
+ # what will be available to the worker in real situations.
tokens = [w for w in
nltk.corpus.genesis.words('english-web.txt')]
- self.document['tokens'] = tokens
- bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
- expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which')
+ doc_id = self.collection.insert({'tokens': tokens}, w=1)
- Bigrams().delay(self.fake_id)
- bigram_rank = self.document['bigram_rank']
+ Bigrams().delay(doc_id)
+ refreshed_document = self.collection.find_one({'_id': doc_id})
+ bigram_rank = refreshed_document['bigram_rank']
result = bigram_rank[0][1][0]
- self.assertEqual(result, expected)
+ # This is the value of the chi_sq measure for this bigram in this
+ # colocation
+ expected_chi_sq = 95.59393417173634
+ self.assertEqual(result, expected_chi_sq)
+
+ def test_bigrams_could_contain_dollar_signs_and_dots(self):
+ tokens = ['$', '.']
+ doc_id = self.collection.insert({'tokens': tokens}, w=1)
+
+ Bigrams().delay(doc_id)
+ refreshed_document = self.collection.find_one({'_id': doc_id})
+ bigram_rank = refreshed_document['bigram_rank']
+ result = bigram_rank[0][1][0]
+ # 2.0 is the value of the chi_sq measure for this bigram in this
+ # colocation
+ expected_chi_sq = 2.0
+ self.assertEqual(result, expected_chi_sq)
diff --git a/tests/test_worker_extractor.py b/tests/test_worker_extractor.py
index 489bc02..d7819a5 100644
--- a/tests/test_worker_extractor.py
+++ b/tests/test_worker_extractor.py
@@ -17,6 +17,7 @@
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see .
+import base64
import os
from textwrap import dedent
from pypln.backend.workers import Extractor
@@ -28,54 +29,64 @@ class TestExtractorWorker(TaskTest):
def test_extraction_from_text_file(self):
expected = "This is a test file.\nI'm testing PyPLN extractor worker!"
filename = os.path.join(DATA_DIR, 'test.txt')
- self.document.update({'filename': filename,
- 'contents': open(filename).read()})
- Extractor().delay(self.fake_id)
- self.assertEqual(self.document['text'], expected)
- self.assertEqual(self.document['file_metadata'], {})
- self.assertEqual(self.document['mimetype'], 'text/plain')
+ doc_id = self.collection.insert({'filename': filename,
+ 'contents': base64.b64encode(open(filename).read())}, w=1)
+ Extractor().delay(doc_id)
+ refreshed_document = self.collection.find_one({'_id': doc_id})
+ self.assertEqual(refreshed_document['text'], expected)
+ self.assertEqual(refreshed_document['file_metadata'], {})
+ self.assertEqual(refreshed_document['mimetype'], 'text/plain')
def test_extraction_from_html_file(self):
expected = "This is a test file. I'm testing PyPLN extractor worker!"
filename = os.path.join(DATA_DIR, 'test.html')
- data = {'filename': filename, 'contents': open(filename).read()}
- self.document.update(data)
- Extractor().delay(self.fake_id)
- self.assertEqual(self.document['text'], expected)
- self.assertEqual(self.document['file_metadata'], {})
- self.assertEqual(self.document['mimetype'], 'text/html')
+ # When saving directly to mongodb we always get everything back from
+ # the database as unicode. Because of that, the extractor is having
+ # problems when there is a non-ascii character in the content. This
+ # wasn't a problem before because with mongodict we used to keep a
+ # pickled representation of the data.
+ data = {'filename': filename,
+ 'contents': base64.b64encode(open(filename).read())}
+ doc_id = self.collection.insert(data, w=1)
+ Extractor().delay(doc_id)
+ refreshed_document = self.collection.find_one({'_id': doc_id})
+ self.assertEqual(refreshed_document['text'], expected)
+ self.assertEqual(refreshed_document['file_metadata'], {})
+ self.assertEqual(refreshed_document['mimetype'], 'text/html')
def test_extraction_from_pdf_file(self):
expected = "This is a test file.\nI'm testing PyPLN extractor worker!"
filename = os.path.join(DATA_DIR, 'test.pdf')
- data = {'filename': filename, 'contents': open(filename).read()}
- self.document.update(data)
- Extractor().delay(self.fake_id)
- self.assertEqual(self.document['text'], expected)
+ data = {'filename': filename,
+ 'contents': base64.b64encode(open(filename).read())}
+ doc_id = self.collection.insert(data, w=1)
+ Extractor().delay(doc_id)
+ refreshed_document = self.collection.find_one({'_id': doc_id})
+ self.assertEqual(refreshed_document['text'], expected)
# Check that the expected metadata is a subset of what
# our Extractor found (it may have found more details
# depending on the toolset used to extract metadata)
metadata_expected = {
- 'Author': 'Álvaro Justen',
- 'Creator': 'Writer',
- 'Producer': 'LibreOffice 3.5',
- 'CreationDate': 'Fri Jun 1 17:07:57 2012',
- 'Tagged': 'no',
- 'Pages': '1',
- 'Encrypted': 'no',
- 'Page size': '612 x 792 pts (letter)',
- 'Optimized': 'no',
- 'PDF version': '1.4',
+ u'Author': u'Álvaro Justen',
+ u'Creator': u'Writer',
+ u'Producer': u'LibreOffice 3.5',
+ u'CreationDate': u'Fri Jun 1 17:07:57 2012',
+ u'Tagged': u'no',
+ u'Pages': u'1',
+ u'Encrypted': u'no',
+ u'Page size': u'612 x 792 pts (letter)',
+ u'Optimized': u'no',
+ u'PDF version': u'1.4',
}
metadata_expected_set = set(metadata_expected.iteritems())
- metadata = self.document['file_metadata']
+ metadata = refreshed_document['file_metadata']
metadata_set = set(metadata.iteritems())
diff_set = metadata_expected_set - metadata_set
self.assertTrue(metadata_expected_set.issubset(metadata_set),
("Extracted metadata is not a subset of the expected metadata. "
"Items missing or with different values: {}").format(
u", ".join(unicode(item) for item in diff_set)))
- self.assertEqual(self.document['mimetype'], 'application/pdf')
+ self.assertEqual(refreshed_document['mimetype'], 'application/pdf')
def test_extraction_from_html(self):
contents = dedent('''
@@ -101,9 +112,10 @@ def test_extraction_from_html(self):