From 201d6f8eb8a3adb6aa7f23252e1ff47287becf0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= Date: Tue, 18 Dec 2012 19:38:47 -0200 Subject: [PATCH 01/13] Added bigram worker: returns a pickled bigram finder object --- pypln/backend/workers/bigrams.py | 46 ++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 pypln/backend/workers/bigrams.py diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py new file mode 100644 index 0000000..6926824 --- /dev/null +++ b/pypln/backend/workers/bigrams.py @@ -0,0 +1,46 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . + +from pypelinin import Worker + +import nltk +from nltk.collocations import BigramCollocationFinder +import cPickle + + +def _put_offset(text, tagged_text): + result = [] + position = 0 + for token, classification in tagged_text: + token_position = text.find(token, position) + result.append((token, classification, token_position)) + position = token_position + len(token) - 1 + return result + +class Bigrams(Worker): + """ + Returns pickled bigram finder + """ + requires = ['tokens'] + bigram_measures = nltk.collocations.BigramAssocMeasures() + + def process(self, tokens): + bigram_finder = BigramCollocationFinder.from_words(tokens) + + return {'bigram_finder': cPickle.dumps(bigram_finder)} From dee05e4f771ecc9f7a4e0e01d762bf6a0fcd22f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= Date: Tue, 18 Dec 2012 19:42:20 -0200 Subject: [PATCH 02/13] Added trigram worker as well --- pypln/backend/workers/bigrams.py | 8 ------- pypln/backend/workers/trigrams.py | 37 +++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) create mode 100644 pypln/backend/workers/trigrams.py diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index 6926824..962c165 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -24,14 +24,6 @@ import cPickle -def _put_offset(text, tagged_text): - result = [] - position = 0 - for token, classification in tagged_text: - token_position = text.find(token, position) - result.append((token, classification, token_position)) - position = token_position + len(token) - 1 - return result class Bigrams(Worker): """ diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py new file mode 100644 index 0000000..bffa2c0 --- /dev/null +++ b/pypln/backend/workers/trigrams.py @@ -0,0 +1,37 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . + +from pypelinin import Worker + +import nltk +from nltk.collocations import TrigramCollocationFinder +import cPickle + + + +class Trigrams(Worker): + """ + Returns pickled bigram finder + """ + requires = ['tokens'] + + def process(self, tokens): + trigram_finder = TrigramCollocationFinder.from_words(tokens) + + return {'trigram_finder': cPickle.dumps(trigram_finder)} From f1055704d8efc3cb295f5e3dbf5de075eee995fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= Date: Tue, 18 Dec 2012 20:02:40 -0200 Subject: [PATCH 03/13] added tests for bigram and trigram workers --- tests/test_worker_bigrams.py | 44 +++++++++++++++++++++++++++++++++++ tests/test_worker_trigrams.py | 44 +++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 tests/test_worker_bigrams.py create mode 100644 tests/test_worker_trigrams.py diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py new file mode 100644 index 0000000..a7b1ba1 --- /dev/null +++ b/tests/test_worker_bigrams.py @@ -0,0 +1,44 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . + +import unittest +from pypln.backend.workers.bigrams import Bigrams +import nltk +import cPickle + +bigram_measures = nltk.collocations.BigramAssocMeasures() + + +class TestBigramWorker(unittest.TestCase): + def test_bigrams_should_return_10_best_bigrams_in_this_order(self): + tokens = nltk.corpus.genesis.words('english-web.txt') + finder = cPickle.loads(Bigrams().process({'tokens':tokens})['bigram_finder']) + expected = [(u'Allon', u'Bacuth'), + (u'Ashteroth', u'Karnaim'), + (u'Ben', u'Ammi'), + (u'En', u'Mishpat'), + (u'Jegar', u'Sahadutha'), + (u'Salt', u'Sea'), + (u'Whoever', u'sheds'), + (u'appoint', u'overseers'), + (u'aromatic', u'resin'), + (u'cutting', u'instrument')] + result = finder.nbest(bigram_measures.pmi,10) + self.assertEqual(result, expected) + diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py new file mode 100644 index 0000000..42f6748 --- /dev/null +++ b/tests/test_worker_trigrams.py @@ -0,0 +1,44 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see . + +import unittest +from pypln.backend.workers.trigrams import Trigrams +import nltk +import cPickle + +trigram_measures = nltk.collocations.TrigramAssocMeasures() + + +class TestTrigramWorker(unittest.TestCase): + def test_Trigrams_should_return_10_best_trigrams_in_this_order(self): + tokens = nltk.corpus.genesis.words('english-web.txt') + finder = cPickle.loads(Trigrams().process({'tokens':tokens})['trigram_finder']) + expected = [(u'olive', u'leaf', u'plucked'), + (u'rider', u'falls', u'backward'), + (u'sewed', u'fig', u'leaves'), + (u'yield', u'royal', u'dainties'), + (u'during', u'mating', u'season'), + (u'Salt', u'Sea', u').'), + (u'Sea', u').', u'Twelve'), + (u'Their', u'hearts', u'failed'), + (u'Valley', u').', u'Melchizedek'), + (u'doing', u'forced', u'labor')] + result = finder.nbest(trigram_measures.pmi,10) + self.assertEqual(result, expected) + From caf235e62323b0ef15e754b9ae085eb691aa2410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= Date: Tue, 18 Dec 2012 21:08:39 -0200 Subject: [PATCH 04/13] fixed bug, tests passing --- pypln/backend/workers/bigrams.py | 4 ++-- pypln/backend/workers/trigrams.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index 962c165..a176716 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -32,7 +32,7 @@ class Bigrams(Worker): requires = ['tokens'] bigram_measures = nltk.collocations.BigramAssocMeasures() - def process(self, tokens): - bigram_finder = BigramCollocationFinder.from_words(tokens) + def process(self, document): + bigram_finder = BigramCollocationFinder.from_words(document['tokens']) return {'bigram_finder': cPickle.dumps(bigram_finder)} diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index bffa2c0..263c630 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -31,7 +31,7 @@ class Trigrams(Worker): """ requires = ['tokens'] - def process(self, tokens): - trigram_finder = TrigramCollocationFinder.from_words(tokens) + def process(self, document): + trigram_finder = TrigramCollocationFinder.from_words(document['tokens']) return {'trigram_finder': cPickle.dumps(trigram_finder)} From e986ea920a71a9c21ba055c1baf9819d5f66be45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?= Date: Wed, 19 Dec 2012 20:10:36 -0200 Subject: [PATCH 05/13] Fix PEP8 + legibility in {bi,tri}gram workers --- pypln/backend/workers/bigrams.py | 12 ++++-------- pypln/backend/workers/trigrams.py | 11 ++++------- tests/test_worker_bigrams.py | 14 ++++++++------ tests/test_worker_trigrams.py | 14 ++++++++------ 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index a176716..a4afd70 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -17,22 +17,18 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from pypelinin import Worker +import cPickle import nltk -from nltk.collocations import BigramCollocationFinder -import cPickle +from nltk.collocations import BigramCollocationFinder +from pypelinin import Worker class Bigrams(Worker): - """ - Returns pickled bigram finder - """ + """Create a NLTK bigram finder and return the pickled object""" requires = ['tokens'] - bigram_measures = nltk.collocations.BigramAssocMeasures() def process(self, document): bigram_finder = BigramCollocationFinder.from_words(document['tokens']) - return {'bigram_finder': cPickle.dumps(bigram_finder)} diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index 263c630..3fb65d4 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -17,21 +17,18 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . -from pypelinin import Worker +import cPickle import nltk -from nltk.collocations import TrigramCollocationFinder -import cPickle +from pypelinin import Worker +from nltk.collocations import TrigramCollocationFinder class Trigrams(Worker): - """ - Returns pickled bigram finder - """ + """Create a NLTK trigram finder and return the pickled object""" requires = ['tokens'] def process(self, document): trigram_finder = TrigramCollocationFinder.from_words(document['tokens']) - return {'trigram_finder': cPickle.dumps(trigram_finder)} diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index a7b1ba1..edfe5b5 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -17,18 +17,21 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import cPickle import unittest -from pypln.backend.workers.bigrams import Bigrams + import nltk -import cPickle -bigram_measures = nltk.collocations.BigramAssocMeasures() +from pypln.backend.workers.bigrams import Bigrams class TestBigramWorker(unittest.TestCase): def test_bigrams_should_return_10_best_bigrams_in_this_order(self): + bigram_measures = nltk.collocations.BigramAssocMeasures() tokens = nltk.corpus.genesis.words('english-web.txt') - finder = cPickle.loads(Bigrams().process({'tokens':tokens})['bigram_finder']) + worker_result = Bigrams().process({'tokens': tokens}) + finder = cPickle.loads(worker_result['bigram_finder']) + expected = [(u'Allon', u'Bacuth'), (u'Ashteroth', u'Karnaim'), (u'Ben', u'Ammi'), @@ -39,6 +42,5 @@ def test_bigrams_should_return_10_best_bigrams_in_this_order(self): (u'appoint', u'overseers'), (u'aromatic', u'resin'), (u'cutting', u'instrument')] - result = finder.nbest(bigram_measures.pmi,10) + result = finder.nbest(bigram_measures.pmi, 10) self.assertEqual(result, expected) - diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index 42f6748..39375b8 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -17,18 +17,21 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import cPickle import unittest -from pypln.backend.workers.trigrams import Trigrams + import nltk -import cPickle -trigram_measures = nltk.collocations.TrigramAssocMeasures() +from pypln.backend.workers.trigrams import Trigrams class TestTrigramWorker(unittest.TestCase): def test_Trigrams_should_return_10_best_trigrams_in_this_order(self): + trigram_measures = nltk.collocations.TrigramAssocMeasures() tokens = nltk.corpus.genesis.words('english-web.txt') - finder = cPickle.loads(Trigrams().process({'tokens':tokens})['trigram_finder']) + worker_result = Trigrams().process({'tokens': tokens}) + finder = cPickle.loads(worker_result['trigram_finder']) + expected = [(u'olive', u'leaf', u'plucked'), (u'rider', u'falls', u'backward'), (u'sewed', u'fig', u'leaves'), @@ -39,6 +42,5 @@ def test_Trigrams_should_return_10_best_trigrams_in_this_order(self): (u'Their', u'hearts', u'failed'), (u'Valley', u').', u'Melchizedek'), (u'doing', u'forced', u'labor')] - result = finder.nbest(trigram_measures.pmi,10) + result = finder.nbest(trigram_measures.pmi, 10) self.assertEqual(result, expected) - From 38a2b55a7a0d23ac71940c98ae7ad30317e17af1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= Date: Sat, 22 Dec 2012 13:03:08 -0200 Subject: [PATCH 06/13] changed bigram and trigram workers to return tables in jSON format --- pypln/backend/workers/bigrams.py | 22 +++++++++++++++++++--- pypln/backend/workers/trigrams.py | 17 +++++++++++++++-- tests/test_worker_bigrams.py | 19 +++++-------------- tests/test_worker_trigrams.py | 18 +++++------------- 4 files changed, 44 insertions(+), 32 deletions(-) diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index a176716..6d6a74d 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -21,7 +21,7 @@ import nltk from nltk.collocations import BigramCollocationFinder -import cPickle +from collections import defaultdict @@ -30,9 +30,25 @@ class Bigrams(Worker): Returns pickled bigram finder """ requires = ['tokens'] - bigram_measures = nltk.collocations.BigramAssocMeasures() + def process(self, document): + #todo: support filtering by stopwords + bigram_measures = nltk.collocations.BigramAssocMeasures() + metrics = ['chi_sq', + 'dice', + 'jaccard', + 'likelihood_ratio', + 'mi_like', + 'phi_sq', + 'pmi', + 'poisson_stirling', + 'raw_freq', + 'student_t'] bigram_finder = BigramCollocationFinder.from_words(document['tokens']) + br = defaultdict(lambda :[]) + for m in metrics: + for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)): + br[res[0]].append(res[1]) - return {'bigram_finder': cPickle.dumps(bigram_finder)} + return {'metrics':metrics,'bigram_rank': br} diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index 263c630..f7f9f10 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -21,7 +21,7 @@ import nltk from nltk.collocations import TrigramCollocationFinder -import cPickle +from collections import defaultdict @@ -32,6 +32,19 @@ class Trigrams(Worker): requires = ['tokens'] def process(self, document): + trigram_measures = nltk.collocations.TrigramAssocMeasures() + metrics = ['chi_sq', + 'jaccard', + 'likelihood_ratio', + 'mi_like', + 'pmi', + 'poisson_stirling', + 'raw_freq', + 'student_t'] trigram_finder = TrigramCollocationFinder.from_words(document['tokens']) + tr = defaultdict(lambda: []) + for m in metrics: + for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)): + tr[res[0]].append(res[1]) - return {'trigram_finder': cPickle.dumps(trigram_finder)} + return {'trigram_rank': tr, 'metrics':metrics} diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index a7b1ba1..cf2fad9 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -20,25 +20,16 @@ import unittest from pypln.backend.workers.bigrams import Bigrams import nltk -import cPickle bigram_measures = nltk.collocations.BigramAssocMeasures() class TestBigramWorker(unittest.TestCase): - def test_bigrams_should_return_10_best_bigrams_in_this_order(self): + def test_bigrams_should_return_correct_score(self): tokens = nltk.corpus.genesis.words('english-web.txt') - finder = cPickle.loads(Bigrams().process({'tokens':tokens})['bigram_finder']) - expected = [(u'Allon', u'Bacuth'), - (u'Ashteroth', u'Karnaim'), - (u'Ben', u'Ammi'), - (u'En', u'Mishpat'), - (u'Jegar', u'Sahadutha'), - (u'Salt', u'Sea'), - (u'Whoever', u'sheds'), - (u'appoint', u'overseers'), - (u'aromatic', u'resin'), - (u'cutting', u'instrument')] - result = finder.nbest(bigram_measures.pmi,10) + bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) + expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u'Allon',u'Bacuth') + bigram_rank = Bigrams().process({'tokens':tokens})['bigram_rank'] + result = bigram_rank[(u'Allon', u'Bacuth')][0] self.assertEqual(result, expected) diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py index 42f6748..ac30e6f 100644 --- a/tests/test_worker_trigrams.py +++ b/tests/test_worker_trigrams.py @@ -26,19 +26,11 @@ class TestTrigramWorker(unittest.TestCase): - def test_Trigrams_should_return_10_best_trigrams_in_this_order(self): + def test_Trigrams_should_return_correct_score_(self): tokens = nltk.corpus.genesis.words('english-web.txt') - finder = cPickle.loads(Trigrams().process({'tokens':tokens})['trigram_finder']) - expected = [(u'olive', u'leaf', u'plucked'), - (u'rider', u'falls', u'backward'), - (u'sewed', u'fig', u'leaves'), - (u'yield', u'royal', u'dainties'), - (u'during', u'mating', u'season'), - (u'Salt', u'Sea', u').'), - (u'Sea', u').', u'Twelve'), - (u'Their', u'hearts', u'failed'), - (u'Valley', u').', u'Melchizedek'), - (u'doing', u'forced', u'labor')] - result = finder.nbest(trigram_measures.pmi,10) + trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens) + expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf',u'plucked') + trigram_rank = Trigrams().process({'tokens':tokens})['trigram_rank'] + result = trigram_rank[(u'olive', u'leaf',u'plucked')][0] self.assertEqual(result, expected) From 92ed9220cae4ff4a4f379cb7a089478d8a2aa587 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= Date: Sat, 22 Dec 2012 13:14:29 -0200 Subject: [PATCH 07/13] fix docstrings --- pypln/backend/workers/bigrams.py | 2 +- pypln/backend/workers/trigrams.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index 8655dca..bd6d105 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -27,7 +27,7 @@ class Bigrams(Worker): - """Create a NLTK bigram finder and return the pickled object""" + """Create a NLTK bigram finder and return a table in JSON format""" requires = ['tokens'] diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py index 3ed0f6e..453ca71 100644 --- a/pypln/backend/workers/trigrams.py +++ b/pypln/backend/workers/trigrams.py @@ -26,7 +26,7 @@ class Trigrams(Worker): - """Create a NLTK trigram finder and return the pickled object""" + """Create a NLTK trigram finder and returns a table in JSON format""" requires = ['tokens'] def process(self, document): From 8149731b4b297aea97f2a834b8205a4f96922360 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Tue, 15 Jan 2013 15:53:44 -0200 Subject: [PATCH 08/13] Adds Bigrams to available workers --- pypln/backend/workers/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py index 301f831..f4f406a 100644 --- a/pypln/backend/workers/__init__.py +++ b/pypln/backend/workers/__init__.py @@ -22,6 +22,7 @@ from freqdist import FreqDist from pos import POS from statistics import Statistics +from bigrams import Bigrams -__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics'] +__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', 'Bigrams'] From 86d018b7f50c964dd61e77faea7ccc722235d8a0 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 16 Jan 2013 12:35:48 -0200 Subject: [PATCH 09/13] Adds regression tests for pickling of bigram worker output --- tests/test_worker_bigrams.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index cf2fad9..7ef71f7 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -17,6 +17,7 @@ # You should have received a copy of the GNU General Public License # along with PyPLN. If not, see . +import cPickle import unittest from pypln.backend.workers.bigrams import Bigrams import nltk @@ -33,3 +34,10 @@ def test_bigrams_should_return_correct_score(self): result = bigram_rank[(u'Allon', u'Bacuth')][0] self.assertEqual(result, expected) + def test_worker_output_should_be_pickleable(self): + """The workers run under multiprocessing, so their result is + pickled. This is a regression test.""" + tokens = nltk.corpus.genesis.words('english-web.txt') + result = Bigrams().process({'tokens':tokens}) + # This should not raise an exception. + cPickle.dumps(result) From f583dd8de91b84a8a3217f8585beb3fe626a2d73 Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 16 Jan 2013 12:49:55 -0200 Subject: [PATCH 10/13] Cast defaultdict to dict to avoid pickling the generator function --- pypln/backend/workers/bigrams.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index bd6d105..299d21a 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -49,4 +49,4 @@ def process(self, document): for m in metrics: for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)): br[res[0]].append(res[1]) - return {'metrics':metrics,'bigram_rank': br} + return {'metrics': metrics, 'bigram_rank': dict(br)} From b1f33d01a76cc1d26dac242037c3c8b096d2cd6b Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 16 Jan 2013 13:48:29 -0200 Subject: [PATCH 11/13] Fixes typo in test_mongo_store --- tests/test_mongo_store.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mongo_store.py b/tests/test_mongo_store.py index 2d2a346..7ebc919 100644 --- a/tests/test_mongo_store.py +++ b/tests/test_mongo_store.py @@ -41,7 +41,7 @@ def setUp(self): self.monitoring = self.db[db_conf['monitoring_collection']] self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection']) self.db[db_conf['gridfs_collection'] + '.files'].drop() - self.db[db_conf['gridfs_collection'] + '.chuncks'].drop() + self.db[db_conf['gridfs_collection'] + '.chunks'].drop() self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'], database=db_conf['database'], collection=db_conf['analysis_collection']) From f85afbb633f8625d38e4b2be262000af5888128c Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Wed, 16 Jan 2013 13:52:05 -0200 Subject: [PATCH 12/13] Adds regression test to the error when saving worker output --- tests/test_worker_bigrams.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index 7ef71f7..dc3423e 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -18,14 +18,36 @@ # along with PyPLN. If not, see . import cPickle +import gridfs +import nltk import unittest + +from mongodict import MongoDict +from pymongo import Connection + +from pypln.backend.mongo_store import MongoDBStore from pypln.backend.workers.bigrams import Bigrams -import nltk +from .utils import default_config bigram_measures = nltk.collocations.BigramAssocMeasures() class TestBigramWorker(unittest.TestCase): + def _prepare_store(self): + self.db_conf = db_conf = default_config['store'] + self.connection = Connection(host=db_conf['host'], + port=db_conf['port']) + self.connection.drop_database(db_conf['database']) + self.db = self.connection[db_conf['database']] + self.monitoring = self.db[db_conf['monitoring_collection']] + self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection']) + self.db[db_conf['gridfs_collection'] + '.files'].drop() + self.db[db_conf['gridfs_collection'] + '.chunks'].drop() + self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'], + database=db_conf['database'], + collection=db_conf['analysis_collection']) + self.store = MongoDBStore(**db_conf) + def test_bigrams_should_return_correct_score(self): tokens = nltk.corpus.genesis.words('english-web.txt') bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) @@ -41,3 +63,13 @@ def test_worker_output_should_be_pickleable(self): result = Bigrams().process({'tokens':tokens}) # This should not raise an exception. cPickle.dumps(result) + + def test_saving_worker_output_should_work(self): + """Saving the worker output should work. This is a regression test.""" + self._prepare_store() + tokens = nltk.corpus.genesis.words('english-web.txt')[:100] + result = Bigrams().process({'tokens': tokens}) + info = {'data': {'id': 789, '_id': 'eggs'}, 'worker': 'Bigrams', + 'worker_requires': ['tokens'], 'worker_result': result} + self.store.save(info) + self.connection.drop_database(self.db) From ea6ca3d4db16d971ec46b7962ee03312585bc44e Mon Sep 17 00:00:00 2001 From: Flavio Amieiro Date: Thu, 17 Jan 2013 18:40:05 -0200 Subject: [PATCH 13/13] Convert bigram worker output into a list of tuples so it can be saved --- pypln/backend/workers/bigrams.py | 2 +- tests/test_worker_bigrams.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py index 299d21a..b83797f 100644 --- a/pypln/backend/workers/bigrams.py +++ b/pypln/backend/workers/bigrams.py @@ -49,4 +49,4 @@ def process(self, document): for m in metrics: for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)): br[res[0]].append(res[1]) - return {'metrics': metrics, 'bigram_rank': dict(br)} + return {'metrics': metrics, 'bigram_rank': br.items()} diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py index dc3423e..8eacbf5 100644 --- a/tests/test_worker_bigrams.py +++ b/tests/test_worker_bigrams.py @@ -51,9 +51,9 @@ def _prepare_store(self): def test_bigrams_should_return_correct_score(self): tokens = nltk.corpus.genesis.words('english-web.txt') bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens) - expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u'Allon',u'Bacuth') + expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which') bigram_rank = Bigrams().process({'tokens':tokens})['bigram_rank'] - result = bigram_rank[(u'Allon', u'Bacuth')][0] + result = bigram_rank[0][1][0] self.assertEqual(result, expected) def test_worker_output_should_be_pickleable(self):