NAMD · flavioamieiro · Jan 17, 2013 · Dec 18, 2012 · Dec 18, 2012 · Dec 18, 2012
diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
@@ -22,6 +22,7 @@
 from freqdist import FreqDist
 from pos import POS
 from statistics import Statistics
+from bigrams import Bigrams
 
 
-__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics']
+__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', 'Bigrams']
diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
@@ -0,0 +1,52 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+import cPickle
+
+import nltk
+from collections import defaultdict
+
+from nltk.collocations import BigramCollocationFinder
+from pypelinin import Worker
+
+
+class Bigrams(Worker):
+    """Create a NLTK bigram finder and return a table in JSON format"""
+    requires = ['tokens']
+
+
+    def process(self, document):
+        #todo: support filtering by stopwords
+        bigram_measures = nltk.collocations.BigramAssocMeasures()
+        metrics = ['chi_sq',
+               'dice',
+               'jaccard',
+               'likelihood_ratio',
+               'mi_like',
+               'phi_sq',
+               'pmi',
+               'poisson_stirling',
+               'raw_freq',
+               'student_t']
+        bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
+        br = defaultdict(lambda :[])
+        for m in metrics:
+            for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
+                br[res[0]].append(res[1])
+        return {'metrics': metrics, 'bigram_rank': br.items()}
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+from pypelinin import Worker
+
+import nltk
+from nltk.collocations import TrigramCollocationFinder
+from collections import defaultdict
+
+
+
+class Trigrams(Worker):
+    """Create a NLTK trigram finder and returns a table in JSON format"""
+    requires = ['tokens']
+
+    def process(self, document):
+        trigram_measures = nltk.collocations.TrigramAssocMeasures()
+        metrics = ['chi_sq',
+                   'jaccard',
+                   'likelihood_ratio',
+                   'mi_like',
+                   'pmi',
+                   'poisson_stirling',
+                   'raw_freq',
+                   'student_t']
+        trigram_finder = TrigramCollocationFinder.from_words(document['tokens'])
+        tr = defaultdict(lambda: [])
+        for m in metrics:
+            for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)):
+                tr[res[0]].append(res[1])
+
+        return {'trigram_rank': tr, 'metrics':metrics}
diff --git a/tests/test_mongo_store.py b/tests/test_mongo_store.py
@@ -41,7 +41,7 @@ def setUp(self):
         self.monitoring = self.db[db_conf['monitoring_collection']]
         self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
         self.db[db_conf['gridfs_collection'] + '.files'].drop()
-        self.db[db_conf['gridfs_collection'] + '.chuncks'].drop()
+        self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
         self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
                                    database=db_conf['database'],
                                    collection=db_conf['analysis_collection'])

diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
@@ -0,0 +1,75 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+import cPickle
+import gridfs
+import nltk
+import unittest
+
+from mongodict import MongoDict
+from pymongo import Connection
+
+from pypln.backend.mongo_store import MongoDBStore
+from pypln.backend.workers.bigrams import Bigrams
+from .utils import default_config
+
+bigram_measures = nltk.collocations.BigramAssocMeasures()
+
+
+class TestBigramWorker(unittest.TestCase):
+    def _prepare_store(self):
+        self.db_conf = db_conf = default_config['store']
+        self.connection = Connection(host=db_conf['host'],
+                                     port=db_conf['port'])
+        self.connection.drop_database(db_conf['database'])
+        self.db = self.connection[db_conf['database']]
+        self.monitoring = self.db[db_conf['monitoring_collection']]
+        self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
+        self.db[db_conf['gridfs_collection'] + '.files'].drop()
+        self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
+        self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
+                                   database=db_conf['database'],
+                                   collection=db_conf['analysis_collection'])
+        self.store = MongoDBStore(**db_conf)
+
+    def test_bigrams_should_return_correct_score(self):
+        tokens = nltk.corpus.genesis.words('english-web.txt')
+        bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
+        expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which')
+        bigram_rank = Bigrams().process({'tokens':tokens})['bigram_rank']
+        result = bigram_rank[0][1][0]
+        self.assertEqual(result, expected)
+
+    def test_worker_output_should_be_pickleable(self):
+        """The workers run under multiprocessing, so their result is
+        pickled. This is a regression test."""
+        tokens = nltk.corpus.genesis.words('english-web.txt')
+        result = Bigrams().process({'tokens':tokens})
+        # This should not raise an exception.
+        cPickle.dumps(result)
+
+    def test_saving_worker_output_should_work(self):
+        """Saving the worker output should work. This is a regression test."""
+        self._prepare_store()
+        tokens = nltk.corpus.genesis.words('english-web.txt')[:100]
+        result = Bigrams().process({'tokens': tokens})
+        info = {'data': {'id': 789, '_id': 'eggs'}, 'worker': 'Bigrams',
+                'worker_requires': ['tokens'], 'worker_result': result}
+        self.store.save(info)
+        self.connection.drop_database(self.db)
diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py
@@ -0,0 +1,36 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+from pypln.backend.workers.trigrams import Trigrams
+import nltk
+import cPickle
+
+trigram_measures = nltk.collocations.TrigramAssocMeasures()
+
+
+class TestTrigramWorker(unittest.TestCase):
+    def test_Trigrams_should_return_correct_score_(self):
+        tokens = nltk.corpus.genesis.words('english-web.txt')
+        trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
+        expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf',u'plucked')
+        trigram_rank = Trigrams().process({'tokens':tokens})['trigram_rank']
+        result = trigram_rank[(u'olive', u'leaf',u'plucked')][0]
+        self.assertEqual(result, expected)
+