From 201d6f8eb8a3adb6aa7f23252e1ff47287becf0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= <fccoelho@gmail.com>
Date: Tue, 18 Dec 2012 19:38:47 -0200
Subject: [PATCH 01/13] Added bigram worker: returns a pickled bigram finder
 object

---
 pypln/backend/workers/bigrams.py | 46 ++++++++++++++++++++++++++++++++
 1 file changed, 46 insertions(+)
 create mode 100644 pypln/backend/workers/bigrams.py

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
new file mode 100644
index 0000000..6926824
--- /dev/null
+++ b/pypln/backend/workers/bigrams.py
@@ -0,0 +1,46 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+from pypelinin import Worker
+
+import nltk
+from nltk.collocations import BigramCollocationFinder
+import cPickle
+
+
+def _put_offset(text, tagged_text):
+    result = []
+    position = 0
+    for token, classification in tagged_text:
+        token_position = text.find(token, position)
+        result.append((token, classification, token_position))
+        position = token_position + len(token) - 1
+    return result
+
+class Bigrams(Worker):
+    """
+    Returns pickled bigram finder
+    """
+    requires = ['tokens']
+    bigram_measures = nltk.collocations.BigramAssocMeasures()
+
+    def process(self, tokens):
+        bigram_finder = BigramCollocationFinder.from_words(tokens)
+
+        return {'bigram_finder': cPickle.dumps(bigram_finder)}

From dee05e4f771ecc9f7a4e0e01d762bf6a0fcd22f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= <fccoelho@gmail.com>
Date: Tue, 18 Dec 2012 19:42:20 -0200
Subject: [PATCH 02/13] Added trigram worker as well

---
 pypln/backend/workers/bigrams.py  |  8 -------
 pypln/backend/workers/trigrams.py | 37 +++++++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 8 deletions(-)
 create mode 100644 pypln/backend/workers/trigrams.py

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index 6926824..962c165 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -24,14 +24,6 @@
 import cPickle
 
 
-def _put_offset(text, tagged_text):
-    result = []
-    position = 0
-    for token, classification in tagged_text:
-        token_position = text.find(token, position)
-        result.append((token, classification, token_position))
-        position = token_position + len(token) - 1
-    return result
 
 class Bigrams(Worker):
     """
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
new file mode 100644
index 0000000..bffa2c0
--- /dev/null
+++ b/pypln/backend/workers/trigrams.py
@@ -0,0 +1,37 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+from pypelinin import Worker
+
+import nltk
+from nltk.collocations import TrigramCollocationFinder
+import cPickle
+
+
+
+class Trigrams(Worker):
+    """
+    Returns pickled bigram finder
+    """
+    requires = ['tokens']
+
+    def process(self, tokens):
+        trigram_finder = TrigramCollocationFinder.from_words(tokens)
+
+        return {'trigram_finder': cPickle.dumps(trigram_finder)}

From f1055704d8efc3cb295f5e3dbf5de075eee995fc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= <fccoelho@gmail.com>
Date: Tue, 18 Dec 2012 20:02:40 -0200
Subject: [PATCH 03/13] added tests for bigram and trigram workers

---
 tests/test_worker_bigrams.py  | 44 +++++++++++++++++++++++++++++++++++
 tests/test_worker_trigrams.py | 44 +++++++++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+)
 create mode 100644 tests/test_worker_bigrams.py
 create mode 100644 tests/test_worker_trigrams.py

diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
new file mode 100644
index 0000000..a7b1ba1
--- /dev/null
+++ b/tests/test_worker_bigrams.py
@@ -0,0 +1,44 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+from pypln.backend.workers.bigrams import Bigrams
+import nltk
+import cPickle
+
+bigram_measures = nltk.collocations.BigramAssocMeasures()
+
+
+class TestBigramWorker(unittest.TestCase):
+    def test_bigrams_should_return_10_best_bigrams_in_this_order(self):
+        tokens = nltk.corpus.genesis.words('english-web.txt')
+        finder = cPickle.loads(Bigrams().process({'tokens':tokens})['bigram_finder'])
+        expected = [(u'Allon', u'Bacuth'),
+                    (u'Ashteroth', u'Karnaim'),
+                    (u'Ben', u'Ammi'),
+                    (u'En', u'Mishpat'),
+                    (u'Jegar', u'Sahadutha'),
+                    (u'Salt', u'Sea'),
+                    (u'Whoever', u'sheds'),
+                    (u'appoint', u'overseers'),
+                    (u'aromatic', u'resin'),
+                    (u'cutting', u'instrument')]
+        result = finder.nbest(bigram_measures.pmi,10)
+        self.assertEqual(result, expected)
+
diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py
new file mode 100644
index 0000000..42f6748
--- /dev/null
+++ b/tests/test_worker_trigrams.py
@@ -0,0 +1,44 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+from pypln.backend.workers.trigrams import Trigrams
+import nltk
+import cPickle
+
+trigram_measures = nltk.collocations.TrigramAssocMeasures()
+
+
+class TestTrigramWorker(unittest.TestCase):
+    def test_Trigrams_should_return_10_best_trigrams_in_this_order(self):
+        tokens = nltk.corpus.genesis.words('english-web.txt')
+        finder = cPickle.loads(Trigrams().process({'tokens':tokens})['trigram_finder'])
+        expected = [(u'olive', u'leaf', u'plucked'),
+                    (u'rider', u'falls', u'backward'),
+                    (u'sewed', u'fig', u'leaves'),
+                    (u'yield', u'royal', u'dainties'),
+                    (u'during', u'mating', u'season'),
+                    (u'Salt', u'Sea', u').'),
+                    (u'Sea', u').', u'Twelve'),
+                    (u'Their', u'hearts', u'failed'),
+                    (u'Valley', u').', u'Melchizedek'),
+                    (u'doing', u'forced', u'labor')]
+        result = finder.nbest(trigram_measures.pmi,10)
+        self.assertEqual(result, expected)
+

From caf235e62323b0ef15e754b9ae085eb691aa2410 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= <fccoelho@gmail.com>
Date: Tue, 18 Dec 2012 21:08:39 -0200
Subject: [PATCH 04/13] fixed bug, tests passing

---
 pypln/backend/workers/bigrams.py  | 4 ++--
 pypln/backend/workers/trigrams.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index 962c165..a176716 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -32,7 +32,7 @@ class Bigrams(Worker):
     requires = ['tokens']
     bigram_measures = nltk.collocations.BigramAssocMeasures()
 
-    def process(self, tokens):
-        bigram_finder = BigramCollocationFinder.from_words(tokens)
+    def process(self, document):
+        bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
 
         return {'bigram_finder': cPickle.dumps(bigram_finder)}
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
index bffa2c0..263c630 100644
--- a/pypln/backend/workers/trigrams.py
+++ b/pypln/backend/workers/trigrams.py
@@ -31,7 +31,7 @@ class Trigrams(Worker):
     """
     requires = ['tokens']
 
-    def process(self, tokens):
-        trigram_finder = TrigramCollocationFinder.from_words(tokens)
+    def process(self, document):
+        trigram_finder = TrigramCollocationFinder.from_words(document['tokens'])
 
         return {'trigram_finder': cPickle.dumps(trigram_finder)}

From e986ea920a71a9c21ba055c1baf9819d5f66be45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=C3=81lvaro=20Justen=20aka=20Turicas?=
 <alvaro@justen.eng.br>
Date: Wed, 19 Dec 2012 20:10:36 -0200
Subject: [PATCH 05/13] Fix PEP8 + legibility in {bi,tri}gram workers

---
 pypln/backend/workers/bigrams.py  | 12 ++++--------
 pypln/backend/workers/trigrams.py | 11 ++++-------
 tests/test_worker_bigrams.py      | 14 ++++++++------
 tests/test_worker_trigrams.py     | 14 ++++++++------
 4 files changed, 24 insertions(+), 27 deletions(-)

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index a176716..a4afd70 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -17,22 +17,18 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
-from pypelinin import Worker
+import cPickle
 
 import nltk
-from nltk.collocations import BigramCollocationFinder
-import cPickle
 
+from nltk.collocations import BigramCollocationFinder
+from pypelinin import Worker
 
 
 class Bigrams(Worker):
-    """
-    Returns pickled bigram finder
-    """
+    """Create a NLTK bigram finder and return the pickled object"""
     requires = ['tokens']
-    bigram_measures = nltk.collocations.BigramAssocMeasures()
 
     def process(self, document):
         bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
-
         return {'bigram_finder': cPickle.dumps(bigram_finder)}
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
index 263c630..3fb65d4 100644
--- a/pypln/backend/workers/trigrams.py
+++ b/pypln/backend/workers/trigrams.py
@@ -17,21 +17,18 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
-from pypelinin import Worker
+import cPickle
 
 import nltk
-from nltk.collocations import TrigramCollocationFinder
-import cPickle
 
+from pypelinin import Worker
+from nltk.collocations import TrigramCollocationFinder
 
 
 class Trigrams(Worker):
-    """
-    Returns pickled bigram finder
-    """
+    """Create a NLTK trigram finder and return the pickled object"""
     requires = ['tokens']
 
     def process(self, document):
         trigram_finder = TrigramCollocationFinder.from_words(document['tokens'])
-
         return {'trigram_finder': cPickle.dumps(trigram_finder)}
diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index a7b1ba1..edfe5b5 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -17,18 +17,21 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
+import cPickle
 import unittest
-from pypln.backend.workers.bigrams import Bigrams
+
 import nltk
-import cPickle
 
-bigram_measures = nltk.collocations.BigramAssocMeasures()
+from pypln.backend.workers.bigrams import Bigrams
 
 
 class TestBigramWorker(unittest.TestCase):
     def test_bigrams_should_return_10_best_bigrams_in_this_order(self):
+        bigram_measures = nltk.collocations.BigramAssocMeasures()
         tokens = nltk.corpus.genesis.words('english-web.txt')
-        finder = cPickle.loads(Bigrams().process({'tokens':tokens})['bigram_finder'])
+        worker_result = Bigrams().process({'tokens': tokens})
+        finder = cPickle.loads(worker_result['bigram_finder'])
+
         expected = [(u'Allon', u'Bacuth'),
                     (u'Ashteroth', u'Karnaim'),
                     (u'Ben', u'Ammi'),
@@ -39,6 +42,5 @@ def test_bigrams_should_return_10_best_bigrams_in_this_order(self):
                     (u'appoint', u'overseers'),
                     (u'aromatic', u'resin'),
                     (u'cutting', u'instrument')]
-        result = finder.nbest(bigram_measures.pmi,10)
+        result = finder.nbest(bigram_measures.pmi, 10)
         self.assertEqual(result, expected)
-
diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py
index 42f6748..39375b8 100644
--- a/tests/test_worker_trigrams.py
+++ b/tests/test_worker_trigrams.py
@@ -17,18 +17,21 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
+import cPickle
 import unittest
-from pypln.backend.workers.trigrams import Trigrams
+
 import nltk
-import cPickle
 
-trigram_measures = nltk.collocations.TrigramAssocMeasures()
+from pypln.backend.workers.trigrams import Trigrams
 
 
 class TestTrigramWorker(unittest.TestCase):
     def test_Trigrams_should_return_10_best_trigrams_in_this_order(self):
+        trigram_measures = nltk.collocations.TrigramAssocMeasures()
         tokens = nltk.corpus.genesis.words('english-web.txt')
-        finder = cPickle.loads(Trigrams().process({'tokens':tokens})['trigram_finder'])
+        worker_result = Trigrams().process({'tokens': tokens})
+        finder = cPickle.loads(worker_result['trigram_finder'])
+
         expected = [(u'olive', u'leaf', u'plucked'),
                     (u'rider', u'falls', u'backward'),
                     (u'sewed', u'fig', u'leaves'),
@@ -39,6 +42,5 @@ def test_Trigrams_should_return_10_best_trigrams_in_this_order(self):
                     (u'Their', u'hearts', u'failed'),
                     (u'Valley', u').', u'Melchizedek'),
                     (u'doing', u'forced', u'labor')]
-        result = finder.nbest(trigram_measures.pmi,10)
+        result = finder.nbest(trigram_measures.pmi, 10)
         self.assertEqual(result, expected)
-

From 38a2b55a7a0d23ac71940c98ae7ad30317e17af1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= <fccoelho@gmail.com>
Date: Sat, 22 Dec 2012 13:03:08 -0200
Subject: [PATCH 06/13] changed bigram and trigram workers to return tables in
 jSON format

---
 pypln/backend/workers/bigrams.py  | 22 +++++++++++++++++++---
 pypln/backend/workers/trigrams.py | 17 +++++++++++++++--
 tests/test_worker_bigrams.py      | 19 +++++--------------
 tests/test_worker_trigrams.py     | 18 +++++-------------
 4 files changed, 44 insertions(+), 32 deletions(-)

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index a176716..6d6a74d 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -21,7 +21,7 @@
 
 import nltk
 from nltk.collocations import BigramCollocationFinder
-import cPickle
+from collections import defaultdict
 
 
 
@@ -30,9 +30,25 @@ class Bigrams(Worker):
     Returns pickled bigram finder
     """
     requires = ['tokens']
-    bigram_measures = nltk.collocations.BigramAssocMeasures()
+
 
     def process(self, document):
+        #todo: support filtering by stopwords
+        bigram_measures = nltk.collocations.BigramAssocMeasures()
+        metrics = ['chi_sq',
+               'dice',
+               'jaccard',
+               'likelihood_ratio',
+               'mi_like',
+               'phi_sq',
+               'pmi',
+               'poisson_stirling',
+               'raw_freq',
+               'student_t']
         bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
+        br = defaultdict(lambda :[])
+        for m in metrics:
+            for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
+                br[res[0]].append(res[1])
 
-        return {'bigram_finder': cPickle.dumps(bigram_finder)}
+        return {'metrics':metrics,'bigram_rank': br}
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
index 263c630..f7f9f10 100644
--- a/pypln/backend/workers/trigrams.py
+++ b/pypln/backend/workers/trigrams.py
@@ -21,7 +21,7 @@
 
 import nltk
 from nltk.collocations import TrigramCollocationFinder
-import cPickle
+from collections import defaultdict
 
 
 
@@ -32,6 +32,19 @@ class Trigrams(Worker):
     requires = ['tokens']
 
     def process(self, document):
+        trigram_measures = nltk.collocations.TrigramAssocMeasures()
+        metrics = ['chi_sq',
+                   'jaccard',
+                   'likelihood_ratio',
+                   'mi_like',
+                   'pmi',
+                   'poisson_stirling',
+                   'raw_freq',
+                   'student_t']
         trigram_finder = TrigramCollocationFinder.from_words(document['tokens'])
+        tr = defaultdict(lambda: [])
+        for m in metrics:
+            for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)):
+                tr[res[0]].append(res[1])
 
-        return {'trigram_finder': cPickle.dumps(trigram_finder)}
+        return {'trigram_rank': tr, 'metrics':metrics}
diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index a7b1ba1..cf2fad9 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -20,25 +20,16 @@
 import unittest
 from pypln.backend.workers.bigrams import Bigrams
 import nltk
-import cPickle
 
 bigram_measures = nltk.collocations.BigramAssocMeasures()
 
 
 class TestBigramWorker(unittest.TestCase):
-    def test_bigrams_should_return_10_best_bigrams_in_this_order(self):
+    def test_bigrams_should_return_correct_score(self):
         tokens = nltk.corpus.genesis.words('english-web.txt')
-        finder = cPickle.loads(Bigrams().process({'tokens':tokens})['bigram_finder'])
-        expected = [(u'Allon', u'Bacuth'),
-                    (u'Ashteroth', u'Karnaim'),
-                    (u'Ben', u'Ammi'),
-                    (u'En', u'Mishpat'),
-                    (u'Jegar', u'Sahadutha'),
-                    (u'Salt', u'Sea'),
-                    (u'Whoever', u'sheds'),
-                    (u'appoint', u'overseers'),
-                    (u'aromatic', u'resin'),
-                    (u'cutting', u'instrument')]
-        result = finder.nbest(bigram_measures.pmi,10)
+        bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
+        expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u'Allon',u'Bacuth')
+        bigram_rank = Bigrams().process({'tokens':tokens})['bigram_rank']
+        result = bigram_rank[(u'Allon', u'Bacuth')][0]
         self.assertEqual(result, expected)
 
diff --git a/tests/test_worker_trigrams.py b/tests/test_worker_trigrams.py
index 42f6748..ac30e6f 100644
--- a/tests/test_worker_trigrams.py
+++ b/tests/test_worker_trigrams.py
@@ -26,19 +26,11 @@
 
 
 class TestTrigramWorker(unittest.TestCase):
-    def test_Trigrams_should_return_10_best_trigrams_in_this_order(self):
+    def test_Trigrams_should_return_correct_score_(self):
         tokens = nltk.corpus.genesis.words('english-web.txt')
-        finder = cPickle.loads(Trigrams().process({'tokens':tokens})['trigram_finder'])
-        expected = [(u'olive', u'leaf', u'plucked'),
-                    (u'rider', u'falls', u'backward'),
-                    (u'sewed', u'fig', u'leaves'),
-                    (u'yield', u'royal', u'dainties'),
-                    (u'during', u'mating', u'season'),
-                    (u'Salt', u'Sea', u').'),
-                    (u'Sea', u').', u'Twelve'),
-                    (u'Their', u'hearts', u'failed'),
-                    (u'Valley', u').', u'Melchizedek'),
-                    (u'doing', u'forced', u'labor')]
-        result = finder.nbest(trigram_measures.pmi,10)
+        trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
+        expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf',u'plucked')
+        trigram_rank = Trigrams().process({'tokens':tokens})['trigram_rank']
+        result = trigram_rank[(u'olive', u'leaf',u'plucked')][0]
         self.assertEqual(result, expected)
 

From 92ed9220cae4ff4a4f379cb7a089478d8a2aa587 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fl=C3=A1vio=20C=2E=20Coelho?= <fccoelho@gmail.com>
Date: Sat, 22 Dec 2012 13:14:29 -0200
Subject: [PATCH 07/13] fix docstrings

---
 pypln/backend/workers/bigrams.py  | 2 +-
 pypln/backend/workers/trigrams.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index 8655dca..bd6d105 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -27,7 +27,7 @@
 
 
 class Bigrams(Worker):
-    """Create a NLTK bigram finder and return the pickled object"""
+    """Create a NLTK bigram finder and return a table in JSON format"""
     requires = ['tokens']
 
 
diff --git a/pypln/backend/workers/trigrams.py b/pypln/backend/workers/trigrams.py
index 3ed0f6e..453ca71 100644
--- a/pypln/backend/workers/trigrams.py
+++ b/pypln/backend/workers/trigrams.py
@@ -26,7 +26,7 @@
 
 
 class Trigrams(Worker):
-    """Create a NLTK trigram finder and return the pickled object"""
+    """Create a NLTK trigram finder and returns a table in JSON format"""
     requires = ['tokens']
 
     def process(self, document):

From 8149731b4b297aea97f2a834b8205a4f96922360 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Tue, 15 Jan 2013 15:53:44 -0200
Subject: [PATCH 08/13] Adds Bigrams to available workers

---
 pypln/backend/workers/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py
index 301f831..f4f406a 100644
--- a/pypln/backend/workers/__init__.py
+++ b/pypln/backend/workers/__init__.py
@@ -22,6 +22,7 @@
 from freqdist import FreqDist
 from pos import POS
 from statistics import Statistics
+from bigrams import Bigrams
 
 
-__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics']
+__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', 'Bigrams']

From 86d018b7f50c964dd61e77faea7ccc722235d8a0 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Wed, 16 Jan 2013 12:35:48 -0200
Subject: [PATCH 09/13] Adds regression tests for pickling of bigram worker
 output

---
 tests/test_worker_bigrams.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index cf2fad9..7ef71f7 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -17,6 +17,7 @@
 # You should have received a copy of the GNU General Public License
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
+import cPickle
 import unittest
 from pypln.backend.workers.bigrams import Bigrams
 import nltk
@@ -33,3 +34,10 @@ def test_bigrams_should_return_correct_score(self):
         result = bigram_rank[(u'Allon', u'Bacuth')][0]
         self.assertEqual(result, expected)
 
+    def test_worker_output_should_be_pickleable(self):
+        """The workers run under multiprocessing, so their result is
+        pickled. This is a regression test."""
+        tokens = nltk.corpus.genesis.words('english-web.txt')
+        result = Bigrams().process({'tokens':tokens})
+        # This should not raise an exception.
+        cPickle.dumps(result)

From f583dd8de91b84a8a3217f8585beb3fe626a2d73 Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Wed, 16 Jan 2013 12:49:55 -0200
Subject: [PATCH 10/13] Cast defaultdict to dict to avoid pickling the
 generator function

---
 pypln/backend/workers/bigrams.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index bd6d105..299d21a 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -49,4 +49,4 @@ def process(self, document):
         for m in metrics:
             for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
                 br[res[0]].append(res[1])
-        return {'metrics':metrics,'bigram_rank': br}
+        return {'metrics': metrics, 'bigram_rank': dict(br)}

From b1f33d01a76cc1d26dac242037c3c8b096d2cd6b Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Wed, 16 Jan 2013 13:48:29 -0200
Subject: [PATCH 11/13] Fixes typo in test_mongo_store

---
 tests/test_mongo_store.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_mongo_store.py b/tests/test_mongo_store.py
index 2d2a346..7ebc919 100644
--- a/tests/test_mongo_store.py
+++ b/tests/test_mongo_store.py
@@ -41,7 +41,7 @@ def setUp(self):
         self.monitoring = self.db[db_conf['monitoring_collection']]
         self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
         self.db[db_conf['gridfs_collection'] + '.files'].drop()
-        self.db[db_conf['gridfs_collection'] + '.chuncks'].drop()
+        self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
         self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
                                    database=db_conf['database'],
                                    collection=db_conf['analysis_collection'])

From f85afbb633f8625d38e4b2be262000af5888128c Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Wed, 16 Jan 2013 13:52:05 -0200
Subject: [PATCH 12/13] Adds regression test to the error when saving worker
 output

---
 tests/test_worker_bigrams.py | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index 7ef71f7..dc3423e 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -18,14 +18,36 @@
 # along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
 
 import cPickle
+import gridfs
+import nltk
 import unittest
+
+from mongodict import MongoDict
+from pymongo import Connection
+
+from pypln.backend.mongo_store import MongoDBStore
 from pypln.backend.workers.bigrams import Bigrams
-import nltk
+from .utils import default_config
 
 bigram_measures = nltk.collocations.BigramAssocMeasures()
 
 
 class TestBigramWorker(unittest.TestCase):
+    def _prepare_store(self):
+        self.db_conf = db_conf = default_config['store']
+        self.connection = Connection(host=db_conf['host'],
+                                     port=db_conf['port'])
+        self.connection.drop_database(db_conf['database'])
+        self.db = self.connection[db_conf['database']]
+        self.monitoring = self.db[db_conf['monitoring_collection']]
+        self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
+        self.db[db_conf['gridfs_collection'] + '.files'].drop()
+        self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
+        self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
+                                   database=db_conf['database'],
+                                   collection=db_conf['analysis_collection'])
+        self.store = MongoDBStore(**db_conf)
+
     def test_bigrams_should_return_correct_score(self):
         tokens = nltk.corpus.genesis.words('english-web.txt')
         bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
@@ -41,3 +63,13 @@ def test_worker_output_should_be_pickleable(self):
         result = Bigrams().process({'tokens':tokens})
         # This should not raise an exception.
         cPickle.dumps(result)
+
+    def test_saving_worker_output_should_work(self):
+        """Saving the worker output should work. This is a regression test."""
+        self._prepare_store()
+        tokens = nltk.corpus.genesis.words('english-web.txt')[:100]
+        result = Bigrams().process({'tokens': tokens})
+        info = {'data': {'id': 789, '_id': 'eggs'}, 'worker': 'Bigrams',
+                'worker_requires': ['tokens'], 'worker_result': result}
+        self.store.save(info)
+        self.connection.drop_database(self.db)

From ea6ca3d4db16d971ec46b7962ee03312585bc44e Mon Sep 17 00:00:00 2001
From: Flavio Amieiro <amieiro.flavio@gmail.com>
Date: Thu, 17 Jan 2013 18:40:05 -0200
Subject: [PATCH 13/13] Convert bigram worker output into a list of tuples so
 it can be saved

---
 pypln/backend/workers/bigrams.py | 2 +-
 tests/test_worker_bigrams.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/pypln/backend/workers/bigrams.py b/pypln/backend/workers/bigrams.py
index 299d21a..b83797f 100644
--- a/pypln/backend/workers/bigrams.py
+++ b/pypln/backend/workers/bigrams.py
@@ -49,4 +49,4 @@ def process(self, document):
         for m in metrics:
             for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
                 br[res[0]].append(res[1])
-        return {'metrics': metrics, 'bigram_rank': dict(br)}
+        return {'metrics': metrics, 'bigram_rank': br.items()}
diff --git a/tests/test_worker_bigrams.py b/tests/test_worker_bigrams.py
index dc3423e..8eacbf5 100644
--- a/tests/test_worker_bigrams.py
+++ b/tests/test_worker_bigrams.py
@@ -51,9 +51,9 @@ def _prepare_store(self):
     def test_bigrams_should_return_correct_score(self):
         tokens = nltk.corpus.genesis.words('english-web.txt')
         bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
-        expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u'Allon',u'Bacuth')
+        expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which')
         bigram_rank = Bigrams().process({'tokens':tokens})['bigram_rank']
-        result = bigram_rank[(u'Allon', u'Bacuth')][0]
+        result = bigram_rank[0][1][0]
         self.assertEqual(result, expected)
 
     def test_worker_output_should_be_pickleable(self):