Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/bigrams #127

Merged
merged 15 commits into from
Jan 17, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pypln/backend/workers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from freqdist import FreqDist
from pos import POS
from statistics import Statistics
from bigrams import Bigrams


__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics']
__all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', 'Bigrams']
52 changes: 52 additions & 0 deletions pypln/backend/workers/bigrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.

import cPickle

import nltk
from collections import defaultdict

from nltk.collocations import BigramCollocationFinder
from pypelinin import Worker


class Bigrams(Worker):
"""Create a NLTK bigram finder and return a table in JSON format"""
requires = ['tokens']


def process(self, document):
#todo: support filtering by stopwords
bigram_measures = nltk.collocations.BigramAssocMeasures()
metrics = ['chi_sq',
'dice',
'jaccard',
'likelihood_ratio',
'mi_like',
'phi_sq',
'pmi',
'poisson_stirling',
'raw_freq',
'student_t']
bigram_finder = BigramCollocationFinder.from_words(document['tokens'])
br = defaultdict(lambda :[])
for m in metrics:
for res in bigram_finder.score_ngrams(getattr(bigram_measures,m)):
br[res[0]].append(res[1])
return {'metrics': metrics, 'bigram_rank': br.items()}
48 changes: 48 additions & 0 deletions pypln/backend/workers/trigrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.

from pypelinin import Worker

import nltk
from nltk.collocations import TrigramCollocationFinder
from collections import defaultdict



class Trigrams(Worker):
"""Create a NLTK trigram finder and returns a table in JSON format"""
requires = ['tokens']

def process(self, document):
trigram_measures = nltk.collocations.TrigramAssocMeasures()
metrics = ['chi_sq',
'jaccard',
'likelihood_ratio',
'mi_like',
'pmi',
'poisson_stirling',
'raw_freq',
'student_t']
trigram_finder = TrigramCollocationFinder.from_words(document['tokens'])
tr = defaultdict(lambda: [])
for m in metrics:
for res in trigram_finder.score_ngrams(getattr(trigram_measures,m)):
tr[res[0]].append(res[1])

return {'trigram_rank': tr, 'metrics':metrics}
2 changes: 1 addition & 1 deletion tests/test_mongo_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def setUp(self):
self.monitoring = self.db[db_conf['monitoring_collection']]
self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
self.db[db_conf['gridfs_collection'] + '.files'].drop()
self.db[db_conf['gridfs_collection'] + '.chuncks'].drop()
self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
database=db_conf['database'],
collection=db_conf['analysis_collection'])
Expand Down
75 changes: 75 additions & 0 deletions tests/test_worker_bigrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.

import cPickle
import gridfs
import nltk
import unittest

from mongodict import MongoDict
from pymongo import Connection

from pypln.backend.mongo_store import MongoDBStore
from pypln.backend.workers.bigrams import Bigrams
from .utils import default_config

bigram_measures = nltk.collocations.BigramAssocMeasures()


class TestBigramWorker(unittest.TestCase):
def _prepare_store(self):
self.db_conf = db_conf = default_config['store']
self.connection = Connection(host=db_conf['host'],
port=db_conf['port'])
self.connection.drop_database(db_conf['database'])
self.db = self.connection[db_conf['database']]
self.monitoring = self.db[db_conf['monitoring_collection']]
self.gridfs = gridfs.GridFS(self.db, db_conf['gridfs_collection'])
self.db[db_conf['gridfs_collection'] + '.files'].drop()
self.db[db_conf['gridfs_collection'] + '.chunks'].drop()
self.mongodict = MongoDict(host=db_conf['host'], port=db_conf['port'],
database=db_conf['database'],
collection=db_conf['analysis_collection'])
self.store = MongoDBStore(**db_conf)

def test_bigrams_should_return_correct_score(self):
tokens = nltk.corpus.genesis.words('english-web.txt')
bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(tokens)
expected = bigram_finder.score_ngram(bigram_measures.chi_sq, u',', u'which')
bigram_rank = Bigrams().process({'tokens':tokens})['bigram_rank']
result = bigram_rank[0][1][0]
self.assertEqual(result, expected)

def test_worker_output_should_be_pickleable(self):
"""The workers run under multiprocessing, so their result is
pickled. This is a regression test."""
tokens = nltk.corpus.genesis.words('english-web.txt')
result = Bigrams().process({'tokens':tokens})
# This should not raise an exception.
cPickle.dumps(result)

def test_saving_worker_output_should_work(self):
"""Saving the worker output should work. This is a regression test."""
self._prepare_store()
tokens = nltk.corpus.genesis.words('english-web.txt')[:100]
result = Bigrams().process({'tokens': tokens})
info = {'data': {'id': 789, '_id': 'eggs'}, 'worker': 'Bigrams',
'worker_requires': ['tokens'], 'worker_result': result}
self.store.save(info)
self.connection.drop_database(self.db)
36 changes: 36 additions & 0 deletions tests/test_worker_trigrams.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.

import unittest
from pypln.backend.workers.trigrams import Trigrams
import nltk
import cPickle

trigram_measures = nltk.collocations.TrigramAssocMeasures()


class TestTrigramWorker(unittest.TestCase):
def test_Trigrams_should_return_correct_score_(self):
tokens = nltk.corpus.genesis.words('english-web.txt')
trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(tokens)
expected = trigram_finder.score_ngram(trigram_measures.chi_sq, u'olive', u'leaf',u'plucked')
trigram_rank = Trigrams().process({'tokens':tokens})['trigram_rank']
result = trigram_rank[(u'olive', u'leaf',u'plucked')][0]
self.assertEqual(result, expected)