diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py index 7c501d4..463caec 100644 --- a/pypln/backend/workers/__init__.py +++ b/pypln/backend/workers/__init__.py @@ -24,7 +24,8 @@ from statistics import Statistics from bigrams import Bigrams from stanford_ner import StanfordNER +from palavras_raw import PalavrasRaw __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', - 'Bigrams', 'StanfordNER'] + 'Bigrams', 'StanfordNER', 'PalavrasRaw'] diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py new file mode 100644 index 0000000..fbbdcac --- /dev/null +++ b/pypln/backend/workers/palavras_raw.py @@ -0,0 +1,48 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see <http://www.gnu.org/licenses/>. + +import os +import subprocess +import sys + +from pypelinin import Worker + + +PALAVRAS_ENCODING = sys.getfilesystemencoding() +BASE_PARSER = '/opt/palavras/por.pl' +PARSER_MODE = '--dep' + +def palavras_installed(): + return os.path.exists(BASE_PARSER) + +class PalavrasRaw(Worker): + requires = ['text', 'language'] + + def process(self, document): + if document['language'] != 'pt' or not palavras_installed(): + return {} + + text = document['text'] + process = subprocess.Popen([BASE_PARSER, PARSER_MODE], + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING)) + + return {'palavras_raw': stdout} diff --git a/pypln/backend/workers/pos/__init__.py b/pypln/backend/workers/pos/__init__.py index 2f10f48..a353da1 100644 --- a/pypln/backend/workers/pos/__init__.py +++ b/pypln/backend/workers/pos/__init__.py @@ -21,13 +21,14 @@ import en_nltk import pt_palavras +from pypln.backend.workers.palavras_raw import palavras_installed MAPPING = { 'en': en_nltk.pos, 'pt': pt_palavras.pos, } -if not pt_palavras.palavras_installed(): +if not palavras_installed(): del(MAPPING['pt']) def put_offset(text, tagged_text): @@ -40,7 +41,7 @@ def put_offset(text, tagged_text): return result class POS(Worker): - requires = ['text', 'tokens', 'language'] + requires = ['text', 'tokens', 'language', 'palavras_raw'] def process(self, document): tagged_text_with_offset = None diff --git a/pypln/backend/workers/pos/pt_palavras.py b/pypln/backend/workers/pos/pt_palavras.py index 47bb56a..5279b00 100644 --- a/pypln/backend/workers/pos/pt_palavras.py +++ b/pypln/backend/workers/pos/pt_palavras.py @@ -21,10 +21,7 @@ import subprocess import sys - PALAVRAS_ENCODING = sys.getfilesystemencoding() -BASE_PARSER = '/opt/palavras/por.pl' -PARSER_MODE = '--syn' WORD_CLASSES = { 'N': 'Nouns', 'PROP': 'Proper nouns', @@ -45,19 +42,14 @@ 'NW': 'Non Word', } -def call_palavras(text): - process = subprocess.Popen([BASE_PARSER, PARSER_MODE], - stdin=subprocess.PIPE, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE) - stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING)) - return stdout def pos(document): - text = document['text'] - palavras_output = call_palavras(text) + if 'palavras_raw' not in document: + return '', [] + + palavras_output = document['palavras_raw'] tagged_text = [] - for line in palavras_output.split('\n')[1:]: + for line in palavras_output.split('\n'): line = line.strip().decode(PALAVRAS_ENCODING) if line.isspace() or line == '': continue @@ -82,6 +74,3 @@ def pos(document): pos_tag = tags[0] tagged_text.append((word, pos_tag)) return 'pt-palavras', tagged_text - -def palavras_installed(): - return os.path.exists(BASE_PARSER) diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py new file mode 100644 index 0000000..63c38c4 --- /dev/null +++ b/tests/test_worker_palavras_raw.py @@ -0,0 +1,64 @@ +# coding: utf-8 +# +# Copyright 2012 NAMD-EMAP-FGV +# +# This file is part of PyPLN. You can get more information at: http://pypln.org/. +# +# PyPLN is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# PyPLN is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with PyPLN. If not, see <http://www.gnu.org/licenses/>. + +import unittest + +from textwrap import dedent + +from pypln.backend.workers import palavras_raw + + +ORIGINAL_PATH = palavras_raw.BASE_PARSER + +class TestPalavrasRawWorker(unittest.TestCase): + + def test_should_run_only_if_language_is_portuguese(self): + if palavras_raw.palavras_installed(): + document = {'text': 'There was a rock on the way.', 'language': 'en'} + result = palavras_raw.PalavrasRaw().process(document) + self.assertEqual(result, {}) + + def test_palavras_not_installed(self): + palavras_raw.BASE_PARSER = '/not-found' + document = {'text': 'Tinha uma pedra no meio do caminho.', 'language': 'pt'} + result = palavras_raw.PalavrasRaw().process(document) + self.assertEqual(result, {}) + + + def test_palavras_should_return_raw_if_it_is_installed(self): + palavras_raw.BASE_PARSER = ORIGINAL_PATH + document = {'text': 'Eu sei que neste momento falo para todo Brasil.', + 'language': 'pt'} + expected_raw = dedent(''' + Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2 + sei [saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA #2->0 + que [que] <clb> <clb-fs> KS @SUB #3->7 + em [em] <sam-> PRP @PIV> #4->7 + este [este] <-sam> <dem> DET M S @>N #5->6 + momento [momento] <dur> <f-q> N M S @P< #6->4 + falo [falar] <vH> <mv> V PR 1S IND VFIN @FS-<ACC #7->2 + para [para] PRP @<ADVL #8->7 + todo [todo] <quant> DET M S @>N #9->10 + Brasil [Brasil] <civ> <newlex> <*> PROP M S @P< [Brasil] <*> PROP M S @P< #10->8 + $. #11->0 + </s> + ''').strip() + '\n\n' + result = palavras_raw.PalavrasRaw().process(document) + expected_result = {'palavras_raw': expected_raw} + self.assertEqual(expected_result, result) diff --git a/tests/test_worker_pos_pt_palavras.py b/tests/test_worker_pos_pt_palavras.py index 7578f66..b9b4422 100644 --- a/tests/test_worker_pos_pt_palavras.py +++ b/tests/test_worker_pos_pt_palavras.py @@ -25,25 +25,30 @@ class TestPosWorker(unittest.TestCase): + def test_should_return_None_if_palavras_raw_does_not_exist(self): + result = pt_palavras.pos({'text': 'Isso é um teste.'}) + expected = '', [] + self.assertEqual(result, expected) + def test_(self): palavras_output = dedent(''' - $START - Eu\t[eu] <*> PERS M/F 1S NOM @SUBJ> - sei\t[saber] V PR 1S IND VFIN @FMV - que\t[que] KS @#FS-<ACC @SUB - em\t[em] <sam-> PRP @PIV> - este\t[este] <-sam> <dem> DET M S @>N - momento\t[momento] <dur> <f-q> N M S @P< - falo\t[falar] <vH> V PR 1S IND VFIN @FMV - para\t[para] PRP @<ADVL - todo\t[todo] <quant> DET M S @>N - Brasil\t[Brasil] <*> <newlex> PROP M S @P< \t[Brasil] <*> PROP M S @P< - $. - ''').strip() + Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2 + sei [saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA #2->0 + que [que] <clb> <clb-fs> KS @SUB #3->7 + em [em] <sam-> PRP @PIV> #4->7 + este [este] <-sam> <dem> DET M S @>N #5->6 + momento [momento] <dur> <f-q> N M S @P< #6->4 + falo [falar] <vH> <mv> V PR 1S IND VFIN @FS-<ACC #7->2 + para [para] PRP @<ADVL #8->7 + todo [todo] <quant> DET M S @>N #9->10 + Brasil [Brasil] <civ> <newlex> <*> PROP M S @P< [Brasil] <*> PROP M S @P< #10->8 + $. #11->0 + </s> + ''').strip() + '\n\n' expected = ('pt-palavras', [('Eu', 'PERS'), ('sei', 'V'), ('que', 'KS'), ('em', 'PRP'), ('este', 'DET'), ('momento', 'N'), ('falo', 'V'), ('para', 'PRP'), ('todo', 'DET'), ('Brasil', 'PROP'), ('.', '.')]) - pt_palavras.call_palavras = lambda x: palavras_output - result = pt_palavras.pos({'text': 'anything'}) + result = pt_palavras.pos({'text': 'anything', + 'palavras_raw': palavras_output}) self.assertEqual(expected, result)