Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/palavras full output #157

Merged
merged 5 commits into from
Oct 21, 2013
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions pypln/backend/workers/palavras_raw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.

import os
import subprocess
import sys

from pypelinin import Worker


PALAVRAS_ENCODING = sys.getfilesystemencoding()
BASE_PARSER = '/opt/palavras/por.pl'
PARSER_MODE = '--dep'

def palavras_installed():
return os.path.exists(BASE_PARSER)

class PalavrasRaw(Worker):
requires = ['text', 'language']

def process(self, document):
if document['language'] != 'pt' or not palavras_installed():
return {}

text = document['text']
process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))

return {'palavras_raw': stdout}
5 changes: 3 additions & 2 deletions pypln/backend/workers/pos/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,14 @@

import en_nltk
import pt_palavras
from pypln.backend.workers.palavras_raw import palavras_installed


MAPPING = {
'en': en_nltk.pos,
'pt': pt_palavras.pos,
}
if not pt_palavras.palavras_installed():
if not palavras_installed():
del(MAPPING['pt'])

def put_offset(text, tagged_text):
Expand All @@ -40,7 +41,7 @@ def put_offset(text, tagged_text):
return result

class POS(Worker):
requires = ['text', 'tokens', 'language']
requires = ['text', 'tokens', 'language', 'palavras_raw']

def process(self, document):
tagged_text_with_offset = None
Expand Down
21 changes: 5 additions & 16 deletions pypln/backend/workers/pos/pt_palavras.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,7 @@
import subprocess
import sys


PALAVRAS_ENCODING = sys.getfilesystemencoding()
BASE_PARSER = '/opt/palavras/por.pl'
PARSER_MODE = '--syn'
WORD_CLASSES = {
'N': 'Nouns',
'PROP': 'Proper nouns',
Expand All @@ -45,19 +42,14 @@
'NW': 'Non Word',
}

def call_palavras(text):
process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))
return stdout

def pos(document):
text = document['text']
palavras_output = call_palavras(text)
if 'palavras_raw' not in document:
return '', []

palavras_output = document['palavras_raw']
tagged_text = []
for line in palavras_output.split('\n')[1:]:
for line in palavras_output.split('\n'):
line = line.strip().decode(PALAVRAS_ENCODING)
if line.isspace() or line == '':
continue
Expand All @@ -82,6 +74,3 @@ def pos(document):
pos_tag = tags[0]
tagged_text.append((word, pos_tag))
return 'pt-palavras', tagged_text

def palavras_installed():
return os.path.exists(BASE_PARSER)
64 changes: 64 additions & 0 deletions tests/test_worker_palavras_raw.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# coding: utf-8
#
# Copyright 2012 NAMD-EMAP-FGV
#
# This file is part of PyPLN. You can get more information at: http://pypln.org/.
#
# PyPLN is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# PyPLN is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with PyPLN. If not, see <http://www.gnu.org/licenses/>.

import unittest

from textwrap import dedent

from pypln.backend.workers import palavras_raw


ORIGINAL_PATH = palavras_raw.BASE_PARSER

class TestPalavrasRawWorker(unittest.TestCase):

def test_should_run_only_if_language_is_portuguese(self):
if palavras_raw.palavras_installed():
document = {'text': 'There was a rock on the way.', 'language': 'en'}
result = palavras_raw.PalavrasRaw().process(document)
self.assertEqual(result, {})

def test_palavras_not_installed(self):
palavras_raw.BASE_PARSER = '/not-found'
document = {'text': 'Tinha uma pedra no meio do caminho.', 'language': 'pt'}
result = palavras_raw.PalavrasRaw().process(document)
self.assertEqual(result, {})


def test_palavras_should_return_raw_if_it_is_installed(self):
palavras_raw.BASE_PARSER = ORIGINAL_PATH
document = {'text': 'Eu sei que neste momento falo para todo Brasil.',
'language': 'pt'}
expected_raw = dedent('''
Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2
sei [saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA #2->0
que [que] <clb> <clb-fs> KS @SUB #3->7
em [em] <sam-> PRP @PIV> #4->7
este [este] <-sam> <dem> DET M S @>N #5->6
momento [momento] <dur> <f-q> N M S @P< #6->4
falo [falar] <vH> <mv> V PR 1S IND VFIN @FS-<ACC #7->2
para [para] PRP @<ADVL #8->7
todo [todo] <quant> DET M S @>N #9->10
Brasil [Brasil] <civ> <newlex> <*> PROP M S @P< [Brasil] <*> PROP M S @P< #10->8
$. #11->0
</s>
''').strip() + '\n\n'
result = palavras_raw.PalavrasRaw().process(document)
expected_result = {'palavras_raw': expected_raw}
self.assertEqual(expected_result, result)
35 changes: 20 additions & 15 deletions tests/test_worker_pos_pt_palavras.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,30 @@


class TestPosWorker(unittest.TestCase):
def test_should_return_None_if_palavras_raw_does_not_exist(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test is failing!

result = pt_palavras.pos({'text': 'Isso é um teste.'})
expected = '', None
self.assertEqual(result, expected)

def test_(self):
palavras_output = dedent('''
$START
Eu\t[eu] <*> PERS M/F 1S NOM @SUBJ>
sei\t[saber] V PR 1S IND VFIN @FMV
que\t[que] KS @#FS-<ACC @SUB
em\t[em] <sam-> PRP @PIV>
este\t[este] <-sam> <dem> DET M S @>N
momento\t[momento] <dur> <f-q> N M S @P<
falo\t[falar] <vH> V PR 1S IND VFIN @FMV
para\t[para] PRP @<ADVL
todo\t[todo] <quant> DET M S @>N
Brasil\t[Brasil] <*> <newlex> PROP M S @P< \t[Brasil] <*> PROP M S @P<
$.
''').strip()
Eu [eu] <*> PERS M/F 1S NOM @SUBJ> #1->2
sei [saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA #2->0
que [que] <clb> <clb-fs> KS @SUB #3->7
em [em] <sam-> PRP @PIV> #4->7
este [este] <-sam> <dem> DET M S @>N #5->6
momento [momento] <dur> <f-q> N M S @P< #6->4
falo [falar] <vH> <mv> V PR 1S IND VFIN @FS-<ACC #7->2
para [para] PRP @<ADVL #8->7
todo [todo] <quant> DET M S @>N #9->10
Brasil [Brasil] <civ> <newlex> <*> PROP M S @P< [Brasil] <*> PROP M S @P< #10->8
$. #11->0
</s>
''').strip() + '\n\n'
expected = ('pt-palavras', [('Eu', 'PERS'), ('sei', 'V'), ('que', 'KS'),
('em', 'PRP'), ('este', 'DET'), ('momento', 'N'),
('falo', 'V'), ('para', 'PRP'), ('todo', 'DET'),
('Brasil', 'PROP'), ('.', '.')])
pt_palavras.call_palavras = lambda x: palavras_output
result = pt_palavras.pos({'text': 'anything'})
result = pt_palavras.pos({'text': 'anything',
'palavras_raw': palavras_output})
self.assertEqual(expected, result)