NAMD · turicas · Oct 21, 2013 · Oct 17, 2013 · Oct 17, 2013 · Oct 21, 2013
diff --git a/pypln/backend/workers/palavras_raw.py b/pypln/backend/workers/palavras_raw.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+import subprocess
+import sys
+
+from pypelinin import Worker
+
+
+PALAVRAS_ENCODING = sys.getfilesystemencoding()
+BASE_PARSER = '/opt/palavras/por.pl'
+PARSER_MODE = '--dep'
+
+def palavras_installed():
+    return os.path.exists(BASE_PARSER)
+
+class PalavrasRaw(Worker):
+    requires = ['text', 'language']
+
+    def process(self, document):
+        if document['language'] != 'pt' or not palavras_installed():
+            return {}
+
+        text = document['text']
+        process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
+                                   stdin=subprocess.PIPE,
+                                   stdout=subprocess.PIPE,
+                                   stderr=subprocess.PIPE)
+        stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))
+
+        return {'palavras_raw': stdout}
diff --git a/pypln/backend/workers/pos/__init__.py b/pypln/backend/workers/pos/__init__.py
@@ -21,13 +21,14 @@
 
 import en_nltk
 import pt_palavras
+from pypln.backend.workers.palavras_raw import palavras_installed
 
 
 MAPPING = {
            'en': en_nltk.pos,
            'pt': pt_palavras.pos,
 }
-if not pt_palavras.palavras_installed():
+if not palavras_installed():
     del(MAPPING['pt'])
 
 def put_offset(text, tagged_text):
@@ -40,7 +41,7 @@ def put_offset(text, tagged_text):
     return result
 
 class POS(Worker):
-    requires = ['text', 'tokens', 'language']
+    requires = ['text', 'tokens', 'language', 'palavras_raw']
 
     def process(self, document):
         tagged_text_with_offset = None

diff --git a/pypln/backend/workers/pos/pt_palavras.py b/pypln/backend/workers/pos/pt_palavras.py
@@ -21,10 +21,7 @@
 import subprocess
 import sys
 
-
 PALAVRAS_ENCODING = sys.getfilesystemencoding()
-BASE_PARSER = '/opt/palavras/por.pl'
-PARSER_MODE = '--syn'
 WORD_CLASSES = {
                 'N': 'Nouns',
                 'PROP': 'Proper nouns',
@@ -45,19 +42,14 @@
                 'NW': 'Non Word',
 }
 
-def call_palavras(text):
-    process = subprocess.Popen([BASE_PARSER, PARSER_MODE],
-                               stdin=subprocess.PIPE,
-                               stdout=subprocess.PIPE,
-                               stderr=subprocess.PIPE)
-    stdout, stderr = process.communicate(text.encode(PALAVRAS_ENCODING))
-    return stdout
 
 def pos(document):
-    text = document['text']
-    palavras_output = call_palavras(text)
+    if 'palavras_raw' not in document:
+        return '', []
+
+    palavras_output = document['palavras_raw']
     tagged_text = []
-    for line in palavras_output.split('\n')[1:]:
+    for line in palavras_output.split('\n'):
         line = line.strip().decode(PALAVRAS_ENCODING)
         if line.isspace() or line == '':
             continue
@@ -82,6 +74,3 @@ def pos(document):
                 pos_tag = tags[0]
                 tagged_text.append((word, pos_tag))
     return 'pt-palavras', tagged_text
-
-def palavras_installed():
-    return os.path.exists(BASE_PARSER)
diff --git a/tests/test_worker_palavras_raw.py b/tests/test_worker_palavras_raw.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+#
+# Copyright 2012 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+import unittest
+
+from textwrap import dedent
+
+from pypln.backend.workers import palavras_raw
+
+
+ORIGINAL_PATH = palavras_raw.BASE_PARSER
+
+class TestPalavrasRawWorker(unittest.TestCase):
+
+    def test_should_run_only_if_language_is_portuguese(self):
+        if palavras_raw.palavras_installed():
+            document = {'text': 'There was a rock on the way.', 'language': 'en'}
+            result = palavras_raw.PalavrasRaw().process(document)
+            self.assertEqual(result, {})
+
+    def test_palavras_not_installed(self):
+        palavras_raw.BASE_PARSER = '/not-found'
+        document = {'text': 'Tinha uma pedra no meio do caminho.', 'language': 'pt'}
+        result = palavras_raw.PalavrasRaw().process(document)
+        self.assertEqual(result, {})
+
+
+    def test_palavras_should_return_raw_if_it_is_installed(self):
+        palavras_raw.BASE_PARSER = ORIGINAL_PATH
+        document = {'text': 'Eu sei que neste momento falo para todo Brasil.',
+                    'language': 'pt'}
+        expected_raw = dedent('''
+        Eu 	[eu] <*> PERS M/F 1S NOM @SUBJ>  #1->2
+        sei 	[saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA  #2->0
+        que 	[que] <clb> <clb-fs> KS @SUB  #3->7
+        em 	[em] <sam-> PRP @PIV>  #4->7
+        este 	[este] <-sam> <dem> DET M S @>N  #5->6
+        momento 	[momento] <dur> <f-q> N M S @P<  #6->4
+        falo 	[falar] <vH> <mv> V PR 1S IND VFIN @FS-<ACC  #7->2
+        para 	[para] PRP @<ADVL  #8->7
+        todo 	[todo] <quant> DET M S @>N  #9->10
+        Brasil 	[Brasil] <civ> <newlex> <*> PROP M S @P< 	[Brasil] <*> PROP M S @P<  #10->8
+        $. #11->0
+        </s>
+        ''').strip() + '\n\n'
+        result = palavras_raw.PalavrasRaw().process(document)
+        expected_result = {'palavras_raw': expected_raw}
+        self.assertEqual(expected_result, result)
diff --git a/tests/test_worker_pos_pt_palavras.py b/tests/test_worker_pos_pt_palavras.py
@@ -25,25 +25,30 @@
 
 
 class TestPosWorker(unittest.TestCase):
+    def test_should_return_None_if_palavras_raw_does_not_exist(self):
+        result = pt_palavras.pos({'text': 'Isso é um teste.'})
+        expected = '', None
+        self.assertEqual(result, expected)
+
     def test_(self):
         palavras_output = dedent('''
-        $START
-        Eu\t[eu] <*> PERS M/F 1S NOM @SUBJ>
-        sei\t[saber] V PR 1S IND VFIN @FMV
-        que\t[que] KS @#FS-<ACC @SUB
-        em\t[em] <sam-> PRP @PIV>
-        este\t[este] <-sam> <dem> DET M S @>N
-        momento\t[momento] <dur> <f-q> N M S @P<
-        falo\t[falar] <vH> V PR 1S IND VFIN @FMV
-        para\t[para] PRP @<ADVL
-        todo\t[todo] <quant> DET M S @>N
-        Brasil\t[Brasil] <*> <newlex> PROP M S @P< \t[Brasil] <*> PROP M S @P<
-        $.
-        ''').strip()
+        Eu 	[eu] <*> PERS M/F 1S NOM @SUBJ>  #1->2
+        sei 	[saber] <fmc> <mv> V PR 1S IND VFIN @FS-STA  #2->0
+        que 	[que] <clb> <clb-fs> KS @SUB  #3->7
+        em 	[em] <sam-> PRP @PIV>  #4->7
+        este 	[este] <-sam> <dem> DET M S @>N  #5->6
+        momento 	[momento] <dur> <f-q> N M S @P<  #6->4
+        falo 	[falar] <vH> <mv> V PR 1S IND VFIN @FS-<ACC  #7->2
+        para 	[para] PRP @<ADVL  #8->7
+        todo 	[todo] <quant> DET M S @>N  #9->10
+        Brasil 	[Brasil] <civ> <newlex> <*> PROP M S @P< 	[Brasil] <*> PROP M S @P<  #10->8
+        $. #11->0
+        </s>
+        ''').strip() + '\n\n'
         expected = ('pt-palavras', [('Eu', 'PERS'), ('sei', 'V'), ('que', 'KS'),
                     ('em', 'PRP'), ('este', 'DET'), ('momento', 'N'),
                     ('falo', 'V'), ('para', 'PRP'), ('todo', 'DET'),
                     ('Brasil', 'PROP'), ('.', '.')])
-        pt_palavras.call_palavras = lambda x: palavras_output
-        result = pt_palavras.pos({'text': 'anything'})
+        result = pt_palavras.pos({'text': 'anything',
+            'palavras_raw': palavras_output})
         self.assertEqual(expected, result)