diff --git a/pypln/backend/workers/__init__.py b/pypln/backend/workers/__init__.py index eb8bb99..4ebdd00 100644 --- a/pypln/backend/workers/__init__.py +++ b/pypln/backend/workers/__init__.py @@ -23,7 +23,6 @@ from pos import POS from statistics import Statistics from bigrams import Bigrams -from stanford_ner import StanfordNER from palavras_raw import PalavrasRaw from lemmatizer_pt import Lemmatizer from palavras_noun_phrase import NounPhrase @@ -31,5 +30,5 @@ __all__ = ['Extractor', 'Tokenizer', 'FreqDist', 'POS', 'Statistics', - 'Bigrams', 'StanfordNER', 'PalavrasRaw', 'Lemmatizer', - 'NounPhrase', 'SemanticTagger'] + 'Bigrams', 'PalavrasRaw', 'Lemmatizer', 'NounPhrase', + 'SemanticTagger'] diff --git a/pypln/backend/workers/stanford_ner.py b/pypln/backend/workers/stanford_ner.py deleted file mode 100644 index 563a716..0000000 --- a/pypln/backend/workers/stanford_ner.py +++ /dev/null @@ -1,57 +0,0 @@ -# coding: utf-8 -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - -from pypelinin import Worker -import ner - -NER_HOST="localhost" -NER_PORT=4242 - -class NERWrapper(ner.SocketNER): - - def __slashTags_parse_entities(self, tagged_text): - """Return a list of token tuples (entity_type, token) parsed - from slashTags-format tagged text. - - :param tagged_text: slashTag-format entity tagged text - """ - return (match.groups()[::-1] for match in - ner.client.SLASHTAGS_EPATTERN.finditer(tagged_text)) - - def get_entities_as_tuples(self, text): - """ - """ - if self.oformat != 'slashTags': - raise NotImplementedError("get_entities_as_tuples is not " - "implemented for output formats other than slashTags") - tagged_text = self.tag_text(text) - entities = self.__slashTags_parse_entities(tagged_text) - return entities - -class StanfordNER(Worker): - requires = ['text'] - - def process(self, document): - text = document['text'] - tagger = NERWrapper(host=NER_HOST, - port=NER_PORT, output_format="slashTags") - - entities = list(tagger.get_entities_as_tuples(text.encode('utf-8'))) - - return {'named_entities': entities} diff --git a/scripts/download_stanford_ner.sh b/scripts/download_stanford_ner.sh deleted file mode 100755 index 5d72087..0000000 --- a/scripts/download_stanford_ner.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - - -SCRIPT_PATH=$(dirname $(readlink -f $0))/ - -NER_DIRNAME="stanford-ner-2013-06-20" - -NER_DIR="$SCRIPT_PATH/$NER_DIRNAME" -NER_SHA1SUM="1589ac1b477a7894ca98d783d27c5b5b73f51d3d stanford-ner-2013-06-20.zip" - -DOWNLOAD_URL="http://nlp.stanford.edu/software/$NER_DIRNAME.zip" -ANSWER="Y" -read -p "download Stanford NER? [Y/n] " ANSWER -if [ "$ANSWER" = "y" -o "$ANSWER" = "Y" ] -then - cd "$SCRIPT_PATH" - wget -c "$DOWNLOAD_URL" - if [ "$(sha1sum $NER_DIRNAME.zip)" != "$NER_SHA1SUM" ] - then - echo "Something is wrong. NER zip file is different from expected." - exit 1 - fi - unzip -x $NER_DIRNAME.zip -else - exit 1 -fi diff --git a/scripts/start_development_environment.sh b/scripts/start_development_environment.sh index 83cd81a..239d6d4 100755 --- a/scripts/start_development_environment.sh +++ b/scripts/start_development_environment.sh @@ -28,20 +28,6 @@ echo "| This script is intended for development only. |" echo "| Please do not use it to run a production environment. |" echo "+-------------------------------------------------------+" -echo "Starting Stanford NER..." -NER_DIRNAME="stanford-ner-2013-06-20" -NER_DIR="$SCRIPT_PATH/scripts/$NER_DIRNAME" -if [ -d "$NER_DIR" ] -then - "$SCRIPT_PATH/scripts/start_stanford_ner_in_dev_environment.sh" & - NER_PID=$! - echo "Stanford NER has PID $NER_PID" -else - echo "Can't find Stanford NER." - echo "Run $SCRIPT_PATH/scripts/download_stanford_ner.sh to download it." - exit 0 -fi - echo "Starting router..." "$SCRIPT_PATH/pypln/backend/router.py" & ROUTER_PID=$! diff --git a/scripts/start_stanford_ner_in_dev_environment.sh b/scripts/start_stanford_ner_in_dev_environment.sh deleted file mode 100755 index 03a388f..0000000 --- a/scripts/start_stanford_ner_in_dev_environment.sh +++ /dev/null @@ -1,31 +0,0 @@ -#!/bin/bash -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - - -SCRIPT_PATH=$(dirname $(readlink -f $0))/ - -NER_DIRNAME="stanford-ner-2013-06-20" - -NER_DIR="$SCRIPT_PATH/$NER_DIRNAME" - -NER_PORT=4242 -NER_CLASSIFIER="classifiers/english.muc.7class.distsim.crf.ser.gz" - -cd "$NER_DIR" -exec java -mx500m -cp stanford-ner.jar edu.stanford.nlp.ie.NERServer -port $NER_PORT -loadClassifier "$NER_CLASSIFIER" diff --git a/tests/test_worker_stanford_ner.py b/tests/test_worker_stanford_ner.py deleted file mode 100644 index f096f64..0000000 --- a/tests/test_worker_stanford_ner.py +++ /dev/null @@ -1,63 +0,0 @@ -# coding: utf-8 -# -# Copyright 2012 NAMD-EMAP-FGV -# -# This file is part of PyPLN. You can get more information at: http://pypln.org/. -# -# PyPLN is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# PyPLN is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with PyPLN. If not, see . - -import unittest -from pypln.backend.workers import StanfordNER - - -class TestStanfordNERWorker(unittest.TestCase): - - def test_ner_should_return_marked_entities(self): - text = 'The sky is blue, the sun is yellow.' - - # Sample text from https://en.wikipedia.org/wiki/Dijkstra%27s_algorithm - text = (u"Dijkstra's algorithm, conceived by Dutch computer scientist " - u"Edsger Dijkstra in 1956 and published in 1959,[1][2] is a graph " - u"search algorithm that solves the single-source shortest path " - u"problem for a graph with non-negative edge path costs, producing " - u"a shortest path tree. This algorithm is often used in routing as " - u"a subroutine in other graph algorithms, or in GPS Technology. " - u"I'll add a unicode character here just for completion: Flávio." - ) - self.maxDiff = None - expected = [('O', 'Dijkstra'), ('O', "'s"), ('O', 'algorithm'), - ('O', ','), ('O', 'conceived'), ('O', 'by'), ('O', 'Dutch'), - ('O', 'computer'), ('O', 'scientist'), ('PERSON', 'Edsger'), - ('PERSON', 'Dijkstra'), ('O', 'in'), ('DATE', '1956'), ('O', 'and'), - ('O', 'published'), ('O', 'in'), ('DATE', '1959'), ('O', ','), - ('O', '-LSB-'), ('O', '1'), ('O', '-RSB-'), ('O', '-LSB-'), - ('O', '2'), ('O', '-RSB-'), ('O', 'is'), ('O', 'a'), ('O', 'graph'), - ('O', 'search'), ('O', 'algorithm'), ('O', 'that'), ('O', 'solves'), - ('O', 'the'), ('O', 'single-source'), ('O', 'shortest'), - ('O', 'path'), ('O', 'problem'), ('O', 'for'), ('O', 'a'), - ('O', 'graph'), ('O', 'with'), ('O', 'non-negative'), ('O', 'edge'), - ('O', 'path'), ('O', 'costs'), ('O', ','), ('O', 'producing'), - ('O', 'a'), ('O', 'shortest'), ('O', 'path'), ('O', 'tree'), - ('O', '.'), ('O', 'This'), ('O', 'algorithm'), ('O', 'is'), - ('O', 'often'), ('O', 'used'), ('O', 'in'), ('O', 'routing'), - ('O', 'as'), ('O', 'a'), ('O', 'subroutine'), ('O', 'in'), - ('O', 'other'), ('O', 'graph'), ('O', 'algorithms'), ('O', ','), - ('O', 'or'), ('O', 'in'), ('O', 'GPS'), ('O', 'Technology'), - ('O', '.'), ('O', 'I'), ('O', "'ll"), ('O', 'add'), ('O', 'a'), - ('O', 'unicode'), ('O', 'character'), ('O', 'here'), ('O', 'just'), - ('O', 'for'), ('O', 'completion'), ('O', ':'), - ('O', 'Fl\xc3\xa1vio'), ('O', '.')] - - result = StanfordNER().process({'text': text}) - self.assertEqual(result, {'named_entities': expected})