NAMD · flavioamieiro · Jul 7, 2015 · Feb 22, 2013 · Feb 22, 2013 · Feb 26, 2013
diff --git a/.gitignore b/.gitignore
@@ -13,3 +13,4 @@ MANIFEST
 .directory
 *.db
 .env
+local_config.py
diff --git a/Makefile b/Makefile
@@ -18,15 +18,15 @@
 
 test:
 	@clear
-	nosetests -dvs
+	nosetests -dvs tests/
 
 test-workers:
 	@clear
 	nosetests -dsv tests/test_worker_*.py
 
 test-x:
 	@clear
-	nosetests -dvsx
+	nosetests -dvsx tests/
 
 doc:
 	@clear
@@ -37,8 +37,8 @@ clean:
 	find -regex '.*\.pyc' -exec rm {} \;
 	find -regex '.*~' -exec rm {} \;
 
-run:
-	@./scripts/start_development_environment.sh
+run-celery:
+	celery -A 'pypln.backend' worker --app=pypln.backend.celery_app:app  -l info
 
 
-.PHONY:	test test-x doc clean test-workers run
+.PHONY:	test test-x doc clean test-workers run-celery
diff --git a/README.rst b/README.rst
@@ -2,13 +2,10 @@ PyPLN
 =====
 
 PyPLN is a distributed pipeline for natural language processing, made in Python.
-We use `NLTK <http://nltk.org/>`_ and `ZeroMQ <http://www.zeromq.org/>`_ as
+We use `NLTK <http://nltk.org/>`_ and `Celery <http://www.celeryproject.org>`_ as
 our foundations. The goal of the project is to create an easy way to use NLTK
 for processing big corpora, with a Web interface.
 
-We don't have a production release yet, but it's scheduled on our
-`next milestone <https://github.com/namd/pypln.backend/issues?milestone=1>`_.
-
 PyPLN is sponsored by `Fundação Getulio Vargas <http://portal.fgv.br/>`_.
 
 License
@@ -58,11 +55,38 @@ To run tests::
 
     workon pypln.backend
     pip install -r requirements/development.txt
+    echo "MONGODB_CONFIG = {'host': 'localhost', 'port': 27017, 'database': 'test_pypln_dev', 'gridfs_collection': files}" >> pypln/backend/local_config.py
     make test
 
+See our `code guidelines <https://github.com/namd/pypln.backend/blob/develop/CONTRIBUTING.rst>`_.
 
-..  TODO: The PYTHONPATH issue should be fixed once we organize the directory
-    structure. As soon as this is fixed, we must update this instructions.
+Creating a new Task
+~~~~~~~~~~~~~~~~~~~
 
+All analyses in PyPLN are performed by our workers. Every worker is a Celery
+task that can be included in the canvas that will run when a document is
+received in pypln.web.
+
+New workers are very easy to create. All you need to do is write a subclass of `PyPLNTask <https://github.com/NAMD/pypln.backend/blob/develop/pypln/backend/celery_task.py#L36>`
+that implements a "process" method. This method will receive the document as a
+dictionary, and should return a dictionary that will be used to update the
+existing document. As an example::
+
+
+    from pypln.backend.celery_task import PyPLNTask
+
+    class FreqDist(PyPLNTask):
+        def process(self, document):
+            value = document['value']
+            square = value ** 2
+            return {'squared_value': square}
+
+
+This worker assumes that a previous worker has already included "value" in the
+document and uses it to create a new one, called "squared_value".
+
+
+.. image:: https://d2weczhvl823v0.cloudfront.net/NAMD/pypln.backend/trend.png
+   :alt: Bitdeli badge
+   :target: https://bitdeli.com/free
 
-See our `code guidelines <https://github.com/namd/pypln.backend/blob/develop/CONTRIBUTING.rst>`_.
diff --git a/pypln/backend/broker.py b/pypln/backend/broker.py
diff --git a/pypln/backend/celery_app.py b/pypln/backend/celery_app.py
@@ -0,0 +1,28 @@
+# coding: utf-8
+#
+# Copyright 2015 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+from celery import Celery
+import config
+
+app = Celery('pypln_workers', backend='mongodb',
+        broker='amqp://', include=['pypln.backend.workers'])
+app.conf.update(
+    BROKER_URL=config.BROKER_URL,
+    CELERY_RESULT_BACKEND=config.CELERY_RESULT_BACKEND,
+)
diff --git a/pypln/backend/celery_task.py b/pypln/backend/celery_task.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+#
+# Copyright 2015 NAMD-EMAP-FGV
+#
+# This file is part of PyPLN. You can get more information at: http://pypln.org/.
+#
+# PyPLN is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# PyPLN is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with PyPLN.  If not, see <http://www.gnu.org/licenses/>.
+
+from celery import Task
+
+from pypln.backend.mongodict_adapter import MongoDictAdapter
+
+# This import may look like an unused imported, but it is not.
+# When our base task class is defined, the Celery app must have already been
+# instantiated, otherwise when this code is imported elsewhere (like in a
+# client that will call a task, for example) celery will fallback to the
+# default app, and our configuration will be ignored. This is not an issue in
+# the documented project layout, because there they import the app in the
+# module that define the tasks (to use the decorator in `app.task`).
+from pypln.backend.celery_app import app
+
+from pypln.backend import config
+
+
+class PyPLNTask(Task):
+    """
+    A base class for PyPLN tasks. It is in charge of getting the document
+    information based on the document id (that should be passed as an argument
+    by Celery), calling the `process` method, and saving this information on
+    the database. It will also return the document id, so the rest of the
+    pipeline has access to it.
+    """
+
+    def run(self, document_id):
+        """
+        This method is called by Celery, and should not be overridden.
+        It will call the `process` method with a dictionary containing all the
+        document information and will update de database with results.
+        """
+        document = MongoDictAdapter(doc_id=document_id,
+                host=config.MONGODB_CONFIG['host'],
+                port=config.MONGODB_CONFIG['port'],
+                database=config.MONGODB_CONFIG['database'])
+        # Create a dictionary out of our document. We could simply pass
+        # it on to the process method, but for now we won't let the user
+        # manipulate the MongoDict directly.
+        dic = {k: v for k, v in document.iteritems()}
+        result = self.process(dic)
+        document.update(result)
+        return document_id
+
+    def process(self, document):
+        """
+        This process should be implemented by subclasses. It is responsible for
+        performing the analysis itself. It will receive a dictionary as a
+        paramenter (containing all the current information on the document)
+        and must return a dictionary with the keys to be saved in the database.
+        """
+        raise NotImplementedError
diff --git a/pypln/backend/config.py b/pypln/backend/config.py
@@ -0,0 +1,48 @@
+import os
+import ConfigParser
+def get_store_config():
+    config_filename = os.path.expanduser('~/.pypln_store_config')
+    defaults = {'host': 'localhost',
+                'port': '27017',
+                'database': 'pypln_dev',
+                'gridfs_collection': 'files',
+    }
+    config = ConfigParser.ConfigParser(defaults=defaults)
+    config.add_section('store')
+    config.read(config_filename)
+    store_config = dict(config.items('store'))
+    # The database port needs to be an integer, but ConfigParser will treat
+    # everything as a string unless you use the specific method to retrieve the
+    # value.
+    store_config['port'] = config.getint('store', 'port')
+    return store_config
+
+MONGODB_CONFIG = get_store_config()
+ELASTICSEARCH_CONFIG = {
+    'hosts': ['127.0.0.1', '172.16.4.46', '172.16.4.52'],
+}
+
+def get_broker_config():
+    defaults = {
+        "host": "localhost",
+        "port": "5672",
+        "user": "guest",
+        "password": "guest",
+    }
+    celery_config = ConfigParser.ConfigParser(defaults=defaults)
+    celery_config.add_section('broker')
+    celery_config.read(os.path.expanduser('~/.pypln_celery_config'))
+    return dict(celery_config.items('broker'))
+
+CELERY_BROKER_CONFIG = get_broker_config()
+
+BROKER_URL = 'amqp://{}:{}@{}:{}//'.format(
+        CELERY_BROKER_CONFIG['user'], CELERY_BROKER_CONFIG['password'],
+        CELERY_BROKER_CONFIG['host'], CELERY_BROKER_CONFIG['port'])
+
+CELERY_RESULT_BACKEND = 'mongodb://{}:{}'.format(MONGODB_CONFIG['host'],
+        MONGODB_CONFIG['port'])
+try:
+    from local_config import *
+except ImportError:
+    pass
-Original file line number
+Diff line change
@@ Expand Up / @@ -13,3 +13,4 @@ MANIFEST @@
     .directory
     *.db
     .env
+    local_config.py