Skip to content
This repository was archived by the owner on Jul 19, 2018. It is now read-only.

autoschedule extension added #39

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions scrapylib/autoschedule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import os

import scrapinghub
from scrapy import log
from scrapy import signals
from scrapy.exceptions import NotConfigured


class AutoSchedule(object):
'''
AUTOSCHEDULE_SETTINGS:
{reason: {spider_name: [{**params}, {...}]}}
'''

def __init__(self, settings):
self.settings = settings.getdict('AUTOSCHEDULE_SETTINGS')
self.apikey = settings.get('AUTOSCHEDULE_APIKEY')
if not self.settings.values() or not self.apikey:
raise NotConfigured

@classmethod
def from_crawler(cls, crawler):
settings = crawler.settings
if not settings.getbool('AUTOSCHEDULE_ENABLED'):
raise NotConfigured
o = cls(settings)
crawler.signals.connect(o.spider_closed,
signal=signals.spider_closed)
return o

def _compile_params(self, settings_params, spider):
params = {}
for k, v in settings_params.items():
if isinstance(v, basestring):
new_v = getattr(spider, v, v)
params[k] = v if hasattr(new_v, '__call__') else new_v
else:
params[k] = v
return params

def spider_closed(self, spider, reason):
settings = self.settings.get(reason).get(spider.name)
if not settings:
return

_compiled_params_list = []
for params in settings:
params = self._compile_params(params, spider)
_compiled_params_list.append(params)

conn = scrapinghub.Connection(self.apikey)
project = conn[os.environ.get('SCRAPY_PROJECT_ID')]
current_job_key = os.environ.get('SCRAPY_JOB')
for settings_params in _compiled_params_list:
params = {'parent_job_key': current_job_key}
params.update(settings_params)
self._schedule(project, params)

def _schedule(self, project, params):
job = project.schedule(**params)
log.msg('Scheduled {spider} spider, job {job}, params {params}'\
.format(job=job, params=params, **params), level=log.INFO)
82 changes: 82 additions & 0 deletions tests/test_autoschedule.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import unittest

from scrapy.utils.test import get_crawler
from scrapy.exceptions import NotConfigured
from scrapy.spider import Spider

from scrapylib.autoschedule import AutoSchedule


class AutoScheduleTestCase(unittest.TestCase):

ext_cls = AutoSchedule

def _mock_crawler(self, settings=None):
class MockedDownloader(object):
slots = {}

class MockedEngine(object):
downloader = MockedDownloader()
fake_spider_closed_result = None

def close_spider(self, spider, reason):
self.fake_spider_closed_result = (spider, reason)

crawler = get_crawler(settings)
crawler.engine = MockedEngine()
return crawler

def test_enabled(self):
settings = {'AUTOSCHEDULE_ENABLED': True}
crawler = self._mock_crawler(settings)
self.assertRaises(NotConfigured, self.ext_cls.from_crawler, crawler)
settings['AUTOSCHEDULE_APIKEY'] = '123'
crawler = self._mock_crawler(settings)
self.assertRaises(NotConfigured, self.ext_cls.from_crawler, crawler)
settings['AUTOSCHEDULE_SETTINGS'] = {}
crawler = self._mock_crawler(settings)
self.assertRaises(NotConfigured, self.ext_cls.from_crawler, crawler)
# finally enabled
settings['AUTOSCHEDULE_SETTINGS'] = {'finished': {'foo': []}}
crawler = self._mock_crawler(settings)
self.ext_cls.from_crawler(crawler)

def test_compile_params(self):
spider = Spider('foo', pstr='string', pint=1, pcall=iter, plist=[])
jobs = [{
'spider': 'bar',
'string': 'pstr',
'int': 'pint',
'call': 'pcall',
'constant': 'constant1',
}, {
'spider': 'foo_bar',
'string': 'pstr',
'call': 'pcall',
'list': 'plist',
'constant': 'constant2',
}]
settings = {
'AUTOSCHEDULE_APIKEY': '123',
'AUTOSCHEDULE_ENABLED': True,
'AUTOSCHEDULE_SETTINGS': {
'finished': {
'foo': jobs
}
}
}
crawler = self._mock_crawler(settings)
ext = self.ext_cls.from_crawler(crawler)
params= []
ext._schedule = lambda y, x: params.append(x)
ext.spider_closed(spider, 'finished')
for j, p in zip(jobs, params):
for k, v in p.items():
if k == 'parent_job_key':
continue

sv = getattr(spider, j[k], v)
if not hasattr(sv, '__call__'):
self.assertEqual(sv, v)
else:
self.assertNotEqual(sv, k)