-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
98 lines (84 loc) · 2.62 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
from airbrakepy.logging.handlers import AirbrakeHandler
from config import SETTINGS, pg_connect
from datetime import datetime
import click
import isodate
import json
import logging
import lxml.html
import psycopg2
import redis
import sys
import urllib2
# initialize logger
logging.basicConfig()
logger = logging.getLogger("scraper")
if SETTINGS['PYTHON_ENV'] == 'development' or SETTINGS['PYTHON_ENV'] == 'test':
logger.setLevel(logging.DEBUG)
else:
logger.setLevel(logging.INFO)
handler = AirbrakeHandler(SETTINGS['AIRBRAKE_API_KEY'], environment=SETTINGS['PYTHON_ENV'], component_name='scraper', node_name='data25c')
handler.setLevel(logging.ERROR)
logger.addHandler(handler)
# initialize redis connection
redis_data = redis.StrictRedis.from_url(SETTINGS['REDIS_URL'])
class Scraper:
def __init__(self, dom):
self.dom = dom
def title(self):
for element in self.dom.xpath('//html/head/title'):
return element.text
def scrape(url):
try:
# open url
request = urllib2.Request(url, None, { 'User-Agent': 'twentyfivec/1.0 (https://www.25c.com/)' })
html = urllib2.urlopen(request).read()
except:
logger.exception("%s: unable to download", url)
# TODO re-enqueue for retry
return
try:
dom = lxml.html.fromstring(html)
return Scraper(dom)
except:
logger.exception('%s: unable to scrape html', url)
def process_message(message):
try:
# parse JSON data
data = json.loads(message)
except ValueError:
logger.warn('%s: unparseable message=%s', message)
return
try:
logger.info("%s: scraping...", data['url'])
scraper = scrape(data['url'])
if scraper is not None:
click.insert_title(data['url'], scraper.title())
logger.info("%s: %s", data['url'], scraper.title())
except:
logger.exception('%s: unexpected exception', data['url'])
def process_queue():
# block and wait for click data, pushing into processing queue
message = redis_data.brpop('QUEUE_SCRAPER', 0)
# process message
process_message(message[1])
def enqueue_url(url):
logger.info("Enqueueing: %s", url)
redis_data.lpush('QUEUE_SCRAPER', json.dumps({ 'url': url }))
def rescrape_all():
try:
pg_data = pg_connect(SETTINGS['DATABASE_URL'])
cursor = pg_data.cursor()
cursor.execute("SELECT DISTINCT(referrer) FROM clicks")
for row in cursor:
enqueue_url(row[0])
except:
logger.exception('Unexpected exception re-enqueing referrer urls for scraping')
finally:
cursor.close()
pg_data.commit()
pg_data.close()
if __name__ == '__main__':
logger.info("Starting scraper...")
while True:
process_queue()