-
Notifications
You must be signed in to change notification settings - Fork 48
/
Copy pathrandom_useragent.py
49 lines (41 loc) · 1.7 KB
/
random_useragent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#!/usr/bin/python
# -*-coding:utf-8-*-
"""Scrapy Middleware to set a random User-Agent for every Request.
Downloader Middleware which uses a file containing a list of
user-agents and sets a random one for each request.
"""
import random
from scrapy import signals
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
__author__ = "Srinivasan Rangarajan"
__copyright__ = "Copyright 2016, Srinivasan Rangarajan"
__credits__ = ["Srinivasan Rangarajan"]
__license__ = "MIT"
__version__ = "0.2"
__maintainer__ = "Srinivasan Rangarajan"
__email__ = "[email protected]"
__status__ = "Development"
class RandomUserAgentMiddleware(UserAgentMiddleware):
def __init__(self, settings, user_agent='Scrapy'):
super(RandomUserAgentMiddleware, self).__init__()
self.user_agent = user_agent
user_agent_list_file = settings.get('USER_AGENT_LIST')
if not user_agent_list_file:
# If USER_AGENT_LIST_FILE settings is not set,
# Use the default USER_AGENT or whatever was
# passed to the middleware.
ua = settings.get('USER_AGENT', user_agent)
self.user_agent_list = [ua]
else:
with open(user_agent_list_file, 'r') as f:
self.user_agent_list = [line.strip() for line in f.readlines()]
@classmethod
def from_crawler(cls, crawler):
obj = cls(crawler.settings)
crawler.signals.connect(obj.spider_opened,
signal=signals.spider_opened)
return obj
def process_request(self, request, spider):
user_agent = random.choice(self.user_agent_list)
if user_agent:
request.headers.setdefault('User-Agent', user_agent)