Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add compatibility with scrapinghub/splash virtual browser #42

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 29 additions & 13 deletions scrapy_proxies/randomproxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@
import base64
import logging

proxy_regex = r'(\w+://)([^:]+?:.+@)?(.+)'
log = logging.getLogger('scrapy.proxies')


class Mode:
RANDOMIZE_PROXY_EVERY_REQUESTS, RANDOMIZE_PROXY_ONCE, SET_CUSTOM_PROXY = range(3)

Expand All @@ -43,7 +43,7 @@ def __init__(self, settings):
fin = open(self.proxy_list)
try:
for line in fin.readlines():
parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', line.strip())
parts = re.match(proxy_regex, line.strip())
if not parts:
continue

Expand All @@ -61,7 +61,7 @@ def __init__(self, settings):
elif self.mode == Mode.SET_CUSTOM_PROXY:
custom_proxy = settings.get('CUSTOM_PROXY')
self.proxies = {}
parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', custom_proxy.strip())
parts = re.match(proxy_regex, custom_proxy.strip())
if not parts:
raise ValueError('CUSTOM_PROXY is not well formatted')

Expand All @@ -78,9 +78,12 @@ def from_crawler(cls, crawler):
return cls(crawler.settings)

def process_request(self, request, spider):
if self.mode < 0:
log.warning("Skipping Random Proxy selection(disabled)!")
return;
# Don't overwrite with a random one (server-side state for IP)
if 'proxy' in request.meta:
if request.meta["exception"] is False:
if 'proxy' in request.meta or ('splash' in request.meta and 'proxy' in request.meta['splash']['args']):
if request.meta.get("exception", False) is False:
return
request.meta["exception"] = False
if len(self.proxies) == 0:
Expand All @@ -93,20 +96,20 @@ def process_request(self, request, spider):

proxy_user_pass = self.proxies[proxy_address]

if proxy_user_pass:
request.meta['proxy'] = proxy_address
basic_auth = 'Basic ' + base64.b64encode(proxy_user_pass.encode()).decode()
request.headers['Proxy-Authorization'] = basic_auth
else:
log.debug('Proxy user pass not found')
self.add_scrapy_proxy(request, proxy_address, proxy_user_pass)

log.debug('Using proxy <%s>, %d proxies left' % (
proxy_address, len(self.proxies)))

def process_exception(self, request, exception, spider):
if 'proxy' not in request.meta:
if self.mode < 0 or ('proxy' not in request.meta and not('splash' in request.meta and 'proxy' in request.meta['splash']['args'])):
return
if self.mode == Mode.RANDOMIZE_PROXY_EVERY_REQUESTS or self.mode == Mode.RANDOMIZE_PROXY_ONCE:
proxy = request.meta['proxy']
if ('splash' in request.meta and 'proxy' in request.meta['splash']['args']):
parts = re.match(proxy_regex, request.meta['splash']['args']['proxy'].strip())
proxy = parts.group(1) + parts.group(3)
else:
proxy = request.meta['proxy']
try:
del self.proxies[proxy]
except KeyError:
Expand All @@ -116,3 +119,16 @@ def process_exception(self, request, exception, spider):
self.chosen_proxy = random.choice(list(self.proxies.keys()))
log.info('Removing failed proxy <%s>, %d proxies left' % (
proxy, len(self.proxies)))

def add_scrapy_proxy(self, request, address, user_pass = None):

if('splash' in request.meta):
# In case there is splash, just forward the proxy to it
parts = re.match('(\w+://)([\w\W]+)', address.strip())
request.meta['splash']['args']['proxy'] = parts.group(1) + ((user_pass + '@') if len(user_pass) > 0 else '') + parts.group(2)
else:
request.meta['proxy'] = address
if user_pass:
basic_auth = 'Basic ' + base64.b64encode(user_pass.encode()).decode()
request.headers['Proxy-Authorization'] = basic_auth