From db6aded0775a7a6891d2fd1191d77592443d2e5d Mon Sep 17 00:00:00 2001 From: Alessio Pollero Date: Fri, 23 Mar 2018 12:44:31 +0100 Subject: [PATCH 1/3] Add compatibility with scrapinghub/splash virtual browser --- scrapy_proxies/randomproxy.py | 39 +++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/scrapy_proxies/randomproxy.py b/scrapy_proxies/randomproxy.py index 06a21ff..2a65c7c 100644 --- a/scrapy_proxies/randomproxy.py +++ b/scrapy_proxies/randomproxy.py @@ -23,9 +23,9 @@ import base64 import logging +proxy_regex = r'(\w+://)([^:]+?:[^@]+?@)?(.+)' log = logging.getLogger('scrapy.proxies') - class Mode: RANDOMIZE_PROXY_EVERY_REQUESTS, RANDOMIZE_PROXY_ONCE, SET_CUSTOM_PROXY = range(3) @@ -43,7 +43,7 @@ def __init__(self, settings): fin = open(self.proxy_list) try: for line in fin.readlines(): - parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', line.strip()) + parts = re.match(proxy_regex, line.strip()) if not parts: continue @@ -61,7 +61,7 @@ def __init__(self, settings): elif self.mode == Mode.SET_CUSTOM_PROXY: custom_proxy = settings.get('CUSTOM_PROXY') self.proxies = {} - parts = re.match('(\w+://)([^:]+?:[^@]+?@)?(.+)', custom_proxy.strip()) + parts = re.match(proxy_regex, custom_proxy.strip()) if not parts: raise ValueError('CUSTOM_PROXY is not well formatted') @@ -79,8 +79,8 @@ def from_crawler(cls, crawler): def process_request(self, request, spider): # Don't overwrite with a random one (server-side state for IP) - if 'proxy' in request.meta: - if request.meta["exception"] is False: + if 'proxy' in request.meta or ('splash' in request.meta and 'proxy' in request.meta['splash']['args']): + if request.meta.get("exception", False) is False: return request.meta["exception"] = False if len(self.proxies) == 0: @@ -93,20 +93,20 @@ def process_request(self, request, spider): proxy_user_pass = self.proxies[proxy_address] - if proxy_user_pass: - request.meta['proxy'] = proxy_address - basic_auth = 'Basic ' + base64.b64encode(proxy_user_pass.encode()).decode() - request.headers['Proxy-Authorization'] = basic_auth - else: - log.debug('Proxy user pass not found') + self.add_scrapy_proxy(request, proxy_address, proxy_user_pass) + log.debug('Using proxy <%s>, %d proxies left' % ( proxy_address, len(self.proxies))) def process_exception(self, request, exception, spider): - if 'proxy' not in request.meta: + if 'proxy' not in request.meta and not('splash' in request.meta and 'proxy' in request.meta['splash']['args']): return if self.mode == Mode.RANDOMIZE_PROXY_EVERY_REQUESTS or self.mode == Mode.RANDOMIZE_PROXY_ONCE: - proxy = request.meta['proxy'] + if ('splash' in request.meta and 'proxy' in request.meta['splash']['args']): + parts = re.match(proxy_regex, request.meta['splash']['args']['proxy'].strip()) + proxy = parts.group(1) + parts.group(3) + else: + proxy = request.meta['proxy'] try: del self.proxies[proxy] except KeyError: @@ -116,3 +116,16 @@ def process_exception(self, request, exception, spider): self.chosen_proxy = random.choice(list(self.proxies.keys())) log.info('Removing failed proxy <%s>, %d proxies left' % ( proxy, len(self.proxies))) + + def add_scrapy_proxy(self, request, address, user_pass = None): + + if('splash' in request.meta): + # In case there is splash, just forward the proxy to it + parts = re.match('(\w+://)([\w\W]+)', address.strip()) + request.meta['splash']['args']['proxy'] = parts.group(1) + ((user_pass + '@') if len(user_pass) > 0 else '') + parts.group(2) + else: + request.meta['proxy'] = address + if user_pass: + basic_auth = 'Basic ' + base64.b64encode(user_pass.encode()).decode() + request.headers['Proxy-Authorization'] = basic_auth + From 3ce09d57f9fb190734a7928a7d2cd3fa0120e5ce Mon Sep 17 00:00:00 2001 From: Alessio Pollero Date: Fri, 23 Mar 2018 17:29:36 +0100 Subject: [PATCH 2/3] Fix @ sign not allowed in password field of proxy url(fixes #40) --- scrapy_proxies/randomproxy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scrapy_proxies/randomproxy.py b/scrapy_proxies/randomproxy.py index 2a65c7c..b2011b1 100644 --- a/scrapy_proxies/randomproxy.py +++ b/scrapy_proxies/randomproxy.py @@ -23,7 +23,7 @@ import base64 import logging -proxy_regex = r'(\w+://)([^:]+?:[^@]+?@)?(.+)' +proxy_regex = r'(\w+://)([^:]+?:.+@)?(.+)' log = logging.getLogger('scrapy.proxies') class Mode: From dcefdda2684ff7c97d54e4b0fb63e9e30df02cd9 Mon Sep 17 00:00:00 2001 From: Alessio Pollero Date: Mon, 26 Mar 2018 17:05:23 +0200 Subject: [PATCH 3/3] Allow to disable random proxy selection from configuration file(mode < 0) --- scrapy_proxies/randomproxy.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scrapy_proxies/randomproxy.py b/scrapy_proxies/randomproxy.py index b2011b1..da49e84 100644 --- a/scrapy_proxies/randomproxy.py +++ b/scrapy_proxies/randomproxy.py @@ -78,6 +78,9 @@ def from_crawler(cls, crawler): return cls(crawler.settings) def process_request(self, request, spider): + if self.mode < 0: + log.warning("Skipping Random Proxy selection(disabled)!") + return; # Don't overwrite with a random one (server-side state for IP) if 'proxy' in request.meta or ('splash' in request.meta and 'proxy' in request.meta['splash']['args']): if request.meta.get("exception", False) is False: @@ -99,7 +102,7 @@ def process_request(self, request, spider): proxy_address, len(self.proxies))) def process_exception(self, request, exception, spider): - if 'proxy' not in request.meta and not('splash' in request.meta and 'proxy' in request.meta['splash']['args']): + if self.mode < 0 or ('proxy' not in request.meta and not('splash' in request.meta and 'proxy' in request.meta['splash']['args'])): return if self.mode == Mode.RANDOMIZE_PROXY_EVERY_REQUESTS or self.mode == Mode.RANDOMIZE_PROXY_ONCE: if ('splash' in request.meta and 'proxy' in request.meta['splash']['args']):