Skip to content

add sub proxy pool mechanics #213

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Mar 25, 2024
5 changes: 4 additions & 1 deletion proxypool/processors/getter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from proxypool.storages.redis import RedisClient
from proxypool.setting import PROXY_NUMBER_MAX
from proxypool.crawlers import __all__ as crawlers_cls

from proxypool.testers import __all__ as testers_cls

class Getter(object):
"""
Expand All @@ -16,6 +16,8 @@ def __init__(self):
self.redis = RedisClient()
self.crawlers_cls = crawlers_cls
self.crawlers = [crawler_cls() for crawler_cls in self.crawlers_cls]
self.testers_cls = testers_cls
self.testers = [tester_cls() for tester_cls in self.testers_cls]

def is_full(self):
"""
Expand All @@ -36,6 +38,7 @@ def run(self):
logger.info(f'crawler {crawler} to get proxy')
for proxy in crawler.crawl():
self.redis.add(proxy)
[self.redis.add(proxy, redis_key=tester.key) for tester in self.testers]


if __name__ == '__main__':
Expand Down
21 changes: 17 additions & 4 deletions proxypool/processors/server.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from flask import Flask, g, request
from proxypool.exceptions import PoolEmptyException
from proxypool.storages.redis import RedisClient
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV
from proxypool.setting import API_HOST, API_PORT, API_THREADED, API_KEY, IS_DEV, PROXY_RAND_KEY_DEGRADED
import functools

__all__ = ['app']
Expand Down Expand Up @@ -53,10 +54,19 @@ def index():
@auth_required
def get_proxy():
"""
get a random proxy
get a random proxy, can query the specific sub-pool according the (redis) key
if PROXY_RAND_KEY_DEGRADED is set to True, will get a universal random proxy if no proxy found in the sub-pool
:return: get a random proxy
"""
key = request.args.get('key')
conn = get_conn()
# return conn.random(key).string() if key else conn.random().string()
if key:
try:
return conn.random(key).string()
except PoolEmptyException:
if not PROXY_RAND_KEY_DEGRADED:
raise
return conn.random().string()


Expand All @@ -67,8 +77,10 @@ def get_proxy_all():
get a random proxy
:return: get a random proxy
"""
key = request.args.get('key')

conn = get_conn()
proxies = conn.all()
proxies = conn.all(key) if key else conn.all()
proxies_string = ''
if proxies:
for proxy in proxies:
Expand All @@ -85,7 +97,8 @@ def get_count():
:return: count, int
"""
conn = get_conn()
return str(conn.count())
key = request.args.get('key')
return str(conn.count(key)) if key else conn.count()


if __name__ == '__main__':
Expand Down
28 changes: 28 additions & 0 deletions proxypool/processors/tester.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
TEST_DONT_SET_MAX_SCORE
from aiohttp import ClientProxyConnectionError, ServerDisconnectedError, ClientOSError, ClientHttpProxyError
from asyncio import TimeoutError
from proxypool.testers import __all__ as testers_cls

EXCEPTIONS = (
ClientProxyConnectionError,
Expand All @@ -30,6 +31,8 @@ def __init__(self):
"""
self.redis = RedisClient()
self.loop = asyncio.get_event_loop()
self.testers_cls = testers_cls
self.testers = [tester_cls() for tester_cls in self.testers_cls]

async def test(self, proxy: Proxy):
"""
Expand Down Expand Up @@ -63,8 +66,33 @@ async def test(self, proxy: Proxy):
else:
self.redis.decrease(proxy)
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')
# if independent tester class found, create new set of storage and do the extra test
for tester in self.testers:
key = tester.key
if self.redis.exists(proxy, key):
test_url = tester.test_url
headers = tester.headers()
cookies = tester.cookies()
async with session.get(test_url, proxy=f'http://{proxy.string()}',
timeout=TEST_TIMEOUT,
headers=headers,
cookies=cookies,
allow_redirects=False) as response:
resp_text = await response.text()
is_valid = await tester.parse(resp_text, test_url, proxy.string())
if is_valid:
if tester.test_dont_set_max_score:
logger.info(f'key[{key}] proxy {proxy.string()} is valid, remain current score')
else:
self.redis.max(proxy, key, tester.proxy_score_max)
logger.info(f'key[{key}] proxy {proxy.string()} is valid, set max score')
else:
self.redis.decrease(proxy, tester.key, tester.proxy_score_min)
logger.info(f'key[{key}] proxy {proxy.string()} is invalid, decrease score')

except EXCEPTIONS:
self.redis.decrease(proxy)
[self.redis.decrease(proxy, tester.key, tester.proxy_score_min) for tester in self.testers]
logger.debug(f'proxy {proxy.string()} is invalid, decrease score')

@logger.catch
Expand Down
2 changes: 2 additions & 0 deletions proxypool/setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@
PROXY_SCORE_MAX = env.int('PROXY_SCORE_MAX', 100)
PROXY_SCORE_MIN = env.int('PROXY_SCORE_MIN', 0)
PROXY_SCORE_INIT = env.int('PROXY_SCORE_INIT', 10)
# whether to get a universal random proxy if no proxy exists in the sub-pool identified by a specific key
PROXY_RAND_KEY_DEGRADED = env.bool('TEST_ANONYMOUS', True)

# definition of proxy number
PROXY_NUMBER_MAX = 50000
Expand Down
50 changes: 25 additions & 25 deletions proxypool/storages/redis.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __init__(self, host=REDIS_HOST, port=REDIS_PORT, password=REDIS_PASSWORD, db
self.db = redis.StrictRedis(
host=host, port=port, password=password, db=db, decode_responses=True, **kwargs)

def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
def add(self, proxy: Proxy, score=PROXY_SCORE_INIT, redis_key=REDIS_KEY) -> int:
"""
add proxy and set it to init score
:param proxy: proxy, ip:port, like 8.8.8.8:88
Expand All @@ -44,12 +44,12 @@ def add(self, proxy: Proxy, score=PROXY_SCORE_INIT) -> int:
if not is_valid_proxy(f'{proxy.host}:{proxy.port}'):
logger.info(f'invalid proxy {proxy}, throw it')
return
if not self.exists(proxy):
if not self.exists(proxy, redis_key):
if IS_REDIS_VERSION_2:
return self.db.zadd(REDIS_KEY, score, proxy.string())
return self.db.zadd(REDIS_KEY, {proxy.string(): score})
return self.db.zadd(redis_key, score, proxy.string())
return self.db.zadd(redis_key, {proxy.string(): score})

def random(self) -> Proxy:
def random(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> Proxy:
"""
get random proxy
firstly try to get proxy with max score
Expand All @@ -59,74 +59,74 @@ def random(self) -> Proxy:
"""
# try to get proxy with max score
proxies = self.db.zrangebyscore(
REDIS_KEY, PROXY_SCORE_MAX, PROXY_SCORE_MAX)
redis_key, proxy_score_max, proxy_score_max)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else get proxy by rank
proxies = self.db.zrevrange(
REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX)
redis_key, proxy_score_min, proxy_score_max)
if len(proxies):
return convert_proxy_or_proxies(choice(proxies))
# else raise error
raise PoolEmptyException

def decrease(self, proxy: Proxy) -> int:
def decrease(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN) -> int:
"""
decrease score of proxy, if small than PROXY_SCORE_MIN, delete it
:param proxy: proxy
:return: new score
"""
if IS_REDIS_VERSION_2:
self.db.zincrby(REDIS_KEY, proxy.string(), -1)
self.db.zincrby(redis_key, proxy.string(), -1)
else:
self.db.zincrby(REDIS_KEY, -1, proxy.string())
score = self.db.zscore(REDIS_KEY, proxy.string())
self.db.zincrby(redis_key, -1, proxy.string())
score = self.db.zscore(redis_key, proxy.string())
logger.info(f'{proxy.string()} score decrease 1, current {score}')
if score <= PROXY_SCORE_MIN:
if score <= proxy_score_min:
logger.info(f'{proxy.string()} current score {score}, remove')
self.db.zrem(REDIS_KEY, proxy.string())
self.db.zrem(redis_key, proxy.string())

def exists(self, proxy: Proxy) -> bool:
def exists(self, proxy: Proxy, redis_key=REDIS_KEY) -> bool:
"""
if proxy exists
:param proxy: proxy
:return: if exists, bool
"""
return not self.db.zscore(REDIS_KEY, proxy.string()) is None
return not self.db.zscore(redis_key, proxy.string()) is None

def max(self, proxy: Proxy) -> int:
def max(self, proxy: Proxy, redis_key=REDIS_KEY, proxy_score_max=PROXY_SCORE_MAX) -> int:
"""
set proxy to max score
:param proxy: proxy
:return: new score
"""
logger.info(f'{proxy.string()} is valid, set to {PROXY_SCORE_MAX}')
logger.info(f'{proxy.string()} is valid, set to {proxy_score_max}')
if IS_REDIS_VERSION_2:
return self.db.zadd(REDIS_KEY, PROXY_SCORE_MAX, proxy.string())
return self.db.zadd(REDIS_KEY, {proxy.string(): PROXY_SCORE_MAX})
return self.db.zadd(redis_key, proxy_score_max, proxy.string())
return self.db.zadd(redis_key, {proxy.string(): proxy_score_max})

def count(self) -> int:
def count(self, redis_key=REDIS_KEY) -> int:
"""
get count of proxies
:return: count, int
"""
return self.db.zcard(REDIS_KEY)
return self.db.zcard(redis_key)

def all(self) -> List[Proxy]:
def all(self, redis_key=REDIS_KEY, proxy_score_min=PROXY_SCORE_MIN, proxy_score_max=PROXY_SCORE_MAX) -> List[Proxy]:
"""
get all proxies
:return: list of proxies
"""
return convert_proxy_or_proxies(self.db.zrangebyscore(REDIS_KEY, PROXY_SCORE_MIN, PROXY_SCORE_MAX))
return convert_proxy_or_proxies(self.db.zrangebyscore(redis_key, proxy_score_min, proxy_score_max))

def batch(self, cursor, count) -> List[Proxy]:
def batch(self, cursor, count, redis_key=REDIS_KEY) -> List[Proxy]:
"""
get batch of proxies
:param cursor: scan cursor
:param count: scan count
:return: list of proxies
"""
cursor, proxies = self.db.zscan(REDIS_KEY, cursor, count=count)
cursor, proxies = self.db.zscan(redis_key, cursor, count=count)
return cursor, convert_proxy_or_proxies([i[0] for i in proxies])


Expand Down
16 changes: 16 additions & 0 deletions proxypool/testers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pkgutil
from .base import BaseTester
import inspect


# load classes subclass of BaseCrawler
classes = []
for loader, name, is_pkg in pkgutil.walk_packages(__path__):
module = loader.find_module(name).load_module(name)
for name, value in inspect.getmembers(module):
globals()[name] = value
if inspect.isclass(value) and issubclass(value, BaseTester) and value is not BaseTester \
and not getattr(value, 'ignore', False):
classes.append(value)
__all__ = __ALL__ = classes

19 changes: 19 additions & 0 deletions proxypool/testers/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from proxypool.setting import TEST_DONT_SET_MAX_SCORE, PROXY_SCORE_INIT, PROXY_SCORE_MAX, PROXY_SCORE_MIN


class BaseTester(object):
test_url = ""
key = ""
test_dont_set_max_score = TEST_DONT_SET_MAX_SCORE
proxy_score_init = PROXY_SCORE_INIT
proxy_score_max = PROXY_SCORE_MAX
proxy_score_min = PROXY_SCORE_MIN

def headers(self):
return None

def cookies(self):
return None

async def parse(self, html, url, proxy, expr='{"code":0'):
return True if expr in html else False