Skip to content

Commit 72e024a

Browse files
committedFeb 23, 2017
update struct
1 parent 05277b9 commit 72e024a

File tree

9 files changed

+156
-176
lines changed

9 files changed

+156
-176
lines changed
 

Diff for: ‎.idea/workspace.xml

+123-78
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Diff for: ‎proxypool/__init__.py

-1
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-
__author__ = 'WiseDoge'

Diff for: ‎proxypool/api.py

-13
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,3 @@
1-
"""
2-
-------------------------------------------------
3-
File Name: api.py
4-
Description: API模块,运行后打开浏览器,访问
5-
http://127.0.0.1:5000/进入主页。
6-
访问 http://127.0.0.1:5000/get
7-
从代理池中获取一个代理。
8-
访问 http://127.0.0.1:5000/count
9-
获取代理池中可用代理的总数。
10-
Author: Liu
11-
Date: 2016/12/9
12-
-------------------------------------------------
13-
"""
141
from flask import Flask, g
152

163
from .db import RedisClient

Diff for: ‎proxypool/db.py

+15-20
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,6 @@
1-
"""
2-
-------------------------------------------------
3-
File Name: db.py
4-
Description: 数据库操作模块,负责对象与底层数据库
5-
的交互。
6-
Author: Liu
7-
Date: 2016/12/9
8-
-------------------------------------------------
9-
"""
101
import redis
11-
12-
from .error import PoolEmptyError
13-
from .setting import HOST, PORT
2+
from proxypool.error import PoolEmptyError
3+
from proxypool.setting import HOST, PORT
144

155

166
class RedisClient(object):
@@ -19,21 +9,21 @@ class RedisClient(object):
199
"""
2010

2111
def __init__(self, host=HOST, port=PORT):
22-
self.__db = redis.Redis(host, port)
12+
self._db = redis.Redis(host, port)
2313

2414
def get(self, count=1):
2515
"""从Pool中获取一定量数据。"""
26-
proxies = self.__db.lrange("proxies", 0, count - 1)
27-
self.__db.ltrim("proxies", count, -1)
16+
proxies = self._db.lrange("proxies", 0, count - 1)
17+
self._db.ltrim("proxies", count, -1)
2818
return proxies
2919

3020
def put(self, proxy):
3121
"""将代理压入Pool中。
3222
用Redis的set容器来负责去重,如果proxy能被压入proxy_set,
3323
就将其放入proxy pool中,否则不压入。
3424
"""
35-
if self.__db.sadd("proxy_set", proxy):
36-
self.__db.rpush("proxies", proxy)
25+
if self._db.sadd("set", proxy):
26+
self._db.rpush("proxies", proxy)
3727
else:
3828
pass
3929

@@ -47,17 +37,22 @@ def pop(self):
4737
"""弹出一个可用代理。
4838
"""
4939
try:
50-
return self.__db.blpop("proxies", 30)[1].decode('utf-8')
40+
return self._db.blpop("proxies", 30)[1].decode('utf-8')
5141
except:
5242
raise PoolEmptyError
5343

5444
@property
5545
def queue_len(self):
5646
"""获取proxy pool的大小。
5747
"""
58-
return self.__db.llen("proxies")
48+
return self._db.llen("proxies")
5949

6050
def flush(self):
6151
"""刷新Redis中的全部内容,测试用。
6252
"""
63-
self.__db.flushall()
53+
self._db.flushall()
54+
55+
56+
if __name__ == '__main__':
57+
conn = RedisClient()
58+
print(conn.get(20))

Diff for: ‎proxypool/error.py

-10
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,3 @@
1-
"""
2-
-------------------------------------------------
3-
File Name: error.py
4-
Description: 异常模块
5-
Author: Liu
6-
Date: 2016/12/9
7-
-------------------------------------------------
8-
"""
9-
10-
111
class ResourceDepletionError(Exception):
122
"""
133
资源枯竭异常,如果从所有抓取网站都抓不到可用的代理资源,

Diff for: ‎proxypool/getter.py

+1-16
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,3 @@
1-
"""
2-
-------------------------------------------------
3-
File Name: proxyGetter.py
4-
Description: 代理抓取模块,负责与网络的交互。
5-
注意,代理网站的HTML结构可能会时常的更新,
6-
会导致本文件下的抓取函数失效,所以,在运行
7-
代理池之前,需要更新一下FreeProxyGetter类
8-
中以crawl_开头的方法。
9-
Author: Liu
10-
Date: 2016/12/9
11-
-------------------------------------------------
12-
"""
13-
14-
import time
15-
161
from .utils import get_page
172
from pyquery import PyQuery as pq
183

@@ -43,7 +28,7 @@ class FreeProxyGetter(object, metaclass=ProxyMetaclass):
4328
添加器会自动识别并调用此类函数。
4429
"""
4530

46-
def get_raw_proxies(self, callback, count=40):
31+
def get_raw_proxies(self, callback):
4732
proxies = []
4833
print('Callback', callback)
4934
for proxy in eval("self.{}()".format(callback)):

Diff for: ‎proxypool/schedule.py

+13-23
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,49 @@
1-
"""
2-
-------------------------------------------------
3-
File Name: schedule.py
4-
Description: 调度器模块,
5-
包含ValidityTester,PoolAdder,
6-
Schedule三个类,负责维护代理池。
7-
Author: Liu
8-
Date: 2016/12/9
9-
-------------------------------------------------
10-
"""
111
import time
122
from multiprocessing import Process
133
import asyncio
144
import aiohttp
15-
from .db import RedisClient
16-
from .error import ResourceDepletionError
17-
from .getter import FreeProxyGetter
18-
from .setting import *
5+
from proxypool.db import RedisClient
6+
from proxypool.error import ResourceDepletionError
7+
from proxypool.getter import FreeProxyGetter
8+
from proxypool.setting import *
199

2010

2111
class ValidityTester(object):
2212
"""
2313
检验器,负责对未知的代理进行异步检测。
2414
"""
2515
# 用百度的首页来检验
26-
test_api = 'http://www.baidu.com'
16+
test_api = TEST_API
2717

2818
def __init__(self):
2919
self._raw_proxies = None
3020
self._usable_proxies = []
3121

3222
def set_raw_proxies(self, proxies):
33-
"""设置待检测的代理。
23+
"""
24+
设置待检测的代理。
3425
"""
3526
self._raw_proxies = proxies
3627
self._usable_proxies = []
3728

3829
async def test_single_proxy(self, proxy):
39-
"""检测单个代理,如果可用,则将其加入_usable_proxies
30+
"""
31+
检测单个代理,如果可用,则将其加入_usable_proxies
4032
"""
4133
async with aiohttp.ClientSession() as session:
4234
try:
4335
real_proxy = 'http://' + proxy
4436
print('Testing', real_proxy)
4537
async with session.get(self.test_api, proxy=real_proxy, timeout=15) as response:
4638
await response
47-
print('Response from', proxy)
4839
self._usable_proxies.append(proxy)
40+
print('Valid proxy', proxy)
4941
except Exception:
50-
pass
42+
print('Invalid proxy', proxy)
5143

5244
def test(self):
53-
"""异步检测_raw_proxies中的全部代理。
45+
"""
46+
异步检测_raw_proxies中的全部代理。
5447
"""
5548
print('ValidityTester is working')
5649
loop = asyncio.get_event_loop()
@@ -92,9 +85,6 @@ def add_to_queue(self):
9285
for callback_label in range(self._crawler.__CrawlFuncCount__):
9386
callback = self._crawler.__CrawlFunc__[callback_label]
9487
raw_proxies = self._crawler.get_raw_proxies(callback)
95-
self._tester.set_raw_proxies(raw_proxies)
96-
self._tester.test()
97-
self._conn.put_many(self._tester.get_usable_proxies())
9888
proxy_count += len(raw_proxies)
9989
if proxy_count == 0:
10090
raise ResourceDepletionError

Diff for: ‎proxypool/setting.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,3 @@
1-
"""
2-
-------------------------------------------------
3-
File Name: setting.py
4-
Description: 设置模块,包含了一些常量。
5-
Author: Liu
6-
Date: 2016/12/9
7-
-------------------------------------------------
8-
"""
9-
101
# Redis Host
112
HOST = 'localhost'
123
# Redis PORT
@@ -18,3 +9,5 @@
189

1910
VALID_CHECK_CYCLE = 600
2011
POOL_LEN_CHECK_CYCLE = 20
12+
13+
TEST_API='http://www.baidu.com'

Diff for: ‎proxypool/utils.py

+2-6
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,18 @@
11
import requests
2-
import lxml
32
import asyncio
4-
import time
53
import aiohttp
6-
from bs4 import BeautifulSoup
74
from requests.exceptions import ConnectionError
85

96
base_headers = {
10-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
11-
(KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
7+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
128
'Accept-Encoding': 'gzip, deflate, sdch',
139
'Accept-Language': 'zh-CN,zh;q=0.8'
1410
}
1511

1612

1713
def get_page(url, options={}):
1814
headers = dict(base_headers, **options)
19-
print('Getting', url, headers)
15+
print('Getting', url)
2016
try:
2117
r = requests.get(url, headers=headers)
2218
print('Getting result', url, r.status_code)

0 commit comments

Comments
 (0)
Please sign in to comment.