Skip to content

Commit 5c63bcf

Browse files
♻️ Redesigned Boilerplate (#39)
* ♻️ refactor design * 🗑️ Deprecated selectolax * ♻️ improved contracts and base design * ✅ add rocketry serve test * 🦺 Update AioHttpEngine `cookies` and `proxy` type * ✅ Add engine test for attrs * 🎨 Update pre-commit and `pyproject.toml` and `setup.cfg` * 🎨 Apply format with conf on all project * 🔧 update config files * 🚨 fix mypy type errors * 🎨 improve core adaptor * ✅ Update tests * 👷 Update req-dev * 👷 Update tox.ini * 🎨 Improved and centralized protos * ⚡️ Update `_get_cookie` and add a test for non-setuped engine --------- Co-authored-by: Sadegh Yazdani <[email protected]>
1 parent 4d9459e commit 5c63bcf

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

51 files changed

+624
-343
lines changed

.pre-commit-config.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ repos:
77
- id: check-yaml
88
- id: check-added-large-files
99
- repo: 'https://github.com/psf/black'
10-
rev: 23.3.0
10+
rev: 23.7.0
1111
hooks:
1212
- id: black
1313
- repo: 'https://github.com/PyCQA/flake8'
@@ -18,3 +18,8 @@ repos:
1818
rev: v1.4.1
1919
hooks:
2020
- id: mypy
21+
name: mypy (fastcrawler)
22+
files: ^fastcrawler/
23+
# - id: mypy
24+
# name: mypy (test)
25+
# files: ^test/

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
# fastcrawler
1+
# fastcrawler

docs_src/initilizing_project/sample1/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,5 @@
22
from fastcrawler import FastCrawler
33

44
app = FastCrawler(
5-
crawlers=wiki_spider
5+
crawlers=wiki_spider,
66
)

docs_src/initilizing_project/sample1/wikipedia.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33

44
from fastcrawler import BaseModel, Crawler, CSSField, Spider, XPATHField
5-
from fastcrawler.engine import AioHTTP
5+
from fastcrawler.engine import AioHttpEngine
66

77

88
class PageResolver(BaseModel):
@@ -16,21 +16,24 @@ class ArticleData(BaseModel):
1616

1717

1818
class WikiBaseSpider(Spider):
19-
engine = AioHTTP
19+
engine = AioHttpEngine
2020
concurrency = 100
2121

2222

2323
class WikiArticleFinder(WikiBaseSpider):
2424
data_model = PageResolver
2525
req_count = 1_000_000
26-
start_url = ["https://meta.wikimedia.org/wiki/List_of_Wikipedias", ]
26+
start_url = [
27+
"https://meta.wikimedia.org/wiki/List_of_Wikipedias",
28+
]
2729

2830

2931
class WikiArticleRetirever(WikiBaseSpider):
3032
data_model = ArticleData
3133
req_count = 1_000_000
3234

33-
async def save_data(self, data: ArticleData): ... # save parsed data to database
35+
async def save_data(self, data: ArticleData):
36+
... # save parsed data to database
3437

3538

3639
wiki_spider = Crawler(WikiArticleFinder >> WikiArticleRetirever)

fastcrawler/__init__.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
from .core import Crawler, FastCrawler, Spider
2-
from .parsers import BaseModel, CSSField, XPATHField, RegexField
2+
from .engine import AioHttpEngine
3+
from .parsers import BaseModel, CSSField, RegexField, XPATHField
4+
from .schedule import RocketryApplication, RocketryController
35
from .utils import Depends
46

57
__all__ = [
@@ -10,5 +12,8 @@
1012
"Depends",
1113
"Spider",
1214
"Crawler",
13-
"FastCrawler"
15+
"FastCrawler",
16+
"RocketryApplication",
17+
"RocketryController",
18+
"AioHttpEngine",
1419
]

fastcrawler/core/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,5 +5,5 @@
55
__all__ = [
66
"Crawler",
77
"Spider",
8-
"FastCrawler"
8+
"FastCrawler",
99
]

fastcrawler/core/registery.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ class CrawlerMeta(type):
99
1010
DONT TOUCH THIS CLASS UNLESS YOU KNOW WHAT YOU ARE DOING.
1111
"""
12+
1213
def __init__(cls, name, bases, dct):
1314
super().__init__(name, bases, dct)
1415
cls._instances = {}

fastcrawler/core/spider.py

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,25 @@
11
from typing import List
22

33

4-
class SpiderMetaClass(type):
4+
class Spider:
5+
"""
6+
Spider class to create the actual spider interface
7+
so that configuration of each spider can be given
8+
as class properties from the inheritanced class from spider
9+
10+
instances property hold the instances that were set by metaclass
11+
that is connected to current spider class
12+
"""
13+
14+
instances: List["Spider"]
15+
16+
def __init__(self):
17+
...
18+
519
def __rshift__(self, other: "Spider") -> "Spider":
620
"""
721
leveraged RSHIFT method for magic in flow >>
8-
objA >> objB >> objC >> objD
22+
clsA >> clsB >> clsC >> clsD
923
1024
Must be used as metaclass to inject behaviour to subclass
1125
@@ -17,15 +31,3 @@ def __rshift__(self, other: "Spider") -> "Spider":
1731
self.instances.append(other)
1832
setattr(other, "instances", self.instances)
1933
return other
20-
21-
22-
class Spider(metaclass=SpiderMetaClass):
23-
"""
24-
Spider class to create the actual spider interface
25-
so that configuration of each spider can be given
26-
as class properties from the inheritanced class from spider
27-
28-
instances property hold the instances that were set by metaclass
29-
that is connected to current spider class
30-
"""
31-
instances: List["Spider"]

fastcrawler/engine/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
from .aio import AioHTTP
2-
from .base import ProxySetting, SetCookieParam
1+
from .aio import AioHttpEngine
2+
from .contracts import ProxySetting, SetCookieParam
33

44
__all__ = [
55
"ProxySetting",
66
"SetCookieParam",
7-
"AioHTTP",
7+
"AioHttpEngine",
88
]

fastcrawler/engine/aio.py

Lines changed: 68 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,15 @@
11
import asyncio
2+
from typing import Any
23

34
import pydantic
45
from aiohttp import BasicAuth, ClientSession, TCPConnector
6+
from aiohttp.client import ClientResponse
57
from aiohttp.cookiejar import Morsel
68

7-
from fastcrawler.engine.base import ProxySetting, SetCookieParam
9+
from fastcrawler.engine.contracts import ProxySetting, Response, SetCookieParam
810

911

10-
class AioHTTP:
12+
class AioHttpEngine:
1113
def __init__(
1214
self,
1315
cookies: list[SetCookieParam] | None = None,
@@ -17,9 +19,9 @@ def __init__(
1719
connection_limit: int = 100,
1820
):
1921
"""Initialize a new engine instance with given cookie, header, useragent, and proxy"""
20-
self.session = None
22+
self.session: None | ClientSession = None
2123
self._cookies = (
22-
[(cookie.name, self.get_morsel_cookie(cookie)) for cookie in cookies]
24+
[(cookie.name, self._get_morsel_cookie(cookie)) for cookie in cookies]
2325
if cookies is not None
2426
else None
2527
)
@@ -30,29 +32,39 @@ def __init__(
3032

3133
self._connector = TCPConnector(limit_per_host=connection_limit)
3234

33-
self._proxy = {}
35+
self._proxy: dict[Any, Any] = {}
36+
self.proxy_dct = proxy
3437
if proxy:
3538
proxy_url = f"{proxy.protocol}{proxy.server}:{proxy.port}"
3639
self._proxy["proxy"] = proxy_url
3740
if proxy.username and proxy.password:
38-
auth = BasicAuth(login=proxy.username, password=proxy.password)
39-
self._proxy["proxy_auth"] = auth
41+
self._proxy["proxy_auth"] = BasicAuth(
42+
login=proxy.username, password=proxy.password
43+
)
4044

4145
@property
42-
def cookies(self):
43-
return self._cookies
46+
def cookies(self) -> list[SetCookieParam] | None:
47+
"""Return cookies"""
48+
cookies = None
49+
if self._cookies is not None:
50+
cookies = [self._get_cookie(cookie) for _, cookie in self._cookies]
51+
52+
return cookies
4453

4554
@property
46-
def headers(self):
55+
def headers(self) -> dict:
56+
"""Return headers"""
4757
return self._headers
4858

4959
@property
50-
def proxy(self):
51-
return self._proxy
60+
def proxy(self) -> ProxySetting | None:
61+
"""Return proxy setting"""
62+
return self.proxy_dct
5263

53-
def get_morsel_cookie(self, cookie: SetCookieParam) -> Morsel:
64+
@staticmethod
65+
def _get_morsel_cookie(cookie: SetCookieParam) -> Morsel:
5466
"""Converts a SetCookieParam object to an Morsel object."""
55-
morsel_obj = Morsel()
67+
morsel_obj: Morsel = Morsel()
5668
morsel_obj.set(cookie.name, cookie.value, cookie.value)
5769
morsel_obj.update(
5870
dict(
@@ -66,6 +78,21 @@ def get_morsel_cookie(self, cookie: SetCookieParam) -> Morsel:
6678
)
6779
return morsel_obj
6880

81+
@staticmethod
82+
def _get_cookie(cookie: Morsel) -> SetCookieParam:
83+
"""convert Morsel object to SetCookieParam object"""
84+
cookie_params = {
85+
"name": cookie.key,
86+
"value": cookie.value,
87+
"domain": cookie.get("domain"),
88+
"path": cookie.get("path"),
89+
"expires": cookie.get("expires"),
90+
"httpOnly": cookie.get("httponly"),
91+
"secure": cookie.get("secure"),
92+
"sameSite": cookie.get("samesite"),
93+
}
94+
return SetCookieParam(**cookie_params)
95+
6996
async def __aenter__(self):
7097
"""Async context manager support for engine -> ENTER"""
7198
await self.setup()
@@ -79,46 +106,59 @@ async def setup(self, **kwargs) -> None:
79106
"""Set-up up the engine for crawling purpose."""
80107
self.session = ClientSession(
81108
connector=self._connector,
82-
cookies=self.cookies,
109+
cookies=self._cookies,
83110
headers=self.headers,
84111
trust_env=True,
85112
**kwargs,
86113
)
87114

88115
async def teardown(self) -> None:
89116
"""Cleans up the engine."""
90-
await self.session.close()
117+
if self.session:
118+
await self.session.close()
91119

92-
async def base(self, url: pydantic.AnyUrl, method: str, data: dict, **kwargs) -> str:
120+
async def base(
121+
self, url: pydantic.AnyUrl, method: str, data: dict | None, **kwargs
122+
) -> Response | None:
93123
"""Base Method for protocol to retrieve a list of URL."""
94-
95-
async with self.session.request(
96-
method, url, data=data, headers=self.headers, **self.proxy, **kwargs
97-
) as response:
98-
return await response.text()
99-
100-
async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[str] | str:
124+
if self.session:
125+
async with self.session.request(
126+
method, str(url), data=data, headers=self.headers, **self._proxy, **kwargs
127+
) as response:
128+
return await self.translate_to_response(response)
129+
return None
130+
131+
async def get(self, urls: list[pydantic.AnyUrl], **kwargs) -> list[Response]:
101132
"""GET HTTP Method for protocol to retrieve a list of URL."""
102133
tasks = [self.base(url, "GET", None, **kwargs) for url in urls]
103134
return await asyncio.gather(*tasks)
104135

105136
async def post(
106137
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
107-
) -> list[str] | str:
138+
) -> list[Response]:
108139
"""POST HTTP Method for protocol to crawl a list of URL."""
109140
tasks = [self.base(url, "POST", data=data, **kwargs) for url, data in zip(urls, datas)]
110141
return await asyncio.gather(*tasks)
111142

112143
async def put(
113144
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
114-
) -> list[str] | str:
145+
) -> list[Response]:
115146
"""PUT HTTP Method for protocol to crawl a list of URL."""
116-
tasks = [self.base(url, "PUT", data=data) for url, data in zip(urls, datas)]
147+
tasks = [self.base(url, "PUT", data=data, **kwargs) for url, data in zip(urls, datas)]
117148
return await asyncio.gather(*tasks)
118149

119150
async def delete(
120151
self, urls: list[pydantic.AnyUrl], datas: list[dict], **kwargs
121-
) -> list[str] | str:
152+
) -> list[Response]:
122153
"""DELETE HTTP Method for protocol to crawl a list of URL."""
123154
tasks = [self.base(url, "DELETE", data=data, **kwargs) for url, data in zip(urls, datas)]
124155
return await asyncio.gather(*tasks)
156+
157+
async def translate_to_response(self, response_obj: ClientResponse) -> Response:
158+
"""Translate aiohttp response object to Response object"""
159+
return Response(
160+
text=await response_obj.text(),
161+
status_code=response_obj.status,
162+
headers=response_obj.headers,
163+
cookie=response_obj.cookies,
164+
)

fastcrawler/engine/constants.py

Whitespace-only changes.

fastcrawler/engine/base.py renamed to fastcrawler/engine/contracts.py

Lines changed: 18 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,15 @@
77

88

99
class SetCookieParam(pydantic.BaseModel):
10-
name: str
11-
value: str
10+
name: str = ""
11+
value: str = ""
1212
url: str | None = None
1313
domain: str = ""
14-
path: str | None = None
15-
expires: float | None = None
16-
httpOnly: bool | None = None
17-
secure: bool | None = None
18-
sameSite: Literal["Lax", "None", "Strict"] | None = None
14+
path: str = ""
15+
expires: str = ""
16+
httpOnly: str = ""
17+
secure: str = ""
18+
sameSite: str | Literal["Lax", "None", "Strict"] = ""
1919

2020

2121
class ProxySetting(pydantic.BaseModel):
@@ -26,6 +26,13 @@ class ProxySetting(pydantic.BaseModel):
2626
password: str | None = None
2727

2828

29+
class Response(pydantic.BaseModel):
30+
text: str | None = None
31+
status_code: int | None = None
32+
headers: dict | None = None
33+
cookie: dict | None = None
34+
35+
2936
class EngineProto(Protocol):
3037
def __init__(
3138
self,
@@ -34,7 +41,7 @@ def __init__(
3441
useragent: str | None,
3542
proxy: ProxySetting | None,
3643
):
37-
"""Initialize a new engine instance with given cookie(s), header(s), useragent, and proxy"""
44+
"Initialize a new engine instance with given cookie(s), header(s), useragent, and proxy"
3845

3946
async def __aenter__(self):
4047
"""Async context manager support for engine -> ENTER"""
@@ -62,3 +69,6 @@ async def put(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str:
6269

6370
async def delete(self, urls: list[pydantic.AnyUrl], datas: list[dict]) -> str:
6471
"""DELETE HTTP Method for protocol to crawl a list of URL."""
72+
73+
async def translate_to_response(self, response_obj: type) -> Response:
74+
"""Translate the response object to a Response object"""

0 commit comments

Comments
 (0)