Skip to content

Commit 0199697

Browse files
committed
Initial commit
0 parents  commit 0199697

File tree

9 files changed

+231
-0
lines changed

9 files changed

+231
-0
lines changed

.gitignore

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Created by .ignore support plugin (hsz.mobi)
2+
### Python template
3+
# Byte-compiled / optimized / DLL files
4+
__pycache__/
5+
*.py[cod]
6+
*$py.class
7+
8+
# C extensions
9+
*.so
10+
11+
# Distribution / packaging
12+
.Python
13+
env/
14+
build/
15+
develop-eggs/
16+
dist/
17+
downloads/
18+
eggs/
19+
.eggs/
20+
lib/
21+
lib64/
22+
parts/
23+
sdist/
24+
var/
25+
*.egg-info/
26+
.installed.cfg
27+
*.egg
28+
29+
# PyInstaller
30+
# Usually these files are written by a python script from a template
31+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
32+
*.manifest
33+
*.spec
34+
35+
# Installer logs
36+
pip-log.txt
37+
pip-delete-this-directory.txt
38+
39+
# Unit test / coverage reports
40+
htmlcov/
41+
.tox/
42+
.coverage
43+
.coverage.*
44+
.cache
45+
nosetests.xml
46+
coverage.xml
47+
*,cover
48+
.hypothesis/
49+
reports/
50+
51+
# Translations
52+
*.mo
53+
*.pot
54+
55+
# Django stuff:
56+
*.log
57+
local_settings.py
58+
59+
# Flask instance folder
60+
instance/
61+
62+
# Scrapy stuff:
63+
.scrapy
64+
65+
# Sphinx documentation
66+
docs/_build/
67+
.tmpdocs/
68+
69+
# PyBuilder
70+
target/
71+
72+
# IPython Notebook
73+
.ipynb_checkpoints
74+
75+
# pyenv
76+
.python-version
77+
78+
# celery beat schedule file
79+
celerybeat-schedule
80+
81+
# dotenv
82+
.env
83+
84+
# virtualenv
85+
.venv
86+
venv/
87+
venv-jenkins*/
88+
ENV/
89+
90+
# Spyder project settings
91+
.spyderproject
92+
93+
# Rope project settings
94+
.ropeproject
95+
96+
# .idea is the directory for pycharm project files
97+
.idea
98+
99+
# MACOS stuff
100+
.DS_Store

LICENCE

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
2+
Version 2, December 2004
3+
4+
Copyright (C) 2018 Clément Denoix <[email protected]>
5+
6+
Everyone is permitted to copy and distribute verbatim or modified
7+
copies of this license document, and changing it is allowed as long
8+
as the name is changed.
9+
10+
DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
11+
TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
12+
13+
0. You just DO WHAT THE FUCK YOU WANT TO.

MANIFEST.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
include requirements.txt

README.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
[![PyPI](https://img.shields.io/pypi/v/scrapy_cloudflare_middleware.svg)](https://pypi.python.org/pypi/scrapy_cloudflare_middleware)
2+
3+
## Scrapy "CloudFlare" middleware
4+
5+
A Scrapy middleware to bypass the CloudFlare's anti-bot protection, based on [cloudflare-scrape](https://github.com/Anorov/cloudflare-scrape).
6+
7+
### Installation
8+
```
9+
pip install scrapy_cloudflare_middleware
10+
```
11+
12+
### Usage
13+
14+
Add the middleware to your `DOWNLOADER_MIDDLEWARES` settings
15+
16+
```python
17+
DOWNLOADER_MIDDLEWARES = {
18+
# The priority of 560 is important, because we want this middleware to kick in just before the scrapy built-in `RetryMiddleware`.
19+
'scraping_hub.middlewares.CloudFlareMiddleware': 560
20+
}
21+
```
22+
23+
Done.
24+
Happy scraping !
25+
26+
27+
28+
29+
30+

requirements.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Scrapy>=1.0
2+
cfscrape>=1.9.4

scrapy_cloudflare_middleware/__init__.py

Whitespace-only changes.
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""This module contains the ``CloudFlareMiddleware``"""
2+
3+
from cfscrape import get_tokens
4+
5+
import logging
6+
7+
8+
class CloudFlareMiddleware:
9+
"""Scrapy middleware to bypass the CloudFlare's anti-bot protection"""
10+
11+
@staticmethod
12+
def is_cloudflare_challenge(response):
13+
"""Test if the given response contains the cloudflare's anti-bot protection"""
14+
15+
return (
16+
response.status == 503
17+
and response.headers.get('Server', '').startswith(b'cloudflare')
18+
and 'jschl_vc' in response.text
19+
and 'jschl_answer' in response.text
20+
)
21+
22+
def process_response(self, request, response, spider):
23+
"""Handle the a Scrapy response"""
24+
25+
if not self.is_cloudflare_challenge(response):
26+
return response
27+
28+
logger = logging.getLogger('cloudflaremiddleware')
29+
30+
logger.debug(
31+
'Cloudflare protection detected on %s, trying to bypass...',
32+
response.url
33+
)
34+
35+
cloudflare_tokens, __ = get_tokens(
36+
request.url,
37+
user_agent=spider.settings.get('USER_AGENT')
38+
)
39+
40+
logger.debug(
41+
'Successfully bypassed the protection for %s, re-scheduling the request',
42+
response.url
43+
)
44+
45+
request.cookies.update(cloudflare_tokens)
46+
request.priority = 99999
47+
48+
return request

setup.cfg

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
[metadata]
2+
name = scrapy_cloudflare_middleware
3+
version = 0.0.1
4+
url = https://github.com/clemfromspace/scrapy-cloudflare-middleware
5+
licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
6+
description = A Scrapy Middleware to bypass the CloudFlare's anti-bot protection
7+
long_description = file:README.md
8+
9+
[options]
10+
include_package_data = true

setup.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
"""This module contains the packaging routine for the ``scrapy-algolia-exporter`` package"""
2+
3+
from setuptools import setup, find_packages
4+
from pip.download import PipSession
5+
from pip.req import parse_requirements
6+
7+
8+
def get_requirements(source):
9+
"""Get the requirements from the given ``source``
10+
11+
Parameters
12+
----------
13+
source: str
14+
The filename containing the requirements
15+
16+
"""
17+
18+
install_reqs = parse_requirements(filename=source, session=PipSession())
19+
20+
return [str(ir.req) for ir in install_reqs]
21+
22+
setup(
23+
packages=find_packages(),
24+
install_requires=get_requirements('requirements.txt')
25+
)
26+
27+

0 commit comments

Comments
 (0)