Initial commit

clemfromspace · clemfromspace · commit 01996972073f · 2018-02-10T19:45:57.000+09:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,100 @@
+# Created by .ignore support plugin (hsz.mobi)
+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+.hypothesis/
+reports/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask instance folder
+instance/
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+.tmpdocs/
+
+# PyBuilder
+target/
+
+# IPython Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+venv-jenkins*/
+ENV/
+
+# Spyder project settings
+.spyderproject
+
+# Rope project settings
+.ropeproject
+
+# .idea is the directory for pycharm project files
+.idea
+
+# MACOS stuff
+.DS_Store
diff --git a/LICENCE b/LICENCE
@@ -0,0 +1,13 @@
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+                    Version 2, December 2004
+
+ Copyright (C) 2018 Clément Denoix <clement.denoix@gmail.com>
+
+ Everyone is permitted to copy and distribute verbatim or modified
+ copies of this license document, and changing it is allowed as long
+ as the name is changed.
+
+            DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. You just DO WHAT THE FUCK YOU WANT TO.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include requirements.txt
diff --git a/README.md b/README.md
@@ -0,0 +1,30 @@
+[![PyPI](https://img.shields.io/pypi/v/scrapy_cloudflare_middleware.svg)](https://pypi.python.org/pypi/scrapy_cloudflare_middleware)
+
+## Scrapy "CloudFlare" middleware
+
+A Scrapy middleware to bypass the CloudFlare's anti-bot protection, based on [cloudflare-scrape](https://github.com/Anorov/cloudflare-scrape).
+
+### Installation
+```
+pip install scrapy_cloudflare_middleware
+```
+
+### Usage
+
+Add the middleware to your `DOWNLOADER_MIDDLEWARES` settings
+
+```python
+DOWNLOADER_MIDDLEWARES = {
+    # The priority of 560 is important, because we want this middleware to kick in just before the scrapy built-in `RetryMiddleware`.
+    'scraping_hub.middlewares.CloudFlareMiddleware': 560
+}
+```
+
+Done.
+Happy scraping !
+
+
+
+
+
+
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,2 @@
+Scrapy>=1.0
+cfscrape>=1.9.4
diff --git a/scrapy_cloudflare_middleware/__init__.py b/scrapy_cloudflare_middleware/__init__.py
diff --git a/scrapy_cloudflare_middleware/middlewares.py b/scrapy_cloudflare_middleware/middlewares.py
@@ -0,0 +1,48 @@
+"""This module contains the ``CloudFlareMiddleware``"""
+
+from cfscrape import get_tokens
+
+import logging
+
+
+class CloudFlareMiddleware:
+    """Scrapy middleware to bypass the CloudFlare's anti-bot protection"""
+
+    @staticmethod
+    def is_cloudflare_challenge(response):
+        """Test if the given response contains the cloudflare's anti-bot protection"""
+
+        return (
+            response.status == 503
+            and response.headers.get('Server', '').startswith(b'cloudflare')
+            and 'jschl_vc' in response.text
+            and 'jschl_answer' in response.text
+        )
+
+    def process_response(self, request, response, spider):
+        """Handle the a Scrapy response"""
+
+        if not self.is_cloudflare_challenge(response):
+            return response
+
+        logger = logging.getLogger('cloudflaremiddleware')
+
+        logger.debug(
+            'Cloudflare protection detected on %s, trying to bypass...',
+            response.url
+        )
+
+        cloudflare_tokens, __ = get_tokens(
+            request.url,
+            user_agent=spider.settings.get('USER_AGENT')
+        )
+
+        logger.debug(
+            'Successfully bypassed the protection for %s, re-scheduling the request',
+            response.url
+        )
+
+        request.cookies.update(cloudflare_tokens)
+        request.priority = 99999
+
+        return request
diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,10 @@
+[metadata]
+name = scrapy_cloudflare_middleware
+version = 0.0.1
+url = https://github.com/clemfromspace/scrapy-cloudflare-middleware
+licence = DO WHAT THE FUCK YOU WANT TO PUBLIC LICENSE
+description = A Scrapy Middleware to bypass the CloudFlare's anti-bot protection
+long_description = file:README.md
+
+[options]
+include_package_data = true
diff --git a/setup.py b/setup.py
@@ -0,0 +1,27 @@
+"""This module contains the packaging routine for the ``scrapy-algolia-exporter`` package"""
+
+from setuptools import setup, find_packages
+from pip.download import PipSession
+from pip.req import parse_requirements
+
+
+def get_requirements(source):
+    """Get the requirements from the given ``source``
+
+    Parameters
+    ----------
+    source: str
+        The filename containing the requirements
+
+    """
+
+    install_reqs = parse_requirements(filename=source, session=PipSession())
+
+    return [str(ir.req) for ir in install_reqs]
+
+setup(
+    packages=find_packages(),
+    install_requires=get_requirements('requirements.txt')
+)
+
+