diff --git a/src/http_crawler/__init__.py b/src/http_crawler/__init__.py index fd2f990..8730875 100644 --- a/src/http_crawler/__init__.py +++ b/src/http_crawler/__init__.py @@ -9,7 +9,7 @@ __version__ = '0.1.2' -def crawl(base_url, follow_external_links=True): +def crawl(base_url, follow_external_links=True, follow_redirects=True): base_netloc = urlparse(base_url).netloc seen = set([base_url]) @@ -19,7 +19,10 @@ def crawl(base_url, follow_external_links=True): while todo: url = todo.pop() - rsp = session.get(url) + rsp = session.get(url, allow_redirects=follow_redirects) + + if rsp.status_code // 100 == 3: + continue yield rsp diff --git a/tests/redirect-site/index.html b/tests/redirect-site/index.html new file mode 100644 index 0000000..0deca7c --- /dev/null +++ b/tests/redirect-site/index.html @@ -0,0 +1,7 @@ + + +

This is a test page

+ page-1 + page-2 + + diff --git a/tests/redirect-site/redirect-new-path/page-1.html b/tests/redirect-site/redirect-new-path/page-1.html new file mode 100644 index 0000000..e965047 --- /dev/null +++ b/tests/redirect-site/redirect-new-path/page-1.html @@ -0,0 +1 @@ +Hello diff --git a/tests/redirect-site/redirect-new-path/page-2.html b/tests/redirect-site/redirect-new-path/page-2.html new file mode 100644 index 0000000..216e97c --- /dev/null +++ b/tests/redirect-site/redirect-new-path/page-2.html @@ -0,0 +1 @@ +World diff --git a/tests/test_http_crawler.py b/tests/test_http_crawler.py index f55ece9..a8968ba 100644 --- a/tests/test_http_crawler.py +++ b/tests/test_http_crawler.py @@ -16,14 +16,28 @@ def serve(): serving = True + class HTTPHandler(SimpleHTTPRequestHandler): + def do_GET(self): + path = self.path.split('/') + if path[-2] == 'redirect-old-path': + path[-2] = 'redirect-new-path' + newpath = '/'.join(path) + self.send_response(301) + self.send_header('Location', newpath) + self.send_header('content-type', 'text/html') + self.end_headers() + else: + return super(HTTPHandler, self).do_GET() + def _serve(dir, port): base_dir = os.path.join('tests', dir) os.chdir(base_dir) - server = HTTPServer(('', port), SimpleHTTPRequestHandler) + server = HTTPServer(('', port), HTTPHandler) server.serve_forever() Process(target=_serve, args=('site', 8000), daemon=True).start() Process(target=_serve, args=('external-site', 8001), daemon=True).start() + Process(target=_serve, args=('redirect-site', 8002), daemon=True).start() def test_crawl(): @@ -112,3 +126,30 @@ def test_extract_urls_from_css(): '/assets/somefont.eot', '/assets/somefont.ttf', } + + +def test_with_redirect(): + serve() + + rsps = list(http_crawler.crawl('http://localhost:8002/')) + actual_urls = set([resp.url for resp in rsps]) + expected_urls = set([ + 'http://localhost:8002/', + 'http://localhost:8002/redirect-new-path/page-2.html', + 'http://localhost:8002/redirect-new-path/page-1.html' + ]) + + assert actual_urls == expected_urls + + +def test_without_redirect(): + serve() + + rsps = list(http_crawler.crawl('http://localhost:8002/', + follow_redirects=False)) + actual_urls = set([resp.url for resp in rsps]) + expected_urls = set([ + 'http://localhost:8002/', + ]) + + assert actual_urls == expected_urls