Skip to content

Commit

Permalink
See #13: basic logout url detection
Browse files Browse the repository at this point in the history
For now, links with "logout" in url and "logout" or "log out" in text
will be skipped. This detection is off when autologin is disabled.
  • Loading branch information
lopuhin committed Apr 14, 2016
1 parent 209d563 commit fd4acea
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 9 deletions.
14 changes: 8 additions & 6 deletions tests/test_autologin.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,16 +106,18 @@ def __init__(self):
super().__init__()
self.putChild(b'hidden', authenticated_text(html(
'<a href="/one">one</a> | '
'<a href="/one?action=logout">one</a> | '
'<a href="/logout1">logout1</a> | '
'<a href="/one?action=l0gout">one</a> | ' # LOGOUT_URL
'<a href="/one?action=logout">one</a> | ' # _looks_like_logout
'<a href="/one?action=lo9out">Logout</a> | ' # _looks_like_logout
'<a href="/l0gout1">l0gout1</a> | '
'<a href="/two">two</a> | '
'<a href="/logout2">logout2</a> | '
'<a href="/l0gout2">l0gout2</a> | '
'<a href="/three">three</a>'
))())
self.putChild(b'one', authenticated_text(html('1'))())
self.putChild(b'logout1', self._Logout())
self.putChild(b'l0gout1', self._Logout())
self.putChild(b'two', authenticated_text(html('2'))())
self.putChild(b'logout2', self._Logout())
self.putChild(b'l0gout2', self._Logout())
self.putChild(b'three', authenticated_text(html('3'))())


Expand All @@ -127,7 +129,7 @@ def settings(self):
'USERNAME': 'admin',
'PASSWORD': 'secret',
'LOGIN_URL': '/login',
'LOGOUT_URL': 'action=logout',
'LOGOUT_URL': 'action=l0gout',
'FILES_STORE': 'file://' + self.tempdir.name,
}

Expand Down
2 changes: 1 addition & 1 deletion tests/test_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class Follow(Resource):
def __init__(self):
super().__init__()
self.putChild(b'', text_resource(
html('<a href="/one">one</a> | <a href="/two">two</a>'))())
html('<a href="/one">one</a> | <a href="/two">Logout</a>'))())
self.putChild(b'one', text_resource('one')())
self.putChild(b'two', text_resource('two')())

Expand Down
36 changes: 34 additions & 2 deletions undercrawler/spiders/base_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,8 @@ def request(url, meta=None, **kwargs):
# Pagination requests are sent twice, but we don't care because
# they're be filtered out by a dupefilter.
normal_urls = {link.url for link in
self.link_extractor.extract_links(response)}
self.link_extractor.extract_links(response)
if not self._looks_like_logout(link)}
for url in normal_urls:
yield request(url)

Expand Down Expand Up @@ -152,7 +153,9 @@ def download_files(self, response, normal_urls, parent_item):
urls = set()
for extractor in [
self.images_link_extractor, self.files_link_extractor]:
urls.update(link.url for link in extractor.extract_links(response))
urls.update(
link.url for link in extractor.extract_links(response)
if not self._looks_like_logout(link))
urls.difference_update(normal_urls)
for url in urls:
yield self.cdr_item(
Expand Down Expand Up @@ -247,6 +250,11 @@ def files_link_extractor(self):
def handled_search_forms(self):
return self.state.setdefault('handled_search_forms', set())

def _looks_like_logout(self, link):
if not self.settings.getbool('AUTOLOGIN_ENABLED'):
return False
return _looks_like_logout(link)


@contextlib.contextmanager
def _dont_increase_depth(response):
Expand Down Expand Up @@ -310,3 +318,27 @@ def _looks_like_url(txt):
if re.match(r"\w+\.html", txt):
return True
return False


def _looks_like_logout(link):
"""
Return True is link looks like a logout link.
>>> from scrapy.link import Link
>>> _looks_like_logout(Link('/logout', text='Log out'))
True
>>> _looks_like_logout(Link('/Logout-me', text='Exit'))
True
>>> _looks_like_logout(Link('/exit', text='Log out'))
True
>>> _looks_like_logout(Link('/exit', text='Logout'))
True
>>> _looks_like_logout(Link('/exit', text='Exit'))
False
"""
text = link.text.lower()
if any(x in text for x in ['logout', 'log out']):
return True
url = link.url.lower()
if any(x in url for x in ['logout']):
return True
return False

0 comments on commit fd4acea

Please sign in to comment.