File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 11# encoding: utf-8
22
3+ require 'tempfile'
34require 'typhoeus'
45require 'bloomfilter-rb'
56require 'nokogiri'
@@ -16,6 +17,7 @@ def initialize(url, options = {})
1617 @exclude_urls_with_hash = options [ :exclude_urls_with_hash ]
1718 @exclude_urls_with_extensions = options [ :exclude_urls_with_extensions ]
1819 @proxy_list = options [ :proxy_list ]
20+ @cookies_enabled = options [ :enable_cookies ]
1921 end
2022
2123 def crawl ( options = { } )
@@ -47,6 +49,11 @@ def crawl(options = {})
4749 options [ :proxy ] = "#{ ip } :#{ port } " unless ip . nil?
4850 options [ :proxy_username ] = user unless user . nil?
4951 options [ :proxy_password ] = pass unless pass . nil?
52+ if @cookies_enabled
53+ cookie_file = Tempfile . new 'cookies'
54+ options [ :cookiefile ] = cookie_file
55+ options [ :cookiejar ] = cookie_file
56+ end
5057
5158 request = Typhoeus ::Request . new ( q , options )
5259
@@ -56,7 +63,7 @@ def crawl(options = {})
5663
5764 links = Nokogiri ::HTML . parse ( response . body ) . xpath ( './/a/@href' ) . map ( &:to_s )
5865 links . each do |link |
59- next if link . match ( /^\( |^javascript:|^mailto:/ )
66+ next if link . match ( /^\( |^javascript:|^mailto:|^#|^ \s *$ / )
6067 begin
6168
6269 if internal_link? ( link , response . effective_url ) &&
You can’t perform that action at this time.
0 commit comments