Skip to content

Commit 73eb8ac

Browse files
committed
allows enabling of cookies + more robust link rejection
1 parent 591cf54 commit 73eb8ac

1 file changed

Lines changed: 8 additions & 1 deletion

File tree

lib/arachnid.rb

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# encoding: utf-8
22

3+
require 'tempfile'
34
require 'typhoeus'
45
require 'bloomfilter-rb'
56
require 'nokogiri'
@@ -16,6 +17,7 @@ def initialize(url, options = {})
1617
@exclude_urls_with_hash = options[:exclude_urls_with_hash]
1718
@exclude_urls_with_extensions = options[:exclude_urls_with_extensions]
1819
@proxy_list = options[:proxy_list]
20+
@cookies_enabled = options[:enable_cookies]
1921
end
2022

2123
def crawl(options = {})
@@ -47,6 +49,11 @@ def crawl(options = {})
4749
options[:proxy] = "#{ip}:#{port}" unless ip.nil?
4850
options[:proxy_username] = user unless user.nil?
4951
options[:proxy_password] = pass unless pass.nil?
52+
if @cookies_enabled
53+
cookie_file = Tempfile.new 'cookies'
54+
options[:cookiefile] = cookie_file
55+
options[:cookiejar] = cookie_file
56+
end
5057

5158
request = Typhoeus::Request.new(q, options)
5259

@@ -56,7 +63,7 @@ def crawl(options = {})
5663

5764
links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s)
5865
links.each do |link|
59-
next if link.match(/^\(|^javascript:|^mailto:/)
66+
next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$/)
6067
begin
6168

6269
if internal_link?(link, response.effective_url) &&

0 commit comments

Comments
 (0)