From ed065c86e895fa8cb68acec4a29c2fe2b24e81af Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Tue, 13 Jan 2015 23:49:11 +0100
Subject: [PATCH 01/18] rejects href that contain javascript code

---
 lib/arachnid.rb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index c0442a0..89e4758 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -155,6 +155,7 @@ def ignore_extensions(url)
 	end
 
 	def sanitize_link(url)
+		return false if url.start_with? 'javascript'
 		begin
 			return url.gsub(/\s+/, "%20")
 		rescue
@@ -171,4 +172,4 @@ def make_absolute( href, root )
 	  	end
 	end
 
-end
\ No newline at end of file
+end

From 253f69c269ddcdc75b65dd64aba5c9dcf60d8673 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Wed, 14 Jan 2015 00:00:27 +0100
Subject: [PATCH 02/18] simplifies a few lines

---
 README.md       |  2 +-
 lib/arachnid.rb | 31 +++++--------------------------
 2 files changed, 6 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index 1342636..3ca3b26 100644
--- a/README.md
+++ b/README.md
@@ -68,7 +68,7 @@ Arachnid was built to run on Ruby 1.9.2 I'll be honest, I haven't really tested
 
     require 'arachnid'
 
-    Arachnid.new("http://domain.com", {:exclude_urls_with_images => true}).crawl({:threads => 2, :max_urls => 1000}) do |response|
+    Arachnid.new("http://domain.com", {:exclude_urls_with_extensions => ['.jpg']}).crawl({:threads => 2, :max_urls => 1000}) do |response|
       
         #"response" is just a Typhoeus response object.
         puts response.effective_url
diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 89e4758..b3dfd6c 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -52,7 +52,7 @@ def crawl(options = {})
 						links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
 
 						links.each do |link|
-							if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link))
+							if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && extension_not_ignored?(link))
 								
 								sanitized_link = sanitize_link(split_url_at_hash(link))
 								if(sanitized_link)
@@ -110,48 +110,27 @@ def parse_domain(url)
 	end
 
 	def internal_link?(url, effective_url)
-
 		absolute_url = make_absolute(url, effective_url)
-
 		parsed_url = parse_domain(absolute_url)
-		if(@domain == parsed_url)
-			return true
-		else
-			return false
-		end
+		@domain == parsed_url
 	end
 
 	def split_url_at_hash(url)
 		return url.to_s unless @split_url_at_hash
-
 		return url.to_s.split('#')[0]
-
 	end
 
 	def no_hash_in_url?(url)
 		return true unless @exclude_urls_with_hash
 
-		if(url.to_s.scan(/#/).size > 0)
-			return false
-		else
-			return true
-		end
+		! url.to_s.scan(/#/).size > 0
 	end
 
-	def ignore_extensions(url)
+	def extension_not_ignored?(url)
 		return true if url.to_s.length == 0
 		return true unless @exclude_urls_with_extensions
 
-		not_found = true
-
-		@exclude_urls_with_extensions.each do |e|
-			if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase)
-				not_found = false
-				puts "#{e} Found At URL: #{url}" if @debug
-			end
-		end
-
-		return not_found
+		@exclude_urls_with_extensions.find { |e| url.to_s.downcase.end_with? e.to_s.downcase }.nil?
 	end
 
 	def sanitize_link(url)

From 3d57caf724d34722d20c782419ad6c00ba976194 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Wed, 14 Jan 2015 00:06:10 +0100
Subject: [PATCH 03/18] adds a few tests

---
 test/arachnid_test.rb | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 test/arachnid_test.rb

diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb
new file mode 100644
index 0000000..93e9bc4
--- /dev/null
+++ b/test/arachnid_test.rb
@@ -0,0 +1,18 @@
+require_relative '../lib/arachnid'
+
+require "minitest/autorun"
+
+class ArachnidTest < Minitest::Test
+  def test_ignores_specified_extensions
+    arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg']
+
+    assert arachnid.extension_not_ignored?('http://example.org/example')
+    refute arachnid.extension_not_ignored?('http://example.org/example.jpg')
+  end
+
+  def test_does_not_sanitize_hrefs_with_javascript_code
+    arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg']
+
+    refute arachnid.sanitize_link('javascript:void(0)')
+  end
+end

From dee37ba1a29d511b802e24dd2e3e1bd5f2cf15da Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Thu, 15 Jan 2015 09:55:27 +0100
Subject: [PATCH 04/18] rejects hrefs starting with '(' or mailto

---
 lib/arachnid.rb       |  2 +-
 test/arachnid_test.rb | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index b3dfd6c..3fbe2dd 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -134,7 +134,7 @@ def extension_not_ignored?(url)
 	end
 
 	def sanitize_link(url)
-		return false if url.start_with? 'javascript'
+		return false if url.match(/^javascript|^\(|^mailto/)
 		begin
 			return url.gsub(/\s+/, "%20")
 		rescue
diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb
index 93e9bc4..a9cd13c 100644
--- a/test/arachnid_test.rb
+++ b/test/arachnid_test.rb
@@ -10,9 +10,19 @@ def test_ignores_specified_extensions
     refute arachnid.extension_not_ignored?('http://example.org/example.jpg')
   end
 
-  def test_does_not_sanitize_hrefs_with_javascript_code
-    arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg']
+  def test_sanitizes_a_normal_href
+    arachnid = Arachnid.new 'example.com'
+
+    assert arachnid.sanitize_link('http://example.com/page.html')
+  end
+
+  def test_does_not_sanitize_hrefs_with_javascript_or_mailto
+    arachnid = Arachnid.new 'example.com'
 
     refute arachnid.sanitize_link('javascript:void(0)')
+    refute arachnid.sanitize_link('(javascript:void(0))')
+    refute arachnid.sanitize_link('mailto:info@example.com')
+    refute arachnid.sanitize_link('(mailto:info@example.com)')
   end
+
 end

From 9d7facc254cbaf954b3329385de3ea2a7c9ea548 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Thu, 15 Jan 2015 14:33:48 +0100
Subject: [PATCH 05/18] upgrades typhoeus dependency

---
 arachnid.gemspec | 4 ++--
 lib/arachnid.rb  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arachnid.gemspec b/arachnid.gemspec
index bae3659..4255248 100644
--- a/arachnid.gemspec
+++ b/arachnid.gemspec
@@ -9,8 +9,8 @@ Gem::Specification.new do |s|
   s.files       = ["lib/arachnid.rb"]
   s.homepage    = 'https://github.com/dchuk/Arachnid'
 
-  s.add_dependency('typhoeus',    '0.3.2')
+  s.add_dependency('typhoeus',    '0.7.0')
   s.add_dependency('bloomfilter-rb',    '2.1.1')
   s.add_dependency('nokogiri',    '1.5.0')
   s.add_dependency('domainatrix')
-end
\ No newline at end of file
+end
diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 3fbe2dd..16852a8 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -41,9 +41,9 @@ def crawl(options = {})
 				begin
 					ip,port,user,pass = grab_proxy
  
-					request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil
-					request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
-					request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
+					request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true) if ip == nil
+					request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
+					request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
 
 					request.on_complete do |response|
 

From d3e56c8089d8a7817eb7aa55727d53f2945933f0 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sat, 17 Jan 2015 01:26:12 +0100
Subject: [PATCH 06/18] makes sanitization more robust

---
 lib/arachnid.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 16852a8..6ed0903 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -134,7 +134,7 @@ def extension_not_ignored?(url)
 	end
 
 	def sanitize_link(url)
-		return false if url.match(/^javascript|^\(|^mailto/)
+		return false if url.strip.match(/^javascript:|^\(|^mailto:|^about:/)
 		begin
 			return url.gsub(/\s+/, "%20")
 		rescue

From 25aafc1f1f705361dd992f7c22fb4f9f51d156d3 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sat, 17 Jan 2015 02:03:45 +0100
Subject: [PATCH 07/18] prevents explosion of global queue

---
 lib/arachnid.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 6ed0903..8235d47 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -34,9 +34,9 @@ def crawl(options = {})
 		@global_queue << @start_url
 		
 		while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
-			temp_queue = @global_queue
-
-			temp_queue.each do |q|
+			(0...@global_queue.length).each do |i|
+        next if i >= @global_queue.length
+        q = @global_queue[i]
 
 				begin
 					ip,port,user,pass = grab_proxy
@@ -59,7 +59,7 @@ def crawl(options = {})
 
 									absolute_link = make_absolute(sanitized_link, response.effective_url)
 									if(absolute_link)
-										@global_queue << absolute_link
+										(@global_queue << absolute_link) unless @global_queue.include?(absolute_link)
 									end
 								end
 							end

From 591cf54db9fdf1d2c33d8b38b6434937473b818b Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sat, 17 Jan 2015 11:56:42 +0100
Subject: [PATCH 08/18] fixes bugs + refactors

---
 lib/arachnid.rb       | 200 +++++++++++++++++++-----------------------
 test/arachnid_test.rb |  17 ++--
 2 files changed, 100 insertions(+), 117 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 8235d47..adc62d0 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -8,147 +8,131 @@
 
 class Arachnid
 
-	def initialize(url, options = {})
-		@start_url = url
-		@domain = parse_domain(url)
+  def initialize(url, options = {})
+    @start_url = url
+    @debug = options[:debug]
+    @domain = parse_domain(url)
+    @split_url_at_hash = options[:split_url_at_hash]
+    @exclude_urls_with_hash = options[:exclude_urls_with_hash]
+    @exclude_urls_with_extensions = options[:exclude_urls_with_extensions]
+    @proxy_list = options[:proxy_list]
+  end
 
-		@split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false
-		@exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false
-		@exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false
-		@proxy_list = options[:proxy_list] ? options[:proxy_list] : false
-		
-		@debug = options[:debug] ? options[:debug] : false
-	end
+  def crawl(options = {})
+    threads = options[:threads] || 1
+    max_urls = options[:max_urls]
 
-	def crawl(options = {})
+    @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
+    @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
+    @global_queue = []
 
-		#defaults to 1 thread so people don't do a stupid amount of crawling on unsuspecting domains
-		threads = options[:threads] ? options[:threads] : 1
-		#defaults to -1 so it will always keep running until it runs out of urls
-		max_urls = options[:max_urls] ? options[:max_urls] : nil
+    @global_queue << @start_url
 
-		@hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
-		@global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
-		@global_queue = []
+    while not @global_queue.empty?
 
-		@global_queue << @start_url
-		
-		while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls))
-			(0...@global_queue.length).each do |i|
-        next if i >= @global_queue.length
-        q = @global_queue[i]
+      @global_queue.size.times do
+        q = @global_queue.shift
 
-				begin
-					ip,port,user,pass = grab_proxy
- 
-					request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true) if ip == nil
-					request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil
-					request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil
+        if !max_urls.nil? && @global_visited.size >= max_urls
+          @global_queue = []
+          break
+        end
 
-					request.on_complete do |response|
+        @global_visited.insert(q)
+        puts "Processing link: #{q}" if @debug
 
-						yield response
+        ip,port,user,pass = grab_proxy
 
-						links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href')
+        options = {timeout: 10000, followlocation:true}
+        options[:proxy] = "#{ip}:#{port}" unless ip.nil?
+        options[:proxy_username] = user unless user.nil?
+        options[:proxy_password] = pass unless pass.nil?
 
-						links.each do |link|
-							if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && extension_not_ignored?(link))
-								
-								sanitized_link = sanitize_link(split_url_at_hash(link))
-								if(sanitized_link)
+        request = Typhoeus::Request.new(q, options)
 
-									absolute_link = make_absolute(sanitized_link, response.effective_url)
-									if(absolute_link)
-										(@global_queue << absolute_link) unless @global_queue.include?(absolute_link)
-									end
-								end
-							end
-						end
+        request.on_complete do |response|
 
-					end
+          yield response
 
-					@hydra.queue request
+          links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s)
+          links.each do |link|
+            next if link.match(/^\(|^javascript:|^mailto:/)
+            begin
 
-				rescue URI::InvalidURIError, NoMethodError => e
-					puts "Exception caught: #{e}" if @debug == true
-				end
+              if internal_link?(link, response.effective_url) && 
+                !@global_visited.include?(make_absolute(link, response.effective_url)) &&
+                no_hash_in_url?(link) &&
+                extension_not_ignored?(link)
 
-				@global_visited.insert(q)
-				@global_queue.delete(q)
+                absolute_link = make_absolute(sanitize_link(split_url_at_hash(link)), response.effective_url)
+                @global_queue << absolute_link unless @global_queue.include?(absolute_link)
+              end
 
-			end
+            rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
+              $stderr.puts "#{e.class}: ignored link #{link}"
+            end
+          end
 
-			@hydra.run
+        end
 
-		end
+        @hydra.queue request
 
-	end
+      end
+      puts "Running the hydra" if @debug
+      @hydra.run
+    end
 
-	def grab_proxy
+  end
 
-		return nil unless @proxy_list
+  def grab_proxy
+    return nil unless @proxy_list
 
-		return @proxy_list.sample.split(':')
+    @proxy_list.sample.split(':')
+  end
 
-	end
+  def parse_domain(url)
+    puts "Parsing URL: #{url}" if @debug
 
-	def parse_domain(url)
-		puts "Parsing URL: #{url}" if @debug
+    parsed_domain = Domainatrix.parse(url)
 
-		begin
-			parsed_domain = Domainatrix.parse(url)
+    if(parsed_domain.subdomain != "")
+      parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
+    else
+      parsed_domain.domain + '.' + parsed_domain.public_suffix
+    end
+  end
 
-			if(parsed_domain.subdomain != "")
-				parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix
-			else
-				parsed_domain.domain + '.' + parsed_domain.public_suffix
-			end
-		rescue NoMethodError, Addressable::URI::InvalidURIError => e
-			puts "URL Parsing Exception (#{url}): #{e}"
-			return nil
-		end
-	end
+  def internal_link?(url, effective_url)
+    absolute_url = make_absolute(url, effective_url)
+    parsed_url = parse_domain(absolute_url)
+    @domain == parsed_url
+  end
 
-	def internal_link?(url, effective_url)
-		absolute_url = make_absolute(url, effective_url)
-		parsed_url = parse_domain(absolute_url)
-		@domain == parsed_url
-	end
+  def split_url_at_hash(url)
+    return url unless @split_url_at_hash
 
-	def split_url_at_hash(url)
-		return url.to_s unless @split_url_at_hash
-		return url.to_s.split('#')[0]
-	end
+    url.split('#')[0]
+  end
 
-	def no_hash_in_url?(url)
-		return true unless @exclude_urls_with_hash
+  def no_hash_in_url?(url)
+    !@exclude_urls_with_hash || url.scan(/#/).empty?
+  end
 
-		! url.to_s.scan(/#/).size > 0
-	end
+  def extension_not_ignored?(url)
+    return true if url.empty?
+    return true unless @exclude_urls_with_extensions
 
-	def extension_not_ignored?(url)
-		return true if url.to_s.length == 0
-		return true unless @exclude_urls_with_extensions
+    @exclude_urls_with_extensions.find { |e| url.downcase.end_with? e.downcase }.nil?
+  end
 
-		@exclude_urls_with_extensions.find { |e| url.to_s.downcase.end_with? e.to_s.downcase }.nil?
-	end
+  def sanitize_link(url)
+    url.gsub(/\s+/, "%20")
+  end
 
-	def sanitize_link(url)
-		return false if url.strip.match(/^javascript:|^\(|^mailto:|^about:/)
-		begin
-			return url.gsub(/\s+/, "%20")
-		rescue
-			return false
-		end
-	end
+  def make_absolute( href, root )
+    URI.parse(root).merge(URI.parse(split_url_at_hash(href.gsub(/\s+/, "%20")))).to_s
+  end
 
-	def make_absolute( href, root )
+end
 
-		begin
-	  		URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s
-	  	rescue URI::InvalidURIError, URI::InvalidComponentError => e
-	  		return false
-	  	end
-	end
 
-end
diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb
index a9cd13c..7522518 100644
--- a/test/arachnid_test.rb
+++ b/test/arachnid_test.rb
@@ -10,19 +10,18 @@ def test_ignores_specified_extensions
     refute arachnid.extension_not_ignored?('http://example.org/example.jpg')
   end
 
-  def test_sanitizes_a_normal_href
+  def test_parses_domain
     arachnid = Arachnid.new 'example.com'
 
-    assert arachnid.sanitize_link('http://example.com/page.html')
+    assert_equal arachnid.parse_domain('www.example.com/link'), 'www.example.com'
   end
 
-  def test_does_not_sanitize_hrefs_with_javascript_or_mailto
-    arachnid = Arachnid.new 'example.com'
-
-    refute arachnid.sanitize_link('javascript:void(0)')
-    refute arachnid.sanitize_link('(javascript:void(0))')
-    refute arachnid.sanitize_link('mailto:info@example.com')
-    refute arachnid.sanitize_link('(mailto:info@example.com)')
+  def test_hash_detection
+    arachnid = Arachnid.new 'example.com', exclude_urls_with_hash: true
+    refute arachnid.no_hash_in_url? 'http://www.example.com/link#1'
+    
+    arachnid = Arachnid.new 'example.com', exclude_urls_with_hash: false
+    assert arachnid.no_hash_in_url? 'http://www.example.com/link#1'
   end
 
 end

From 73eb8aca86418d635b6991750ec6e13d21fce9ff Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sat, 17 Jan 2015 14:25:00 +0100
Subject: [PATCH 09/18] allows enabling of cookies + more robust link rejection

---
 lib/arachnid.rb | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index adc62d0..a349900 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -1,5 +1,6 @@
 # encoding: utf-8
 
+require 'tempfile'
 require 'typhoeus'
 require 'bloomfilter-rb'
 require 'nokogiri'
@@ -16,6 +17,7 @@ def initialize(url, options = {})
     @exclude_urls_with_hash = options[:exclude_urls_with_hash]
     @exclude_urls_with_extensions = options[:exclude_urls_with_extensions]
     @proxy_list = options[:proxy_list]
+    @cookies_enabled = options[:enable_cookies]
   end
 
   def crawl(options = {})
@@ -47,6 +49,11 @@ def crawl(options = {})
         options[:proxy] = "#{ip}:#{port}" unless ip.nil?
         options[:proxy_username] = user unless user.nil?
         options[:proxy_password] = pass unless pass.nil?
+        if @cookies_enabled
+          cookie_file = Tempfile.new 'cookies'
+          options[:cookiefile] = cookie_file
+          options[:cookiejar] = cookie_file
+        end
 
         request = Typhoeus::Request.new(q, options)
 
@@ -56,7 +63,7 @@ def crawl(options = {})
 
           links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s)
           links.each do |link|
-            next if link.match(/^\(|^javascript:|^mailto:/)
+            next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$/)
             begin
 
               if internal_link?(link, response.effective_url) && 

From 0c28d98765f3e5cd3a84218709f364852ef9e244 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sat, 17 Jan 2015 14:29:21 +0100
Subject: [PATCH 10/18] removes the parsing note in debug mode

---
 lib/arachnid.rb | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index a349900..fea933e 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -98,8 +98,6 @@ def grab_proxy
   end
 
   def parse_domain(url)
-    puts "Parsing URL: #{url}" if @debug
-
     parsed_domain = Domainatrix.parse(url)
 
     if(parsed_domain.subdomain != "")

From 9d1e36422b6dc53bc4f3dd68e9a4cc7e30a85c9a Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sat, 17 Jan 2015 14:35:06 +0100
Subject: [PATCH 11/18] allows crawling from multiple entry points

---
 lib/arachnid.rb | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index fea933e..7d57711 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -9,10 +9,10 @@
 
 class Arachnid
 
-  def initialize(url, options = {})
-    @start_url = url
+  def initialize(urls, options = {})
+    @start_urls = urls.is_a?(Array) ? urls : [urls]
     @debug = options[:debug]
-    @domain = parse_domain(url)
+    @domain = parse_domain(@start_urls[0])
     @split_url_at_hash = options[:split_url_at_hash]
     @exclude_urls_with_hash = options[:exclude_urls_with_hash]
     @exclude_urls_with_extensions = options[:exclude_urls_with_extensions]
@@ -28,7 +28,7 @@ def crawl(options = {})
     @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
     @global_queue = []
 
-    @global_queue << @start_url
+    @global_queue.concat @start_urls
 
     while not @global_queue.empty?
 

From 4344d230cf459be0f76bb64b447328ee53ae336b Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sat, 17 Jan 2015 14:40:25 +0100
Subject: [PATCH 12/18] adds a url filter

---
 lib/arachnid.rb | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 7d57711..b48b76d 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -23,6 +23,7 @@ def initialize(urls, options = {})
   def crawl(options = {})
     threads = options[:threads] || 1
     max_urls = options[:max_urls]
+    filter = options[:filter]
 
     @hydra = Typhoeus::Hydra.new(:max_concurrency => threads)
     @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false)
@@ -40,6 +41,10 @@ def crawl(options = {})
           break
         end
 
+        if filter
+          next unless filter.call(q)
+        end
+
         @global_visited.insert(q)
         puts "Processing link: #{q}" if @debug
 

From fd3998dae9bb15f1c1cb0a8f1ce5bd8aee58ce79 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sun, 25 Jan 2015 20:41:16 +0100
Subject: [PATCH 13/18] better escapes the URLs before parsing them

---
 lib/arachnid.rb       | 11 +++++++----
 test/arachnid_test.rb |  6 ++++++
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index b48b76d..eef93ba 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -76,12 +76,12 @@ def crawl(options = {})
                 no_hash_in_url?(link) &&
                 extension_not_ignored?(link)
 
-                absolute_link = make_absolute(sanitize_link(split_url_at_hash(link)), response.effective_url)
+                absolute_link = make_absolute(split_url_at_hash(link), response.effective_url)
                 @global_queue << absolute_link unless @global_queue.include?(absolute_link)
               end
 
             rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
-              $stderr.puts "#{e.class}: ignored link #{link}"
+              $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message})"
             end
           end
 
@@ -136,11 +136,14 @@ def extension_not_ignored?(url)
   end
 
   def sanitize_link(url)
-    url.gsub(/\s+/, "%20")
+    hash_position = url.index('#')
+    left_part = hash_position ? url[0,hash_position] : url
+    sanitized = left_part.gsub(/[ éèêàâôïûùÉÈÊÀÂÔÏÛÙöäüßÖÄÜ]/) {|w| CGI::escape(w)}
+    sanitized + (hash_position ? url[hash_position..-1] : "")
   end
 
   def make_absolute( href, root )
-    URI.parse(root).merge(URI.parse(split_url_at_hash(href.gsub(/\s+/, "%20")))).to_s
+    URI.parse(root).merge(URI.parse(sanitize_link(split_url_at_hash(href)))).to_s
   end
 
 end
diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb
index 7522518..c021098 100644
--- a/test/arachnid_test.rb
+++ b/test/arachnid_test.rb
@@ -3,6 +3,12 @@
 require "minitest/autorun"
 
 class ArachnidTest < Minitest::Test
+  def test_sanitizes_url
+    arachnid = Arachnid.new 'example.com'
+
+    assert_equal "http://example.com/%C3%A9#anchor", arachnid.sanitize_link("http://example.com/é#anchor")
+  end
+
   def test_ignores_specified_extensions
     arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg']
 

From 07882c8e1916324a2369e8d81f48009a715cdab4 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Tue, 27 Jan 2015 13:51:49 +0100
Subject: [PATCH 14/18] uses addressable to make urls absolute

---
 arachnid.gemspec      |  1 +
 lib/arachnid.rb       | 15 ++++-----------
 test/arachnid_test.rb |  7 +++++--
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/arachnid.gemspec b/arachnid.gemspec
index 4255248..d5a1dc5 100644
--- a/arachnid.gemspec
+++ b/arachnid.gemspec
@@ -9,6 +9,7 @@ Gem::Specification.new do |s|
   s.files       = ["lib/arachnid.rb"]
   s.homepage    = 'https://github.com/dchuk/Arachnid'
 
+  s.add_dependency('addressable',    '2.3.6')
   s.add_dependency('typhoeus',    '0.7.0')
   s.add_dependency('bloomfilter-rb',    '2.1.1')
   s.add_dependency('nokogiri',    '1.5.0')
diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index eef93ba..1d7659f 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -5,7 +5,7 @@
 require 'bloomfilter-rb'
 require 'nokogiri'
 require 'domainatrix'
-require 'uri'
+require 'addressable/uri'
 
 class Arachnid
 
@@ -80,7 +80,7 @@ def crawl(options = {})
                 @global_queue << absolute_link unless @global_queue.include?(absolute_link)
               end
 
-            rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e
+            rescue => e
               $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message})"
             end
           end
@@ -135,15 +135,8 @@ def extension_not_ignored?(url)
     @exclude_urls_with_extensions.find { |e| url.downcase.end_with? e.downcase }.nil?
   end
 
-  def sanitize_link(url)
-    hash_position = url.index('#')
-    left_part = hash_position ? url[0,hash_position] : url
-    sanitized = left_part.gsub(/[ éèêàâôïûùÉÈÊÀÂÔÏÛÙöäüßÖÄÜ]/) {|w| CGI::escape(w)}
-    sanitized + (hash_position ? url[hash_position..-1] : "")
-  end
-
-  def make_absolute( href, root )
-    URI.parse(root).merge(URI.parse(sanitize_link(split_url_at_hash(href)))).to_s
+  def make_absolute(href, root)
+    Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s
   end
 
 end
diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb
index c021098..000c9c3 100644
--- a/test/arachnid_test.rb
+++ b/test/arachnid_test.rb
@@ -3,10 +3,13 @@
 require "minitest/autorun"
 
 class ArachnidTest < Minitest::Test
-  def test_sanitizes_url
+  def test_makes_a_url_absolute
     arachnid = Arachnid.new 'example.com'
 
-    assert_equal "http://example.com/%C3%A9#anchor", arachnid.sanitize_link("http://example.com/é#anchor")
+    assert_equal "http://example.com/é#anchor", arachnid.make_absolute("/é#anchor", "http://example.com")
+    assert_equal "http://example.com/é#anchor", arachnid.make_absolute("é#anchor", "http://example.com/a")
+    assert_equal "http://example.com/a/é#anchor", arachnid.make_absolute("é#anchor", "http://example.com/a/b")
+    assert_equal "http://other.org/a", arachnid.make_absolute("http://other.org/a", "http://example.com")
   end
 
   def test_ignores_specified_extensions

From e0aa176624e54828f4c9e305c8ed8612779cf82b Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Wed, 28 Jan 2015 14:39:03 +0100
Subject: [PATCH 15/18] adds more info to invalid URI error

---
 lib/arachnid.rb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 1d7659f..63e1706 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -81,7 +81,7 @@ def crawl(options = {})
               end
 
             rescue => e
-              $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message})"
+              $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message}) on page #{q}"
             end
           end
 

From 3d09fcb36431d159c29fd96ac7f70a2e83a59079 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Sun, 8 Feb 2015 17:43:13 +0100
Subject: [PATCH 16/18] skips about urls and urls outside of domain

---
 lib/arachnid.rb | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 63e1706..38037b4 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -64,11 +64,13 @@ def crawl(options = {})
 
         request.on_complete do |response|
 
+          next unless parse_domain(response.effective_url) == @domain
+
           yield response
 
           links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s)
           links.each do |link|
-            next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$/)
+            next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/)
             begin
 
               if internal_link?(link, response.effective_url) && 

From cef7c0b33737b7f9c94e595d2bdbad7bd8cb7284 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Mon, 9 Feb 2015 13:51:42 +0100
Subject: [PATCH 17/18] only prints invalid uris in debug mode

---
 lib/arachnid.rb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 38037b4..1572987 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -82,8 +82,8 @@ def crawl(options = {})
                 @global_queue << absolute_link unless @global_queue.include?(absolute_link)
               end
 
-            rescue => e
-              $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message}) on page #{q}"
+            rescue Addressable::URI::InvalidURIError => e
+              $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message}) on page #{q}" if @debug
             end
           end
 

From c5baee9b6753c8fa81fa64b65981f3d3ee8be912 Mon Sep 17 00:00:00 2001
From: Matthieu Tanguay-Carel <matthieutc@gmail.com>
Date: Mon, 9 Feb 2015 14:23:23 +0100
Subject: [PATCH 18/18] makes parse_domain a class method

---
 lib/arachnid.rb       | 8 ++++----
 test/arachnid_test.rb | 4 +---
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/lib/arachnid.rb b/lib/arachnid.rb
index 1572987..ba84681 100644
--- a/lib/arachnid.rb
+++ b/lib/arachnid.rb
@@ -12,7 +12,7 @@ class Arachnid
   def initialize(urls, options = {})
     @start_urls = urls.is_a?(Array) ? urls : [urls]
     @debug = options[:debug]
-    @domain = parse_domain(@start_urls[0])
+    @domain = Arachnid.parse_domain(@start_urls[0])
     @split_url_at_hash = options[:split_url_at_hash]
     @exclude_urls_with_hash = options[:exclude_urls_with_hash]
     @exclude_urls_with_extensions = options[:exclude_urls_with_extensions]
@@ -64,7 +64,7 @@ def crawl(options = {})
 
         request.on_complete do |response|
 
-          next unless parse_domain(response.effective_url) == @domain
+          next unless Arachnid.parse_domain(response.effective_url) == @domain
 
           yield response
 
@@ -104,7 +104,7 @@ def grab_proxy
     @proxy_list.sample.split(':')
   end
 
-  def parse_domain(url)
+  def self.parse_domain(url)
     parsed_domain = Domainatrix.parse(url)
 
     if(parsed_domain.subdomain != "")
@@ -116,7 +116,7 @@ def parse_domain(url)
 
   def internal_link?(url, effective_url)
     absolute_url = make_absolute(url, effective_url)
-    parsed_url = parse_domain(absolute_url)
+    parsed_url = Arachnid.parse_domain(absolute_url)
     @domain == parsed_url
   end
 
diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb
index 000c9c3..26d3cad 100644
--- a/test/arachnid_test.rb
+++ b/test/arachnid_test.rb
@@ -20,9 +20,7 @@ def test_ignores_specified_extensions
   end
 
   def test_parses_domain
-    arachnid = Arachnid.new 'example.com'
-
-    assert_equal arachnid.parse_domain('www.example.com/link'), 'www.example.com'
+    assert_equal Arachnid.parse_domain('www.example.com/link'), 'www.example.com'
   end
 
   def test_hash_detection