From ed065c86e895fa8cb68acec4a29c2fe2b24e81af Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Tue, 13 Jan 2015 23:49:11 +0100 Subject: [PATCH 01/18] rejects href that contain javascript code --- lib/arachnid.rb | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index c0442a0..89e4758 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -155,6 +155,7 @@ def ignore_extensions(url) end def sanitize_link(url) + return false if url.start_with? 'javascript' begin return url.gsub(/\s+/, "%20") rescue @@ -171,4 +172,4 @@ def make_absolute( href, root ) end end -end \ No newline at end of file +end From 253f69c269ddcdc75b65dd64aba5c9dcf60d8673 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Wed, 14 Jan 2015 00:00:27 +0100 Subject: [PATCH 02/18] simplifies a few lines --- README.md | 2 +- lib/arachnid.rb | 31 +++++-------------------------- 2 files changed, 6 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 1342636..3ca3b26 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ Arachnid was built to run on Ruby 1.9.2 I'll be honest, I haven't really tested require 'arachnid' - Arachnid.new("http://domain.com", {:exclude_urls_with_images => true}).crawl({:threads => 2, :max_urls => 1000}) do |response| + Arachnid.new("http://domain.com", {:exclude_urls_with_extensions => ['.jpg']}).crawl({:threads => 2, :max_urls => 1000}) do |response| #"response" is just a Typhoeus response object. puts response.effective_url diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 89e4758..b3dfd6c 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -52,7 +52,7 @@ def crawl(options = {}) links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href') links.each do |link| - if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link)) + if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && extension_not_ignored?(link)) sanitized_link = sanitize_link(split_url_at_hash(link)) if(sanitized_link) @@ -110,48 +110,27 @@ def parse_domain(url) end def internal_link?(url, effective_url) - absolute_url = make_absolute(url, effective_url) - parsed_url = parse_domain(absolute_url) - if(@domain == parsed_url) - return true - else - return false - end + @domain == parsed_url end def split_url_at_hash(url) return url.to_s unless @split_url_at_hash - return url.to_s.split('#')[0] - end def no_hash_in_url?(url) return true unless @exclude_urls_with_hash - if(url.to_s.scan(/#/).size > 0) - return false - else - return true - end + ! url.to_s.scan(/#/).size > 0 end - def ignore_extensions(url) + def extension_not_ignored?(url) return true if url.to_s.length == 0 return true unless @exclude_urls_with_extensions - not_found = true - - @exclude_urls_with_extensions.each do |e| - if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase) - not_found = false - puts "#{e} Found At URL: #{url}" if @debug - end - end - - return not_found + @exclude_urls_with_extensions.find { |e| url.to_s.downcase.end_with? e.to_s.downcase }.nil? end def sanitize_link(url) From 3d57caf724d34722d20c782419ad6c00ba976194 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Wed, 14 Jan 2015 00:06:10 +0100 Subject: [PATCH 03/18] adds a few tests --- test/arachnid_test.rb | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 test/arachnid_test.rb diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb new file mode 100644 index 0000000..93e9bc4 --- /dev/null +++ b/test/arachnid_test.rb @@ -0,0 +1,18 @@ +require_relative '../lib/arachnid' + +require "minitest/autorun" + +class ArachnidTest < Minitest::Test + def test_ignores_specified_extensions + arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg'] + + assert arachnid.extension_not_ignored?('http://example.org/example') + refute arachnid.extension_not_ignored?('http://example.org/example.jpg') + end + + def test_does_not_sanitize_hrefs_with_javascript_code + arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg'] + + refute arachnid.sanitize_link('javascript:void(0)') + end +end From dee37ba1a29d511b802e24dd2e3e1bd5f2cf15da Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Thu, 15 Jan 2015 09:55:27 +0100 Subject: [PATCH 04/18] rejects hrefs starting with '(' or mailto --- lib/arachnid.rb | 2 +- test/arachnid_test.rb | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index b3dfd6c..3fbe2dd 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -134,7 +134,7 @@ def extension_not_ignored?(url) end def sanitize_link(url) - return false if url.start_with? 'javascript' + return false if url.match(/^javascript|^\(|^mailto/) begin return url.gsub(/\s+/, "%20") rescue diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb index 93e9bc4..a9cd13c 100644 --- a/test/arachnid_test.rb +++ b/test/arachnid_test.rb @@ -10,9 +10,19 @@ def test_ignores_specified_extensions refute arachnid.extension_not_ignored?('http://example.org/example.jpg') end - def test_does_not_sanitize_hrefs_with_javascript_code - arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg'] + def test_sanitizes_a_normal_href + arachnid = Arachnid.new 'example.com' + + assert arachnid.sanitize_link('http://example.com/page.html') + end + + def test_does_not_sanitize_hrefs_with_javascript_or_mailto + arachnid = Arachnid.new 'example.com' refute arachnid.sanitize_link('javascript:void(0)') + refute arachnid.sanitize_link('(javascript:void(0))') + refute arachnid.sanitize_link('mailto:info@example.com') + refute arachnid.sanitize_link('(mailto:info@example.com)') end + end From 9d7facc254cbaf954b3329385de3ea2a7c9ea548 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Thu, 15 Jan 2015 14:33:48 +0100 Subject: [PATCH 05/18] upgrades typhoeus dependency --- arachnid.gemspec | 4 ++-- lib/arachnid.rb | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arachnid.gemspec b/arachnid.gemspec index bae3659..4255248 100644 --- a/arachnid.gemspec +++ b/arachnid.gemspec @@ -9,8 +9,8 @@ Gem::Specification.new do |s| s.files = ["lib/arachnid.rb"] s.homepage = 'https://github.com/dchuk/Arachnid' - s.add_dependency('typhoeus', '0.3.2') + s.add_dependency('typhoeus', '0.7.0') s.add_dependency('bloomfilter-rb', '2.1.1') s.add_dependency('nokogiri', '1.5.0') s.add_dependency('domainatrix') -end \ No newline at end of file +end diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 3fbe2dd..16852a8 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -41,9 +41,9 @@ def crawl(options = {}) begin ip,port,user,pass = grab_proxy - request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil - request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil - request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil + request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true) if ip == nil + request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil + request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil request.on_complete do |response| From d3e56c8089d8a7817eb7aa55727d53f2945933f0 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sat, 17 Jan 2015 01:26:12 +0100 Subject: [PATCH 06/18] makes sanitization more robust --- lib/arachnid.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 16852a8..6ed0903 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -134,7 +134,7 @@ def extension_not_ignored?(url) end def sanitize_link(url) - return false if url.match(/^javascript|^\(|^mailto/) + return false if url.strip.match(/^javascript:|^\(|^mailto:|^about:/) begin return url.gsub(/\s+/, "%20") rescue From 25aafc1f1f705361dd992f7c22fb4f9f51d156d3 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sat, 17 Jan 2015 02:03:45 +0100 Subject: [PATCH 07/18] prevents explosion of global queue --- lib/arachnid.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 6ed0903..8235d47 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -34,9 +34,9 @@ def crawl(options = {}) @global_queue << @start_url while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls)) - temp_queue = @global_queue - - temp_queue.each do |q| + (0...@global_queue.length).each do |i| + next if i >= @global_queue.length + q = @global_queue[i] begin ip,port,user,pass = grab_proxy @@ -59,7 +59,7 @@ def crawl(options = {}) absolute_link = make_absolute(sanitized_link, response.effective_url) if(absolute_link) - @global_queue << absolute_link + (@global_queue << absolute_link) unless @global_queue.include?(absolute_link) end end end From 591cf54db9fdf1d2c33d8b38b6434937473b818b Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sat, 17 Jan 2015 11:56:42 +0100 Subject: [PATCH 08/18] fixes bugs + refactors --- lib/arachnid.rb | 200 +++++++++++++++++++----------------------- test/arachnid_test.rb | 17 ++-- 2 files changed, 100 insertions(+), 117 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 8235d47..adc62d0 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -8,147 +8,131 @@ class Arachnid - def initialize(url, options = {}) - @start_url = url - @domain = parse_domain(url) + def initialize(url, options = {}) + @start_url = url + @debug = options[:debug] + @domain = parse_domain(url) + @split_url_at_hash = options[:split_url_at_hash] + @exclude_urls_with_hash = options[:exclude_urls_with_hash] + @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] + @proxy_list = options[:proxy_list] + end - @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false - @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false - @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false - @proxy_list = options[:proxy_list] ? options[:proxy_list] : false - - @debug = options[:debug] ? options[:debug] : false - end + def crawl(options = {}) + threads = options[:threads] || 1 + max_urls = options[:max_urls] - def crawl(options = {}) + @hydra = Typhoeus::Hydra.new(:max_concurrency => threads) + @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false) + @global_queue = [] - #defaults to 1 thread so people don't do a stupid amount of crawling on unsuspecting domains - threads = options[:threads] ? options[:threads] : 1 - #defaults to -1 so it will always keep running until it runs out of urls - max_urls = options[:max_urls] ? options[:max_urls] : nil + @global_queue << @start_url - @hydra = Typhoeus::Hydra.new(:max_concurrency => threads) - @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false) - @global_queue = [] + while not @global_queue.empty? - @global_queue << @start_url - - while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls)) - (0...@global_queue.length).each do |i| - next if i >= @global_queue.length - q = @global_queue[i] + @global_queue.size.times do + q = @global_queue.shift - begin - ip,port,user,pass = grab_proxy - - request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true) if ip == nil - request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil - request = Typhoeus::Request.new(q, :timeout => 10000, :followlocation => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil + if !max_urls.nil? && @global_visited.size >= max_urls + @global_queue = [] + break + end - request.on_complete do |response| + @global_visited.insert(q) + puts "Processing link: #{q}" if @debug - yield response + ip,port,user,pass = grab_proxy - links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href') + options = {timeout: 10000, followlocation:true} + options[:proxy] = "#{ip}:#{port}" unless ip.nil? + options[:proxy_username] = user unless user.nil? + options[:proxy_password] = pass unless pass.nil? - links.each do |link| - if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && extension_not_ignored?(link)) - - sanitized_link = sanitize_link(split_url_at_hash(link)) - if(sanitized_link) + request = Typhoeus::Request.new(q, options) - absolute_link = make_absolute(sanitized_link, response.effective_url) - if(absolute_link) - (@global_queue << absolute_link) unless @global_queue.include?(absolute_link) - end - end - end - end + request.on_complete do |response| - end + yield response - @hydra.queue request + links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s) + links.each do |link| + next if link.match(/^\(|^javascript:|^mailto:/) + begin - rescue URI::InvalidURIError, NoMethodError => e - puts "Exception caught: #{e}" if @debug == true - end + if internal_link?(link, response.effective_url) && + !@global_visited.include?(make_absolute(link, response.effective_url)) && + no_hash_in_url?(link) && + extension_not_ignored?(link) - @global_visited.insert(q) - @global_queue.delete(q) + absolute_link = make_absolute(sanitize_link(split_url_at_hash(link)), response.effective_url) + @global_queue << absolute_link unless @global_queue.include?(absolute_link) + end - end + rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e + $stderr.puts "#{e.class}: ignored link #{link}" + end + end - @hydra.run + end - end + @hydra.queue request - end + end + puts "Running the hydra" if @debug + @hydra.run + end - def grab_proxy + end - return nil unless @proxy_list + def grab_proxy + return nil unless @proxy_list - return @proxy_list.sample.split(':') + @proxy_list.sample.split(':') + end - end + def parse_domain(url) + puts "Parsing URL: #{url}" if @debug - def parse_domain(url) - puts "Parsing URL: #{url}" if @debug + parsed_domain = Domainatrix.parse(url) - begin - parsed_domain = Domainatrix.parse(url) + if(parsed_domain.subdomain != "") + parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix + else + parsed_domain.domain + '.' + parsed_domain.public_suffix + end + end - if(parsed_domain.subdomain != "") - parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix - else - parsed_domain.domain + '.' + parsed_domain.public_suffix - end - rescue NoMethodError, Addressable::URI::InvalidURIError => e - puts "URL Parsing Exception (#{url}): #{e}" - return nil - end - end + def internal_link?(url, effective_url) + absolute_url = make_absolute(url, effective_url) + parsed_url = parse_domain(absolute_url) + @domain == parsed_url + end - def internal_link?(url, effective_url) - absolute_url = make_absolute(url, effective_url) - parsed_url = parse_domain(absolute_url) - @domain == parsed_url - end + def split_url_at_hash(url) + return url unless @split_url_at_hash - def split_url_at_hash(url) - return url.to_s unless @split_url_at_hash - return url.to_s.split('#')[0] - end + url.split('#')[0] + end - def no_hash_in_url?(url) - return true unless @exclude_urls_with_hash + def no_hash_in_url?(url) + !@exclude_urls_with_hash || url.scan(/#/).empty? + end - ! url.to_s.scan(/#/).size > 0 - end + def extension_not_ignored?(url) + return true if url.empty? + return true unless @exclude_urls_with_extensions - def extension_not_ignored?(url) - return true if url.to_s.length == 0 - return true unless @exclude_urls_with_extensions + @exclude_urls_with_extensions.find { |e| url.downcase.end_with? e.downcase }.nil? + end - @exclude_urls_with_extensions.find { |e| url.to_s.downcase.end_with? e.to_s.downcase }.nil? - end + def sanitize_link(url) + url.gsub(/\s+/, "%20") + end - def sanitize_link(url) - return false if url.strip.match(/^javascript:|^\(|^mailto:|^about:/) - begin - return url.gsub(/\s+/, "%20") - rescue - return false - end - end + def make_absolute( href, root ) + URI.parse(root).merge(URI.parse(split_url_at_hash(href.gsub(/\s+/, "%20")))).to_s + end - def make_absolute( href, root ) +end - begin - URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s - rescue URI::InvalidURIError, URI::InvalidComponentError => e - return false - end - end -end diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb index a9cd13c..7522518 100644 --- a/test/arachnid_test.rb +++ b/test/arachnid_test.rb @@ -10,19 +10,18 @@ def test_ignores_specified_extensions refute arachnid.extension_not_ignored?('http://example.org/example.jpg') end - def test_sanitizes_a_normal_href + def test_parses_domain arachnid = Arachnid.new 'example.com' - assert arachnid.sanitize_link('http://example.com/page.html') + assert_equal arachnid.parse_domain('www.example.com/link'), 'www.example.com' end - def test_does_not_sanitize_hrefs_with_javascript_or_mailto - arachnid = Arachnid.new 'example.com' - - refute arachnid.sanitize_link('javascript:void(0)') - refute arachnid.sanitize_link('(javascript:void(0))') - refute arachnid.sanitize_link('mailto:info@example.com') - refute arachnid.sanitize_link('(mailto:info@example.com)') + def test_hash_detection + arachnid = Arachnid.new 'example.com', exclude_urls_with_hash: true + refute arachnid.no_hash_in_url? 'http://www.example.com/link#1' + + arachnid = Arachnid.new 'example.com', exclude_urls_with_hash: false + assert arachnid.no_hash_in_url? 'http://www.example.com/link#1' end end From 73eb8aca86418d635b6991750ec6e13d21fce9ff Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sat, 17 Jan 2015 14:25:00 +0100 Subject: [PATCH 09/18] allows enabling of cookies + more robust link rejection --- lib/arachnid.rb | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index adc62d0..a349900 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -1,5 +1,6 @@ # encoding: utf-8 +require 'tempfile' require 'typhoeus' require 'bloomfilter-rb' require 'nokogiri' @@ -16,6 +17,7 @@ def initialize(url, options = {}) @exclude_urls_with_hash = options[:exclude_urls_with_hash] @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] @proxy_list = options[:proxy_list] + @cookies_enabled = options[:enable_cookies] end def crawl(options = {}) @@ -47,6 +49,11 @@ def crawl(options = {}) options[:proxy] = "#{ip}:#{port}" unless ip.nil? options[:proxy_username] = user unless user.nil? options[:proxy_password] = pass unless pass.nil? + if @cookies_enabled + cookie_file = Tempfile.new 'cookies' + options[:cookiefile] = cookie_file + options[:cookiejar] = cookie_file + end request = Typhoeus::Request.new(q, options) @@ -56,7 +63,7 @@ def crawl(options = {}) links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s) links.each do |link| - next if link.match(/^\(|^javascript:|^mailto:/) + next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$/) begin if internal_link?(link, response.effective_url) && From 0c28d98765f3e5cd3a84218709f364852ef9e244 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sat, 17 Jan 2015 14:29:21 +0100 Subject: [PATCH 10/18] removes the parsing note in debug mode --- lib/arachnid.rb | 2 -- 1 file changed, 2 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index a349900..fea933e 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -98,8 +98,6 @@ def grab_proxy end def parse_domain(url) - puts "Parsing URL: #{url}" if @debug - parsed_domain = Domainatrix.parse(url) if(parsed_domain.subdomain != "") From 9d1e36422b6dc53bc4f3dd68e9a4cc7e30a85c9a Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sat, 17 Jan 2015 14:35:06 +0100 Subject: [PATCH 11/18] allows crawling from multiple entry points --- lib/arachnid.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index fea933e..7d57711 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -9,10 +9,10 @@ class Arachnid - def initialize(url, options = {}) - @start_url = url + def initialize(urls, options = {}) + @start_urls = urls.is_a?(Array) ? urls : [urls] @debug = options[:debug] - @domain = parse_domain(url) + @domain = parse_domain(@start_urls[0]) @split_url_at_hash = options[:split_url_at_hash] @exclude_urls_with_hash = options[:exclude_urls_with_hash] @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] @@ -28,7 +28,7 @@ def crawl(options = {}) @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false) @global_queue = [] - @global_queue << @start_url + @global_queue.concat @start_urls while not @global_queue.empty? From 4344d230cf459be0f76bb64b447328ee53ae336b Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sat, 17 Jan 2015 14:40:25 +0100 Subject: [PATCH 12/18] adds a url filter --- lib/arachnid.rb | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 7d57711..b48b76d 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -23,6 +23,7 @@ def initialize(urls, options = {}) def crawl(options = {}) threads = options[:threads] || 1 max_urls = options[:max_urls] + filter = options[:filter] @hydra = Typhoeus::Hydra.new(:max_concurrency => threads) @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false) @@ -40,6 +41,10 @@ def crawl(options = {}) break end + if filter + next unless filter.call(q) + end + @global_visited.insert(q) puts "Processing link: #{q}" if @debug From fd3998dae9bb15f1c1cb0a8f1ce5bd8aee58ce79 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sun, 25 Jan 2015 20:41:16 +0100 Subject: [PATCH 13/18] better escapes the URLs before parsing them --- lib/arachnid.rb | 11 +++++++---- test/arachnid_test.rb | 6 ++++++ 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index b48b76d..eef93ba 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -76,12 +76,12 @@ def crawl(options = {}) no_hash_in_url?(link) && extension_not_ignored?(link) - absolute_link = make_absolute(sanitize_link(split_url_at_hash(link)), response.effective_url) + absolute_link = make_absolute(split_url_at_hash(link), response.effective_url) @global_queue << absolute_link unless @global_queue.include?(absolute_link) end rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e - $stderr.puts "#{e.class}: ignored link #{link}" + $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message})" end end @@ -136,11 +136,14 @@ def extension_not_ignored?(url) end def sanitize_link(url) - url.gsub(/\s+/, "%20") + hash_position = url.index('#') + left_part = hash_position ? url[0,hash_position] : url + sanitized = left_part.gsub(/[ éèêàâôïûùÉÈÊÀÂÔÏÛÙöäüßÖÄÜ]/) {|w| CGI::escape(w)} + sanitized + (hash_position ? url[hash_position..-1] : "") end def make_absolute( href, root ) - URI.parse(root).merge(URI.parse(split_url_at_hash(href.gsub(/\s+/, "%20")))).to_s + URI.parse(root).merge(URI.parse(sanitize_link(split_url_at_hash(href)))).to_s end end diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb index 7522518..c021098 100644 --- a/test/arachnid_test.rb +++ b/test/arachnid_test.rb @@ -3,6 +3,12 @@ require "minitest/autorun" class ArachnidTest < Minitest::Test + def test_sanitizes_url + arachnid = Arachnid.new 'example.com' + + assert_equal "http://example.com/%C3%A9#anchor", arachnid.sanitize_link("http://example.com/é#anchor") + end + def test_ignores_specified_extensions arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg'] From 07882c8e1916324a2369e8d81f48009a715cdab4 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Tue, 27 Jan 2015 13:51:49 +0100 Subject: [PATCH 14/18] uses addressable to make urls absolute --- arachnid.gemspec | 1 + lib/arachnid.rb | 15 ++++----------- test/arachnid_test.rb | 7 +++++-- 3 files changed, 10 insertions(+), 13 deletions(-) diff --git a/arachnid.gemspec b/arachnid.gemspec index 4255248..d5a1dc5 100644 --- a/arachnid.gemspec +++ b/arachnid.gemspec @@ -9,6 +9,7 @@ Gem::Specification.new do |s| s.files = ["lib/arachnid.rb"] s.homepage = 'https://github.com/dchuk/Arachnid' + s.add_dependency('addressable', '2.3.6') s.add_dependency('typhoeus', '0.7.0') s.add_dependency('bloomfilter-rb', '2.1.1') s.add_dependency('nokogiri', '1.5.0') diff --git a/lib/arachnid.rb b/lib/arachnid.rb index eef93ba..1d7659f 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -5,7 +5,7 @@ require 'bloomfilter-rb' require 'nokogiri' require 'domainatrix' -require 'uri' +require 'addressable/uri' class Arachnid @@ -80,7 +80,7 @@ def crawl(options = {}) @global_queue << absolute_link unless @global_queue.include?(absolute_link) end - rescue URI::InvalidURIError, Addressable::URI::InvalidURIError => e + rescue => e $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message})" end end @@ -135,15 +135,8 @@ def extension_not_ignored?(url) @exclude_urls_with_extensions.find { |e| url.downcase.end_with? e.downcase }.nil? end - def sanitize_link(url) - hash_position = url.index('#') - left_part = hash_position ? url[0,hash_position] : url - sanitized = left_part.gsub(/[ éèêàâôïûùÉÈÊÀÂÔÏÛÙöäüßÖÄÜ]/) {|w| CGI::escape(w)} - sanitized + (hash_position ? url[hash_position..-1] : "") - end - - def make_absolute( href, root ) - URI.parse(root).merge(URI.parse(sanitize_link(split_url_at_hash(href)))).to_s + def make_absolute(href, root) + Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s end end diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb index c021098..000c9c3 100644 --- a/test/arachnid_test.rb +++ b/test/arachnid_test.rb @@ -3,10 +3,13 @@ require "minitest/autorun" class ArachnidTest < Minitest::Test - def test_sanitizes_url + def test_makes_a_url_absolute arachnid = Arachnid.new 'example.com' - assert_equal "http://example.com/%C3%A9#anchor", arachnid.sanitize_link("http://example.com/é#anchor") + assert_equal "http://example.com/é#anchor", arachnid.make_absolute("/é#anchor", "http://example.com") + assert_equal "http://example.com/é#anchor", arachnid.make_absolute("é#anchor", "http://example.com/a") + assert_equal "http://example.com/a/é#anchor", arachnid.make_absolute("é#anchor", "http://example.com/a/b") + assert_equal "http://other.org/a", arachnid.make_absolute("http://other.org/a", "http://example.com") end def test_ignores_specified_extensions From e0aa176624e54828f4c9e305c8ed8612779cf82b Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Wed, 28 Jan 2015 14:39:03 +0100 Subject: [PATCH 15/18] adds more info to invalid URI error --- lib/arachnid.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 1d7659f..63e1706 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -81,7 +81,7 @@ def crawl(options = {}) end rescue => e - $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message})" + $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message}) on page #{q}" end end From 3d09fcb36431d159c29fd96ac7f70a2e83a59079 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Sun, 8 Feb 2015 17:43:13 +0100 Subject: [PATCH 16/18] skips about urls and urls outside of domain --- lib/arachnid.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 63e1706..38037b4 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -64,11 +64,13 @@ def crawl(options = {}) request.on_complete do |response| + next unless parse_domain(response.effective_url) == @domain + yield response links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s) links.each do |link| - next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$/) + next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/) begin if internal_link?(link, response.effective_url) && From cef7c0b33737b7f9c94e595d2bdbad7bd8cb7284 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Mon, 9 Feb 2015 13:51:42 +0100 Subject: [PATCH 17/18] only prints invalid uris in debug mode --- lib/arachnid.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 38037b4..1572987 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -82,8 +82,8 @@ def crawl(options = {}) @global_queue << absolute_link unless @global_queue.include?(absolute_link) end - rescue => e - $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message}) on page #{q}" + rescue Addressable::URI::InvalidURIError => e + $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message}) on page #{q}" if @debug end end From c5baee9b6753c8fa81fa64b65981f3d3ee8be912 Mon Sep 17 00:00:00 2001 From: Matthieu Tanguay-Carel Date: Mon, 9 Feb 2015 14:23:23 +0100 Subject: [PATCH 18/18] makes parse_domain a class method --- lib/arachnid.rb | 8 ++++---- test/arachnid_test.rb | 4 +--- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/lib/arachnid.rb b/lib/arachnid.rb index 1572987..ba84681 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -12,7 +12,7 @@ class Arachnid def initialize(urls, options = {}) @start_urls = urls.is_a?(Array) ? urls : [urls] @debug = options[:debug] - @domain = parse_domain(@start_urls[0]) + @domain = Arachnid.parse_domain(@start_urls[0]) @split_url_at_hash = options[:split_url_at_hash] @exclude_urls_with_hash = options[:exclude_urls_with_hash] @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] @@ -64,7 +64,7 @@ def crawl(options = {}) request.on_complete do |response| - next unless parse_domain(response.effective_url) == @domain + next unless Arachnid.parse_domain(response.effective_url) == @domain yield response @@ -104,7 +104,7 @@ def grab_proxy @proxy_list.sample.split(':') end - def parse_domain(url) + def self.parse_domain(url) parsed_domain = Domainatrix.parse(url) if(parsed_domain.subdomain != "") @@ -116,7 +116,7 @@ def parse_domain(url) def internal_link?(url, effective_url) absolute_url = make_absolute(url, effective_url) - parsed_url = parse_domain(absolute_url) + parsed_url = Arachnid.parse_domain(absolute_url) @domain == parsed_url end diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb index 000c9c3..26d3cad 100644 --- a/test/arachnid_test.rb +++ b/test/arachnid_test.rb @@ -20,9 +20,7 @@ def test_ignores_specified_extensions end def test_parses_domain - arachnid = Arachnid.new 'example.com' - - assert_equal arachnid.parse_domain('www.example.com/link'), 'www.example.com' + assert_equal Arachnid.parse_domain('www.example.com/link'), 'www.example.com' end def test_hash_detection