diff --git a/README.md b/README.md index 1342636..3ca3b26 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ Arachnid was built to run on Ruby 1.9.2 I'll be honest, I haven't really tested require 'arachnid' - Arachnid.new("http://domain.com", {:exclude_urls_with_images => true}).crawl({:threads => 2, :max_urls => 1000}) do |response| + Arachnid.new("http://domain.com", {:exclude_urls_with_extensions => ['.jpg']}).crawl({:threads => 2, :max_urls => 1000}) do |response| #"response" is just a Typhoeus response object. puts response.effective_url diff --git a/arachnid.gemspec b/arachnid.gemspec index bae3659..d5a1dc5 100644 --- a/arachnid.gemspec +++ b/arachnid.gemspec @@ -9,8 +9,9 @@ Gem::Specification.new do |s| s.files = ["lib/arachnid.rb"] s.homepage = 'https://github.com/dchuk/Arachnid' - s.add_dependency('typhoeus', '0.3.2') + s.add_dependency('addressable', '2.3.6') + s.add_dependency('typhoeus', '0.7.0') s.add_dependency('bloomfilter-rb', '2.1.1') s.add_dependency('nokogiri', '1.5.0') s.add_dependency('domainatrix') -end \ No newline at end of file +end diff --git a/lib/arachnid.rb b/lib/arachnid.rb index c0442a0..ba84681 100644 --- a/lib/arachnid.rb +++ b/lib/arachnid.rb @@ -1,174 +1,146 @@ # encoding: utf-8 +require 'tempfile' require 'typhoeus' require 'bloomfilter-rb' require 'nokogiri' require 'domainatrix' -require 'uri' +require 'addressable/uri' class Arachnid - def initialize(url, options = {}) - @start_url = url - @domain = parse_domain(url) + def initialize(urls, options = {}) + @start_urls = urls.is_a?(Array) ? urls : [urls] + @debug = options[:debug] + @domain = Arachnid.parse_domain(@start_urls[0]) + @split_url_at_hash = options[:split_url_at_hash] + @exclude_urls_with_hash = options[:exclude_urls_with_hash] + @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] + @proxy_list = options[:proxy_list] + @cookies_enabled = options[:enable_cookies] + end - @split_url_at_hash = options[:split_url_at_hash] ? options[:split_url_at_hash] : false - @exclude_urls_with_hash = options[:exclude_urls_with_hash] ? options[:exclude_urls_with_hash] : false - @exclude_urls_with_extensions = options[:exclude_urls_with_extensions] ? options[:exclude_urls_with_extensions] : false - @proxy_list = options[:proxy_list] ? options[:proxy_list] : false - - @debug = options[:debug] ? options[:debug] : false - end + def crawl(options = {}) + threads = options[:threads] || 1 + max_urls = options[:max_urls] + filter = options[:filter] - def crawl(options = {}) + @hydra = Typhoeus::Hydra.new(:max_concurrency => threads) + @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false) + @global_queue = [] - #defaults to 1 thread so people don't do a stupid amount of crawling on unsuspecting domains - threads = options[:threads] ? options[:threads] : 1 - #defaults to -1 so it will always keep running until it runs out of urls - max_urls = options[:max_urls] ? options[:max_urls] : nil + @global_queue.concat @start_urls - @hydra = Typhoeus::Hydra.new(:max_concurrency => threads) - @global_visited = BloomFilter::Native.new(:size => 1000000, :hashes => 5, :seed => 1, :bucket => 8, :raise => false) - @global_queue = [] + while not @global_queue.empty? - @global_queue << @start_url - - while(@global_queue.size > 0 && (max_urls.nil? || @global_visited.size.to_i < max_urls)) - temp_queue = @global_queue + @global_queue.size.times do + q = @global_queue.shift - temp_queue.each do |q| + if !max_urls.nil? && @global_visited.size >= max_urls + @global_queue = [] + break + end - begin - ip,port,user,pass = grab_proxy - - request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true) if ip == nil - request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}") if ip != nil && user == nil - request = Typhoeus::Request.new(q, :timeout => 10000, :follow_location => true, :proxy => "#{ip}:#{port}", :proxy_username => user, :proxy_password => pass) if user != nil + if filter + next unless filter.call(q) + end - request.on_complete do |response| + @global_visited.insert(q) + puts "Processing link: #{q}" if @debug - yield response + ip,port,user,pass = grab_proxy - links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href') + options = {timeout: 10000, followlocation:true} + options[:proxy] = "#{ip}:#{port}" unless ip.nil? + options[:proxy_username] = user unless user.nil? + options[:proxy_password] = pass unless pass.nil? + if @cookies_enabled + cookie_file = Tempfile.new 'cookies' + options[:cookiefile] = cookie_file + options[:cookiejar] = cookie_file + end - links.each do |link| - if(internal_link?(link, response.effective_url) && !@global_visited.include?(make_absolute(link, response.effective_url)) && no_hash_in_url?(link) && ignore_extensions(link)) - - sanitized_link = sanitize_link(split_url_at_hash(link)) - if(sanitized_link) + request = Typhoeus::Request.new(q, options) - absolute_link = make_absolute(sanitized_link, response.effective_url) - if(absolute_link) - @global_queue << absolute_link - end - end - end - end + request.on_complete do |response| - end + next unless Arachnid.parse_domain(response.effective_url) == @domain - @hydra.queue request + yield response - rescue URI::InvalidURIError, NoMethodError => e - puts "Exception caught: #{e}" if @debug == true - end + links = Nokogiri::HTML.parse(response.body).xpath('.//a/@href').map(&:to_s) + links.each do |link| + next if link.match(/^\(|^javascript:|^mailto:|^#|^\s*$|^about:/) + begin - @global_visited.insert(q) - @global_queue.delete(q) + if internal_link?(link, response.effective_url) && + !@global_visited.include?(make_absolute(link, response.effective_url)) && + no_hash_in_url?(link) && + extension_not_ignored?(link) - end + absolute_link = make_absolute(split_url_at_hash(link), response.effective_url) + @global_queue << absolute_link unless @global_queue.include?(absolute_link) + end - @hydra.run + rescue Addressable::URI::InvalidURIError => e + $stderr.puts "#{e.class}: Ignored link #{link} (#{e.message}) on page #{q}" if @debug + end + end - end + end - end + @hydra.queue request - def grab_proxy + end + puts "Running the hydra" if @debug + @hydra.run + end - return nil unless @proxy_list + end - return @proxy_list.sample.split(':') + def grab_proxy + return nil unless @proxy_list - end + @proxy_list.sample.split(':') + end - def parse_domain(url) - puts "Parsing URL: #{url}" if @debug + def self.parse_domain(url) + parsed_domain = Domainatrix.parse(url) - begin - parsed_domain = Domainatrix.parse(url) + if(parsed_domain.subdomain != "") + parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix + else + parsed_domain.domain + '.' + parsed_domain.public_suffix + end + end - if(parsed_domain.subdomain != "") - parsed_domain.subdomain + '.' + parsed_domain.domain + '.' + parsed_domain.public_suffix - else - parsed_domain.domain + '.' + parsed_domain.public_suffix - end - rescue NoMethodError, Addressable::URI::InvalidURIError => e - puts "URL Parsing Exception (#{url}): #{e}" - return nil - end - end + def internal_link?(url, effective_url) + absolute_url = make_absolute(url, effective_url) + parsed_url = Arachnid.parse_domain(absolute_url) + @domain == parsed_url + end - def internal_link?(url, effective_url) + def split_url_at_hash(url) + return url unless @split_url_at_hash - absolute_url = make_absolute(url, effective_url) + url.split('#')[0] + end - parsed_url = parse_domain(absolute_url) - if(@domain == parsed_url) - return true - else - return false - end - end + def no_hash_in_url?(url) + !@exclude_urls_with_hash || url.scan(/#/).empty? + end - def split_url_at_hash(url) - return url.to_s unless @split_url_at_hash + def extension_not_ignored?(url) + return true if url.empty? + return true unless @exclude_urls_with_extensions - return url.to_s.split('#')[0] + @exclude_urls_with_extensions.find { |e| url.downcase.end_with? e.downcase }.nil? + end - end + def make_absolute(href, root) + Addressable::URI.parse(root).join(Addressable::URI.parse(href)).to_s + end - def no_hash_in_url?(url) - return true unless @exclude_urls_with_hash +end - if(url.to_s.scan(/#/).size > 0) - return false - else - return true - end - end - def ignore_extensions(url) - return true if url.to_s.length == 0 - return true unless @exclude_urls_with_extensions - - not_found = true - - @exclude_urls_with_extensions.each do |e| - if(url.to_s.length > e.size && url.to_s[-e.size .. -1].downcase == e.to_s.downcase) - not_found = false - puts "#{e} Found At URL: #{url}" if @debug - end - end - - return not_found - end - - def sanitize_link(url) - begin - return url.gsub(/\s+/, "%20") - rescue - return false - end - end - - def make_absolute( href, root ) - - begin - URI.parse(root).merge(URI.parse(split_url_at_hash(href.to_s.gsub(/\s+/, "%20")))).to_s - rescue URI::InvalidURIError, URI::InvalidComponentError => e - return false - end - end - -end \ No newline at end of file diff --git a/test/arachnid_test.rb b/test/arachnid_test.rb new file mode 100644 index 0000000..26d3cad --- /dev/null +++ b/test/arachnid_test.rb @@ -0,0 +1,34 @@ +require_relative '../lib/arachnid' + +require "minitest/autorun" + +class ArachnidTest < Minitest::Test + def test_makes_a_url_absolute + arachnid = Arachnid.new 'example.com' + + assert_equal "http://example.com/é#anchor", arachnid.make_absolute("/é#anchor", "http://example.com") + assert_equal "http://example.com/é#anchor", arachnid.make_absolute("é#anchor", "http://example.com/a") + assert_equal "http://example.com/a/é#anchor", arachnid.make_absolute("é#anchor", "http://example.com/a/b") + assert_equal "http://other.org/a", arachnid.make_absolute("http://other.org/a", "http://example.com") + end + + def test_ignores_specified_extensions + arachnid = Arachnid.new 'example.com', exclude_urls_with_extensions: ['.jpg'] + + assert arachnid.extension_not_ignored?('http://example.org/example') + refute arachnid.extension_not_ignored?('http://example.org/example.jpg') + end + + def test_parses_domain + assert_equal Arachnid.parse_domain('www.example.com/link'), 'www.example.com' + end + + def test_hash_detection + arachnid = Arachnid.new 'example.com', exclude_urls_with_hash: true + refute arachnid.no_hash_in_url? 'http://www.example.com/link#1' + + arachnid = Arachnid.new 'example.com', exclude_urls_with_hash: false + assert arachnid.no_hash_in_url? 'http://www.example.com/link#1' + end + +end