diff --git a/lib/scrapify.rb b/lib/scrapify.rb index cd54bfd..7cb73da 100644 --- a/lib/scrapify.rb +++ b/lib/scrapify.rb @@ -2,6 +2,7 @@ require 'active_support/core_ext/class/attribute_accessors' require 'active_support/core_ext/hash/keys' require 'active_support/core_ext/object/blank' +require 'active_support/core_ext/array/wrap' require 'nokogiri' require 'uri' require 'net/http' diff --git a/lib/scrapify/base.rb b/lib/scrapify/base.rb index 6defe59..84ab058 100644 --- a/lib/scrapify/base.rb +++ b/lib/scrapify/base.rb @@ -38,26 +38,28 @@ def attribute(name, options={}, &block) to_array = options[:array] define_singleton_method "#{name}_values" do values = [] - self.doc = parse_html(url) - while self.doc - page_values = self.doc.send(parser, selector).map do |element| - if block - yield element - else - content = element.content - if matcher - match_data = content.scan(matcher).map &:first - options[:array] ? match_data : match_data.first + Array.wrap(url).each do |uri| + self.doc = parse_html(uri) + while self.doc + page_values = self.doc.send(parser, selector).map do |element| + if block + yield element else - content.strip + content = element.content + if matcher + match_data = content.scan(matcher).map &:first + options[:array] ? match_data : match_data.first + else + content.strip + end end end - end - values += page_values - if next_page_selector and (next_page_url = self.doc.send(next_page_selector.keys.first, next_page_selector.values.first).first) and !next_page_url.content.blank? - self.doc = parse_html(next_page_url.content) - else - self.doc = nil + values += page_values + if next_page_selector and (next_page_url = self.doc.send(next_page_selector.keys.first, next_page_selector.values.first).first) and !next_page_url.content.blank? + self.doc = parse_html(next_page_url.content) + else + self.doc = nil + end end end values diff --git a/spec/book.rb b/spec/models/book.rb similarity index 100% rename from spec/book.rb rename to spec/models/book.rb diff --git a/spec/models/magazine.rb b/spec/models/magazine.rb new file mode 100644 index 0000000..9f72cc4 --- /dev/null +++ b/spec/models/magazine.rb @@ -0,0 +1,9 @@ +class Magazine + include Scrapify::Base + html ["http://www.magazines.com/pages/1", "http://www.magazines.com/pages/2", "http://www.magazines.com/pages/3"] + + attribute :issue, css: ".issue" + attribute :title, css: ".title" + + key :issue +end \ No newline at end of file diff --git a/spec/pizza.rb b/spec/models/pizza.rb similarity index 100% rename from spec/pizza.rb rename to spec/models/pizza.rb diff --git a/spec/multiple_pages_spec.rb b/spec/multiple_pages_spec.rb new file mode 100644 index 0000000..5875e00 --- /dev/null +++ b/spec/multiple_pages_spec.rb @@ -0,0 +1,55 @@ +require 'spec_helper' +require 'test_models' + +describe Scrapify do + + before do + page1 = "http://www.magazines.com/pages/1" + FakeWeb.register_uri :get, page1, + :body => <<-HTML + + HTML + page2 = "http://www.magazines.com/pages/2" + FakeWeb.register_uri :get, page2, + :body => <<-HTML + + HTML + page3 = "http://www.magazines.com/pages/3" + FakeWeb.register_uri :get, page3, + :body => <<-HTML + + HTML + end + + it "should crawl and fetch data from multiple pages" do + Magazine.all.to_json.should == [ + {issue: 'i1', title: 'title1'}, + {issue: 'i2', title: 'title2'}, + {issue: 'i3', title: 'title3'}, + {issue: 'i4', title: 'title4'}, + {issue: 'i5', title: 'title5'}, + {issue: 'i6', title: 'title6'}, + {issue: 'i7', title: 'title7'}, + {issue: 'i8', title: 'title8'}, + {issue: 'i9', title: 'title9'} + ].to_json + end +end \ No newline at end of file diff --git a/spec/test_models.rb b/spec/test_models.rb index 93cc3cd..2433c33 100644 --- a/spec/test_models.rb +++ b/spec/test_models.rb @@ -1,2 +1,3 @@ -require 'pizza' -require 'book' \ No newline at end of file +require 'models/pizza' +require 'models/book' +require 'models/magazine' \ No newline at end of file