Skip to content

Commit

Permalink
#29 html accepts array of pages
Browse files Browse the repository at this point in the history
  • Loading branch information
sathish316 committed Jul 17, 2012
1 parent 07170fe commit f5186ce
Show file tree
Hide file tree
Showing 7 changed files with 87 additions and 19 deletions.
1 change: 1 addition & 0 deletions lib/scrapify.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
require 'active_support/core_ext/class/attribute_accessors'
require 'active_support/core_ext/hash/keys'
require 'active_support/core_ext/object/blank'
require 'active_support/core_ext/array/wrap'
require 'nokogiri'
require 'uri'
require 'net/http'
Expand Down
36 changes: 19 additions & 17 deletions lib/scrapify/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,26 +38,28 @@ def attribute(name, options={}, &block)
to_array = options[:array]
define_singleton_method "#{name}_values" do
values = []
self.doc = parse_html(url)
while self.doc
page_values = self.doc.send(parser, selector).map do |element|
if block
yield element
else
content = element.content
if matcher
match_data = content.scan(matcher).map &:first
options[:array] ? match_data : match_data.first
Array.wrap(url).each do |uri|
self.doc = parse_html(uri)
while self.doc
page_values = self.doc.send(parser, selector).map do |element|
if block
yield element
else
content.strip
content = element.content
if matcher
match_data = content.scan(matcher).map &:first
options[:array] ? match_data : match_data.first
else
content.strip
end
end
end
end
values += page_values
if next_page_selector and (next_page_url = self.doc.send(next_page_selector.keys.first, next_page_selector.values.first).first) and !next_page_url.content.blank?
self.doc = parse_html(next_page_url.content)
else
self.doc = nil
values += page_values
if next_page_selector and (next_page_url = self.doc.send(next_page_selector.keys.first, next_page_selector.values.first).first) and !next_page_url.content.blank?
self.doc = parse_html(next_page_url.content)
else
self.doc = nil
end
end
end
values
Expand Down
File renamed without changes.
9 changes: 9 additions & 0 deletions spec/models/magazine.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
class Magazine
include Scrapify::Base
html ["http://www.magazines.com/pages/1", "http://www.magazines.com/pages/2", "http://www.magazines.com/pages/3"]

attribute :issue, css: ".issue"
attribute :title, css: ".title"

key :issue
end
File renamed without changes.
55 changes: 55 additions & 0 deletions spec/multiple_pages_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
require 'spec_helper'
require 'test_models'

describe Scrapify do

before do
page1 = "http://www.magazines.com/pages/1"
FakeWeb.register_uri :get, page1,
:body => <<-HTML
<ul>
<li>
<div class="issue">i1</div><div class="title">title1</div>
<div class="issue">i2</div><div class="title">title2</div>
<div class="issue">i3</div><div class="title">title3</div>
</li>
</ul>
HTML
page2 = "http://www.magazines.com/pages/2"
FakeWeb.register_uri :get, page2,
:body => <<-HTML
<ul>
<li>
<div class="issue">i4</div><div class="title">title4</div>
<div class="issue">i5</div><div class="title">title5</div>
<div class="issue">i6</div><div class="title">title6</div>
</li>
</ul>
HTML
page3 = "http://www.magazines.com/pages/3"
FakeWeb.register_uri :get, page3,
:body => <<-HTML
<ul>
<li>
<div class="issue">i7</div><div class="title">title7</div>
<div class="issue">i8</div><div class="title">title8</div>
<div class="issue">i9</div><div class="title">title9</div>
</li>
</ul>
HTML
end

it "should crawl and fetch data from multiple pages" do
Magazine.all.to_json.should == [
{issue: 'i1', title: 'title1'},
{issue: 'i2', title: 'title2'},
{issue: 'i3', title: 'title3'},
{issue: 'i4', title: 'title4'},
{issue: 'i5', title: 'title5'},
{issue: 'i6', title: 'title6'},
{issue: 'i7', title: 'title7'},
{issue: 'i8', title: 'title8'},
{issue: 'i9', title: 'title9'}
].to_json
end
end
5 changes: 3 additions & 2 deletions spec/test_models.rb
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
require 'pizza'
require 'book'
require 'models/pizza'
require 'models/book'
require 'models/magazine'

0 comments on commit f5186ce

Please sign in to comment.