1
-
2
- # amazon_spider.rb
3
1
require 'kimurai'
2
+ require 'net/http'
3
+ require 'open-uri'
4
+ require 'webdrivers'
4
5
5
- class AmazonSpider < Kimurai ::Base
6
- @name = "amazon_spider"
6
+ class ProductsSpider < Kimurai ::Base
7
+ @name = 'products_spider'
7
8
@engine = :selenium_chrome
8
- @start_urls = [ "https://www.amazon.com/" ]
9
+ @start_urls = [ 'https://www.amazon.com/s?k=motherboard' ]
10
+ @config = {
11
+ user_agent : 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36' ,
12
+ before_request : { delay : 4 ..7 }
13
+ }
9
14
10
15
def parse ( response , url :, data : { } )
11
- browser . fill_in "field-keywords" , with : "Web Scraping Books"
12
- browser . click_on "Go"
13
-
14
- # Walk through pagination and collect products urls:
15
- urls = [ ]
16
- loop do
17
- response = browser . current_response
18
- response . xpath ( "//li//a[contains(@class, 's-access-detail-page')]" ) . each do |a |
19
- urls << a [ :href ] . sub ( /ref=.+/ , "" )
20
- end
21
-
22
- browser . find ( :xpath , "//a[@id='pagnNextLink']" , wait : 1 ) . click rescue break
16
+ response . xpath ( "/html/body/div[1]/div[2]/div[1]/div/div[1]" ) . each do |a |
17
+ request_to :parse_repo_page , url : absolute_url ( a [ :href ] , base : url )
23
18
end
24
19
25
- # Process all collected urls concurrently within 3 threads:
26
- in_parallel ( :parse_book_page , urls , threads : 3 )
20
+ if next_page = response . at_xpath ( "/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[23]/span/div/div/ul/li[7]" )
21
+ request_to :parse , url : absolute_url ( next_page [ :href ] , base : url )
22
+ end
27
23
end
28
24
29
- def parse_book_page ( response , url :, data : { } )
25
+ def parse_repo_page ( response , url :, data : { } )
30
26
item = { }
31
27
32
- item [ :title ] = response . xpath ( "//h1/span[@id]" ) . text . squish
33
- item [ :url ] = url
34
- item [ :price ] = response . xpath ( "(//span[contains(@class, 'a-color-price')])[1]" ) . text . squish . presence
35
- item [ :publisher ] = response . xpath ( "//h2[text()='Product details']/following::b[text()='Publisher:']/following-sibling::text()[1]" ) . text . squish . presence
36
-
37
- save_to "books.json" , item , format : :pretty_json
28
+ item [ :item_name ] = response . xpath ( "/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[2]/div/span/div/div/div[2]/div[2]/div/div/div[1]/h2" ) . text
29
+ item [ :item_description ] = response . xpath ( "/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[2]/div/span/div/div/div[2]/div[2]/div/div/div[3]/div[1]/div/div[1]/div/a" ) . text
30
+
31
+ save_to "results.json" , item , format : :pretty_json
38
32
end
39
33
end
40
-
41
- AmazonSpider . crawl!
0 commit comments