Skip to content

Commit 65c0480

Browse files
committed
update v5 - had success in scraping
1 parent 4d1e244 commit 65c0480

20 files changed

+27588
-6144
lines changed

Gemfile

+4-4
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,13 @@ gem 'jbuilder', '~> 2.7'
2727
# Use Active Storage variant
2828
# gem 'image_processing', '~> 1.2'
2929

30+
gem 'json', '~> 2.5', '>= 2.5.1'
3031
gem 'kimurai'
32+
gem 'net-http'
33+
gem 'nokogiri', '~> 1.11', '>= 1.11.7'
3134
gem 'open-uri'
3235
gem 'rexml', '~> 3.2', '>= 3.2.4'
33-
34-
gem 'chromedriver-helper'
35-
gem 'selenium-webdriver'
36+
gem 'webdrivers', '~> 4.6'
3637

3738
# Reduces boot times through caching; required in config/boot.rb
3839
gem 'bootsnap', '>= 1.4.4', require: false
@@ -54,4 +55,3 @@ end
5455

5556
# Windows does not include zoneinfo files, so bundle the tzinfo-data gem
5657
gem 'tzinfo-data', platforms: %i[mingw mswin x64_mingw jruby]
57-
gem 'webdrivers'

Gemfile.lock

+13-9
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,6 @@ GEM
6262
zeitwerk (~> 2.3)
6363
addressable (2.8.0)
6464
public_suffix (>= 2.0.2, < 5.0)
65-
archive-zip (0.12.0)
66-
io-like (~> 0.3.0)
6765
bindex (0.8.1)
6866
bootsnap (1.7.5)
6967
msgpack (~> 1.0)
@@ -81,9 +79,6 @@ GEM
8179
capybara (>= 2.4.4, < 4)
8280
mechanize (~> 2.7.0)
8381
childprocess (3.0.0)
84-
chromedriver-helper (2.1.1)
85-
archive-zip (~> 0.10)
86-
nokogiri (~> 1.8)
8782
chronic (0.10.2)
8883
cliver (0.3.2)
8984
coderay (1.1.3)
@@ -102,9 +97,10 @@ GEM
10297
domain_name (~> 0.5)
10398
i18n (1.8.10)
10499
concurrent-ruby (~> 1.0)
105-
io-like (0.3.1)
100+
io-wait (0.1.0)
106101
jbuilder (2.11.2)
107102
activesupport (>= 5.0.0)
103+
json (2.5.1)
108104
kimurai (1.4.0)
109105
activesupport
110106
capybara (>= 2.15, < 4.0)
@@ -147,9 +143,15 @@ GEM
147143
minitest (5.14.4)
148144
msgpack (1.4.2)
149145
murmurhash3 (0.1.6)
146+
net-http (0.1.1)
147+
net-protocol
148+
uri
150149
net-http-digest_auth (1.4.1)
151150
net-http-persistent (4.0.1)
152151
connection_pool (~> 2.2)
152+
net-protocol (0.1.1)
153+
io-wait
154+
timeout
153155
nio4r (2.5.7)
154156
nokogiri (1.11.7-x86_64-linux)
155157
racc (~> 1.4)
@@ -237,6 +239,7 @@ GEM
237239
tilt (2.0.10)
238240
time (0.1.0)
239241
date
242+
timeout (0.1.1)
240243
turbolinks (5.2.1)
241244
turbolinks-source (~> 5.2)
242245
turbolinks-source (5.2.0)
@@ -277,22 +280,23 @@ PLATFORMS
277280
DEPENDENCIES
278281
bootsnap (>= 1.4.4)
279282
byebug
280-
chromedriver-helper
281283
jbuilder (~> 2.7)
284+
json (~> 2.5, >= 2.5.1)
282285
kimurai
283286
listen (~> 3.3)
287+
net-http
288+
nokogiri (~> 1.11, >= 1.11.7)
284289
open-uri
285290
pg (~> 1.1)
286291
puma (~> 5.0)
287292
rails (~> 6.1.3, >= 6.1.3.2)
288293
rexml (~> 3.2, >= 3.2.4)
289294
sass-rails (>= 6)
290-
selenium-webdriver
291295
spring
292296
turbolinks (~> 5)
293297
tzinfo-data
294298
web-console (>= 4.1.0)
295-
webdrivers
299+
webdrivers (~> 4.6)
296300
webpacker (~> 5.0)
297301

298302
RUBY VERSION
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
// Place all the styles related to the WebScrappers controller here.
2+
// They will automatically be included in application.css.
3+
// You can use Sass (SCSS) here: https://sass-lang.com/

app/controllers/products_controller.rb

+1
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ def create
4444
else
4545
format.html { render :new, status: :unprocessable_entity }
4646
format.json { render json: @product.errors, status: :unprocessable_entity }
47+
render :new
4748
end
4849
end
4950
end

app/controllers/web_scraper_controller.rb

+2-12
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,5 @@ def index
33
end
44

55
def new
6-
WebScraper.crawl!
7-
redirect_to root_path, notice: 'Results stored in results.json file.'
8-
end
9-
10-
def download
11-
send_file(
12-
"#{Rails.root}/results.json",
13-
filename: "scraping_data.json",
14-
type: "application/json"
15-
)
16-
end
17-
end
6+
Web Scrapper.crawl!
7+
end
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
class WebScrappersController < ApplicationController
2+
def index
3+
end
4+
end

app/helpers/web_scrappers_helper.rb

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
module WebScrappersHelper
2+
end

app/models/github_spider.rb

+10-15
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,18 @@
1+
require 'kimurai'
2+
require 'webdrivers'
3+
14
class Spider < Kimurai::Base
5+
@name = ''
6+
@start_urls = []
27
@engine = :selenium_chrome
3-
@start_urls = ["https://pcpartpicker.com/products/motherboard/"]
4-
5-
def parse(response, url:, data: {})
6-
# Process request to `parse_product` method with `https://example.com/some_product` url:
7-
request_to :parse_product, url: "https://pcpartpicker.com/products/motherboard/"
8-
end
9-
10-
def parse_product(response, url:, data: {})
11-
puts "From page https://pcpartpicker.com/products/motherboard/ !"
8+
@config = {
9+
user_agent: "Chrome/68.0.3440.84"
10+
}
11+
def self.process()
12+
self.crawl!
1213
end
1314
end
1415

15-
16-
17-
18-
19-
# product spider
20-
2116
require 'kimurai'
2217

2318
class ProductsSpider < Kimurai::Base

app/models/products_spider.rb

+20-28
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,33 @@
1-
2-
# amazon_spider.rb
31
require 'kimurai'
2+
require 'net/http'
3+
require 'open-uri'
4+
require 'webdrivers'
45

5-
class AmazonSpider < Kimurai::Base
6-
@name = "amazon_spider"
6+
class ProductsSpider < Kimurai::Base
7+
@name = 'products_spider'
78
@engine = :selenium_chrome
8-
@start_urls = ["https://www.amazon.com/"]
9+
@start_urls = ['https://www.amazon.com/s?k=motherboard']
10+
@config = {
11+
user_agent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
12+
before_request: { delay: 4..7 }
13+
}
914

1015
def parse(response, url:, data: {})
11-
browser.fill_in "field-keywords", with: "Web Scraping Books"
12-
browser.click_on "Go"
13-
14-
# Walk through pagination and collect products urls:
15-
urls = []
16-
loop do
17-
response = browser.current_response
18-
response.xpath("//li//a[contains(@class, 's-access-detail-page')]").each do |a|
19-
urls << a[:href].sub(/ref=.+/, "")
20-
end
21-
22-
browser.find(:xpath, "//a[@id='pagnNextLink']", wait: 1).click rescue break
16+
response.xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]").each do |a|
17+
request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
2318
end
2419

25-
# Process all collected urls concurrently within 3 threads:
26-
in_parallel(:parse_book_page, urls, threads: 3)
20+
if next_page = response.at_xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[23]/span/div/div/ul/li[7]")
21+
request_to :parse, url: absolute_url(next_page[:href], base: url)
22+
end
2723
end
2824

29-
def parse_book_page(response, url:, data: {})
25+
def parse_repo_page(response, url:, data: {})
3026
item = {}
3127

32-
item[:title] = response.xpath("//h1/span[@id]").text.squish
33-
item[:url] = url
34-
item[:price] = response.xpath("(//span[contains(@class, 'a-color-price')])[1]").text.squish.presence
35-
item[:publisher] = response.xpath("//h2[text()='Product details']/following::b[text()='Publisher:']/following-sibling::text()[1]").text.squish.presence
36-
37-
save_to "books.json", item, format: :pretty_json
28+
item[:item_name] = response.xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[2]/div/span/div/div/div[2]/div[2]/div/div/div[1]/h2").text
29+
item[:item_description] = response.xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[2]/div/span/div/div/div[2]/div[2]/div/div/div[3]/div[1]/div/div[1]/div/a").text
30+
31+
save_to "results.json", item, format: :pretty_json
3832
end
3933
end
40-
41-
AmazonSpider.crawl!

app/models/web_scrapper.rb

+50
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
require 'kimurai'
2+
3+
class WebScrapper < Kimurai::Base
4+
@name = "web_scrapper_spider"
5+
@engine = :selenium_chrome
6+
@start_urls = ["https://metaruby.com/"]
7+
@config = {
8+
user_agent: "Chrome/68.0.3440.84"
9+
}
10+
11+
attr_accessor :blogs
12+
13+
def parse(response, url:, data: { })
14+
@blogs = []
15+
16+
# Get all rows inside the table using XPATH
17+
posts_headers_path = "//table[@class='topic-list ember-view']//tbody//tr"
18+
count = response.xpath(posts_headers_path).count
19+
20+
loop do
21+
# Scroll Until it reaches thge end.
22+
browser.execute_script("window.scrollBy(0,10000)") ; sleep 2
23+
response = browser.current_response
24+
25+
new_count = response.xpath(posts_headers_path).count
26+
if count == new_count
27+
# Parse & store the data.
28+
parse_data(response)
29+
logger.info "> Pagination is done" and break
30+
else
31+
count = new_count
32+
logger.info "> Continue scrolling, current count is #{count}..."
33+
end
34+
end
35+
36+
logger.info "> Data saved to results.json"
37+
end
38+
39+
def parse_data(response)
40+
response.xpath("//table[@class='topic-list ember-view']//tbody//tr").each do |tr|
41+
scrapped_data = {
42+
title: tr.at('td[1]//span').text,
43+
category: tr.at('td[1]//div//span').text,
44+
date: tr.at('td[3]').text.strip
45+
}
46+
blogs << scrapped_data
47+
save_to "results.json", scrapped_data.as_json, format: :json
48+
end
49+
end
50+
end
+5
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
<h1>WebScrappers#index</h1>
2+
<p>Find me in app/views/web_scrappers/index.html.erb</p>
3+
4+
5+
<%= link_to 'Start Scrap', new_web_scrapper_path %>

config/routes.rb

+1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# frozen_string_literal: true
22

33
Rails.application.routes.draw do
4+
45
resources :products do
56
match '/scrape', to: 'products#scrape', via: :post, on: :collection
67
end

dist/index.js

+56
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/index.js.map

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/interface.js

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dist/interfaces.js.map

+1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

es6.js

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
const pcp3 = require("/home/jarenc/Rails_Activities/web_scraper/dist")
2+
3+
async function returnParts() {
4+
let parts = await pcp3.getPartsList('test')
5+
console.log(parts)
6+
}
7+
returnParts()

0 commit comments

Comments
 (0)