jarencudilla
diff --git a/‎Gemfile
+4-4 b/‎Gemfile
+4-4
diff --git a/‎Gemfile.lock
+13-9 b/‎Gemfile.lock
+13-9
diff --git a/‎app/assets/stylesheets/web_scrappers.scss
+3 b/‎app/assets/stylesheets/web_scrappers.scss
+3
diff --git a/‎app/controllers/products_controller.rb
+1 b/‎app/controllers/products_controller.rb
+1
diff --git a/‎app/controllers/web_scraper_controller.rb
+2-12 b/‎app/controllers/web_scraper_controller.rb
+2-12
diff --git a/‎app/controllers/web_scrappers_controller.rb
+4 b/‎app/controllers/web_scrappers_controller.rb
+4
diff --git a/‎app/helpers/web_scrappers_helper.rb
+2 b/‎app/helpers/web_scrappers_helper.rb
+2
diff --git a/‎app/models/github_spider.rb
+10-15 b/‎app/models/github_spider.rb
+10-15
diff --git a/‎app/models/products_spider.rb
+20-28 b/‎app/models/products_spider.rb
+20-28
diff --git a/‎app/models/web_scrapper.rb
+50 b/‎app/models/web_scrapper.rb
+50
diff --git a/‎app/views/web_scrappers/index.html.erb
+5 b/‎app/views/web_scrappers/index.html.erb
+5
diff --git a/‎config/routes.rb
+1 b/‎config/routes.rb
+1
diff --git a/‎dist/index.js
+56 b/‎dist/index.js
+56
diff --git a/‎dist/index.js.map
+1 b/‎dist/index.js.map
+1
diff --git a/‎dist/interface.js
+3 b/‎dist/interface.js
+3
diff --git a/‎dist/interfaces.js.map
+1 b/‎dist/interfaces.js.map
+1
diff --git a/‎es6.js
+7 b/‎es6.js
+7
@@ -27,12 +27,13 @@ gem 'jbuilder', '~> 2.7'
 # Use Active Storage variant
 # gem 'image_processing', '~> 1.2'
 
+gem 'json', '~> 2.5', '>= 2.5.1'
 gem 'kimurai'
+gem 'net-http'
+gem 'nokogiri', '~> 1.11', '>= 1.11.7'
 gem 'open-uri'
 gem 'rexml', '~> 3.2', '>= 3.2.4'
-
-gem 'chromedriver-helper'
-gem 'selenium-webdriver'
+gem 'webdrivers', '~> 4.6'
 
 # Reduces boot times through caching; required in config/boot.rb
 gem 'bootsnap', '>= 1.4.4', require: false
@@ -54,4 +55,3 @@ end
 
 # Windows does not include zoneinfo files, so bundle the tzinfo-data gem
 gem 'tzinfo-data', platforms: %i[mingw mswin x64_mingw jruby]
-gem 'webdrivers'
@@ -62,8 +62,6 @@ GEM
       zeitwerk (~> 2.3)
     addressable (2.8.0)
       public_suffix (>= 2.0.2, < 5.0)
-    archive-zip (0.12.0)
-      io-like (~> 0.3.0)
     bindex (0.8.1)
     bootsnap (1.7.5)
       msgpack (~> 1.0)
@@ -81,9 +79,6 @@ GEM
       capybara (>= 2.4.4, < 4)
       mechanize (~> 2.7.0)
     childprocess (3.0.0)
-    chromedriver-helper (2.1.1)
-      archive-zip (~> 0.10)
-      nokogiri (~> 1.8)
     chronic (0.10.2)
     cliver (0.3.2)
     coderay (1.1.3)
@@ -102,9 +97,10 @@ GEM
       domain_name (~> 0.5)
     i18n (1.8.10)
       concurrent-ruby (~> 1.0)
-    io-like (0.3.1)
+    io-wait (0.1.0)
     jbuilder (2.11.2)
       activesupport (>= 5.0.0)
+    json (2.5.1)
     kimurai (1.4.0)
       activesupport
       capybara (>= 2.15, < 4.0)
@@ -147,9 +143,15 @@ GEM
     minitest (5.14.4)
     msgpack (1.4.2)
     murmurhash3 (0.1.6)
+    net-http (0.1.1)
+      net-protocol
+      uri
     net-http-digest_auth (1.4.1)
     net-http-persistent (4.0.1)
       connection_pool (~> 2.2)
+    net-protocol (0.1.1)
+      io-wait
+      timeout
     nio4r (2.5.7)
     nokogiri (1.11.7-x86_64-linux)
       racc (~> 1.4)
@@ -237,6 +239,7 @@ GEM
     tilt (2.0.10)
     time (0.1.0)
       date
+    timeout (0.1.1)
     turbolinks (5.2.1)
       turbolinks-source (~> 5.2)
     turbolinks-source (5.2.0)
@@ -277,22 +280,23 @@ PLATFORMS
 DEPENDENCIES
   bootsnap (>= 1.4.4)
   byebug
-  chromedriver-helper
   jbuilder (~> 2.7)
+  json (~> 2.5, >= 2.5.1)
   kimurai
   listen (~> 3.3)
+  net-http
+  nokogiri (~> 1.11, >= 1.11.7)
   open-uri
   pg (~> 1.1)
   puma (~> 5.0)
   rails (~> 6.1.3, >= 6.1.3.2)
   rexml (~> 3.2, >= 3.2.4)
   sass-rails (>= 6)
-  selenium-webdriver
   spring
   turbolinks (~> 5)
   tzinfo-data
   web-console (>= 4.1.0)
-  webdrivers
+  webdrivers (~> 4.6)
   webpacker (~> 5.0)
 
 RUBY VERSION
 
@@ -0,0 +1,3 @@
+// Place all the styles related to the WebScrappers controller here.
+// They will automatically be included in application.css.
+// You can use Sass (SCSS) here: https://sass-lang.com/
@@ -44,6 +44,7 @@ def create
       else
         format.html { render :new, status: :unprocessable_entity }
         format.json { render json: @product.errors, status: :unprocessable_entity }
+        render :new
       end
     end
   end
 
@@ -3,15 +3,5 @@ def index
   end
 
   def new
-    WebScraper.crawl! 
-    redirect_to root_path, notice: 'Results stored in results.json file.'
-  end
-
-  def download
-    send_file(
-      "#{Rails.root}/results.json",
-      filename: "scraping_data.json",
-      type: "application/json"
-    )
-  end
-end
+    Web Scrapper.crawl!
+    end
@@ -0,0 +1,4 @@
+class WebScrappersController < ApplicationController
+  def index
+  end
+end
@@ -0,0 +1,2 @@
+module WebScrappersHelper
+end
@@ -1,23 +1,18 @@
+require 'kimurai'
+require 'webdrivers'
+
 class Spider < Kimurai::Base
+  @name = ''
+  @start_urls = []
   @engine = :selenium_chrome
-  @start_urls = ["https://pcpartpicker.com/products/motherboard/"]
-
-  def parse(response, url:, data: {})
-    # Process request to `parse_product` method with `https://example.com/some_product` url:
-    request_to :parse_product, url: "https://pcpartpicker.com/products/motherboard/"
-  end
-
-  def parse_product(response, url:, data: {})
-    puts "From page https://pcpartpicker.com/products/motherboard/ !"
+  @config = {
+    user_agent: "Chrome/68.0.3440.84"
+  }
+  def self.process()
+    self.crawl!
   end
 end
 
-
-
-
-
-# product spider
-
 require 'kimurai'
 
 class ProductsSpider < Kimurai::Base
 
@@ -1,41 +1,33 @@
-
-# amazon_spider.rb
 require 'kimurai'
+require 'net/http'
+require 'open-uri'
+require 'webdrivers'
 
-class AmazonSpider < Kimurai::Base
-  @name = "amazon_spider"
+class ProductsSpider < Kimurai::Base
+  @name = 'products_spider'
   @engine = :selenium_chrome
-  @start_urls = ["https://www.amazon.com/"]
+  @start_urls = ['https://www.amazon.com/s?k=motherboard']
+  @config = {
+    user_agent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36',
+    before_request: { delay: 4..7 }
+  }
 
   def parse(response, url:, data: {})
-    browser.fill_in "field-keywords", with: "Web Scraping Books"
-    browser.click_on "Go"
-
-    # Walk through pagination and collect products urls:
-    urls = []
-    loop do
-      response = browser.current_response
-      response.xpath("//li//a[contains(@class, 's-access-detail-page')]").each do |a|
-        urls << a[:href].sub(/ref=.+/, "")
-      end
-
-      browser.find(:xpath, "//a[@id='pagnNextLink']", wait: 1).click rescue break
+    response.xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]").each do |a|
+      request_to :parse_repo_page, url: absolute_url(a[:href], base: url)
     end
 
-    # Process all collected urls concurrently within 3 threads:
-    in_parallel(:parse_book_page, urls, threads: 3)
+    if next_page = response.at_xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[23]/span/div/div/ul/li[7]")
+      request_to :parse, url: absolute_url(next_page[:href], base: url)
+    end
   end
 
-  def parse_book_page(response, url:, data: {})
+  def parse_repo_page(response, url:, data: {})
     item = {}
 
-    item[:title] = response.xpath("//h1/span[@id]").text.squish
-    item[:url] = url
-    item[:price] = response.xpath("(//span[contains(@class, 'a-color-price')])[1]").text.squish.presence
-    item[:publisher] = response.xpath("//h2[text()='Product details']/following::b[text()='Publisher:']/following-sibling::text()[1]").text.squish.presence
-
-    save_to "books.json", item, format: :pretty_json
+    item[:item_name] = response.xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[2]/div/span/div/div/div[2]/div[2]/div/div/div[1]/h2").text
+    item[:item_description] = response.xpath("/html/body/div[1]/div[2]/div[1]/div/div[1]/div/span[3]/div[2]/div[2]/div/span/div/div/div[2]/div[2]/div/div/div[3]/div[1]/div/div[1]/div/a").text
+    
+    save_to "results.json", item, format: :pretty_json
   end
 end
-
-AmazonSpider.crawl!
@@ -0,0 +1,50 @@
+require 'kimurai'
+
+class WebScrapper < Kimurai::Base
+  @name = "web_scrapper_spider"
+  @engine = :selenium_chrome
+  @start_urls = ["https://metaruby.com/"]
+  @config = {
+      user_agent: "Chrome/68.0.3440.84"
+    }
+
+  attr_accessor :blogs
+
+  def parse(response, url:, data: { })
+    @blogs = []
+
+    # Get all rows inside the table using XPATH
+    posts_headers_path = "//table[@class='topic-list ember-view']//tbody//tr"
+    count = response.xpath(posts_headers_path).count
+    
+    loop do
+      # Scroll Until it reaches thge end.
+      browser.execute_script("window.scrollBy(0,10000)") ; sleep 2
+      response = browser.current_response
+
+      new_count = response.xpath(posts_headers_path).count
+      if count == new_count
+        # Parse & store the data.
+        parse_data(response)
+        logger.info "> Pagination is done" and break
+      else
+        count = new_count
+        logger.info "> Continue scrolling, current count is #{count}..."
+      end
+    end
+    
+    logger.info "> Data saved to results.json"
+  end
+
+  def parse_data(response)
+    response.xpath("//table[@class='topic-list ember-view']//tbody//tr").each do |tr|
+      scrapped_data = {
+                        title: tr.at('td[1]//span').text,
+                        category: tr.at('td[1]//div//span').text,
+                        date: tr.at('td[3]').text.strip
+                      }
+      blogs << scrapped_data
+      save_to "results.json", scrapped_data.as_json, format: :json
+    end
+  end
+end
@@ -0,0 +1,5 @@
+<h1>WebScrappers#index</h1>
+<p>Find me in app/views/web_scrappers/index.html.erb</p>
+
+
+<%= link_to 'Start Scrap', new_web_scrapper_path %>
@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 
 Rails.application.routes.draw do
+  
   resources :products do
     match '/scrape', to: 'products#scrape', via: :post, on: :collection
   end
 
@@ -0,0 +1,7 @@
+const pcp3 = require("/home/jarenc/Rails_Activities/web_scraper/dist")
+
+async function returnParts() {
+  let parts = await pcp3.getPartsList('test')
+  console.log(parts)
+}
+returnParts()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+// Place all the styles related to the WebScrappers controller here.`
	`2`	`+// They will automatically be included in application.css.`
	`3`	`+// You can use Sass (SCSS) here: https://sass-lang.com/`