forked from vifreefly/kimuraframework
-
Notifications
You must be signed in to change notification settings - Fork 0
/
session.rb
249 lines (217 loc) · 7.86 KB
/
session.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
require 'capybara'
require 'nokogiri'
require 'json'
require_relative 'session/config'
module Capybara
class Session
attr_accessor :spider
alias_method :original_visit, :visit
def visit(visit_uri, delay: config.before_request[:delay], skip_request_options: false, max_retries: 3)
if spider
process_delay(delay) if delay
retries, sleep_interval = 0, 0
begin
check_request_options(visit_uri) unless skip_request_options
driver.requests += 1 and logger.info "Browser: started get request to: #{visit_uri}"
spider.class.update(:visits, :requests) if spider.with_info
original_visit(visit_uri)
rescue => e
if match_error?(e, type: :to_skip)
logger.error "Browser: skip request error: #{e.inspect}, url: #{visit_uri}"
spider.add_event(:requests_errors, e.inspect) if spider.with_info
false
elsif match_error?(e, type: :to_retry)
logger.error "Browser: retry request error: #{e.inspect}, url: #{visit_uri}"
spider.add_event(:requests_errors, e.inspect) if spider.with_info
if (retries += 1) <= max_retries
logger.info "Browser: sleep #{(sleep_interval += 15)} seconds and process retry № #{retries} to the url: #{visit_uri}"
sleep sleep_interval and retry
else
logger.error "Browser: all retries (#{retries - 1}) to the url #{visit_uri} are gone"
raise e unless skip_error_on_failure?(e)
end
else
raise e
end
else
driver.responses += 1 and logger.info "Browser: finished get request to: #{visit_uri}"
spider.class.update(:visits, :responses) if spider.with_info
driver.visited = true unless driver.visited
true
ensure
if spider.with_info
logger.info "Info: visits: requests: #{spider.class.visits[:requests]}, responses: #{spider.class.visits[:responses]}"
end
if memory = driver.current_memory
logger.debug "Browser: driver.current_memory: #{memory}"
end
end
else
original_visit(visit_uri)
end
end
def destroy_driver!
if @driver
begin
@driver.quit
# handle Net::ReadTimeout error for Selenium like drivers
rescue Net::ReadTimeout => e
@driver.quit
end
@driver = nil
logger.info "Browser: driver #{mode} has been destroyed"
else
logger.warn "Browser: driver #{mode} is not present"
end
end
def restart!
if mode.match?(/poltergeist/)
@driver.browser.restart
@driver.requests, @driver.responses = 0, 0
else
destroy_driver!
driver
end
logger.info "Browser: driver has been restarted: name: #{mode}, pid: #{driver.pid}, port: #{driver.port}"
end
def current_response(response_type = :html)
case response_type
when :html
if config.encoding
if config.encoding == :auto
charset = body.force_encoding("ISO-8859-1").encode("UTF-8")[/<meta.*?charset=["]?([\w+\d+\-]*)/i, 1]
Nokogiri::HTML(body, nil, charset)
else
Nokogiri::HTML(body, nil, config.encoding)
end
else
Nokogiri::HTML(body)
end
when :json
JSON.parse(body)
end
end
###
# Handy method to perform some processing in the new tab within block and then automatically close this tab:
# Usage (url):
# browser.within_new_window_by(url: "https://google.com") do
# do some stuff and then automatically close this tab and return back to the first tab
# end
# Usage (action) (when new tab opening by some action, for example by clicking
# on a particular element):
# action = -> { browser.find("//some/element/path").click }
# browser.within_new_window_by(action: action) do
# do some stuff and then automatically close this tab and return back to the first tab
# end
def within_new_window_by(action: nil, url: nil)
case
when action
opened_window = window_opened_by { action.call }
within_window(opened_window) do
yield
current_window.close
end
when url
within_window(open_new_window) do
visit(url)
yield
current_window.close
end
end
end
###
def scroll_to_bottom
execute_script("window.scrollBy(0,10000)")
end
private
def skip_error_on_failure?(e)
config.retry_request_errors.any? do |error|
error[:skip_on_failure] && e.class.ancestors.include?(error[:error]) if error.kind_of?(Hash)
end
end
def match_error?(e, type:)
errors =
case type
when :to_retry then config.retry_request_errors
when :to_skip then config.skip_request_errors
end
errors.any? do |error|
if error.kind_of?(Hash)
match_class = e.class.ancestors.include?(error[:error])
if error[:message].present?
if error[:message].kind_of?(Regexp)
e.message&.match?(error[:message])
else
e.message&.include?(error[:message])
end && match_class
else
match_class
end
else
e.class.ancestors.include?(error)
end
end
end
def process_delay(delay)
interval = (delay.class == Range ? rand(delay) : delay)
logger.debug "Browser: sleep #{interval.round(2)} #{'second'.pluralize(interval)} before request..."
sleep interval
end
def check_request_options(url_to_visit)
# restart_if
if memory_limit = config.restart_if[:memory_limit]
memory = driver.current_memory
if memory && memory >= memory_limit
logger.warn "Browser: memory_limit #{memory_limit} of driver.current_memory (#{memory}) is exceeded (engine: #{mode})"
restart!
end
end
if requests_limit = config.restart_if[:requests_limit]
requests = driver.requests
if requests >= requests_limit
logger.warn "Browser: requests_limit #{requests_limit} of driver.requests (#{requests}) is exceeded (engine: #{mode})"
restart!
end
end
# cookies
# (Selenium only) if config.cookies present and browser was just created,
# visit url_to_visit first and only then set cookies:
if driver.visited.nil? && config.cookies && mode.match?(/selenium/)
visit(url_to_visit, skip_request_options: true)
config.cookies.each do |cookie|
driver.set_cookie(cookie[:name], cookie[:value], cookie)
end
end
if config.before_request[:clear_cookies]
driver.clear_cookies
logger.debug "Browser: cleared cookies before request"
end
if config.before_request[:clear_and_set_cookies]
driver.clear_cookies
# (Selenium only) if browser is not visited yet any page, visit url_to_visit
# first and then set cookies (needs after browser restart):
if driver.visited.nil? && mode.match?(/selenium/)
visit(url_to_visit, skip_request_options: true)
end
config.cookies.each do |cookie|
driver.set_cookie(cookie[:name], cookie[:value], cookie)
end
logger.debug "Browser: cleared and set cookies before request"
end
# user_agent
if config.before_request[:change_user_agent]
driver.add_header("User-Agent", config.user_agent.call)
logger.debug "Browser: changed user_agent before request"
end
# proxy
if config.before_request[:change_proxy]
proxy_string = config.proxy.call
driver.set_proxy(*proxy_string.split(":"))
logger.debug "Browser: changed proxy before request"
end
end
def logger
spider.logger
end
end
end