Skip to content

Commit

Permalink
Merge pull request #193 from Progress1/col_upg
Browse files Browse the repository at this point in the history
Update collectors 2 (Web collector: support for latest Selenium version)
  • Loading branch information
milankowww authored Nov 29, 2023
2 parents b915724 + 58929c1 commit 1b4da69
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 43 deletions.
13 changes: 9 additions & 4 deletions src/collectors/collectors/atom_collector.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
import datetime
import hashlib
import uuid
import traceback
import feedparser
import requests
from bs4 import BeautifulSoup
from dateutil.parser import parse

from .base_collector import BaseCollector
from managers import log_manager
from shared.schema.news_item import NewsItemData
from shared.schema.parameter import Parameter, ParameterType

Expand All @@ -29,6 +31,7 @@ def collect(self, source):
feed_url = source.parameter_values['ATOM_FEED_URL']
user_agent = source.parameter_values['USER_AGENT']
interval = source.parameter_values['REFRESH_INTERVAL']
log_manager.log_collector_activity("atom", source.name, "Starting collector for url: {}".format(feed_url))

proxies = {}
if 'PROXY_SERVER' in source.parameter_values:
Expand All @@ -49,16 +52,14 @@ def collect(self, source):

news_items = []

limit = BaseCollector.history(interval)
for feed_entry in feed['entries']:

limit = BaseCollector.history(interval)
published = feed_entry['updated']
published = parse(published, tzinfos=BaseCollector.timezone_info())

if str(published) > str(limit):

link_for_article = feed_entry['link']

log_manager.log_collector_activity("atom", source.name, "Processing entry [{}]".format(link_for_article))
if proxies:
page = requests.get(link_for_article, headers={'User-Agent': user_agent}, proxies=proxies)
else:
Expand All @@ -84,4 +85,8 @@ def collect(self, source):

BaseCollector.publish(news_items, source)
except Exception as error:
log_manager.log_collector_activity("atom", source.name, "ATOM collection exceptionally failed")
BaseCollector.print_exception(source, error)
log_manager.log_debug(traceback.format_exc())

log_manager.log_debug("{} collection finished.".format(self.type))
76 changes: 37 additions & 39 deletions src/collectors/collectors/web_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options as ChromeOptions
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.firefox.service import Service as FirefoxService
from urllib.parse import urlparse
import os
import dateparser
Expand Down Expand Up @@ -137,17 +139,17 @@ def __find_element_by(driver, element_selector):

element = None
if prefix == 'id':
element = driver.find_element_by_id(selector)
element = driver.find_element(By.ID, selector)
if prefix == 'name':
element = driver.find_element_by_name(selector)
element = driver.find_element(By.NAME, selector)
elif prefix == 'xpath':
element = driver.find_element_by_xpath(selector)
element = driver.find_element(By.XPATH, selector)
elif prefix in [ 'tag_name', 'tag' ]:
element = driver.find_element_by_tag_name(selector)
element = driver.find_element(By.TAG_NAME, selector)
elif prefix in [ 'class_name', 'class' ]:
element = driver.find_element_by_class_name(selector)
element = driver.find_element(By.CLASS_NAME, selector)
elif prefix in [ 'css_selector', 'css' ]:
element = driver.find_element_by_css_selector(selector)
element = driver.find_element(By.CSS_SELECTOR, selector)

return element

Expand Down Expand Up @@ -177,17 +179,17 @@ def __find_elements_by(driver, element_selector):

elements = None
if prefix == 'id':
elements = [ driver.find_element_by_id(selector) ]
elements = [ driver.find_element(By.ID, selector) ]
if prefix == 'name':
elements = driver.find_elements_by_name(selector)
elements = driver.find_element(By.NAME, selector)
elif prefix == 'xpath':
elements = driver.find_elements_by_xpath(selector)
elements = driver.find_element(By.XPATH, selector)
elif prefix in [ 'tag_name', 'tag' ]:
elements = driver.find_elements_by_tag_name(selector)
elements = driver.find_elements(By.TAG_NAME, selector)
elif prefix in [ 'class_name', 'class' ]:
elements = driver.find_elements_by_class_name(selector)
elements = driver.find_element(By.CLASS_NAME, selector)
elif prefix in [ 'css_selector', 'css' ]:
elements = driver.find_elements_by_css_selector(selector)
elements = driver.find_elements(By.CSS_SELECTOR, selector)
return elements

@staticmethod
Expand Down Expand Up @@ -218,7 +220,6 @@ def __wait_for_new_tab(browser, timeout, current_tab):
browser.switch_to.window(tab)
return


def __close_other_tabs(self, browser, handle_to_keep, fallback_url):
try:
handles_to_close = copy.copy(browser.window_handles)
Expand Down Expand Up @@ -369,23 +370,21 @@ def __get_headless_driver_chrome(self):
chrome_options.add_argument("--headless")
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--incognito')
chrome_service = ChromeService(executable_path=chrome_driver_executable)
if self.user_agent:
chrome_options.add_argument('user-agent=' + self.user_agent)
if self.tor_service.lower() == 'yes':
socks_proxy = "socks5://127.0.0.1:9050"
chrome_options.add_argument('--proxy-server={}'.format(socks_proxy))
driver = webdriver.Chrome(executable_path=chrome_driver_executable, options=chrome_options)
elif self.proxy:
webdriver.DesiredCapabilities.CHROME['proxy'] = {
"proxyType": "MANUAL",
"httpProxy": self.proxy,
"ftpProxy": self.proxy,
"sslProxy": self.proxy
}
driver = webdriver.Chrome(executable_path=chrome_driver_executable, options=chrome_options)
else:
driver = webdriver.Chrome(executable_path=chrome_driver_executable, options=chrome_options)


driver = webdriver.Chrome(service=chrome_service, options=chrome_options)
log_manager.log_debug('Chrome driver initialized.')
return driver

Expand All @@ -407,30 +406,26 @@ def __get_headless_driver_firefox(self):
if self.user_agent:
firefox_options.add_argument('user-agent=' + self.user_agent)

profile = webdriver.FirefoxProfile()
firefox_capabilities = webdriver.DesiredCapabilities.FIREFOX
firefox_capabilities['marionette'] = True

if self.tor_service.lower() == 'yes':
profile.set_preference('network.proxy.type', 1) # manual proxy config
profile.set_preference('network.proxy.socks', '127.0.0.1')
profile.set_preference('network.proxy.socks_port', 9050)
profile.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8');
firefox_options.set_preference('network.proxy.type', 1) # manual proxy config
firefox_options.set_preference('network.proxy.socks', '127.0.0.1')
firefox_options.set_preference('network.proxy.socks_port', 9050)
firefox_options.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8');

elif self.proxy:
profile.set_preference('network.proxy.type', 1) # manual proxy config
profile.set_preference('network.proxy.http', self.proxy_host)
profile.set_preference('network.proxy.http_port', int(self.proxy_port))
profile.set_preference('network.proxy.ssl', self.proxy_host)
profile.set_preference('network.proxy.ssl_port', int(self.proxy_port))
profile.set_preference('network.proxy.ftp', self.proxy)
profile.set_preference('network.proxy.ftp_port', int(self.proxy_port))
profile.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8');
firefox_options.set_preference('network.proxy.type', 1) # manual proxy config
firefox_options.set_preference('network.proxy.http', self.proxy_host)
firefox_options.set_preference('network.proxy.http_port', int(self.proxy_port))
firefox_options.set_preference('network.proxy.ssl', self.proxy_host)
firefox_options.set_preference('network.proxy.ssl_port', int(self.proxy_port))
firefox_options.set_preference('network.proxy.ftp', self.proxy)
firefox_options.set_preference('network.proxy.ftp_port', int(self.proxy_port))
firefox_options.set_preference('network.proxy.no_proxies_on', f'localhost, ::1, 127.0.0.1, {core_url_host}, 127.0.0.0/8');
else:
profile.set_preference('network.proxy.type', 0) # no proxy
firefox_options.set_preference('network.proxy.type', 0) # no proxy

profile.update_preferences()
driver = webdriver.Firefox(profile, executable_path=firefox_driver_executable, options=firefox_options, capabilities=firefox_capabilities)
firefox_service = FirefoxService(executable_path=firefox_driver_executable)
driver = webdriver.Firefox(service=firefox_service, options=firefox_options)

log_manager.log_debug('Firefox driver initialized.')
return driver
Expand Down Expand Up @@ -518,8 +513,11 @@ def __browse_title_page(self, index_url):
popup = WebDriverWait(browser, 10).until(EC.presence_of_element_located(self.__get_element_locator(self.selectors['popup_close'])))
except Exception as ex:
log_manager.log_collector_activity('web', self.source.name, 'Popup find error: ' + traceback.format_exc())
if popup:
popup.click()
try:
if popup:
popup.click()
except Exception as ex:
log_manager.log_collector_activity('web', self.source.name, 'Popup click error: ' + traceback.format_exc())

# if there is a "load more" selector, click on it!
page = 1
Expand Down

0 comments on commit 1b4da69

Please sign in to comment.