Skip to content

Commit

Permalink
Merge pull request #435 from multiflexi/improve_web_collector
Browse files Browse the repository at this point in the history
Fix date time format for web collector
  • Loading branch information
Progress1 authored Dec 11, 2024
2 parents b8a0f28 + a1c141c commit f133e32
Showing 1 changed file with 10 additions and 6 deletions.
16 changes: 10 additions & 6 deletions src/collectors/collectors/web_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
from selenium.webdriver.firefox.service import Service as FirefoxService
from urllib.parse import urlparse
import os
import dateparser
import re
from dateutil.parser import parse

from .base_collector import BaseCollector
from managers.log_manager import logger
Expand Down Expand Up @@ -843,11 +843,15 @@ def __process_article_page(self, index_url, browser):
if not article_description:
article_description = self.__smart_truncate(article_full_text)

extracted_date = None
published_str = self.__find_element_text_by(browser, self.selectors["published"])
if not published_str:
published_str = "today"
published = dateparser.parse(published_str, settings={"DATE_ORDER": "DMY"})
published_str = published.strftime("%Y-%m-%d %H:%M") # remove microseconds/seconds from the screen, looks ugly
if published_str:
extracted_date = parse(published_str, fuzzy=True)
now = datetime.datetime.now()
if extracted_date:
published_str = extracted_date.strftime("%d.%m.%Y - %H:%M")
else:
published_str = now.strftime("%d.%m.%Y - %H:%M")

link = current_url

Expand All @@ -863,7 +867,7 @@ def __process_article_page(self, index_url, browser):
link,
published_str,
author,
datetime.datetime.now(),
now,
article_full_text,
self.source.id,
[],
Expand Down

0 comments on commit f133e32

Please sign in to comment.