Skip to content

Commit

Permalink
Merge pull request #177 from Progress1/regex_bot
Browse files Browse the repository at this point in the history
Better regex bots
  • Loading branch information
milankowww authored Oct 20, 2023
2 parents fb67094 + 80fc42b commit 6e5f292
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 26 deletions.
55 changes: 34 additions & 21 deletions src/bots/bots/analyst_bot.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re

from .base_bot import BaseBot
from managers.log_manager import log_debug, log_bot_activity
from shared.schema import news_item
from shared.schema.parameter import Parameter, ParameterType
from remote.core_api import CoreApi
Expand Down Expand Up @@ -28,15 +29,21 @@ class AnalystBot(BaseBot):
def execute(self, preset):
try:
source_group = preset.parameter_values['SOURCE_GROUP']
regexp = preset.parameter_values['REGULAR_EXPRESSION'].replace(' ', '')
attr_name = preset.parameter_values['ATTRIBUTE_NAME'].replace(' ', '')
regexp = preset.parameter_values['REGULAR_EXPRESSION']
attr_name = preset.parameter_values['ATTRIBUTE_NAME']
interval = preset.parameter_values['REFRESH_INTERVAL']

regexp = regexp.split(',')
attr_name = attr_name.split(',')
# support for multiple regexps
regexp = regexp.split(';;;')
attr_name = attr_name.split(';;;')
if len(regexp) > len(attr_name):
regexp = regexp[:len(attr_name)]
elif len(attr_name) > len(regexp):
attr_name = attr_name[:len(regexp)]

bots_params = dict(zip(attr_name, regexp))
limit = BaseBot.history(interval)
log_bot_activity(preset.name, 'running with date limit {}'.format(limit))
news_items_data, code = CoreApi.get_news_items_data(limit)
if code == 200 and news_items_data is not None:
for item in news_items_data:
Expand All @@ -46,26 +53,32 @@ def execute(self, preset):
preview = item['review']
content = item['content']

analyzed_text = ''.join([title, preview, content]).split()
analyzed_text = [item.replace('.', '') if item.endswith('.') else item
for item in analyzed_text]
analyzed_text = [item.replace(',', '') if item.endswith(',') else item
for item in analyzed_text]
analyzed_text = ' '.join([title, preview, content])

for element in analyzed_text:
attributes = []
for key, value in bots_params.items():
finding = re.search(value, element)
if finding:
attributes = []
for key, value in bots_params.items():
uniq_list = []
# print('Key:', key, 'Regex:', value, flush=True)
for finding in re.finditer(value, analyzed_text):
if len(finding.groups()) > 0:
found_value = finding.group(1)
else:
found_value = finding.group(0)
value = found_value
binary_mime_type = ''
binary_value = ''
# print('Found:', found_value, flush=True)
if found_value not in uniq_list:
uniq_list.append(found_value)

# app is checking combination ID + Value in DB before INSERT (attribute_value_identical) so check for some duplicity here (faster)
for found_value in uniq_list:
binary_mime_type = ''
binary_value = ''
news_attribute = news_item.NewsItemAttribute(key, found_value, binary_mime_type, binary_value)
attributes.append(news_attribute)

news_attribute = news_item.NewsItemAttribute(key, value, binary_mime_type, binary_value)
attributes.append(news_attribute)
news_item_attributes_schema = news_item.NewsItemAttributeSchema(many=True)
CoreApi.update_news_item_attributes(news_item_id, news_item_attributes_schema.dump(attributes))
if len(attributes) > 0:
log_debug('Processing item id: {}, {}, Found: {}'.format(news_item_id, item['collected'], len(attributes)))
news_item_attributes_schema = news_item.NewsItemAttributeSchema(many=True)
CoreApi.update_news_item_attributes(news_item_id, news_item_attributes_schema.dump(attributes))

except Exception as error:
BaseBot.print_exception(preset, error)
Expand Down
14 changes: 12 additions & 2 deletions src/bots/bots/base_bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import traceback

from managers import time_manager
from managers.log_manager import log_debug
from shared.schema import bot, bot_preset
from shared.schema.parameter import Parameter, ParameterType
from remote.core_api import CoreApi
Expand All @@ -12,8 +13,9 @@ class BaseBot:
name = "Base Bot"
description = "Base abstract type for all bots"

# real values are stored in 'parameter' table
parameters = [
Parameter(0, "REFRESH_INTERVAL", "Refresh Interval", "How often and when is this bot doing its job. Examples:<ul><li>10 --- perform the task every 10 minutes</li><li>10:30 --- perform the task every day at 10:30</li><li>Tuesday,10:30 --- perform the task every Tuesday at 10:30</li></ul>",
Parameter(0, "REFRESH_INTERVAL", "Refresh interval (0 to disable)", "How often and when is this bot doing its job. Examples:<ul><li>10 --- perform the task every 10 minutes</li><li>10:30 --- perform the task every day at 10:30</li><li>Tuesday,10:30 --- perform the task every Tuesday at 10:30</li></ul>",
ParameterType.NUMBER)
]

Expand Down Expand Up @@ -70,16 +72,23 @@ def initialize(self):
self.bot_presets = preset_schema.load(response)

for preset in self.bot_presets:
self.execute(preset)
interval = preset.parameter_values["REFRESH_INTERVAL"]
# do not schedule if no interval is set
if interval == '' or interval == '0':
log_debug("scheduling '{}' disabled".format(str(preset.name)))
continue

self.execute(preset)

if interval:
if interval[0].isdigit() and ':' in interval:
log_debug("scheduling '{}' at: {}".format(str(preset.name), str(interval)))
time_manager.schedule_job_every_day(interval, self.execute, preset)
elif interval[0].isalpha():
interval = interval.split(',')
day = interval[0].strip()
at = interval[1].strip()
log_debug("scheduling '{}' at: {} {}".format(str(preset.name), str(day), str(at)))
if day == 'Monday':
time_manager.schedule_job_on_monday(at, self.execute, preset)
elif day == 'Tuesday':
Expand All @@ -95,4 +104,5 @@ def initialize(self):
else:
time_manager.schedule_job_on_sunday(at, self.execute, preset)
else:
log_debug("scheduling '{}' for {}".format(str(preset.name), int(interval)))
time_manager.schedule_job_minutes(int(interval), self.execute, preset)
4 changes: 2 additions & 2 deletions src/bots/managers/log_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,6 @@ def log_critical(message):
def log_system_activity(module, message):
log_info("[{}] {}".format(module, message))

def log_collector_activity(collector_type, collector, message):
log_text = "COLLECTOR {} '{}': {}".format(collector_type, collector, message)
def log_bot_activity(bot, message):
log_text = "BOT '{}': {}".format(bot, message)
log_info(log_text)
2 changes: 1 addition & 1 deletion src/core/model/news_item.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def latest_collected(cls):
@classmethod
def get_all_news_items_data(cls, limit):
limit = datetime.strptime(limit, '%d.%m.%Y - %H:%M')
news_items_data = cls.query.filter(cls.collected > limit).all()
news_items_data = cls.query.filter(cls.collected >= limit).all()
news_items_data_schema = NewsItemDataSchema(many=True)
return news_items_data_schema.dump(news_items_data)

Expand Down

0 comments on commit 6e5f292

Please sign in to comment.