From 4758d514e5352adec2167befe2088c4e5e181ace Mon Sep 17 00:00:00 2001 From: Paul Tikken Date: Tue, 21 Nov 2023 12:06:21 +0000 Subject: [PATCH] addressed slow queries on initialization of the database; added several minor bulk import optimizations --- CveXplore/VERSION | 2 +- CveXplore/database/maintenance/DownloadHandler.py | 4 +++- CveXplore/database/maintenance/Sources_process.py | 1 + CveXplore/database/maintenance/api_handlers.py | 9 +++++---- CveXplore/database/maintenance/main_updater.py | 12 +++++++++--- 5 files changed, 19 insertions(+), 9 deletions(-) diff --git a/CveXplore/VERSION b/CveXplore/VERSION index bf707c6fb..44e32aeac 100644 --- a/CveXplore/VERSION +++ b/CveXplore/VERSION @@ -1 +1 @@ -0.3.17.dev12 \ No newline at end of file +0.3.17.dev13 \ No newline at end of file diff --git a/CveXplore/database/maintenance/DownloadHandler.py b/CveXplore/database/maintenance/DownloadHandler.py index 39b7c612e..2d6526310 100644 --- a/CveXplore/database/maintenance/DownloadHandler.py +++ b/CveXplore/database/maintenance/DownloadHandler.py @@ -152,7 +152,7 @@ def chunk_list(self, lst: list, number: int) -> list: Yield successive n-sized chunks from lst. """ for i in range(0, len(lst), number): - yield lst[i: i + number] + yield lst[i : i + number] def _db_bulk_writer(self, batch: list): """ @@ -162,6 +162,8 @@ def _db_bulk_writer(self, batch: list): try: if self.feed_type.lower() == "epss": self.database["cves"].bulk_write(batch, ordered=False) + elif self.feed_type.lower() == "cves" or self.feed_type.lower() == "cpe": + self.database[self.feed_type.lower()].insert_many(batch, ordered=False) else: self.database[self.feed_type.lower()].bulk_write(batch, ordered=False) except BulkWriteError as err: diff --git a/CveXplore/database/maintenance/Sources_process.py b/CveXplore/database/maintenance/Sources_process.py index 72e3f5dce..5fc26eeda 100644 --- a/CveXplore/database/maintenance/Sources_process.py +++ b/CveXplore/database/maintenance/Sources_process.py @@ -1107,6 +1107,7 @@ def __init__(self): MongoAddIndex( index=[("padded_version", ASCENDING)], name="padded_version" ), + MongoAddIndex(index=[("lastModified", ASCENDING)], name="lastModified"), ], "cpeother": [ MongoUniqueIndex(index=[("id", ASCENDING)], name="id", unique=True) diff --git a/CveXplore/database/maintenance/api_handlers.py b/CveXplore/database/maintenance/api_handlers.py index 7a3d22610..2425b6c20 100644 --- a/CveXplore/database/maintenance/api_handlers.py +++ b/CveXplore/database/maintenance/api_handlers.py @@ -32,10 +32,11 @@ def process_item(self, item: dict): doc=item, ).entry else: - return DatabaseAction( - action=DatabaseAction.actions.InsertOne, - doc=item, - ).entry + # return DatabaseAction( + # action=DatabaseAction.actions.InsertOne, + # doc=item, + # ).entry + return item @abstractmethod def process_the_item(self, *args): diff --git a/CveXplore/database/maintenance/main_updater.py b/CveXplore/database/maintenance/main_updater.py index 8165d5afc..7896c7cc4 100644 --- a/CveXplore/database/maintenance/main_updater.py +++ b/CveXplore/database/maintenance/main_updater.py @@ -106,7 +106,9 @@ def update(self, update_source: str | list = None): self.datasource.set_handlers_for_collections() self.logger.info(f"Database update complete!") - self.logger.info(f"Update Total duration: {timedelta(seconds=time.time() - start_time)}") + self.logger.info( + f"Update Total duration: {timedelta(seconds=time.time() - start_time)}" + ) def populate(self, populate_source: str | list = None): """ @@ -159,7 +161,9 @@ def populate(self, populate_source: str | list = None): self.datasource.set_handlers_for_collections() self.logger.info(f"Database population complete!") - self.logger.info(f"Populate total duration: {timedelta(seconds=time.time() - start_time)}") + self.logger.info( + f"Populate total duration: {timedelta(seconds=time.time() - start_time)}" + ) def initialize(self): """ @@ -183,4 +187,6 @@ def initialize(self): self.update() self.logger.info(f"Database initialization complete!") - self.logger.info(f"Initialization total duration: {timedelta(seconds=time.time() - start_time)}") + self.logger.info( + f"Initialization total duration: {timedelta(seconds=time.time() - start_time)}" + )