fix: improve efficiency of querying headword counts

Crissium · Mar 2, 2024 · ea0cc0b · ea0cc0b
1 parent 0fb04fb
commit ea0cc0b
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 10 deletions.
diff --git a/server/app/db_manager.py b/server/app/db_manager.py
@@ -51,7 +51,7 @@ def get_cursor() -> sqlite3.Cursor:
 	return local_storage.cursor
 
 
-def create_table_entries() -> None:
+def init_db() -> None:
 	cursor = get_cursor()
 	cursor.execute('''create table if not exists entries (
 		key text, -- the entry in lowercase and without accents
@@ -60,6 +60,13 @@ def create_table_entries() -> None:
 		offset integer, -- offset of the entry in the dictionary file
 		size integer -- size of the definition in bytes
 	)''')
+	cursor.execute('''create table if not exists headword_counts (
+		dictionary_name text primary key,
+		count integer
+	)''')
+	# Note: we shouldn't use triggers to update the table automatically,
+	# because it is statement-level rather than transaction-level,
+	# i.e., it would run after each insertion, which is not efficient.
 
 
 def dictionary_exists(dictionary_name: str) -> bool:
@@ -80,8 +87,15 @@ def add_entry(key: str,
 	cursor.execute('insert into entries values (?, ?, ?, ?, ?)', (key, dictionary_name, word, offset, size))
 
 
-def commit() -> None:
-	get_connection().commit()
+def commit_new_entries(dictionary_name: str) -> None:
+	conn = get_connection()
+	conn.commit()
+
+	cursor = get_cursor()
+	cursor.execute('''insert or replace into headword_counts (dictionary_name, count)
+				select ?, count(*) from entries
+				where dictionary_name = ?''', (dictionary_name, dictionary_name))
+	conn.commit()
 
 
 def create_ngram_table(stores_keys: bool) -> None:
@@ -138,10 +152,22 @@ def get_entries(key: str, dictionary_name: str) -> list[tuple[str, int, int]]:
 	return cursor.fetchall()
 
 
+lock_headword_counts = threading.Lock()
+
 def headword_count_of_dictionary(dictionary_name: str) -> int:
 	cursor = get_cursor()
-	cursor.execute('select count(*) from entries where dictionary_name = ?', (dictionary_name,))
-	return cursor.fetchone()[0]
+	cursor.execute('select count from headword_counts where dictionary_name = ?', (dictionary_name,))
+	cached_count = cursor.fetchone()
+	if cached_count:
+		return cached_count[0]
+	else:
+		cursor.execute('select count(*) from entries where dictionary_name = ?', (dictionary_name,))
+		count = cursor.fetchone()[0]
+		with lock_headword_counts:
+			cursor.execute('insert into headword_counts (dictionary_name, count) values (?, ?)',
+							(dictionary_name, count))
+			get_connection().commit()
+		return count
 
 
 def get_entries_with_headword(word: str, dictionary_name: str) -> list[tuple[int, int]]:
@@ -167,6 +193,7 @@ def get_entries_all(dictionary_name: str) -> list[tuple[str, str, int, int]]:
 def delete_dictionary(dictionary_name: str) -> None:
 	cursor = get_cursor()
 	cursor.execute('delete from entries where dictionary_name = ?', (dictionary_name,))
+	cursor.execute('delete from headword_counts where dictionary_name = ?', (dictionary_name,))
 	get_connection().commit()
 
 

diff --git a/server/app/dictionaries.py b/server/app/dictionaries.py
@@ -118,7 +118,7 @@ def __init__(self, app: Flask) -> None:
 
 		self.settings = Settings()
 
-		db_manager.create_table_entries()
+		db_manager.init_db()
 
 		self._dictionaries: dict[str, BaseReader] = dict()
 		# on HDD it would confuse the I/O scheduler to load the dictionaries in parallel

diff --git a/server/app/dicts/dsl_reader.py b/server/app/dicts/dsl_reader.py
@@ -164,7 +164,7 @@ def __init__(self,
 						for headword in headwords:
 							db_manager.add_entry(self.simplify(headword), self.name, headword, offset, size)
 						headwords.clear()
-			db_manager.commit()
+			db_manager.commit_new_entries(self.name)
 			db_manager.create_index()
 			logger.info(f'Entries of dictionary {self.name} added to database')
 			# Whether compressed originally or not, we need to compress it now

diff --git a/server/app/dicts/mdict_reader.py b/server/app/dicts/mdict_reader.py
@@ -70,7 +70,7 @@ def __init__(self,
 									 key.decode('UTF-8'),
 									 offset,
 									 length)
-			db_manager.commit()
+			db_manager.commit_new_entries(self.name)
 			db_manager.create_index()
 			logger.info(f'Entries of dictionary {self.name} added to database')
 

diff --git a/server/app/dicts/stardict_reader.py b/server/app/dicts/stardict_reader.py
@@ -54,7 +54,7 @@ def __init__(self,
 				word_decoded = word_str.decode('utf-8')
 				for offset, size in spans:
 					db_manager.add_entry(self.simplify(word_decoded), self.name, word_decoded, offset, size)
-			db_manager.commit()
+			db_manager.commit_new_entries(self.name)
 			db_manager.create_index()
 			logger.info(f'Entries of dictionary {self.name} added to database')
 

diff --git a/server/updater.py b/server/updater.py
@@ -15,7 +15,7 @@
 project_directory = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 windows_save_path = os.path.join(os.path.dirname(project_directory), 'SilverDict-windows.zip')
 unix_save_path = os.path.join(project_directory, 'SilverDict.zip')
-current_version = 'v1.1.0'
+current_version = 'v1.1.1'
 
 
 def _get_latest_version_and_release_note() -> tuple[str, str]: