Skip to content

Commit dcd36e4

Browse files
author
4gac
committed
PVQ-3473 fix: crashing when detecting language on meaningless data
1 parent 359712f commit dcd36e4

File tree

1 file changed

+22
-13
lines changed

1 file changed

+22
-13
lines changed

src/lang_detect.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import tempfile
55
from collections import Counter
66

7-
from langdetect import detect
7+
from langdetect import LangDetectException, detect
88
from pdfixsdk.Pdfix import GetPdfix, PdeText, kPdeText, kSaveFull
99

1010

@@ -15,7 +15,10 @@ def __init__(self, message: str = "") -> None:
1515

1616

1717
def detect_lang_for_text(text: str) -> str:
18-
return detect(text)
18+
try:
19+
return detect(text)
20+
except LangDetectException as e:
21+
raise e
1922

2023

2124
def get_text(element, words) -> None:
@@ -158,38 +161,44 @@ def detect_lang_pdf_2_txt(
158161
words: list[str] = []
159162
get_text(container, words)
160163

161-
print(words)
162-
print(i)
163164
if words:
164-
lang = detect_lang_for_text(" ".join(words))
165-
lang_list.append(lang)
165+
try:
166+
lang = detect_lang_for_text(" ".join(words))
167+
lang_list.append(lang)
168+
except LangDetectException:
169+
continue
166170

167171
# Count the frequency of each string
168172
string_counts = Counter(lang_list)
169173

170174
# Get the string(s) that occur the most
171175
most_common_lang = string_counts.most_common(1)
172176

173-
print("Detected language: " + most_common_lang[0][0])
177+
if most_common_lang:
178+
print("Detected language: " + most_common_lang[0][0])
174179

175180
if not os.path.exists(os.path.dirname(out_path)):
176181
os.makedirs(os.path.dirname(out_path))
177182
with open(out_path, "w", encoding="utf-8") as f:
178-
f.write(most_common_lang[0][0])
183+
if most_common_lang:
184+
f.write(most_common_lang[0][0])
185+
else:
186+
print("No language detected")
187+
f.write("")
188+
sys.exit(1)
179189

180190

181191
def detect_lang_txt_2_txt(input: str, output: str) -> None:
182192
try:
183193
with open(input, "r", encoding="utf-8") as infile:
184194
text = infile.read()
185195

186-
if not text:
187-
detected_language = detect_lang_for_text(text)
196+
detected_language = detect_lang_for_text(text)
188197

189-
print("Detected language: " + detected_language)
198+
print("Detected language: " + detected_language)
190199

191-
with open(output, "w", encoding="utf-8") as outfile:
192-
outfile.write(detected_language)
200+
with open(output, "w", encoding="utf-8") as outfile:
201+
outfile.write(detected_language)
193202

194203
except Exception as e:
195204
print(f"An error occurred: {str(e)}", file=sys.stderr)

0 commit comments

Comments
 (0)