4
4
import tempfile
5
5
from collections import Counter
6
6
7
- from langdetect import detect
7
+ from langdetect import LangDetectException , detect
8
8
from pdfixsdk .Pdfix import GetPdfix , PdeText , kPdeText , kSaveFull
9
9
10
10
@@ -15,7 +15,10 @@ def __init__(self, message: str = "") -> None:
15
15
16
16
17
17
def detect_lang_for_text (text : str ) -> str :
18
- return detect (text )
18
+ try :
19
+ return detect (text )
20
+ except LangDetectException as e :
21
+ raise e
19
22
20
23
21
24
def get_text (element , words ) -> None :
@@ -158,38 +161,44 @@ def detect_lang_pdf_2_txt(
158
161
words : list [str ] = []
159
162
get_text (container , words )
160
163
161
- print (words )
162
- print (i )
163
164
if words :
164
- lang = detect_lang_for_text (" " .join (words ))
165
- lang_list .append (lang )
165
+ try :
166
+ lang = detect_lang_for_text (" " .join (words ))
167
+ lang_list .append (lang )
168
+ except LangDetectException :
169
+ continue
166
170
167
171
# Count the frequency of each string
168
172
string_counts = Counter (lang_list )
169
173
170
174
# Get the string(s) that occur the most
171
175
most_common_lang = string_counts .most_common (1 )
172
176
173
- print ("Detected language: " + most_common_lang [0 ][0 ])
177
+ if most_common_lang :
178
+ print ("Detected language: " + most_common_lang [0 ][0 ])
174
179
175
180
if not os .path .exists (os .path .dirname (out_path )):
176
181
os .makedirs (os .path .dirname (out_path ))
177
182
with open (out_path , "w" , encoding = "utf-8" ) as f :
178
- f .write (most_common_lang [0 ][0 ])
183
+ if most_common_lang :
184
+ f .write (most_common_lang [0 ][0 ])
185
+ else :
186
+ print ("No language detected" )
187
+ f .write ("" )
188
+ sys .exit (1 )
179
189
180
190
181
191
def detect_lang_txt_2_txt (input : str , output : str ) -> None :
182
192
try :
183
193
with open (input , "r" , encoding = "utf-8" ) as infile :
184
194
text = infile .read ()
185
195
186
- if not text :
187
- detected_language = detect_lang_for_text (text )
196
+ detected_language = detect_lang_for_text (text )
188
197
189
- print ("Detected language: " + detected_language )
198
+ print ("Detected language: " + detected_language )
190
199
191
- with open (output , "w" , encoding = "utf-8" ) as outfile :
192
- outfile .write (detected_language )
200
+ with open (output , "w" , encoding = "utf-8" ) as outfile :
201
+ outfile .write (detected_language )
193
202
194
203
except Exception as e :
195
204
print (f"An error occurred: { str (e )} " , file = sys .stderr )
0 commit comments