-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgenerate.py
103 lines (79 loc) · 2.72 KB
/
generate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
"""
Created on Mon JUN 1 12:41:30 2020
@author: mohabmes
script used to clean and process Arabic texts for sinai-corpus
"""
import re
import Arabycia.pyaramorph as pam
def removeNumber(text):
return re.sub(r'\d+', '', text)
def removeUnnecessaryChar(text):
text = " ".join(text.split())
text = re.sub(r'[،؛﴾﴿٪]+', '', text)
text = re.sub(r'[\r\t ]+', ' ', text)
return removeNumber(text)
def removeNonArabicChar(text):
text = re.sub(
r'[^0-9\u0600-\u06ff\u0750-\u077f\ufb50-\ufbc1\ufbd3-\ufd3f\ufd50-\ufd8f\ufd50-\ufd8f\ufe70-\ufefc\uFDF0-\uFDFD.0-9]+',
' ', text)
return text
def sentTokenize(text):
text = re.sub(r'[.؟!,,]', '\n', text)
return text
def clean(text):
text = removeUnnecessaryChar(text)
text = removeNonArabicChar(text)
return sentTokenize(text)
def find_best_series(all_solution, text):
word=text.split()
# print(word)
final_pos = ""
for i in range(0, len(word)):
best_solution_index = -1
best_solution_diff = 1000
# print(all_solution[i])
if len(all_solution[i]['solution']) == 0 :
final_pos += "[Unidentified] "
continue
else:
current_word_sol = all_solution[i]['solution']
for index in range(0, len(current_word_sol)):
dif = MED(current_word_sol[index]['word'][0], word[i])
if dif < best_solution_diff:
best_solution_diff = dif
best_solution_index = index
pos = current_word_sol[best_solution_index]['pos']
word_trans = current_word_sol[best_solution_index]['word'][1]
word_res = []
for part in pos:
if part != "":
word_res.append(part.split('/')[1])
final_pos += word_trans + ':' + "/".join(word_res) + " "
return final_pos
def analyze_text(text_list, pam = pam.Analyzer()):
result =""
for sent in text_list:
if sent not in [" ", "\n", " \n"]:
all_sol = pam.analyze_text(sent)
pos = find_best_series(all_sol, sent)
result += pos + "\n"
print("Done")
return result
def remove_duplicates(text_list):
text_set = set(text_list)
print(len(text_list) - len(text_set), " Duplicates")
text = list(text_set)
text = "\n".join(text)
return text
def cleanDir(path):
files = scan_directory(path)
cnt = 1
sz = len(files)
for file in files:
print("{}\{} - {}".format(cnt, sz, file))
text = readFile(file, "r")
clean_text = clean(text)
clean_text = remove_duplicates(clean_text.split("\n"))
analyzed_text = analyze_text(clean_text.split("\n"))
createFile(str(cnt)+".txt", analyzed_text, "w")
cnt += 1