-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_recognizer.py
303 lines (264 loc) · 12.7 KB
/
my_recognizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import warnings
from asl_data import SinglesData
import arpa
import copy
from collections import OrderedDict
def recognize(models: dict, test_set: SinglesData):
""" Recognize test word sequences from word models set
:param models: dict of trained models
{'SOMEWORD': GaussianHMM model object, 'SOMEOTHERWORD': GaussianHMM model object, ...}
:param test_set: SinglesData object
:return: (list, list) as probabilities, guesses
both lists are ordered by the test set word_id
probabilities is a list of dictionaries where each key a word and value is Log Liklihood
[{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
{SOMEWORD': LogLvalue, 'SOMEOTHERWORD' LogLvalue, ... },
]
guesses is a list of the best guess words ordered by the test set word_id
['WORDGUESS0', 'WORDGUESS1', 'WORDGUESS2',...]
"""
warnings.filterwarnings("ignore", category=DeprecationWarning)
word_ids = test_set.get_all_sequences().keys()
probabilities = [dict() for w in word_ids]
guesses = []
# test_set contains words. Test each word with the models. this test should return a dict with the probability for each word
# Then, the word with the highest prob is the best_guess.
for word_id in word_ids:
X, lengths = test_set.get_item_Xlengths(word_id)
for candidate_word in models.keys():
model = models[candidate_word]
try:
score = model.score(X, lengths)
except:
score = -999999
probabilities[word_id][candidate_word] = score
for dct in probabilities:
guesses.append(max(dct.keys(), key=lambda x:dct[x]))
return probabilities, guesses
def normalize_probabilities(probabilities:dict, verbose=False):
"""
The scores of individual words can be positive or negative. To define the
top candidates for sentence scoring, normalize the scores between [0:1].
:param probabilities: dict
keys: index of word in test_set
values: dict:
keys: word
values: scores for this word at this test word index
:return: dict. Same as probabilities but each score is normalized [0:1]
"""
prob_norm = []
for i in range(len(probabilities)):
prob_norm.append({})
max_s = max(probabilities[i].values())
min_s = min(probabilities[i].values())
for w in probabilities[i].keys():
prob_norm[i][w] = (probabilities[i][w] - min_s) / (max_s - min_s)
if verbose:
print("Scores normalized [0:1]")
print()
# print("Showing normalized word scores, sorted. (only top 10 for the 3 first words = 1st test sentence)...")
# for i in range(3):
# l = [(k,v) for k,v in zip(prob_norm[i].keys(), prob_norm[i].values())]
# l.sort(key=lambda x: x[1], reverse=True)
# for j in range(10):
# print("{0:10} : {1}".format(l[j][0], l[j][1]))
# print()
return prob_norm
def get_candidates(probabilities: list, guesses: list, sentence: list, test_set, threshold_score=0.9, threshold_number=0, verbose=False):
"""
For each word in the sentence, gather the candidate words with high probabilities. Each word's probability
is divided by the top scored word's probability. Only the normalized scores that are higher than the threshold, are withheld.
Example: if the 0gram model thinks that the word should be JOHN, then JOHN's normalized score = 1.0 and
MARY's score could be 0.7545 (fictive number).
:param probabilities: list of dicts.
1 dict for each word in the test_set.
Each dict contains all possible words as keys and their matching probability for the test word as values
:param guesses: a list of str
Best guess word from the 0gram model for each test word
:param sentence: a list of indices of test words that form a test sentence
:param test_set: SinglesData Object
The built test_set (see Jupyter Notebook for more info)
:param threshold: int or float
The threshold for the score
:return: list of dicts
Similar to probabilities
keys: individual word
values: score. The normalized score of the individual word score and the score
of the most probable word.
example: {0: {'JOHN': 0.905, ...}, ...}
"""
# Set a threshold for the max number of candidates per word in the sentence
# If the sentence contains more words, you want to lower the threshold for
# computational purposes. Also, a max. cap should be set to avoid introducing
# sentences with high log_s but too low individual word scores.
if threshold_number == 0:
# Set for max 1000 sentence combinations
threshold_number = int(10000**(1./len(sentence)))
if threshold_number > 10:
threshold_number = 10
candidates = {}
for i in sentence: # for each word in the test sentence, gather the candidates
l = []
for k in probabilities[i].keys():
# The normliazed word score needs to be >= threshold
if probabilities[i][k] >= threshold_score:
# valid candidate word for this index in the test sentence
l.append((k, probabilities[i][k]))
l.sort(key=lambda x:x[1], reverse=True)
candidates[i] = {k:v for k,v in l[0:threshold_number]}
if verbose:
for i in sentence:
print("Selected candidates for index {} (threshold = {}, threshold_number = {}):".format(i, threshold_score, threshold_number))
print(guesses[i],":", probabilities[i][guesses[i]], "\t Correct word:", test_set.wordlist[i])
print("============================================")
l = [(k,v) for k,v in zip(candidates[i].keys(), candidates[i].values())]
l.sort(key=lambda x:x[1], reverse=True)
for tpl in l:
print("{0:16}{1}".format(tpl[0], tpl[1]))
print("--------------------------------------------")
print()
return candidates
def score_sentences(candidates, guesses, sentence, lm, verbose=False):
"""
From the possible candidates, create all possible sentences and score them with lm.log_s
:param candidates: dict of dicts
keys: index int of the word in the test sentence
values: dict
keys: individual word
values: score. The normalized score of the individual word score and the score
of the most probable word.
example: {0: {'JOHN': 0.905, ...}, ...}
:param guesses: list of str
Best guess words of 0gram model for the test words
:param sentence: a list of indices of test words that form a test sentence
:param lm: language model according to arpa module
:return: dict
keys: tuple of words that form a sentence of the test_set
values: scores according to lm.log_s(), normalized [0:1]
"""
def score_words_as_sentence(words:list):
"""
Take a deepcopy of the words, remove any numbers in them and score the sentence
according to the language model's log_s() function
"""
words = copy.deepcopy(words)
# remove all digits in words because they can occur in the words, but not in the LM
for i, w in enumerate(words):
for d in "123456789":
if d in w[-1]:
# Only remove digits from the last character because in 'IX-1P', nothing should be removed!
words[i] = w.replace(d, "")
# return the sentence score
return lm.log_s(words)
def build_and_score_sentences(sol, i):
"""
recursive function to create all possible sentences from the selected words
for each position in the sentence.
"""
sol = copy.deepcopy(sol)
if i < 0:
sentence_scores[tuple(sol)] = score_words_as_sentence(sol)
return
for k in candidates[sentence[i]].keys():
# Get the word of the selected candidate tuple at index i
sol[i] = str(k)
build_and_score_sentences(sol, i-1)
return
def normalize_sentence_scores(sentence_scores):
"""
Normalize all sentence scores to [0:1]
"""
min_s = min(sentence_scores.values())
max_s = max(sentence_scores.values())
ss_norm = {}
for k in sentence_scores.keys():
ss_norm[k] = (sentence_scores[k] - min_s) / (max_s - min_s)
# if verbose:
# print("SENTENCE SCORES:")
# v = list(ss_norm.values())
# v.sort(reverse=True)
# for e in v:
# print(e)
return ss_norm
if verbose:
lens = []
total = 1
for k in candidates.keys():
l = len(candidates[k].keys())
lens.append(l)
total *= l
print("The number of candidates per word are: {}".format(lens))
print("A total of {} sentences will be built and scored".format(total))
print()
sentence_scores = {} # Empty dict to be populated by build_and_score_sentences()
sol = [guesses[i] for i in sentence] # Best guess sentence to start with
build_and_score_sentences(sol, len(sentence)-1)
ss_norm = normalize_sentence_scores(sentence_scores)
if verbose:
print('Scored sentences (only top 10):')
l = [(" ".join(k), v) for k,v in zip(ss_norm.keys(), ss_norm.values())]
l.sort(key=lambda x: x[1], reverse=True)
for i in range(10):
print("{0:20} ---> {1}".format(l[i][0], l[i][1]))
print("...")
print()
return ss_norm
def combine_scores(sentence_scores, candidates, sentence, word_weight=1.0, sentence_weight=1.0, verbose=False):
"""
For all sentences we want to score, get the sentence score and the scores
of the individual words in the sentence.
Then apply some magic to get a total_score and return the winner!
:param sentence_scores: dict
keys: tuple of all the words in a sentence we want to test
values: score according to the lm.log_s() function
example: {('JOHN', 'WRITE', 'HOMEWORK'): -4.65564, ...}
:param candidates: dict of dicts
keys: index int of the word in the test sentence
values: dict
keys: individual word
values: score. The normliazed score of the individual word score and the score
of the most probable word.
example: {0: {('JOHN', 0): 1.34, ...}, ...}
:param sentence: a list of indices of test words that form a test sentence
:return: tuple ((best_sentence, best_score), scores)
best_sentence and best_score is the winning tuple. It is our best guess for
a sentence from the test_set taking into account individual word scores and
sentence scores.
Scores is a list containing this tuple for all candidate sentences.
"""
best_sentence = []
best_score = -999999
scores = []
# For each sentence
for words in sentence_scores.keys():
s_score = sentence_scores[words] # log_s score for sentence
w_scores = 0.0
# This is part of the other strategy: w_scores = []
total_score = None
for i in range(len(sentence)):
# Populate the w_scores list with individual word scores
w = words[i]
idx = sentence[i]
ind_normalized_score = candidates[idx][w]
w_scores += ind_normalized_score
# This is part of the other strategy: w_scores.append(ind_normalized_score)
# HERE IS THE MEAT OF THE SCORING - this should be tuned just right
weighted_w_scores = word_weight * w_scores
weighted_s_score = sentence_weight * s_score
total_score = weighted_w_scores + weighted_s_score
# This is another strategy: total_score = sum(w_scores)/len(w_scores) * s_score
if verbose:
print("Individual word scores:",weighted_w_scores, "Sentence score:", weighted_s_score, "Total score:",total_score)
scores.append((tuple(words), total_score))
# Keep track of the winner
if total_score > best_score:
best_score = total_score
best_sentence = words
# print("New best sentence:",best_sentence)
scores.sort(key=lambda x: x[1], reverse=True)
if verbose:
print("Showing top 10 results after combining sentence scores with individual word scores:")
for e in scores[:10]:
print("For words: {}, total score: {}".format(e[0], e[1]))
print()
return (best_sentence, best_score), scores