-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSequentialEmbeddings.py
65 lines (56 loc) · 2.34 KB
/
SequentialEmbeddings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-''
import numpy as np
import itertools
import collections
import glob
import os
from ReichstagEmbeddings import Embeddings
# Define class for sequential embedding
class SequentialEmbedding(Embeddings):
"""
Credits: This implementation is based on https://github.com/williamleif/histwords/blob/master/representations/embedding_matrixedding.py
"""
def __init__(self, embedding_spaces, **kwargs):
self.embeds = embedding_spaces
@classmethod
def load(cls, model_folder, **kwargs):
i = 1
embeds = collections.OrderedDict()
model_names = glob.glob(f'{model_folder}/*.model')
for fn in model_names:
if os.path.splitext(os.path.basename(fn))[0] != 'compass':
embeds[i] = Embeddings.load(fn)
i += 1
return SequentialEmbedding(embeds)
def get_embed(self, slice):
return self.embeds[str(slice)]
def get_time_sims(self, word1, word2):
time_sims = collections.OrderedDict()
for slice, embed in self.embeds.items():
time_sims[slice] = embed.wv.similarity(word1, word2)
return time_sims
def get_nearest_neighbors(self, word, n=3):
neighbour_set = set([])
for embed in self.embeds.values():
closest = embed.wv.most_similar(word,topn=n)
for neighbour,score in closest:
neighbour_set.add(neighbour)
return neighbour_set
def get_seq_closest(self, word, start_slice, num_slices=5, n=10):
closest = collections.defaultdict(float)
for slice in range(start_slice, start_slice + num_slices):
embed = self.embeds[slice]
slice_closest = embed.wv.most_similar(word,topn=n*10)
for neigh, score in slice_closest:
closest[neigh] += score
return sorted(closest, key = lambda word : closest[word], reverse=True)[0:n]
def get_intra_word_similarites(self, word, slices=[1,2,3,4,5]):
"""
How similar are vectors of the same word trained in different slices?
"""
print(f'Similarity vals for word {word} ...')
print('\n')
for s1,s2 in itertools.combinations(slices,2):
print(f'between slice {s1} and {s2}:')
print(np.dot(self.embeds[s1].wv[word], self.embeds[s2].wv[word]))
print('\n')