-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathword_relations.py
101 lines (81 loc) · 3.63 KB
/
word_relations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# coding: utf-8
import os, pickle, gensim, matplotlib
import matplotlib.pyplot as plt
from matplotlib import pylab
from sklearn.decomposition import PCA
import numpy as np
from simpleemotionaltweets import *
from collections import defaultdict, namedtuple
model = gensim.models.Word2Vec.load('path/to/vector/model')
'''
this example provides the result when v(king)-v(man)+v(woman)
Thus the vectors to be added are given in positive section and
the vectors to subtract should be in negative section.
'''
model.most_similar(positive=[u'könig', u'frau'], negative=[u'mann'])
'''
This example also works good for the following conditions.
sonne - hitze = mond
model.most_similar(positive=[u'sonne'], negative=[u'hitze'])
frau + kind = mutter
planet + wasser = erde
planet - wasser = saturn, jupiter
haus + film = kino
computer + telefonieren = handy
koenig - mann + frau = koenigin
'''
'''
Function returns the vectors with 2 dimensions by using PCA
the limit parameter controls how much vocabulary is added in addition
to the words to be plotted so that the PCA finds a generalised plane.
for the nintendo, xbox, computer example limit is set 0, as the variations
in the vocabulary is enough to plot the differences.
In case of countries-capitals, the limit is around 50.
Limit is also 50 for the king queen example.
'''
def getPlottingVecs(nouns_in):
voc = []
voc.extend([wiki_model[word] for word in nouns_in])
limit = 10
for idx,x in enumerate(wiki_model.vocab.iteritems()):
if idx<limit:
voc.append(wiki_model[x[0]])
else:
break
pca_vecs = PCA(n_components=2, whiten=True).fit(voc).transform(voc)
vecs_to_plot = []
for x in range(len(nouns_in)):
vecs_to_plot.append(pca_vecs[x,:])
return vecs_to_plot
'''
Function plots the 2d vectors of the words computed via PCA
'''
def plotSimilarVectors(data, label, clrs):
fig, ax = plt.subplots(figsize=(9,9))
for idx,val in enumerate(data):
plt.scatter(val[0], val[1], marker='o', s=55, color=clrs[idx])
plt.annotate(label[idx] ,xytext=pos[idx], textcoords='offset points', xy=(val[0], val[1]), arrowprops=dict(arrowstyle="->",connectionstyle="arc3"))
ax.grid('on')
plt.xlabel('vector space')
plt.ylabel('vector space')
plt.show()
'''
One of the following set of examples could be tested.
The pos list gives the position of the words annotated in the plot.
nouns = [ u'nintendo', u'xbox', u'playstation',u'computer',u'handy',u'fernseher', u'pc', u'laptop', u'digitalkamera',u'demokratie', u'landwirtschaft']
pos = [(-70,40), (40,-40), (-30,-40), (-70,40), (-80,10) , (-50,70), (-70,40), (-70,-40),(20,-60),(-70,40),(-90,40)]
clrs = ['blue','blue','blue','blue','blue','blue','blue','blue','blue','blue','blue','blue','blue','blue',
'blue','blue','blue','blue']
nouns = [u'usa', u'washington',u'deutschland','berlin', u'russland', u'moskau',u'türkei',u'ankara',
u'portugal',u'lissabon', u'japan',u'tokio', u'schweden',u'stockholm',
u'griechenland',u'athen',u'bulgarien',u'sofia']
clrs = ['green','green','blue','blue','brown','brown','red','red','orange','orange','black','black','cyan','cyan',
'magenta','magenta','pink','pink']
pos = [(35,-45), (35,-45), (-30,-65), (-30,-45), (30,-45), (35,-55),(35,-45), (35,0), (35,0),
(35,0), (35,-45),(35,-35), (25,0), (35,0), (35,0), (35,0), (-35,45), (35,0)]
nouns = [u'könig',u'frau',u'mann',u'königin']
pos = [(15,-15), (-45,-5), (-10,-25), (15,-25)]
clrs = ['green','green','blue','blue']
'''
plotting_vecs = getPlottingVecs(nouns)
plotSimilarVectors(plotting_vecs, nouns, clrs)