forked from scopeInfinity/Video2Description
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvocab.py
172 lines (157 loc) · 6.65 KB
/
vocab.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from scipy.interpolate import interp1d
from logger import logger
from utils import caption_tokenize
from VideoDataset.videohandler import VideoHandler
from config import getVocabConfig
import os
import numpy as np
import pickle
# Read
GLOVE_FILE = getVocabConfig()['GLOVE_FILE']
# Read or Write if not exists
WORD_EMBEDDED_CACHE = getVocabConfig()['WORD_EMBEDDED_CACHE']
VOCAB_FILE = getVocabConfig()['VOCAB_FILE']
class Vocab:
OUTDIM_EMB = 300
WORD_MIN_FREQ = 5
VOCAB_SIZE = 9448
CAPTION_LEN = 15
def __init__(self, data, train_ids, data_dir, working_dir):
# data = dict(id => captions)
logger.debug("Glove File %s\nEmbedding File %s\nVocab File %s\n" % (GLOVE_FILE, WORD_EMBEDDED_CACHE, VOCAB_FILE))
self.specialWords = dict()
self.specialWords['START'] = '>'
self.specialWords['END'] = '<'
self.specialWords['NONE'] = '?!?'
self.specialWords['EXTRA'] = '___'
freshWordEmbedding = self.loadWordEmbedding(GLOVE_FILE)
for word,enc in self.specialWords.iteritems():
assert enc in self.wordEmbedding.keys()
self.buildVocab(data, train_ids, freshWordEmbedding)
logger.debug("Vocab Build Completed")
def loadWordEmbedding(self, glove_file):
isEmbeddingPresent = os.path.exists(WORD_EMBEDDED_CACHE)
logger.debug("Embedding Present %s " % isEmbeddingPresent)
if isEmbeddingPresent:
with open(WORD_EMBEDDED_CACHE, 'r') as f:
self.wordEmbedding = pickle.load(f)
logger.debug("Emdedding Loaded")
return False
else:
self.wordEmbedding = dict()
with open(glove_file,'r') as f:
for i,line in enumerate(f):
tokens = line.split()
tokens = [tok.__str__() for tok in tokens]
word = tokens[0]
self.wordEmbedding[word] = np.asarray(tokens[1:], dtype='float32')
minVal = float('inf')
maxVal = -minVal
for v in self.wordEmbedding.values():
for x in v:
minVal = min(minVal,x)
maxVal = max(maxVal,x)
mapper = interp1d([minVal,maxVal],[-1,1])
logger.info("Mapping minVal[%f], maxVal[%f] to [-1,1] " % (minVal,maxVal))
for w in self.wordEmbedding:
self.wordEmbedding[w] = mapper(self.wordEmbedding[w])
print "Cross Check"
print self.wordEmbedding['good']
self.saveEmbedding()
return True
def saveEmbedding(self):
with open(WORD_EMBEDDED_CACHE,'w') as f:
pickle.dump(self.wordEmbedding,f)
logger.info("Embedding Saved!")
def buildVocab(self, data, train_ids, trimEmbedding):
if os.path.exists(VOCAB_FILE):
with open(VOCAB_FILE,'r') as f:
logger.debug("Vocab Loading from File")
self.ind2word = pickle.load(f)
logger.debug("Vocab Loaded")
else:
logger.debug("Building Vocab")
x = {}
allWords = set()
for w in self.wordEmbedding.keys():
allWords.add(w)
logger.debug("Cached all Embedded Words")
for _id,captions in data.iteritems():
if _id not in train_ids:
continue
for cap in captions:
for w in caption_tokenize(cap):
if w not in allWords:
continue
if w not in x.keys():
x[w]=1
else:
x[w]+=1
assert 'tshirt' not in x.keys()
assert 'tshirt' not in allWords
logger.debug("Iterated over all captions")
self.ind2word = []
for w,enc in self.specialWords.iteritems():
self.ind2word.append(enc)
self.ind2word.extend([w for w in x.keys() if x[w]>=Vocab.WORD_MIN_FREQ])
with open(VOCAB_FILE,'w') as f:
pickle.dump(self.ind2word,f)
logger.debug("Vocab File saved")
logger.info("Vocab Size : %d"%len(self.ind2word))
self.word2ind = dict()
for i,w in enumerate(self.ind2word):
self.word2ind[w]=i
assert 'tshirt' not in self.wordEmbedding.keys()
assert 'tshirt' not in self.word2ind.keys()
logger.debug("Words to be in vocab %d found %d" % (Vocab.VOCAB_SIZE, len(self.ind2word)))
assert len(self.ind2word) == Vocab.VOCAB_SIZE
if trimEmbedding:
newEmbedding = dict()
logger.debug("Trimming Word Embedding")
for w in self.ind2word:
newEmbedding[w] = self.wordEmbedding[w]
self.wordEmbedding=newEmbedding
logger.debug("Trimming Word Embedding Done")
self.saveEmbedding()
def get_filteredword(self,w):
if w in self.word2ind.keys():
return w
return self.specialWords['EXTRA']
def fit_caption_tokens(self,tokens,length,addPrefix,addSuffix):
tok = []
tokens = tokens[0:length]
if addPrefix:
tok.append(self.specialWords['START'])
tok.extend(tokens)
if addSuffix:
tok.append(self.specialWords['END'])
for i in range(length-len(tokens)):
tok.append(self.specialWords['NONE'])
return tok
def onehot_word(self,w):
encode = [0] * Vocab.VOCAB_SIZE
encode[self.word2ind[w]] = 1
return encode
def word_fromonehot(self, onehot):
index = np.argmax(onehot)
return self.ind2word[index]
def get_caption_encoded(self,caption,glove, addPrefix, addSuffix):
tokens = caption_tokenize(caption)
tokens = self.fit_caption_tokens(tokens, Vocab.CAPTION_LEN, addPrefix, addSuffix)
tokens = [self.get_filteredword(x) for x in tokens]
# logger.debug("Working on Caption %s " % str(tokens))
if glove:
return [self.wordEmbedding[x] for x in tokens]
else:
return [self.onehot_word(x) for x in tokens]
def get_caption_from_indexs(self,indx):
s = ' '.join([self.ind2word[x] for x in indx])
return s
def vocabBuilder(datadir, workdir):
vHandler = VideoHandler(datadir, VideoHandler.s_fname_train, VideoHandler.s_fname_test)
train_ids = vHandler.get_otrain_ids()
captionData = vHandler.getCaptionData()
vocab = Vocab(captionData, train_ids, datadir, workdir)
return [vHandler, vocab]
#if __name__ == "__main__":
# vocabBuilder("/home/gagan.cs14/btp","/home/gagan.cs14/btp_VideoCaption")