-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataProjector.py
133 lines (113 loc) · 7.23 KB
/
DataProjector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# Pedram Daee <<[email protected]>>.
# A DataProjector is an object that projects data object to a latent space
# At the moment we will use LSI types of methdos to do this
import numpy
from gensim import corpora, models, similarities,matutils
import numpy as np
import os.path
import time
import scipy.sparse
class DataProjector:
def __init__(self, data_orig, params):
"""For initialization"""
self.params = params
self.num_terms = data_orig.corpus.num_terms #total number of features (items of views, aka terms)
self.num_docs = data_orig.corpus.num_docs #total number of data (snapshots)
self.num_features = params["num_latent_dims"] #number of latent dimensions
self.data_orig = data_orig # keep the original data
self.corpus_normalized = None # contains the corpus in the tfidf format or in the nomalized format
self.tfidf = None # the tf-idf model of the input corpus
self.corpus_lsi = None # contains the corpus in the LSI space
self.lsi = None # the lsi transformation of corpus_normalized
self.svd_v = None # the V matrix in lsi[X] = U^-1*X = V*S
def generate_latent_space(self):
#for now just use Gensim's LSA for latent space
if os.path.isfile('./temp/corp1.lsi') and os.path.isfile('./temp/corp1.tfidf') and os.path.isfile('./temp/corpus_normalized.mm') \
and os.path.isfile('./temp/corp1.svd_v.npy'):
print 'Loading LSI model from folder /temp...'
#The mapping between the questions (how many times does a word appear..) and ids is called a dictionary
#self.dictionary = corpora.Dictionary.load('./temp/corp1.dict')
self.lsi = models.LsiModel.load('./temp/corp1.lsi')
self.tfidf = models.TfidfModel.load('./temp/corp1.tfidf')
self.svd_v = np.load('./temp/corp1.svd_v.npy')
self.corpus_normalized = corpora.MmCorpus('./temp/corpus_normalized.mm')
else:
#use libraries from gensim to build LSI model
print 'Create latent space and save it in /temp...'
t1 = time.time()
#todo: maybe I don't need to do tfidf, but if I do I should also do it for the query
self.tfidf = models.TfidfModel(self.data_orig.corpus)
self.tfidf.save('./temp/corp1.tfidf')
corpus_tfidf = self.tfidf[self.data_orig.corpus]
self.corpus_normalized = corpus_tfidf # tfidf is a basic normalization
corpora.MmCorpus.serialize('./temp/corpus_normalized.mm', self.corpus_normalized) #save the normalized corpus
# initialize an LSI transformation
self.lsi = models.LsiModel(self.corpus_normalized, id2word=self.data_orig.dictionary, num_topics=self.num_features)
self.lsi.save('./temp/corp1.lsi')
# Given a model lsi = LsiModel(X, ...), with the truncated singular value decomposition of your corpus X being X=U*S*V^T,
# doing lsi[X] computes U^-1*X, which equals V*S (basic linear algebra). So if you want V, divide lsi[X] by S:
self.svd_v = matutils.corpus2dense(self.lsi[self.corpus_normalized], num_terms=len(self.lsi.projection.s)).T / self.lsi.projection.s #TODO: is \ element wise?!
np.save('./temp/corp1.svd_v.npy', self.svd_v)
#print(lsi.print_topics(self.num_latent_dims))
t2 = time.time()
t_latent = t2-t1
print 'Latent space creation took %f second' %t_latent
# create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
self.corpus_lsi = self.lsi[self.corpus_normalized]
def create_feature_matrices(self):
#This function creates the neccessary featuer matrices introduced in [1]
#the new idea is that the keyword space is projected to a latent space first and
# based on the document transformation idea, the documents are also projected
if os.path.isfile('./temp/term_f_mat.npy') and os.path.isfile('./temp/doc_f_mat.npy'):
self.term_f_mat = np.load('./temp/term_f_mat.npy')
self.doc_f_mat = np.load('./temp/doc_f_mat.npy')
else:
t1 = time.time()
w = self.svd_v
w = w/self.lsi.projection.s # this is necessary based on the LSI in wiki
# Use sparse matrix rather than dense matrices to do the calculations (save memory)
M_T_sparse = matutils.corpus2csc(self.corpus_normalized, num_terms=self.data_orig.num_features, num_docs=self.data_orig.num_data, num_nnz=self.data_orig.corpus.num_nnz)
self.term_f_mat = M_T_sparse.dot(w)
np.save('./temp/term_f_mat.npy', self.term_f_mat)
t2 = time.time()
# Based on the assumptions in [1], I need to normalize to have P(t_i|d_j) in the original space
# Normalize the document vectors to sum up to one
if self.params["normalize_terms"]:
sum_over_terms = M_T_sparse.sum(axis=0).A.ravel() # take the sum over terms for each doc
sum_over_terms_diag = scipy.sparse.diags(1/sum_over_terms, 0) # create an inverted diag matrix of sums
M_T_sparse_normalized = M_T_sparse.dot(sum_over_terms_diag) # divide by sums by using doc product
M_T_sparse_normalized_T = M_T_sparse_normalized.transpose()
else:
M_T_sparse_normalized_T = M_T_sparse.transpose()
# Use sparse matrix rather than dence matrices to do the calculations (save memory)
self.doc_f_mat = M_T_sparse_normalized_T.dot(self.term_f_mat)
np.save('./temp/doc_f_mat.npy', self.doc_f_mat)
t3 = time.time()
t_term_mat = t2-t1
t_doc_mat = t3-t2
t_total = t3-t1
print 'Creating term matrix %f second' %t_term_mat
print 'Creating document matrix %f second' %t_doc_mat
print 'Total %f second' %t_total
def item_fv(self,index_item):
return self.term_f_mat[index_item][:]
def doc_fv(self,index_doc):
#there would be new docs generated in every iteration. Should I update the latent space? "no" at the moment
return self.doc_f_mat[index_doc][:]
def doc_fv_new(self, new_doc_fv):
#feedbacks are on new docs (not in corpus)
#It is only enough to transform the new doc fv to the latent space which can be done as: fv * self.term_f_mat
#input: new_doc_fv should be a bag-of-word representation of a document (sparse matrix)
# the logger needs to check if the term names are the same to the current dictionary
# use tfidf
new_doc_fv_tfidf = self.tfidf[new_doc_fv]
# make it an array
new_doc_fv_normalized = np.zeros(self.num_terms)
sum_over_terms = 0
for i in range(len(new_doc_fv_tfidf)):
new_doc_fv_normalized[int(new_doc_fv_tfidf[i][0])] = new_doc_fv_tfidf[i][1]
sum_over_terms = sum_over_terms + new_doc_fv_tfidf[i][1]
if self.params["normalize_terms"]:
new_doc_fv_normalized = new_doc_fv_normalized / sum_over_terms
new_fv = np.dot(new_doc_fv_normalized, self.term_f_mat)
return new_fv