-
Notifications
You must be signed in to change notification settings - Fork 9
Expand file tree
/
Copy pathtfidf.m
More file actions
24 lines (19 loc) · 715 Bytes
/
tfidf.m
File metadata and controls
24 lines (19 loc) · 715 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
function [X_train, X_test] = tfidf(X_train, X_test)
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%
% Calculates the idf on the train set and performs tf-idf normalization of
% both matrices. Also does L2 normalization.
%
% tf-idf = tf * log(|D| / n_occurences)
%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
idf = log(size(X_train,1) ./ (sum(X_train>0) + eps));
IDF = spdiags(idf', 0, size(idf,2), size(idf,2));
X_train = X_train * IDF;
X_train = L2_norm_row(X_train);
X_test = X_test * IDF;
X_test = L2_norm_row(X_test);
function Xnorm = L2_norm_row(X)
Xnorm = spdiags(1 ./ (sqrt(sum(X.*X,2)) + eps), 0, size(X,1), size(X,1)) * X;
end
end