-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtf.cpp
115 lines (91 loc) · 2.87 KB
/
tf.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
// g++ -std=c++11 -I /usr/lib/boost/include -L /usr/lib/boost/lib tfidf.cpp -lboost_regex
// If you've included boost libraries in LD_LIBRARY_PATH, just use $ g++ -std=c++11 tfidf.cpp -lboost_regex
#include <iostream>
#include <string>
#include <climits>
#include <map>
#include <unordered_map>
#include <fstream>
#include <algorithm>
#include <boost/regex.hpp>
#include <boost/algorithm/string.hpp>
#include <boost/algorithm/string/join.hpp>
#include "stop_words.cpp"
#include <set>
using namespace std;
set<string > getKeywords(string filename, int PERCENT) {
cout<<"\n\n Percent value = "<<PERCENT;
string line,input_text="";
ifstream myfile (filename);
//Getting all words into input_text
if (myfile.is_open())
{
while ( getline (myfile,line) )
{
line = boost::regex_replace(line, boost::regex {"[^'a-zA-Z]"}, std::string {" "});
line = boost::regex_replace(line, boost::regex {"[ \r\n\t]+"}, std::string {" "});
transform(line.begin(), line.end(), line.begin(), ::tolower);
input_text+=line+" ";
}
myfile.close();
}
//Storing input_text into separate words in input_text_words
vector<string> input_text_words;
boost::split(input_text_words, input_text, boost::is_any_of(" "));
map<string,int> found_keywords;
unordered_map<string, int> stop_words = get_stop_words();
int w_length,unique_keyword_count=0;
for(string& word : input_text_words)
{
//Remove extra spaces surrounding a word
boost::trim(word);
w_length = word.length();
//If the word is actually a word
if(w_length)
{
//If the word is not a stop word then proceed
if(stop_words[word] == 0)
{
//If not already present in keyword, Create the word
if(!found_keywords[word])
{
found_keywords[word] = 1;
unique_keyword_count++;
}
//If already present, increase the count
else
{
found_keywords[word] += 1;
}
}
}
} //End of for loop
vector<pair<string, int> > found_keywords_sort(found_keywords.begin(), found_keywords.end());
struct
{
bool operator() (pair<string, int> a, pair<string, int> b)
{
return a.second > b.second; // desc
}
} sort_keywords_desc;
sort(found_keywords_sort.begin(), found_keywords_sort.end(), sort_keywords_desc);
//Resize to top 100 keywords only
int resize_val = (PERCENT * found_keywords_sort.size())/100;
cout<<"\n Actual value = "<<found_keywords_sort.size()<<" resize value = "<<resize_val;
found_keywords_sort.resize(resize_val);
set<string> final_keywords;
set<string>::iterator it=final_keywords.begin();
transform(found_keywords_sort.begin(),found_keywords_sort.end(),
std::inserter(final_keywords,it),
[](const std::pair<string, int>& p) { return p.first; }
);
return final_keywords;
}
int get_similarity(set<string> file1, set<string> file2){
int similarity=0;
for(auto p: file1){
if(file2.find(p)!=file2.end())
similarity++;
}
return similarity;
}