-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvisualize.py
More file actions
111 lines (97 loc) · 3.23 KB
/
Copy pathvisualize.py
File metadata and controls
111 lines (97 loc) · 3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import math
import pdb
import string
def averageLen(emailDict):
lengths = [len(emailDict[i]) for i in emailDict]
if len(lengths) == 0:
return 0
else:
return (float(sum(lengths)) / len(lengths))
'''
Return list of tokens without stop words
'''
def getTokens(email):
COMMON_WORDS = ['meeting' , 'paper', 'comments', 'conference', 'call']
lowers = email.lower()
clean_lowers = lowers.translate(None, string.punctuation)
tokens = nltk.word_tokenize(clean_lowers)
stop = stopwords.words('english')+COMMON_WORDS
tokensNoStop = [w for w in tokens if w not in stop]
return tokensNoStop
'''
Stem a list of tokens
'''
def stem_tokens(tokens, stemmer):
stemmed = []
for item in tokens:
stemmed.append(stemmer.stem(item))
return stemmed
'''
Method that creates the tfidf.txt file from the documents in subject.txt
'''
def getIDF(emails):
myVocabulary = dict()
myIDF = dict()
emailNumber = len(emails)
threshold = 0.9*(emailNumber)
for i in range(len(emails)):
print i
email = emails[i]
tokens = getTokens(email)
stemmer = PorterStemmer()
stemmed = stem_tokens(stemmer)
for token in stemmed:
if token in myVocabulary: #in how many docs the token appears
myVocabulary[token] += 1
else:
myVocabulary[token] = 1
for word in myVocabulary:
occurrences = myVocabulary[word]
if (occurrences > 3) and (occurrences<threshold): #ignore typos and overly common words
myIDF[word] = math.log((emailNumber*1.00)/occurrences)
return myIDF
'''
Returns a token: frequency dictionary for a list of tokens
'''
def getWordFrequency(tokens, myIDF):
tokenFreqDict = dict()
totalTokens = 0
for token in tokens:
if (token in myIDF):
totalTokens += 1
if token in tokenFreqDict:
tokenFreqDict[token] +=1
else:
tokenFreqDict[token] = 1
return tokenFreqDict, totalTokens
'''
Rates a set of emails based on the tf.idf of their content
'''
def processEmails(emailDict, idfDict, query):
docValues = dict()
importantWordsDict = dict()
averageDocLength = averageLen(emailDict)
k = 2
query_tokens = getTokens(query)
stemmer = PorterStemmer()
query_stems = stem_tokens(query_tokens, stemmer)
for originDestKey in emailDict:
value = 0
importantWordsDict[originDestKey] = list()
email = emailDict[originDestKey]
email_tokens = getTokens(email)
email_stems = stem_tokens(email_tokens, stemmer)
tokenFreqDict, docLength = getWordFrequency(email_stems, idfDict)
for token in tokenFreqDict:
if token in query_stems:
idf = idfDict[token]
freq = tokenFreqDict[token]
damping = ((float(k)*(docLength)) / averageDocLength)
importantWordsDict[originDestKey] = (token, value)
value += (float(freq) / (freq+damping) ) * idf
docValues[originDestKey] = value
return docValues, importantWordsDict