nlp_enron/visualize.py at master · asantinc/nlp_enron · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import *
import re
import math
import pdb
import string


def averageLen(emailDict):
    lengths = [len(emailDict[i]) for i in emailDict]
    if len(lengths) == 0:
        return 0
    else:
        return (float(sum(lengths)) / len(lengths))

'''
Return list of tokens without stop words
'''
def getTokens(email):
    COMMON_WORDS = ['meeting' , 'paper', 'comments', 'conference', 'call']
    lowers = email.lower()
    clean_lowers = lowers.translate(None, string.punctuation)
    tokens = nltk.word_tokenize(clean_lowers)
    stop = stopwords.words('english')+COMMON_WORDS
    tokensNoStop = [w for w in tokens if w not in stop]
    return tokensNoStop

'''
Stem a list of tokens
'''
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

'''
Method that creates the tfidf.txt file from the documents in subject.txt
'''
def getIDF(emails):
    myVocabulary = dict()
    myIDF = dict()
    emailNumber = len(emails)
    threshold = 0.9*(emailNumber)
    for i in range(len(emails)):
        print i
        email = emails[i]
        tokens = getTokens(email)
        stemmer = PorterStemmer()
        stemmed = stem_tokens(stemmer)
        for token in stemmed:
            if token in myVocabulary:       #in how many docs the token appears
                myVocabulary[token] += 1
            else:
                myVocabulary[token] = 1
    for word in myVocabulary:
        occurrences = myVocabulary[word]
        if (occurrences > 3) and (occurrences<threshold):   #ignore typos and overly common words
            myIDF[word] = math.log((emailNumber*1.00)/occurrences)
    return myIDF


'''
Returns a token: frequency dictionary for a list of tokens
'''
def getWordFrequency(tokens, myIDF):
    tokenFreqDict = dict()
    totalTokens = 0
    for token in tokens:
        if (token in myIDF):
            totalTokens += 1
            if token in tokenFreqDict:
                tokenFreqDict[token] +=1
            else:
                tokenFreqDict[token] = 1
    return tokenFreqDict, totalTokens

'''
Rates a set of emails based on the tf.idf of their content
'''
def processEmails(emailDict, idfDict, query):
    docValues = dict()
    importantWordsDict = dict()
    averageDocLength = averageLen(emailDict)
    k = 2
    query_tokens = getTokens(query)
    stemmer = PorterStemmer()
    query_stems = stem_tokens(query_tokens, stemmer)

    for originDestKey in emailDict:
        value = 0
        importantWordsDict[originDestKey] = list()
        email = emailDict[originDestKey]

        email_tokens = getTokens(email)
        email_stems = stem_tokens(email_tokens, stemmer)
        tokenFreqDict, docLength = getWordFrequency(email_stems, idfDict)
        for token in tokenFreqDict:
            if token in query_stems:
                idf = idfDict[token]
                freq = tokenFreqDict[token]
                damping = ((float(k)*(docLength)) / averageDocLength)
                importantWordsDict[originDestKey] = (token, value)
                value += (float(freq) / (freq+damping) ) * idf
        docValues[originDestKey] = value
    return docValues, importantWordsDict