-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
92 lines (70 loc) · 2.07 KB
/
main.py
File metadata and controls
92 lines (70 loc) · 2.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
import numpy as np
import codecs
def loadDataSet():
filedir = "dataset/0.txt"
result = readFile(filedir)
vector = [1,1,0,0,1,0,0,0,1,0,0,1,0,1,1,1,0,1,1,1,0,0,0,1,1,0,0,0,0,1,1,1,0,1,0,0,1,0,0,0,1,1,0,1,0,0,1,0,0,0,0,1,1,1]
return result,vector
def createVocabList(dataset):
vocabs = set( [ ] )
for document in dataset:
vocabs = vocabs|set( document )
return list(vocabs)
def readFile(fl):
f = codecs.open(fl, 'r', encoding='iso8859_9')
splitted = []
for line in f.readlines():
#line = line.encode('utf8')
splitted.append(splitData(line))
return splitted
def splitData(sentence):
arr = sentence.split(" ")
for index,word in enumerate(arr):
arr[index] = word.encode('iso8859_9')
return arr
def bagOfWords2Vec(vocabs, inputSet):
returnVec = [0] * len(vocabs)
for inp in inputSet:
if inp not in vocabs:
print "not fount %s", inp
else:
returnVec[vocabs.index(inp)] += 1
return returnVec
def trainNB0(trainMatrix, trainCategory):
numTrainDocs = len(trainMatrix)
numWords = len(trainMatrix[0])
pAbusive = sum(trainCategory) / float( numTrainDocs )
p0Num = np.ones( numWords )
p1Num = np.ones( numWords )
p0Denom = 2.0
p1Denom = 2.0
for i in range( numTrainDocs ):
if trainCategory[i] == 1:
p1Num += trainMatrix[i]
p1Denom += sum( trainMatrix[i] )
else:
p0Num += trainMatrix[i]
p0Denom += sum( trainMatrix[i] )
p1Vect = np.log(p1Num / p1Denom)
p0Vect = np.log(p0Num / p0Denom)
return p0Vect,p1Vect,pAbusive
def classifyNB(vec2Classify, p0Vec, p1Vec, pClass1):
p1 = sum(vec2Classify * p1Vec) + np.log(pClass1)
p0 = sum(vec2Classify * p0Vec) + np.log(1.0 - pClass1)
if p1 > p0:
return 1
else:
return 0
def testingNB(splitted):
vlist,vec = loadDataSet()
voc = createVocabList(vlist)
trainMat = [ ]
for post in vlist:
trainMat.append(bagOfWords2Vec(voc, post))
p0V,p1V,pAb = trainNB0(trainMat,vec)
doc = bagOfWords2Vec(voc,splitted)
print splitted,"classified as",classifyNB(doc,p0V,p1V,pAb)
if __name__ == '__main__':
#dt,vec = loadDataSet();
testingNB(['orospu','poker,','zaman'])