-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnaive_bayesian.py
More file actions
160 lines (150 loc) · 5.77 KB
/
naive_bayesian.py
File metadata and controls
160 lines (150 loc) · 5.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
# -*- coding: utf-8 -*-
import csv
import random
import numpy
import math
import time
import sys
def main(argv):
filename = "data.csv"
ratio = 0.8
repeats = 1
result_set = []
for i in range(repeats):
try:
start = time.time()
# 读取数据
records = read_csv(filename)
# 划分训练集和测试集
(trainset, testset) = split_dataset(records, ratio)
# 将训练集里的记录按不同职位分割
partitions = partition_trainset(trainset)
# 统计每个职位每个属性取不同值的数量
counter = count_attributes(partitions)
# 计算每个职位每个属性取不同值的概率
probabilities = calculate_probabilities(partitions, counter)
# 对测试集中的记录进行预测
predict_testset(testset, probabilities, result_set)
end = time.time()
print("Cost: {0} s".format(end - start))
print("**********")
result = calculate_probabilities(partition_trainset(records), count_attributes(partition_trainset(records)))
with open("result.txt", "w") as result_file:
for item in result:
result_file.write(item)
result_file.write("\n")
for attr in result[item]:
for value in attr:
result_file.write("({0}, {1})".format(value, attr[value]))
result_file.write("\n")
except (IOError, TypeError, ValueError) as identifier:
print(identifier)
print(numpy.mean(result_set))
def read_csv(filename):
records = []
with open(filename, "r") as file:
reader = csv.reader(file)
for record in reader:
records.append(record)
print("**********")
print("Data Loaded Successfully!")
print("{0}: {1} rows".format(filename, len(records)))
print("**********")
return records
def split_dataset(dataset, ratio):
trainset_capacity = int(len(dataset) * ratio)
trainset = []
testset = list(dataset)
count = 0
while count < trainset_capacity:
index = random.randrange(0, len(testset))
trainset.append(testset.pop(index))
count += 1
print("Split the Dataset Successfully!")
print("Train Set: {0} rows".format(len(trainset)))
print("Test Set: {0} rows".format(len(testset)))
print("**********")
return (trainset, testset)
def partition_trainset(trainset):
partitions = {}
for record in trainset:
if record[0] not in partitions:
partitions[record[0]] = []
partitions[record[0]].append(record)
print("Partition the Train Set Successfully!")
for key in partitions:
print("{0}: {1} rows".format(key, len(partitions[key])))
print("**********")
return partitions
def count_attributes(partitions):
counter = {}
for key in partitions:
for index in range(1, 19):
attr_map = {}
for item in partitions[key]:
if item[index] not in attr_map:
attr_map[item[index]] = 0
attr_map[item[index]] += 1
if key not in counter:
counter[key] = []
counter[key].append(attr_map)
return counter
def calculate_probabilities(partitions, counter):
probabilities = {}
for key in counter:
length = len(partitions[key])
for dictionary in counter[key]:
attr_map = {}
for item in dictionary:
attr_map[item] = float(dictionary[item]) / length
if key not in probabilities:
probabilities[key] = []
probabilities[key].append(attr_map)
print("Calculate Probabilities Successfully!")
print("**********")
return probabilities
def predict_testset(testset, probabilities, result_set):
actual_values = []
predicted_values = []
count = 0
for record in testset:
actual_values.append(record[0])
for record in testset:
value_dict = {}
# product
product_probability = calculate_prediction("product", probabilities, record)
value_dict[product_probability] = "product"
# technical
technical_probability = calculate_prediction("technical", probabilities, record)
value_dict[technical_probability] = "technical"
# sales
sales_probability = calculate_prediction("sales", probabilities, record)
value_dict[sales_probability] = "sales"
# operation
operation_probability = calculate_prediction("operation", probabilities, record)
value_dict[operation_probability] = "operation"
# finance
finance_probability = calculate_prediction("finance", probabilities, record)
value_dict[finance_probability] = "finance"
# HR
HR_probability = calculate_prediction("HR", probabilities, record)
value_dict[HR_probability] = "HR"
predicted_values.append(value_dict[max(product_probability, technical_probability, sales_probability, operation_probability, finance_probability, HR_probability)])
for i in range(len(actual_values)):
if actual_values[i] == predicted_values[i]:
count += 1
result_set.append(count / len(actual_values))
print("Accuracy: {0}".format(count / len(actual_values)))
print("**********")
def calculate_prediction(item, probabilities, record):
probability = 1
count = 0
for i in set(range(1,19)):
if record[i] not in probabilities[item][count]:
probability *= 0
else:
probability *= probabilities[item][count][record[i]]
count += 1
return probability
if __name__ == '__main__':
main(sys.argv)