Skip to content

Commit cbb97ae

Browse files
committed
finish logistic regression
1 parent d1d43a4 commit cbb97ae

8 files changed

+484
-187
lines changed

.idea/workspace.xml

Lines changed: 211 additions & 32 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
# encoding=utf-8
2+
# @Author: WenDesi
3+
# @Date: 09-08-16
4+
5+
# @Last modified by: WenDesi
6+
# @Last modified time: 08-11-16
7+
8+
9+
import pandas as pd
10+
import numpy as np
11+
import cv2
12+
import random
13+
import time
14+
15+
from sklearn.cross_validation import train_test_split
16+
from sklearn.metrics import accuracy_score
17+
18+
19+
class Perceptron(object):
20+
21+
def __init__(self):
22+
self.learning_step = 0.00001
23+
self.max_iteration = 5000
24+
25+
def predict_(self, x):
26+
wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])
27+
return int(wx > 0)
28+
29+
def train(self, features, labels):
30+
self.w = [0.0] * (len(features[0]) + 1)
31+
32+
correct_count = 0
33+
time = 0
34+
35+
while time < self.max_iteration:
36+
index = random.randint(0, len(labels) - 1)
37+
x = list(features[index])
38+
x.append(1.0)
39+
y = 2 * labels[index] - 1
40+
wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])
41+
42+
if wx * y > 0:
43+
correct_count += 1
44+
if correct_count > self.max_iteration:
45+
break
46+
continue
47+
48+
for i in xrange(len(self.w)):
49+
self.w[i] += self.learning_step * (y * x[i])
50+
51+
def predict(self,features):
52+
labels = []
53+
for feature in features:
54+
x = list(feature)
55+
x.append(1)
56+
labels.append(self.predict_(x))
57+
return labels
58+
59+
60+
if __name__ == '__main__':
61+
62+
print 'Start read data'
63+
64+
time_1 = time.time()
65+
66+
raw_data = pd.read_csv('../data/train_binary.csv', header=0)
67+
data = raw_data.values
68+
69+
imgs = data[0::, 1::]
70+
labels = data[::, 0]
71+
72+
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
73+
train_features, test_features, train_labels, test_labels = train_test_split(
74+
imgs, labels, test_size=0.33, random_state=23323)
75+
# print train_features.shape
76+
# print train_features.shape
77+
78+
time_2 = time.time()
79+
print 'read data cost ', time_2 - time_1, ' second', '\n'
80+
81+
print 'Start training'
82+
p = Perceptron()
83+
p.train(train_features, train_labels)
84+
85+
time_3 = time.time()
86+
print 'training cost ', time_3 - time_2, ' second', '\n'
87+
88+
print 'Start predicting'
89+
test_predict = p.predict(test_features)
90+
time_4 = time.time()
91+
print 'predicting cost ', time_4 - time_3, ' second', '\n'
92+
93+
score = accuracy_score(test_labels, test_predict)
94+
print "The accruacy socre is ", score
2.98 KB
Binary file not shown.

logistic_regression/competation.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
# encoding=utf-8
2+
# @Author: WenDesi
3+
# @Date: 08-11-16
4+
5+
# @Last modified by: WenDesi
6+
# @Last modified time: 08-11-16
7+
8+
import csv
9+
import pandas as pd
10+
11+
from binary_perceptron import Perceptron
12+
from logistic_regression import LogisticRegression
13+
14+
from sklearn.cross_validation import train_test_split
15+
from sklearn.metrics import accuracy_score
16+
17+
if __name__ == '__main__':
18+
19+
raw_data = pd.read_csv('../data/train_binary.csv', header=0)
20+
data = raw_data.values
21+
22+
imgs = data[0::, 1::]
23+
labels = data[::, 0]
24+
25+
test_time = 10
26+
27+
p = Perceptron()
28+
lr = LogisticRegression()
29+
30+
writer = csv.writer(file('result.csv', 'wb'))
31+
32+
for time in xrange(test_time):
33+
print 'iterater time %d' % time
34+
35+
train_features, test_features, train_labels, test_labels = train_test_split(
36+
imgs, labels, test_size=0.33, random_state=23323)
37+
38+
p.train(train_features, train_labels)
39+
lr.train(train_features, train_labels)
40+
41+
p_predict = p.predict(test_features)
42+
lr_predict = lr.predict(test_features)
43+
44+
p_score = accuracy_score(test_labels, p_predict)
45+
lr_score = accuracy_score(test_labels, lr_predict)
46+
47+
print 'perceptron accruacy score ', p_score
48+
print 'logistic Regression accruacy score ', lr_score
49+
50+
writer.writerow([time,p_score,lr_score])

logistic_regression/logistic_regression.py

Lines changed: 67 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -5,111 +5,103 @@
55
# @Last modified by: WenDesi
66
# @Last modified time: 08-11-16
77

8+
import time
89
import math
910
import random
1011

12+
import pandas as pd
13+
from sklearn.cross_validation import train_test_split
14+
from sklearn.metrics import accuracy_score
1115

12-
def predict_(x, w):
13-
wx = sum([w[j] * x[j] for j in xrange(len(w))])
14-
exp_wx = math.exp(wx)
1516

16-
predict1 = exp_wx / (1 + exp_wx)
17-
predict0 = 1 / (1 + exp_wx)
17+
class LogisticRegression(object):
1818

19-
if predict1 > predict0:
20-
return 1
21-
else:
22-
return 0
19+
def __init__(self):
20+
self.learning_step = 0.00001
21+
self.max_iteration = 5000
2322

23+
def predict_(self,x):
24+
wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])
25+
exp_wx = math.exp(wx)
2426

25-
def train(features, labels):
26-
w = [0.0] * (len(features[0]) + 1)
27+
predict1 = exp_wx / (1 + exp_wx)
28+
predict0 = 1 / (1 + exp_wx)
2729

28-
learning_step = 0.00001
29-
max_iteration = 1000
30-
correct_count = 0
31-
time = 0
30+
if predict1 > predict0:
31+
return 1
32+
else:
33+
return 0
3234

33-
while time < max_iteration:
34-
index = random.randint(0, len(labels) - 1)
35-
x = features[index]
36-
x.append(1.0)
37-
y = labels[index]
3835

39-
if y == predict_(x, w):
40-
correct_count += 1
41-
if correct_count > max_iteration:
42-
break
43-
continue
36+
def train(self,features, labels):
37+
self.w = [0.0] * (len(features[0]) + 1)
4438

45-
print 'iterater times %d' % time
46-
time += 1
4739
correct_count = 0
40+
time = 0
4841

49-
wx = sum([w[i] * x[i] for i in xrange(len(w))])
50-
exp_wx = math.exp(wx)
51-
52-
for i in xrange(len(w)):
53-
w[i] -= learning_step * (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx))
42+
while time < self.max_iteration:
43+
index = random.randint(0, len(labels) - 1)
44+
x = list(features[index])
45+
x.append(1.0)
46+
y = labels[index]
5447

55-
return w
48+
if y == self.predict_(x):
49+
correct_count += 1
50+
if correct_count > self.max_iteration:
51+
break
52+
continue
5653

54+
# print 'iterater times %d' % time
55+
time += 1
56+
correct_count = 0
5757

58-
def predict(features, w):
59-
labels = []
58+
wx = sum([self.w[i] * x[i] for i in xrange(len(self.w))])
59+
exp_wx = math.exp(wx)
6060

61-
for feature in features:
62-
feature.append(1)
63-
x = feature
61+
for i in xrange(len(self.w)):
62+
self.w[i] -= self.learning_step * \
63+
(-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx))
6464

65-
labels.append(predict_(x,w))
6665

67-
return labels
66+
def predict(self,features):
67+
labels = []
6868

69+
for feature in features:
70+
x = list(feature)
71+
x.append(1)
72+
labels.append(self.predict_(x))
6973

70-
def build_dataset(label, original_posins, radius, size):
71-
datasets = []
72-
dim = len(original_posins)
74+
return labels
7375

74-
for i in xrange(size):
75-
dataset = [label]
76-
for j in xrange(dim):
77-
point = random.randint(0, 2 * radius) - radius + original_posins[j]
78-
dataset.append(point)
79-
datasets.append(dataset)
76+
if __name__ == "__main__":
77+
print 'Start read data'
8078

81-
return datasets
79+
time_1 = time.time()
8280

83-
if __name__ == "__main__":
81+
raw_data = pd.read_csv('../data/train_binary.csv',header=0)
82+
data = raw_data.values
8483

85-
# 构建训练集
86-
trainset1 = build_dataset(0, [0, 0], 10, 100)
87-
trainset2 = build_dataset(1, [30, 30], 10, 100)
84+
imgs = data[0::,1::]
85+
labels = data[::,0]
8886

89-
trainset = trainset1
90-
trainset.extend(trainset2)
91-
random.shuffle(trainset)
9287

93-
trainset_features = map(lambda x: x[1:], trainset)
94-
trainset_labels = map(lambda x: x[0], trainset)
88+
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
89+
train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)
9590

96-
# 训练
97-
w = train(trainset_features, trainset_labels)
91+
time_2 = time.time()
92+
print 'read data cost ',time_2 - time_1,' second','\n'
9893

99-
# 构建测试集
100-
testset1 = build_dataset(0, [0, 0], 10, 500)
101-
testset2 = build_dataset(1, [30, 30], 10, 500)
94+
print 'Start training'
95+
lr = LogisticRegression()
96+
lr.train(train_features, train_labels)
10297

103-
testset = testset1
104-
testset.extend(testset2)
105-
random.shuffle(testset)
98+
time_3 = time.time()
99+
print 'training cost ',time_3 - time_2,' second','\n'
106100

107-
testset_features = map(lambda x: x[1:], testset)
108-
testset_labels = map(lambda x: x[0], testset)
101+
print 'Start predicting'
102+
test_predict = lr.predict(test_features)
103+
time_4 = time.time()
104+
print 'predicting cost ',time_4 - time_3,' second','\n'
109105

110-
# 测试
111-
testset_predicts = predict(testset_features, w)
112-
print 'asad'
113-
accuracy_score = float(len(filter(lambda x: x == True, [testset_labels[i] == testset_predicts[
114-
i] for i in xrange(len(testset_predicts))]))) / float(len(testset_predicts))
115-
print "The accruacy socre is ", accuracy_score
106+
score = accuracy_score(test_labels,test_predict)
107+
print "The accruacy socre is ", score
3.16 KB
Binary file not shown.

logistic_regression/result.csv

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
0,0.96681096681096679,0.94834054834054837
2+
1,0.98138528138528136,0.98484848484848486
3+
2,0.98152958152958147,0.97554112554112549
4+
3,0.98160173160173159,0.97128427128427131
5+
4,0.98484848484848486,0.9878066378066378
6+
5,0.98297258297258294,0.98823953823953825
7+
6,0.98037518037518034,0.98730158730158735
8+
7,0.98095238095238091,0.9878066378066378
9+
8,0.96854256854256859,0.98535353535353531
10+
9,0.97813852813852808,0.98672438672438667

0 commit comments

Comments
 (0)