Skip to content

Commit f26eece

Browse files
committed
before svm blog
1 parent e72226a commit f26eece

20 files changed

+33535
-506
lines changed

.idea/workspace.xml

Lines changed: 226 additions & 68 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

data/adult-census-income.zip

465 KB
Binary file not shown.

data/adult-census-income/adult.csv

Lines changed: 32562 additions & 0 deletions
Large diffs are not rendered by default.

data/iris-species.zip

3.63 KB
Binary file not shown.

data/iris-species/Iris.csv

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
2+
1,5.1,3.5,1.4,0.2,Iris-setosa
3+
2,4.9,3.0,1.4,0.2,Iris-setosa
4+
3,4.7,3.2,1.3,0.2,Iris-setosa
5+
4,4.6,3.1,1.5,0.2,Iris-setosa
6+
5,5.0,3.6,1.4,0.2,Iris-setosa
7+
6,5.4,3.9,1.7,0.4,Iris-setosa
8+
7,4.6,3.4,1.4,0.3,Iris-setosa
9+
8,5.0,3.4,1.5,0.2,Iris-setosa
10+
9,4.4,2.9,1.4,0.2,Iris-setosa
11+
10,4.9,3.1,1.5,0.1,Iris-setosa
12+
11,5.4,3.7,1.5,0.2,Iris-setosa
13+
12,4.8,3.4,1.6,0.2,Iris-setosa
14+
13,4.8,3.0,1.4,0.1,Iris-setosa
15+
14,4.3,3.0,1.1,0.1,Iris-setosa
16+
15,5.8,4.0,1.2,0.2,Iris-setosa
17+
16,5.7,4.4,1.5,0.4,Iris-setosa
18+
17,5.4,3.9,1.3,0.4,Iris-setosa
19+
18,5.1,3.5,1.4,0.3,Iris-setosa
20+
19,5.7,3.8,1.7,0.3,Iris-setosa
21+
20,5.1,3.8,1.5,0.3,Iris-setosa
22+
21,5.4,3.4,1.7,0.2,Iris-setosa
23+
22,5.1,3.7,1.5,0.4,Iris-setosa
24+
23,4.6,3.6,1.0,0.2,Iris-setosa
25+
24,5.1,3.3,1.7,0.5,Iris-setosa
26+
25,4.8,3.4,1.9,0.2,Iris-setosa
27+
26,5.0,3.0,1.6,0.2,Iris-setosa
28+
27,5.0,3.4,1.6,0.4,Iris-setosa
29+
28,5.2,3.5,1.5,0.2,Iris-setosa
30+
29,5.2,3.4,1.4,0.2,Iris-setosa
31+
30,4.7,3.2,1.6,0.2,Iris-setosa
32+
31,4.8,3.1,1.6,0.2,Iris-setosa
33+
32,5.4,3.4,1.5,0.4,Iris-setosa
34+
33,5.2,4.1,1.5,0.1,Iris-setosa
35+
34,5.5,4.2,1.4,0.2,Iris-setosa
36+
35,4.9,3.1,1.5,0.1,Iris-setosa
37+
36,5.0,3.2,1.2,0.2,Iris-setosa
38+
37,5.5,3.5,1.3,0.2,Iris-setosa
39+
38,4.9,3.1,1.5,0.1,Iris-setosa
40+
39,4.4,3.0,1.3,0.2,Iris-setosa
41+
40,5.1,3.4,1.5,0.2,Iris-setosa
42+
41,5.0,3.5,1.3,0.3,Iris-setosa
43+
42,4.5,2.3,1.3,0.3,Iris-setosa
44+
43,4.4,3.2,1.3,0.2,Iris-setosa
45+
44,5.0,3.5,1.6,0.6,Iris-setosa
46+
45,5.1,3.8,1.9,0.4,Iris-setosa
47+
46,4.8,3.0,1.4,0.3,Iris-setosa
48+
47,5.1,3.8,1.6,0.2,Iris-setosa
49+
48,4.6,3.2,1.4,0.2,Iris-setosa
50+
49,5.3,3.7,1.5,0.2,Iris-setosa
51+
50,5.0,3.3,1.4,0.2,Iris-setosa
52+
51,7.0,3.2,4.7,1.4,Iris-versicolor
53+
52,6.4,3.2,4.5,1.5,Iris-versicolor
54+
53,6.9,3.1,4.9,1.5,Iris-versicolor
55+
54,5.5,2.3,4.0,1.3,Iris-versicolor
56+
55,6.5,2.8,4.6,1.5,Iris-versicolor
57+
56,5.7,2.8,4.5,1.3,Iris-versicolor
58+
57,6.3,3.3,4.7,1.6,Iris-versicolor
59+
58,4.9,2.4,3.3,1.0,Iris-versicolor
60+
59,6.6,2.9,4.6,1.3,Iris-versicolor
61+
60,5.2,2.7,3.9,1.4,Iris-versicolor
62+
61,5.0,2.0,3.5,1.0,Iris-versicolor
63+
62,5.9,3.0,4.2,1.5,Iris-versicolor
64+
63,6.0,2.2,4.0,1.0,Iris-versicolor
65+
64,6.1,2.9,4.7,1.4,Iris-versicolor
66+
65,5.6,2.9,3.6,1.3,Iris-versicolor
67+
66,6.7,3.1,4.4,1.4,Iris-versicolor
68+
67,5.6,3.0,4.5,1.5,Iris-versicolor
69+
68,5.8,2.7,4.1,1.0,Iris-versicolor
70+
69,6.2,2.2,4.5,1.5,Iris-versicolor
71+
70,5.6,2.5,3.9,1.1,Iris-versicolor
72+
71,5.9,3.2,4.8,1.8,Iris-versicolor
73+
72,6.1,2.8,4.0,1.3,Iris-versicolor
74+
73,6.3,2.5,4.9,1.5,Iris-versicolor
75+
74,6.1,2.8,4.7,1.2,Iris-versicolor
76+
75,6.4,2.9,4.3,1.3,Iris-versicolor
77+
76,6.6,3.0,4.4,1.4,Iris-versicolor
78+
77,6.8,2.8,4.8,1.4,Iris-versicolor
79+
78,6.7,3.0,5.0,1.7,Iris-versicolor
80+
79,6.0,2.9,4.5,1.5,Iris-versicolor
81+
80,5.7,2.6,3.5,1.0,Iris-versicolor
82+
81,5.5,2.4,3.8,1.1,Iris-versicolor
83+
82,5.5,2.4,3.7,1.0,Iris-versicolor
84+
83,5.8,2.7,3.9,1.2,Iris-versicolor
85+
84,6.0,2.7,5.1,1.6,Iris-versicolor
86+
85,5.4,3.0,4.5,1.5,Iris-versicolor
87+
86,6.0,3.4,4.5,1.6,Iris-versicolor
88+
87,6.7,3.1,4.7,1.5,Iris-versicolor
89+
88,6.3,2.3,4.4,1.3,Iris-versicolor
90+
89,5.6,3.0,4.1,1.3,Iris-versicolor
91+
90,5.5,2.5,4.0,1.3,Iris-versicolor
92+
91,5.5,2.6,4.4,1.2,Iris-versicolor
93+
92,6.1,3.0,4.6,1.4,Iris-versicolor
94+
93,5.8,2.6,4.0,1.2,Iris-versicolor
95+
94,5.0,2.3,3.3,1.0,Iris-versicolor
96+
95,5.6,2.7,4.2,1.3,Iris-versicolor
97+
96,5.7,3.0,4.2,1.2,Iris-versicolor
98+
97,5.7,2.9,4.2,1.3,Iris-versicolor
99+
98,6.2,2.9,4.3,1.3,Iris-versicolor
100+
99,5.1,2.5,3.0,1.1,Iris-versicolor
101+
100,5.7,2.8,4.1,1.3,Iris-versicolor
102+
101,6.3,3.3,6.0,2.5,Iris-virginica
103+
102,5.8,2.7,5.1,1.9,Iris-virginica
104+
103,7.1,3.0,5.9,2.1,Iris-virginica
105+
104,6.3,2.9,5.6,1.8,Iris-virginica
106+
105,6.5,3.0,5.8,2.2,Iris-virginica
107+
106,7.6,3.0,6.6,2.1,Iris-virginica
108+
107,4.9,2.5,4.5,1.7,Iris-virginica
109+
108,7.3,2.9,6.3,1.8,Iris-virginica
110+
109,6.7,2.5,5.8,1.8,Iris-virginica
111+
110,7.2,3.6,6.1,2.5,Iris-virginica
112+
111,6.5,3.2,5.1,2.0,Iris-virginica
113+
112,6.4,2.7,5.3,1.9,Iris-virginica
114+
113,6.8,3.0,5.5,2.1,Iris-virginica
115+
114,5.7,2.5,5.0,2.0,Iris-virginica
116+
115,5.8,2.8,5.1,2.4,Iris-virginica
117+
116,6.4,3.2,5.3,2.3,Iris-virginica
118+
117,6.5,3.0,5.5,1.8,Iris-virginica
119+
118,7.7,3.8,6.7,2.2,Iris-virginica
120+
119,7.7,2.6,6.9,2.3,Iris-virginica
121+
120,6.0,2.2,5.0,1.5,Iris-virginica
122+
121,6.9,3.2,5.7,2.3,Iris-virginica
123+
122,5.6,2.8,4.9,2.0,Iris-virginica
124+
123,7.7,2.8,6.7,2.0,Iris-virginica
125+
124,6.3,2.7,4.9,1.8,Iris-virginica
126+
125,6.7,3.3,5.7,2.1,Iris-virginica
127+
126,7.2,3.2,6.0,1.8,Iris-virginica
128+
127,6.2,2.8,4.8,1.8,Iris-virginica
129+
128,6.1,3.0,4.9,1.8,Iris-virginica
130+
129,6.4,2.8,5.6,2.1,Iris-virginica
131+
130,7.2,3.0,5.8,1.6,Iris-virginica
132+
131,7.4,2.8,6.1,1.9,Iris-virginica
133+
132,7.9,3.8,6.4,2.0,Iris-virginica
134+
133,6.4,2.8,5.6,2.2,Iris-virginica
135+
134,6.3,2.8,5.1,1.5,Iris-virginica
136+
135,6.1,2.6,5.6,1.4,Iris-virginica
137+
136,7.7,3.0,6.1,2.3,Iris-virginica
138+
137,6.3,3.4,5.6,2.4,Iris-virginica
139+
138,6.4,3.1,5.5,1.8,Iris-virginica
140+
139,6.0,3.0,4.8,1.8,Iris-virginica
141+
140,6.9,3.1,5.4,2.1,Iris-virginica
142+
141,6.7,3.1,5.6,2.4,Iris-virginica
143+
142,6.9,3.1,5.1,2.3,Iris-virginica
144+
143,5.8,2.7,5.1,1.9,Iris-virginica
145+
144,6.8,3.2,5.9,2.3,Iris-virginica
146+
145,6.7,3.3,5.7,2.5,Iris-virginica
147+
146,6.7,3.0,5.2,2.3,Iris-virginica
148+
147,6.3,2.5,5.0,1.9,Iris-virginica
149+
148,6.5,3.0,5.2,2.0,Iris-virginica
150+
149,6.2,3.4,5.4,2.3,Iris-virginica
151+
150,5.9,3.0,5.1,1.8,Iris-virginica

data/iris-species/database.sqlite

10 KB
Binary file not shown.

svm/generate_dataset.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
# encoding=utf8
2+
import numpy as np
3+
import random
4+
import matplotlib
5+
import matplotlib.pyplot as plt
6+
7+
N = 10 #生成训练数据的个数
8+
9+
# AX=0 相当于matlab中 null(a','r')
10+
def null(a, rtol=1e-5):
11+
u, s, v = np.linalg.svd(a)
12+
rank = (s > rtol*s[0]).sum()
13+
return rank, v[rank:].T.copy()
14+
15+
# 符号函数,之后要进行向量化
16+
def sign(x):
17+
if x > 0:
18+
return 1
19+
elif x == 0:
20+
return 0
21+
elif x < 0:
22+
return -1
23+
#noisy=False,那么就会生成N的dim维的线性可分数据X,标签为y
24+
#noisy=True, 那么生成的数据是线性不可分的,标签为y
25+
def mk_data(N, noisy=False):
26+
rang = [-10,10]
27+
dim = 2
28+
29+
X=np.random.rand(dim,N)*(rang[1]-rang[0])+rang[0]
30+
31+
while True:
32+
Xsample = np.concatenate((np.ones((1,dim)), np.random.rand(dim,dim)*(rang[1]-rang[0])+rang[0]))
33+
k,w=null(Xsample.T)
34+
y = sign(np.dot(w.T,np.concatenate((np.ones((1,N)), X))))
35+
if np.all(y):
36+
break
37+
38+
if noisy == True:
39+
idx = random.sample(range(1,N), N/10)
40+
41+
for id in idx:
42+
y[0][id] = -y[0][id]
43+
44+
return (X,y,w)
45+
46+
def data_visualization(X,y,title):
47+
class_1 = [[],[]]
48+
class_2 = [[],[]]
49+
50+
size = len(y)
51+
52+
for i in xrange(size):
53+
X_1 = X[0][i]
54+
X_2 = X[1][i]
55+
56+
if y[i] == 1:
57+
class_1[0].append(X_1)
58+
class_1[1].append(X_2)
59+
else:
60+
class_2[0].append(X_1)
61+
class_2[1].append(X_2)
62+
63+
64+
plt.figure(figsize=(8, 6), dpi=80)
65+
plt.title(title)
66+
67+
axes = plt.subplot(111)
68+
69+
type1 = axes.scatter(class_1[0], class_1[1], s=20, c='red')
70+
type2 = axes.scatter(class_2[0], class_2[1], s=20, c='green')
71+
72+
73+
plt.show()
74+
75+
def rebuild_features(features):
76+
size = len(features[0])
77+
78+
new_features = []
79+
for i in xrange(size):
80+
new_features.append([features[0][i],features[1][i]])
81+
82+
return new_features
83+
84+
def generate_dataset(size, noisy = False, visualization = True):
85+
global sign
86+
sign = np.vectorize(sign)
87+
X,y,w = mk_data(size,False)
88+
y = list(y[0])
89+
90+
if visualization:
91+
data_visualization(X,y,'all data') #数据可视化
92+
93+
testset_size = int(len(y)*0.333)
94+
95+
indexes = [i for i in xrange(len(y))]
96+
test_indexes = random.sample(indexes,testset_size)
97+
train_indexes = list(set(indexes)-set(test_indexes))
98+
99+
trainset_features = [[],[]]
100+
trainset_labels = []
101+
102+
testset_features = [[],[]]
103+
testset_labels = []
104+
105+
for i in test_indexes:
106+
testset_features[0].append(X[0][i])
107+
testset_features[1].append(X[1][i])
108+
testset_labels.append(y[i])
109+
110+
111+
if visualization:
112+
data_visualization(testset_features,testset_labels,'test set')
113+
114+
for i in train_indexes:
115+
trainset_features[0].append(X[0][i])
116+
trainset_features[1].append(X[1][i])
117+
trainset_labels.append(y[i])
118+
119+
if visualization:
120+
data_visualization(trainset_features,trainset_labels,'train set')
121+
122+
return rebuild_features(trainset_features),trainset_labels,rebuild_features(testset_features),testset_labels
123+
124+
125+
126+
if __name__ == '__main__':
127+
128+
size = 1000
129+
generate_dataset(size)
130+
131+
# generate_dataset
132+
# print sign
133+
# sign = np.vectorize(sign)
134+
# X,y,w = mk_data(size,False)
135+
#
136+
# data_visualization(X,y)

svm/generate_dataset.pyc

3.65 KB
Binary file not shown.

svm/logistic_regression.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# encoding=utf-8
2+
# @Author: WenDesi
3+
# @Date: 08-11-16
4+
5+
# @Last modified by: WenDesi
6+
# @Last modified time: 08-11-16
7+
8+
import time
9+
import math
10+
import random
11+
12+
import pandas as pd
13+
from sklearn.cross_validation import train_test_split
14+
from sklearn.metrics import accuracy_score
15+
16+
17+
class LogisticRegression(object):
18+
19+
def __init__(self):
20+
self.learning_step = 0.00001
21+
self.max_iteration = 5000
22+
23+
def predict_(self,x):
24+
wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])
25+
exp_wx = math.exp(wx)
26+
27+
predict1 = exp_wx / (1 + exp_wx)
28+
predict0 = 1 / (1 + exp_wx)
29+
30+
if predict1 > predict0:
31+
return 1
32+
else:
33+
return 0
34+
35+
36+
def train(self,features, labels):
37+
self.w = [0.0] * (len(features[0]) + 1)
38+
39+
correct_count = 0
40+
time = 0
41+
42+
while time < self.max_iteration:
43+
index = random.randint(0, len(labels) - 1)
44+
x = list(features[index])
45+
x.append(1.0)
46+
y = labels[index]
47+
48+
if y == self.predict_(x):
49+
correct_count += 1
50+
if correct_count > self.max_iteration:
51+
break
52+
continue
53+
54+
# print 'iterater times %d' % time
55+
time += 1
56+
correct_count = 0
57+
58+
wx = sum([self.w[i] * x[i] for i in xrange(len(self.w))])
59+
exp_wx = math.exp(wx)
60+
61+
for i in xrange(len(self.w)):
62+
self.w[i] -= self.learning_step * \
63+
(-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx))
64+
65+
66+
def predict(self,features):
67+
labels = []
68+
69+
for feature in features:
70+
x = list(feature)
71+
x.append(1)
72+
labels.append(self.predict_(x))
73+
74+
return labels
75+
76+
if __name__ == "__main__":
77+
print 'Start read data'
78+
79+
time_1 = time.time()
80+
81+
raw_data = pd.read_csv('../data/train_binary.csv',header=0)
82+
data = raw_data.values
83+
84+
imgs = data[0::,1::]
85+
labels = data[::,0]
86+
87+
88+
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
89+
train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)
90+
91+
time_2 = time.time()
92+
print 'read data cost ',time_2 - time_1,' second','\n'
93+
94+
print 'Start training'
95+
lr = LogisticRegression()
96+
lr.train(train_features, train_labels)
97+
98+
time_3 = time.time()
99+
print 'training cost ',time_3 - time_2,' second','\n'
100+
101+
print 'Start predicting'
102+
test_predict = lr.predict(test_features)
103+
time_4 = time.time()
104+
print 'predicting cost ',time_4 - time_3,' second','\n'
105+
106+
score = accuracy_score(test_labels,test_predict)
107+
print "The accruacy socre is ", score

svm/logistic_regression.pyc

3.06 KB
Binary file not shown.

0 commit comments

Comments
 (0)