WenDesi
diff --git a/‎.idea/workspace.xml
Lines changed: 211 additions & 32 deletions b/‎.idea/workspace.xml
Lines changed: 211 additions & 32 deletions
diff --git a/‎logistic_regression/binary_perceptron.py
Lines changed: 94 additions & 0 deletions b/‎logistic_regression/binary_perceptron.py
Lines changed: 94 additions & 0 deletions
diff --git a/‎logistic_regression/binary_perceptron.pyc
2.98 KB b/‎logistic_regression/binary_perceptron.pyc
2.98 KB
diff --git a/‎logistic_regression/competation.py
Lines changed: 50 additions & 0 deletions b/‎logistic_regression/competation.py
Lines changed: 50 additions & 0 deletions
diff --git a/‎logistic_regression/logistic_regression.py
Lines changed: 67 additions & 75 deletions b/‎logistic_regression/logistic_regression.py
Lines changed: 67 additions & 75 deletions
diff --git a/‎logistic_regression/logistic_regression.pyc
3.16 KB b/‎logistic_regression/logistic_regression.pyc
3.16 KB
diff --git a/‎logistic_regression/result.csv
Lines changed: 10 additions & 0 deletions b/‎logistic_regression/result.csv
Lines changed: 10 additions & 0 deletions
@@ -0,0 +1,94 @@
+# encoding=utf-8
+# @Author: WenDesi
+# @Date:   09-08-16
+# @Email:  [email protected]
+# @Last modified by:   WenDesi
+# @Last modified time: 08-11-16
+
+
+import pandas as pd
+import numpy as np
+import cv2
+import random
+import time
+
+from sklearn.cross_validation import train_test_split
+from sklearn.metrics import accuracy_score
+
+
+class Perceptron(object):
+
+    def __init__(self):
+        self.learning_step = 0.00001
+        self.max_iteration = 5000
+
+    def predict_(self, x):
+        wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])
+        return int(wx > 0)
+
+    def train(self, features, labels):
+        self.w = [0.0] * (len(features[0]) + 1)
+
+        correct_count = 0
+        time = 0
+
+        while time < self.max_iteration:
+            index = random.randint(0, len(labels) - 1)
+            x = list(features[index])
+            x.append(1.0)
+            y = 2 * labels[index] - 1
+            wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])
+
+            if wx * y > 0:
+                correct_count += 1
+                if correct_count > self.max_iteration:
+                    break
+                continue
+
+            for i in xrange(len(self.w)):
+                self.w[i] += self.learning_step * (y * x[i])
+
+    def predict(self,features):
+        labels = []
+        for feature in features:
+            x = list(feature)
+            x.append(1)
+            labels.append(self.predict_(x))
+        return labels
+
+
+if __name__ == '__main__':
+
+    print 'Start read data'
+
+    time_1 = time.time()
+
+    raw_data = pd.read_csv('../data/train_binary.csv', header=0)
+    data = raw_data.values
+
+    imgs = data[0::, 1::]
+    labels = data[::, 0]
+
+    # 选取 2/3 数据作为训练集， 1/3 数据作为测试集
+    train_features, test_features, train_labels, test_labels = train_test_split(
+        imgs, labels, test_size=0.33, random_state=23323)
+    # print train_features.shape
+    # print train_features.shape
+
+    time_2 = time.time()
+    print 'read data cost ', time_2 - time_1, ' second', '\n'
+
+    print 'Start training'
+    p = Perceptron()
+    p.train(train_features, train_labels)
+
+    time_3 = time.time()
+    print 'training cost ', time_3 - time_2, ' second', '\n'
+
+    print 'Start predicting'
+    test_predict = p.predict(test_features)
+    time_4 = time.time()
+    print 'predicting cost ', time_4 - time_3, ' second', '\n'
+
+    score = accuracy_score(test_labels, test_predict)
+    print "The accruacy socre is ", score
@@ -0,0 +1,50 @@
+# encoding=utf-8
+# @Author: WenDesi
+# @Date:   08-11-16
+# @Email:  [email protected]
+# @Last modified by:   WenDesi
+# @Last modified time: 08-11-16
+
+import csv
+import pandas as pd
+
+from binary_perceptron import Perceptron
+from logistic_regression import LogisticRegression
+
+from sklearn.cross_validation import train_test_split
+from sklearn.metrics import accuracy_score
+
+if __name__ == '__main__':
+
+    raw_data = pd.read_csv('../data/train_binary.csv', header=0)
+    data = raw_data.values
+
+    imgs = data[0::, 1::]
+    labels = data[::, 0]
+
+    test_time = 10
+
+    p = Perceptron()
+    lr = LogisticRegression()
+
+    writer = csv.writer(file('result.csv', 'wb'))
+
+    for time in xrange(test_time):
+        print 'iterater time %d' % time
+
+        train_features, test_features, train_labels, test_labels = train_test_split(
+            imgs, labels, test_size=0.33, random_state=23323)
+
+        p.train(train_features, train_labels)
+        lr.train(train_features, train_labels)
+
+        p_predict = p.predict(test_features)
+        lr_predict = lr.predict(test_features)
+
+        p_score = accuracy_score(test_labels, p_predict)
+        lr_score = accuracy_score(test_labels, lr_predict)
+
+        print 'perceptron accruacy score ', p_score
+        print 'logistic Regression accruacy score ', lr_score
+
+        writer.writerow([time,p_score,lr_score])
@@ -5,111 +5,103 @@
 # @Last modified by:   WenDesi
 # @Last modified time: 08-11-16
 
+import time
 import math
 import random
 
+import pandas as pd
+from sklearn.cross_validation import train_test_split
+from sklearn.metrics import accuracy_score
 
-def predict_(x, w):
-    wx = sum([w[j] * x[j] for j in xrange(len(w))])
-    exp_wx = math.exp(wx)
 
-    predict1 = exp_wx / (1 + exp_wx)
-    predict0 = 1 / (1 + exp_wx)
+class LogisticRegression(object):
 
-    if predict1 > predict0:
-        return 1
-    else:
-        return 0
+    def __init__(self):
+        self.learning_step = 0.00001
+        self.max_iteration = 5000
 
+    def predict_(self,x):
+        wx = sum([self.w[j] * x[j] for j in xrange(len(self.w))])
+        exp_wx = math.exp(wx)
 
-def train(features, labels):
-    w = [0.0] * (len(features[0]) + 1)
+        predict1 = exp_wx / (1 + exp_wx)
+        predict0 = 1 / (1 + exp_wx)
 
-    learning_step = 0.00001
-    max_iteration = 1000
-    correct_count = 0
-    time = 0
+        if predict1 > predict0:
+            return 1
+        else:
+            return 0
 
-    while time < max_iteration:
-        index = random.randint(0, len(labels) - 1)
-        x = features[index]
-        x.append(1.0)
-        y = labels[index]
 
-        if y == predict_(x, w):
-            correct_count += 1
-            if correct_count > max_iteration:
-                break
-            continue
+    def train(self,features, labels):
+        self.w = [0.0] * (len(features[0]) + 1)
 
-        print 'iterater times %d' % time
-        time += 1
         correct_count = 0
+        time = 0
 
-        wx = sum([w[i] * x[i] for i in xrange(len(w))])
-        exp_wx = math.exp(wx)
-
-        for i in xrange(len(w)):
-            w[i] -= learning_step * (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx))
+        while time < self.max_iteration:
+            index = random.randint(0, len(labels) - 1)
+            x = list(features[index])
+            x.append(1.0)
+            y = labels[index]
 
-    return w
+            if y == self.predict_(x):
+                correct_count += 1
+                if correct_count > self.max_iteration:
+                    break
+                continue
 
+            # print 'iterater times %d' % time
+            time += 1
+            correct_count = 0
 
-def predict(features, w):
-    labels = []
+            wx = sum([self.w[i] * x[i] for i in xrange(len(self.w))])
+            exp_wx = math.exp(wx)
 
-    for feature in features:
-        feature.append(1)
-        x = feature
+            for i in xrange(len(self.w)):
+                self.w[i] -= self.learning_step * \
+                    (-y * x[i] + float(x[i] * exp_wx) / float(1 + exp_wx))
 
-        labels.append(predict_(x,w))
 
-    return labels
+    def predict(self,features):
+        labels = []
 
+        for feature in features:
+            x = list(feature)
+            x.append(1)
+            labels.append(self.predict_(x))
 
-def build_dataset(label, original_posins, radius, size):
-    datasets = []
-    dim = len(original_posins)
+        return labels
 
-    for i in xrange(size):
-        dataset = [label]
-        for j in xrange(dim):
-            point = random.randint(0, 2 * radius) - radius + original_posins[j]
-            dataset.append(point)
-        datasets.append(dataset)
+if __name__ == "__main__":
+    print 'Start read data'
 
-    return datasets
+    time_1 = time.time()
 
-if __name__ == "__main__":
+    raw_data = pd.read_csv('../data/train_binary.csv',header=0)
+    data = raw_data.values
 
-    # 构建训练集
-    trainset1 = build_dataset(0, [0, 0], 10, 100)
-    trainset2 = build_dataset(1, [30, 30], 10, 100)
+    imgs = data[0::,1::]
+    labels = data[::,0]
 
-    trainset = trainset1
-    trainset.extend(trainset2)
-    random.shuffle(trainset)
 
-    trainset_features = map(lambda x: x[1:], trainset)
-    trainset_labels = map(lambda x: x[0], trainset)
+    # 选取 2/3 数据作为训练集， 1/3 数据作为测试集
+    train_features, test_features, train_labels, test_labels = train_test_split(imgs, labels, test_size=0.33, random_state=23323)
 
-    # 训练
-    w = train(trainset_features, trainset_labels)
+    time_2 = time.time()
+    print 'read data cost ',time_2 - time_1,' second','\n'
 
-    # 构建测试集
-    testset1 = build_dataset(0, [0, 0], 10, 500)
-    testset2 = build_dataset(1, [30, 30], 10, 500)
+    print 'Start training'
+    lr = LogisticRegression()
+    lr.train(train_features, train_labels)
 
-    testset = testset1
-    testset.extend(testset2)
-    random.shuffle(testset)
+    time_3 = time.time()
+    print 'training cost ',time_3 - time_2,' second','\n'
 
-    testset_features = map(lambda x: x[1:], testset)
-    testset_labels = map(lambda x: x[0], testset)
+    print 'Start predicting'
+    test_predict = lr.predict(test_features)
+    time_4 = time.time()
+    print 'predicting cost ',time_4 - time_3,' second','\n'
 
-    # 测试
-    testset_predicts = predict(testset_features, w)
-    print 'asad'
-    accuracy_score = float(len(filter(lambda x: x == True, [testset_labels[i] == testset_predicts[
-                           i] for i in xrange(len(testset_predicts))]))) / float(len(testset_predicts))
-    print "The accruacy socre is ", accuracy_score
+    score = accuracy_score(test_labels,test_predict)
+    print "The accruacy socre is ", score
@@ -0,0 +1,10 @@
+0,0.96681096681096679,0.94834054834054837
+1,0.98138528138528136,0.98484848484848486
+2,0.98152958152958147,0.97554112554112549
+3,0.98160173160173159,0.97128427128427131
+4,0.98484848484848486,0.9878066378066378
+5,0.98297258297258294,0.98823953823953825
+6,0.98037518037518034,0.98730158730158735
+7,0.98095238095238091,0.9878066378066378
+8,0.96854256854256859,0.98535353535353531
+9,0.97813852813852808,0.98672438672438667