diff --git a/499net.pth b/499net.pth new file mode 100644 index 0000000..a3966cc Binary files /dev/null and b/499net.pth differ diff --git a/HW2.md b/HW2.md new file mode 100644 index 0000000..73f4482 --- /dev/null +++ b/HW2.md @@ -0,0 +1,235 @@ + + + + + + + HW2.md (editing) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+ + +
+ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/HW2.py b/HW2.py new file mode 100644 index 0000000..d62172f --- /dev/null +++ b/HW2.py @@ -0,0 +1,235 @@ + + + + + + + HW2.py (editing) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+ + +
+
+
+ + +
+ + + + + + + + + + + + + + + + + \ No newline at end of file diff --git "a/MNIST_\345\274\240\351\235\226.md" "b/MNIST_\345\274\240\351\235\226.md" new file mode 100644 index 0000000..a5138e3 --- /dev/null +++ "b/MNIST_\345\274\240\351\235\226.md" @@ -0,0 +1,9 @@ +李宏毅机器学习作业2 +在这个模型里面假设数据集属于正态分布,采用两个种类shared_sigma的模型,通过数据集算出各自的,然后直接带入公式求解 + +训练数据直接将其中的10%当作valid,最后valid accuracy为0.847359, test accuracy为0.843990 +学习内容 +吴恩达第二周课程以及完成对应第一二周课后作业 + +预期学习 +吴恩达第三周课程并完成对应课后题,李宏毅作业三,继续学习python \ No newline at end of file diff --git "a/MNIST_\345\274\240\351\235\226.py" "b/MNIST_\345\274\240\351\235\226.py" new file mode 100644 index 0000000..51f7768 --- /dev/null +++ "b/MNIST_\345\274\240\351\235\226.py" @@ -0,0 +1,157 @@ +import pandas as pd +import numpy as np +from random import shuffle +from numpy.linalg import inv +from math import floor, log +import os +import argparse + + + +output_dir = "output/" + +def dataProcess_X(rawData): + + #sex 只有两个属性 先drop之后处理 + if "income" in rawData.columns: + Data = rawData.drop(["sex", 'income'], axis=1) + else: + Data = rawData.drop(["sex"], axis=1) + listObjectColumn = [col for col in Data.columns if Data[col].dtypes == "object"] #读取非数字的column + listNonObjedtColumn = [x for x in list(Data) if x not in listObjectColumn] #数字的column + + ObjectData = Data[listObjectColumn] + NonObjectData = Data[listNonObjedtColumn] + #insert set into nonobject data with male = 0 and female = 1 + NonObjectData.insert(0 ,"sex", (rawData["sex"] == " Female").astype(np.int)) + #set every element in object rows as an attribute + ObjectData = pd.get_dummies(ObjectData) + + Data = pd.concat([NonObjectData, ObjectData], axis=1) + Data_x = Data.astype("int64") + # Data_y = (rawData["income"] == " <=50K").astype(np.int) + + #normalize + Data_x = (Data_x - Data_x.mean()) / Data_x.std() + + return Data_x + +def dataProcess_Y(rawData): + df_y = rawData['income'] + Data_y = pd.DataFrame((df_y==' >50K').astype("int64"), columns=["income"]) + return Data_y + + +def sigmoid(z): + res = 1 / (1.0 + np.exp(-z)) + return np.clip(res, 1e-8, (1-(1e-8))) + +def _shuffle(X, Y): #X and Y are np.array + randomize = np.arange(X.shape[0]) + np.random.shuffle(randomize) + return (X[randomize], Y[randomize]) + +def split_valid_set(X, Y, percentage): + all_size = X.shape[0] + valid_size = int(floor(all_size * percentage)) + + X, Y = _shuffle(X, Y) + X_valid, Y_valid = X[ : valid_size], Y[ : valid_size] + X_train, Y_train = X[valid_size:], Y[valid_size:] + + return X_train, Y_train, X_valid, Y_valid + +def valid(X, Y, mu1, mu2, shared_sigma, N1, N2): + sigma_inv = inv(shared_sigma) + w = np.dot((mu1-mu2), sigma_inv) + X_t = X.T + b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log(float(N1)/N2) + a = np.dot(w,X_t) + b + y = sigmoid(a) + y_ = np.around(y) + result = (np.squeeze(Y) == y_) + print('Valid acc = %f' % (float(result.sum()) / result.shape[0])) + return + +def train(X_train, Y_train): + # vaild_set_percetange = 0.1 + # X_train, Y_train, X_valid, Y_valid = split_valid_set(X, Y, vaild_set_percetange) + + #Gussian distribution parameters + train_data_size = X_train.shape[0] + + cnt1 = 0 + cnt2 = 0 + + mu1 = np.zeros((106,)) + mu2 = np.zeros((106,)) + for i in range(train_data_size): + if Y_train[i] == 1: # >50k + mu1 += X_train[i] + cnt1 += 1 + else: + mu2 += X_train[i] + cnt2 += 1 + mu1 /= cnt1 + mu2 /= cnt2 + + sigma1 = np.zeros((106, 106)) + sigma2 = np.zeros((106, 106)) + for i in range(train_data_size): + if Y_train[i] == 1: + sigma1 += np.dot(np.transpose([X_train[i] - mu1]), [X_train[i] - mu1]) + else: + sigma2 += np.dot(np.transpose([X_train[i] - mu2]), [X_train[i] - mu2]) + + sigma1 /= cnt1 + sigma2 /= cnt2 + shared_sigma = (float(cnt1) / train_data_size) * sigma1 + (float(cnt2) / train_data_size) * sigma2 + + N1 = cnt1 + N2 = cnt2 + + return mu1, mu2, shared_sigma, N1, N2 + + +if __name__ == "__main__": + trainData = pd.read_csv("C:\Users\zjnorton\Downloads\NTU_ML2017_Hung-yi-Lee_HW-master\HW2\data/train.csv") + testData = pd.read_csv("C:\Users\zjnorton\Downloads\NTU_ML2017_Hung-yi-Lee_HW-master\HW2\data/test.csv") + ans = pd.read_csv("C:\Users\zjnorton\Downloads\NTU_ML2017_Hung-yi-Lee_HW-master\HW2\data/correct_answer.csv") + +#here is one more attribute in trainData + x_train = dataProcess_X(trainData).drop(['native_country_ Holand-Netherlands'], axis=1).values + x_test = dataProcess_X(testData).values + y_train = dataProcess_Y(trainData).values + y_ans = ans['label'].values + + vaild_set_percetange = 0.1 + X_train, Y_train, X_valid, Y_valid = split_valid_set(x_train, y_train, vaild_set_percetange) + mu1, mu2, shared_sigma, N1, N2 = train(X_train, Y_train) + valid(X_valid, Y_valid, mu1, mu2, shared_sigma, N1, N2) + + mu1, mu2, shared_sigma, N1, N2 = train(x_train, y_train) + sigma_inv = inv(shared_sigma) + w = np.dot((mu1 - mu2), sigma_inv) + X_t = x_test.T + b = (-0.5) * np.dot(np.dot(mu1.T, sigma_inv), mu1) + (0.5) * np.dot(np.dot(mu2.T, sigma_inv), mu2) + np.log( + float(N1) / N2) + a = np.dot(w, X_t) + b + y = sigmoid(a) + y_ = np.around(y).astype(np.int) + df = pd.DataFrame({"id" : np.arange(1,16282), "label": y_}) + result = (np.squeeze(y_ans) == y_) + print('Test acc = %f' % (float(result.sum()) / result.shape[0])) + df = pd.DataFrame({"id": np.arange(1, 16282), "label": y_}) + if not os.path.exists(output_dir): + os.mkdir(output_dir) + df.to_csv(os.path.join(output_dir+'gd_output.csv'), sep='\t', index=False) + + + + + + + + + + diff --git a/data.rar b/data.rar new file mode 100644 index 0000000..100caa1 Binary files /dev/null and b/data.rar differ diff --git a/mnist_test.py b/mnist_test.py new file mode 100644 index 0000000..75a54ea --- /dev/null +++ b/mnist_test.py @@ -0,0 +1,65 @@ +import torchvision +import torch +from torch import nn, optim +import numpy as np +import cv2 +import time +from torch.utils.data import DataLoader +import os + +transform = torchvision.transforms.Compose( + [torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.5], [0.5])]) +def get_files(path): + list = os.listdir(path) + paths = [path + name for name in list] + labels = [int(name.split('.')[0]) for name in list] + return paths, labels + +class my_mnist(nn.Module): + def __init__(self): + super(my_mnist, self).__init__() + self.layer1 = nn.Sequential( + nn.Conv2d(1,16,5,1,2), + nn.ReLU(), + nn.MaxPool2d(2) + ) + self.layer2 = nn.Sequential( + nn.Conv2d(16,32,5,1,2), + nn.ReLU(), + nn.MaxPool2d(2) + ) + self.layer3 = nn.Linear(32*7*7, 10) + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + x = x.view(x.size(0), -1) + out = self.layer3(x) + return out + +#net.load_state_dict(torch.load('pkl/499net.pth')) +net = torch.load('pkl/499net.pth').cpu() +path = 'test/' + +def threshold_filter(threshold, image): + for i in range(28): + for j in range(28): + image[i][j] = 255 - image[i][j] + if(image[i][j] < threshold): + image[i][j] = 0 + else: + image[i][j] = 255 + return image +paths, labels = get_files(path) +threshold = 99 #80-141 +for e in range(len(labels)): + image = cv2.imread(paths[e]) #读取图片 + img_gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) #变为灰度图 + image = cv2.resize(img_gray, (28,28)) #改变大小 + image = threshold_filter(threshold, image) #阈值过滤 + image = transform(image) #转换为Tensor,对像素点进行处理变为[-1,1]区间 + image = image.reshape(-1,1,28,28) + + out = net(image) #模型处理 + pred = torch.max(out, 1)[1] #得出结果 + print(pred.item(), labels[e]) diff --git a/mnist_train.py b/mnist_train.py new file mode 100644 index 0000000..9b4c829 --- /dev/null +++ b/mnist_train.py @@ -0,0 +1,89 @@ +import torchvision +import torch +from torch import nn, optim +from torch.autograd import Variable +import numpy as np +from torch.utils.data import DataLoader +import time + +data_tf = torchvision.transforms.Compose( + [torchvision.transforms.ToTensor(), + torchvision.transforms.Normalize([0.5], [0.5])] +) +train_set = torchvision.datasets.MNIST('./data', train=True, transform=data_tf, download=True) +test_set = torchvision.datasets.MNIST('./data', train=False, transform=data_tf,download=True) + +train_data = DataLoader(train_set, batch_size=64, shuffle=True) +test_data = DataLoader(test_set, batch_size=128, shuffle=False) + +class my_mnist(nn.Module): + def __init__(self): + super(my_mnist, self).__init__() + self.layer1 = nn.Sequential( + nn.Conv2d(1,16,5,1,2), + nn.ReLU(), + nn.MaxPool2d(2) + ) + self.layer2 = nn.Sequential( + nn.Conv2d(16,32,5,1,2), + nn.ReLU(), + nn.MaxPool2d(2) + ) + self.layer3 = nn.Linear(32*7*7, 10) + def forward(self, x): + x = self.layer1(x) + x = self.layer2(x) + x = x.view(x.size(0), -1) + out = self.layer3(x) + return out +net = my_mnist() +net.cuda() + +loss_func = nn.CrossEntropyLoss() +optimizer = optim.Adam(net.parameters(), 1e-2) + +for e in range(500): + start = time.time() + + train_loss = 0 + train_acc = 0 + for x, y in train_data: + x_input = Variable(x).cuda() + y_target = Variable(y).cuda() + + output = net(x_input) + loss = loss_func(output, y_target) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + + train_loss += loss.item() + + pred = torch.max(output, 1)[1].cuda() + num_correct = (pred == y_target).sum() + train_acc += num_correct.item() + + train_loss /= len(train_data)*128 + train_acc /= len(train_data)*128 + + print('epoch: {}, Train Loss: {:.6f}, Train Acc: {:.6f}'.format(e, train_loss, train_acc)) + torch.save(net.state_dict(), 'pkl/mnist.pkl') + print('Have costed {}'.format(time.time()-start)) +def test(): + net = torch.load('pth/499net.pth').cuda() + net.eval() + + test_acc = 0 + for img, label in test_data: + test_img = Variable(img).cuda() + label = Variable(label).cuda() + + test_out = net(test_img) + pred = torch.max(test_out,1)[1].cuda() + num_correct = (pred == label).sum() + test_acc += num_correct.item() + + test_acc /= len(test_data)*128 + + print('Test Acc: {:.6f}'.format(test_acc)) diff --git a/test.rar b/test.rar new file mode 100644 index 0000000..41e0687 Binary files /dev/null and b/test.rar differ