|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import sklearn as sk |
| 4 | +from sklearn import tree |
| 5 | +import matplotlib.pyplot as plt |
| 6 | + |
| 7 | +# Load the train and test datasets to create two DataFrames |
| 8 | +train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv" |
| 9 | +train = pd.read_csv(train_url) |
| 10 | + |
| 11 | +test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv" |
| 12 | +test = pd.read_csv(test_url) |
| 13 | + |
| 14 | + |
| 15 | +#### converting variables and clean the data |
| 16 | +train.loc[train["Sex"] == "male", "Sex"] = 0 |
| 17 | +train.loc[train["Sex"] == "female", "Sex"] = 1 |
| 18 | + |
| 19 | +train["Embarked"] = train["Embarked"].fillna("S") |
| 20 | + |
| 21 | +train.loc[train["Embarked"] == "S", "Embarked"] = 0 |
| 22 | +train.loc[train["Embarked"] == "C", "Embarked"] = 1 |
| 23 | +train.loc[train["Embarked"] == "Q", "Embarked"] = 2 |
| 24 | + |
| 25 | +train["Age"] = train["Age"].fillna(train["Age"].median()) |
| 26 | + |
| 27 | +## building the first tree |
| 28 | +target = np.array(train.Survived).transpose() |
| 29 | +features_one = np.array([train.Pclass, train.Sex, train.Age, train.Fare]).transpose() |
| 30 | + |
| 31 | +my_tree_one = tree.DecisionTreeClassifier() |
| 32 | +my_tree_one = my_tree_one.fit(features_one, target) |
| 33 | + |
| 34 | +#### second tree |
| 35 | + |
| 36 | +features_two = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose() |
| 37 | + |
| 38 | +my_tree_two = tree.DecisionTreeClassifier() |
| 39 | +my_tree_two = my_tree_two.fit(features_two, target) |
| 40 | + |
| 41 | +#### third tree |
| 42 | +# control overfitting |
| 43 | +my_tree_three = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5) |
| 44 | +my_tree_three = my_tree_three.fit(features_two, target) |
| 45 | + |
| 46 | + |
| 47 | +### evaluating the models |
| 48 | +from sklearn.metrics import confusion_matrix |
| 49 | + |
| 50 | +pred_vec_three = my_tree_three.predict(features_two) |
| 51 | +pred_vec_two = my_tree_two.predict(features_two) |
| 52 | +pred_vec_one = my_tree_one.predict(features_one) |
| 53 | + |
| 54 | +def pred_eval(pred_vec,target): |
| 55 | + cm = confusion_matrix(pred_vec,target) |
| 56 | + true_positive = cm[0][0] |
| 57 | + true_negative = cm[1][1] |
| 58 | + false_positive = cm[0][1] |
| 59 | + false_negative = cm[1][0] |
| 60 | + positive = true_positive + false_negative |
| 61 | + negative = true_negative + false_positive |
| 62 | + sensitivity = true_positive/positive #proportion of survivals correctly classified (want to maximize) |
| 63 | + specificity = true_negative/negative #proportion of deaths correctly classified (want to maximize) |
| 64 | + ppv = true_positive/(true_positive + false_positive) |
| 65 | + npv = true_negative/(true_negative + false_negative) |
| 66 | + fnr = false_negative/positive #accordingly minimize 1 - sensitivity |
| 67 | + fpr = false_positive/negative #accordingly minimize 1 - specificity |
| 68 | + |
| 69 | + eval = np.array([cm,sensitivity,specificity,ppv,npv,fnr,fpr]) |
| 70 | + return(eval) |
| 71 | + |
| 72 | +my_tree_one.score(features_one, target) |
| 73 | +my_tree_two.score(features_two, target) |
| 74 | +my_tree_three.score(features_two, target) |
| 75 | + |
| 76 | +#### Graphiong the Tree |
| 77 | + |
| 78 | + |
| 79 | +#from sklearn.externals.six import StringIO |
| 80 | +#import pydot |
| 81 | +#dot_data = StringIO() |
| 82 | +#tree.export_graphviz(my_tree_one, out_file = dot_data) |
| 83 | +#graph = pydot.graph_from_dot_data(dot_data.getvalue()) |
| 84 | +#graph.write_pdf("tree.pdf") |
| 85 | + |
| 86 | +#from sklearn.externals.six import StringIO |
| 87 | +#with open("tree.dot", 'w') as f: |
| 88 | +# f = tree.export_graphviz(my_tree_two, out_file=f) |
| 89 | + |
| 90 | +#from IPython.display import Image |
| 91 | +#dot_data = StringIO() |
| 92 | +#tree.export_graphviz(my_tree_two, out_file=dot_data, filled=True, rounded=True, special_characters=True) |
| 93 | +#graph = pydot.graph_from_dot_data(dot_data.getvalue()) |
| 94 | + |
| 95 | + |
| 96 | + |
| 97 | +#### Useful Attributes |
| 98 | +my_tree_one.feature_importances_ |
| 99 | +my_tree_one.tree_ |
| 100 | +my_tree_one.n_classes_ |
| 101 | +my_tree_one.n_features_ |
| 102 | +my_tree_one.classes_ |
| 103 | + |
| 104 | + |
| 105 | + |
| 106 | +#### Clean the test data. |
| 107 | +test.loc[test["Sex"] == "male", "Sex"] = 0 |
| 108 | +test.loc[test["Sex"] == "female", "Sex"] = 1 |
| 109 | + |
| 110 | +test["Embarked"] = test["Embarked"].fillna("S") |
| 111 | + |
| 112 | +test.loc[test["Embarked"] == "S", "Embarked"] = 0 |
| 113 | +test.loc[test["Embarked"] == "C", "Embarked"] = 1 |
| 114 | +test.loc[test["Embarked"] == "Q", "Embarked"] = 2 |
| 115 | + |
| 116 | +test["Age"] = test["Age"].fillna(test["Age"].median()) |
| 117 | + |
| 118 | +test.Fare[152] = test.Fare.median() |
| 119 | + |
| 120 | + |
| 121 | +#### Prediction |
| 122 | + |
| 123 | +test_features_one = np.array([test.Pclass, test.Fare, test.SibSp, test.Parch]).transpose() |
| 124 | +pred_one = my_tree_one.predict(test_features_one) |
| 125 | + |
| 126 | + |
| 127 | +test_features_two = np.array([test.Pclass,test.Age,test.Sex, test.Fare, test.SibSp, test.Parch,test.Embarked]).transpose() |
| 128 | +pred_two = my_tree_two.predict(test_features_two) |
| 129 | + |
| 130 | +pred_three = my_tree_three.predict(test_features_two) |
| 131 | + |
| 132 | + |
| 133 | +#### Feature Engineering |
| 134 | + |
| 135 | + |
| 136 | +#### https://plot.ly/matplotlib/bar-charts/ |
| 137 | + |
| 138 | +y1 = cm1[1:5] |
| 139 | +y2 = cm2[1:5] |
| 140 | +y3 = cm3[1:5] |
| 141 | +N = len(y1) |
| 142 | +x = range(N) |
| 143 | +plt.bar(x, y2, color="red") |
| 144 | +plt.bar(x, y3, color="green") |
| 145 | +plt.bar(x, y1, color="blue") |
| 146 | + |
| 147 | +g1 = cm1[5:7] |
| 148 | +g2 = cm2[5:7] |
| 149 | +g3 = cm3[5:7] |
| 150 | +M = len(g1) |
| 151 | +h = range(M) |
| 152 | +plt.bar(h, g1, color="blue") |
| 153 | +plt.bar(h, g3, color="green") |
| 154 | +plt.bar(h, g2, color="red") |
| 155 | + |
| 156 | + |
| 157 | +#### Building a Random Forest |
| 158 | + |
| 159 | +from sklearn import cross_validation |
| 160 | +from sklearn.ensemble import RandomForestClassifier |
| 161 | + |
| 162 | +features_forest = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose() |
| 163 | + |
| 164 | +forest = RandomForestClassifier(max_depth = 10, n_estimators=100, min_samples_split=2) |
| 165 | +my_forest = forest.fit(features_forest, target) |
| 166 | +my_forest.score(features_forest, target) |
| 167 | + |
| 168 | +#Evaluate the forest |
| 169 | +pred_vec_forest = my_forest.predict(features_forest) |
| 170 | +pred_eval(pred_vec_forest,target) |
| 171 | + |
| 172 | +#predict using the forest |
| 173 | +pred_forest = my_forest.predict(test_features_two) |
| 174 | + |
| 175 | + |
| 176 | + |
0 commit comments