Skip to content

Commit 8845f9f

Browse files
committed
Added files via upload
1 parent 6c381fa commit 8845f9f

File tree

4 files changed

+1513
-0
lines changed

4 files changed

+1513
-0
lines changed

Python_Code__For_Decision_Trees

+176
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
import pandas as pd
2+
import numpy as np
3+
import sklearn as sk
4+
from sklearn import tree
5+
import matplotlib.pyplot as plt
6+
7+
# Load the train and test datasets to create two DataFrames
8+
train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
9+
train = pd.read_csv(train_url)
10+
11+
test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
12+
test = pd.read_csv(test_url)
13+
14+
15+
#### converting variables and clean the data
16+
train.loc[train["Sex"] == "male", "Sex"] = 0
17+
train.loc[train["Sex"] == "female", "Sex"] = 1
18+
19+
train["Embarked"] = train["Embarked"].fillna("S")
20+
21+
train.loc[train["Embarked"] == "S", "Embarked"] = 0
22+
train.loc[train["Embarked"] == "C", "Embarked"] = 1
23+
train.loc[train["Embarked"] == "Q", "Embarked"] = 2
24+
25+
train["Age"] = train["Age"].fillna(train["Age"].median())
26+
27+
## building the first tree
28+
target = np.array(train.Survived).transpose()
29+
features_one = np.array([train.Pclass, train.Sex, train.Age, train.Fare]).transpose()
30+
31+
my_tree_one = tree.DecisionTreeClassifier()
32+
my_tree_one = my_tree_one.fit(features_one, target)
33+
34+
#### second tree
35+
36+
features_two = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose()
37+
38+
my_tree_two = tree.DecisionTreeClassifier()
39+
my_tree_two = my_tree_two.fit(features_two, target)
40+
41+
#### third tree
42+
# control overfitting
43+
my_tree_three = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5)
44+
my_tree_three = my_tree_three.fit(features_two, target)
45+
46+
47+
### evaluating the models
48+
from sklearn.metrics import confusion_matrix
49+
50+
pred_vec_three = my_tree_three.predict(features_two)
51+
pred_vec_two = my_tree_two.predict(features_two)
52+
pred_vec_one = my_tree_one.predict(features_one)
53+
54+
def pred_eval(pred_vec,target):
55+
cm = confusion_matrix(pred_vec,target)
56+
true_positive = cm[0][0]
57+
true_negative = cm[1][1]
58+
false_positive = cm[0][1]
59+
false_negative = cm[1][0]
60+
positive = true_positive + false_negative
61+
negative = true_negative + false_positive
62+
sensitivity = true_positive/positive #proportion of survivals correctly classified (want to maximize)
63+
specificity = true_negative/negative #proportion of deaths correctly classified (want to maximize)
64+
ppv = true_positive/(true_positive + false_positive)
65+
npv = true_negative/(true_negative + false_negative)
66+
fnr = false_negative/positive #accordingly minimize 1 - sensitivity
67+
fpr = false_positive/negative #accordingly minimize 1 - specificity
68+
69+
eval = np.array([cm,sensitivity,specificity,ppv,npv,fnr,fpr])
70+
return(eval)
71+
72+
my_tree_one.score(features_one, target)
73+
my_tree_two.score(features_two, target)
74+
my_tree_three.score(features_two, target)
75+
76+
#### Graphiong the Tree
77+
78+
79+
#from sklearn.externals.six import StringIO
80+
#import pydot
81+
#dot_data = StringIO()
82+
#tree.export_graphviz(my_tree_one, out_file = dot_data)
83+
#graph = pydot.graph_from_dot_data(dot_data.getvalue())
84+
#graph.write_pdf("tree.pdf")
85+
86+
#from sklearn.externals.six import StringIO
87+
#with open("tree.dot", 'w') as f:
88+
# f = tree.export_graphviz(my_tree_two, out_file=f)
89+
90+
#from IPython.display import Image
91+
#dot_data = StringIO()
92+
#tree.export_graphviz(my_tree_two, out_file=dot_data, filled=True, rounded=True, special_characters=True)
93+
#graph = pydot.graph_from_dot_data(dot_data.getvalue())
94+
95+
96+
97+
#### Useful Attributes
98+
my_tree_one.feature_importances_
99+
my_tree_one.tree_
100+
my_tree_one.n_classes_
101+
my_tree_one.n_features_
102+
my_tree_one.classes_
103+
104+
105+
106+
#### Clean the test data.
107+
test.loc[test["Sex"] == "male", "Sex"] = 0
108+
test.loc[test["Sex"] == "female", "Sex"] = 1
109+
110+
test["Embarked"] = test["Embarked"].fillna("S")
111+
112+
test.loc[test["Embarked"] == "S", "Embarked"] = 0
113+
test.loc[test["Embarked"] == "C", "Embarked"] = 1
114+
test.loc[test["Embarked"] == "Q", "Embarked"] = 2
115+
116+
test["Age"] = test["Age"].fillna(test["Age"].median())
117+
118+
test.Fare[152] = test.Fare.median()
119+
120+
121+
#### Prediction
122+
123+
test_features_one = np.array([test.Pclass, test.Fare, test.SibSp, test.Parch]).transpose()
124+
pred_one = my_tree_one.predict(test_features_one)
125+
126+
127+
test_features_two = np.array([test.Pclass,test.Age,test.Sex, test.Fare, test.SibSp, test.Parch,test.Embarked]).transpose()
128+
pred_two = my_tree_two.predict(test_features_two)
129+
130+
pred_three = my_tree_three.predict(test_features_two)
131+
132+
133+
#### Feature Engineering
134+
135+
136+
#### https://plot.ly/matplotlib/bar-charts/
137+
138+
y1 = cm1[1:5]
139+
y2 = cm2[1:5]
140+
y3 = cm3[1:5]
141+
N = len(y1)
142+
x = range(N)
143+
plt.bar(x, y2, color="red")
144+
plt.bar(x, y3, color="green")
145+
plt.bar(x, y1, color="blue")
146+
147+
g1 = cm1[5:7]
148+
g2 = cm2[5:7]
149+
g3 = cm3[5:7]
150+
M = len(g1)
151+
h = range(M)
152+
plt.bar(h, g1, color="blue")
153+
plt.bar(h, g3, color="green")
154+
plt.bar(h, g2, color="red")
155+
156+
157+
#### Building a Random Forest
158+
159+
from sklearn import cross_validation
160+
from sklearn.ensemble import RandomForestClassifier
161+
162+
features_forest = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose()
163+
164+
forest = RandomForestClassifier(max_depth = 10, n_estimators=100, min_samples_split=2)
165+
my_forest = forest.fit(features_forest, target)
166+
my_forest.score(features_forest, target)
167+
168+
#Evaluate the forest
169+
pred_vec_forest = my_forest.predict(features_forest)
170+
pred_eval(pred_vec_forest,target)
171+
172+
#predict using the forest
173+
pred_forest = my_forest.predict(test_features_two)
174+
175+
176+

0 commit comments

Comments
 (0)