datacamp
diff --git a/‎Python_Code__For_Decision_Trees
+176 b/‎Python_Code__For_Decision_Trees
+176
@@ -0,0 +1,176 @@
+import pandas as pd
+import numpy as np
+import sklearn as sk
+from sklearn import tree
+import matplotlib.pyplot as plt
+
+# Load the train and test datasets to create two DataFrames
+train_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
+train = pd.read_csv(train_url)
+
+test_url = "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
+test = pd.read_csv(test_url)
+
+
+#### converting variables and clean the data
+train.loc[train["Sex"] == "male", "Sex"] = 0
+train.loc[train["Sex"] == "female", "Sex"] = 1
+
+train["Embarked"] = train["Embarked"].fillna("S")
+
+train.loc[train["Embarked"] == "S", "Embarked"] = 0
+train.loc[train["Embarked"] == "C", "Embarked"] = 1
+train.loc[train["Embarked"] == "Q", "Embarked"] = 2
+
+train["Age"] = train["Age"].fillna(train["Age"].median())
+
+## building the first tree
+target = np.array(train.Survived).transpose()
+features_one = np.array([train.Pclass, train.Sex, train.Age,  train.Fare]).transpose()
+
+my_tree_one = tree.DecisionTreeClassifier()
+my_tree_one = my_tree_one.fit(features_one, target)
+
+#### second tree
+
+features_two = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose()
+
+my_tree_two = tree.DecisionTreeClassifier()
+my_tree_two = my_tree_two.fit(features_two, target)
+
+#### third tree
+# control overfitting
+my_tree_three = tree.DecisionTreeClassifier(max_depth = 10, min_samples_split = 5)
+my_tree_three = my_tree_three.fit(features_two, target)
+
+
+### evaluating the models
+from sklearn.metrics import confusion_matrix
+
+pred_vec_three = my_tree_three.predict(features_two)
+pred_vec_two = my_tree_two.predict(features_two)
+pred_vec_one = my_tree_one.predict(features_one)
+
+def pred_eval(pred_vec,target):
+    cm = confusion_matrix(pred_vec,target)
+    true_positive = cm[0][0]
+    true_negative = cm[1][1]
+    false_positive = cm[0][1]
+    false_negative = cm[1][0]
+    positive = true_positive + false_negative
+    negative = true_negative + false_positive
+    sensitivity = true_positive/positive #proportion of survivals correctly classified (want to maximize)
+    specificity = true_negative/negative #proportion of deaths correctly classified (want to maximize)
+    ppv = true_positive/(true_positive + false_positive)
+    npv = true_negative/(true_negative + false_negative)
+    fnr = false_negative/positive #accordingly minimize 1 - sensitivity
+    fpr = false_positive/negative #accordingly minimize 1 - specificity
+    
+    eval = np.array([cm,sensitivity,specificity,ppv,npv,fnr,fpr])
+    return(eval)
+
+my_tree_one.score(features_one, target)
+my_tree_two.score(features_two, target)
+my_tree_three.score(features_two, target)
+
+#### Graphiong the Tree
+
+
+#from sklearn.externals.six import StringIO 
+#import pydot
+#dot_data = StringIO() 
+#tree.export_graphviz(my_tree_one, out_file = dot_data)
+#graph = pydot.graph_from_dot_data(dot_data.getvalue())
+#graph.write_pdf("tree.pdf")
+
+#from sklearn.externals.six import StringIO
+#with open("tree.dot", 'w') as f:
+#    f = tree.export_graphviz(my_tree_two, out_file=f)
+
+#from IPython.display import Image
+#dot_data = StringIO()
+#tree.export_graphviz(my_tree_two, out_file=dot_data,  filled=True, rounded=True,  special_characters=True)
+#graph = pydot.graph_from_dot_data(dot_data.getvalue())
+
+
+
+#### Useful Attributes
+my_tree_one.feature_importances_
+my_tree_one.tree_
+my_tree_one.n_classes_
+my_tree_one.n_features_
+my_tree_one.classes_
+
+
+
+####  Clean the test data.
+test.loc[test["Sex"] == "male", "Sex"] = 0
+test.loc[test["Sex"] == "female", "Sex"] = 1
+
+test["Embarked"] = test["Embarked"].fillna("S")
+
+test.loc[test["Embarked"] == "S", "Embarked"] = 0
+test.loc[test["Embarked"] == "C", "Embarked"] = 1
+test.loc[test["Embarked"] == "Q", "Embarked"] = 2
+
+test["Age"] = test["Age"].fillna(test["Age"].median())
+
+test.Fare[152] = test.Fare.median()
+
+
+#### Prediction
+
+test_features_one = np.array([test.Pclass, test.Fare, test.SibSp, test.Parch]).transpose()
+pred_one = my_tree_one.predict(test_features_one)
+
+
+test_features_two = np.array([test.Pclass,test.Age,test.Sex, test.Fare, test.SibSp, test.Parch,test.Embarked]).transpose()
+pred_two = my_tree_two.predict(test_features_two)
+
+pred_three = my_tree_three.predict(test_features_two)
+
+
+#### Feature Engineering
+
+
+#### https://plot.ly/matplotlib/bar-charts/
+
+y1 = cm1[1:5] 
+y2 = cm2[1:5]
+y3 = cm3[1:5]
+N = len(y1)
+x = range(N)
+plt.bar(x, y2, color="red")
+plt.bar(x, y3, color="green")
+plt.bar(x, y1, color="blue")
+
+g1 = cm1[5:7] 
+g2 = cm2[5:7]
+g3 = cm3[5:7]
+M = len(g1)
+h = range(M)
+plt.bar(h, g1, color="blue")
+plt.bar(h, g3, color="green")
+plt.bar(h, g2, color="red")
+
+
+#### Building a Random Forest
+
+from sklearn import cross_validation
+from sklearn.ensemble import RandomForestClassifier
+
+features_forest = np.array([train.Pclass,train.Age,train.Sex, train.Fare, train.SibSp, train.Parch,train.Embarked]).transpose()
+
+forest = RandomForestClassifier(max_depth = 10, n_estimators=100, min_samples_split=2)
+my_forest = forest.fit(features_forest, target)
+my_forest.score(features_forest, target)
+
+#Evaluate the forest
+pred_vec_forest = my_forest.predict(features_forest)
+pred_eval(pred_vec_forest,target)
+
+#predict using the forest
+pred_forest = my_forest.predict(test_features_two)
+
+
+