Skip to content

Commit 5c9bcb0

Browse files
committed
Adding titanic example that uses all features
1 parent 68987d1 commit 5c9bcb0

File tree

2 files changed

+199
-0
lines changed

2 files changed

+199
-0
lines changed

titanic_all_features.py

+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import random
2+
import pandas
3+
from sklearn.cross_validation import train_test_split
4+
from sklearn.linear_model import LogisticRegression
5+
from sklearn.metrics import accuracy_score
6+
from sklearn.preprocessing import LabelEncoder
7+
from sklearn.utils import check_array
8+
9+
import tensorflow as tf
10+
from tensorflow.contrib import layers
11+
from tensorflow.contrib import learn
12+
13+
14+
train = pandas.read_csv('data/titanic_train.csv')
15+
y = train.pop('Survived')
16+
# Drop all unique columns. List all variables for future reference.
17+
categorical_vars = ['Pclass', 'Sex', 'Embarked']
18+
continues_vars = ['Age', 'SibSp', 'Parch', 'Fare']
19+
X = train[categorical_vars + continues_vars].fillna(0)
20+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
21+
22+
23+
# Pandas input functino.
24+
def pandas_input_fn(x, y=None, batch_size=128, num_epochs=None):
25+
def input_fn():
26+
if y is not None:
27+
x['target'] = y
28+
queue = learn.dataframe.queues.feeding_functions.enqueue_data(
29+
x, 1000, shuffle=num_epochs is None, num_epochs=num_epochs)
30+
if num_epochs is None:
31+
features = queue.dequeue_many(batch_size)
32+
else:
33+
features = queue.dequeue_up_to(batch_size)
34+
features = dict(zip(['index'] + list(x.columns), features))
35+
if y is not None:
36+
target = features.pop('target')
37+
return features, target
38+
return features
39+
return input_fn
40+
41+
42+
# Process categorical variables into ids.
43+
X_train = X_train.copy()
44+
X_test = X_test.copy()
45+
categorical_var_encoders = {}
46+
for var in categorical_vars:
47+
le = LabelEncoder().fit(X_train[var])
48+
X_train[var + '_ids'] = le.transform(X_train[var])
49+
X_test[var + '_ids'] = le.transform(X_test[var])
50+
X_train.pop(var)
51+
X_test.pop(var)
52+
categorical_var_encoders[var] = le
53+
54+
55+
CATEGORICAL_EMBED_SIZE = 10 # Note, you can customize this per variable.
56+
57+
58+
# 3 layer neural network with hyperbolic tangent activation.
59+
def dnn_tanh(features, target):
60+
target = tf.one_hot(target, 2, 1.0, 0.0)
61+
# Organize continues features.
62+
final_features = [tf.expand_dims(tf.cast(features[var], tf.float32), 1) for var in continues_vars]
63+
# Embed categorical variables into distributed representation.
64+
for var in categorical_vars:
65+
feature = learn.ops.categorical_variable(
66+
features[var + '_ids'], len(categorical_var_encoders[var].classes_),
67+
embedding_size=CATEGORICAL_EMBED_SIZE, name=var)
68+
final_features.append(feature)
69+
# Concatenate all features into one vector.
70+
features = tf.concat(1, final_features)
71+
# Deep Neural Network
72+
logits = layers.stack(features, layers.fully_connected, [10, 20, 10],
73+
activation_fn=tf.tanh)
74+
prediction, loss = learn.models.logistic_regression(logits, target)
75+
train_op = layers.optimize_loss(loss,
76+
tf.contrib.framework.get_global_step(), optimizer='SGD', learning_rate=0.05)
77+
return tf.argmax(prediction, dimension=1), loss, train_op
78+
79+
random.seed(42)
80+
classifier = learn.Estimator(model_fn=dnn_tanh)
81+
# Note: not training this alomst at all.
82+
classifier.fit(input_fn=pandas_input_fn(X_train, y_train), steps=100)
83+
preds = list(classifier.predict(input_fn=pandas_input_fn(X_test, num_epochs=1), as_iterable=True))
84+
print(accuracy_score(y_test, preds))

titanic_all_features_with_fc.py

+115
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import random
2+
import pandas
3+
from sklearn.cross_validation import train_test_split
4+
from sklearn.linear_model import LogisticRegression
5+
from sklearn.metrics import accuracy_score
6+
from sklearn.preprocessing import LabelEncoder
7+
from sklearn.utils import check_array
8+
9+
import tensorflow as tf
10+
from tensorflow.contrib import layers
11+
from tensorflow.contrib import learn
12+
13+
14+
train = pandas.read_csv('data/titanic_train.csv')
15+
y = train.pop('Survived')
16+
# Drop all unique columns. List all variables for future reference.
17+
categorical_vars = ['Pclass', 'Sex', 'Embarked']
18+
continues_vars = ['Age', 'SibSp', 'Parch', 'Fare']
19+
X = train[categorical_vars + continues_vars].fillna(0)
20+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
21+
22+
23+
# Pandas input functino.
24+
def pandas_input_fn(x, y=None, batch_size=128, num_epochs=None):
25+
def input_fn():
26+
if y is not None:
27+
x['target'] = y
28+
queue = learn.dataframe.queues.feeding_functions.enqueue_data(
29+
x, 1000, shuffle=num_epochs is None, num_epochs=num_epochs)
30+
if num_epochs is None:
31+
features = queue.dequeue_many(batch_size)
32+
else:
33+
features = queue.dequeue_up_to(batch_size)
34+
features = dict(zip(['index'] + list(x.columns), features))
35+
if y is not None:
36+
target = features.pop('target')
37+
return features, target
38+
return features
39+
return input_fn
40+
41+
42+
# Process categorical variables into ids.
43+
X_train = X_train.copy()
44+
X_test = X_test.copy()
45+
categorical_var_encoders = {}
46+
for var in categorical_vars:
47+
le = LabelEncoder().fit(X_train[var])
48+
X_train[var + '_ids'] = le.transform(X_train[var])
49+
X_test[var + '_ids'] = le.transform(X_test[var])
50+
X_train.pop(var)
51+
X_test.pop(var)
52+
categorical_var_encoders[var] = le
53+
54+
### Note: Feature Columns currently (2016/10/22) not working, update is coming.
55+
# Setup feature columns.
56+
CATEGORICAL_EMBED_SIZE = 10 # Note, you can customize this per variable.
57+
feature_columns = [
58+
layers.real_valued_column(var) for var in continues_vars
59+
] + [
60+
layers.embedding_column(
61+
layers.sparse_column_with_integerized_feature(
62+
var + '_ids', len(categorical_var_encoders[var].classes_)),
63+
CATEGORICAL_EMBED_SIZE) for var in
64+
categorical_vars
65+
]
66+
67+
68+
# Linear classifier.
69+
'''
70+
random.seed(42)
71+
tflr = learn.LinearClassifier(n_classes=2,
72+
feature_columns=feature_columns,
73+
optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))
74+
tflr.fit(input_fn=train_input_fn, steps=500)
75+
print(list(tflr.predict(input_fn=test_input_fn, as_iterable=True)), y_test)
76+
print(accuracy_score(y_test, list(tflr.predict(input_fn=test_input_fn, as_iterable=True))))
77+
'''
78+
79+
# 3 layer neural network with rectified linear activation.
80+
'''
81+
random.seed(42)
82+
classifier = learn.DNNClassifier(hidden_units=[10, 20, 10],
83+
n_classes=2,
84+
feature_columns=feature_columns,
85+
optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05))
86+
classifier.fit(X_train, y_train, batch_size=128, steps=500)
87+
print(accuracy_score(y_test, classifier.predict(X_test)))
88+
'''
89+
90+
# 3 layer neural network with hyperbolic tangent activation.
91+
def dnn_tanh(features, target):
92+
target = tf.one_hot(target, 2, 1.0, 0.0)
93+
# Organize continues features.
94+
final_features = [tf.expand_dims(tf.cast(features[var], tf.float32), 1) for var in continues_vars]
95+
# Embed categorical variables into distributed representation.
96+
for var in categorical_vars:
97+
feature = learn.ops.categorical_variable(
98+
features[var + '_ids'], len(categorical_var_encoders[var].classes_),
99+
embedding_size=CATEGORICAL_EMBED_SIZE, name=var)
100+
final_features.append(feature)
101+
# Concatenate all features into one vector.
102+
features = tf.concat(1, final_features)
103+
# Deep Neural Network
104+
logits = layers.stack(features, layers.fully_connected, [10, 20, 10],
105+
activation_fn=tf.tanh)
106+
prediction, loss = learn.models.logistic_regression(logits, target)
107+
train_op = layers.optimize_loss(loss,
108+
tf.contrib.framework.get_global_step(), optimizer='SGD', learning_rate=0.05)
109+
return tf.argmax(prediction, dimension=1), loss, train_op
110+
111+
random.seed(42)
112+
classifier = learn.Estimator(model_fn=dnn_tanh)
113+
classifier.fit(input_fn=pandas_input_fn(X_train, y_train), steps=100)
114+
preds = list(classifier.predict(input_fn=pandas_input_fn(X_test, num_epochs=1), as_iterable=True))
115+
print(accuracy_score(y_test, preds))

0 commit comments

Comments
 (0)