|
| 1 | +import random |
| 2 | +import pandas |
| 3 | +from sklearn.cross_validation import train_test_split |
| 4 | +from sklearn.linear_model import LogisticRegression |
| 5 | +from sklearn.metrics import accuracy_score |
| 6 | +from sklearn.preprocessing import LabelEncoder |
| 7 | +from sklearn.utils import check_array |
| 8 | + |
| 9 | +import tensorflow as tf |
| 10 | +from tensorflow.contrib import layers |
| 11 | +from tensorflow.contrib import learn |
| 12 | + |
| 13 | + |
| 14 | +train = pandas.read_csv('data/titanic_train.csv') |
| 15 | +y = train.pop('Survived') |
| 16 | +# Drop all unique columns. List all variables for future reference. |
| 17 | +categorical_vars = ['Pclass', 'Sex', 'Embarked'] |
| 18 | +continues_vars = ['Age', 'SibSp', 'Parch', 'Fare'] |
| 19 | +X = train[categorical_vars + continues_vars].fillna(0) |
| 20 | +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) |
| 21 | + |
| 22 | + |
| 23 | +# Pandas input functino. |
| 24 | +def pandas_input_fn(x, y=None, batch_size=128, num_epochs=None): |
| 25 | + def input_fn(): |
| 26 | + if y is not None: |
| 27 | + x['target'] = y |
| 28 | + queue = learn.dataframe.queues.feeding_functions.enqueue_data( |
| 29 | + x, 1000, shuffle=num_epochs is None, num_epochs=num_epochs) |
| 30 | + if num_epochs is None: |
| 31 | + features = queue.dequeue_many(batch_size) |
| 32 | + else: |
| 33 | + features = queue.dequeue_up_to(batch_size) |
| 34 | + features = dict(zip(['index'] + list(x.columns), features)) |
| 35 | + if y is not None: |
| 36 | + target = features.pop('target') |
| 37 | + return features, target |
| 38 | + return features |
| 39 | + return input_fn |
| 40 | + |
| 41 | + |
| 42 | +# Process categorical variables into ids. |
| 43 | +X_train = X_train.copy() |
| 44 | +X_test = X_test.copy() |
| 45 | +categorical_var_encoders = {} |
| 46 | +for var in categorical_vars: |
| 47 | + le = LabelEncoder().fit(X_train[var]) |
| 48 | + X_train[var + '_ids'] = le.transform(X_train[var]) |
| 49 | + X_test[var + '_ids'] = le.transform(X_test[var]) |
| 50 | + X_train.pop(var) |
| 51 | + X_test.pop(var) |
| 52 | + categorical_var_encoders[var] = le |
| 53 | + |
| 54 | +### Note: Feature Columns currently (2016/10/22) not working, update is coming. |
| 55 | +# Setup feature columns. |
| 56 | +CATEGORICAL_EMBED_SIZE = 10 # Note, you can customize this per variable. |
| 57 | +feature_columns = [ |
| 58 | + layers.real_valued_column(var) for var in continues_vars |
| 59 | +] + [ |
| 60 | + layers.embedding_column( |
| 61 | + layers.sparse_column_with_integerized_feature( |
| 62 | + var + '_ids', len(categorical_var_encoders[var].classes_)), |
| 63 | + CATEGORICAL_EMBED_SIZE) for var in |
| 64 | + categorical_vars |
| 65 | +] |
| 66 | + |
| 67 | + |
| 68 | +# Linear classifier. |
| 69 | +''' |
| 70 | +random.seed(42) |
| 71 | +tflr = learn.LinearClassifier(n_classes=2, |
| 72 | + feature_columns=feature_columns, |
| 73 | + optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05)) |
| 74 | +tflr.fit(input_fn=train_input_fn, steps=500) |
| 75 | +print(list(tflr.predict(input_fn=test_input_fn, as_iterable=True)), y_test) |
| 76 | +print(accuracy_score(y_test, list(tflr.predict(input_fn=test_input_fn, as_iterable=True)))) |
| 77 | +''' |
| 78 | + |
| 79 | +# 3 layer neural network with rectified linear activation. |
| 80 | +''' |
| 81 | +random.seed(42) |
| 82 | +classifier = learn.DNNClassifier(hidden_units=[10, 20, 10], |
| 83 | + n_classes=2, |
| 84 | + feature_columns=feature_columns, |
| 85 | + optimizer=tf.train.GradientDescentOptimizer(learning_rate=0.05)) |
| 86 | +classifier.fit(X_train, y_train, batch_size=128, steps=500) |
| 87 | +print(accuracy_score(y_test, classifier.predict(X_test))) |
| 88 | +''' |
| 89 | + |
| 90 | +# 3 layer neural network with hyperbolic tangent activation. |
| 91 | +def dnn_tanh(features, target): |
| 92 | + target = tf.one_hot(target, 2, 1.0, 0.0) |
| 93 | + # Organize continues features. |
| 94 | + final_features = [tf.expand_dims(tf.cast(features[var], tf.float32), 1) for var in continues_vars] |
| 95 | + # Embed categorical variables into distributed representation. |
| 96 | + for var in categorical_vars: |
| 97 | + feature = learn.ops.categorical_variable( |
| 98 | + features[var + '_ids'], len(categorical_var_encoders[var].classes_), |
| 99 | + embedding_size=CATEGORICAL_EMBED_SIZE, name=var) |
| 100 | + final_features.append(feature) |
| 101 | + # Concatenate all features into one vector. |
| 102 | + features = tf.concat(1, final_features) |
| 103 | + # Deep Neural Network |
| 104 | + logits = layers.stack(features, layers.fully_connected, [10, 20, 10], |
| 105 | + activation_fn=tf.tanh) |
| 106 | + prediction, loss = learn.models.logistic_regression(logits, target) |
| 107 | + train_op = layers.optimize_loss(loss, |
| 108 | + tf.contrib.framework.get_global_step(), optimizer='SGD', learning_rate=0.05) |
| 109 | + return tf.argmax(prediction, dimension=1), loss, train_op |
| 110 | + |
| 111 | +random.seed(42) |
| 112 | +classifier = learn.Estimator(model_fn=dnn_tanh) |
| 113 | +classifier.fit(input_fn=pandas_input_fn(X_train, y_train), steps=100) |
| 114 | +preds = list(classifier.predict(input_fn=pandas_input_fn(X_test, num_epochs=1), as_iterable=True)) |
| 115 | +print(accuracy_score(y_test, preds)) |
0 commit comments