Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
527 changes: 527 additions & 0 deletions data_exporation.ipynb

Large diffs are not rendered by default.

300 changes: 300 additions & 0 deletions model_iteration_1.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,300 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Import basic modules we will use and load in the train and test set"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import os\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import sklearn\n",
"# import statsmodels.api as sm\n",
"\n",
"%matplotlib inline\n",
"\n",
"cur_dir = os.path.dirname('__file__')\n",
"\n",
"train = pd.read_csv(os.path.join(cur_dir, \"train.csv\"))\n",
"test = pd.read_csv(os.path.join(cur_dir, \"test.csv\"))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Preprocess the data. In this case, we fill all `NaN` values in the `Sex` and `Age` columns. We also create dummy variables out of the categorical columns `Pclass` and `Embarked`"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"target = \"Survived\"\n",
"\n",
"def preprocess(df):\n",
"\n",
" df.Sex = (df.Sex == \"male\").astype(int)\n",
" df.Age = df.Age.fillna(df.Age.mean())\n",
" df.Embarked = df.Embarked.fillna(df.Embarked.mode())\n",
" df[\"oneSibSp\"] = (df.SibSp == 1).astype(int)\n",
" df[\"hasParch\"] = (df.Parch > 0).astype(int)\n",
" df[\"maleAge\"] = df.Sex * df.Age\n",
" df[\"is20\"] = df.Age >= 20\n",
"\n",
" dummy_class = pd.get_dummies(df['Pclass'], prefix='class')\n",
" dummy_emb = pd.get_dummies(df['Embarked'], prefix='emb')\n",
" df = pd.concat([df, dummy_class, dummy_emb], axis=1)\n",
" \n",
" return df\n",
"\n",
"train = preprocess(train)\n",
"test = preprocess(test)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"test.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### This is some `statsmodels` `LogisticRegression` code. I ended up using `sklearn` instead because of its easy to use `cross_validation` module."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false,
"scrolled": true
},
"outputs": [],
"source": [
"# from statsmodels.tools.sm_exceptions import PerfectSeparationError\n",
"\n",
"# # baseline_cols = [\"Sex\", \"Age\", \"class_2\", \"class_3\", \"intercept\"]\n",
"# baseline_cols = [\"Sex\", \"Age\", \"class_2\", \"class_3\", \"intercept\"]\n",
"# t = []\n",
"# for name in df.columns:\n",
"# if name == \"Survived\" or name in baseline_cols:\n",
"# continue\n",
"\n",
"# independents = baseline_cols + [name]\n",
"\n",
"# # for i in independents:\n",
"# # print df[i].dtype\n",
"\n",
"# logit = sm.Logit(df[\"Survived\"], df[baseline_cols + [name]], missing='drop')\n",
"# result = logit.fit()\n",
"\n",
"# print result.prsquared\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Let's do some data mining. We assume that age, sex, and class are all good predictors. Let's check if any of the other variables give us any additional insight beyond these three features."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Added predictiveness beyond Age, class, and sex:\n",
"\n",
"maleAge: 0.45% extra accuracy on predictions\n",
"emb_Q: 0.34% extra accuracy on predictions\n",
"is20: 0.22% extra accuracy on predictions\n",
"emb_S: 0.00% extra accuracy on predictions\n",
"oneSibSp: -0.11% extra accuracy on predictions\n",
"Fare: -0.45% extra accuracy on predictions\n",
"hasParch: -0.56% extra accuracy on predictions\n"
]
}
],
"source": [
"from sklearn.linear_model import LogisticRegression\n",
"from sklearn import cross_validation\n",
"\n",
"alg = LogisticRegression(random_state=1)\n",
"\n",
"scores = []\n",
"\n",
"possible_predictors = [\"Sex\", \"Age\", \"is20\", \"Fare\", \"oneSibSp\", \"hasParch\", \"maleAge\", \"class_2\", \"class_3\",\n",
" \"emb_Q\", \"emb_S\"]\n",
"\n",
"baseline_cols = [\"Sex\", \"Age\", \"class_2\", \"class_3\"]\n",
"baseline_score = cross_validation.cross_val_score(alg, train[baseline_cols], train[target], cv=3).mean()\n",
"\n",
"for name in possible_predictors:\n",
" if name == \"Survived\" or name in baseline_cols:\n",
" continue\n",
"\n",
" score = cross_validation.cross_val_score(alg, train[baseline_cols + [name]], train[target], cv=3).mean()\n",
" scores.append((name, score))\n",
"\n",
" \n",
"scores = sorted(scores, key=lambda x: x[1], reverse=True) \n",
"\n",
"print \"Added predictiveness beyond Age, class, and sex:\"\n",
"print\n",
"for var, score in scores:\n",
" print \"%s: %0.2f%% extra accuracy on predictions\" % (var, (score - baseline_score) * 100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## So adjusting for age's effects based on Sex seems to help a bit, and there might be something special about the age 20. I chose to omit all other features from the final model because they added very little benefit and I'm not convinced it was due to anything other than noise."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.79012345679012341"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"predictors = [\"Sex\", \"Age\", \"class_2\", \"class_3\", \"maleAge\", \"is20\"]\n",
"score = cross_validation.cross_val_score(alg, train[predictors], train[target], cv=3).mean(); score"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 79% accuracy isn't bad! Let's check what the baseline accuracy is:"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"0.61616161616161613"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"1 - train['Survived'].mean()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Generate a submission"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"alg = LogisticRegression(random_state=1)\n",
"\n",
"# Train the algorithm using all the training data\n",
"alg.fit(train[predictors], train[target])\n",
"\n",
"# Make predictions using the test set.\n",
"predictions = alg.predict(test[predictors])\n",
"\n",
"# Create a new dataframe with only the columns Kaggle wants from the dataset.\n",
"submission = pd.DataFrame({\n",
" \"PassengerId\": test[\"PassengerId\"],\n",
" \"Survived\": predictions\n",
" })\n",
"\n",
"submission.to_csv(\"kaggle.csv\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading