diff --git a/data_exploration.ipynb b/data_exploration.ipynb new file mode 100644 index 0000000..d0abfa5 --- /dev/null +++ b/data_exploration.ipynb @@ -0,0 +1,284 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import Titanic training set as data" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "import sys\n", + "sys.path.append('../DataScience16/ThinkStats2')\n", + "import thinkstats2, thinkplot\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "data = pd.read_csv('./train.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check for null data" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "PassengerId 0\n", + "Survived 0\n", + "Pclass 0\n", + "Name 0\n", + "Sex 0\n", + "Age 177\n", + "SibSp 0\n", + "Parch 0\n", + "Ticket 0\n", + "Fare 0\n", + "Cabin 687\n", + "Embarked 2\n", + "dtype: int64" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.isnull().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEACAYAAACj0I2EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGgBJREFUeJzt3X2QFfW95/H3FxQDJsAo60DAYWQ1XrVWAbPEG3VzgoqG\n3TIkW5XVGJNxWTRGSVgrtwRSO8xgUj7EGJPKJckSJcbV7N6bjYCWD6h4TMhGRT0IisiTwwSQ4Skg\nDAnDMN/94zTjMMw5fZ6np+fzqpqiT3efX384HL40v/71r83dERGRvm9AbwcQEZHSUEEXEYkJFXQR\nkZhQQRcRiQkVdBGRmFBBFxGJidCCbmanmNmrZpYyszVmNi9YP8/MtprZm8HPNeWPKyIimVgu49DN\nbIi7HzKzgcCfgG8DXwAOuPsDZc4oIiI5yKnLxd0PBYunACcBx/4VsHKEEhGR/OVU0M1sgJmlgB3A\n8+6+Mth0u5mtMrNfmdmwsqUUEZFQOXW5dO5sNhR4ApgJ7AJ2u7ub2feBUe4+vTwxRUQkTF4FHcDM\n/gfQ2rXv3MzGAk+6+4U97K/JYkRECuDueXVr5zLKZcSx7hQzGwxcBawzs5Fddvsy8HaWUJH/mTdv\nXq9nUE5lVE7lPPZTiJNy2GcU8IiZDSD9D8D/cfenzew3ZjYe6ACagFsKSiAiIiURWtDdfQ0wsYf1\nXy9LIhERKYjuFA0kEonejpAT5SydvpARlLPU+krOQuR9UTTvA5h5uY8hIhI3ZobneVE0lz50EREA\namtr2bJlS2/HiJWxY8fS1NRUkrZ0hi4iOQvOGns7Rqxk+kwLOUNXH7qISEyooIuIxIQKuohITKig\ni0hsrF+/ngkTJjBs2DB+9rOfVey4AwYMYPPmzRU7XiYa5dLP1N9dT3NLc9Z9aqprmD9nflnbkPio\nr3+Q5uZ9ZWu/pmY48+fPymnf++67j8mTJ5NKpcqWpydm0ZhJXAW9n2luaaZ2Wm3WfZoWN5W9DYmP\n5uZ91NY2lK39pqbc296yZQvXX3992bJkEpWRP+pyEZFYuOKKK3jppZe47bbbGDp0KBs2bOC73/0u\nY8eOZdSoUXzrW9/i8OHDALz88suceeaZ/PCHP6S6uprRo0ezZMkSnnnmGc4991xGjBjB3Xff3dn2\nypUr+exnP0tVVRWjR49m5syZtLe395ijra0t43HLTQVdRGLhxRdf5PLLL2fBggV8+OGHLFiwgI0b\nN7J69Wo2btzItm3bmD//o27AHTt20NbWxvbt22lsbGTGjBk89thjpFIp/vCHP3DXXXd13kQ1cOBA\nHnzwQfbu3cuf//xnli9fzoIFC3rMceedd2Y9bjmpoItIrBzr/li4cCE//vGPGTZsGKeeeiqzZ8/m\nt7/9bed+gwYNYu7cuQwcOJDrrruO3bt3M2vWLIYMGcL555/P+eefz1tvvQXAxIkTmTRpEmZGTU0N\nN998My+//HKPxw87bjmpD11EYmfXrl0cOnSIiy++uHNdR0fHcX3dp59+eufFzMGDBwNwxhlndG4f\nPHgwBw8eBGDDhg3ccccdvP766/ztb3+jvb39uLbzOW456QxdRGJnxIgRDBkyhHfeeYe9e/eyd+9e\n9u3bx/79+wtq79Zbb+W8885j06ZN7Nu3jx/84Ac9FulSHzdfKugiEjtmxowZM5g1axa7du0CYNu2\nbSxbtqyg9g4cOMDQoUMZMmQI69at4+c//3lFjpsvdbmISFFqaobnNbSwkPZz1XU8+D333MP8+fO5\n5JJL2LNnD6NHj+bWW29lypQpoe/t/vr+++/n5ptv5r777mPChAlcd911LF++vMd97733XhobG3M+\nbilptsV+pm5WXU5jyH/94K/L2ob0TZptsfQ026KIiJxABV1EJCZU0EVEYkIXRSMkl0mvNm/YzLhz\nxmXcrkmxRPovFfQIyWXSqxVzVzB52uSM2zUplkj/FdrlYmanmNmrZpYyszVmNi9YX2Vmy8zsPTN7\nzsyGlT+uiIhkElrQ3f0w8Hl3nwCMB75gZpOA2cAL7n4usByYU9akIiKSVU4XRd39ULB4CuluGge+\nCDwSrH8EmFbydCIikrOcCrqZDTCzFLADeN7dVwLV7t4C4O47gDOytSEi0hfcdNNN1NfX93aMguR0\nUdTdO4AJZjYUeMLMLiB9ln7cbpne39DQ0LmcSCRIJBJ5BxWRaMpldFYx+svIrWQySTKZLKqNvEa5\nuPuHZpYErgFazKza3VvMbCSwM9P7uhZ0EYmXXEZnFaO/jNzqfrLb2NiYdxu5jHIZcWwEi5kNBq4C\n3gWWAnXBbt8AluR9dBGREjnrrLO4//77ueiii/jEJz7BjBkz2LlzJ1OnTmXo0KFMmTKlcxrbr3zl\nK4waNYqqqioSiQRr167N2O5TTz3FhAkTqKqq4rLLLmPNmjWV+i3lLZc+9FHAS2a2CngVeM7dnwbu\nBa4ys/eAK4B7yhdTRCTc73//e1588UXWr1/P0qVLmTp1Kvfccw+7d+/m6NGj/PSnPwVg6tSpbNq0\niZ07dzJx4kRuuOGGHttLpVJMnz6dhQsXsnfvXm655RauvfZajhw5UsnfVs5yGba4xt0nuvt4d7/Q\n3X8QrN/r7le6+7nuPsXd95U/rohIZjNnzmTEiBGMGjWKyy+/nM985jNceOGFDBo0iC996UukUikA\n6urqGDJkCCeffDL19fW89dZbHDhw4IT2Fi5cyDe/+U0+/elPY2bceOONnHLKKbzyyiuV/q3lRHO5\niEhsVFdXdy4PHjz4hNcHDx6ko6OD2bNnc/bZZzN8+HDOOusszIzdu3ef0N6WLVv40Y9+xGmnncZp\np51GVVUVW7duZfv27RX5/eRLt/6LSL/y+OOPs3TpUpYvX05NTQ379++nqqqqxznJzzzzTL73ve8x\nZ07fuG9SZ+gi0q8cPHiQj33sY1RVVdHa2sqcOXNOeFrRMTNmzOAXv/gFr732GgCtra08/fTTtLa2\nVjJyznSGLiJFqamuKevQwprqmpz2y/YIua6+/vWv8+yzzzJ69GhOP/107rrrLn75y1/2uO/FF1/M\nwoULuf3229m4cSODBw/msssu43Of+1x+v4kKUUEXkaJE5aafzZs3H/f6N7/5zXGvp0+fzvTp0wFY\nvHjxcdu+9rWvdS4vWrTouG1TpkypyPNAS0FdLiIiMaGCLiISEyroIiIxoYIuIhITKugiIjGhgi4i\nEhMatigiORs7dmzG8d1SmLFjx5asLRV0EclZU1NTb0eQLNTlIiISEyroIiIxoYIuIhITKugiIjGh\ngi4iEhMq6CIiMaGCLiISEyroIiIxoRuLYia1KkXdrLrM21enqJ1WW7E8IlI5Kugx09rWmrVgr3ht\nReXCiEhFhXa5mNkYM1tuZu+Y2Rozmxmsn2dmW83szeDnmvLHFRGRTHI5Q28H7nD3VWb2ceANM3s+\n2PaAuz9QvngiIpKr0ILu7juAHcHyQTN7FxgdbNa0ayIiEZHXKBczqwXGA68Gq243s1Vm9iszG1bi\nbCIikoecL4oG3S2/A74TnKkvAOa7u5vZ94EHgOk9vbehoaFzOZFIkEgkisksZVaKkTJhbWzesJlx\n54zLuL2muob5c+aHJBWJj2QySTKZLKqNnAq6mZ1Eupg/6u5LANx9V5ddFgJPZnp/14Iu0VeKkTKh\nbcxdweRpkzNub1rcFHoMkTjpfrLb2NiYdxu5drk8DKx1958cW2FmI7ts/zLwdt5HFxGRkgk9Qzez\nS4EbgDVmlgIcmAt81czGAx1AE3BLGXOKiEiIXEa5/AkY2MOmZ0sfR0RECqW5XEREYkIFXUQkJlTQ\nRURiQgVdRCQmVNBFRGJCBV1EJCZU0EVEYkIFXUQkJlTQRURiQgVdRCQmVNBFRGJCBV1EJCZU0EVE\nYkIFXUQkJlTQRURiIudniopUUtgzSUHPHRXpTgVdIinsmaSg546KdKcuFxGRmFBBFxGJCRV0EZGY\nUEEXEYkJXRTtZ/bs2cfixcnQffqCsJEwGgUj/Y0Kej/T3t7B8OGJrPtsal9dmTBFChsJo1Ew0t+E\ndrmY2RgzW25m75jZGjP7drC+ysyWmdl7ZvacmQ0rf1wREckklz70duAOd78A+EfgNjP7B2A28IK7\nnwssB+aUL6aIiIQJLejuvsPdVwXLB4F3gTHAF4FHgt0eAaaVK6SIiITLa5SLmdUC44FXgGp3b4F0\n0QfOKHU4ERHJXc4XRc3s48DvgO+4+0Ez8267dH/dqaGhoXM5kUiQSCTyS9lPpFJrWUVT1n36yggU\nEclPMpkkmUwW1UZOBd3MTiJdzB919yXB6hYzq3b3FjMbCezM9P6uBV0ya21tY8zwqVn36SsjUEQk\nP91PdhsbG/NuI9cul4eBte7+ky7rlgJ1wfI3gCXd3yQiIpUTeoZuZpcCNwBrzCxFumtlLnAv8C9m\n9l+BLcBXyhlURESyCy3o7v4nYGCGzVeWNo6IiBRKc7mIiMSECrqISExoLhcpi7BJwDT8UqT0VNCl\nLMImAdPwS5HSU5eLiEhMqKCLiMSECrqISEyooIuIxIQuilZQ/d31NLc0Z9y+Z/8HjKlgHhGJFxX0\nCmpuac76yLT2J49ULoyIxI66XEREYkIFXUQkJlTQRURiQgVdRCQmdFE0ZsLmUDl8uK1yYUSkolTQ\nYyZsDpUOX1m5MCJSUepyERGJCRV0EZGYUEEXEYkJFXQRkZhQQRcRiQkVdBGRmFBBFxGJidCCbmYP\nmVmLma3usm6emW01szeDn2vKG1NERMLkcoa+CLi6h/UPuPvE4OfZEucSEZE8hRZ0d18B/LWHTVb6\nOCIiUqhi+tBvN7NVZvYrMxtWskQiIlKQQudyWQDMd3c3s+8DDwDTM+3c0NDQuZxIJEgkEgUeVkQk\nnpLJJMlksqg2Ciro7r6ry8uFwJPZ9u9a0EVE5ETdT3YbGxvzbiPXLhejS5+5mY3ssu3LwNt5H1lE\nREoq9AzdzB4HEsDpZtYMzAM+b2bjgQ6gCbiljBlFRCQHoQXd3b/aw+pFZcgiIiJF0J2iIiIxoYIu\nIhITegSd5C3suaUQjWeXplalqJtVl3F7TXUN8+fML+oY9XfX09zSXNZjiORKBV3yFvbcUojGs0tb\n21qpnVabcXvT4qaij9Hc0lz2Y4jkSl0uIiIxoYIuIhITKugiIjGhgi4iEhO6KCqSRdgoltTqVNaL\noiKVpIIukkXYKJYVr62oXBiREOpyERGJCRV0EZGYUEEXEYkJFXQRkZjQRdEKSqXWsoqmjNujMP9J\nVOQyX8yePfsqE0akj1BBr6DW1jbGDJ+acXsU5j+Jilzmi9nUvroyYUT6CHW5iIjEhAq6iEhMqKCL\niMSECrqISEyooIuIxIRGucgJDh8+nHXIYF8ZXhk29PGDZ9dnfUQdRGPyrbAJwkCPupM0FXQ5QYeT\ndchgXxleGTb0cdOR1aHFOgqTb4VNEAZ61J2khXa5mNlDZtZiZqu7rKsys2Vm9p6ZPWdmw8obU0RE\nwuTSh74IuLrbutnAC+5+LrAcmFPqYCIikp/Qgu7uK4C/dlv9ReCRYPkRYFqJc4mISJ4K7UM/w91b\nANx9h5mdkW3nP/7xjxm3jR49mnHjxhUYQ0REjinVRVHPtnHmXTM7l0f+25GMPHskAG1/b+OcQefQ\n+E+NJYohfUUURtKEZYDiJwBLrUqFjqTRCBUBSCaTJJPJotootKC3mFm1u7eY2UhgZ7adp83tuUfm\n4N6DsLHABNKnRWEkTVgGKH4CsNa2Vo1QkZwkEgkSiUTn68bG/E90c72xyIKfY5YCdcHyN4AleR9Z\nRERKKpdhi48D/w/4lJk1m9lNwD3AVWb2HnBF8FpERHpRaJeLu381w6YrS5xFRESKoLlcRERiok/c\n+l9f/yDNzZlHG9TUDGf+/FkVTNSzsJx79uxjTJHHiMLoEPlI2HwxuYySCRsJE4X5ZKRv6BMFvbl5\nH7W1DRm3NzVl3lZJYTnbX3y46GNEYXSIfCR0vpgcRsmEjYSJwnwy0jeoy0VEJCZU0EVEYkIFXUQk\nJlTQRURiQgVdRCQmen2Uy6JH/xeP/Uv2mQMO7G7j1rqGygSSPkNDOEWO1+sF/cDhg1z0n2/Mus8b\nDxU/3E/iR0M4RY6nLhcRkZhQQRcRiQkVdBGRmFBBFxGJiV6/KCoi5Vd/dz3NLc0Zt+sxePGggi7S\nDzS3NGedAEyPwYsHdbmIiMSECrqISEyooIuIxIQKuohITKigi4jERL8Y5fIfrr6a7Xtasu7zydOr\n+cNzz1UokchHwp5LunXrjqzbATx1qLShCqChkb2vXxT07XtaGPOfpmXdZ+tTiyuURuR4Yc8lbT+6\nMut2gK2tvf/91dDI3ldUQTezJmA/0AEccfdJpQglIiL5K/YMvQNIuPtfSxFGREQKV+xFUStBGyIi\nUgLFFmMHnjezlWY2oxSBRESkMMV2uVzq7h+Y2b8hXdjfdfcV3XdK/jrZuVw7vpba8bV5HeRw234W\nJ+sybve/vw805NWmSC70mLvKqsRImaiOxkkmkySTyaLaKKqgu/sHwa+7zOwJYBJwQkFP1CWKOQwd\nJ7UzPFGbcfvWp1YV1b5IJnrMXWVVYqRMVEfjJBIJEolE5+vGxsa82yi4y8XMhpjZx4PlU4EpwNuF\nticiIsUp5gy9GnjCzDxo5zF3X1aaWCIikq+CC7q7vw+ML2EWEREpQkXuFP3www97XN96sBV3r0QE\nEZHYq0hBfym5ocf1bQcORWaUwJ69O6ibVZdxu+ahkCgL+/4+s2wZo/hUxu0fPLs+6/sBNm/YzLhz\nxmXcnlqdynqxUcqvIgV92NCLe1x/8GgL7m9UIkKodmuL5JVvkVyEfX8PPfn3rKN1Nh1ZHVqMV8xd\nweRpkzNvf+2EAW5SYbrLU0QkJlTQRURiQgVdRCQmVNBFRGJCBV1EJCZi8cSirdve5+xPZ77HaeuO\n9xlTgRypdS+xqqkp4/bD7fsqkEL6o0pMIhb2qLw9e7J/v595dlnWv6cABw7t5NZpNxcST4hJQW8f\n0J71EXNNC39YkRyt7fsZk2USsY4NRyuSQ/qfSkwiFvaovE3tq7O+/9CRv3NeyKMg33j0p4VEk4C6\nXEREYkIFXUQkJlTQRURiQgVdRCQmYnFRtBTCRgl88MwW6vY1ZG1jz559FRlNI31H2PcqvU/vT1DX\nV3JKdirogbBRApsOPUxtbUPWNtpffLi0oaTPC/tepffp/cfY9ZWckp26XEREYkIFXUQkJlTQRURi\nQgVdRCQmdFE0R4fb9rM4WZd9H83VIv1YJeaTSa1KhT4qrxKPi6y/u57mluaCM4S9v1Aq6DnqOKmd\n4VnmaQHN1SL9WyXmk2ltaw19VF4lHhfZ3NJc1CMrw94PwE/yjlVcl4uZXWNm68xsvZndWUxbIiJS\nnIILupkNAH4GXA1cAFxvZv9QqmCV5of7xtm1cpZOX8gIyllqO7bu6O0IZVPMGfokYIO7b3H3I8D/\nBr5YmliV11e+jMpZOn0hIyhnqamg92w08Jcur7cG60REpBdU5KLo9leSPa4/euQIhlUigohI7Jm7\nF/ZGs0uABne/Jng9G3B3v7fbfoUdQESkn3P3vM54iynoA4H3gCuAD4DXgOvd/d2CGhQRkaIU3OXi\n7kfN7HZgGem++IdUzEVEek/BZ+giIhItZZvLJco3HZnZQ2bWYmaru6yrMrNlZvaemT1nZsN6OeMY\nM1tuZu+Y2Roz+3ZEc55iZq+aWSrIOS+KOYNMA8zsTTNbGtWMAGbWZGZvBZ/pa8G6SGU1s2Fm9q9m\n9m7wHf1MBDN+KvgM3wx+3W9m345aziDrfzezt81stZk9ZmaDCslZloLeB246WkQ6W1ezgRfc/Vxg\nOTCn4qmO1w7c4e4XAP8I3BZ8hpHK6e6Hgc+7+wRgPPAFM5tExHIGvgOs7fI6ihkBOoCEu09w90nB\nuqhl/QnwtLufB1wErCNiGd19ffAZTgQuBlqBJ4hYTjP7JDATmOjuF5LuCr+eQnK6e8l/gEuAZ7q8\nng3cWY5jFZFxLLC6y+t1QHWwPBJY19sZu+VdDFwZ5ZzAEOB14N9HLScwBngeSABLo/xnDrwPnN5t\nXWSyAkOBTT2sj0zGHrJNAf4YxZzAJ4EtQFVQzJcW+ne9XF0uffGmozPcvQXA3XcAZ/Rynk5mVkv6\n7PcV0n/AkcoZdGWkgB3A8+6+kujl/DHwT0DXi0ZRy3iMA8+b2Uoz+2/BuihlPQvYbWaLgu6M/2lm\nQyKWsbv/AjweLEcqp7tvB34ENAPbgP3u/gIF5NR86JlF4mqxmX0c+B3wHXc/yIm5ej2nu3d4ustl\nDDDJzC4gQjnN7D8CLe6+CrLeydbrn2XgUk93E0wl3dV2ORH6PEmfRU4E/jnI2Ur6f+FRytjJzE4G\nrgX+NVgVqZxmNpz0tCljSZ+tn2pmN/SQKzRnuQr6NqCmy+sxwbooazGzagAzGwns7OU8mNlJpIv5\no+6+JFgduZzHuPuHQBK4hmjlvBS41sw2A78FJpvZo8COCGXs5O4fBL/uIt3VNolofZ5bgb+4++vB\n6/9LusBHKWNXXwDecPfdweuo5bwS2Ozue939KOl+/s9SQM5yFfSVwNlmNtbMBgHXke4XihLj+LO1\npUBdsPwNYEn3N/SCh4G17t51ZuRI5TSzEceuvpvZYOAq4F0ilNPd57p7jbuPI/1dXO7uNwJPEpGM\nx5jZkOB/ZZjZqaT7ftcQrc+zBfiLmX0qWHUF8A4RytjN9aT/IT8majmbgUvM7GNmZqQ/z7UUkrOM\nHf3XkL6TdAMwuzcvOvSQ7XFgO3A4+DBvIn1B4oUg8zJgeC9nvBQ4CqwCUsCbwWd6WsRy/rsg2ypg\nNfC9YH2kcnbJ+zk+uigauYyk+6eP/ZmvOfZ3J2pZSY9sWRlk/T0wLGoZg5xDgF3AJ7qsi2LOeaRP\nhFYDjwAnF5JTNxaJiMSELoqKiMSECrqISEyooIuIxIQKuohITKigi4jEhAq6iEhMqKCLiMSECrqI\nSEz8f80ocpZuFnMIAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(data.Age[data.Sex=='female'], bins=np.arange(0, 82, 2), alpha=.5)\n", + "plt.hist(data.Age[data.Sex=='male'], bins=np.arange(0, 82, 2), alpha=.5)\n", + "plt.legend(['female', 'male'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Age distribution for men and women are fairly similar. If they were different, it could affect other investigations." + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEACAYAAACj0I2EAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAG2FJREFUeJzt3X+QVOW95/H3F1AWQflxWUAZhwHdcCmTCGjYGL2h/Rl1\njSEaE0CzDKuWpq4B19SWGFyYoUx5NYVX1iRVG6IWawXcmAiiKwEU2wq5V0FtAgiiUYYGhIkMomEI\nMwzz3T+6ZzIM0z+mf83pw+dVNUX3c04/50Mz8+XMc55+jrk7IiJS/nr1dAARESkMFXQRkZBQQRcR\nCQkVdBGRkFBBFxEJCRV0EZGQyFjQzayvmb1pZjEz22Jm85Pt881sj5m9k/y6tvhxRUQkFctmHrqZ\nneHuR8ysN/BHYBZwHfBXd3+syBlFRCQLWQ25uPuR5MO+QB+g7X8BK0YoERHpvqwKupn1MrMYsB9Y\n6+4bk5vuMbNNZvYrMxtYtJQiIpJRVkMu7TubnQUsB34IfAIccHc3s4eAs9399uLEFBGRTLpV0AHM\n7H8CjR3Hzs1sFPCiu3+5i/21WIyISA7cvVvD2tnMchnaNpxiZv2Aq4H3zGxEh91uAramCRX4r/nz\n5/d4BuVURuVUzravXPTJYp+zgSVm1ovEfwD/191fNrP/Y2bjgVagDrgrpwQiIlIQGQu6u28BJnbR\n/l+LkkhERHKiT4omRSKRno6QFeUsnHLICMpZaOWSMxfdvija7QOYebGPISISNmaGd/OiaDZj6CIS\nclVVVezataunY5ySRo0aRV1dXUH60hm6iLSdDfZ0jFNSqvc+lzN0jaGLiISECrqISEiooIuIhIQK\nuogIcP311/PMM88UvN+ZM2cyb968gvfbFc1yCZB5D88jXh9Pu0/l8EoWPLAg0MeQcJg373Hi8UNF\n67+ychALFtxbtP676+WXX+7pCHlTQQ+QeH2cqilVafepW1EX+GNIOMTjh6iqqila/3V1xeu7s+PH\nj9O7d++SHa+naMhFRALvkUceoaKigrPOOotx48bx2muvnTSU8frrr3Puuee2Px89ejSPPvooF154\nIQMGDODRRx/llltuOaHf2bNnc++9id8SLr/8cp566imam5sZPHgw27Zta9/vwIEDnHHGGRw4cACA\nl156iQkTJjB48GAuu+wytmzZ0r5vLBbjoosuYuDAgUydOpWjR48W5T3pigq6iATa+++/z89//nPe\nfvttPv/8c1avXs2oUaO63NfsxGnbzz77LKtWreLQoUNMnTqVVatW0djYCEBrayvPPfcct9566wmv\nOf3007n55ptZtmxZe9tvfvMbIpEIQ4cOJRaLcfvtt7N48WIOHjzIXXfdxY033sixY8c4duwY3/72\nt5kxYwYHDx7klltu4Xe/+12B35HUVNBFJNB69+5Nc3MzW7dupaWlhcrKSsaMGZPVa2fPns0555xD\n3759qaysZOLEiSxfvhyAV199lf79+/OVr3zlpNdNmzbthIK+dOnS9sK/ePFi7r77bi6++GLMjO9/\n//v07duXN954gzfeeIOWlhZmzZpF7969ufnmm7vsv1hU0EUk0M477zwef/xxampqGDZsGNOnT2ff\nvn1ZvbaiouKE5x0L9bJly5g+fXqXr7v88sv529/+xsaNG9m1axd/+tOfmDJlCgC7du1i4cKFDBky\nhCFDhjB48GD27NnDxx9/zMcff8zIkSNP6CvVbxPFoIIuIoE3depU/vCHPxCPJ2Zo3X///QwYMIAj\nR46079NVke88BHPLLbcQjUbZu3cvy5cvT1nQe/XqxXe/+12WLl3KsmXLuOGGG+jfvz8A5557LnPn\nzuXgwYMcPHiQTz/9lMOHD/O9732Ps88+m717957QV1vmUtAslzIT2xSj+t7qlNtLMeVQUx+llN5/\n/3327t3LpZdeyumnn06/fv1obW1l/PjxLFy4kLlz59LU1MSiRYsy9jV06FAmT57MzJkzGTNmDGPH\njk2577Rp05gyZQpDhw7lJz/5SXv7nXfeyU033cSVV17JpEmTaGxs5PXXX2fy5Mlccskl9OnThyee\neIIf/OAHrFy5kg0bNnDFFVcU5L3IRAW9zDQ2N6addliKKYea+nhqqKwcVNSphZWVg7Lar6mpiTlz\n5vDee+9x2mmn8bWvfY1f/vKXDB48mLVr11JVVcXo0aOZOXMmCxcubH9d57PzNtOnT2fGjBn89Kc/\nPaG98/6TJk2if//+7Nu3j+uuu669/aKLLmLx4sXcc889/PnPf6Zfv35cdtllTJ48mdNOO43nn3+e\nO+64gwcffJDrr7+em2++Odu3JG8q6CLSpaB86OdLX/oSb775Zpfbnn322ROez549u/3xRx991OVr\nbrvtNm677baT2tetW3dS2wcffNBlH9dccw3XXHNNl9smTpzIO++80+W2YtMYuohISKigi4iEhAq6\niEhIqKCLiISECrqISEhkLOhm1tfM3jSzmJltMbP5yfbBZrbGzHaY2WozG1j8uCIikkrGgu7uTcDl\n7j4BGA9cZ2aTgDnAK+4+FlgHPFDUpCIiklZWQy7u3vb52r4k5q478C1gSbJ9CTCl4OlERCRrWRV0\nM+tlZjFgP7DW3TcCw929HsDd9wPDihdTROTv2tZCX79+PePGjcupj87rp4dBVp8UdfdWYIKZnQUs\nN7MLSJyln7BbqtfX1NS0P45EIkQikW4HFZHSymbNnnwUYr2fyy67jO3bt+f8+lTLA/SEaDRKNBrN\nq49uffTf3T83syhwLVBvZsPdvd7MRgB/SfW6jgVdRMpDNmv25EPr/Zyo88lubW1tt/vIZpbL0LYZ\nLGbWD7ga2A6sBKqTu80AXuj20UVEspDqtm6dh0327dvHd77zHYYNG8Z5553HE0880b7t6NGjVFdX\nM2TIEL74xS+ycePGkv89ii2bMfSzgdfMbBPwJrDa3V8GHgGuNrMdwJXAvxQvpoicqjLd1q1t2MTd\n+eY3v8mECRPYt28fr776KosWLWLt2rVAYqRg586d7Ny5k9WrV7NkyZIuj1fOspm2uMXdJ7r7eHf/\nsrv/JNl+0N2vcvex7n6Nux8qflwROdVke1u3DRs2cODAAebOnUvv3r2pqqrijjvuaF+R8bnnnuPB\nBx9k4MCBjBw5klmzZpX6r1J0Wj5XRAIt29u6xeNx9u7dy5AhQ4DEGXtraytf//rX2/vpeEu6Ut4a\nrlRU0EUk0FLd1u38888/oe3cc89lzJgx7Nixo8t+zjnnHHbv3t0+zXHXrl3FCdyDtJaLiARax9u6\ntbS08Pzzz7Nhw4b27e6JGdOTJk3izDPP5NFHH+Xo0aMcP36cd999l7feegtI3E/04Ycf5tChQ+zZ\ns4ef/exnPfL3KSadoYtIlyqHVxZ1amHl8Mqs9st0W7e2i6K9evXipZde4r777mP06NE0NzczduxY\nHnroIQDmz5/P3XffzejRoxk5ciQzZ87M6j6k5UQFXUS6FKSbfKe7rVs8/vcPP40YMYKlS5d2uV+/\nfv1Omtnyox/9qHAhA0BDLiIiIaGCLiISEiroIiIhoYIuIhISKugiIiGhgi4iEhKatigijBo1KlBr\ng59KCrkEgQq6iFBXV9fTEaQANOQiIhISKugiIiGhIZeQiW2KUX1vdertm2NFva2YiPQcFfSQaWxu\nTFuw129YX7owIlJSGnIREQkJFXQRkZBQQRcRCQkVdBGRkNBFUSmKTLNtKodXBuoGCiJhoIIuRZFp\ntk0xb20mcqrKOORiZhVmts7M3jWzLWb2w2T7fDPbY2bvJL+uLX5cERFJJZsz9BbgPnffZGYDgLfN\nbG1y22Pu/ljx4omISLYyFnR33w/sTz4+bGbbgZHJzVqeTUQkILo1y8XMqoDxwJvJpnvMbJOZ/crM\nBhY4m4iIdEPWF0WTwy2/BWYnz9R/ASxwdzezh4DHgNu7em1NTU3740gkQiQSySeziEjoRKNRotFo\nXn1kVdDNrA+JYv6Mu78A4O6fdNhlMfBiqtd3LOgiInKyzie7tbW13e4j2yGXp4Bt7r6orcHMRnTY\nfhOwtdtHFxGRgsl4hm5mlwK3AlvMLAY48GNgupmNB1qBOuCuIuYUEZEMspnl8kegdxebfl/4OCIi\nkiut5SIiEhIq6CIiIaGCLiISEiroIiIhoYIuIhISKugiIiGhgi4iEhIq6CIiIaGCLiISEiroIiIh\noYIuIhISKugiIiGhgi4iEhIq6CIiIaGCLiISEiroIiIhoYIuIhISKugiIiGhgi4iEhIq6CIiIaGC\nLiISEn16OoCUVkPDIVasiKbdx2NHShNGRApKBf0U09LSyqBBkbT77GlcUZowIlJQGYdczKzCzNaZ\n2btmtsXMZiXbB5vZGjPbYWarzWxg8eOKiEgq2YyhtwD3ufsFwCXAP5vZPwJzgFfcfSywDnigeDFF\nRCSTjAXd3fe7+6bk48PAdqAC+BawJLnbEmBKsUKKiEhm3ZrlYmZVwHjgDWC4u9dDougDwwodTkRE\nspf1RVEzGwD8Fpjt7ofNzDvt0vl5u5qamvbHkUiESCTSvZQiIiEXjUaJRqN59ZFVQTezPiSK+TPu\n/kKyud7Mhrt7vZmNAP6S6vUdC7qIiJys88lubW1tt/vIdsjlKWCbuy/q0LYSqE4+ngG80PlFIiJS\nOhnP0M3sUuBWYIuZxUgMrfwYeAT4jZn9N2AX8N1iBhURkfQyFnR3/yPQO8XmqwobR0REcqW1XERE\nQkIFXUQkJLSWi5yk4eB+qu+tTrk9tjlG1ZSqvI4R2xRLe4zK4ZUseGBBXscQOdWooMtJWqw5bcFe\nv2F93sdobG5Me4y6FXV5H0PkVKMhFxGRkFBBFxEJCRV0EZGQUEEXEQkJXRQtoXkPzyNeH0+5fdWa\nNZzNF9L20dBwqNCxiiLTre7K5e8hUk5U0EsoXh9PO7PjyItHM94e7sOWzYUNVSSZbnVXLn8PkXKi\nIRcRkZBQQRcRCQkVdBGRkFBBFxEJCV0UlZM0NTX1+AyVVb9fw/kXj0+7T/PRv3LFVf+UcrvWg5FT\njQq6nKTV6fEZKkeOHWXcDVPS7vP2M/9L68GIdKAhFxGRkFBBFxEJCRV0EZGQUEEXEQkJFXQRkZDQ\nLJeQybQoVlNTc+nCpJFpamRQcoqUExX0kMm0KFarbyxdmDQyTY0MSk6RcpJxyMXMnjSzejPb3KFt\nvpntMbN3kl/XFjemiIhkks0Y+tPAN7pof8zdJya/fl/gXCIi0k0ZC7q7rwc+7WKTFT6OiIjkKp9Z\nLveY2SYz+5WZDSxYIhERyUmuF0V/ASxwdzezh4DHgNtT7VxTU9P+OBKJEIlEcjysBEGmGSqJfXp+\nlkpsU4zqe6tTbi/E4l2ZbiuoBcIkW9FolGg0mlcfORV0d/+kw9PFwIvp9u9Y0KX8ZZqhktin52ep\nNDY3Fn3xrky3FdQCYZKtzie7tbW13e4j2yEXo8OYuZmN6LDtJmBrt48sIiIFlfEM3cyWAhHgH8ws\nDswHLjez8UArUAfcVcSMIiKShYwF3d2nd9H8dBGyiIhIHrSWi4hISKigi4iEhAq6iEhIqKCLiISE\nCrqISEiooIuIhIQKuohISKigi4iEhO5YJKesTIt3gRbXkvKigi6nrEyLd4EW15LyoiEXEZGQUEEX\nEQkJFXQRkZAoyRj67t27U24bNGgQZ555ZiliiIiEWkkKem3tmi7bW1qaGTeuD/fff2cpYvS4WGwb\nm6hLuT0It20rJ5luhdfQcKh0YUQCoCQFvaKi69uNHj5cz5Ejz5ciQiA0NjZTMej6lNuDcNu2cpLp\nVngftmwuXRiRANAYuohISKigi4iEhAq6iEhIqKCLiISECrqISEiUxVou8+Y9TjyeegpaZeUgFiy4\nt4SJ5FSRaQGv2OZY2vVgtACYlFJZFPR4/BBVVTUpt9fVpd4mko9MC3it37A+r9eDFgCTwsk45GJm\nT5pZvZlt7tA22MzWmNkOM1ttZgOLG1NERDLJZgz9aeAbndrmAK+4+1hgHfBAoYOJiEj3ZCzo7r4e\n+LRT87eAJcnHS4ApBc4lIiLdlOssl2HuXg/g7vuBYYWLJCIiuSjURVFPtzEarWl/XFUVoaoqUqDD\niqRWLot3xWLbqK6uSbn9o33/zphxZ6ftQzNlyl80GiUajebVR64Fvd7Mhrt7vZmNAP6SbudIpCbH\nw4jkrlwW72psbE47i2v91vFcMeWStH1opkz5i0QiRCKR9ue1tbXd7iPbIRdLfrVZCVQnH88AXuj2\nkUVEpKCymba4FPg34AtmFjezmcC/AFeb2Q7gyuRzERHpQRmHXNx9eopNVxU4i4iI5EFruYiIhIQK\nuohISKigi4iEhAq6iEhIqKCLiISECrqISEiooIuIhIQKuohISPT4HYuefvYX/Pr//e+0+/z1QDM/\nSLN4USbzHp5HvD6edp9yWdwo04JTTU3NpQtT5jK9l5B5Aa+GhkNlsQCYnBp6vKD/tfkwF94wI+0+\nbz/5VF7HiNfHQ3MbsEwLTrX6xtKFKXOZ3kvIvIBXS0trWSwAJqcGDbmIiISECrqISEiooIuIhIQK\nuohISPT4RdEwmTfvceLx1LMaGhoOUVHCPJK/IMwqamg4mHE2jseOpN2eaaZXuczykvRU0AsoHj+U\n9lZiLa/mN1tHSi8Is4paWjLPxtnTuCLt9kwzvcpllpekpyEXEZGQUEEXEQkJFXQRkZBQQRcRCQkV\ndBGRkFBBFxEJCRV0EZGQyGseupnVAZ8BrcAxd59UiFAiItJ9+X6wqBWIuPunhQgjIiK5y3fIxQrQ\nh4iIFEC+xdiBtWa20czuLEQgERHJTb5DLpe6+z4z+48kCvt2d1/feadotKb9cVVVhKqqSJ6HLbzY\nphjV91an3J7N4kWx915jU11dyu1NLbodmZys4eB+VkSrU24vl++bQiwAVopFxIK6UFk0GiUajebV\nR14F3d33Jf/8xMyWA5OAkwp6JFKTz2FKorG5Me/FixpbPqMikrqP1g+Odz+YhF6LNTMoBN83hVgA\nrBSLiAV1obJIJEIkEml/Xltb2+0+ch5yMbMzzGxA8nF/4Bpga679iYhIfvI5Qx8OLDczT/bza3df\nU5hYIiLSXTkXdHffCYwvYBYREcmDphyKiITEKXHHolhsG5uoS7tPQ0N5zCQQ6UpDw0Gqq2tSbl/1\n769zdpqfgX2/fz/tLC+Ajz74iDH/aUzK7bHNsbQXG2OxbWkzAsR2bkvbh6R3ShT0xsZmKgZdn3af\nD1s2lyiNSOG1tJD29odHXn0q7W3sPjy2OWMhXf/j9Vwx5YrU2zecNMHtBI2NzWkzAqzfmv5WepKe\nhlxEREJCBV1EJCRU0EVEQkIFXUQkJFTQRURCoixmuTQ1f5Z28aIPdrzI+otTXx3fs38nFXlmyGbK\nVUPDobyPI5KLTD8jhVjgq6HhECtWRNNul55VFgW9tU9L2sWLmj44SsUNU1Jur1v807wzZDPlquXV\np/I+jkguMv2MFGKBr5aW1vRTHzX1t8dpyEVEJCRU0EVEQkIFXUQkJEoyhr5797912X7kyEFaW4+V\nIoKISOiVpKC/3by4y/am5s9paj1ciggikkZTU1PaGSyJfZrz6mPP3rq0M3EAGj7bl3a7pFeSgj7o\nvFFdth/+tB42lCKBiKTT6qSdwZLYZ2NefbT02ph2Jg7Ah8/oN/Z8aAxdRCQkVNBFREJCBV1EJCRU\n0EVEQqIsPvpfCpmu0GudCslFIWaPyN/FNsUy3iqvcnglCx5YUNQc8x6eR7w+nnOGefMeJx4vfE1R\nQU/KdIX+wxat0yLdV4jZI/J3jc2NGW+VV7eirug54vXxtDkyZYjHD2VcGwpquxsrvyEXM7vWzN4z\ns/fN7P58+hIRkfzkXNDNrBfwM+AbwAXANDP7x0IFKzVvyn81ulJQzsIph4ygnIW2f8/+no5QNPmc\noU8CPnD3Xe5+DHgW+FZhYpVeuXwzKmfhlENGUM5CU0Hv2khgd4fne5JtIiLSA0pyUfStXzzRZbu3\nOt6rPP5XFxEJOnP33F5o9lWgxt2vTT6fA7i7P9Jpv9wOICJyinN3687++RT03sAO4EpgH4lltqa5\n+/acOhQRkbzkPOTi7sfN7B5gDYmx+CdVzEVEek7OZ+giIhIsRVvLJcgfOjKzJ82s3sw2d2gbbGZr\nzGyHma02s4E9nLHCzNaZ2btmtsXMZgU0Z18ze9PMYsmc84OYM5mpl5m9Y2Yrg5oRwMzqzOxPyfd0\nQ7ItUFnNbKCZPWdm25Pfo/85gBm/kHwP30n++ZmZzQpazmTW/25mW81ss5n92sxOzyVnUQp6GXzo\n6GkS2TqaA7zi7mOBdcADJU91ohbgPne/ALgE+OfkexionO7eBFzu7hOA8cB1ZjaJgOVMmg1s6/A8\niBkBWoGIu09w90nJtqBlXQS87O7jgAuB9whYRnd/P/keTgQuAhqB5QQsp5mdA/wQmOjuXyYxFD6N\nXHK6e8G/gK8Cqzo8nwPcX4xj5ZFxFLC5w/P3gOHJxyOA93o6Y6e8K4CrgpwTOAN4C/hK0HICFcBa\nIAKsDPK/ObAT+IdObYHJCpwFfNhFe2AydpHtGuAPQcwJnAPsAgYni/nKXH/WizXkUo4fOhrm7vUA\n7r4fGNbDedqZWRWJs983SPwDBypncigjBuwH1rr7RoKX81+B/wF0vGgUtIxtHFhrZhvN7I5kW5Cy\njgYOmNnTyeGMX5rZGQHL2Nn3gKXJx4HK6e4fAwuBOLAX+MzdXyGHnFoPPbVAXC02swHAb4HZ7n6Y\nk3P1eE53b/XEkEsFMMnMLiBAOc3svwD17r4JSDevt8ffy6RLPTFMcD2JobZ/IkDvJ4mzyInAz5M5\nG0n8Fh6kjO3M7DTgRuC5ZFOgcprZIBLLpowicbbe38xu7SJXxpzFKuh7gcoOzyuSbUFWb2bDAcxs\nBPCXHs6DmfUhUcyfcfcXks2By9nG3T8HosC1BCvnpcCNZvYRsAy4wsyeAfYHKGM7d9+X/PMTEkNt\nkwjW+7kH2O3ubyWf/45EgQ9Sxo6uA9529wPJ50HLeRXwkbsfdPfjJMb5v0YOOYtV0DcC55vZKDM7\nHZhKYlwoSIwTz9ZWAtXJxzOAFzq/oAc8BWxz90Ud2gKV08yGtl19N7N+wNXAdgKU091/7O6V7j6G\nxPfiOnf/PvAiAcnYxszOSP5Whpn1JzH2u4VgvZ/1wG4z+0Ky6UrgXQKUsZNpJP4jbxO0nHHgq2b2\nH8zMSLyf28glZxEH+q8l8UnSD4A5PXnRoYtsS4GPgabkmzmTxAWJV5KZ1wCDejjjpcBxYBMQA95J\nvqdDApbzS8lsm4DNwNxke6Bydsg7mb9fFA1cRhLj023/5lvafnaClpXEzJaNyazPAwODljGZ8wzg\nE+DMDm1BzDmfxInQZmAJcFouOfXBIhGRkNBFURGRkFBBFxEJCRV0EZGQUEEXEQkJFXQRkZBQQRcR\nCQkVdBGRkFBBFxEJif8PyICZLq7GVloAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "plt.hist(data[data.Survived==1].Age, bins=np.arange(0, 82, 2), alpha=.5)\n", + "plt.hist(data[data.Survived==0].Age, bins=np.arange(0, 82, 2), alpha=.5)\n", + "plt.hist(data[data.Survived==1].Age - data[data.Survived==0].Age, bins=np.arange(0, 82, 2))\n", + "plt.legend(['survived', 'died'])\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Investigate basic survival rates among males/females, and adults/children." + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "% of males who survived: 18.8908145581\n", + "% of females who survived: 74.2038216561\n" + ] + } + ], + "source": [ + "female_surv, male_surv = data.groupby('Sex').Survived.mean()\n", + "print '% of males who survived:', male_surv*100\n", + "print '% of females who survived:', female_surv*100" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "% of children under 16 who survived: 0.590361445783\n", + "% of adults 16 and older who survived: 0.362623762376\n" + ] + } + ], + "source": [ + "over_16_surv, under_16_surv = data.groupby(data.Age<16).Survived.mean()\n", + "print '% of children under 16 who survived:', under_16_surv\n", + "print '% of adults 16 and older who survived:', over_16_surv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Investigate how the intersection of age and sex affect survival chances." + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "A male child is 3.20369318182 more likely to surive the crash than a male adult.\n", + "A female child, however, is only 0.860805445264 more likely to survive the crash than an adult female.\n", + "An adult female is 4.61611036565 more likely to survive the crash than an adult male.\n" + ] + } + ], + "source": [ + "fem_adult, fem_child, male_adult, male_child = data.groupby(['Sex', data.Age<16]).mean().Survived\n", + "\n", + "print 'A male child is', male_child/male_adult, 'more likely to surive the crash than a male adult.' \n", + "print 'A female child, however, is only', fem_child/fem_adult, 'more likely to survive the crash than an adult female.'\n", + "print 'An adult female is', fem_adult/male_adult, 'more likely to survive the crash than an adult male.'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Investigate whether males with families were more likely to survive." + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Survival rate of males with / without families: 0.285714285714 / 0.451612903226\n", + "Survival rate of females with / without families: 0.583333333333 / 0.769230769231\n" + ] + } + ], + "source": [ + "# fem_fam, male_fam \n", + "fam = data[(data.SibSp != 0) & (data.Parch != 0)]\n", + "lone = data[(data.SibSp == 1) & (data.Parch == 1)]\n", + "fem_fam = fam[fam.Sex=='female'].Survived\n", + "male_fam = fam[fam.Sex=='male'].Survived\n", + "fem_lone = lone[fam.Sex=='female'].Survived\n", + "male_lone = lone[fam.Sex=='male'].Survived\n", + "\n", + "print 'Survival rate of males with / without families:', male_fam.mean(), '/', male_lone.mean()\n", + "print 'Survival rate of females with / without families:', fem_fam.mean(), '/', fem_lone.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Passengers traveling without families tended to survive more than those travelling with them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/model_iteration_1.ipynb b/model_iteration_1.ipynb new file mode 100644 index 0000000..ff2a09d --- /dev/null +++ b/model_iteration_1.ipynb @@ -0,0 +1,169 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas, math\n", + "from sklearn.linear_model import LinearRegression, LogisticRegression\n", + "import sklearn.cross_validation as cv\n", + "import numpy as np\n", + "\n", + "titanic = pandas.read_csv(\"train.csv\")\n", + "titanic_test = pandas.read_csv('test.csv')\n", + "\n", + "titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())\n", + "titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())\n", + "\n", + "# titanic.loc[titanic[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "# titanic.loc[titanic['Sex'] == 'female', 'Sex'] = 1\n", + "titanic.Sex.replace(['male', 'female'], [0, 1], inplace=True)\n", + "titanic_test.Sex.replace(['male', 'female'], [0, 1], inplace=True)\n", + "# titanic_test.loc[titanic_test[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "# titanic_test.loc[titanic_test['Sex'] == 'female', 'Sex'] = 1\n", + "\n", + "\n", + "titanic.Embarked.fillna('S', inplace=True)\n", + "titanic_test.Embarked.fillna('S', inplace=True)\n", + "\n", + "# titanic.loc[titanic.Embarked == 'S', 'Embarked'] = 0\n", + "# titanic.loc[titanic.Embarked == 'C', 'Embarked'] = 1\n", + "# titanic.loc[titanic.Embarked == 'Q', 'Embarked'] = 2\n", + "titanic['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)\n", + "titanic_test['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)\n", + "# titanic_test.loc[titanic_test.Embarked == 'S', 'Embarked'] = 0\n", + "# titanic_test.loc[titanic_test.Embarked == 'C', 'Embarked'] = 1\n", + "# titanic_test.loc[titanic_test.Embarked == 'Q', 'Embarked'] = 2\n", + "\n", + "titanic_test['Fare'].fillna(titanic_test['Fare'].median(), inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Testing and training data has been cleaned. Time to make some changes to the model." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "titanic['LogAge'] = [math.log(val) for val in titanic['Age']]\n", + "titanic_test['LogAge'] = [math.log(val) for val in titanic_test['Age']]\n", + "\n", + "titanic['Male'] = titanic.Sex\n", + "titanic['Female'] = titanic.Male\n", + "titanic['Female'].replace([0, 1], [0,1], inplace=True)\n", + "\n", + "titanic_test['Male'] = titanic_test.Sex\n", + "titanic_test['Female'] = titanic_test.Male\n", + "titanic_test['Female'].replace([0, 1], [0,1], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Create a LogAge column which records the logarithm of the passenger's age, because a given change in age does not have the same effect across the entire distribution. \n", + "Ideally, I would create a transformation that gave more resolution/weight to ages 0-20, and also 50-80. Differences in ages in the middle probably have less of an effect on survival.\n", + "Remove \"ParCh\" from the list of predictors." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.802469135802\n" + ] + } + ], + "source": [ + "predictors = [\"Pclass\", \"Sex\", \"LogAge\", \"Fare\", \"SibSp\", \"Embarked\"]\n", + "alg = LogisticRegression(random_state=1)\n", + "scores = cv.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "print scores.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + "predictions = alg.predict(titanic_test[predictors])\n", + "\n", + "submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " })\n", + " \n", + "submission.to_csv(\"kaggle.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The new kaggle score is .77033" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/model_iteration_2.ipynb b/model_iteration_2.ipynb new file mode 100644 index 0000000..9b29d46 --- /dev/null +++ b/model_iteration_2.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I read through both Dataquest tutorials, as well as some more tutorials on the kaggle forums. All of them seemed to revolve around using the same variables." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Instead of using age, like most of the tutorials, I've turned that continuous variable into a discrete 'Adult' variable, based on whether a passenger was 16 or older. I use a FamilySize variable that is the sum of SibSp and Parch, and I've turned the Embarked and Sex categories into integers instead of strings." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [], + "source": [ + "import pandas, math\n", + "import numpy as np\n", + "from sklearn import cross_validation\n", + "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "titanic = pandas.read_csv('train.csv')\n", + "titanic_test = pandas.read_csv('test.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Clean up data, replace strings with integers, create new variables" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#Create FamSize variable as sum of Parch and SibSp values\n", + "titanic['FamSize'] = titanic.SibSp + titanic.Parch\n", + "titanic_test['FamSize'] = titanic_test.SibSp + titanic_test.Parch\n", + "\n", + "#Replace male/female strings with 0/1\n", + "titanic.Sex.replace(['male', 'female'], [0, 1], inplace=True)\n", + "titanic_test.Sex.replace(['male', 'female'], [0, 1], inplace=True)\n", + "\n", + "#Fill empty values with the median age of the data\n", + "titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())\n", + "titanic_test['Age'] = titanic_test['Age'].fillna(titanic['Age'].median())\n", + "\n", + "#Create a variable which transforms age to log(age)\n", + "titanic['LogAge'] = titanic['Age'].apply(lambda x: math.log(x))\n", + "titanic_test['LogAge'] = titanic_test['Age'].apply(lambda x: math.log(x))\n", + "\n", + "#Create a variable that is the length of a passenger's name\n", + "titanic['NameLen'] = titanic['Name'].apply(lambda x: len(x))\n", + "titanic_test['NameLen'] = titanic['Name'].apply(lambda x: len(x))\n", + "\n", + "#Fill in missing Fare values with median value\n", + "titanic_test['Fare'].fillna(titanic_test['Fare'].median(), inplace=True)\n", + "\n", + "#Create Adult variable based on whether a passenger's age is greater/less than 16\n", + "titanic['Adult'] = titanic['Age'].apply(lambda(x): 0 if x<16.0 else 1)\n", + "titanic_test['Adult'] = titanic_test['Age'].apply(lambda(x): 0 if x<16.0 else 1)\n", + "\n", + "#Fill missing embarked values with most common part\n", + "titanic.Embarked.fillna('S', inplace=True)\n", + "titanic_test.Embarked.fillna('S', inplace=True)\n", + "\n", + "#Replace string values of ports with ints\n", + "titanic['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)\n", + "titanic_test['Embarked'].replace(['S', 'C', 'Q'], [0, 1, 2], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use a logistic regression algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.79012345679\n" + ] + } + ], + "source": [ + "predictors = [\"Sex\", \"Adult\", \"Embarked\", \"FamSize\"]\n", + "alg = LogisticRegression(random_state=1)\n", + "#Test Logistic regression model with given predictors and print score\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "print scores.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Use a Random Forest algorithm" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.824915824916\n" + ] + } + ], + "source": [ + "alg = RandomForestClassifier(random_state=1, n_estimators=120, min_samples_split=3, min_samples_leaf=4)\n", + "#Test Random Forest model with given predictors and print score\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "print(scores.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": false + }, + "source": [ + "Besides using different variables as parameters in the Random Forest, I also tried to modify the parameters of the Random Forest, to little effect. The most dramatic effect was in modifying the number of estimators, and it almost always worsened the performance." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + "predictions = alg.predict(titanic_test[predictors])\n", + "#Create a submission using alg and predictors, export to csv\n", + "submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " })\n", + "\n", + "submission.to_csv(\"kaggle.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Best score = .78469 using a Random Forest with Sex, Adult, Embarked, FamSize, 120 estimators, minimum split of 3, minimum sample of 4" + ] + }, + { + "cell_type": "code", + "execution_count": 380, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.79797979798\n" + ] + } + ], + "source": [ + "#Ensembling code pulled straight from tutorial\n", + "algorithms = [\n", + " [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],\n", + " [LogisticRegression(random_state=1), predictors]\n", + "]\n", + "\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "print(scores.mean())\n", + "\n", + "full_predictions = []\n", + "for alg, predictors in algorithms:\n", + " alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + " predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]\n", + " full_predictions.append(predictions)\n", + "\n", + "# The gradient boosting classifier generates better predictions, so we weight it higher.\n", + "predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4\n", + "predictions[predictions>.5] = 1\n", + "predictions[predictions<=.5] = 0\n", + "predictions = predictions.astype(int)\n", + "submission = pandas.DataFrame({'PassengerId': titanic_test['PassengerId'], 'Survived':predictions})\n", + "submission.to_csv(\"kaggle.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Best score = .78947 using an ensemble of Gradient Boosting and Logistic Regression. Parameters were Sex, Adult, Embarked, FamSize, and Pclass." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It was very difficult to correlate an improvement in training data score with a correlation in kaggle submission scores. When I was using the Random Forest, reducing the number of parameters almost always gave a better performance (up to a point). This makes sense because I think complex models tend to overfit to the training data. I also just read The Evolution of Cooperation by Robert Axelrod, and that book gives plenty of examples for why models should be as simple as possible." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}