diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8957135 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.pyc +/submit_to_kaggle diff --git a/.ipynb_checkpoints/exploration-checkpoint.ipynb b/.ipynb_checkpoints/exploration-checkpoint.ipynb new file mode 100644 index 0000000..c9157c9 --- /dev/null +++ b/.ipynb_checkpoints/exploration-checkpoint.ipynb @@ -0,0 +1,2516 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Emily Wang | Data Science Warmup Project | Spring 2016\n", + "\n", + "Includes stories/thoughts throughout the process, summary statistics, plots, and more!\n", + "\n", + "[Link to data and variable descriptions page on kaggle](https://www.kaggle.com/c/titanic/data?train.csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import pprint as pp\n", + "import thinkstats2\n", + "import thinkplot" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.read_csv('./data/train.csv')\n", + "\n", + "# Variable descriptions \n", + "# survival Survival\n", + "# (0 = No; 1 = Yes)\n", + "# pclass Passenger Class\n", + "# (1 = 1st; 2 = 2nd; 3 = 3rd)\n", + "# name Name\n", + "# sex Sex\n", + "# age Age\n", + "# sibsp Number of Siblings/Spouses Aboard\n", + "# parch Number of Parents/Children Aboard\n", + "# ticket Ticket Number\n", + "# fare Passenger Fare\n", + "# cabin Cabin\n", + "# embarked Port of Embarkation\n", + "# (C = Cherbourg; Q = Queenstown; S = Southampton)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale2210A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female3810PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale2600STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female351011380353.1000C123S
4503Allen, Mr. William Henrymale35003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
6701McCarthy, Mr. Timothy Jmale54001746351.8625E46S
7803Palsson, Master. Gosta Leonardmale23134990921.0750NaNS
8913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female270234774211.1333NaNS
91012Nasser, Mrs. Nicholas (Adele Achem)female141023773630.0708NaNC
101113Sandstrom, Miss. Marguerite Rutfemale411PP 954916.7000G6S
111211Bonnell, Miss. Elizabethfemale580011378326.5500C103S
121303Saundercock, Mr. William Henrymale2000A/5. 21518.0500NaNS
131403Andersson, Mr. Anders Johanmale391534708231.2750NaNS
141503Vestrom, Miss. Hulda Amanda Adolfinafemale14003504067.8542NaNS
151612Hewlett, Mrs. (Mary D Kingcome)female550024870616.0000NaNS
161703Rice, Master. Eugenemale24138265229.1250NaNQ
171812Williams, Mr. Charles EugenemaleNaN0024437313.0000NaNS
181903Vander Planke, Mrs. Julius (Emelia Maria Vande...female311034576318.0000NaNS
192013Masselmani, Mrs. FatimafemaleNaN0026497.2250NaNC
202102Fynney, Mr. Joseph Jmale350023986526.0000NaNS
212212Beesley, Mr. Lawrencemale340024869813.0000D56S
222313McGowan, Miss. Anna \"Annie\"female15003309238.0292NaNQ
232411Sloper, Mr. William Thompsonmale280011378835.5000A6S
242503Palsson, Miss. Torborg Danirafemale83134990921.0750NaNS
252613Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...female381534707731.3875NaNS
262703Emir, Mr. Farred ChehabmaleNaN0026317.2250NaNC
272801Fortune, Mr. Charles Alexandermale193219950263.0000C23 C25 C27S
282913O'Dwyer, Miss. Ellen \"Nellie\"femaleNaN003309597.8792NaNQ
293003Todoroff, Mr. LaliomaleNaN003492167.8958NaNS
.......................................
86186202Giles, Mr. Frederick Edwardmale21102813411.5000NaNS
86286311Swift, Mrs. Frederick Joel (Margaret Welles Ba...female48001746625.9292D17S
86386403Sage, Miss. Dorothy Edith \"Dolly\"femaleNaN82CA. 234369.5500NaNS
86486502Gill, Mr. John Williammale240023386613.0000NaNS
86586612Bystrom, Mrs. (Karolina)female420023685213.0000NaNS
86686712Duran y More, Miss. Asuncionfemale2710SC/PARIS 214913.8583NaNC
86786801Roebling, Mr. Washington Augustus IImale3100PC 1759050.4958A24S
86886903van Melkebeke, Mr. PhilemonmaleNaN003457779.5000NaNS
86987013Johnson, Master. Harold Theodormale41134774211.1333NaNS
87087103Balkic, Mr. Cerinmale26003492487.8958NaNS
87187211Beckwith, Mrs. Richard Leonard (Sallie Monypeny)female47111175152.5542D35S
87287301Carlsson, Mr. Frans Olofmale33006955.0000B51 B53 B55S
87387403Vander Cruyssen, Mr. Victormale47003457659.0000NaNS
87487512Abelson, Mrs. Samuel (Hannah Wizosky)female2810P/PP 338124.0000NaNC
87587613Najib, Miss. Adele Kiamie \"Jane\"female150026677.2250NaNC
87687703Gustafsson, Mr. Alfred Ossianmale200075349.8458NaNS
87787803Petroff, Mr. Nedeliomale19003492127.8958NaNS
87887903Laleff, Mr. KristomaleNaN003492177.8958NaNS
87988011Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)female56011176783.1583C50C
88088112Shelley, Mrs. William (Imanita Parrish Hall)female250123043326.0000NaNS
88188203Markun, Mr. Johannmale33003492577.8958NaNS
88288303Dahlberg, Miss. Gerda Ulrikafemale2200755210.5167NaNS
88388402Banfield, Mr. Frederick Jamesmale2800C.A./SOTON 3406810.5000NaNS
88488503Sutehall, Mr. Henry Jrmale2500SOTON/OQ 3920767.0500NaNS
88588603Rice, Mrs. William (Margaret Norton)female390538265229.1250NaNQ
88688702Montvila, Rev. Juozasmale270021153613.0000NaNS
88788811Graham, Miss. Margaret Edithfemale190011205330.0000B42S
88888903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.4500NaNS
88989011Behr, Mr. Karl Howellmale260011136930.0000C148C
89089103Dooley, Mr. Patrickmale32003703767.7500NaNQ
\n", + "

891 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "5 6 0 3 \n", + "6 7 0 1 \n", + "7 8 0 3 \n", + "8 9 1 3 \n", + "9 10 1 2 \n", + "10 11 1 3 \n", + "11 12 1 1 \n", + "12 13 0 3 \n", + "13 14 0 3 \n", + "14 15 0 3 \n", + "15 16 1 2 \n", + "16 17 0 3 \n", + "17 18 1 2 \n", + "18 19 0 3 \n", + "19 20 1 3 \n", + "20 21 0 2 \n", + "21 22 1 2 \n", + "22 23 1 3 \n", + "23 24 1 1 \n", + "24 25 0 3 \n", + "25 26 1 3 \n", + "26 27 0 3 \n", + "27 28 0 1 \n", + "28 29 1 3 \n", + "29 30 0 3 \n", + ".. ... ... ... \n", + "861 862 0 2 \n", + "862 863 1 1 \n", + "863 864 0 3 \n", + "864 865 0 2 \n", + "865 866 1 2 \n", + "866 867 1 2 \n", + "867 868 0 1 \n", + "868 869 0 3 \n", + "869 870 1 3 \n", + "870 871 0 3 \n", + "871 872 1 1 \n", + "872 873 0 1 \n", + "873 874 0 3 \n", + "874 875 1 2 \n", + "875 876 1 3 \n", + "876 877 0 3 \n", + "877 878 0 3 \n", + "878 879 0 3 \n", + "879 880 1 1 \n", + "880 881 1 2 \n", + "881 882 0 3 \n", + "882 883 0 3 \n", + "883 884 0 2 \n", + "884 885 0 3 \n", + "885 886 0 3 \n", + "886 887 0 2 \n", + "887 888 1 1 \n", + "888 889 0 3 \n", + "889 890 1 1 \n", + "890 891 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 \n", + "2 Heikkinen, Miss. Laina female 26 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 \n", + "4 Allen, Mr. William Henry male 35 0 \n", + "5 Moran, Mr. James male NaN 0 \n", + "6 McCarthy, Mr. Timothy J male 54 0 \n", + "7 Palsson, Master. Gosta Leonard male 2 3 \n", + "8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0 \n", + "9 Nasser, Mrs. Nicholas (Adele Achem) female 14 1 \n", + "10 Sandstrom, Miss. Marguerite Rut female 4 1 \n", + "11 Bonnell, Miss. Elizabeth female 58 0 \n", + "12 Saundercock, Mr. William Henry male 20 0 \n", + "13 Andersson, Mr. Anders Johan male 39 1 \n", + "14 Vestrom, Miss. Hulda Amanda Adolfina female 14 0 \n", + "15 Hewlett, Mrs. (Mary D Kingcome) female 55 0 \n", + "16 Rice, Master. Eugene male 2 4 \n", + "17 Williams, Mr. Charles Eugene male NaN 0 \n", + "18 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31 1 \n", + "19 Masselmani, Mrs. Fatima female NaN 0 \n", + "20 Fynney, Mr. Joseph J male 35 0 \n", + "21 Beesley, Mr. Lawrence male 34 0 \n", + "22 McGowan, Miss. Anna \"Annie\" female 15 0 \n", + "23 Sloper, Mr. William Thompson male 28 0 \n", + "24 Palsson, Miss. Torborg Danira female 8 3 \n", + "25 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38 1 \n", + "26 Emir, Mr. Farred Chehab male NaN 0 \n", + "27 Fortune, Mr. Charles Alexander male 19 3 \n", + "28 O'Dwyer, Miss. Ellen \"Nellie\" female NaN 0 \n", + "29 Todoroff, Mr. Lalio male NaN 0 \n", + ".. ... ... ... ... \n", + "861 Giles, Mr. Frederick Edward male 21 1 \n", + "862 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48 0 \n", + "863 Sage, Miss. Dorothy Edith \"Dolly\" female NaN 8 \n", + "864 Gill, Mr. John William male 24 0 \n", + "865 Bystrom, Mrs. (Karolina) female 42 0 \n", + "866 Duran y More, Miss. Asuncion female 27 1 \n", + "867 Roebling, Mr. Washington Augustus II male 31 0 \n", + "868 van Melkebeke, Mr. Philemon male NaN 0 \n", + "869 Johnson, Master. Harold Theodor male 4 1 \n", + "870 Balkic, Mr. Cerin male 26 0 \n", + "871 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47 1 \n", + "872 Carlsson, Mr. Frans Olof male 33 0 \n", + "873 Vander Cruyssen, Mr. Victor male 47 0 \n", + "874 Abelson, Mrs. Samuel (Hannah Wizosky) female 28 1 \n", + "875 Najib, Miss. Adele Kiamie \"Jane\" female 15 0 \n", + "876 Gustafsson, Mr. Alfred Ossian male 20 0 \n", + "877 Petroff, Mr. Nedelio male 19 0 \n", + "878 Laleff, Mr. Kristo male NaN 0 \n", + "879 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56 0 \n", + "880 Shelley, Mrs. William (Imanita Parrish Hall) female 25 0 \n", + "881 Markun, Mr. Johann male 33 0 \n", + "882 Dahlberg, Miss. Gerda Ulrika female 22 0 \n", + "883 Banfield, Mr. Frederick James male 28 0 \n", + "884 Sutehall, Mr. Henry Jr male 25 0 \n", + "885 Rice, Mrs. William (Margaret Norton) female 39 0 \n", + "886 Montvila, Rev. Juozas male 27 0 \n", + "887 Graham, Miss. Margaret Edith female 19 0 \n", + "888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n", + "889 Behr, Mr. Karl Howell male 26 0 \n", + "890 Dooley, Mr. Patrick male 32 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S \n", + "5 0 330877 8.4583 NaN Q \n", + "6 0 17463 51.8625 E46 S \n", + "7 1 349909 21.0750 NaN S \n", + "8 2 347742 11.1333 NaN S \n", + "9 0 237736 30.0708 NaN C \n", + "10 1 PP 9549 16.7000 G6 S \n", + "11 0 113783 26.5500 C103 S \n", + "12 0 A/5. 2151 8.0500 NaN S \n", + "13 5 347082 31.2750 NaN S \n", + "14 0 350406 7.8542 NaN S \n", + "15 0 248706 16.0000 NaN S \n", + "16 1 382652 29.1250 NaN Q \n", + "17 0 244373 13.0000 NaN S \n", + "18 0 345763 18.0000 NaN S \n", + "19 0 2649 7.2250 NaN C \n", + "20 0 239865 26.0000 NaN S \n", + "21 0 248698 13.0000 D56 S \n", + "22 0 330923 8.0292 NaN Q \n", + "23 0 113788 35.5000 A6 S \n", + "24 1 349909 21.0750 NaN S \n", + "25 5 347077 31.3875 NaN S \n", + "26 0 2631 7.2250 NaN C \n", + "27 2 19950 263.0000 C23 C25 C27 S \n", + "28 0 330959 7.8792 NaN Q \n", + "29 0 349216 7.8958 NaN S \n", + ".. ... ... ... ... ... \n", + "861 0 28134 11.5000 NaN S \n", + "862 0 17466 25.9292 D17 S \n", + "863 2 CA. 2343 69.5500 NaN S \n", + "864 0 233866 13.0000 NaN S \n", + "865 0 236852 13.0000 NaN S \n", + "866 0 SC/PARIS 2149 13.8583 NaN C \n", + "867 0 PC 17590 50.4958 A24 S \n", + "868 0 345777 9.5000 NaN S \n", + "869 1 347742 11.1333 NaN S \n", + "870 0 349248 7.8958 NaN S \n", + "871 1 11751 52.5542 D35 S \n", + "872 0 695 5.0000 B51 B53 B55 S \n", + "873 0 345765 9.0000 NaN S \n", + "874 0 P/PP 3381 24.0000 NaN C \n", + "875 0 2667 7.2250 NaN C \n", + "876 0 7534 9.8458 NaN S \n", + "877 0 349212 7.8958 NaN S \n", + "878 0 349217 7.8958 NaN S \n", + "879 1 11767 83.1583 C50 C \n", + "880 1 230433 26.0000 NaN S \n", + "881 0 349257 7.8958 NaN S \n", + "882 0 7552 10.5167 NaN S \n", + "883 0 C.A./SOTON 34068 10.5000 NaN S \n", + "884 0 SOTON/OQ 392076 7.0500 NaN S \n", + "885 5 382652 29.1250 NaN Q \n", + "886 0 211536 13.0000 NaN S \n", + "887 0 112053 30.0000 B42 S \n", + "888 2 W./C. 6607 23.4500 NaN S \n", + "889 0 111369 30.0000 C148 C \n", + "890 0 370376 7.7500 NaN Q \n", + "\n", + "[891 rows x 12 columns]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data # look at the dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Brain dump: Intuition, Plans, Feels --> Guesses to confirm or deny via plots\n", + "\n", + "#### There must be some trend\n", + "* Age and survival almost certainly have a relationship -- the oldest and youngest people probably had trouble, unless there was a healthy adult looking out for them and helping them out? Let's look into survival and age with all the data, age and survival with the youngest, oldest, teens, adults, etc.\n", + "* There must be some relationship between pclass and survival -- were the emergency resources better for certain classes more so than others? Some possibilities include higher quality resources, higher quantity of resources, closeness of the pclass seatings/cabin to the escape resources, etc. Needs more contextual knowledge to confirm.\n", + "* sibsp and parch could be useful features because family members tend to look out for each other and escape together. Perhaps there is strength in numbers when it comes to surviving the Titanic?\n", + "\n", + "#### Uncertain given current information and context\n", + "* Ticket number does not seem to be relevant for discovering trends (it's probably a hash function of some sort to ensure it's a unique number in the registration process...)\n", + "* Uncertain of the relationship between embarked and survival -- might be some hidden underpinnings in the health/human factors of people who went from the diffrent ports (i.e. people from Southamptonn are more physicially fit and able to survive for some non-obvious reason??)\n", + "* Uncertain of the relationship between name and survival -- maybe the model could incorporate some pattern between family members and survival. Did families stick together and survive or die together, or did they scatter? In the former case the last name might be useful in the predictive model. This might be tedious to investigate in a visualization but could provide anothher feature engineering idea for the modeling phase.\n", + "* Fare might contain some non-obvious information about the personality and human aspects of the passengers, but might not be useful in the long run for the model... Will confirm this to ensure that the model doesn't include extra variables (because more dimensionality means that more data is needed to train a good model... #curseofdimensionality)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finding some extremes and info in numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Youngest passengers? Did they survive?\n", + "\n", + "# Oldest passengers? Did they survive?\n", + "\n", + "# pclass that survived the most?\n", + "\n", + "# Investigating the families -- sibsp\n", + "\n", + "# Investigating the married couples) -- parch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary statistics using groupby and comments that inspired further exploration work\n" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedAgeSibSpParchFare
Pclass
1461.5972220.62963038.2334410.4166670.35648184.154687
2445.9565220.47282629.8776300.4021740.38043520.662183
3439.1547860.24236325.1406200.6150710.39307513.675550
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Age SibSp Parch Fare\n", + "Pclass \n", + "1 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687\n", + "2 445.956522 0.472826 29.877630 0.402174 0.380435 20.662183\n", + "3 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Pclass').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "^ Looks like people in first class were the best at surviving. Reasonable guess is that they had the best emergency resources or access to them. There could also be other features that confirm this trend; will investigate more in feature engineering phase.\n", + "\n", + "===" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdPclassAgeSibSpParchFare
Survived
0447.0163932.53187630.6261790.5537340.32969022.117887
1444.3684211.95029228.3436900.4736840.46491248.395408
\n", + "
" + ], + "text/plain": [ + " PassengerId Pclass Age SibSp Parch Fare\n", + "Survived \n", + "0 447.016393 2.531876 30.626179 0.553734 0.329690 22.117887\n", + "1 444.368421 1.950292 28.343690 0.473684 0.464912 48.395408" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Survived').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassAgeSibSpParchFare
Sex
female431.0286620.7420382.15923627.9157090.6942680.64968244.479818
male454.1473140.1889082.38994830.7266450.4298090.23570225.523893
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Age SibSp Parch \\\n", + "Sex \n", + "female 431.028662 0.742038 2.159236 27.915709 0.694268 0.649682 \n", + "male 454.147314 0.188908 2.389948 30.726645 0.429809 0.235702 \n", + "\n", + " Fare \n", + "Sex \n", + "female 44.479818 \n", + "male 25.523893 " + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Sex').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassAgeParchFare
SibSp
0455.3700660.3453952.35197431.3975580.18585525.692028
1439.7272730.5358852.05741630.0897270.65550244.147370
2412.4285710.4642862.35714322.6200000.64285751.753718
3321.5625000.2500002.56250013.9166671.31250068.908862
4381.6111110.1666673.0000007.0555561.50000031.855556
5336.8000000.0000003.00000010.2000002.00000046.900000
8481.7142860.0000003.000000NaN2.00000069.550000
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Age Parch Fare\n", + "SibSp \n", + "0 455.370066 0.345395 2.351974 31.397558 0.185855 25.692028\n", + "1 439.727273 0.535885 2.057416 30.089727 0.655502 44.147370\n", + "2 412.428571 0.464286 2.357143 22.620000 0.642857 51.753718\n", + "3 321.562500 0.250000 2.562500 13.916667 1.312500 68.908862\n", + "4 381.611111 0.166667 3.000000 7.055556 1.500000 31.855556\n", + "5 336.800000 0.000000 3.000000 10.200000 2.000000 46.900000\n", + "8 481.714286 0.000000 3.000000 NaN 2.000000 69.550000" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('SibSp').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassAgeSibSpFare
Parch
0445.2551620.3436582.32153432.1785030.23746325.586774
1465.1101690.5508472.20339024.4220001.08474646.778180
2416.6625000.5000002.27500017.2169122.06250064.337604
3579.2000000.6000002.60000033.2000001.00000025.951660
4384.0000000.0000002.50000044.5000000.75000084.968750
5435.2000000.2000003.00000039.2000000.60000032.550000
6679.0000000.0000003.00000043.0000001.00000046.900000
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Age SibSp Fare\n", + "Parch \n", + "0 445.255162 0.343658 2.321534 32.178503 0.237463 25.586774\n", + "1 465.110169 0.550847 2.203390 24.422000 1.084746 46.778180\n", + "2 416.662500 0.500000 2.275000 17.216912 2.062500 64.337604\n", + "3 579.200000 0.600000 2.600000 33.200000 1.000000 25.951660\n", + "4 384.000000 0.000000 2.500000 44.500000 0.750000 84.968750\n", + "5 435.200000 0.200000 3.000000 39.200000 0.600000 32.550000\n", + "6 679.000000 0.000000 3.000000 43.000000 1.000000 46.900000" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Parch').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassSibSpParchFare
Age
0.42804.0000001.0000003.0000000.0000001.0000008.516700
0.67756.0000001.0000002.0000001.0000001.00000014.500000
0.75557.5000001.0000003.0000002.0000001.00000019.258300
0.83455.5000001.0000002.0000000.5000001.50000023.875000
0.92306.0000001.0000001.0000001.0000002.000000151.550000
1.00415.4285710.7142862.7142861.8571431.57142930.005957
2.00346.9000000.3000002.6000002.1000001.30000037.536250
3.00272.0000000.8333332.5000001.8333331.33333325.781950
4.00466.1000000.7000002.6000001.6000001.40000029.543330
5.00380.0000001.0000002.7500001.7500001.25000022.717700
6.00762.3333330.6666672.6666671.3333331.33333325.583333
7.00288.6666670.3333332.6666672.6666671.33333331.687500
8.00400.2500000.5000002.5000002.0000001.25000028.300000
9.00437.2500000.2500003.0000002.5000001.75000027.938538
10.00620.0000000.0000003.0000001.5000002.00000026.025000
11.00534.5000000.2500002.5000002.5000001.50000054.240625
12.00126.0000001.0000003.0000001.0000000.00000011.241700
13.00614.0000001.0000002.5000000.0000000.50000013.364600
14.00312.0000000.5000002.5000002.0000000.83333342.625700
14.50112.0000000.0000003.0000001.0000000.00000014.454200
15.00554.6000000.8000002.6000000.4000000.40000049.655020
16.00422.2941180.3529412.5294120.7647060.52941225.745100
17.00423.0000000.4615382.3846150.6153850.38461528.389423
18.00516.2692310.3461542.4615380.3846150.42307738.063462
19.00389.4000000.3600002.3600000.3200000.20000027.869496
20.00493.0666670.2000003.0000000.2000000.0666678.624173
20.50228.0000000.0000003.0000000.0000000.0000007.250000
21.00390.2083330.2083332.5833330.3333330.20833331.565621
22.00365.7407410.4074072.5555560.1481480.22222225.504781
23.00510.2666670.3333332.1333330.4000000.26666737.994720
.....................
44.00437.1111110.3333332.1111110.4444440.22222229.758333
45.00367.5000000.4166672.0000000.3333330.58333336.818408
45.50268.0000000.0000002.0000000.0000000.00000017.862500
46.00427.0000000.0000001.3333330.3333330.00000055.458333
47.00534.6666670.1111111.7777780.2222220.11111127.601389
48.00663.1111110.6666671.6666670.5555560.55555637.893067
49.00533.5000000.6666671.3333330.6666670.16666759.929183
50.00457.2000000.5000001.6000000.4000000.20000064.025830
51.00456.1428570.2857142.0000000.1428570.14285728.752386
52.00589.5000000.5000001.3333330.5000000.33333351.402783
53.00572.0000001.0000001.0000002.0000000.00000051.479200
54.00383.6250000.3750001.5000000.5000000.50000044.477087
55.00254.5000000.5000001.5000000.0000000.00000023.250000
55.50153.0000000.0000003.0000000.0000000.0000008.050000
56.00542.7500000.5000001.0000000.0000000.25000043.976025
57.00700.0000000.0000002.0000000.0000000.00000011.425000
58.00325.0000000.6000001.0000000.0000000.60000093.901660
59.00164.0000000.0000002.5000000.0000000.00000010.375000
60.00583.7500000.5000001.2500000.7500000.50000055.000000
61.00374.6666670.0000001.6666670.0000000.00000024.019433
62.00552.5000000.5000001.2500000.0000000.00000035.900000
63.00380.0000001.0000002.0000000.5000000.00000043.772900
64.00492.5000000.0000001.0000000.5000002.000000144.500000
65.00264.3333330.0000001.6666670.0000000.33333332.093067
66.0034.0000000.0000002.0000000.0000000.00000010.500000
70.00709.5000000.0000001.5000000.5000000.50000040.750000
70.50117.0000000.0000003.0000000.0000000.0000007.750000
71.00295.5000000.0000001.0000000.0000000.00000042.079200
74.00852.0000000.0000003.0000000.0000000.0000007.775000
80.00631.0000001.0000001.0000000.0000000.00000030.000000
\n", + "

88 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass SibSp Parch Fare\n", + "Age \n", + "0.42 804.000000 1.000000 3.000000 0.000000 1.000000 8.516700\n", + "0.67 756.000000 1.000000 2.000000 1.000000 1.000000 14.500000\n", + "0.75 557.500000 1.000000 3.000000 2.000000 1.000000 19.258300\n", + "0.83 455.500000 1.000000 2.000000 0.500000 1.500000 23.875000\n", + "0.92 306.000000 1.000000 1.000000 1.000000 2.000000 151.550000\n", + "1.00 415.428571 0.714286 2.714286 1.857143 1.571429 30.005957\n", + "2.00 346.900000 0.300000 2.600000 2.100000 1.300000 37.536250\n", + "3.00 272.000000 0.833333 2.500000 1.833333 1.333333 25.781950\n", + "4.00 466.100000 0.700000 2.600000 1.600000 1.400000 29.543330\n", + "5.00 380.000000 1.000000 2.750000 1.750000 1.250000 22.717700\n", + "6.00 762.333333 0.666667 2.666667 1.333333 1.333333 25.583333\n", + "7.00 288.666667 0.333333 2.666667 2.666667 1.333333 31.687500\n", + "8.00 400.250000 0.500000 2.500000 2.000000 1.250000 28.300000\n", + "9.00 437.250000 0.250000 3.000000 2.500000 1.750000 27.938538\n", + "10.00 620.000000 0.000000 3.000000 1.500000 2.000000 26.025000\n", + "11.00 534.500000 0.250000 2.500000 2.500000 1.500000 54.240625\n", + "12.00 126.000000 1.000000 3.000000 1.000000 0.000000 11.241700\n", + "13.00 614.000000 1.000000 2.500000 0.000000 0.500000 13.364600\n", + "14.00 312.000000 0.500000 2.500000 2.000000 0.833333 42.625700\n", + "14.50 112.000000 0.000000 3.000000 1.000000 0.000000 14.454200\n", + "15.00 554.600000 0.800000 2.600000 0.400000 0.400000 49.655020\n", + "16.00 422.294118 0.352941 2.529412 0.764706 0.529412 25.745100\n", + "17.00 423.000000 0.461538 2.384615 0.615385 0.384615 28.389423\n", + "18.00 516.269231 0.346154 2.461538 0.384615 0.423077 38.063462\n", + "19.00 389.400000 0.360000 2.360000 0.320000 0.200000 27.869496\n", + "20.00 493.066667 0.200000 3.000000 0.200000 0.066667 8.624173\n", + "20.50 228.000000 0.000000 3.000000 0.000000 0.000000 7.250000\n", + "21.00 390.208333 0.208333 2.583333 0.333333 0.208333 31.565621\n", + "22.00 365.740741 0.407407 2.555556 0.148148 0.222222 25.504781\n", + "23.00 510.266667 0.333333 2.133333 0.400000 0.266667 37.994720\n", + "... ... ... ... ... ... ...\n", + "44.00 437.111111 0.333333 2.111111 0.444444 0.222222 29.758333\n", + "45.00 367.500000 0.416667 2.000000 0.333333 0.583333 36.818408\n", + "45.50 268.000000 0.000000 2.000000 0.000000 0.000000 17.862500\n", + "46.00 427.000000 0.000000 1.333333 0.333333 0.000000 55.458333\n", + "47.00 534.666667 0.111111 1.777778 0.222222 0.111111 27.601389\n", + "48.00 663.111111 0.666667 1.666667 0.555556 0.555556 37.893067\n", + "49.00 533.500000 0.666667 1.333333 0.666667 0.166667 59.929183\n", + "50.00 457.200000 0.500000 1.600000 0.400000 0.200000 64.025830\n", + "51.00 456.142857 0.285714 2.000000 0.142857 0.142857 28.752386\n", + "52.00 589.500000 0.500000 1.333333 0.500000 0.333333 51.402783\n", + "53.00 572.000000 1.000000 1.000000 2.000000 0.000000 51.479200\n", + "54.00 383.625000 0.375000 1.500000 0.500000 0.500000 44.477087\n", + "55.00 254.500000 0.500000 1.500000 0.000000 0.000000 23.250000\n", + "55.50 153.000000 0.000000 3.000000 0.000000 0.000000 8.050000\n", + "56.00 542.750000 0.500000 1.000000 0.000000 0.250000 43.976025\n", + "57.00 700.000000 0.000000 2.000000 0.000000 0.000000 11.425000\n", + "58.00 325.000000 0.600000 1.000000 0.000000 0.600000 93.901660\n", + "59.00 164.000000 0.000000 2.500000 0.000000 0.000000 10.375000\n", + "60.00 583.750000 0.500000 1.250000 0.750000 0.500000 55.000000\n", + "61.00 374.666667 0.000000 1.666667 0.000000 0.000000 24.019433\n", + "62.00 552.500000 0.500000 1.250000 0.000000 0.000000 35.900000\n", + "63.00 380.000000 1.000000 2.000000 0.500000 0.000000 43.772900\n", + "64.00 492.500000 0.000000 1.000000 0.500000 2.000000 144.500000\n", + "65.00 264.333333 0.000000 1.666667 0.000000 0.333333 32.093067\n", + "66.00 34.000000 0.000000 2.000000 0.000000 0.000000 10.500000\n", + "70.00 709.500000 0.000000 1.500000 0.500000 0.500000 40.750000\n", + "70.50 117.000000 0.000000 3.000000 0.000000 0.000000 7.750000\n", + "71.00 295.500000 0.000000 1.000000 0.000000 0.000000 42.079200\n", + "74.00 852.000000 0.000000 3.000000 0.000000 0.000000 7.775000\n", + "80.00 631.000000 1.000000 1.000000 0.000000 0.000000 30.000000\n", + "\n", + "[88 rows x 6 columns]" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Age').mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAEACAYAAACuzv3DAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADYtJREFUeJzt3G+MHHUdx/H3latCqaFWtK2AKamCiMZWjP+KMppCSqKI\nhqhNahoFNdEKYiIcxoQ9fWA1kRAl+sCjpKmIf0CaNgTTgkysISBgCy2lVmurVNurSqPVJ6KeD35z\nveXu6v6/me/d+5VsdmZud/Zzc7uf/d3vZg8kSZIkSZIkSZIkSZIkSZK66lTgUWAnsAf4arF9PrAN\n2AdsBeaVkk6S1JQ5xXU/8AhwMfB14IZi+43AuhJySZJaNAd4DLgQ2AssKLYvLNYlSRU1izS1cpw0\nEgc4Vvf1vnHrkqSKOoM0tfJuJhb3c1MfR5IEad67WX8D7gMuAoZJUypHgEXA0fE3XrJkycj+/fu7\nkVGSZpL9wKtbucOsBl8/k7EzUk4DLgV2AJuBNcX2NcCmCUn272dkZKRyl5tvvrn0DGYy00zMZabm\nLsCSVkocGo/IFwEbSIU/C9gIPFiU+Y+Aq4GDwIdafWBJUnc0KvJdwJsm2f4csKL7cSRJrWo0tTLt\nZFlWdoQJzNQcMzWvirnM1Dt9Pdz3SDHfI0lqUl9fH7TYzTNuRC5J041FLknBWeSSFJxFLknBWeSS\nFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxF\nLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFPgXWDgyVHUHSNGaRS1JwFrkkBdeoyM8BHgKeBnYD\n1xbba8AhYEdxWdmjfJKkBvobfP154HpgJzAXeALYBowAtxQXSVKJGhX5keIC8A/gGeCsYr2vV6Ek\nSc1rZY58MbAMeKRY/yzwJHA7MK+7sSRJzWq2yOcCdwPXkUbm3wHOBZYCh4Fv9CSdJKmhRlMrALOB\ne4DvAZuKbUfrvj4EbJnsjrVa7cRylmVkWdZORvXA2oEhblt3TdkxpBkvz3PyPO9oH42KvI80dbIH\nuLVu+yLSSBzgA8Cuye5cX+SSpInGD3IHBwdb3kejIl8OrAaeIp1mCPBFYBVpWmUEOAB8quVHliR1\nRaMi/wWTz6Pf34MskqQ2+MlOSQrOIpek4CxySQrOIq8Y/+WtpFZZ5JIUnEUuScFZ5JIUnEUuScFZ\n5JIUnEUuScFZ5JIUnEU+jXgOujQzWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSaUp4iKXWf\nRS5JwVnkkhScRS5JwVnkkhScRS5JwVnkkhScRS5JwVnkkhScRS5JwVnkkhRcoyI/B3gIeBrYDVxb\nbJ8PbAP2AVuBeb0KKEn6/xoV+fPA9cCFwNuAzwAXAAOkIj8PeLBYlySVoFGRHwF2Fsv/AJ4BzgKu\nADYU2zcAV/YknSSpoVbmyBcDy4BHgQXAcLF9uFiXJJWgv8nbzQXuAa4Djo/72khxmaBWq51YzrKM\nLMtaDihJ01me5+R53tE+miny2aQS3whsKrYNAwtJUy+LgKOT3bG+yCVJE40f5A4ODra8j0ZTK33A\n7cAe4Na67ZuBNcXyGsYKXpI0xRqNyJcDq4GngB3FtpuAdcCPgKuBg8CHepRPktRAoyL/BScfta/o\nchZJUhv8ZKckBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrlatnZg\nqOwIkupY5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUu\nScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUXDNFvh4YBnbVbasBh4AdxWVl15NJkprSTJHfwcSiHgFu\nAZYVl592OZckqUnNFPl24Ngk2/u6nEWS1IZO5sg/CzwJ3A7M604cSVKr+tu833eALxfLXwG+AVw9\n/ka1Wu3EcpZlZFnW5sNJ0vSU5zl5nne0j3aL/Gjd8hCwZbIb1Re5JGmi8YPcwcHBlvfR7tTKorrl\nD/DCM1okSVOomRH5XcAlwJnAs8DNQAYsJZ29cgD4VI/ySZIaaKbIV02ybX23g0iS2uMnOyUpOItc\nkoKzyCUpOItclbV2YKjsCFIIFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkk\nBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeR\nS1JwFrkkBWeRS1JwzRT5emAY2FW3bT6wDdgHbAXmdT+aJKkZzRT5HcDKcdsGSEV+HvBgsS5JKkEz\nRb4dODZu2xXAhmJ5A3BlN0NJkprX7hz5AtJ0C8X1gu7EkSS1qr8L+xgpLhPUarUTy1mWkWVZFx5O\nqo61A0Pctu6asmMosDzPyfO8o320W+TDwELgCLAIODrZjeqLXJI00fhB7uDgYMv7aHdqZTOwplhe\nA2xqcz+SpA41U+R3AQ8D5wPPAh8D1gGXkk4/fE+xLkkqQTNTK6tOsn1FN4NIktrjJzslKTiLXJKC\ns8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8gl\nKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKbhKFPnagaGyIzQU\nIaOkmakSRS5Jap9FLknB9Xd4/4PA34H/AM8Db+k0kCSpNZ0W+QiQAc91HkWS1I5uTK30dWEfkqQ2\ndVrkI8ADwOPAJzqPI0lqVadTK8uBw8DLgW3AXmB7p6EkSc3rtMgPF9d/Bu4l/bHzRJHXarUTN8yy\njCzLOnw4qTlrB4a4bd01ZceQGsrznDzPO9pHJ0U+BzgFOA6cDlwGDNbfoL7IJUkTjR/kDg4OnvzG\nJ9FJkS8gjcJH93MnsLWD/UmS2tBJkR8AlnYriCSpPX6yU5KCs8glKTiLXJKCm9FF7r+mlTQdzOgi\nl6TpwCKXpOAsckkKziKXpOAsckkKziKXpODCF7mnEKpT0/05NN2/P02DIpekmc4il6TgLHJJCs4i\nl6TgLHJJCs4il6TgLHJJCm5KitzzWKvLn00MU/Vzmm6PM1M4Ipek4CxySQrOIpek4CxySQrOIpek\n4CxySQrOIm9RFU+bqmImTeTPSb1ikUtScBa5JAXXSZGvBPYCvwFu7E4cSVKr2i3yU4DbSGX+OmAV\ncEG3QvVSnudlR5jATM2pYqY//uHXZUeYVBWPlZl6p90ifwvwW+Ag8DzwA+D9XcrUU1X8wZmpOVXM\nZJE3z0y9026RnwU8W7d+qNgmSZpi7Rb5SFdTSJLa1tfm/d4G1Ehz5AA3Af8FvlZ3m98CS9pOJkkz\n037g1VPxQP3Fgy0GXgTsJMgfOyVJYy4Hfk0aed9UchZJkiRJ9arwYaH1wDCwq27bfGAbsA/YCsyb\n4kznAA8BTwO7gWsrkOtU4FHS9Nge4KsVyDTqFGAHsKVCmQ4CTxW5flmRXPOAu4FnSD/Dt5ac6XzS\n8Rm9/I30XC/7ON1Eeu3tAr4PvLgCmQCuKzLtLpapQq5TSNMti4HZlDd//k5gGS8s8q8DNxTLNwLr\npjjTQmBpsTyXNDV1QQVyzSmu+4FHgIsrkAng88CdwOZivQqZDpBeZPXKzrUB+Hix3A+cUYFMo2YB\nh0mDmDIzLQZ+RypvgB8Ca0rOBPB6UkedSurObaSTRMrOxduBn9atDxSXMizmhUW+F1hQLC8s1su0\nCVhBdXLNAR4DLqxAprOBB4B3MzYiLzsTpCJ/2bhtZeY6g1RQ41XhWAFcBmwvlsvMNJ80cHop6c1u\nC3BpyZkArgLq/y3ml0gFXnYurgK+W7e+GvjWVIcoLOaFRX6sbrlv3PpUWwz8HngJ5eeaRfrN6Thp\nJEAFMv2Y9BvVJYwVedmZIJXmDuBx4BPFtjJzLSVNjd0B/Ir02ju95Ez11gOfLpbLzvRJ0nP8KLCx\nIpleS3qDmU8aSD0MfLPVXL3474dRPiw0QnlZ5wL3kObDjo/7Whm5/ksqhLOBd5FGwWVmei/pxbaD\nk3/Woayf33LSG8zlwGdIU3j1pjpXP/Am4NvF9T+Z+BtwWcfqRcD7SG/K4011piXA50gDqFeSXoOr\nS84EaaT9NdI8+P2kAdV/Ws3ViyL/I2k+bNQ5pI/wV8Ew6dcUgEWksphqs0klvpE0tVKVXJD+KHUf\ncFHJmd4BXEGaxrgLeA/peFXhOB0urv8M3Ev6v0Nl5jpUXB4r1u8mFfqREjONuhx4gnSsoNzj9GbS\naPevwL+Bn5CmgatwnNYX+S4hjbz30eKx6kWRPw68hrEPC32YsT9WlW0z6Q8cFNeb/s9te6EPuJ10\nZsGtFcl1JmN/ET+NNG+4o+RMXyQNAM4FPgL8DPhoyZkg/er7kmL5dNL8766Scx0h/d+j84r1FaQz\nM7aUmGnUKtIb8agyj9Ne0ifSTyO9DleQXodVOE6vKK5fBXyQdEZN2c91oBofFroL+BPwL9IT/WOk\neagHKO+UnotJ0xg7GTs1a2XJud5AmlvdSTqt7gvF9rKP1ahLGBsIlJ3pXNJx2kk6VWz0uV12rjeS\nRuRPkkaaZ1Qg0+nAXxh746MCmW5g7PTDDaTfjsvOBPDzItdOxqY1q5BLkiRJkiRJkiRJkiRJkiRJ\nkiRJUi/8D75zeGYODSEaAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data_hist = thinkstats2.Hist(data.Age)\n", + "thinkplot.Hist(data_hist)\n", + "thinkplot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "^ Seems like it would be worth breaking down age a bit more... Let's look at the passengers one year old and under." + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAEACAYAAAB4ayemAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADOlJREFUeJzt3H2MHHUdx/H3tkcjD9KqJSBQhVRBNEZEqaiYjAYFDKBR\n/0AjIqTEPzwhxqeT+LBNfKjxCfXUP6ggQYU/QI0YUEEYpQFRoDSFAoEKAQERRVERIsj6x2/uene9\n6/z2bme23/b9Si6dvZ2b+dzO3edmv7tTkCRJkiRJkiRJkiRJkiRJO4lDgQ1TPh4DzhxqIklSlkXA\nQ8CKYQeRJNV7C7B+2CEkaVe1qM/1TwZ+1EQQSdJgLQEeAfYZdhBJ2lWN9LHu8cBNpOKetHLlyt6W\nLVsGGkqSdgFbgBf1+0WdPta9GLgCuGDG53u9Xq/f/bau2+3S7XaHHaOWOQfLnIPVVs7RsXXz/trf\nr/8Zq44+ad5fP7529by/th+dTgf662Agf6a9J3AM8ON+dyBJGpzc8cjjwPImg0iS6vX77pGwiqIY\ndoQs5hwscw5WhJwHvODQYUdoVN/zlFmEmGlLimMhM+2F2llm2pKkHYClLUmBWNqSFIilLUmBWNqS\nFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIil\nLUmBWNqSFIilLUmBWNqSFEhOaS8DLgFuBzYDRzWaSJI0p5GMdb4BXA68q1p/z0YTSZLmVFfaS4E3\nAKdWt58GHms0kSRpTnXjkYOBR4DzgZuBc4E9mg4lSZpd3Zn2CHAEMAr8ATgHGAM+M3Wlbrc7uVwU\nBUVRDDKjJIVXliVlWS54O52a+/cDriedcQMcTSrtE6as0+v1egsOIkkTRsfWDW3f42tXt7KfTqcD\n9R28jbrxyJ+B+4FDqtvHALf1uxNJ0mDkvHvkQ8APgSXAFuC0RhNJkuaUU9obgSObDiJJqucVkZIU\niKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUt\nSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUyEjmevcC/wT+BzwFrGoq\nkCRpbrml3QMK4NHmokiS6vQzHuk0lkKSlCW3tHvAVcCNwBnNxZEkbU/ueOT1wEPAPsCVwB3AtRN3\ndrvdyRWLoqAoioEFlKSdQVmWlGW54O3MZ+TxWeDfwFer271er7fgIJI0YXRs3dD2Pb52dSv76XQ6\nMI8OzhmP7AE8u1reE3gLsKnfHUmSFi5nPLIv8JMp6/8Q+FVjiSRJc8op7XuAw5sOIkmq5xWRkhSI\npS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1J\ngVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhRIbmkvBjYAlzWYRZJUI7e0\nzwI2A70Gs0iSauSU9oHAW4F1QKfZOJKk7ckp7a8DHwOeaTiLJKnGSM39JwB/Ic2zi7lW6na7k8tF\nUVAUc64qSbuksiwpy3LB26kbd3wBOAV4GngWsDdwKfC+Kev0ej1H3ZIGZ3Rs3dD2Pb52dSv76XQ6\nMI+Rc9145GxgBXAwcDJwNdMLW5LUon7fp+0ptSQNUd1Me6rfVB+SpCHxikhJCsTSlqRALG1JCsTS\nlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRA\nLG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCiSntJ8F3ADcAmwGvthoIknSnEYy1nkSeCPw\nn2r99cDR1b+SpBbljkf+U/27BFgMPNpMHEnS9uSW9iLSeORh4BrSmESS1LKc8QjAM8DhwFLgl0AB\nlBN3drvdyRWLoqAoigHFk6SdQ1mWlGW54O105vE1nwaeAL5S3e71er0FB5GkCaNj64a27/G1q1vZ\nT6fTgXl0cM54ZDmwrFreHXgzsKHfHUmSFi5nPPJ84AJSwS8CLgR+3WQoSdLsckp7E3BE00EkSfW8\nIlKSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uS\nArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSAskp7RXANcBtwK3A\nmY0mkiTNaSRjnaeADwO3AHsBNwFXArc3mEuSNIucM+0/kwob4N+kst6/sUSSpDn1O9M+CHglcMPg\no0iS6uSMRybsBVwCnEU6457U7XYnl4uioCiKAUSTmjM6tm4o+x1fu3qHyqD2lGVJWZYL3k5uae8G\nXAr8APjpzDunlrYkaVszT2jXrFkzr+3kjEc6wPeAzcA589qLJGkgckr79cB7gTcCG6qP45oMJUma\nXc54ZD1ehCNJOwTLWJICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBL\nW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5IC\nySnt84CHgU0NZ5Ek1cgp7fOB45oOIkmql1Pa1wJ/bzqIJKmeM21JCmRkEBvpdruTy0VRUBTFIDY7\nL6Nj64ay3/G1q4ey32g8Pjs+j1EzyrKkLMsFb2fgpS1J2tbME9o1a9bMazuORyQpkJzSvgi4DjgE\nuB84rdFEkqQ55YxH3t14CklSFscjkhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVja\nkhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSI\npS1JgeSU9nHAHcBdwCeajSNJ2p660l4MjJOK+6XAu4HDmg7VhAfuu3PYEbKUZTnsCFmi5Ixy3M05\nOBEyLkRdaa8C7gbuBZ4CLgbe1nCmRkQ5kFHKMErOKMfdnIMTIeNC1JX2AcD9U27/qfqcJGkI6kq7\n10oKSVKWTs39RwFd0kwb4JPAM8CXpqxzN7By4Mkkaee2BXjRoDc6Um34IGAJcAtBX4iUpF3F8cCd\npDPqTw45iyRJkrTzqbvI5m3ARmADcBPwpvaiTZN7MdCRwNPAO9oINYu6nAXwGOnx3AB8qrVk0+U8\nngUp461A2UqqbdXl/ChbH8tNpGO/rLV0SV3G5cAvSGPIW4H3t5ZsurqczwF+Qvp9vwF4WXvRJp0H\nPEw6lnP5Jul72Ai8so1Qs6jL+RLgeuBJ4COD3PFi0njkIGA3Zp9t7zll+eXV+m3LyTmx3tXAz4F3\nthVuxv7rchbAz1pNta2cnMuA24ADq9vL2wo3Re5xn3ACcFXzsabJydgFvlgtLwf+RnpdqU05Ob8M\nfLpaPpT2H0uAN5CKeK4yfCtwebX8GuB3bYSaRV3OfYBXA58js7Rz/++RnItsHp+yvBfw18xtD1Lu\nxUAfAi4BHmkt2XS5Oeve3dO0nJzvAS4lvYcfduzjPuE9wEXNx5omJ+NDwN7V8t6k0n66pXwTcnIe\nBlxTLd9JKvh92ok36Vrg79u5/yTggmr5BtLJxb5Nh5pFXc5HgBtJj3WW3NLOvcjm7cDtwBXAmbkh\nBign5wGkH8LvVreH8V70nJw94HWkp3aXk/4bgbbl5Hwx8FzSL/GNwCntRJumn4vA9gCOJf2haVNO\nxnNJo4YHScf9rHaiTZOTcyNbx4qrgBey9ZnWjmK272NHyzgvuaWdW2w/Jf0VPhG4cF6JFiYn5znA\nWLVuh+GczebkvBlYAbwC+BbpsW1bTs7dgCNIT0ePJT1tfnGToWbRzx/eE4H1wD8ayjKXnIxnk8YR\n+wOHA98Gnt1kqFnk5FxLOnPdAIxW//6vyVDzNPN3e6e4WDB3XvYAqUAmrGDr0+HZXFtt+3mkp3ht\nycn5KtJTPkhzw+NJT03anB/n5PzXlOUrgO+QzmgfbTbaNDk57yeNRJ6oPn5L+kNzVxsBK/38fJ5M\n+6MRyMv4OuDz1fIW4B7SzPjGxtNtlfuzefqU2/cAf2w4V79mfh8HVp/bZeRcZLOSrX/ZjqjWb1u/\nFwOdz3DePZKTc1+2Pp6rSDPGtuXkfAnphajFpNHDJtof5eQe96Wkk4jdW0u2VU7GrwGfrZb3JZXl\nc1vKNyEn59LqPoAzgO+3lG2mg8h7IfIohvdCJGw/54QuA373CMx+kc0Hqg+Aj5PeprSBdKZ95KAD\nZKrLOdWwShvqc36Q9HjeAlxH+sEbhpzH86Okd5BsYjivZUBezlOBH7Wca6q6jMuBy0gz402kF0yH\noS7na6v77yC9oL+07YCkZ0sPAv8lPds7nW2P9zjpe9hIOpEchrqc+1Wff4z0guV9pDdySJIkSZIk\nSZIkSZIkSZIkSZIkSZrp/4ZZbBO7NN0hAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "oyo_data = data[data.Age <= 1.00]\n", + "oyo_data.groupby('Age').mean() # Confirm that this matches the top rows in the code cell above\n", + "\n", + "# Do more explorations with this oyo_data subset\n", + "oyo_data_hist = thinkstats2.Hist(oyo_data.Age)\n", + "thinkplot.Hist(oyo_data_hist)\n", + "thinkplot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/.ipynb_checkpoints/model_iteration_1-checkpoint.ipynb b/.ipynb_checkpoints/model_iteration_1-checkpoint.ipynb new file mode 100644 index 0000000..01e6120 --- /dev/null +++ b/.ipynb_checkpoints/model_iteration_1-checkpoint.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we'll be building a predictive model for survival on the titanic based on training data provided by kaggle. This is part of the Warmup Project for Data Science 2016. \n", + "\n", + "#### A. import the training data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "titanic = pandas.read_csv(\"./data/train.csv\")\n", + "\n", + "# Uncomment print statements below to take a look at the \n", + "# first 5 rows of the dataframe and the describing output.\n", + "# print(titanic.head(5))\n", + "# print(titanic.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### B. clean up the missing data. \n", + "\n", + "Occasionally a dataset contains missing values (null, not a number, NA, etc.) and we want to prevent these missing values from affecting our computations in unintended ways. In particular, this training data set has missing values for `Age`, so let's clean that up!" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "titanic[\"Age\"] = titanic[\"Age\"].fillna(titanic[\"Age\"].median())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### C. convert non-numeric (categorical) variables into usable numbers!\n", + "\n", + "In particular, `Sex` and `Embarked` should be converted into usable numbers. We'll find all the unique values for these non-numeric data points and replace them with numbers that can be used by the predictive model in a later step." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unique genders are ['male' 'female']\n" + ] + } + ], + "source": [ + "# Find all the unique genders \n", + "print\"unique genders are\", titanic[\"Sex\"].unique()\n", + "\n", + "# From genders to numbers\n", + "titanic.loc[titanic[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic.loc[titanic[\"Sex\"] == \"female\", \"Sex\"] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unique embarked values are ['S' 'C' 'Q' nan]\n" + ] + } + ], + "source": [ + "# Find all the uniqued embarked values\n", + "print \"unique embarked values are\", titanic[\"Embarked\"].unique()\n", + "\n", + "# From embarked letters to numbers\n", + "titanic[\"Embarked\"] = titanic[\"Embarked\"].fillna(\"S\")\n", + "titanic.loc[titanic[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic.loc[titanic[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic.loc[titanic[\"Embarked\"] == \"Q\", \"Embarked\"] = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### D. cross validation, linear regression, first stab at predictions \n", + "\n", + "We want to make sure that we don't train our model on the same data that we'll make predictions on, so we're going to split the data into several folds. In each trial, one fold will be set aside for predictions, and the remaining folds will be used for training. Thus there's no overlap between the folds/partitions that were used for training and the one fold used for predictions. We'll run several trials with these fold combinations and eventually get predictions for the entire dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Code from dataquest mission 74, part 9.\n", + "\n", + "# Import the linear regression class\n", + "from sklearn.linear_model import LinearRegression\n", + "# Sklearn also has a helper that makes it easy to do cross validation\n", + "from sklearn.cross_validation import KFold\n", + "\n", + "# The columns we'll use to predict the target\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Fare\", \"Embarked\"]\n", + "\n", + "# Initialize our algorithm class\n", + "alg = LinearRegression()\n", + "# Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test.\n", + "# We set random_state to ensure we get the same splits every time we run this.\n", + "kf = KFold(titanic.shape[0], n_folds=3, random_state=1)\n", + "\n", + "predictions = []\n", + "for train, test in kf:\n", + " # The predictors we're using the train the algorithm. Note how we only take the rows in the train folds.\n", + " train_predictors = (titanic[predictors].iloc[train,:])\n", + " # The target we're using to train the algorithm.\n", + " train_target = titanic[\"Survived\"].iloc[train]\n", + " # Training the algorithm using the predictors and target.\n", + " alg.fit(train_predictors, train_target)\n", + " # We can now make predictions on the test fold\n", + " test_predictions = alg.predict(titanic[predictors].iloc[test,:])\n", + " predictions.append(test_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[array([ 8.99877810e-02, 9.60756206e-01, 5.92676278e-01,\n", + " 9.31138728e-01, 5.29343071e-02, 1.70275685e-01,\n", + " 3.69943590e-01, 1.03474847e-01, 5.21597906e-01,\n", + " 8.74491050e-01, 6.48883611e-01, 8.29742769e-01,\n", + " 1.34797198e-01, -1.61126844e-01, 6.58141307e-01,\n", + " 6.39819748e-01, 1.51733875e-01, 2.95432718e-01,\n", + " 5.35377959e-01, 6.21007683e-01, 2.61872592e-01,\n", + " 2.62687561e-01, 7.31739160e-01, 5.05995897e-01,\n", + " 5.61398567e-01, 3.35039734e-01, 1.30338808e-01,\n", + " 4.68765767e-01, 6.60737753e-01, 9.10819218e-02,\n", + " 4.77223920e-01, 1.04220026e+00, 6.60691613e-01,\n", + " 8.71539273e-02, 5.28550732e-01, 4.01874338e-01,\n", + " 1.30340307e-01, 1.29339672e-01, 5.72717129e-01,\n", + " 6.65238822e-01, 4.83215779e-01, 7.60807408e-01,\n", + " 1.30578363e-01, 8.71867121e-01, 7.09855487e-01,\n", + " 9.11369897e-02, 1.39181745e-01, 6.60691613e-01,\n", + " 6.82833485e-02, 6.06254374e-01, 4.92254383e-02,\n", + " 1.29250392e-01, 9.02668258e-01, 7.51677954e-01,\n", + " 3.19636822e-01, 5.05995897e-01, 8.23411477e-01,\n", + " 1.27611544e-01, 8.16516947e-01, -3.70209060e-02,\n", + " 1.63085464e-01, 9.57981340e-01, 3.96742103e-01,\n", + " 6.16138409e-02, 5.42714233e-01, 6.62112275e-02,\n", + " 7.79751268e-01, 1.40293401e-01, 4.40592742e-01,\n", + " 3.50534388e-02, 2.72709814e-01, 4.26360339e-01,\n", + " 3.55241143e-01, 1.10226880e-01, 8.66078358e-02,\n", + " 1.07366720e-01, 9.10819218e-02, 9.11369897e-02,\n", + " 3.82661024e-01, 5.72471068e-01, 1.24221410e-01,\n", + " 8.61972872e-02, 6.60705005e-01, 5.10138486e-01,\n", + " 8.45241581e-01, 4.56477760e-01, 3.22699204e-02,\n", + " 9.11369897e-02, 9.37604538e-01, 1.12967094e-01,\n", + " 8.56794636e-02, 1.34727274e-01, 3.83320807e-01,\n", + " 6.14970393e-03, -7.83320148e-02, 9.11369897e-02,\n", + " 3.10516665e-01, 5.49345421e-01, 7.23544338e-01,\n", + " 2.33721448e-01, 5.81750798e-01, 9.10819218e-02,\n", + " 5.25738424e-01, 6.40651310e-02, -2.52427240e-02,\n", + " 9.10819218e-02, 6.19865700e-01, 9.10387818e-02,\n", + " 3.65066610e-02, 6.32939707e-01, 4.08195377e-01,\n", + " 6.63657306e-01, 1.23882146e-01, 5.92491292e-01,\n", + " 6.83623624e-01, 1.29295032e-01, -6.19221217e-02,\n", + " 2.59223480e-01, 6.09655955e-01, 5.30794378e-01,\n", + " 2.88023805e-01, 9.11369897e-02, 2.82857942e-01,\n", + " 7.61542726e-01, 3.45640063e-01, 1.85484998e-01,\n", + " 1.70022737e-01, 1.12642722e-01, 5.59420117e-01,\n", + " -2.02485747e-03, 1.03290733e-01, 1.34440079e-01,\n", + " 4.46807623e-01, 7.51677954e-01, 3.11805296e-01,\n", + " 3.62947385e-01, 9.75724449e-01, 4.29554800e-01,\n", + " 1.57043954e-01, 5.82928575e-01, 5.57105476e-01,\n", + " 6.14443886e-01, 5.72812834e-01, 2.18783352e-01,\n", + " 3.49472299e-01, 2.86040080e-01, 9.65037360e-02,\n", + " 5.60916106e-01, 1.86919710e-01, 2.19027353e-01,\n", + " 1.69739986e-01, 1.00690768e+00, -5.89449777e-02,\n", + " -4.15452572e-02, 9.08736139e-02, 3.95827915e-01,\n", + " 7.26175962e-01, 8.02219375e-02, 9.13557255e-02,\n", + " -2.22536096e-01, -2.66919104e-02, 7.21593360e-01,\n", + " 1.01953834e-01, 1.51388512e-01, 8.19705948e-02,\n", + " 1.32518461e-01, 9.70245311e-01, 3.28974893e-01,\n", + " 5.02576476e-01, 1.08437940e-01, 3.25183297e-01,\n", + " 1.40818823e-01, 6.63268211e-01, 1.29295032e-01,\n", + " 3.90965934e-01, 7.86503606e-02, -3.68524682e-02,\n", + " 9.13671691e-01, 2.84517666e-01, 4.46019673e-02,\n", + " 2.68132779e-01, 3.35661255e-01, 1.96299597e-03,\n", + " 3.51470400e-01, 6.51010647e-01, 5.11174133e-01,\n", + " 6.29850621e-01, 4.10021732e-01, 4.03081359e-02,\n", + " 4.74217131e-02, 7.64271489e-01, 3.44550453e-01,\n", + " 5.97245007e-01, 3.69521460e-01, 9.46062691e-01,\n", + " 9.12083149e-01, 1.70022737e-01, -1.85251802e-02,\n", + " 6.60691613e-01, 8.07931698e-01, 9.16548133e-02,\n", + " -2.22536096e-01, 5.78367977e-02, 3.48321010e-02,\n", + " 1.45712251e-01, 6.91179799e-01, 3.84837497e-02,\n", + " 1.45383056e-01, 7.26181926e-01, 4.78394987e-01,\n", + " 1.12609974e-01, 7.50755869e-01, 1.23596450e-01,\n", + " 2.84517666e-01, 1.36414068e-01, 1.01395495e+00,\n", + " 5.87218752e-01, 1.90418359e-01, 1.02889863e+00,\n", + " 2.83624866e-01, 1.56627303e-01, 3.00890244e-01,\n", + " -3.43861103e-02, 9.10819218e-02, 4.37274991e-01,\n", + " 1.24346402e-01, 3.43657653e-01, 1.31782740e-01,\n", + " 3.50007979e-01, 4.53816408e-01, 9.41986239e-01,\n", + " 8.55812557e-02, 1.26427969e-01, 5.14461976e-01,\n", + " 3.16370023e-01, 5.81627306e-01, 1.79146187e-01,\n", + " 8.33217359e-01, 3.43657653e-01, 2.67886176e-01,\n", + " 5.89980704e-01, 6.29850621e-01, 2.89082393e-01,\n", + " 1.23551810e-01, 1.19423755e-01, 4.49914049e-01,\n", + " 5.98080236e-01, 7.41700785e-01, 3.95976588e-01,\n", + " 1.24570927e-01, 9.08512939e-02, 5.10217925e-01,\n", + " 3.17243789e-01, 4.94880818e-02, 4.48434902e-01,\n", + " 5.51647950e-01, 1.05176735e+00, 1.00396283e+00,\n", + " 1.16824364e+00, 6.37295280e-01, 1.70022737e-01,\n", + " 3.47081525e-02, 3.23790141e-01, 4.27827834e-01,\n", + " 6.60691613e-01, 2.50879710e-01, 1.07703504e-04,\n", + " 7.38026906e-02, 8.41682429e-01, 9.94221666e-01,\n", + " 5.04388858e-01, 1.04634754e-01, 6.84091736e-01,\n", + " 4.60920013e-01, 6.60691613e-01, 7.87205387e-01,\n", + " 4.88920786e-01, 2.90790162e-01, 1.24446245e-01,\n", + " 4.80968077e-01, -3.19057282e-02, 9.10670657e-02,\n", + " 1.57145126e-01, 1.40254724e-01, 5.02603260e-01,\n", + " 1.03564537e-01, 8.07397611e-02, 1.23827078e-01,\n", + " 2.19027353e-01, 6.93436769e-01, 1.02306096e+00,\n", + " 1.07151871e+00, 2.91224311e-01, 6.03921666e-01,\n", + " 1.12912026e-01, 5.42714233e-01, 1.54899175e-01]), array([ 1.13774791, 0.44173212, 0.98551347, 0.66915371, 0.08254228,\n", + " 0.15142624, 0.83642014, 0.09704526, 0.64711481, 1.03845173,\n", + " 1.06064212, 0.24647842, 0.98364902, 1.04411609, 1.10195734,\n", + " 0.72596387, 0.09692709, 0.11388411, 0.60824987, 0.74905725,\n", + " 0.090424 , 1.00314273, 0.91588368, 0.13679886, 0.10365487,\n", + " 0.82296458, 0.755174 , -0.27746285, 1.0035964 , -0.12636043,\n", + " 0.70865678, 0.52438799, 1.06900476, 0.58044138, 0.32246331,\n", + " 0.45904751, 0.0848131 , 0.96838383, 0.09692709, 0.4123739 ,\n", + " 0.96908901, -0.01732698, 0.33119158, 0.38953146, 0.97455471,\n", + " 0.26457991, 0.28476325, 0.21075768, 0.78939013, 0.68174567,\n", + " 0.5508181 , 0.21132238, 0.00332574, 0.1315846 , 0.44518065,\n", + " 0.16116388, 0.07440511, 0.13363265, 0.09815645, 0.98913539,\n", + " 0.69520122, 0.66925272, 0.66925272, -0.05732283, 0.25605759,\n", + " 0.51306171, 0.04918447, 0.12689844, 0.08297663, 0.74556032,\n", + " 0.63153497, 0.66915371, 1.03349593, 0.46795359, 0.11283671,\n", + " 0.15759527, 0.5998862 , 0.6125967 , 0.96615292, 0.63469796,\n", + " 0.6051113 , 0.18499302, 0.15738453, 1.03364995, 0.80043282,\n", + " 0.07003835, 0.85871777, 0.09692709, 0.37822123, 0.03771546,\n", + " 0.70865678, 0.17123866, 0.87293786, 0.38692632, 0.14394491,\n", + " -0.00364112, 1.02362819, 0.60920867, 0.13721713, 0.57461098,\n", + " 0.1534423 , 0.29630296, 0.76221079, 0.0229439 , 0.11050082,\n", + " 0.59310377, 0.05272741, 0.64923598, 0.18004866, -0.05792355,\n", + " 0.37724772, 0.14392897, 0.44776777, 0.09692709, 0.17057126,\n", + " 0.97573347, 0.2546175 , -0.01069499, 0.59494436, 0.67712284,\n", + " 0.81048116, 0.25112435, 0.7091068 , 0.13414671, 0.21833626,\n", + " 0.09018337, 0.5398775 , 0.11371054, 0.09643219, 0.72214613,\n", + " 0.83299143, 0.1712546 , 0.07013414, 0.43870508, 0.5508181 ,\n", + " 0.62795723, 0.17034196, 0.26289071, 1.03283656, 0.54234647,\n", + " 0.66429253, 0.2888594 , 0.24248073, 0.59832765, 0.15197868,\n", + " 0.06672256, 0.76247901, 0.09709316, 0.62328105, 0.85873908,\n", + " 0.39833841, 0.68526385, 0.28026543, 0.15249025, 0.0558822 ,\n", + " 0.46338875, 0.3322838 , 0.09704526, 0.12741893, 0.18977726,\n", + " 0.90570685, 0.61255203, 0.1712546 , 0.3041495 , 0.05667859,\n", + " 0.32003504, 0.13002433, 0.09704526, 0.02900113, 0.2546175 ,\n", + " 0.25032727, 0.17123545, 0.71385691, 0.09643219, 0.03023685,\n", + " 0.67057269, 0.83394424, 0.63668087, 0.45820842, 0.18004866,\n", + " 0.03925263, 0.13700639, 0.76347615, -0.01610677, 0.2546175 ,\n", + " -0.05096587, 0.36065035, 0.49526401, 0.44776777, 0.88783867,\n", + " 0.27650531, 0.0835897 , 0.17095571, 0.0558822 , 0.14352664,\n", + " 0.26008209, 0.20422092, 0.14413971, 0.13917582, 0.78823881,\n", + " 0.10244795, 0.983009 , 0.12376157, 0.17152021, 0.71624816,\n", + " 0.66906113, 0.5355726 , 1.06327957, 0.55601524, 0.71952689,\n", + " 0.43870508, 0.10813802, 0.14762674, 0.16452683, 0.09704526,\n", + " 0.38468169, 0.77378051, 0.12353167, 0.31660245, 0.72019649,\n", + " 0.18382257, 0.6683239 , 0.07001598, 0.97445504, 0.13729376,\n", + " 0.13363265, 0.88062695, 0.13363587, 0.08715737, 0.61255203,\n", + " 0.5883169 , 0.0229439 , 0.18684089, 0.88743056, 0.13363587,\n", + " 0.14770832, 0.62385335, 0.58195819, 0.89464072, 0.32433284,\n", + " 1.0215796 , 0.10198815, 1.01250232, 0.89757009, 0.52011358,\n", + " 0.50665802, 0.19733591, 0.33882963, 0.19608356, 0.78269614,\n", + " 0.3024605 , 0.01303333, 0.35740293, 0.59528255, 0.2812701 ,\n", + " 0.1713153 , 0.17399933, 0.63510029, 0.2099606 , 0.79897366,\n", + " 0.62993975, 0.84335812, 0.49799211, 0.1712546 , 0.01619374,\n", + " 0.26496308, 0.09704526, 0.59494436, 0.03570385, 0.1574771 ,\n", + " 0.55964686, 0.13363587, 0.0699841 , 0.03391958, 0.68692335,\n", + " 0.38475832, 0.66915371, 0.17777861, 0.16253816, 0.72211234,\n", + " 0.83479538, 0.58677963, 0.07003835, 0.735757 , 0.90451305,\n", + " 0.09962007, 0.43250553, 0.13477258, 1.02529894, 0.13828479,\n", + " 0.24105043, 0.13741193, 0.09704526, 0.04924194, 0.80169436,\n", + " -0.03139561, 0.64987806]), array([ 1.72889219e-01, 1.70294715e-02, 7.82616935e-01,\n", + " -8.34788848e-03, 1.47022266e-01, 3.10888595e-01,\n", + " 7.28261340e-01, 1.01479914e-01, 4.24565622e-01,\n", + " 1.57316587e-02, 4.37708069e-01, 1.44204264e-02,\n", + " 9.07678482e-02, 4.33913871e-01, 8.26537251e-01,\n", + " 8.45262338e-01, 5.42776171e-01, 1.01763663e-01,\n", + " 6.70148479e-01, 1.92163452e-01, 6.39359534e-02,\n", + " 7.62650655e-01, 3.10124701e-02, 5.90024631e-01,\n", + " 8.31356231e-01, 2.78648916e-01, 1.08309653e-01,\n", + " 3.04531238e-01, 1.50864127e-01, 1.38986099e-01,\n", + " 1.36219795e-01, 2.51197915e-01, 2.02625887e-01,\n", + " 9.72357134e-01, 1.12191979e-01, 1.92169054e-01,\n", + " 1.50211875e-01, -2.14264992e-02, 4.52451020e-01,\n", + " 4.38789988e-01, 6.04820088e-01, 7.89326541e-01,\n", + " 8.00459867e-02, 2.10435721e-01, 5.70885269e-01,\n", + " 5.70841743e-02, 1.44342132e-01, 1.00451104e+00,\n", + " 6.42312317e-01, 8.51755703e-02, 7.33373007e-01,\n", + " 3.09602117e-01, 1.49684208e-01, 3.22228832e-01,\n", + " 1.01595923e-01, 6.50604478e-01, 1.01479914e-01,\n", + " 8.45026241e-01, 1.38791822e-01, 7.14365273e-01,\n", + " 7.68287651e-01, 1.84938938e-01, 1.01479914e-01,\n", + " 6.54218524e-01, 2.93878313e-01, 2.96413137e-01,\n", + " 1.92833539e-01, 8.27498735e-02, 3.28441263e-01,\n", + " 5.87658439e-02, 1.02674988e-01, 1.42090676e-01,\n", + " 2.83166248e-01, 1.01520440e-01, 2.10876914e-02,\n", + " 9.01930011e-01, 6.80182444e-01, 3.63633521e-01,\n", + " 4.29834748e-02, 2.51030051e-01, 2.71459394e-01,\n", + " 1.55080767e-01, 1.20174297e-01, 6.76615822e-01,\n", + " 5.21604336e-01, 2.74876851e-01, 7.14261845e-01,\n", + " 4.63722197e-01, 1.43882255e-01, -3.38493769e-02,\n", + " 5.08333972e-02, 2.88240761e-01, 4.71949096e-03,\n", + " 1.48920991e-01, 1.55073789e-01, 9.65241409e-01,\n", + " 3.61956120e-01, 8.01212426e-01, 8.51755703e-02,\n", + " 1.63090365e-01, 2.58489938e-01, 1.38385623e-01,\n", + " 1.57316587e-02, 7.14397446e-01, 2.98282232e-01,\n", + " 2.65779163e-02, 9.41922468e-01, 3.92478820e-01,\n", + " 7.25879907e-01, 2.08234335e-01, 7.05625434e-02,\n", + " 2.03820545e-01, 6.98106244e-01, 3.54986591e-01,\n", + " 9.42312534e-01, 1.08182230e-01, 1.01115214e+00,\n", + " 4.29882986e-01, 2.72580965e-01, 9.55913060e-02,\n", + " 1.38553363e-01, 1.49766670e-01, 8.76445205e-01,\n", + " 7.95521275e-01, 1.89563479e-01, 7.47402760e-02,\n", + " 9.05943831e-01, 1.19035222e-01, 2.34961953e-01,\n", + " 1.49265429e-01, 3.84688624e-01, 1.44070963e-01,\n", + " 6.51000458e-01, 7.14396037e-01, 2.37161612e-01,\n", + " 5.98123216e-01, 8.84762775e-01, 2.34195832e-01,\n", + " 2.71459394e-01, 2.93878313e-01, 2.93878313e-01,\n", + " 9.60495497e-02, 4.82543535e-01, 2.74738708e-01,\n", + " 1.01479914e-01, 1.01479914e-01, 4.28725578e-01,\n", + " 3.27845711e-01, 8.83507841e-01, 7.85083053e-02,\n", + " 8.54020195e-02, 1.53868294e-01, 1.25458500e-01,\n", + " 7.78614476e-01, 4.27536886e-01, 1.76095354e-01,\n", + " 8.78367308e-01, 2.23270579e-01, 7.41615725e-02,\n", + " 1.28260077e-01, 6.34105869e-01, 3.76826088e-01,\n", + " 1.01513462e-01, 3.21161697e-01, 6.92919862e-02,\n", + " 9.05219168e-01, 9.92643346e-02, 3.21100762e-02,\n", + " 1.89869119e-01, 8.47257439e-01, 1.65792833e-01,\n", + " 7.70032759e-01, 4.70822280e-01, 7.01001762e-01,\n", + " 1.45018183e-01, 7.98992141e-02, 1.22365867e-01,\n", + " -5.62678525e-03, 6.34840292e-01, 1.47022266e-01,\n", + " 6.21554022e-01, 1.55089154e-01, 1.92163452e-01,\n", + " 7.45360827e-01, 1.92167645e-01, 8.15272492e-01,\n", + " 7.49589740e-01, 9.59168970e-01, 4.23369546e-01,\n", + " 6.56067455e-02, 1.17831761e-01, 1.17764665e-01,\n", + " 6.77402825e-01, 1.31033823e-01, 2.11184136e-01,\n", + " 3.61128670e-01, 1.92163452e-01, 3.27009298e-01,\n", + " 2.80865752e-01, 4.73809464e-01, 1.17548012e-01,\n", + " 2.08181789e-01, 8.39842956e-01, 6.07376016e-01,\n", + " 1.36308792e-01, 5.71394060e-01, 2.34961953e-01,\n", + " 7.32664113e-01, 4.58929866e-01, 2.99802486e-01,\n", + " 1.07144857e-01, 8.54523415e-02, 3.79873628e-01,\n", + " 6.77309159e-01, 2.08181789e-01, 8.74780819e-01,\n", + " 1.12194764e-01, 3.71105893e-02, 2.30444621e-01,\n", + " 5.78112549e-01, 8.80381008e-02, 4.38789988e-01,\n", + " 6.50478673e-01, 2.52145211e-01, 2.16244600e-02,\n", + " 7.72356638e-02, 7.64956968e-01, 1.06578734e-01,\n", + " 3.85229660e-01, 6.33022282e-01, 6.89918839e-02,\n", + " 1.92431836e-01, 8.51755703e-02, 4.59963761e-01,\n", + " 1.92163452e-01, 7.52074841e-01, 6.94810438e-01,\n", + " 3.74543331e-01, 1.47020857e-01, 1.28274033e-01,\n", + " 1.54904640e-01, 8.83372143e-01, 1.38714930e-01,\n", + " 1.01428183e-01, 6.37514393e-02, 4.74143535e-01,\n", + " 1.44318380e-01, 3.32209243e-01, 9.85223737e-01,\n", + " 1.12472244e-01, 1.60139061e-01, 2.66114644e-02,\n", + " -2.41362640e-01, 1.09304997e-01, 2.65882719e-01,\n", + " 9.34799595e-01, 6.65962224e-02, -1.44857067e-01,\n", + " 7.32175244e-01, 1.01756702e+00, 6.57625381e-01,\n", + " 6.82274953e-01, 7.78507074e-01, 3.06694232e-01,\n", + " 7.03120381e-01, 1.47020857e-01, -5.35194672e-02,\n", + " 2.63450207e-01, 8.45198988e-01, 2.80865752e-01,\n", + " 2.88522280e-01, 7.14342083e-01, 7.98068552e-01,\n", + " 4.05781543e-01, 1.00941736e-01, 1.92789366e-01,\n", + " 1.12191979e-01, 8.05473642e-01, 4.10332423e-01,\n", + " -6.55145848e-04, 7.89310178e-01, 7.38879084e-01,\n", + " 1.43673989e-01, 1.49684208e-01, 1.01479914e-01,\n", + " 8.33962978e-01, 8.06527571e-01, 7.46997500e-02,\n", + " 6.54965242e-01, 2.67936850e-01, 1.17831761e-01,\n", + " 6.75775470e-01, 2.72454182e-01, 9.99158265e-01,\n", + " 5.87835137e-01, 4.84754956e-01, 1.70739321e-01])]\n" + ] + } + ], + "source": [ + "print predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### D. contninued: accuracy!\n", + "\n", + "How did this first stab of predictions go? The possible outcomes are 1 and 0 (survival is a binary thing), but the linear regression model output doesn't match this binary format. Thus we have to map our predictions to outcomes. We'll also compute the accuracy of these results by comparing our predictions to the `Survived` column of the training data. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# The predictions are in three separate numpy arrays. Concatenate them into one. \n", + "# We concatenate them on axis 0, as they only have one axis.\n", + "predictions = np.concatenate(predictions, axis=0)\n", + "\n", + "# Map predictions to outcomes (only possible outcomes are 1 and 0)\n", + "predictions[predictions > .5] = 1\n", + "predictions[predictions <=.5] = 0\n", + "\n", + "# Take a look\n", + "# print(predictions.shape)\n", + "# print(titanic[\"Survived\"].shape)\n", + "\n", + "num_accurate_predictions = 0 # counter\n", + "\n", + "# Check whether the predictions are correct\n", + "for i in range(predictions.shape[0]):\n", + " if predictions[i] == titanic[\"Survived\"][i]:\n", + " num_accurate_predictions +=1\n", + "\n", + "accuracy = float(num_accurate_predictions) / predictions.shape[0]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The accuracy of this linear regression model is `0.783389450056` -- definitely a lot of room for improvement! Perhaps using a different model or some feature engineering could help. :)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### E. second stab: logistic regression" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.787878787879\n" + ] + } + ], + "source": [ + "from sklearn import cross_validation\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Initialize our algorithm\n", + "alg = LogisticRegression(random_state=1)\n", + "# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "# Take the mean of the scores (because we have one for each fold)\n", + "print(scores.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The accuracy of the logistic regression model is `0.792368125701` -- better, but not perfect. Let's go through making a submission to kaggle before continuing to tweak the model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### F. preparing a submission to kaggle; running the model on the test data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "titanic_test = pandas.read_csv(\"./data/test.csv\")\n", + "\n", + "# Age column\n", + "titanic_test[\"Age\"] = titanic_test[\"Age\"].fillna(titanic[\"Age\"].median())\n", + "\n", + "# Sex column\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"female\", \"Sex\"] = 1\n", + "\n", + "# Embarked column\n", + "titanic_test[\"Embarked\"] = titanic_test[\"Embarked\"].fillna(\"S\")\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"Q\", \"Embarked\"] = 2\n", + "\n", + "# Fare column\n", + "titanic_test[\"Fare\"] = titanic_test[\"Fare\"].fillna(titanic[\"Fare\"].median())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Initialize the algorithm class\n", + "alg = LogisticRegression(random_state=1)\n", + "\n", + "# Train the algorithm using all the training data\n", + "alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + "\n", + "# Make predictions using the test set.\n", + "predictions = alg.predict(titanic_test[predictors])\n", + "\n", + "# Create a new dataframe with only the columns Kaggle wants from the dataset.\n", + "submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# generate a submission file\n", + "# commented out to prevent unintentional file overwrite/creation\n", + "# submission.to_csv(\"dataquest_logistic_regression.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uploaded the submission file to kaggle; it resulted in an score of 0.75120 (rank 3393). This model did approximately 3% worse on the test dataset compared to the training dataset. 3% does \"feel\" like a big difference, however it doesn't seem like overfitting was the only issue. It seems more likely to me that there are nuanced differences in the passenger data that this current model did not capture. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### G. improving the dataquest code\n", + "\n", + "Brain dump of ideas:\n", + "* Not using every feature in the model, relevant to the curse of dimensionality -- see if using the same logistic regression with less features is helpful. Perhaps things like ticket number and fare are not as useful as sex and age. \n", + "* Try different models\n", + "* Combine features together: perhaps combining sex and age into one feature somehow (encoding it with one digit for sex and one digit for age)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Helper functions: Use logistic regression, try using different features\n", + "\n", + "def make_titanic_test_predictions(predictors):\n", + " # Initialize our algorithm\n", + " alg = LogisticRegression(random_state=1)\n", + " \n", + " # Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)\n", + " scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + " \n", + " # Take the mean of the scores (because we have one for each fold)\n", + " print \"accuracy\", scores.mean()\n", + " return \n", + "\n", + "def prepare_submission_file_different_predictors(predictors, filename):\n", + " # Initialize the algorithm class\n", + " alg = LogisticRegression(random_state=1)\n", + "\n", + " # Train the algorithm using all the training data\n", + " alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + "\n", + " # Make predictions using the test set.\n", + " predictions = alg.predict(titanic_test[predictors])\n", + " \n", + " # Create a new dataframe with only the columns Kaggle wants from the dataset.\n", + " submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " }) \n", + " \n", + " # Save it\n", + " submission.to_csv(filename, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our first attempt, predictors included all of the provided features from the kaggle dataset: \n", + "`['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']`. \n", + "\n", + "Let's see what happens when we do something super bare bones with just `Sex` and `Age`. I expect that this will be less accurate because while these features do seem important, there is probably more to the relationship between people and survival than `Sex and Age`. \n", + "\n", + "The code in the next few cells somewhat resembles one of the data mining approaches in the reading (I believe the reading mentioned computing the correlation coefficient for each of the variables). We'll see which variables work well for predictions, and then proceeding onwards based on which variables seem to be helping the accuracy score. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Sex', 'Age']\n", + "accuracy 0.786756453423\n" + ] + } + ], + "source": [ + "predictors2 = ['Sex', 'Age'] \n", + "print predictors2\n", + "predictions2 = make_titanic_test_predictions(predictors2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It turns out that using just `Sex` and `Age` gives us a score comparable to using all of the features! This definitely makes me think that some of the features in the dataset are not helpful in this logistic regression model... this is not a surprise because we know that more variables is not necessarily better with a fixed amount of data (insert reference to the curse of dimensionality concept. \n", + "\n", + "Based on contextual knowledge about the Titanic story (DataQuest mission 74 also mentions this), we know that passenger class was relevant because the first class cabins were closer to the deck of the ship. A distance advantage to safety almost certainly would impact survival rate, so let's try including `Pclass` in addition to the bare-bones model based on just `Sex` and `Age`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# prepare_submission_file_different_predictors(predictors2, \"logistic_regression_SA.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This bare bones two-feature model also did better on the test set -- it received a score of 0.76555 (now at rank 3098; improvement compared to first submission score was 0.01435). " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Pclass', 'Sex', 'Age']\n", + "accuracy 0.789001122334\n" + ] + } + ], + "source": [ + "predictors3 = ['Pclass', 'Sex', 'Age']\n", + "print predictors3\n", + "predictions3 = make_titanic_test_predictions(predictors3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# prepare_submission_file_different_predictors(predictors2, \"logistic_regression_PSA.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This three-feature model did (very slightly with an improvement of about 0.005; probably not \"significant\") better than the two-feature model on the training dataset, and it had the same performance as the two-feature model on the test dataset -- it received a score of 0.76555 (same place on the kaggle leaderboard). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### H. Other things to try (for model_iteration_2.ipynb) !\n", + "\n", + "Due to time constraints I didn't have a bunch of time to implement more ideas -- but these are some things I will explore more in future iterations and perhaps discuss in class soon:\n", + "\n", + "* Take another look at the data, see what the unique values themselves look like. For example, is there some pattern in the names of the passengers?\n", + "* Combine variables:\n", + " * In the brain dump cell earlier I mentioned combining `sex` and `age` somehow. Consider \"female child, male child, female adult, male adult, female senior, male senior\", and put these categories in one variable. Maybe this would help the curse of dimensionality problem? Or maybe it would prevent the model from learning nuances that need `sex` and `age` to be provided separately? \n", + "* Consider the tradeoff between doing a bunch of feature engineering myself and letting the model figure out the trends on its own. There must be a sweet spot between the data processing I do and what happens automatically in logistic regression.\n", + "* Revisit exploration.ipynb for more bottom-up data inspiration!\n", + "* Different models provided by scikit-learn (Random Forest?)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/.ipynb_checkpoints/model_iteration_2-checkpoint.ipynb b/.ipynb_checkpoints/model_iteration_2-checkpoint.ipynb new file mode 100644 index 0000000..f65b6ba --- /dev/null +++ b/.ipynb_checkpoints/model_iteration_2-checkpoint.ipynb @@ -0,0 +1,697 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 265, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import pandas\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Beginning thoughts\n", + "\n", + "I looked through several options for model_iteration_2: DataQuest Mission 75, the blog posts of several other kaggle competition participants, and scanning the forums for ideas. Afer completing DataQuest Mission 75 in depth I think that it provides the most comprehensive starting point for model_iteration_2, and I really appreciate the opportunity to both read code and implement ideas (a rare opportunity since the titanic dataset in particular is an educational dataset?). Hopefully the work done in this notebook becomes a resource of examples for projects in the near future; the lambda functions passed into .apply and the regular expression usage in particular are not things that I use commonly in python. \n", + "\n", + "In this notebook I will explain and use the code that was suggested in DataQuest Mission 75, and adapt it to implement additional ideas. The markdown cells also include some reflection on the process and how I reacted to the dataquest mission. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Overall Themes of DataQuest Mission 75:\n", + "\n", + "* Use a better machine learning algorithm.\n", + "* Generate better features.\n", + "* Combine multiple machine learning algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import the datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 266, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "titanic = pandas.read_csv(\"./data/train.csv\")\n", + "titanic_test = pandas.read_csv(\"./data/test.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Clean up the training data (same as what we did in model_iteration_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 267, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Replace the missing age values with the median age\n", + "titanic[\"Age\"] = titanic[\"Age\"].fillna(titanic[\"Age\"].median())\n", + "\n", + "# From genders to numbers\n", + "titanic.loc[titanic[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic.loc[titanic[\"Sex\"] == \"female\", \"Sex\"] = 1\n", + "\n", + "# From embarked letters to numbers\n", + "titanic[\"Embarked\"] = titanic[\"Embarked\"].fillna(\"S\")\n", + "titanic.loc[titanic[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic.loc[titanic[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic.loc[titanic[\"Embarked\"] == \"Q\", \"Embarked\"] = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Clean up the test data (same as what we did in model_iteration_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "titanic_test = pandas.read_csv(\"./data/test.csv\")\n", + "\n", + "# Age column\n", + "titanic_test[\"Age\"] = titanic_test[\"Age\"].fillna(titanic[\"Age\"].median())\n", + "\n", + "# Sex column\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"female\", \"Sex\"] = 1\n", + "\n", + "# Embarked column\n", + "titanic_test[\"Embarked\"] = titanic_test[\"Embarked\"].fillna(\"S\")\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"Q\", \"Embarked\"] = 2\n", + "\n", + "# Fare column\n", + "titanic_test[\"Fare\"] = titanic_test[\"Fare\"].fillna(titanic[\"Fare\"].median())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Random forests!\n", + "\n", + "Random forests have the ability to capture many different \"layers\" of relationships between the features in our dataset. I use the word \"layers\" here to loosely describe the different branches of decision trees in the random forest. Random forests are random because each decision tree in the forest gets a random subset of the data. Taking the average of the results from the trees will then result in the model's prediction. " + ] + }, + { + "cell_type": "code", + "execution_count": 269, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.801346801347\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "from sklearn import cross_validation\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Fare\", \"Embarked\"]\n", + "\n", + "# Initialize our algorithm with the default parameters\n", + "# n_estimators is the number of trees we want to make\n", + "# min_samples_split is the minimum number of rows we need to make a split\n", + "# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)\n", + "alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"])\n", + "print(scores.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This first stab with random forests resulted in 80% accuracy, which is better than the linear and logistic regression in model_iteration_1. Based on what I know about these three algorithms, I can see how random forests do a better job of capturing the complicated relationships between features in this dataset -- for example, the branches can capture how the same non-sex features can impact men and women differently." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A lot of people who use random forest models in their implementation are also a fan of visualizing a tree from their random forest, such as the picture included in [triangleinequality's tutorial](https://triangleinequality.wordpress.com/2013/09/05/a-complete-guide-to-getting-0-79903-in-kaggles-titanic-competition-with-python/) and shown immediately below. \n", + "\n", + "![a tree in the random forest](https://triangleinequality.files.wordpress.com/2013/09/sample_tree1.png?w=960&h=960)\n", + "\n", + "The blue nodes are the leaves that represent whether that path of the tree resulted in survival or death. The dataquest mission also explained some of the relevant \"high-level\" parameters of the random forest model, such as the amount of splits and how many samples are needed to create a leaf (try tweaking `min_samples_split` and `min_samples_leaf`). There is probably a fair amount of iteration involved in finding the sweet spot of branches/layers of the decision tree (too many branches will result in overfitting on the training dataset, and too little branches will result in a ineffective set of decision trees). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Making new features\n", + "\n", + "Bulletpoints of suggestions from the dataquest page:\n", + "* The length of the name -- this could pertain to how rich the \n", + "person was, and therefore their position in the Titanic.\n", + "* The total number of people in a family (SibSp + Parch).\n", + "\n", + "Emily's commentary: \n", + "* I'm not sure if the first bulletpoint would make a nontrivial improvement, since the wealth of the passengers could already be represented with the `Pclass` variable, and names have a ton of variation on their own. We can confirm this with a correlation coefficient or some other measure for relevance of a feature on the prediction. \n", + "* I think the second bulletpoint would result in an improvement to the model. One potential story behind the family feature is that people with families stick together and help each other escape. Another potential story is that large families may have a tough time trying to get everyone safe because of all the craziness of the event and the many people they are trying to take care of at once. It also makes more sense to put family as one feature rather than `SibSp` and `Parch` separately if they both help the model \"figure out the family situation\" -- maybe the curse of dimensionality is showing up here." + ] + }, + { + "cell_type": "code", + "execution_count": 270, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "# Generating a familysize column\n", + "titanic[\"FamilySize\"] = titanic[\"SibSp\"] + titanic[\"Parch\"]\n", + "\n", + "# The .apply method generates a new series\n", + "titanic[\"NameLength\"] = titanic[\"Name\"].apply(lambda x: len(x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I usually don't use lambda functions in python, so I super appreciated the quick example above with the `.apply` method. I'll be sure to remember that when I'm working with dataframes in the future!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataquest mission used regular expressions to extract the titles from the names of the passengers -- this idea was also mentioned in the class discussion in Data Science last week. Similarly to the lambda used above, I also don't have a ton of experience with regular expressions, and appreciate this example! Definitely makes this notebook a valuable resource for future projects if I still need the examples in the near future." + ] + }, + { + "cell_type": "code", + "execution_count": 271, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mr 517\n", + "Miss 182\n", + "Mrs 125\n", + "Master 40\n", + "Dr 7\n", + "Rev 6\n", + "Col 2\n", + "Major 2\n", + "Mlle 2\n", + "Countess 1\n", + "Ms 1\n", + "Lady 1\n", + "Jonkheer 1\n", + "Don 1\n", + "Mme 1\n", + "Capt 1\n", + "Sir 1\n", + "Name: Name, dtype: int64\n", + "1 517\n", + "2 183\n", + "3 125\n", + "4 40\n", + "5 7\n", + "6 6\n", + "7 5\n", + "10 3\n", + "8 3\n", + "9 2\n", + "Name: Name, dtype: int64\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "import re\n", + "\n", + "# A function to get the title from a name.\n", + "def get_title(name):\n", + " # Use a regular expression to search for a title. Titles always consist of capital and lowercase letters, and end with a period.\n", + " title_search = re.search(' ([A-Za-z]+)\\.', name)\n", + " # If the title exists, extract and return it.\n", + " if title_search:\n", + " return title_search.group(1)\n", + " return \"\"\n", + "\n", + "# Get all the titles and print how often each one occurs.\n", + "titles = titanic[\"Name\"].apply(get_title)\n", + "print(pandas.value_counts(titles))\n", + "\n", + "# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.\n", + "title_mapping = {\"Mr\": 1, \"Miss\": 2, \"Mrs\": 3, \"Master\": 4, \"Dr\": 5, \"Rev\": 6, \"Major\": 7, \"Col\": 7, \"Mlle\": 8, \"Mme\": 8, \"Don\": 9, \"Lady\": 10, \"Countess\": 10, \"Jonkheer\": 10, \"Sir\": 9, \"Capt\": 7, \"Ms\": 2}\n", + "for k,v in title_mapping.items():\n", + " titles[titles == k] = v\n", + "\n", + "# Verify that we converted everything.\n", + "print(pandas.value_counts(titles))\n", + "\n", + "# Add in the title column.\n", + "titanic[\"Title\"] = titles" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataquest mission dives deeper into the \"family\" storyline by making family groups. While this part of the dataquest mission didn't prompt me to write any additional code, I super appreciated the example of a working implementation to read through and understand at my own pace." + ] + }, + { + "cell_type": "code", + "execution_count": 272, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-1 800\n", + " 14 8\n", + " 149 7\n", + " 63 6\n", + " 50 6\n", + " 59 6\n", + " 17 5\n", + " 384 4\n", + " 27 4\n", + " 25 4\n", + " 162 4\n", + " 8 4\n", + " 84 4\n", + " 340 4\n", + " 43 3\n", + " 269 3\n", + " 58 3\n", + " 633 2\n", + " 167 2\n", + " 280 2\n", + " 510 2\n", + " 90 2\n", + " 83 1\n", + " 625 1\n", + " 376 1\n", + " 449 1\n", + " 498 1\n", + " 588 1\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "import operator\n", + "\n", + "# A dictionary mapping family name to id\n", + "family_id_mapping = {}\n", + "\n", + "# A function to get the id given a row\n", + "def get_family_id(row):\n", + " # Find the last name by splitting on a comma\n", + " last_name = row[\"Name\"].split(\",\")[0]\n", + " # Create the family id\n", + " family_id = \"{0}{1}\".format(last_name, row[\"FamilySize\"])\n", + " # Look up the id in the mapping\n", + " if family_id not in family_id_mapping:\n", + " if len(family_id_mapping) == 0:\n", + " current_id = 1\n", + " else:\n", + " # Get the maximum id from the mapping and add one to it if we don't have an id\n", + " current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)\n", + " family_id_mapping[family_id] = current_id\n", + " return family_id_mapping[family_id]\n", + "\n", + "# Get the family ids with the apply method\n", + "family_ids = titanic.apply(get_family_id, axis=1)\n", + "\n", + "# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.\n", + "family_ids[titanic[\"FamilySize\"] < 3] = -1\n", + "\n", + "# Print the count of each unique id.\n", + "print(pandas.value_counts(family_ids))\n", + "\n", + "titanic[\"FamilyId\"] = family_ids" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Finding the best features\n", + "\n", + "The mission provides an example of univariate feature selection -- figuring out which features are most relevant by calculating a \"feature score\" for each feature (column inn the dataframe)." + ] + }, + { + "cell_type": "code", + "execution_count": 273, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXsAAAEpCAYAAAByeIL3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHSNJREFUeJzt3Xu8XGV56PHf5AYhZBMCGFIFImhEURSsXCqVjRfUHowI\nAh/gaEStWg8FbweirbJ7Wrno6VGqFloUTC1FEBGBeiSRwwjeEEKACAYwmraiCSC3BLkEmPPHs8aZ\nPdmXmey93rWG9ft+PvOZtdae2evZyZpn3vWs930XSJIkSZIkSZIkSZIkSZIkqSReBKxsezwMnATM\nBZYDdwHLgDlFBShJmlxTgN8CuwCfAU7Jtp8KnFlUUJKkyXUocH22vBqYly3vnK1Lkp4Fzgc+mC0/\n2La91rEuSepTM4D7gJ2y9c7k/kDacCSpWqYl2s+bgRVEwgdYT5Rv1gHzgXs737DHHns01qxZkyg8\nSXrWuBV4RefGKYl2fixwUdv6FcDibHkxcHnnG9asWUOj0Sj0cdpppxUeQ1niKEMMZYmjDDGUJY4y\nxFCWOMoQQ6PRAHj5SEk4RbKfBbweuKxt25nAG4iul6/F3jiSlKsUyf5RYEdgQ9u2B4gvgIVEL52H\nEsTRszPOOItarZbkMTAwt+g/V9KzWKqafV968snHgUaSfW3YUBv1Z4ODg0liGEsZYoByxFGGGKAc\ncZQhBihHHGWIYSyjZ5jiNbL6U2FqtRqpkj3UKPrvldT/Im9tnttTXaCVJBXIZC9JFWCyl6QKMNlL\nUgWY7CWpAkz2klQBJntJqgCTvSRVgMlekirAZC9JFWCyl6QKMNlLUgWY7CWpAkz2klQBJntJqgCT\nvSRVgMlekirAZC9JFWCyl6QKMNlLUgWY7CWpAlIk+znApcDPgTuA/YG5wHLgLmBZ9hpJUk5SJPuz\nge8ALwb2BlYDS4hkvxC4JluXJOWklvPv3w5YCezesX01cDCwHtgZqAN7drym0Wg0cg5vbLVaDUgV\nQ42i/15J/S/y1ua5Pe+W/fOB+4ALgJuB84BZwDwi0ZM9z8s5DkmqtGkJfv++wInAjcDn2bxk02CU\n5vPQ0NAflgcHBxkcHMwjRknqW/V6nXq9Pu7r8i7j7Az8mGjhAxwEfJwo6xwCrAPmA9diGccyjqQJ\nK6qMsw74L+JCLMDrgduBK4HF2bbFwOU5xyFJlZZ3yx7g5cCXgRnAGuAEYCpwCbArsBY4Gnio4322\n7CWpR6O17FMk+y1lspekHhVVxpEklYDJXpIqwGQvSRVgspekCjDZS1IFmOwlqQJM9pJUASZ7SaoA\nk70kVYDJXpIqwGQvSRVgspekCjDZS1IFmOwlqQJM9pJUASZ7SaoAk70kVYDJXpIqwGQvSRVgspek\nCjDZS1IFmOwlqQJM9pJUAdMS7GMt8AjwNLAJ2A+YC1wM7Jb9/GjgoQSxSFIlpWjZN4BBYB8i0QMs\nAZYDC4FrsnVJUk5SlXFqHeuLgKXZ8lLg8ERxSFIlpWrZfw+4CfjzbNs8YH22vD5blyTlJEXN/tXA\nb4GdiNLN6o6fN7LHZoaGhv6wPDg4yODgYC4BSlK/qtfr1Ov1cV/XWV7J22nARqKFPwisA+YD1wJ7\ndry20WiM+B2QTK1WY5TvoTz2RtF/r6T+F3lr89yedxlnG2B2tjwLOBRYBVwBLM62LwYuzzkOSaq0\nvFv2zwe+lS1PAy4EziC6Xl4C7MroXS9t2UtSj0Zr2acu4/TCZC9JPSqqjCNJKgGTvSRVgMlekirA\nZC9JFWCyl6QKMNlLUgWY7CWpAkz2klQBJntJqoBukv3RwEC2/Eli+oN9c4tIkjTpukn2nyRuK3gQ\n8DrgK8A5eQYlSZpc3ST7p7Pnw4DzgKuAGblFJEmadN0k+3uAfwaOAf4d2LrL90mSSqKbWS9nAW8k\n5qG/m7jZyMuAZTnGBc56KUk9m8isl48C9xE1e4CngF9MWmSSpNx107IfAl4JvAhYCDyXuPHIq/ML\nC7BlL0k9m0jL/m3AW4kWPkQNf/boL5cklU03yf4J4Jm29Vk5xSJJykk3yf4bwD8Bc4D3AdcAX84z\nKEnS5BqvZl8DdgH2BA7Ntl0NLM8zqIw1e0nq0ZbecLxGdLl8aQ4xjcdkL0k92tILtA1gBbBfDjFJ\nkhLppuvlncALgP+g1SOnAeydV1DNfRTd0rVlL6nfjNayn9bFe9+YPTczUTdfEO2mAjcBvwbeAswF\nLgZ2A9YSs2o+1OPvlCT1oJveOGuJnjiLiGS9XbatWycDd9D6slhCXOBdSPTsWdLD75IkbYFukv3J\nwL8COwHzsuWTuvz9zwP+jOiq2TwjWAQszZaXAod3G6wkact0U5JZBRxAq14/C/gJMRnaeL4BnE7c\n/ORjxJnBg8D2bft/oG29nTV7SerRRGr2MHwE7TOjvmq4w4B7gZXA4CivaTBGNh0aGvrD8uDgIIOD\no/0aSaqmer1OvV4f93XdtOw/ArwLuCx7/eHAV4HPjfO+04F3ELNkbk207i8DXkUk/3XEdMnXEoO2\nOtmyl6QebemgqqZXElMcN4DridZ6Lw6mVcb5DPA74Czi4uwcRr5Ia7KXpB5NpIxzANGbZkW2PgDs\nD9zQYwzNTHYmMUXye2h1vZQk5aiblv0twD60knWz3/w+eQWVsWUvST2ayHz2MDzjPU0kfElSn+gm\n2f+K6Fc/HZhB9Lv/ZZ5BSZImVzfJ/gPELQjvIaY8OICY116S1Cd6necmJWv2ktSjidTsP0v0wJlO\nzGVzP9F/XpLUJ7pJ9ocCjxAjYtcCewD/M8eYJEmTrJtk3+yLfxhwKfAw6WobkqRJ0M2gqiuB1cDj\nwF8Az8mWJUl9otsLtDsQNxh5mpj1cjYxt02evEArST2a6Nw4RTDZS1KPJjqCVpLUx0z2klQB3ST7\nKUS/+k9l67sC++UWkSRp0nVTsz+XuDvVa4mbjMwFlgF/nGNcYM1ekno2kfns9yemM27esOQBYjSt\nJKlPdFPGeZLhUxrvRPf3oZUklUA3yf4LwLeIwVSnAz8EzsgzKEnS5BqvZj8FOJAo3bwu23YN8PM8\ng8pYs5ekHk1kUNUtwCsmO6AumOwlqUcTGVT1PeDtI71ZktQfukngG4FtiHlxmhOgNYg57vNky16S\nejSRrpfbTno0kqSkukn2rxll+3WTGYgkKT/dlHGuolXL2JqYKmEFMaJ2LFsD3we2AmYA3wY+TozA\nvRjYjbjz1dHE9MmdLONIUo8mc4rjXYCzgSO6eO02wO+JM4gfAB8DFhH3sf0McCqwPbBkhPea7CWp\nR5M5xfGvgRd3+drfZ88ziFG4DxLJfmm2fSlw+BbEIEnqQTc1+y+0LU8h+tyv6PL3TwFuJm5Sfg5w\nOzAPWJ/9fH22LknKUTfJ/qa25aeAfyOmTOjGM8SXw3bA1cAhHT9vMEadZGho6A/Lg4ODDA4Odrlb\nSaqGer1OvV4f93Xd1Ow/BHy+Y9vJRN2+F58EHgPeCwwS97CdD1xLTJ3cyZq9JPVoIjX7xSNsO6GL\n9+0IzMmWZwJvIKZJvqLtdy4GLu/id0mSJmCsMs6xwHHA84Er27bPBn7Xxe+eT1yAnZI9vkZMorYS\nuAR4D62ul5KkHI1VxtmNSPRnEl0km6/dANxK1O/zZBlHkno0mf3sUzHZS1KPJlKzPxC4kZgQbRPR\nw+aRyQxOkpSvbpL9F4na/d3EFAjvAf4xz6AkSZOr2xG0dxMjYJ8GLgDelFtEkqRJ182gqkeJycxu\nJeazWUe5a/2SpA7dtOzfmb3uRGKum+cBR+YZlCT1YmBgLrVaLffHwMDcov/ULdZtC30bYrbLO3OM\npZO9cSR1Jd1ntfyf04n0xllEDIS6OlvfhxgFK0nqE90k+yFgf2J6YojEv3teAUmSJl83yX4Tm99J\n6pkcYpEk5aSbZH87cDzRc+eFxPz2P8ozKEnS5Oom2Z8I7AU8AVxEjJ79UJ5BSZIm11i9cb4GvIOR\n57NPIdkl79mzt+eRRx7YbLu9caT+YG+cli2ZCO0O4PXAd4mbjXTaPDtOrkbRidZkL/UHk33LaMl+\nrBG05xLzz+/O5vecbWCPHEnqG90MqjoX+EDegYzAlr2krtiyb+nL+eyLTrQme6k/mOxbJjKCVpLU\n50z2klQBJntJqgCTvSRVgMlekirAZC9JFZB3st8FuJaYTO1nwEnZ9rnAcuAuYBkwJ+c4JKnS8u5n\nv3P2uAXYlhiJezhwAnA/cU/bU4HtgSUd77WfvaSu2M++pah+9uuIRA+wEfg58Fzi7ldLs+1LiS8A\nSVJOUtbsFxC3NLwBmAesz7avz9YlSTkZayK0ybQt8E3gZGBDx88ajHr+NdS2PMjIk29KUnXV63Xq\n9fq4r0sxN8504Crg/9KaF381kbnXAfOJi7h7drzPmr2krlizbymqZl8DvkLMjd9+A5QrgMXZ8mLg\n8pzjkKRKy7tlfxBwHXAbra/djwM/BS4BdgXWAkez+U3NbdlL6oot+xanOB6TyV7qZyb7Fqc4lqQK\nM9lLUgWY7CWpAkz2klQBJntJqgCTvSRVgMlekirAZC9JFWCyl6QKMNlLUgWY7CWpAkz2klQBJntJ\nqgCTvdSjgYG51Gq1JI+BgblF/7l6lnCKY8ApjtULj4vycYrjFqc4lqQKM9lLUgWY7CWpAkz2klQB\nJntJqgCTvSRVgMlekirAZC9JFZB3sj8fWA+sats2F1gO3AUsA+bkHIMkVV7eyf4C4E0d25YQyX4h\ncE22LknKUd7J/nrgwY5ti4Cl2fJS4PCcY5CkyiuiZj+PKO2QPc8rIAZJqpRpBe+/wZizFw21LQ9m\nD0lSU71ep16vj/u6FLNeLgCuBF6Wra8msvY6YD5wLbDnCO9z1kuVksdF+TjrZUuZZr28AlicLS8G\nLi8gBkmqlLxb9hcBBwM7EvX5TwHfBi4BdgXWAkcDD43wXlv2KiWPi/KxZd8yWsvem5cAJnv1wuOi\nfEz2LWUq40iSEjPZS1IFmOwlqQJM9pJUASZ7SaoAk70kVYDJXpIqwGQvSRVgspekCjDZS1IFmOwl\nqQJM9pJUASZ7SaoAk70kVYDJXpIqwGQvSRVgspekCjDZS1IFmOylPjUwMJdarZb7Y2BgbtF/qiaB\n96AFvAetelGW48L7rrb4b9HiPWglqcJM9uorli5UVqmOzS09Poss47wJ+DwwFfgycFbHzy3jaDNl\nOF0vy3FRhn+LsijDv0W5jovylHGmAl8kEv5LgGOBFxcUS+lts83swlsM9Xo97R+tvuBx0T+KSvb7\nAb8A1gKbgK8Dby0oltJ77LGNRIsh/8eGDQ+OGIMfao3E46J/FJXsnwv8V9v6r7NtKqkzzjir8LML\nSVuuqGRf7gKgNvPkk49T9NmFysdGQP+YVtB+7wF2aVvfhWjdt1sDtT1SBZRd1BjpJ6lCGCOGssRR\nhhjSxVGGGMoSx9gxpLFhw4N98G9RiuPi1mRBdGEasAZYAMwAbsELtJL0rPRm4E7iQu3HC45FkiRJ\nkqTq2aboANSb4q+6tLyAuEj7OHAI8DLgX4CHigyq4uYTYyKeAW4E1hUQw9bAkcT1nWaHggbwvxLH\n8afEMXoBsBOwLfCrRPs+kvibRxuieVmiOAD+hBjxPpvoWPEK4H3ABxPGAPAi4B+BnYG9gL2BRcDf\nJdj3F9qWm/8v7esnJYihZ2WaG+ebwFPEB+qfiAPp3xLH8J6O9WnAUOIYIA7grwDfzdZfwuax5e29\nwA3AEcDbs+XUMQB8m/gQbwI2Zo9HE8cwBJxC69rSDOBfE+7/Ldnj3cRxcXz2+HK2LaXPEyPf78/W\nbwEOThwDwHnAJ4Ans/VVxEj8FFZkj62AfYG7gLuJL74ZiWLoayuz51OAv+zYlspFwHeAPwJeSrRm\n/z5xDBBJ/hjgtmx9OvCzxDHcBezQtr5Dti211H/3SG4lGkbtx+Nto7w2T8uJs62m+cCyxDH8NHtu\n/7cooqvfTSPEcUviGG4gPptN07NtpVSmlv2TwHHAO4GriFOj6WO+Y/IdS5SObgP+Hfgw8NHEMQDs\nCFwMPJ2tbyLOelK6n2hFN22k1ZpL6UfEKXqRniBKWU2zCopjF4aX0tYDuyaO4T+BV2fLM4CPAT9P\nHAPAfUQVoOntwG8TxzAHGGhbn51tK6WiBlWN5N3A+4FPE7XQ5wNfSxzDQqLedhnR7/+/Ey2H1GWD\njQxvVR8APJw4hjXAT4gyCsTcRbcRX34N4P/kvP9V2fNU4ATimHgi29Yg7RfAN4jS4hyiPv1uooSS\n2veAq4nyZo04+1ueOIa/AM4mpje5hziz+B+JYwA4EfhnYE/gN8TxcXziGM4Ebgbq2frBFFP27UqZ\nLtC2mws8j/SnyquJg+h7xFnPh4k69UsSx/FK4iLQXsDtxAXBt5P2dHkoe25eEOy8OPg3Oe9/wTg/\nX5vz/ptqRIt6T+DQbNvVpE+yzVjeRlwsBrgO+FbiGHZh+LxWENeYirh4D3GWNQXYUND+5wP7E5+N\nGyju32FcZUr23ycuQk0jLn7cB/yQSLipbMfmLeiFFFOrnk70OIAYfLapgBia5hK9op4Z74U5OAC4\nA3gkWx8gzrpS1UZrxFnGSxPtbzwLgBcSXzbbEGc+KRPdU8ClxNnN77NtK4F9Eu2/vaza3vhoNkby\nPuOEaIx17rs9npsTxNCzMpVxtiM+0O8l6uan0TqVT2UmcbA8l9Zc+weSPtk3u9o1LSS+hFYB9+a8\n79OAS4g67FbExeKXEx/y40nfoj2X6PHQ9Gi2LVVyaRCNj/1oXZwsyvuAPye+fPcgzn7PAV6XMIZV\nwPVEQ+woYgR8SrMpfiLFvx8nhkNSBdKvVtHqXbBfti11GacMvWAgLg4/QHRH/SbwOyLJ/oK4gJ2n\nO2i1VN5H1COnEq3pG3Pe90hG6mGR+ri4k7hY/kviOF1VQAwQZbytGN4DJXWDqLnvVxMNgreQvtcc\nwEFdblMJHUV8gM7J1vcgEl1KZejOBfGFN69tfV62bQeihp+n9r/9MuADo/wslW8RF82nE70/TgYu\nTxzDglEeqXV2e5xG+i+d9mNgPvAD4LHEMXTG0ZS6fLKCuDi9feL9ahLUiYTaPJAOIK4lpNbZla3W\nti3vhPsTYvTyTsTZxe5tP7sz532P5DlEN9R7s8dF2bYiPIfo6th8pPZZ4K+I/4c3EF+En04cw/yO\n9WnAaxLu/0Cibv9r4CPZ8keJDgWp+/u/EDidOOO+GHgj5boOOkyZavYzafV8mZlta5B2hOBHgSuJ\nBPcjor/7UQn333QtUcq5hDh4jiS+iGaR//QRHyIuwO0EfI4oXQD8N9K3nKZlMRyTeL+dFhF12j8i\nvnB2I75890ocx6nENa1VRDfl75CuC+g7iK7Qx43wswbRMyiFGUTdfmr23PQI0WMtpbuJUbx/DRwG\nnE90Yjif6J76QOJ4+salwN8SyWUxUaP+h0T73o9Wi2U6cWr2/4AvERfDUptCHLifI4anf4qYB6SK\nfkDUqYt0G/HF3zyrOoT4QKfWOR/QVNJNKfL+7HmIuIjffDTXU9utgH2O5OXEZ/ROIl8dQAw0K6L8\n2zea/zjtF0dTda9bSSupv4YYiXckManSpYli6LQvcdr+H0Sr/i/HfPXk25Ho67+SaNGfzfCBXql8\njbgw/Elap+wfSRzDiuz5ViLBQjEXaL9Ka36erYgBb0MFxFGkL2bPV47wuCJxLCuIRuFxxIR97VKP\nfxhXmco4zQmNHiZqxuuIUkIKU2idch1DjJZs9oRJWQd8ETFlwzHEOINvEGWcwYQxNH2duF5xRBbD\ncURd8vWJ41iTPaYQM02ONvNjnh4kSgbXAxcSpZyNY74jH+/O9v8J4uziO8TZXwrNnll3Ef8H5xMN\norXAu0hX4ltMDHwcac6q1MfFUbTKnJ3eljKQftPsP3wwMfT5Pob3BMnTz2jNw3Mnw2fxy7v3S7tn\niNZJ+8W/VNPodhqpy2nqbn5Fa/4/zCJa9NOJxHYSac9yXkmc6e1LjNa8hSjrNbelcDutz8hxRHLf\ngfjyvz5RDFBMj7BOH2X4WWbneimVqWV/Xvb8fWJenJQuyvZ7PzEqsHnwvpC08+kfQbTsryP6/Ddb\n9kVYlsVycbZ+FOlnWIToAXMKm1+4f22CfX+bGLz1KHGWdyRRSkmtcxDPQ8S4h2brNsUgnk20RnEf\nRgx8/B0xtchnE+y/aScioY70uUg1gna0gV1FnHV2rQzdhEaaVbL9Rg0p/vMgunTtTCS05sRnC4nS\nQepeKNsSE48dS3yQ/4WoAaZIthtpHbCzaE2RMIX4d5k90ptytJz4wvkYcZHwXcRZ3ykJ9t0+DUDK\nKQFGMpW4aH/xeC/Myc1Ekn+AuI70Olpnf6uJuYNS+C0xgno0ec/Z1LfK0LIvw/BngB+PsK2IOXEg\nEu6F2WMu8SFfQppkv22CffRiB6J74UnE2df3aQ1+q5KniS+4opL9p4gL5dOIUmMz0Q8S11RSWUfx\nCf1U4CyG37GqqbR3qipDy17lsifRUhutFpz6LOcnRHe2ZUTXtt8Q5a09Euz7aVqTfc1k+EjRBsPn\nMk/hTKLUeDHDp91O1Z97OtEYeLBt2ywij6S6YF30GRbEFBFXEmeZnRrA0qTR9KGlDJ/4f3uK6ctc\ndc1rJ3VicFfnI7XDiOPiZVlMNxODnKpoLXHBvvORUtFTBBTR/VeTbKRBCA5MSK99gBlEV7criFPW\nlB+0mcT01l8iavVlKDlq+BQBX6fkUwTk7FXEtbSVFDtBXt+5leGjVedSva5+ZVCWAWaXEDf1fj/R\nK+bshPsus5cCRxOznzYfRZhCnGHdQ9zM5G8oZrR5ke4i/g12p9gJ8vrOO4k+7n9LJJY7Ke5ArrL2\nQWRfYvgIzZQDzNq/6KdRjv7VRRsiSmn3AhcQFyuLGOHtFAHhh0UH0M/2IqYFOJH0twJUKMsAs87k\nbrKP/5uptL505xH93FPqqykCcnYo8BWii/SR2eOIQiMaQxnqoDOJkbIvIOpd51LsLfiqriwDzPZm\n+O32ZratF9ETpgweI3oIPUXc2e1e4p6wKTlFQMtiYoqTaQy/ZedlxYQztjIk+6XEvDg/AP6MaNGf\nXGhE1fZpouXWHGDWPIhrpJ2Mber4L6mcG4leMOcRYw0eJabiTqHz3q+1jvVUgx/L5I+JrsplGCc0\nrjJcRV9FdKuD+PK5keL70Uplt4A4u0nV+2OIsacIKHqgUxEuAP43acubW6wMLfunRlmWNFyNqAkf\nRCTY60mb7DXcgcRF6V8BT2TbGkQJsnTK0LJvH6UIw0cqVrU2K43kHGLk8EXEZ/doon7+wQT77ssp\nAnK2YJTtaxPG0LUytOytzUrdOYS4ptW8jvJV4I5E+27uZ8UIP+uLmnUO1mbPz2HznkmStMWuYnhr\nckG2TcVYRNyH9lGilPMMJa7fl6FlL2lsV2bPs4kbnf+UaE3vR3RoSOlVxJ2yFtDKH6WtU+fs74i6\n/XKiU8khxI3ZS8lkL5XfSLfga0pdQrmQGC37M4b3La+iTcR4lClEOfpaSjyth8leKr96x/oAxX12\n7yP9jb3Lqiz3Ju5KGXrjSOrO+4n+7E/QalU3iIm4UjkUOIaYpuHJthhKOWo0J7sC/0nM5f840bI/\nnvgSvpC4XWPpmOyl/vELYtKx+wuM4UJiioDbGV7GOaGYcArRfgOV5r2JS88yjtQ/fsnwu2UVoa+m\nCEgg5VnVhJjspf6xhLhX8o8ZXkJJOaDpR0Rf/9J2MdTILONI/eMm4DpiPqlnaM1Lk/Kep6uJUbx9\nMUVATsp2b+KumOyl/lGGm20vGGX72oQxaAs4VYHUP3Yj7vvwG2AG0arsbFnm7aHsMYPIH43s8XDC\nGCTpWW0tUT7pfKTUV1MESJK2zG3AjrRuE3kIcH5x4UjSs8cpbctHdfzs9JSB0Jr18lZaZeBUc+pr\nAqYUHYCkcR3btvyJjp+9OWUgbD5FwD9Q4ikC1GKyl9SNXbPntxLdDj8MfJcY1fuWooKSpGeTlaMs\nj7SeIoZvJtqnJpEjaKXy2xvYkC3PbFturqfWN1MEqMVkL5Wf42E0YY6gldSNvpwiQJIkSZIkSZIk\nSZIkSZIm3/8H8RXc2zrUvDcAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of the random forest scores 0.811447811448\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "import numpy as np\n", + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Fare\", \"Embarked\", \"FamilySize\", \"Title\", \"FamilyId\"]\n", + "\n", + "# Perform feature selection\n", + "selector = SelectKBest(f_classif, k=5)\n", + "selector.fit(titanic[predictors], titanic[\"Survived\"])\n", + "\n", + "# Get the raw p-values for each feature, and transform from p-values into scores\n", + "scores = -np.log10(selector.pvalues_)\n", + "\n", + "# Plot the scores. See how \"Pclass\", \"Sex\", \"Title\", and \"Fare\" are the best?\n", + "plt.bar(range(len(predictors)), scores)\n", + "plt.xticks(range(len(predictors)), predictors, rotation='vertical')\n", + "plt.ylabel(\"feature scores\")\n", + "\n", + "plt.show()\n", + "\n", + "# Pick only the four best features.\n", + "predictors = [\"Pclass\", \"Sex\", \"Fare\", \"Title\"]\n", + "\n", + "alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "print \"mean of the random forest scores\", scores.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### gradient boosting and ensembling!\n", + "\n", + "I had heard of gradient boosting in passing from ML enthusiast friends, but I hadn't tried it in person... at a high level, the dataquest page says that the errors from one tree will help the next tree learn the dataset more effectively. There are also some suggested parameters to prevennt overftting: limiting the tree count and tree depth. \n", + "\n", + "The dataquest then describes ensembling -- making predictions based on several different models and averaging their results to make a final decision on what the prediction is. My reaction: ensembling seems super useful! It sounds like ensembling presents the opportunity and challenge of balancing the strengths and weaknesses of many different models; it's another layer of algorithm design and ensemble parameter tweaking." + ] + }, + { + "cell_type": "code", + "execution_count": 274, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.819304152637\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:40: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index\n" + ] + } + ], + "source": [ + "# Code from DataQuest Mission 75\n", + "\n", + "from sklearn.ensemble import GradientBoostingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.cross_validation import KFold\n", + "\n", + "# The algorithms we want to ensemble.\n", + "# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.\n", + "algorithms = [\n", + " [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), [\"Pclass\", \"Sex\", \"Age\", \"Fare\", \"Embarked\", \"FamilySize\", \"Title\", \"FamilyId\"]],\n", + " [LogisticRegression(random_state=1), [\"Pclass\", \"Sex\", \"Fare\", \"FamilySize\", \"Title\", \"Age\", \"Embarked\"]]\n", + "]\n", + "\n", + "# Initialize the cross validation folds\n", + "kf = KFold(titanic.shape[0], n_folds=3, random_state=1)\n", + "\n", + "predictions = []\n", + "for train, test in kf:\n", + " train_target = titanic[\"Survived\"].iloc[train]\n", + " full_test_predictions = []\n", + " # Make predictions for each algorithm on each fold\n", + " for alg, predictors in algorithms:\n", + " # Fit the algorithm on the training data.\n", + " alg.fit(titanic[predictors].iloc[train,:], train_target)\n", + " # Select and predict on the test fold. \n", + " # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.\n", + " test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]\n", + " full_test_predictions.append(test_predictions)\n", + " # Use a simple ensembling scheme -- just average the predictions to get the final classification.\n", + " test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2\n", + " # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.\n", + " test_predictions[test_predictions <= .5] = 0\n", + " test_predictions[test_predictions > .5] = 1\n", + " predictions.append(test_predictions)\n", + "\n", + "# Put all the predictions together into one array.\n", + "predictions = np.concatenate(predictions, axis=0)\n", + "\n", + "# Compute accuracy by comparing to the training data.\n", + "accuracy = sum(predictions[predictions == titanic[\"Survived\"]]) / len(predictions)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preparing and predicting on the test set" + ] + }, + { + "cell_type": "code", + "execution_count": 275, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Code from DataQuest Mission 75\n", + "\n", + "# First, we'll add titles to the test set.\n", + "titles = titanic_test[\"Name\"].apply(get_title)\n", + "# We're adding the Dona title to the mapping, because it's in the test set, but not the training set\n", + "title_mapping = {\"Mr\": 1, \"Miss\": 2, \"Mrs\": 3, \"Master\": 4, \"Dr\": 5, \"Rev\": 6, \"Major\": 7, \"Col\": 7, \"Mlle\": 8, \"Mme\": 8, \"Don\": 9, \"Lady\": 10, \"Countess\": 10, \"Jonkheer\": 10, \"Sir\": 9, \"Capt\": 7, \"Ms\": 2, \"Dona\": 10}\n", + "for k,v in title_mapping.items():\n", + " titles[titles == k] = v\n", + "titanic_test[\"Title\"] = titles\n", + "\n", + "# Check the counts of each unique title.\n", + "# print(pandas.value_counts(titanic_test[\"Title\"]))\n", + "\n", + "# Now, we add the family size column.\n", + "titanic_test[\"FamilySize\"] = titanic_test[\"SibSp\"] + titanic_test[\"Parch\"]\n", + "\n", + "# Now we can add family ids.\n", + "# We'll use the same ids that we did earlier.\n", + "# print(family_id_mapping)\n", + "\n", + "family_ids = titanic_test.apply(get_family_id, axis=1)\n", + "family_ids[titanic_test[\"FamilySize\"] < 3] = -1\n", + "titanic_test[\"FamilyId\"] = family_ids\n", + "\n", + "titanic_test[\"NameLength\"] = titanic_test[\"Name\"].apply(lambda x: len(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 276, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Code from DataQuest Mission 75\n", + "\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"Fare\", \"Embarked\", \"FamilySize\", \"Title\", \"FamilyId\"]\n", + "\n", + "algorithms = [\n", + " [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],\n", + " [LogisticRegression(random_state=1), [\"Pclass\", \"Sex\", \"Fare\", \"FamilySize\", \"Title\", \"Age\", \"Embarked\"]]\n", + "]\n", + "\n", + "full_predictions = []\n", + "for alg, predictors in algorithms:\n", + " # Fit the algorithm using the full training data.\n", + " alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + " # Predict using the test dataset. We have to convert all the columns to floats to avoid an error.\n", + " predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]\n", + " full_predictions.append(predictions)\n", + "\n", + "# The gradient boosting classifier generates better predictions, so we weight it higher.\n", + "predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4\n", + "\n", + "# turning the predictions into 0s and 1s\n", + "for i in range(predictions.shape[0]):\n", + " predictions[i] = (predictions[i] >= 0.5)\n", + "\n", + "predictions = predictions.astype(int)\n", + "\n", + "# Create a new dataframe with only the columns Kaggle wants from the dataset.\n", + "submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " }) " + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Save it\n", + "submission.to_csv(\"dataquest75.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### How the model performed on kaggle\n", + "\n", + "The model had a score of 0.79904 and is at rank 1003 on the kaggle leaderboards. It is reassuring that the changes explored in the dataquest mission resulted in a nontrivial improvement on test performance over the logistic regression models from model_iteration_1! I have a feeling that Data Science projects are neverending -- there's always some room for improvement or new combination of ideas to try and see if those new ideas work just as well or better than the current iteration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Ideas for future work\n", + "\n", + "If I had more time...\n", + "\n", + "* Getting more familiar with scikit learn and the many algorithms it has -- ensemble lots of things!\n", + "* Trying different ensembling techniques:\n", + " * instead of averaging, maybe voting (going with what the majority of the models say),\n", + " * the [wikipedia page on Ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) and the [scikit learn page on Ensemble methods](http://scikit-learn.org/stable/modules/ensemble.html) -- maybe trying things like AdaBoost, the Bayes optimal classifier, etc. and plotting their performance vs. ensemble learning method\n", + "* Following advice from this scikit learn cheat-sheet on the [Choosing the right estimator](http://scikit-learn.org/stable/tutorial/machine_learning_map/) page (the image on the webpage also includes links for the boxes in this figure):\n", + "![scikit-learn algorithm cheat-sheet](http://scikit-learn.org/stable/_static/ml_map.png)\n", + "* Reading more about the history of the Titanic and getting more inspiration from the details of the event (gathering more domain-specific knowledge than I currently have about the Titanic) \n", + "* Trying different numbers of folds for the experiments and plotting the model performance vs. number of folds. There must be a sweet spot between too many folds (too little training data for each sub-trial of the model to do well...) and too little folds." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Ending thoughts\n", + "\n", + "In regards to time management... Week 2 of the semester definitely was a \"recalibration\" week for me -- I've found that I'm most productive in Data Science when I have at least 1.5 hours to work. Here's a quick bulletpoint list of the proces:\n", + "\n", + "* gathering resources and figuring out which tools to try using first, \n", + "* reading, \n", + "* implementing code, \n", + "* jotting down commentary in real time with the markdown cells\n", + "* debugging code, \n", + "* learning about functionality of the tools (or perhaps investigating other tools)\n", + "* getting creative with ideas on how to improve the model, \n", + "* implementing those creative ideas and debugging,\n", + "* continued...\n", + "\n", + "There are so many steps that need to happen before one can easily do creative changes to improve your model! Definitely worth thinking about this process with the future steps in mind. e.g. Once the first pass at implementating the data cleanup, feature engineering, and training the model is done, make functions that can be used for speedy feature engineering and testing in the creative phase. Also, take advantage of how easy it is to swap scikit-learn functions. \n", + "\n", + "At the beginning of this warmup project I felt like each step could be a timesink and was overwhelmed by the amount of work goes into each notebook. I also felt like I could have done more at any step in time and tried to express these ideas in the markdown cells. I've done this process before in the machine learning co-curricular but that was with a smaller time commitment and longer timespan (1 credit for DoML vs. 4 credits for Data Science). In the future I'll do a better job of breaking down tasks into smaller chunks that I can do in my relatively frequent small chunks of free time (often I have 30-60 minutes free during the workday hours) and leaving notes to myself about where I left off. In retrospect this seems like common sense with Olin workload time management but this warmup project was a helpful \"recalibration\" phase for me before the next projects start. :) \n", + "\n", + "I definitely value the opportunity for creativity in the \"improving your model\" phase -- next time I will try some creative exercises to generate a large quantity of ideas, and how visualizations or results will either confirm those ideas or lead to new questions. In this project I didn't devote a lot of time to do divergent thinking exercises but will try refining the creative process in Data Science in the next projects." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/data/test.csv b/data/test.csv new file mode 100644 index 0000000..f705412 --- /dev/null +++ b/data/test.csv @@ -0,0 +1,419 @@ +PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked +892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q +893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47,1,0,363272,7,,S +894,2,"Myles, Mr. Thomas Francis",male,62,0,0,240276,9.6875,,Q +895,3,"Wirz, Mr. Albert",male,27,0,0,315154,8.6625,,S +896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22,1,1,3101298,12.2875,,S +897,3,"Svensson, Mr. Johan Cervin",male,14,0,0,7538,9.225,,S +898,3,"Connolly, Miss. Kate",female,30,0,0,330972,7.6292,,Q +899,2,"Caldwell, Mr. Albert Francis",male,26,1,1,248738,29,,S +900,3,"Abrahim, Mrs. Joseph (Sophie Halaut Easu)",female,18,0,0,2657,7.2292,,C +901,3,"Davies, Mr. John Samuel",male,21,2,0,A/4 48871,24.15,,S +902,3,"Ilieff, Mr. Ylio",male,,0,0,349220,7.8958,,S +903,1,"Jones, Mr. Charles Cresson",male,46,0,0,694,26,,S +904,1,"Snyder, Mrs. John Pillsbury (Nelle Stevenson)",female,23,1,0,21228,82.2667,B45,S +905,2,"Howard, Mr. Benjamin",male,63,1,0,24065,26,,S +906,1,"Chaffee, Mrs. Herbert Fuller (Carrie Constance Toogood)",female,47,1,0,W.E.P. 5734,61.175,E31,S +907,2,"del Carlo, Mrs. Sebastiano (Argenia Genovesi)",female,24,1,0,SC/PARIS 2167,27.7208,,C +908,2,"Keane, Mr. Daniel",male,35,0,0,233734,12.35,,Q +909,3,"Assaf, Mr. Gerios",male,21,0,0,2692,7.225,,C +910,3,"Ilmakangas, Miss. Ida Livija",female,27,1,0,STON/O2. 3101270,7.925,,S +911,3,"Assaf Khalil, Mrs. Mariana (Miriam"")""",female,45,0,0,2696,7.225,,C +912,1,"Rothschild, Mr. Martin",male,55,1,0,PC 17603,59.4,,C +913,3,"Olsen, Master. Artur Karl",male,9,0,1,C 17368,3.1708,,S +914,1,"Flegenheim, Mrs. Alfred (Antoinette)",female,,0,0,PC 17598,31.6833,,S +915,1,"Williams, Mr. Richard Norris II",male,21,0,1,PC 17597,61.3792,,C +916,1,"Ryerson, Mrs. Arthur Larned (Emily Maria Borie)",female,48,1,3,PC 17608,262.375,B57 B59 B63 B66,C +917,3,"Robins, Mr. Alexander A",male,50,1,0,A/5. 3337,14.5,,S +918,1,"Ostby, Miss. Helene Ragnhild",female,22,0,1,113509,61.9792,B36,C +919,3,"Daher, Mr. Shedid",male,22.5,0,0,2698,7.225,,C +920,1,"Brady, Mr. John Bertram",male,41,0,0,113054,30.5,A21,S +921,3,"Samaan, Mr. Elias",male,,2,0,2662,21.6792,,C +922,2,"Louch, Mr. Charles Alexander",male,50,1,0,SC/AH 3085,26,,S +923,2,"Jefferys, Mr. Clifford Thomas",male,24,2,0,C.A. 31029,31.5,,S +924,3,"Dean, Mrs. Bertram (Eva Georgetta Light)",female,33,1,2,C.A. 2315,20.575,,S +925,3,"Johnston, Mrs. Andrew G (Elizabeth Lily"" Watson)""",female,,1,2,W./C. 6607,23.45,,S +926,1,"Mock, Mr. Philipp Edmund",male,30,1,0,13236,57.75,C78,C +927,3,"Katavelas, Mr. Vassilios (Catavelas Vassilios"")""",male,18.5,0,0,2682,7.2292,,C +928,3,"Roth, Miss. Sarah A",female,,0,0,342712,8.05,,S +929,3,"Cacic, Miss. Manda",female,21,0,0,315087,8.6625,,S +930,3,"Sap, Mr. Julius",male,25,0,0,345768,9.5,,S +931,3,"Hee, Mr. Ling",male,,0,0,1601,56.4958,,S +932,3,"Karun, Mr. Franz",male,39,0,1,349256,13.4167,,C +933,1,"Franklin, Mr. Thomas Parham",male,,0,0,113778,26.55,D34,S +934,3,"Goldsmith, Mr. Nathan",male,41,0,0,SOTON/O.Q. 3101263,7.85,,S +935,2,"Corbett, Mrs. Walter H (Irene Colvin)",female,30,0,0,237249,13,,S +936,1,"Kimball, Mrs. Edwin Nelson Jr (Gertrude Parsons)",female,45,1,0,11753,52.5542,D19,S +937,3,"Peltomaki, Mr. Nikolai Johannes",male,25,0,0,STON/O 2. 3101291,7.925,,S +938,1,"Chevre, Mr. Paul Romaine",male,45,0,0,PC 17594,29.7,A9,C +939,3,"Shaughnessy, Mr. Patrick",male,,0,0,370374,7.75,,Q +940,1,"Bucknell, Mrs. William Robert (Emma Eliza Ward)",female,60,0,0,11813,76.2917,D15,C +941,3,"Coutts, Mrs. William (Winnie Minnie"" Treanor)""",female,36,0,2,C.A. 37671,15.9,,S +942,1,"Smith, Mr. Lucien Philip",male,24,1,0,13695,60,C31,S +943,2,"Pulbaum, Mr. Franz",male,27,0,0,SC/PARIS 2168,15.0333,,C +944,2,"Hocking, Miss. Ellen Nellie""""",female,20,2,1,29105,23,,S +945,1,"Fortune, Miss. Ethel Flora",female,28,3,2,19950,263,C23 C25 C27,S +946,2,"Mangiavacchi, Mr. Serafino Emilio",male,,0,0,SC/A.3 2861,15.5792,,C +947,3,"Rice, Master. Albert",male,10,4,1,382652,29.125,,Q +948,3,"Cor, Mr. Bartol",male,35,0,0,349230,7.8958,,S +949,3,"Abelseth, Mr. Olaus Jorgensen",male,25,0,0,348122,7.65,F G63,S +950,3,"Davison, Mr. Thomas Henry",male,,1,0,386525,16.1,,S +951,1,"Chaudanson, Miss. Victorine",female,36,0,0,PC 17608,262.375,B61,C +952,3,"Dika, Mr. Mirko",male,17,0,0,349232,7.8958,,S +953,2,"McCrae, Mr. Arthur Gordon",male,32,0,0,237216,13.5,,S +954,3,"Bjorklund, Mr. Ernst Herbert",male,18,0,0,347090,7.75,,S +955,3,"Bradley, Miss. Bridget Delia",female,22,0,0,334914,7.725,,Q +956,1,"Ryerson, Master. John Borie",male,13,2,2,PC 17608,262.375,B57 B59 B63 B66,C +957,2,"Corey, Mrs. Percy C (Mary Phyllis Elizabeth Miller)",female,,0,0,F.C.C. 13534,21,,S +958,3,"Burns, Miss. Mary Delia",female,18,0,0,330963,7.8792,,Q +959,1,"Moore, Mr. Clarence Bloomfield",male,47,0,0,113796,42.4,,S +960,1,"Tucker, Mr. Gilbert Milligan Jr",male,31,0,0,2543,28.5375,C53,C +961,1,"Fortune, Mrs. Mark (Mary McDougald)",female,60,1,4,19950,263,C23 C25 C27,S +962,3,"Mulvihill, Miss. Bertha E",female,24,0,0,382653,7.75,,Q +963,3,"Minkoff, Mr. Lazar",male,21,0,0,349211,7.8958,,S +964,3,"Nieminen, Miss. Manta Josefina",female,29,0,0,3101297,7.925,,S +965,1,"Ovies y Rodriguez, Mr. Servando",male,28.5,0,0,PC 17562,27.7208,D43,C +966,1,"Geiger, Miss. Amalie",female,35,0,0,113503,211.5,C130,C +967,1,"Keeping, Mr. Edwin",male,32.5,0,0,113503,211.5,C132,C +968,3,"Miles, Mr. Frank",male,,0,0,359306,8.05,,S +969,1,"Cornell, Mrs. Robert Clifford (Malvina Helen Lamson)",female,55,2,0,11770,25.7,C101,S +970,2,"Aldworth, Mr. Charles Augustus",male,30,0,0,248744,13,,S +971,3,"Doyle, Miss. Elizabeth",female,24,0,0,368702,7.75,,Q +972,3,"Boulos, Master. Akar",male,6,1,1,2678,15.2458,,C +973,1,"Straus, Mr. Isidor",male,67,1,0,PC 17483,221.7792,C55 C57,S +974,1,"Case, Mr. Howard Brown",male,49,0,0,19924,26,,S +975,3,"Demetri, Mr. Marinko",male,,0,0,349238,7.8958,,S +976,2,"Lamb, Mr. John Joseph",male,,0,0,240261,10.7083,,Q +977,3,"Khalil, Mr. Betros",male,,1,0,2660,14.4542,,C +978,3,"Barry, Miss. Julia",female,27,0,0,330844,7.8792,,Q +979,3,"Badman, Miss. Emily Louisa",female,18,0,0,A/4 31416,8.05,,S +980,3,"O'Donoghue, Ms. Bridget",female,,0,0,364856,7.75,,Q +981,2,"Wells, Master. Ralph Lester",male,2,1,1,29103,23,,S +982,3,"Dyker, Mrs. Adolf Fredrik (Anna Elisabeth Judith Andersson)",female,22,1,0,347072,13.9,,S +983,3,"Pedersen, Mr. Olaf",male,,0,0,345498,7.775,,S +984,1,"Davidson, Mrs. Thornton (Orian Hays)",female,27,1,2,F.C. 12750,52,B71,S +985,3,"Guest, Mr. Robert",male,,0,0,376563,8.05,,S +986,1,"Birnbaum, Mr. Jakob",male,25,0,0,13905,26,,C +987,3,"Tenglin, Mr. Gunnar Isidor",male,25,0,0,350033,7.7958,,S +988,1,"Cavendish, Mrs. Tyrell William (Julia Florence Siegel)",female,76,1,0,19877,78.85,C46,S +989,3,"Makinen, Mr. Kalle Edvard",male,29,0,0,STON/O 2. 3101268,7.925,,S +990,3,"Braf, Miss. Elin Ester Maria",female,20,0,0,347471,7.8542,,S +991,3,"Nancarrow, Mr. William Henry",male,33,0,0,A./5. 3338,8.05,,S +992,1,"Stengel, Mrs. Charles Emil Henry (Annie May Morris)",female,43,1,0,11778,55.4417,C116,C +993,2,"Weisz, Mr. Leopold",male,27,1,0,228414,26,,S +994,3,"Foley, Mr. William",male,,0,0,365235,7.75,,Q +995,3,"Johansson Palmquist, Mr. Oskar Leander",male,26,0,0,347070,7.775,,S +996,3,"Thomas, Mrs. Alexander (Thamine Thelma"")""",female,16,1,1,2625,8.5167,,C +997,3,"Holthen, Mr. Johan Martin",male,28,0,0,C 4001,22.525,,S +998,3,"Buckley, Mr. Daniel",male,21,0,0,330920,7.8208,,Q +999,3,"Ryan, Mr. Edward",male,,0,0,383162,7.75,,Q +1000,3,"Willer, Mr. Aaron (Abi Weller"")""",male,,0,0,3410,8.7125,,S +1001,2,"Swane, Mr. George",male,18.5,0,0,248734,13,F,S +1002,2,"Stanton, Mr. Samuel Ward",male,41,0,0,237734,15.0458,,C +1003,3,"Shine, Miss. Ellen Natalia",female,,0,0,330968,7.7792,,Q +1004,1,"Evans, Miss. Edith Corse",female,36,0,0,PC 17531,31.6792,A29,C +1005,3,"Buckley, Miss. Katherine",female,18.5,0,0,329944,7.2833,,Q +1006,1,"Straus, Mrs. Isidor (Rosalie Ida Blun)",female,63,1,0,PC 17483,221.7792,C55 C57,S +1007,3,"Chronopoulos, Mr. Demetrios",male,18,1,0,2680,14.4542,,C +1008,3,"Thomas, Mr. John",male,,0,0,2681,6.4375,,C +1009,3,"Sandstrom, Miss. Beatrice Irene",female,1,1,1,PP 9549,16.7,G6,S +1010,1,"Beattie, Mr. Thomson",male,36,0,0,13050,75.2417,C6,C +1011,2,"Chapman, Mrs. John Henry (Sara Elizabeth Lawry)",female,29,1,0,SC/AH 29037,26,,S +1012,2,"Watt, Miss. Bertha J",female,12,0,0,C.A. 33595,15.75,,S +1013,3,"Kiernan, Mr. John",male,,1,0,367227,7.75,,Q +1014,1,"Schabert, Mrs. Paul (Emma Mock)",female,35,1,0,13236,57.75,C28,C +1015,3,"Carver, Mr. Alfred John",male,28,0,0,392095,7.25,,S +1016,3,"Kennedy, Mr. John",male,,0,0,368783,7.75,,Q +1017,3,"Cribb, Miss. Laura Alice",female,17,0,1,371362,16.1,,S +1018,3,"Brobeck, Mr. Karl Rudolf",male,22,0,0,350045,7.7958,,S +1019,3,"McCoy, Miss. Alicia",female,,2,0,367226,23.25,,Q +1020,2,"Bowenur, Mr. Solomon",male,42,0,0,211535,13,,S +1021,3,"Petersen, Mr. Marius",male,24,0,0,342441,8.05,,S +1022,3,"Spinner, Mr. Henry John",male,32,0,0,STON/OQ. 369943,8.05,,S +1023,1,"Gracie, Col. Archibald IV",male,53,0,0,113780,28.5,C51,C +1024,3,"Lefebre, Mrs. Frank (Frances)",female,,0,4,4133,25.4667,,S +1025,3,"Thomas, Mr. Charles P",male,,1,0,2621,6.4375,,C +1026,3,"Dintcheff, Mr. Valtcho",male,43,0,0,349226,7.8958,,S +1027,3,"Carlsson, Mr. Carl Robert",male,24,0,0,350409,7.8542,,S +1028,3,"Zakarian, Mr. Mapriededer",male,26.5,0,0,2656,7.225,,C +1029,2,"Schmidt, Mr. August",male,26,0,0,248659,13,,S +1030,3,"Drapkin, Miss. Jennie",female,23,0,0,SOTON/OQ 392083,8.05,,S +1031,3,"Goodwin, Mr. Charles Frederick",male,40,1,6,CA 2144,46.9,,S +1032,3,"Goodwin, Miss. Jessie Allis",female,10,5,2,CA 2144,46.9,,S +1033,1,"Daniels, Miss. Sarah",female,33,0,0,113781,151.55,,S +1034,1,"Ryerson, Mr. Arthur Larned",male,61,1,3,PC 17608,262.375,B57 B59 B63 B66,C +1035,2,"Beauchamp, Mr. Henry James",male,28,0,0,244358,26,,S +1036,1,"Lindeberg-Lind, Mr. Erik Gustaf (Mr Edward Lingrey"")""",male,42,0,0,17475,26.55,,S +1037,3,"Vander Planke, Mr. Julius",male,31,3,0,345763,18,,S +1038,1,"Hilliard, Mr. Herbert Henry",male,,0,0,17463,51.8625,E46,S +1039,3,"Davies, Mr. Evan",male,22,0,0,SC/A4 23568,8.05,,S +1040,1,"Crafton, Mr. John Bertram",male,,0,0,113791,26.55,,S +1041,2,"Lahtinen, Rev. William",male,30,1,1,250651,26,,S +1042,1,"Earnshaw, Mrs. Boulton (Olive Potter)",female,23,0,1,11767,83.1583,C54,C +1043,3,"Matinoff, Mr. Nicola",male,,0,0,349255,7.8958,,C +1044,3,"Storey, Mr. Thomas",male,60.5,0,0,3701,,,S +1045,3,"Klasen, Mrs. (Hulda Kristina Eugenia Lofqvist)",female,36,0,2,350405,12.1833,,S +1046,3,"Asplund, Master. Filip Oscar",male,13,4,2,347077,31.3875,,S +1047,3,"Duquemin, Mr. Joseph",male,24,0,0,S.O./P.P. 752,7.55,,S +1048,1,"Bird, Miss. Ellen",female,29,0,0,PC 17483,221.7792,C97,S +1049,3,"Lundin, Miss. Olga Elida",female,23,0,0,347469,7.8542,,S +1050,1,"Borebank, Mr. John James",male,42,0,0,110489,26.55,D22,S +1051,3,"Peacock, Mrs. Benjamin (Edith Nile)",female,26,0,2,SOTON/O.Q. 3101315,13.775,,S +1052,3,"Smyth, Miss. Julia",female,,0,0,335432,7.7333,,Q +1053,3,"Touma, Master. Georges Youssef",male,7,1,1,2650,15.2458,,C +1054,2,"Wright, Miss. Marion",female,26,0,0,220844,13.5,,S +1055,3,"Pearce, Mr. Ernest",male,,0,0,343271,7,,S +1056,2,"Peruschitz, Rev. Joseph Maria",male,41,0,0,237393,13,,S +1057,3,"Kink-Heilmann, Mrs. Anton (Luise Heilmann)",female,26,1,1,315153,22.025,,S +1058,1,"Brandeis, Mr. Emil",male,48,0,0,PC 17591,50.4958,B10,C +1059,3,"Ford, Mr. Edward Watson",male,18,2,2,W./C. 6608,34.375,,S +1060,1,"Cassebeer, Mrs. Henry Arthur Jr (Eleanor Genevieve Fosdick)",female,,0,0,17770,27.7208,,C +1061,3,"Hellstrom, Miss. Hilda Maria",female,22,0,0,7548,8.9625,,S +1062,3,"Lithman, Mr. Simon",male,,0,0,S.O./P.P. 251,7.55,,S +1063,3,"Zakarian, Mr. Ortin",male,27,0,0,2670,7.225,,C +1064,3,"Dyker, Mr. Adolf Fredrik",male,23,1,0,347072,13.9,,S +1065,3,"Torfa, Mr. Assad",male,,0,0,2673,7.2292,,C +1066,3,"Asplund, Mr. Carl Oscar Vilhelm Gustafsson",male,40,1,5,347077,31.3875,,S +1067,2,"Brown, Miss. Edith Eileen",female,15,0,2,29750,39,,S +1068,2,"Sincock, Miss. Maude",female,20,0,0,C.A. 33112,36.75,,S +1069,1,"Stengel, Mr. Charles Emil Henry",male,54,1,0,11778,55.4417,C116,C +1070,2,"Becker, Mrs. Allen Oliver (Nellie E Baumgardner)",female,36,0,3,230136,39,F4,S +1071,1,"Compton, Mrs. Alexander Taylor (Mary Eliza Ingersoll)",female,64,0,2,PC 17756,83.1583,E45,C +1072,2,"McCrie, Mr. James Matthew",male,30,0,0,233478,13,,S +1073,1,"Compton, Mr. Alexander Taylor Jr",male,37,1,1,PC 17756,83.1583,E52,C +1074,1,"Marvin, Mrs. Daniel Warner (Mary Graham Carmichael Farquarson)",female,18,1,0,113773,53.1,D30,S +1075,3,"Lane, Mr. Patrick",male,,0,0,7935,7.75,,Q +1076,1,"Douglas, Mrs. Frederick Charles (Mary Helene Baxter)",female,27,1,1,PC 17558,247.5208,B58 B60,C +1077,2,"Maybery, Mr. Frank Hubert",male,40,0,0,239059,16,,S +1078,2,"Phillips, Miss. Alice Frances Louisa",female,21,0,1,S.O./P.P. 2,21,,S +1079,3,"Davies, Mr. Joseph",male,17,2,0,A/4 48873,8.05,,S +1080,3,"Sage, Miss. Ada",female,,8,2,CA. 2343,69.55,,S +1081,2,"Veal, Mr. James",male,40,0,0,28221,13,,S +1082,2,"Angle, Mr. William A",male,34,1,0,226875,26,,S +1083,1,"Salomon, Mr. Abraham L",male,,0,0,111163,26,,S +1084,3,"van Billiard, Master. Walter John",male,11.5,1,1,A/5. 851,14.5,,S +1085,2,"Lingane, Mr. John",male,61,0,0,235509,12.35,,Q +1086,2,"Drew, Master. Marshall Brines",male,8,0,2,28220,32.5,,S +1087,3,"Karlsson, Mr. Julius Konrad Eugen",male,33,0,0,347465,7.8542,,S +1088,1,"Spedden, Master. Robert Douglas",male,6,0,2,16966,134.5,E34,C +1089,3,"Nilsson, Miss. Berta Olivia",female,18,0,0,347066,7.775,,S +1090,2,"Baimbrigge, Mr. Charles Robert",male,23,0,0,C.A. 31030,10.5,,S +1091,3,"Rasmussen, Mrs. (Lena Jacobsen Solvang)",female,,0,0,65305,8.1125,,S +1092,3,"Murphy, Miss. Nora",female,,0,0,36568,15.5,,Q +1093,3,"Danbom, Master. Gilbert Sigvard Emanuel",male,0.33,0,2,347080,14.4,,S +1094,1,"Astor, Col. John Jacob",male,47,1,0,PC 17757,227.525,C62 C64,C +1095,2,"Quick, Miss. Winifred Vera",female,8,1,1,26360,26,,S +1096,2,"Andrew, Mr. Frank Thomas",male,25,0,0,C.A. 34050,10.5,,S +1097,1,"Omont, Mr. Alfred Fernand",male,,0,0,F.C. 12998,25.7417,,C +1098,3,"McGowan, Miss. Katherine",female,35,0,0,9232,7.75,,Q +1099,2,"Collett, Mr. Sidney C Stuart",male,24,0,0,28034,10.5,,S +1100,1,"Rosenbaum, Miss. Edith Louise",female,33,0,0,PC 17613,27.7208,A11,C +1101,3,"Delalic, Mr. Redjo",male,25,0,0,349250,7.8958,,S +1102,3,"Andersen, Mr. Albert Karvin",male,32,0,0,C 4001,22.525,,S +1103,3,"Finoli, Mr. Luigi",male,,0,0,SOTON/O.Q. 3101308,7.05,,S +1104,2,"Deacon, Mr. Percy William",male,17,0,0,S.O.C. 14879,73.5,,S +1105,2,"Howard, Mrs. Benjamin (Ellen Truelove Arman)",female,60,1,0,24065,26,,S +1106,3,"Andersson, Miss. Ida Augusta Margareta",female,38,4,2,347091,7.775,,S +1107,1,"Head, Mr. Christopher",male,42,0,0,113038,42.5,B11,S +1108,3,"Mahon, Miss. Bridget Delia",female,,0,0,330924,7.8792,,Q +1109,1,"Wick, Mr. George Dennick",male,57,1,1,36928,164.8667,,S +1110,1,"Widener, Mrs. George Dunton (Eleanor Elkins)",female,50,1,1,113503,211.5,C80,C +1111,3,"Thomson, Mr. Alexander Morrison",male,,0,0,32302,8.05,,S +1112,2,"Duran y More, Miss. Florentina",female,30,1,0,SC/PARIS 2148,13.8583,,C +1113,3,"Reynolds, Mr. Harold J",male,21,0,0,342684,8.05,,S +1114,2,"Cook, Mrs. (Selena Rogers)",female,22,0,0,W./C. 14266,10.5,F33,S +1115,3,"Karlsson, Mr. Einar Gervasius",male,21,0,0,350053,7.7958,,S +1116,1,"Candee, Mrs. Edward (Helen Churchill Hungerford)",female,53,0,0,PC 17606,27.4458,,C +1117,3,"Moubarek, Mrs. George (Omine Amenia"" Alexander)""",female,,0,2,2661,15.2458,,C +1118,3,"Asplund, Mr. Johan Charles",male,23,0,0,350054,7.7958,,S +1119,3,"McNeill, Miss. Bridget",female,,0,0,370368,7.75,,Q +1120,3,"Everett, Mr. Thomas James",male,40.5,0,0,C.A. 6212,15.1,,S +1121,2,"Hocking, Mr. Samuel James Metcalfe",male,36,0,0,242963,13,,S +1122,2,"Sweet, Mr. George Frederick",male,14,0,0,220845,65,,S +1123,1,"Willard, Miss. Constance",female,21,0,0,113795,26.55,,S +1124,3,"Wiklund, Mr. Karl Johan",male,21,1,0,3101266,6.4958,,S +1125,3,"Linehan, Mr. Michael",male,,0,0,330971,7.8792,,Q +1126,1,"Cumings, Mr. John Bradley",male,39,1,0,PC 17599,71.2833,C85,C +1127,3,"Vendel, Mr. Olof Edvin",male,20,0,0,350416,7.8542,,S +1128,1,"Warren, Mr. Frank Manley",male,64,1,0,110813,75.25,D37,C +1129,3,"Baccos, Mr. Raffull",male,20,0,0,2679,7.225,,C +1130,2,"Hiltunen, Miss. Marta",female,18,1,1,250650,13,,S +1131,1,"Douglas, Mrs. Walter Donald (Mahala Dutton)",female,48,1,0,PC 17761,106.425,C86,C +1132,1,"Lindstrom, Mrs. Carl Johan (Sigrid Posse)",female,55,0,0,112377,27.7208,,C +1133,2,"Christy, Mrs. (Alice Frances)",female,45,0,2,237789,30,,S +1134,1,"Spedden, Mr. Frederic Oakley",male,45,1,1,16966,134.5,E34,C +1135,3,"Hyman, Mr. Abraham",male,,0,0,3470,7.8875,,S +1136,3,"Johnston, Master. William Arthur Willie""""",male,,1,2,W./C. 6607,23.45,,S +1137,1,"Kenyon, Mr. Frederick R",male,41,1,0,17464,51.8625,D21,S +1138,2,"Karnes, Mrs. J Frank (Claire Bennett)",female,22,0,0,F.C.C. 13534,21,,S +1139,2,"Drew, Mr. James Vivian",male,42,1,1,28220,32.5,,S +1140,2,"Hold, Mrs. Stephen (Annie Margaret Hill)",female,29,1,0,26707,26,,S +1141,3,"Khalil, Mrs. Betros (Zahie Maria"" Elias)""",female,,1,0,2660,14.4542,,C +1142,2,"West, Miss. Barbara J",female,0.92,1,2,C.A. 34651,27.75,,S +1143,3,"Abrahamsson, Mr. Abraham August Johannes",male,20,0,0,SOTON/O2 3101284,7.925,,S +1144,1,"Clark, Mr. Walter Miller",male,27,1,0,13508,136.7792,C89,C +1145,3,"Salander, Mr. Karl Johan",male,24,0,0,7266,9.325,,S +1146,3,"Wenzel, Mr. Linhart",male,32.5,0,0,345775,9.5,,S +1147,3,"MacKay, Mr. George William",male,,0,0,C.A. 42795,7.55,,S +1148,3,"Mahon, Mr. John",male,,0,0,AQ/4 3130,7.75,,Q +1149,3,"Niklasson, Mr. Samuel",male,28,0,0,363611,8.05,,S +1150,2,"Bentham, Miss. Lilian W",female,19,0,0,28404,13,,S +1151,3,"Midtsjo, Mr. Karl Albert",male,21,0,0,345501,7.775,,S +1152,3,"de Messemaeker, Mr. Guillaume Joseph",male,36.5,1,0,345572,17.4,,S +1153,3,"Nilsson, Mr. August Ferdinand",male,21,0,0,350410,7.8542,,S +1154,2,"Wells, Mrs. Arthur Henry (Addie"" Dart Trevaskis)""",female,29,0,2,29103,23,,S +1155,3,"Klasen, Miss. Gertrud Emilia",female,1,1,1,350405,12.1833,,S +1156,2,"Portaluppi, Mr. Emilio Ilario Giuseppe",male,30,0,0,C.A. 34644,12.7375,,C +1157,3,"Lyntakoff, Mr. Stanko",male,,0,0,349235,7.8958,,S +1158,1,"Chisholm, Mr. Roderick Robert Crispin",male,,0,0,112051,0,,S +1159,3,"Warren, Mr. Charles William",male,,0,0,C.A. 49867,7.55,,S +1160,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S +1161,3,"Pokrnic, Mr. Mate",male,17,0,0,315095,8.6625,,S +1162,1,"McCaffry, Mr. Thomas Francis",male,46,0,0,13050,75.2417,C6,C +1163,3,"Fox, Mr. Patrick",male,,0,0,368573,7.75,,Q +1164,1,"Clark, Mrs. Walter Miller (Virginia McDowell)",female,26,1,0,13508,136.7792,C89,C +1165,3,"Lennon, Miss. Mary",female,,1,0,370371,15.5,,Q +1166,3,"Saade, Mr. Jean Nassr",male,,0,0,2676,7.225,,C +1167,2,"Bryhl, Miss. Dagmar Jenny Ingeborg ",female,20,1,0,236853,26,,S +1168,2,"Parker, Mr. Clifford Richard",male,28,0,0,SC 14888,10.5,,S +1169,2,"Faunthorpe, Mr. Harry",male,40,1,0,2926,26,,S +1170,2,"Ware, Mr. John James",male,30,1,0,CA 31352,21,,S +1171,2,"Oxenham, Mr. Percy Thomas",male,22,0,0,W./C. 14260,10.5,,S +1172,3,"Oreskovic, Miss. Jelka",female,23,0,0,315085,8.6625,,S +1173,3,"Peacock, Master. Alfred Edward",male,0.75,1,1,SOTON/O.Q. 3101315,13.775,,S +1174,3,"Fleming, Miss. Honora",female,,0,0,364859,7.75,,Q +1175,3,"Touma, Miss. Maria Youssef",female,9,1,1,2650,15.2458,,C +1176,3,"Rosblom, Miss. Salli Helena",female,2,1,1,370129,20.2125,,S +1177,3,"Dennis, Mr. William",male,36,0,0,A/5 21175,7.25,,S +1178,3,"Franklin, Mr. Charles (Charles Fardon)",male,,0,0,SOTON/O.Q. 3101314,7.25,,S +1179,1,"Snyder, Mr. John Pillsbury",male,24,1,0,21228,82.2667,B45,S +1180,3,"Mardirosian, Mr. Sarkis",male,,0,0,2655,7.2292,F E46,C +1181,3,"Ford, Mr. Arthur",male,,0,0,A/5 1478,8.05,,S +1182,1,"Rheims, Mr. George Alexander Lucien",male,,0,0,PC 17607,39.6,,S +1183,3,"Daly, Miss. Margaret Marcella Maggie""""",female,30,0,0,382650,6.95,,Q +1184,3,"Nasr, Mr. Mustafa",male,,0,0,2652,7.2292,,C +1185,1,"Dodge, Dr. Washington",male,53,1,1,33638,81.8583,A34,S +1186,3,"Wittevrongel, Mr. Camille",male,36,0,0,345771,9.5,,S +1187,3,"Angheloff, Mr. Minko",male,26,0,0,349202,7.8958,,S +1188,2,"Laroche, Miss. Louise",female,1,1,2,SC/Paris 2123,41.5792,,C +1189,3,"Samaan, Mr. Hanna",male,,2,0,2662,21.6792,,C +1190,1,"Loring, Mr. Joseph Holland",male,30,0,0,113801,45.5,,S +1191,3,"Johansson, Mr. Nils",male,29,0,0,347467,7.8542,,S +1192,3,"Olsson, Mr. Oscar Wilhelm",male,32,0,0,347079,7.775,,S +1193,2,"Malachard, Mr. Noel",male,,0,0,237735,15.0458,D,C +1194,2,"Phillips, Mr. Escott Robert",male,43,0,1,S.O./P.P. 2,21,,S +1195,3,"Pokrnic, Mr. Tome",male,24,0,0,315092,8.6625,,S +1196,3,"McCarthy, Miss. Catherine Katie""""",female,,0,0,383123,7.75,,Q +1197,1,"Crosby, Mrs. Edward Gifford (Catherine Elizabeth Halstead)",female,64,1,1,112901,26.55,B26,S +1198,1,"Allison, Mr. Hudson Joshua Creighton",male,30,1,2,113781,151.55,C22 C26,S +1199,3,"Aks, Master. Philip Frank",male,0.83,0,1,392091,9.35,,S +1200,1,"Hays, Mr. Charles Melville",male,55,1,1,12749,93.5,B69,S +1201,3,"Hansen, Mrs. Claus Peter (Jennie L Howard)",female,45,1,0,350026,14.1083,,S +1202,3,"Cacic, Mr. Jego Grga",male,18,0,0,315091,8.6625,,S +1203,3,"Vartanian, Mr. David",male,22,0,0,2658,7.225,,C +1204,3,"Sadowitz, Mr. Harry",male,,0,0,LP 1588,7.575,,S +1205,3,"Carr, Miss. Jeannie",female,37,0,0,368364,7.75,,Q +1206,1,"White, Mrs. John Stuart (Ella Holmes)",female,55,0,0,PC 17760,135.6333,C32,C +1207,3,"Hagardon, Miss. Kate",female,17,0,0,AQ/3. 30631,7.7333,,Q +1208,1,"Spencer, Mr. William Augustus",male,57,1,0,PC 17569,146.5208,B78,C +1209,2,"Rogers, Mr. Reginald Harry",male,19,0,0,28004,10.5,,S +1210,3,"Jonsson, Mr. Nils Hilding",male,27,0,0,350408,7.8542,,S +1211,2,"Jefferys, Mr. Ernest Wilfred",male,22,2,0,C.A. 31029,31.5,,S +1212,3,"Andersson, Mr. Johan Samuel",male,26,0,0,347075,7.775,,S +1213,3,"Krekorian, Mr. Neshan",male,25,0,0,2654,7.2292,F E57,C +1214,2,"Nesson, Mr. Israel",male,26,0,0,244368,13,F2,S +1215,1,"Rowe, Mr. Alfred G",male,33,0,0,113790,26.55,,S +1216,1,"Kreuchen, Miss. Emilie",female,39,0,0,24160,211.3375,,S +1217,3,"Assam, Mr. Ali",male,23,0,0,SOTON/O.Q. 3101309,7.05,,S +1218,2,"Becker, Miss. Ruth Elizabeth",female,12,2,1,230136,39,F4,S +1219,1,"Rosenshine, Mr. George (Mr George Thorne"")""",male,46,0,0,PC 17585,79.2,,C +1220,2,"Clarke, Mr. Charles Valentine",male,29,1,0,2003,26,,S +1221,2,"Enander, Mr. Ingvar",male,21,0,0,236854,13,,S +1222,2,"Davies, Mrs. John Morgan (Elizabeth Agnes Mary White) ",female,48,0,2,C.A. 33112,36.75,,S +1223,1,"Dulles, Mr. William Crothers",male,39,0,0,PC 17580,29.7,A18,C +1224,3,"Thomas, Mr. Tannous",male,,0,0,2684,7.225,,C +1225,3,"Nakid, Mrs. Said (Waika Mary"" Mowad)""",female,19,1,1,2653,15.7417,,C +1226,3,"Cor, Mr. Ivan",male,27,0,0,349229,7.8958,,S +1227,1,"Maguire, Mr. John Edward",male,30,0,0,110469,26,C106,S +1228,2,"de Brito, Mr. Jose Joaquim",male,32,0,0,244360,13,,S +1229,3,"Elias, Mr. Joseph",male,39,0,2,2675,7.2292,,C +1230,2,"Denbury, Mr. Herbert",male,25,0,0,C.A. 31029,31.5,,S +1231,3,"Betros, Master. Seman",male,,0,0,2622,7.2292,,C +1232,2,"Fillbrook, Mr. Joseph Charles",male,18,0,0,C.A. 15185,10.5,,S +1233,3,"Lundstrom, Mr. Thure Edvin",male,32,0,0,350403,7.5792,,S +1234,3,"Sage, Mr. John George",male,,1,9,CA. 2343,69.55,,S +1235,1,"Cardeza, Mrs. James Warburton Martinez (Charlotte Wardle Drake)",female,58,0,1,PC 17755,512.3292,B51 B53 B55,C +1236,3,"van Billiard, Master. James William",male,,1,1,A/5. 851,14.5,,S +1237,3,"Abelseth, Miss. Karen Marie",female,16,0,0,348125,7.65,,S +1238,2,"Botsford, Mr. William Hull",male,26,0,0,237670,13,,S +1239,3,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38,0,0,2688,7.2292,,C +1240,2,"Giles, Mr. Ralph",male,24,0,0,248726,13.5,,S +1241,2,"Walcroft, Miss. Nellie",female,31,0,0,F.C.C. 13528,21,,S +1242,1,"Greenfield, Mrs. Leo David (Blanche Strouse)",female,45,0,1,PC 17759,63.3583,D10 D12,C +1243,2,"Stokes, Mr. Philip Joseph",male,25,0,0,F.C.C. 13540,10.5,,S +1244,2,"Dibden, Mr. William",male,18,0,0,S.O.C. 14879,73.5,,S +1245,2,"Herman, Mr. Samuel",male,49,1,2,220845,65,,S +1246,3,"Dean, Miss. Elizabeth Gladys Millvina""""",female,0.17,1,2,C.A. 2315,20.575,,S +1247,1,"Julian, Mr. Henry Forbes",male,50,0,0,113044,26,E60,S +1248,1,"Brown, Mrs. John Murray (Caroline Lane Lamson)",female,59,2,0,11769,51.4792,C101,S +1249,3,"Lockyer, Mr. Edward",male,,0,0,1222,7.8792,,S +1250,3,"O'Keefe, Mr. Patrick",male,,0,0,368402,7.75,,Q +1251,3,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Persson)",female,30,1,0,349910,15.55,,S +1252,3,"Sage, Master. William Henry",male,14.5,8,2,CA. 2343,69.55,,S +1253,2,"Mallet, Mrs. Albert (Antoinette Magnin)",female,24,1,1,S.C./PARIS 2079,37.0042,,C +1254,2,"Ware, Mrs. John James (Florence Louise Long)",female,31,0,0,CA 31352,21,,S +1255,3,"Strilic, Mr. Ivan",male,27,0,0,315083,8.6625,,S +1256,1,"Harder, Mrs. George Achilles (Dorothy Annan)",female,25,1,0,11765,55.4417,E50,C +1257,3,"Sage, Mrs. John (Annie Bullen)",female,,1,9,CA. 2343,69.55,,S +1258,3,"Caram, Mr. Joseph",male,,1,0,2689,14.4583,,C +1259,3,"Riihivouri, Miss. Susanna Juhantytar Sanni""""",female,22,0,0,3101295,39.6875,,S +1260,1,"Gibson, Mrs. Leonard (Pauline C Boeson)",female,45,0,1,112378,59.4,,C +1261,2,"Pallas y Castello, Mr. Emilio",male,29,0,0,SC/PARIS 2147,13.8583,,C +1262,2,"Giles, Mr. Edgar",male,21,1,0,28133,11.5,,S +1263,1,"Wilson, Miss. Helen Alice",female,31,0,0,16966,134.5,E39 E41,C +1264,1,"Ismay, Mr. Joseph Bruce",male,49,0,0,112058,0,B52 B54 B56,S +1265,2,"Harbeck, Mr. William H",male,44,0,0,248746,13,,S +1266,1,"Dodge, Mrs. Washington (Ruth Vidaver)",female,54,1,1,33638,81.8583,A34,S +1267,1,"Bowen, Miss. Grace Scott",female,45,0,0,PC 17608,262.375,,C +1268,3,"Kink, Miss. Maria",female,22,2,0,315152,8.6625,,S +1269,2,"Cotterill, Mr. Henry Harry""""",male,21,0,0,29107,11.5,,S +1270,1,"Hipkins, Mr. William Edward",male,55,0,0,680,50,C39,S +1271,3,"Asplund, Master. Carl Edgar",male,5,4,2,347077,31.3875,,S +1272,3,"O'Connor, Mr. Patrick",male,,0,0,366713,7.75,,Q +1273,3,"Foley, Mr. Joseph",male,26,0,0,330910,7.8792,,Q +1274,3,"Risien, Mrs. Samuel (Emma)",female,,0,0,364498,14.5,,S +1275,3,"McNamee, Mrs. Neal (Eileen O'Leary)",female,19,1,0,376566,16.1,,S +1276,2,"Wheeler, Mr. Edwin Frederick""""",male,,0,0,SC/PARIS 2159,12.875,,S +1277,2,"Herman, Miss. Kate",female,24,1,2,220845,65,,S +1278,3,"Aronsson, Mr. Ernst Axel Algot",male,24,0,0,349911,7.775,,S +1279,2,"Ashby, Mr. John",male,57,0,0,244346,13,,S +1280,3,"Canavan, Mr. Patrick",male,21,0,0,364858,7.75,,Q +1281,3,"Palsson, Master. Paul Folke",male,6,3,1,349909,21.075,,S +1282,1,"Payne, Mr. Vivian Ponsonby",male,23,0,0,12749,93.5,B24,S +1283,1,"Lines, Mrs. Ernest H (Elizabeth Lindsey James)",female,51,0,1,PC 17592,39.4,D28,S +1284,3,"Abbott, Master. Eugene Joseph",male,13,0,2,C.A. 2673,20.25,,S +1285,2,"Gilbert, Mr. William",male,47,0,0,C.A. 30769,10.5,,S +1286,3,"Kink-Heilmann, Mr. Anton",male,29,3,1,315153,22.025,,S +1287,1,"Smith, Mrs. Lucien Philip (Mary Eloise Hughes)",female,18,1,0,13695,60,C31,S +1288,3,"Colbert, Mr. Patrick",male,24,0,0,371109,7.25,,Q +1289,1,"Frolicher-Stehli, Mrs. Maxmillian (Margaretha Emerentia Stehli)",female,48,1,1,13567,79.2,B41,C +1290,3,"Larsson-Rondberg, Mr. Edvard A",male,22,0,0,347065,7.775,,S +1291,3,"Conlon, Mr. Thomas Henry",male,31,0,0,21332,7.7333,,Q +1292,1,"Bonnell, Miss. Caroline",female,30,0,0,36928,164.8667,C7,S +1293,2,"Gale, Mr. Harry",male,38,1,0,28664,21,,S +1294,1,"Gibson, Miss. Dorothy Winifred",female,22,0,1,112378,59.4,,C +1295,1,"Carrau, Mr. Jose Pedro",male,17,0,0,113059,47.1,,S +1296,1,"Frauenthal, Mr. Isaac Gerald",male,43,1,0,17765,27.7208,D40,C +1297,2,"Nourney, Mr. Alfred (Baron von Drachstedt"")""",male,20,0,0,SC/PARIS 2166,13.8625,D38,C +1298,2,"Ware, Mr. William Jeffery",male,23,1,0,28666,10.5,,S +1299,1,"Widener, Mr. George Dunton",male,50,1,1,113503,211.5,C80,C +1300,3,"Riordan, Miss. Johanna Hannah""""",female,,0,0,334915,7.7208,,Q +1301,3,"Peacock, Miss. Treasteall",female,3,1,1,SOTON/O.Q. 3101315,13.775,,S +1302,3,"Naughton, Miss. Hannah",female,,0,0,365237,7.75,,Q +1303,1,"Minahan, Mrs. William Edward (Lillian E Thorpe)",female,37,1,0,19928,90,C78,Q +1304,3,"Henriksson, Miss. Jenny Lovisa",female,28,0,0,347086,7.775,,S +1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.05,,S +1306,1,"Oliva y Ocana, Dona. Fermina",female,39,0,0,PC 17758,108.9,C105,C +1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.25,,S +1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.05,,S +1309,3,"Peter, Master. Michael J",male,,1,1,2668,22.3583,,C diff --git a/data/train.csv b/data/train.csv new file mode 100644 index 0000000..63b68ab --- /dev/null +++ b/data/train.csv @@ -0,0 +1,892 @@ +PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked +1,0,3,"Braund, Mr. Owen Harris",male,22,1,0,A/5 21171,7.25,,S +2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38,1,0,PC 17599,71.2833,C85,C +3,1,3,"Heikkinen, Miss. Laina",female,26,0,0,STON/O2. 3101282,7.925,,S +4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35,1,0,113803,53.1,C123,S +5,0,3,"Allen, Mr. William Henry",male,35,0,0,373450,8.05,,S +6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q +7,0,1,"McCarthy, Mr. Timothy J",male,54,0,0,17463,51.8625,E46,S +8,0,3,"Palsson, Master. Gosta Leonard",male,2,3,1,349909,21.075,,S +9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27,0,2,347742,11.1333,,S +10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14,1,0,237736,30.0708,,C +11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4,1,1,PP 9549,16.7,G6,S +12,1,1,"Bonnell, Miss. Elizabeth",female,58,0,0,113783,26.55,C103,S +13,0,3,"Saundercock, Mr. William Henry",male,20,0,0,A/5. 2151,8.05,,S +14,0,3,"Andersson, Mr. Anders Johan",male,39,1,5,347082,31.275,,S +15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14,0,0,350406,7.8542,,S +16,1,2,"Hewlett, Mrs. (Mary D Kingcome) ",female,55,0,0,248706,16,,S +17,0,3,"Rice, Master. Eugene",male,2,4,1,382652,29.125,,Q +18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13,,S +19,0,3,"Vander Planke, Mrs. Julius (Emelia Maria Vandemoortele)",female,31,1,0,345763,18,,S +20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C +21,0,2,"Fynney, Mr. Joseph J",male,35,0,0,239865,26,,S +22,1,2,"Beesley, Mr. Lawrence",male,34,0,0,248698,13,D56,S +23,1,3,"McGowan, Miss. Anna ""Annie""",female,15,0,0,330923,8.0292,,Q +24,1,1,"Sloper, Mr. William Thompson",male,28,0,0,113788,35.5,A6,S +25,0,3,"Palsson, Miss. Torborg Danira",female,8,3,1,349909,21.075,,S +26,1,3,"Asplund, Mrs. Carl Oscar (Selma Augusta Emilia Johansson)",female,38,1,5,347077,31.3875,,S +27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C +28,0,1,"Fortune, Mr. Charles Alexander",male,19,3,2,19950,263,C23 C25 C27,S +29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q +30,0,3,"Todoroff, Mr. Lalio",male,,0,0,349216,7.8958,,S +31,0,1,"Uruchurtu, Don. Manuel E",male,40,0,0,PC 17601,27.7208,,C +32,1,1,"Spencer, Mrs. William Augustus (Marie Eugenie)",female,,1,0,PC 17569,146.5208,B78,C +33,1,3,"Glynn, Miss. Mary Agatha",female,,0,0,335677,7.75,,Q +34,0,2,"Wheadon, Mr. Edward H",male,66,0,0,C.A. 24579,10.5,,S +35,0,1,"Meyer, Mr. Edgar Joseph",male,28,1,0,PC 17604,82.1708,,C +36,0,1,"Holverson, Mr. Alexander Oskar",male,42,1,0,113789,52,,S +37,1,3,"Mamee, Mr. Hanna",male,,0,0,2677,7.2292,,C +38,0,3,"Cann, Mr. Ernest Charles",male,21,0,0,A./5. 2152,8.05,,S +39,0,3,"Vander Planke, Miss. Augusta Maria",female,18,2,0,345764,18,,S +40,1,3,"Nicola-Yarred, Miss. Jamila",female,14,1,0,2651,11.2417,,C +41,0,3,"Ahlin, Mrs. Johan (Johanna Persdotter Larsson)",female,40,1,0,7546,9.475,,S +42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann Wonnacott)",female,27,1,0,11668,21,,S +43,0,3,"Kraeff, Mr. Theodor",male,,0,0,349253,7.8958,,C +44,1,2,"Laroche, Miss. Simonne Marie Anne Andree",female,3,1,2,SC/Paris 2123,41.5792,,C +45,1,3,"Devaney, Miss. Margaret Delia",female,19,0,0,330958,7.8792,,Q +46,0,3,"Rogers, Mr. William John",male,,0,0,S.C./A.4. 23567,8.05,,S +47,0,3,"Lennon, Mr. Denis",male,,1,0,370371,15.5,,Q +48,1,3,"O'Driscoll, Miss. Bridget",female,,0,0,14311,7.75,,Q +49,0,3,"Samaan, Mr. Youssef",male,,2,0,2662,21.6792,,C +50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18,1,0,349237,17.8,,S +51,0,3,"Panula, Master. Juha Niilo",male,7,4,1,3101295,39.6875,,S +52,0,3,"Nosworthy, Mr. Richard Cater",male,21,0,0,A/4. 39886,7.8,,S +53,1,1,"Harper, Mrs. Henry Sleeper (Myna Haxtun)",female,49,1,0,PC 17572,76.7292,D33,C +54,1,2,"Faunthorpe, Mrs. Lizzie (Elizabeth Anne Wilkinson)",female,29,1,0,2926,26,,S +55,0,1,"Ostby, Mr. Engelhart Cornelius",male,65,0,1,113509,61.9792,B30,C +56,1,1,"Woolner, Mr. Hugh",male,,0,0,19947,35.5,C52,S +57,1,2,"Rugg, Miss. Emily",female,21,0,0,C.A. 31026,10.5,,S +58,0,3,"Novel, Mr. Mansouer",male,28.5,0,0,2697,7.2292,,C +59,1,2,"West, Miss. Constance Mirium",female,5,1,2,C.A. 34651,27.75,,S +60,0,3,"Goodwin, Master. William Frederick",male,11,5,2,CA 2144,46.9,,S +61,0,3,"Sirayanian, Mr. Orsen",male,22,0,0,2669,7.2292,,C +62,1,1,"Icard, Miss. Amelie",female,38,0,0,113572,80,B28, +63,0,1,"Harris, Mr. Henry Birkhardt",male,45,1,0,36973,83.475,C83,S +64,0,3,"Skoog, Master. Harald",male,4,3,2,347088,27.9,,S +65,0,1,"Stewart, Mr. Albert A",male,,0,0,PC 17605,27.7208,,C +66,1,3,"Moubarek, Master. Gerios",male,,1,1,2661,15.2458,,C +67,1,2,"Nye, Mrs. (Elizabeth Ramell)",female,29,0,0,C.A. 29395,10.5,F33,S +68,0,3,"Crease, Mr. Ernest James",male,19,0,0,S.P. 3464,8.1583,,S +69,1,3,"Andersson, Miss. Erna Alexandra",female,17,4,2,3101281,7.925,,S +70,0,3,"Kink, Mr. Vincenz",male,26,2,0,315151,8.6625,,S +71,0,2,"Jenkin, Mr. Stephen Curnow",male,32,0,0,C.A. 33111,10.5,,S +72,0,3,"Goodwin, Miss. Lillian Amy",female,16,5,2,CA 2144,46.9,,S +73,0,2,"Hood, Mr. Ambrose Jr",male,21,0,0,S.O.C. 14879,73.5,,S +74,0,3,"Chronopoulos, Mr. Apostolos",male,26,1,0,2680,14.4542,,C +75,1,3,"Bing, Mr. Lee",male,32,0,0,1601,56.4958,,S +76,0,3,"Moen, Mr. Sigurd Hansen",male,25,0,0,348123,7.65,F G73,S +77,0,3,"Staneff, Mr. Ivan",male,,0,0,349208,7.8958,,S +78,0,3,"Moutal, Mr. Rahamin Haim",male,,0,0,374746,8.05,,S +79,1,2,"Caldwell, Master. Alden Gates",male,0.83,0,2,248738,29,,S +80,1,3,"Dowdell, Miss. Elizabeth",female,30,0,0,364516,12.475,,S +81,0,3,"Waelens, Mr. Achille",male,22,0,0,345767,9,,S +82,1,3,"Sheerlinck, Mr. Jan Baptist",male,29,0,0,345779,9.5,,S +83,1,3,"McDermott, Miss. Brigdet Delia",female,,0,0,330932,7.7875,,Q +84,0,1,"Carrau, Mr. Francisco M",male,28,0,0,113059,47.1,,S +85,1,2,"Ilett, Miss. Bertha",female,17,0,0,SO/C 14885,10.5,,S +86,1,3,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gustafsson)",female,33,3,0,3101278,15.85,,S +87,0,3,"Ford, Mr. William Neal",male,16,1,3,W./C. 6608,34.375,,S +88,0,3,"Slocovski, Mr. Selman Francis",male,,0,0,SOTON/OQ 392086,8.05,,S +89,1,1,"Fortune, Miss. Mabel Helen",female,23,3,2,19950,263,C23 C25 C27,S +90,0,3,"Celotti, Mr. Francesco",male,24,0,0,343275,8.05,,S +91,0,3,"Christmann, Mr. Emil",male,29,0,0,343276,8.05,,S +92,0,3,"Andreasson, Mr. Paul Edvin",male,20,0,0,347466,7.8542,,S +93,0,1,"Chaffee, Mr. Herbert Fuller",male,46,1,0,W.E.P. 5734,61.175,E31,S +94,0,3,"Dean, Mr. Bertram Frank",male,26,1,2,C.A. 2315,20.575,,S +95,0,3,"Coxon, Mr. Daniel",male,59,0,0,364500,7.25,,S +96,0,3,"Shorney, Mr. Charles Joseph",male,,0,0,374910,8.05,,S +97,0,1,"Goldschmidt, Mr. George B",male,71,0,0,PC 17754,34.6542,A5,C +98,1,1,"Greenfield, Mr. William Bertram",male,23,0,1,PC 17759,63.3583,D10 D12,C +99,1,2,"Doling, Mrs. John T (Ada Julia Bone)",female,34,0,1,231919,23,,S +100,0,2,"Kantor, Mr. Sinai",male,34,1,0,244367,26,,S +101,0,3,"Petranec, Miss. Matilda",female,28,0,0,349245,7.8958,,S +102,0,3,"Petroff, Mr. Pastcho (""Pentcho"")",male,,0,0,349215,7.8958,,S +103,0,1,"White, Mr. Richard Frasar",male,21,0,1,35281,77.2875,D26,S +104,0,3,"Johansson, Mr. Gustaf Joel",male,33,0,0,7540,8.6542,,S +105,0,3,"Gustafsson, Mr. Anders Vilhelm",male,37,2,0,3101276,7.925,,S +106,0,3,"Mionoff, Mr. Stoytcho",male,28,0,0,349207,7.8958,,S +107,1,3,"Salkjelsvik, Miss. Anna Kristine",female,21,0,0,343120,7.65,,S +108,1,3,"Moss, Mr. Albert Johan",male,,0,0,312991,7.775,,S +109,0,3,"Rekic, Mr. Tido",male,38,0,0,349249,7.8958,,S +110,1,3,"Moran, Miss. Bertha",female,,1,0,371110,24.15,,Q +111,0,1,"Porter, Mr. Walter Chamberlain",male,47,0,0,110465,52,C110,S +112,0,3,"Zabour, Miss. Hileni",female,14.5,1,0,2665,14.4542,,C +113,0,3,"Barton, Mr. David John",male,22,0,0,324669,8.05,,S +114,0,3,"Jussila, Miss. Katriina",female,20,1,0,4136,9.825,,S +115,0,3,"Attalah, Miss. Malake",female,17,0,0,2627,14.4583,,C +116,0,3,"Pekoniemi, Mr. Edvard",male,21,0,0,STON/O 2. 3101294,7.925,,S +117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q +118,0,2,"Turpin, Mr. William John Robert",male,29,1,0,11668,21,,S +119,0,1,"Baxter, Mr. Quigg Edmond",male,24,0,1,PC 17558,247.5208,B58 B60,C +120,0,3,"Andersson, Miss. Ellis Anna Maria",female,2,4,2,347082,31.275,,S +121,0,2,"Hickman, Mr. Stanley George",male,21,2,0,S.O.C. 14879,73.5,,S +122,0,3,"Moore, Mr. Leonard Charles",male,,0,0,A4. 54510,8.05,,S +123,0,2,"Nasser, Mr. Nicholas",male,32.5,1,0,237736,30.0708,,C +124,1,2,"Webber, Miss. Susan",female,32.5,0,0,27267,13,E101,S +125,0,1,"White, Mr. Percival Wayland",male,54,0,1,35281,77.2875,D26,S +126,1,3,"Nicola-Yarred, Master. Elias",male,12,1,0,2651,11.2417,,C +127,0,3,"McMahon, Mr. Martin",male,,0,0,370372,7.75,,Q +128,1,3,"Madsen, Mr. Fridtjof Arne",male,24,0,0,C 17369,7.1417,,S +129,1,3,"Peter, Miss. Anna",female,,1,1,2668,22.3583,F E69,C +130,0,3,"Ekstrom, Mr. Johan",male,45,0,0,347061,6.975,,S +131,0,3,"Drazenoic, Mr. Jozef",male,33,0,0,349241,7.8958,,C +132,0,3,"Coelho, Mr. Domingos Fernandeo",male,20,0,0,SOTON/O.Q. 3101307,7.05,,S +133,0,3,"Robins, Mrs. Alexander A (Grace Charity Laury)",female,47,1,0,A/5. 3337,14.5,,S +134,1,2,"Weisz, Mrs. Leopold (Mathilde Francoise Pede)",female,29,1,0,228414,26,,S +135,0,2,"Sobey, Mr. Samuel James Hayden",male,25,0,0,C.A. 29178,13,,S +136,0,2,"Richard, Mr. Emile",male,23,0,0,SC/PARIS 2133,15.0458,,C +137,1,1,"Newsom, Miss. Helen Monypeny",female,19,0,2,11752,26.2833,D47,S +138,0,1,"Futrelle, Mr. Jacques Heath",male,37,1,0,113803,53.1,C123,S +139,0,3,"Osen, Mr. Olaf Elon",male,16,0,0,7534,9.2167,,S +140,0,1,"Giglio, Mr. Victor",male,24,0,0,PC 17593,79.2,B86,C +141,0,3,"Boulos, Mrs. Joseph (Sultana)",female,,0,2,2678,15.2458,,C +142,1,3,"Nysten, Miss. Anna Sofia",female,22,0,0,347081,7.75,,S +143,1,3,"Hakkarainen, Mrs. Pekka Pietari (Elin Matilda Dolck)",female,24,1,0,STON/O2. 3101279,15.85,,S +144,0,3,"Burke, Mr. Jeremiah",male,19,0,0,365222,6.75,,Q +145,0,2,"Andrew, Mr. Edgardo Samuel",male,18,0,0,231945,11.5,,S +146,0,2,"Nicholls, Mr. Joseph Charles",male,19,1,1,C.A. 33112,36.75,,S +147,1,3,"Andersson, Mr. August Edvard (""Wennerstrom"")",male,27,0,0,350043,7.7958,,S +148,0,3,"Ford, Miss. Robina Maggie ""Ruby""",female,9,2,2,W./C. 6608,34.375,,S +149,0,2,"Navratil, Mr. Michel (""Louis M Hoffman"")",male,36.5,0,2,230080,26,F2,S +150,0,2,"Byles, Rev. Thomas Roussel Davids",male,42,0,0,244310,13,,S +151,0,2,"Bateman, Rev. Robert James",male,51,0,0,S.O.P. 1166,12.525,,S +152,1,1,"Pears, Mrs. Thomas (Edith Wearne)",female,22,1,0,113776,66.6,C2,S +153,0,3,"Meo, Mr. Alfonzo",male,55.5,0,0,A.5. 11206,8.05,,S +154,0,3,"van Billiard, Mr. Austin Blyler",male,40.5,0,2,A/5. 851,14.5,,S +155,0,3,"Olsen, Mr. Ole Martin",male,,0,0,Fa 265302,7.3125,,S +156,0,1,"Williams, Mr. Charles Duane",male,51,0,1,PC 17597,61.3792,,C +157,1,3,"Gilnagh, Miss. Katherine ""Katie""",female,16,0,0,35851,7.7333,,Q +158,0,3,"Corn, Mr. Harry",male,30,0,0,SOTON/OQ 392090,8.05,,S +159,0,3,"Smiljanic, Mr. Mile",male,,0,0,315037,8.6625,,S +160,0,3,"Sage, Master. Thomas Henry",male,,8,2,CA. 2343,69.55,,S +161,0,3,"Cribb, Mr. John Hatfield",male,44,0,1,371362,16.1,,S +162,1,2,"Watt, Mrs. James (Elizabeth ""Bessie"" Inglis Milne)",female,40,0,0,C.A. 33595,15.75,,S +163,0,3,"Bengtsson, Mr. John Viktor",male,26,0,0,347068,7.775,,S +164,0,3,"Calic, Mr. Jovo",male,17,0,0,315093,8.6625,,S +165,0,3,"Panula, Master. Eino Viljami",male,1,4,1,3101295,39.6875,,S +166,1,3,"Goldsmith, Master. Frank John William ""Frankie""",male,9,0,2,363291,20.525,,S +167,1,1,"Chibnall, Mrs. (Edith Martha Bowerman)",female,,0,1,113505,55,E33,S +168,0,3,"Skoog, Mrs. William (Anna Bernhardina Karlsson)",female,45,1,4,347088,27.9,,S +169,0,1,"Baumann, Mr. John D",male,,0,0,PC 17318,25.925,,S +170,0,3,"Ling, Mr. Lee",male,28,0,0,1601,56.4958,,S +171,0,1,"Van der hoef, Mr. Wyckoff",male,61,0,0,111240,33.5,B19,S +172,0,3,"Rice, Master. Arthur",male,4,4,1,382652,29.125,,Q +173,1,3,"Johnson, Miss. Eleanor Ileen",female,1,1,1,347742,11.1333,,S +174,0,3,"Sivola, Mr. Antti Wilhelm",male,21,0,0,STON/O 2. 3101280,7.925,,S +175,0,1,"Smith, Mr. James Clinch",male,56,0,0,17764,30.6958,A7,C +176,0,3,"Klasen, Mr. Klas Albin",male,18,1,1,350404,7.8542,,S +177,0,3,"Lefebre, Master. Henry Forbes",male,,3,1,4133,25.4667,,S +178,0,1,"Isham, Miss. Ann Elizabeth",female,50,0,0,PC 17595,28.7125,C49,C +179,0,2,"Hale, Mr. Reginald",male,30,0,0,250653,13,,S +180,0,3,"Leonard, Mr. Lionel",male,36,0,0,LINE,0,,S +181,0,3,"Sage, Miss. Constance Gladys",female,,8,2,CA. 2343,69.55,,S +182,0,2,"Pernot, Mr. Rene",male,,0,0,SC/PARIS 2131,15.05,,C +183,0,3,"Asplund, Master. Clarence Gustaf Hugo",male,9,4,2,347077,31.3875,,S +184,1,2,"Becker, Master. Richard F",male,1,2,1,230136,39,F4,S +185,1,3,"Kink-Heilmann, Miss. Luise Gretchen",female,4,0,2,315153,22.025,,S +186,0,1,"Rood, Mr. Hugh Roscoe",male,,0,0,113767,50,A32,S +187,1,3,"O'Brien, Mrs. Thomas (Johanna ""Hannah"" Godfrey)",female,,1,0,370365,15.5,,Q +188,1,1,"Romaine, Mr. Charles Hallace (""Mr C Rolmane"")",male,45,0,0,111428,26.55,,S +189,0,3,"Bourke, Mr. John",male,40,1,1,364849,15.5,,Q +190,0,3,"Turcin, Mr. Stjepan",male,36,0,0,349247,7.8958,,S +191,1,2,"Pinsky, Mrs. (Rosa)",female,32,0,0,234604,13,,S +192,0,2,"Carbines, Mr. William",male,19,0,0,28424,13,,S +193,1,3,"Andersen-Jensen, Miss. Carla Christine Nielsine",female,19,1,0,350046,7.8542,,S +194,1,2,"Navratil, Master. Michel M",male,3,1,1,230080,26,F2,S +195,1,1,"Brown, Mrs. James Joseph (Margaret Tobin)",female,44,0,0,PC 17610,27.7208,B4,C +196,1,1,"Lurette, Miss. Elise",female,58,0,0,PC 17569,146.5208,B80,C +197,0,3,"Mernagh, Mr. Robert",male,,0,0,368703,7.75,,Q +198,0,3,"Olsen, Mr. Karl Siegwart Andreas",male,42,0,1,4579,8.4042,,S +199,1,3,"Madigan, Miss. Margaret ""Maggie""",female,,0,0,370370,7.75,,Q +200,0,2,"Yrois, Miss. Henriette (""Mrs Harbeck"")",female,24,0,0,248747,13,,S +201,0,3,"Vande Walle, Mr. Nestor Cyriel",male,28,0,0,345770,9.5,,S +202,0,3,"Sage, Mr. Frederick",male,,8,2,CA. 2343,69.55,,S +203,0,3,"Johanson, Mr. Jakob Alfred",male,34,0,0,3101264,6.4958,,S +204,0,3,"Youseff, Mr. Gerious",male,45.5,0,0,2628,7.225,,C +205,1,3,"Cohen, Mr. Gurshon ""Gus""",male,18,0,0,A/5 3540,8.05,,S +206,0,3,"Strom, Miss. Telma Matilda",female,2,0,1,347054,10.4625,G6,S +207,0,3,"Backstrom, Mr. Karl Alfred",male,32,1,0,3101278,15.85,,S +208,1,3,"Albimona, Mr. Nassef Cassem",male,26,0,0,2699,18.7875,,C +209,1,3,"Carr, Miss. Helen ""Ellen""",female,16,0,0,367231,7.75,,Q +210,1,1,"Blank, Mr. Henry",male,40,0,0,112277,31,A31,C +211,0,3,"Ali, Mr. Ahmed",male,24,0,0,SOTON/O.Q. 3101311,7.05,,S +212,1,2,"Cameron, Miss. Clear Annie",female,35,0,0,F.C.C. 13528,21,,S +213,0,3,"Perkin, Mr. John Henry",male,22,0,0,A/5 21174,7.25,,S +214,0,2,"Givard, Mr. Hans Kristensen",male,30,0,0,250646,13,,S +215,0,3,"Kiernan, Mr. Philip",male,,1,0,367229,7.75,,Q +216,1,1,"Newell, Miss. Madeleine",female,31,1,0,35273,113.275,D36,C +217,1,3,"Honkanen, Miss. Eliina",female,27,0,0,STON/O2. 3101283,7.925,,S +218,0,2,"Jacobsohn, Mr. Sidney Samuel",male,42,1,0,243847,27,,S +219,1,1,"Bazzani, Miss. Albina",female,32,0,0,11813,76.2917,D15,C +220,0,2,"Harris, Mr. Walter",male,30,0,0,W/C 14208,10.5,,S +221,1,3,"Sunderland, Mr. Victor Francis",male,16,0,0,SOTON/OQ 392089,8.05,,S +222,0,2,"Bracken, Mr. James H",male,27,0,0,220367,13,,S +223,0,3,"Green, Mr. George Henry",male,51,0,0,21440,8.05,,S +224,0,3,"Nenkoff, Mr. Christo",male,,0,0,349234,7.8958,,S +225,1,1,"Hoyt, Mr. Frederick Maxfield",male,38,1,0,19943,90,C93,S +226,0,3,"Berglund, Mr. Karl Ivar Sven",male,22,0,0,PP 4348,9.35,,S +227,1,2,"Mellors, Mr. William John",male,19,0,0,SW/PP 751,10.5,,S +228,0,3,"Lovell, Mr. John Hall (""Henry"")",male,20.5,0,0,A/5 21173,7.25,,S +229,0,2,"Fahlstrom, Mr. Arne Jonas",male,18,0,0,236171,13,,S +230,0,3,"Lefebre, Miss. Mathilde",female,,3,1,4133,25.4667,,S +231,1,1,"Harris, Mrs. Henry Birkhardt (Irene Wallach)",female,35,1,0,36973,83.475,C83,S +232,0,3,"Larsson, Mr. Bengt Edvin",male,29,0,0,347067,7.775,,S +233,0,2,"Sjostedt, Mr. Ernst Adolf",male,59,0,0,237442,13.5,,S +234,1,3,"Asplund, Miss. Lillian Gertrud",female,5,4,2,347077,31.3875,,S +235,0,2,"Leyson, Mr. Robert William Norman",male,24,0,0,C.A. 29566,10.5,,S +236,0,3,"Harknett, Miss. Alice Phoebe",female,,0,0,W./C. 6609,7.55,,S +237,0,2,"Hold, Mr. Stephen",male,44,1,0,26707,26,,S +238,1,2,"Collyer, Miss. Marjorie ""Lottie""",female,8,0,2,C.A. 31921,26.25,,S +239,0,2,"Pengelly, Mr. Frederick William",male,19,0,0,28665,10.5,,S +240,0,2,"Hunt, Mr. George Henry",male,33,0,0,SCO/W 1585,12.275,,S +241,0,3,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C +242,1,3,"Murphy, Miss. Katherine ""Kate""",female,,1,0,367230,15.5,,Q +243,0,2,"Coleridge, Mr. Reginald Charles",male,29,0,0,W./C. 14263,10.5,,S +244,0,3,"Maenpaa, Mr. Matti Alexanteri",male,22,0,0,STON/O 2. 3101275,7.125,,S +245,0,3,"Attalah, Mr. Sleiman",male,30,0,0,2694,7.225,,C +246,0,1,"Minahan, Dr. William Edward",male,44,2,0,19928,90,C78,Q +247,0,3,"Lindahl, Miss. Agda Thorilda Viktoria",female,25,0,0,347071,7.775,,S +248,1,2,"Hamalainen, Mrs. William (Anna)",female,24,0,2,250649,14.5,,S +249,1,1,"Beckwith, Mr. Richard Leonard",male,37,1,1,11751,52.5542,D35,S +250,0,2,"Carter, Rev. Ernest Courtenay",male,54,1,0,244252,26,,S +251,0,3,"Reed, Mr. James George",male,,0,0,362316,7.25,,S +252,0,3,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29,1,1,347054,10.4625,G6,S +253,0,1,"Stead, Mr. William Thomas",male,62,0,0,113514,26.55,C87,S +254,0,3,"Lobb, Mr. William Arthur",male,30,1,0,A/5. 3336,16.1,,S +255,0,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",female,41,0,2,370129,20.2125,,S +256,1,3,"Touma, Mrs. Darwis (Hanne Youssef Razi)",female,29,0,2,2650,15.2458,,C +257,1,1,"Thorne, Mrs. Gertrude Maybelle",female,,0,0,PC 17585,79.2,,C +258,1,1,"Cherry, Miss. Gladys",female,30,0,0,110152,86.5,B77,S +259,1,1,"Ward, Miss. Anna",female,35,0,0,PC 17755,512.3292,,C +260,1,2,"Parrish, Mrs. (Lutie Davis)",female,50,0,1,230433,26,,S +261,0,3,"Smith, Mr. Thomas",male,,0,0,384461,7.75,,Q +262,1,3,"Asplund, Master. Edvin Rojj Felix",male,3,4,2,347077,31.3875,,S +263,0,1,"Taussig, Mr. Emil",male,52,1,1,110413,79.65,E67,S +264,0,1,"Harrison, Mr. William",male,40,0,0,112059,0,B94,S +265,0,3,"Henry, Miss. Delia",female,,0,0,382649,7.75,,Q +266,0,2,"Reeves, Mr. David",male,36,0,0,C.A. 17248,10.5,,S +267,0,3,"Panula, Mr. Ernesti Arvid",male,16,4,1,3101295,39.6875,,S +268,1,3,"Persson, Mr. Ernst Ulrik",male,25,1,0,347083,7.775,,S +269,1,1,"Graham, Mrs. William Thompson (Edith Junkins)",female,58,0,1,PC 17582,153.4625,C125,S +270,1,1,"Bissette, Miss. Amelia",female,35,0,0,PC 17760,135.6333,C99,S +271,0,1,"Cairns, Mr. Alexander",male,,0,0,113798,31,,S +272,1,3,"Tornquist, Mr. William Henry",male,25,0,0,LINE,0,,S +273,1,2,"Mellinger, Mrs. (Elizabeth Anne Maidment)",female,41,0,1,250644,19.5,,S +274,0,1,"Natsch, Mr. Charles H",male,37,0,1,PC 17596,29.7,C118,C +275,1,3,"Healy, Miss. Hanora ""Nora""",female,,0,0,370375,7.75,,Q +276,1,1,"Andrews, Miss. Kornelia Theodosia",female,63,1,0,13502,77.9583,D7,S +277,0,3,"Lindblom, Miss. Augusta Charlotta",female,45,0,0,347073,7.75,,S +278,0,2,"Parkes, Mr. Francis ""Frank""",male,,0,0,239853,0,,S +279,0,3,"Rice, Master. Eric",male,7,4,1,382652,29.125,,Q +280,1,3,"Abbott, Mrs. Stanton (Rosa Hunt)",female,35,1,1,C.A. 2673,20.25,,S +281,0,3,"Duane, Mr. Frank",male,65,0,0,336439,7.75,,Q +282,0,3,"Olsson, Mr. Nils Johan Goransson",male,28,0,0,347464,7.8542,,S +283,0,3,"de Pelsmaeker, Mr. Alfons",male,16,0,0,345778,9.5,,S +284,1,3,"Dorking, Mr. Edward Arthur",male,19,0,0,A/5. 10482,8.05,,S +285,0,1,"Smith, Mr. Richard William",male,,0,0,113056,26,A19,S +286,0,3,"Stankovic, Mr. Ivan",male,33,0,0,349239,8.6625,,C +287,1,3,"de Mulder, Mr. Theodore",male,30,0,0,345774,9.5,,S +288,0,3,"Naidenoff, Mr. Penko",male,22,0,0,349206,7.8958,,S +289,1,2,"Hosono, Mr. Masabumi",male,42,0,0,237798,13,,S +290,1,3,"Connolly, Miss. Kate",female,22,0,0,370373,7.75,,Q +291,1,1,"Barber, Miss. Ellen ""Nellie""",female,26,0,0,19877,78.85,,S +292,1,1,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19,1,0,11967,91.0792,B49,C +293,0,2,"Levy, Mr. Rene Jacques",male,36,0,0,SC/Paris 2163,12.875,D,C +294,0,3,"Haas, Miss. Aloisia",female,24,0,0,349236,8.85,,S +295,0,3,"Mineff, Mr. Ivan",male,24,0,0,349233,7.8958,,S +296,0,1,"Lewy, Mr. Ervin G",male,,0,0,PC 17612,27.7208,,C +297,0,3,"Hanna, Mr. Mansour",male,23.5,0,0,2693,7.2292,,C +298,0,1,"Allison, Miss. Helen Loraine",female,2,1,2,113781,151.55,C22 C26,S +299,1,1,"Saalfeld, Mr. Adolphe",male,,0,0,19988,30.5,C106,S +300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50,0,1,PC 17558,247.5208,B58 B60,C +301,1,3,"Kelly, Miss. Anna Katherine ""Annie Kate""",female,,0,0,9234,7.75,,Q +302,1,3,"McCoy, Mr. Bernard",male,,2,0,367226,23.25,,Q +303,0,3,"Johnson, Mr. William Cahoone Jr",male,19,0,0,LINE,0,,S +304,1,2,"Keane, Miss. Nora A",female,,0,0,226593,12.35,E101,Q +305,0,3,"Williams, Mr. Howard Hugh ""Harry""",male,,0,0,A/5 2466,8.05,,S +306,1,1,"Allison, Master. Hudson Trevor",male,0.92,1,2,113781,151.55,C22 C26,S +307,1,1,"Fleming, Miss. Margaret",female,,0,0,17421,110.8833,,C +308,1,1,"Penasco y Castellana, Mrs. Victor de Satode (Maria Josefa Perez de Soto y Vallejo)",female,17,1,0,PC 17758,108.9,C65,C +309,0,2,"Abelson, Mr. Samuel",male,30,1,0,P/PP 3381,24,,C +310,1,1,"Francatelli, Miss. Laura Mabel",female,30,0,0,PC 17485,56.9292,E36,C +311,1,1,"Hays, Miss. Margaret Bechstein",female,24,0,0,11767,83.1583,C54,C +312,1,1,"Ryerson, Miss. Emily Borie",female,18,2,2,PC 17608,262.375,B57 B59 B63 B66,C +313,0,2,"Lahtinen, Mrs. William (Anna Sylfven)",female,26,1,1,250651,26,,S +314,0,3,"Hendekovic, Mr. Ignjac",male,28,0,0,349243,7.8958,,S +315,0,2,"Hart, Mr. Benjamin",male,43,1,1,F.C.C. 13529,26.25,,S +316,1,3,"Nilsson, Miss. Helmina Josefina",female,26,0,0,347470,7.8542,,S +317,1,2,"Kantor, Mrs. Sinai (Miriam Sternin)",female,24,1,0,244367,26,,S +318,0,2,"Moraweck, Dr. Ernest",male,54,0,0,29011,14,,S +319,1,1,"Wick, Miss. Mary Natalie",female,31,0,2,36928,164.8667,C7,S +320,1,1,"Spedden, Mrs. Frederic Oakley (Margaretta Corning Stone)",female,40,1,1,16966,134.5,E34,C +321,0,3,"Dennis, Mr. Samuel",male,22,0,0,A/5 21172,7.25,,S +322,0,3,"Danoff, Mr. Yoto",male,27,0,0,349219,7.8958,,S +323,1,2,"Slayter, Miss. Hilda Mary",female,30,0,0,234818,12.35,,Q +324,1,2,"Caldwell, Mrs. Albert Francis (Sylvia Mae Harbaugh)",female,22,1,1,248738,29,,S +325,0,3,"Sage, Mr. George John Jr",male,,8,2,CA. 2343,69.55,,S +326,1,1,"Young, Miss. Marie Grice",female,36,0,0,PC 17760,135.6333,C32,C +327,0,3,"Nysveen, Mr. Johan Hansen",male,61,0,0,345364,6.2375,,S +328,1,2,"Ball, Mrs. (Ada E Hall)",female,36,0,0,28551,13,D,S +329,1,3,"Goldsmith, Mrs. Frank John (Emily Alice Brown)",female,31,1,1,363291,20.525,,S +330,1,1,"Hippach, Miss. Jean Gertrude",female,16,0,1,111361,57.9792,B18,C +331,1,3,"McCoy, Miss. Agnes",female,,2,0,367226,23.25,,Q +332,0,1,"Partner, Mr. Austen",male,45.5,0,0,113043,28.5,C124,S +333,0,1,"Graham, Mr. George Edward",male,38,0,1,PC 17582,153.4625,C91,S +334,0,3,"Vander Planke, Mr. Leo Edmondus",male,16,2,0,345764,18,,S +335,1,1,"Frauenthal, Mrs. Henry William (Clara Heinsheimer)",female,,1,0,PC 17611,133.65,,S +336,0,3,"Denkoff, Mr. Mitto",male,,0,0,349225,7.8958,,S +337,0,1,"Pears, Mr. Thomas Clinton",male,29,1,0,113776,66.6,C2,S +338,1,1,"Burns, Miss. Elizabeth Margaret",female,41,0,0,16966,134.5,E40,C +339,1,3,"Dahl, Mr. Karl Edwart",male,45,0,0,7598,8.05,,S +340,0,1,"Blackwell, Mr. Stephen Weart",male,45,0,0,113784,35.5,T,S +341,1,2,"Navratil, Master. Edmond Roger",male,2,1,1,230080,26,F2,S +342,1,1,"Fortune, Miss. Alice Elizabeth",female,24,3,2,19950,263,C23 C25 C27,S +343,0,2,"Collander, Mr. Erik Gustaf",male,28,0,0,248740,13,,S +344,0,2,"Sedgwick, Mr. Charles Frederick Waddington",male,25,0,0,244361,13,,S +345,0,2,"Fox, Mr. Stanley Hubert",male,36,0,0,229236,13,,S +346,1,2,"Brown, Miss. Amelia ""Mildred""",female,24,0,0,248733,13,F33,S +347,1,2,"Smith, Miss. Marion Elsie",female,40,0,0,31418,13,,S +348,1,3,"Davison, Mrs. Thomas Henry (Mary E Finck)",female,,1,0,386525,16.1,,S +349,1,3,"Coutts, Master. William Loch ""William""",male,3,1,1,C.A. 37671,15.9,,S +350,0,3,"Dimic, Mr. Jovan",male,42,0,0,315088,8.6625,,S +351,0,3,"Odahl, Mr. Nils Martin",male,23,0,0,7267,9.225,,S +352,0,1,"Williams-Lambert, Mr. Fletcher Fellows",male,,0,0,113510,35,C128,S +353,0,3,"Elias, Mr. Tannous",male,15,1,1,2695,7.2292,,C +354,0,3,"Arnold-Franchi, Mr. Josef",male,25,1,0,349237,17.8,,S +355,0,3,"Yousif, Mr. Wazli",male,,0,0,2647,7.225,,C +356,0,3,"Vanden Steen, Mr. Leo Peter",male,28,0,0,345783,9.5,,S +357,1,1,"Bowerman, Miss. Elsie Edith",female,22,0,1,113505,55,E33,S +358,0,2,"Funk, Miss. Annie Clemmer",female,38,0,0,237671,13,,S +359,1,3,"McGovern, Miss. Mary",female,,0,0,330931,7.8792,,Q +360,1,3,"Mockler, Miss. Helen Mary ""Ellie""",female,,0,0,330980,7.8792,,Q +361,0,3,"Skoog, Mr. Wilhelm",male,40,1,4,347088,27.9,,S +362,0,2,"del Carlo, Mr. Sebastiano",male,29,1,0,SC/PARIS 2167,27.7208,,C +363,0,3,"Barbara, Mrs. (Catherine David)",female,45,0,1,2691,14.4542,,C +364,0,3,"Asim, Mr. Adola",male,35,0,0,SOTON/O.Q. 3101310,7.05,,S +365,0,3,"O'Brien, Mr. Thomas",male,,1,0,370365,15.5,,Q +366,0,3,"Adahl, Mr. Mauritz Nils Martin",male,30,0,0,C 7076,7.25,,S +367,1,1,"Warren, Mrs. Frank Manley (Anna Sophia Atkinson)",female,60,1,0,110813,75.25,D37,C +368,1,3,"Moussa, Mrs. (Mantoura Boulos)",female,,0,0,2626,7.2292,,C +369,1,3,"Jermyn, Miss. Annie",female,,0,0,14313,7.75,,Q +370,1,1,"Aubart, Mme. Leontine Pauline",female,24,0,0,PC 17477,69.3,B35,C +371,1,1,"Harder, Mr. George Achilles",male,25,1,0,11765,55.4417,E50,C +372,0,3,"Wiklund, Mr. Jakob Alfred",male,18,1,0,3101267,6.4958,,S +373,0,3,"Beavan, Mr. William Thomas",male,19,0,0,323951,8.05,,S +374,0,1,"Ringhini, Mr. Sante",male,22,0,0,PC 17760,135.6333,,C +375,0,3,"Palsson, Miss. Stina Viola",female,3,3,1,349909,21.075,,S +376,1,1,"Meyer, Mrs. Edgar Joseph (Leila Saks)",female,,1,0,PC 17604,82.1708,,C +377,1,3,"Landergren, Miss. Aurora Adelia",female,22,0,0,C 7077,7.25,,S +378,0,1,"Widener, Mr. Harry Elkins",male,27,0,2,113503,211.5,C82,C +379,0,3,"Betros, Mr. Tannous",male,20,0,0,2648,4.0125,,C +380,0,3,"Gustafsson, Mr. Karl Gideon",male,19,0,0,347069,7.775,,S +381,1,1,"Bidois, Miss. Rosalie",female,42,0,0,PC 17757,227.525,,C +382,1,3,"Nakid, Miss. Maria (""Mary"")",female,1,0,2,2653,15.7417,,C +383,0,3,"Tikkanen, Mr. Juho",male,32,0,0,STON/O 2. 3101293,7.925,,S +384,1,1,"Holverson, Mrs. Alexander Oskar (Mary Aline Towner)",female,35,1,0,113789,52,,S +385,0,3,"Plotcharsky, Mr. Vasil",male,,0,0,349227,7.8958,,S +386,0,2,"Davies, Mr. Charles Henry",male,18,0,0,S.O.C. 14879,73.5,,S +387,0,3,"Goodwin, Master. Sidney Leonard",male,1,5,2,CA 2144,46.9,,S +388,1,2,"Buss, Miss. Kate",female,36,0,0,27849,13,,S +389,0,3,"Sadlier, Mr. Matthew",male,,0,0,367655,7.7292,,Q +390,1,2,"Lehmann, Miss. Bertha",female,17,0,0,SC 1748,12,,C +391,1,1,"Carter, Mr. William Ernest",male,36,1,2,113760,120,B96 B98,S +392,1,3,"Jansson, Mr. Carl Olof",male,21,0,0,350034,7.7958,,S +393,0,3,"Gustafsson, Mr. Johan Birger",male,28,2,0,3101277,7.925,,S +394,1,1,"Newell, Miss. Marjorie",female,23,1,0,35273,113.275,D36,C +395,1,3,"Sandstrom, Mrs. Hjalmar (Agnes Charlotta Bengtsson)",female,24,0,2,PP 9549,16.7,G6,S +396,0,3,"Johansson, Mr. Erik",male,22,0,0,350052,7.7958,,S +397,0,3,"Olsson, Miss. Elina",female,31,0,0,350407,7.8542,,S +398,0,2,"McKane, Mr. Peter David",male,46,0,0,28403,26,,S +399,0,2,"Pain, Dr. Alfred",male,23,0,0,244278,10.5,,S +400,1,2,"Trout, Mrs. William H (Jessie L)",female,28,0,0,240929,12.65,,S +401,1,3,"Niskanen, Mr. Juha",male,39,0,0,STON/O 2. 3101289,7.925,,S +402,0,3,"Adams, Mr. John",male,26,0,0,341826,8.05,,S +403,0,3,"Jussila, Miss. Mari Aina",female,21,1,0,4137,9.825,,S +404,0,3,"Hakkarainen, Mr. Pekka Pietari",male,28,1,0,STON/O2. 3101279,15.85,,S +405,0,3,"Oreskovic, Miss. Marija",female,20,0,0,315096,8.6625,,S +406,0,2,"Gale, Mr. Shadrach",male,34,1,0,28664,21,,S +407,0,3,"Widegren, Mr. Carl/Charles Peter",male,51,0,0,347064,7.75,,S +408,1,2,"Richards, Master. William Rowe",male,3,1,1,29106,18.75,,S +409,0,3,"Birkeland, Mr. Hans Martin Monsen",male,21,0,0,312992,7.775,,S +410,0,3,"Lefebre, Miss. Ida",female,,3,1,4133,25.4667,,S +411,0,3,"Sdycoff, Mr. Todor",male,,0,0,349222,7.8958,,S +412,0,3,"Hart, Mr. Henry",male,,0,0,394140,6.8583,,Q +413,1,1,"Minahan, Miss. Daisy E",female,33,1,0,19928,90,C78,Q +414,0,2,"Cunningham, Mr. Alfred Fleming",male,,0,0,239853,0,,S +415,1,3,"Sundman, Mr. Johan Julian",male,44,0,0,STON/O 2. 3101269,7.925,,S +416,0,3,"Meek, Mrs. Thomas (Annie Louise Rowley)",female,,0,0,343095,8.05,,S +417,1,2,"Drew, Mrs. James Vivian (Lulu Thorne Christian)",female,34,1,1,28220,32.5,,S +418,1,2,"Silven, Miss. Lyyli Karoliina",female,18,0,2,250652,13,,S +419,0,2,"Matthews, Mr. William John",male,30,0,0,28228,13,,S +420,0,3,"Van Impe, Miss. Catharina",female,10,0,2,345773,24.15,,S +421,0,3,"Gheorgheff, Mr. Stanio",male,,0,0,349254,7.8958,,C +422,0,3,"Charters, Mr. David",male,21,0,0,A/5. 13032,7.7333,,Q +423,0,3,"Zimmerman, Mr. Leo",male,29,0,0,315082,7.875,,S +424,0,3,"Danbom, Mrs. Ernst Gilbert (Anna Sigrid Maria Brogren)",female,28,1,1,347080,14.4,,S +425,0,3,"Rosblom, Mr. Viktor Richard",male,18,1,1,370129,20.2125,,S +426,0,3,"Wiseman, Mr. Phillippe",male,,0,0,A/4. 34244,7.25,,S +427,1,2,"Clarke, Mrs. Charles V (Ada Maria Winfield)",female,28,1,0,2003,26,,S +428,1,2,"Phillips, Miss. Kate Florence (""Mrs Kate Louise Phillips Marshall"")",female,19,0,0,250655,26,,S +429,0,3,"Flynn, Mr. James",male,,0,0,364851,7.75,,Q +430,1,3,"Pickard, Mr. Berk (Berk Trembisky)",male,32,0,0,SOTON/O.Q. 392078,8.05,E10,S +431,1,1,"Bjornstrom-Steffansson, Mr. Mauritz Hakan",male,28,0,0,110564,26.55,C52,S +432,1,3,"Thorneycroft, Mrs. Percival (Florence Kate White)",female,,1,0,376564,16.1,,S +433,1,2,"Louch, Mrs. Charles Alexander (Alice Adelaide Slow)",female,42,1,0,SC/AH 3085,26,,S +434,0,3,"Kallio, Mr. Nikolai Erland",male,17,0,0,STON/O 2. 3101274,7.125,,S +435,0,1,"Silvey, Mr. William Baird",male,50,1,0,13507,55.9,E44,S +436,1,1,"Carter, Miss. Lucile Polk",female,14,1,2,113760,120,B96 B98,S +437,0,3,"Ford, Miss. Doolina Margaret ""Daisy""",female,21,2,2,W./C. 6608,34.375,,S +438,1,2,"Richards, Mrs. Sidney (Emily Hocking)",female,24,2,3,29106,18.75,,S +439,0,1,"Fortune, Mr. Mark",male,64,1,4,19950,263,C23 C25 C27,S +440,0,2,"Kvillner, Mr. Johan Henrik Johannesson",male,31,0,0,C.A. 18723,10.5,,S +441,1,2,"Hart, Mrs. Benjamin (Esther Ada Bloomfield)",female,45,1,1,F.C.C. 13529,26.25,,S +442,0,3,"Hampe, Mr. Leon",male,20,0,0,345769,9.5,,S +443,0,3,"Petterson, Mr. Johan Emil",male,25,1,0,347076,7.775,,S +444,1,2,"Reynaldo, Ms. Encarnacion",female,28,0,0,230434,13,,S +445,1,3,"Johannesen-Bratthammer, Mr. Bernt",male,,0,0,65306,8.1125,,S +446,1,1,"Dodge, Master. Washington",male,4,0,2,33638,81.8583,A34,S +447,1,2,"Mellinger, Miss. Madeleine Violet",female,13,0,1,250644,19.5,,S +448,1,1,"Seward, Mr. Frederic Kimber",male,34,0,0,113794,26.55,,S +449,1,3,"Baclini, Miss. Marie Catherine",female,5,2,1,2666,19.2583,,C +450,1,1,"Peuchen, Major. Arthur Godfrey",male,52,0,0,113786,30.5,C104,S +451,0,2,"West, Mr. Edwy Arthur",male,36,1,2,C.A. 34651,27.75,,S +452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S +453,0,1,"Foreman, Mr. Benjamin Laventall",male,30,0,0,113051,27.75,C111,C +454,1,1,"Goldenberg, Mr. Samuel L",male,49,1,0,17453,89.1042,C92,C +455,0,3,"Peduzzi, Mr. Joseph",male,,0,0,A/5 2817,8.05,,S +456,1,3,"Jalsevac, Mr. Ivan",male,29,0,0,349240,7.8958,,C +457,0,1,"Millet, Mr. Francis Davis",male,65,0,0,13509,26.55,E38,S +458,1,1,"Kenyon, Mrs. Frederick R (Marion)",female,,1,0,17464,51.8625,D21,S +459,1,2,"Toomey, Miss. Ellen",female,50,0,0,F.C.C. 13531,10.5,,S +460,0,3,"O'Connor, Mr. Maurice",male,,0,0,371060,7.75,,Q +461,1,1,"Anderson, Mr. Harry",male,48,0,0,19952,26.55,E12,S +462,0,3,"Morley, Mr. William",male,34,0,0,364506,8.05,,S +463,0,1,"Gee, Mr. Arthur H",male,47,0,0,111320,38.5,E63,S +464,0,2,"Milling, Mr. Jacob Christian",male,48,0,0,234360,13,,S +465,0,3,"Maisner, Mr. Simon",male,,0,0,A/S 2816,8.05,,S +466,0,3,"Goncalves, Mr. Manuel Estanslas",male,38,0,0,SOTON/O.Q. 3101306,7.05,,S +467,0,2,"Campbell, Mr. William",male,,0,0,239853,0,,S +468,0,1,"Smart, Mr. John Montgomery",male,56,0,0,113792,26.55,,S +469,0,3,"Scanlan, Mr. James",male,,0,0,36209,7.725,,Q +470,1,3,"Baclini, Miss. Helene Barbara",female,0.75,2,1,2666,19.2583,,C +471,0,3,"Keefe, Mr. Arthur",male,,0,0,323592,7.25,,S +472,0,3,"Cacic, Mr. Luka",male,38,0,0,315089,8.6625,,S +473,1,2,"West, Mrs. Edwy Arthur (Ada Mary Worth)",female,33,1,2,C.A. 34651,27.75,,S +474,1,2,"Jerwan, Mrs. Amin S (Marie Marthe Thuillard)",female,23,0,0,SC/AH Basle 541,13.7917,D,C +475,0,3,"Strandberg, Miss. Ida Sofia",female,22,0,0,7553,9.8375,,S +476,0,1,"Clifford, Mr. George Quincy",male,,0,0,110465,52,A14,S +477,0,2,"Renouf, Mr. Peter Henry",male,34,1,0,31027,21,,S +478,0,3,"Braund, Mr. Lewis Richard",male,29,1,0,3460,7.0458,,S +479,0,3,"Karlsson, Mr. Nils August",male,22,0,0,350060,7.5208,,S +480,1,3,"Hirvonen, Miss. Hildur E",female,2,0,1,3101298,12.2875,,S +481,0,3,"Goodwin, Master. Harold Victor",male,9,5,2,CA 2144,46.9,,S +482,0,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0,,S +483,0,3,"Rouse, Mr. Richard Henry",male,50,0,0,A/5 3594,8.05,,S +484,1,3,"Turkula, Mrs. (Hedwig)",female,63,0,0,4134,9.5875,,S +485,1,1,"Bishop, Mr. Dickinson H",male,25,1,0,11967,91.0792,B49,C +486,0,3,"Lefebre, Miss. Jeannie",female,,3,1,4133,25.4667,,S +487,1,1,"Hoyt, Mrs. Frederick Maxfield (Jane Anne Forby)",female,35,1,0,19943,90,C93,S +488,0,1,"Kent, Mr. Edward Austin",male,58,0,0,11771,29.7,B37,C +489,0,3,"Somerton, Mr. Francis William",male,30,0,0,A.5. 18509,8.05,,S +490,1,3,"Coutts, Master. Eden Leslie ""Neville""",male,9,1,1,C.A. 37671,15.9,,S +491,0,3,"Hagland, Mr. Konrad Mathias Reiersen",male,,1,0,65304,19.9667,,S +492,0,3,"Windelov, Mr. Einar",male,21,0,0,SOTON/OQ 3101317,7.25,,S +493,0,1,"Molson, Mr. Harry Markland",male,55,0,0,113787,30.5,C30,S +494,0,1,"Artagaveytia, Mr. Ramon",male,71,0,0,PC 17609,49.5042,,C +495,0,3,"Stanley, Mr. Edward Roland",male,21,0,0,A/4 45380,8.05,,S +496,0,3,"Yousseff, Mr. Gerious",male,,0,0,2627,14.4583,,C +497,1,1,"Eustis, Miss. Elizabeth Mussey",female,54,1,0,36947,78.2667,D20,C +498,0,3,"Shellard, Mr. Frederick William",male,,0,0,C.A. 6212,15.1,,S +499,0,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25,1,2,113781,151.55,C22 C26,S +500,0,3,"Svensson, Mr. Olof",male,24,0,0,350035,7.7958,,S +501,0,3,"Calic, Mr. Petar",male,17,0,0,315086,8.6625,,S +502,0,3,"Canavan, Miss. Mary",female,21,0,0,364846,7.75,,Q +503,0,3,"O'Sullivan, Miss. Bridget Mary",female,,0,0,330909,7.6292,,Q +504,0,3,"Laitinen, Miss. Kristina Sofia",female,37,0,0,4135,9.5875,,S +505,1,1,"Maioni, Miss. Roberta",female,16,0,0,110152,86.5,B79,S +506,0,1,"Penasco y Castellana, Mr. Victor de Satode",male,18,1,0,PC 17758,108.9,C65,C +507,1,2,"Quick, Mrs. Frederick Charles (Jane Richards)",female,33,0,2,26360,26,,S +508,1,1,"Bradley, Mr. George (""George Arthur Brayton"")",male,,0,0,111427,26.55,,S +509,0,3,"Olsen, Mr. Henry Margido",male,28,0,0,C 4001,22.525,,S +510,1,3,"Lang, Mr. Fang",male,26,0,0,1601,56.4958,,S +511,1,3,"Daly, Mr. Eugene Patrick",male,29,0,0,382651,7.75,,Q +512,0,3,"Webber, Mr. James",male,,0,0,SOTON/OQ 3101316,8.05,,S +513,1,1,"McGough, Mr. James Robert",male,36,0,0,PC 17473,26.2875,E25,S +514,1,1,"Rothschild, Mrs. Martin (Elizabeth L. Barrett)",female,54,1,0,PC 17603,59.4,,C +515,0,3,"Coleff, Mr. Satio",male,24,0,0,349209,7.4958,,S +516,0,1,"Walker, Mr. William Anderson",male,47,0,0,36967,34.0208,D46,S +517,1,2,"Lemore, Mrs. (Amelia Milley)",female,34,0,0,C.A. 34260,10.5,F33,S +518,0,3,"Ryan, Mr. Patrick",male,,0,0,371110,24.15,,Q +519,1,2,"Angle, Mrs. William A (Florence ""Mary"" Agnes Hughes)",female,36,1,0,226875,26,,S +520,0,3,"Pavlovic, Mr. Stefo",male,32,0,0,349242,7.8958,,S +521,1,1,"Perreault, Miss. Anne",female,30,0,0,12749,93.5,B73,S +522,0,3,"Vovk, Mr. Janko",male,22,0,0,349252,7.8958,,S +523,0,3,"Lahoud, Mr. Sarkis",male,,0,0,2624,7.225,,C +524,1,1,"Hippach, Mrs. Louis Albert (Ida Sophia Fischer)",female,44,0,1,111361,57.9792,B18,C +525,0,3,"Kassem, Mr. Fared",male,,0,0,2700,7.2292,,C +526,0,3,"Farrell, Mr. James",male,40.5,0,0,367232,7.75,,Q +527,1,2,"Ridsdale, Miss. Lucy",female,50,0,0,W./C. 14258,10.5,,S +528,0,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S +529,0,3,"Salonen, Mr. Johan Werner",male,39,0,0,3101296,7.925,,S +530,0,2,"Hocking, Mr. Richard George",male,23,2,1,29104,11.5,,S +531,1,2,"Quick, Miss. Phyllis May",female,2,1,1,26360,26,,S +532,0,3,"Toufik, Mr. Nakli",male,,0,0,2641,7.2292,,C +533,0,3,"Elias, Mr. Joseph Jr",male,17,1,1,2690,7.2292,,C +534,1,3,"Peter, Mrs. Catherine (Catherine Rizk)",female,,0,2,2668,22.3583,,C +535,0,3,"Cacic, Miss. Marija",female,30,0,0,315084,8.6625,,S +536,1,2,"Hart, Miss. Eva Miriam",female,7,0,2,F.C.C. 13529,26.25,,S +537,0,1,"Butt, Major. Archibald Willingham",male,45,0,0,113050,26.55,B38,S +538,1,1,"LeRoy, Miss. Bertha",female,30,0,0,PC 17761,106.425,,C +539,0,3,"Risien, Mr. Samuel Beard",male,,0,0,364498,14.5,,S +540,1,1,"Frolicher, Miss. Hedwig Margaritha",female,22,0,2,13568,49.5,B39,C +541,1,1,"Crosby, Miss. Harriet R",female,36,0,2,WE/P 5735,71,B22,S +542,0,3,"Andersson, Miss. Ingeborg Constanzia",female,9,4,2,347082,31.275,,S +543,0,3,"Andersson, Miss. Sigrid Elisabeth",female,11,4,2,347082,31.275,,S +544,1,2,"Beane, Mr. Edward",male,32,1,0,2908,26,,S +545,0,1,"Douglas, Mr. Walter Donald",male,50,1,0,PC 17761,106.425,C86,C +546,0,1,"Nicholson, Mr. Arthur Ernest",male,64,0,0,693,26,,S +547,1,2,"Beane, Mrs. Edward (Ethel Clarke)",female,19,1,0,2908,26,,S +548,1,2,"Padro y Manent, Mr. Julian",male,,0,0,SC/PARIS 2146,13.8625,,C +549,0,3,"Goldsmith, Mr. Frank John",male,33,1,1,363291,20.525,,S +550,1,2,"Davies, Master. John Morgan Jr",male,8,1,1,C.A. 33112,36.75,,S +551,1,1,"Thayer, Mr. John Borland Jr",male,17,0,2,17421,110.8833,C70,C +552,0,2,"Sharp, Mr. Percival James R",male,27,0,0,244358,26,,S +553,0,3,"O'Brien, Mr. Timothy",male,,0,0,330979,7.8292,,Q +554,1,3,"Leeni, Mr. Fahim (""Philip Zenni"")",male,22,0,0,2620,7.225,,C +555,1,3,"Ohman, Miss. Velin",female,22,0,0,347085,7.775,,S +556,0,1,"Wright, Mr. George",male,62,0,0,113807,26.55,,S +557,1,1,"Duff Gordon, Lady. (Lucille Christiana Sutherland) (""Mrs Morgan"")",female,48,1,0,11755,39.6,A16,C +558,0,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C +559,1,1,"Taussig, Mrs. Emil (Tillie Mandelbaum)",female,39,1,1,110413,79.65,E67,S +560,1,3,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36,1,0,345572,17.4,,S +561,0,3,"Morrow, Mr. Thomas Rowan",male,,0,0,372622,7.75,,Q +562,0,3,"Sivic, Mr. Husein",male,40,0,0,349251,7.8958,,S +563,0,2,"Norman, Mr. Robert Douglas",male,28,0,0,218629,13.5,,S +564,0,3,"Simmons, Mr. John",male,,0,0,SOTON/OQ 392082,8.05,,S +565,0,3,"Meanwell, Miss. (Marion Ogden)",female,,0,0,SOTON/O.Q. 392087,8.05,,S +566,0,3,"Davies, Mr. Alfred J",male,24,2,0,A/4 48871,24.15,,S +567,0,3,"Stoytcheff, Mr. Ilia",male,19,0,0,349205,7.8958,,S +568,0,3,"Palsson, Mrs. Nils (Alma Cornelia Berglund)",female,29,0,4,349909,21.075,,S +569,0,3,"Doharr, Mr. Tannous",male,,0,0,2686,7.2292,,C +570,1,3,"Jonsson, Mr. Carl",male,32,0,0,350417,7.8542,,S +571,1,2,"Harris, Mr. George",male,62,0,0,S.W./PP 752,10.5,,S +572,1,1,"Appleton, Mrs. Edward Dale (Charlotte Lamson)",female,53,2,0,11769,51.4792,C101,S +573,1,1,"Flynn, Mr. John Irwin (""Irving"")",male,36,0,0,PC 17474,26.3875,E25,S +574,1,3,"Kelly, Miss. Mary",female,,0,0,14312,7.75,,Q +575,0,3,"Rush, Mr. Alfred George John",male,16,0,0,A/4. 20589,8.05,,S +576,0,3,"Patchett, Mr. George",male,19,0,0,358585,14.5,,S +577,1,2,"Garside, Miss. Ethel",female,34,0,0,243880,13,,S +578,1,1,"Silvey, Mrs. William Baird (Alice Munger)",female,39,1,0,13507,55.9,E44,S +579,0,3,"Caram, Mrs. Joseph (Maria Elias)",female,,1,0,2689,14.4583,,C +580,1,3,"Jussila, Mr. Eiriik",male,32,0,0,STON/O 2. 3101286,7.925,,S +581,1,2,"Christy, Miss. Julie Rachel",female,25,1,1,237789,30,,S +582,1,1,"Thayer, Mrs. John Borland (Marian Longstreth Morris)",female,39,1,1,17421,110.8833,C68,C +583,0,2,"Downton, Mr. William James",male,54,0,0,28403,26,,S +584,0,1,"Ross, Mr. John Hugo",male,36,0,0,13049,40.125,A10,C +585,0,3,"Paulner, Mr. Uscher",male,,0,0,3411,8.7125,,C +586,1,1,"Taussig, Miss. Ruth",female,18,0,2,110413,79.65,E68,S +587,0,2,"Jarvis, Mr. John Denzil",male,47,0,0,237565,15,,S +588,1,1,"Frolicher-Stehli, Mr. Maxmillian",male,60,1,1,13567,79.2,B41,C +589,0,3,"Gilinski, Mr. Eliezer",male,22,0,0,14973,8.05,,S +590,0,3,"Murdlin, Mr. Joseph",male,,0,0,A./5. 3235,8.05,,S +591,0,3,"Rintamaki, Mr. Matti",male,35,0,0,STON/O 2. 3101273,7.125,,S +592,1,1,"Stephenson, Mrs. Walter Bertram (Martha Eustis)",female,52,1,0,36947,78.2667,D20,C +593,0,3,"Elsbury, Mr. William James",male,47,0,0,A/5 3902,7.25,,S +594,0,3,"Bourke, Miss. Mary",female,,0,2,364848,7.75,,Q +595,0,2,"Chapman, Mr. John Henry",male,37,1,0,SC/AH 29037,26,,S +596,0,3,"Van Impe, Mr. Jean Baptiste",male,36,1,1,345773,24.15,,S +597,1,2,"Leitch, Miss. Jessie Wills",female,,0,0,248727,33,,S +598,0,3,"Johnson, Mr. Alfred",male,49,0,0,LINE,0,,S +599,0,3,"Boulos, Mr. Hanna",male,,0,0,2664,7.225,,C +600,1,1,"Duff Gordon, Sir. Cosmo Edmund (""Mr Morgan"")",male,49,1,0,PC 17485,56.9292,A20,C +601,1,2,"Jacobsohn, Mrs. Sidney Samuel (Amy Frances Christy)",female,24,2,1,243847,27,,S +602,0,3,"Slabenoff, Mr. Petco",male,,0,0,349214,7.8958,,S +603,0,1,"Harrington, Mr. Charles H",male,,0,0,113796,42.4,,S +604,0,3,"Torber, Mr. Ernst William",male,44,0,0,364511,8.05,,S +605,1,1,"Homer, Mr. Harry (""Mr E Haven"")",male,35,0,0,111426,26.55,,C +606,0,3,"Lindell, Mr. Edvard Bengtsson",male,36,1,0,349910,15.55,,S +607,0,3,"Karaic, Mr. Milan",male,30,0,0,349246,7.8958,,S +608,1,1,"Daniel, Mr. Robert Williams",male,27,0,0,113804,30.5,,S +609,1,2,"Laroche, Mrs. Joseph (Juliette Marie Louise Lafargue)",female,22,1,2,SC/Paris 2123,41.5792,,C +610,1,1,"Shutes, Miss. Elizabeth W",female,40,0,0,PC 17582,153.4625,C125,S +611,0,3,"Andersson, Mrs. Anders Johan (Alfrida Konstantia Brogren)",female,39,1,5,347082,31.275,,S +612,0,3,"Jardin, Mr. Jose Neto",male,,0,0,SOTON/O.Q. 3101305,7.05,,S +613,1,3,"Murphy, Miss. Margaret Jane",female,,1,0,367230,15.5,,Q +614,0,3,"Horgan, Mr. John",male,,0,0,370377,7.75,,Q +615,0,3,"Brocklebank, Mr. William Alfred",male,35,0,0,364512,8.05,,S +616,1,2,"Herman, Miss. Alice",female,24,1,2,220845,65,,S +617,0,3,"Danbom, Mr. Ernst Gilbert",male,34,1,1,347080,14.4,,S +618,0,3,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)",female,26,1,0,A/5. 3336,16.1,,S +619,1,2,"Becker, Miss. Marion Louise",female,4,2,1,230136,39,F4,S +620,0,2,"Gavey, Mr. Lawrence",male,26,0,0,31028,10.5,,S +621,0,3,"Yasbeck, Mr. Antoni",male,27,1,0,2659,14.4542,,C +622,1,1,"Kimball, Mr. Edwin Nelson Jr",male,42,1,0,11753,52.5542,D19,S +623,1,3,"Nakid, Mr. Sahid",male,20,1,1,2653,15.7417,,C +624,0,3,"Hansen, Mr. Henry Damsgaard",male,21,0,0,350029,7.8542,,S +625,0,3,"Bowen, Mr. David John ""Dai""",male,21,0,0,54636,16.1,,S +626,0,1,"Sutton, Mr. Frederick",male,61,0,0,36963,32.3208,D50,S +627,0,2,"Kirkland, Rev. Charles Leonard",male,57,0,0,219533,12.35,,Q +628,1,1,"Longley, Miss. Gretchen Fiske",female,21,0,0,13502,77.9583,D9,S +629,0,3,"Bostandyeff, Mr. Guentcho",male,26,0,0,349224,7.8958,,S +630,0,3,"O'Connell, Mr. Patrick D",male,,0,0,334912,7.7333,,Q +631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80,0,0,27042,30,A23,S +632,0,3,"Lundahl, Mr. Johan Svensson",male,51,0,0,347743,7.0542,,S +633,1,1,"Stahelin-Maeglin, Dr. Max",male,32,0,0,13214,30.5,B50,C +634,0,1,"Parr, Mr. William Henry Marsh",male,,0,0,112052,0,,S +635,0,3,"Skoog, Miss. Mabel",female,9,3,2,347088,27.9,,S +636,1,2,"Davis, Miss. Mary",female,28,0,0,237668,13,,S +637,0,3,"Leinonen, Mr. Antti Gustaf",male,32,0,0,STON/O 2. 3101292,7.925,,S +638,0,2,"Collyer, Mr. Harvey",male,31,1,1,C.A. 31921,26.25,,S +639,0,3,"Panula, Mrs. Juha (Maria Emilia Ojala)",female,41,0,5,3101295,39.6875,,S +640,0,3,"Thorneycroft, Mr. Percival",male,,1,0,376564,16.1,,S +641,0,3,"Jensen, Mr. Hans Peder",male,20,0,0,350050,7.8542,,S +642,1,1,"Sagesser, Mlle. Emma",female,24,0,0,PC 17477,69.3,B35,C +643,0,3,"Skoog, Miss. Margit Elizabeth",female,2,3,2,347088,27.9,,S +644,1,3,"Foo, Mr. Choong",male,,0,0,1601,56.4958,,S +645,1,3,"Baclini, Miss. Eugenie",female,0.75,2,1,2666,19.2583,,C +646,1,1,"Harper, Mr. Henry Sleeper",male,48,1,0,PC 17572,76.7292,D33,C +647,0,3,"Cor, Mr. Liudevit",male,19,0,0,349231,7.8958,,S +648,1,1,"Simonius-Blumer, Col. Oberst Alfons",male,56,0,0,13213,35.5,A26,C +649,0,3,"Willey, Mr. Edward",male,,0,0,S.O./P.P. 751,7.55,,S +650,1,3,"Stanley, Miss. Amy Zillah Elsie",female,23,0,0,CA. 2314,7.55,,S +651,0,3,"Mitkoff, Mr. Mito",male,,0,0,349221,7.8958,,S +652,1,2,"Doling, Miss. Elsie",female,18,0,1,231919,23,,S +653,0,3,"Kalvik, Mr. Johannes Halvorsen",male,21,0,0,8475,8.4333,,S +654,1,3,"O'Leary, Miss. Hanora ""Norah""",female,,0,0,330919,7.8292,,Q +655,0,3,"Hegarty, Miss. Hanora ""Nora""",female,18,0,0,365226,6.75,,Q +656,0,2,"Hickman, Mr. Leonard Mark",male,24,2,0,S.O.C. 14879,73.5,,S +657,0,3,"Radeff, Mr. Alexander",male,,0,0,349223,7.8958,,S +658,0,3,"Bourke, Mrs. John (Catherine)",female,32,1,1,364849,15.5,,Q +659,0,2,"Eitemiller, Mr. George Floyd",male,23,0,0,29751,13,,S +660,0,1,"Newell, Mr. Arthur Webster",male,58,0,2,35273,113.275,D48,C +661,1,1,"Frauenthal, Dr. Henry William",male,50,2,0,PC 17611,133.65,,S +662,0,3,"Badt, Mr. Mohamed",male,40,0,0,2623,7.225,,C +663,0,1,"Colley, Mr. Edward Pomeroy",male,47,0,0,5727,25.5875,E58,S +664,0,3,"Coleff, Mr. Peju",male,36,0,0,349210,7.4958,,S +665,1,3,"Lindqvist, Mr. Eino William",male,20,1,0,STON/O 2. 3101285,7.925,,S +666,0,2,"Hickman, Mr. Lewis",male,32,2,0,S.O.C. 14879,73.5,,S +667,0,2,"Butler, Mr. Reginald Fenton",male,25,0,0,234686,13,,S +668,0,3,"Rommetvedt, Mr. Knud Paust",male,,0,0,312993,7.775,,S +669,0,3,"Cook, Mr. Jacob",male,43,0,0,A/5 3536,8.05,,S +670,1,1,"Taylor, Mrs. Elmer Zebley (Juliet Cummins Wright)",female,,1,0,19996,52,C126,S +671,1,2,"Brown, Mrs. Thomas William Solomon (Elizabeth Catherine Ford)",female,40,1,1,29750,39,,S +672,0,1,"Davidson, Mr. Thornton",male,31,1,0,F.C. 12750,52,B71,S +673,0,2,"Mitchell, Mr. Henry Michael",male,70,0,0,C.A. 24580,10.5,,S +674,1,2,"Wilhelms, Mr. Charles",male,31,0,0,244270,13,,S +675,0,2,"Watson, Mr. Ennis Hastings",male,,0,0,239856,0,,S +676,0,3,"Edvardsson, Mr. Gustaf Hjalmar",male,18,0,0,349912,7.775,,S +677,0,3,"Sawyer, Mr. Frederick Charles",male,24.5,0,0,342826,8.05,,S +678,1,3,"Turja, Miss. Anna Sofia",female,18,0,0,4138,9.8417,,S +679,0,3,"Goodwin, Mrs. Frederick (Augusta Tyler)",female,43,1,6,CA 2144,46.9,,S +680,1,1,"Cardeza, Mr. Thomas Drake Martinez",male,36,0,1,PC 17755,512.3292,B51 B53 B55,C +681,0,3,"Peters, Miss. Katie",female,,0,0,330935,8.1375,,Q +682,1,1,"Hassab, Mr. Hammad",male,27,0,0,PC 17572,76.7292,D49,C +683,0,3,"Olsvigen, Mr. Thor Anderson",male,20,0,0,6563,9.225,,S +684,0,3,"Goodwin, Mr. Charles Edward",male,14,5,2,CA 2144,46.9,,S +685,0,2,"Brown, Mr. Thomas William Solomon",male,60,1,1,29750,39,,S +686,0,2,"Laroche, Mr. Joseph Philippe Lemercier",male,25,1,2,SC/Paris 2123,41.5792,,C +687,0,3,"Panula, Mr. Jaako Arnold",male,14,4,1,3101295,39.6875,,S +688,0,3,"Dakic, Mr. Branko",male,19,0,0,349228,10.1708,,S +689,0,3,"Fischer, Mr. Eberhard Thelander",male,18,0,0,350036,7.7958,,S +690,1,1,"Madill, Miss. Georgette Alexandra",female,15,0,1,24160,211.3375,B5,S +691,1,1,"Dick, Mr. Albert Adrian",male,31,1,0,17474,57,B20,S +692,1,3,"Karun, Miss. Manca",female,4,0,1,349256,13.4167,,C +693,1,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S +694,0,3,"Saad, Mr. Khalil",male,25,0,0,2672,7.225,,C +695,0,1,"Weir, Col. John",male,60,0,0,113800,26.55,,S +696,0,2,"Chapman, Mr. Charles Henry",male,52,0,0,248731,13.5,,S +697,0,3,"Kelly, Mr. James",male,44,0,0,363592,8.05,,S +698,1,3,"Mullens, Miss. Katherine ""Katie""",female,,0,0,35852,7.7333,,Q +699,0,1,"Thayer, Mr. John Borland",male,49,1,1,17421,110.8833,C68,C +700,0,3,"Humblen, Mr. Adolf Mathias Nicolai Olsen",male,42,0,0,348121,7.65,F G63,S +701,1,1,"Astor, Mrs. John Jacob (Madeleine Talmadge Force)",female,18,1,0,PC 17757,227.525,C62 C64,C +702,1,1,"Silverthorne, Mr. Spencer Victor",male,35,0,0,PC 17475,26.2875,E24,S +703,0,3,"Barbara, Miss. Saiide",female,18,0,1,2691,14.4542,,C +704,0,3,"Gallagher, Mr. Martin",male,25,0,0,36864,7.7417,,Q +705,0,3,"Hansen, Mr. Henrik Juul",male,26,1,0,350025,7.8542,,S +706,0,2,"Morley, Mr. Henry Samuel (""Mr Henry Marshall"")",male,39,0,0,250655,26,,S +707,1,2,"Kelly, Mrs. Florence ""Fannie""",female,45,0,0,223596,13.5,,S +708,1,1,"Calderhead, Mr. Edward Pennington",male,42,0,0,PC 17476,26.2875,E24,S +709,1,1,"Cleaver, Miss. Alice",female,22,0,0,113781,151.55,,S +710,1,3,"Moubarek, Master. Halim Gonios (""William George"")",male,,1,1,2661,15.2458,,C +711,1,1,"Mayne, Mlle. Berthe Antonine (""Mrs de Villiers"")",female,24,0,0,PC 17482,49.5042,C90,C +712,0,1,"Klaber, Mr. Herman",male,,0,0,113028,26.55,C124,S +713,1,1,"Taylor, Mr. Elmer Zebley",male,48,1,0,19996,52,C126,S +714,0,3,"Larsson, Mr. August Viktor",male,29,0,0,7545,9.4833,,S +715,0,2,"Greenberg, Mr. Samuel",male,52,0,0,250647,13,,S +716,0,3,"Soholt, Mr. Peter Andreas Lauritz Andersen",male,19,0,0,348124,7.65,F G73,S +717,1,1,"Endres, Miss. Caroline Louise",female,38,0,0,PC 17757,227.525,C45,C +718,1,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27,0,0,34218,10.5,E101,S +719,0,3,"McEvoy, Mr. Michael",male,,0,0,36568,15.5,,Q +720,0,3,"Johnson, Mr. Malkolm Joackim",male,33,0,0,347062,7.775,,S +721,1,2,"Harper, Miss. Annie Jessie ""Nina""",female,6,0,1,248727,33,,S +722,0,3,"Jensen, Mr. Svend Lauritz",male,17,1,0,350048,7.0542,,S +723,0,2,"Gillespie, Mr. William Henry",male,34,0,0,12233,13,,S +724,0,2,"Hodges, Mr. Henry Price",male,50,0,0,250643,13,,S +725,1,1,"Chambers, Mr. Norman Campbell",male,27,1,0,113806,53.1,E8,S +726,0,3,"Oreskovic, Mr. Luka",male,20,0,0,315094,8.6625,,S +727,1,2,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",female,30,3,0,31027,21,,S +728,1,3,"Mannion, Miss. Margareth",female,,0,0,36866,7.7375,,Q +729,0,2,"Bryhl, Mr. Kurt Arnold Gottfrid",male,25,1,0,236853,26,,S +730,0,3,"Ilmakangas, Miss. Pieta Sofia",female,25,1,0,STON/O2. 3101271,7.925,,S +731,1,1,"Allen, Miss. Elisabeth Walton",female,29,0,0,24160,211.3375,B5,S +732,0,3,"Hassan, Mr. Houssein G N",male,11,0,0,2699,18.7875,,C +733,0,2,"Knight, Mr. Robert J",male,,0,0,239855,0,,S +734,0,2,"Berriman, Mr. William John",male,23,0,0,28425,13,,S +735,0,2,"Troupiansky, Mr. Moses Aaron",male,23,0,0,233639,13,,S +736,0,3,"Williams, Mr. Leslie",male,28.5,0,0,54636,16.1,,S +737,0,3,"Ford, Mrs. Edward (Margaret Ann Watson)",female,48,1,3,W./C. 6608,34.375,,S +738,1,1,"Lesurer, Mr. Gustave J",male,35,0,0,PC 17755,512.3292,B101,C +739,0,3,"Ivanoff, Mr. Kanio",male,,0,0,349201,7.8958,,S +740,0,3,"Nankoff, Mr. Minko",male,,0,0,349218,7.8958,,S +741,1,1,"Hawksford, Mr. Walter James",male,,0,0,16988,30,D45,S +742,0,1,"Cavendish, Mr. Tyrell William",male,36,1,0,19877,78.85,C46,S +743,1,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21,2,2,PC 17608,262.375,B57 B59 B63 B66,C +744,0,3,"McNamee, Mr. Neal",male,24,1,0,376566,16.1,,S +745,1,3,"Stranden, Mr. Juho",male,31,0,0,STON/O 2. 3101288,7.925,,S +746,0,1,"Crosby, Capt. Edward Gifford",male,70,1,1,WE/P 5735,71,B22,S +747,0,3,"Abbott, Mr. Rossmore Edward",male,16,1,1,C.A. 2673,20.25,,S +748,1,2,"Sinkkonen, Miss. Anna",female,30,0,0,250648,13,,S +749,0,1,"Marvin, Mr. Daniel Warner",male,19,1,0,113773,53.1,D30,S +750,0,3,"Connaghton, Mr. Michael",male,31,0,0,335097,7.75,,Q +751,1,2,"Wells, Miss. Joan",female,4,1,1,29103,23,,S +752,1,3,"Moor, Master. Meier",male,6,0,1,392096,12.475,E121,S +753,0,3,"Vande Velde, Mr. Johannes Joseph",male,33,0,0,345780,9.5,,S +754,0,3,"Jonkoff, Mr. Lalio",male,23,0,0,349204,7.8958,,S +755,1,2,"Herman, Mrs. Samuel (Jane Laver)",female,48,1,2,220845,65,,S +756,1,2,"Hamalainen, Master. Viljo",male,0.67,1,1,250649,14.5,,S +757,0,3,"Carlsson, Mr. August Sigfrid",male,28,0,0,350042,7.7958,,S +758,0,2,"Bailey, Mr. Percy Andrew",male,18,0,0,29108,11.5,,S +759,0,3,"Theobald, Mr. Thomas Leonard",male,34,0,0,363294,8.05,,S +760,1,1,"Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)",female,33,0,0,110152,86.5,B77,S +761,0,3,"Garfirth, Mr. John",male,,0,0,358585,14.5,,S +762,0,3,"Nirva, Mr. Iisakki Antino Aijo",male,41,0,0,SOTON/O2 3101272,7.125,,S +763,1,3,"Barah, Mr. Hanna Assi",male,20,0,0,2663,7.2292,,C +764,1,1,"Carter, Mrs. William Ernest (Lucile Polk)",female,36,1,2,113760,120,B96 B98,S +765,0,3,"Eklund, Mr. Hans Linus",male,16,0,0,347074,7.775,,S +766,1,1,"Hogeboom, Mrs. John C (Anna Andrews)",female,51,1,0,13502,77.9583,D11,S +767,0,1,"Brewe, Dr. Arthur Jackson",male,,0,0,112379,39.6,,C +768,0,3,"Mangan, Miss. Mary",female,30.5,0,0,364850,7.75,,Q +769,0,3,"Moran, Mr. Daniel J",male,,1,0,371110,24.15,,Q +770,0,3,"Gronnestad, Mr. Daniel Danielsen",male,32,0,0,8471,8.3625,,S +771,0,3,"Lievens, Mr. Rene Aime",male,24,0,0,345781,9.5,,S +772,0,3,"Jensen, Mr. Niels Peder",male,48,0,0,350047,7.8542,,S +773,0,2,"Mack, Mrs. (Mary)",female,57,0,0,S.O./P.P. 3,10.5,E77,S +774,0,3,"Elias, Mr. Dibo",male,,0,0,2674,7.225,,C +775,1,2,"Hocking, Mrs. Elizabeth (Eliza Needs)",female,54,1,3,29105,23,,S +776,0,3,"Myhrman, Mr. Pehr Fabian Oliver Malkolm",male,18,0,0,347078,7.75,,S +777,0,3,"Tobin, Mr. Roger",male,,0,0,383121,7.75,F38,Q +778,1,3,"Emanuel, Miss. Virginia Ethel",female,5,0,0,364516,12.475,,S +779,0,3,"Kilgannon, Mr. Thomas J",male,,0,0,36865,7.7375,,Q +780,1,1,"Robert, Mrs. Edward Scott (Elisabeth Walton McMillan)",female,43,0,1,24160,211.3375,B3,S +781,1,3,"Ayoub, Miss. Banoura",female,13,0,0,2687,7.2292,,C +782,1,1,"Dick, Mrs. Albert Adrian (Vera Gillespie)",female,17,1,0,17474,57,B20,S +783,0,1,"Long, Mr. Milton Clyde",male,29,0,0,113501,30,D6,S +784,0,3,"Johnston, Mr. Andrew G",male,,1,2,W./C. 6607,23.45,,S +785,0,3,"Ali, Mr. William",male,25,0,0,SOTON/O.Q. 3101312,7.05,,S +786,0,3,"Harmer, Mr. Abraham (David Lishin)",male,25,0,0,374887,7.25,,S +787,1,3,"Sjoblom, Miss. Anna Sofia",female,18,0,0,3101265,7.4958,,S +788,0,3,"Rice, Master. George Hugh",male,8,4,1,382652,29.125,,Q +789,1,3,"Dean, Master. Bertram Vere",male,1,1,2,C.A. 2315,20.575,,S +790,0,1,"Guggenheim, Mr. Benjamin",male,46,0,0,PC 17593,79.2,B82 B84,C +791,0,3,"Keane, Mr. Andrew ""Andy""",male,,0,0,12460,7.75,,Q +792,0,2,"Gaskell, Mr. Alfred",male,16,0,0,239865,26,,S +793,0,3,"Sage, Miss. Stella Anna",female,,8,2,CA. 2343,69.55,,S +794,0,1,"Hoyt, Mr. William Fisher",male,,0,0,PC 17600,30.6958,,C +795,0,3,"Dantcheff, Mr. Ristiu",male,25,0,0,349203,7.8958,,S +796,0,2,"Otter, Mr. Richard",male,39,0,0,28213,13,,S +797,1,1,"Leader, Dr. Alice (Farnham)",female,49,0,0,17465,25.9292,D17,S +798,1,3,"Osman, Mrs. Mara",female,31,0,0,349244,8.6833,,S +799,0,3,"Ibrahim Shawah, Mr. Yousseff",male,30,0,0,2685,7.2292,,C +800,0,3,"Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)",female,30,1,1,345773,24.15,,S +801,0,2,"Ponesell, Mr. Martin",male,34,0,0,250647,13,,S +802,1,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31,1,1,C.A. 31921,26.25,,S +803,1,1,"Carter, Master. William Thornton II",male,11,1,2,113760,120,B96 B98,S +804,1,3,"Thomas, Master. Assad Alexander",male,0.42,0,1,2625,8.5167,,C +805,1,3,"Hedman, Mr. Oskar Arvid",male,27,0,0,347089,6.975,,S +806,0,3,"Johansson, Mr. Karl Johan",male,31,0,0,347063,7.775,,S +807,0,1,"Andrews, Mr. Thomas Jr",male,39,0,0,112050,0,A36,S +808,0,3,"Pettersson, Miss. Ellen Natalia",female,18,0,0,347087,7.775,,S +809,0,2,"Meyer, Mr. August",male,39,0,0,248723,13,,S +810,1,1,"Chambers, Mrs. Norman Campbell (Bertha Griggs)",female,33,1,0,113806,53.1,E8,S +811,0,3,"Alexander, Mr. William",male,26,0,0,3474,7.8875,,S +812,0,3,"Lester, Mr. James",male,39,0,0,A/4 48871,24.15,,S +813,0,2,"Slemen, Mr. Richard James",male,35,0,0,28206,10.5,,S +814,0,3,"Andersson, Miss. Ebba Iris Alfrida",female,6,4,2,347082,31.275,,S +815,0,3,"Tomlin, Mr. Ernest Portage",male,30.5,0,0,364499,8.05,,S +816,0,1,"Fry, Mr. Richard",male,,0,0,112058,0,B102,S +817,0,3,"Heininen, Miss. Wendla Maria",female,23,0,0,STON/O2. 3101290,7.925,,S +818,0,2,"Mallet, Mr. Albert",male,31,1,1,S.C./PARIS 2079,37.0042,,C +819,0,3,"Holm, Mr. John Fredrik Alexander",male,43,0,0,C 7075,6.45,,S +820,0,3,"Skoog, Master. Karl Thorsten",male,10,3,2,347088,27.9,,S +821,1,1,"Hays, Mrs. Charles Melville (Clara Jennings Gregg)",female,52,1,1,12749,93.5,B69,S +822,1,3,"Lulic, Mr. Nikola",male,27,0,0,315098,8.6625,,S +823,0,1,"Reuchlin, Jonkheer. John George",male,38,0,0,19972,0,,S +824,1,3,"Moor, Mrs. (Beila)",female,27,0,1,392096,12.475,E121,S +825,0,3,"Panula, Master. Urho Abraham",male,2,4,1,3101295,39.6875,,S +826,0,3,"Flynn, Mr. John",male,,0,0,368323,6.95,,Q +827,0,3,"Lam, Mr. Len",male,,0,0,1601,56.4958,,S +828,1,2,"Mallet, Master. Andre",male,1,0,2,S.C./PARIS 2079,37.0042,,C +829,1,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q +830,1,1,"Stone, Mrs. George Nelson (Martha Evelyn)",female,62,0,0,113572,80,B28, +831,1,3,"Yasbeck, Mrs. Antoni (Selini Alexander)",female,15,1,0,2659,14.4542,,C +832,1,2,"Richards, Master. George Sibley",male,0.83,1,1,29106,18.75,,S +833,0,3,"Saad, Mr. Amin",male,,0,0,2671,7.2292,,C +834,0,3,"Augustsson, Mr. Albert",male,23,0,0,347468,7.8542,,S +835,0,3,"Allum, Mr. Owen George",male,18,0,0,2223,8.3,,S +836,1,1,"Compton, Miss. Sara Rebecca",female,39,1,1,PC 17756,83.1583,E49,C +837,0,3,"Pasic, Mr. Jakob",male,21,0,0,315097,8.6625,,S +838,0,3,"Sirota, Mr. Maurice",male,,0,0,392092,8.05,,S +839,1,3,"Chip, Mr. Chang",male,32,0,0,1601,56.4958,,S +840,1,1,"Marechal, Mr. Pierre",male,,0,0,11774,29.7,C47,C +841,0,3,"Alhomaki, Mr. Ilmari Rudolf",male,20,0,0,SOTON/O2 3101287,7.925,,S +842,0,2,"Mudd, Mr. Thomas Charles",male,16,0,0,S.O./P.P. 3,10.5,,S +843,1,1,"Serepeca, Miss. Augusta",female,30,0,0,113798,31,,C +844,0,3,"Lemberopolous, Mr. Peter L",male,34.5,0,0,2683,6.4375,,C +845,0,3,"Culumovic, Mr. Jeso",male,17,0,0,315090,8.6625,,S +846,0,3,"Abbing, Mr. Anthony",male,42,0,0,C.A. 5547,7.55,,S +847,0,3,"Sage, Mr. Douglas Bullen",male,,8,2,CA. 2343,69.55,,S +848,0,3,"Markoff, Mr. Marin",male,35,0,0,349213,7.8958,,C +849,0,2,"Harper, Rev. John",male,28,0,1,248727,33,,S +850,1,1,"Goldenberg, Mrs. Samuel L (Edwiga Grabowska)",female,,1,0,17453,89.1042,C92,C +851,0,3,"Andersson, Master. Sigvard Harald Elias",male,4,4,2,347082,31.275,,S +852,0,3,"Svensson, Mr. Johan",male,74,0,0,347060,7.775,,S +853,0,3,"Boulos, Miss. Nourelain",female,9,1,1,2678,15.2458,,C +854,1,1,"Lines, Miss. Mary Conover",female,16,0,1,PC 17592,39.4,D28,S +855,0,2,"Carter, Mrs. Ernest Courtenay (Lilian Hughes)",female,44,1,0,244252,26,,S +856,1,3,"Aks, Mrs. Sam (Leah Rosen)",female,18,0,1,392091,9.35,,S +857,1,1,"Wick, Mrs. George Dennick (Mary Hitchcock)",female,45,1,1,36928,164.8667,,S +858,1,1,"Daly, Mr. Peter Denis ",male,51,0,0,113055,26.55,E17,S +859,1,3,"Baclini, Mrs. Solomon (Latifa Qurban)",female,24,0,3,2666,19.2583,,C +860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C +861,0,3,"Hansen, Mr. Claus Peter",male,41,2,0,350026,14.1083,,S +862,0,2,"Giles, Mr. Frederick Edward",male,21,1,0,28134,11.5,,S +863,1,1,"Swift, Mrs. Frederick Joel (Margaret Welles Barron)",female,48,0,0,17466,25.9292,D17,S +864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.55,,S +865,0,2,"Gill, Mr. John William",male,24,0,0,233866,13,,S +866,1,2,"Bystrom, Mrs. (Karolina)",female,42,0,0,236852,13,,S +867,1,2,"Duran y More, Miss. Asuncion",female,27,1,0,SC/PARIS 2149,13.8583,,C +868,0,1,"Roebling, Mr. Washington Augustus II",male,31,0,0,PC 17590,50.4958,A24,S +869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5,,S +870,1,3,"Johnson, Master. Harold Theodor",male,4,1,1,347742,11.1333,,S +871,0,3,"Balkic, Mr. Cerin",male,26,0,0,349248,7.8958,,S +872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47,1,1,11751,52.5542,D35,S +873,0,1,"Carlsson, Mr. Frans Olof",male,33,0,0,695,5,B51 B53 B55,S +874,0,3,"Vander Cruyssen, Mr. Victor",male,47,0,0,345765,9,,S +875,1,2,"Abelson, Mrs. Samuel (Hannah Wizosky)",female,28,1,0,P/PP 3381,24,,C +876,1,3,"Najib, Miss. Adele Kiamie ""Jane""",female,15,0,0,2667,7.225,,C +877,0,3,"Gustafsson, Mr. Alfred Ossian",male,20,0,0,7534,9.8458,,S +878,0,3,"Petroff, Mr. Nedelio",male,19,0,0,349212,7.8958,,S +879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S +880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56,0,1,11767,83.1583,C50,C +881,1,2,"Shelley, Mrs. William (Imanita Parrish Hall)",female,25,0,1,230433,26,,S +882,0,3,"Markun, Mr. Johann",male,33,0,0,349257,7.8958,,S +883,0,3,"Dahlberg, Miss. Gerda Ulrika",female,22,0,0,7552,10.5167,,S +884,0,2,"Banfield, Mr. Frederick James",male,28,0,0,C.A./SOTON 34068,10.5,,S +885,0,3,"Sutehall, Mr. Henry Jr",male,25,0,0,SOTON/OQ 392076,7.05,,S +886,0,3,"Rice, Mrs. William (Margaret Norton)",female,39,0,5,382652,29.125,,Q +887,0,2,"Montvila, Rev. Juozas",male,27,0,0,211536,13,,S +888,1,1,"Graham, Miss. Margaret Edith",female,19,0,0,112053,30,B42,S +889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S +890,1,1,"Behr, Mr. Karl Howell",male,26,0,0,111369,30,C148,C +891,0,3,"Dooley, Mr. Patrick",male,32,0,0,370376,7.75,,Q diff --git a/exploration.ipynb b/exploration.ipynb new file mode 100644 index 0000000..1566d26 --- /dev/null +++ b/exploration.ipynb @@ -0,0 +1,2520 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Emily Wang | Data Science Warmup Project | Spring 2016\n", + "\n", + "Includes stories/thoughts throughout the process, summary statistics, plots, and more!\n", + "\n", + "[Link to data and variable descriptions page on kaggle](https://www.kaggle.com/c/titanic/data?train.csv)" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import pprint as pp\n", + "import thinkstats2\n", + "import thinkplot" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "data = pd.read_csv('./data/train.csv')\n", + "\n", + "# Variable descriptions \n", + "# survival Survival\n", + "# (0 = No; 1 = Yes)\n", + "# pclass Passenger Class\n", + "# (1 = 1st; 2 = 2nd; 3 = 3rd)\n", + "# name Name\n", + "# sex Sex\n", + "# age Age\n", + "# sibsp Number of Siblings/Spouses Aboard\n", + "# parch Number of Parents/Children Aboard\n", + "# ticket Ticket Number\n", + "# fare Passenger Fare\n", + "# cabin Cabin\n", + "# embarked Port of Embarkation\n", + "# (C = Cherbourg; Q = Queenstown; S = Southampton)" + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale2210A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female3810PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale2600STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female351011380353.1000C123S
4503Allen, Mr. William Henrymale35003734508.0500NaNS
5603Moran, Mr. JamesmaleNaN003308778.4583NaNQ
6701McCarthy, Mr. Timothy Jmale54001746351.8625E46S
7803Palsson, Master. Gosta Leonardmale23134990921.0750NaNS
8913Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)female270234774211.1333NaNS
91012Nasser, Mrs. Nicholas (Adele Achem)female141023773630.0708NaNC
101113Sandstrom, Miss. Marguerite Rutfemale411PP 954916.7000G6S
111211Bonnell, Miss. Elizabethfemale580011378326.5500C103S
121303Saundercock, Mr. William Henrymale2000A/5. 21518.0500NaNS
131403Andersson, Mr. Anders Johanmale391534708231.2750NaNS
141503Vestrom, Miss. Hulda Amanda Adolfinafemale14003504067.8542NaNS
151612Hewlett, Mrs. (Mary D Kingcome)female550024870616.0000NaNS
161703Rice, Master. Eugenemale24138265229.1250NaNQ
171812Williams, Mr. Charles EugenemaleNaN0024437313.0000NaNS
181903Vander Planke, Mrs. Julius (Emelia Maria Vande...female311034576318.0000NaNS
192013Masselmani, Mrs. FatimafemaleNaN0026497.2250NaNC
202102Fynney, Mr. Joseph Jmale350023986526.0000NaNS
212212Beesley, Mr. Lawrencemale340024869813.0000D56S
222313McGowan, Miss. Anna \"Annie\"female15003309238.0292NaNQ
232411Sloper, Mr. William Thompsonmale280011378835.5000A6S
242503Palsson, Miss. Torborg Danirafemale83134990921.0750NaNS
252613Asplund, Mrs. Carl Oscar (Selma Augusta Emilia...female381534707731.3875NaNS
262703Emir, Mr. Farred ChehabmaleNaN0026317.2250NaNC
272801Fortune, Mr. Charles Alexandermale193219950263.0000C23 C25 C27S
282913O'Dwyer, Miss. Ellen \"Nellie\"femaleNaN003309597.8792NaNQ
293003Todoroff, Mr. LaliomaleNaN003492167.8958NaNS
.......................................
86186202Giles, Mr. Frederick Edwardmale21102813411.5000NaNS
86286311Swift, Mrs. Frederick Joel (Margaret Welles Ba...female48001746625.9292D17S
86386403Sage, Miss. Dorothy Edith \"Dolly\"femaleNaN82CA. 234369.5500NaNS
86486502Gill, Mr. John Williammale240023386613.0000NaNS
86586612Bystrom, Mrs. (Karolina)female420023685213.0000NaNS
86686712Duran y More, Miss. Asuncionfemale2710SC/PARIS 214913.8583NaNC
86786801Roebling, Mr. Washington Augustus IImale3100PC 1759050.4958A24S
86886903van Melkebeke, Mr. PhilemonmaleNaN003457779.5000NaNS
86987013Johnson, Master. Harold Theodormale41134774211.1333NaNS
87087103Balkic, Mr. Cerinmale26003492487.8958NaNS
87187211Beckwith, Mrs. Richard Leonard (Sallie Monypeny)female47111175152.5542D35S
87287301Carlsson, Mr. Frans Olofmale33006955.0000B51 B53 B55S
87387403Vander Cruyssen, Mr. Victormale47003457659.0000NaNS
87487512Abelson, Mrs. Samuel (Hannah Wizosky)female2810P/PP 338124.0000NaNC
87587613Najib, Miss. Adele Kiamie \"Jane\"female150026677.2250NaNC
87687703Gustafsson, Mr. Alfred Ossianmale200075349.8458NaNS
87787803Petroff, Mr. Nedeliomale19003492127.8958NaNS
87887903Laleff, Mr. KristomaleNaN003492177.8958NaNS
87988011Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)female56011176783.1583C50C
88088112Shelley, Mrs. William (Imanita Parrish Hall)female250123043326.0000NaNS
88188203Markun, Mr. Johannmale33003492577.8958NaNS
88288303Dahlberg, Miss. Gerda Ulrikafemale2200755210.5167NaNS
88388402Banfield, Mr. Frederick Jamesmale2800C.A./SOTON 3406810.5000NaNS
88488503Sutehall, Mr. Henry Jrmale2500SOTON/OQ 3920767.0500NaNS
88588603Rice, Mrs. William (Margaret Norton)female390538265229.1250NaNQ
88688702Montvila, Rev. Juozasmale270021153613.0000NaNS
88788811Graham, Miss. Margaret Edithfemale190011205330.0000B42S
88888903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.4500NaNS
88989011Behr, Mr. Karl Howellmale260011136930.0000C148C
89089103Dooley, Mr. Patrickmale32003703767.7500NaNQ
\n", + "

891 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass \\\n", + "0 1 0 3 \n", + "1 2 1 1 \n", + "2 3 1 3 \n", + "3 4 1 1 \n", + "4 5 0 3 \n", + "5 6 0 3 \n", + "6 7 0 1 \n", + "7 8 0 3 \n", + "8 9 1 3 \n", + "9 10 1 2 \n", + "10 11 1 3 \n", + "11 12 1 1 \n", + "12 13 0 3 \n", + "13 14 0 3 \n", + "14 15 0 3 \n", + "15 16 1 2 \n", + "16 17 0 3 \n", + "17 18 1 2 \n", + "18 19 0 3 \n", + "19 20 1 3 \n", + "20 21 0 2 \n", + "21 22 1 2 \n", + "22 23 1 3 \n", + "23 24 1 1 \n", + "24 25 0 3 \n", + "25 26 1 3 \n", + "26 27 0 3 \n", + "27 28 0 1 \n", + "28 29 1 3 \n", + "29 30 0 3 \n", + ".. ... ... ... \n", + "861 862 0 2 \n", + "862 863 1 1 \n", + "863 864 0 3 \n", + "864 865 0 2 \n", + "865 866 1 2 \n", + "866 867 1 2 \n", + "867 868 0 1 \n", + "868 869 0 3 \n", + "869 870 1 3 \n", + "870 871 0 3 \n", + "871 872 1 1 \n", + "872 873 0 1 \n", + "873 874 0 3 \n", + "874 875 1 2 \n", + "875 876 1 3 \n", + "876 877 0 3 \n", + "877 878 0 3 \n", + "878 879 0 3 \n", + "879 880 1 1 \n", + "880 881 1 2 \n", + "881 882 0 3 \n", + "882 883 0 3 \n", + "883 884 0 2 \n", + "884 885 0 3 \n", + "885 886 0 3 \n", + "886 887 0 2 \n", + "887 888 1 1 \n", + "888 889 0 3 \n", + "889 890 1 1 \n", + "890 891 0 3 \n", + "\n", + " Name Sex Age SibSp \\\n", + "0 Braund, Mr. Owen Harris male 22 1 \n", + "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38 1 \n", + "2 Heikkinen, Miss. Laina female 26 0 \n", + "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 \n", + "4 Allen, Mr. William Henry male 35 0 \n", + "5 Moran, Mr. James male NaN 0 \n", + "6 McCarthy, Mr. Timothy J male 54 0 \n", + "7 Palsson, Master. Gosta Leonard male 2 3 \n", + "8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0 \n", + "9 Nasser, Mrs. Nicholas (Adele Achem) female 14 1 \n", + "10 Sandstrom, Miss. Marguerite Rut female 4 1 \n", + "11 Bonnell, Miss. Elizabeth female 58 0 \n", + "12 Saundercock, Mr. William Henry male 20 0 \n", + "13 Andersson, Mr. Anders Johan male 39 1 \n", + "14 Vestrom, Miss. Hulda Amanda Adolfina female 14 0 \n", + "15 Hewlett, Mrs. (Mary D Kingcome) female 55 0 \n", + "16 Rice, Master. Eugene male 2 4 \n", + "17 Williams, Mr. Charles Eugene male NaN 0 \n", + "18 Vander Planke, Mrs. Julius (Emelia Maria Vande... female 31 1 \n", + "19 Masselmani, Mrs. Fatima female NaN 0 \n", + "20 Fynney, Mr. Joseph J male 35 0 \n", + "21 Beesley, Mr. Lawrence male 34 0 \n", + "22 McGowan, Miss. Anna \"Annie\" female 15 0 \n", + "23 Sloper, Mr. William Thompson male 28 0 \n", + "24 Palsson, Miss. Torborg Danira female 8 3 \n", + "25 Asplund, Mrs. Carl Oscar (Selma Augusta Emilia... female 38 1 \n", + "26 Emir, Mr. Farred Chehab male NaN 0 \n", + "27 Fortune, Mr. Charles Alexander male 19 3 \n", + "28 O'Dwyer, Miss. Ellen \"Nellie\" female NaN 0 \n", + "29 Todoroff, Mr. Lalio male NaN 0 \n", + ".. ... ... ... ... \n", + "861 Giles, Mr. Frederick Edward male 21 1 \n", + "862 Swift, Mrs. Frederick Joel (Margaret Welles Ba... female 48 0 \n", + "863 Sage, Miss. Dorothy Edith \"Dolly\" female NaN 8 \n", + "864 Gill, Mr. John William male 24 0 \n", + "865 Bystrom, Mrs. (Karolina) female 42 0 \n", + "866 Duran y More, Miss. Asuncion female 27 1 \n", + "867 Roebling, Mr. Washington Augustus II male 31 0 \n", + "868 van Melkebeke, Mr. Philemon male NaN 0 \n", + "869 Johnson, Master. Harold Theodor male 4 1 \n", + "870 Balkic, Mr. Cerin male 26 0 \n", + "871 Beckwith, Mrs. Richard Leonard (Sallie Monypeny) female 47 1 \n", + "872 Carlsson, Mr. Frans Olof male 33 0 \n", + "873 Vander Cruyssen, Mr. Victor male 47 0 \n", + "874 Abelson, Mrs. Samuel (Hannah Wizosky) female 28 1 \n", + "875 Najib, Miss. Adele Kiamie \"Jane\" female 15 0 \n", + "876 Gustafsson, Mr. Alfred Ossian male 20 0 \n", + "877 Petroff, Mr. Nedelio male 19 0 \n", + "878 Laleff, Mr. Kristo male NaN 0 \n", + "879 Potter, Mrs. Thomas Jr (Lily Alexenia Wilson) female 56 0 \n", + "880 Shelley, Mrs. William (Imanita Parrish Hall) female 25 0 \n", + "881 Markun, Mr. Johann male 33 0 \n", + "882 Dahlberg, Miss. Gerda Ulrika female 22 0 \n", + "883 Banfield, Mr. Frederick James male 28 0 \n", + "884 Sutehall, Mr. Henry Jr male 25 0 \n", + "885 Rice, Mrs. William (Margaret Norton) female 39 0 \n", + "886 Montvila, Rev. Juozas male 27 0 \n", + "887 Graham, Miss. Margaret Edith female 19 0 \n", + "888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n", + "889 Behr, Mr. Karl Howell male 26 0 \n", + "890 Dooley, Mr. Patrick male 32 0 \n", + "\n", + " Parch Ticket Fare Cabin Embarked \n", + "0 0 A/5 21171 7.2500 NaN S \n", + "1 0 PC 17599 71.2833 C85 C \n", + "2 0 STON/O2. 3101282 7.9250 NaN S \n", + "3 0 113803 53.1000 C123 S \n", + "4 0 373450 8.0500 NaN S \n", + "5 0 330877 8.4583 NaN Q \n", + "6 0 17463 51.8625 E46 S \n", + "7 1 349909 21.0750 NaN S \n", + "8 2 347742 11.1333 NaN S \n", + "9 0 237736 30.0708 NaN C \n", + "10 1 PP 9549 16.7000 G6 S \n", + "11 0 113783 26.5500 C103 S \n", + "12 0 A/5. 2151 8.0500 NaN S \n", + "13 5 347082 31.2750 NaN S \n", + "14 0 350406 7.8542 NaN S \n", + "15 0 248706 16.0000 NaN S \n", + "16 1 382652 29.1250 NaN Q \n", + "17 0 244373 13.0000 NaN S \n", + "18 0 345763 18.0000 NaN S \n", + "19 0 2649 7.2250 NaN C \n", + "20 0 239865 26.0000 NaN S \n", + "21 0 248698 13.0000 D56 S \n", + "22 0 330923 8.0292 NaN Q \n", + "23 0 113788 35.5000 A6 S \n", + "24 1 349909 21.0750 NaN S \n", + "25 5 347077 31.3875 NaN S \n", + "26 0 2631 7.2250 NaN C \n", + "27 2 19950 263.0000 C23 C25 C27 S \n", + "28 0 330959 7.8792 NaN Q \n", + "29 0 349216 7.8958 NaN S \n", + ".. ... ... ... ... ... \n", + "861 0 28134 11.5000 NaN S \n", + "862 0 17466 25.9292 D17 S \n", + "863 2 CA. 2343 69.5500 NaN S \n", + "864 0 233866 13.0000 NaN S \n", + "865 0 236852 13.0000 NaN S \n", + "866 0 SC/PARIS 2149 13.8583 NaN C \n", + "867 0 PC 17590 50.4958 A24 S \n", + "868 0 345777 9.5000 NaN S \n", + "869 1 347742 11.1333 NaN S \n", + "870 0 349248 7.8958 NaN S \n", + "871 1 11751 52.5542 D35 S \n", + "872 0 695 5.0000 B51 B53 B55 S \n", + "873 0 345765 9.0000 NaN S \n", + "874 0 P/PP 3381 24.0000 NaN C \n", + "875 0 2667 7.2250 NaN C \n", + "876 0 7534 9.8458 NaN S \n", + "877 0 349212 7.8958 NaN S \n", + "878 0 349217 7.8958 NaN S \n", + "879 1 11767 83.1583 C50 C \n", + "880 1 230433 26.0000 NaN S \n", + "881 0 349257 7.8958 NaN S \n", + "882 0 7552 10.5167 NaN S \n", + "883 0 C.A./SOTON 34068 10.5000 NaN S \n", + "884 0 SOTON/OQ 392076 7.0500 NaN S \n", + "885 5 382652 29.1250 NaN Q \n", + "886 0 211536 13.0000 NaN S \n", + "887 0 112053 30.0000 B42 S \n", + "888 2 W./C. 6607 23.4500 NaN S \n", + "889 0 111369 30.0000 C148 C \n", + "890 0 370376 7.7500 NaN Q \n", + "\n", + "[891 rows x 12 columns]" + ] + }, + "execution_count": 101, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data # look at the dataframe" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Brain dump: Intuition, Plans, Feels --> Guesses to confirm or deny via plots\n", + "\n", + "#### There must be some trend\n", + "* Age and survival almost certainly have a relationship -- the oldest and youngest people probably had trouble, unless there was a healthy adult looking out for them and helping them out? Let's look into survival and age with all the data, age and survival with the youngest, oldest, teens, adults, etc.\n", + "* There must be some relationship between pclass and survival -- were the emergency resources better for certain classes more so than others? Some possibilities include higher quality resources, higher quantity of resources, closeness of the pclass seatings/cabin to the escape resources, etc. Needs more contextual knowledge to confirm.\n", + "* sibsp and parch could be useful features because family members tend to look out for each other and escape together. Perhaps there is strength in numbers when it comes to surviving the Titanic?\n", + "\n", + "#### Uncertain given current information and context\n", + "* Ticket number does not seem to be relevant for discovering trends (it's probably a hash function of some sort to ensure it's a unique number in the registration process...)\n", + "* Uncertain of the relationship between embarked and survival -- might be some hidden underpinnings in the health/human factors of people who went from the diffrent ports (i.e. people from Southamptonn are more physicially fit and able to survive for some non-obvious reason??)\n", + "* Uncertain of the relationship between name and survival -- maybe the model could incorporate some pattern between family members and survival. Did families stick together and survive or die together, or did they scatter? In the former case the last name might be useful in the predictive model. This might be tedious to investigate in a visualization but could provide anothher feature engineering idea for the modeling phase.\n", + "* Fare might contain some non-obvious information about the personality and human aspects of the passengers, but might not be useful in the long run for the model... Will confirm this to ensure that the model doesn't include extra variables (because more dimensionality means that more data is needed to train a good model... #curseofdimensionality)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Finding some extremes and info in numbers" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Youngest passengers? Did they survive?\n", + "\n", + "# Oldest passengers? Did they survive?\n", + "\n", + "# pclass that survived the most?\n", + "\n", + "# Investigating the families -- sibsp\n", + "\n", + "# Investigating the married couples) -- parch" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary statistics using groupby and comments that inspired further exploration work\n" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedAgeSibSpParchFare
Pclass
1461.5972220.62963038.2334410.4166670.35648184.154687
2445.9565220.47282629.8776300.4021740.38043520.662183
3439.1547860.24236325.1406200.6150710.39307513.675550
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Age SibSp Parch Fare\n", + "Pclass \n", + "1 461.597222 0.629630 38.233441 0.416667 0.356481 84.154687\n", + "2 445.956522 0.472826 29.877630 0.402174 0.380435 20.662183\n", + "3 439.154786 0.242363 25.140620 0.615071 0.393075 13.675550" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Pclass').mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "^ Looks like people in first class were the best at surviving. Reasonable guess is that they had the best emergency resources or access to them. There could also be other features that confirm this trend; will investigate more in feature engineering phase.\n", + "\n", + "===" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdPclassAgeSibSpParchFare
Survived
0447.0163932.53187630.6261790.5537340.32969022.117887
1444.3684211.95029228.3436900.4736840.46491248.395408
\n", + "
" + ], + "text/plain": [ + " PassengerId Pclass Age SibSp Parch Fare\n", + "Survived \n", + "0 447.016393 2.531876 30.626179 0.553734 0.329690 22.117887\n", + "1 444.368421 1.950292 28.343690 0.473684 0.464912 48.395408" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Survived').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassAgeSibSpParchFare
Sex
female431.0286620.7420382.15923627.9157090.6942680.64968244.479818
male454.1473140.1889082.38994830.7266450.4298090.23570225.523893
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Age SibSp Parch \\\n", + "Sex \n", + "female 431.028662 0.742038 2.159236 27.915709 0.694268 0.649682 \n", + "male 454.147314 0.188908 2.389948 30.726645 0.429809 0.235702 \n", + "\n", + " Fare \n", + "Sex \n", + "female 44.479818 \n", + "male 25.523893 " + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Sex').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassAgeParchFare
SibSp
0455.3700660.3453952.35197431.3975580.18585525.692028
1439.7272730.5358852.05741630.0897270.65550244.147370
2412.4285710.4642862.35714322.6200000.64285751.753718
3321.5625000.2500002.56250013.9166671.31250068.908862
4381.6111110.1666673.0000007.0555561.50000031.855556
5336.8000000.0000003.00000010.2000002.00000046.900000
8481.7142860.0000003.000000NaN2.00000069.550000
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Age Parch Fare\n", + "SibSp \n", + "0 455.370066 0.345395 2.351974 31.397558 0.185855 25.692028\n", + "1 439.727273 0.535885 2.057416 30.089727 0.655502 44.147370\n", + "2 412.428571 0.464286 2.357143 22.620000 0.642857 51.753718\n", + "3 321.562500 0.250000 2.562500 13.916667 1.312500 68.908862\n", + "4 381.611111 0.166667 3.000000 7.055556 1.500000 31.855556\n", + "5 336.800000 0.000000 3.000000 10.200000 2.000000 46.900000\n", + "8 481.714286 0.000000 3.000000 NaN 2.000000 69.550000" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('SibSp').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassAgeSibSpFare
Parch
0445.2551620.3436582.32153432.1785030.23746325.586774
1465.1101690.5508472.20339024.4220001.08474646.778180
2416.6625000.5000002.27500017.2169122.06250064.337604
3579.2000000.6000002.60000033.2000001.00000025.951660
4384.0000000.0000002.50000044.5000000.75000084.968750
5435.2000000.2000003.00000039.2000000.60000032.550000
6679.0000000.0000003.00000043.0000001.00000046.900000
\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass Age SibSp Fare\n", + "Parch \n", + "0 445.255162 0.343658 2.321534 32.178503 0.237463 25.586774\n", + "1 465.110169 0.550847 2.203390 24.422000 1.084746 46.778180\n", + "2 416.662500 0.500000 2.275000 17.216912 2.062500 64.337604\n", + "3 579.200000 0.600000 2.600000 33.200000 1.000000 25.951660\n", + "4 384.000000 0.000000 2.500000 44.500000 0.750000 84.968750\n", + "5 435.200000 0.200000 3.000000 39.200000 0.600000 32.550000\n", + "6 679.000000 0.000000 3.000000 43.000000 1.000000 46.900000" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Parch').mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
PassengerIdSurvivedPclassSibSpParchFare
Age
0.42804.0000001.0000003.0000000.0000001.0000008.516700
0.67756.0000001.0000002.0000001.0000001.00000014.500000
0.75557.5000001.0000003.0000002.0000001.00000019.258300
0.83455.5000001.0000002.0000000.5000001.50000023.875000
0.92306.0000001.0000001.0000001.0000002.000000151.550000
1.00415.4285710.7142862.7142861.8571431.57142930.005957
2.00346.9000000.3000002.6000002.1000001.30000037.536250
3.00272.0000000.8333332.5000001.8333331.33333325.781950
4.00466.1000000.7000002.6000001.6000001.40000029.543330
5.00380.0000001.0000002.7500001.7500001.25000022.717700
6.00762.3333330.6666672.6666671.3333331.33333325.583333
7.00288.6666670.3333332.6666672.6666671.33333331.687500
8.00400.2500000.5000002.5000002.0000001.25000028.300000
9.00437.2500000.2500003.0000002.5000001.75000027.938538
10.00620.0000000.0000003.0000001.5000002.00000026.025000
11.00534.5000000.2500002.5000002.5000001.50000054.240625
12.00126.0000001.0000003.0000001.0000000.00000011.241700
13.00614.0000001.0000002.5000000.0000000.50000013.364600
14.00312.0000000.5000002.5000002.0000000.83333342.625700
14.50112.0000000.0000003.0000001.0000000.00000014.454200
15.00554.6000000.8000002.6000000.4000000.40000049.655020
16.00422.2941180.3529412.5294120.7647060.52941225.745100
17.00423.0000000.4615382.3846150.6153850.38461528.389423
18.00516.2692310.3461542.4615380.3846150.42307738.063462
19.00389.4000000.3600002.3600000.3200000.20000027.869496
20.00493.0666670.2000003.0000000.2000000.0666678.624173
20.50228.0000000.0000003.0000000.0000000.0000007.250000
21.00390.2083330.2083332.5833330.3333330.20833331.565621
22.00365.7407410.4074072.5555560.1481480.22222225.504781
23.00510.2666670.3333332.1333330.4000000.26666737.994720
.....................
44.00437.1111110.3333332.1111110.4444440.22222229.758333
45.00367.5000000.4166672.0000000.3333330.58333336.818408
45.50268.0000000.0000002.0000000.0000000.00000017.862500
46.00427.0000000.0000001.3333330.3333330.00000055.458333
47.00534.6666670.1111111.7777780.2222220.11111127.601389
48.00663.1111110.6666671.6666670.5555560.55555637.893067
49.00533.5000000.6666671.3333330.6666670.16666759.929183
50.00457.2000000.5000001.6000000.4000000.20000064.025830
51.00456.1428570.2857142.0000000.1428570.14285728.752386
52.00589.5000000.5000001.3333330.5000000.33333351.402783
53.00572.0000001.0000001.0000002.0000000.00000051.479200
54.00383.6250000.3750001.5000000.5000000.50000044.477087
55.00254.5000000.5000001.5000000.0000000.00000023.250000
55.50153.0000000.0000003.0000000.0000000.0000008.050000
56.00542.7500000.5000001.0000000.0000000.25000043.976025
57.00700.0000000.0000002.0000000.0000000.00000011.425000
58.00325.0000000.6000001.0000000.0000000.60000093.901660
59.00164.0000000.0000002.5000000.0000000.00000010.375000
60.00583.7500000.5000001.2500000.7500000.50000055.000000
61.00374.6666670.0000001.6666670.0000000.00000024.019433
62.00552.5000000.5000001.2500000.0000000.00000035.900000
63.00380.0000001.0000002.0000000.5000000.00000043.772900
64.00492.5000000.0000001.0000000.5000002.000000144.500000
65.00264.3333330.0000001.6666670.0000000.33333332.093067
66.0034.0000000.0000002.0000000.0000000.00000010.500000
70.00709.5000000.0000001.5000000.5000000.50000040.750000
70.50117.0000000.0000003.0000000.0000000.0000007.750000
71.00295.5000000.0000001.0000000.0000000.00000042.079200
74.00852.0000000.0000003.0000000.0000000.0000007.775000
80.00631.0000001.0000001.0000000.0000000.00000030.000000
\n", + "

88 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " PassengerId Survived Pclass SibSp Parch Fare\n", + "Age \n", + "0.42 804.000000 1.000000 3.000000 0.000000 1.000000 8.516700\n", + "0.67 756.000000 1.000000 2.000000 1.000000 1.000000 14.500000\n", + "0.75 557.500000 1.000000 3.000000 2.000000 1.000000 19.258300\n", + "0.83 455.500000 1.000000 2.000000 0.500000 1.500000 23.875000\n", + "0.92 306.000000 1.000000 1.000000 1.000000 2.000000 151.550000\n", + "1.00 415.428571 0.714286 2.714286 1.857143 1.571429 30.005957\n", + "2.00 346.900000 0.300000 2.600000 2.100000 1.300000 37.536250\n", + "3.00 272.000000 0.833333 2.500000 1.833333 1.333333 25.781950\n", + "4.00 466.100000 0.700000 2.600000 1.600000 1.400000 29.543330\n", + "5.00 380.000000 1.000000 2.750000 1.750000 1.250000 22.717700\n", + "6.00 762.333333 0.666667 2.666667 1.333333 1.333333 25.583333\n", + "7.00 288.666667 0.333333 2.666667 2.666667 1.333333 31.687500\n", + "8.00 400.250000 0.500000 2.500000 2.000000 1.250000 28.300000\n", + "9.00 437.250000 0.250000 3.000000 2.500000 1.750000 27.938538\n", + "10.00 620.000000 0.000000 3.000000 1.500000 2.000000 26.025000\n", + "11.00 534.500000 0.250000 2.500000 2.500000 1.500000 54.240625\n", + "12.00 126.000000 1.000000 3.000000 1.000000 0.000000 11.241700\n", + "13.00 614.000000 1.000000 2.500000 0.000000 0.500000 13.364600\n", + "14.00 312.000000 0.500000 2.500000 2.000000 0.833333 42.625700\n", + "14.50 112.000000 0.000000 3.000000 1.000000 0.000000 14.454200\n", + "15.00 554.600000 0.800000 2.600000 0.400000 0.400000 49.655020\n", + "16.00 422.294118 0.352941 2.529412 0.764706 0.529412 25.745100\n", + "17.00 423.000000 0.461538 2.384615 0.615385 0.384615 28.389423\n", + "18.00 516.269231 0.346154 2.461538 0.384615 0.423077 38.063462\n", + "19.00 389.400000 0.360000 2.360000 0.320000 0.200000 27.869496\n", + "20.00 493.066667 0.200000 3.000000 0.200000 0.066667 8.624173\n", + "20.50 228.000000 0.000000 3.000000 0.000000 0.000000 7.250000\n", + "21.00 390.208333 0.208333 2.583333 0.333333 0.208333 31.565621\n", + "22.00 365.740741 0.407407 2.555556 0.148148 0.222222 25.504781\n", + "23.00 510.266667 0.333333 2.133333 0.400000 0.266667 37.994720\n", + "... ... ... ... ... ... ...\n", + "44.00 437.111111 0.333333 2.111111 0.444444 0.222222 29.758333\n", + "45.00 367.500000 0.416667 2.000000 0.333333 0.583333 36.818408\n", + "45.50 268.000000 0.000000 2.000000 0.000000 0.000000 17.862500\n", + "46.00 427.000000 0.000000 1.333333 0.333333 0.000000 55.458333\n", + "47.00 534.666667 0.111111 1.777778 0.222222 0.111111 27.601389\n", + "48.00 663.111111 0.666667 1.666667 0.555556 0.555556 37.893067\n", + "49.00 533.500000 0.666667 1.333333 0.666667 0.166667 59.929183\n", + "50.00 457.200000 0.500000 1.600000 0.400000 0.200000 64.025830\n", + "51.00 456.142857 0.285714 2.000000 0.142857 0.142857 28.752386\n", + "52.00 589.500000 0.500000 1.333333 0.500000 0.333333 51.402783\n", + "53.00 572.000000 1.000000 1.000000 2.000000 0.000000 51.479200\n", + "54.00 383.625000 0.375000 1.500000 0.500000 0.500000 44.477087\n", + "55.00 254.500000 0.500000 1.500000 0.000000 0.000000 23.250000\n", + "55.50 153.000000 0.000000 3.000000 0.000000 0.000000 8.050000\n", + "56.00 542.750000 0.500000 1.000000 0.000000 0.250000 43.976025\n", + "57.00 700.000000 0.000000 2.000000 0.000000 0.000000 11.425000\n", + "58.00 325.000000 0.600000 1.000000 0.000000 0.600000 93.901660\n", + "59.00 164.000000 0.000000 2.500000 0.000000 0.000000 10.375000\n", + "60.00 583.750000 0.500000 1.250000 0.750000 0.500000 55.000000\n", + "61.00 374.666667 0.000000 1.666667 0.000000 0.000000 24.019433\n", + "62.00 552.500000 0.500000 1.250000 0.000000 0.000000 35.900000\n", + "63.00 380.000000 1.000000 2.000000 0.500000 0.000000 43.772900\n", + "64.00 492.500000 0.000000 1.000000 0.500000 2.000000 144.500000\n", + "65.00 264.333333 0.000000 1.666667 0.000000 0.333333 32.093067\n", + "66.00 34.000000 0.000000 2.000000 0.000000 0.000000 10.500000\n", + "70.00 709.500000 0.000000 1.500000 0.500000 0.500000 40.750000\n", + "70.50 117.000000 0.000000 3.000000 0.000000 0.000000 7.750000\n", + "71.00 295.500000 0.000000 1.000000 0.000000 0.000000 42.079200\n", + "74.00 852.000000 0.000000 3.000000 0.000000 0.000000 7.775000\n", + "80.00 631.000000 1.000000 1.000000 0.000000 0.000000 30.000000\n", + "\n", + "[88 rows x 6 columns]" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data.groupby('Age').mean() " + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXIAAAEACAYAAACuzv3DAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADYtJREFUeJzt3G+MHHUdx/H3latCqaFWtK2AKamCiMZWjP+KMppCSqKI\nhqhNahoFNdEKYiIcxoQ9fWA1kRAl+sCjpKmIf0CaNgTTgkysISBgCy2lVmurVNurSqPVJ6KeD35z\nveXu6v6/me/d+5VsdmZud/Zzc7uf/d3vZg8kSZIkSZIkSZIkSZIkSZK66lTgUWAnsAf4arF9PrAN\n2AdsBeaVkk6S1JQ5xXU/8AhwMfB14IZi+43AuhJySZJaNAd4DLgQ2AssKLYvLNYlSRU1izS1cpw0\nEgc4Vvf1vnHrkqSKOoM0tfJuJhb3c1MfR5IEad67WX8D7gMuAoZJUypHgEXA0fE3XrJkycj+/fu7\nkVGSZpL9wKtbucOsBl8/k7EzUk4DLgV2AJuBNcX2NcCmCUn272dkZKRyl5tvvrn0DGYy00zMZabm\nLsCSVkocGo/IFwEbSIU/C9gIPFiU+Y+Aq4GDwIdafWBJUnc0KvJdwJsm2f4csKL7cSRJrWo0tTLt\nZFlWdoQJzNQcMzWvirnM1Dt9Pdz3SDHfI0lqUl9fH7TYzTNuRC5J041FLknBWeSSFJxFLknBWeSS\nFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxF\nLknBWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFPgXWDgyVHUHSNGaRS1JwFrkkBdeoyM8BHgKeBnYD\n1xbba8AhYEdxWdmjfJKkBvobfP154HpgJzAXeALYBowAtxQXSVKJGhX5keIC8A/gGeCsYr2vV6Ek\nSc1rZY58MbAMeKRY/yzwJHA7MK+7sSRJzWq2yOcCdwPXkUbm3wHOBZYCh4Fv9CSdJKmhRlMrALOB\ne4DvAZuKbUfrvj4EbJnsjrVa7cRylmVkWdZORvXA2oEhblt3TdkxpBkvz3PyPO9oH42KvI80dbIH\nuLVu+yLSSBzgA8Cuye5cX+SSpInGD3IHBwdb3kejIl8OrAaeIp1mCPBFYBVpWmUEOAB8quVHliR1\nRaMi/wWTz6Pf34MskqQ2+MlOSQrOIpek4CxySQrOIq8Y/+WtpFZZ5JIUnEUuScFZ5JIUnEUuScFZ\n5JIUnEUuScFZ5JIUnEU+jXgOujQzWeSSFJxFLknBWeSSFJxFLknBWeSSFJxFLknBWeSaUp4iKXWf\nRS5JwVnkkhScRS5JwVnkkhScRS5JwVnkkhScRS5JwVnkkhScRS5JwVnkkhRcoyI/B3gIeBrYDVxb\nbJ8PbAP2AVuBeb0KKEn6/xoV+fPA9cCFwNuAzwAXAAOkIj8PeLBYlySVoFGRHwF2Fsv/AJ4BzgKu\nADYU2zcAV/YknSSpoVbmyBcDy4BHgQXAcLF9uFiXJJWgv8nbzQXuAa4Djo/72khxmaBWq51YzrKM\nLMtaDihJ01me5+R53tE+miny2aQS3whsKrYNAwtJUy+LgKOT3bG+yCVJE40f5A4ODra8j0ZTK33A\n7cAe4Na67ZuBNcXyGsYKXpI0xRqNyJcDq4GngB3FtpuAdcCPgKuBg8CHepRPktRAoyL/BScfta/o\nchZJUhv8ZKckBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrlatnZg\nqOwIkupY5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUnEUu\nScFZ5JIUnEUuScFZ5JIUnEUuScFZ5JIUXDNFvh4YBnbVbasBh4AdxWVl15NJkprSTJHfwcSiHgFu\nAZYVl592OZckqUnNFPl24Ngk2/u6nEWS1IZO5sg/CzwJ3A7M604cSVKr+tu833eALxfLXwG+AVw9\n/ka1Wu3EcpZlZFnW5sNJ0vSU5zl5nne0j3aL/Gjd8hCwZbIb1Re5JGmi8YPcwcHBlvfR7tTKorrl\nD/DCM1okSVOomRH5XcAlwJnAs8DNQAYsJZ29cgD4VI/ySZIaaKbIV02ybX23g0iS2uMnOyUpOItc\nkoKzyCUpOItclbV2YKjsCFIIFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkk\nBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeRS1JwFrkkBWeR\nS1JwFrkkBWeRS1JwzRT5emAY2FW3bT6wDdgHbAXmdT+aJKkZzRT5HcDKcdsGSEV+HvBgsS5JKkEz\nRb4dODZu2xXAhmJ5A3BlN0NJkprX7hz5AtJ0C8X1gu7EkSS1qr8L+xgpLhPUarUTy1mWkWVZFx5O\nqo61A0Pctu6asmMosDzPyfO8o320W+TDwELgCLAIODrZjeqLXJI00fhB7uDgYMv7aHdqZTOwplhe\nA2xqcz+SpA41U+R3AQ8D5wPPAh8D1gGXkk4/fE+xLkkqQTNTK6tOsn1FN4NIktrjJzslKTiLXJKC\ns8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8gl\nKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKTiLXJKCs8glKbhKFPnagaGyIzQU\nIaOkmakSRS5Jap9FLknB9Xd4/4PA34H/AM8Db+k0kCSpNZ0W+QiQAc91HkWS1I5uTK30dWEfkqQ2\ndVrkI8ADwOPAJzqPI0lqVadTK8uBw8DLgW3AXmB7p6EkSc3rtMgPF9d/Bu4l/bHzRJHXarUTN8yy\njCzLOnw4qTlrB4a4bd01ZceQGsrznDzPO9pHJ0U+BzgFOA6cDlwGDNbfoL7IJUkTjR/kDg4OnvzG\nJ9FJkS8gjcJH93MnsLWD/UmS2tBJkR8AlnYriCSpPX6yU5KCs8glKTiLXJKCm9FF7r+mlTQdzOgi\nl6TpwCKXpOAsckkKziKXpOAsckkKziKXpODCF7mnEKpT0/05NN2/P02DIpekmc4il6TgLHJJCs4i\nl6TgLHJJCs4il6TgLHJJCm5KitzzWKvLn00MU/Vzmm6PM1M4Ipek4CxySQrOIpek4CxySQrOIpek\n4CxySQrOIm9RFU+bqmImTeTPSb1ikUtScBa5JAXXSZGvBPYCvwFu7E4cSVKr2i3yU4DbSGX+OmAV\ncEG3QvVSnudlR5jATM2pYqY//uHXZUeYVBWPlZl6p90ifwvwW+Ag8DzwA+D9XcrUU1X8wZmpOVXM\nZJE3z0y9026RnwU8W7d+qNgmSZpi7Rb5SFdTSJLa1tfm/d4G1Ehz5AA3Af8FvlZ3m98CS9pOJkkz\n037g1VPxQP3Fgy0GXgTsJMgfOyVJYy4Hfk0aed9UchZJkiRJ9arwYaH1wDCwq27bfGAbsA/YCsyb\n4kznAA8BTwO7gWsrkOtU4FHS9Nge4KsVyDTqFGAHsKVCmQ4CTxW5flmRXPOAu4FnSD/Dt5ac6XzS\n8Rm9/I30XC/7ON1Eeu3tAr4PvLgCmQCuKzLtLpapQq5TSNMti4HZlDd//k5gGS8s8q8DNxTLNwLr\npjjTQmBpsTyXNDV1QQVyzSmu+4FHgIsrkAng88CdwOZivQqZDpBeZPXKzrUB+Hix3A+cUYFMo2YB\nh0mDmDIzLQZ+RypvgB8Ca0rOBPB6UkedSurObaSTRMrOxduBn9atDxSXMizmhUW+F1hQLC8s1su0\nCVhBdXLNAR4DLqxAprOBB4B3MzYiLzsTpCJ/2bhtZeY6g1RQ41XhWAFcBmwvlsvMNJ80cHop6c1u\nC3BpyZkArgLq/y3ml0gFXnYurgK+W7e+GvjWVIcoLOaFRX6sbrlv3PpUWwz8HngJ5eeaRfrN6Thp\nJEAFMv2Y9BvVJYwVedmZIJXmDuBx4BPFtjJzLSVNjd0B/Ir02ju95Ez11gOfLpbLzvRJ0nP8KLCx\nIpleS3qDmU8aSD0MfLPVXL3474dRPiw0QnlZ5wL3kObDjo/7Whm5/ksqhLOBd5FGwWVmei/pxbaD\nk3/Woayf33LSG8zlwGdIU3j1pjpXP/Am4NvF9T+Z+BtwWcfqRcD7SG/K4011piXA50gDqFeSXoOr\nS84EaaT9NdI8+P2kAdV/Ws3ViyL/I2k+bNQ5pI/wV8Ew6dcUgEWksphqs0klvpE0tVKVXJD+KHUf\ncFHJmd4BXEGaxrgLeA/peFXhOB0urv8M3Ev6v0Nl5jpUXB4r1u8mFfqREjONuhx4gnSsoNzj9GbS\naPevwL+Bn5CmgatwnNYX+S4hjbz30eKx6kWRPw68hrEPC32YsT9WlW0z6Q8cFNeb/s9te6EPuJ10\nZsGtFcl1JmN/ET+NNG+4o+RMXyQNAM4FPgL8DPhoyZkg/er7kmL5dNL8766Scx0h/d+j84r1FaQz\nM7aUmGnUKtIb8agyj9Ne0ifSTyO9DleQXodVOE6vKK5fBXyQdEZN2c91oBofFroL+BPwL9IT/WOk\neagHKO+UnotJ0xg7GTs1a2XJud5AmlvdSTqt7gvF9rKP1ahLGBsIlJ3pXNJx2kk6VWz0uV12rjeS\nRuRPkkaaZ1Qg0+nAXxh746MCmW5g7PTDDaTfjsvOBPDzItdOxqY1q5BLkiRJkiRJkiRJkiRJkiRJ\nkiRJUi/8D75zeGYODSEaAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data_hist = thinkstats2.Hist(data.Age)\n", + "thinkplot.Hist(data_hist)\n", + "thinkplot.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "^ Seems like it would be worth breaking down age a bit more... Let's look at the passengers one year old and under." + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAEACAYAAAB4ayemAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADOlJREFUeJzt3H2MHHUdx/H3tkcjD9KqJSBQhVRBNEZEqaiYjAYFDKBR\n/0AjIqTEPzwhxqeT+LBNfKjxCfXUP6ggQYU/QI0YUEEYpQFRoDSFAoEKAQERRVERIsj6x2/uene9\n6/z2bme23/b9Si6dvZ2b+dzO3edmv7tTkCRJkiRJkiRJkiRJkiRJO4lDgQ1TPh4DzhxqIklSlkXA\nQ8CKYQeRJNV7C7B+2CEkaVe1qM/1TwZ+1EQQSdJgLQEeAfYZdhBJ2lWN9LHu8cBNpOKetHLlyt6W\nLVsGGkqSdgFbgBf1+0WdPta9GLgCuGDG53u9Xq/f/bau2+3S7XaHHaOWOQfLnIPVVs7RsXXz/trf\nr/8Zq44+ad5fP7529by/th+dTgf662Agf6a9J3AM8ON+dyBJGpzc8cjjwPImg0iS6vX77pGwiqIY\ndoQs5hwscw5WhJwHvODQYUdoVN/zlFmEmGlLimMhM+2F2llm2pKkHYClLUmBWNqSFIilLUmBWNqS\nFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIilLUmBWNqSFIil\nLUmBWNqSFIilLUmBWNqSFEhOaS8DLgFuBzYDRzWaSJI0p5GMdb4BXA68q1p/z0YTSZLmVFfaS4E3\nAKdWt58GHms0kSRpTnXjkYOBR4DzgZuBc4E9mg4lSZpd3Zn2CHAEMAr8ATgHGAM+M3Wlbrc7uVwU\nBUVRDDKjJIVXliVlWS54O52a+/cDriedcQMcTSrtE6as0+v1egsOIkkTRsfWDW3f42tXt7KfTqcD\n9R28jbrxyJ+B+4FDqtvHALf1uxNJ0mDkvHvkQ8APgSXAFuC0RhNJkuaUU9obgSObDiJJqucVkZIU\niKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUt\nSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUiKUtSYFY2pIUyEjmevcC/wT+BzwFrGoq\nkCRpbrml3QMK4NHmokiS6vQzHuk0lkKSlCW3tHvAVcCNwBnNxZEkbU/ueOT1wEPAPsCVwB3AtRN3\ndrvdyRWLoqAoioEFlKSdQVmWlGW54O3MZ+TxWeDfwFer271er7fgIJI0YXRs3dD2Pb52dSv76XQ6\nMI8OzhmP7AE8u1reE3gLsKnfHUmSFi5nPLIv8JMp6/8Q+FVjiSRJc8op7XuAw5sOIkmq5xWRkhSI\npS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1J\ngVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhRIbmkvBjYAlzWYRZJUI7e0\nzwI2A70Gs0iSauSU9oHAW4F1QKfZOJKk7ckp7a8DHwOeaTiLJKnGSM39JwB/Ic2zi7lW6na7k8tF\nUVAUc64qSbuksiwpy3LB26kbd3wBOAV4GngWsDdwKfC+Kev0ej1H3ZIGZ3Rs3dD2Pb52dSv76XQ6\nMI+Rc9145GxgBXAwcDJwNdMLW5LUon7fp+0ptSQNUd1Me6rfVB+SpCHxikhJCsTSlqRALG1JCsTS\nlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRA\nLG1JCsTSlqRALG1JCsTSlqRALG1JCsTSlqRALG1JCiSntJ8F3ADcAmwGvthoIknSnEYy1nkSeCPw\nn2r99cDR1b+SpBbljkf+U/27BFgMPNpMHEnS9uSW9iLSeORh4BrSmESS1LKc8QjAM8DhwFLgl0AB\nlBN3drvdyRWLoqAoigHFk6SdQ1mWlGW54O105vE1nwaeAL5S3e71er0FB5GkCaNj64a27/G1q1vZ\nT6fTgXl0cM54ZDmwrFreHXgzsKHfHUmSFi5nPPJ84AJSwS8CLgR+3WQoSdLsckp7E3BE00EkSfW8\nIlKSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uS\nArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSArG0JSkQS1uSAskp7RXANcBtwK3A\nmY0mkiTNaSRjnaeADwO3AHsBNwFXArc3mEuSNIucM+0/kwob4N+kst6/sUSSpDn1O9M+CHglcMPg\no0iS6uSMRybsBVwCnEU6457U7XYnl4uioCiKAUSTmjM6tm4o+x1fu3qHyqD2lGVJWZYL3k5uae8G\nXAr8APjpzDunlrYkaVszT2jXrFkzr+3kjEc6wPeAzcA589qLJGkgckr79cB7gTcCG6qP45oMJUma\nXc54ZD1ehCNJOwTLWJICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBL\nW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5ICsbQlKRBLW5IC\nySnt84CHgU0NZ5Ek1cgp7fOB45oOIkmql1Pa1wJ/bzqIJKmeM21JCmRkEBvpdruTy0VRUBTFIDY7\nL6Nj64ay3/G1q4ey32g8Pjs+j1EzyrKkLMsFb2fgpS1J2tbME9o1a9bMazuORyQpkJzSvgi4DjgE\nuB84rdFEkqQ55YxH3t14CklSFscjkhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVja\nkhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSIpS1JgVjakhSI\npS1JgeSU9nHAHcBdwCeajSNJ2p660l4MjJOK+6XAu4HDmg7VhAfuu3PYEbKUZTnsCFmi5Ixy3M05\nOBEyLkRdaa8C7gbuBZ4CLgbe1nCmRkQ5kFHKMErOKMfdnIMTIeNC1JX2AcD9U27/qfqcJGkI6kq7\n10oKSVKWTs39RwFd0kwb4JPAM8CXpqxzN7By4Mkkaee2BXjRoDc6Um34IGAJcAtBX4iUpF3F8cCd\npDPqTw45iyRJkrTzqbvI5m3ARmADcBPwpvaiTZN7MdCRwNPAO9oINYu6nAXwGOnx3AB8qrVk0+U8\nngUp461A2UqqbdXl/ChbH8tNpGO/rLV0SV3G5cAvSGPIW4H3t5ZsurqczwF+Qvp9vwF4WXvRJp0H\nPEw6lnP5Jul72Ai8so1Qs6jL+RLgeuBJ4COD3PFi0njkIGA3Zp9t7zll+eXV+m3LyTmx3tXAz4F3\nthVuxv7rchbAz1pNta2cnMuA24ADq9vL2wo3Re5xn3ACcFXzsabJydgFvlgtLwf+RnpdqU05Ob8M\nfLpaPpT2H0uAN5CKeK4yfCtwebX8GuB3bYSaRV3OfYBXA58js7Rz/++RnItsHp+yvBfw18xtD1Lu\nxUAfAi4BHmkt2XS5Oeve3dO0nJzvAS4lvYcfduzjPuE9wEXNx5omJ+NDwN7V8t6k0n66pXwTcnIe\nBlxTLd9JKvh92ok36Vrg79u5/yTggmr5BtLJxb5Nh5pFXc5HgBtJj3WW3NLOvcjm7cDtwBXAmbkh\nBign5wGkH8LvVreH8V70nJw94HWkp3aXk/4bgbbl5Hwx8FzSL/GNwCntRJumn4vA9gCOJf2haVNO\nxnNJo4YHScf9rHaiTZOTcyNbx4qrgBey9ZnWjmK272NHyzgvuaWdW2w/Jf0VPhG4cF6JFiYn5znA\nWLVuh+GczebkvBlYAbwC+BbpsW1bTs7dgCNIT0ePJT1tfnGToWbRzx/eE4H1wD8ayjKXnIxnk8YR\n+wOHA98Gnt1kqFnk5FxLOnPdAIxW//6vyVDzNPN3e6e4WDB3XvYAqUAmrGDr0+HZXFtt+3mkp3ht\nycn5KtJTPkhzw+NJT03anB/n5PzXlOUrgO+QzmgfbTbaNDk57yeNRJ6oPn5L+kNzVxsBK/38fJ5M\n+6MRyMv4OuDz1fIW4B7SzPjGxtNtlfuzefqU2/cAf2w4V79mfh8HVp/bZeRcZLOSrX/ZjqjWb1u/\nFwOdz3DePZKTc1+2Pp6rSDPGtuXkfAnphajFpNHDJtof5eQe96Wkk4jdW0u2VU7GrwGfrZb3JZXl\nc1vKNyEn59LqPoAzgO+3lG2mg8h7IfIohvdCJGw/54QuA373CMx+kc0Hqg+Aj5PeprSBdKZ95KAD\nZKrLOdWwShvqc36Q9HjeAlxH+sEbhpzH86Okd5BsYjivZUBezlOBH7Wca6q6jMuBy0gz402kF0yH\noS7na6v77yC9oL+07YCkZ0sPAv8lPds7nW2P9zjpe9hIOpEchrqc+1Wff4z0guV9pDdySJIkSZIk\nSZIkSZIkSZIkSZIkSZrp/4ZZbBO7NN0hAAAAAElFTkSuQmCC\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "oyo_data = data[data.Age <= 1.00]\n", + "oyo_data.groupby('Age').mean() # Confirm that this matches the top rows in the code cell above\n", + "\n", + "# Do more explorations with this oyo_data subset\n", + "oyo_data_hist = thinkstats2.Hist(oyo_data.Age)\n", + "thinkplot.Hist(oyo_data_hist)\n", + "thinkplot.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Normalizing \n", + "# Correlation != causation\n", + "# Try using different tools" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/model_iteration_1.ipynb b/model_iteration_1.ipynb new file mode 100644 index 0000000..01e6120 --- /dev/null +++ b/model_iteration_1.ipynb @@ -0,0 +1,796 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this notebook, we'll be building a predictive model for survival on the titanic based on training data provided by kaggle. This is part of the Warmup Project for Data Science 2016. \n", + "\n", + "#### A. import the training data." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "titanic = pandas.read_csv(\"./data/train.csv\")\n", + "\n", + "# Uncomment print statements below to take a look at the \n", + "# first 5 rows of the dataframe and the describing output.\n", + "# print(titanic.head(5))\n", + "# print(titanic.describe())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### B. clean up the missing data. \n", + "\n", + "Occasionally a dataset contains missing values (null, not a number, NA, etc.) and we want to prevent these missing values from affecting our computations in unintended ways. In particular, this training data set has missing values for `Age`, so let's clean that up!" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "titanic[\"Age\"] = titanic[\"Age\"].fillna(titanic[\"Age\"].median())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### C. convert non-numeric (categorical) variables into usable numbers!\n", + "\n", + "In particular, `Sex` and `Embarked` should be converted into usable numbers. We'll find all the unique values for these non-numeric data points and replace them with numbers that can be used by the predictive model in a later step." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unique genders are ['male' 'female']\n" + ] + } + ], + "source": [ + "# Find all the unique genders \n", + "print\"unique genders are\", titanic[\"Sex\"].unique()\n", + "\n", + "# From genders to numbers\n", + "titanic.loc[titanic[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic.loc[titanic[\"Sex\"] == \"female\", \"Sex\"] = 1" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "unique embarked values are ['S' 'C' 'Q' nan]\n" + ] + } + ], + "source": [ + "# Find all the uniqued embarked values\n", + "print \"unique embarked values are\", titanic[\"Embarked\"].unique()\n", + "\n", + "# From embarked letters to numbers\n", + "titanic[\"Embarked\"] = titanic[\"Embarked\"].fillna(\"S\")\n", + "titanic.loc[titanic[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic.loc[titanic[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic.loc[titanic[\"Embarked\"] == \"Q\", \"Embarked\"] = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### D. cross validation, linear regression, first stab at predictions \n", + "\n", + "We want to make sure that we don't train our model on the same data that we'll make predictions on, so we're going to split the data into several folds. In each trial, one fold will be set aside for predictions, and the remaining folds will be used for training. Thus there's no overlap between the folds/partitions that were used for training and the one fold used for predictions. We'll run several trials with these fold combinations and eventually get predictions for the entire dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Code from dataquest mission 74, part 9.\n", + "\n", + "# Import the linear regression class\n", + "from sklearn.linear_model import LinearRegression\n", + "# Sklearn also has a helper that makes it easy to do cross validation\n", + "from sklearn.cross_validation import KFold\n", + "\n", + "# The columns we'll use to predict the target\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Fare\", \"Embarked\"]\n", + "\n", + "# Initialize our algorithm class\n", + "alg = LinearRegression()\n", + "# Generate cross validation folds for the titanic dataset. It return the row indices corresponding to train and test.\n", + "# We set random_state to ensure we get the same splits every time we run this.\n", + "kf = KFold(titanic.shape[0], n_folds=3, random_state=1)\n", + "\n", + "predictions = []\n", + "for train, test in kf:\n", + " # The predictors we're using the train the algorithm. Note how we only take the rows in the train folds.\n", + " train_predictors = (titanic[predictors].iloc[train,:])\n", + " # The target we're using to train the algorithm.\n", + " train_target = titanic[\"Survived\"].iloc[train]\n", + " # Training the algorithm using the predictors and target.\n", + " alg.fit(train_predictors, train_target)\n", + " # We can now make predictions on the test fold\n", + " test_predictions = alg.predict(titanic[predictors].iloc[test,:])\n", + " predictions.append(test_predictions)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[array([ 8.99877810e-02, 9.60756206e-01, 5.92676278e-01,\n", + " 9.31138728e-01, 5.29343071e-02, 1.70275685e-01,\n", + " 3.69943590e-01, 1.03474847e-01, 5.21597906e-01,\n", + " 8.74491050e-01, 6.48883611e-01, 8.29742769e-01,\n", + " 1.34797198e-01, -1.61126844e-01, 6.58141307e-01,\n", + " 6.39819748e-01, 1.51733875e-01, 2.95432718e-01,\n", + " 5.35377959e-01, 6.21007683e-01, 2.61872592e-01,\n", + " 2.62687561e-01, 7.31739160e-01, 5.05995897e-01,\n", + " 5.61398567e-01, 3.35039734e-01, 1.30338808e-01,\n", + " 4.68765767e-01, 6.60737753e-01, 9.10819218e-02,\n", + " 4.77223920e-01, 1.04220026e+00, 6.60691613e-01,\n", + " 8.71539273e-02, 5.28550732e-01, 4.01874338e-01,\n", + " 1.30340307e-01, 1.29339672e-01, 5.72717129e-01,\n", + " 6.65238822e-01, 4.83215779e-01, 7.60807408e-01,\n", + " 1.30578363e-01, 8.71867121e-01, 7.09855487e-01,\n", + " 9.11369897e-02, 1.39181745e-01, 6.60691613e-01,\n", + " 6.82833485e-02, 6.06254374e-01, 4.92254383e-02,\n", + " 1.29250392e-01, 9.02668258e-01, 7.51677954e-01,\n", + " 3.19636822e-01, 5.05995897e-01, 8.23411477e-01,\n", + " 1.27611544e-01, 8.16516947e-01, -3.70209060e-02,\n", + " 1.63085464e-01, 9.57981340e-01, 3.96742103e-01,\n", + " 6.16138409e-02, 5.42714233e-01, 6.62112275e-02,\n", + " 7.79751268e-01, 1.40293401e-01, 4.40592742e-01,\n", + " 3.50534388e-02, 2.72709814e-01, 4.26360339e-01,\n", + " 3.55241143e-01, 1.10226880e-01, 8.66078358e-02,\n", + " 1.07366720e-01, 9.10819218e-02, 9.11369897e-02,\n", + " 3.82661024e-01, 5.72471068e-01, 1.24221410e-01,\n", + " 8.61972872e-02, 6.60705005e-01, 5.10138486e-01,\n", + " 8.45241581e-01, 4.56477760e-01, 3.22699204e-02,\n", + " 9.11369897e-02, 9.37604538e-01, 1.12967094e-01,\n", + " 8.56794636e-02, 1.34727274e-01, 3.83320807e-01,\n", + " 6.14970393e-03, -7.83320148e-02, 9.11369897e-02,\n", + " 3.10516665e-01, 5.49345421e-01, 7.23544338e-01,\n", + " 2.33721448e-01, 5.81750798e-01, 9.10819218e-02,\n", + " 5.25738424e-01, 6.40651310e-02, -2.52427240e-02,\n", + " 9.10819218e-02, 6.19865700e-01, 9.10387818e-02,\n", + " 3.65066610e-02, 6.32939707e-01, 4.08195377e-01,\n", + " 6.63657306e-01, 1.23882146e-01, 5.92491292e-01,\n", + " 6.83623624e-01, 1.29295032e-01, -6.19221217e-02,\n", + " 2.59223480e-01, 6.09655955e-01, 5.30794378e-01,\n", + " 2.88023805e-01, 9.11369897e-02, 2.82857942e-01,\n", + " 7.61542726e-01, 3.45640063e-01, 1.85484998e-01,\n", + " 1.70022737e-01, 1.12642722e-01, 5.59420117e-01,\n", + " -2.02485747e-03, 1.03290733e-01, 1.34440079e-01,\n", + " 4.46807623e-01, 7.51677954e-01, 3.11805296e-01,\n", + " 3.62947385e-01, 9.75724449e-01, 4.29554800e-01,\n", + " 1.57043954e-01, 5.82928575e-01, 5.57105476e-01,\n", + " 6.14443886e-01, 5.72812834e-01, 2.18783352e-01,\n", + " 3.49472299e-01, 2.86040080e-01, 9.65037360e-02,\n", + " 5.60916106e-01, 1.86919710e-01, 2.19027353e-01,\n", + " 1.69739986e-01, 1.00690768e+00, -5.89449777e-02,\n", + " -4.15452572e-02, 9.08736139e-02, 3.95827915e-01,\n", + " 7.26175962e-01, 8.02219375e-02, 9.13557255e-02,\n", + " -2.22536096e-01, -2.66919104e-02, 7.21593360e-01,\n", + " 1.01953834e-01, 1.51388512e-01, 8.19705948e-02,\n", + " 1.32518461e-01, 9.70245311e-01, 3.28974893e-01,\n", + " 5.02576476e-01, 1.08437940e-01, 3.25183297e-01,\n", + " 1.40818823e-01, 6.63268211e-01, 1.29295032e-01,\n", + " 3.90965934e-01, 7.86503606e-02, -3.68524682e-02,\n", + " 9.13671691e-01, 2.84517666e-01, 4.46019673e-02,\n", + " 2.68132779e-01, 3.35661255e-01, 1.96299597e-03,\n", + " 3.51470400e-01, 6.51010647e-01, 5.11174133e-01,\n", + " 6.29850621e-01, 4.10021732e-01, 4.03081359e-02,\n", + " 4.74217131e-02, 7.64271489e-01, 3.44550453e-01,\n", + " 5.97245007e-01, 3.69521460e-01, 9.46062691e-01,\n", + " 9.12083149e-01, 1.70022737e-01, -1.85251802e-02,\n", + " 6.60691613e-01, 8.07931698e-01, 9.16548133e-02,\n", + " -2.22536096e-01, 5.78367977e-02, 3.48321010e-02,\n", + " 1.45712251e-01, 6.91179799e-01, 3.84837497e-02,\n", + " 1.45383056e-01, 7.26181926e-01, 4.78394987e-01,\n", + " 1.12609974e-01, 7.50755869e-01, 1.23596450e-01,\n", + " 2.84517666e-01, 1.36414068e-01, 1.01395495e+00,\n", + " 5.87218752e-01, 1.90418359e-01, 1.02889863e+00,\n", + " 2.83624866e-01, 1.56627303e-01, 3.00890244e-01,\n", + " -3.43861103e-02, 9.10819218e-02, 4.37274991e-01,\n", + " 1.24346402e-01, 3.43657653e-01, 1.31782740e-01,\n", + " 3.50007979e-01, 4.53816408e-01, 9.41986239e-01,\n", + " 8.55812557e-02, 1.26427969e-01, 5.14461976e-01,\n", + " 3.16370023e-01, 5.81627306e-01, 1.79146187e-01,\n", + " 8.33217359e-01, 3.43657653e-01, 2.67886176e-01,\n", + " 5.89980704e-01, 6.29850621e-01, 2.89082393e-01,\n", + " 1.23551810e-01, 1.19423755e-01, 4.49914049e-01,\n", + " 5.98080236e-01, 7.41700785e-01, 3.95976588e-01,\n", + " 1.24570927e-01, 9.08512939e-02, 5.10217925e-01,\n", + " 3.17243789e-01, 4.94880818e-02, 4.48434902e-01,\n", + " 5.51647950e-01, 1.05176735e+00, 1.00396283e+00,\n", + " 1.16824364e+00, 6.37295280e-01, 1.70022737e-01,\n", + " 3.47081525e-02, 3.23790141e-01, 4.27827834e-01,\n", + " 6.60691613e-01, 2.50879710e-01, 1.07703504e-04,\n", + " 7.38026906e-02, 8.41682429e-01, 9.94221666e-01,\n", + " 5.04388858e-01, 1.04634754e-01, 6.84091736e-01,\n", + " 4.60920013e-01, 6.60691613e-01, 7.87205387e-01,\n", + " 4.88920786e-01, 2.90790162e-01, 1.24446245e-01,\n", + " 4.80968077e-01, -3.19057282e-02, 9.10670657e-02,\n", + " 1.57145126e-01, 1.40254724e-01, 5.02603260e-01,\n", + " 1.03564537e-01, 8.07397611e-02, 1.23827078e-01,\n", + " 2.19027353e-01, 6.93436769e-01, 1.02306096e+00,\n", + " 1.07151871e+00, 2.91224311e-01, 6.03921666e-01,\n", + " 1.12912026e-01, 5.42714233e-01, 1.54899175e-01]), array([ 1.13774791, 0.44173212, 0.98551347, 0.66915371, 0.08254228,\n", + " 0.15142624, 0.83642014, 0.09704526, 0.64711481, 1.03845173,\n", + " 1.06064212, 0.24647842, 0.98364902, 1.04411609, 1.10195734,\n", + " 0.72596387, 0.09692709, 0.11388411, 0.60824987, 0.74905725,\n", + " 0.090424 , 1.00314273, 0.91588368, 0.13679886, 0.10365487,\n", + " 0.82296458, 0.755174 , -0.27746285, 1.0035964 , -0.12636043,\n", + " 0.70865678, 0.52438799, 1.06900476, 0.58044138, 0.32246331,\n", + " 0.45904751, 0.0848131 , 0.96838383, 0.09692709, 0.4123739 ,\n", + " 0.96908901, -0.01732698, 0.33119158, 0.38953146, 0.97455471,\n", + " 0.26457991, 0.28476325, 0.21075768, 0.78939013, 0.68174567,\n", + " 0.5508181 , 0.21132238, 0.00332574, 0.1315846 , 0.44518065,\n", + " 0.16116388, 0.07440511, 0.13363265, 0.09815645, 0.98913539,\n", + " 0.69520122, 0.66925272, 0.66925272, -0.05732283, 0.25605759,\n", + " 0.51306171, 0.04918447, 0.12689844, 0.08297663, 0.74556032,\n", + " 0.63153497, 0.66915371, 1.03349593, 0.46795359, 0.11283671,\n", + " 0.15759527, 0.5998862 , 0.6125967 , 0.96615292, 0.63469796,\n", + " 0.6051113 , 0.18499302, 0.15738453, 1.03364995, 0.80043282,\n", + " 0.07003835, 0.85871777, 0.09692709, 0.37822123, 0.03771546,\n", + " 0.70865678, 0.17123866, 0.87293786, 0.38692632, 0.14394491,\n", + " -0.00364112, 1.02362819, 0.60920867, 0.13721713, 0.57461098,\n", + " 0.1534423 , 0.29630296, 0.76221079, 0.0229439 , 0.11050082,\n", + " 0.59310377, 0.05272741, 0.64923598, 0.18004866, -0.05792355,\n", + " 0.37724772, 0.14392897, 0.44776777, 0.09692709, 0.17057126,\n", + " 0.97573347, 0.2546175 , -0.01069499, 0.59494436, 0.67712284,\n", + " 0.81048116, 0.25112435, 0.7091068 , 0.13414671, 0.21833626,\n", + " 0.09018337, 0.5398775 , 0.11371054, 0.09643219, 0.72214613,\n", + " 0.83299143, 0.1712546 , 0.07013414, 0.43870508, 0.5508181 ,\n", + " 0.62795723, 0.17034196, 0.26289071, 1.03283656, 0.54234647,\n", + " 0.66429253, 0.2888594 , 0.24248073, 0.59832765, 0.15197868,\n", + " 0.06672256, 0.76247901, 0.09709316, 0.62328105, 0.85873908,\n", + " 0.39833841, 0.68526385, 0.28026543, 0.15249025, 0.0558822 ,\n", + " 0.46338875, 0.3322838 , 0.09704526, 0.12741893, 0.18977726,\n", + " 0.90570685, 0.61255203, 0.1712546 , 0.3041495 , 0.05667859,\n", + " 0.32003504, 0.13002433, 0.09704526, 0.02900113, 0.2546175 ,\n", + " 0.25032727, 0.17123545, 0.71385691, 0.09643219, 0.03023685,\n", + " 0.67057269, 0.83394424, 0.63668087, 0.45820842, 0.18004866,\n", + " 0.03925263, 0.13700639, 0.76347615, -0.01610677, 0.2546175 ,\n", + " -0.05096587, 0.36065035, 0.49526401, 0.44776777, 0.88783867,\n", + " 0.27650531, 0.0835897 , 0.17095571, 0.0558822 , 0.14352664,\n", + " 0.26008209, 0.20422092, 0.14413971, 0.13917582, 0.78823881,\n", + " 0.10244795, 0.983009 , 0.12376157, 0.17152021, 0.71624816,\n", + " 0.66906113, 0.5355726 , 1.06327957, 0.55601524, 0.71952689,\n", + " 0.43870508, 0.10813802, 0.14762674, 0.16452683, 0.09704526,\n", + " 0.38468169, 0.77378051, 0.12353167, 0.31660245, 0.72019649,\n", + " 0.18382257, 0.6683239 , 0.07001598, 0.97445504, 0.13729376,\n", + " 0.13363265, 0.88062695, 0.13363587, 0.08715737, 0.61255203,\n", + " 0.5883169 , 0.0229439 , 0.18684089, 0.88743056, 0.13363587,\n", + " 0.14770832, 0.62385335, 0.58195819, 0.89464072, 0.32433284,\n", + " 1.0215796 , 0.10198815, 1.01250232, 0.89757009, 0.52011358,\n", + " 0.50665802, 0.19733591, 0.33882963, 0.19608356, 0.78269614,\n", + " 0.3024605 , 0.01303333, 0.35740293, 0.59528255, 0.2812701 ,\n", + " 0.1713153 , 0.17399933, 0.63510029, 0.2099606 , 0.79897366,\n", + " 0.62993975, 0.84335812, 0.49799211, 0.1712546 , 0.01619374,\n", + " 0.26496308, 0.09704526, 0.59494436, 0.03570385, 0.1574771 ,\n", + " 0.55964686, 0.13363587, 0.0699841 , 0.03391958, 0.68692335,\n", + " 0.38475832, 0.66915371, 0.17777861, 0.16253816, 0.72211234,\n", + " 0.83479538, 0.58677963, 0.07003835, 0.735757 , 0.90451305,\n", + " 0.09962007, 0.43250553, 0.13477258, 1.02529894, 0.13828479,\n", + " 0.24105043, 0.13741193, 0.09704526, 0.04924194, 0.80169436,\n", + " -0.03139561, 0.64987806]), array([ 1.72889219e-01, 1.70294715e-02, 7.82616935e-01,\n", + " -8.34788848e-03, 1.47022266e-01, 3.10888595e-01,\n", + " 7.28261340e-01, 1.01479914e-01, 4.24565622e-01,\n", + " 1.57316587e-02, 4.37708069e-01, 1.44204264e-02,\n", + " 9.07678482e-02, 4.33913871e-01, 8.26537251e-01,\n", + " 8.45262338e-01, 5.42776171e-01, 1.01763663e-01,\n", + " 6.70148479e-01, 1.92163452e-01, 6.39359534e-02,\n", + " 7.62650655e-01, 3.10124701e-02, 5.90024631e-01,\n", + " 8.31356231e-01, 2.78648916e-01, 1.08309653e-01,\n", + " 3.04531238e-01, 1.50864127e-01, 1.38986099e-01,\n", + " 1.36219795e-01, 2.51197915e-01, 2.02625887e-01,\n", + " 9.72357134e-01, 1.12191979e-01, 1.92169054e-01,\n", + " 1.50211875e-01, -2.14264992e-02, 4.52451020e-01,\n", + " 4.38789988e-01, 6.04820088e-01, 7.89326541e-01,\n", + " 8.00459867e-02, 2.10435721e-01, 5.70885269e-01,\n", + " 5.70841743e-02, 1.44342132e-01, 1.00451104e+00,\n", + " 6.42312317e-01, 8.51755703e-02, 7.33373007e-01,\n", + " 3.09602117e-01, 1.49684208e-01, 3.22228832e-01,\n", + " 1.01595923e-01, 6.50604478e-01, 1.01479914e-01,\n", + " 8.45026241e-01, 1.38791822e-01, 7.14365273e-01,\n", + " 7.68287651e-01, 1.84938938e-01, 1.01479914e-01,\n", + " 6.54218524e-01, 2.93878313e-01, 2.96413137e-01,\n", + " 1.92833539e-01, 8.27498735e-02, 3.28441263e-01,\n", + " 5.87658439e-02, 1.02674988e-01, 1.42090676e-01,\n", + " 2.83166248e-01, 1.01520440e-01, 2.10876914e-02,\n", + " 9.01930011e-01, 6.80182444e-01, 3.63633521e-01,\n", + " 4.29834748e-02, 2.51030051e-01, 2.71459394e-01,\n", + " 1.55080767e-01, 1.20174297e-01, 6.76615822e-01,\n", + " 5.21604336e-01, 2.74876851e-01, 7.14261845e-01,\n", + " 4.63722197e-01, 1.43882255e-01, -3.38493769e-02,\n", + " 5.08333972e-02, 2.88240761e-01, 4.71949096e-03,\n", + " 1.48920991e-01, 1.55073789e-01, 9.65241409e-01,\n", + " 3.61956120e-01, 8.01212426e-01, 8.51755703e-02,\n", + " 1.63090365e-01, 2.58489938e-01, 1.38385623e-01,\n", + " 1.57316587e-02, 7.14397446e-01, 2.98282232e-01,\n", + " 2.65779163e-02, 9.41922468e-01, 3.92478820e-01,\n", + " 7.25879907e-01, 2.08234335e-01, 7.05625434e-02,\n", + " 2.03820545e-01, 6.98106244e-01, 3.54986591e-01,\n", + " 9.42312534e-01, 1.08182230e-01, 1.01115214e+00,\n", + " 4.29882986e-01, 2.72580965e-01, 9.55913060e-02,\n", + " 1.38553363e-01, 1.49766670e-01, 8.76445205e-01,\n", + " 7.95521275e-01, 1.89563479e-01, 7.47402760e-02,\n", + " 9.05943831e-01, 1.19035222e-01, 2.34961953e-01,\n", + " 1.49265429e-01, 3.84688624e-01, 1.44070963e-01,\n", + " 6.51000458e-01, 7.14396037e-01, 2.37161612e-01,\n", + " 5.98123216e-01, 8.84762775e-01, 2.34195832e-01,\n", + " 2.71459394e-01, 2.93878313e-01, 2.93878313e-01,\n", + " 9.60495497e-02, 4.82543535e-01, 2.74738708e-01,\n", + " 1.01479914e-01, 1.01479914e-01, 4.28725578e-01,\n", + " 3.27845711e-01, 8.83507841e-01, 7.85083053e-02,\n", + " 8.54020195e-02, 1.53868294e-01, 1.25458500e-01,\n", + " 7.78614476e-01, 4.27536886e-01, 1.76095354e-01,\n", + " 8.78367308e-01, 2.23270579e-01, 7.41615725e-02,\n", + " 1.28260077e-01, 6.34105869e-01, 3.76826088e-01,\n", + " 1.01513462e-01, 3.21161697e-01, 6.92919862e-02,\n", + " 9.05219168e-01, 9.92643346e-02, 3.21100762e-02,\n", + " 1.89869119e-01, 8.47257439e-01, 1.65792833e-01,\n", + " 7.70032759e-01, 4.70822280e-01, 7.01001762e-01,\n", + " 1.45018183e-01, 7.98992141e-02, 1.22365867e-01,\n", + " -5.62678525e-03, 6.34840292e-01, 1.47022266e-01,\n", + " 6.21554022e-01, 1.55089154e-01, 1.92163452e-01,\n", + " 7.45360827e-01, 1.92167645e-01, 8.15272492e-01,\n", + " 7.49589740e-01, 9.59168970e-01, 4.23369546e-01,\n", + " 6.56067455e-02, 1.17831761e-01, 1.17764665e-01,\n", + " 6.77402825e-01, 1.31033823e-01, 2.11184136e-01,\n", + " 3.61128670e-01, 1.92163452e-01, 3.27009298e-01,\n", + " 2.80865752e-01, 4.73809464e-01, 1.17548012e-01,\n", + " 2.08181789e-01, 8.39842956e-01, 6.07376016e-01,\n", + " 1.36308792e-01, 5.71394060e-01, 2.34961953e-01,\n", + " 7.32664113e-01, 4.58929866e-01, 2.99802486e-01,\n", + " 1.07144857e-01, 8.54523415e-02, 3.79873628e-01,\n", + " 6.77309159e-01, 2.08181789e-01, 8.74780819e-01,\n", + " 1.12194764e-01, 3.71105893e-02, 2.30444621e-01,\n", + " 5.78112549e-01, 8.80381008e-02, 4.38789988e-01,\n", + " 6.50478673e-01, 2.52145211e-01, 2.16244600e-02,\n", + " 7.72356638e-02, 7.64956968e-01, 1.06578734e-01,\n", + " 3.85229660e-01, 6.33022282e-01, 6.89918839e-02,\n", + " 1.92431836e-01, 8.51755703e-02, 4.59963761e-01,\n", + " 1.92163452e-01, 7.52074841e-01, 6.94810438e-01,\n", + " 3.74543331e-01, 1.47020857e-01, 1.28274033e-01,\n", + " 1.54904640e-01, 8.83372143e-01, 1.38714930e-01,\n", + " 1.01428183e-01, 6.37514393e-02, 4.74143535e-01,\n", + " 1.44318380e-01, 3.32209243e-01, 9.85223737e-01,\n", + " 1.12472244e-01, 1.60139061e-01, 2.66114644e-02,\n", + " -2.41362640e-01, 1.09304997e-01, 2.65882719e-01,\n", + " 9.34799595e-01, 6.65962224e-02, -1.44857067e-01,\n", + " 7.32175244e-01, 1.01756702e+00, 6.57625381e-01,\n", + " 6.82274953e-01, 7.78507074e-01, 3.06694232e-01,\n", + " 7.03120381e-01, 1.47020857e-01, -5.35194672e-02,\n", + " 2.63450207e-01, 8.45198988e-01, 2.80865752e-01,\n", + " 2.88522280e-01, 7.14342083e-01, 7.98068552e-01,\n", + " 4.05781543e-01, 1.00941736e-01, 1.92789366e-01,\n", + " 1.12191979e-01, 8.05473642e-01, 4.10332423e-01,\n", + " -6.55145848e-04, 7.89310178e-01, 7.38879084e-01,\n", + " 1.43673989e-01, 1.49684208e-01, 1.01479914e-01,\n", + " 8.33962978e-01, 8.06527571e-01, 7.46997500e-02,\n", + " 6.54965242e-01, 2.67936850e-01, 1.17831761e-01,\n", + " 6.75775470e-01, 2.72454182e-01, 9.99158265e-01,\n", + " 5.87835137e-01, 4.84754956e-01, 1.70739321e-01])]\n" + ] + } + ], + "source": [ + "print predictions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### D. contninued: accuracy!\n", + "\n", + "How did this first stab of predictions go? The possible outcomes are 1 and 0 (survival is a binary thing), but the linear regression model output doesn't match this binary format. Thus we have to map our predictions to outcomes. We'll also compute the accuracy of these results by comparing our predictions to the `Survived` column of the training data. " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# The predictions are in three separate numpy arrays. Concatenate them into one. \n", + "# We concatenate them on axis 0, as they only have one axis.\n", + "predictions = np.concatenate(predictions, axis=0)\n", + "\n", + "# Map predictions to outcomes (only possible outcomes are 1 and 0)\n", + "predictions[predictions > .5] = 1\n", + "predictions[predictions <=.5] = 0\n", + "\n", + "# Take a look\n", + "# print(predictions.shape)\n", + "# print(titanic[\"Survived\"].shape)\n", + "\n", + "num_accurate_predictions = 0 # counter\n", + "\n", + "# Check whether the predictions are correct\n", + "for i in range(predictions.shape[0]):\n", + " if predictions[i] == titanic[\"Survived\"][i]:\n", + " num_accurate_predictions +=1\n", + "\n", + "accuracy = float(num_accurate_predictions) / predictions.shape[0]\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The accuracy of this linear regression model is `0.783389450056` -- definitely a lot of room for improvement! Perhaps using a different model or some feature engineering could help. :)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### E. second stab: logistic regression" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.787878787879\n" + ] + } + ], + "source": [ + "from sklearn import cross_validation\n", + "from sklearn.linear_model import LogisticRegression\n", + "\n", + "# Initialize our algorithm\n", + "alg = LogisticRegression(random_state=1)\n", + "# Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "# Take the mean of the scores (because we have one for each fold)\n", + "print(scores.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The accuracy of the logistic regression model is `0.792368125701` -- better, but not perfect. Let's go through making a submission to kaggle before continuing to tweak the model." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### F. preparing a submission to kaggle; running the model on the test data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "titanic_test = pandas.read_csv(\"./data/test.csv\")\n", + "\n", + "# Age column\n", + "titanic_test[\"Age\"] = titanic_test[\"Age\"].fillna(titanic[\"Age\"].median())\n", + "\n", + "# Sex column\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"female\", \"Sex\"] = 1\n", + "\n", + "# Embarked column\n", + "titanic_test[\"Embarked\"] = titanic_test[\"Embarked\"].fillna(\"S\")\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"Q\", \"Embarked\"] = 2\n", + "\n", + "# Fare column\n", + "titanic_test[\"Fare\"] = titanic_test[\"Fare\"].fillna(titanic[\"Fare\"].median())" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Initialize the algorithm class\n", + "alg = LogisticRegression(random_state=1)\n", + "\n", + "# Train the algorithm using all the training data\n", + "alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + "\n", + "# Make predictions using the test set.\n", + "predictions = alg.predict(titanic_test[predictors])\n", + "\n", + "# Create a new dataframe with only the columns Kaggle wants from the dataset.\n", + "submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " })" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# generate a submission file\n", + "# commented out to prevent unintentional file overwrite/creation\n", + "# submission.to_csv(\"dataquest_logistic_regression.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uploaded the submission file to kaggle; it resulted in an score of 0.75120 (rank 3393). This model did approximately 3% worse on the test dataset compared to the training dataset. 3% does \"feel\" like a big difference, however it doesn't seem like overfitting was the only issue. It seems more likely to me that there are nuanced differences in the passenger data that this current model did not capture. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### G. improving the dataquest code\n", + "\n", + "Brain dump of ideas:\n", + "* Not using every feature in the model, relevant to the curse of dimensionality -- see if using the same logistic regression with less features is helpful. Perhaps things like ticket number and fare are not as useful as sex and age. \n", + "* Try different models\n", + "* Combine features together: perhaps combining sex and age into one feature somehow (encoding it with one digit for sex and one digit for age)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Helper functions: Use logistic regression, try using different features\n", + "\n", + "def make_titanic_test_predictions(predictors):\n", + " # Initialize our algorithm\n", + " alg = LogisticRegression(random_state=1)\n", + " \n", + " # Compute the accuracy score for all the cross validation folds. (much simpler than what we did before!)\n", + " scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + " \n", + " # Take the mean of the scores (because we have one for each fold)\n", + " print \"accuracy\", scores.mean()\n", + " return \n", + "\n", + "def prepare_submission_file_different_predictors(predictors, filename):\n", + " # Initialize the algorithm class\n", + " alg = LogisticRegression(random_state=1)\n", + "\n", + " # Train the algorithm using all the training data\n", + " alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + "\n", + " # Make predictions using the test set.\n", + " predictions = alg.predict(titanic_test[predictors])\n", + " \n", + " # Create a new dataframe with only the columns Kaggle wants from the dataset.\n", + " submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " }) \n", + " \n", + " # Save it\n", + " submission.to_csv(filename, index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In our first attempt, predictors included all of the provided features from the kaggle dataset: \n", + "`['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']`. \n", + "\n", + "Let's see what happens when we do something super bare bones with just `Sex` and `Age`. I expect that this will be less accurate because while these features do seem important, there is probably more to the relationship between people and survival than `Sex and Age`. \n", + "\n", + "The code in the next few cells somewhat resembles one of the data mining approaches in the reading (I believe the reading mentioned computing the correlation coefficient for each of the variables). We'll see which variables work well for predictions, and then proceeding onwards based on which variables seem to be helping the accuracy score. " + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Sex', 'Age']\n", + "accuracy 0.786756453423\n" + ] + } + ], + "source": [ + "predictors2 = ['Sex', 'Age'] \n", + "print predictors2\n", + "predictions2 = make_titanic_test_predictions(predictors2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It turns out that using just `Sex` and `Age` gives us a score comparable to using all of the features! This definitely makes me think that some of the features in the dataset are not helpful in this logistic regression model... this is not a surprise because we know that more variables is not necessarily better with a fixed amount of data (insert reference to the curse of dimensionality concept. \n", + "\n", + "Based on contextual knowledge about the Titanic story (DataQuest mission 74 also mentions this), we know that passenger class was relevant because the first class cabins were closer to the deck of the ship. A distance advantage to safety almost certainly would impact survival rate, so let's try including `Pclass` in addition to the bare-bones model based on just `Sex` and `Age`." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# prepare_submission_file_different_predictors(predictors2, \"logistic_regression_SA.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This bare bones two-feature model also did better on the test set -- it received a score of 0.76555 (now at rank 3098; improvement compared to first submission score was 0.01435). " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Pclass', 'Sex', 'Age']\n", + "accuracy 0.789001122334\n" + ] + } + ], + "source": [ + "predictors3 = ['Pclass', 'Sex', 'Age']\n", + "print predictors3\n", + "predictions3 = make_titanic_test_predictions(predictors3)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# prepare_submission_file_different_predictors(predictors2, \"logistic_regression_PSA.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This three-feature model did (very slightly with an improvement of about 0.005; probably not \"significant\") better than the two-feature model on the training dataset, and it had the same performance as the two-feature model on the test dataset -- it received a score of 0.76555 (same place on the kaggle leaderboard). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### H. Other things to try (for model_iteration_2.ipynb) !\n", + "\n", + "Due to time constraints I didn't have a bunch of time to implement more ideas -- but these are some things I will explore more in future iterations and perhaps discuss in class soon:\n", + "\n", + "* Take another look at the data, see what the unique values themselves look like. For example, is there some pattern in the names of the passengers?\n", + "* Combine variables:\n", + " * In the brain dump cell earlier I mentioned combining `sex` and `age` somehow. Consider \"female child, male child, female adult, male adult, female senior, male senior\", and put these categories in one variable. Maybe this would help the curse of dimensionality problem? Or maybe it would prevent the model from learning nuances that need `sex` and `age` to be provided separately? \n", + "* Consider the tradeoff between doing a bunch of feature engineering myself and letting the model figure out the trends on its own. There must be a sweet spot between the data processing I do and what happens automatically in logistic regression.\n", + "* Revisit exploration.ipynb for more bottom-up data inspiration!\n", + "* Different models provided by scikit-learn (Random Forest?)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/model_iteration_2.ipynb b/model_iteration_2.ipynb new file mode 100644 index 0000000..f65b6ba --- /dev/null +++ b/model_iteration_2.ipynb @@ -0,0 +1,697 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 265, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%matplotlib inline\n", + "\n", + "import pandas\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Beginning thoughts\n", + "\n", + "I looked through several options for model_iteration_2: DataQuest Mission 75, the blog posts of several other kaggle competition participants, and scanning the forums for ideas. Afer completing DataQuest Mission 75 in depth I think that it provides the most comprehensive starting point for model_iteration_2, and I really appreciate the opportunity to both read code and implement ideas (a rare opportunity since the titanic dataset in particular is an educational dataset?). Hopefully the work done in this notebook becomes a resource of examples for projects in the near future; the lambda functions passed into .apply and the regular expression usage in particular are not things that I use commonly in python. \n", + "\n", + "In this notebook I will explain and use the code that was suggested in DataQuest Mission 75, and adapt it to implement additional ideas. The markdown cells also include some reflection on the process and how I reacted to the dataquest mission. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Overall Themes of DataQuest Mission 75:\n", + "\n", + "* Use a better machine learning algorithm.\n", + "* Generate better features.\n", + "* Combine multiple machine learning algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import the datasets" + ] + }, + { + "cell_type": "code", + "execution_count": 266, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "titanic = pandas.read_csv(\"./data/train.csv\")\n", + "titanic_test = pandas.read_csv(\"./data/test.csv\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Clean up the training data (same as what we did in model_iteration_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 267, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Replace the missing age values with the median age\n", + "titanic[\"Age\"] = titanic[\"Age\"].fillna(titanic[\"Age\"].median())\n", + "\n", + "# From genders to numbers\n", + "titanic.loc[titanic[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic.loc[titanic[\"Sex\"] == \"female\", \"Sex\"] = 1\n", + "\n", + "# From embarked letters to numbers\n", + "titanic[\"Embarked\"] = titanic[\"Embarked\"].fillna(\"S\")\n", + "titanic.loc[titanic[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic.loc[titanic[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic.loc[titanic[\"Embarked\"] == \"Q\", \"Embarked\"] = 2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Clean up the test data (same as what we did in model_iteration_1)" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "titanic_test = pandas.read_csv(\"./data/test.csv\")\n", + "\n", + "# Age column\n", + "titanic_test[\"Age\"] = titanic_test[\"Age\"].fillna(titanic[\"Age\"].median())\n", + "\n", + "# Sex column\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"male\", \"Sex\"] = 0\n", + "titanic_test.loc[titanic_test[\"Sex\"] == \"female\", \"Sex\"] = 1\n", + "\n", + "# Embarked column\n", + "titanic_test[\"Embarked\"] = titanic_test[\"Embarked\"].fillna(\"S\")\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"S\", \"Embarked\"] = 0\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"C\", \"Embarked\"] = 1\n", + "titanic_test.loc[titanic_test[\"Embarked\"] == \"Q\", \"Embarked\"] = 2\n", + "\n", + "# Fare column\n", + "titanic_test[\"Fare\"] = titanic_test[\"Fare\"].fillna(titanic[\"Fare\"].median())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Random forests!\n", + "\n", + "Random forests have the ability to capture many different \"layers\" of relationships between the features in our dataset. I use the word \"layers\" here to loosely describe the different branches of decision trees in the random forest. Random forests are random because each decision tree in the forest gets a random subset of the data. Taking the average of the results from the trees will then result in the model's prediction. " + ] + }, + { + "cell_type": "code", + "execution_count": 269, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.801346801347\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "from sklearn import cross_validation\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Fare\", \"Embarked\"]\n", + "\n", + "# Initialize our algorithm with the default parameters\n", + "# n_estimators is the number of trees we want to make\n", + "# min_samples_split is the minimum number of rows we need to make a split\n", + "# min_samples_leaf is the minimum number of samples we can have at the place where a tree branch ends (the bottom points of the tree)\n", + "alg = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"])\n", + "print(scores.mean())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This first stab with random forests resulted in 80% accuracy, which is better than the linear and logistic regression in model_iteration_1. Based on what I know about these three algorithms, I can see how random forests do a better job of capturing the complicated relationships between features in this dataset -- for example, the branches can capture how the same non-sex features can impact men and women differently." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A lot of people who use random forest models in their implementation are also a fan of visualizing a tree from their random forest, such as the picture included in [triangleinequality's tutorial](https://triangleinequality.wordpress.com/2013/09/05/a-complete-guide-to-getting-0-79903-in-kaggles-titanic-competition-with-python/) and shown immediately below. \n", + "\n", + "![a tree in the random forest](https://triangleinequality.files.wordpress.com/2013/09/sample_tree1.png?w=960&h=960)\n", + "\n", + "The blue nodes are the leaves that represent whether that path of the tree resulted in survival or death. The dataquest mission also explained some of the relevant \"high-level\" parameters of the random forest model, such as the amount of splits and how many samples are needed to create a leaf (try tweaking `min_samples_split` and `min_samples_leaf`). There is probably a fair amount of iteration involved in finding the sweet spot of branches/layers of the decision tree (too many branches will result in overfitting on the training dataset, and too little branches will result in a ineffective set of decision trees). " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Making new features\n", + "\n", + "Bulletpoints of suggestions from the dataquest page:\n", + "* The length of the name -- this could pertain to how rich the \n", + "person was, and therefore their position in the Titanic.\n", + "* The total number of people in a family (SibSp + Parch).\n", + "\n", + "Emily's commentary: \n", + "* I'm not sure if the first bulletpoint would make a nontrivial improvement, since the wealth of the passengers could already be represented with the `Pclass` variable, and names have a ton of variation on their own. We can confirm this with a correlation coefficient or some other measure for relevance of a feature on the prediction. \n", + "* I think the second bulletpoint would result in an improvement to the model. One potential story behind the family feature is that people with families stick together and help each other escape. Another potential story is that large families may have a tough time trying to get everyone safe because of all the craziness of the event and the many people they are trying to take care of at once. It also makes more sense to put family as one feature rather than `SibSp` and `Parch` separately if they both help the model \"figure out the family situation\" -- maybe the curse of dimensionality is showing up here." + ] + }, + { + "cell_type": "code", + "execution_count": 270, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "# Generating a familysize column\n", + "titanic[\"FamilySize\"] = titanic[\"SibSp\"] + titanic[\"Parch\"]\n", + "\n", + "# The .apply method generates a new series\n", + "titanic[\"NameLength\"] = titanic[\"Name\"].apply(lambda x: len(x))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "I usually don't use lambda functions in python, so I super appreciated the quick example above with the `.apply` method. I'll be sure to remember that when I'm working with dataframes in the future!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataquest mission used regular expressions to extract the titles from the names of the passengers -- this idea was also mentioned in the class discussion in Data Science last week. Similarly to the lambda used above, I also don't have a ton of experience with regular expressions, and appreciate this example! Definitely makes this notebook a valuable resource for future projects if I still need the examples in the near future." + ] + }, + { + "cell_type": "code", + "execution_count": 271, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Mr 517\n", + "Miss 182\n", + "Mrs 125\n", + "Master 40\n", + "Dr 7\n", + "Rev 6\n", + "Col 2\n", + "Major 2\n", + "Mlle 2\n", + "Countess 1\n", + "Ms 1\n", + "Lady 1\n", + "Jonkheer 1\n", + "Don 1\n", + "Mme 1\n", + "Capt 1\n", + "Sir 1\n", + "Name: Name, dtype: int64\n", + "1 517\n", + "2 183\n", + "3 125\n", + "4 40\n", + "5 7\n", + "6 6\n", + "7 5\n", + "10 3\n", + "8 3\n", + "9 2\n", + "Name: Name, dtype: int64\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "import re\n", + "\n", + "# A function to get the title from a name.\n", + "def get_title(name):\n", + " # Use a regular expression to search for a title. Titles always consist of capital and lowercase letters, and end with a period.\n", + " title_search = re.search(' ([A-Za-z]+)\\.', name)\n", + " # If the title exists, extract and return it.\n", + " if title_search:\n", + " return title_search.group(1)\n", + " return \"\"\n", + "\n", + "# Get all the titles and print how often each one occurs.\n", + "titles = titanic[\"Name\"].apply(get_title)\n", + "print(pandas.value_counts(titles))\n", + "\n", + "# Map each title to an integer. Some titles are very rare, and are compressed into the same codes as other titles.\n", + "title_mapping = {\"Mr\": 1, \"Miss\": 2, \"Mrs\": 3, \"Master\": 4, \"Dr\": 5, \"Rev\": 6, \"Major\": 7, \"Col\": 7, \"Mlle\": 8, \"Mme\": 8, \"Don\": 9, \"Lady\": 10, \"Countess\": 10, \"Jonkheer\": 10, \"Sir\": 9, \"Capt\": 7, \"Ms\": 2}\n", + "for k,v in title_mapping.items():\n", + " titles[titles == k] = v\n", + "\n", + "# Verify that we converted everything.\n", + "print(pandas.value_counts(titles))\n", + "\n", + "# Add in the title column.\n", + "titanic[\"Title\"] = titles" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataquest mission dives deeper into the \"family\" storyline by making family groups. While this part of the dataquest mission didn't prompt me to write any additional code, I super appreciated the example of a working implementation to read through and understand at my own pace." + ] + }, + { + "cell_type": "code", + "execution_count": 272, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-1 800\n", + " 14 8\n", + " 149 7\n", + " 63 6\n", + " 50 6\n", + " 59 6\n", + " 17 5\n", + " 384 4\n", + " 27 4\n", + " 25 4\n", + " 162 4\n", + " 8 4\n", + " 84 4\n", + " 340 4\n", + " 43 3\n", + " 269 3\n", + " 58 3\n", + " 633 2\n", + " 167 2\n", + " 280 2\n", + " 510 2\n", + " 90 2\n", + " 83 1\n", + " 625 1\n", + " 376 1\n", + " 449 1\n", + " 498 1\n", + " 588 1\n", + "dtype: int64\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "import operator\n", + "\n", + "# A dictionary mapping family name to id\n", + "family_id_mapping = {}\n", + "\n", + "# A function to get the id given a row\n", + "def get_family_id(row):\n", + " # Find the last name by splitting on a comma\n", + " last_name = row[\"Name\"].split(\",\")[0]\n", + " # Create the family id\n", + " family_id = \"{0}{1}\".format(last_name, row[\"FamilySize\"])\n", + " # Look up the id in the mapping\n", + " if family_id not in family_id_mapping:\n", + " if len(family_id_mapping) == 0:\n", + " current_id = 1\n", + " else:\n", + " # Get the maximum id from the mapping and add one to it if we don't have an id\n", + " current_id = (max(family_id_mapping.items(), key=operator.itemgetter(1))[1] + 1)\n", + " family_id_mapping[family_id] = current_id\n", + " return family_id_mapping[family_id]\n", + "\n", + "# Get the family ids with the apply method\n", + "family_ids = titanic.apply(get_family_id, axis=1)\n", + "\n", + "# There are a lot of family ids, so we'll compress all of the families under 3 members into one code.\n", + "family_ids[titanic[\"FamilySize\"] < 3] = -1\n", + "\n", + "# Print the count of each unique id.\n", + "print(pandas.value_counts(family_ids))\n", + "\n", + "titanic[\"FamilyId\"] = family_ids" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Finding the best features\n", + "\n", + "The mission provides an example of univariate feature selection -- figuring out which features are most relevant by calculating a \"feature score\" for each feature (column inn the dataframe)." + ] + }, + { + "cell_type": "code", + "execution_count": 273, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXsAAAEpCAYAAAByeIL3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHSNJREFUeJzt3Xu8XGV56PHf5AYhZBMCGFIFImhEURSsXCqVjRfUHowI\nAh/gaEStWg8FbweirbJ7Wrno6VGqFloUTC1FEBGBeiSRwwjeEEKACAYwmraiCSC3BLkEmPPHs8aZ\nPdmXmey93rWG9ft+PvOZtdae2evZyZpn3vWs930XSJIkSZIkSZIkSZIkSZIkqSReBKxsezwMnATM\nBZYDdwHLgDlFBShJmlxTgN8CuwCfAU7Jtp8KnFlUUJKkyXUocH22vBqYly3vnK1Lkp4Fzgc+mC0/\n2La91rEuSepTM4D7gJ2y9c7k/kDacCSpWqYl2s+bgRVEwgdYT5Rv1gHzgXs737DHHns01qxZkyg8\nSXrWuBV4RefGKYl2fixwUdv6FcDibHkxcHnnG9asWUOj0Sj0cdpppxUeQ1niKEMMZYmjDDGUJY4y\nxFCWOMoQQ6PRAHj5SEk4RbKfBbweuKxt25nAG4iul6/F3jiSlKsUyf5RYEdgQ9u2B4gvgIVEL52H\nEsTRszPOOItarZbkMTAwt+g/V9KzWKqafV968snHgUaSfW3YUBv1Z4ODg0liGEsZYoByxFGGGKAc\ncZQhBihHHGWIYSyjZ5jiNbL6U2FqtRqpkj3UKPrvldT/Im9tnttTXaCVJBXIZC9JFWCyl6QKMNlL\nUgWY7CWpAkz2klQBJntJqgCTvSRVgMlekirAZC9JFWCyl6QKMNlLUgWY7CWpAkz2klQBJntJqgCT\nvSRVgMlekirAZC9JFWCyl6QKMNlLUgWY7CWpAlIk+znApcDPgTuA/YG5wHLgLmBZ9hpJUk5SJPuz\nge8ALwb2BlYDS4hkvxC4JluXJOWklvPv3w5YCezesX01cDCwHtgZqAN7drym0Wg0cg5vbLVaDUgV\nQ42i/15J/S/y1ua5Pe+W/fOB+4ALgJuB84BZwDwi0ZM9z8s5DkmqtGkJfv++wInAjcDn2bxk02CU\n5vPQ0NAflgcHBxkcHMwjRknqW/V6nXq9Pu7r8i7j7Az8mGjhAxwEfJwo6xwCrAPmA9diGccyjqQJ\nK6qMsw74L+JCLMDrgduBK4HF2bbFwOU5xyFJlZZ3yx7g5cCXgRnAGuAEYCpwCbArsBY4Gnio4322\n7CWpR6O17FMk+y1lspekHhVVxpEklYDJXpIqwGQvSRVgspekCjDZS1IFmOwlqQJM9pJUASZ7SaoA\nk70kVYDJXpIqwGQvSRVgspekCjDZS1IFmOwlqQJM9pJUASZ7SaoAk70kVYDJXpIqwGQvSRVgspek\nCjDZS1IFmOwlqQJM9pJUAdMS7GMt8AjwNLAJ2A+YC1wM7Jb9/GjgoQSxSFIlpWjZN4BBYB8i0QMs\nAZYDC4FrsnVJUk5SlXFqHeuLgKXZ8lLg8ERxSFIlpWrZfw+4CfjzbNs8YH22vD5blyTlJEXN/tXA\nb4GdiNLN6o6fN7LHZoaGhv6wPDg4yODgYC4BSlK/qtfr1Ov1cV/XWV7J22nARqKFPwisA+YD1wJ7\ndry20WiM+B2QTK1WY5TvoTz2RtF/r6T+F3lr89yedxlnG2B2tjwLOBRYBVwBLM62LwYuzzkOSaq0\nvFv2zwe+lS1PAy4EziC6Xl4C7MroXS9t2UtSj0Zr2acu4/TCZC9JPSqqjCNJKgGTvSRVgMlekirA\nZC9JFWCyl6QKMNlLUgWY7CWpAkz2klQBJntJqoBukv3RwEC2/Eli+oN9c4tIkjTpukn2nyRuK3gQ\n8DrgK8A5eQYlSZpc3ST7p7Pnw4DzgKuAGblFJEmadN0k+3uAfwaOAf4d2LrL90mSSqKbWS9nAW8k\n5qG/m7jZyMuAZTnGBc56KUk9m8isl48C9xE1e4CngF9MWmSSpNx107IfAl4JvAhYCDyXuPHIq/ML\nC7BlL0k9m0jL/m3AW4kWPkQNf/boL5cklU03yf4J4Jm29Vk5xSJJykk3yf4bwD8Bc4D3AdcAX84z\nKEnS5BqvZl8DdgH2BA7Ntl0NLM8zqIw1e0nq0ZbecLxGdLl8aQ4xjcdkL0k92tILtA1gBbBfDjFJ\nkhLppuvlncALgP+g1SOnAeydV1DNfRTd0rVlL6nfjNayn9bFe9+YPTczUTdfEO2mAjcBvwbeAswF\nLgZ2A9YSs2o+1OPvlCT1oJveOGuJnjiLiGS9XbatWycDd9D6slhCXOBdSPTsWdLD75IkbYFukv3J\nwL8COwHzsuWTuvz9zwP+jOiq2TwjWAQszZaXAod3G6wkact0U5JZBRxAq14/C/gJMRnaeL4BnE7c\n/ORjxJnBg8D2bft/oG29nTV7SerRRGr2MHwE7TOjvmq4w4B7gZXA4CivaTBGNh0aGvrD8uDgIIOD\no/0aSaqmer1OvV4f93XdtOw/ArwLuCx7/eHAV4HPjfO+04F3ELNkbk207i8DXkUk/3XEdMnXEoO2\nOtmyl6QebemgqqZXElMcN4DridZ6Lw6mVcb5DPA74Czi4uwcRr5Ia7KXpB5NpIxzANGbZkW2PgDs\nD9zQYwzNTHYmMUXye2h1vZQk5aiblv0twD60knWz3/w+eQWVsWUvST2ayHz2MDzjPU0kfElSn+gm\n2f+K6Fc/HZhB9Lv/ZZ5BSZImVzfJ/gPELQjvIaY8OICY116S1Cd6necmJWv2ktSjidTsP0v0wJlO\nzGVzP9F/XpLUJ7pJ9ocCjxAjYtcCewD/M8eYJEmTrJtk3+yLfxhwKfAw6WobkqRJ0M2gqiuB1cDj\nwF8Az8mWJUl9otsLtDsQNxh5mpj1cjYxt02evEArST2a6Nw4RTDZS1KPJjqCVpLUx0z2klQB3ST7\nKUS/+k9l67sC++UWkSRp0nVTsz+XuDvVa4mbjMwFlgF/nGNcYM1ekno2kfns9yemM27esOQBYjSt\nJKlPdFPGeZLhUxrvRPf3oZUklUA3yf4LwLeIwVSnAz8EzsgzKEnS5BqvZj8FOJAo3bwu23YN8PM8\ng8pYs5ekHk1kUNUtwCsmO6AumOwlqUcTGVT1PeDtI71ZktQfukngG4FtiHlxmhOgNYg57vNky16S\nejSRrpfbTno0kqSkukn2rxll+3WTGYgkKT/dlHGuolXL2JqYKmEFMaJ2LFsD3we2AmYA3wY+TozA\nvRjYjbjz1dHE9MmdLONIUo8mc4rjXYCzgSO6eO02wO+JM4gfAB8DFhH3sf0McCqwPbBkhPea7CWp\nR5M5xfGvgRd3+drfZ88ziFG4DxLJfmm2fSlw+BbEIEnqQTc1+y+0LU8h+tyv6PL3TwFuJm5Sfg5w\nOzAPWJ/9fH22LknKUTfJ/qa25aeAfyOmTOjGM8SXw3bA1cAhHT9vMEadZGho6A/Lg4ODDA4Odrlb\nSaqGer1OvV4f93Xd1Ow/BHy+Y9vJRN2+F58EHgPeCwwS97CdD1xLTJ3cyZq9JPVoIjX7xSNsO6GL\n9+0IzMmWZwJvIKZJvqLtdy4GLu/id0mSJmCsMs6xwHHA84Er27bPBn7Xxe+eT1yAnZI9vkZMorYS\nuAR4D62ul5KkHI1VxtmNSPRnEl0km6/dANxK1O/zZBlHkno0mf3sUzHZS1KPJlKzPxC4kZgQbRPR\nw+aRyQxOkpSvbpL9F4na/d3EFAjvAf4xz6AkSZOr2xG0dxMjYJ8GLgDelFtEkqRJ182gqkeJycxu\nJeazWUe5a/2SpA7dtOzfmb3uRGKum+cBR+YZlCT1YmBgLrVaLffHwMDcov/ULdZtC30bYrbLO3OM\npZO9cSR1Jd1ntfyf04n0xllEDIS6OlvfhxgFK0nqE90k+yFgf2J6YojEv3teAUmSJl83yX4Tm99J\n6pkcYpEk5aSbZH87cDzRc+eFxPz2P8ozKEnS5Oom2Z8I7AU8AVxEjJ79UJ5BSZIm11i9cb4GvIOR\n57NPIdkl79mzt+eRRx7YbLu9caT+YG+cli2ZCO0O4PXAd4mbjXTaPDtOrkbRidZkL/UHk33LaMl+\nrBG05xLzz+/O5vecbWCPHEnqG90MqjoX+EDegYzAlr2krtiyb+nL+eyLTrQme6k/mOxbJjKCVpLU\n50z2klQBJntJqgCTvSRVgMlekirAZC9JFZB3st8FuJaYTO1nwEnZ9rnAcuAuYBkwJ+c4JKnS8u5n\nv3P2uAXYlhiJezhwAnA/cU/bU4HtgSUd77WfvaSu2M++pah+9uuIRA+wEfg58Fzi7ldLs+1LiS8A\nSVJOUtbsFxC3NLwBmAesz7avz9YlSTkZayK0ybQt8E3gZGBDx88ajHr+NdS2PMjIk29KUnXV63Xq\n9fq4r0sxN8504Crg/9KaF381kbnXAfOJi7h7drzPmr2krlizbymqZl8DvkLMjd9+A5QrgMXZ8mLg\n8pzjkKRKy7tlfxBwHXAbra/djwM/BS4BdgXWAkez+U3NbdlL6oot+xanOB6TyV7qZyb7Fqc4lqQK\nM9lLUgWY7CWpAkz2klQBJntJqgCTvSRVgMlekirAZC9JFWCyl6QKMNlLUgWY7CWpAkz2klQBJntJ\nqgCTvdSjgYG51Gq1JI+BgblF/7l6lnCKY8ApjtULj4vycYrjFqc4lqQKM9lLUgWY7CWpAkz2klQB\nJntJqgCTvSRVgMlekirAZC9JFZB3sj8fWA+sats2F1gO3AUsA+bkHIMkVV7eyf4C4E0d25YQyX4h\ncE22LknKUd7J/nrgwY5ti4Cl2fJS4PCcY5CkyiuiZj+PKO2QPc8rIAZJqpRpBe+/wZizFw21LQ9m\nD0lSU71ep16vj/u6FLNeLgCuBF6Wra8msvY6YD5wLbDnCO9z1kuVksdF+TjrZUuZZr28AlicLS8G\nLi8gBkmqlLxb9hcBBwM7EvX5TwHfBi4BdgXWAkcDD43wXlv2KiWPi/KxZd8yWsvem5cAJnv1wuOi\nfEz2LWUq40iSEjPZS1IFmOwlqQJM9pJUASZ7SaoAk70kVYDJXpIqwGQvSRVgspekCjDZS1IFmOwl\nqQJM9pJUASZ7SaoAk70kVYDJXpIqwGQvSRVgspekCjDZS1IFmOylPjUwMJdarZb7Y2BgbtF/qiaB\n96AFvAetelGW48L7rrb4b9HiPWglqcJM9uorli5UVqmOzS09Poss47wJ+DwwFfgycFbHzy3jaDNl\nOF0vy3FRhn+LsijDv0W5jovylHGmAl8kEv5LgGOBFxcUS+lts83swlsM9Xo97R+tvuBx0T+KSvb7\nAb8A1gKbgK8Dby0oltJ77LGNRIsh/8eGDQ+OGIMfao3E46J/FJXsnwv8V9v6r7NtKqkzzjir8LML\nSVuuqGRf7gKgNvPkk49T9NmFysdGQP+YVtB+7wF2aVvfhWjdt1sDtT1SBZRd1BjpJ6lCGCOGssRR\nhhjSxVGGGMoSx9gxpLFhw4N98G9RiuPi1mRBdGEasAZYAMwAbsELtJL0rPRm4E7iQu3HC45FkiRJ\nkqTq2aboANSb4q+6tLyAuEj7OHAI8DLgX4CHigyq4uYTYyKeAW4E1hUQw9bAkcT1nWaHggbwvxLH\n8afEMXoBsBOwLfCrRPs+kvibRxuieVmiOAD+hBjxPpvoWPEK4H3ABxPGAPAi4B+BnYG9gL2BRcDf\nJdj3F9qWm/8v7esnJYihZ2WaG+ebwFPEB+qfiAPp3xLH8J6O9WnAUOIYIA7grwDfzdZfwuax5e29\nwA3AEcDbs+XUMQB8m/gQbwI2Zo9HE8cwBJxC69rSDOBfE+7/Ldnj3cRxcXz2+HK2LaXPEyPf78/W\nbwEOThwDwHnAJ4Ans/VVxEj8FFZkj62AfYG7gLuJL74ZiWLoayuz51OAv+zYlspFwHeAPwJeSrRm\n/z5xDBBJ/hjgtmx9OvCzxDHcBezQtr5Dti211H/3SG4lGkbtx+Nto7w2T8uJs62m+cCyxDH8NHtu\n/7cooqvfTSPEcUviGG4gPptN07NtpVSmlv2TwHHAO4GriFOj6WO+Y/IdS5SObgP+Hfgw8NHEMQDs\nCFwMPJ2tbyLOelK6n2hFN22k1ZpL6UfEKXqRniBKWU2zCopjF4aX0tYDuyaO4T+BV2fLM4CPAT9P\nHAPAfUQVoOntwG8TxzAHGGhbn51tK6WiBlWN5N3A+4FPE7XQ5wNfSxzDQqLedhnR7/+/Ey2H1GWD\njQxvVR8APJw4hjXAT4gyCsTcRbcRX34N4P/kvP9V2fNU4ATimHgi29Yg7RfAN4jS4hyiPv1uooSS\n2veAq4nyZo04+1ueOIa/AM4mpje5hziz+B+JYwA4EfhnYE/gN8TxcXziGM4Ebgbq2frBFFP27UqZ\nLtC2mws8j/SnyquJg+h7xFnPh4k69UsSx/FK4iLQXsDtxAXBt5P2dHkoe25eEOy8OPg3Oe9/wTg/\nX5vz/ptqRIt6T+DQbNvVpE+yzVjeRlwsBrgO+FbiGHZh+LxWENeYirh4D3GWNQXYUND+5wP7E5+N\nGyju32FcZUr23ycuQk0jLn7cB/yQSLipbMfmLeiFFFOrnk70OIAYfLapgBia5hK9op4Z74U5OAC4\nA3gkWx8gzrpS1UZrxFnGSxPtbzwLgBcSXzbbEGc+KRPdU8ClxNnN77NtK4F9Eu2/vaza3vhoNkby\nPuOEaIx17rs9npsTxNCzMpVxtiM+0O8l6uan0TqVT2UmcbA8l9Zc+weSPtk3u9o1LSS+hFYB9+a8\n79OAS4g67FbExeKXEx/y40nfoj2X6PHQ9Gi2LVVyaRCNj/1oXZwsyvuAPye+fPcgzn7PAV6XMIZV\nwPVEQ+woYgR8SrMpfiLFvx8nhkNSBdKvVtHqXbBfti11GacMvWAgLg4/QHRH/SbwOyLJ/oK4gJ2n\nO2i1VN5H1COnEq3pG3Pe90hG6mGR+ri4k7hY/kviOF1VQAwQZbytGN4DJXWDqLnvVxMNgreQvtcc\nwEFdblMJHUV8gM7J1vcgEl1KZejOBfGFN69tfV62bQeihp+n9r/9MuADo/wslW8RF82nE70/TgYu\nTxzDglEeqXV2e5xG+i+d9mNgPvAD4LHEMXTG0ZS6fLKCuDi9feL9ahLUiYTaPJAOIK4lpNbZla3W\nti3vhPsTYvTyTsTZxe5tP7sz532P5DlEN9R7s8dF2bYiPIfo6th8pPZZ4K+I/4c3EF+En04cw/yO\n9WnAaxLu/0Cibv9r4CPZ8keJDgWp+/u/EDidOOO+GHgj5boOOkyZavYzafV8mZlta5B2hOBHgSuJ\nBPcjor/7UQn333QtUcq5hDh4jiS+iGaR//QRHyIuwO0EfI4oXQD8N9K3nKZlMRyTeL+dFhF12j8i\nvnB2I75890ocx6nENa1VRDfl75CuC+g7iK7Qx43wswbRMyiFGUTdfmr23PQI0WMtpbuJUbx/DRwG\nnE90Yjif6J76QOJ4+salwN8SyWUxUaP+h0T73o9Wi2U6cWr2/4AvERfDUptCHLifI4anf4qYB6SK\nfkDUqYt0G/HF3zyrOoT4QKfWOR/QVNJNKfL+7HmIuIjffDTXU9utgH2O5OXEZ/ROIl8dQAw0K6L8\n2zea/zjtF0dTda9bSSupv4YYiXckManSpYli6LQvcdr+H0Sr/i/HfPXk25Ho67+SaNGfzfCBXql8\njbgw/Elap+wfSRzDiuz5ViLBQjEXaL9Ka36erYgBb0MFxFGkL2bPV47wuCJxLCuIRuFxxIR97VKP\nfxhXmco4zQmNHiZqxuuIUkIKU2idch1DjJZs9oRJWQd8ETFlwzHEOINvEGWcwYQxNH2duF5xRBbD\ncURd8vWJ41iTPaYQM02ONvNjnh4kSgbXAxcSpZyNY74jH+/O9v8J4uziO8TZXwrNnll3Ef8H5xMN\norXAu0hX4ltMDHwcac6q1MfFUbTKnJ3eljKQftPsP3wwMfT5Pob3BMnTz2jNw3Mnw2fxy7v3S7tn\niNZJ+8W/VNPodhqpy2nqbn5Fa/4/zCJa9NOJxHYSac9yXkmc6e1LjNa8hSjrNbelcDutz8hxRHLf\ngfjyvz5RDFBMj7BOH2X4WWbneimVqWV/Xvb8fWJenJQuyvZ7PzEqsHnwvpC08+kfQbTsryP6/Ddb\n9kVYlsVycbZ+FOlnWIToAXMKm1+4f22CfX+bGLz1KHGWdyRRSkmtcxDPQ8S4h2brNsUgnk20RnEf\nRgx8/B0xtchnE+y/aScioY70uUg1gna0gV1FnHV2rQzdhEaaVbL9Rg0p/vMgunTtTCS05sRnC4nS\nQepeKNsSE48dS3yQ/4WoAaZIthtpHbCzaE2RMIX4d5k90ptytJz4wvkYcZHwXcRZ3ykJ9t0+DUDK\nKQFGMpW4aH/xeC/Myc1Ekn+AuI70Olpnf6uJuYNS+C0xgno0ec/Z1LfK0LIvw/BngB+PsK2IOXEg\nEu6F2WMu8SFfQppkv22CffRiB6J74UnE2df3aQ1+q5KniS+4opL9p4gL5dOIUmMz0Q8S11RSWUfx\nCf1U4CyG37GqqbR3qipDy17lsifRUhutFpz6LOcnRHe2ZUTXtt8Q5a09Euz7aVqTfc1k+EjRBsPn\nMk/hTKLUeDHDp91O1Z97OtEYeLBt2ywij6S6YF30GRbEFBFXEmeZnRrA0qTR9KGlDJ/4f3uK6ctc\ndc1rJ3VicFfnI7XDiOPiZVlMNxODnKpoLXHBvvORUtFTBBTR/VeTbKRBCA5MSK99gBlEV7criFPW\nlB+0mcT01l8iavVlKDlq+BQBX6fkUwTk7FXEtbSVFDtBXt+5leGjVedSva5+ZVCWAWaXEDf1fj/R\nK+bshPsus5cCRxOznzYfRZhCnGHdQ9zM5G8oZrR5ke4i/g12p9gJ8vrOO4k+7n9LJJY7Ke5ArrL2\nQWRfYvgIzZQDzNq/6KdRjv7VRRsiSmn3AhcQFyuLGOHtFAHhh0UH0M/2IqYFOJH0twJUKMsAs87k\nbrKP/5uptL505xH93FPqqykCcnYo8BWii/SR2eOIQiMaQxnqoDOJkbIvIOpd51LsLfiqriwDzPZm\n+O32ZratF9ETpgweI3oIPUXc2e1e4p6wKTlFQMtiYoqTaQy/ZedlxYQztjIk+6XEvDg/AP6MaNGf\nXGhE1fZpouXWHGDWPIhrpJ2Mber4L6mcG4leMOcRYw0eJabiTqHz3q+1jvVUgx/L5I+JrsplGCc0\nrjJcRV9FdKuD+PK5keL70Uplt4A4u0nV+2OIsacIKHqgUxEuAP43acubW6wMLfunRlmWNFyNqAkf\nRCTY60mb7DXcgcRF6V8BT2TbGkQJsnTK0LJvH6UIw0cqVrU2K43kHGLk8EXEZ/doon7+wQT77ssp\nAnK2YJTtaxPG0LUytOytzUrdOYS4ptW8jvJV4I5E+27uZ8UIP+uLmnUO1mbPz2HznkmStMWuYnhr\nckG2TcVYRNyH9lGilPMMJa7fl6FlL2lsV2bPs4kbnf+UaE3vR3RoSOlVxJ2yFtDKH6WtU+fs74i6\n/XKiU8khxI3ZS8lkL5XfSLfga0pdQrmQGC37M4b3La+iTcR4lClEOfpaSjyth8leKr96x/oAxX12\n7yP9jb3Lqiz3Ju5KGXrjSOrO+4n+7E/QalU3iIm4UjkUOIaYpuHJthhKOWo0J7sC/0nM5f840bI/\nnvgSvpC4XWPpmOyl/vELYtKx+wuM4UJiioDbGV7GOaGYcArRfgOV5r2JS88yjtQ/fsnwu2UVoa+m\nCEgg5VnVhJjspf6xhLhX8o8ZXkJJOaDpR0Rf/9J2MdTILONI/eMm4DpiPqlnaM1Lk/Kep6uJUbx9\nMUVATsp2b+KumOyl/lGGm20vGGX72oQxaAs4VYHUP3Yj7vvwG2AG0arsbFnm7aHsMYPIH43s8XDC\nGCTpWW0tUT7pfKTUV1MESJK2zG3AjrRuE3kIcH5x4UjSs8cpbctHdfzs9JSB0Jr18lZaZeBUc+pr\nAqYUHYCkcR3btvyJjp+9OWUgbD5FwD9Q4ikC1GKyl9SNXbPntxLdDj8MfJcY1fuWooKSpGeTlaMs\nj7SeIoZvJtqnJpEjaKXy2xvYkC3PbFturqfWN1MEqMVkL5Wf42E0YY6gldSNvpwiQJIkSZIkSZIk\nSZIkSZIm3/8H8RXc2zrUvDcAAAAASUVORK5CYII=\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mean of the random forest scores 0.811447811448\n" + ] + } + ], + "source": [ + "# Code from DataQuest mission 75\n", + "\n", + "import numpy as np\n", + "from sklearn.feature_selection import SelectKBest, f_classif\n", + "\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"SibSp\", \"Parch\", \"Fare\", \"Embarked\", \"FamilySize\", \"Title\", \"FamilyId\"]\n", + "\n", + "# Perform feature selection\n", + "selector = SelectKBest(f_classif, k=5)\n", + "selector.fit(titanic[predictors], titanic[\"Survived\"])\n", + "\n", + "# Get the raw p-values for each feature, and transform from p-values into scores\n", + "scores = -np.log10(selector.pvalues_)\n", + "\n", + "# Plot the scores. See how \"Pclass\", \"Sex\", \"Title\", and \"Fare\" are the best?\n", + "plt.bar(range(len(predictors)), scores)\n", + "plt.xticks(range(len(predictors)), predictors, rotation='vertical')\n", + "plt.ylabel(\"feature scores\")\n", + "\n", + "plt.show()\n", + "\n", + "# Pick only the four best features.\n", + "predictors = [\"Pclass\", \"Sex\", \"Fare\", \"Title\"]\n", + "\n", + "alg = RandomForestClassifier(random_state=1, n_estimators=150, min_samples_split=8, min_samples_leaf=4)\n", + "scores = cross_validation.cross_val_score(alg, titanic[predictors], titanic[\"Survived\"], cv=3)\n", + "print \"mean of the random forest scores\", scores.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### gradient boosting and ensembling!\n", + "\n", + "I had heard of gradient boosting in passing from ML enthusiast friends, but I hadn't tried it in person... at a high level, the dataquest page says that the errors from one tree will help the next tree learn the dataset more effectively. There are also some suggested parameters to prevennt overftting: limiting the tree count and tree depth. \n", + "\n", + "The dataquest then describes ensembling -- making predictions based on several different models and averaging their results to make a final decision on what the prediction is. My reaction: ensembling seems super useful! It sounds like ensembling presents the opportunity and challenge of balancing the strengths and weaknesses of many different models; it's another layer of algorithm design and ensemble parameter tweaking." + ] + }, + { + "cell_type": "code", + "execution_count": 274, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.819304152637\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py:40: FutureWarning: in the future, boolean array-likes will be handled as a boolean array index\n" + ] + } + ], + "source": [ + "# Code from DataQuest Mission 75\n", + "\n", + "from sklearn.ensemble import GradientBoostingClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.cross_validation import KFold\n", + "\n", + "# The algorithms we want to ensemble.\n", + "# We're using the more linear predictors for the logistic regression, and everything with the gradient boosting classifier.\n", + "algorithms = [\n", + " [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), [\"Pclass\", \"Sex\", \"Age\", \"Fare\", \"Embarked\", \"FamilySize\", \"Title\", \"FamilyId\"]],\n", + " [LogisticRegression(random_state=1), [\"Pclass\", \"Sex\", \"Fare\", \"FamilySize\", \"Title\", \"Age\", \"Embarked\"]]\n", + "]\n", + "\n", + "# Initialize the cross validation folds\n", + "kf = KFold(titanic.shape[0], n_folds=3, random_state=1)\n", + "\n", + "predictions = []\n", + "for train, test in kf:\n", + " train_target = titanic[\"Survived\"].iloc[train]\n", + " full_test_predictions = []\n", + " # Make predictions for each algorithm on each fold\n", + " for alg, predictors in algorithms:\n", + " # Fit the algorithm on the training data.\n", + " alg.fit(titanic[predictors].iloc[train,:], train_target)\n", + " # Select and predict on the test fold. \n", + " # The .astype(float) is necessary to convert the dataframe to all floats and avoid an sklearn error.\n", + " test_predictions = alg.predict_proba(titanic[predictors].iloc[test,:].astype(float))[:,1]\n", + " full_test_predictions.append(test_predictions)\n", + " # Use a simple ensembling scheme -- just average the predictions to get the final classification.\n", + " test_predictions = (full_test_predictions[0] + full_test_predictions[1]) / 2\n", + " # Any value over .5 is assumed to be a 1 prediction, and below .5 is a 0 prediction.\n", + " test_predictions[test_predictions <= .5] = 0\n", + " test_predictions[test_predictions > .5] = 1\n", + " predictions.append(test_predictions)\n", + "\n", + "# Put all the predictions together into one array.\n", + "predictions = np.concatenate(predictions, axis=0)\n", + "\n", + "# Compute accuracy by comparing to the training data.\n", + "accuracy = sum(predictions[predictions == titanic[\"Survived\"]]) / len(predictions)\n", + "print(accuracy)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Preparing and predicting on the test set" + ] + }, + { + "cell_type": "code", + "execution_count": 275, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Code from DataQuest Mission 75\n", + "\n", + "# First, we'll add titles to the test set.\n", + "titles = titanic_test[\"Name\"].apply(get_title)\n", + "# We're adding the Dona title to the mapping, because it's in the test set, but not the training set\n", + "title_mapping = {\"Mr\": 1, \"Miss\": 2, \"Mrs\": 3, \"Master\": 4, \"Dr\": 5, \"Rev\": 6, \"Major\": 7, \"Col\": 7, \"Mlle\": 8, \"Mme\": 8, \"Don\": 9, \"Lady\": 10, \"Countess\": 10, \"Jonkheer\": 10, \"Sir\": 9, \"Capt\": 7, \"Ms\": 2, \"Dona\": 10}\n", + "for k,v in title_mapping.items():\n", + " titles[titles == k] = v\n", + "titanic_test[\"Title\"] = titles\n", + "\n", + "# Check the counts of each unique title.\n", + "# print(pandas.value_counts(titanic_test[\"Title\"]))\n", + "\n", + "# Now, we add the family size column.\n", + "titanic_test[\"FamilySize\"] = titanic_test[\"SibSp\"] + titanic_test[\"Parch\"]\n", + "\n", + "# Now we can add family ids.\n", + "# We'll use the same ids that we did earlier.\n", + "# print(family_id_mapping)\n", + "\n", + "family_ids = titanic_test.apply(get_family_id, axis=1)\n", + "family_ids[titanic_test[\"FamilySize\"] < 3] = -1\n", + "titanic_test[\"FamilyId\"] = family_ids\n", + "\n", + "titanic_test[\"NameLength\"] = titanic_test[\"Name\"].apply(lambda x: len(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 276, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# Code from DataQuest Mission 75\n", + "\n", + "predictors = [\"Pclass\", \"Sex\", \"Age\", \"Fare\", \"Embarked\", \"FamilySize\", \"Title\", \"FamilyId\"]\n", + "\n", + "algorithms = [\n", + " [GradientBoostingClassifier(random_state=1, n_estimators=25, max_depth=3), predictors],\n", + " [LogisticRegression(random_state=1), [\"Pclass\", \"Sex\", \"Fare\", \"FamilySize\", \"Title\", \"Age\", \"Embarked\"]]\n", + "]\n", + "\n", + "full_predictions = []\n", + "for alg, predictors in algorithms:\n", + " # Fit the algorithm using the full training data.\n", + " alg.fit(titanic[predictors], titanic[\"Survived\"])\n", + " # Predict using the test dataset. We have to convert all the columns to floats to avoid an error.\n", + " predictions = alg.predict_proba(titanic_test[predictors].astype(float))[:,1]\n", + " full_predictions.append(predictions)\n", + "\n", + "# The gradient boosting classifier generates better predictions, so we weight it higher.\n", + "predictions = (full_predictions[0] * 3 + full_predictions[1]) / 4\n", + "\n", + "# turning the predictions into 0s and 1s\n", + "for i in range(predictions.shape[0]):\n", + " predictions[i] = (predictions[i] >= 0.5)\n", + "\n", + "predictions = predictions.astype(int)\n", + "\n", + "# Create a new dataframe with only the columns Kaggle wants from the dataset.\n", + "submission = pandas.DataFrame({\n", + " \"PassengerId\": titanic_test[\"PassengerId\"],\n", + " \"Survived\": predictions\n", + " }) " + ] + }, + { + "cell_type": "code", + "execution_count": 277, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Save it\n", + "submission.to_csv(\"dataquest75.csv\", index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### How the model performed on kaggle\n", + "\n", + "The model had a score of 0.79904 and is at rank 1003 on the kaggle leaderboards. It is reassuring that the changes explored in the dataquest mission resulted in a nontrivial improvement on test performance over the logistic regression models from model_iteration_1! I have a feeling that Data Science projects are neverending -- there's always some room for improvement or new combination of ideas to try and see if those new ideas work just as well or better than the current iteration." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Ideas for future work\n", + "\n", + "If I had more time...\n", + "\n", + "* Getting more familiar with scikit learn and the many algorithms it has -- ensemble lots of things!\n", + "* Trying different ensembling techniques:\n", + " * instead of averaging, maybe voting (going with what the majority of the models say),\n", + " * the [wikipedia page on Ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) and the [scikit learn page on Ensemble methods](http://scikit-learn.org/stable/modules/ensemble.html) -- maybe trying things like AdaBoost, the Bayes optimal classifier, etc. and plotting their performance vs. ensemble learning method\n", + "* Following advice from this scikit learn cheat-sheet on the [Choosing the right estimator](http://scikit-learn.org/stable/tutorial/machine_learning_map/) page (the image on the webpage also includes links for the boxes in this figure):\n", + "![scikit-learn algorithm cheat-sheet](http://scikit-learn.org/stable/_static/ml_map.png)\n", + "* Reading more about the history of the Titanic and getting more inspiration from the details of the event (gathering more domain-specific knowledge than I currently have about the Titanic) \n", + "* Trying different numbers of folds for the experiments and plotting the model performance vs. number of folds. There must be a sweet spot between too many folds (too little training data for each sub-trial of the model to do well...) and too little folds." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Ending thoughts\n", + "\n", + "In regards to time management... Week 2 of the semester definitely was a \"recalibration\" week for me -- I've found that I'm most productive in Data Science when I have at least 1.5 hours to work. Here's a quick bulletpoint list of the proces:\n", + "\n", + "* gathering resources and figuring out which tools to try using first, \n", + "* reading, \n", + "* implementing code, \n", + "* jotting down commentary in real time with the markdown cells\n", + "* debugging code, \n", + "* learning about functionality of the tools (or perhaps investigating other tools)\n", + "* getting creative with ideas on how to improve the model, \n", + "* implementing those creative ideas and debugging,\n", + "* continued...\n", + "\n", + "There are so many steps that need to happen before one can easily do creative changes to improve your model! Definitely worth thinking about this process with the future steps in mind. e.g. Once the first pass at implementating the data cleanup, feature engineering, and training the model is done, make functions that can be used for speedy feature engineering and testing in the creative phase. Also, take advantage of how easy it is to swap scikit-learn functions. \n", + "\n", + "At the beginning of this warmup project I felt like each step could be a timesink and was overwhelmed by the amount of work goes into each notebook. I also felt like I could have done more at any step in time and tried to express these ideas in the markdown cells. I've done this process before in the machine learning co-curricular but that was with a smaller time commitment and longer timespan (1 credit for DoML vs. 4 credits for Data Science). In the future I'll do a better job of breaking down tasks into smaller chunks that I can do in my relatively frequent small chunks of free time (often I have 30-60 minutes free during the workday hours) and leaving notes to myself about where I left off. In retrospect this seems like common sense with Olin workload time management but this warmup project was a helpful \"recalibration\" phase for me before the next projects start. :) \n", + "\n", + "I definitely value the opportunity for creativity in the \"improving your model\" phase -- next time I will try some creative exercises to generate a large quantity of ideas, and how visualizations or results will either confirm those ideas or lead to new questions. In this project I didn't devote a lot of time to do divergent thinking exercises but will try refining the creative process in Data Science in the next projects." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}