Data-IA-exercices-/app.py at master · lydiahiba/Data-IA-exercices- · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import streamlit as st
import pandas as pd
import numpy as np
import glob
import re
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

st.title('Titanic survival')


@st.cache
def load_data():
    import pandas as pd
    import glob

    csv1= pd.read_csv('https://github.com/lydiahiba/Data-IA-exercices-/raw/master/test.csv', index_col=None, header=0)
    csv2=pd.read_csv('https://github.com/lydiahiba/Data-IA-exercices-/raw/master/train.csv', index_col=None, header=0)

    titanic = pd.concat([csv1,csv2], axis=0, ignore_index=True)
    return titanic

# Create a text element and let the reader know the data is loading.
data_load_state = st.text('Loading data...')
# Load 10,000 rows of data into the dataframe.
data = load_data()
# Notify the reader that the data was successfully loaded.
data_load_state.text("Done! (using st.cache)")


def get_title(name):
    title_search = re.search(' ([A-Za-z]+)\.', name)
    # If the title exists, extract and return it.
    if title_search:
        return title_search.group(1) ## to extract only the subgroup from the matching which is the mrs et miss ( to se the diference try without)
    return ""

def Preprocessing_data():

    csv1= pd.read_csv('https://github.com/lydiahiba/Data-IA-exercices-/raw/master/test.csv', index_col=None, header=0)
    csv2=pd.read_csv('https://github.com/lydiahiba/Data-IA-exercices-/raw/master/train.csv', index_col=None, header=0)

    titanic = pd.concat([csv1,csv2], axis=0, ignore_index=True)

    titanic['Sex']=titanic['Sex'].astype('category')

    titanic['Name']=titanic['Name'].astype('category')

    titanic['Embarked']=titanic['Embarked'].astype('category')
    titanic['Has_Cabin']=titanic.Cabin.apply(lambda x: 0 if pd.isnull(x) else 1)
    #drop the nan from age
    age_serie= titanic[['Age']]
    survived_serie=titanic.Survived
    imputer = KNNImputer()
    age_serie = imputer.fit_transform(age_serie)
    titanic['Age']=age_serie
    # convert age to categorical
    titanic['Age'] = titanic['Age'].astype(int)
    titanic['CategoricalAge'] = pd.cut(titanic['Age'], 5,labels=False)
    # Création une variable Name_length qui contient la longueur de la variable Name , askip plus le noms est long plus la perssone fais partie de la haute société et donc a plus de chance de survie
    titanic['Name_length']=titanic['Name'].apply(len)
    # create a family column
    titanic['FamilySize']=titanic.SibSp +titanic.Parch+1

    # Create new feature IsAlone from FamilySize
    titanic['IsAlone'] = 0
    titanic.loc[titanic['FamilySize'] == 1, 'IsAlone'] = 1

    # Remove all NULLS in the Embarked column
    titanic['Embarked'] = titanic['Embarked'].fillna('S')

    # Remove all NULLS in the Fare column and create a new feature CategoricalFare
    titanic['Fare'] = titanic['Fare'].fillna(titanic['Fare'].median())
    titanic['CategoricalFare'] = pd.qcut(titanic['Fare'], 4,labels=False)
    # Define function to extract titles from passenger names


    # Group all non-common titles into one single grouping "Rare"
    # Create a new feature Title, containing the titles of passenger names
    titanic['Title'] = titanic['Name'].apply(get_title)

    titanic['Title'] = titanic['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
    titanic['Title'].unique()

    titanic['Title'] = titanic['Title'].replace('Mlle', 'Miss')
    titanic['Title'] = titanic['Title'].replace('Ms', 'Miss')
    titanic['Title'] = titanic['Title'].replace('Mme', 'Mrs')
    # Mapping Sex
    titanic['Sex'] = titanic['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
    titanic.rename(columns={'Sex':'Male'},inplace=True)


    # Mapping titles
    titles= pd.get_dummies(titanic['Title'], drop_first=True)
    titanic = titanic.drop('Title', axis=1)
    titanic = titanic.join(titles)
    # Mapping Embarked
    titanic['Embarked'] = titanic['Embarked'].cat.codes
    titanic['Embarked'].unique()

    # Feature selection
    drop_elements = ['PassengerId', 'Name', 'Ticket', 'Cabin', 'SibSp','Parch','Fare','Age']
    titanic = titanic.drop(drop_elements, axis = 1)
    return titanic


# Load 10,000 rows of data into the dataframe.
data_prepro = Preprocessing_data()

def main():
    df= Preprocessing_data()
    gender =  st.sidebar.selectbox("Choose Sex :",['Male','Female'])

    if gender == "Male":
        st.write(df[df['Male']==1])

    elif gender == "Female":
        female=df[df['Male']==0]
        st.write(female)

def  cabine():
    df= Preprocessing_data()
    cabineee =  st.sidebar.selectbox(" Does he have a Cabin:",['Yes','No'])

    if cabineee == "Yes":
        st.write(df[df['Has_Cabin']==1])

    elif cabineee == "No":
        st.write(df[df['Has_Cabin']==0])
cabine()

#df= Preprocessing_data()
#sex=
#gender =  st.sidebar.selectbox("Choose Sex :",df['Male'].unique().tolist())


if st.checkbox('Show raw data'):
    st.subheader('Raw data')
    st.write(data)

if st.checkbox('Show preproccessed data'):
    st.subheader('Preprocessed data')
    st.write(data_prepro)


test_set= titanic[titanic['Survived'].isnull()]
train_set= titanic[titanic['Survived'].notna()]


X = train_set.drop(['Survived'], axis=1).values
y= train_set.Survived.values.reshape(-1,1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# Logistic Regression
reglog = LogisticRegression()
reglog.fit(X_train, y_train)

y_pred = reglog.predict_proba(X_test)
#attention à ne pas calculer le score sur des données modifiées par le SC
print(reglog.score(X_train,y_train))


if __name__ == "__main__":
    main()