-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathNESARCLog.py
38 lines (29 loc) · 1.44 KB
/
NESARCLog.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
##This code test the logistic regression with the NESARC Survey data
##Import the modules
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
##Read the data set and select the columns for the response and explanatory variables
data = pd.read_csv('C:\\Users\\Vadim Katsemba\\Documents\\nesarc_pds.csv', usecols=['S1Q7A9','AGE','CONSUMER','NUMPERS','ETHRACE2A'])
##Customize the data frame and remove missing and unknown values
dataf = pd.DataFrame()
dataf['RETIRED'] = data['S1Q7A9'].replace(' ',np.NaN).replace('2','0').astype(float)
dataf['AGE'] = data['AGE'].replace(' ',np.NaN).replace('98',np.NaN).astype(float)
dataf['DRINKSTATUS'] = data['CONSUMER'].replace(' ',np.NaN).replace('999',np.NaN).astype(float)
dataf['HOUSE_PEOPLE'] = data['NUMPERS'].replace(' ',np.NaN).astype(float)
dataf['RACE'] = data['ETHRACE2A'].replace(' ',np.NaN)
##Center the means
for c in ['AGE','DRINKSTATUS','HOUSE_PEOPLE']:
dataf[c] = dataf[c]-dataf[c].mean()
dataf[['AGE','DRINKSTATUS','HOUSE_PEOPLE']].describe()
##Run the logistic regression with Race as a categorical variable
logm = smf.logit(formula='RETIRED ~ AGE + DRINKSTATUS + HOUSE_PEOPLE + C(RACE)',data=dataf).fit()
print(logm.summary())
##Generate a table of confidence intervals, odd ratios and p-values
conf = logm.conf_int()
conf.columns = ['Lower CI','Upper CI']
conf['OR'] = logm.params
conf = np.exp(conf)
conf['p-val'] = logm.pvalues.round(3)
print(conf)