friendly-potato/main.py at main · ofw24/friendly-potato · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""

"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
from scipy.stats import norm, uniform
from analysis.mcmc.mcmc_methods import gibbs_sampler
from analysis.visuals.plotting import cont_2_color, funky_legend, labels_titles, plot_life_exps
from analysis.data_manips.data_extraction import ale, load_data, load_slurm, get_continent, grab_life_exp_gain
from analysis.mcmc.not_mcmc_methods import country_beta_ols, data_spline, double_logistic, linear_fit_params_life_exps, double_logistic_function, residuals

def main():
    data = "./raw_data/gapminder.tsv.txt"
    output = "./raw_data/slurm-10481503.out"
    df = load_data(data)
    count = "Croatia"
    rawdata = pd.read_csv(data, sep="\t")
    countries = df["country"].unique()
    beta_ols, _, _, _ = country_beta_ols(df, count)
    actual = ale(df, count)
    pred  = residuals(df, count)

    # Plotting raw data
    plt.figure(figsize=(8,6))
    X = np.array([])
    mS = {"Africa": [], "Americas": [], "Asia": [], "Europe": [], "Oceania": []}
    bS = {"Africa": [], "Americas": [], "Asia": [], "Europe": [], "Oceania": []}
    rS = {"Africa": [], "Americas": [], "Asia": [], "Europe": [], "Oceania": []}
    for count in countries:
        continent = get_continent(rawdata, count)
        plot_life_exps(plt.gca(), df, count, color=cont_2_color(continent), ec=None, alpha=0.2)
        # Linear regression
        m, b, r = linear_fit_params_life_exps(df, count)
        mS[continent].append(m); bS[continent].append(b); rS[continent].append(r)
        X = np.append(X, ale(df, count, "year"))
    # Lines of best fit for each continent
    m_per_cont = [np.mean(series) for series in mS.values()]
    b_per_cont = [np.mean(series) for series in bS.values()]
    r_per_cont = [np.mean(series) for series in rS.values()]
    for cont, m, b, r in zip(["Africa", "Americas", "Asia", "Europe", "Oceania"], m_per_cont, b_per_cont, r_per_cont):
        f = lambda x: m*x + b
        print(f"Average correlation coefficient {cont} is {r:0.4f}")
        plt.plot(X, f(X), c=cont_2_color(cont), lw=1.5, path_effects=[pe.Stroke(linewidth=4, foreground="k"), pe.Normal()], label=f"Best fit for {cont}")
    labels_titles(plt.gca(), xlabel="Year", ylabel="Average Life Expectancy")
    funky_legend(plt.gca())
    plt.show()

    # KEEP THIS FOR SPLINING
    actuals, resids, continents = [], [], np.array([])
    for count in countries:
        actual = ale(df, count); actuals.append(actual)
        resid = residuals(df, count); resids.append(resid)
        continent = get_continent(rawdata, count); continents = np.append(continents, [cont_2_color(continent) for _ in range(len(actual))])
    actuals, resids = np.array(actuals).flatten(), np.array(resids).flatten()
    resids = np.array([x for _, x in sorted(zip(actuals, resids))]); actuals.sort() # Sorting to deal with spline shenanigans
    spl = data_spline(actuals, actuals-resids, order=1) # THIS CAN NEVER BE CHANGED
    omega = uniform(0, 10).rvs()

    # GIBBS SAMPLING FOR ALL COUNTRIES
    # CHANGE THIS STATEMENT TO TRUE ONLY IF RUNNING ON SUPERCOMPUTER (or just not your own desktop)
    if True:
        all_predicted = []
        colors = []
        slurm_res = load_slurm(data, output)
        for count in countries:
            actual = ale(df, count)
            continent = np.array(rawdata["continent"][rawdata["country"] == count])[0]
            _, actual_gain = grab_life_exp_gain(df, count)
            print(f"{count} is in {continent}")
            # res = gibbs_sampler(df, count, beta_ols, 5000, 1000)
            # # Get averages for each parameter out of the gibbs sampler
            # gibbs_beta = np.zeros(len(res[0]))
            # for r in res:
            #     for i, p in enumerate(r):
            #         gibbs_beta[i] += p / len(res)
            #
            # print(f"Gibbs Beta : {gibbs_beta}")
            gibbs_predicted = []
            gibbs_beta = slurm_res[count][1]
            for index, life_exp in enumerate(actual):
                pred = double_logistic(gibbs_beta, life_exp) + life_exp + norm(0, (omega*spl(life_exp)**2)).rvs()
                if np.absolute(pred - life_exp) <= 50:
                    gibbs_predicted.append(pred)
                else: # Impute data by drawing from target distribution
                    print(f"{count} is highly problematic")
                    print(f"Predicted life exp : {pred}")
                    print(f"Actual life exp    : {life_exp}")
                    gibbs_predicted.append( norm(loc=life_exp, scale=2**2).rvs() )
                colors.append(cont_2_color(continent))
            all_predicted.append(gibbs_predicted)
        all_predicted = np.array(all_predicted).flatten()
    else: # Read in data from slurm output
        all_predicted, colors = [], []
        slurm_res = load_slurm(data, output)
        instances = 12 # Number of times each country appears
        counter = 0
        for key, val in slurm_res.items():
            all_predicted.append(val[-1][(12*counter):(12*(counter+1))])
            counter += 1
            colors.append([cont_2_color(val[0]) for _ in range(12)])
        all_predicted = np.array(all_predicted).flatten()
        colors = np.array(colors).flatten()

    # Plotting raw life expectancy gains
    # actual_lexp = []
    plt.figure(figsize=(8,6))
    for count in countries:
        # continent = np.array(rawdata["continent"][rawdata["country"] == count])[0]
        le, leg = grab_life_exp_gain(df, count) # Actual life expectancy and actual gain to life expectancy
        continent = get_continent(rawdata, count)
        if max(np.absolute(leg)) < 5:
            plt.scatter(le, leg, c=cont_2_color(continent), alpha=0.2)
        # if count == "Croatia":
        #     actual_lexp.append(le)
    # # Superimposing gain function on top
    # ales = []
    # for count in countries:
    #     ales.append(ale(df, "Zimbabwe", "lifeExp"))
    # ales = np.array(ales).flatten()
    X = np.linspace(30, 90, 1000)
    F = double_logistic_function([100, 2.1, 14, 30, 4, 1.1])
    Y = F(X)
    plt.plot(X, Y, c="r", lw=2, label="Africa", path_effects=[pe.Stroke(linewidth=4, foreground="k"), pe.Normal()])
    F = double_logistic_function([100, -2.1, 14, 40, 7.1, 6])
    Y = F(X) + 2
    plt.plot(X, Y, c="m", lw=2, label="Europe", path_effects=[pe.Stroke(linewidth=4, foreground="k"), pe.Normal()])
    labels_titles(plt.gca(), xlabel="Life Expectancy", ylabel="Expected Gain")
    funky_legend(plt.gca())
    plt.show()

    # Plotting
    plt.figure(figsize=(8,6))
    # UNCOMMENT FOR GETTING RESIDUAL PLOT WITH SPLINES
    # plt.scatter(actuals, resids, c=continents, label="Residuals")
    # plt.plot(actuals, spl(actuals), c="k", lw=2, label="Spline")
    plt.scatter(actuals[1:], all_predicted[1:], c=colors[1:]) # Something happened with the first point
    plt.plot(np.linspace(10, 100, 100), np.linspace(10, 100, 100), "k--", lw=2, alpha=0.5)
    # plt.xlim([10,100]); plt.ylim([10,100])
    labels_titles(plt.gca(), xlabel="Actual Life Expectancy", ylabel="Predicted Life Expectancy")
    funky_legend(plt.gca())
    plt.show()

    return

if __name__ == "__main__":
    np.random.seed(222222)
    main()