-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscript-01-gen-mechanisms.py
111 lines (86 loc) · 3.81 KB
/
script-01-gen-mechanisms.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
import pandas as pd
def generate_cross_sectional():
np.random.seed(123)
n = 100
x1 = np.random.normal(0, 1, n)
x2 = np.random.normal(0, 1, n)
x3 = np.random.normal(0, 1, n)
y = 0.5*x1 + 0.3*x2 + 0.2*x3 + np.random.normal(0, 1, n)
df = pd.DataFrame({
'y': y,
'x1': x1,
'x2': x2,
'x3': x3
})
# Univariate missing (MCAR in x1)
missing_indices = np.random.choice(n, 20, replace=False)
df.loc[missing_indices, 'x1'] = np.nan
# Multivariate missing (MCAR in x2 and x3)
df.loc[np.random.choice(n, 15, replace=False), 'x2'] = np.nan
df.loc[np.random.choice(n, 15, replace=False), 'x3'] = np.nan
# Monotone missing pattern
df = df.sort_values('x1')
df.iloc[:10, df.columns.get_loc('x2')] = np.nan
df.iloc[:20, df.columns.get_loc('x3')] = np.nan
# MAR pattern (x3 missing depending on x1)
mar_prob = pd.Series(x1).transform(lambda x: (x - x.mean()) / x.std())
mar_prob = 1 / (1 + np.exp(-mar_prob))
mar_missing = np.random.binomial(1, mar_prob)
df.loc[mar_missing == 1, 'x3'] = np.nan
return df
def generate_panel_data():
n_units = 20
n_years = 10
years = range(2015, 2025)
unit_ids = np.repeat(range(1, n_units + 1), n_years)
year = np.tile(years, n_units)
unit_effect = np.repeat(np.random.normal(0, 1, n_units), n_years)
time_trend = (year - min(year)) / (max(year) - min(year))
x1 = unit_effect + 0.5 * time_trend + np.random.normal(0, 0.5, n_units * n_years)
x2 = unit_effect + 0.3 * time_trend + np.random.normal(0, 0.5, n_units * n_years)
y = 0.4 * x1 + 0.3 * x2 + unit_effect + 0.2 * time_trend + np.random.normal(0, 0.3, n_units * n_years)
panel_long = pd.DataFrame({
'unit_id': unit_ids,
'year': year,
'y': y,
'x1': x1,
'x2': x2
})
# Unit attrition
dropout_units = [3, 7, 12]
dropout_mask = (panel_long['unit_id'].isin(dropout_units)) & (panel_long['year'] > 2020)
panel_long.loc[dropout_mask, ['y', 'x1', 'x2']] = np.nan
# Intermittent missing
random_units = np.random.choice(range(1, n_units + 1), 8, replace=False)
for unit in random_units:
missing_years = np.random.choice(years, 2, replace=False)
mask = (panel_long['unit_id'] == unit) & (panel_long['year'].isin(missing_years))
panel_long.loc[mask, ['x1', 'x2']] = np.nan
# Create wide format
panel_wide = panel_long.pivot(index='unit_id',
columns='year',
values=['y', 'x1', 'x2'])
panel_wide.columns = [f"{var}_{year}" for var, year in panel_wide.columns]
panel_wide = panel_wide.reset_index()
return panel_long, panel_wide
def save_all_data():
cross_sectional = generate_cross_sectional()
panel_long, panel_wide = generate_panel_data()
cross_sectional.to_csv('data/data2-py-cross_sectional.csv', index=False)
panel_long.to_csv('data/data2-py-panel_long.csv', index=False)
panel_wide.to_csv('data/data2-py-panel_wide.csv', index=False)
return cross_sectional, panel_long, panel_wide
if __name__ == "__main__":
cross_sectional, panel_long, panel_wide = save_all_data()
print("\nCross-sectional missing patterns:")
print(cross_sectional.isnull().sum())
print("\nPanel (long format) missing patterns by year:")
missing_by_year = pd.DataFrame({
'y_missing': panel_long.groupby('year')['y'].apply(lambda x: x.isnull().sum()),
'x1_missing': panel_long.groupby('year')['x1'].apply(lambda x: x.isnull().sum()),
'x2_missing': panel_long.groupby('year')['x2'].apply(lambda x: x.isnull().sum())
})
print(missing_by_year)
print("\nPanel (wide format) first few rows:")
print(panel_wide.head())