-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbaseline_load_data.py
193 lines (136 loc) · 8.27 KB
/
baseline_load_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import numpy as np
import pandas as pd
from constants import *
from typing import *
from utils.misc_utils import loso_test_idxs
from functools import reduce
import warnings
def data_loader(y_cols: List[str], x_cols_to_exclude: List[str]):
'''
Master DataFrame Load
y_cols: columns for the Y variable
x_cols_to_exclude: columns to exclude from X
'''
with warnings.catch_warnings():
warnings.simplefilter('ignore')
# load family, site
fam_df = pd.read_csv(path + "acspsw03.txt", delim_whitespace=True).drop(0)[merge_cols + ['rel_family_id']]
site_df = pd.read_csv(path + "abcd_lt01.txt", delim_whitespace=True).drop(0)[merge_cols + ['site_id_l']]
# motion
motion_df = pd.read_csv(path + "meanfd.csv")
motion_df['dofavg'] = motion_df.dof / motion_df.runs
motion_df['censoredavg'] = motion_df.censored / motion_df.runs
motion_df['subjectkey'] = motion_df['subjectkey'].str.replace('NDAR', 'NDAR_')
motion_df['eventname'] = 'baseline_year_1_arm_1'
# grab nuisance variables
# todo: this file was generated by our team
nuisance_df = pd.read_csv(path + "../ABCD_4.0_demo.csv")
nuisance_df['HouseholdMaritalStatus'] = nuisance_df['HouseholdMaritalStatus'].str.replace('yes', 'married')
nuisance_df['HouseholdMaritalStatus'] = nuisance_df['HouseholdMaritalStatus'].str.replace('no', 'not_married')
# load phenos
# todo: this file was generated by our team
cbcl_df = pd.read_csv("./abcd_cbcls01_nosleep_scales.csv")
cbcl_df = cbcl_df[merge_cols + cbcl_cols]
cbcl_item_df = pd.read_csv(path + "abcd_cbcl01.txt", delim_whitespace=True).drop(0)
cbcl_item_df = cbcl_item_df[merge_cols + cbcl_item_cols]
bis_upps_df = pd.read_csv(path + "abcd_mhy02.txt", delim_whitespace=True).drop(0)
bis_upps_df = bis_upps_df[merge_cols + bis_cols + upps_cols]
upps_item_df = pd.read_csv(path + "abcd_upps01.txt", delim_whitespace=True).drop(0, axis=0)
upps_item_df[merge_cols + upps_item_cols]
nih_df = pd.read_csv(path + "abcd_tbss01.txt", delim_whitespace=True).drop(0)
nih_df = nih_df[merge_cols + nih_cols]
sscey_df = pd.read_csv(path +"abcd_sscey01.txt", delim_whitespace=True).drop(0)
sscey_df = sscey_df[merge_cols + sscey_cols]
sscep_df = pd.read_csv(path +"abcd_sscep01.txt", delim_whitespace=True).drop(0)
sscep_df = sscep_df[merge_cols + sscep_cols]
ses_df = pd.read_csv(path + "ABCD_ses.csv") # todo: this file was generated by our team
ses_df = ses_df[merge_cols + ses_cols]
pdem_df = pd.read_csv(path + "pdem02.txt", delim_whitespace=True).drop(0)
pdem_df[pdem_cols] = pdem_df[pdem_cols].astype(float).replace({777: np.nan, 999: np.nan})
pdem_df = pdem_df[merge_cols + pdem_cols]
pea_df = pd.read_csv(path + "abcd_ps01.txt", delim_whitespace=True).drop(0)
pea_df = pea_df[merge_cols + pea_cols]
lmt_df = pd.read_csv(path + "lmtp201.txt", delim_whitespace=True).drop(0)
lmt_df = lmt_df[merge_cols + lmt_cols]
neigh_parent_df = pd.read_csv(path + "abcd_pnsc01.txt", delim_whitespace=True).drop(0)
neigh_parent_df = neigh_parent_df[merge_cols + neigh_crimes_par_cols]
sleep_df = pd.read_csv("/nfs/corenfs/psych-freewill-data/Data/ABCD/NDAR/Release4.0/ABCDStudyNDA/abcd_ssphp01.txt", delim_whitespace=True).drop(0)
pf10_df = pd.read_csv(latent_path + "ABCD_PF10_4.0.csv")
pf10_df['eventname'] = 'baseline_year_1_arm_1'
pf10_df = pf10_df[merge_cols + pf10_cols]
ses_df2 = pd.read_csv(latent_path + "ABCD_ses_4.0.csv")
ses_df2 = ses_df2[merge_cols + ses2_cols]
g_df = pd.read_csv(latent_path + "ABCD_G_4.0.csv")
g_df['eventname'] = 'baseline_year_1_arm_1'
g_df = g_df[merge_cols + g_cols]
bmi_df = pd.read_csv(path + "abcd_ant01.txt", delim_whitespace=True).drop(0)
bmi_df['height_in'] = np.nanmean(bmi_df[['anthro_1_height_in', 'anthro2heightin', 'anthro3heightin']].astype(float), axis=1)
bmi_df['weight_lb'] = np.nanmean(bmi_df[['anthroweight1lb', 'anthroweight2lb', 'anthroweight3lb']].astype(float), axis=1)
bmi_df['bmi'] = 703*(bmi_df['weight_lb'] / bmi_df['height_in']**2)
bmi_df = bmi_df[(bmi_df['bmi'] < 60) & (bmi_df['bmi'] > 13.5)]
bmi_df = bmi_df[merge_cols + bmi_cols]
screen_df = pd.read_csv(path + "abcd_ssmty01.txt", delim_whitespace=True).drop(0)
screen_df = screen_df[merge_cols + screen_cols]
# todo: this file was generated by our team
family_df = pd.read_csv("ABCD_phenotypic_BPM.csv")
family_df = family_df[merge_cols + family_cols]
med_df = pd.read_csv(path + "abcd_medhxss01.txt", delim_whitespace=True).drop(0)
med_df = med_df[merge_cols + med_cols]
med_df2 = pd.read_csv(path + "abcd_mx01.txt", delim_whitespace=True).drop(0)
med_df_vals_1 = med_df2.medhx_2a.values.flatten()
med_df_vals_2 = med_df2.medhx_2d.values.flatten()
med_df_doc_visits = med_df2.medhx_1a.values.flatten()
no_doc_visit_idxs_1 = np.argwhere((med_df_doc_visits == 0) & np.isnan(med_df_vals_1.astype(np.float32))).flatten()
no_doc_visit_idxs_2 = np.argwhere((med_df_doc_visits == 0) & np.isnan(med_df_vals_2.astype(np.float32))).flatten()
med_df_vals_1[no_doc_visit_idxs_1] = 0
med_df_vals_2[no_doc_visit_idxs_2] = 0
med_df2['medhx_2a'] = med_df_vals_1
med_df2['medhx_2d'] = med_df_vals_2
med_df2 = med_df2[merge_cols + med_cols_2]
med_df2 = med_df2[merge_cols + med_cols_2]
comm_df = pd.read_csv(path + "abcd_pxccp01.txt", delim_whitespace=True).drop(0)
comm_df[community_cohesion_cols] = comm_df[community_cohesion_cols].astype(float).replace({777: np.nan, 999: np.nan})
comm_df = comm_df[merge_cols + community_cohesion_cols]
comm_df['eventname'] = 'baseline_year_1_arm_1'
fitbit_sleep_df = pd.read_csv('./fitbit_sleep_summary.csv')
fitbit_sleep_df['eventname'] = 'baseline_year_1_arm_1'
fitbit_sleep_df = fitbit_sleep_df[merge_cols + fitbit_sleep_cols]
bpmt_df = pd.read_csv(f'{path}/abcd_bpmt01.txt' , delim_whitespace=True).drop(0)
bpmt_df = bpmt_df[merge_cols + bpmt_questions]
asr_df = pd.read_csv(f'{path}/abcd_asrs01.txt' , delim_whitespace=True).drop(0)
asr_df = asr_df[merge_cols + asr_cols]
# compile all dataframes into list
df_list = [nuisance_df, bmi_df, cbcl_df, cbcl_item_df, nih_df, bis_upps_df, upps_item_df,
sscey_df, sscep_df, sleep_df, ses_df, pdem_df, pea_df, lmt_df, pf10_df, ses_df2,
g_df, screen_df, family_df, med_df, neigh_parent_df, comm_df, fitbit_sleep_df, bpmt_df, asr_df, med_df2]
# merge all dataframes
master_df = reduce(lambda left,right: pd.merge(left, right, on=merge_cols, how='outer'), df_list)
for col in all_cols:
if col not in master_df.columns:
raise Exception(f'Missing Variable: {col}')
'''
Sleep, CBCL, UPPS, and NIH Load
'''
pred_vars = [x for x in all_cols if
(x not in latent_cols)
& (x not in y_cols)
& (x not in x_cols_to_exclude)]
pred_vars += nuisance_cols[:1]
master_df = master_df[master_df.eventname == 'baseline_year_1_arm_1']
master_df = master_df.dropna(axis=0, how='any', subset=pred_vars + y_cols)
# define "predictor"/"predicted" feature sets for CCA
Y = master_df[y_cols].values.astype(float)
X = master_df[pred_vars].values.astype(float)
# now add dummy variables
for col in nuisance_cols[1:]:
if col == 'RaceEthnicity':
dummy = pd.get_dummies(master_df[col])
dummy = dummy.drop('White', axis=1)
else:
dummy = pd.get_dummies(master_df[col], drop_first=True)
pred_vars += dummy.columns.to_list()
X = np.hstack((X, dummy.values))
# get sites for loso
sites = master_df.site_id_l.values.flatten()
test_idxs = loso_test_idxs(sites)
return test_idxs, X, Y, pred_vars, master_df.subjectkey.values.flatten(), master_df