-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_preprocessing_script.py
112 lines (93 loc) · 5.07 KB
/
data_preprocessing_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
time_var = 'event_time'
payment_var = 'patient_payment'
id_var = 'patient_id'
y_var = 'outcome_flag'
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
sparse_flag = False
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
end_mem = df.memory_usage().sum() / 1024**2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
return df
def pay_feats_creator(new_df, df):
cat_list = ['event_name', 'specialty']
for cat in cat_list:
payment_df = df.groupby(['patient_id', cat]).agg({payment_var: ['min','max','mean','sum']}).unstack(cat)
payment_df.columns = ["_".join(x) for x in payment_df.columns.ravel()]
payment_df.reset_index(inplace=True)
new_df = pd.merge(new_df, payment_df, on='patient_id', how='left')
return new_df
def overall_freq_feats_creator(new_df, df):
cat_list = ['event_name']
for cat in cat_list:
payment_df = df.groupby(['patient_id', cat]).agg({time_var: 'count'}).unstack(cat)
payment_df.columns = ["_".join(x) for x in payment_df.columns.ravel()]
payment_df.reset_index(inplace=True)
new_df = pd.merge(new_df, payment_df, on='patient_id', how='left')
return new_df
def main():
print("Reading datasets...\n")
train = pd.read_csv('./train_data.csv')
test = pd.read_csv('./test_data.csv')
train_lables = pd.read_csv('./train_labels.csv')
print("Concatenate train and test into new dataframe...\n")
df = pd.concat([train,test],axis=0).reset_index(drop=True)
del train
del test
recency_df = pd.read_csv('recency_feats_df.csv')
print("Reducing Memory Usage of Recency Features Dataset...\n")
recency_df = reduce_mem_usage(recency_df)
gc.collect()
print("Creating aggregate features from patient_payment column...\n")
recency_df = pay_feats_creator(recency_df, df)
print("Creating overall frequency feature over event_name column...\n")
recency_df = overall_freq_feats_creator(recency_df, df)
print("Creating aggregate features from patient_payment column over patient_id...\n")
patient_payment_df = df.groupby(['patient_id'],as_index=False).agg({payment_var: ['min','max','mean','sum']})
patient_payment_df.columns = ["_".join(x) for x in patient_payment_df.columns.ravel()]
patient_payment_df.rename({"patient_id_": 'patient_id'}, axis=1, inplace=True)
recency_df = pd.merge(recency_df, patient_payment_df, on='patient_id', how='left')
print("Creating count column for patient_id...\n")
patient_events_count = df.groupby(['patient_id'],as_index=False).agg({'event_name': 'count'}).rename({'event_name': 'patient_event_count'}, axis=1)
recency_df = pd.merge(recency_df, patient_events_count, on='patient_id', how='left')
recency_df = pd.merge(recency_df, train_lables, on='patient_id', how='left')
print("Dropping columns present in train but not in test set...\n")
drop_cols = ['spec_111', 'spec_123', 'spec_139', 'spec_141', 'spec_143',
'spec_158', 'spec_165', 'spec_170', 'spec_172', 'spec_174',
'spec_181', 'spec_183', 'spec_188', 'spec_194', 'spec_195',
'spec_197', 'spec_200', 'spec_201', 'spec_202', 'spec_205',
'spec_209', 'spec_211', 'spec_213', 'spec_214', 'spec_215',
'spec_216', 'spec_222', 'spec_225', 'spec_227', 'spec_228',
'spec_230', 'spec_231', 'spec_232', 'spec_238', 'spec_239',
'spec_241']
drop_cols = [col for col in recency_df.columns if any(d_c in col for d_c in drop_cols)]
recency_df.drop(drop_cols, axis=1,inplace=True)
# In[ ]:
print("Exporting final dataframe with {} features".format(recency_df.shape[1] - 1))
recency_df.to_csv('final_df.csv',index=False)
if __name__ == "__main__":
main()