-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerge-normalize-dataframe.py
89 lines (74 loc) · 2.66 KB
/
merge-normalize-dataframe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.svm import l1_min_c
import time
import json
hr = pd.read_csv('/data/HR_daily_samples.csv')
hr = hr.add_prefix('HR_')
print('Read in HR')
sp = pd.read_csv('/data/SPO2_daily_samples.csv')
sp = sp.add_prefix('SP_')
print('Read in SP')
sp['day'] = np.ceil(sp['SP_time'] / 60 / 60 / 24)
hr['day'] = np.ceil(hr['HR_time'] / 60 / 60 / 24)
sp = sp.rename(columns={'SP_id': 'id'})
hr = hr.rename(columns={'HR_id': 'id'})
df = pd.merge(hr, sp, on=['id','day'])
mat_hr = pd.read_csv('./randomDailySample_matlab_HR.csv')
mat_hr = mat_hr.add_prefix('HR_')
mat_hr = mat_hr.rename(columns={'HR_id': 'id', 'HR_time': 'time'})
mat_sp = pd.read_csv('./randomDailySample_matlab_SP.csv')
mat_sp = mat_sp.add_prefix('SP_')
mat_sp = mat_sp.rename(columns={'SP_id': 'id', 'SP_time': 'time'})
df = pd.merge(df, mat_hr, on=['id','time'])
df = pd.merge(df, mat_sp, on=['id','time'])
#Optional: Include if interested in performance on just older babies
# grouped = df.groupby("id")
# too_short = list(grouped['day'].max().index[grouped['day'].max() < 7])
# df = df[~df.id.isin(too_short)]
df = df.replace([np.inf, -np.inf], np.nan)
df = df.select_dtypes(exclude=['object'])
df = df.loc[:,df.isnull().mean() < .035]
print('Building Cleaned Dataframe')
X = df.dropna()
ids = X['id']
time = X['time']
X = X.drop(['id','time'],axis = 1)
X = X.loc[:,X.std() != 0]
def normalize(x):
#Normalizes column onto scale of 0-1
return 1 / ( 1 + np.exp( - ( x - x.median() ) / ( 1.35 * (x.quantile(.75) - x.quantile(.25)) ) ) )
X = X.transform(normalize)
X = X.loc[:, X.isnull().sum() == 0]
#Drop some specific columns unfit for modelling
bad_index = []
bad_column = []
for i in range(len(list(X.columns))):
column = list(X.columns)[i]
if 'MD.pNN.pnn' in column:
bad_index.append(i)
bad_column.append(column)
if 'PH.' in column and 'res.acl' in column:
bad_index.append(i)
bad_column.append(column)
if 'EN.PermEn.3' in column:
bad_index.append(i)
bad_column.append(column)
if 'SY.LocalGlobal' in column and 'skew' in column:
bad_index.append(i)
bad_column.append(column)
if 'SY.LocalGlobal' in column and 'ac1' in column:
bad_index.append(i)
bad_column.append(column)
if 'SY.LocalGlobal' in column and 'kurtosis' in column:
bad_index.append(i)
bad_column.append(column)
if 'Time' in column or 'day' in column:
bad_index.append(i)
bad_column.append(column)
X = X.drop(norm.columns[bad_index],axis=1)
X['time'] = time
X['id'] = ids
X.to_csv('/outputs/normalized_data.csv',index = False)