-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path220305_patientReel01.py
121 lines (102 loc) · 5.06 KB
/
220305_patientReel01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import pandas as pd
import numpy as np
import statistics
import glob
import tqdm
from datetime import datetime, timedelta
def parseResultTime(resultTime):
thisDate = resultTime.split(' ')[0]
thisDate = datetime.strptime(thisDate, '%d-%b-%Y')
return(thisDate)
def calcReelFrame(resultDateString, referenceTime=datetime(2008, 9, 30)):
return parseResultTime(resultDateString)-referenceTime
def normalizeHelper(ORD_VALUE, FINAL_REF_LOW, FINAL_REF_HIGH):
normalizedValue = 0
try:
normalizedValue = (float(ORD_VALUE)-float(FINAL_REF_LOW))/(float(FINAL_REF_HIGH)-float(FINAL_REF_LOW))-0.5
except:
normalizedValue = 0
return normalizedValue
def convertToFloat(ORD_VALUE):
value = 0
try:
value = float(ORD_VALUE)
except:
value = 0
return value
# NOTE: needs enc from #2 in table of contents where hospital encounters are excluded
def lab_clean(df, use_ref_ranges=True):
# Restrict to non-hospital through inner join
#df = pd.merge(df, enc['pat_enc_csn_id'], on='pat_enc_csn_id', how='inner')
# Restrict to >9/2008
df['REEL_FRAME'] = df.apply(lambda x: calcReelFrame(x['RESULT_DATE']).days//14, axis=1)
df = df[df['REEL_FRAME'] > 0]
# Remove external orders/results
df = df[(df.ORDER_CLASS_C != 30) & (df.ORDER_CLASS_C != 143)]
# Remove point of care
df = df[(~df.PROC_NAME.str.contains('(ABL)|(iStat)|(POC)|(POCT)')) & (df.ORDER_CLASS != 'Point of Care')]
# Remove if no ref range if TRUE
if use_ref_ranges == True:
df['REF_LOW_2'] = df.REF_NORMAL_VALS.apply(lambda x: x.split('-')[0] if (isinstance(x, str)) else x)
df['REF_HIGH_2'] = df.REF_NORMAL_VALS.apply(
lambda x: x.split('-')[1] if ((isinstance(x, str)) and (len(x.split('-')) == 2)) else x)
df.REF_LOW_2 = pd.to_numeric(df['REF_LOW_2'], errors='coerce')
df.REF_HIGH_2 = pd.to_numeric(df['REF_HIGH_2'], errors='coerce')
df['FINAL_REF_LOW'] = df.apply(lambda x: x.REF_LOW_2 if ~np.isnan(x.REF_LOW_2) else x.REFERENCE_LOW, axis=1)
df['FINAL_REF_HIGH'] = df.apply(lambda x: x.REF_HIGH_2 if ~np.isnan(x.REF_HIGH_2) else x.REFERENCE_HIGH, axis=1)
df = df[(~df.FINAL_REF_LOW.isna()) & (~df.FINAL_REF_HIGH.isna())]
df['ORD_VALUE_NORM'] = df.apply(lambda x: normalizeHelper(ORD_VALUE = x.ORD_VALUE, FINAL_REF_LOW=x.FINAL_REF_LOW, FINAL_REF_HIGH=x.FINAL_REF_HIGH), axis=1)
#df['ORD_VALUE_NORM']= df.apply(lambda x: convertToFloat(x.ORD_VALUE), axis=1)
return df
def singleLabComponentReel(df, labComp):
#print(labComp)
thisDF = df[df['COMMON_NAME']==labComp]
#print(thisDF.head())
labREEL = np.zeros(240)
if len(thisDF)>0:
for eachIndex in range(240):
listOfLabs = thisDF[thisDF['REEL_FRAME']==eachIndex].ORD_VALUE_NORM.values
listOfLabs = [float(eachItem) for eachItem in listOfLabs]
if len(listOfLabs)>0:
labREEL[eachIndex] = statistics.mean(listOfLabs)
elif eachIndex ==0:
labREEL[eachIndex]=0
else:
labREEL[eachIndex]=labREEL[eachIndex-1]
return labREEL
if __name__ == "__main__":
enc = pd.read_csv('Data/req1512_encounter.csv')
listOfFileNames = [eachItem.split('/')[-1][:-3] for eachItem in glob.glob('Data/lab_data_patientLvl/*.csv')]
listOfIDs = list(set([eachItem.split('_')[0] for eachItem in listOfFileNames]))
listOfIDs.sort(reverse=True)
labComponentsDF = pd.read_csv('Data/lab_stats_Yee_JW_edit.csv')
labCompList = labComponentsDF[labComponentsDF['include']==1]['component name'].values
csvFilepathTemplate = 'Data/lab_data_patientLvl/id_*.csv'
npySavePath = 'Data/lab_data_patientReels/id.npy'
#print(len(listOfIDs))
for eachID in tqdm.tqdm(listOfIDs[0:100]):
#print(eachID)
thisPath = csvFilepathTemplate.replace('id',eachID)
thisPathList = glob.glob(thisPath)
thisListOfDataFrames = [pd.read_csv(eachItem, usecols=['MRN', 'PAT_ENC_CSN_ID', 'ORDER_PROC_ID', 'PROC_NAME', 'RESULT_DATE', 'COMMON_NAME',
'COMPONENT_NAME', 'BASE_NAME', 'ORD_VALUE', 'REFERENCE_LOW', 'REFERENCE_HIGH', 'REFERENCE_UNIT',
'REF_NORMAL_VALS', 'ORDER_CLASS_C', 'ORDER_CLASS'])
for eachItem in thisPathList]
idDataFrame = pd.concat(thisListOfDataFrames)
idDataFrame = lab_clean(idDataFrame)
print(idDataFrame)
#labCompList = ['CREATININE']
labReelList = []
for eachItem in labCompList:
labReel = singleLabComponentReel(idDataFrame, labComp=eachItem)
labReelList.append(labReel)
patientReel = np.stack(labReelList)
thisSavePath = npySavePath.replace('id', eachID)
#print(thisSavePath)
np.save(thisSavePath, patientReel)
thisSavePath = thisSavePath.replace('npy', 'csv')
print(thisSavePath)
np.savetxt(thisSavePath, patientReel, delimiter=",")
#print(labReelList.shape)
#print(labReelList[:,579].shape)
#print(thisListOfDataFrames)