forked from danikiyasseh/loading-physiological-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathload_ptbxl_ecg_part1.py
142 lines (121 loc) · 5.84 KB
/
load_ptbxl_ecg_part1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 14 12:13:14 2020
@author: scro3517
"""
import numpy as np
import os
import pickle
import wfdb
from tqdm import tqdm
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
import ast
#%%
basepath = '/mnt/SecondaryHDD/PTB-XL'
fs = 500
""" File Names to Load """
folders1 = os.listdir(basepath)
pth_to_folders = [folder for folder in folders1 if 'records500' in folder] #records100 contains 100Hz data
pth_to_folders = [os.path.join(pth_to_folder,fldr) for pth_to_folder in pth_to_folders for fldr in os.listdir(os.path.join(basepath,pth_to_folder))]
files = [os.path.join(pth_to_folder,fldr) for pth_to_folder in pth_to_folders for fldr in os.listdir(os.path.join(basepath,pth_to_folder))]
files = [file.split('.hea')[0] for file in files if '.hea' in file]
""" Database with Patient-Specific Info """
df = pd.read_csv(os.path.join(basepath,'ptbxl_database.csv'))
""" Database with Label Information """
codes_df = pd.read_csv(os.path.join(basepath,'scp_statements.csv'))
""" Identify Codes of Interest """
code_of_interest = 'rhythm' # options: 'rhythm' | 'all'
if code_of_interest == 'rhythm':
encoder = LabelBinarizer() #because we may still have multi-output rhythm classification #LabelEncoder()
codes = codes_df[codes_df['rhythm']==1]['Unnamed: 0'].tolist()
elif code_of_interest == 'all':
encoder = LabelBinarizer()
codes = codes_df['Unnamed: 0']
""" Fit Binarizer to Codes for Conversion Later """
encoder.fit(codes)
ncodes = len(encoder.classes_)
def generate_inputs_and_outputs(leads_of_interest,samples_to_take,trial):
inputs = dict()
outputs = dict()
for file in tqdm(files):
data = wfdb.rdsamp(os.path.join(basepath,file))
leads_data = data[0].transpose()
leads_names = data[1]['sig_name']
#patient_name = int(file.split('/')[-1].split('_hr')[0]) #lr for 100 Hz setting
patient_name = df[df['filename_hr']==file].patient_id.astype(int).item()
filedata = df[df['filename_hr']==file]
labels_and_confidence = filedata['scp_codes'].item()
""" Convert string of dictionary to dictionary """
labels_and_confidence = ast.literal_eval(labels_and_confidence)
labels = list(labels_and_confidence.keys())
""" Only Include Patient if Label Falls Within Set of Labels of Interest """
label_present_bool = [label in codes for label in labels]
if any(label_present_bool): #if at least one of the labels are present
""" Keep Labels of Interest """
if code_of_interest in ['rhythm']:
label_indices = np.where(label_present_bool)[0].tolist()
labels = np.array(labels)[label_indices].tolist()
""" Convert Labels into List of Lists of OHE Vectors """
list_of_ohe = encoder.transform(labels)
""" Generate Multi-Hot Vector """
multi_label_output = np.zeros((ncodes))
for entry in list_of_ohe:
multi_label_output += entry
if patient_name in inputs.keys():
current_inputs = list(inputs[patient_name])
current_outputs = list(outputs[patient_name])
else:
current_inputs = []
current_outputs = []
nframes = data[1]['sig_len']//samples_to_take
lead_indices = np.where(np.in1d(leads_names,leads_of_interest))[0]
leads_data = leads_data[lead_indices,:]
if trial in ['contrastive_ml','contrastive_msml']:
for i in range(nframes):
lead_frames = leads_data[:,i*samples_to_take:(i+1)*samples_to_take]
lead_frames = lead_frames.transpose() #2500x12
current_inputs.append(lead_frames)
current_outputs.append(multi_label_output)
else:
for name,lead in zip(leads_of_interest,leads_data):
for i in range(nframes):
frame = lead[i*samples_to_take:(i+1)*samples_to_take]
current_inputs.append(frame)
current_outputs.append(multi_label_output)
inputs[patient_name] = np.array(current_inputs)
outputs[patient_name] = np.array(current_outputs)
return inputs,outputs
#%%
def obtain_samples_to_take(trial):
if trial == 'contrastive_ms':
samples_to_take = 5000
elif trial == 'contrastive_ml':
samples_to_take = 2500
elif trial == 'contrastive_msml':
samples_to_take = 5000
elif trial == 'contrastive_ss':
samples_to_take = 2500
else: #default setting
samples_to_take = 2500
return samples_to_take
#%%
def makepath_and_save_data(inputs,outputs,leads_of_interest,trial='',code_of_interest='all'):
savepath = os.path.join(basepath,'patient_data',trial,'leads_%s' % str(leads_of_interest),'classes_%s' % code_of_interest)
try:
os.chdir(savepath)
except:
os.makedirs(savepath)
with open(os.path.join(savepath,'ecg_signal_frames_ptbxl.pkl'),'wb') as f:
pickle.dump(inputs,f)
with open(os.path.join(savepath,'ecg_signal_labels_ptbxl.pkl'),'wb') as f:
pickle.dump(outputs,f)
print('Saved!')
#%%
leads_of_interest = [['II','V2','aVL','aVR']] #[['I','II','III','aVL','aVR','aVF','V1','V2','V3','V4','V5','V6']] #list of lists regardless of number of leads
trial = 'contrastive_ms' # 'contrastive_ms' | 'contrastive_ml' | 'contrastive_msml' | 'contrastive_ss' | '' #default
for leads in leads_of_interest:
samples_to_take = obtain_samples_to_take(trial)
inputs,outputs = generate_inputs_and_outputs(leads,samples_to_take,trial)
makepath_and_save_data(inputs,outputs,leads,trial=trial,code_of_interest=code_of_interest)