-
Notifications
You must be signed in to change notification settings - Fork 72
/
Copy pathABIDEParser.py
266 lines (197 loc) · 8.86 KB
/
ABIDEParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# Copyright (C) 2017 Sarah Parisot <[email protected]>, Sofia Ira Ktena <[email protected]>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import os
import csv
import numpy as np
import scipy.io as sio
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import RFE
from nilearn import connectome
# Reading and computing the input data
# Selected pipeline
pipeline = 'cpac'
# Input data variables
root_folder = '/path/to/data/'
data_folder = os.path.join(root_folder, 'ABIDE_pcp/cpac/filt_noglobal')
phenotype = os.path.join(root_folder, 'ABIDE_pcp/Phenotypic_V1_0b_preprocessed1.csv')
def fetch_filenames(subject_IDs, file_type):
"""
subject_list : list of short subject IDs in string format
file_type : must be one of the available file types
returns:
filenames : list of filetypes (same length as subject_list)
"""
import glob
# Specify file mappings for the possible file types
filemapping = {'func_preproc': '_func_preproc.nii.gz',
'rois_ho': '_rois_ho.1D'}
# The list to be filled
filenames = []
# Fill list with requested file paths
for i in range(len(subject_IDs)):
os.chdir(data_folder) # os.path.join(data_folder, subject_IDs[i]))
try:
filenames.append(glob.glob('*' + subject_IDs[i] + filemapping[file_type])[0])
except IndexError:
# Return N/A if subject ID is not found
filenames.append('N/A')
return filenames
# Get timeseries arrays for list of subjects
def get_timeseries(subject_list, atlas_name):
"""
subject_list : list of short subject IDs in string format
atlas_name : the atlas based on which the timeseries are generated e.g. aal, cc200
returns:
time_series : list of timeseries arrays, each of shape (timepoints x regions)
"""
timeseries = []
for i in range(len(subject_list)):
subject_folder = os.path.join(data_folder, subject_list[i])
ro_file = [f for f in os.listdir(subject_folder) if f.endswith('_rois_' + atlas_name + '.1D')]
fl = os.path.join(subject_folder, ro_file[0])
print("Reading timeseries file %s" %fl)
timeseries.append(np.loadtxt(fl, skiprows=0))
return timeseries
# Compute connectivity matrices
def subject_connectivity(timeseries, subject, atlas_name, kind, save=True, save_path=data_folder):
"""
timeseries : timeseries table for subject (timepoints x regions)
subject : the subject ID
atlas_name : name of the parcellation atlas used
kind : the kind of connectivity to be used, e.g. lasso, partial correlation, correlation
save : save the connectivity matrix to a file
save_path : specify path to save the matrix if different from subject folder
returns:
connectivity : connectivity matrix (regions x regions)
"""
print("Estimating %s matrix for subject %s" % (kind, subject))
if kind in ['tangent', 'partial correlation', 'correlation']:
conn_measure = connectome.ConnectivityMeasure(kind=kind)
connectivity = conn_measure.fit_transform([timeseries])[0]
if save:
subject_file = os.path.join(save_path, subject,
subject + '_' + atlas_name + '_' + kind.replace(' ', '_') + '.mat')
sio.savemat(subject_file, {'connectivity': connectivity})
return connectivity
# Get the list of subject IDs
def get_ids(num_subjects=None):
"""
return:
subject_IDs : list of all subject IDs
"""
subject_IDs = np.genfromtxt(os.path.join(data_folder, 'subject_IDs.txt'), dtype=str)
if num_subjects is not None:
subject_IDs = subject_IDs[:num_subjects]
return subject_IDs
# Get phenotype values for a list of subjects
def get_subject_score(subject_list, score):
scores_dict = {}
with open(phenotype) as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
if row['SUB_ID'] in subject_list:
scores_dict[row['SUB_ID']] = row[score]
return scores_dict
# Dimensionality reduction step for the feature vector using a ridge classifier
def feature_selection(matrix, labels, train_ind, fnum):
"""
matrix : feature matrix (num_subjects x num_features)
labels : ground truth labels (num_subjects x 1)
train_ind : indices of the training samples
fnum : size of the feature vector after feature selection
return:
x_data : feature matrix of lower dimension (num_subjects x fnum)
"""
estimator = RidgeClassifier()
selector = RFE(estimator, fnum, step=100, verbose=1)
featureX = matrix[train_ind, :]
featureY = labels[train_ind]
selector = selector.fit(featureX, featureY.ravel())
x_data = selector.transform(matrix)
print("Number of labeled samples %d" % len(train_ind))
print("Number of features selected %d" % x_data.shape[1])
return x_data
# Make sure each site is represented in the training set when selecting a subset of the training set
def site_percentage(train_ind, perc, subject_list):
"""
train_ind : indices of the training samples
perc : percentage of training set used
subject_list : list of subject IDs
return:
labeled_indices : indices of the subset of training samples
"""
train_list = subject_list[train_ind]
sites = get_subject_score(train_list, score='SITE_ID')
unique = np.unique(list(sites.values())).tolist()
site = np.array([unique.index(sites[train_list[x]]) for x in range(len(train_list))])
labeled_indices = []
for i in np.unique(site):
id_in_site = np.argwhere(site == i).flatten()
num_nodes = len(id_in_site)
labeled_num = int(round(perc * num_nodes))
labeled_indices.extend(train_ind[id_in_site[:labeled_num]])
return labeled_indices
# Load precomputed fMRI connectivity networks
def get_networks(subject_list, kind, atlas_name="aal", variable='connectivity'):
"""
subject_list : list of subject IDs
kind : the kind of connectivity to be used, e.g. lasso, partial correlation, correlation
atlas_name : name of the parcellation atlas used
variable : variable name in the .mat file that has been used to save the precomputed networks
return:
matrix : feature matrix of connectivity networks (num_subjects x network_size)
"""
all_networks = []
for subject in subject_list:
fl = os.path.join(data_folder, subject,
subject + "_" + atlas_name + "_" + kind + ".mat")
matrix = sio.loadmat(fl)[variable]
all_networks.append(matrix)
# all_networks=np.array(all_networks)
idx = np.triu_indices_from(all_networks[0], 1)
norm_networks = [np.arctanh(mat) for mat in all_networks]
vec_networks = [mat[idx] for mat in norm_networks]
matrix = np.vstack(vec_networks)
return matrix
# Construct the adjacency matrix of the population from phenotypic scores
def create_affinity_graph_from_scores(scores, subject_list):
"""
scores : list of phenotypic information to be used to construct the affinity graph
subject_list : list of subject IDs
return:
graph : adjacency matrix of the population graph (num_subjects x num_subjects)
"""
num_nodes = len(subject_list)
graph = np.zeros((num_nodes, num_nodes))
for l in scores:
label_dict = get_subject_score(subject_list, l)
# quantitative phenotypic scores
if l in ['AGE_AT_SCAN', 'FIQ']:
for k in range(num_nodes):
for j in range(k + 1, num_nodes):
try:
val = abs(float(label_dict[subject_list[k]]) - float(label_dict[subject_list[j]]))
if val < 2:
graph[k, j] += 1
graph[j, k] += 1
except ValueError: # missing label
pass
else:
for k in range(num_nodes):
for j in range(k + 1, num_nodes):
if label_dict[subject_list[k]] == label_dict[subject_list[j]]:
graph[k, j] += 1
graph[j, k] += 1
return graph