forked from noxtoby/TADPOLE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
TADPOLE_SimpleForecastExampleLeaderboard.py
executable file
·338 lines (305 loc) · 15.8 KB
/
TADPOLE_SimpleForecastExampleLeaderboard.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import os
import sys
import datetime as dt
from dateutil.relativedelta import relativedelta
# TADPOLE_SimpleForecastExampleLeaderboard.py
# - Translated from TADPOLE_SimpleForecastExampleLeaderboard.m by Neil P. Oxtoby -
#
# Example code showing how to construct a forecast in the correct format
# for a leaderboard submission to TADPOLE Challenge 2017.
#
# Uses the LB2 prediction set generated by makeLeaderboardDataset.py (see
# github.com/noxtoby/TADPOLE/evaluation for the code).
#
# The simple forecast example here simply uses a set of predefined
# defaults:
# 1. Likelihoods of each diagnosis (CN, MCI, AD) that depend on the
# subject's most recent clinical status.
# 2. Forecasts of future ADAS13 score and Ventricles volume are just the
# same as the most recent measurement, or filled with defaults where
# the data is missing.
#
# ****** The purpose of the code is not to give a good forecast! ******
#
# It is simply to show how to read in and make sense of the TADPOLE data
# sets and to output a forecast in the right format.
#
# You may wish to use this code as a starting point for generating your own
# leaderboard submission (spreadsheet).
#
#============
# Authors:
# Daniel C. Alexander, Neil P. Oxtoby, and Razvan Valentin-Marinescu
# University College London
# Date:
# 9 August 2017
# Last updated:
# 13 September 2017
## Read in the TADPOLE data set and extract a few columns of salient information.
# Script requires that TADPOLE_D1_D2.csv is in the parent directory. Change if
# necessary
dataLocationD1D2 = '../' # parent directory
dataLocationLB1LB2 = './' # current directory
tadpoleD1D2File = os.path.join(dataLocationD1D2,'TADPOLE_D1_D2.csv')
tadpoleLB1LB2File = os.path.join(dataLocationLB1LB2,'TADPOLE_LB1_LB2.csv')
outputFile = 'TADPOLE_Submission_Leaderboard_TeamName1.csv'
errorFlag = 0
if not os.path.exists(tadpoleD1D2File):
print('File {0} does not exist! \nYou need to download it from ADNI\n and/or move it in the right directory'.format(tadpoleD1D2File))
errorFlag = 1
if errorFlag:
sys.exit()
# choose whether to display warning messages
verbose = 0
#* Read in the D1_D2 spreadsheet: may give a DtypeWarning, but the read/import works.
#* This file contains all the necessary data - the TADPOLE_LB1_LB2.csv spreadsheet contains
#* only the LB1 and LB2 indicators, aligned to TADPOLE_D1_D2.csv
TADPOLE_Table = pd.read_csv(tadpoleD1D2File,low_memory=False)
#* Read in the LB1_LB2 spreadsheet
LB_Table = pd.read_csv(tadpoleLB1LB2File,low_memory=False)
#* Target variables: convert strings to numeric if necessary
targetVariables = ['DX','ADAS13','Ventricles']
variablesToCheck = ['RID','ICV_bl'] + targetVariables # also check RosterID and IntraCranialVolume
for kt in range(0,len(variablesToCheck)):
var0 = TADPOLE_Table[variablesToCheck[kt]][0]
if not('DX' == variablesToCheck[kt]):
if np.str(var0)==var0:
#* Convert strings to numeric
TADPOLE_Table[variablesToCheck[kt]] = np.int(TADPOLE_Table[variablesToCheck[kt]])
#* Copy numeric target variables into arrays. Missing data is encoded as -1
# ADAS13 scores
ADAS13_Col = TADPOLE_Table.ADAS13.values.copy()
ADAS13_Col[np.isnan(ADAS13_Col)] = -1
# Ventricles volumes, normalised by intracranial volume
Ventricles_Col = TADPOLE_Table.Ventricles.values.copy()
Ventricles_Col[np.isnan(Ventricles_Col)] = -1
ICV_Col = TADPOLE_Table.ICV_bl.values.copy()
ICV_Col[np.isnan(ICV_Col)] = -1
ICV_Col[Ventricles_Col==-1] = 1
Ventricles_ICV_Col = Ventricles_Col/ICV_Col
#* Create an array containing the clinical status column from the spreadsheet
# DXCHANGE: current diagnosis (DX) and change since most recent visit, i.e., '[previous DX] to [current DX]'
DXCHANGE = TADPOLE_Table.DX.values.copy() # 'NL to MCI', 'MCI to Dementia', etc.
DX = DXCHANGE.copy() # Note: missing data encoded numerically (!) as nan
# Convert DXCHANGE to current DX
for kr in range(0,len(DX)):
if np.isreal(DX[kr]): # Missing data
DX[kr] = '' # missing data encoded as empty string
else:
# Loop until finding the final space in the DXCHANGE string
idxn = 0 # reset
while not(idxn==-1):
idx = idxn
idxn = DX[kr].find(' ',idxn+1)
if idx>0:
idx=idx+1
DX[kr] = DX[kr][idx:] # extract current DX from DXCHANGE
CLIN_STAT_Col = DX.copy()
#* Copy the subject ID column from the spreadsheet into an array.
RID_Col = TADPOLE_Table.RID.values.copy()
RID_Col[np.isnan(RID_Col)] = -1 # missing data encoded as -1
#* Compute months since Jan 2000 for each exam date
ref = dt.datetime(2000,1,1)
EXAMDATE = TADPOLE_Table.EXAMDATE.values.copy()
ExamMonth_Col = np.zeros(len(EXAMDATE))
for k in range(0,len(EXAMDATE)):
d = dt.datetime.strptime(EXAMDATE[k],'%Y-%m-%d') - ref
ExamMonth_Col[k] = d.days/365*12
#* Copy the column specifying membership of LB2 into an array.
LB2_col = LB_Table.LB2 == 1
## Generate the very simple forecast
print('Generating forecast ...')
#* Get the list of subjects to forecast from LB2 - the ordering is the
#* same as in the submission template.
lbInds = np.where(LB2_col)[0]
LB2_SubjList = np.unique(RID_Col[lbInds])
N_LB2 = len(LB2_SubjList)
# As opposed to the actual submission, we require 84 months of forecast
# data. This is because some ADNI2 subjects in LB4 have visits as long as
# 7 years after their last ADNI1 visit in LB2
#* Create arrays to contain the 84 monthly forecasts for each LB2 subject
nForecasts = 7*12 # forecast 7 years (84 months).
# 1. Clinical status forecasts
# i.e. relative likelihood of NL, MCI, and Dementia (3 numbers)
CLIN_STAT_forecast = np.zeros([N_LB2, nForecasts, 3])
# 2. ADAS13 forecasts
# (best guess, upper and lower bounds on 50% confidence interval)
ADAS13_forecast = np.zeros([N_LB2, nForecasts, 3])
# 3. Ventricles volume forecasts
# (best guess, upper and lower bounds on 50% confidence interval)
Ventricles_ICV_forecast = np.zeros([N_LB2, nForecasts, 3])
#* Our example forecast for each subject is based on the most recent
#* available (not missing) data for each target variable in LB2.
#* Extract most recent data.
# Initialise storage arrays
most_recent_CLIN_STAT = N_LB2*['']
most_recent_ADAS13 = -1*np.ones([N_LB2, 1])
most_recent_Ventricles_ICV = -1*np.zeros([N_LB2, 1])
display_info = 0 # Useful for checking and debugging (see below)
#*** Defaults - in case of missing data
#* Ventricles
# Missing data = typical volume +/- broad interval = 25000 +/- 20000
Ventricles_typical = 25000
Ventricles_broad_50pcMargin = 20000 # +/- (broad 50% confidence interval)
# Default CI = 1000
Ventricles_default_50pcMargin = 1000 # +/- (broad 50% confidence interval)
# Convert to Ventricles/ICV via linear regression
nm = np.all(np.stack([Ventricles_Col>0,ICV_Col>0]),0) # not missing: Ventricles and ICV
x = Ventricles_Col[nm]
y = Ventricles_ICV_Col[nm]
lm = np.polyfit(x,y,1)
p = np.poly1d(lm)
#*** If you want to visualise the regression, you need to install matplotlib (terminal: pip3 install matplotlib), then use the following code:
# from matplotlib import pyplot as plt
# xp = np.linspace(np.min(Ventricles_Col[Ventricles_Col>0]), np.max(Ventricles_Col[Ventricles_Col>0]), 100)
# plt.plot(x,y,'.',xp,p(xp),'-')
# plt.show()
#***
Ventricles_ICV_typical = p(Ventricles_typical)
Ventricles_ICV_broad_50pcMargin = np.abs(p(Ventricles_broad_50pcMargin) - p(-Ventricles_broad_50pcMargin))/2
Ventricles_ICV_default_50pcMargin = np.abs(p(Ventricles_default_50pcMargin) - p(-Ventricles_default_50pcMargin))/2
#* ADAS13
ADAS13_typical = 12
ADAS13_typical_lower = ADAS13_typical - 10
ADAS13_typical_upper = ADAS13_typical + 10
for i in range(0,N_LB2): # Each subject in LB2
#* Rows in LB2 corresponding to Subject LB2_SubjList(i)
subj_rows = np.where(np.all(np.stack([RID_Col==LB2_SubjList[i],LB2_col],0),0))[0]
subj_exam_dates = ExamMonth_Col[subj_rows]
# Non-empty data among these rows
exams_with_CLIN_STAT = CLIN_STAT_Col[subj_rows]!=''
exams_with_ADAS13 = ADAS13_Col[subj_rows]>0
exams_with_ventsv = Ventricles_ICV_Col[subj_rows]>0
#exams_with_allData = exams_with_CLIN_STAT & exams_with_ADAS13 & exams_with_ventsv
#* Extract most recent non-empty data
# 1. Clinical status
if sum(exams_with_CLIN_STAT)>=1: # Subject has a Clinical status
# Index of most recent visit with a Clinical status
ind = subj_rows[ np.all(np.stack([subj_exam_dates == max(subj_exam_dates[exams_with_CLIN_STAT]),exams_with_CLIN_STAT],0),0) ]
most_recent_CLIN_STAT[i] = CLIN_STAT_Col[ind[-1]]
else: # Subject has no Clinical statuses in the data set
most_recent_CLIN_STAT[i] = '' # Already set when initialised above
# 2. ADAS13 score
if sum(exams_with_ADAS13)>=1: # Subject has an ADAS13 score
# Index of most recent visit with an ADAS13 score
ind = subj_rows[ np.all(np.stack([subj_exam_dates == max(subj_exam_dates[exams_with_ADAS13]),exams_with_ADAS13],0),0) ]
most_recent_ADAS13[i] = ADAS13_Col[ind[-1]]
else: # Subject has no ADAS13 scores in the data set
most_recent_ADAS13[i] = -1 # Already set when initialised above
# 3. Most recent ventricles volume measurement
if sum(exams_with_ventsv)>=1: # Subject has a ventricles volume recorded
# Index of most recent visit with a ventricles volume
ind = subj_rows[ np.all(np.stack([subj_exam_dates == max(subj_exam_dates[exams_with_ventsv]),exams_with_ventsv],0),0) ]
most_recent_Ventricles_ICV[i] = Ventricles_ICV_Col[ind[-1]]
else: # Subject has no ventricle volume measurement in the data set
most_recent_Ventricles_ICV[i] = -1 # Already set when initialised above
#* "Debug mode": prints out some stuff (set display_info=1 above)
if display_info:
ExamMonth_Col[subj_rows]
CLIN_STAT_Col[subj_rows]
Ventricles_ICV_Col[subj_rows]
ADAS13_Col[subj_rows]
print('{0} - CLIN_STAT {1} - ADAS13 {2} - Ventricles_ICV {3}'.format(i,most_recent_CLIN_STAT[i],most_recent_ADAS13[i],most_recent_Ventricles_ICV[i]))
#*** Construct example forecasts
#* Clinical status forecast: predefined likelihoods per current status
if most_recent_CLIN_STAT[i] == 'NL':
CNp, MCIp, ADp = [0.75, 0.15, 0.1]
elif most_recent_CLIN_STAT[i] == 'MCI':
CNp, MCIp, ADp = [0.1, 0.5, 0.4]
elif most_recent_CLIN_STAT[i] == 'Dementia':
CNp, MCIp, ADp = [0.1, 0.1, 0.8]
else :
CNp, MCIp, ADp = [0.33, 0.33, 0.34]
if verbose:
print('Unrecognised status ' + most_recent_CLIN_STAT[i])
# Use the same clinical status probabilities for all months
CLIN_STAT_forecast[i,:,0] = CNp
CLIN_STAT_forecast[i,:,1] = MCIp
CLIN_STAT_forecast[i,:,2] = ADp
#* ADAS13 forecast: = most recent score, default confidence interval
if most_recent_ADAS13[i]>=0:
ADAS13_forecast[i,:,0] = most_recent_ADAS13[i]
ADAS13_forecast[i,:,1] = max([0, most_recent_ADAS13[i] - 1]) # Set to zero if best-guess less than 1.
ADAS13_forecast[i,:,2] = most_recent_ADAS13[i] + 1
else:
# Subject has no history of ADAS13 measurement, so we'll take a
# typical score of 12 with wide confidence interval +/-10.
ADAS13_forecast[i,:,0] = ADAS13_typical
ADAS13_forecast[i,:,1] = ADAS13_typical_lower
ADAS13_forecast[i,:,2] = ADAS13_typical_upper
#* Ventricles volume forecast: = most recent measurement, default confidence interval
if most_recent_Ventricles_ICV[i]>0:
Ventricles_ICV_forecast[i,:,0] = most_recent_Ventricles_ICV[i]
Ventricles_ICV_forecast[i,:,1] = most_recent_Ventricles_ICV[i] - Ventricles_ICV_default_50pcMargin
Ventricles_ICV_forecast[i,:,2] = most_recent_Ventricles_ICV[i] + Ventricles_ICV_default_50pcMargin
else:
# Subject has no imaging history, so we'll take a typical
# ventricles volume of 25000 & wide confidence interval +/-20000
Ventricles_ICV_forecast[i,:,0] = Ventricles_ICV_typical
Ventricles_ICV_forecast[i,:,1] = Ventricles_ICV_typical - Ventricles_ICV_broad_50pcMargin
Ventricles_ICV_forecast[i,:,2] = Ventricles_ICV_typical + Ventricles_ICV_broad_50pcMargin
Ventricles_ICV_forecast = np.around(1e9*Ventricles_ICV_forecast,0)/1e9 # round to 9 decimal places to match MATLAB equivalent
## Now construct the forecast spreadsheet and output it.
print('Constructing the output spreadsheet {0} ...'.format(outputFile))
submission_table = pd.DataFrame()
#* Repeated matrices - compare with submission template
submission_table['RID'] = LB2_SubjList.repeat(nForecasts)
submission_table['ForecastMonth'] = np.tile(range(1,nForecasts+1),(N_LB2,1)).flatten()
#* Submission dates - compare with submission template
startDate = dt.datetime(2010,5,1)
endDate = startDate + relativedelta(months=+nForecasts-1)
ForecastDates = [startDate]
while ForecastDates[-1] < endDate:
ForecastDates.append(ForecastDates[-1] + relativedelta(months=+1))
ForecastDatesStrings = [dt.datetime.strftime(d,'%Y-%m') for d in ForecastDates]
submission_table['ForecastDate'] = np.tile(ForecastDatesStrings,(N_LB2,1)).flatten()
#* Pre-fill forecast data, encoding missing data as NaN
nanColumn = np.repeat(np.nan,submission_table.shape[0])
submission_table['CNRelativeProbability'] = nanColumn
submission_table['MCIRelativeProbability'] = nanColumn
submission_table['ADRelativeProbability'] = nanColumn
submission_table['ADAS13'] = nanColumn
submission_table['ADAS1350_CILower'] = nanColumn
submission_table['ADAS1350_CIUpper'] = nanColumn
submission_table['Ventricles_ICV'] = nanColumn
submission_table['Ventricles_ICV50_CILower'] = nanColumn
submission_table['Ventricles_ICV50_CIUpper'] = nanColumn
#*** Paste in month-by-month forecasts **
#* 1. Clinical status
submission_table['CNRelativeProbability'] = CLIN_STAT_forecast[:,:,0].flatten()
submission_table['MCIRelativeProbability'] = CLIN_STAT_forecast[:,:,1].flatten()
submission_table['ADRelativeProbability'] = CLIN_STAT_forecast[:,:,2].flatten()
#* 2. ADAS13 score
submission_table['ADAS13'] = ADAS13_forecast[:,:,0].flatten()
# Lower and upper bounds (50% confidence intervals)
submission_table['ADAS1350_CILower'] = ADAS13_forecast[:,:,1].flatten()
submission_table['ADAS1350_CIUpper'] = ADAS13_forecast[:,:,2].flatten()
#* 3. Ventricles volume (normalised by intracranial volume)
submission_table['Ventricles_ICV'] = Ventricles_ICV_forecast[:,:,0].flatten()
# Lower and upper bounds (50% confidence intervals)
submission_table['Ventricles_ICV50_CILower'] = Ventricles_ICV_forecast[:,:,1].flatten()
submission_table['Ventricles_ICV50_CIUpper'] = Ventricles_ICV_forecast[:,:,2].flatten()
#* Convert all numbers to strings - only useful in MATLAB
# hdr = submission_table.columns.copy()
# for k in range(0,len(hdr)):
# if np.all(np.isreal(submission_table[hdr[k]].values)):
# submission_table[hdr[k]] = submission_table[hdr[k]].values.astype(str)
#* Use column names that match the submission template
submission_table.rename(columns={'RID':'RID',
'ForecastMonth': 'Forecast Month',
'ForecastDate': 'Forecast Date',
'CNRelativeProbability' :'CN relative probability',
'MCIRelativeProbability':'MCI relative probability',
'ADRelativeProbability' :'AD relative probability',
'ADAS13': 'ADAS13',
'ADAS1350_CILower': 'ADAS13 50% CI lower',
'ADAS1350_CIUpper': 'ADAS13 50% CI upper',
'Ventricles_ICV': 'Ventricles_ICV',
'Ventricles_ICV50_CILower': 'Ventricles_ICV 50% CI lower',
'Ventricles_ICV50_CIUpper': 'Ventricles_ICV 50% CI upper'}, inplace=True)
#* Write to file
submission_table.to_csv(outputFile,index=False)