forked from noxtoby/TADPOLE
-
Notifications
You must be signed in to change notification settings - Fork 0
/
makeDummyD4.py
100 lines (72 loc) · 3.73 KB
/
makeDummyD4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import time
import random
import numpy as np
from datetime import timedelta
from datetime import datetime
import argparse
argparse.ArgumentParser(usage='python3 makeDummyD4.py',
description=r'''
The program creates a dummy D4 dataset from an already generated forecast CSV spreadsheet:
TADPOLE_Submission_TeamName1.csv
This spreadsheet is assumed to be in the current folder.
This is done by randomly selecting some months as ADNI3 future visits
and making the D4 entries by adding noise to the estimated values.
Author: Razvan V. Marinescu, [email protected]
''')
stdDateFormat = '%Y-%m-%d'
def strTimeProp(start, end, format, prop):
"""Get a time at a proportion of a range of two formatted times.
start and end should be strings specifying times formated in the
given format (strftime-style), giving an interval [start, end].
prop specifies how a proportion of the interval to be taken after
start. The returned time will be in the specified format.
"""
stime = time.mktime(time.strptime(start, format))
etime = time.mktime(time.strptime(end, format))
ptime = stime + prop * (etime - stime)
return datetime.fromtimestamp(ptime)
def randomDate(start, end, prop):
return strTimeProp(start, end, stdDateFormat, prop)
df = pd.read_csv('TADPOLE_Submission_TeamName1.csv')
# print('df', df)
unqSubj = np.unique(df['RID'])
nrUnqSubj = unqSubj.shape[0]
startDate = '2018-03-10'
endDate = '2019-02-28'
# perturb the cognitive assessment date from the scan date by max 10 days
tdeltaCog = timedelta(days=10)
df['Forecast Date'] = [datetime.strptime(x, '%Y-%m') for x in df['Forecast Date']] # considers every month estimate to be the actual first day 2017-01
diagStr = ['CN', 'MCI', 'AD']
trueDf = pd.DataFrame(np.nan,index=range(nrUnqSubj), columns=('RID', 'CognitiveAssessmentDate',
'Diagnosis', 'ADAS13', 'ScanDate', 'Ventricles'))
np.random.seed(1)
for s in range(nrUnqSubj):
# select random months from each subject as ADNI3 follow-ups
randDateScanCurr = randomDate(startDate, endDate, random.random())
currSubjMask = df['RID'] == unqSubj[s]
currSubjData = df[currSubjMask]
currSubjData = currSubjData.reset_index(drop=True)
# set the cognitive assessment date to be slightly different,
# as this has always been the case in ADNI
randDateCog = randomDate((randDateScanCurr-tdeltaCog).strftime(stdDateFormat),
(randDateScanCurr+tdeltaCog).strftime(stdDateFormat), random.random())
# find the closest estimates, add noise and set them as the true D4
timeDiffsScanMri = [randDateScanCurr - d for d in currSubjData['Forecast Date']]
indexMin = np.argsort(np.abs(timeDiffsScanMri))[0]
randDiag = diagStr[np.random.randint(0,3)]
# std=50%_CI/(2*0.67) .. taken from en.wikipedia.org/wiki/68%E2%80%9395%E2%80%9399.7_rule#Table_of_numerical_values
adasStd = (currSubjData['ADAS13 50% CI upper'].iloc[indexMin] -
currSubjData['ADAS13 50% CI lower'].iloc[indexMin])/(2*0.67)
# print('adasStd', adasStd)
randADAS = round(np.random.normal(loc=currSubjData['ADAS13'].iloc[indexMin], scale=adasStd))
ventStd = (currSubjData['Ventricles_ICV 50% CI upper'].iloc[indexMin] -
currSubjData['Ventricles_ICV 50% CI lower'].iloc[indexMin])/(2*0.67)
# print('ventStd', ventStd)
randVent = np.random.normal(loc=currSubjData['Ventricles_ICV'].iloc[indexMin], scale=ventStd)
# set these randomly generated values to be the true measurements (i.e. D4 dataset)
trueDf.iloc[s] = [unqSubj[s], randDateCog.strftime(stdDateFormat), randDiag, randADAS, randDateScanCurr.strftime(stdDateFormat), randVent]
trueDf.RID = trueDf.RID.astype(int)
trueDf.ADAS13 = trueDf.ADAS13.astype(int)
trueDf.Ventricles = trueDf.Ventricles.astype(float)
trueDf.to_csv('D4_dummy.csv',index=False)