-
Notifications
You must be signed in to change notification settings - Fork 0
/
probabel_workflow.wdl
executable file
·227 lines (189 loc) · 5.89 KB
/
probabel_workflow.wdl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
task process_phenos {
File phenofile
File? idfile
String sample_id_header
String outcome
String exposure
String covar_names
String? delimiter
String? missing
Int ppmem
command {
python3 /format_probabel_phenos.py ${phenofile} ${sample_id_header} ${outcome} ${exposure} "${covar_names}" "${delimiter}" ${missing} "${idfile}"
}
runtime {
docker: "quay.io/large-scale-gxe-methods/probabel-workflow"
memory: ppmem + "GB"
}
output {
File pheno_fmt = "probabel_phenotypes.csv"
}
}
task sanitize_info {
File infofile
String infofile_base = basename(infofile)
command <<<
cat ${infofile} \
| cut -f 1-7 \
| awk '{ gsub("-","1",$0); print }' \
> "${infofile_base}.clean"
>>>
runtime {
docker: "quay.io/large-scale-gxe-methods/probabel-workflow"
memory: "1 GB"
}
output {
File sanitized = "${infofile_base}.clean"
}
}
task run_interaction {
File genofile
File infofile
File phenofile
Boolean binary_outcome
Boolean robust
Int memory
Int disk
Int monitoring_freq
String mode = if binary_outcome then "palogist" else "palinear"
command {
dstat -c -d -m --nocolor ${monitoring_freq} > system_resource_usage.log &
atop -x -P PRM ${monitoring_freq} | grep '(${mode})' > process_resource_usage.log &
/ProbABEL/src/${mode} \
-p ${phenofile} \
-d ${genofile} \
-i ${infofile} \
--interaction 1 \
${true="--robust" false="" robust} \
-o probabel_res
}
runtime {
docker: "quay.io/large-scale-gxe-methods/probabel-workflow"
memory: "${memory} GB"
disks: "local-disk ${disk} HDD"
gpu: false
dx_timeout: "7D0H00M"
}
output {
File res = "probabel_res_add.out.txt"
File system_resource_usage = "system_resource_usage.log"
File process_resource_usage = "process_resource_usage.log"
}
}
task standardize_output {
File resfile
String exposure
String outfile_base = basename(resfile)
String outfile = "${outfile_base}.fmt"
command {
python3 /format_probabel_output.py ${resfile} ${exposure} ${outfile}
}
runtime {
docker: "quay.io/large-scale-gxe-methods/probabel-workflow"
memory: "2 GB"
}
output {
File res_fmt = "${outfile}"
}
}
task cat_results {
Array[File] results_array
command {
head -1 ${results_array[0]} > all_results.txt && \
for res in ${sep=" " results_array}; do tail -n +2 $res >> all_results.txt; done
}
runtime {
docker: "quay.io/large-scale-gxe-methods/probabel-workflow"
disks: "local-disk 5 HDD"
}
output {
File all_results = "all_results.txt"
}
}
workflow run_probabel {
Array[File] genofiles
Array[File] infofiles
File phenofile
File? idfile
String sample_id_header
String outcome
Boolean binary_outcome
String exposure_names
String? covar_names = ""
String? delimiter = ","
String? missing = "NA"
Boolean? robust = true
Int? memory = 10
Int? disk = 20
Int? monitoring_freq = 1
Int ppmem = 2 * ceil(size(phenofile, "GB")) + 1
call process_phenos {
input:
phenofile = phenofile,
idfile = idfile,
sample_id_header = sample_id_header,
outcome = outcome,
exposure = exposure_names,
covar_names = covar_names,
delimiter = delimiter,
missing = missing,
ppmem = ppmem
}
scatter (infofile in infofiles) {
call sanitize_info {
input:
infofile = infofile
}
}
scatter (i in range(length(genofiles))) {
call run_interaction {
input:
genofile = genofiles[i],
infofile = sanitize_info.sanitized[i],
phenofile = process_phenos.pheno_fmt,
binary_outcome = binary_outcome,
robust = robust,
memory = memory,
disk = disk,
monitoring_freq = monitoring_freq
}
}
scatter (resfile in run_interaction.res) {
call standardize_output {
input:
resfile = resfile,
exposure = exposure_names
}
}
call cat_results {
input:
results_array = standardize_output.res_fmt
}
output {
File results = cat_results.all_results
Array[File] system_resource_usage = run_interaction.system_resource_usage
Array[File] process_resource_usage = run_interaction.process_resource_usage
}
parameter_meta {
genofiles: "Array of genotype filepaths in Minimac dosage format."
infofiles: "Variant information files. NOTE: preprocessing step within this workflow will trim the info file to the first 7 columns and sanitize columns 6 & 7 (typically Quality and Rsq) by replacing dashes with a value of 1. Ideally, this input file contains only numeric values in columns 6 & 7."
phenofile: "Phenotype filepath."
idfile: "Optional list of IDs associated with the .dose file (one per line) for use in filtering and aligning the phenotype file."
sample_id_header: "Column header name of sample ID in phenotype file."
outcome: "Column header name of phenotype data in phenotype file."
binary_outcome: "Boolean: is the outcome binary? Otherwise, quantitative is assumed."
exposure_names: "Column header name(s) of the exposures for genotype interaction testing (space-delimited). Only one exposures is currently allowed."
covar_names: "Column header name(s) of any covariates for which only main effects should be included (space-delimited). This set should not overlap with exposure_names."
delimiter: "Delimiter used in the phenotype file."
missing: "Missing value key of phenotype file."
robust: "Boolean: should robust (a.k.a. sandwich/Huber-White) standard errors be used?"
memory: "Requested memory for the interaction testing step (in GB)."
disk: "Requested disk space for the interaction testing step (in GB)."
monitoring_freq: "Delay between each output for process monitoring (in seconds). Default is 1 second."
}
meta {
author: "Kenny Westerman"
email: "[email protected]"
description: "Run interaction tests using the ProbABEL package and return summary statistics for 1-DF and 2-DF tests."
}
}