-
Notifications
You must be signed in to change notification settings - Fork 0
/
risky.py
executable file
·255 lines (196 loc) · 6.32 KB
/
risky.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
#!/usr/bin/env python
import gzip
import os
import glob
import sys
import shutil
import subprocess
import re
import pprint
import yaml
import argparse
"""
This python script will:
1. Submit a job array over all chunks.
2. Submit a final job that will run when the chunks job array finshes.
The parameters should be specified in the p dictionary below, nothing else should need to be changed.
PARAMS
------
chunks
A directory of the chunks to be computed. Use the wildcard character * to grab all of your files.
chr-regex
The regular expression to find the chromosome.
memory
How many Mb you want to use for the job array risk calculation.
weights
The path to the weights file.
working-dir
Where all of the params, scripts, o, e, and results are saved. See OUTPUT below.
sample-file
The path to a sample file.
weights-p-value max
The maximum p-value of a weight to include in the risk calculation.
farm-group (OPTIONAL)
The farm-group to run the jobs against.
queue
Which farm queue to send the jobs to.
OUTPUT
------
working dir/
name/
params
o/
e/
scripts/
results/
AUTHORS
-------
Daniel Rice
Mari Niemi
"""
def get_choice():
yes = set(['yes','y', 'ye', ''])
no = set(['no','n'])
choice = raw_input('([y]/n)? ').lower()
if choice in yes:
return True
else:
return False
def get_script_path():
return os.path.realpath(__file__)
def get_script_directory():
return os.path.dirname(os.path.realpath(__file__))
def main():
parser = argparse.ArgumentParser()
parser.add_argument('params')
args = parser.parse_args()
with open(args.params) as f:
p = yaml.load(f)
observed = set(p.keys())
required = set(['working-dir', 'weights', 'memory', 'chunks', 'weights-p-value-max'])
short = required - observed
if short:
print('Error, the params file does not contain all of the paramters! Missing the following:\n')
for el in short:
print('\t{}'.format(el))
sys.exit()
print('\n\nUsing the following parameters from file {}:\n'.format(args.params))
pprint.pprint(p)
p['job-name'] = os.path.basename(p['working-dir'])
if os.path.exists(p['working-dir']):
print('\n\n{working-dir} already exists. Do you want me to delete it and proceed?'.format(**p))
choice = get_choice()
if not choice:
sys.exit()
else:
shutil.rmtree(p['working-dir'])
os.makedirs(p['working-dir'])
params_file = os.path.join(p['working-dir'], 'params')
with open(params_file, 'w') as f:
f.write('Running python script:\n')
f.write(get_script_path())
f.write('\n\nWith paramsters:\n')
pprint.pprint(p, f)
for directory in ['o', 'e', 'scripts', 'results']:
path = os.path.join(p['working-dir'], directory)
os.makedirs(path)
p[directory] = path
chunks = glob.glob(p['chunks'])
p['n-chunks'] = len(chunks)
if 'sample-file' in p:
with open(p['sample-file']) as f:
n_samples = sum(1 for line in f)
_, file_extension = os.path.splitext(p['sample-file'])
if file_extension == '.sample':
n_samples -= 2
p['n-samples'] = n_samples
p['sample-file'] = '--sample_file={sample-file}'.format(**p)
else:
_, file_extension = os.path.splitext(chunks[0])
if file_extension.lower() == '.gz':
f = gzip.open(chunks[0])
else:
f = open(chunk_path)
header = f.readline()
f.close()
header = header.split()
genotypes = header[5:]
n_samples = len(genotypes)/3.
assert n_samples == int(n_samples)
p['n-samples'] = int(n_samples)
p['sample-file'] = ''
p['risky-chunk'] = os.path.join(get_script_directory(), 'risky_chunk.py')
array_job_element = """
#!/usr/bin/env bash
CHUNKS=({chunks})
{risky-chunk} \\
--weights_path={weights} \\
--results_dir={results} \\
--n_samples={n-samples} \\
--weights_p_value_max={weights-p-value-max} \\
--chunk=${{CHUNKS[$LSB_JOBINDEX - 1]}}
""".format(**p)
p['array-job-element'] = os.path.join(p['scripts'], 'array_job_element.sh')
with open (p['array-job-element'], 'w') as f:
f.write(array_job_element)
os.chmod(p['array-job-element'], 0700)
if 'farm-group' in p:
p['farm-group'] = '-G {farm-group}'.format(**p)
else:
p['farm-group'] = ''
if 'queue' in p:
p['queue'] = '-q {queue}'.format(**p)
else:
p['queue'] = ''
run_array = """
#!/usr/bin/env bash
bsub \\
-J"{job-name}[1-{n-chunks}]" \\
{farm-group} \\
{queue} \\
-o {o}/chunk-%J-%I \\
-e {e}/chunk-%J-%I \\
-M{memory} \\
-R"select[mem>{memory}] rusage[mem={memory}]" \\
{array-job-element}
""".format(**p)
fpath = os.path.join(p['scripts'], 'run_array.sh')
with open (fpath, 'w') as f:
f.write(run_array)
os.chmod(fpath, 0700)
print '\nRunning chunks job array from script at: ' + fpath
job_submit = subprocess.check_output(fpath, shell=True)
print(job_submit)
RE_JOBID = re.compile(r'Job \<(\d+)\>')
m = RE_JOBID.search(job_submit)
if not m:
print('Could not find the Job ID for the chunks job array!')
sys.exit()
g = m.groups()
p['job-id'] = g[0]
p['risky-combine'] = os.path.join(get_script_directory(), 'risky_combine.py')
combine = """
#!/usr/bin/env bash
bsub \\
-w "done({job-id})" \\
{farm-group} \\
{queue} \\
-o {o}/combine-%J \\
-e {e}/combine-%J \\
-M{memory} \\
-R"select[mem>{memory}] rusage[mem={memory}]" \\
{risky-combine} \\
--results_dir={results} \\
{sample-file} \\
--n_chunks={n-chunks} \\
""".format(**p)
print(combine)
fpath = os.path.join(p['scripts'], 'combine.sh')
with open (fpath, 'w') as f:
f.write(combine)
os.chmod(fpath, 0700)
print 'Queueing combine results job to run when {job-id} finishes'.format(**p)
job_submit = subprocess.check_output(fpath, shell=True)
print(job_submit)
if __name__ == '__main__':
main()