-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathspp.py
198 lines (164 loc) · 8.72 KB
/
spp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
#!/bin/env python
import sjm
import os
import chr_maps
import idr
import conf
BIN_DIR = conf.BIN_DIR
SPP_BINARY = conf.SPP_BINARY
QUEUE = conf.QUEUE
PROJECT = conf.SGE_PROJECT
NAME = 'spp'
USE_CONTROL_LOCK = True
from peakseq import archive_results
from peakseq import check_control_inputs
from macs import check_sample_inputs
from macs import prep_control
from peakseq import prep_sample
def form_control_files(name, control):
cmds = []
eland_merged = os.path.join(control.temp_dir, '%s_merged_eland.txt' % control.run_name)
control.merged_file_location = os.path.join(control.results_dir, '%s_merged.tagAlign' % control.run_name)
# Merge eland files
cmd = os.path.join(BIN_DIR, 'merge_and_filter_reads.py')
cmd += ' %s' % eland_merged
for mr in control.mapped_read_files:
cmd += ' %s' % mr
cmds.append(cmd)
# Convert to tagAlign
cmd = os.path.join(BIN_DIR, 'eland2tagAlign.py')
cmd += ' %s' % eland_merged
cmd += ' %s' % control.merged_file_location
cmd += ' %s' % control.genome
cmds.append(cmd)
j = sjm.Job(control.run_name, cmds,modules=["samtools/1.2"], queue=QUEUE,memory="12G",sched_options="-m e")
control.add_jobs(name, [j,])
from peakseq import form_sample_files
from macs import form_sample_files_nodups
from macs import form_replicate_files
from peakseq import complete_control
from peakseq import archive_control
from peakseq import archive_sample
from peakseq import calc_pbc
def run_peakcaller(name, control, sample, options=None):
if not options:
options = {}
jobs = []
control_file = os.path.join(control.results_dir, '%s_merged.tagAlign' % control.run_name)
for r in sample.replicates + [sample.combined_replicate,]:
# Regular Run
r.merged_ta = os.path.join(r.temp_dir(sample), '%s.tagAlign' % r.rep_name(sample))
convert_cmd = os.path.join(BIN_DIR, 'eland2tagAlign.py')
convert_cmd += ' %s' % r.merged_file_location
convert_cmd += ' %s' % r.merged_ta
convert_cmd += ' %s' % sample.genome
spp_cmd = SPP_BINARY
spp_cmd += ' -rf' # overwrite existing plot files
spp_cmd += ' -c=%s' % r.merged_ta # sample
spp_cmd += ' -i=%s' % control_file # control
spp_cmd += ' -npeak=300000' # number of peaks to call
spp_cmd += ' -odir=%s' % r.results_dir(sample) # output directory
spp_cmd += ' -savr'
spp_cmd += ' -savp'
spp_cmd += ' -x=-500:50' # exclude frag lengths within this range
spp_cmd += ' -out=%s' % os.path.join(r.results_dir(sample), 'phantomPeakStatsRep%s.tab' % r.rep_name(sample))
print "spp1"
print spp_cmd
if 'filtchr' in options:
for filtchr in options['filtchr']:
spp_cmd += ' -filtchr=%s' % filtchr # ignore reads from filtchr chr
jobs.append(sjm.Job('SPP_' + sample.run_name + '_' + r.rep_name(sample), [convert_cmd, spp_cmd,], queue=QUEUE, project=PROJECT, memory='8G',sched_options="-m e"))
r.spp_results = os.path.join(r.results_dir(sample), r.rep_name(sample) + '_VS_' + control.run_name + '_merged.regionPeak.gz')
# Pseudoreplicate Runs
r.merged_ta_pr1 = os.path.join(r.temp_dir(sample), '%s_PR1.tagAlign' % r.rep_name(sample))
convert_cmd = os.path.join(BIN_DIR, 'eland2tagAlign.py')
convert_cmd += ' %s' % r.pr1_merged
convert_cmd += ' %s' % r.merged_ta_pr1
convert_cmd += ' %s' % sample.genome
spp_cmd = SPP_BINARY
spp_cmd += ' -rf' # overwrite existing plot files
spp_cmd += ' -c=%s' % r.merged_ta_pr1 # sample
spp_cmd += ' -i=%s' % control_file # control
spp_cmd += ' -npeak=300000' # number of peaks to call
spp_cmd += ' -odir=%s' % r.pr1_results_dir # output directory
spp_cmd += ' -savr'
spp_cmd += ' -savp'
spp_cmd += ' -x=-500:50' # exclude frag lengths within this range
spp_cmd += ' -out=%s' % os.path.join(r.results_dir(sample), 'phantomPeakStatsRep%s.tab' % r.rep_name(sample) + '_PR1')
print spp_cmd
if 'filtchr' in options:
for filtchr in options['filtchr']:
spp_cmd += ' -filtchr=%s' % filtchr # ignore reads from filtchr chr
jobs.append(sjm.Job('SPP_' + sample.run_name + '_' + r.rep_name(sample) + '_PR1', [convert_cmd, spp_cmd,], queue=QUEUE, project=PROJECT, memory='8G',sched_options="-m e"))
r.spp_results_pr1 = os.path.join(r.pr1_results_dir, r.rep_name(sample) + '_PR1_VS_' + control.run_name + '_merged.regionPeak.gz')
r.merged_ta_pr2 = os.path.join(r.temp_dir(sample), '%s_PR2.tagAlign' % r.rep_name(sample))
convert_cmd = os.path.join(BIN_DIR, 'eland2tagAlign.py')
convert_cmd += ' %s' % r.pr2_merged
convert_cmd += ' %s' % r.merged_ta_pr2
convert_cmd += ' %s' % sample.genome
spp_cmd = SPP_BINARY
spp_cmd += ' -rf' # overwrite existing plot files
spp_cmd += ' -c=%s' % r.merged_ta_pr2 # sample
spp_cmd += ' -i=%s' % control_file # control
spp_cmd += ' -npeak=300000' # number of peaks to call
spp_cmd += ' -odir=%s' % r.pr2_results_dir # output directory
spp_cmd += ' -savr'
spp_cmd += ' -savp'
spp_cmd += ' -x=-500:50' # exclude frag lengths within this range
spp_cmd += ' -out=%s' % os.path.join(r.results_dir(sample), 'phantomPeakStats%s.tab' % r.rep_name(sample) + '_PR2')
if 'filtchr' in options:
for filtchr in options['filtchr']:
spp_cmd += ' -filtchr=%s' % filtchr # ignore reads from filtchr chr
jobs.append(sjm.Job('SPP_' + sample.run_name + '_' + r.rep_name(sample) + '_PR2', [convert_cmd, spp_cmd,], queue=QUEUE, project=PROJECT, memory='8G',sched_options="-m e"))
r.spp_results_pr2 = os.path.join(r.pr2_results_dir, r.rep_name(sample) + '_PR2_VS_' + control.run_name + '_merged.regionPeak.gz')
sample.add_jobs(name, jobs)
def merge_results(name, sample):
jobs = []
for r in sample.replicates + [sample.combined_replicate,]:
r.unfiltered_results = os.path.join(r.results_dir(sample), '%s_peaks.regionPeak' % r.rep_name(sample))
r.unfiltered_results_pr1 = os.path.join(r.pr1_results_dir, '%s_PR1_peaks.regionPeak' % r.rep_name(sample))
r.unfiltered_results_pr2 = os.path.join(r.pr2_results_dir, '%s_PR2_peaks.regionPeak' % r.rep_name(sample))
unpack_cmds = ['zcat %s > %s' % (r.spp_results, r.unfiltered_results), 'zcat %s > %s' % (r.spp_results_pr1, r.unfiltered_results_pr1), 'zcat %s > %s' % (r.spp_results_pr2, r.unfiltered_results_pr2),]
jobs.append(sjm.Job('merge_results_%s' % r.rep_name(sample), unpack_cmds, queue=QUEUE, project=PROJECT,sched_options="-m e"))
sample.add_jobs(name, jobs)
def replicate_scoring(name, sample):
j = sjm.Job('replicate_scoring', ['echo replicate_scoring',], host='localhost', queue=QUEUE, project=PROJECT,sched_options="-m e")
sample.add_jobs(name, [j,])
def form_idr_inputs(name, sample):
os.makedirs(os.path.join(sample.results_dir, 'idr'))
jobs = []
for rep in sample.replicates + [sample.combined_replicate,]:
rep.narrowPeak = rep.unfiltered_results
rep.narrowPeak_pr1 = rep.unfiltered_results_pr1
rep.narrowPeak_pr2 = rep.unfiltered_results_pr2
sample.add_jobs(name, [sjm.Job('form_idr_inputs', ['echo form_idr_inputs',], queue=QUEUE, project=PROJECT, host='localhost',sched_options="-m e"),])
from peakseq import mail_results
from macs import cleanup
def idr_analysis(name, sample):
jobs = []
modules = ["r/3.0.1"]
for i, rep_a in enumerate(sample.replicates):
for j in range(i+1, len(sample.replicates)):
rep_b = sample.replicates[j]
idr_name = '%s_VS_%s' % (rep_a.rep_name(sample), rep_b.rep_name(sample))
cmd = idr.idr_analysis_cmd(rep_a.narrowPeak, rep_b.narrowPeak, os.path.join(sample.idr_dir, idr_name), 'signal.value', sample.genome)
jobs.append(sjm.Job('idr_analysis_' + idr_name, [cmd,], queue=QUEUE,modules=modules,project=PROJECT,sched_options="-m e"))
# Pseudoreplicates
idr_name = '%s_PR1_VS_%s_PR2' % (rep_a.rep_name(sample), rep_a.rep_name(sample))
cmd = idr.idr_analysis_cmd(rep_a.narrowPeak_pr1, rep_a.narrowPeak_pr2, os.path.join(sample.idr_dir, idr_name+'_PR'), 'signal.value', sample.genome)
jobs.append(sjm.Job('idr_analysis_' + idr_name, [cmd,], queue=QUEUE,modules=modules,project=PROJECT,sched_options="-m e"))
# Pooled Pseudoreplicates
idr_name = '%s_PR1_VS_%s_PR2' % (sample.combined_replicate.rep_name(sample), sample.combined_replicate.rep_name(sample))
cmd = idr.idr_analysis_cmd(sample.combined_replicate.narrowPeak_pr1, sample.combined_replicate.narrowPeak_pr2, os.path.join(sample.idr_dir, idr_name), 'signal.value', sample.genome)
jobs.append(sjm.Job('idr_analysis_'+ idr_name, [cmd,], queue=QUEUE,modules=modules,project=PROJECT,sched_options="-m e"))
sample.add_jobs(name, jobs)
def idr_filter(name, sample):
cmd = os.path.join(BIN_DIR, 'idr_filter.py')
cmd += ' %s' % sample.run_name
cmd += ' %s' % sample.genome
cmd += ' %i' % len(sample.replicates)
cmd += ' %s' % sample.idr_dir
cmd += ' %s' % os.path.join(os.path.join(sample.results_dir, 'All'), sample.combined_replicate.unfiltered_results)
cmd += ' %s' % sample.results_dir
cmd += ' 7' # sort column (signal.value)
sample.add_jobs(name, [sjm.Job('idr_filter_' + sample.run_name, [cmd,], queue=QUEUE, project=PROJECT,sched_options="-m e"),])