Skip to content

Commit 9a749d0

Browse files
author
gitliver
committed
expose Trinity mem in main script, move code from assembly.py to modules (reduce bloating)
1 parent 5dc7f61 commit 9a749d0

File tree

3 files changed

+118
-100
lines changed

3 files changed

+118
-100
lines changed

helpers/assembly_helpers.py

+107
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Helper functions for the assembly step
5+
~~~~~~
6+
"""
7+
8+
# -------------------------------------
9+
10+
def computedistrib(infile, outfile):
11+
"""compute simple distribution of contigs"""
12+
13+
# a list of contig lengths
14+
x = []
15+
16+
with open(infile, 'r') as g:
17+
for line in g:
18+
if line[0] != '>':
19+
x.append(len(line.rstrip()))
20+
21+
from collections import Counter
22+
23+
# count occurences of each length
24+
xcount = Counter(x)
25+
26+
# tot length
27+
tot = sum(dict(xcount).values())
28+
29+
# running sum
30+
runningsum = 0
31+
32+
with open(outfile, 'w') as f:
33+
for i in sorted(dict(xcount), reverse=True):
34+
runningsum += xcount[i]
35+
f.write(str(i) + '\t')
36+
f.write(str(xcount[i]) + '\t')
37+
f.write(str(runningsum) + '/' + str(tot) + '\t')
38+
f.write(str(int(100*runningsum/tot)) + '%')
39+
f.write('\n')
40+
41+
# -------------------------------------
42+
43+
def formatpileup(infile, idxfile, outfile):
44+
"""format the pileup file for computing entropy"""
45+
46+
# id 2 length dict (output of samtools idxstats)
47+
idx = {}
48+
49+
# load idx file
50+
with open(idxfile, 'r') as f:
51+
for line in f:
52+
# map id to length of contig
53+
idx[line.split()[0].strip()] = line.split()[1].strip()
54+
55+
myid = '' # contig id
56+
pos = '' # position
57+
58+
with open(outfile, 'w') as f:
59+
with open(infile, 'r') as g:
60+
for line in g:
61+
# get number reads
62+
numrds = line.split()[3]
63+
64+
# if beginning of a new contig (id != previous id)
65+
if line.split()[0] != myid:
66+
# if change (and not first contig), check if previous contig was covered until the end
67+
# if not covered, pad with zeros
68+
if myid:
69+
if int(idx[myid]) > int(pos):
70+
for i in range(int(pos) + 1, int(idx[myid]) + 1):
71+
f.write(myid + '\t' + str(i) + '\t0\n')
72+
73+
# if contig starts at postion > 1, pad with zeros
74+
if int(line.split()[1]) > 1:
75+
for i in range(1, int(line.split()[1])):
76+
f.write(line.split()[0] + '\t' + str(i) + '\t0\n')
77+
78+
# set new id
79+
myid = line.split()[0]
80+
81+
# write current line
82+
f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
83+
84+
# if discontinuity (position - previous position > 1), pad with zeros
85+
elif (int(line.split()[1]) - int(pos)) > 1:
86+
for i in range(int(pos) + 1, int(line.split()[1])):
87+
f.write(myid + '\t' + str(i) + '\t0\n')
88+
89+
f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
90+
91+
# otherwise, simply write line
92+
else:
93+
f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
94+
95+
# get position (this will become previous position for next iteration)
96+
pos = line.split()[1]
97+
98+
# check if last contig covered until the end
99+
if int(idx[myid]) > int(pos):
100+
for i in range(int(pos) + 1, int(idx[myid]) + 1):
101+
f.write(myid + '\t' + str(i) + '\t0\n')
102+
103+
# -------------------------------------
104+
105+
if __name__ == "__main__":
106+
107+
pass

pandora.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,8 @@ def get_arg():
6666
step 3: blast contigs, \
6767
step 4: orf discovery, \
6868
step 5: reporting (default: 12345 - i.e, steps 1 through 5).')
69+
parser_scan.add_argument('--trinitymem', default='50G', help='max memory for Trinity (default: 50G)')
70+
parser_scan.add_argument('--trinitycores', default='8', help='number of cores for Trinity (default: 8)')
6971
parser_scan.set_defaults(which='scan')
7072

7173
# create the parser for the 'aggregate' command
@@ -201,9 +203,11 @@ def scan_main(args):
201203
int(args.verbose),
202204
int(args.noclean),
203205
args.gtf),
204-
'2': '{}/scripts/assembly.py --scripts {} --verbose {} --noclean {}'.format(
206+
'2': '{}/scripts/assembly.py --scripts {} --trinitymem {} --trinitycores {} --verbose {} --noclean {}'.format(
205207
args.scripts,
206208
args.scripts,
209+
args.trinitymem,
210+
args.trinitycores,
207211
int(args.verbose),
208212
int(args.noclean)),
209213
'3': '{}/scripts/blast_wrapper.py --scripts {} --threshold {} --db {} --id {} --verbose {} --noclean {} --nosge {}'.format(

scripts/assembly.py

+6-99
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ def get_arg():
1818
parser.add_argument('-1', '--mate1', default='host_separation/unmapped_1.fastq.gz', help='mate1')
1919
parser.add_argument('-2', '--mate2', default='host_separation/unmapped_2.fastq.gz', help='mate2')
2020
parser.add_argument('-o', '--outputdir', default='assembly_trinity', help='the output directory')
21-
parser.add_argument('--trinitymem', default='50G', help='max memory for Trinity')
22-
parser.add_argument('--trinitycores', default='8', help='number of cores for Trinity')
21+
parser.add_argument('--trinitymem', required=True, help='max memory for Trinity')
22+
parser.add_argument('--trinitycores', required=True, help='number of cores for Trinity')
2323
parser.add_argument('-l', '--logsdir', help='the logs directory')
2424
parser.add_argument('-d', '--scripts', help='the git repository directory')
2525
parser.add_argument('--noclean', help='do not delete temporary intermediate files (default: off)')
@@ -32,7 +32,9 @@ def get_arg():
3232
# need this to get local modules
3333
sys.path.append(args.scripts)
3434
global hp
35+
global ahp
3536
from helpers import helpers as hp
37+
from helpers import assembly_helpers as ahp
3638

3739
# error checking: exit if previous step produced zero output
3840
for i in [args.mate1, args.mate2]:
@@ -83,7 +85,7 @@ def assembly(args):
8385

8486
# compute simple distribution
8587
# cat assembly/contigs_trinity.fasta | paste - - | awk '{print length($2)}' | sort -nr | ${d}/scripts/tablecount | awk -v tot=${num_contigs} 'BEGIN{x=0}{x+=$2; print $1"\t"$2"\t"x"/"tot"\t"int(100*x/tot)"%"}' > assembly/contigs.distrib.txt
86-
computedistrib(myoutput2, 'assembly/contigs.distrib.txt')
88+
ahp.computedistrib(myoutput2, 'assembly/contigs.distrib.txt')
8789

8890
if not int(args.noclean):
8991
cmd = 'rm -rf assembly_trinity'
@@ -96,39 +98,6 @@ def assembly(args):
9698

9799
# -------------------------------------
98100

99-
def computedistrib(infile, outfile):
100-
"""compute simple distribution of contigs"""
101-
102-
# a list of contig lengths
103-
x = []
104-
105-
with open(infile, 'r') as g:
106-
for line in g:
107-
if line[0] != '>':
108-
x.append(len(line.rstrip()))
109-
110-
from collections import Counter
111-
112-
# count occurences of each length
113-
xcount = Counter(x)
114-
115-
# tot length
116-
tot = sum(dict(xcount).values())
117-
118-
# running sum
119-
runningsum = 0
120-
121-
with open(outfile, 'w') as f:
122-
for i in sorted(dict(xcount), reverse=True):
123-
runningsum += xcount[i]
124-
f.write(str(i) + '\t')
125-
f.write(str(xcount[i]) + '\t')
126-
f.write(str(runningsum) + '/' + str(tot) + '\t')
127-
f.write(str(int(100*runningsum/tot)) + '%')
128-
f.write('\n')
129-
130-
# -------------------------------------
131-
132101
def remap(args, contigs):
133102
"""map contigs back onto assembly"""
134103

@@ -163,7 +132,7 @@ def remap(args, contigs):
163132
hp.run_cmd(cmd, args.verbose, 0)
164133

165134
# format pileup file - i.e., add zeros to uncovered positions
166-
formatpileup('assembly/reads2contigs.pileup', 'assembly/reads2contigs.stats.txt', 'assembly/reads2contigs.format.pileup')
135+
ahp.formatpileup('assembly/reads2contigs.pileup', 'assembly/reads2contigs.stats.txt', 'assembly/reads2contigs.format.pileup')
167136

168137
if not int(args.noclean):
169138
cmd = 'rm -r assembly/ref_remap'
@@ -175,68 +144,6 @@ def remap(args, contigs):
175144

176145
# -------------------------------------
177146

178-
def formatpileup(infile, idxfile, outfile):
179-
"""format the pileup file for computing entropy"""
180-
181-
# id 2 length dict (output of samtools idxstats)
182-
idx = {}
183-
184-
# load idx file
185-
with open(idxfile, 'r') as f:
186-
for line in f:
187-
# map id to length of contig
188-
idx[line.split()[0].strip()] = line.split()[1].strip()
189-
190-
myid = '' # contig id
191-
pos = '' # position
192-
193-
with open(outfile, 'w') as f:
194-
with open(infile, 'r') as g:
195-
for line in g:
196-
# get number reads
197-
numrds = line.split()[3]
198-
199-
# if beginning of a new contig (id != previous id)
200-
if line.split()[0] != myid:
201-
# if change (and not first contig), check if previous contig was covered until the end
202-
# if not covered, pad with zeros
203-
if myid:
204-
if int(idx[myid]) > int(pos):
205-
for i in range(int(pos) + 1, int(idx[myid]) + 1):
206-
f.write(myid + '\t' + str(i) + '\t0\n')
207-
208-
# if contig starts at postion > 1, pad with zeros
209-
if int(line.split()[1]) > 1:
210-
for i in range(1, int(line.split()[1])):
211-
f.write(line.split()[0] + '\t' + str(i) + '\t0\n')
212-
213-
# set new id
214-
myid = line.split()[0]
215-
216-
# write current line
217-
f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
218-
219-
# if discontinuity (position - previous position > 1), pad with zeros
220-
elif (int(line.split()[1]) - int(pos)) > 1:
221-
for i in range(int(pos) + 1, int(line.split()[1])):
222-
f.write(myid + '\t' + str(i) + '\t0\n')
223-
224-
f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
225-
226-
# otherwise, simply write line
227-
else:
228-
f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
229-
230-
# get position (this will become previous position for next iteration)
231-
pos = line.split()[1]
232-
233-
# check if last contig covered until the end
234-
if int(idx[myid]) > int(pos):
235-
for i in range(int(pos) + 1, int(idx[myid]) + 1):
236-
f.write(myid + '\t' + str(i) + '\t0\n')
237-
238-
# -------------------------------------
239-
240147
def main():
241148
"""Main function"""
242149

0 commit comments

Comments
 (0)