@@ -18,8 +18,8 @@ def get_arg():
18
18
parser .add_argument ('-1' , '--mate1' , default = 'host_separation/unmapped_1.fastq.gz' , help = 'mate1' )
19
19
parser .add_argument ('-2' , '--mate2' , default = 'host_separation/unmapped_2.fastq.gz' , help = 'mate2' )
20
20
parser .add_argument ('-o' , '--outputdir' , default = 'assembly_trinity' , help = 'the output directory' )
21
- parser .add_argument ('--trinitymem' , default = '50G' , help = 'max memory for Trinity' )
22
- parser .add_argument ('--trinitycores' , default = '8' , help = 'number of cores for Trinity' )
21
+ parser .add_argument ('--trinitymem' , required = True , help = 'max memory for Trinity' )
22
+ parser .add_argument ('--trinitycores' , required = True , help = 'number of cores for Trinity' )
23
23
parser .add_argument ('-l' , '--logsdir' , help = 'the logs directory' )
24
24
parser .add_argument ('-d' , '--scripts' , help = 'the git repository directory' )
25
25
parser .add_argument ('--noclean' , help = 'do not delete temporary intermediate files (default: off)' )
@@ -32,7 +32,9 @@ def get_arg():
32
32
# need this to get local modules
33
33
sys .path .append (args .scripts )
34
34
global hp
35
+ global ahp
35
36
from helpers import helpers as hp
37
+ from helpers import assembly_helpers as ahp
36
38
37
39
# error checking: exit if previous step produced zero output
38
40
for i in [args .mate1 , args .mate2 ]:
@@ -83,7 +85,7 @@ def assembly(args):
83
85
84
86
# compute simple distribution
85
87
# cat assembly/contigs_trinity.fasta | paste - - | awk '{print length($2)}' | sort -nr | ${d}/scripts/tablecount | awk -v tot=${num_contigs} 'BEGIN{x=0}{x+=$2; print $1"\t"$2"\t"x"/"tot"\t"int(100*x/tot)"%"}' > assembly/contigs.distrib.txt
86
- computedistrib (myoutput2 , 'assembly/contigs.distrib.txt' )
88
+ ahp . computedistrib (myoutput2 , 'assembly/contigs.distrib.txt' )
87
89
88
90
if not int (args .noclean ):
89
91
cmd = 'rm -rf assembly_trinity'
@@ -96,39 +98,6 @@ def assembly(args):
96
98
97
99
# -------------------------------------
98
100
99
- def computedistrib (infile , outfile ):
100
- """compute simple distribution of contigs"""
101
-
102
- # a list of contig lengths
103
- x = []
104
-
105
- with open (infile , 'r' ) as g :
106
- for line in g :
107
- if line [0 ] != '>' :
108
- x .append (len (line .rstrip ()))
109
-
110
- from collections import Counter
111
-
112
- # count occurences of each length
113
- xcount = Counter (x )
114
-
115
- # tot length
116
- tot = sum (dict (xcount ).values ())
117
-
118
- # running sum
119
- runningsum = 0
120
-
121
- with open (outfile , 'w' ) as f :
122
- for i in sorted (dict (xcount ), reverse = True ):
123
- runningsum += xcount [i ]
124
- f .write (str (i ) + '\t ' )
125
- f .write (str (xcount [i ]) + '\t ' )
126
- f .write (str (runningsum ) + '/' + str (tot ) + '\t ' )
127
- f .write (str (int (100 * runningsum / tot )) + '%' )
128
- f .write ('\n ' )
129
-
130
- # -------------------------------------
131
-
132
101
def remap (args , contigs ):
133
102
"""map contigs back onto assembly"""
134
103
@@ -163,7 +132,7 @@ def remap(args, contigs):
163
132
hp .run_cmd (cmd , args .verbose , 0 )
164
133
165
134
# format pileup file - i.e., add zeros to uncovered positions
166
- formatpileup ('assembly/reads2contigs.pileup' , 'assembly/reads2contigs.stats.txt' , 'assembly/reads2contigs.format.pileup' )
135
+ ahp . formatpileup ('assembly/reads2contigs.pileup' , 'assembly/reads2contigs.stats.txt' , 'assembly/reads2contigs.format.pileup' )
167
136
168
137
if not int (args .noclean ):
169
138
cmd = 'rm -r assembly/ref_remap'
@@ -175,68 +144,6 @@ def remap(args, contigs):
175
144
176
145
# -------------------------------------
177
146
178
- def formatpileup (infile , idxfile , outfile ):
179
- """format the pileup file for computing entropy"""
180
-
181
- # id 2 length dict (output of samtools idxstats)
182
- idx = {}
183
-
184
- # load idx file
185
- with open (idxfile , 'r' ) as f :
186
- for line in f :
187
- # map id to length of contig
188
- idx [line .split ()[0 ].strip ()] = line .split ()[1 ].strip ()
189
-
190
- myid = '' # contig id
191
- pos = '' # position
192
-
193
- with open (outfile , 'w' ) as f :
194
- with open (infile , 'r' ) as g :
195
- for line in g :
196
- # get number reads
197
- numrds = line .split ()[3 ]
198
-
199
- # if beginning of a new contig (id != previous id)
200
- if line .split ()[0 ] != myid :
201
- # if change (and not first contig), check if previous contig was covered until the end
202
- # if not covered, pad with zeros
203
- if myid :
204
- if int (idx [myid ]) > int (pos ):
205
- for i in range (int (pos ) + 1 , int (idx [myid ]) + 1 ):
206
- f .write (myid + '\t ' + str (i ) + '\t 0\n ' )
207
-
208
- # if contig starts at postion > 1, pad with zeros
209
- if int (line .split ()[1 ]) > 1 :
210
- for i in range (1 , int (line .split ()[1 ])):
211
- f .write (line .split ()[0 ] + '\t ' + str (i ) + '\t 0\n ' )
212
-
213
- # set new id
214
- myid = line .split ()[0 ]
215
-
216
- # write current line
217
- f .write (myid + '\t ' + line .split ()[1 ] + '\t ' + numrds + '\n ' )
218
-
219
- # if discontinuity (position - previous position > 1), pad with zeros
220
- elif (int (line .split ()[1 ]) - int (pos )) > 1 :
221
- for i in range (int (pos ) + 1 , int (line .split ()[1 ])):
222
- f .write (myid + '\t ' + str (i ) + '\t 0\n ' )
223
-
224
- f .write (myid + '\t ' + line .split ()[1 ] + '\t ' + numrds + '\n ' )
225
-
226
- # otherwise, simply write line
227
- else :
228
- f .write (myid + '\t ' + line .split ()[1 ] + '\t ' + numrds + '\n ' )
229
-
230
- # get position (this will become previous position for next iteration)
231
- pos = line .split ()[1 ]
232
-
233
- # check if last contig covered until the end
234
- if int (idx [myid ]) > int (pos ):
235
- for i in range (int (pos ) + 1 , int (idx [myid ]) + 1 ):
236
- f .write (myid + '\t ' + str (i ) + '\t 0\n ' )
237
-
238
- # -------------------------------------
239
-
240
147
def main ():
241
148
"""Main function"""
242
149
0 commit comments