expose Trinity mem in main script, move code from assembly.py to modules (reduce bloating)

gitliver · gitliver · commit 9a749d023e94 · 2016-05-04T13:48:25.000-04:00
diff --git a/helpers/assembly_helpers.py b/helpers/assembly_helpers.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+
+"""
+    Helper functions for the assembly step
+    ~~~~~~
+"""
+
+# -------------------------------------
+
+def computedistrib(infile, outfile):
+    """compute simple distribution of contigs"""
+
+    # a list of contig lengths
+    x = []
+
+    with open(infile, 'r') as g:
+        for line in g:
+            if line[0] != '>': 
+                x.append(len(line.rstrip()))
+
+    from collections import Counter
+
+    # count occurences of each length
+    xcount = Counter(x)
+
+    # tot length
+    tot = sum(dict(xcount).values())
+
+    # running sum
+    runningsum = 0
+
+    with open(outfile, 'w') as f:
+        for i in sorted(dict(xcount), reverse=True):
+            runningsum += xcount[i]
+            f.write(str(i) + '\t')
+            f.write(str(xcount[i]) + '\t')
+            f.write(str(runningsum) + '/' + str(tot) + '\t')
+            f.write(str(int(100*runningsum/tot)) + '%')
+            f.write('\n')
+        
+# -------------------------------------
+
+def formatpileup(infile, idxfile, outfile):
+    """format the pileup file for computing entropy"""
+
+    # id 2 length dict (output of samtools idxstats)
+    idx = {}
+
+    # load idx file
+    with open(idxfile, 'r') as f:
+        for line in f:
+            # map id to length of contig
+            idx[line.split()[0].strip()] = line.split()[1].strip()
+
+    myid = ''    # contig id
+    pos = ''    # position
+
+    with open(outfile, 'w') as f:
+        with open(infile, 'r') as g:
+            for line in g:
+                # get number reads 
+                numrds = line.split()[3]
+
+                # if beginning of a new contig (id != previous id)
+                if line.split()[0] != myid:
+                    # if change (and not first contig), check if previous contig was covered until the end
+                    # if not covered, pad with zeros
+                    if myid: 
+                        if int(idx[myid]) > int(pos): 
+                            for i in range(int(pos) + 1, int(idx[myid]) + 1): 
+                                f.write(myid + '\t' + str(i) + '\t0\n')
+ 
+                    # if contig starts at postion > 1, pad with zeros 
+                    if int(line.split()[1]) > 1: 
+                        for i in range(1, int(line.split()[1])): 
+                            f.write(line.split()[0] + '\t' + str(i) + '\t0\n')
+
+                    # set new id 
+                    myid = line.split()[0]
+
+                    # write current line
+                    f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
+
+                # if discontinuity (position - previous position > 1), pad with zeros
+                elif (int(line.split()[1]) - int(pos)) > 1:
+                    for i in range(int(pos) + 1, int(line.split()[1])):
+                        f.write(myid + '\t' + str(i) + '\t0\n')
+
+                    f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
+
+                # otherwise, simply write line
+                else:
+                    f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
+
+                # get position (this will become previous position for next iteration)
+                pos = line.split()[1]
+
+    # check if last contig covered until the end
+    if int(idx[myid]) > int(pos):
+        for i in range(int(pos) + 1, int(idx[myid]) + 1):
+            f.write(myid + '\t' + str(i) + '\t0\n')
+
+# -------------------------------------
+
+if __name__ == "__main__":
+
+    pass
diff --git a/pandora.py b/pandora.py
@@ -66,6 +66,8 @@ def get_arg():
       step 3: blast contigs, \
       step 4: orf discovery, \
       step 5: reporting (default: 12345 - i.e, steps 1 through 5).')
+    parser_scan.add_argument('--trinitymem', default='50G', help='max memory for Trinity (default: 50G)')
+    parser_scan.add_argument('--trinitycores', default='8', help='number of cores for Trinity (default: 8)')
     parser_scan.set_defaults(which='scan')
 
     # create the parser for the 'aggregate' command
@@ -201,9 +203,11 @@ def scan_main(args):
                       int(args.verbose),
                       int(args.noclean),
                       args.gtf),
-             '2': '{}/scripts/assembly.py --scripts {} --verbose {} --noclean {}'.format(
+             '2': '{}/scripts/assembly.py --scripts {} --trinitymem {} --trinitycores {} --verbose {} --noclean {}'.format(
                       args.scripts,
                       args.scripts,
+                      args.trinitymem,
+                      args.trinitycores,
                       int(args.verbose),
                       int(args.noclean)),
              '3': '{}/scripts/blast_wrapper.py --scripts {} --threshold {} --db {} --id {} --verbose {} --noclean {} --nosge {}'.format(
diff --git a/scripts/assembly.py b/scripts/assembly.py
@@ -18,8 +18,8 @@ def get_arg():
     parser.add_argument('-1', '--mate1', default='host_separation/unmapped_1.fastq.gz', help='mate1')
     parser.add_argument('-2', '--mate2', default='host_separation/unmapped_2.fastq.gz', help='mate2')
     parser.add_argument('-o', '--outputdir', default='assembly_trinity', help='the output directory')
-    parser.add_argument('--trinitymem', default='50G', help='max memory for Trinity')
-    parser.add_argument('--trinitycores', default='8', help='number of cores for Trinity')
+    parser.add_argument('--trinitymem', required=True, help='max memory for Trinity')
+    parser.add_argument('--trinitycores', required=True, help='number of cores for Trinity')
     parser.add_argument('-l', '--logsdir', help='the logs directory')
     parser.add_argument('-d', '--scripts', help='the git repository directory')
     parser.add_argument('--noclean', help='do not delete temporary intermediate files (default: off)')
@@ -32,7 +32,9 @@ def get_arg():
     # need this to get local modules
     sys.path.append(args.scripts)
     global hp
+    global ahp
     from helpers import helpers as hp
+    from helpers import assembly_helpers as ahp
 
     # error checking: exit if previous step produced zero output
     for i in [args.mate1, args.mate2]:
@@ -83,7 +85,7 @@ def assembly(args):
 
     # compute simple distribution
     # cat assembly/contigs_trinity.fasta | paste - - | awk '{print length($2)}' | sort -nr | ${d}/scripts/tablecount | awk -v tot=${num_contigs} 'BEGIN{x=0}{x+=$2; print $1"\t"$2"\t"x"/"tot"\t"int(100*x/tot)"%"}' > assembly/contigs.distrib.txt
-    computedistrib(myoutput2, 'assembly/contigs.distrib.txt')
+    ahp.computedistrib(myoutput2, 'assembly/contigs.distrib.txt')
 
     if not int(args.noclean):
         cmd = 'rm -rf assembly_trinity'
@@ -96,39 +98,6 @@ def assembly(args):
 
 # -------------------------------------
 
-def computedistrib(infile, outfile):
-    """compute simple distribution of contigs"""
-
-    # a list of contig lengths
-    x = []
-
-    with open(infile, 'r') as g:
-        for line in g:
-            if line[0] != '>': 
-                x.append(len(line.rstrip()))
-
-    from collections import Counter
-
-    # count occurences of each length
-    xcount = Counter(x)
-
-    # tot length
-    tot = sum(dict(xcount).values())
-
-    # running sum
-    runningsum = 0
-
-    with open(outfile, 'w') as f:
-        for i in sorted(dict(xcount), reverse=True):
-            runningsum += xcount[i]
-            f.write(str(i) + '\t')
-            f.write(str(xcount[i]) + '\t')
-            f.write(str(runningsum) + '/' + str(tot) + '\t')
-            f.write(str(int(100*runningsum/tot)) + '%')
-            f.write('\n')
-        
-# -------------------------------------
-
 def remap(args, contigs):
     """map contigs back onto assembly"""
 
@@ -163,7 +132,7 @@ def remap(args, contigs):
     hp.run_cmd(cmd, args.verbose, 0)
 
     # format pileup file - i.e., add zeros to uncovered positions
-    formatpileup('assembly/reads2contigs.pileup', 'assembly/reads2contigs.stats.txt', 'assembly/reads2contigs.format.pileup')
+    ahp.formatpileup('assembly/reads2contigs.pileup', 'assembly/reads2contigs.stats.txt', 'assembly/reads2contigs.format.pileup')
 
     if not int(args.noclean):
         cmd = 'rm -r assembly/ref_remap'
@@ -175,68 +144,6 @@ def remap(args, contigs):
 
 # -------------------------------------
 
-def formatpileup(infile, idxfile, outfile):
-    """format the pileup file for computing entropy"""
-
-    # id 2 length dict (output of samtools idxstats)
-    idx = {}
-
-    # load idx file
-    with open(idxfile, 'r') as f:
-        for line in f:
-            # map id to length of contig
-            idx[line.split()[0].strip()] = line.split()[1].strip()
-
-    myid = ''    # contig id
-    pos = ''    # position
-
-    with open(outfile, 'w') as f:
-        with open(infile, 'r') as g:
-            for line in g:
-                # get number reads 
-                numrds = line.split()[3]
-
-                # if beginning of a new contig (id != previous id)
-                if line.split()[0] != myid:
-                    # if change (and not first contig), check if previous contig was covered until the end
-                    # if not covered, pad with zeros
-                    if myid: 
-                        if int(idx[myid]) > int(pos): 
-                            for i in range(int(pos) + 1, int(idx[myid]) + 1): 
-                                f.write(myid + '\t' + str(i) + '\t0\n')
- 
-                    # if contig starts at postion > 1, pad with zeros 
-                    if int(line.split()[1]) > 1: 
-                        for i in range(1, int(line.split()[1])): 
-                            f.write(line.split()[0] + '\t' + str(i) + '\t0\n')
-
-                    # set new id 
-                    myid = line.split()[0]
-
-                    # write current line
-                    f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
-
-                # if discontinuity (position - previous position > 1), pad with zeros
-                elif (int(line.split()[1]) - int(pos)) > 1:
-                    for i in range(int(pos) + 1, int(line.split()[1])):
-                        f.write(myid + '\t' + str(i) + '\t0\n')
-
-                    f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
-
-                # otherwise, simply write line
-                else:
-                    f.write(myid + '\t' + line.split()[1] + '\t' + numrds + '\n')
-
-                # get position (this will become previous position for next iteration)
-                pos = line.split()[1]
-
-    # check if last contig covered until the end
-    if int(idx[myid]) > int(pos):
-        for i in range(int(pos) + 1, int(idx[myid]) + 1):
-            f.write(myid + '\t' + str(i) + '\t0\n')
-
-# -------------------------------------
-
 def main():
     """Main function"""