v2.1.0

BSSeeker · Dec 9, 2016 · 4567ef5 · 4567ef5
1 parent ace12b6
commit 4567ef5
Show file tree

Hide file tree

Showing 7 changed files with 139 additions and 84 deletions.
diff --git a/RELEASE_NOTES b/RELEASE_NOTES
@@ -72,4 +72,10 @@ V2.0.9 - Dec 30, 2014
 
 V2.0.10 - Nov 5, 2015
 1. [bs_seeker2-align.py] : Fix bug with "-s" and "-e"
-1. [bs_seeker2-call_methylation.py] : Fix error with pysam version higher than 0.8.0
+2. [bs_seeker2-call_methylation.py] : Fix error with pysam version higher than 0.8.0
+
+------------------------------------------
+
+V2.1.0 - Dec 9, 2016
+1. fixed bug related with pysam version
+2. fixed bug related with output information
diff --git a/bs_align/bs_align_utils.py b/bs_align/bs_align_utils.py
@@ -16,8 +16,11 @@ def N_MIS(r,g):
         for i in xrange(len(r)):
             if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == 'T' and g[i] == 'C'):
                 mismatches += 1
+            #
+        #
+    #
     return mismatches
-
+#
 
 #----------------------------------------------------------------
 
@@ -64,7 +67,7 @@ def RemoveAdapter ( read, adapter, no_mismatch, rm_back=0) :
     # not including the 'A' base in front of the adapter.
     if adapter[2:] == read[0:(la-1)] :
         return ""
-
+    #
     for i in xrange( lr - no_mismatch ) :
         read_pos = i
         adapter_pos = 0
@@ -80,6 +83,8 @@ def RemoveAdapter ( read, adapter, no_mismatch, rm_back=0) :
                 else :
                     read_pos = read_pos + 1
                     adapter_pos = adapter_pos + 1
+                #
+            #
         # while_end
 
         # Cut the extra bases before the adapter
@@ -90,6 +95,8 @@ def RemoveAdapter ( read, adapter, no_mismatch, rm_back=0) :
                 return ''
             else :
                 return read[:(i-rm_back)]
+            #
+        #
     # for_end
     return read
 
@@ -99,6 +106,7 @@ def Remove_5end_Adapter ( read, adapter, no_mismatch) :
     la = len(adapter)
     if la == 0 :
         return read
+    #
     for i in xrange (la - no_mismatch) :
         read_pos = 0
         adapter_pos = i
@@ -114,11 +122,14 @@ def Remove_5end_Adapter ( read, adapter, no_mismatch) :
                 else : 
                     read_pos = read_pos + 1
                     adapter_pos = adapter_pos + 1
+                #
+            #
         # while_end
         if adapter_pos == la :
             return read[(la-i):]
+        #
     return read
-
+#
 
 def next_nuc(seq, pos, n):
     """ Returns the nucleotide that is n places from pos in seq. Skips gap symbols.
@@ -133,21 +144,19 @@ def next_nuc(seq, pos, n):
         return seq[i]
     else :
         return 'N'
-
+    #
+#
 
 
 def methy_seq(read, genome):
     H = ['A', 'C', 'T']
     m_seq = []
     xx = "-"
     for i in xrange(len(read)):
-
         if genome[i] == '-':
             continue
-
         elif read[i] != 'C' and read[i] != 'T':
             xx = "-"
-
         elif read[i] == "T" and genome[i] == "C": #(unmethylated):
             nn1 = next_nuc(genome, i, 1)
             if nn1 == "G":
@@ -158,52 +167,60 @@ def methy_seq(read, genome):
                     xx = "y"
                 elif nn2 in H :
                     xx = "z"
-
+                #
+            #
         elif read[i] == "C" and genome[i] == "C": #(methylated):
             nn1 = next_nuc(genome, i, 1)
 
             if nn1 == "G":
                 xx = "X"
             elif nn1 in H :
                 nn2 = next_nuc(genome, i, 2)
-
+                2#
                 if nn2 == "G":
                     xx = "Y"
                 elif nn2 in H:
                     xx = "Z"
+                #
+            #
         else:
             xx = "-"
+        #
         m_seq.append(xx)
-
+    #
     return ''.join(m_seq)
+#
 
 def mcounts(mseq, mlst, ulst):
     out_mlst=[mlst[0]+mseq.count("X"), mlst[1]+mseq.count("Y"), mlst[2]+mseq.count("Z")]
     out_ulst=[ulst[0]+mseq.count("x"), ulst[1]+mseq.count("y"), ulst[2]+mseq.count("z")]
     return out_mlst, out_ulst
-
+#
 
 
 def process_aligner_output(filename, pair_end = False):
-
     #m = re.search(r'-('+'|'.join(supported_aligners) +')-TMP', filename)
     m = re.search(r'-('+'|'.join(supported_aligners) +')-.*TMP', filename)
     if m is None:
         error('The temporary folder path should contain the name of one of the supported aligners: ' + filename)
-
+    #
     format = m.group(1)
     try :
         input = open(filename)
     except IOError:
         print "[Error] Cannot open file %s" % filename
         exit(-1)
-
+    #
     QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL = range(11)
     def parse_SAM(line):
-        buf = line.split()
-        # print buf
+        # fix error when reading file with lots of \x00 # date on 2016-12-09
+        line = line.replace('\x00', '').strip()
+        buf = line.split("\t")
+        if len(buf) < 11 :
+            sys.stderr.write("[warning] SAM input without enough columns\n")
+            return None, None, None, None, None, None
+        #
         flag = int(buf[FLAG])
-
         # skip reads that are not mapped
         # skip reads that have probability of being non-unique higher than 1/10
         if flag & 0x4 : # or int(buf[MAPQ]) < 10:
@@ -230,14 +247,15 @@ def parse_SAM(line):
             # chr16   75728107        75728147        read45  9       -
             # chr16   67934919        67934959        read45  9       -
             mismatches = buf[4]
-
+        #
         return (buf[QNAME], # read ID
                 buf[RNAME], # reference ID
                 int(buf[POS]) - 1, # position, 0 based (SAM is 1 based)
                 mismatches,    # number of mismatches
                 parse_cigar(buf[CIGAR]), # the parsed cigar string
                 flag & 0x40 # true if it is the first mate in a pair, false if it is the second mate
                 )
+    #
 
     SOAP_QNAME, SOAP_SEQ, SOAP_QUAL, SOAP_NHITS, SOAP_AB, SOAP_LEN, SOAP_STRAND, SOAP_CHR, SOAP_LOCATION, SOAP_MISMATCHES = range(10)
     def parse_SOAP(line):
@@ -250,8 +268,9 @@ def parse_SOAP(line):
                 buf[SOAP_STRAND],
                 parse_cigar(buf[SOAP_LEN]+'M')
             )
+    #
 
-        # chr16   75728107        75728147        read45  9       -
+    # chr16   75728107        75728147        read45  9       -
     RMAP_CHR, RMAP_START, RMAP_END, RMAP_QNAME, RMAP_MISMATCH, RMAP_STRAND = range(6)
     def parse_RMAP(line):
         buf = line.split()
@@ -262,47 +281,55 @@ def parse_RMAP(line):
                  int(buf[RMAP_MISMATCH]),
                  buf[RMAP_STRAND]
         )
+    #
 
     if format == BOWTIE or format == BOWTIE2:
         if pair_end:
             for line in input:
                 header1, chr1, location1, no_mismatch1, cigar1,        _ = parse_SAM(line)
                 header2,    _, location2, no_mismatch2, cigar2, mate_no2 = parse_SAM(input.next())
-
-
+                #
                 if header1 and header2:
                     # flip the location info if the second mate comes first in the alignment file
                     if mate_no2:
                         location1, location2 = location2, location1
                         cigar1, cigar2 = cigar2, cigar1
-
-
+                    #
                     yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
+                #
+            #
         else:
             for line in input:
                 header, chr, location, no_mismatch, cigar, _ = parse_SAM(line)
                 if header is not None:
                     yield header, chr, location, no_mismatch, cigar
+                #
+            #
+        #
     elif format == SOAP:
         if pair_end:
             for line in input:
                 header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line)
                 header2, _   , location2, no_mismatch2,     _, strand2, cigar2 = parse_SOAP(input.next())
-
+                #
                 if mate1 == 'b':
                     location1, location2 = location2, location1
                     strand1, strand2 = strand2, strand1
                     ciga1, cigar2 = cigar2, cigar1
-
-
+                #
                 if header1 and header2 and strand1 == '+' and strand2 == '-':
                     yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
-
+                #
+            #
+        #
         else:
             for line in input:
                 header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line)
                 if header and strand == '+':
                     yield header, chr, location, no_mismatch, cigar
+                #
+            #
+        #
     elif format == RMAP :
         if pair_end :
             todo = 0
@@ -312,9 +339,11 @@ def parse_RMAP(line):
                 header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line)
                 cigar = str(read_len) + "M"
                 yield header, chr, location, no_mismatch, cigar
-
+            #
+        #
+    #
     input.close()
-
+#
 
 def parse_cigar(cigar_string):
     i = 0
@@ -339,11 +368,13 @@ def get_read_start_end_and_genome_length(cigar):
             r_end += count
         elif edit_op == BAM_DEL:
             g_len += count
+        #
+    #
     return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence
     # r_start : start position on the read
     # r_end   : end position on the read
     # g_len   : length of the mapped region on genome
-
+#
 
 def cigar_to_alignment(cigar, read_seq, genome_seq):
     """ Reconstruct the pairwise alignment based on the CIGAR string and the two sequences
@@ -368,9 +399,10 @@ def cigar_to_alignment(cigar, read_seq, genome_seq):
             r_aln += read_seq[r_pos : r_pos + count]
             g_aln += '-'*count
             r_pos += count
-
+        #
+    #
     return r_aln, g_aln
-
+#
 
 
 # return sequence is [start, end), not include 'end'
@@ -384,19 +416,21 @@ def get_genomic_sequence(genome, start, end, strand = '+'):
         prev = 'N'+genome[0]
     else:
         prev = 'NN'
-
+    #
     if end < len(genome) - 1:
         next = genome[end: end + 2]
     elif end == len(genome) - 1:
         next = genome[end] + 'N'
     else:
         next = 'NN'
+    #
     origin_genome = genome[start:end]
-
+    #
     if strand == '-':
         # reverse complement everything if strand is '-'
         revc = reverse_compl_seq('%s%s%s' % (prev, origin_genome, next))
         prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:]
-
+    #
     return origin_genome, next, '%s_%s_%s' % (prev, origin_genome, next)
-    # next : next two nucleotides
+    # next : next two nucleotides
+#
diff --git a/bs_align/bs_pair_end.py b/bs_align/bs_pair_end.py
@@ -989,7 +989,8 @@ def bs_pair_end(main_read_file_1,
     #print "AdapterA=", adapterA, "; AdapterB=", adapterB
     if adapterA != "" or adapterB != "" :
         logm("Number of reads having adapter removed: %d "% all_trimmed)
-        logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
+        trim_percent = (float(all_base_after_trim) / all_base_before_trim) if all_base_before_trim>0 else 0
+        logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, trim_percent)  )
 
     if all_raw_reads >0:
         logm("Number of reads rejected because of multiple hits: %d" % len(Multiple_hits) )
@@ -1011,8 +1012,8 @@ def bs_pair_end(main_read_file_1,
         elif asktag=="N":
             logm("  %7d FW-RC pairs mapped to Watson strand" % (numbers_mapped_lst[0]) )
             logm("  %7d FW-RC pairs mapped to Crick strand" % (numbers_mapped_lst[1]) )
-
-        logm("Mappability = %1.4f%%" % (100*float(all_mapped_passed)*2/all_raw_reads) )
+        Mappability = (100*float(all_mapped_passed)*2/all_raw_reads) if all_raw_reads > 0 else 0
+        logm("Mappability = %1.4f%%" % Mappability )
         logm("Total bases of uniquely mapped reads : %7d" % all_base_mapped )
         logm("Unmapped read pairs: %d" % all_unmapped+"\n")
         #

diff --git a/bs_align/bs_rrbs.py b/bs_align/bs_rrbs.py
@@ -1012,13 +1012,13 @@ def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines,
                                                     try_pos, FR)
                             try_pos += 1
                             try_count += 1
-
+                        #
                         #if my_region_serial == 0 :
                         #    print "[For debug]: chr=", mapped_chr
                         #    print "[For debug]: RC_C2A read still cannot find fragment serial"
-
+                    #
                     N_mismatch = N_MIS(r_aln, g_aln)
-#                    if N_mismatch <= int(max_mismatch_no) :
+                    #if N_mismatch <= int(max_mismatch_no) :
                     mm_no=float(max_mismatch_no)
                     if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
                         all_mapped_passed += 1
@@ -1059,19 +1059,23 @@ def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines,
     logm("----------------------------------------------")
     logm("Number of raw reads: %d " % all_raw_reads)
     if all_raw_reads>0:
-        logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)" % (all_tagged, float(all_tagged)/all_raw_reads))
+        tag_percent = (float(all_tagged)/all_raw_reads) if all_raw_reads>0 else 0
+        logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)" % (all_tagged, tag_percent) )
         for kk in range(len(n_cut_tag_lst)):
-            logm("Number of raw reads with tag %s: %d (%1.3f)" % (cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
+            tag_percent = (float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads) if all_raw_reads>0 else 0
+            logm("Number of raw reads with tag %s: %d (%1.3f)" % (cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]], tag_percent) )
         logm("Number of bases in total: %d " % all_base_before_trim)
         if adapter!="" :
             logm("Number of reads having adapter removed: %d " % all_tagged_trimmed)
-            logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) )
+            trim_percent = (float(all_base_after_trim)/all_base_before_trim ) if all_base_before_trim>0 else 0
+            logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, trim_percent ) )
         logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
         logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped)
         logm("  %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
-        logm("Mappability = %1.4f%%" % (100*float(all_mapped_passed)/all_raw_reads))
+        Mappability = (100*float(all_mapped_passed)/all_raw_reads) if all_raw_reads>0 else 0
+        logm("Mappability = %1.4f%%" % Mappability )
         #logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
-        if asktag == "Y": # un-diretional
+        if asktag == "Y": # un-directional
             logm("  %7d FW reads mapped to Watson strand" % (num_mapped_FW_C2T) )
             logm("  %7d RC reads mapped to Watson strand" % (num_mapped_FW_G2A) )
             logm("  %7d FW reads mapped to Crick strand" % (num_mapped_RC_C2T) )