Skip to content

Commit

Permalink
v2.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
BSSeeker committed Dec 9, 2016
1 parent ace12b6 commit 4567ef5
Show file tree
Hide file tree
Showing 7 changed files with 139 additions and 84 deletions.
8 changes: 7 additions & 1 deletion RELEASE_NOTES
Original file line number Diff line number Diff line change
Expand Up @@ -72,4 +72,10 @@ V2.0.9 - Dec 30, 2014

V2.0.10 - Nov 5, 2015
1. [bs_seeker2-align.py] : Fix bug with "-s" and "-e"
1. [bs_seeker2-call_methylation.py] : Fix error with pysam version higher than 0.8.0
2. [bs_seeker2-call_methylation.py] : Fix error with pysam version higher than 0.8.0

------------------------------------------

V2.1.0 - Dec 9, 2016
1. fixed bug related with pysam version
2. fixed bug related with output information
106 changes: 70 additions & 36 deletions bs_align/bs_align_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@ def N_MIS(r,g):
for i in xrange(len(r)):
if r[i] != g[i] and r[i] != "N" and g[i] != "N" and not(r[i] == 'T' and g[i] == 'C'):
mismatches += 1
#
#
#
return mismatches

#

#----------------------------------------------------------------

Expand Down Expand Up @@ -64,7 +67,7 @@ def RemoveAdapter ( read, adapter, no_mismatch, rm_back=0) :
# not including the 'A' base in front of the adapter.
if adapter[2:] == read[0:(la-1)] :
return ""

#
for i in xrange( lr - no_mismatch ) :
read_pos = i
adapter_pos = 0
Expand All @@ -80,6 +83,8 @@ def RemoveAdapter ( read, adapter, no_mismatch, rm_back=0) :
else :
read_pos = read_pos + 1
adapter_pos = adapter_pos + 1
#
#
# while_end

# Cut the extra bases before the adapter
Expand All @@ -90,6 +95,8 @@ def RemoveAdapter ( read, adapter, no_mismatch, rm_back=0) :
return ''
else :
return read[:(i-rm_back)]
#
#
# for_end
return read

Expand All @@ -99,6 +106,7 @@ def Remove_5end_Adapter ( read, adapter, no_mismatch) :
la = len(adapter)
if la == 0 :
return read
#
for i in xrange (la - no_mismatch) :
read_pos = 0
adapter_pos = i
Expand All @@ -114,11 +122,14 @@ def Remove_5end_Adapter ( read, adapter, no_mismatch) :
else :
read_pos = read_pos + 1
adapter_pos = adapter_pos + 1
#
#
# while_end
if adapter_pos == la :
return read[(la-i):]
#
return read

#

def next_nuc(seq, pos, n):
""" Returns the nucleotide that is n places from pos in seq. Skips gap symbols.
Expand All @@ -133,21 +144,19 @@ def next_nuc(seq, pos, n):
return seq[i]
else :
return 'N'

#
#


def methy_seq(read, genome):
H = ['A', 'C', 'T']
m_seq = []
xx = "-"
for i in xrange(len(read)):

if genome[i] == '-':
continue

elif read[i] != 'C' and read[i] != 'T':
xx = "-"

elif read[i] == "T" and genome[i] == "C": #(unmethylated):
nn1 = next_nuc(genome, i, 1)
if nn1 == "G":
Expand All @@ -158,52 +167,60 @@ def methy_seq(read, genome):
xx = "y"
elif nn2 in H :
xx = "z"

#
#
elif read[i] == "C" and genome[i] == "C": #(methylated):
nn1 = next_nuc(genome, i, 1)

if nn1 == "G":
xx = "X"
elif nn1 in H :
nn2 = next_nuc(genome, i, 2)

2#
if nn2 == "G":
xx = "Y"
elif nn2 in H:
xx = "Z"
#
#
else:
xx = "-"
#
m_seq.append(xx)

#
return ''.join(m_seq)
#

def mcounts(mseq, mlst, ulst):
out_mlst=[mlst[0]+mseq.count("X"), mlst[1]+mseq.count("Y"), mlst[2]+mseq.count("Z")]
out_ulst=[ulst[0]+mseq.count("x"), ulst[1]+mseq.count("y"), ulst[2]+mseq.count("z")]
return out_mlst, out_ulst

#


def process_aligner_output(filename, pair_end = False):

#m = re.search(r'-('+'|'.join(supported_aligners) +')-TMP', filename)
m = re.search(r'-('+'|'.join(supported_aligners) +')-.*TMP', filename)
if m is None:
error('The temporary folder path should contain the name of one of the supported aligners: ' + filename)

#
format = m.group(1)
try :
input = open(filename)
except IOError:
print "[Error] Cannot open file %s" % filename
exit(-1)

#
QNAME, FLAG, RNAME, POS, MAPQ, CIGAR, RNEXT, PNEXT, TLEN, SEQ, QUAL = range(11)
def parse_SAM(line):
buf = line.split()
# print buf
# fix error when reading file with lots of \x00 # date on 2016-12-09
line = line.replace('\x00', '').strip()
buf = line.split("\t")
if len(buf) < 11 :
sys.stderr.write("[warning] SAM input without enough columns\n")
return None, None, None, None, None, None
#
flag = int(buf[FLAG])

# skip reads that are not mapped
# skip reads that have probability of being non-unique higher than 1/10
if flag & 0x4 : # or int(buf[MAPQ]) < 10:
Expand All @@ -230,14 +247,15 @@ def parse_SAM(line):
# chr16 75728107 75728147 read45 9 -
# chr16 67934919 67934959 read45 9 -
mismatches = buf[4]

#
return (buf[QNAME], # read ID
buf[RNAME], # reference ID
int(buf[POS]) - 1, # position, 0 based (SAM is 1 based)
mismatches, # number of mismatches
parse_cigar(buf[CIGAR]), # the parsed cigar string
flag & 0x40 # true if it is the first mate in a pair, false if it is the second mate
)
#

SOAP_QNAME, SOAP_SEQ, SOAP_QUAL, SOAP_NHITS, SOAP_AB, SOAP_LEN, SOAP_STRAND, SOAP_CHR, SOAP_LOCATION, SOAP_MISMATCHES = range(10)
def parse_SOAP(line):
Expand All @@ -250,8 +268,9 @@ def parse_SOAP(line):
buf[SOAP_STRAND],
parse_cigar(buf[SOAP_LEN]+'M')
)
#

# chr16 75728107 75728147 read45 9 -
# chr16 75728107 75728147 read45 9 -
RMAP_CHR, RMAP_START, RMAP_END, RMAP_QNAME, RMAP_MISMATCH, RMAP_STRAND = range(6)
def parse_RMAP(line):
buf = line.split()
Expand All @@ -262,47 +281,55 @@ def parse_RMAP(line):
int(buf[RMAP_MISMATCH]),
buf[RMAP_STRAND]
)
#

if format == BOWTIE or format == BOWTIE2:
if pair_end:
for line in input:
header1, chr1, location1, no_mismatch1, cigar1, _ = parse_SAM(line)
header2, _, location2, no_mismatch2, cigar2, mate_no2 = parse_SAM(input.next())


#
if header1 and header2:
# flip the location info if the second mate comes first in the alignment file
if mate_no2:
location1, location2 = location2, location1
cigar1, cigar2 = cigar2, cigar1


#
yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2
#
#
else:
for line in input:
header, chr, location, no_mismatch, cigar, _ = parse_SAM(line)
if header is not None:
yield header, chr, location, no_mismatch, cigar
#
#
#
elif format == SOAP:
if pair_end:
for line in input:
header1, chr1, location1, no_mismatch1, mate1, strand1, cigar1 = parse_SOAP(line)
header2, _ , location2, no_mismatch2, _, strand2, cigar2 = parse_SOAP(input.next())

#
if mate1 == 'b':
location1, location2 = location2, location1
strand1, strand2 = strand2, strand1
ciga1, cigar2 = cigar2, cigar1


#
if header1 and header2 and strand1 == '+' and strand2 == '-':
yield header1, chr1, no_mismatch1 + no_mismatch2, location1, cigar1, location2, cigar2

#
#
#
else:
for line in input:
header, chr, location, no_mismatch, _, strand, cigar = parse_SOAP(line)
if header and strand == '+':
yield header, chr, location, no_mismatch, cigar
#
#
#
elif format == RMAP :
if pair_end :
todo = 0
Expand All @@ -312,9 +339,11 @@ def parse_RMAP(line):
header, chr, location, read_len, no_mismatch, strand = parse_RMAP(line)
cigar = str(read_len) + "M"
yield header, chr, location, no_mismatch, cigar

#
#
#
input.close()

#

def parse_cigar(cigar_string):
i = 0
Expand All @@ -339,11 +368,13 @@ def get_read_start_end_and_genome_length(cigar):
r_end += count
elif edit_op == BAM_DEL:
g_len += count
#
#
return r_start, r_end, g_len # return the start and end in the read and the length of the genomic sequence
# r_start : start position on the read
# r_end : end position on the read
# g_len : length of the mapped region on genome

#

def cigar_to_alignment(cigar, read_seq, genome_seq):
""" Reconstruct the pairwise alignment based on the CIGAR string and the two sequences
Expand All @@ -368,9 +399,10 @@ def cigar_to_alignment(cigar, read_seq, genome_seq):
r_aln += read_seq[r_pos : r_pos + count]
g_aln += '-'*count
r_pos += count

#
#
return r_aln, g_aln

#


# return sequence is [start, end), not include 'end'
Expand All @@ -384,19 +416,21 @@ def get_genomic_sequence(genome, start, end, strand = '+'):
prev = 'N'+genome[0]
else:
prev = 'NN'

#
if end < len(genome) - 1:
next = genome[end: end + 2]
elif end == len(genome) - 1:
next = genome[end] + 'N'
else:
next = 'NN'
#
origin_genome = genome[start:end]

#
if strand == '-':
# reverse complement everything if strand is '-'
revc = reverse_compl_seq('%s%s%s' % (prev, origin_genome, next))
prev, origin_genome, next = revc[:2], revc[2:-2], revc[-2:]

#
return origin_genome, next, '%s_%s_%s' % (prev, origin_genome, next)
# next : next two nucleotides
# next : next two nucleotides
#
7 changes: 4 additions & 3 deletions bs_align/bs_pair_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -989,7 +989,8 @@ def bs_pair_end(main_read_file_1,
#print "AdapterA=", adapterA, "; AdapterB=", adapterB
if adapterA != "" or adapterB != "" :
logm("Number of reads having adapter removed: %d "% all_trimmed)
logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim) )
trim_percent = (float(all_base_after_trim) / all_base_before_trim) if all_base_before_trim>0 else 0
logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, trim_percent) )

if all_raw_reads >0:
logm("Number of reads rejected because of multiple hits: %d" % len(Multiple_hits) )
Expand All @@ -1011,8 +1012,8 @@ def bs_pair_end(main_read_file_1,
elif asktag=="N":
logm(" %7d FW-RC pairs mapped to Watson strand" % (numbers_mapped_lst[0]) )
logm(" %7d FW-RC pairs mapped to Crick strand" % (numbers_mapped_lst[1]) )

logm("Mappability = %1.4f%%" % (100*float(all_mapped_passed)*2/all_raw_reads) )
Mappability = (100*float(all_mapped_passed)*2/all_raw_reads) if all_raw_reads > 0 else 0
logm("Mappability = %1.4f%%" % Mappability )
logm("Total bases of uniquely mapped reads : %7d" % all_base_mapped )
logm("Unmapped read pairs: %d" % all_unmapped+"\n")
#
Expand Down
20 changes: 12 additions & 8 deletions bs_align/bs_rrbs.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,13 +1012,13 @@ def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines,
try_pos, FR)
try_pos += 1
try_count += 1

#
#if my_region_serial == 0 :
# print "[For debug]: chr=", mapped_chr
# print "[For debug]: RC_C2A read still cannot find fragment serial"

#
N_mismatch = N_MIS(r_aln, g_aln)
# if N_mismatch <= int(max_mismatch_no) :
#if N_mismatch <= int(max_mismatch_no) :
mm_no=float(max_mismatch_no)
if (mm_no>=1 and N_mismatch<=mm_no) or (mm_no<1 and N_mismatch<=(mm_no*len(r_aln)) ):
all_mapped_passed += 1
Expand Down Expand Up @@ -1059,19 +1059,23 @@ def bs_rrbs(main_read_file, asktag, adapter_file, cut_s, cut_e, no_small_lines,
logm("----------------------------------------------")
logm("Number of raw reads: %d " % all_raw_reads)
if all_raw_reads>0:
logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)" % (all_tagged, float(all_tagged)/all_raw_reads))
tag_percent = (float(all_tagged)/all_raw_reads) if all_raw_reads>0 else 0
logm("Number of raw reads with CGG/TGG at 5' end: %d (%1.3f)" % (all_tagged, tag_percent) )
for kk in range(len(n_cut_tag_lst)):
logm("Number of raw reads with tag %s: %d (%1.3f)" % (cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]],float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads))
tag_percent = (float(n_cut_tag_lst[cut3_tag_lst[kk]])/all_raw_reads) if all_raw_reads>0 else 0
logm("Number of raw reads with tag %s: %d (%1.3f)" % (cut3_tag_lst[kk],n_cut_tag_lst[cut3_tag_lst[kk]], tag_percent) )
logm("Number of bases in total: %d " % all_base_before_trim)
if adapter!="" :
logm("Number of reads having adapter removed: %d " % all_tagged_trimmed)
logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, float(all_base_after_trim)/all_base_before_trim ) )
trim_percent = (float(all_base_after_trim)/all_base_before_trim ) if all_base_before_trim>0 else 0
logm("Number of bases after trimming the adapters: %d (%1.3f)" % (all_base_after_trim, trim_percent ) )
logm("Number of reads are rejected because of multiple hits: %d\n" % len(Multiple_hits) )
logm("Number of unique-hits reads (before post-filtering): %d" % all_mapped)
logm(" %d uniquely aligned reads, passed fragment check, with mismatches <= %s"%(all_mapped_passed, max_mismatch_no))
logm("Mappability = %1.4f%%" % (100*float(all_mapped_passed)/all_raw_reads))
Mappability = (100*float(all_mapped_passed)/all_raw_reads) if all_raw_reads>0 else 0
logm("Mappability = %1.4f%%" % Mappability )
#logm("Total bases of uniquely mapped reads %7d"% all_base_mapped )
if asktag == "Y": # un-diretional
if asktag == "Y": # un-directional
logm(" %7d FW reads mapped to Watson strand" % (num_mapped_FW_C2T) )
logm(" %7d RC reads mapped to Watson strand" % (num_mapped_FW_G2A) )
logm(" %7d FW reads mapped to Crick strand" % (num_mapped_RC_C2T) )
Expand Down
Loading

0 comments on commit 4567ef5

Please sign in to comment.