-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbam_rsub.py
executable file
·166 lines (147 loc) · 6.86 KB
/
bam_rsub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#! /usr/bin/env python
###############################################################################
# based on https://github.com/Niknafs/NGSTools/blob/master/baseParser.py
# Returns substition counts for bam file relative to fasta reference
# Usage:
# bam_rsub.py ref.fa sample.bam
# Warning: pysam mpileup produces coverage reduced by 1 compared to samtools
# mpileup in some regions.
###############################################################################
import argparse
import pysam
import os
import sys
from collections import Counter, OrderedDict
def parse_command_line_arguments():
parser = argparse.ArgumentParser(description=
"""
Returns substition counts for bam file relative to fasta reference.
Input file can be filtered by read length and edit distance.
Requires pysam
"""
)
parser.add_argument("bam_file", help="input bam file")
parser.add_argument("-o", "--out",
help="output BAM filename")
parser.add_argument("-v", "--verbose", action='store_true', default=False,
help="output BAM filename")
parser.add_argument("-l", "--max_length", type=int, default=1000,
help="maximum read length filter")
action1 = parser.add_mutually_exclusive_group(required=False)
action1.add_argument("-e", "--max_edit_dist", type=int, default=1000,
help="maximum edit distance to reference for the read")
action1.add_argument("-d", "--max_divergence", type=int,
help="maximum percent of derived positions in read calculated from edit distance")
#parser.add_argument("-r", "--remove_trimming", action='store_true', help="discard all trimming data")
action2 = parser.add_mutually_exclusive_group(required=True)
action2.add_argument("-f", "--filter_only", action='store_true', help="only filter bam")
action2.add_argument("-r", "--ref_fasta", help="reference fasta file")
return parser.parse_args()
class parseString(object):
def __init__(self, ref, string):
self.ref = ref.upper()
self.string = string.upper()
self.types = Counter()
self.process()
def process(self):
# remove end of read character
self.string = self.string.replace('$','')
while self.string != '':
if self.string[0] == '^':
# skip two characters when encountering '^' as it indicates
# a read start mark and the read mapping quality
self.string = self.string[2:]
elif self.string[0] == '*':
# unknown?
self.types['un'] += 1
# skip to next character
#sys.stdout.write(self.string[0])
self.string = self.string[1:]
elif self.string[0] in list('.,'):
if (len(self.string)== 1) or (self.string[1] not in ['+','-']):
# a reference base
self.types[self.ref+self.ref] += 1
self.string = self.string[1:]
elif self.string[1] == '+':
insertionLength = int(self.string[2])
#insertionSeq = self.string[3:3+ insertionLength]
self.types['ins'] += 1 # .append(insertionSeq)
self.string = self.string[3+insertionLength:]
elif self.string[1] == '-':
deletionLength = int(self.string[2])
#deletionSeq = self.string[3:3+deletionLength]
self.types['del'] += 1 #.append(deletionSeq)
self.string = self.string[3+deletionLength:]
elif self.string[0] in list('ACTG'):
# one of the four bases
self.types[self.ref + self.string[0]] += 1
self.string = self.string[1:]
else:
# unrecognized character
# or a read that reports a substitition followed by an insertion/deletion
self.types['un'] += 1
#sys.stdout.write(self.string[0])
self.string = self.string[1:]
return
def __repr__(self):
return self.types
def main():
args = parse_command_line_arguments()
# defaults and naming
if not args.out:
if args.max_divergence:
outbam = '%slen%dder%d.bam' % (args.bam_file[:-3], args.max_length, args.max_divergence)
else:
outbam = '%slen%dNM%d.bam' % (args.bam_file[:-3], args.max_length, args.max_edit_dist)
if not os.path.isfile(args.bam_file + 'bai'):
pysam.index(args.bam_file)
with pysam.AlignmentFile(args.bam_file, "rb") as samfile:
if args.verbose:
print '%d reads before filtering' % samfile.count()
i = 0
with pysam.AlignmentFile(outbam, "wb", template=samfile) as tmpfile:
for read in samfile.fetch():
edit_dist = read.get_tag('NM')
read_length = read.query_length
if read_length <= args.max_length:
if args.max_edit_dist:
if edit_dist <= args.max_edit_dist:
tmpfile.write(read)
i += 1
elif args.max_divergence:
if 100*edit_dist/read_length < args.max_divergence:
tmpfile.write(read)
i += 1
if args.verbose:
print '%d reads after filtering' % i
# pileup generation
if not args.filter_only:
pileup = pysam.mpileup('-f', args.ref_fasta, outbam)
os.remove(outbam)
# pileup parsing
subst = {'AA':0, 'TT':0, 'CC':0, 'GG':0,
'AC':0, 'AT':0, 'AG':0,
'CA':0, 'CT':0, 'CG':0,
'TA':0, 'TC':0, 'TG':0,
'GA':0, 'GC':0, 'GT':0,
'ins':0, 'del':0, 'un':0, 'un_ref':0}
for line in pileup.split('\n'):
if len(line) > 0:
toks = line.strip('\n').split('\t')
ref = toks[2].upper()
alt = parseString(ref, toks[4]).__repr__()
for alt_type, count in alt.iteritems():
if ref in list('ACTG'):
try:
subst[alt_type] += count
except:
print alt, toks[1]
sys.exit()
else:
subst['un_ref'] += count
# output
# keyorder = "AA\tTT\tCC\tGG\tAC\tAT\tAG\tCA\tCT\tCG\tTA\tTC\tTG\tGA\tGC\tGT\tins\tdel\tun\tno_ref"
print '\n'.join(['%s\t%d' % (k,v) for (k,v) in sorted(subst.items())])
return subst
if __name__ == '__main__':
main()