-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathget_flipflop_config.py
163 lines (139 loc) · 6.65 KB
/
get_flipflop_config.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import parasail
import sys
import pysam
from csv import DictReader
from Bio import SeqIO
"""
Get ITR flip flop configurations
Must have already run `summarize_AAV_alignment.py` to get a .tagged.BAM file!
Required:
"""
SW_SCORE_MATRIX = parasail.matrix_create("ACGT", 2, -5)
SEQ_LEFT_FLIP='ttggccactccctctctgcgcgctcgctcgctcactgaggccgggcgaccaaaggtcgcccgacgcccgggctttgcccgggcggcctcagtgagcgagcgagcgcgcagagagggagtggccaactccatcactaggggttcct'.upper()
SEQ_LEFT_FLOP='TTGGCCACTCCCTCTCTGCGCGCTCGCTCGCTCACTGAGGCCGCCCGGGCAAAGCCCGGGCGTCGGGCGACCTTTGGTCGCCCGGCCTCAGTGAGCGAGCGAGCGCGCAGAGAGGGAGTGGCCAACTCCATCACTAGGGGTTCCT'
SEQ_RIGHT_FLIP='AGGAACCCCTAGTGATGGAGTTGGCCACTCCCTCTCTGCGCGCTCGCTCGCTCACTGAGGCCGCCCGGGCAAAGCCCGGGCGTCGGGCGACCTTTGGTCGCCCGGCCTCAGTGAGCGAGCGAGCGCGCAGAGAGGGAGTGGCCAA'
SEQ_RIGHT_FLOP='AGGAACCCCTAGTGATGGAGTTGGCCACTCCCTCTCTGCGCGCTCGCTCGCTCACTGAGGCCGGGCGACCAAAGGTCGCCCGACGCCCGGGCTTTGCCCGGGCGGCCTCAGTGAGCGAGCGAGCGCGCAGAGAGGGAGTGGCCAA'
#POS_LEFT_FLIP=0 # start position
#POS_RIGHT_FLIP=4603 # start position
def read_flip_flop_fasta(fasta_filename):
global SEQ_LEFT_FLIP
global SEQ_LEFT_FLOP
global SEQ_RIGHT_FLIP
global SEQ_RIGHT_FLOP
flag = 0
print(f"Reading {fasta_filename}....")
for r in SeqIO.parse(open(fasta_filename), 'fasta'):
if r.id == 'SEQ_LEFT_FLIP':
SEQ_LEFT_FLIP = str(r.seq)
flag += 0b1000
elif r.id == 'SEQ_LEFT_FLOP':
SEQ_LEFT_FLOP = str(r.seq)
flag += 0b0100
elif r.id == 'SEQ_RIGHT_FLIP':
SEQ_RIGHT_FLIP = str(r.seq)
flag += 0b0010
elif r.id == 'SEQ_RIGHT_FLOP':
SEQ_RIGHT_FLOP = str(r.seq)
flag += 0b0001
else:
print("WARNING: Sequence IDs must be SEQ_LEFT_FLIP|SEQ_LEFT_FLOP|SEQ_RIGHT_FLIP|SEQ_RIGHT_FLOP. Is {0} instead. Ignoring!".format(r.id))
# check that all 4 needed sequence IDs are seen
if (flag >> 3) == 0:
print("ERROR: SEQ_LEFT_FLIP is not given. Abort!")
sys.exit(-1)
flag &= 0b0111
if (flag >> 2) == 0:
print("ERROR: SEQ_LEFT_FLOP is not given. Abort!")
sys.exit(-1)
flag &= 0b0011
if (flag >> 1) == 0:
print("ERROR: SEQ_RIGHT_FLIP is not given. Abort!")
sys.exit(-1)
flag &= 0b0001
if flag == 0:
print("ERROR: SEQ_RIGHT_FLOP is not given. Abort!")
sys.exit(-1)
def identify_flip_flop(r):
"""
Assume record tag:AT is vector, tag:AX can be full|left-partial|right-partial|partial
Add back a tag 'AF' that is [flip/flop]-[flip/flop]
"""
t = dict(r.tags)
try:
assert t['AX'] in ('vector-full', 'vector-left-partial', 'vector-right-partial', 'vector-partial')
except AssertionError:
print("Input BAM records must have a `AX` tag assigned by first running summarize_AAV_alignment.py. Abort!")
sys.exit(-1)
config_left, config_right = 'unknown', 'unknown'
if t['AX']=='vector-partial': # ignore, since both sides are missing chunks of ITR
return 'unknown', 'unknown'
if t['AX'] in ('vector-full', 'vector-left-partial'):
o1 = parasail.sw_trace(r.query, SEQ_LEFT_FLIP, 3, 1, SW_SCORE_MATRIX)
o2 = parasail.sw_trace(r.query, SEQ_LEFT_FLOP, 3, 1, SW_SCORE_MATRIX)
if o1.score > o2.score and o1.score > 250:
config_left = 'flip'
elif o2.score > o1.score and o2.score > 250:
config_left = 'flop'
else:
config_left = 'unknown'
if t['AX'] in ('vector-full', 'vector-right-partial'):
o1 = parasail.sw_trace(r.query[-len(SEQ_RIGHT_FLIP)-10:], SEQ_RIGHT_FLIP, 3, 1, SW_SCORE_MATRIX)
o2 = parasail.sw_trace(r.query[-len(SEQ_RIGHT_FLOP)-10:], SEQ_RIGHT_FLOP, 3, 1, SW_SCORE_MATRIX)
if o1.score > o2.score and o1.score > 250:
config_right = 'flip'
elif o2.score > o1.score and o2.score > 250:
config_right = 'flop'
else:
config_right = 'unknown'
return config_left, config_right
def main(per_read_csv, tagged_bam, output_prefix):
read_info = {}
for r in DictReader(open(per_read_csv), delimiter='\t'):
read_info[r['read_id']] = r
fout = open(output_prefix + '.flipflop_assignments.txt', 'w')
fout.write("name\ttype\tsubtype\tstart\tend\tleftITR\trightITR\n")
reader = pysam.AlignmentFile(open(tagged_bam), 'rb', check_sq=False)
writer1 = pysam.AlignmentFile(open(output_prefix+'.vector-full-flipflop.bam', 'w'), 'wb',
header=reader.header)
writer2 = pysam.AlignmentFile(open(output_prefix+'.vector-leftpartial-flipflop.bam', 'w'), 'wb',
header=reader.header)
writer3 = pysam.AlignmentFile(open(output_prefix+'.vector-rightpartial-flipflop.bam', 'w'), 'wb',
header=reader.header)
for r in reader:
t = dict(r.tags)
# if r.qname=='m54278_220522_043945/5964021/ccs/rev': break
if t['AT'] == 'vector' and t['AX'] in ('vector-full', 'vector-left-partial', 'vector-right-partial'):
c_l, c_r = identify_flip_flop(r)
d = r.to_dict()
a_type = read_info[r.qname]['assigned_type']
if a_type not in ('scAAV', 'ssAAV'): continue
d['tags'].append('AF:Z:' + c_l + '-' + c_r)
d['tags'].append('AG:Z:' + a_type)
if t['AX'] == 'vector-full':
writer = writer1
elif t['AX'] == 'vector-right-partial':
writer = writer2
elif t['AX'] == 'vector-left-partial':
writer = writer3
writer.write(pysam.AlignedSegment.from_dict(d, r.header))
fout.write(r.qname + '\t' + a_type + '\t' + t['AX'] + '\t' +
str(r.reference_start) + '\t' +
str(r.reference_end) + '\t' +
c_l + '\t' + c_r + '\n')
writer1.close()
writer2.close()
writer3.close()
fout.close()
print("Output summmary: {0}".format(fout.name))
print(f"Indidual BAM files written: {output_prefix}.vector- full,leftpartial,rightpartial -flipflop.bam")
if __name__ == "__main__":
from argparse import ArgumentParser
parser = ArgumentParser()
parser.add_argument("sorted_tagged_bam", help="Sorted tagged BAM file")
parser.add_argument("per_read_csv", help="Per read CSV file")
parser.add_argument("-o", "--output_prefix", help="Output prefix", required=True)
parser.add_argument("--flipflop_fasta", default=None, help="(optional) flip flop fasta file (if not given, uses AAV2 default)")
args = parser.parse_args()
if args.flipflop_fasta is not None:
read_flip_flop_fasta(args.flipflop_fasta)
main(args.per_read_csv, args.sorted_tagged_bam, args.output_prefix)