-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathreads_to_remapped_variants.py
executable file
·352 lines (310 loc) · 15.8 KB
/
reads_to_remapped_variants.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
#! /usr/bin/env python3
import argparse
import copy
from argparse import RawTextHelpFormatter
from collections import Counter, defaultdict
import yaml
from Bio.Seq import Seq
from Bio.Alphabet import generic_dna
import pysam
nucleotide_alphabet = {'A', 'T', 'C', 'G'}
def reverse_complement(sequence):
return str(Seq(sequence, generic_dna).reverse_complement())
def calculate_new_variants_definition(left_read, right_read, ref_fasta, original_vcf_rec):
"""
Resolve the variant definition from the flanking region alignment and old variant definition
TODO: Link to algorithm description once public
"""
# Flag to highlight low confidence in an event detected
failure_reason = None
# Flag to store novel reference allele
novel_reference_allele = None
old_ref = original_vcf_rec[3]
old_alts = original_vcf_rec[4].split(',')
operations = {}
contig = left_read.reference_name
# Define new ref and new pos
new_ref = fetch_bases(ref_fasta, contig, left_read.reference_end + 1,
right_read.reference_start - left_read.reference_end).upper()
new_pos = left_read.reference_end + 1
# 1. Handle reference strand change
if not left_read.is_reverse and not right_read.is_reverse:
# Forward strand alignment
old_ref_conv = old_ref
old_alt_conv = old_alts
operations['st'] = '+'
elif left_read.is_reverse and right_read.is_reverse:
# Reverse strand alignment
old_ref_conv = reverse_complement(old_ref)
old_alt_conv = [reverse_complement(alt) for alt in old_alts]
operations['st'] = '-'
else:
# This case should be handled by the filtering but raise just in case...
error_msg = (f'Impossible read configuration: '
f'read1 is_reverse: {left_read.is_reverse}, '
f'read2 is_reverse: {right_read.is_reverse}, '
f'read1 position: {left_read.pos}, '
f'read2 position: {right_read.pos}')
raise ValueError(error_msg)
# 2. In the case of insertion or deletions we need to ensure the context base are updated correctly
if any([len(old_ref) != len(alt) for alt in old_alts]):
# On the negative strand the context base of the ref and alts needs to be removed
# and replaced with the one upstream
if new_ref and operations['st'] == '-':
new_pos -= 1
new_ref = fetch_bases(ref_fasta, contig, new_pos, len(new_ref)).upper()
contextbase = new_ref[0]
old_alt_conv = [contextbase + alt[:-1] for alt in old_alt_conv]
# also change the old_ref_conv for consistency but it assumes that the base context base downstream
# of the variant was the same in the old genome
old_ref_conv = contextbase + old_ref_conv[:-1]
# on the positive strand only modify the context base if it is different.
elif new_ref and new_ref[0] != old_ref_conv[0]:
contextbase = new_ref[0]
old_alt_conv = [contextbase + alt[1:] for alt in old_alt_conv]
old_ref_conv = contextbase + old_ref_conv[1:]
# If new ref is empty then it's a reference allele length change that will be dealt with in next block
# 3. Assign new allele sequences
if new_ref == old_ref_conv:
new_alts = old_alt_conv
elif new_ref in old_alt_conv:
old_alt_conv.remove(new_ref)
new_alts = old_alt_conv
new_alts.append(old_ref_conv)
operations['rac'] = f'{contig}|{new_pos}|{old_ref_conv}-{new_ref}'
if len(old_ref_conv) != len(new_ref):
failure_reason = 'Reference Allele length change'
else:
new_alts = old_alt_conv
# new_alts.append(old_ref_conv)
operations['rac'] = f'{contig}|{new_pos}|{old_ref_conv}-{new_ref}'
# Instead of adding to the list of ALT, we'll produce the novel reference allele as a separate record
novel_reference_allele = old_ref_conv
if len(old_ref_conv) != len(new_ref):
failure_reason = 'Novel Reference Allele length change'
# 4. Correct zero-length reference sequence
if len(new_ref) == 0:
new_pos -= 1
new_ref = fetch_bases(ref_fasta, contig, new_pos, 1).upper()
new_alts = [new_ref + alt for alt in new_alts]
operations['zlr'] = None
if len(set(new_ref).difference(nucleotide_alphabet)) != 0:
failure_reason = 'Reference Allele not in ACGT'
yield new_pos, new_ref, new_alts, operations, failure_reason
if novel_reference_allele:
operations_new_rec = copy.copy(operations)
operations_new_rec['nra'] = old_ref_conv
yield new_pos, new_ref, [novel_reference_allele], operations_new_rec, failure_reason
def update_vcf_record(reference_name, varpos, new_ref, new_alts, operations, original_vcf_rec):
"""
Update the original vcf record with the different fields and use the operations to modify the info and genotypes
fields.
"""
original_vcf_rec[0] = reference_name
original_vcf_rec[1] = str(varpos)
original_vcf_rec[3] = new_ref
original_vcf_rec[4] = ','.join(new_alts)
# Update The INFO field by appending operations
operation_list = [op if operations[op] is None else '%s=%s' % (op, operations[op]) for op in operations]
if original_vcf_rec[7] != '.':
original_vcf_rec[7] = ';'.join(original_vcf_rec[7].strip(';').split(';') + operation_list)
else:
original_vcf_rec[7] = ';'.join(operation_list)
# If required Update SAMPLE fields by changing the Genotypes
if 'rac' in operations and len(original_vcf_rec) > 8 and 'GT' in original_vcf_rec[8]:
gt_index = original_vcf_rec[8].split(':').index('GT')
for genotype_i in range(9, len(original_vcf_rec)):
genotype_str_list = original_vcf_rec[genotype_i].split(':')
if genotype_str_list[gt_index] == '1/1':
genotype_str_list[gt_index] = '0/0'
elif 'nra' in operations and genotype_str_list[gt_index] == '0/1':
genotype_str_list[gt_index] = '1/2'
original_vcf_rec[genotype_i] = ':'.join(genotype_str_list)
def fetch_bases(fasta, contig, start, length):
"""
Returns a subsection from a specified FASTA contig. The start coordinate is 1-based.
"""
zero_base_start = start - 1
end = zero_base_start + length
new_ref = fasta.fetch(reference=contig, start=zero_base_start, end=end)
return new_ref
def group_reads(bam_file_path):
"""
This function assumes that the reads are sorted by query name.
It will group reads by query name and create three subgroups of primary, supplementary and secondary aligned reads.
It returns an iterators where each element is a tuple of the three lists
:param bam_file_path: the name sorted bam file
:return: iterator of tuples containing three lists
"""
with pysam.AlignmentFile(bam_file_path, 'rb') as inbam:
current_read_name = None
primary_group = None
secondary_group = None
supplementary_group = None
for read in inbam:
if read.query_name == current_read_name:
pass
else:
if current_read_name:
yield primary_group, supplementary_group, secondary_group
primary_group = []
secondary_group = []
supplementary_group = []
if read.is_secondary:
secondary_group.append(read)
elif read.is_supplementary:
supplementary_group.append(read)
else:
primary_group.append(read)
current_read_name = read.query_name
if primary_group:
yield primary_group, supplementary_group, secondary_group
def order_reads(primary_group, primary_to_supplementary):
"""
Order read and return the most 5' (smallest coordinates) first.
if a supplementary read exists and is closer to the other read then it is used in place of the primary
"""
read1, read2 = primary_group
suppl_read1 = suppl_read2 = None
if read1 in primary_to_supplementary:
suppl_read1 = primary_to_supplementary.get(read1)[0]
if read2 in primary_to_supplementary:
suppl_read2 = primary_to_supplementary.get(read2)[0]
if read1.reference_start <= read2.reference_start:
if suppl_read1 and suppl_read1.reference_start > read1.reference_start:
read1 = suppl_read1
if suppl_read2 and suppl_read2.reference_start < read2.reference_start:
read2 = suppl_read2
return read1, read2
else:
if suppl_read1 and suppl_read1.reference_start < read1.reference_start:
read1 = suppl_read1
if suppl_read2 and suppl_read2.reference_start > read2.reference_start:
read2 = suppl_read2
return read2, read1
def pass_basic_filtering(primary_group, secondary_group, primary_to_supplementary, counter, filter_align_with_secondary):
"""
Test if the alignment pass basic filtering such as presence of secondary alignments, any primary unmapped,
primary mapped on different chromosome, or primary mapped poorly.
"""
if filter_align_with_secondary and len(secondary_group):
counter['Too many alignments'] += 1
elif len(primary_group) < 2 or any(read.is_unmapped for read in primary_group):
counter['Flank unmapped'] += 1
elif len(set(read.reference_name for read in primary_group)) != 1:
counter['Different chromosomes'] += 1
elif any(len(suppl) > 1 for suppl in primary_to_supplementary.values()):
counter['Too many supplementary'] += 1
else:
return True
return False
def pass_aligned_filtering(left_read, right_read, counter):
"""
Test if the two reads pass the additional filters such as check for soft-clipped end next to the variant region,
or overlapping region between the two reads.
:param left_read: the left (or 5') most read
:param right_read: the right (or 3') most read
:param counter: Counter to report the number of reads filtered.
:return: True or False
"""
# in CIGAR tuples the operation is coded as an integer
# https://pysam.readthedocs.io/en/latest/api.html#pysam.AlignedSegment.cigartuples
if left_read.cigartuples[-1][0] == pysam.CSOFT_CLIP or right_read.cigartuples[0][0] == pysam.CSOFT_CLIP:
counter['Soft-clipped alignments'] += 1
elif left_read.reference_end > right_read.reference_start:
counter['Overlapping alignment'] += 1
elif left_read.is_reverse != right_read.is_reverse:
counter['Unexpected orientation'] += 1
else:
return True
return False
def output_alignment(original_vcf_rec, outfile):
"""
Output the original or updated VCF entry to the provided output file.
"""
print('\t'.join(original_vcf_rec), file=outfile)
def link_supplementary(primary_group, supplementary_group):
"""Link supplementary alignments to their primary."""
if not supplementary_group:
# No supplementary so no linking required
return {}
supplementary_dict = {}
primary_to_supplementary = defaultdict(list)
for supplementary_read in supplementary_group:
supplementary_dict[supplementary_read.reference_name + str(supplementary_read.reference_start + 1)] = supplementary_read
for primary in primary_group:
# chr2,808117,+,1211M790S,60,1;
if primary.has_tag('SA'):
for other_alignment in primary.get_tag('SA').split(';'):
if other_alignment:
rname, pos = other_alignment.split(',')[:2]
primary_to_supplementary[primary].append(
supplementary_dict[rname + pos]
)
return dict(primary_to_supplementary)
def process_bam_file(bam_file_paths, output_file, out_failed_file, new_genome,
filter_align_with_secondary, flank_length, summary_file):
counter = Counter()
fasta = pysam.FastaFile(new_genome)
with open(output_file, 'w') as outfile, open(out_failed_file, 'w') as out_failed:
for bam_file_path in bam_file_paths:
for primary_group, supplementary_group, secondary_group in group_reads(bam_file_path):
counter['total'] += 1
primary_to_supplementary = link_supplementary(primary_group, supplementary_group)
# Retrieve the full VCF record from the bam vr tag
original_vcf_rec = primary_group[0].get_tag('vr').split('|^')
if not pass_basic_filtering(primary_group, secondary_group, primary_to_supplementary, counter,
filter_align_with_secondary):
output_alignment(original_vcf_rec, out_failed)
continue
left_read, right_read = order_reads(primary_group, primary_to_supplementary)
if not pass_aligned_filtering(left_read, right_read, counter):
output_alignment(original_vcf_rec, out_failed)
continue
failure_reasons = set()
for varpos, new_ref, new_alts, ops, failure_reason in calculate_new_variants_definition(
left_read, right_read, fasta, original_vcf_rec):
if not failure_reason:
counter['Remapped'] += 1
update_vcf_record(left_read.reference_name, varpos, new_ref, new_alts, ops, original_vcf_rec)
output_alignment(original_vcf_rec, outfile)
else:
failure_reasons.add(failure_reason)
if failure_reasons:
# Currently the alignment is not precise enough to ensure that the allele change for INDEL and
# novel reference allele are correct. So we skip them.
# TODO: add realignment confirmation see #14 and EVA-2417
counter[','.join(failure_reasons)] += 1
output_alignment(original_vcf_rec, out_failed)
with open(summary_file, 'w') as open_summary:
yaml.safe_dump({f'Flank_{flank_length}': dict(counter)}, open_summary)
def main():
description = ('Process alignment results in bam format to determine the location of the variant in the new genome.'
' Each variant will be either output in the new genome VCF or the old VCF will be output in a '
'separate file.')
parser = argparse.ArgumentParser(description=description, formatter_class=RawTextHelpFormatter)
parser.add_argument('-i', '--bams', type=str, required=True, nargs='+',
help='Input BAM file with remapped flanking regions')
parser.add_argument('-o', '--outfile', type=str, required=True,
help='Output VCF file with remapped variants')
parser.add_argument('--out_failed_file', type=str, required=True,
help='Name of the file containing reads that did not align correctly')
parser.add_argument('--flank_length', type=int, required=True,
help='Length of the flanking region used.')
parser.add_argument('--summary', type=str, required=True,
help='YAML files containing the summary metrics')
parser.add_argument('-f', '--filter_align_with_secondary', action='store_true', default=False,
help='Filter out alignments that have one or several secondary alignments.')
parser.add_argument('-n', '--newgenome', required=True, help='FASTA file of the target genome')
args = parser.parse_args()
process_bam_file(
bam_file_paths=args.bams,
output_file=args.outfile,
out_failed_file=args.out_failed_file,
new_genome=args.newgenome,
filter_align_with_secondary=args.filter_align_with_secondary,
flank_length=args.flank_length,
summary_file=args.summary
)
if __name__ == '__main__':
main()