From da30d629d6a87f673a30321fa31a0081dd37fa3a Mon Sep 17 00:00:00 2001 From: Albert Tian Chen Date: Sun, 19 May 2024 14:11:57 -0400 Subject: [PATCH] Tolerate frameshifting indels as long as they are resolved (#646) * Tolerate frameshifting indels as long as they are resolved (no terminal frameshift) within 10 codons * Quick shortcut to speed up processing of synonymous mutations * Fix bug where index popping results in index overflow * fix typo * Ignore test files --- .gitignore | 1 + workflow_main/scripts/extract_aa_mutations.py | 199 ++++++++++++++---- 2 files changed, 163 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index bcf85cc3..860a92f7 100644 --- a/.gitignore +++ b/.gitignore @@ -145,6 +145,7 @@ data_flu_genbank # scratch notebooks workflow_main/notebooks/** +workflow_main/scripts/*.csv # SnapGene - temp files static_data/flu/alignments/.sglock/** diff --git a/workflow_main/scripts/extract_aa_mutations.py b/workflow_main/scripts/extract_aa_mutations.py index aab825a5..def7697e 100755 --- a/workflow_main/scripts/extract_aa_mutations.py +++ b/workflow_main/scripts/extract_aa_mutations.py @@ -12,6 +12,9 @@ from util import translate +# Number of codons to look ahead for resolving frameshifts +FS_LOOKAHEAD = 10 + def extract_aa_mutations( dna_mutation_file, @@ -23,7 +26,7 @@ def extract_aa_mutations( ): """ Extract AA mutations from NT mutations - + Parameters ---------- dna_mutation_file: str @@ -79,45 +82,48 @@ def extract_aa_mutations( feature_dfs[k] = v - dna_mutation_df = pd.read_csv(dna_mutation_file).fillna("") + dna_mutation_df = pd.read_csv(dna_mutation_file, index_col=False).fillna("") + # Filter out any big mutations in the 5' or 3' UTR # dna_mutation_df = dna_mutation_df.loc[ # (dna_mutation_df["pos"] < 29675) & (dna_mutation_df["pos"] > 265), : # ].reset_index(drop=True) + # Filter out any frameshifting indels - dna_mutation_df = dna_mutation_df.loc[ - ( - (dna_mutation_df["ref"].str.len() == 1) - & (dna_mutation_df["alt"].str.len() == 1) - ) - | ( - (dna_mutation_df["ref"].str.len() > 1) - & (dna_mutation_df["alt"].str.len() == 0) - & (dna_mutation_df["ref"].str.len() % 3 == 0) - ) - | ( - (dna_mutation_df["alt"].str.len() > 1) - & (dna_mutation_df["ref"].str.len() == 0) - & (dna_mutation_df["alt"].str.len() % 3 == 0) - ) - ].reset_index(drop=True) + # dna_mutation_df = dna_mutation_df.loc[ + # ( + # (dna_mutation_df["ref"].str.len() == 1) + # & (dna_mutation_df["alt"].str.len() == 1) + # ) + # | ( + # (dna_mutation_df["ref"].str.len() > 1) + # & (dna_mutation_df["alt"].str.len() == 0) + # & (dna_mutation_df["ref"].str.len() % 3 == 0) + # ) + # | ( + # (dna_mutation_df["alt"].str.len() > 1) + # & (dna_mutation_df["ref"].str.len() == 0) + # & (dna_mutation_df["alt"].str.len() % 3 == 0) + # ) + # ].reset_index(drop=True) aa_mutations = [] aa_seqs = {} + # For each reference for reference, feature_df in feature_dfs.items(): + # For each feature (gene/protein) for feature_name, feature_row in feature_df.iterrows(): # Only process genes in this segment/chromosome if feature_row["segment"] != active_segment: continue - # print(feature_name) - resi_counter = 0 aa_seqs[feature_name] = [] + # For each segment (ORF) in this feature: for segment in feature_row["segments"]: # Get the region in coordinates to translate/look for mutations in segment_start = segment[0] @@ -152,18 +158,91 @@ def extract_aa_mutations( - segment_start ) // 3 + # GROUP MUTATIONS + # Group together individual mutations to process them as a single mutation + # Criteria: + # - Same Accession ID + # - Mutations are within codon range + # For each NT mutation in this segment: + segment_mutations = [] i = 0 while i < len(segment_mutation_df): cur_mutation = segment_mutation_df.iloc[i, :] + # If the current mutation results in a frameshift, + # then seek to resolve the frameshift by looking ahead FS_LOOKAHEAD codons + # i.e., if this mutation is a single deletion, then + # look ahead for a single insertion to resolve the frame + + # Number of bases off from the frame + # Use a modulo of 3 later to get the # of bases off + frameshift = 0 + + if len(cur_mutation["alt"]) - len(cur_mutation["ref"]) % 3 != 0: + frameshift += len(cur_mutation["alt"]) - len( + cur_mutation["ref"] + ) + + # If we have a frameshift, then look ahead FS_LOOKAHEAD codons + # If no frameshift resolution is possible then toss this mutation and move on + # Keep track of when the indel resolves + resolve_codon_ind = None + if frameshift % 3 != 0 and i < len(segment_mutation_df) - 1: + j = i + 1 + new_mutation = segment_mutation_df.iloc[j] + while ( + new_mutation["codon_ind_start"] + <= cur_mutation["codon_ind_start"] + FS_LOOKAHEAD + and new_mutation["Accession ID"] + == cur_mutation["Accession ID"] + ): + # Adjust frameshift if the new mutation is itself a frameshifting indel + if ( + len(new_mutation["alt"]) - len(new_mutation["ref"]) % 3 + != 0 + ): + frameshift += len(new_mutation["alt"]) - len( + new_mutation["ref"] + ) + + # If the frameshift is resolved, then flag as resolved and break + if frameshift % 3 == 0: + resolve_codon_ind = new_mutation["codon_ind_end"] + break + + j += 1 + if j < len(segment_mutation_df): + new_mutation = segment_mutation_df.iloc[j] + else: + break + + # If there's no resolution to the frameshift within FS_LOOKAHEAD codons, + # then ignore this mutation and move onto the next mutation in the list + if resolve_codon_ind is None: + i += 1 + continue + + # If this was a singular frameshift mutation without a resolution, + # Then no possibility to resolve the frameshift + # Just ignore this mutation + # TODO: process nonsense frameshifts if near the end of the ORF? + if frameshift % 3 != 0 and resolve_codon_ind is None: + i += 1 + continue + # Look ahead in the mutation list (ordered by position) - # for any other mutations within this codon + # for any other mutations within the codon range j = i + 1 - mutations = [cur_mutation] + mutations = [cur_mutation.to_dict()] codon_ind_start = cur_mutation["codon_ind_start"] codon_ind_end = cur_mutation["codon_ind_end"] + # If we have a resolvable frameshift, then set the mutation combination window + # From here til where the indel resolves + if resolve_codon_ind is not None: + codon_ind_end = resolve_codon_ind + if j < len(segment_mutation_df): new_mutation = segment_mutation_df.iloc[j] while ( @@ -171,9 +250,20 @@ def extract_aa_mutations( and new_mutation["Accession ID"] == cur_mutation["Accession ID"] ): - mutations.append(new_mutation) - # Update end position - codon_ind_end = new_mutation["codon_ind_end"] + # If we weren't expecting a frameshift and this mutation + # produces a frameshift, then ignore this mutation + if resolve_codon_ind is None and ( + len(new_mutation["alt"]) - len(new_mutation["ref"]) % 3 + != 0 + ): + pass + else: + mutations.append(new_mutation.to_dict()) + + # Update end position if beyond current end position + if new_mutation["codon_ind_end"] > codon_ind_end: + codon_ind_end = new_mutation["codon_ind_end"] + j += 1 if j < len(segment_mutation_df): new_mutation = segment_mutation_df.iloc[j] @@ -183,6 +273,16 @@ def extract_aa_mutations( # Increment our counter so we don't reprocess any grouped mutations i = j + if mutations: + segment_mutations.append(mutations) + + # PROCESS GROUPED MUTATIONS + for mutations in segment_mutations: + + codon_ind_start = mutations[0]["codon_ind_start"] + codon_ind_end = mutations[-1]["codon_ind_end"] + cur_mutation = mutations[0] + # Get region start/end, 0-indexed region_start = segment_start + (codon_ind_start * 3) - 1 region_end = segment_start + (codon_ind_end * 3) + 2 @@ -197,13 +297,11 @@ def extract_aa_mutations( initial_region_seq = [s for s in region_seq] # Translate the reference region sequence ref_aa = list(translate("".join(region_seq))) - # print(region_seq, ref_aa) - - # print([mut['pos'] for mut in mutations]) - # print(region_seq, ref_aa) - - for k, mut in enumerate(mutations): + # Process mutations in reverse order to preserve indexing + for k, mut in zip( + range(len(mutations) - 1, -1, -1), mutations[::-1] + ): # Make sure the reference matches if len(mut["ref"]) > 0: ref_mutation_seq = "".join( @@ -215,8 +313,9 @@ def extract_aa_mutations( ) if not ref_mutation_seq == mut["ref"]: print( - "REF MISMATCH:\n\tReference sequence:\t{}\n\tMutation sequence\t\t{}\n".format( - ref_mutation_seq, mut["ref"], + "REF MISMATCH:\n\tReference sequence:\t{}\n\tMutation sequence:\t{}\n".format( + ref_mutation_seq, + mut["ref"], ) ) continue @@ -234,7 +333,10 @@ def extract_aa_mutations( # Translate the new region alt_aa = list(translate("".join(region_seq))) - # print(region_seq, alt_aa) + + # If synonymous mutation, then move on: + if ref_aa == alt_aa: + continue # Remove matching AAs from the start of ref_aa # i.e., if ref = 'FF', and alt = 'F', then: @@ -256,7 +358,27 @@ def extract_aa_mutations( for ind in remove_inds[::-1]: ref_aa.pop(ind) alt_aa.pop(ind) - # print(ref_aa, alt_aa) + + # Remove matching AAs from the end of ref_aa + # i.e., if ref = 'SRG', and alt = 'KG', then: + # ref = 'SR' and alt = 'K' + remove_inds = [] + for b in range(max(len(ref_aa), len(alt_aa))): + if b >= len(ref_aa) or b >= len(alt_aa): + break + + if ref_aa[-1 - b] == alt_aa[-1 - b]: + remove_inds.append(b) + # Increment the AA index end so that + # we end up on the correct position + # (This should only affect deletions) + codon_ind_end -= 1 + else: + break + + for ind in remove_inds[::-1]: + ref_aa.pop(-1 - ind) + alt_aa.pop(-1 - ind) # If there's no mutation, or synonymous mutation, # then move on @@ -265,7 +387,7 @@ def extract_aa_mutations( pos = resi_counter + codon_ind_start + 1 - # If the positiion is outside of the segment, then skip + # If the position is outside of the segment, then skip # (This happens sometimes for long deletions) if pos > (resi_counter + segment_len): continue @@ -369,7 +491,10 @@ def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--dna-mutation", type=str, required=True, help="Path to DNA mutation CSV", + "--dna-mutation", + type=str, + required=True, + help="Path to DNA mutation CSV", ) parser.add_argument( "--gene-protein-def",