diff --git a/src/utils/mutationUtils.js b/src/utils/mutationUtils.js index 37723f11..4877229f 100644 --- a/src/utils/mutationUtils.js +++ b/src/utils/mutationUtils.js @@ -20,9 +20,18 @@ export const formatMutation = (mutationStr, dnaOrAa) => { return `${chunks[2]}${chunks[1]}${chunks[3]}`; } } else if (dnaOrAa === DNA_OR_AA.AA) { + let feature = chunks[0]; + let pos = parseInt(chunks[1]); let ref = chunks[2]; let alt = chunks[3]; + // For mutations with multiple reference AAs affected, display + // the position as a range of affected positions + // e.g., LPPA24S --> LPPA24/27S + if (ref.length > 1) { + pos = `${pos}/${pos + ref.length - 1}`; + } + // Translate stop codon from "_" to more conventional "*" if (ref === '_') { ref = '*'; @@ -44,6 +53,6 @@ export const formatMutation = (mutationStr, dnaOrAa) => { ref = ''; } - return `${chunks[0]}:${ref}${chunks[1]}${alt}`; + return `${feature}:${ref}${pos}${alt}`; } }; diff --git a/workflow_main/scripts/build_full_dataframe.py b/workflow_main/scripts/build_full_dataframe.py index 4d5274fb..9a98cb5e 100644 --- a/workflow_main/scripts/build_full_dataframe.py +++ b/workflow_main/scripts/build_full_dataframe.py @@ -40,6 +40,34 @@ def main(): v: k for k, v in metadata_map["protein_aa_mutation"].items() } + # Affected positions map + gene_aa_affected_positions_map = {} + for k, v in metadata_map["gene_aa_mutation"].items(): + feature = k.split("|")[0] + pos = int(k.split("|")[1]) + ref = k.split("|")[2] + gene_aa_affected_positions_map[v] = ";".join( + [f"{feature}|{x}" for x in range(pos, pos + len(ref))] + ) + + protein_aa_affected_positions_map = {} + for k, v in metadata_map["protein_aa_mutation"].items(): + feature = k.split("|")[0] + pos = int(k.split("|")[1]) + ref = k.split("|")[2] + protein_aa_affected_positions_map[v] = ";".join( + [f"{feature}|{x}" for x in range(pos, pos + len(ref))] + ) + + # Add affected positions to dataframe + df.loc[:, "gene_aa_affected_positions"] = df["gene_aa_mutation"].apply( + lambda x: ";".join([gene_aa_affected_positions_map[i] for i in x]) + ) + df.loc[:, "protein_aa_affected_positions"] = df["protein_aa_mutation"].apply( + lambda x: ";".join([protein_aa_affected_positions_map[i] for i in x]) + ) + + # Convert mutation IDs back to mutation strings df.loc[:, "dna_mutation"] = df["dna_mutation"].apply( lambda x: ";".join([dna_mutation_map[i] for i in x]) ) @@ -50,7 +78,7 @@ def main(): lambda x: ";".join([protein_aa_mutation_map[i] for i in x]) ) - # Serialize coverage + # Serialize coverage data df.loc[:, "dna_range"] = df["dna_range"].apply( lambda rngs: ";".join([f"{rng[0]}-{rng[1]}" for rng in rngs]) ) diff --git a/workflow_main/scripts/extract_aa_mutations.py b/workflow_main/scripts/extract_aa_mutations.py index def7697e..7712cc9b 100755 --- a/workflow_main/scripts/extract_aa_mutations.py +++ b/workflow_main/scripts/extract_aa_mutations.py @@ -158,8 +158,9 @@ def extract_aa_mutations( - segment_start ) // 3 - # GROUP MUTATIONS - # Group together individual mutations to process them as a single mutation + # GROUP DNA MUTATIONS + # Group together individual DNA mutations to process them + # as a single DNA mutation # Criteria: # - Same Accession ID # - Mutations are within codon range @@ -425,6 +426,7 @@ def extract_aa_mutations( # - position is the same # - one mutation is an indel, other is a substitution + """ if len(aa_mutations) > 0: cur_mutation = aa_mutations[0] i = 1 @@ -432,7 +434,7 @@ def extract_aa_mutations( next_mutation = aa_mutations[i] - # (Accession ID, gene/protein, pos, ref, alt) + # (reference, Accession ID, gene/protein, pos, ref, alt) if ( cur_mutation[0] == next_mutation[0] and cur_mutation[1] == next_mutation[1] @@ -468,6 +470,86 @@ def extract_aa_mutations( # No merging, move on cur_mutation = aa_mutations[i] i += 1 + """ + + # SPLIT AA MUTATIONS + # ------------------ + # Split AA mutations: + # 1. Split deletions of multiple residues into individual deletions + # - e.g., ∆HV69 --> ∆H69, ∆V70 + # - Only applicable to pure deletions, i.e., no substitutions + # 2. Mutations will be “ungrouped” if possible + # - e.g., FR157SG --> F157S, R158G + # - Only applicable for substitutions of the same residue length + + # 1. SPLIT DELETIONS + old_aa_mutation_inds = [] + new_aa_mutations = [] # (new_mutation, insertion_index) + for i, aa_mutation in enumerate(aa_mutations): + # (reference, Accession ID, gene/protein, pos, ref, alt) + ref = aa_mutation[4] + alt = aa_mutation[5] + # Skip if the mutation is not a pure deletion of more than 1 residue + if len(alt) > 0 or len(ref) == 1: + continue + + # Split the deletion into individual deletions + for j, r in enumerate(ref): + new_aa_mutations.append( + ( + ( + aa_mutation[0], + aa_mutation[1], + aa_mutation[2], + aa_mutation[3] + j, + r, + "", + ), + i + j, + ) + ) + + old_aa_mutation_inds.append(i) + + # Remove old deletion mutations + for i in old_aa_mutation_inds[::-1]: + del aa_mutations[i] + # Add new mutations + for new_mutation, insertion_index in new_aa_mutations: + aa_mutations.insert(insertion_index, new_mutation) + + # 2. SPLIT SUBSTITUTIONS + old_aa_mutation_inds = [] + new_aa_mutations = [] # (new_mutation, insertion_index) + for i, aa_mutation in enumerate(aa_mutations): + # (reference, Accession ID, gene/protein, pos, ref, alt) + ref = aa_mutation[4] + alt = aa_mutation[5] + # Skip if the mutation is not a pure substitution of more than 1 residue + if len(alt) == 0 or len(ref) == 0 or len(ref) != len(alt) or len(ref) == 1: + continue + + # Split the substitution into individual substitutions + for j, (a, b) in enumerate(zip(ref, alt)): + new_aa_mutations.append(( + ( + aa_mutation[0], + aa_mutation[1], + aa_mutation[2], + aa_mutation[3] + j, + a, + b, + ), i + j + )) + + old_aa_mutation_inds.append(i) + + # Remove old deletion mutations + for i in old_aa_mutation_inds[::-1]: + del aa_mutations[i] + # Add new mutations + for new_mutation, insertion_index in new_aa_mutations: + aa_mutations.insert(insertion_index, new_mutation) aa_mutation_df = pd.DataFrame.from_records( aa_mutations, diff --git a/workflow_main/scripts/process_mutations.py b/workflow_main/scripts/process_mutations.py index 81ea4666..e5e05d63 100644 --- a/workflow_main/scripts/process_mutations.py +++ b/workflow_main/scripts/process_mutations.py @@ -19,7 +19,7 @@ def process_mutations( mode="dna", # dna, gene_aa, protein_aa ): """Process mutation data - + Parameters ---------- manifest: pandas.DataFrame @@ -30,7 +30,7 @@ def process_mutations( - Mutations must occur at least this many times to pass filters mode: string - dna, gene_aa, protein_aa - + Returns ------- out: tuple of pandas.DataFrames @@ -120,10 +120,12 @@ def process_mutations( ) # Map mutations to integer IDs - mutation_map = pd.Series(mutation_df["mutation_str"].unique()) + mutation_to_id_map = pd.Series(mutation_df["mutation_str"].unique()) # Flip index and values - mutation_map = pd.Series(mutation_map.index.values, index=mutation_map) - mutation_df["mutation_id"] = mutation_df["mutation_str"].map(mutation_map) + mutation_to_id_map = pd.Series( + mutation_to_id_map.index.values, index=mutation_to_id_map + ) + mutation_df["mutation_id"] = mutation_df["mutation_str"].map(mutation_to_id_map) mutation_group_df = mutation_df.groupby( ["Accession ID", "reference"], as_index=False @@ -141,11 +143,11 @@ def process_mutations( ) # Fill NaNs with empty arrays - mutation_group_df.loc[ - mutation_group_df["mutation_id"].isna(), "mutation_id" - ] = pd.Series( - [[]] * mutation_group_df["mutation_id"].isna().sum(), - index=mutation_group_df.index[mutation_group_df["mutation_id"].isna()], + mutation_group_df.loc[mutation_group_df["mutation_id"].isna(), "mutation_id"] = ( + pd.Series( + [[]] * mutation_group_df["mutation_id"].isna().sum(), + index=mutation_group_df.index[mutation_group_df["mutation_id"].isna()], + ) ) - return mutation_group_df, mutation_map + return mutation_group_df, mutation_to_id_map