Mutation separation (#651)

* Split AA mutations of multiple residues into individual mutations, where appropriate * cleanup * Add affected positions column to complete dataframe only * Format mutation range for AA mutations affecting multiple reference residues
vector-engineering · Jul 29, 2024 · cd329ec · cd329ec
1 parent 73fa89e
commit cd329ec
Show file tree

Hide file tree

Showing 4 changed files with 137 additions and 16 deletions.
diff --git a/src/utils/mutationUtils.js b/src/utils/mutationUtils.js
@@ -20,9 +20,18 @@ export const formatMutation = (mutationStr, dnaOrAa) => {
       return `${chunks[2]}${chunks[1]}${chunks[3]}`;
     }
   } else if (dnaOrAa === DNA_OR_AA.AA) {
+    let feature = chunks[0];
+    let pos = parseInt(chunks[1]);
     let ref = chunks[2];
     let alt = chunks[3];
 
+    // For mutations with multiple reference AAs affected, display
+    // the position as a range of affected positions
+    // e.g., LPPA24S --> LPPA24/27S
+    if (ref.length > 1) {
+      pos = `${pos}/${pos + ref.length - 1}`;
+    }
+
     // Translate stop codon from "_" to more conventional "*"
     if (ref === '_') {
       ref = '*';
@@ -44,6 +53,6 @@ export const formatMutation = (mutationStr, dnaOrAa) => {
       ref = '';
     }
 
-    return `${chunks[0]}:${ref}${chunks[1]}${alt}`;
+    return `${feature}:${ref}${pos}${alt}`;
   }
 };
diff --git a/workflow_main/scripts/build_full_dataframe.py b/workflow_main/scripts/build_full_dataframe.py
@@ -40,6 +40,34 @@ def main():
         v: k for k, v in metadata_map["protein_aa_mutation"].items()
     }
 
+    # Affected positions map
+    gene_aa_affected_positions_map = {}
+    for k, v in metadata_map["gene_aa_mutation"].items():
+        feature = k.split("|")[0]
+        pos = int(k.split("|")[1])
+        ref = k.split("|")[2]
+        gene_aa_affected_positions_map[v] = ";".join(
+            [f"{feature}|{x}" for x in range(pos, pos + len(ref))]
+        )
+
+    protein_aa_affected_positions_map = {}
+    for k, v in metadata_map["protein_aa_mutation"].items():
+        feature = k.split("|")[0]
+        pos = int(k.split("|")[1])
+        ref = k.split("|")[2]
+        protein_aa_affected_positions_map[v] = ";".join(
+            [f"{feature}|{x}" for x in range(pos, pos + len(ref))]
+        )
+
+    # Add affected positions to dataframe
+    df.loc[:, "gene_aa_affected_positions"] = df["gene_aa_mutation"].apply(
+        lambda x: ";".join([gene_aa_affected_positions_map[i] for i in x])
+    )
+    df.loc[:, "protein_aa_affected_positions"] = df["protein_aa_mutation"].apply(
+        lambda x: ";".join([protein_aa_affected_positions_map[i] for i in x])
+    )
+
+    # Convert mutation IDs back to mutation strings
     df.loc[:, "dna_mutation"] = df["dna_mutation"].apply(
         lambda x: ";".join([dna_mutation_map[i] for i in x])
     )
@@ -50,7 +78,7 @@ def main():
         lambda x: ";".join([protein_aa_mutation_map[i] for i in x])
     )
 
-    # Serialize coverage
+    # Serialize coverage data
     df.loc[:, "dna_range"] = df["dna_range"].apply(
         lambda rngs: ";".join([f"{rng[0]}-{rng[1]}" for rng in rngs])
     )

diff --git a/workflow_main/scripts/extract_aa_mutations.py b/workflow_main/scripts/extract_aa_mutations.py
@@ -158,8 +158,9 @@ def extract_aa_mutations(
                     - segment_start
                 ) // 3
 
-                # GROUP MUTATIONS
-                # Group together individual mutations to process them as a single mutation
+                # GROUP DNA MUTATIONS
+                # Group together individual DNA mutations to process them
+                # as a single DNA mutation
                 # Criteria:
                 # - Same Accession ID
                 # - Mutations are within codon range
@@ -425,14 +426,15 @@ def extract_aa_mutations(
     # - position is the same
     # - one mutation is an indel, other is a substitution
 
+    """
     if len(aa_mutations) > 0:
         cur_mutation = aa_mutations[0]
         i = 1
         while i < len(aa_mutations):
 
             next_mutation = aa_mutations[i]
 
-            # (Accession ID, gene/protein, pos, ref, alt)
+            # (reference, Accession ID, gene/protein, pos, ref, alt)
             if (
                 cur_mutation[0] == next_mutation[0]
                 and cur_mutation[1] == next_mutation[1]
@@ -468,6 +470,86 @@ def extract_aa_mutations(
                 # No merging, move on
                 cur_mutation = aa_mutations[i]
                 i += 1
+    """
+
+    # SPLIT AA MUTATIONS
+    # ------------------
+    # Split AA mutations:
+    # 1. Split deletions of multiple residues into individual deletions
+    #    - e.g., ∆HV69 --> ∆H69, ∆V70
+    #    - Only applicable to pure deletions, i.e., no substitutions
+    # 2. Mutations will be “ungrouped” if possible
+    #    - e.g., FR157SG --> F157S, R158G
+    #    - Only applicable for substitutions of the same residue length
+
+    # 1. SPLIT DELETIONS
+    old_aa_mutation_inds = []
+    new_aa_mutations = []  # (new_mutation, insertion_index)
+    for i, aa_mutation in enumerate(aa_mutations):
+        # (reference, Accession ID, gene/protein, pos, ref, alt)
+        ref = aa_mutation[4]
+        alt = aa_mutation[5]
+        # Skip if the mutation is not a pure deletion of more than 1 residue
+        if len(alt) > 0 or len(ref) == 1:
+            continue
+
+        # Split the deletion into individual deletions
+        for j, r in enumerate(ref):
+            new_aa_mutations.append(
+                (
+                    (
+                        aa_mutation[0],
+                        aa_mutation[1],
+                        aa_mutation[2],
+                        aa_mutation[3] + j,
+                        r,
+                        "",
+                    ),
+                    i + j,
+                )
+            )
+
+        old_aa_mutation_inds.append(i)
+
+    # Remove old deletion mutations
+    for i in old_aa_mutation_inds[::-1]:
+        del aa_mutations[i]
+    # Add new mutations
+    for new_mutation, insertion_index in new_aa_mutations:
+        aa_mutations.insert(insertion_index, new_mutation)
+
+    # 2. SPLIT SUBSTITUTIONS
+    old_aa_mutation_inds = []
+    new_aa_mutations = [] # (new_mutation, insertion_index)
+    for i, aa_mutation in enumerate(aa_mutations):
+        # (reference, Accession ID, gene/protein, pos, ref, alt)
+        ref = aa_mutation[4]
+        alt = aa_mutation[5]
+        # Skip if the mutation is not a pure substitution of more than 1 residue
+        if len(alt) == 0 or len(ref) == 0 or len(ref) != len(alt) or len(ref) == 1:
+            continue
+
+        # Split the substitution into individual substitutions
+        for j, (a, b) in enumerate(zip(ref, alt)):
+            new_aa_mutations.append((
+                (
+                    aa_mutation[0],
+                    aa_mutation[1],
+                    aa_mutation[2],
+                    aa_mutation[3] + j,
+                    a,
+                    b,
+                ), i + j
+            ))
+
+        old_aa_mutation_inds.append(i)
+
+    # Remove old deletion mutations
+    for i in old_aa_mutation_inds[::-1]:
+        del aa_mutations[i]
+    # Add new mutations
+    for new_mutation, insertion_index in new_aa_mutations:
+        aa_mutations.insert(insertion_index, new_mutation)
 
     aa_mutation_df = pd.DataFrame.from_records(
         aa_mutations,

diff --git a/workflow_main/scripts/process_mutations.py b/workflow_main/scripts/process_mutations.py
@@ -19,7 +19,7 @@ def process_mutations(
     mode="dna",  # dna, gene_aa, protein_aa
 ):
     """Process mutation data
-    
+
     Parameters
     ----------
     manifest: pandas.DataFrame
@@ -30,7 +30,7 @@ def process_mutations(
         - Mutations must occur at least this many times to pass filters
     mode: string
         - dna, gene_aa, protein_aa
-    
+
     Returns
     -------
     out: tuple of pandas.DataFrames
@@ -120,10 +120,12 @@ def process_mutations(
     )
 
     # Map mutations to integer IDs
-    mutation_map = pd.Series(mutation_df["mutation_str"].unique())
+    mutation_to_id_map = pd.Series(mutation_df["mutation_str"].unique())
     # Flip index and values
-    mutation_map = pd.Series(mutation_map.index.values, index=mutation_map)
-    mutation_df["mutation_id"] = mutation_df["mutation_str"].map(mutation_map)
+    mutation_to_id_map = pd.Series(
+        mutation_to_id_map.index.values, index=mutation_to_id_map
+    )
+    mutation_df["mutation_id"] = mutation_df["mutation_str"].map(mutation_to_id_map)
 
     mutation_group_df = mutation_df.groupby(
         ["Accession ID", "reference"], as_index=False
@@ -141,11 +143,11 @@ def process_mutations(
     )
 
     # Fill NaNs with empty arrays
-    mutation_group_df.loc[
-        mutation_group_df["mutation_id"].isna(), "mutation_id"
-    ] = pd.Series(
-        [[]] * mutation_group_df["mutation_id"].isna().sum(),
-        index=mutation_group_df.index[mutation_group_df["mutation_id"].isna()],
+    mutation_group_df.loc[mutation_group_df["mutation_id"].isna(), "mutation_id"] = (
+        pd.Series(
+            [[]] * mutation_group_df["mutation_id"].isna().sum(),
+            index=mutation_group_df.index[mutation_group_df["mutation_id"].isna()],
+        )
     )
 
-    return mutation_group_df, mutation_map
+    return mutation_group_df, mutation_to_id_map