Skip to content

Commit

Permalink
Mutation separation (#651)
Browse files Browse the repository at this point in the history
* Split AA mutations of multiple residues into individual mutations, where appropriate

* cleanup

* Add affected positions column to complete dataframe only

* Format mutation range for AA mutations affecting multiple reference residues
  • Loading branch information
atc3 authored Jul 29, 2024
1 parent 73fa89e commit cd329ec
Show file tree
Hide file tree
Showing 4 changed files with 137 additions and 16 deletions.
11 changes: 10 additions & 1 deletion src/utils/mutationUtils.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,18 @@ export const formatMutation = (mutationStr, dnaOrAa) => {
return `${chunks[2]}${chunks[1]}${chunks[3]}`;
}
} else if (dnaOrAa === DNA_OR_AA.AA) {
let feature = chunks[0];
let pos = parseInt(chunks[1]);
let ref = chunks[2];
let alt = chunks[3];

// For mutations with multiple reference AAs affected, display
// the position as a range of affected positions
// e.g., LPPA24S --> LPPA24/27S
if (ref.length > 1) {
pos = `${pos}/${pos + ref.length - 1}`;
}

// Translate stop codon from "_" to more conventional "*"
if (ref === '_') {
ref = '*';
Expand All @@ -44,6 +53,6 @@ export const formatMutation = (mutationStr, dnaOrAa) => {
ref = '';
}

return `${chunks[0]}:${ref}${chunks[1]}${alt}`;
return `${feature}:${ref}${pos}${alt}`;
}
};
30 changes: 29 additions & 1 deletion workflow_main/scripts/build_full_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,34 @@ def main():
v: k for k, v in metadata_map["protein_aa_mutation"].items()
}

# Affected positions map
gene_aa_affected_positions_map = {}
for k, v in metadata_map["gene_aa_mutation"].items():
feature = k.split("|")[0]
pos = int(k.split("|")[1])
ref = k.split("|")[2]
gene_aa_affected_positions_map[v] = ";".join(
[f"{feature}|{x}" for x in range(pos, pos + len(ref))]
)

protein_aa_affected_positions_map = {}
for k, v in metadata_map["protein_aa_mutation"].items():
feature = k.split("|")[0]
pos = int(k.split("|")[1])
ref = k.split("|")[2]
protein_aa_affected_positions_map[v] = ";".join(
[f"{feature}|{x}" for x in range(pos, pos + len(ref))]
)

# Add affected positions to dataframe
df.loc[:, "gene_aa_affected_positions"] = df["gene_aa_mutation"].apply(
lambda x: ";".join([gene_aa_affected_positions_map[i] for i in x])
)
df.loc[:, "protein_aa_affected_positions"] = df["protein_aa_mutation"].apply(
lambda x: ";".join([protein_aa_affected_positions_map[i] for i in x])
)

# Convert mutation IDs back to mutation strings
df.loc[:, "dna_mutation"] = df["dna_mutation"].apply(
lambda x: ";".join([dna_mutation_map[i] for i in x])
)
Expand All @@ -50,7 +78,7 @@ def main():
lambda x: ";".join([protein_aa_mutation_map[i] for i in x])
)

# Serialize coverage
# Serialize coverage data
df.loc[:, "dna_range"] = df["dna_range"].apply(
lambda rngs: ";".join([f"{rng[0]}-{rng[1]}" for rng in rngs])
)
Expand Down
88 changes: 85 additions & 3 deletions workflow_main/scripts/extract_aa_mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,8 +158,9 @@ def extract_aa_mutations(
- segment_start
) // 3

# GROUP MUTATIONS
# Group together individual mutations to process them as a single mutation
# GROUP DNA MUTATIONS
# Group together individual DNA mutations to process them
# as a single DNA mutation
# Criteria:
# - Same Accession ID
# - Mutations are within codon range
Expand Down Expand Up @@ -425,14 +426,15 @@ def extract_aa_mutations(
# - position is the same
# - one mutation is an indel, other is a substitution

"""
if len(aa_mutations) > 0:
cur_mutation = aa_mutations[0]
i = 1
while i < len(aa_mutations):
next_mutation = aa_mutations[i]
# (Accession ID, gene/protein, pos, ref, alt)
# (reference, Accession ID, gene/protein, pos, ref, alt)
if (
cur_mutation[0] == next_mutation[0]
and cur_mutation[1] == next_mutation[1]
Expand Down Expand Up @@ -468,6 +470,86 @@ def extract_aa_mutations(
# No merging, move on
cur_mutation = aa_mutations[i]
i += 1
"""

# SPLIT AA MUTATIONS
# ------------------
# Split AA mutations:
# 1. Split deletions of multiple residues into individual deletions
# - e.g., ∆HV69 --> ∆H69, ∆V70
# - Only applicable to pure deletions, i.e., no substitutions
# 2. Mutations will be “ungrouped” if possible
# - e.g., FR157SG --> F157S, R158G
# - Only applicable for substitutions of the same residue length

# 1. SPLIT DELETIONS
old_aa_mutation_inds = []
new_aa_mutations = [] # (new_mutation, insertion_index)
for i, aa_mutation in enumerate(aa_mutations):
# (reference, Accession ID, gene/protein, pos, ref, alt)
ref = aa_mutation[4]
alt = aa_mutation[5]
# Skip if the mutation is not a pure deletion of more than 1 residue
if len(alt) > 0 or len(ref) == 1:
continue

# Split the deletion into individual deletions
for j, r in enumerate(ref):
new_aa_mutations.append(
(
(
aa_mutation[0],
aa_mutation[1],
aa_mutation[2],
aa_mutation[3] + j,
r,
"",
),
i + j,
)
)

old_aa_mutation_inds.append(i)

# Remove old deletion mutations
for i in old_aa_mutation_inds[::-1]:
del aa_mutations[i]
# Add new mutations
for new_mutation, insertion_index in new_aa_mutations:
aa_mutations.insert(insertion_index, new_mutation)

# 2. SPLIT SUBSTITUTIONS
old_aa_mutation_inds = []
new_aa_mutations = [] # (new_mutation, insertion_index)
for i, aa_mutation in enumerate(aa_mutations):
# (reference, Accession ID, gene/protein, pos, ref, alt)
ref = aa_mutation[4]
alt = aa_mutation[5]
# Skip if the mutation is not a pure substitution of more than 1 residue
if len(alt) == 0 or len(ref) == 0 or len(ref) != len(alt) or len(ref) == 1:
continue

# Split the substitution into individual substitutions
for j, (a, b) in enumerate(zip(ref, alt)):
new_aa_mutations.append((
(
aa_mutation[0],
aa_mutation[1],
aa_mutation[2],
aa_mutation[3] + j,
a,
b,
), i + j
))

old_aa_mutation_inds.append(i)

# Remove old deletion mutations
for i in old_aa_mutation_inds[::-1]:
del aa_mutations[i]
# Add new mutations
for new_mutation, insertion_index in new_aa_mutations:
aa_mutations.insert(insertion_index, new_mutation)

aa_mutation_df = pd.DataFrame.from_records(
aa_mutations,
Expand Down
24 changes: 13 additions & 11 deletions workflow_main/scripts/process_mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def process_mutations(
mode="dna", # dna, gene_aa, protein_aa
):
"""Process mutation data
Parameters
----------
manifest: pandas.DataFrame
Expand All @@ -30,7 +30,7 @@ def process_mutations(
- Mutations must occur at least this many times to pass filters
mode: string
- dna, gene_aa, protein_aa
Returns
-------
out: tuple of pandas.DataFrames
Expand Down Expand Up @@ -120,10 +120,12 @@ def process_mutations(
)

# Map mutations to integer IDs
mutation_map = pd.Series(mutation_df["mutation_str"].unique())
mutation_to_id_map = pd.Series(mutation_df["mutation_str"].unique())
# Flip index and values
mutation_map = pd.Series(mutation_map.index.values, index=mutation_map)
mutation_df["mutation_id"] = mutation_df["mutation_str"].map(mutation_map)
mutation_to_id_map = pd.Series(
mutation_to_id_map.index.values, index=mutation_to_id_map
)
mutation_df["mutation_id"] = mutation_df["mutation_str"].map(mutation_to_id_map)

mutation_group_df = mutation_df.groupby(
["Accession ID", "reference"], as_index=False
Expand All @@ -141,11 +143,11 @@ def process_mutations(
)

# Fill NaNs with empty arrays
mutation_group_df.loc[
mutation_group_df["mutation_id"].isna(), "mutation_id"
] = pd.Series(
[[]] * mutation_group_df["mutation_id"].isna().sum(),
index=mutation_group_df.index[mutation_group_df["mutation_id"].isna()],
mutation_group_df.loc[mutation_group_df["mutation_id"].isna(), "mutation_id"] = (
pd.Series(
[[]] * mutation_group_df["mutation_id"].isna().sum(),
index=mutation_group_df.index[mutation_group_df["mutation_id"].isna()],
)
)

return mutation_group_df, mutation_map
return mutation_group_df, mutation_to_id_map

0 comments on commit cd329ec

Please sign in to comment.