Skip to content

Commit

Permalink
feat: protein stratification analysis for Davis#57
Browse files Browse the repository at this point in the history
  • Loading branch information
jyaacoub committed Nov 14, 2023
1 parent 5a93b85 commit 2d71560
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 16 deletions.
32 changes: 16 additions & 16 deletions playground.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
# # %%
# from src.data_processing.init_dataset import create_datasets
#%%
from src.data_analysis.stratify_protein import check_davis_names, kinbase_to_df

# create_datasets(
# data_opt=['davis'],
# feat_opt=['foldseek'],
# edge_opt=['binary']
# )
# # %%
# from src.utils.loader import Loader
# d2 = Loader.load_dataset(data='davis', pro_feature='nomsa',
# edge_opt='anm')
df = kinbase_to_df()
# %%
import json
prot_dict = json.load(open('/home/jyaacoub/projects/data/davis/proteins.txt', 'r'))
# %%
from src.data_analysis.figures import fig4_pro_feat_violin, prepare_df
# returns a dictionary of davis protein names (keys) and a truple of the protein name, main family, and subgroup (values)
prots = check_davis_names(prot_dict, df)

# %% plot histogram of main families and their counts
import seaborn as sns
import pandas as pd

main_families = [v[1] for v in prots.values()]
main_families = pd.Series(main_families)
sns.histplot(main_families)

df = prepare_df('results/model_media/model_stats.csv')

#%%
fig4_pro_feat_violin(df, sel_dataset='davis', sel_col='mse',
add_stats=True, verbose=False)
# %%
111 changes: 111 additions & 0 deletions src/data_analysis/stratify_protein.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# To resolve https://github.com/jyaacoub/MutDTA/issues/57
# list of protein names for davis is in ../data/davis/proteins.txt
# or downloaded from https://staff.cs.utu.fi/~aatapa/data/DrugTarget/target_gene_names.txt

from typing import Iterable
from src.utils import config as cfg
import pandas as pd
import re

def kinbase_to_df(fasta_fp:str=f'{cfg.DATA_ROOT}/misc/Human_kinase_domain.fasta'):
"""
Converts the KinBase fasta file containing all Human Kinase Domains into a
dataframe for easy parsing.
The Human_kinase_domain can be retrieved from:
https://web.archive.org/web/20230517032418/http://kinase.com/kinbase/FastaFiles/Human_kinase_domain.fasta
Sample of the FASTA headers:
>TTBK2_Hsap (CK1/TTBK) *
>TTBK1_Hsap (CK1/TTBK)
>TSSK4_Hsap (CAMK/TSSK)
>TSSK3_Hsap (CAMK/TSSK)
- The first couple characters before the underscore are the protein names (*e.g.: TTBK2).
- The characters after the underscore are the species (*e.g.: Hsap == homo sapiens).
- Most importantly, the characters between the parenthesis are the protein family and
subgroups in that order (*e.g.: CK1/TTBK).
Parameters
----------
`fasta_fp` : str, optional
The path to the downloaded fasta file path, by default f'{cfg.DATA_ROOT}/misc/Human_kinase_domain.fasta'
"""
prots = {}
with open(fasta_fp, 'r') as f:
lines = f.readlines()
for i in range(len(lines)):
line = lines[i]
if line[0] == '>': # header
seq = lines[i+1]
name = re.search(r'^>(.+?)_Hsap', line).group(1)
# all in the fasta has a protein family discriptor with at least 2 elements
protein_family = re.search(r'\((.*)\)', line).group(1)
main_family, subgroup = protein_family.split('/')[:2]

prots[name] = (main_family, subgroup, seq)

i+=2
else:
i+=1
# convert to dataframe
df = pd.DataFrame.from_dict(prots, orient='index', columns=['main_family', 'subgroup', 'seq'])
df.index.name = 'protein_name'
return df

def check_davis_names(davis_prots:dict, df:pd.DataFrame) -> list:
"""
Checks davis protein names against fasta file containing human kinase domains.
Returns list of proteins in davis that are also found in the fasta.
NOTE: that for some in davis they have mutation information in brackets after
the protein name, but we only need the protein name for this function.
Example:
ABL1(F317I)p -> ABL1
There are also some with "-alpha?" (or "beta" etc..) where ? is a number, we
can't ignore these and must include them by appending to the protein name "a?".
Parameters
----------
`davis_prots` : dict
Dictionary of davis protein names (keys) and their associated sequence (values).
`df` : pd.DataFrame
The dataframe containing the human kinase domains (see kinbase_to_df()).
"""

df = kinbase_to_df() if df is None else df

greek = {'alpha', 'beta', 'gamma', 'delta'} # for checking if protein name has greek letter

found_prots = {}
for k in davis_prots.keys():
name = k.split('(')[0]
alpha = ''
if '-' in name:
name, alpha = name.split('-')

# removing any 'p' at the end of the name which indicates phosphorylation
if name[-1] == 'p':
name = name[:-1]

# getting alpha info if it exists
if len(alpha) >= 1:
alpha_name, alpha_num = re.search(r'([a-z]*)(\d*)', alpha).groups()
if alpha_name in greek:
name += f'{alpha_name[0]}{alpha_num}'

# checking if name is in the dataframe
if name in df.index:
found_prots[k] = (name, df.loc[name, 'main_family'], df.loc[name, 'subgroup'])
else:
print(f'MISSING: {k}-{name}')

return found_prots

if __name__ == '__main__':
import json
from src.data_analysis.stratify_protein import check_davis_names, kinbase_to_df

prot_dict = json.load(open('/home/jyaacoub/projects/data/davis/proteins.txt', 'r'))

df = kinbase_to_df()
prots = check_davis_names(prot_dict, df)

1 comment on commit 2d71560

@jyaacoub
Copy link
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is related to #57

Please sign in to comment.