-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: protein stratification analysis for Davis#57
- Loading branch information
Showing
2 changed files
with
127 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,21 +1,21 @@ | ||
# # %% | ||
# from src.data_processing.init_dataset import create_datasets | ||
#%% | ||
from src.data_analysis.stratify_protein import check_davis_names, kinbase_to_df | ||
|
||
# create_datasets( | ||
# data_opt=['davis'], | ||
# feat_opt=['foldseek'], | ||
# edge_opt=['binary'] | ||
# ) | ||
# # %% | ||
# from src.utils.loader import Loader | ||
# d2 = Loader.load_dataset(data='davis', pro_feature='nomsa', | ||
# edge_opt='anm') | ||
df = kinbase_to_df() | ||
# %% | ||
import json | ||
prot_dict = json.load(open('/home/jyaacoub/projects/data/davis/proteins.txt', 'r')) | ||
# %% | ||
from src.data_analysis.figures import fig4_pro_feat_violin, prepare_df | ||
# returns a dictionary of davis protein names (keys) and a truple of the protein name, main family, and subgroup (values) | ||
prots = check_davis_names(prot_dict, df) | ||
|
||
# %% plot histogram of main families and their counts | ||
import seaborn as sns | ||
import pandas as pd | ||
|
||
main_families = [v[1] for v in prots.values()] | ||
main_families = pd.Series(main_families) | ||
sns.histplot(main_families) | ||
|
||
df = prepare_df('results/model_media/model_stats.csv') | ||
|
||
#%% | ||
fig4_pro_feat_violin(df, sel_dataset='davis', sel_col='mse', | ||
add_stats=True, verbose=False) | ||
# %% |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
# To resolve https://github.com/jyaacoub/MutDTA/issues/57 | ||
# list of protein names for davis is in ../data/davis/proteins.txt | ||
# or downloaded from https://staff.cs.utu.fi/~aatapa/data/DrugTarget/target_gene_names.txt | ||
|
||
from typing import Iterable | ||
from src.utils import config as cfg | ||
import pandas as pd | ||
import re | ||
|
||
def kinbase_to_df(fasta_fp:str=f'{cfg.DATA_ROOT}/misc/Human_kinase_domain.fasta'): | ||
""" | ||
Converts the KinBase fasta file containing all Human Kinase Domains into a | ||
dataframe for easy parsing. | ||
The Human_kinase_domain can be retrieved from: | ||
https://web.archive.org/web/20230517032418/http://kinase.com/kinbase/FastaFiles/Human_kinase_domain.fasta | ||
Sample of the FASTA headers: | ||
>TTBK2_Hsap (CK1/TTBK) * | ||
>TTBK1_Hsap (CK1/TTBK) | ||
>TSSK4_Hsap (CAMK/TSSK) | ||
>TSSK3_Hsap (CAMK/TSSK) | ||
- The first couple characters before the underscore are the protein names (*e.g.: TTBK2). | ||
- The characters after the underscore are the species (*e.g.: Hsap == homo sapiens). | ||
- Most importantly, the characters between the parenthesis are the protein family and | ||
subgroups in that order (*e.g.: CK1/TTBK). | ||
Parameters | ||
---------- | ||
`fasta_fp` : str, optional | ||
The path to the downloaded fasta file path, by default f'{cfg.DATA_ROOT}/misc/Human_kinase_domain.fasta' | ||
""" | ||
prots = {} | ||
with open(fasta_fp, 'r') as f: | ||
lines = f.readlines() | ||
for i in range(len(lines)): | ||
line = lines[i] | ||
if line[0] == '>': # header | ||
seq = lines[i+1] | ||
name = re.search(r'^>(.+?)_Hsap', line).group(1) | ||
# all in the fasta has a protein family discriptor with at least 2 elements | ||
protein_family = re.search(r'\((.*)\)', line).group(1) | ||
main_family, subgroup = protein_family.split('/')[:2] | ||
|
||
prots[name] = (main_family, subgroup, seq) | ||
|
||
i+=2 | ||
else: | ||
i+=1 | ||
# convert to dataframe | ||
df = pd.DataFrame.from_dict(prots, orient='index', columns=['main_family', 'subgroup', 'seq']) | ||
df.index.name = 'protein_name' | ||
return df | ||
|
||
def check_davis_names(davis_prots:dict, df:pd.DataFrame) -> list: | ||
""" | ||
Checks davis protein names against fasta file containing human kinase domains. | ||
Returns list of proteins in davis that are also found in the fasta. | ||
NOTE: that for some in davis they have mutation information in brackets after | ||
the protein name, but we only need the protein name for this function. | ||
Example: | ||
ABL1(F317I)p -> ABL1 | ||
There are also some with "-alpha?" (or "beta" etc..) where ? is a number, we | ||
can't ignore these and must include them by appending to the protein name "a?". | ||
Parameters | ||
---------- | ||
`davis_prots` : dict | ||
Dictionary of davis protein names (keys) and their associated sequence (values). | ||
`df` : pd.DataFrame | ||
The dataframe containing the human kinase domains (see kinbase_to_df()). | ||
""" | ||
|
||
df = kinbase_to_df() if df is None else df | ||
|
||
greek = {'alpha', 'beta', 'gamma', 'delta'} # for checking if protein name has greek letter | ||
|
||
found_prots = {} | ||
for k in davis_prots.keys(): | ||
name = k.split('(')[0] | ||
alpha = '' | ||
if '-' in name: | ||
name, alpha = name.split('-') | ||
|
||
# removing any 'p' at the end of the name which indicates phosphorylation | ||
if name[-1] == 'p': | ||
name = name[:-1] | ||
|
||
# getting alpha info if it exists | ||
if len(alpha) >= 1: | ||
alpha_name, alpha_num = re.search(r'([a-z]*)(\d*)', alpha).groups() | ||
if alpha_name in greek: | ||
name += f'{alpha_name[0]}{alpha_num}' | ||
|
||
# checking if name is in the dataframe | ||
if name in df.index: | ||
found_prots[k] = (name, df.loc[name, 'main_family'], df.loc[name, 'subgroup']) | ||
else: | ||
print(f'MISSING: {k}-{name}') | ||
|
||
return found_prots | ||
|
||
if __name__ == '__main__': | ||
import json | ||
from src.data_analysis.stratify_protein import check_davis_names, kinbase_to_df | ||
|
||
prot_dict = json.load(open('/home/jyaacoub/projects/data/davis/proteins.txt', 'r')) | ||
|
||
df = kinbase_to_df() | ||
prots = check_davis_names(prot_dict, df) |
2d71560
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is related to #57