Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Missing aflows #126

Merged
merged 6 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 143 additions & 87 deletions playground.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,154 @@
# %%
#%%
import os
import logging
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
train_df = pd.read_csv('/cluster/home/t122995uhn/projects/data/DavisKibaDataset/davis/nomsa_binary_original_binary/full/XY.csv')
test_df = pd.read_csv('/cluster/home/t122995uhn/projects/MutDTA/splits/davis/test.csv')
train_df = train_df[~train_df.prot_id.isin(set(test_df.prot_id))]

# %%
from src.utils.residue import Chain
from multiprocessing import Pool, cpu_count
from src.data_prep.datasets import BaseDataset

# df = pd.read_csv("/cluster/home/t122995uhn/projects/MutDTA/df_base.csv", index_col=0)
df = pd.read_csv("/cluster/home/t122995uhn/projects/MutDTA/df_base_filtered.csv", index_col=0)
logging.getLogger().setLevel(logging.DEBUG)

def process_protein_multiprocessing(args):
"""
Checks if protein has conf file and correct sequence, returns:
- None, None - if it has a conf file and is correct
- pid, None - is missing a conf file
- pid, seq - has a conf file but is not correct sequence.
"""
group_codes, code, pid, seq, af_conf_dir, is_pdbbind, files = args
MIN_MODEL_COUNT = 5

correct_seq = False
matching_code = None
af_confs = []
if is_pdbbind:
for c in group_codes:
af_fp = os.path.join(af_conf_dir, f'{c}.pdb')
if os.path.exists(af_fp):
af_confs = [af_fp]
matching_code = code
if Chain(af_fp).sequence == seq:
correct_seq = True
break

else:
af_confs = [os.path.join(af_conf_dir, f) for f in files if f.startswith(pid)]

if len(af_confs) == 0:
return pid, None

# either all models in one pdb file (alphaflow) or spread out across multiple files (AF2 msa subsampling)
model_count = len(af_confs) if len(af_confs) > 1 else 5# Chain.get_model_count(af_confs[0])

if model_count < MIN_MODEL_COUNT:
return pid, None
elif not correct_seq: # final check
af_seq = Chain(af_confs[0]).sequence
if seq != af_seq:
logging.debug(f'Mismatched sequence for {pid}')
# if matching_code == code: # something wrong here -> incorrect seq but for the right code?
# return pid, af_seq
return pid, matching_code

if matching_code != code:
return None, matching_code
return None, None

#%% check_missing_confs method
af_conf_dir:str = '/cluster/home/t122995uhn/projects/data/pdbbind/alphaflow_io/out_pdb_MD-distilled/'
is_pdbbind=True

df_unique:pd.DataFrame = df.drop_duplicates('prot_id')
df_pid_groups = df.groupby(['prot_id']).groups

missing = set()
mismatched = {}
# total of 3728 unique proteins with alphaflow confs (named by pdb ID)
files = None
if not is_pdbbind:
files = [f for f in os.listdir(af_conf_dir) if f.endswith('.pdb')]

with Pool(processes=cpu_count()) as pool:
tasks = [(df_pid_groups[pid], code, pid, seq, af_conf_dir, is_pdbbind, files) \
for code, (pid, seq) in df_unique[['prot_id', 'prot_seq']].iterrows()]

for pid, new_seq in tqdm(pool.imap_unordered(process_protein_multiprocessing, tasks),
desc='Filtering out proteins with missing PDB files for multiple confirmations',
total=len(tasks)):
if new_seq is not None:
mismatched[pid] = new_seq
elif pid is not None: # just pid -> missing af files
missing.add(pid)

print(len(missing),len(mismatched))

#%% make subsitutions for rows
df = pd.read_csv("/cluster/home/t122995uhn/projects/MutDTA/df_base.csv", index_col=0)
df_mismatched = pd.DataFrame.from_dict(mismatched, orient='index', columns=['code'])
df_mismatched_sub = df.loc[df_mismatched['code']][['prot_id', 'prot_seq']].reset_index()
df_mismatched = df_mismatched.merge(df_mismatched_sub, on='code')

df_mismatched = df_mismatched.merge(df_mismatched_sub, on='code')
dff = pd.read_csv("/cluster/home/t122995uhn/projects/MutDTA/df_base_filtered.csv")
dffm = dff.merge(df_mismatched, on='code')
#%%
from src.data_prep.datasets import BaseDataset
import pandas as pd
csv_p = "/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_binary_original_binary/full/XY.csv"

davis_test_df = pd.read_csv(f"/home/jean/projects/MutDTA/splits/davis/test.csv")
davis_test_df['gene'] = davis_test_df['prot_id'].str.split('(').str[0]

#%% ONCO KB MERGE
onco_df = pd.read_csv("../data/oncoKB_DrugGenePairList.csv")
davis_join_onco = davis_test_df.merge(onco_df.drop_duplicates("gene"), on="gene", how="inner")
df = pd.read_csv(csv_p, index_col=0)

# %%
onco_df = pd.read_csv("../data/oncoKB_DrugGenePairList.csv")
onco_df.merge(davis_test_df.drop_duplicates("gene"), on="gene", how="inner").value_counts("gene")








import os
from tqdm import tqdm
alphaflow_dir = "/cluster/home/t122995uhn/projects/data/pdbbind/alphaflow_io/out_pdb_MD-distilled/"
ln_dir = "/cluster/home/t122995uhn/projects/data/pdbbind/alphaflow_io/out_pid_ln/"

os.makedirs(ln_dir, exist_ok=True)

# First, check and remove any broken links in the destination directory
for link_file in tqdm(os.listdir(ln_dir), desc="Checking for broken links"):
ln_p = os.path.join(ln_dir, link_file)
if os.path.islink(ln_p) and not os.path.exists(ln_p):
print(f"Removing broken link: {ln_p}")
os.remove(ln_p)


# %% files are .pdb with 50 "models" in each
for file in tqdm(os.listdir(alphaflow_dir)):
if not file.endswith('.pdb'):
continue

code, _ = os.path.splitext(file)
pid = df.loc[code].prot_id
src, dst = f"{alphaflow_dir}/{file}", f"{ln_dir}/{pid}.pdb"
if not os.path.exists(dst):
os.symlink(src,dst)

# %%
from src.train_test.splitting import resplit
#%%
########################################################################
########################## BUILD DATASETS ##############################
########################################################################
import os
from src.data_prep.init_dataset import create_datasets
from src import cfg
import logging
cfg.logger.setLevel(logging.DEBUG)

db_p = lambda x: f'{cfg.DATA_ROOT}/DavisKibaDataset/davis/nomsa_{x}_gvp_binary'

db = resplit(dataset=db_p('binary'), split_files=db_p('aflow'), use_train_set=True)

splits = '/cluster/home/t122995uhn/projects/MutDTA/splits/davis/'
create_datasets(cfg.DATA_OPT.davis,
feat_opt=cfg.PRO_FEAT_OPT.nomsa,
edge_opt=[cfg.PRO_EDGE_OPT.aflow, cfg.PRO_EDGE_OPT.binary],
ligand_features=[cfg.LIG_FEAT_OPT.original, cfg.LIG_FEAT_OPT.gvp],
ligand_edges=cfg.LIG_EDGE_OPT.binary, overwrite=True,
k_folds=5,
test_prots_csv=f'{splits}/test.csv',
val_prots_csv=[f'{splits}/val{i}.csv' for i in range(5)],
data_root=os.path.abspath('../data/test/'))


# %%
Expand Down Expand Up @@ -69,62 +184,3 @@
models=models, metrics=['cindex', 'mse'],
fig_scale=(10,5), add_stats=True, title_postfix=" validation set performance")
plt.xticks(rotation=45)

#%%
########################################################################
########################## BUILD DATASETS ##############################
########################################################################
from src.data_prep.init_dataset import create_datasets
from src import cfg
import logging
cfg.logger.setLevel(logging.DEBUG)

splits = '/cluster/home/t122995uhn/projects/MutDTA/splits/pdbbind/'
create_datasets(cfg.DATA_OPT.PDBbind,
feat_opt=cfg.PRO_FEAT_OPT.nomsa,
edge_opt=[cfg.PRO_EDGE_OPT.binary, cfg.PRO_EDGE_OPT.aflow],
ligand_features=[cfg.LIG_FEAT_OPT.original, cfg.LIG_FEAT_OPT.gvp],
ligand_edges=cfg.LIG_EDGE_OPT.binary,
k_folds=5,
test_prots_csv=f'{splits}/test.csv',
val_prots_csv=[f'{splits}/val{i}.csv' for i in range(5)])

# %%
from src.utils.loader import Loader

db_aflow = Loader.load_dataset('../data/DavisKibaDataset/davis/nomsa_aflow_original_binary/full')
db = Loader.load_dataset('../data/DavisKibaDataset/davis/nomsa_binary_original_binary/full')

# %%
# 5-fold cross validation + test set
import pandas as pd
from src import cfg
from src.train_test.splitting import balanced_kfold_split
from src.utils.loader import Loader
test_df = pd.read_csv('/cluster/home/t122995uhn/projects/MutDTA/splits/pdbbind_test.csv')
test_prots = set(test_df.prot_id)
db = Loader.load_dataset(f'{cfg.DATA_ROOT}/PDBbindDataset/nomsa_binary_original_binary/full/')

train, val, test = balanced_kfold_split(db,
k_folds=5, test_split=0.1, val_split=0.1,
test_prots=test_prots, random_seed=0, verbose=True
)

#%%
db.save_subset_folds(train, 'train')
db.save_subset_folds(val, 'val')
db.save_subset(test, 'test')

#%%
import shutil, os

src = "/cluster/home/t122995uhn/projects/data/PDBbindDataset/nomsa_binary_original_binary/"
dst = "/cluster/home/t122995uhn/projects/MutDTA/splits/pdbbind"
os.makedirs(dst, exist_ok=True)

for i in range(5):
sfile = f"{src}/val{i}/XY.csv"
dfile = f"{dst}/val{i}.csv"
shutil.copyfile(sfile, dfile)

# %%
14 changes: 12 additions & 2 deletions results/v113/model_media/model_stats.csv
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ DGM_kiba3D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_o
DGM_kiba1D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.6572547696170707,0.4910169881324348,0.4223030308489566,0.5056608650145961,0.4887140404038343,0.7110983511544631
DGM_kiba0D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.7034265325632763,0.5925355722791278,0.5277625252545413,0.4342528236949791,0.4414575484107895,0.6589786215765874
DGM_kiba2D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.7048145943290888,0.5949726563519002,0.5309972890766975,0.4319786589236721,0.4397173516202035,0.657250834098879
DGM_kiba4D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.701385350951551,0.5879330884340543,0.5238489684293576,0.43711720760821254,0.44305434527103205,0.6611484005941575
DGM_kiba4D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.701385350951551,0.5879330884340543,0.5238489684293576,0.4371172076082125,0.443054345271032,0.6611484005941575
DGM_PDBbind0D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6655853516013849,0.4884783169365537,0.4768266531852068,2.716448143336347,1.29524747639392,1.648165083763258
DGM_PDBbind1D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6616134720685233,0.4760514005042608,0.465349083935074,2.7772597147478297,1.309049959958126,1.6665112405104952
DGM_PDBbind2D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6547720831157129,0.4674109621848075,0.4473761042353416,2.911396055546887,1.350302681084595,1.7062813529857517
Expand All @@ -46,6 +46,16 @@ GVPLM_kiba4D_nomsaF_binaryE_32B_3.372637625954074e-05LR_0.09399264336737133D_200
GVPLM_kiba2D_nomsaF_binaryE_32B_3.372637625954074e-05LR_0.09399264336737133D_2000E_gvpLF_binaryLE,0.6999996243009051,0.5844167249718527,0.5172933123278608,1.1060115689114345,0.8743866055307609,1.051670846278166
GVPLM_kiba4D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.7028775926753292,0.5511820968815552,0.5274587057439738,0.5191856014604727,0.464434987594827,0.7205453500373676
GVPLM_kiba3D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.6974533040075589,0.5366909353583093,0.5092679820627712,0.5267452909346243,0.4756001094118965,0.7257722031978245
GVPLM_kiba2D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.6914933128392398,0.5535320315157304,0.49916906416324947,0.5167043183823063,0.46841519355949945,0.7188214787986696
GVPLM_kiba2D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.6914933128392398,0.5535320315157304,0.4991690641632494,0.5167043183823063,0.4684151935594994,0.7188214787986696
GVPLM_kiba0D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.667835957156033,0.5118338017508158,0.4512406693094591,0.529732104745498,0.4987863530176246,0.7278269744558098
GVPLM_kiba1D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.673244203521782,0.5584520336614822,0.4601543459719329,0.4634166140494674,0.4859944296833685,0.6807470999199831
DGM_PDBbind0D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5,,,3.4182907626961336,1.4994075870215695,1.8488620182956144
DGM_PDBbind1D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5007066821937181,0.038214548483191,0.0037893479451263,3.4747609352089994,1.5185222442497732,1.864071064956752
DGM_PDBbind2D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5000473143129389,0.0065482511633214,0.0077853856938565,3.55397849703766,1.5411690941151477,1.8851998559934329
DGM_PDBbind3D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5,,,3.6010645245915747,1.553675811370748,1.8976471022272752
DGM_PDBbind4D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5,,,3.472537530411251,1.5178422226480432,1.8634745853945127
GVPLM_PDBbind0D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.5675107531975828,0.2012549479389283,0.1972868580568649,3.521371768804649,1.5340581613048625,1.876531845934049
GVPLM_PDBbind1D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.6351386534675363,0.4162576037033809,0.393776329694548,2.952582892735166,1.3614234164872832,1.7183081483643048
GVPLM_PDBbind2D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.6073794252279185,0.300904510298725,0.3141960696982953,3.5527159664685373,1.497829202900039,1.8848649730069624
GVPLM_PDBbind3D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.5544194125815396,0.1962498205943712,0.1602389373350445,3.559013089849939,1.5262607894024975,1.886534677616592
GVPLM_PDBbind4D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.6222734472562935,0.38283753210340116,0.3603062880559139,3.0631035175351524,1.389417946315663,1.7501724250870692
12 changes: 11 additions & 1 deletion results/v113/model_media/model_stats_val.csv
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ DGM_kiba3D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_o
DGM_kiba1D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.7169797416293802,0.6527045139380728,0.5592661699267665,0.3368578002681257,0.3921905937173818,0.5803945212251109
DGM_kiba0D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.7128444726133566,0.6469405510839773,0.5484484334329713,0.336774733412883,0.4184534194805188,0.5803229561312244
DGM_kiba2D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.7033453012745192,0.6072209463563496,0.5174763874796517,0.4390207180018329,0.4649270940615934,0.6625863853127628
DGM_kiba4D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.7023830999958538,0.639949133525287,0.5151496892582399,0.40105347469958885,0.4409018537119612,0.6332878292684843
DGM_kiba4D_nomsaF_aflowE_64B_0.0001139464546302261LR_0.4321620419748407D_2000E_originalLF_binaryLE,0.7023830999958538,0.639949133525287,0.5151496892582399,0.4010534746995888,0.4409018537119612,0.6332878292684843
DGM_PDBbind0D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.667185125919345,0.4916330475527956,0.4742033800137094,2.573515761007047,1.247796092498501,1.604218115159858
DGM_PDBbind1D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6918743579083576,0.5545547118073524,0.5483030524092783,2.511534059876986,1.2495642605282011,1.5847820228274254
DGM_PDBbind2D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6664571269002308,0.5086157319827234,0.4766146235124788,2.7894558492182133,1.3283703922279295,1.6701664136301548
Expand All @@ -49,3 +49,13 @@ GVPLM_kiba3D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_200
GVPLM_kiba2D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.7926024955332317,0.7374112888096318,0.7063431040772125,0.32468252989801,0.3579667507508575,0.5698092048203591
GVPLM_kiba0D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.6669854507035494,0.5525438589938947,0.4355859541059588,0.4115128362890195,0.4456888421443692,0.6414926626930503
GVPLM_kiba1D_nomsaF_aflowE_16B_0.00010990897170411903LR_0.03599877069828837D_2000E_gvpLF_binaryLE,0.6989377849970573,0.6478510815311954,0.515693387777559,0.3561253610107396,0.4216625264430977,0.5967623991261007
DGM_PDBbind0D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5125715405607719,0.2793381742071084,0.1926536143404282,3.346598030093213,1.4805904418459894,1.829370938353732
DGM_PDBbind1D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5107971942443016,0.0312432003865885,0.0308374960271485,3.5631682917172403,1.5458264870189105,1.887635635316636
DGM_PDBbind2D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5277274707626068,0.358671071267003,0.2830406326274937,3.6322764505727583,1.5248606967243108,1.905853208033808
DGM_PDBbind3D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.51955000029021,0.3102068840467967,0.2386819219389774,3.410729951552425,1.478064934361954,1.8468161661498488
DGM_PDBbind4D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.5275270713061986,0.3631722071531414,0.2847957039355699,3.1869078443560386,1.4534702110049291,1.7851912626819677
GVPLM_PDBbind0D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.5596497975350556,0.2543017795335043,0.1766358807164376,3.113015575178155,1.445382858916236,1.7643739896003214
GVPLM_PDBbind1D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.6600679137627058,0.4711013339427787,0.4650543790596504,2.884138926557529,1.3550213658092014,1.698275279970102
GVPLM_PDBbind2D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.6141731947137768,0.3644634107975009,0.3328003860926398,3.1989141378067982,1.408361079518094,1.788550848538223
GVPLM_PDBbind3D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.5536881344717306,0.192581089791613,0.1606937190574807,3.4912679264408366,1.4810306765899366,1.8684934911422184
GVPLM_PDBbind4D_nomsaF_aflowE_128B_0.00020048122460779208LR_0.042268679447260635D_2000E_gvpLF_binaryLE,0.628428758706968,0.3807074065686098,0.3759327635419598,2.9551973425576046,1.3590244197523815,1.7190687428249065
Loading
Loading