From 17c58e57fb1b55a5388adc533a6ee589de362477 Mon Sep 17 00:00:00 2001 From: jyaacoub Date: Mon, 12 Aug 2024 14:32:50 -0400 Subject: [PATCH 1/3] feat: mutate_model.py with `run_modeller` fn #136 --- playground.py | 8 ++ src/utils/mutate_model.py | 163 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 src/utils/mutate_model.py diff --git a/playground.py b/playground.py index 1182518..757ddf5 100644 --- a/playground.py +++ b/playground.py @@ -1,3 +1,11 @@ +#%% +from src import cfg + +#%% +from src.utils.mutate_model import run_modeller + +run_modeller('/cluster/home/t122995uhn/projects/tmp/d1/O00141.pdb', 50, "MET", "A") + #%% import pandas as pd diff --git a/src/utils/mutate_model.py b/src/utils/mutate_model.py new file mode 100644 index 0000000..e8dd369 --- /dev/null +++ b/src/utils/mutate_model.py @@ -0,0 +1,163 @@ +import sys +import os + +from modeller import * +from modeller.optimizers import MolecularDynamics, ConjugateGradients +from modeller.automodel import autosched + + +def optimize(atmsel, sched): + #conjugate gradient + for step in sched: + step.optimize(atmsel, max_iterations=200, min_atom_shift=0.001) + #md + refine(atmsel) + cg = ConjugateGradients() + cg.optimize(atmsel, max_iterations=200, min_atom_shift=0.001) + + +#molecular dynamics +def refine(atmsel): + # at T=1000, max_atom_shift for 4fs is cca 0.15 A. + md = MolecularDynamics(cap_atom_shift=0.39, md_time_step=4.0, + md_return='FINAL') + init_vel = True + for (its, equil, temps) in ((200, 20, (150.0, 250.0, 400.0, 700.0, 1000.0)), + (200, 600, + (1000.0, 800.0, 600.0, 500.0, 400.0, 300.0))): + for temp in temps: + md.optimize(atmsel, init_velocities=init_vel, temperature=temp, + max_iterations=its, equilibrate=equil) + init_vel = False + + +#use homologs and dihedral library for dihedral angle restraints +def make_restraints(mdl1, aln): + rsr = mdl1.restraints + rsr.clear() + s = Selection(mdl1) + for typ in ('stereo', 'phi-psi_binormal'): + rsr.make(s, restraint_type=typ, aln=aln, spline_on_site=True) + for typ in ('omega', 'chi1', 'chi2', 'chi3', 'chi4'): + rsr.make(s, restraint_type=typ+'_dihedral', spline_range=4.0, + spline_dx=0.3, spline_min_points = 5, aln=aln, + spline_on_site=True) + + +def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path:str=None): + """ + Takes in the model path (excluding .pdb extension) and the residue index to + change and the new residue. Outputs to same dir as "{modelname}-{respos}_restype.pdb" + + Args: + modelname (str): Model file path. + respos (int): Index position for target residue (1-indexed) + restyp (str): 3 letter residue name for what to change into + chain (str): Single character chain identifier. + out_path (str): output path for pdb file to override default of "{modelname}-{respos}_restype.pdb". + """ + modelname = modelname.split('.pdb')[0] + if out_path and modelname == out_path.split('.pdb')[0]: + raise FileExistsError('would overwrite existing file at "{modelname}.pdb"') + + respos = str(respos) + log.verbose() + + # Set a different value for rand_seed to get a different final model + env = Environ(rand_seed=-49837) + + env.io.hetatm = True + #soft sphere potential + env.edat.dynamic_sphere=False + #lennard-jones potential (more accurate) + env.edat.dynamic_lennard=True + env.edat.contact_shell = 4.0 + env.edat.update_dynamic = 0.39 + + # Read customized topology file with phosphoserines (or standard one) + env.libs.topology.read(file='$(LIB)/top_heav.lib') + + # Read customized CHARMM parameter library with phosphoserines (or standard one) + env.libs.parameters.read(file='$(LIB)/par.lib') + + + # Read the original PDB file and copy its sequence to the alignment array: + mdl1 = Model(env, file=modelname) + ali = Alignment(env) + ali.append_model(mdl1, atom_files=modelname, align_codes=modelname) + + #set up the mutate residue selection segment + s = Selection(mdl1.chains[chain].residues[respos]) + + #perform the mutate residue operation + s.mutate(residue_type=restyp) + #get two copies of the sequence. A modeller trick to get things set up + ali.append_model(mdl1, align_codes=modelname) + + # Generate molecular topology for mutant + mdl1.clear_topology() + mdl1.generate_topology(ali[-1]) + + + # Transfer all the coordinates you can from the template native structure + # to the mutant (this works even if the order of atoms in the native PDB + # file is not standard): + #here we are generating the model by reading the template coordinates + mdl1.transfer_xyz(ali) + + # Build the remaining unknown coordinates + mdl1.build(initialize_xyz=False, build_method='INTERNAL_COORDINATES') + + #yes model2 is the same file as model1. It's a modeller trick. + mdl2 = Model(env, file=modelname) + + #required to do a transfer_res_numb + #ali.append_model(mdl2, atom_files=modelname, align_codes=modelname) + #transfers from "model 2" to "model 1" + mdl1.res_num_from(mdl2,ali) + + #It is usually necessary to write the mutated sequence out and read it in + #before proceeding, because not all sequence related information about MODEL + #is changed by this command (e.g., internal coordinates, charges, and atom + #types and radii are not updated). + + mdl1.write(file=modelname+restyp+respos+'.tmp') + mdl1.read(file=modelname+restyp+respos+'.tmp') + + #set up restraints before computing energy + #we do this a second time because the model has been written out and read in, + #clearing the previously set restraints + make_restraints(mdl1, ali) + + #a non-bonded pair has to have at least as many selected atoms + mdl1.env.edat.nonbonded_sel_atoms=1 + + sched = autosched.loop.make_for_model(mdl1) + + #only optimize the selected residue (in first pass, just atoms in selected + #residue, in second pass, include nonbonded neighboring atoms) + #set up the mutate residue selection segment + s = Selection(mdl1.chains[chain].residues[respos]) + + mdl1.restraints.unpick_all() + mdl1.restraints.pick(s) + + s.energy() + + s.randomize_xyz(deviation=4.0) + + mdl1.env.edat.nonbonded_sel_atoms=2 + optimize(s, sched) + + #feels environment (energy computed on pairs that have at least one member + #in the selected) + mdl1.env.edat.nonbonded_sel_atoms=1 + optimize(s, sched) + + s.energy() + + #give a proper name + mdl1.write(file=f"{modelname}-{restyp}_{respos}.pdb") + + #delete the temporary file + os.remove(modelname+restyp+respos+'.tmp') \ No newline at end of file From 45702a215401ed962ce88f99f7b265c45ff52652 Mon Sep 17 00:00:00 2001 From: jyaacoub Date: Tue, 13 Aug 2024 14:27:02 -0400 Subject: [PATCH 2/3] feat(mutagenesis): sequence plotting and sample plot #136 --- playground.py | 117 ++++++++------------------- src/analysis/mutagenesis.py | 153 ++++++++++++++++++++++++++++++++++++ src/utils/config.py | 3 +- src/utils/mutate_model.py | 15 ++-- 4 files changed, 195 insertions(+), 93 deletions(-) create mode 100644 src/analysis/mutagenesis.py diff --git a/playground.py b/playground.py index 757ddf5..8669224 100644 --- a/playground.py +++ b/playground.py @@ -1,89 +1,34 @@ -#%% -from src import cfg - -#%% -from src.utils.mutate_model import run_modeller - -run_modeller('/cluster/home/t122995uhn/projects/tmp/d1/O00141.pdb', 50, "MET", "A") - -#%% -import pandas as pd - -def get_test_oncokbs(train_df=pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/full/cleaned_XY.csv'), - oncokb_fp='/cluster/home/t122995uhn/projects/data/tcga/mart_export.tsv', - biomart='/cluster/home/t122995uhn/projects/downloads/oncoKB_DrugGenePairList.csv'): - #Get gene names for PDBbind - dfbm = pd.read_csv(oncokb_fp, sep='\t') - dfbm['PDB ID'] = dfbm['PDB ID'].str.lower() - train_df.reset_index(names='idx',inplace=True) - - df_uni = train_df.merge(dfbm, how='inner', left_on='prot_id', right_on='UniProtKB/Swiss-Prot ID') - df_pdb = train_df.merge(dfbm, how='inner', left_on='code', right_on='PDB ID') - - # identifying ovelap with oncokb - # df_all will have duplicate entries for entries with multiple gene names... - df_all = pd.concat([df_uni, df_pdb]).drop_duplicates(['idx', 'Gene name'])[['idx', 'code', 'Gene name']] - - dfkb = pd.read_csv(biomart) - df_all_kb = df_all.merge(dfkb.drop_duplicates('gene'), left_on='Gene name', right_on='gene', how='inner') - - trained_genes = set(df_all_kb.gene) - - #Identify non-trained genes - return dfkb[~dfkb['gene'].isin(trained_genes)], dfkb[dfkb['gene'].isin(trained_genes)], dfkb - - -train_df = pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/train0/cleaned_XY.csv') -val_df = pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/val0/cleaned_XY.csv') - -train_df = pd.concat([train_df, val_df]) - -get_test_oncokbs(train_df=train_df) - - - - - -#%% +# %% ######################################################################## -########################## BUILD DATASETS ############################## +########################## VIOLIN PLOTTING ############################# ######################################################################## -import os -from src.data_prep.init_dataset import create_datasets -from src import cfg import logging -cfg.logger.setLevel(logging.DEBUG) - -splits = '/cluster/home/t122995uhn/projects/MutDTA/splits/davis/' -create_datasets([cfg.DATA_OPT.PDBbind, cfg.DATA_OPT.davis, cfg.DATA_OPT.kiba], - feat_opt=cfg.PRO_FEAT_OPT.nomsa, - edge_opt=[cfg.PRO_EDGE_OPT.binary, cfg.PRO_EDGE_OPT.aflow], - ligand_features=[cfg.LIG_FEAT_OPT.original, cfg.LIG_FEAT_OPT.gvp], - ligand_edges=cfg.LIG_EDGE_OPT.binary, overwrite=False, - k_folds=5, - test_prots_csv=f'{splits}/test.csv', - val_prots_csv=[f'{splits}/val{i}.csv' for i in range(5)],) - # data_root=os.path.abspath('../data/test/')) - -# %% Copy splits to commit them: -#from to: -import shutil -from_dir_p = '/cluster/home/t122995uhn/projects/data/v131/' -to_dir_p = '/cluster/home/t122995uhn/projects/MutDTA/splits/' -from_db = ['PDBbindDataset', 'DavisKibaDataset/kiba', 'DavisKibaDataset/davis'] -to_db = ['pdbbind', 'kiba', 'davis'] - -from_db = [f'{from_dir_p}/{f}/nomsa_binary_original_binary/' for f in from_db] -to_db = [f'{to_dir_p}/{f}' for f in to_db] - -for src, dst in zip(from_db, to_db): - for x in ['train', 'val']: - for i in range(5): - print(f"{src}/{x}{i}/XY.csv", f"{dst}/{x}{i}.csv") - shutil.copy(f"{src}/{x}{i}/XY.csv", f"{dst}/{x}{i}.csv") - - print(f"{src}/test/XY.csv", f"{dst}/test.csv") - shutil.copy(f"{src}/test/XY.csv", f"{dst}/test.csv") - - -# %% +from matplotlib import pyplot as plt + +from src.analysis.figures import prepare_df, fig_combined, custom_fig + +dft = prepare_df('./results/v115/model_media/model_stats.csv') +dfv = prepare_df('./results/v115/model_media/model_stats_val.csv') + +models = { + 'DG': ('nomsa', 'binary', 'original', 'binary'), + 'esm': ('ESM', 'binary', 'original', 'binary'), # esm model + 'aflow': ('nomsa', 'aflow', 'original', 'binary'), + # 'gvpP': ('gvp', 'binary', 'original', 'binary'), + 'gvpL': ('nomsa', 'binary', 'gvp', 'binary'), + # 'aflow_ring3': ('nomsa', 'aflow_ring3', 'original', 'binary'), + 'gvpL_aflow': ('nomsa', 'aflow', 'gvp', 'binary'), + # 'gvpL_aflow_rng3': ('nomsa', 'aflow_ring3', 'gvp', 'binary'), + #GVPL_ESMM_davis3D_nomsaF_aflowE_48B_0.00010636872718329864LR_0.23282479481785903D_2000E_gvpLF_binaryLE + # 'gvpl_esm_aflow': ('ESM', 'aflow', 'gvp', 'binary'), +} + +fig, axes = fig_combined(dft, datasets=['davis'], fig_callable=custom_fig, + models=models, metrics=['cindex', 'mse'], + fig_scale=(10,5), add_stats=True, title_postfix=" test set performance", box=True, fold_labels=True) +plt.xticks(rotation=45) + +fig, axes = fig_combined(dfv, datasets=['davis'], fig_callable=custom_fig, + models=models, metrics=['cindex', 'mse'], + fig_scale=(10,5), add_stats=True, title_postfix=" validation set performance", box=True, fold_labels=True) +plt.xticks(rotation=45) \ No newline at end of file diff --git a/src/analysis/mutagenesis.py b/src/analysis/mutagenesis.py new file mode 100644 index 0000000..2479952 --- /dev/null +++ b/src/analysis/mutagenesis.py @@ -0,0 +1,153 @@ +import numpy as np +import matplotlib.pyplot as plt +import matplotlib.patheffects as PathEffects + +from src.utils.residue import ResInfo, Chain +from src.utils.mutate_model import run_modeller + + +def plot_sequence(muta, pep_opts, pro_seq): + # Create the plot + plt.figure(figsize=(len(pro_seq), 20)) # Adjust the figure size as needed + plt.imshow(muta, aspect='auto', cmap='coolwarm', interpolation='none') + + # Set the x-ticks to correspond to positions in the protein sequence + plt.xticks(ticks=np.arange(len(pro_seq)), labels=[ResInfo.code_to_pep[p] for p in pro_seq], rotation=45, fontsize=16) + plt.yticks(ticks=np.arange(len(pep_opts)), labels=pep_opts, fontsize=16) + plt.xlabel('Protein Sequence Position', fontsize=75) + plt.ylabel('Peptide Options', fontsize=75) + + # Add text labels to each square + for i in range(len(pep_opts)): + for j in range(len(pro_seq)): + text = plt.text(j, i, f'{ResInfo.pep_to_code[pep_opts[i]]}', ha='center', va='center', color='black', fontsize=8) + # Add a white outline to the text + text.set_path_effects([ + PathEffects.Stroke(linewidth=1, foreground='white'), + PathEffects.Normal() + ]) + break + + + # Adjust gridlines to be off-center, forming cell boundaries + plt.gca().set_xticks(np.arange(-0.5, len(pro_seq), 1), minor=True) + plt.gca().set_yticks(np.arange(-0.5, len(pep_opts), 1), minor=True) + plt.grid(which='minor', color='gray', linestyle='-', linewidth=0.5) + + # Remove the major gridlines (optional, for clarity) + plt.grid(which='major', color='none') + + # Add a colorbar to show the scale of mutation values + plt.colorbar(label='Mutation Values') + plt.title('Mutation Array Visualization', fontsize=100) + plt.show() + + + +if __name__ == "__main__": + pro_id = "P67870" + pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb' + + + plot_sequence(np.load('muta-100_200.npy'), pep_opts=list(ResInfo.pep_to_code.keys()), + pro_seq=Chain(pdb_file).sequence) + + + # %% + import os + import logging + logging.getLogger().setLevel(logging.DEBUG) + + import numpy as np + import torch + import torch_geometric as torchg + from tqdm import tqdm + + from src import cfg + from src.utils.loader import Loader + from src.utils.residue import ResInfo, Chain + from src.data_prep.feature_extraction.ligand import smile_to_graph + from src.data_prep.feature_extraction.protein import target_to_graph + from src.utils.residue import ResInfo, Chain + + #%% + DATA = cfg.DATA_OPT.kiba + lig_feature = cfg.LIG_FEAT_OPT.original + lig_edge = cfg.LIG_EDGE_OPT.binary + pro_feature = cfg.PRO_FEAT_OPT.nomsa + pro_edge = cfg.PRO_EDGE_OPT.binary + + lig_seq = "COC1=C(C=CC(=C1)C2=CC3=C(C=C2)C(=CC4=CC=CN4)C(=O)N3)O" #CHEMBL202930 + + # %% build ligand graph + mol_feat, mol_edge = smile_to_graph(lig_seq, lig_feature=lig_feature, lig_edge=lig_edge) + lig = torchg.data.Data(x=torch.Tensor(mol_feat), edge_index=torch.LongTensor(mol_edge),lig_seq=lig_seq) + + # %% Get initial pkd value: + pro_id = "P67870" + pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb' + + def get_protein_features(pro_id, pdb_file, DATA=DATA): + pdb = Chain(pdb_file) + pro_seq = pdb.sequence + pro_cmap = pdb.get_contact_map() + + updated_seq, extra_feat, edge_idx = target_to_graph(target_sequence=pro_seq, + contact_map=pro_cmap, + threshold=8.0 if DATA is cfg.DATA_OPT.PDBbind else -0.5, + pro_feat=pro_feature) + pro_feat = torch.Tensor(extra_feat) + + pro = torchg.data.Data(x=torch.Tensor(pro_feat), + edge_index=torch.LongTensor(edge_idx), + pro_seq=updated_seq, # Protein sequence for downstream esm model + prot_id=pro_id, + edge_weight=None) + return pro, pro_seq + + pro, pro_seq = get_protein_features(pro_id, pdb_file) + + # %% Loading the model + m = Loader.load_tuned_model('davis_DG', fold=1) + m.eval() + original_pkd = m(pro, lig) + print(original_pkd) + + # %% mutate and regenerate graphs + muta = np.zeros(shape=(len(ResInfo.pep_to_code.keys()), len(pro_seq))) + + # zero indexed res range to mutate: + res_range = (100, 200) + res_range = (max(res_range[0], 0), + min(res_range[1], len(pro_seq))) + + # %% + from src.utils.mutate_model import run_modeller + + amino_acids = ResInfo.amino_acids[:-1] # not including "X" - unknown + + with tqdm(range(*res_range), ncols=80, total=(res_range[1]-res_range[0])) as t: + for j in t: + for i, AA in enumerate(amino_acids): + if i%2 == 0: + t.set_postfix(res=j, AA=i+1) + + if pro_seq[i] == AA: + muta[i,j] = original_pkd + continue + + pro_id = "P67870" + pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb' + out_pdb_fp = run_modeller(pdb_file, 1, ResInfo.code_to_pep[AA], "A") + + pro, _ = get_protein_features(pro_id, out_pdb_fp) + muta[i,j] = m(pro, lig) + + # delete after use + os.remove(out_pdb_fp) + + #%% + np.save(f"muta-{res_range[0]}_{res_range[1]}.npy", muta) + + + diff --git a/src/utils/config.py b/src/utils/config.py index e24465d..672ae18 100644 --- a/src/utils/config.py +++ b/src/utils/config.py @@ -89,7 +89,8 @@ class LIG_FEAT_OPT(StringEnum): from pathlib import Path # Model save paths -issue_number = 131 +issue_number = None + DATA_BASENAME = f'data/{f"v{issue_number}" if issue_number else ""}' RESULTS_PATH = os.path.abspath(f'results/{f"v{issue_number}/" if issue_number else ""}') MEDIA_SAVE_DIR = f'{RESULTS_PATH}/model_media/' diff --git a/src/utils/mutate_model.py b/src/utils/mutate_model.py index e8dd369..c6a2b6a 100644 --- a/src/utils/mutate_model.py +++ b/src/utils/mutate_model.py @@ -61,8 +61,11 @@ def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path: raise FileExistsError('would overwrite existing file at "{modelname}.pdb"') respos = str(respos) - log.verbose() + log.none() + TMP_FILE_PATH = modelname+restyp+respos+'.tmp' + OUT_FILE_PATH = f"{modelname}-{restyp}_{respos}.pdb" + # Set a different value for rand_seed to get a different final model env = Environ(rand_seed=-49837) @@ -120,9 +123,8 @@ def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path: #before proceeding, because not all sequence related information about MODEL #is changed by this command (e.g., internal coordinates, charges, and atom #types and radii are not updated). - - mdl1.write(file=modelname+restyp+respos+'.tmp') - mdl1.read(file=modelname+restyp+respos+'.tmp') + mdl1.write(file=TMP_FILE_PATH) + mdl1.read(file=TMP_FILE_PATH) #set up restraints before computing energy #we do this a second time because the model has been written out and read in, @@ -157,7 +159,8 @@ def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path: s.energy() #give a proper name - mdl1.write(file=f"{modelname}-{restyp}_{respos}.pdb") + mdl1.write(file=OUT_FILE_PATH) #delete the temporary file - os.remove(modelname+restyp+respos+'.tmp') \ No newline at end of file + os.remove(TMP_FILE_PATH) + return OUT_FILE_PATH \ No newline at end of file From bb12c2840903194133d6e8edb7a5365dd5a2c8d1 Mon Sep 17 00:00:00 2001 From: jyaacoub Date: Thu, 15 Aug 2024 12:52:41 -0400 Subject: [PATCH 3/3] fix(mutagenesis): plot labels and size of text #136 --- src/analysis/figures.py | 35 +++++++++++++++++++ src/analysis/mutagenesis.py | 70 ++++++++++++++++++++++++++----------- src/utils/config.py | 2 +- 3 files changed, 85 insertions(+), 22 deletions(-) diff --git a/src/analysis/figures.py b/src/analysis/figures.py index f2c240f..c26f051 100644 --- a/src/analysis/figures.py +++ b/src/analysis/figures.py @@ -972,3 +972,38 @@ def generate_roc_curve(true_dpkd, pred_dpkd, thres_range=(0,5), step=0.1): # performance drop values: grp = plot_df.groupby(['data', 'overlap']).mean() grp[grp.index.get_level_values(1)].values - grp[~grp.index.get_level_values(1)].values + + #%% + ######################################################################## + ########################## VIOLIN PLOTTING ############################# + ######################################################################## + import logging + from matplotlib import pyplot as plt + + from src.analysis.figures import prepare_df, fig_combined, custom_fig + + dft = prepare_df('./results/v115/model_media/model_stats.csv') + dfv = prepare_df('./results/v115/model_media/model_stats_val.csv') + + models = { + 'DG': ('nomsa', 'binary', 'original', 'binary'), + # 'esm': ('ESM', 'binary', 'original', 'binary'), # esm model + 'aflow': ('nomsa', 'aflow', 'original', 'binary'), + # 'gvpP': ('gvp', 'binary', 'original', 'binary'), + # 'gvpL': ('nomsa', 'binary', 'gvp', 'binary'), + # 'aflow_ring3': ('nomsa', 'aflow_ring3', 'original', 'binary'), + # 'gvpL_aflow': ('nomsa', 'aflow', 'gvp', 'binary'), + # 'gvpL_aflow_rng3': ('nomsa', 'aflow_ring3', 'gvp', 'binary'), + #GVPL_ESMM_davis3D_nomsaF_aflowE_48B_0.00010636872718329864LR_0.23282479481785903D_2000E_gvpLF_binaryLE + # 'gvpl_esm_aflow': ('ESM', 'aflow', 'gvp', 'binary'), + } + + fig, axes = fig_combined(dft, datasets=['davis'], fig_callable=custom_fig, + models=models, metrics=['cindex', 'mse'], + fig_scale=(10,5), add_stats=True, title_postfix=" test set performance", box=True, fold_labels=True) + plt.xticks(rotation=45) + + fig, axes = fig_combined(dfv, datasets=['davis'], fig_callable=custom_fig, + models=models, metrics=['cindex', 'mse'], + fig_scale=(10,5), add_stats=True, title_postfix=" validation set performance", box=True, fold_labels=True) + plt.xticks(rotation=45) diff --git a/src/analysis/mutagenesis.py b/src/analysis/mutagenesis.py index 2479952..eb78c8a 100644 --- a/src/analysis/mutagenesis.py +++ b/src/analysis/mutagenesis.py @@ -4,29 +4,38 @@ from src.utils.residue import ResInfo, Chain from src.utils.mutate_model import run_modeller - - -def plot_sequence(muta, pep_opts, pro_seq): +import copy + +def plot_sequence(muta, pep_opts, pro_seq, delta=False): + if delta: + muta = copy.deepcopy(muta) + original_pkd = None + for i, AA in enumerate(pep_opts): + if AA == pro_seq[0]: + original_pkd = muta[i,0] + + muta -= original_pkd + + # Create the plot plt.figure(figsize=(len(pro_seq), 20)) # Adjust the figure size as needed plt.imshow(muta, aspect='auto', cmap='coolwarm', interpolation='none') # Set the x-ticks to correspond to positions in the protein sequence - plt.xticks(ticks=np.arange(len(pro_seq)), labels=[ResInfo.code_to_pep[p] for p in pro_seq], rotation=45, fontsize=16) + plt.xticks(ticks=np.arange(len(pro_seq)), labels=pro_seq, fontsize=16) plt.yticks(ticks=np.arange(len(pep_opts)), labels=pep_opts, fontsize=16) - plt.xlabel('Protein Sequence Position', fontsize=75) - plt.ylabel('Peptide Options', fontsize=75) + plt.xlabel('Original Protein Sequence', fontsize=75) + plt.ylabel('Mutated to Amino Acid code', fontsize=75) # Add text labels to each square for i in range(len(pep_opts)): for j in range(len(pro_seq)): - text = plt.text(j, i, f'{ResInfo.pep_to_code[pep_opts[i]]}', ha='center', va='center', color='black', fontsize=8) + text = plt.text(j, i, f'{pep_opts[i]}', ha='center', va='center', color='black', fontsize=12) # Add a white outline to the text text.set_path_effects([ PathEffects.Stroke(linewidth=1, foreground='white'), PathEffects.Normal() ]) - break # Adjust gridlines to be off-center, forming cell boundaries @@ -45,14 +54,6 @@ def plot_sequence(muta, pep_opts, pro_seq): if __name__ == "__main__": - pro_id = "P67870" - pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb' - - - plot_sequence(np.load('muta-100_200.npy'), pep_opts=list(ResInfo.pep_to_code.keys()), - pro_seq=Chain(pdb_file).sequence) - - # %% import os import logging @@ -114,10 +115,9 @@ def get_protein_features(pro_id, pdb_file, DATA=DATA): print(original_pkd) # %% mutate and regenerate graphs - muta = np.zeros(shape=(len(ResInfo.pep_to_code.keys()), len(pro_seq))) # zero indexed res range to mutate: - res_range = (100, 200) + res_range = (0,250) res_range = (max(res_range[0], 0), min(res_range[1], len(pro_seq))) @@ -125,29 +125,57 @@ def get_protein_features(pro_id, pdb_file, DATA=DATA): from src.utils.mutate_model import run_modeller amino_acids = ResInfo.amino_acids[:-1] # not including "X" - unknown + muta = np.zeros(shape=(len(amino_acids), len(pro_seq))) + n_continued = 0 with tqdm(range(*res_range), ncols=80, total=(res_range[1]-res_range[0])) as t: for j in t: for i, AA in enumerate(amino_acids): if i%2 == 0: t.set_postfix(res=j, AA=i+1) - if pro_seq[i] == AA: + if pro_seq[j] == AA: muta[i,j] = original_pkd + n_continued += 1 continue pro_id = "P67870" pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb' - out_pdb_fp = run_modeller(pdb_file, 1, ResInfo.code_to_pep[AA], "A") + out_pdb_fp = run_modeller(pdb_file, j+1, ResInfo.code_to_pep[AA], "A") pro, _ = get_protein_features(pro_id, out_pdb_fp) + assert pro.pro_seq != pro_seq and pro.pro_seq[j] == AA, "ERROR in modeller" muta[i,j] = m(pro, lig) # delete after use os.remove(out_pdb_fp) - + print('n_continued:', n_continued) #%% np.save(f"muta-{res_range[0]}_{res_range[1]}.npy", muta) + exit() + + # %% + import numpy as np + from src.analysis.mutagenesis import plot_sequence + from src.utils.residue import ResInfo, Chain + + pro_id = "P67870" + pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb' + res_range = (0,215) + muta = np.load(f'muta-{res_range[0]}_{res_range[1]}.npy') + + # pkd -> kd + # muta = 10**(-muta) + # %% + amino_acids = ResInfo.amino_acids[:-1] # not including "X" - unknown + + plot_sequence(muta[:,res_range[0]:res_range[1]], pep_opts=amino_acids, + pro_seq=Chain(pdb_file).sequence[res_range[0]:res_range[1]]) + + plot_sequence(muta[:,res_range[0]:res_range[1]], pep_opts=amino_acids, + pro_seq=Chain(pdb_file).sequence[res_range[0]:res_range[1]], delta=True) + + diff --git a/src/utils/config.py b/src/utils/config.py index 672ae18..8fa711f 100644 --- a/src/utils/config.py +++ b/src/utils/config.py @@ -111,7 +111,7 @@ class LIG_FEAT_OPT(StringEnum): SLURM_ACCOUNT = None SLURM_GPU_NAME = 'v100' -if 'uhnh4h' in DOMAIN_NAME: +if ('uhnh4h' in DOMAIN_NAME or 'h4h' in DOMAIN_NAME): CLUSTER = 'h4h' SLURM_PARTITION = 'gpu' SLURM_CONSTRAINT = 'gpu32g'