From 17c58e57fb1b55a5388adc533a6ee589de362477 Mon Sep 17 00:00:00 2001
From: jyaacoub <jyaacoub21@gmail.com>
Date: Mon, 12 Aug 2024 14:32:50 -0400
Subject: [PATCH 1/3] feat: mutate_model.py with `run_modeller` fn #136

---
 playground.py             |   8 ++
 src/utils/mutate_model.py | 163 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 src/utils/mutate_model.py

diff --git a/playground.py b/playground.py
index 1182518..757ddf5 100644
--- a/playground.py
+++ b/playground.py
@@ -1,3 +1,11 @@
+#%%
+from src import cfg
+
+#%%
+from src.utils.mutate_model import run_modeller
+
+run_modeller('/cluster/home/t122995uhn/projects/tmp/d1/O00141.pdb', 50, "MET", "A")
+
 #%%
 import pandas as pd
 
diff --git a/src/utils/mutate_model.py b/src/utils/mutate_model.py
new file mode 100644
index 0000000..e8dd369
--- /dev/null
+++ b/src/utils/mutate_model.py
@@ -0,0 +1,163 @@
+import sys
+import os
+
+from modeller import *
+from modeller.optimizers import MolecularDynamics, ConjugateGradients
+from modeller.automodel import autosched
+
+
+def optimize(atmsel, sched):
+    #conjugate gradient
+    for step in sched:
+        step.optimize(atmsel, max_iterations=200, min_atom_shift=0.001)
+    #md
+    refine(atmsel)
+    cg = ConjugateGradients()
+    cg.optimize(atmsel, max_iterations=200, min_atom_shift=0.001)
+
+
+#molecular dynamics
+def refine(atmsel):
+    # at T=1000, max_atom_shift for 4fs is cca 0.15 A.
+    md = MolecularDynamics(cap_atom_shift=0.39, md_time_step=4.0,
+                           md_return='FINAL')
+    init_vel = True
+    for (its, equil, temps) in ((200, 20, (150.0, 250.0, 400.0, 700.0, 1000.0)),
+                                (200, 600,
+                                 (1000.0, 800.0, 600.0, 500.0, 400.0, 300.0))):
+        for temp in temps:
+            md.optimize(atmsel, init_velocities=init_vel, temperature=temp,
+                         max_iterations=its, equilibrate=equil)
+            init_vel = False
+
+
+#use homologs and dihedral library for dihedral angle restraints
+def make_restraints(mdl1, aln):
+   rsr = mdl1.restraints
+   rsr.clear()
+   s = Selection(mdl1)
+   for typ in ('stereo', 'phi-psi_binormal'):
+       rsr.make(s, restraint_type=typ, aln=aln, spline_on_site=True)
+   for typ in ('omega', 'chi1', 'chi2', 'chi3', 'chi4'):
+       rsr.make(s, restraint_type=typ+'_dihedral', spline_range=4.0,
+                spline_dx=0.3, spline_min_points = 5, aln=aln,
+                spline_on_site=True)
+
+
+def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path:str=None):
+    """
+    Takes in the model path (excluding .pdb extension) and the residue index to 
+    change and the new residue. Outputs to same dir as "{modelname}-{respos}_restype.pdb"
+
+    Args:
+        modelname (str): Model file path.
+        respos (int): Index position for target residue (1-indexed)
+        restyp (str): 3 letter residue name for what to change into
+        chain (str): Single character chain identifier.
+        out_path (str): output path for pdb file to override default of "{modelname}-{respos}_restype.pdb".
+    """
+    modelname = modelname.split('.pdb')[0]
+    if out_path and modelname == out_path.split('.pdb')[0]:
+        raise FileExistsError('would overwrite existing file at "{modelname}.pdb"')
+    
+    respos = str(respos)
+    log.verbose()
+
+    # Set a different value for rand_seed to get a different final model
+    env = Environ(rand_seed=-49837)
+
+    env.io.hetatm = True
+    #soft sphere potential
+    env.edat.dynamic_sphere=False
+    #lennard-jones potential (more accurate)
+    env.edat.dynamic_lennard=True
+    env.edat.contact_shell = 4.0
+    env.edat.update_dynamic = 0.39
+
+    # Read customized topology file with phosphoserines (or standard one)
+    env.libs.topology.read(file='$(LIB)/top_heav.lib')
+
+    # Read customized CHARMM parameter library with phosphoserines (or standard one)
+    env.libs.parameters.read(file='$(LIB)/par.lib')
+
+
+    # Read the original PDB file and copy its sequence to the alignment array:
+    mdl1 = Model(env, file=modelname)
+    ali = Alignment(env)
+    ali.append_model(mdl1, atom_files=modelname, align_codes=modelname)
+
+    #set up the mutate residue selection segment
+    s = Selection(mdl1.chains[chain].residues[respos])
+
+    #perform the mutate residue operation
+    s.mutate(residue_type=restyp)
+    #get two copies of the sequence.  A modeller trick to get things set up
+    ali.append_model(mdl1, align_codes=modelname)
+
+    # Generate molecular topology for mutant
+    mdl1.clear_topology()
+    mdl1.generate_topology(ali[-1])
+
+
+    # Transfer all the coordinates you can from the template native structure
+    # to the mutant (this works even if the order of atoms in the native PDB
+    # file is not standard):
+    #here we are generating the model by reading the template coordinates
+    mdl1.transfer_xyz(ali)
+
+    # Build the remaining unknown coordinates
+    mdl1.build(initialize_xyz=False, build_method='INTERNAL_COORDINATES')
+
+    #yes model2 is the same file as model1.  It's a modeller trick.
+    mdl2 = Model(env, file=modelname)
+
+    #required to do a transfer_res_numb
+    #ali.append_model(mdl2, atom_files=modelname, align_codes=modelname)
+    #transfers from "model 2" to "model 1"
+    mdl1.res_num_from(mdl2,ali)
+
+    #It is usually necessary to write the mutated sequence out and read it in
+    #before proceeding, because not all sequence related information about MODEL
+    #is changed by this command (e.g., internal coordinates, charges, and atom
+    #types and radii are not updated).
+
+    mdl1.write(file=modelname+restyp+respos+'.tmp')
+    mdl1.read(file=modelname+restyp+respos+'.tmp')
+
+    #set up restraints before computing energy
+    #we do this a second time because the model has been written out and read in,
+    #clearing the previously set restraints
+    make_restraints(mdl1, ali)
+
+    #a non-bonded pair has to have at least as many selected atoms
+    mdl1.env.edat.nonbonded_sel_atoms=1
+
+    sched = autosched.loop.make_for_model(mdl1)
+
+    #only optimize the selected residue (in first pass, just atoms in selected
+    #residue, in second pass, include nonbonded neighboring atoms)
+    #set up the mutate residue selection segment
+    s = Selection(mdl1.chains[chain].residues[respos])
+
+    mdl1.restraints.unpick_all()
+    mdl1.restraints.pick(s)
+
+    s.energy()
+
+    s.randomize_xyz(deviation=4.0)
+
+    mdl1.env.edat.nonbonded_sel_atoms=2
+    optimize(s, sched)
+
+    #feels environment (energy computed on pairs that have at least one member
+    #in the selected)
+    mdl1.env.edat.nonbonded_sel_atoms=1
+    optimize(s, sched)
+
+    s.energy()
+
+    #give a proper name
+    mdl1.write(file=f"{modelname}-{restyp}_{respos}.pdb")
+
+    #delete the temporary file
+    os.remove(modelname+restyp+respos+'.tmp')
\ No newline at end of file

From 45702a215401ed962ce88f99f7b265c45ff52652 Mon Sep 17 00:00:00 2001
From: jyaacoub <jyaacoub21@gmail.com>
Date: Tue, 13 Aug 2024 14:27:02 -0400
Subject: [PATCH 2/3] feat(mutagenesis): sequence plotting and sample plot #136

---
 playground.py               | 117 ++++++++-------------------
 src/analysis/mutagenesis.py | 153 ++++++++++++++++++++++++++++++++++++
 src/utils/config.py         |   3 +-
 src/utils/mutate_model.py   |  15 ++--
 4 files changed, 195 insertions(+), 93 deletions(-)
 create mode 100644 src/analysis/mutagenesis.py

diff --git a/playground.py b/playground.py
index 757ddf5..8669224 100644
--- a/playground.py
+++ b/playground.py
@@ -1,89 +1,34 @@
-#%%
-from src import cfg
-
-#%%
-from src.utils.mutate_model import run_modeller
-
-run_modeller('/cluster/home/t122995uhn/projects/tmp/d1/O00141.pdb', 50, "MET", "A")
-
-#%%
-import pandas as pd
-
-def get_test_oncokbs(train_df=pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/full/cleaned_XY.csv'),
-                     oncokb_fp='/cluster/home/t122995uhn/projects/data/tcga/mart_export.tsv', 
-                     biomart='/cluster/home/t122995uhn/projects/downloads/oncoKB_DrugGenePairList.csv'):
-    #Get gene names for PDBbind
-    dfbm = pd.read_csv(oncokb_fp, sep='\t')
-    dfbm['PDB ID'] = dfbm['PDB ID'].str.lower()
-    train_df.reset_index(names='idx',inplace=True)
-
-    df_uni = train_df.merge(dfbm, how='inner', left_on='prot_id', right_on='UniProtKB/Swiss-Prot ID')
-    df_pdb = train_df.merge(dfbm, how='inner', left_on='code', right_on='PDB ID')
-
-    # identifying ovelap with oncokb
-    # df_all will have duplicate entries for entries with multiple gene names...
-    df_all = pd.concat([df_uni, df_pdb]).drop_duplicates(['idx', 'Gene name'])[['idx', 'code', 'Gene name']]
-
-    dfkb = pd.read_csv(biomart)
-    df_all_kb = df_all.merge(dfkb.drop_duplicates('gene'), left_on='Gene name', right_on='gene', how='inner')
-
-    trained_genes = set(df_all_kb.gene)
-
-    #Identify non-trained genes
-    return dfkb[~dfkb['gene'].isin(trained_genes)], dfkb[dfkb['gene'].isin(trained_genes)], dfkb
-
-
-train_df = pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/train0/cleaned_XY.csv')
-val_df = pd.read_csv('/cluster/home/t122995uhn/projects/data/test/PDBbindDataset/nomsa_binary_original_binary/val0/cleaned_XY.csv')
-
-train_df = pd.concat([train_df, val_df])
-
-get_test_oncokbs(train_df=train_df)
-
-
-
-
-
-#%%
+# %%
 ########################################################################
-########################## BUILD DATASETS ##############################
+########################## VIOLIN PLOTTING #############################
 ########################################################################
-import os
-from src.data_prep.init_dataset import create_datasets
-from src import cfg
 import logging
-cfg.logger.setLevel(logging.DEBUG)
-
-splits = '/cluster/home/t122995uhn/projects/MutDTA/splits/davis/'
-create_datasets([cfg.DATA_OPT.PDBbind, cfg.DATA_OPT.davis, cfg.DATA_OPT.kiba], 
-                feat_opt=cfg.PRO_FEAT_OPT.nomsa, 
-                edge_opt=[cfg.PRO_EDGE_OPT.binary, cfg.PRO_EDGE_OPT.aflow],
-                ligand_features=[cfg.LIG_FEAT_OPT.original, cfg.LIG_FEAT_OPT.gvp], 
-                ligand_edges=cfg.LIG_EDGE_OPT.binary, overwrite=False,
-                k_folds=5,
-                test_prots_csv=f'{splits}/test.csv',
-                val_prots_csv=[f'{splits}/val{i}.csv' for i in range(5)],)
-                # data_root=os.path.abspath('../data/test/'))
-
-# %% Copy splits to commit them:
-#from to:
-import shutil
-from_dir_p = '/cluster/home/t122995uhn/projects/data/v131/'
-to_dir_p = '/cluster/home/t122995uhn/projects/MutDTA/splits/'
-from_db = ['PDBbindDataset', 'DavisKibaDataset/kiba', 'DavisKibaDataset/davis']
-to_db = ['pdbbind', 'kiba', 'davis']
-
-from_db = [f'{from_dir_p}/{f}/nomsa_binary_original_binary/' for f in from_db]
-to_db = [f'{to_dir_p}/{f}' for f in to_db]
-
-for src, dst in zip(from_db, to_db):
-    for x in ['train', 'val']:
-        for i in range(5):
-            print(f"{src}/{x}{i}/XY.csv", f"{dst}/{x}{i}.csv")
-            shutil.copy(f"{src}/{x}{i}/XY.csv", f"{dst}/{x}{i}.csv")
-    
-    print(f"{src}/test/XY.csv", f"{dst}/test.csv")
-    shutil.copy(f"{src}/test/XY.csv", f"{dst}/test.csv")
-    
-    
-# %%
+from matplotlib import pyplot as plt
+
+from src.analysis.figures import prepare_df, fig_combined, custom_fig
+
+dft = prepare_df('./results/v115/model_media/model_stats.csv')
+dfv = prepare_df('./results/v115/model_media/model_stats_val.csv')
+
+models = {
+    'DG': ('nomsa', 'binary', 'original', 'binary'),
+    'esm': ('ESM', 'binary', 'original', 'binary'), # esm model
+    'aflow': ('nomsa', 'aflow', 'original', 'binary'),
+    # 'gvpP': ('gvp', 'binary', 'original', 'binary'),
+    'gvpL': ('nomsa', 'binary', 'gvp', 'binary'),
+    # 'aflow_ring3': ('nomsa', 'aflow_ring3', 'original', 'binary'),
+    'gvpL_aflow': ('nomsa', 'aflow', 'gvp', 'binary'),
+    # 'gvpL_aflow_rng3': ('nomsa', 'aflow_ring3', 'gvp', 'binary'),
+    #GVPL_ESMM_davis3D_nomsaF_aflowE_48B_0.00010636872718329864LR_0.23282479481785903D_2000E_gvpLF_binaryLE
+    # 'gvpl_esm_aflow': ('ESM', 'aflow', 'gvp', 'binary'),
+}
+
+fig, axes = fig_combined(dft, datasets=['davis'], fig_callable=custom_fig,
+             models=models, metrics=['cindex', 'mse'],
+             fig_scale=(10,5), add_stats=True, title_postfix=" test set performance", box=True, fold_labels=True)
+plt.xticks(rotation=45)
+
+fig, axes = fig_combined(dfv, datasets=['davis'], fig_callable=custom_fig,
+             models=models, metrics=['cindex', 'mse'],
+             fig_scale=(10,5), add_stats=True, title_postfix=" validation set performance", box=True, fold_labels=True)
+plt.xticks(rotation=45)
\ No newline at end of file
diff --git a/src/analysis/mutagenesis.py b/src/analysis/mutagenesis.py
new file mode 100644
index 0000000..2479952
--- /dev/null
+++ b/src/analysis/mutagenesis.py
@@ -0,0 +1,153 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import matplotlib.patheffects as PathEffects
+
+from src.utils.residue import ResInfo, Chain
+from src.utils.mutate_model import run_modeller
+
+
+def plot_sequence(muta, pep_opts, pro_seq):
+    # Create the plot
+    plt.figure(figsize=(len(pro_seq), 20))  # Adjust the figure size as needed
+    plt.imshow(muta, aspect='auto', cmap='coolwarm', interpolation='none')
+
+    # Set the x-ticks to correspond to positions in the protein sequence
+    plt.xticks(ticks=np.arange(len(pro_seq)), labels=[ResInfo.code_to_pep[p] for p in pro_seq], rotation=45, fontsize=16)
+    plt.yticks(ticks=np.arange(len(pep_opts)), labels=pep_opts, fontsize=16)
+    plt.xlabel('Protein Sequence Position', fontsize=75)
+    plt.ylabel('Peptide Options', fontsize=75)
+
+    # Add text labels to each square
+    for i in range(len(pep_opts)):
+        for j in range(len(pro_seq)):
+            text = plt.text(j, i, f'{ResInfo.pep_to_code[pep_opts[i]]}', ha='center', va='center', color='black', fontsize=8)
+            # Add a white outline to the text
+            text.set_path_effects([
+                PathEffects.Stroke(linewidth=1, foreground='white'),
+                PathEffects.Normal()
+            ])
+            break
+
+
+    # Adjust gridlines to be off-center, forming cell boundaries
+    plt.gca().set_xticks(np.arange(-0.5, len(pro_seq), 1), minor=True)
+    plt.gca().set_yticks(np.arange(-0.5, len(pep_opts), 1), minor=True)
+    plt.grid(which='minor', color='gray', linestyle='-', linewidth=0.5)
+
+    # Remove the major gridlines (optional, for clarity)
+    plt.grid(which='major', color='none')
+
+    # Add a colorbar to show the scale of mutation values
+    plt.colorbar(label='Mutation Values')
+    plt.title('Mutation Array Visualization', fontsize=100)
+    plt.show()
+
+
+
+if __name__ == "__main__":
+    pro_id = "P67870"
+    pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb'
+
+
+    plot_sequence(np.load('muta-100_200.npy'), pep_opts=list(ResInfo.pep_to_code.keys()),
+                pro_seq=Chain(pdb_file).sequence)
+    
+    
+    # %%
+    import os
+    import logging
+    logging.getLogger().setLevel(logging.DEBUG)
+
+    import numpy as np
+    import torch
+    import torch_geometric as torchg
+    from tqdm import tqdm
+
+    from src import cfg
+    from src.utils.loader import Loader
+    from src.utils.residue import ResInfo, Chain
+    from src.data_prep.feature_extraction.ligand import smile_to_graph
+    from src.data_prep.feature_extraction.protein import target_to_graph
+    from src.utils.residue import ResInfo, Chain
+
+    #%%
+    DATA = cfg.DATA_OPT.kiba
+    lig_feature = cfg.LIG_FEAT_OPT.original
+    lig_edge = cfg.LIG_EDGE_OPT.binary
+    pro_feature = cfg.PRO_FEAT_OPT.nomsa
+    pro_edge = cfg.PRO_EDGE_OPT.binary
+
+    lig_seq = "COC1=C(C=CC(=C1)C2=CC3=C(C=C2)C(=CC4=CC=CN4)C(=O)N3)O" #CHEMBL202930
+
+    # %% build ligand graph
+    mol_feat, mol_edge = smile_to_graph(lig_seq, lig_feature=lig_feature, lig_edge=lig_edge)
+    lig = torchg.data.Data(x=torch.Tensor(mol_feat), edge_index=torch.LongTensor(mol_edge),lig_seq=lig_seq)
+
+    # %% Get initial pkd value:
+    pro_id = "P67870"
+    pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb'
+
+    def get_protein_features(pro_id, pdb_file, DATA=DATA):
+        pdb = Chain(pdb_file)
+        pro_seq = pdb.sequence
+        pro_cmap = pdb.get_contact_map()
+
+        updated_seq, extra_feat, edge_idx = target_to_graph(target_sequence=pro_seq, 
+                                                            contact_map=pro_cmap,
+                                                            threshold=8.0 if DATA is cfg.DATA_OPT.PDBbind else -0.5, 
+                                                            pro_feat=pro_feature)
+        pro_feat = torch.Tensor(extra_feat)
+
+        pro = torchg.data.Data(x=torch.Tensor(pro_feat),
+                            edge_index=torch.LongTensor(edge_idx),
+                            pro_seq=updated_seq, # Protein sequence for downstream esm model
+                            prot_id=pro_id,
+                            edge_weight=None)
+        return pro, pro_seq
+
+    pro, pro_seq = get_protein_features(pro_id, pdb_file)
+
+    # %% Loading the model
+    m = Loader.load_tuned_model('davis_DG', fold=1)
+    m.eval()
+    original_pkd = m(pro, lig)
+    print(original_pkd)
+
+    # %% mutate and regenerate graphs
+    muta = np.zeros(shape=(len(ResInfo.pep_to_code.keys()), len(pro_seq)))
+
+    # zero indexed res range to mutate:
+    res_range = (100, 200)
+    res_range = (max(res_range[0], 0),
+                min(res_range[1], len(pro_seq)))
+
+    # %%
+    from src.utils.mutate_model import run_modeller
+
+    amino_acids = ResInfo.amino_acids[:-1] # not including "X" - unknown
+
+    with tqdm(range(*res_range), ncols=80, total=(res_range[1]-res_range[0])) as t:
+        for j in t:
+            for i, AA in enumerate(amino_acids):
+                if i%2 == 0:
+                    t.set_postfix(res=j, AA=i+1)
+                
+                if pro_seq[i] == AA:
+                    muta[i,j] = original_pkd
+                    continue
+                    
+                pro_id = "P67870"
+                pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb'
+                out_pdb_fp = run_modeller(pdb_file, 1, ResInfo.code_to_pep[AA], "A")
+                
+                pro, _ = get_protein_features(pro_id, out_pdb_fp)
+                muta[i,j] = m(pro, lig)
+                
+                # delete after use
+                os.remove(out_pdb_fp)
+                
+    #%%
+    np.save(f"muta-{res_range[0]}_{res_range[1]}.npy", muta)
+
+
+
diff --git a/src/utils/config.py b/src/utils/config.py
index e24465d..672ae18 100644
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -89,7 +89,8 @@ class LIG_FEAT_OPT(StringEnum):
 from pathlib import Path
 
 # Model save paths
-issue_number = 131
+issue_number = None
+
 DATA_BASENAME = f'data/{f"v{issue_number}" if issue_number else ""}'
 RESULTS_PATH = os.path.abspath(f'results/{f"v{issue_number}/" if issue_number else ""}')
 MEDIA_SAVE_DIR      = f'{RESULTS_PATH}/model_media/'
diff --git a/src/utils/mutate_model.py b/src/utils/mutate_model.py
index e8dd369..c6a2b6a 100644
--- a/src/utils/mutate_model.py
+++ b/src/utils/mutate_model.py
@@ -61,8 +61,11 @@ def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path:
         raise FileExistsError('would overwrite existing file at "{modelname}.pdb"')
     
     respos = str(respos)
-    log.verbose()
+    log.none()
 
+    TMP_FILE_PATH = modelname+restyp+respos+'.tmp'
+    OUT_FILE_PATH = f"{modelname}-{restyp}_{respos}.pdb"
+    
     # Set a different value for rand_seed to get a different final model
     env = Environ(rand_seed=-49837)
 
@@ -120,9 +123,8 @@ def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path:
     #before proceeding, because not all sequence related information about MODEL
     #is changed by this command (e.g., internal coordinates, charges, and atom
     #types and radii are not updated).
-
-    mdl1.write(file=modelname+restyp+respos+'.tmp')
-    mdl1.read(file=modelname+restyp+respos+'.tmp')
+    mdl1.write(file=TMP_FILE_PATH)
+    mdl1.read(file=TMP_FILE_PATH)
 
     #set up restraints before computing energy
     #we do this a second time because the model has been written out and read in,
@@ -157,7 +159,8 @@ def run_modeller(modelname:str, respos:int|str, restyp:str, chain:str, out_path:
     s.energy()
 
     #give a proper name
-    mdl1.write(file=f"{modelname}-{restyp}_{respos}.pdb")
+    mdl1.write(file=OUT_FILE_PATH)
 
     #delete the temporary file
-    os.remove(modelname+restyp+respos+'.tmp')
\ No newline at end of file
+    os.remove(TMP_FILE_PATH)
+    return OUT_FILE_PATH
\ No newline at end of file

From bb12c2840903194133d6e8edb7a5365dd5a2c8d1 Mon Sep 17 00:00:00 2001
From: jyaacoub <jyaacoub21@gmail.com>
Date: Thu, 15 Aug 2024 12:52:41 -0400
Subject: [PATCH 3/3] fix(mutagenesis): plot labels and size of text #136

---
 src/analysis/figures.py     | 35 +++++++++++++++++++
 src/analysis/mutagenesis.py | 70 ++++++++++++++++++++++++++-----------
 src/utils/config.py         |  2 +-
 3 files changed, 85 insertions(+), 22 deletions(-)

diff --git a/src/analysis/figures.py b/src/analysis/figures.py
index f2c240f..c26f051 100644
--- a/src/analysis/figures.py
+++ b/src/analysis/figures.py
@@ -972,3 +972,38 @@ def generate_roc_curve(true_dpkd, pred_dpkd, thres_range=(0,5), step=0.1):
     # performance drop values:
     grp = plot_df.groupby(['data', 'overlap']).mean()
     grp[grp.index.get_level_values(1)].values - grp[~grp.index.get_level_values(1)].values
+    
+    #%%
+    ########################################################################
+    ########################## VIOLIN PLOTTING #############################
+    ########################################################################
+    import logging
+    from matplotlib import pyplot as plt
+
+    from src.analysis.figures import prepare_df, fig_combined, custom_fig
+
+    dft = prepare_df('./results/v115/model_media/model_stats.csv')
+    dfv = prepare_df('./results/v115/model_media/model_stats_val.csv')
+
+    models = {
+        'DG': ('nomsa', 'binary', 'original', 'binary'),
+        # 'esm': ('ESM', 'binary', 'original', 'binary'), # esm model
+        'aflow': ('nomsa', 'aflow', 'original', 'binary'),
+        # 'gvpP': ('gvp', 'binary', 'original', 'binary'),
+        # 'gvpL': ('nomsa', 'binary', 'gvp', 'binary'),
+        # 'aflow_ring3': ('nomsa', 'aflow_ring3', 'original', 'binary'),
+        # 'gvpL_aflow': ('nomsa', 'aflow', 'gvp', 'binary'),
+        # 'gvpL_aflow_rng3': ('nomsa', 'aflow_ring3', 'gvp', 'binary'),
+        #GVPL_ESMM_davis3D_nomsaF_aflowE_48B_0.00010636872718329864LR_0.23282479481785903D_2000E_gvpLF_binaryLE
+        # 'gvpl_esm_aflow': ('ESM', 'aflow', 'gvp', 'binary'),
+    }
+
+    fig, axes = fig_combined(dft, datasets=['davis'], fig_callable=custom_fig,
+                models=models, metrics=['cindex', 'mse'],
+                fig_scale=(10,5), add_stats=True, title_postfix=" test set performance", box=True, fold_labels=True)
+    plt.xticks(rotation=45)
+
+    fig, axes = fig_combined(dfv, datasets=['davis'], fig_callable=custom_fig,
+                models=models, metrics=['cindex', 'mse'],
+                fig_scale=(10,5), add_stats=True, title_postfix=" validation set performance", box=True, fold_labels=True)
+    plt.xticks(rotation=45)
diff --git a/src/analysis/mutagenesis.py b/src/analysis/mutagenesis.py
index 2479952..eb78c8a 100644
--- a/src/analysis/mutagenesis.py
+++ b/src/analysis/mutagenesis.py
@@ -4,29 +4,38 @@
 
 from src.utils.residue import ResInfo, Chain
 from src.utils.mutate_model import run_modeller
-
-
-def plot_sequence(muta, pep_opts, pro_seq):
+import copy
+
+def plot_sequence(muta, pep_opts, pro_seq, delta=False):
+    if delta:
+        muta = copy.deepcopy(muta)
+        original_pkd = None
+        for i, AA in enumerate(pep_opts):
+            if AA == pro_seq[0]:
+                original_pkd = muta[i,0]
+            
+        muta -= original_pkd
+                
+    
     # Create the plot
     plt.figure(figsize=(len(pro_seq), 20))  # Adjust the figure size as needed
     plt.imshow(muta, aspect='auto', cmap='coolwarm', interpolation='none')
 
     # Set the x-ticks to correspond to positions in the protein sequence
-    plt.xticks(ticks=np.arange(len(pro_seq)), labels=[ResInfo.code_to_pep[p] for p in pro_seq], rotation=45, fontsize=16)
+    plt.xticks(ticks=np.arange(len(pro_seq)), labels=pro_seq, fontsize=16)
     plt.yticks(ticks=np.arange(len(pep_opts)), labels=pep_opts, fontsize=16)
-    plt.xlabel('Protein Sequence Position', fontsize=75)
-    plt.ylabel('Peptide Options', fontsize=75)
+    plt.xlabel('Original Protein Sequence', fontsize=75)
+    plt.ylabel('Mutated to Amino Acid code', fontsize=75)
 
     # Add text labels to each square
     for i in range(len(pep_opts)):
         for j in range(len(pro_seq)):
-            text = plt.text(j, i, f'{ResInfo.pep_to_code[pep_opts[i]]}', ha='center', va='center', color='black', fontsize=8)
+            text = plt.text(j, i, f'{pep_opts[i]}', ha='center', va='center', color='black', fontsize=12)
             # Add a white outline to the text
             text.set_path_effects([
                 PathEffects.Stroke(linewidth=1, foreground='white'),
                 PathEffects.Normal()
             ])
-            break
 
 
     # Adjust gridlines to be off-center, forming cell boundaries
@@ -45,14 +54,6 @@ def plot_sequence(muta, pep_opts, pro_seq):
 
 
 if __name__ == "__main__":
-    pro_id = "P67870"
-    pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb'
-
-
-    plot_sequence(np.load('muta-100_200.npy'), pep_opts=list(ResInfo.pep_to_code.keys()),
-                pro_seq=Chain(pdb_file).sequence)
-    
-    
     # %%
     import os
     import logging
@@ -114,10 +115,9 @@ def get_protein_features(pro_id, pdb_file, DATA=DATA):
     print(original_pkd)
 
     # %% mutate and regenerate graphs
-    muta = np.zeros(shape=(len(ResInfo.pep_to_code.keys()), len(pro_seq)))
 
     # zero indexed res range to mutate:
-    res_range = (100, 200)
+    res_range = (0,250)
     res_range = (max(res_range[0], 0),
                 min(res_range[1], len(pro_seq)))
 
@@ -125,29 +125,57 @@ def get_protein_features(pro_id, pdb_file, DATA=DATA):
     from src.utils.mutate_model import run_modeller
 
     amino_acids = ResInfo.amino_acids[:-1] # not including "X" - unknown
+    muta = np.zeros(shape=(len(amino_acids), len(pro_seq)))
 
+    n_continued = 0
     with tqdm(range(*res_range), ncols=80, total=(res_range[1]-res_range[0])) as t:
         for j in t:
             for i, AA in enumerate(amino_acids):
                 if i%2 == 0:
                     t.set_postfix(res=j, AA=i+1)
                 
-                if pro_seq[i] == AA:
+                if pro_seq[j] == AA:
                     muta[i,j] = original_pkd
+                    n_continued += 1
                     continue
                     
                 pro_id = "P67870"
                 pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb'
-                out_pdb_fp = run_modeller(pdb_file, 1, ResInfo.code_to_pep[AA], "A")
+                out_pdb_fp = run_modeller(pdb_file, j+1, ResInfo.code_to_pep[AA], "A")
                 
                 pro, _ = get_protein_features(pro_id, out_pdb_fp)
+                assert pro.pro_seq != pro_seq and pro.pro_seq[j] == AA, "ERROR in modeller"
                 muta[i,j] = m(pro, lig)
                 
                 # delete after use
                 os.remove(out_pdb_fp)
-                
+    print('n_continued:', n_continued)  
     #%%
     np.save(f"muta-{res_range[0]}_{res_range[1]}.npy", muta)
 
+    exit()
+
+    # %%
+    import numpy as np
+    from src.analysis.mutagenesis import plot_sequence
+    from src.utils.residue import ResInfo, Chain
+
+    pro_id = "P67870"
+    pdb_file = f'/cluster/home/t122995uhn/projects/tmp/kiba/{pro_id}.pdb'
+    res_range = (0,215)
+    muta = np.load(f'muta-{res_range[0]}_{res_range[1]}.npy')
+
+    # pkd -> kd
+    # muta = 10**(-muta)
+    # %%
+    amino_acids = ResInfo.amino_acids[:-1] # not including "X" - unknown
+
+    plot_sequence(muta[:,res_range[0]:res_range[1]], pep_opts=amino_acids,
+                pro_seq=Chain(pdb_file).sequence[res_range[0]:res_range[1]])
+
+    plot_sequence(muta[:,res_range[0]:res_range[1]], pep_opts=amino_acids,
+                pro_seq=Chain(pdb_file).sequence[res_range[0]:res_range[1]], delta=True)
+
+
 
 
diff --git a/src/utils/config.py b/src/utils/config.py
index 672ae18..8fa711f 100644
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -111,7 +111,7 @@ class LIG_FEAT_OPT(StringEnum):
 SLURM_ACCOUNT = None
 SLURM_GPU_NAME = 'v100'
 
-if 'uhnh4h' in DOMAIN_NAME:
+if ('uhnh4h' in DOMAIN_NAME or 'h4h' in DOMAIN_NAME):
     CLUSTER = 'h4h'
     SLURM_PARTITION = 'gpu'
     SLURM_CONSTRAINT = 'gpu32g'