Merge branch 'main' of https://github.com/jyaacoub/MutDTA into main

Conflicts: .gitignore
jyaacoub · Sep 8, 2023 · 99d976e · 99d976e
2 parents 995f4d9 + 0a59cbb
commit 99d976e
Show file tree

Hide file tree

Showing 202 changed files with 11,912 additions and 1,057 deletions.
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -31,7 +31,7 @@ jobs:
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
-        flake8 --count --select=E9,F63,F7,F82 --show-source --statistics --extend-exclude .venv ./
+        flake8 --count --select=E9,F63,F7,F82 --show-source --statistics --extend-exclude .venv,run.py ./
         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
         flake8 --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics --extend-ignore W291,W293,W391,E701 --extend-exclude .venv,__init__.py,docking ./src
 #TODO: write unit tests for src scripts

diff --git a/.gitignore b/.gitignore
@@ -205,6 +205,6 @@ lib/mgltools_x86_64Linux2_1.5.7/MGLToolsPckgs/AutoDockTools/Utilities24/*
 
 # too large to commit tar file
 lib/mgltools_x86_64Linux2_1.5.7p1.tar.gz
-slurm_tests/*
 
-log_test/*
+log_test/
+slurm_tests/
diff --git a/data/downloaded_codes.json b/data/downloaded_codes.json
diff --git a/docs/PLATINUM.md b/docs/PLATINUM.md
@@ -0,0 +1,10 @@
+## Important columns for PLANTINUM data are:
+* `mutation`             - Mutation: The mutation string (e.g.: I84V).
+* `affin.k_wt`           - Affinity Reference (nM): wild type k value in **nM**.
+* `affin.k_mt`           - Affinity Mutant (nM): Mutated type k value in **nM**.
+* `affin.pdb_id`         - Reference PDB ID: The PDB ID for the reference structure (wt).
+* `affin.chain`          - Chain: The target residue chain in the PDB structure (none missing).
+* `affin.lig_id`         - Ligand ID: The ligand ID in the PDB structure (none missing!).
+* `mut.uniprot`          - Uniprot: The uniprot ID for the protein (**5 are missing**).
+* `lig.canonical_smiles` - Canonical SMILES: The canonical SMILES for the ligand.
+
diff --git a/docs/VINA_CONFIG.md b/docs/VINA_CONFIG.md
@@ -92,5 +92,12 @@ increased energy range and search space.
 # 2 - Flexible docking
 Following https://autodock-vina.readthedocs.io/en/latest/docking_flexible.html, the next runs will use flexible docking.
 
-New parameters for flexibile docking:
+New parameters for flexible docking:
 ### run11
+```conf
+energy_range = 10
+exhaustiveness = 20
+num_modes = 9
+out = /cluster/projects/kumargroup/jean/data/vina_out/run11
+log = /cluster/projects/kumargroup/jean/data/vina_out/run11
+```
diff --git a/docs/VINA_PROCEDURE.md b/docs/VINA_PROCEDURE.md
@@ -213,3 +213,22 @@ Same as **2.2A** but change `conf_dir` in `src/docking/sbatch/multi_dock.sh` to
 
 ## 3. Data analysis
 Follow the same steps as in [**5. Data analysis**](#5-data-analysis) above.
+
+
+# Flexible Docking
+For flexible docking we need to also run `prepare_flexreceptor4.py` during our preparation stage [**3. Prepare receptor**](#3-preparing-receptor-and-ligand-pdbqt-files). 
+
+For this we just pass the additional `-f` argument to specify the flexible docking option. This will extra PDB files with the name `<code>_pocket.pdb` from which the script will select to be flexible residues.
+```
+Usage: ./src/docking/bash_scripts/PDBbind_prepare.sh path ADT template [OPTIONS]
+       path     - path to PDBbind dir containing pdb for protein to convert to pdbqt.
+       ADT - path to MGL root  (e.g.: '~/mgltools_x86_64Linux2_1.5.7/')
+       template - path to conf template file (create empty file if you want vina defaults).
+Options:
+       -sl --shortlist: path to csv file containing a list of pdbcodes to process.
+              Doesn't matter what the file is as long as the first column contains the pdbcodes.
+       -cd --config-dir: path to store new configurations in.
+              Default is to store it with the prepared receptor as <PDBCode>_conf.txt
+       -f --flex: optional, runs prepare_flexreceptor4.py with all residues in <code>_pocket.pdb
+              as flexible.
+```
diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -11,10 +11,12 @@ lifelines # used for concordance index calc
 # model building
 torch
 torch_geometric
+transformers # huggingface needed for esm
 
 # optional:
 torchsummary 
 tabulate # for torch_geometric.nn.summary
 ipykernel
 plotly
-requests
+requests
+ray[tune]
diff --git a/playground-ddp.py b/playground-ddp.py
@@ -0,0 +1,53 @@
+#%%
+from src.train_test.training import train_tune
+from src.models.prior_work import DGraphDTA
+from src.models.mut_dta import EsmDTA, EsmAttentionDTA
+from src.utils.loader import Loader
+from src.utils import config
+import pickle
+
+
+#%%
+import ray
+from ray import tune
+ray.init(num_cpus=6, num_gpus=3)
+
+
+# %%
+MODEL = 'EDI'
+PRO_FEATURE = 'nomsa'
+DATA = 'davis'
+data_root = "/cluster/home/t122995uhn/projects/data"# WARNING: HARD CODED PATH
+
+config = {    
+    "edge": tune.grid_search(['simple', 'binary']),
+    "dropout": tune.grid_search([0.1, 0.2, 0.4, 0.5]),
+
+    "lr": tune.grid_search([1e-4, 1e-3, 1e-2]),
+    "batch_size": tune.grid_search([4, 5, 6, 10, 12]),
+}
+
+#%%
+train_dataset = Loader.load_dataset(DATA, PRO_FEATURE, subset='train', path=data_root) 
+val_dataset = Loader.load_dataset(DATA, PRO_FEATURE, subset='val', path=data_root)
+
+tuner = tune.Tuner(
+    tune.with_resources(
+        tune.with_parameters(train_tune, model=MODEL, pro_feature=PRO_FEATURE,
+                             train_dataset=train_dataset, 
+                             val_dataset=val_dataset),
+        resources={"cpu": 2, "gpu": 1}
+    ),
+    tune_config=tune.TuneConfig(
+        metric="val_loss",
+        mode="min",
+    ),
+    param_space=config,
+)
+# %%
+results = tuner.fit()
+print(results)
+print(results.get_best_result())
+
+with open(f'{MODEL}_{PRO_FEATURE}_{DATA}-rayTuneResults.pickle', 'wb') as handle:
+    pickle.dump(results, handle)
diff --git a/playground.py b/playground.py
@@ -0,0 +1,130 @@
+#%%
+# from src.feature_extraction.process_msa import create_pfm_np_files
+
+# create_pfm_np_files('../data/PDBbind_aln/', processes=4)
+
+#%%
+from src.utils.loader import Loader
+from src.utils import config
+import pickle
+
+import numpy as np
+import pandas as pd
+from src.data_processing.processors import Processor
+from prody import parsePDB, calcANM
+
+# delNonstdAminoacid('SEP')
+
+
+td = Loader.load_dataset('PDBbind', 'nomsa',
+                                    path="/cluster/home/t122995uhn/projects/data") 
+
+hv = parsePDB('../data/v2020-other-PL/5swg/5swg_protein.pdb', subset='calpha').getHierView()
+for c in hv: print(c)
+
+#%%
+ch = Processor.pdb_get_chain('../data/v2020-other-PL/5swg/5swg_protein.pdb', model=1)
+seq_mine = ch.getSequence()
+seq_real = hv['A'].getSequence()
+for i, c in enumerate(seq_real):
+    if seq_real[i] != seq_mine[i]:
+        print(f'{i}: MISMATCH')
+        print(f'\t{seq_real[i-3:i+3]}')
+        print(f'\t{seq_mine[i-3:i+3]}')
+        break
+
+
+#%%
+from prody import parsePDB, calcANM
+from src.feature_extraction.protein import calcCrossCorr
+
+idx=1
+pdb_fp = td.pdb_p(td[idx]['code'])
+target_seq = td[idx]['protein'].pro_seq
+n_modes=10
+
+pdb = parsePDB(pdb_fp, subset='calpha').getHierView()
+
+anm = None
+for chain in pdb:
+    if chain.getSequence() == target_seq:
+        anm, _ = calcANM(chain, selstr='calpha', n_modes=n_modes)
+        break
+
+if anm is None:
+    raise ValueError(f"No matching chain found in pdb file ({pdb_fp})")
+else:
+    # norm=True normalizes it from -1.0 to 1.0
+    cc = calcCrossCorr(anm[:n_modes], n_cpu=2, norm=True)
+
+
+
+
+
+
+
+###########################################################################################
+#%% selecting chain A to run ANM on:
+from prody import parsePDB, calcANM, calcCrossCorr
+import matplotlib.pyplot as plt
+import numpy as np
+
+pdb = "10gs"
+pdb_fp = f"/cluster/home/t122995uhn/projects/data/v2020-other-PL/{pdb}/{pdb}_protein.pdb"
+
+pdb = parsePDB(pdb_fp, subset='calpha').getHierView()['A']
+anm, atoms = calcANM(pdb, selstr='calpha', n_modes=20)
+
+# %%
+cc = calcCrossCorr(anm[:2], n_cpu=1, norm=True)
+
+plt.matshow(cc)#(cc-np.min(cc))/(np.max(cc)-np.min(cc)))
+plt.title('Cross Correlation Matrix:')
+plt.show()
+
+lap_m = anm.getKirchhoff()
+adj = lap_m - np.eye(lap_m.shape[0])*lap_m
+plt.title('Overlapped with adjacency matrix')
+plt.imshow(adj, alpha=0.5)
+plt.imshow(cc, alpha=0.6)#(cc-np.min(cc))/(np.max(cc)-np.min(cc)), alpha=0.6)
+
+# %% TM SCORE:
+# ================================================================================================
+from prody import parsePDB, matchAlign, showProtein
+from pylab import legend
+import numpy as np
+
+# %%
+# tm score for A&B is 0.9835 (https://seq2fun.dcmb.med.umich.edu//TM-score/tmp/110056.html)
+src_model = '/cluster/home/t122995uhn/projects/data/v2020-other-PL/1a1e/1a1e_protein.pdb'
+pred_model = '/cluster/home/t122995uhn/projects/colabfold/out/1a1e.msa_unrelaxed_rank_001_alphafold2_ptm_model_1_seed_000.pdb'
+
+sm = parsePDB(src_model, model=1, subset="ca", chain="A")
+sm.setTitle('experimental')
+pm = parsePDB(pred_model, model=1, subset="ca", chain="A")
+pm.setTitle('alphafold')
+
+showProtein(sm,pm)
+legend()
+
+#%% Performing alignment before TM-score
+result = matchAlign(pm, sm)
+
+showProtein(sm,pm)
+legend()
+
+# %%
+def tm_score(xyz0, xyz1): #Check if TM-align use all atoms!    
+    L = len(xyz0)
+    # d0 is less than 0.5 for L < 22 
+    # and nan for L < 15 (root of a negative number)
+    d0 = 1.24 * np.power(L - 15, 1/3) - 1.8
+    d0 = max(0.5, d0) 
+
+    # compute the distance for each pair of atoms
+    di = np.sum((xyz0 - xyz1) ** 2, 1) # sum along first axis
+    return np.sum(1 / (1 + (di / d0) ** 2)) / L
+
+# TM score for predicted model 1 with chain A of src should be 0.9681 (https://seq2fun.dcmb.med.umich.edu//TM-score/tmp/987232.html)
+tm_score(sm.getCoords(), 
+         pm.getCoords())