From 73d9566406e294d1bb5384f8428610f2a0f0c41f Mon Sep 17 00:00:00 2001
From: Jackson Howe <jacksonhowe3@gmail.com>
Date: Wed, 24 Jul 2024 14:31:02 -0400
Subject: [PATCH 1/8] feat: download binding pockets from KLIFS (#123)

---
 ...pocket_alignment.py => pocket_alignment.py | 57 +++++++++++++++++++
 1 file changed, 57 insertions(+)
 rename src/utils/pocket_alignment.py => pocket_alignment.py (69%)

diff --git a/src/utils/pocket_alignment.py b/pocket_alignment.py
similarity index 69%
rename from src/utils/pocket_alignment.py
rename to pocket_alignment.py
index 550eb6e..e1f26c9 100644
--- a/src/utils/pocket_alignment.py
+++ b/pocket_alignment.py
@@ -3,10 +3,16 @@
 mask from a binding pocket sequence.
 """
 
+import json
+import os
+
 from Bio import Align
 from Bio.Align import substitution_matrices
+import pandas as pd
 import torch
 
+from src.data_prep.downloaders import Downloader
+
 
 def create_pocket_mask(target_seq: str, query_seq: str) -> list[bool]:
     """
@@ -92,6 +98,51 @@ def mask_graph(data, mask: list[bool]):
     return data
 
 
+def _parse_json(json_path: str) -> str:
+    """
+    Parse a JSON file that holds binding pocket data downloaded from KLIFS.
+
+    Parameters
+    ----------
+    json_path : str 
+        The path to the JSON file
+
+    Returns
+    -------
+    str
+        The binding pocket sequence
+    """
+    with open(json_path, 'r') as json_file:
+        data = json.load(json_file)
+        return data[0]['pocket']
+
+
+def get_dataset_binding_pockets(
+        dataset_path: str = 'data/DavisKibaDataset/kiba/',
+        pockets_path: str = 'data/DavisKibaDataset/kiba_pocket'
+    ) -> None:
+    """
+    Get all binding pocket sequences for a dataset
+
+    Parameters
+    ----------
+    dataset_path : str
+        The path to the directory containing the dataset (as of July 24, 2024,
+        only expecting Kiba dataset). Specify only the path to one of 'davis', 'kiba',
+        or 'PDBbind' (e.g., 'data/DavisKibaDataset/kiba')
+    pockets_path: str
+        The path to the new dataset directory after all the binding pockets have been found
+    """
+    csv_path = os.path.join(dataset_path, 'nomsa_binary_original_binary', 'full', 'cleaned_XY.csv')
+    df = pd.read_csv(csv_path, usecols=['prot_id'])
+    prot_ids = list(set(df['prot_id']))
+    dl = Downloader()
+    seq_save_dir = os.path.join(pockets_path, 'pockets')
+    os.makedirs(seq_save_dir, exist_ok=True)
+    download_check = dl.download_pocket_seq(prot_ids, seq_save_dir)
+    print(download_check)
+
+
 if __name__ == '__main__':
     graph_data = torch.load('sample_pro_data.torch')
     seq = graph_data.pro_seq
@@ -104,3 +155,9 @@ def mask_graph(data, mask: list[bool]):
         binding_pocket_sequence
     )
     masked_data = mask_graph(graph_data, mask)
+    # print(masked_data)
+    # dl = Downloader()
+    # seqs = Downloader.download_pocket_seq(['O00141', 'O14920', 'O15111'])
+    # print(seqs)
+    # get_dataset_binding_pockets()
+    _parse_json('data/DavisKibaDataset/kiba_pocket/pockets/O00141.json')

From 2388875a2dd6158c1544384fbcad27b9361ecd34 Mon Sep 17 00:00:00 2001
From: Jackson Howe <jacksonhowe3@gmail.com>
Date: Wed, 24 Jul 2024 15:44:38 -0400
Subject: [PATCH 2/8] feat: apply mask to all data observations (#123)

---
 pocket_alignment.py | 66 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 8 deletions(-)

diff --git a/pocket_alignment.py b/pocket_alignment.py
index e1f26c9..0584b1a 100644
--- a/pocket_alignment.py
+++ b/pocket_alignment.py
@@ -31,6 +31,8 @@ def create_pocket_mask(target_seq: str, query_seq: str) -> list[bool]:
         A boolean list of indices that are True if the residue at that
         position is part of the binding pocket and false otherwise
     """
+    # Ensure that no '-' characters are present in the query sequence
+    query_seq = query_seq.replace('-', 'X')
     # Taken from tutorial https://biopython.org/docs/dev/Tutorial/chapter_pairwise.html
     aligner = Align.PairwiseAligner()
     # Pairwise alignment parameters as specified in paragraph 2
@@ -120,7 +122,7 @@ def _parse_json(json_path: str) -> str:
 def get_dataset_binding_pockets(
         dataset_path: str = 'data/DavisKibaDataset/kiba/',
         pockets_path: str = 'data/DavisKibaDataset/kiba_pocket'
-    ) -> None:
+    ) -> tuple[dict[str, str], set[str]]:
     """
     Get all binding pocket sequences for a dataset
 
@@ -132,6 +134,11 @@ def get_dataset_binding_pockets(
         or 'PDBbind' (e.g., 'data/DavisKibaDataset/kiba')
     pockets_path: str
         The path to the new dataset directory after all the binding pockets have been found
+    
+    Returns
+    -------
+    sequences : tuple[dict[str, str], set[str]]
+        A map of protein ID, binding pocket sequence pairs
     """
     csv_path = os.path.join(dataset_path, 'nomsa_binary_original_binary', 'full', 'cleaned_XY.csv')
     df = pd.read_csv(csv_path, usecols=['prot_id'])
@@ -140,7 +147,49 @@ def get_dataset_binding_pockets(
     seq_save_dir = os.path.join(pockets_path, 'pockets')
     os.makedirs(seq_save_dir, exist_ok=True)
     download_check = dl.download_pocket_seq(prot_ids, seq_save_dir)
-    print(download_check)
+    download_errors = set()
+    for key, val in download_check.items():
+        if val == 400:
+            download_errors.add(key)
+    sequences = {}
+    for file in os.listdir(seq_save_dir):
+        pocket_seq = _parse_json(os.path.join(seq_save_dir, file))
+        if len(pocket_seq) == 0:
+            download_errors.add(file.split('.')[0])
+        else:
+            sequences[file.split('.')[0]] = pocket_seq
+    return (sequences, download_errors)
+
+
+def create_binding_pocket_dataset(
+    dataset_path: str,
+    pocket_sequences: dict[str, str],
+    download_errors: set[str],
+    new_dataset_path: str
+) -> None:
+    """
+    Apply the graph mask based on binding pocket sequence for each
+    Data object in a PyTorch dataset.
+
+    dataset_path : str
+        The path to the PyTorch dataset object to be transformed
+    pocket_sequences : dict[str, str]
+        A map of protein ID, binding pocket sequence pairs
+    download_errors : set[str]
+        A set of protein IDs that have no binding pocket sequence
+        to be downloaded from KLIFS
+    new_dataset_path : str
+        A path to where the new dataset should be saved
+    """
+    dataset = torch.load(dataset_path)
+    new_dataset = {}
+    for id, data in dataset.items():
+        if id not in download_errors:
+            mask = create_pocket_mask(data.pro_seq, pocket_sequences[id])
+            new_data = mask_graph(data, mask)
+            new_dataset[id] = new_data
+    os.makedirs(new_dataset_path, exist_ok=True)
+    torch.save(dataset, new_dataset_path)
 
 
 if __name__ == '__main__':
@@ -155,9 +204,10 @@ def get_dataset_binding_pockets(
         binding_pocket_sequence
     )
     masked_data = mask_graph(graph_data, mask)
-    # print(masked_data)
-    # dl = Downloader()
-    # seqs = Downloader.download_pocket_seq(['O00141', 'O14920', 'O15111'])
-    # print(seqs)
-    # get_dataset_binding_pockets()
-    _parse_json('data/DavisKibaDataset/kiba_pocket/pockets/O00141.json')
+    pocket_map, download_errors = get_dataset_binding_pockets()
+    create_binding_pocket_dataset(
+        'data/DavisKibaDataset/kiba/nomsa_binary_original_binary/full/data_pro.pt',
+        pocket_map,
+        download_errors,
+        'data/DavisKibaDataset/kiba_pocket/nomsa_binary_original_binary/full/data_pro.pt '
+    )

From 3306ec9277770c64215780e11bd3a5a52c3622c5 Mon Sep 17 00:00:00 2001
From: Jackson Howe <jacksonhowe3@gmail.com>
Date: Wed, 24 Jul 2024 15:51:41 -0400
Subject: [PATCH 3/8] fix: fix directory creation error in
 create_binding_pocket_dataset (#123)

---
 pocket_alignment.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pocket_alignment.py b/pocket_alignment.py
index 0584b1a..a3f4e70 100644
--- a/pocket_alignment.py
+++ b/pocket_alignment.py
@@ -14,7 +14,7 @@
 from src.data_prep.downloaders import Downloader
 
 
-def create_pocket_mask(target_seq: str, query_seq: str) -> list[bool]:
+def create_pocket_mask(target_seq: str, pocket_seq: str) -> list[bool]:
     """
     Return an index mask of a pocket on a protein sequence.
     
@@ -22,7 +22,7 @@ def create_pocket_mask(target_seq: str, query_seq: str) -> list[bool]:
     ----------
     target_seq : str
         The protein sequence you want to query in
-    query_seq : str
+    pocket_seq : str
         The binding pocket sequence for the protein
 
     Returns
@@ -32,7 +32,7 @@ def create_pocket_mask(target_seq: str, query_seq: str) -> list[bool]:
         position is part of the binding pocket and false otherwise
     """
     # Ensure that no '-' characters are present in the query sequence
-    query_seq = query_seq.replace('-', 'X')
+    query_seq = pocket_seq.replace('-', 'X')
     # Taken from tutorial https://biopython.org/docs/dev/Tutorial/chapter_pairwise.html
     aligner = Align.PairwiseAligner()
     # Pairwise alignment parameters as specified in paragraph 2
@@ -188,7 +188,7 @@ def create_binding_pocket_dataset(
             mask = create_pocket_mask(data.pro_seq, pocket_sequences[id])
             new_data = mask_graph(data, mask)
             new_dataset[id] = new_data
-    os.makedirs(new_dataset_path, exist_ok=True)
+    os.makedirs(os.path.dirname(new_dataset_path), exist_ok=True)
     torch.save(dataset, new_dataset_path)
 
 

From ffe740114fd349a88282e3aa79e36c6da59c47d3 Mon Sep 17 00:00:00 2001
From: Jackson Howe <jacksonhowe3@gmail.com>
Date: Wed, 24 Jul 2024 16:34:03 -0400
Subject: [PATCH 4/8] feat: remove proteins with no KLIFS pocket from CSV
 (#123)

---
 pocket_alignment.py | 82 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 66 insertions(+), 16 deletions(-)

diff --git a/pocket_alignment.py b/pocket_alignment.py
index a3f4e70..5870f46 100644
--- a/pocket_alignment.py
+++ b/pocket_alignment.py
@@ -137,8 +137,10 @@ def get_dataset_binding_pockets(
     
     Returns
     -------
-    sequences : tuple[dict[str, str], set[str]]
-        A map of protein ID, binding pocket sequence pairs
+    tuple[dict[str, str], set[str]]
+        A tuple consisting of:
+            -A map of protein ID, binding pocket sequence pairs
+            -A set of protein IDs with no KLIFS binding pockets
     """
     csv_path = os.path.join(dataset_path, 'nomsa_binary_original_binary', 'full', 'cleaned_XY.csv')
     df = pd.read_csv(csv_path, usecols=['prot_id'])
@@ -192,22 +194,70 @@ def create_binding_pocket_dataset(
     torch.save(dataset, new_dataset_path)
 
 
-if __name__ == '__main__':
-    graph_data = torch.load('sample_pro_data.torch')
-    seq = graph_data.pro_seq
-    seq = seq[:857] + 'R' + seq[858:]
-    graph_data.pro_seq = seq
-    torch.save(graph_data, 'sample_pro_data_unmutated.torch')
-    binding_pocket_sequence = 'KVLGSGAFGTVYKVAIKELEILDEAYVMASVDPHVCRLLGIQLITQLMPFGCLLDYVREYLEDRRLVHRDLAARNVLVITDFGLA'
-    mask = create_pocket_mask(
-        graph_data.pro_seq,
-        binding_pocket_sequence
-    )
-    masked_data = mask_graph(graph_data, mask)
+def binding_pocket_filter(dataset_csv_path: str, download_errors: set[str], csv_save_path: str):
+    """
+    Filter out protein IDs that do not have a corresponding KLIFS
+    binding pocket sequence from the dataset.
+
+    Parameters
+    ----------
+    dataset_csv_path : str
+        The path to the original cleaned CSV. Will probably be a CSV named cleaned_XY.csv
+        or something like that.
+    download_errors : set[str]
+        A set of protein IDs with no KLIFS binding pocket sequences.
+    csv_save_path : str
+        The path to save the new CSV file to.
+    """
+    df = pd.read_csv(dataset_csv_path)
+    df = df[~df['prot_id'].isin(download_errors)]
+    os.makedirs(os.path.dirname(csv_save_path), exist_ok=True)
+    df.to_csv(csv_save_path)
+
+
+def pocket_dataset_full(
+    dataset_dir: str,
+    save_dir: str
+) -> None:
+    """
+    Create all elements of a dataset that includes binding pockets. This
+    function assumes the PyTorch object holding the dataset is named 'data_pro.pt'
+    and the CSV holding the cleaned data is named 'cleaned_XY.csv'.
+
+    Parameters
+    ----------
+    dataset_dir : str
+        The path to the dataset to be transformed
+    save_dir : str
+        The path to where the new dataset is to be saved
+    """
     pocket_map, download_errors = get_dataset_binding_pockets()
     create_binding_pocket_dataset(
-        'data/DavisKibaDataset/kiba/nomsa_binary_original_binary/full/data_pro.pt',
+        os.path.join(dataset_dir, 'data_pro.pt'),
         pocket_map,
         download_errors,
-        'data/DavisKibaDataset/kiba_pocket/nomsa_binary_original_binary/full/data_pro.pt '
+        os.path.join(save_dir, 'data_pro.pt')
+    )
+    binding_pocket_filter(
+        os.path.join(dataset_dir, 'cleaned_XY.csv'),
+        download_errors,
+        os.path.join(save_dir, 'cleaned_XY.csv')
+    )
+
+
+if __name__ == '__main__':
+    # graph_data = torch.load('sample_pro_data.torch')
+    # seq = graph_data.pro_seq
+    # seq = seq[:857] + 'R' + seq[858:]
+    # graph_data.pro_seq = seq
+    # torch.save(graph_data, 'sample_pro_data_unmutated.torch')
+    # binding_pocket_sequence = 'KVLGSGAFGTVYKVAIKELEILDEAYVMASVDPHVCRLLGIQLITQLMPFGCLLDYVREYLEDRRLVHRDLAARNVLVITDFGLA'
+    # mask = create_pocket_mask(
+    #     graph_data.pro_seq,
+    #     binding_pocket_sequence
+    # )
+    # masked_data = mask_graph(graph_data, mask)
+    pocket_dataset_full(
+        'data/DavisKibaDataset/kiba/nomsa_binary_original_binary/full/',
+        'data/DavisKibaDataset/kiba_pocket/nomsa_binary_original_binary/full/'
     )

From 6a7f6fc109934cc6058c08f82d30c8fe1de8a474 Mon Sep 17 00:00:00 2001
From: Jackson Howe <jacksonhowe3@gmail.com>
Date: Tue, 30 Jul 2024 16:25:11 -0400
Subject: [PATCH 5/8] feat: add functionality to pocket_alignment for DAVIS
 dataset (#123)

---
 .../utils/pocket_alignment.py                 | 34 +++++++++----------
 1 file changed, 17 insertions(+), 17 deletions(-)
 rename pocket_alignment.py => src/utils/pocket_alignment.py (89%)

diff --git a/pocket_alignment.py b/src/utils/pocket_alignment.py
similarity index 89%
rename from pocket_alignment.py
rename to src/utils/pocket_alignment.py
index 5870f46..e5d8635 100644
--- a/pocket_alignment.py
+++ b/src/utils/pocket_alignment.py
@@ -120,7 +120,7 @@ def _parse_json(json_path: str) -> str:
 
 
 def get_dataset_binding_pockets(
-        dataset_path: str = 'data/DavisKibaDataset/kiba/',
+        dataset_path: str = 'data/DavisKibaDataset/kiba/nomsa_binary_original_binary/full',
         pockets_path: str = 'data/DavisKibaDataset/kiba_pocket'
     ) -> tuple[dict[str, str], set[str]]:
     """
@@ -142,9 +142,12 @@ def get_dataset_binding_pockets(
             -A map of protein ID, binding pocket sequence pairs
             -A set of protein IDs with no KLIFS binding pockets
     """
-    csv_path = os.path.join(dataset_path, 'nomsa_binary_original_binary', 'full', 'cleaned_XY.csv')
+    csv_path = os.path.join(dataset_path, 'cleaned_XY.csv')
     df = pd.read_csv(csv_path, usecols=['prot_id'])
     prot_ids = list(set(df['prot_id']))
+    # Strip out mutations and '-(alpha, beta, gamma)' tags if they are present,
+    # the binding pocket sequence will be the same for mutated and non-mutated genes
+    prot_ids = [id.split('(')[0].split('-')[0] for id in prot_ids]
     dl = Downloader()
     seq_save_dir = os.path.join(pockets_path, 'pockets')
     os.makedirs(seq_save_dir, exist_ok=True)
@@ -156,7 +159,7 @@ def get_dataset_binding_pockets(
     sequences = {}
     for file in os.listdir(seq_save_dir):
         pocket_seq = _parse_json(os.path.join(seq_save_dir, file))
-        if len(pocket_seq) == 0:
+        if pocket_seq == 0 or len(pocket_seq) == 0:
             download_errors.add(file.split('.')[0])
         else:
             sequences[file.split('.')[0]] = pocket_seq
@@ -186,8 +189,10 @@ def create_binding_pocket_dataset(
     dataset = torch.load(dataset_path)
     new_dataset = {}
     for id, data in dataset.items():
-        if id not in download_errors:
-            mask = create_pocket_mask(data.pro_seq, pocket_sequences[id])
+        # If there are any mutations or (-alpha,beta,gamma) tags, strip them
+        stripped_id = id.split('(')[0].split('-')[0]
+        if stripped_id not in download_errors:
+            mask = create_pocket_mask(data.pro_seq, pocket_sequences[stripped_id])
             new_data = mask_graph(data, mask)
             new_dataset[id] = new_data
     os.makedirs(os.path.dirname(new_dataset_path), exist_ok=True)
@@ -217,6 +222,7 @@ def binding_pocket_filter(dataset_csv_path: str, download_errors: set[str], csv_
 
 def pocket_dataset_full(
     dataset_dir: str,
+    pocket_dir: str,
     save_dir: str
 ) -> None:
     """
@@ -228,10 +234,14 @@ def pocket_dataset_full(
     ----------
     dataset_dir : str
         The path to the dataset to be transformed
+    pocket_dir : str
+        The path to where the dataset raw pocket sequences are to be saved
     save_dir : str
         The path to where the new dataset is to be saved
     """
-    pocket_map, download_errors = get_dataset_binding_pockets()
+    pocket_map, download_errors = get_dataset_binding_pockets(dataset_dir, pocket_dir)
+    print(f'Binding pocket sequences were not found for the following {len(download_errors)} protein IDs:')
+    print(','.join(list(download_errors)))
     create_binding_pocket_dataset(
         os.path.join(dataset_dir, 'data_pro.pt'),
         pocket_map,
@@ -246,18 +256,8 @@ def pocket_dataset_full(
 
 
 if __name__ == '__main__':
-    # graph_data = torch.load('sample_pro_data.torch')
-    # seq = graph_data.pro_seq
-    # seq = seq[:857] + 'R' + seq[858:]
-    # graph_data.pro_seq = seq
-    # torch.save(graph_data, 'sample_pro_data_unmutated.torch')
-    # binding_pocket_sequence = 'KVLGSGAFGTVYKVAIKELEILDEAYVMASVDPHVCRLLGIQLITQLMPFGCLLDYVREYLEDRRLVHRDLAARNVLVITDFGLA'
-    # mask = create_pocket_mask(
-    #     graph_data.pro_seq,
-    #     binding_pocket_sequence
-    # )
-    # masked_data = mask_graph(graph_data, mask)
     pocket_dataset_full(
         'data/DavisKibaDataset/kiba/nomsa_binary_original_binary/full/',
+        'data/DavisKibaDataset/kiba_pocket',
         'data/DavisKibaDataset/kiba_pocket/nomsa_binary_original_binary/full/'
     )

From 0072b0977da049a884b59cace4ae556673e61b97 Mon Sep 17 00:00:00 2001
From: jyaacoub <jyaacoub21@gmail.com>
Date: Wed, 31 Jul 2024 09:42:03 -0400
Subject: [PATCH 6/8] results(pdbbind): retrained DG and aflow on random split
 #128

---
 results/v128/model_media/model_stats.csv     | 11 +++++++++++
 results/v128/model_media/model_stats_val.csv | 11 +++++++++++
 src/utils/config.py                          |  4 ++--
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 results/v128/model_media/model_stats.csv
 create mode 100644 results/v128/model_media/model_stats_val.csv

diff --git a/results/v128/model_media/model_stats.csv b/results/v128/model_media/model_stats.csv
new file mode 100644
index 0000000..cf15971
--- /dev/null
+++ b/results/v128/model_media/model_stats.csv
@@ -0,0 +1,11 @@
+run,cindex,pearson,spearman,mse,mae,rmse
+DGM_PDBbind1D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6700303728768234,0.5066764643691913,0.4877355421142711,2.8944340651943885,1.3393119536913358,1.7013036369779466
+DGM_PDBbind0D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6920256384153837,0.5521166195053873,0.5492833432385198,2.6901810056403943,1.2902274471429678,1.6401771263008134
+DGM_PDBbind2D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6860817317553821,0.5349021289716281,0.5292561754408733,2.783316931934153,1.3007997966913076,1.668327585318349
+DGM_PDBbind4D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6814616054321716,0.5209332896962908,0.5205006093028853,2.791164015918997,1.3129914098152748,1.6706777115646805
+DGM_PDBbind3D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6886407871725875,0.5377497788348156,0.536984917834154,2.7892612584388665,1.3068153142929078,1.6701081577068195
+DGM_PDBbind0D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6791215578161572,0.5426811555689641,0.5087142971057145,2.7253219488991136,1.2940673363869717,1.6508549145515827
+DGM_PDBbind2D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6935974149775556,0.5590003871276702,0.5527753938595599,2.613210370957274,1.2817467006748329,1.616542721661656
+DGM_PDBbind1D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6776316790562161,0.5208748506408137,0.5056665537333501,2.79682269540488,1.3061289907536988,1.672370382243383
+DGM_PDBbind3D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6772535031619085,0.5129702979170532,0.5079117445737773,2.876293932539623,1.343678560827124,1.69596401274898
+DGM_PDBbind4D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6803930344028177,0.5261288742927207,0.5152003489955188,2.7888457682850523,1.3061685765992026,1.669983762880661
diff --git a/results/v128/model_media/model_stats_val.csv b/results/v128/model_media/model_stats_val.csv
new file mode 100644
index 0000000..d6661c5
--- /dev/null
+++ b/results/v128/model_media/model_stats_val.csv
@@ -0,0 +1,11 @@
+run,cindex,pearson,spearman,mse,mae,rmse
+DGM_PDBbind1D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6865570848103539,0.5217828035285198,0.5342788004080268,3.12362316791697,1.4128894441393012,1.7673774831418922
+DGM_PDBbind0D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6856114868115144,0.5126870149038275,0.5317248936648343,3.3695982839888123,1.4536717794904257,1.8356465574801737
+DGM_PDBbind2D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6727143444861741,0.5022869639192042,0.5008407950347942,3.421695656587546,1.4982888305121282,1.849782597114468
+DGM_PDBbind4D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.6923138709195878,0.5430234761140276,0.5501681361051465,2.9685919834473102,1.3589257854182952,1.7229602384986458
+DGM_PDBbind3D_nomsaF_binaryE_64B_0.0001LR_0.4D_2000E_originalLF_binaryLE,0.7221237704365324,0.5997226273753726,0.6282960957077218,3.071004525264972,1.3901743548518686,1.7524281797737025
+DGM_PDBbind0D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6765066849227523,0.5001545914726073,0.5133105406519389,3.044526199847986,1.3962691384923704,1.7448570714668827
+DGM_PDBbind2D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.7083260415364561,0.5949781068309761,0.5895379695932953,2.563278684581813,1.274137923604572,1.601024261084701
+DGM_PDBbind1D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6854165476207483,0.5218685304168357,0.535423559993022,2.837139616188374,1.326142185326017,1.6843810780783468
+DGM_PDBbind3D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6752795639801628,0.5022118247529487,0.5077603820327848,3.2012135060951,1.425228337672172,1.7891935351143822
+DGM_PDBbind4D_nomsaF_aflowE_128B_0.0009185598967356679LR_0.22880989869337157D_2000E_originalLF_binaryLE,0.6843307790913993,0.5107948247428088,0.5298336871787266,2.9007170257795094,1.3341726068459485,1.7031491495989155
diff --git a/src/utils/config.py b/src/utils/config.py
index bc26d93..5a0ec37 100644
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -87,7 +87,7 @@ class LIG_FEAT_OPT(StringEnum):
 from pathlib import Path
 
 # Model save paths
-issue_number = 115  # 113 is for unifying all splits for cross validation so that we are more confident 
+issue_number = 128  # 113 is for unifying all splits for cross validation so that we are more confident 
                     # when comparing results that they were trained in the same manner.
 RESULTS_PATH = os.path.abspath(f'results/v{issue_number}/')
 MEDIA_SAVE_DIR      = f'{RESULTS_PATH}/model_media/'
@@ -113,7 +113,7 @@ class LIG_FEAT_OPT(StringEnum):
     SLURM_PARTITION = 'gpu'
     SLURM_CONSTRAINT = 'gpu32g'
     SLURM_ACCOUNT = 'kumargroup_gpu'
-    DATA_ROOT = os.path.abspath('../data/')
+    DATA_ROOT = os.path.abspath('../data/test')
 elif 'graham' in DOMAIN_NAME:
     CLUSTER = 'graham'
     SLURM_CONSTRAINT = 'cascade,v100'

From ce27b6da443904ea9fc6721c4ce817d8e314b1bb Mon Sep 17 00:00:00 2001
From: Jackson Howe <jacksonhowe3@gmail.com>
Date: Wed, 31 Jul 2024 10:17:41 -0400
Subject: [PATCH 7/8] feat(pocket): copy data_mol.pt from dataset to pocket
 dataset (#123)

---
 src/utils/pocket_alignment.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/utils/pocket_alignment.py b/src/utils/pocket_alignment.py
index e5d8635..8ff30eb 100644
--- a/src/utils/pocket_alignment.py
+++ b/src/utils/pocket_alignment.py
@@ -5,6 +5,7 @@
 
 import json
 import os
+import shutil
 
 from Bio import Align
 from Bio.Align import substitution_matrices
@@ -253,6 +254,7 @@ def pocket_dataset_full(
         download_errors,
         os.path.join(save_dir, 'cleaned_XY.csv')
     )
+    shutil.copy2(os.path.join(dataset_dir, 'data_mol.pt'), os.path.join(save_dir, 'data_mol.pt'))
 
 
 if __name__ == '__main__':

From 5564537619912825c15b798e41bf11f99a70b8f4 Mon Sep 17 00:00:00 2001
From: Jean Charle Yaacoub <50300488+jyaacoub@users.noreply.github.com>
Date: Fri, 2 Aug 2024 10:38:44 -0400
Subject: [PATCH 8/8] revert(config): data_root param

---
 src/utils/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/utils/config.py b/src/utils/config.py
index 5a0ec37..865a16d 100644
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -113,7 +113,7 @@ class LIG_FEAT_OPT(StringEnum):
     SLURM_PARTITION = 'gpu'
     SLURM_CONSTRAINT = 'gpu32g'
     SLURM_ACCOUNT = 'kumargroup_gpu'
-    DATA_ROOT = os.path.abspath('../data/test')
+    DATA_ROOT = os.path.abspath('../data/')
 elif 'graham' in DOMAIN_NAME:
     CLUSTER = 'graham'
     SLURM_CONSTRAINT = 'cascade,v100'