From 5696a7aff4d0b1ed52a3a29ee77b6db258b62e23 Mon Sep 17 00:00:00 2001
From: jyaacoub <jyaacoub21@gmail.com>
Date: Mon, 13 May 2024 10:50:38 -0400
Subject: [PATCH] feat(analysis): combine_dataset_pids #95

---
 playground.py         | 12 +++++++++---
 src/analysis/utils.py | 21 +++++++++++++++++++++
 2 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/playground.py b/playground.py
index e1d6565..4a24ff0 100644
--- a/playground.py
+++ b/playground.py
@@ -1,12 +1,18 @@
-#%% 1-2. download TCGA and gather proteins for dbs 
+#%%
+# steps for matching prots from TCGA to our dataset 
+# (see https://github.com/jyaacoub/MutDTA/issues/95#issuecomment-2107780527)
 import os
 import pandas as pd
+import matplotlib.pyplot as plt
+
+#%% 1. gather relevant proteins to match with tcga
+from src.analysis.utils import combine_dataset_pids
+df_prots = combine_dataset_pids(subset='test')
 
-df_prots = pd.read_csv('../downloads/all_prots.csv')
+#%% 2. Download ALL TCGA projects as a single MAF 
 df_tcga = pd.read_csv('../downloads/TCGA_ALL.maf', sep='\t')
 
 #%% 3. Pre filtering
-import matplotlib.pyplot as plt
 df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation']
 df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1])
 df_tcga = df_tcga[df_tcga['seq_len'] < 5000]
diff --git a/src/analysis/utils.py b/src/analysis/utils.py
index 6ad8a4d..0dab921 100644
--- a/src/analysis/utils.py
+++ b/src/analysis/utils.py
@@ -3,6 +3,7 @@
 from scipy.stats import ttest_ind
 import pandas as pd
 import numpy as np
+from src import config as cfg
 
 
 def count_missing_res(pdb_file: str) -> Tuple[int,int]:
@@ -117,6 +118,26 @@ def generate_markdown(results, names=None, verbose=False, thresh_sig=False, cind
     if verbose: print(md_output)
     return md_table
 
+def combine_dataset_pids(data_dir=cfg.DATA_ROOT, target='nomsa_binary_original_binary', subset='full'):
+    df_all = None
+    dbs = ['PDBbindDataset', 'DavisKibaDataset/davis', 'DavisKibaDataset/kiba']
+    dbs = [f'{data_dir}/{d}' for d in dbs]
+    for root_dir in dbs:
+        print(root_dir)
+        DB = root_dir.split("/")[-1]
+                    
+        df = pd.read_csv(f"{root_dir}/nomsa_binary_original_binary/{subset}/XY.csv", index_col=0)
+        df['pdb_id'] = df.prot_id.str.split("_").str[0]
+        df = df[['prot_id', 'prot_seq']].drop_duplicates(subset='prot_id')
+        df['seq_len'] = df['prot_seq'].str.len()
+        df['db'] = DB
+        df.reset_index(inplace=True)
+        df = df[['db','code', 'prot_id', 'seq_len', 'prot_seq']] # reorder them.
+        df.index.name = 'db_idx'
+        df_all = df if df_all is None else pd.concat([df_all, df], axis=0)
+    
+    return df_all
+
 if __name__ == '__main__':
     #NOTE: the following is code for stratifying AutoDock Vina results by 
     # missing residues to identify if there is a correlation between missing