fix(tcga): focus on pdbbind davis #95 #99

Since we dont have a kiba GVPL dataset yet...
jyaacoub · May 16, 2024 · 985d684 · 985d684
1 parent 10ce1b2
commit 985d684
Show file tree

Hide file tree

Showing 2 changed files with 89 additions and 68 deletions.
diff --git a/playground.py b/playground.py
@@ -1,66 +1,81 @@
-# %%
+#%% 1.Gather data for davis,kiba and pdbbind datasets
+import os
 import pandas as pd
-from src.utils.residue import Chain
-
-
-df = pd.read_csv('/cluster/home/t122995uhn/projects/MutDTA/PDBbind_all_geneNames.csv')
-
-#%% identify pocket location:
-def get_residue_range(pdb_filename):
-    with open(pdb_filename, 'r') as pdb_file:
-        chain_residues = {}
-        for line in pdb_file:
-            if line.startswith('ATOM'):
-                chain_id = line[21].strip()
-                residue_number = int(line[22:26].strip())
-                if chain_id not in chain_residues:
-                    chain_residues[chain_id] = set()
-                chain_residues[chain_id].add(residue_number)
-
-        chain_ranges = {}
-        for chain_id, residues in chain_residues.items():
-            min_residue = min(residues)
-            max_residue = max(residues)
-            chain_ranges[chain_id] = (min_residue, max_residue)
-
-        return chain_ranges
+import matplotlib.pyplot as plt
+from src.analysis.utils import combine_dataset_pids
+from src import config as cfg
+df_prots = combine_dataset_pids(dbs=[cfg.DATA_OPT.davis, cfg.DATA_OPT.PDBbind], # just davis and pdbbind for now
+                                subset='test')
 
-# %%
-from tqdm import tqdm
-dir_p = '/cluster/home/t122995uhn/projects/data/pdbbind/v2020-other-PL/'
-pocket_fp = lambda x: f'{dir_p}/{x}/{x}_pocket.pdb'
-prot_fp = lambda x: f'{dir_p}/{x}/{x}_protein.pdb'
-
-dfu = df.drop_duplicates(subset='code')
-
-mapped_prange = {}
-for idx, (code, seq) in tqdm(dfu[['code', 'prot_seq']].iterrows(), total=len(dfu)):
-    code = code.lower()
-    # find the seq range for all chains in pocket and in protein
-    prot = get_residue_range(prot_fp(code))
-    pocket = get_residue_range(pocket_fp(code))
-
-    # find the right change that matches seq_len
-    chainID = None
-    for cid, s in Chain(prot_fp(code)):
-        if s == seq: # +1 bc PDB starts at index 1
-            chainID = cid
-            break
-
-    # Find the protein residue range for the same chain
-    prot_ch = prot.get(chainID)
-    if not prot_ch or chainID not in pocket:
-        # print(f'No matching chain in protein file for {prot_fp(code)}')
-        continue
-
-    # Reset pocket range so that it matches exactly with proteins sequence that is 0-indexed:
-    prange = (pocket[chainID][0] - prot_ch[0], pocket[chainID][1] - prot_ch[0])  # Convert to 0-indexed
-    mapped_prange[code.upper()] = f"{prange[0]}-{prange[1]}"
 
+#%% 2. Load TCGA data
+df_tcga = pd.read_csv('../downloads/TCGA_ALL.maf', sep='\t')
 
-# %%
-# Add the pocket range to the dataframe
-df['pocket_range'] = df['code'].map(mapped_prange)
-df.to_csv("pdbbind_all_prange.csv")
+#%% 3. Pre filtering
+df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation']
+df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1])
+df_tcga = df_tcga[df_tcga['seq_len'] < 5000]
+df_tcga['seq_len'].plot.hist(bins=100, title="sequence length histogram capped at 5K")
+plt.show()
+df_tcga = df_tcga[df_tcga['seq_len'] < 1200]
+df_tcga['seq_len'].plot.hist(bins=100, title="sequence length after capped at 1.2K")
+
+#%% 4. Merging df_prots with TCGA
+df_tcga['uniprot'] = df_tcga['SWISSPROT'].str.split('.').str[0]
+
+dfm = df_tcga.merge(df_prots[df_prots.db != 'davis'], 
+                    left_on='uniprot', right_on='prot_id', how='inner')
+
+# for davis we have to merge on HUGO_SYMBOLS
+dfm_davis = df_tcga.merge(df_prots[df_prots.db == 'davis'], 
+                          left_on='Hugo_Symbol', right_on='prot_id', how='inner')
+
+dfm = pd.concat([dfm,dfm_davis], axis=0)
+
+del dfm_davis # to save mem
+
+# %% 5. Post filtering step
+# 5.1. Filter for only those sequences with matching sequence length (to get rid of nonmatched isoforms)
+# seq_len_x is from tcga, seq_len_y is from our dataset 
+tmp = len(dfm)
+# allow for some error due to missing amino acids from pdb file in PDBbind dataset
+#   - assumption here is that isoforms will differ by more than 50 amino acids
+dfm = dfm[(dfm.seq_len_y <= dfm.seq_len_x) & (dfm.seq_len_x<= dfm.seq_len_y+50)]
+print(f"Filter #1 (seq_len)     : {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}")
+
+# 5.2. Filter out those that dont have the same reference seq according to the "Protein_position" and "Amino_acids" col
+
+# Extract mutation location and reference amino acid from 'Protein_position' and 'Amino_acids' columns
+dfm['mt_loc'] = pd.to_numeric(dfm['Protein_position'].str.split('/').str[0])
+dfm = dfm[dfm['mt_loc'] < dfm['seq_len_y']]
+dfm[['ref_AA', 'mt_AA']] = dfm['Amino_acids'].str.split('/', expand=True)
+
+dfm['db_AA'] = dfm.apply(lambda row: row['prot_seq'][row['mt_loc']-1], axis=1)
+
+# Filter #2: Match proteins with the same reference amino acid at the mutation location
+tmp = len(dfm)
+dfm = dfm[dfm['db_AA'] == dfm['ref_AA']]
+print(f"Filter #2 (ref_AA match): {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}")
+print('\n',dfm.db.value_counts())
+
+# %% final seq len distribution
+
+n_bins = 25
+lengths = dfm.seq_len_x
+fig, ax = plt.subplots(1, 1, figsize=(10, 5))
+
+# Plot histogram
+n, bins, patches = ax.hist(lengths, bins=n_bins, color='blue', alpha=0.7)
+ax.set_title('TCGA final filtering for db matches')
+
+# Add counts to each bin
+for count, x, patch in zip(n, bins, patches):
+    ax.text(x + 0.5, count, str(int(count)), ha='center', va='bottom')
+
+ax.set_xlabel('Sequence Length')
+ax.set_ylabel('Frequency')
+
+plt.tight_layout()
+plt.show()
 
 # %%
diff --git a/src/analysis/utils.py b/src/analysis/utils.py
@@ -118,20 +118,26 @@ def generate_markdown(results, names=None, verbose=False, thresh_sig=False, cind
     if verbose: print(md_output)
     return md_table
 
-def combine_dataset_pids(data_dir=cfg.DATA_ROOT, target='nomsa_binary_original_binary', subset='full'):
+def combine_dataset_pids(data_dir=cfg.DATA_ROOT, 
+                         dbs=[cfg.DATA_OPT.davis, cfg.DATA_OPT.kiba, cfg.DATA_OPT.PDBbind],
+                         target='nomsa_aflow_gvp_binary', subset='full', 
+                         xy='cleaned_XY.csv'):
     df_all = None
-    dbs = ['PDBbindDataset', 'DavisKibaDataset/davis', 'DavisKibaDataset/kiba']
-    dbs = [f'{data_dir}/{d}' for d in dbs]
-    for root_dir in dbs:
-        print(root_dir)
-        DB = root_dir.split("/")[-1]
-
-        df = pd.read_csv(f"{root_dir}/nomsa_binary_original_binary/{subset}/XY.csv", index_col=0)
+    dir_p = {'davis':'DavisKibaDataset/davis', 
+           'kiba': 'DavisKibaDataset/kiba',
+           'PDBbind': 'PDBbindDataset'}
+    dbs = {d.value: f'{data_dir}/{dir_p[d]}/{target}/{subset}/{xy}' for d in dbs}
+
+    for DB, fp in dbs.items():
+        print(DB, fp)
+        df = pd.read_csv(fp, index_col=0)
         df['pdb_id'] = df.prot_id.str.split("_").str[0]
         df = df[['prot_id', 'prot_seq']].drop_duplicates(subset='prot_id')
+
         df['seq_len'] = df['prot_seq'].str.len()
         df['db'] = DB
         df.reset_index(inplace=True)
+
         df = df[['db','code', 'prot_id', 'seq_len', 'prot_seq']] # reorder them.
         df.index.name = 'db_idx'
         df_all = df if df_all is None else pd.concat([df_all, df], axis=0)