From 985d684d84b8f1aca820a05c5441a7e5626d469f Mon Sep 17 00:00:00 2001 From: jyaacoub Date: Thu, 16 May 2024 14:21:37 -0400 Subject: [PATCH] fix(tcga): focus on pdbbind davis #95 #99 Since we dont have a kiba GVPL dataset yet... --- playground.py | 135 +++++++++++++++++++++++------------------- src/analysis/utils.py | 22 ++++--- 2 files changed, 89 insertions(+), 68 deletions(-) diff --git a/playground.py b/playground.py index fbb4c0a..be89f5a 100644 --- a/playground.py +++ b/playground.py @@ -1,66 +1,81 @@ -# %% +#%% 1.Gather data for davis,kiba and pdbbind datasets +import os import pandas as pd -from src.utils.residue import Chain - - -df = pd.read_csv('/cluster/home/t122995uhn/projects/MutDTA/PDBbind_all_geneNames.csv') - -#%% identify pocket location: -def get_residue_range(pdb_filename): - with open(pdb_filename, 'r') as pdb_file: - chain_residues = {} - for line in pdb_file: - if line.startswith('ATOM'): - chain_id = line[21].strip() - residue_number = int(line[22:26].strip()) - if chain_id not in chain_residues: - chain_residues[chain_id] = set() - chain_residues[chain_id].add(residue_number) - - chain_ranges = {} - for chain_id, residues in chain_residues.items(): - min_residue = min(residues) - max_residue = max(residues) - chain_ranges[chain_id] = (min_residue, max_residue) - - return chain_ranges +import matplotlib.pyplot as plt +from src.analysis.utils import combine_dataset_pids +from src import config as cfg +df_prots = combine_dataset_pids(dbs=[cfg.DATA_OPT.davis, cfg.DATA_OPT.PDBbind], # just davis and pdbbind for now + subset='test') -# %% -from tqdm import tqdm -dir_p = '/cluster/home/t122995uhn/projects/data/pdbbind/v2020-other-PL/' -pocket_fp = lambda x: f'{dir_p}/{x}/{x}_pocket.pdb' -prot_fp = lambda x: f'{dir_p}/{x}/{x}_protein.pdb' - -dfu = df.drop_duplicates(subset='code') - -mapped_prange = {} -for idx, (code, seq) in tqdm(dfu[['code', 'prot_seq']].iterrows(), total=len(dfu)): - code = code.lower() - # find the seq range for all chains in pocket and in protein - prot = get_residue_range(prot_fp(code)) - pocket = get_residue_range(pocket_fp(code)) - - # find the right change that matches seq_len - chainID = None - for cid, s in Chain(prot_fp(code)): - if s == seq: # +1 bc PDB starts at index 1 - chainID = cid - break - - # Find the protein residue range for the same chain - prot_ch = prot.get(chainID) - if not prot_ch or chainID not in pocket: - # print(f'No matching chain in protein file for {prot_fp(code)}') - continue - - # Reset pocket range so that it matches exactly with proteins sequence that is 0-indexed: - prange = (pocket[chainID][0] - prot_ch[0], pocket[chainID][1] - prot_ch[0]) # Convert to 0-indexed - mapped_prange[code.upper()] = f"{prange[0]}-{prange[1]}" +#%% 2. Load TCGA data +df_tcga = pd.read_csv('../downloads/TCGA_ALL.maf', sep='\t') -# %% -# Add the pocket range to the dataframe -df['pocket_range'] = df['code'].map(mapped_prange) -df.to_csv("pdbbind_all_prange.csv") +#%% 3. Pre filtering +df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation'] +df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1]) +df_tcga = df_tcga[df_tcga['seq_len'] < 5000] +df_tcga['seq_len'].plot.hist(bins=100, title="sequence length histogram capped at 5K") +plt.show() +df_tcga = df_tcga[df_tcga['seq_len'] < 1200] +df_tcga['seq_len'].plot.hist(bins=100, title="sequence length after capped at 1.2K") + +#%% 4. Merging df_prots with TCGA +df_tcga['uniprot'] = df_tcga['SWISSPROT'].str.split('.').str[0] + +dfm = df_tcga.merge(df_prots[df_prots.db != 'davis'], + left_on='uniprot', right_on='prot_id', how='inner') + +# for davis we have to merge on HUGO_SYMBOLS +dfm_davis = df_tcga.merge(df_prots[df_prots.db == 'davis'], + left_on='Hugo_Symbol', right_on='prot_id', how='inner') + +dfm = pd.concat([dfm,dfm_davis], axis=0) + +del dfm_davis # to save mem + +# %% 5. Post filtering step +# 5.1. Filter for only those sequences with matching sequence length (to get rid of nonmatched isoforms) +# seq_len_x is from tcga, seq_len_y is from our dataset +tmp = len(dfm) +# allow for some error due to missing amino acids from pdb file in PDBbind dataset +# - assumption here is that isoforms will differ by more than 50 amino acids +dfm = dfm[(dfm.seq_len_y <= dfm.seq_len_x) & (dfm.seq_len_x<= dfm.seq_len_y+50)] +print(f"Filter #1 (seq_len) : {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}") + +# 5.2. Filter out those that dont have the same reference seq according to the "Protein_position" and "Amino_acids" col + +# Extract mutation location and reference amino acid from 'Protein_position' and 'Amino_acids' columns +dfm['mt_loc'] = pd.to_numeric(dfm['Protein_position'].str.split('/').str[0]) +dfm = dfm[dfm['mt_loc'] < dfm['seq_len_y']] +dfm[['ref_AA', 'mt_AA']] = dfm['Amino_acids'].str.split('/', expand=True) + +dfm['db_AA'] = dfm.apply(lambda row: row['prot_seq'][row['mt_loc']-1], axis=1) + +# Filter #2: Match proteins with the same reference amino acid at the mutation location +tmp = len(dfm) +dfm = dfm[dfm['db_AA'] == dfm['ref_AA']] +print(f"Filter #2 (ref_AA match): {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}") +print('\n',dfm.db.value_counts()) + +# %% final seq len distribution + +n_bins = 25 +lengths = dfm.seq_len_x +fig, ax = plt.subplots(1, 1, figsize=(10, 5)) + +# Plot histogram +n, bins, patches = ax.hist(lengths, bins=n_bins, color='blue', alpha=0.7) +ax.set_title('TCGA final filtering for db matches') + +# Add counts to each bin +for count, x, patch in zip(n, bins, patches): + ax.text(x + 0.5, count, str(int(count)), ha='center', va='bottom') + +ax.set_xlabel('Sequence Length') +ax.set_ylabel('Frequency') + +plt.tight_layout() +plt.show() # %% diff --git a/src/analysis/utils.py b/src/analysis/utils.py index 0dab921..34feb2e 100644 --- a/src/analysis/utils.py +++ b/src/analysis/utils.py @@ -118,20 +118,26 @@ def generate_markdown(results, names=None, verbose=False, thresh_sig=False, cind if verbose: print(md_output) return md_table -def combine_dataset_pids(data_dir=cfg.DATA_ROOT, target='nomsa_binary_original_binary', subset='full'): +def combine_dataset_pids(data_dir=cfg.DATA_ROOT, + dbs=[cfg.DATA_OPT.davis, cfg.DATA_OPT.kiba, cfg.DATA_OPT.PDBbind], + target='nomsa_aflow_gvp_binary', subset='full', + xy='cleaned_XY.csv'): df_all = None - dbs = ['PDBbindDataset', 'DavisKibaDataset/davis', 'DavisKibaDataset/kiba'] - dbs = [f'{data_dir}/{d}' for d in dbs] - for root_dir in dbs: - print(root_dir) - DB = root_dir.split("/")[-1] - - df = pd.read_csv(f"{root_dir}/nomsa_binary_original_binary/{subset}/XY.csv", index_col=0) + dir_p = {'davis':'DavisKibaDataset/davis', + 'kiba': 'DavisKibaDataset/kiba', + 'PDBbind': 'PDBbindDataset'} + dbs = {d.value: f'{data_dir}/{dir_p[d]}/{target}/{subset}/{xy}' for d in dbs} + + for DB, fp in dbs.items(): + print(DB, fp) + df = pd.read_csv(fp, index_col=0) df['pdb_id'] = df.prot_id.str.split("_").str[0] df = df[['prot_id', 'prot_seq']].drop_duplicates(subset='prot_id') + df['seq_len'] = df['prot_seq'].str.len() df['db'] = DB df.reset_index(inplace=True) + df = df[['db','code', 'prot_id', 'seq_len', 'prot_seq']] # reorder them. df.index.name = 'db_idx' df_all = df if df_all is None else pd.concat([df_all, df], axis=0)