From 5696a7aff4d0b1ed52a3a29ee77b6db258b62e23 Mon Sep 17 00:00:00 2001 From: jyaacoub Date: Mon, 13 May 2024 10:50:38 -0400 Subject: [PATCH] feat(analysis): combine_dataset_pids #95 --- playground.py | 12 +++++++++--- src/analysis/utils.py | 21 +++++++++++++++++++++ 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/playground.py b/playground.py index e1d6565..4a24ff0 100644 --- a/playground.py +++ b/playground.py @@ -1,12 +1,18 @@ -#%% 1-2. download TCGA and gather proteins for dbs +#%% +# steps for matching prots from TCGA to our dataset +# (see https://github.com/jyaacoub/MutDTA/issues/95#issuecomment-2107780527) import os import pandas as pd +import matplotlib.pyplot as plt + +#%% 1. gather relevant proteins to match with tcga +from src.analysis.utils import combine_dataset_pids +df_prots = combine_dataset_pids(subset='test') -df_prots = pd.read_csv('../downloads/all_prots.csv') +#%% 2. Download ALL TCGA projects as a single MAF df_tcga = pd.read_csv('../downloads/TCGA_ALL.maf', sep='\t') #%% 3. Pre filtering -import matplotlib.pyplot as plt df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation'] df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1]) df_tcga = df_tcga[df_tcga['seq_len'] < 5000] diff --git a/src/analysis/utils.py b/src/analysis/utils.py index 6ad8a4d..0dab921 100644 --- a/src/analysis/utils.py +++ b/src/analysis/utils.py @@ -3,6 +3,7 @@ from scipy.stats import ttest_ind import pandas as pd import numpy as np +from src import config as cfg def count_missing_res(pdb_file: str) -> Tuple[int,int]: @@ -117,6 +118,26 @@ def generate_markdown(results, names=None, verbose=False, thresh_sig=False, cind if verbose: print(md_output) return md_table +def combine_dataset_pids(data_dir=cfg.DATA_ROOT, target='nomsa_binary_original_binary', subset='full'): + df_all = None + dbs = ['PDBbindDataset', 'DavisKibaDataset/davis', 'DavisKibaDataset/kiba'] + dbs = [f'{data_dir}/{d}' for d in dbs] + for root_dir in dbs: + print(root_dir) + DB = root_dir.split("/")[-1] + + df = pd.read_csv(f"{root_dir}/nomsa_binary_original_binary/{subset}/XY.csv", index_col=0) + df['pdb_id'] = df.prot_id.str.split("_").str[0] + df = df[['prot_id', 'prot_seq']].drop_duplicates(subset='prot_id') + df['seq_len'] = df['prot_seq'].str.len() + df['db'] = DB + df.reset_index(inplace=True) + df = df[['db','code', 'prot_id', 'seq_len', 'prot_seq']] # reorder them. + df.index.name = 'db_idx' + df_all = df if df_all is None else pd.concat([df_all, df], axis=0) + + return df_all + if __name__ == '__main__': #NOTE: the following is code for stratifying AutoDock Vina results by # missing residues to identify if there is a correlation between missing