Skip to content

Commit

Permalink
feat(analysis): combine_dataset_pids #95
Browse files Browse the repository at this point in the history
  • Loading branch information
jyaacoub committed May 13, 2024
1 parent c54b67a commit 5696a7a
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 3 deletions.
12 changes: 9 additions & 3 deletions playground.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
#%% 1-2. download TCGA and gather proteins for dbs
#%%
# steps for matching prots from TCGA to our dataset
# (see https://github.com/jyaacoub/MutDTA/issues/95#issuecomment-2107780527)
import os
import pandas as pd
import matplotlib.pyplot as plt

#%% 1. gather relevant proteins to match with tcga
from src.analysis.utils import combine_dataset_pids
df_prots = combine_dataset_pids(subset='test')

df_prots = pd.read_csv('../downloads/all_prots.csv')
#%% 2. Download ALL TCGA projects as a single MAF
df_tcga = pd.read_csv('../downloads/TCGA_ALL.maf', sep='\t')

#%% 3. Pre filtering
import matplotlib.pyplot as plt
df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation']
df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1])
df_tcga = df_tcga[df_tcga['seq_len'] < 5000]
Expand Down
21 changes: 21 additions & 0 deletions src/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from scipy.stats import ttest_ind
import pandas as pd
import numpy as np
from src import config as cfg


def count_missing_res(pdb_file: str) -> Tuple[int,int]:
Expand Down Expand Up @@ -117,6 +118,26 @@ def generate_markdown(results, names=None, verbose=False, thresh_sig=False, cind
if verbose: print(md_output)
return md_table

def combine_dataset_pids(data_dir=cfg.DATA_ROOT, target='nomsa_binary_original_binary', subset='full'):
df_all = None
dbs = ['PDBbindDataset', 'DavisKibaDataset/davis', 'DavisKibaDataset/kiba']
dbs = [f'{data_dir}/{d}' for d in dbs]
for root_dir in dbs:
print(root_dir)
DB = root_dir.split("/")[-1]

df = pd.read_csv(f"{root_dir}/nomsa_binary_original_binary/{subset}/XY.csv", index_col=0)
df['pdb_id'] = df.prot_id.str.split("_").str[0]
df = df[['prot_id', 'prot_seq']].drop_duplicates(subset='prot_id')
df['seq_len'] = df['prot_seq'].str.len()
df['db'] = DB
df.reset_index(inplace=True)
df = df[['db','code', 'prot_id', 'seq_len', 'prot_seq']] # reorder them.
df.index.name = 'db_idx'
df_all = df if df_all is None else pd.concat([df_all, df], axis=0)

return df_all

if __name__ == '__main__':
#NOTE: the following is code for stratifying AutoDock Vina results by
# missing residues to identify if there is a correlation between missing
Expand Down

0 comments on commit 5696a7a

Please sign in to comment.