Skip to content

Commit

Permalink
fix(tcga): focus on pdbbind davis #95 #99
Browse files Browse the repository at this point in the history
Since we dont have a kiba GVPL dataset yet...
  • Loading branch information
jyaacoub committed May 16, 2024
1 parent 10ce1b2 commit 985d684
Show file tree
Hide file tree
Showing 2 changed files with 89 additions and 68 deletions.
135 changes: 75 additions & 60 deletions playground.py
Original file line number Diff line number Diff line change
@@ -1,66 +1,81 @@
# %%
#%% 1.Gather data for davis,kiba and pdbbind datasets
import os
import pandas as pd
from src.utils.residue import Chain


df = pd.read_csv('/cluster/home/t122995uhn/projects/MutDTA/PDBbind_all_geneNames.csv')

#%% identify pocket location:
def get_residue_range(pdb_filename):
with open(pdb_filename, 'r') as pdb_file:
chain_residues = {}
for line in pdb_file:
if line.startswith('ATOM'):
chain_id = line[21].strip()
residue_number = int(line[22:26].strip())
if chain_id not in chain_residues:
chain_residues[chain_id] = set()
chain_residues[chain_id].add(residue_number)

chain_ranges = {}
for chain_id, residues in chain_residues.items():
min_residue = min(residues)
max_residue = max(residues)
chain_ranges[chain_id] = (min_residue, max_residue)

return chain_ranges
import matplotlib.pyplot as plt
from src.analysis.utils import combine_dataset_pids
from src import config as cfg
df_prots = combine_dataset_pids(dbs=[cfg.DATA_OPT.davis, cfg.DATA_OPT.PDBbind], # just davis and pdbbind for now
subset='test')

# %%
from tqdm import tqdm
dir_p = '/cluster/home/t122995uhn/projects/data/pdbbind/v2020-other-PL/'
pocket_fp = lambda x: f'{dir_p}/{x}/{x}_pocket.pdb'
prot_fp = lambda x: f'{dir_p}/{x}/{x}_protein.pdb'

dfu = df.drop_duplicates(subset='code')

mapped_prange = {}
for idx, (code, seq) in tqdm(dfu[['code', 'prot_seq']].iterrows(), total=len(dfu)):
code = code.lower()
# find the seq range for all chains in pocket and in protein
prot = get_residue_range(prot_fp(code))
pocket = get_residue_range(pocket_fp(code))

# find the right change that matches seq_len
chainID = None
for cid, s in Chain(prot_fp(code)):
if s == seq: # +1 bc PDB starts at index 1
chainID = cid
break

# Find the protein residue range for the same chain
prot_ch = prot.get(chainID)
if not prot_ch or chainID not in pocket:
# print(f'No matching chain in protein file for {prot_fp(code)}')
continue

# Reset pocket range so that it matches exactly with proteins sequence that is 0-indexed:
prange = (pocket[chainID][0] - prot_ch[0], pocket[chainID][1] - prot_ch[0]) # Convert to 0-indexed
mapped_prange[code.upper()] = f"{prange[0]}-{prange[1]}"

#%% 2. Load TCGA data
df_tcga = pd.read_csv('../downloads/TCGA_ALL.maf', sep='\t')

# %%
# Add the pocket range to the dataframe
df['pocket_range'] = df['code'].map(mapped_prange)
df.to_csv("pdbbind_all_prange.csv")
#%% 3. Pre filtering
df_tcga = df_tcga[df_tcga['Variant_Classification'] == 'Missense_Mutation']
df_tcga['seq_len'] = pd.to_numeric(df_tcga['Protein_position'].str.split('/').str[1])
df_tcga = df_tcga[df_tcga['seq_len'] < 5000]
df_tcga['seq_len'].plot.hist(bins=100, title="sequence length histogram capped at 5K")
plt.show()
df_tcga = df_tcga[df_tcga['seq_len'] < 1200]
df_tcga['seq_len'].plot.hist(bins=100, title="sequence length after capped at 1.2K")

#%% 4. Merging df_prots with TCGA
df_tcga['uniprot'] = df_tcga['SWISSPROT'].str.split('.').str[0]

dfm = df_tcga.merge(df_prots[df_prots.db != 'davis'],
left_on='uniprot', right_on='prot_id', how='inner')

# for davis we have to merge on HUGO_SYMBOLS
dfm_davis = df_tcga.merge(df_prots[df_prots.db == 'davis'],
left_on='Hugo_Symbol', right_on='prot_id', how='inner')

dfm = pd.concat([dfm,dfm_davis], axis=0)

del dfm_davis # to save mem

# %% 5. Post filtering step
# 5.1. Filter for only those sequences with matching sequence length (to get rid of nonmatched isoforms)
# seq_len_x is from tcga, seq_len_y is from our dataset
tmp = len(dfm)
# allow for some error due to missing amino acids from pdb file in PDBbind dataset
# - assumption here is that isoforms will differ by more than 50 amino acids
dfm = dfm[(dfm.seq_len_y <= dfm.seq_len_x) & (dfm.seq_len_x<= dfm.seq_len_y+50)]
print(f"Filter #1 (seq_len) : {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}")

# 5.2. Filter out those that dont have the same reference seq according to the "Protein_position" and "Amino_acids" col

# Extract mutation location and reference amino acid from 'Protein_position' and 'Amino_acids' columns
dfm['mt_loc'] = pd.to_numeric(dfm['Protein_position'].str.split('/').str[0])
dfm = dfm[dfm['mt_loc'] < dfm['seq_len_y']]
dfm[['ref_AA', 'mt_AA']] = dfm['Amino_acids'].str.split('/', expand=True)

dfm['db_AA'] = dfm.apply(lambda row: row['prot_seq'][row['mt_loc']-1], axis=1)

# Filter #2: Match proteins with the same reference amino acid at the mutation location
tmp = len(dfm)
dfm = dfm[dfm['db_AA'] == dfm['ref_AA']]
print(f"Filter #2 (ref_AA match): {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}")
print('\n',dfm.db.value_counts())

# %% final seq len distribution

n_bins = 25
lengths = dfm.seq_len_x
fig, ax = plt.subplots(1, 1, figsize=(10, 5))

# Plot histogram
n, bins, patches = ax.hist(lengths, bins=n_bins, color='blue', alpha=0.7)
ax.set_title('TCGA final filtering for db matches')

# Add counts to each bin
for count, x, patch in zip(n, bins, patches):
ax.text(x + 0.5, count, str(int(count)), ha='center', va='bottom')

ax.set_xlabel('Sequence Length')
ax.set_ylabel('Frequency')

plt.tight_layout()
plt.show()

# %%
22 changes: 14 additions & 8 deletions src/analysis/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,20 +118,26 @@ def generate_markdown(results, names=None, verbose=False, thresh_sig=False, cind
if verbose: print(md_output)
return md_table

def combine_dataset_pids(data_dir=cfg.DATA_ROOT, target='nomsa_binary_original_binary', subset='full'):
def combine_dataset_pids(data_dir=cfg.DATA_ROOT,
dbs=[cfg.DATA_OPT.davis, cfg.DATA_OPT.kiba, cfg.DATA_OPT.PDBbind],
target='nomsa_aflow_gvp_binary', subset='full',
xy='cleaned_XY.csv'):
df_all = None
dbs = ['PDBbindDataset', 'DavisKibaDataset/davis', 'DavisKibaDataset/kiba']
dbs = [f'{data_dir}/{d}' for d in dbs]
for root_dir in dbs:
print(root_dir)
DB = root_dir.split("/")[-1]

df = pd.read_csv(f"{root_dir}/nomsa_binary_original_binary/{subset}/XY.csv", index_col=0)
dir_p = {'davis':'DavisKibaDataset/davis',
'kiba': 'DavisKibaDataset/kiba',
'PDBbind': 'PDBbindDataset'}
dbs = {d.value: f'{data_dir}/{dir_p[d]}/{target}/{subset}/{xy}' for d in dbs}

for DB, fp in dbs.items():
print(DB, fp)
df = pd.read_csv(fp, index_col=0)
df['pdb_id'] = df.prot_id.str.split("_").str[0]
df = df[['prot_id', 'prot_seq']].drop_duplicates(subset='prot_id')

df['seq_len'] = df['prot_seq'].str.len()
df['db'] = DB
df.reset_index(inplace=True)

df = df[['db','code', 'prot_id', 'seq_len', 'prot_seq']] # reorder them.
df.index.name = 'db_idx'
df_all = df if df_all is None else pd.concat([df_all, df], axis=0)
Expand Down

0 comments on commit 985d684

Please sign in to comment.