Skip to content

Commit

Permalink
feat(tcga): msa for tcga muts #99 #95
Browse files Browse the repository at this point in the history
  • Loading branch information
jyaacoub committed May 16, 2024
1 parent 985d684 commit 5340ec8
Show file tree
Hide file tree
Showing 3 changed files with 98 additions and 12 deletions.
69 changes: 68 additions & 1 deletion playground.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
print(f"Filter #2 (ref_AA match): {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}")
print('\n',dfm.db.value_counts())

# %% final seq len distribution

# %% final seq len distribution
n_bins = 25
lengths = dfm.seq_len_x
fig, ax = plt.subplots(1, 1, figsize=(10, 5))
Expand All @@ -78,4 +78,71 @@
plt.tight_layout()
plt.show()

# %% Getting updated sequences
def apply_mut(row):
ref_seq = list(row['prot_seq'])
ref_seq[row['mt_loc']-1] = row['mt_AA']
return ''.join(ref_seq)

dfm['mt_seq'] = dfm.apply(apply_mut, axis=1)


# %%
dfm.to_csv("/cluster/home/t122995uhn/projects/data/tcga/tcga_maf_davis_pdbbind.csv")
# %%
from src.utils.seq_alignment import MSARunner
from tqdm import tqdm
import pandas as pd
import os

DATA_DIR = '/cluster/home/t122995uhn/projects/data/tcga'
CSV = f'{DATA_DIR}/tcga_maf_davis_pdbbind.csv'
N_CPUS= 6
NUM_ARRAYS = 10
array_idx = 0#${SLURM_ARRAY_TASK_ID}

df = pd.read_csv(CSV, index_col=0)
df.sort_values(by='seq_len_y', inplace=True)


# %%
for DB in df.db.unique():
print('DB', DB)
RAW_DIR = f'{DATA_DIR}/{DB}'
# should already be unique if these are proteins mapped form tcga!
unique_df = df[df['db'] == DB]
########################## Get job partition
partition_size = len(unique_df) / NUM_ARRAYS
start, end = int(array_idx*partition_size), int((array_idx+1)*partition_size)

unique_df = unique_df[start:end]

#################################### create fastas
fa_dir = os.path.join(RAW_DIR, f'{DB}_fa')
fasta_fp = lambda idx,pid: os.path.join(fa_dir, f"{idx}-{pid}.fasta")
os.makedirs(fa_dir, exist_ok=True)
for idx, (prot_id, pro_seq) in tqdm(
unique_df[['prot_id', 'prot_seq']].iterrows(),
desc='Creating fastas',
total=len(unique_df)):
with open(fasta_fp(idx,prot_id), "w") as f:
f.write(f">{prot_id},{idx},{DB}\n{pro_seq}")

##################################### Run hhblits
aln_dir = os.path.join(RAW_DIR, f'{DB}_aln')
aln_fp = lambda idx,pid: os.path.join(aln_dir, f"{idx}-{pid}.a3m")
os.makedirs(aln_dir, exist_ok=True)

# finally running
for idx, (prot_id, pro_seq) in tqdm(
unique_df[['prot_id', 'mt_seq']].iterrows(),
desc='Running hhblits',
total=len(unique_df)):
in_fp = fasta_fp(idx,prot_id)
out_fp = aln_fp(idx,prot_id)

if not os.path.isfile(out_fp):
print(MSARunner.hhblits(in_fp, out_fp, n_cpus=N_CPUS, return_cmd=True))
break

# %%
7 changes: 7 additions & 0 deletions src/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,13 @@ class LIG_FEAT_OPT(StringEnum):
MMSEQ2_BIN = f'{Path.home()}/lib/mmseqs/bin/mmseqs'
RING3_BIN = f'{Path.home()}/lib/ring-3.0.0/ring/bin/ring'

if 'uhnh4h' in DOMAIN_NAME:
UniRef_dir = '/cluster/projects/kumargroup/sequence_databases/UniRef30_2020_06/UniRef30_2020_06'
hhsuite_bin_dir = '/cluster/tools/software/centos7/hhsuite/3.3.0/bin'
else:
UniRef_dir = ''
hhsuite_bin_dir = ''


###########################
# LOGGING STUFF:
Expand Down
34 changes: 23 additions & 11 deletions src/utils/seq_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@


class MSARunner(Processor):
hhsuite_bin_dir = '/cluster/tools/software/centos7/hhsuite/3.3.0/bin'
bin_hhblits = f'{hhsuite_bin_dir}/hhblits'
bin_hhfilter = f'{hhsuite_bin_dir}/hhfilter'
UniRef_dir = '/cluster/projects/kumargroup/mslobody/Protein_Communities/01_MSA/databases/UniRef30_2020_06'
UniRef_dir = cfg.UniRef_dir
bin_hhblits = f'{cfg.hhsuite_bin_dir}/hhblits'
bin_hhfilter = f'{cfg.hhsuite_bin_dir}/hhfilter'

@staticmethod
def hhblits(f_in:str, f_out:str, n_cpus=6, n_iter:int=2,
Expand Down Expand Up @@ -216,20 +215,33 @@ def read_clusttsv_output(tsv_path:str):


if __name__ == '__main__':
from src.data_prep.datasets import BaseDataset
from src.utils.seq_alignment import MSARunner
from tqdm import tqdm
import pandas as pd
import os
csv = '/cluster/home/t122995uhn/projects/data/PlatinumDataset/nomsa_binary/full/XY.csv'
df = pd.read_csv(csv, index_col=0)
#################### Get unique proteins:
unique_df = BaseDataset.get_unique_prots(df)

# sorting by sequence length before dropping so that we keep the longest protein sequence instead of just the first.
df['seq_len'] = df['prot_seq'].str.len()
df = df.sort_values(by='seq_len', ascending=False)

# create new numerated index col for ensuring the first unique uniprotID is fetched properly
df.reset_index(drop=False, inplace=True)
unique_pro = df[['prot_id']].drop_duplicates(keep='first')

# reverting index to code-based index
df.set_index('code', inplace=True)
unique_df = df.iloc[unique_pro.index]

########################## Get job partition
num_arrays = 100
NUM_ARRAYS = 100
array_idx = 0#${SLURM_ARRAY_TASK_ID}
partition_size = len(unique_df) / num_arrays
partition_size = len(unique_df) / NUM_ARRAYS
start, end = int(array_idx*partition_size), int((array_idx+1)*partition_size)

unique_df = unique_df[start:end]

raw_dir = '/cluster/home/t122995uhn/projects/data/PlatinumDataset/raw'

#################################### create fastas
Expand Down

0 comments on commit 5340ec8

Please sign in to comment.