feat(tcga): msa for tcga muts #99 #95

jyaacoub · May 16, 2024 · 5340ec8 · 5340ec8
1 parent 985d684
commit 5340ec8
Show file tree

Hide file tree

Showing 3 changed files with 98 additions and 12 deletions.
diff --git a/playground.py b/playground.py
@@ -58,8 +58,8 @@
 print(f"Filter #2 (ref_AA match): {tmp:5d} - {tmp-len(dfm):5d} = {len(dfm):5d}")
 print('\n',dfm.db.value_counts())
 
-# %% final seq len distribution
 
+# %% final seq len distribution
 n_bins = 25
 lengths = dfm.seq_len_x
 fig, ax = plt.subplots(1, 1, figsize=(10, 5))
@@ -78,4 +78,71 @@
 plt.tight_layout()
 plt.show()
 
+# %% Getting updated sequences
+def apply_mut(row):
+    ref_seq = list(row['prot_seq'])
+    ref_seq[row['mt_loc']-1] = row['mt_AA']
+    return ''.join(ref_seq)
+
+dfm['mt_seq'] = dfm.apply(apply_mut, axis=1)
+
+
+# %%
+dfm.to_csv("/cluster/home/t122995uhn/projects/data/tcga/tcga_maf_davis_pdbbind.csv")
+# %%
+from src.utils.seq_alignment import MSARunner
+from tqdm import tqdm
+import pandas as pd
+import os
+
+DATA_DIR = '/cluster/home/t122995uhn/projects/data/tcga'
+CSV = f'{DATA_DIR}/tcga_maf_davis_pdbbind.csv'
+N_CPUS= 6
+NUM_ARRAYS = 10
+array_idx = 0#${SLURM_ARRAY_TASK_ID}
+
+df = pd.read_csv(CSV, index_col=0)
+df.sort_values(by='seq_len_y', inplace=True)
+
+
+# %%
+for DB in df.db.unique():
+    print('DB', DB)
+    RAW_DIR = f'{DATA_DIR}/{DB}'
+    # should already be unique if these are proteins mapped form tcga!
+    unique_df = df[df['db'] == DB]
+    ########################## Get job partition
+    partition_size = len(unique_df) / NUM_ARRAYS
+    start, end = int(array_idx*partition_size), int((array_idx+1)*partition_size)
+
+    unique_df = unique_df[start:end]
+
+    #################################### create fastas
+    fa_dir = os.path.join(RAW_DIR, f'{DB}_fa')
+    fasta_fp = lambda idx,pid: os.path.join(fa_dir, f"{idx}-{pid}.fasta")
+    os.makedirs(fa_dir, exist_ok=True)
+    for idx, (prot_id, pro_seq) in tqdm(
+                    unique_df[['prot_id', 'prot_seq']].iterrows(), 
+                    desc='Creating fastas',
+                    total=len(unique_df)):
+        with open(fasta_fp(idx,prot_id), "w") as f:
+            f.write(f">{prot_id},{idx},{DB}\n{pro_seq}")
+
+    ##################################### Run hhblits
+    aln_dir = os.path.join(RAW_DIR, f'{DB}_aln')
+    aln_fp = lambda idx,pid: os.path.join(aln_dir, f"{idx}-{pid}.a3m")
+    os.makedirs(aln_dir, exist_ok=True)
+
+    # finally running
+    for idx, (prot_id, pro_seq) in tqdm(
+                    unique_df[['prot_id', 'mt_seq']].iterrows(), 
+                    desc='Running hhblits',
+                    total=len(unique_df)):
+        in_fp = fasta_fp(idx,prot_id)
+        out_fp = aln_fp(idx,prot_id)
+
+        if not os.path.isfile(out_fp):
+            print(MSARunner.hhblits(in_fp, out_fp, n_cpus=N_CPUS, return_cmd=True))
+            break
+
 # %%
diff --git a/src/utils/config.py b/src/utils/config.py
@@ -123,6 +123,13 @@ class LIG_FEAT_OPT(StringEnum):
 MMSEQ2_BIN = f'{Path.home()}/lib/mmseqs/bin/mmseqs'
 RING3_BIN = f'{Path.home()}/lib/ring-3.0.0/ring/bin/ring'
 
+if 'uhnh4h' in DOMAIN_NAME:
+    UniRef_dir = '/cluster/projects/kumargroup/sequence_databases/UniRef30_2020_06/UniRef30_2020_06'
+    hhsuite_bin_dir = '/cluster/tools/software/centos7/hhsuite/3.3.0/bin'
+else:
+    UniRef_dir = ''
+    hhsuite_bin_dir = ''
+
 
 ###########################
 # LOGGING STUFF:

diff --git a/src/utils/seq_alignment.py b/src/utils/seq_alignment.py
@@ -9,10 +9,9 @@
 
 
 class MSARunner(Processor):
-    hhsuite_bin_dir = '/cluster/tools/software/centos7/hhsuite/3.3.0/bin'
-    bin_hhblits = f'{hhsuite_bin_dir}/hhblits'
-    bin_hhfilter = f'{hhsuite_bin_dir}/hhfilter'
-    UniRef_dir = '/cluster/projects/kumargroup/mslobody/Protein_Communities/01_MSA/databases/UniRef30_2020_06'
+    UniRef_dir = cfg.UniRef_dir
+    bin_hhblits = f'{cfg.hhsuite_bin_dir}/hhblits'
+    bin_hhfilter = f'{cfg.hhsuite_bin_dir}/hhfilter'
 
     @staticmethod    
     def hhblits(f_in:str, f_out:str, n_cpus=6, n_iter:int=2,
@@ -216,20 +215,33 @@ def read_clusttsv_output(tsv_path:str):
 
 
 if __name__ == '__main__':
-    from src.data_prep.datasets import BaseDataset
+    from src.utils.seq_alignment import MSARunner
+    from tqdm import tqdm
+    import pandas as pd
+    import os
     csv = '/cluster/home/t122995uhn/projects/data/PlatinumDataset/nomsa_binary/full/XY.csv'
     df = pd.read_csv(csv, index_col=0)
     #################### Get unique proteins:
-    unique_df = BaseDataset.get_unique_prots(df)
-
+    # sorting by sequence length before dropping so that we keep the longest protein sequence instead of just the first.
+    df['seq_len'] = df['prot_seq'].str.len()
+    df = df.sort_values(by='seq_len', ascending=False)
+
+    # create new numerated index col for ensuring the first unique uniprotID is fetched properly 
+    df.reset_index(drop=False, inplace=True)
+    unique_pro = df[['prot_id']].drop_duplicates(keep='first')
+
+    # reverting index to code-based index
+    df.set_index('code', inplace=True)
+    unique_df = df.iloc[unique_pro.index]
+
     ########################## Get job partition
-    num_arrays = 100
+    NUM_ARRAYS = 100
     array_idx = 0#${SLURM_ARRAY_TASK_ID}
-    partition_size = len(unique_df) / num_arrays
+    partition_size = len(unique_df) / NUM_ARRAYS
     start, end = int(array_idx*partition_size), int((array_idx+1)*partition_size)
-    
+
     unique_df = unique_df[start:end]
-    
+
     raw_dir = '/cluster/home/t122995uhn/projects/data/PlatinumDataset/raw'
 
     #################################### create fastas