feat(download): cids + CHEMBL for sdf download via pubchem #94 #27

ligand_name is only really used by platinum (#26, #27). Davis and kiba use CIDs and CHEMBL repectively.
jyaacoub · May 8, 2024 · 126dc27 · 126dc27
1 parent 996df97
commit 126dc27
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 9 deletions.
diff --git a/playground.py b/playground.py
@@ -1,5 +1,9 @@
 #%%
-from concurrent.futures import ThreadPoolExecutor, as_completed
+from src.data_prep.downloaders import Downloader
+
+Downloader.download_SDFs(['CHEMBL245769', 'CHEMBL55788'], save_dir='./')
+
+#%%
 import pandas as pd
 
 df = pd.read_csv("../data/TCGA_BRCA_Mutations.csv")

diff --git a/src/data_prep/datasets.py b/src/data_prep/datasets.py
@@ -1073,7 +1073,7 @@ def download(self):
         # (see https://github.com/jyaacoub/MutDTA/issues/27)
         os.makedirs(self.raw_paths[2], exist_ok=True)
         print('Downloading SDF files for ligands.')
-        Downloader.download_SDFs(ligand_names=df_raw['affin.lig_id'].unique(),
+        Downloader.download_SDFs(ligand_ids=df_raw['affin.lig_id'].unique(),
                                 save_dir=self.raw_paths[2])
 
         # Fixing smiles in csv file using downloaded sdf files        

diff --git a/src/data_prep/downloaders.py b/src/data_prep/downloaders.py
@@ -146,21 +146,44 @@ def download_predicted_PDBs(UniProtID: Iterable[str], save_dir='./') -> dict:
         return Downloader.download(UniProtID, save_path=save_path, url=url)
 
     @staticmethod
-    def download_SDFs(ligand_names: List[str], 
-                      save_dir='./data/structures/ligands/', **kwargs) -> dict:
+    def download_SDFs(ligand_ids: List[str],
+                      save_dir='./data/structures/ligands/',
+                      **kwargs) -> dict:
         """
         Wrapper of `Downloader.download` for downloading SDF files. 
         Fetches SDF files from 
         https://files.rcsb.org/ligands/download/{ligand_name}_ideal.sdf.
         
-        where ligand name is the name of the ligand.
+        Where ligand_id is either the CID, CHEMBL id, or simply the ligand name. Will look at the first 
+        ligand_id in the list and determine which type it is.
+        
+        ## Different urls for different databases
+        For CID we can use the following url (identifiable by the fact that it is a number)
+            - https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/11314340/record/SDF?record_type=3d
+        
+        For CHEMBL ids we can use the following url (identifiable by the fact that it starts with "CHEMBL")
+            - https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/registryID/CHEMBL390156/record/sdf?record_type=3d
+        
+        For ligand names we can use the following url 
+            - https://files.rcsb.org/ligands/download/{ligand_name}_ideal.sdf. (e.g.: NLG_ideal.sdf)
+            OR
+            - https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{ligand_name}/record/SDF?record_type=3d
         """
-        save_path = lambda x: os.path.join(save_dir, f'{x}.sdf')
-        url = lambda x: f'https://files.rcsb.org/ligands/download/{x}_ideal.sdf'
+        urls = {'CID': lambda x: f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{x}/record/SDF?record_type=3d',
+                'CHEMBL': lambda x: f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/registryID/{x}/record/sdf?record_type=3d',
+                'name': lambda x: f'https://files.rcsb.org/ligands/download/{x}_ideal.sdf'}
 
-        return Downloader.download(ligand_names, save_path=save_path, url=url, 
+        lid = ligand_ids[0]
+        if lid.isdigit():
+            url = urls['CID']
+        elif lid.startswith('CHEMBL'):
+            url = urls['CHEMBL']
+        else:
+            url = urls['name']
+
+        save_path = lambda x: os.path.join(save_dir, f'{x}.sdf')        
+        return Downloader.download(ligand_ids, save_path=save_path, url=url, 
                                    tqdm_desc='Downloading ligand sdfs', **kwargs)
-
 
 if __name__ == '__main__':
     # downloading pdbs from X.csv list