Skip to content

Commit

Permalink
feat(download): cids + CHEMBL for sdf download via pubchem #94 #27
Browse files Browse the repository at this point in the history
ligand_name is only really used by platinum (#26, #27). Davis and kiba use CIDs and CHEMBL repectively.
  • Loading branch information
jyaacoub committed May 8, 2024
1 parent 996df97 commit 126dc27
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
6 changes: 5 additions & 1 deletion playground.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
#%%
from concurrent.futures import ThreadPoolExecutor, as_completed
from src.data_prep.downloaders import Downloader

Downloader.download_SDFs(['CHEMBL245769', 'CHEMBL55788'], save_dir='./')

#%%
import pandas as pd

df = pd.read_csv("../data/TCGA_BRCA_Mutations.csv")
Expand Down
2 changes: 1 addition & 1 deletion src/data_prep/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,7 @@ def download(self):
# (see https://github.com/jyaacoub/MutDTA/issues/27)
os.makedirs(self.raw_paths[2], exist_ok=True)
print('Downloading SDF files for ligands.')
Downloader.download_SDFs(ligand_names=df_raw['affin.lig_id'].unique(),
Downloader.download_SDFs(ligand_ids=df_raw['affin.lig_id'].unique(),
save_dir=self.raw_paths[2])

# Fixing smiles in csv file using downloaded sdf files
Expand Down
37 changes: 30 additions & 7 deletions src/data_prep/downloaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,44 @@ def download_predicted_PDBs(UniProtID: Iterable[str], save_dir='./') -> dict:
return Downloader.download(UniProtID, save_path=save_path, url=url)

@staticmethod
def download_SDFs(ligand_names: List[str],
save_dir='./data/structures/ligands/', **kwargs) -> dict:
def download_SDFs(ligand_ids: List[str],
save_dir='./data/structures/ligands/',
**kwargs) -> dict:
"""
Wrapper of `Downloader.download` for downloading SDF files.
Fetches SDF files from
https://files.rcsb.org/ligands/download/{ligand_name}_ideal.sdf.
where ligand name is the name of the ligand.
Where ligand_id is either the CID, CHEMBL id, or simply the ligand name. Will look at the first
ligand_id in the list and determine which type it is.
## Different urls for different databases
For CID we can use the following url (identifiable by the fact that it is a number)
- https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/11314340/record/SDF?record_type=3d
For CHEMBL ids we can use the following url (identifiable by the fact that it starts with "CHEMBL")
- https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/registryID/CHEMBL390156/record/sdf?record_type=3d
For ligand names we can use the following url
- https://files.rcsb.org/ligands/download/{ligand_name}_ideal.sdf. (e.g.: NLG_ideal.sdf)
OR
- https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{ligand_name}/record/SDF?record_type=3d
"""
save_path = lambda x: os.path.join(save_dir, f'{x}.sdf')
url = lambda x: f'https://files.rcsb.org/ligands/download/{x}_ideal.sdf'
urls = {'CID': lambda x: f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{x}/record/SDF?record_type=3d',
'CHEMBL': lambda x: f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/xref/registryID/{x}/record/sdf?record_type=3d',
'name': lambda x: f'https://files.rcsb.org/ligands/download/{x}_ideal.sdf'}

return Downloader.download(ligand_names, save_path=save_path, url=url,
lid = ligand_ids[0]
if lid.isdigit():
url = urls['CID']
elif lid.startswith('CHEMBL'):
url = urls['CHEMBL']
else:
url = urls['name']

save_path = lambda x: os.path.join(save_dir, f'{x}.sdf')
return Downloader.download(ligand_ids, save_path=save_path, url=url,
tqdm_desc='Downloading ligand sdfs', **kwargs)


if __name__ == '__main__':
# downloading pdbs from X.csv list
Expand Down

0 comments on commit 126dc27

Please sign in to comment.