From 65657f98a1da0c5c73627215f692e7bb1b4ea830 Mon Sep 17 00:00:00 2001 From: Adriano Rutz Date: Mon, 13 May 2024 10:00:54 +0200 Subject: [PATCH] WIP #50 --- update/config.py | 9 ++++++ update/generate_database_index.py | 4 +-- update/queries/urls_formatters.rq | 46 +++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 2 deletions(-) create mode 100644 update/queries/urls_formatters.rq diff --git a/update/config.py b/update/config.py index 97c918c..8404042 100644 --- a/update/config.py +++ b/update/config.py @@ -407,6 +407,15 @@ "output_file": "triplets.csv", }, ), + Task( + name="urls_formatters", + f=download_query_as_csv.run, + group=DownloadGroup, + params={ + "query_file": "update/queries/urls_formatters.rq", + "output_file": "urls_formatters.csv", + }, + ), Task( name="generate_database_chemo", f=generate_database_chemo.run, diff --git a/update/generate_database_index.py b/update/generate_database_index.py index f724b7b..16e41a3 100644 --- a/update/generate_database_index.py +++ b/update/generate_database_index.py @@ -111,7 +111,7 @@ def run(path: Path) -> None: # structures.append({"id": struct, "smiles": smiles}) logging.info(" Processed structures") - # TODO add all structure IDs (See #50) + # TODO add all IDs and formatters (See #50) descriptors_dict = {} with open(path / "descriptors_rdkit.csv", "r") as f: @@ -158,7 +158,7 @@ def run(path: Path) -> None: # Eventually TODO add taxa_names_com - # TODO add all taxon IDs (See #50) + # TODO add all IDs and formatters (See #50) taxon_ranks_dict = {} with open(path / "ranks_names.csv", "r") as f: diff --git a/update/queries/urls_formatters.rq b/update/queries/urls_formatters.rq new file mode 100644 index 0000000..e86c016 --- /dev/null +++ b/update/queries/urls_formatters.rq @@ -0,0 +1,46 @@ +PREFIX wd: +PREFIX wdt: +PREFIX hint: + +SELECT * WHERE { + # All properties we use with a formatter URL + VALUES ?property { + wd:P231 # STRUCTURE CAS + wd:P233 # STRUCTURE SMILES (canonical) + wd:P234 # STRUCTURE InChI + wd:P235 # STRUCTURE InChIKey + wd:P356 # REFERENCE DOI + wd:P592 # STRUCTURE ChEMBL + wd:P638 # STRUCTURE PDB structure + wd:P661 # STRUCTURE ChemSpider + wd:P662 # STRUCTURE PubChem CID + wd:P683 # STRUCTURE ChEBI + wd:P665 # STRUCTURE KEGG + wd:P685 # TAXON NCBI + wd:P815 # TAXON ITIS + wd:P830 # TAXON EOL + wd:P846 # TAXON GBIF + wd:P850 # TAXON WoRMS + wd:P960 # TAXON TROPICOS + wd:P961 # TAXON IPNI + wd:P2017 # STRUCTURE SMILES (isomeric) + wd:P2057 # STRUCTURE HMDB + wd:P2064 # STRUCTURE KNApSAcK + wd:P2084 # STRUCTURE ZINC + wd:P2877 # STRUCTURE SureChEMBL + wd:P3151 # TAXON iNat + wd:P3636 # STRUCTURE PDB ligand + wd:P4964 # STRUCTURE SPLASH + wd:P5037 # TAXON PoWO + wd:P5055 # TAXON IRMNG + wd:P6689 # STRUCTURE MassBank + wd:P7715 # TAXON WFO + wd:P7746 # STRUCTURE NPAtlas + wd:P8533 # STRUCTURE SMARTS + wd:P9157 # TAXON OTL + wd:P9405 # STRUCTURE NMRShiftDB + wd:P10718 # STRUCTURE CXSMILES + wd:P11375 # STRUCTURE CSD + } + ?property wdt:P1630 ?formatter. hint:Prior hint:rangeSafe TRUE. +}