Skip to content

Commit

Permalink
chore: refactor all_ontology_generator for cleanliness and modularity
Browse files Browse the repository at this point in the history
  • Loading branch information
nayib-jose-gloria committed Feb 12, 2024
1 parent 3e50769 commit 4384459
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 158 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/generate_all_ontology.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ name: Updates to Ontology Files
on:
push:
paths:
- '**/tools/ontology-builder/ontology-references/owl_info.yml'
- '**/tools/ontology-builder/ontology-references/ontology_info.yml'
branches-ignore:
- main

Expand Down
229 changes: 113 additions & 116 deletions tools/ontology-builder/all_ontology_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,21 @@
import yaml


def _download_owls(owl_info_yml: str = env.OWL_INFO_YAML, output_dir: str = env.ONTOLOGY_DIR):
def _download_ontologies(onto_info_yml: str = env.ONTO_INFO_YAML, output_dir: str = env.RAW_ONTOLOGY_DIR):
"""
Downloads the ontology owl files specified in 'owl_info_yml' into 'output_dir'
Downloads the ontology files specified in 'ontology_info.yml' into 'output_dir'
:param str owl_info_yml: path to yaml file wit OWL information
:param str output_dir: path to writable directory where owl files will be downloaded to
:param str onto_info_yml: path to yaml file with ontology information
:param str output_dir: path to writable directory where ontology files will be downloaded to
:rtype None
"""

with open(owl_info_yml, "r") as owl_info_handle:
owl_info = yaml.safe_load(owl_info_handle)
with open(onto_info_yml, "r") as onto_info_handle:
ontology_info = yaml.safe_load(onto_info_handle)

def download(_ontology, _url):
print(f"Start Downloading {_ontology}")
# Format of owl (handles cases where they are compressed)
# Format of ontology (handles cases where they are compressed)
download_format = _url.split(".")[-1]

output_file = os.path.join(output_dir, _ontology + ".owl")
Expand All @@ -39,10 +38,13 @@ def download(_ontology, _url):
urllib.request.urlretrieve(_url, output_file)
print(f"Finish Downloading {_ontology}")

def _build_url(_ontology: str):
onto_ref_data = ontology_info[_ontology]
return f"{onto_ref_data['source']}/{onto_ref_data['version']}/{ontology.lower()}.{onto_ref_data['filetype']}"

threads = []
for ontology, _ in owl_info.items():
latest_version = owl_info[ontology]["latest"]
url = owl_info[ontology]["urls"][latest_version]
for ontology, _ in ontology_info.items():
url = _build_url(ontology)
try:
urllib.request.urlopen(url)
except HTTPError as e:
Expand Down Expand Up @@ -72,13 +74,100 @@ def _decompress(infile: str, tofile: str):
tof.write(decom_str)


def _parse_owls(
working_dir: str = env.ONTOLOGY_DIR,
owl_info_yml: str = env.OWL_INFO_YAML,
def _load_ontology_object(onto_file: str) -> owlready2.entity.ThingClass:
"""
Read ontology data from file and write into a python object
:param onto_file: filepath to ontology file
:return:
"""
world = owlready2.World()
onto = world.get_ontology(onto_file)
onto.load()
return onto


def _get_ancestors(onto_class: owlready2.entity.ThingClass, onto_name: str) -> List[str]:
"""
Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name,
it will format the id from the form CL_xxxx to CL:xxxx
:param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved
:param str onto_name: only ancestors from this ontology will be kept
:rtype List[str]
:return list of ancestors (term ids), it could be empty
"""

ancestors = []

for ancestor in onto_class.ancestors():
if onto_class.name == ancestor.name:
continue
if ancestor.name.split("_")[0] == onto_name:
ancestors.append(ancestor.name.replace("_", ":"))

return ancestors


def _extract_ontology_term_metadata(onto: owlready2.entity.ThingClass) -> dict:
"""
Extract relevant metadata from ontology object and save into a dictionary following our JSON Schema
:param: onto: Ontology Object to Process
:return: Dict[str, str] map of ontology term IDs to pertinent metadata from ontology files
"""
term_dict = dict()
for onto_term in onto.classes():
term_id = onto_term.name.replace("_", ":")

# Skip terms that are not direct children from this ontology
if onto.name != term_id.split(":")[0]:
continue
# Gets ancestors
ancestors = _get_ancestors(onto_term, onto.name)

# Special Case: skip the current term if it is an NCBI Term, but not a descendant of 'NCBITaxon:33208'.
if onto.name == "NCBITaxon" and "NCBITaxon:33208" not in ancestors:
continue

term_dict[term_id] = dict()

# only write the ancestors if it's not NCBITaxon, as this saves a lot of disk space and there is
# no current use-case for NCBITaxon
term_dict[term_id]["ancestors"] = [] if onto.name == "NCBITaxon" else ancestors

# Gets label
term_dict[term_id]["label"] = onto_term.label[0] if onto_term.label else ""

# Add the "deprecated" status and associated metadata if True
term_dict[term_id]["deprecated"] = False
if onto_term.deprecated and onto_term.deprecated.first():
# if deprecated, include information to determine replacement term(s)
term_dict[term_id]["deprecated"] = True
if onto_term.comment:
term_dict[term_id]["comments"] = [str(c) for c in onto_term.comment]
# stores term tracking URL, such as a github issue discussing deprecated term
if hasattr(onto_term, "IAO_0000233") and onto_term.IAO_0000233:
term_dict[term_id]["term_tracker"] = str(onto_term.IAO_0000233[0])

# only need to record replaced_by OR considers
if onto_term.IAO_0100001 and onto_term.IAO_0100001.first():
# url --> term
ontology_term = re.findall(r"[^\W_]+", str(onto_term.IAO_0100001[0]))
term_dict[term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}"
else:
if hasattr(onto_term, "consider") and onto_term.consider:
term_dict[term_id]["consider"] = [str(c) for c in onto_term.consider]
return term_dict


def _parse_ontologies(
working_dir: str = env.RAW_ONTOLOGY_DIR,
output_json_file: str = env.PARSED_ONTOLOGIES_FILE,
):
"""
Parser all owl files in working_dir. Extracts information from all classes in the owl file.
Parse all ontology files in working_dir. Extracts information from all classes in the ontology file.
The extracted information is written into a gzipped a json file with the following structure:
{
"ontology_name":
Expand All @@ -101,114 +190,22 @@ def _parse_owls(
}
}
:param str working_dir: path to folder with owl files
:param str owl_info_yml: path to writable directory where owl files will be downloaded to
:param str owl_info_yml: path to yaml file wit owl information
:param str output_json_file: path to output jsaon file
:param str working_dir: path to folder with ontology files
:param str output_json_file: path to output json file
:rtype None
"""

with open(owl_info_yml, "r") as owl_info_handle:
owl_info = yaml.safe_load(owl_info_handle)

owl_files = []
for owl_file in os.listdir(working_dir):
if owl_file.endswith(".owl"):
owl_files.append(os.path.join(working_dir, owl_file))

# Parse owl files
onto_dict = {}
for owl_file in owl_files:
world = owlready2.World()
onto = world.get_ontology(owl_file)
onto.load()
onto_dict[onto.name] = {}

onto_dict = dict()
for onto_file in os.listdir(working_dir):
onto = _load_ontology_object(os.path.join(working_dir, onto_file))
print(f"Processing {onto.name}")

for onto_class in onto.classes():
term_id = onto_class.name.replace("_", ":")

# Skip terms that are not direct children from this ontology
if onto.name != term_id.split(":")[0]:
continue

# If there are specified target terms then only work with them
if onto.name in owl_info and "only" in owl_info[onto.name] and term_id not in owl_info[onto.name]["only"]:
continue

# Gets label
onto_dict[onto.name][term_id] = dict()
try:
onto_dict[onto.name][term_id]["label"] = onto_class.label[0]
except IndexError:
onto_dict[onto.name][term_id]["label"] = ""

# Add the "deprecated" status
onto_dict[onto.name][term_id]["deprecated"] = False
if onto_class.deprecated and onto_class.deprecated.first():
# if deprecated, include information to determine replacement term(s)
onto_dict[onto.name][term_id]["deprecated"] = True
if onto_class.comment:
onto_dict[onto.name][term_id]["comments"] = [str(c) for c in onto_class.comment]
# stores term tracking URL, such as a github issue discussing deprecated term
if hasattr(onto_class, "IAO_0000233") and onto_class.IAO_0000233:
onto_dict[onto.name][term_id]["term_tracker"] = str(onto_class.IAO_0000233[0])

# only need to record replaced_by OR considers
if onto_class.IAO_0100001 and onto_class.IAO_0100001.first():
# url --> term
ontology_term = re.findall(r"[^\W_]+", str(onto_class.IAO_0100001[0]))
onto_dict[onto.name][term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}"
else:
if hasattr(onto_class, "consider") and onto_class.consider:
onto_dict[onto.name][term_id]["consider"] = [str(c) for c in onto_class.consider]
# Gets ancestors
ancestors = _get_ancestors(onto_class, onto.name)

# If "children_of" specified in owl info then skip the current term if it is
# not a children of those indicated.
if (onto.name in owl_info and "children_of" in owl_info[onto.name]) and (
not list(set(ancestors) & set(owl_info[onto.name]["children_of"]))
):
onto_dict[onto.name].pop(term_id)
continue

# only add the ancestors if it's not NCBITaxon, as this saves a lot of disk space
if onto.name == "NCBITaxon":
onto_dict[onto.name][term_id]["ancestors"] = []
else:
onto_dict[onto.name][term_id]["ancestors"] = ancestors
onto_dict[onto.name] = _extract_ontology_term_metadata(onto)

with gzip.open(output_json_file, "wt") as output_json:
json.dump(onto_dict, output_json, indent=2)


def _get_ancestors(onto_class: owlready2.entity.ThingClass, ontololgy_name: str) -> List[str]:
"""
Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name,
it will format the id from the form CL_xxxx to CL:xxxx
:param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved
:param str ontololgy_name: only ancestors from this ontology will be kept
:rtype List[str]
:return list of ancestors (term ids), it could be empty
"""

ancestors = []

for ancestor in onto_class.ancestors():
if onto_class.name == ancestor.name:
continue
if ancestor.name.split("_")[0] == ontololgy_name:
ancestors.append(ancestor.name.replace("_", ":"))

return ancestors


# Download and parse owls upon execution
# Download and parse ontology files upon execution
if __name__ == "__main__":
_download_owls()
_parse_owls()
_download_ontologies()
_parse_ontologies()
7 changes: 4 additions & 3 deletions tools/ontology-builder/env.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os

PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
ONTOLOGY_DIR = os.path.join(PACKAGE_ROOT, "ontology-references")
OWL_INFO_YAML = os.path.join(ONTOLOGY_DIR, "owl_info.yml")
PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_DIR, "all_ontology.json.gz")
ONTOLOGY_REF_DIR = os.path.join(PACKAGE_ROOT, "ontology-references")
RAW_ONTOLOGY_DIR = os.path.join(ONTOLOGY_REF_DIR, "raw-files")
ONTO_INFO_YAML = os.path.join(ONTOLOGY_REF_DIR, "ontology_info.yml")
PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_REF_DIR, "all_ontology.json.gz")
36 changes: 36 additions & 0 deletions tools/ontology-builder/ontology-references/ontology_info.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
CL:
version: v2024-01-04
source: https://github.com/obophenotype/cell-ontology/releases/download
filetype: owl
EFO:
version: v3.62.0
source: https://github.com/EBISPOT/efo/releases/download
filetype: owl
HANCESTRO:
version: 3.0
source: https://github.com/EBISPOT/hancestro/raw/
filetype: owl
HsapDv:
version: 11
source: http://aber-owl.net/media/ontologies/HSAPDV/
filetype: owl
MONDO:
version: v2024-01-03
source: https://github.com/monarch-initiative/mondo/releases/download/
filetype: owl
MmusDv:
version: 9
source: http://aber-owl.net/media/ontologies/MMUSDV/
filetype: owl
NCBITaxon:
version: v2023-06-20
source: https://github.com/obophenotype/ncbitaxon/releases/download/
filetype: owl.gz
UBERON:
version: v2024-01-18
source: https://github.com/obophenotype/uberon/releases/download/
filetype: owl
PATO:
version: v2023-05-18
source: https://github.com/pato-ontology/pato/raw/
filetype: owl
38 changes: 0 additions & 38 deletions tools/ontology-builder/ontology-references/owl_info.yml

This file was deleted.

0 comments on commit 4384459

Please sign in to comment.