From 43844597e23aaabf7b3d7e0cd3593012b312ea05 Mon Sep 17 00:00:00 2001 From: nayib-jose-gloria Date: Mon, 12 Feb 2024 17:50:38 -0500 Subject: [PATCH] chore: refactor all_ontology_generator for cleanliness and modularity --- .github/workflows/generate_all_ontology.yml | 2 +- .../all_ontology_generator.py | 229 +++++++++--------- tools/ontology-builder/env.py | 7 +- .../ontology-references/ontology_info.yml | 36 +++ .../ontology-references/owl_info.yml | 38 --- 5 files changed, 154 insertions(+), 158 deletions(-) create mode 100644 tools/ontology-builder/ontology-references/ontology_info.yml delete mode 100644 tools/ontology-builder/ontology-references/owl_info.yml diff --git a/.github/workflows/generate_all_ontology.yml b/.github/workflows/generate_all_ontology.yml index 63efab8..2be9fec 100644 --- a/.github/workflows/generate_all_ontology.yml +++ b/.github/workflows/generate_all_ontology.yml @@ -3,7 +3,7 @@ name: Updates to Ontology Files on: push: paths: - - '**/tools/ontology-builder/ontology-references/owl_info.yml' + - '**/tools/ontology-builder/ontology-references/ontology_info.yml' branches-ignore: - main diff --git a/tools/ontology-builder/all_ontology_generator.py b/tools/ontology-builder/all_ontology_generator.py index c2c1886..0b5d122 100755 --- a/tools/ontology-builder/all_ontology_generator.py +++ b/tools/ontology-builder/all_ontology_generator.py @@ -12,22 +12,21 @@ import yaml -def _download_owls(owl_info_yml: str = env.OWL_INFO_YAML, output_dir: str = env.ONTOLOGY_DIR): +def _download_ontologies(onto_info_yml: str = env.ONTO_INFO_YAML, output_dir: str = env.RAW_ONTOLOGY_DIR): """ - Downloads the ontology owl files specified in 'owl_info_yml' into 'output_dir' + Downloads the ontology files specified in 'ontology_info.yml' into 'output_dir' - :param str owl_info_yml: path to yaml file wit OWL information - :param str output_dir: path to writable directory where owl files will be downloaded to + :param str onto_info_yml: path to yaml file with ontology information + :param str output_dir: path to writable directory where ontology files will be downloaded to :rtype None """ - - with open(owl_info_yml, "r") as owl_info_handle: - owl_info = yaml.safe_load(owl_info_handle) + with open(onto_info_yml, "r") as onto_info_handle: + ontology_info = yaml.safe_load(onto_info_handle) def download(_ontology, _url): print(f"Start Downloading {_ontology}") - # Format of owl (handles cases where they are compressed) + # Format of ontology (handles cases where they are compressed) download_format = _url.split(".")[-1] output_file = os.path.join(output_dir, _ontology + ".owl") @@ -39,10 +38,13 @@ def download(_ontology, _url): urllib.request.urlretrieve(_url, output_file) print(f"Finish Downloading {_ontology}") + def _build_url(_ontology: str): + onto_ref_data = ontology_info[_ontology] + return f"{onto_ref_data['source']}/{onto_ref_data['version']}/{ontology.lower()}.{onto_ref_data['filetype']}" + threads = [] - for ontology, _ in owl_info.items(): - latest_version = owl_info[ontology]["latest"] - url = owl_info[ontology]["urls"][latest_version] + for ontology, _ in ontology_info.items(): + url = _build_url(ontology) try: urllib.request.urlopen(url) except HTTPError as e: @@ -72,13 +74,100 @@ def _decompress(infile: str, tofile: str): tof.write(decom_str) -def _parse_owls( - working_dir: str = env.ONTOLOGY_DIR, - owl_info_yml: str = env.OWL_INFO_YAML, +def _load_ontology_object(onto_file: str) -> owlready2.entity.ThingClass: + """ + Read ontology data from file and write into a python object + + :param onto_file: filepath to ontology file + :return: + """ + world = owlready2.World() + onto = world.get_ontology(onto_file) + onto.load() + return onto + + +def _get_ancestors(onto_class: owlready2.entity.ThingClass, onto_name: str) -> List[str]: + """ + Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name, + it will format the id from the form CL_xxxx to CL:xxxx + + :param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved + :param str onto_name: only ancestors from this ontology will be kept + + :rtype List[str] + :return list of ancestors (term ids), it could be empty + """ + + ancestors = [] + + for ancestor in onto_class.ancestors(): + if onto_class.name == ancestor.name: + continue + if ancestor.name.split("_")[0] == onto_name: + ancestors.append(ancestor.name.replace("_", ":")) + + return ancestors + + +def _extract_ontology_term_metadata(onto: owlready2.entity.ThingClass) -> dict: + """ + Extract relevant metadata from ontology object and save into a dictionary following our JSON Schema + + :param: onto: Ontology Object to Process + :return: Dict[str, str] map of ontology term IDs to pertinent metadata from ontology files + """ + term_dict = dict() + for onto_term in onto.classes(): + term_id = onto_term.name.replace("_", ":") + + # Skip terms that are not direct children from this ontology + if onto.name != term_id.split(":")[0]: + continue + # Gets ancestors + ancestors = _get_ancestors(onto_term, onto.name) + + # Special Case: skip the current term if it is an NCBI Term, but not a descendant of 'NCBITaxon:33208'. + if onto.name == "NCBITaxon" and "NCBITaxon:33208" not in ancestors: + continue + + term_dict[term_id] = dict() + + # only write the ancestors if it's not NCBITaxon, as this saves a lot of disk space and there is + # no current use-case for NCBITaxon + term_dict[term_id]["ancestors"] = [] if onto.name == "NCBITaxon" else ancestors + + # Gets label + term_dict[term_id]["label"] = onto_term.label[0] if onto_term.label else "" + + # Add the "deprecated" status and associated metadata if True + term_dict[term_id]["deprecated"] = False + if onto_term.deprecated and onto_term.deprecated.first(): + # if deprecated, include information to determine replacement term(s) + term_dict[term_id]["deprecated"] = True + if onto_term.comment: + term_dict[term_id]["comments"] = [str(c) for c in onto_term.comment] + # stores term tracking URL, such as a github issue discussing deprecated term + if hasattr(onto_term, "IAO_0000233") and onto_term.IAO_0000233: + term_dict[term_id]["term_tracker"] = str(onto_term.IAO_0000233[0]) + + # only need to record replaced_by OR considers + if onto_term.IAO_0100001 and onto_term.IAO_0100001.first(): + # url --> term + ontology_term = re.findall(r"[^\W_]+", str(onto_term.IAO_0100001[0])) + term_dict[term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}" + else: + if hasattr(onto_term, "consider") and onto_term.consider: + term_dict[term_id]["consider"] = [str(c) for c in onto_term.consider] + return term_dict + + +def _parse_ontologies( + working_dir: str = env.RAW_ONTOLOGY_DIR, output_json_file: str = env.PARSED_ONTOLOGIES_FILE, ): """ - Parser all owl files in working_dir. Extracts information from all classes in the owl file. + Parse all ontology files in working_dir. Extracts information from all classes in the ontology file. The extracted information is written into a gzipped a json file with the following structure: { "ontology_name": @@ -101,114 +190,22 @@ def _parse_owls( } } - :param str working_dir: path to folder with owl files - :param str owl_info_yml: path to writable directory where owl files will be downloaded to - :param str owl_info_yml: path to yaml file wit owl information - :param str output_json_file: path to output jsaon file + :param str working_dir: path to folder with ontology files + :param str output_json_file: path to output json file :rtype None """ - - with open(owl_info_yml, "r") as owl_info_handle: - owl_info = yaml.safe_load(owl_info_handle) - - owl_files = [] - for owl_file in os.listdir(working_dir): - if owl_file.endswith(".owl"): - owl_files.append(os.path.join(working_dir, owl_file)) - - # Parse owl files - onto_dict = {} - for owl_file in owl_files: - world = owlready2.World() - onto = world.get_ontology(owl_file) - onto.load() - onto_dict[onto.name] = {} - + onto_dict = dict() + for onto_file in os.listdir(working_dir): + onto = _load_ontology_object(os.path.join(working_dir, onto_file)) print(f"Processing {onto.name}") - - for onto_class in onto.classes(): - term_id = onto_class.name.replace("_", ":") - - # Skip terms that are not direct children from this ontology - if onto.name != term_id.split(":")[0]: - continue - - # If there are specified target terms then only work with them - if onto.name in owl_info and "only" in owl_info[onto.name] and term_id not in owl_info[onto.name]["only"]: - continue - - # Gets label - onto_dict[onto.name][term_id] = dict() - try: - onto_dict[onto.name][term_id]["label"] = onto_class.label[0] - except IndexError: - onto_dict[onto.name][term_id]["label"] = "" - - # Add the "deprecated" status - onto_dict[onto.name][term_id]["deprecated"] = False - if onto_class.deprecated and onto_class.deprecated.first(): - # if deprecated, include information to determine replacement term(s) - onto_dict[onto.name][term_id]["deprecated"] = True - if onto_class.comment: - onto_dict[onto.name][term_id]["comments"] = [str(c) for c in onto_class.comment] - # stores term tracking URL, such as a github issue discussing deprecated term - if hasattr(onto_class, "IAO_0000233") and onto_class.IAO_0000233: - onto_dict[onto.name][term_id]["term_tracker"] = str(onto_class.IAO_0000233[0]) - - # only need to record replaced_by OR considers - if onto_class.IAO_0100001 and onto_class.IAO_0100001.first(): - # url --> term - ontology_term = re.findall(r"[^\W_]+", str(onto_class.IAO_0100001[0])) - onto_dict[onto.name][term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}" - else: - if hasattr(onto_class, "consider") and onto_class.consider: - onto_dict[onto.name][term_id]["consider"] = [str(c) for c in onto_class.consider] - # Gets ancestors - ancestors = _get_ancestors(onto_class, onto.name) - - # If "children_of" specified in owl info then skip the current term if it is - # not a children of those indicated. - if (onto.name in owl_info and "children_of" in owl_info[onto.name]) and ( - not list(set(ancestors) & set(owl_info[onto.name]["children_of"])) - ): - onto_dict[onto.name].pop(term_id) - continue - - # only add the ancestors if it's not NCBITaxon, as this saves a lot of disk space - if onto.name == "NCBITaxon": - onto_dict[onto.name][term_id]["ancestors"] = [] - else: - onto_dict[onto.name][term_id]["ancestors"] = ancestors + onto_dict[onto.name] = _extract_ontology_term_metadata(onto) with gzip.open(output_json_file, "wt") as output_json: json.dump(onto_dict, output_json, indent=2) -def _get_ancestors(onto_class: owlready2.entity.ThingClass, ontololgy_name: str) -> List[str]: - """ - Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name, - it will format the id from the form CL_xxxx to CL:xxxx - - :param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved - :param str ontololgy_name: only ancestors from this ontology will be kept - - :rtype List[str] - :return list of ancestors (term ids), it could be empty - """ - - ancestors = [] - - for ancestor in onto_class.ancestors(): - if onto_class.name == ancestor.name: - continue - if ancestor.name.split("_")[0] == ontololgy_name: - ancestors.append(ancestor.name.replace("_", ":")) - - return ancestors - - -# Download and parse owls upon execution +# Download and parse ontology files upon execution if __name__ == "__main__": - _download_owls() - _parse_owls() + _download_ontologies() + _parse_ontologies() diff --git a/tools/ontology-builder/env.py b/tools/ontology-builder/env.py index 6bd9f6a..53479d3 100644 --- a/tools/ontology-builder/env.py +++ b/tools/ontology-builder/env.py @@ -1,6 +1,7 @@ import os PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__)) -ONTOLOGY_DIR = os.path.join(PACKAGE_ROOT, "ontology-references") -OWL_INFO_YAML = os.path.join(ONTOLOGY_DIR, "owl_info.yml") -PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_DIR, "all_ontology.json.gz") +ONTOLOGY_REF_DIR = os.path.join(PACKAGE_ROOT, "ontology-references") +RAW_ONTOLOGY_DIR = os.path.join(ONTOLOGY_REF_DIR, "raw-files") +ONTO_INFO_YAML = os.path.join(ONTOLOGY_REF_DIR, "ontology_info.yml") +PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_REF_DIR, "all_ontology.json.gz") diff --git a/tools/ontology-builder/ontology-references/ontology_info.yml b/tools/ontology-builder/ontology-references/ontology_info.yml new file mode 100644 index 0000000..ee5fec7 --- /dev/null +++ b/tools/ontology-builder/ontology-references/ontology_info.yml @@ -0,0 +1,36 @@ +CL: + version: v2024-01-04 + source: https://github.com/obophenotype/cell-ontology/releases/download + filetype: owl +EFO: + version: v3.62.0 + source: https://github.com/EBISPOT/efo/releases/download + filetype: owl +HANCESTRO: + version: 3.0 + source: https://github.com/EBISPOT/hancestro/raw/ + filetype: owl +HsapDv: + version: 11 + source: http://aber-owl.net/media/ontologies/HSAPDV/ + filetype: owl +MONDO: + version: v2024-01-03 + source: https://github.com/monarch-initiative/mondo/releases/download/ + filetype: owl +MmusDv: + version: 9 + source: http://aber-owl.net/media/ontologies/MMUSDV/ + filetype: owl +NCBITaxon: + version: v2023-06-20 + source: https://github.com/obophenotype/ncbitaxon/releases/download/ + filetype: owl.gz +UBERON: + version: v2024-01-18 + source: https://github.com/obophenotype/uberon/releases/download/ + filetype: owl +PATO: + version: v2023-05-18 + source: https://github.com/pato-ontology/pato/raw/ + filetype: owl diff --git a/tools/ontology-builder/ontology-references/owl_info.yml b/tools/ontology-builder/ontology-references/owl_info.yml deleted file mode 100644 index 88e8eb0..0000000 --- a/tools/ontology-builder/ontology-references/owl_info.yml +++ /dev/null @@ -1,38 +0,0 @@ -CL: - latest: 2024-01-04 - urls: - 2024-01-04: https://github.com/obophenotype/cell-ontology/releases/download/v2024-01-04/cl.owl -EFO: - latest: 2024-01-15 EFO 3.62.0 - urls: - 2024-01-15 EFO 3.62.0: https://github.com/EBISPOT/efo/releases/download/v3.62.0/efo.owl -HANCESTRO: - latest: 3.0 - urls: - 3.0: https://github.com/EBISPOT/hancestro/raw/3.0/hancestro-base.owl -HsapDv: - latest: 2020-03-10 - urls: - 2020-03-10: http://aber-owl.net/media/ontologies/HSAPDV/11/hsapdv.owl -MONDO: - latest: 2024-01-03 - urls: - 2024-01-03: https://github.com/monarch-initiative/mondo/releases/download/v2024-01-03/mondo.owl -MmusDv: - latest: 2020-03-10 - urls: - 2020-03-10: http://aber-owl.net/media/ontologies/MMUSDV/9/mmusdv.owl -NCBITaxon: - latest: 2023-06-20 - urls: - 2023-06-20: https://github.com/obophenotype/ncbitaxon/releases/download/v2023-06-20/ncbitaxon.owl.gz - children_of: - - NCBITaxon:33208 -UBERON: - latest: 2024-01-18 - urls: - 2024-01-18: https://github.com/obophenotype/uberon/releases/download/v2024-01-18/uberon.owl -PATO: - latest: 2023-05-18 - urls: - 2023-05-18: https://github.com/pato-ontology/pato/raw/v2023-05-18/pato.owl