chore: refactor all_ontology_generator for cleanliness and modularity

chanzuckerberg · Feb 12, 2024 · 4384459 · 4384459
1 parent 3e50769
commit 4384459
Show file tree

Hide file tree

Showing 5 changed files with 154 additions and 158 deletions.
diff --git a/.github/workflows/generate_all_ontology.yml b/.github/workflows/generate_all_ontology.yml
@@ -3,7 +3,7 @@ name: Updates to Ontology Files
 on:
   push:
     paths:
-      - '**/tools/ontology-builder/ontology-references/owl_info.yml'
+      - '**/tools/ontology-builder/ontology-references/ontology_info.yml'
     branches-ignore:
       - main
 

diff --git a/tools/ontology-builder/all_ontology_generator.py b/tools/ontology-builder/all_ontology_generator.py
@@ -12,22 +12,21 @@
 import yaml
 
 
-def _download_owls(owl_info_yml: str = env.OWL_INFO_YAML, output_dir: str = env.ONTOLOGY_DIR):
+def _download_ontologies(onto_info_yml: str = env.ONTO_INFO_YAML, output_dir: str = env.RAW_ONTOLOGY_DIR):
     """
-    Downloads the ontology owl files specified in 'owl_info_yml' into 'output_dir'
+    Downloads the ontology files specified in 'ontology_info.yml' into 'output_dir'
 
-    :param str owl_info_yml: path to yaml file wit OWL information
-    :param str output_dir: path to writable directory where owl files will be downloaded to
+    :param str onto_info_yml: path to yaml file with ontology information
+    :param str output_dir: path to writable directory where ontology files will be downloaded to
 
     :rtype None
     """
-
-    with open(owl_info_yml, "r") as owl_info_handle:
-        owl_info = yaml.safe_load(owl_info_handle)
+    with open(onto_info_yml, "r") as onto_info_handle:
+        ontology_info = yaml.safe_load(onto_info_handle)
 
     def download(_ontology, _url):
         print(f"Start Downloading {_ontology}")
-        # Format of owl (handles cases where they are compressed)
+        # Format of ontology (handles cases where they are compressed)
         download_format = _url.split(".")[-1]
 
         output_file = os.path.join(output_dir, _ontology + ".owl")
@@ -39,10 +38,13 @@ def download(_ontology, _url):
             urllib.request.urlretrieve(_url, output_file)
         print(f"Finish Downloading {_ontology}")
 
+    def _build_url(_ontology: str):
+        onto_ref_data = ontology_info[_ontology]
+        return f"{onto_ref_data['source']}/{onto_ref_data['version']}/{ontology.lower()}.{onto_ref_data['filetype']}"
+
     threads = []
-    for ontology, _ in owl_info.items():
-        latest_version = owl_info[ontology]["latest"]
-        url = owl_info[ontology]["urls"][latest_version]
+    for ontology, _ in ontology_info.items():
+        url = _build_url(ontology)
         try:
             urllib.request.urlopen(url)
         except HTTPError as e:
@@ -72,13 +74,100 @@ def _decompress(infile: str, tofile: str):
         tof.write(decom_str)
 
 
-def _parse_owls(
-    working_dir: str = env.ONTOLOGY_DIR,
-    owl_info_yml: str = env.OWL_INFO_YAML,
+def _load_ontology_object(onto_file: str) -> owlready2.entity.ThingClass:
+    """
+    Read ontology data from file and write into a python object
+    
+    :param onto_file: filepath to ontology file
+    :return: 
+    """
+    world = owlready2.World()
+    onto = world.get_ontology(onto_file)
+    onto.load()
+    return onto
+
+
+def _get_ancestors(onto_class: owlready2.entity.ThingClass, onto_name: str) -> List[str]:
+    """
+    Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name,
+    it will format the id from the form CL_xxxx to CL:xxxx
+
+    :param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved
+    :param str onto_name: only ancestors from this ontology will be kept
+
+    :rtype List[str]
+    :return list of ancestors (term ids), it could be empty
+    """
+
+    ancestors = []
+
+    for ancestor in onto_class.ancestors():
+        if onto_class.name == ancestor.name:
+            continue
+        if ancestor.name.split("_")[0] == onto_name:
+            ancestors.append(ancestor.name.replace("_", ":"))
+
+    return ancestors
+
+
+def _extract_ontology_term_metadata(onto: owlready2.entity.ThingClass) -> dict:
+    """
+    Extract relevant metadata from ontology object and save into a dictionary following our JSON Schema
+
+    :param: onto: Ontology Object to Process
+    :return: Dict[str, str] map of ontology term IDs to pertinent metadata from ontology files
+    """
+    term_dict = dict()
+    for onto_term in onto.classes():
+        term_id = onto_term.name.replace("_", ":")
+
+        # Skip terms that are not direct children from this ontology
+        if onto.name != term_id.split(":")[0]:
+            continue
+        # Gets ancestors
+        ancestors = _get_ancestors(onto_term, onto.name)
+
+        # Special Case: skip the current term if it is an NCBI Term, but not a descendant of 'NCBITaxon:33208'.
+        if onto.name == "NCBITaxon" and "NCBITaxon:33208" not in ancestors:
+            continue
+
+        term_dict[term_id] = dict()
+
+        # only write the ancestors if it's not NCBITaxon, as this saves a lot of disk space and there is
+        # no current use-case for NCBITaxon
+        term_dict[term_id]["ancestors"] = [] if onto.name == "NCBITaxon" else ancestors
+
+        # Gets label
+        term_dict[term_id]["label"] = onto_term.label[0] if onto_term.label else ""
+
+        # Add the "deprecated" status and associated metadata if True
+        term_dict[term_id]["deprecated"] = False
+        if onto_term.deprecated and onto_term.deprecated.first():
+            # if deprecated, include information to determine replacement term(s)
+            term_dict[term_id]["deprecated"] = True
+            if onto_term.comment:
+                term_dict[term_id]["comments"] = [str(c) for c in onto_term.comment]
+            # stores term tracking URL, such as a github issue discussing deprecated term
+            if hasattr(onto_term, "IAO_0000233") and onto_term.IAO_0000233:
+                term_dict[term_id]["term_tracker"] = str(onto_term.IAO_0000233[0])
+
+            # only need to record replaced_by OR considers
+            if onto_term.IAO_0100001 and onto_term.IAO_0100001.first():
+                # url --> term
+                ontology_term = re.findall(r"[^\W_]+", str(onto_term.IAO_0100001[0]))
+                term_dict[term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}"
+            else:
+                if hasattr(onto_term, "consider") and onto_term.consider:
+                    term_dict[term_id]["consider"] = [str(c) for c in onto_term.consider]
+    return term_dict
+
+
+def _parse_ontologies(
+    working_dir: str = env.RAW_ONTOLOGY_DIR,
     output_json_file: str = env.PARSED_ONTOLOGIES_FILE,
 ):
     """
-    Parser all owl files in working_dir. Extracts information from all classes in the owl file.
+    Parse all ontology files in working_dir. Extracts information from all classes in the ontology file.
     The extracted information is written into a gzipped a json file with the following structure:
     {
         "ontology_name":
@@ -101,114 +190,22 @@ def _parse_owls(
             }
     }
 
-    :param str working_dir: path to folder with owl files
-    :param str owl_info_yml: path to writable directory where owl files will be downloaded to
-    :param str owl_info_yml: path to yaml file wit owl information
-    :param str output_json_file: path to output jsaon file
+    :param str working_dir: path to folder with ontology files
+    :param str output_json_file: path to output json file
 
     :rtype None
     """
-
-    with open(owl_info_yml, "r") as owl_info_handle:
-        owl_info = yaml.safe_load(owl_info_handle)
-
-    owl_files = []
-    for owl_file in os.listdir(working_dir):
-        if owl_file.endswith(".owl"):
-            owl_files.append(os.path.join(working_dir, owl_file))
-
-    # Parse owl files
-    onto_dict = {}
-    for owl_file in owl_files:
-        world = owlready2.World()
-        onto = world.get_ontology(owl_file)
-        onto.load()
-        onto_dict[onto.name] = {}
-
+    onto_dict = dict()
+    for onto_file in os.listdir(working_dir):
+        onto = _load_ontology_object(os.path.join(working_dir, onto_file))
         print(f"Processing {onto.name}")
-
-        for onto_class in onto.classes():
-            term_id = onto_class.name.replace("_", ":")
-
-            # Skip terms that are not direct children from this ontology
-            if onto.name != term_id.split(":")[0]:
-                continue
-
-            # If there are specified target terms then only work with them
-            if onto.name in owl_info and "only" in owl_info[onto.name] and term_id not in owl_info[onto.name]["only"]:
-                continue
-
-            # Gets label
-            onto_dict[onto.name][term_id] = dict()
-            try:
-                onto_dict[onto.name][term_id]["label"] = onto_class.label[0]
-            except IndexError:
-                onto_dict[onto.name][term_id]["label"] = ""
-
-            # Add the "deprecated" status
-            onto_dict[onto.name][term_id]["deprecated"] = False
-            if onto_class.deprecated and onto_class.deprecated.first():
-                # if deprecated, include information to determine replacement term(s)
-                onto_dict[onto.name][term_id]["deprecated"] = True
-                if onto_class.comment:
-                    onto_dict[onto.name][term_id]["comments"] = [str(c) for c in onto_class.comment]
-                # stores term tracking URL, such as a github issue discussing deprecated term
-                if hasattr(onto_class, "IAO_0000233") and onto_class.IAO_0000233:
-                    onto_dict[onto.name][term_id]["term_tracker"] = str(onto_class.IAO_0000233[0])
-
-                # only need to record replaced_by OR considers
-                if onto_class.IAO_0100001 and onto_class.IAO_0100001.first():
-                    # url --> term
-                    ontology_term = re.findall(r"[^\W_]+", str(onto_class.IAO_0100001[0]))
-                    onto_dict[onto.name][term_id]["replaced_by"] = f"{ontology_term[-2]}:{ontology_term[-1]}"
-                else:
-                    if hasattr(onto_class, "consider") and onto_class.consider:
-                        onto_dict[onto.name][term_id]["consider"] = [str(c) for c in onto_class.consider]
-            # Gets ancestors
-            ancestors = _get_ancestors(onto_class, onto.name)
-
-            # If "children_of" specified in owl info then skip the current term if it is
-            # not a children of those indicated.
-            if (onto.name in owl_info and "children_of" in owl_info[onto.name]) and (
-                not list(set(ancestors) & set(owl_info[onto.name]["children_of"]))
-            ):
-                onto_dict[onto.name].pop(term_id)
-                continue
-
-            # only add the ancestors if it's not NCBITaxon, as this saves a lot of disk space
-            if onto.name == "NCBITaxon":
-                onto_dict[onto.name][term_id]["ancestors"] = []
-            else:
-                onto_dict[onto.name][term_id]["ancestors"] = ancestors
+        onto_dict[onto.name] = _extract_ontology_term_metadata(onto)
 
     with gzip.open(output_json_file, "wt") as output_json:
         json.dump(onto_dict, output_json, indent=2)
 
 
-def _get_ancestors(onto_class: owlready2.entity.ThingClass, ontololgy_name: str) -> List[str]:
-    """
-    Returns a list of ancestors ids of the given onto class, only returns those belonging to ontology_name,
-    it will format the id from the form CL_xxxx to CL:xxxx
-
-    :param owlready2.entity.ThingClass onto_class: the class for which ancestors will be retrieved
-    :param str ontololgy_name: only ancestors from this ontology will be kept
-
-    :rtype List[str]
-    :return list of ancestors (term ids), it could be empty
-    """
-
-    ancestors = []
-
-    for ancestor in onto_class.ancestors():
-        if onto_class.name == ancestor.name:
-            continue
-        if ancestor.name.split("_")[0] == ontololgy_name:
-            ancestors.append(ancestor.name.replace("_", ":"))
-
-    return ancestors
-
-
-# Download and parse owls upon execution
+# Download and parse ontology files upon execution
 if __name__ == "__main__":
-    _download_owls()
-    _parse_owls()
+    _download_ontologies()
+    _parse_ontologies()
diff --git a/tools/ontology-builder/env.py b/tools/ontology-builder/env.py
@@ -1,6 +1,7 @@
 import os
 
 PACKAGE_ROOT = os.path.dirname(os.path.realpath(__file__))
-ONTOLOGY_DIR = os.path.join(PACKAGE_ROOT, "ontology-references")
-OWL_INFO_YAML = os.path.join(ONTOLOGY_DIR, "owl_info.yml")
-PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_DIR, "all_ontology.json.gz")
+ONTOLOGY_REF_DIR = os.path.join(PACKAGE_ROOT, "ontology-references")
+RAW_ONTOLOGY_DIR = os.path.join(ONTOLOGY_REF_DIR, "raw-files")
+ONTO_INFO_YAML = os.path.join(ONTOLOGY_REF_DIR, "ontology_info.yml")
+PARSED_ONTOLOGIES_FILE = os.path.join(ONTOLOGY_REF_DIR, "all_ontology.json.gz")
diff --git a/tools/ontology-builder/ontology-references/ontology_info.yml b/tools/ontology-builder/ontology-references/ontology_info.yml
@@ -0,0 +1,36 @@
+CL:
+  version: v2024-01-04
+  source: https://github.com/obophenotype/cell-ontology/releases/download
+  filetype: owl
+EFO:
+  version: v3.62.0
+  source: https://github.com/EBISPOT/efo/releases/download
+  filetype: owl
+HANCESTRO:
+  version: 3.0
+  source: https://github.com/EBISPOT/hancestro/raw/
+  filetype: owl
+HsapDv:
+  version: 11
+  source: http://aber-owl.net/media/ontologies/HSAPDV/
+  filetype: owl
+MONDO:
+  version: v2024-01-03
+  source: https://github.com/monarch-initiative/mondo/releases/download/
+  filetype: owl
+MmusDv:
+  version: 9
+  source: http://aber-owl.net/media/ontologies/MMUSDV/
+  filetype: owl
+NCBITaxon:
+  version: v2023-06-20
+  source: https://github.com/obophenotype/ncbitaxon/releases/download/
+  filetype: owl.gz
+UBERON:
+  version: v2024-01-18
+  source: https://github.com/obophenotype/uberon/releases/download/
+  filetype: owl
+PATO:
+  version: v2023-05-18
+  source: https://github.com/pato-ontology/pato/raw/
+  filetype: owl
diff --git a/tools/ontology-builder/ontology-references/owl_info.yml b/tools/ontology-builder/ontology-references/owl_info.yml