From c8e0502634bd0eb5e8fc72cc20cc90db9ad0e7e6 Mon Sep 17 00:00:00 2001
From: Bartek Foltyn <62876443+bfoltyn@users.noreply.github.com>
Date: Thu, 9 Nov 2023 17:17:37 +0100
Subject: [PATCH] EFO: add xref details to node data

merges https://github.com/related-sciences/nxontology-data/pull/21
refs https://github.com/related-sciences/nxontology-data/issues/18

Co-authored-by: Bartek Foltyn <bartosz.foltyn@getindata.com>
---
 nxontology_data/efo/efo.py | 71 +++++++++++++++++++++++++++++++++-----
 1 file changed, 63 insertions(+), 8 deletions(-)

diff --git a/nxontology_data/efo/efo.py b/nxontology_data/efo/efo.py
index 36ac212..96370aa 100644
--- a/nxontology_data/efo/efo.py
+++ b/nxontology_data/efo/efo.py
@@ -253,6 +253,68 @@ def update_term(old_term: str) -> str:
         )
         return {k: sorted(v) for k, v in current_to_old.items()}
 
+    def get_xref_details(self) -> dict[str, dict[str, str | list[str] | None]]:
+        xrefs = self.get_xrefs_df()[["efo_id", "xref_bioregistry"]].rename(
+            columns={"xref_bioregistry": "xref_id"}
+        )
+
+        xref_sources = (
+            self.get_xref_sources_df()
+            .assign(
+                xref_id=lambda df: df["xref"]
+                .str.split(":", expand=True)
+                .apply(
+                    lambda row: normalize_parsed_curie(
+                        xref_prefix=row[0],
+                        xref_accession=row[1],
+                        collapse_orphanet=True,
+                    ),
+                    axis="columns",
+                )
+            )
+            .groupby(["efo_id", "xref_id"])["axiom_source"]
+            .apply(list)
+            .reset_index()
+            .rename(columns={"axiom_source": "sources"})
+        )
+
+        def get_relation(x: list[str]) -> str | None:
+            if "skos:exactMatch" in x or "mondo:exactMatch" in x:
+                return "skos:exactMatch"
+            if "skos:closeMatch" in x or "mondo:closeMatch" in x:
+                return "skos:closeMatch"
+            return None
+
+        mapping_properties = (
+            self.get_mapping_properties_df()
+            .groupby(["efo_id", "xref_id"])["mapping_property_id"]
+            .apply(list)
+            .reset_index()
+            .rename(columns={"mapping_property_id": "mapping_properties"})
+            .assign(
+                relation=lambda x: x["mapping_properties"].apply(get_relation),
+            )
+        )
+
+        xref_details = (
+            xrefs.merge(
+                mapping_properties,
+                how="outer",
+                on=["efo_id", "xref_id"],
+            )
+            .merge(
+                xref_sources,
+                how="outer",
+                on=["efo_id", "xref_id"],
+            )
+            .query("efo_id != xref_id")
+        )
+
+        return {
+            k: v[["xref_id", "relation", "sources"]].to_dict(orient="records")
+            for k, v in xref_details.groupby("efo_id")
+        }
+
     def get_nodes(self) -> list[dict[str, Any]]:
         logger.info("Generating nodes")
         node_df = self.get_terms_df()
@@ -265,6 +327,7 @@ def get_nodes(self) -> list[dict[str, Any]]:
             .apply(lambda df: sorted(set(df.xref_bioregistry.dropna())))
         )
         node_df["subsets"] = node_df.efo_id.map(self.get_subsets())
+        node_df["xref_details"] = node_df.efo_id.map(self.get_xref_details())
         # Use .to_json and not .to_dict to convert NaN to None
         return json.loads(node_df.to_json(orient="records"))  # type: ignore [no-any-return]
 
@@ -319,14 +382,6 @@ def write_outputs(self) -> None:
         write_dataframe(
             self.get_obsolete_df(), output_dir.joinpath(f"{self.name}_obsolete.json.gz")
         )
-        write_dataframe(
-            self.get_mapping_properties_df(),
-            output_dir.joinpath(f"{self.name}_mapping_properties.json.gz"),
-        )
-        write_dataframe(
-            self.get_xref_sources_df(),
-            output_dir.joinpath(f"{self.name}_xref_sources.json.gz"),
-        )
         if nxo.name == "efo_otar_profile":
             nxo_slim = self.create_slim_nxo(nxo)
             # classify EFO node/disease precision using nxontology-ml