Merge pull request #274 from TranslatorSRI/add-set-id-endpoint

Adds a `/get_setid` GET endpoint, which can be used to calculate a set ID for a set of CURIEs (implementing the specification described in #256 (comment)). The CURIEs are normalized (note that we don't do any validation to ensure that they are valid CURIEs or even that they look like CURIEs), sorted, and then returned as an SHA-256 hash. This PR also modifies `/get_allowed_conflations` so that it returns `DrugChemical` in additional to `GeneProtein`.
TranslatorSRI · Jul 18, 2024 · 795a036 · 795a036
2 parents 13f67ce + ed943f6
commit 795a036
Show file tree

Hide file tree

Showing 6 changed files with 386 additions and 10 deletions.
diff --git a/node_normalizer/model/__init__.py b/node_normalizer/model/__init__.py
@@ -3,4 +3,4 @@
 """
 
 from .input import CurieList, SemanticTypesInput
-from .response import CuriePivot, SemanticTypes, ConflationList
+from .response import CuriePivot, SemanticTypes, ConflationList, SetIDResponse
diff --git a/node_normalizer/model/response.py b/node_normalizer/model/response.py
@@ -4,7 +4,7 @@
 
 from pydantic import BaseModel
 
-from typing import Dict, List
+from typing import Dict, List, Optional
 
 
 class SemanticTypes(BaseModel):
@@ -27,5 +27,18 @@ class Config:
 class CuriePivot(BaseModel):
     curie_prefix: Dict[str, str]
 
+
 class ConflationList(BaseModel):
-    conflations: List
+    conflations: List
+
+
+class SetIDResponse(BaseModel):
+    curies: List[str]
+    conflations: List[str]
+    error: Optional[str]
+    normalized_curies: Optional[List[str]]
+    normalized_string: Optional[str]
+    setid: Optional[str]
+    # base64: Optional[str]
+    # base64zlib: Optional[str]
+    # sha224hash: Optional[str]
diff --git a/node_normalizer/server.py b/node_normalizer/server.py
@@ -23,8 +23,10 @@
     CurieList,
     SemanticTypesInput,
     ConflationList,
+    SetIDResponse,
 )
 from .normalizer import get_normalized_nodes, get_curie_prefixes, normalize_message
+from .set_id import generate_setid
 from .redis_adapter import RedisConnectionFactory
 from .util import LoggingUtil
 from .examples import EXAMPLE_QUERY_DRUG_TREATS_ESSENTIAL_HYPERTENSION
@@ -167,8 +169,7 @@ async def get_conflations() -> ConflationList:
     """
     Get implemented conflations
     """
-    # TODO: build from config instead of hard-coding.
-    conflations = ConflationList(conflations=["GeneProtein"])
+    conflations = ConflationList(conflations=["GeneProtein", "DrugChemical"])
 
     return conflations
 
@@ -222,6 +223,27 @@ async def get_normalized_node_handler(curies: CurieList):
     return normalized_nodes
 
 
+@app.get(
+    "/get_setid",
+    response_model=SetIDResponse,
+    summary="Normalize and deduplicate a set of identifiers and return a single hash that represents this set."
+)
+async def get_setid(
+    curie: List[str] = fastapi.Query(
+        [],
+        description="Set of curies to normalize",
+        example=["MESH:D014867", "NCIT:C34373", "UNII:63M8RYN44N", "RUBBISH:1234"],
+        min_items=1,
+    ),
+    conflation: List[str] = fastapi.Query(
+        [],
+        description="Set of conflations to apply",
+        example=["GeneProtein", "DrugChemical"],
+    )
+) -> SetIDResponse:
+    return await generate_setid(app, curie, conflation)
+
+
 @app.get(
     "/get_semantic_types",
     response_model=SemanticTypes,

diff --git a/node_normalizer/set_id.py b/node_normalizer/set_id.py
@@ -0,0 +1,101 @@
+# set_id.py
+# Code related to generating IDs for sets (as in https://github.com/TranslatorSRI/NodeNormalization/issues/256).
+import base64
+import gzip
+import hashlib
+import logging
+import uuid
+
+import zlib
+
+from .model import SetIDResponse
+from .normalizer import get_normalized_nodes
+
+# UUID namespace for SetIDs
+uuid_namespace_setid = uuid.UUID('14ef168c-14cb-4979-8442-da6aaca55572')
+
+
+async def generate_setid(app, curies, conflations) -> SetIDResponse:
+    """
+    Generate a SetID for a set of curies.
+
+    :param app: The NodeNorm app (used to access the databases).
+    :param curies: A list of curies to generate a set ID for.
+    :param conflations: A list of conflations to apply. Must be one or both of 'GeneProtein' and 'DrugChemical'.
+    :return: A SetIDResponse with the Set ID.
+    """
+
+    # Step 0. Prepare the SetIDResponse by filling it with the arguments.
+    response = SetIDResponse(
+        curies=curies,
+        conflations=conflations
+    )
+
+    # Step 1. Normalize the curies given the conflation settings.
+    gene_protein_conflation = "GeneProtein" in conflations
+    drug_chemical_conflation = "DrugChemical" in conflations
+    if not all(item in ['GeneProtein', 'DrugChemical'] for item in conflations):
+        response.error = "Conflations provided to " + \
+            f"generate_setid() are {conflations}, but only 'GeneProtein' and 'DrugChemical' are allowed."
+        return response
+
+    # We use get_normalized_nodes() to normalize all the CURIEs for us.
+    normalization_results = await get_normalized_nodes(
+        app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False
+    )
+
+    # We prepare a set of sorted, deduplicated curies.
+    curies_normalized_already = set()
+    normalized_curies = []
+    for curie in curies:
+        # CURIE must be a string.
+        curie = str(curie)
+        if curie in normalization_results and normalization_results[curie] is not None:
+            result = normalization_results[curie]
+            if 'id' in result and 'identifier' in result['id']:
+                preferred_id = result['id']['identifier']
+                if preferred_id in curies_normalized_already:
+                    # Don't duplicate normalized IDs: that way if someone queries for ['id1', 'id2', 'id3'] where
+                    # they normalize to ['nr1', 'nr2', 'nr2'], we can come up with the set ['nr1', 'nr2'], which will
+                    # be a better set_id().
+                    pass
+                else:
+                    normalized_curies.append(preferred_id)
+                    curies_normalized_already.add(preferred_id)
+            else:
+                # We got back a normalization response, but no preferred ID. This shouldn't happen.
+                logging.warning(
+                    f"Normalized CURIE {curie} returned a response but not a preferred identifier: {normalization_results[curie]}"
+                )
+                normalized_curies.append(curie)
+                curies_normalized_already.add(curie)
+        else:
+            # No normalized identifier.
+            normalized_curies.append(curie)
+            curies_normalized_already.add(curie)
+
+    sorted_normalized_curies = sorted(normalized_curies)
+    response.normalized_curies = sorted_normalized_curies
+
+    # Do we have any normalized CURIEs? If not, return now.
+    if not sorted_normalized_curies:
+        return response
+
+    normalized_string = "||".join(sorted_normalized_curies)
+    response.normalized_string = normalized_string
+
+    # There are several options we've tried here:
+    # - SHA224 hash -- but this is too long.
+    # response.sha224hash = hashlib.sha224(normalized_string.encode('utf-8')).hexdigest()
+
+    # - base64+zip, so it would be reversible, which might be something we want at some point
+    #   (https://github.com/TranslatorSRI/NodeNormalization/issues/256#issuecomment-2197465751),
+    #   but that is also too long.
+    # response.base64 = base64.b64encode(normalized_string.encode('utf-8')).decode('utf-8')
+    # compressed_normalized_string = zlib.compress(normalized_string.encode('utf-8'))
+    # response.base64zlib = base64.b64encode(compressed_normalized_string).decode('utf-8')
+
+    # - UUID v5 identifiers with a custom namespace.
+    response.setid = 'uuid:' + str(uuid.uuid5(uuid_namespace_setid, normalized_string))
+
+    return response
diff --git a/redis_config.yaml b/redis_config.yaml
@@ -11,7 +11,7 @@
 "id_to_eqids_db":
   "ssl_enabled": false
   "is_cluster": false
-  "db": 1
+  "db": 0
   "hosts":
     - "host_name": "127.0.0.1"
       "port": "6379"
@@ -20,7 +20,7 @@
 "id_to_type_db":
   "ssl_enabled": false
   "is_cluster": false
-  "db": 2
+  "db": 0
   "hosts":
     - "host_name": "127.0.0.1"
       "port": "6379"
@@ -29,7 +29,7 @@
 "curie_to_bl_type_db":
   "ssl_enabled": false
   "is_cluster": false
-  "db": 3
+  "db": 0
   "hosts":
     - "host_name": "127.0.0.1"
       "port": "6379"
@@ -38,7 +38,7 @@
 "gene_protein_db":
   "ssl_enabled": false
   "is_cluster": false
-  "db": 4
+  "db": 0
   "hosts":
     - "host_name": "127.0.0.1"
       "port": "6379"
@@ -56,7 +56,7 @@
 "info_content_db":
   "ssl_enabled": false
   "is_cluster": false
-  "db": 5
+  "db": 0
   "hosts":
     - "host_name": "127.0.0.1"
       "port": "6379"