Skip to content

Commit

Permalink
Merge pull request #274 from TranslatorSRI/add-set-id-endpoint
Browse files Browse the repository at this point in the history
Adds a `/get_setid` GET endpoint, which can be used to calculate a set ID for a set of CURIEs (implementing the specification described in #256 (comment)). The CURIEs are normalized (note that we don't do any validation to ensure that they are valid CURIEs or even that they look like CURIEs), sorted, and then returned as an SHA-256 hash. This PR also modifies `/get_allowed_conflations` so that it returns `DrugChemical` in additional to `GeneProtein`.
  • Loading branch information
gaurav authored Jul 18, 2024
2 parents 13f67ce + ed943f6 commit 795a036
Show file tree
Hide file tree
Showing 6 changed files with 386 additions and 10 deletions.
2 changes: 1 addition & 1 deletion node_normalizer/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
"""

from .input import CurieList, SemanticTypesInput
from .response import CuriePivot, SemanticTypes, ConflationList
from .response import CuriePivot, SemanticTypes, ConflationList, SetIDResponse
17 changes: 15 additions & 2 deletions node_normalizer/model/response.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from pydantic import BaseModel

from typing import Dict, List
from typing import Dict, List, Optional


class SemanticTypes(BaseModel):
Expand All @@ -27,5 +27,18 @@ class Config:
class CuriePivot(BaseModel):
curie_prefix: Dict[str, str]


class ConflationList(BaseModel):
conflations: List
conflations: List


class SetIDResponse(BaseModel):
curies: List[str]
conflations: List[str]
error: Optional[str]
normalized_curies: Optional[List[str]]
normalized_string: Optional[str]
setid: Optional[str]
# base64: Optional[str]
# base64zlib: Optional[str]
# sha224hash: Optional[str]
26 changes: 24 additions & 2 deletions node_normalizer/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@
CurieList,
SemanticTypesInput,
ConflationList,
SetIDResponse,
)
from .normalizer import get_normalized_nodes, get_curie_prefixes, normalize_message
from .set_id import generate_setid
from .redis_adapter import RedisConnectionFactory
from .util import LoggingUtil
from .examples import EXAMPLE_QUERY_DRUG_TREATS_ESSENTIAL_HYPERTENSION
Expand Down Expand Up @@ -167,8 +169,7 @@ async def get_conflations() -> ConflationList:
"""
Get implemented conflations
"""
# TODO: build from config instead of hard-coding.
conflations = ConflationList(conflations=["GeneProtein"])
conflations = ConflationList(conflations=["GeneProtein", "DrugChemical"])

return conflations

Expand Down Expand Up @@ -222,6 +223,27 @@ async def get_normalized_node_handler(curies: CurieList):
return normalized_nodes


@app.get(
"/get_setid",
response_model=SetIDResponse,
summary="Normalize and deduplicate a set of identifiers and return a single hash that represents this set."
)
async def get_setid(
curie: List[str] = fastapi.Query(
[],
description="Set of curies to normalize",
example=["MESH:D014867", "NCIT:C34373", "UNII:63M8RYN44N", "RUBBISH:1234"],
min_items=1,
),
conflation: List[str] = fastapi.Query(
[],
description="Set of conflations to apply",
example=["GeneProtein", "DrugChemical"],
)
) -> SetIDResponse:
return await generate_setid(app, curie, conflation)


@app.get(
"/get_semantic_types",
response_model=SemanticTypes,
Expand Down
101 changes: 101 additions & 0 deletions node_normalizer/set_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# set_id.py
# Code related to generating IDs for sets (as in https://github.com/TranslatorSRI/NodeNormalization/issues/256).
import base64
import gzip
import hashlib
import logging
import uuid

import zlib

from .model import SetIDResponse
from .normalizer import get_normalized_nodes

# UUID namespace for SetIDs
uuid_namespace_setid = uuid.UUID('14ef168c-14cb-4979-8442-da6aaca55572')


async def generate_setid(app, curies, conflations) -> SetIDResponse:
"""
Generate a SetID for a set of curies.
:param app: The NodeNorm app (used to access the databases).
:param curies: A list of curies to generate a set ID for.
:param conflations: A list of conflations to apply. Must be one or both of 'GeneProtein' and 'DrugChemical'.
:return: A SetIDResponse with the Set ID.
"""

# Step 0. Prepare the SetIDResponse by filling it with the arguments.
response = SetIDResponse(
curies=curies,
conflations=conflations
)

# Step 1. Normalize the curies given the conflation settings.
gene_protein_conflation = "GeneProtein" in conflations
drug_chemical_conflation = "DrugChemical" in conflations
if not all(item in ['GeneProtein', 'DrugChemical'] for item in conflations):
response.error = "Conflations provided to " + \
f"generate_setid() are {conflations}, but only 'GeneProtein' and 'DrugChemical' are allowed."
return response

# We use get_normalized_nodes() to normalize all the CURIEs for us.
normalization_results = await get_normalized_nodes(
app, curies, gene_protein_conflation, drug_chemical_conflation, include_descriptions=False
)

# We prepare a set of sorted, deduplicated curies.
curies_normalized_already = set()
normalized_curies = []
for curie in curies:
# CURIE must be a string.
curie = str(curie)
if curie in normalization_results and normalization_results[curie] is not None:
result = normalization_results[curie]
if 'id' in result and 'identifier' in result['id']:
preferred_id = result['id']['identifier']
if preferred_id in curies_normalized_already:
# Don't duplicate normalized IDs: that way if someone queries for ['id1', 'id2', 'id3'] where
# they normalize to ['nr1', 'nr2', 'nr2'], we can come up with the set ['nr1', 'nr2'], which will
# be a better set_id().
pass
else:
normalized_curies.append(preferred_id)
curies_normalized_already.add(preferred_id)
else:
# We got back a normalization response, but no preferred ID. This shouldn't happen.
logging.warning(
f"Normalized CURIE {curie} returned a response but not a preferred identifier: {normalization_results[curie]}"
)
normalized_curies.append(curie)
curies_normalized_already.add(curie)
else:
# No normalized identifier.
normalized_curies.append(curie)
curies_normalized_already.add(curie)

sorted_normalized_curies = sorted(normalized_curies)
response.normalized_curies = sorted_normalized_curies

# Do we have any normalized CURIEs? If not, return now.
if not sorted_normalized_curies:
return response

normalized_string = "||".join(sorted_normalized_curies)
response.normalized_string = normalized_string

# There are several options we've tried here:
# - SHA224 hash -- but this is too long.
# response.sha224hash = hashlib.sha224(normalized_string.encode('utf-8')).hexdigest()

# - base64+zip, so it would be reversible, which might be something we want at some point
# (https://github.com/TranslatorSRI/NodeNormalization/issues/256#issuecomment-2197465751),
# but that is also too long.
# response.base64 = base64.b64encode(normalized_string.encode('utf-8')).decode('utf-8')
# compressed_normalized_string = zlib.compress(normalized_string.encode('utf-8'))
# response.base64zlib = base64.b64encode(compressed_normalized_string).decode('utf-8')

# - UUID v5 identifiers with a custom namespace.
response.setid = 'uuid:' + str(uuid.uuid5(uuid_namespace_setid, normalized_string))

return response
10 changes: 5 additions & 5 deletions redis_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"id_to_eqids_db":
"ssl_enabled": false
"is_cluster": false
"db": 1
"db": 0
"hosts":
- "host_name": "127.0.0.1"
"port": "6379"
Expand All @@ -20,7 +20,7 @@
"id_to_type_db":
"ssl_enabled": false
"is_cluster": false
"db": 2
"db": 0
"hosts":
- "host_name": "127.0.0.1"
"port": "6379"
Expand All @@ -29,7 +29,7 @@
"curie_to_bl_type_db":
"ssl_enabled": false
"is_cluster": false
"db": 3
"db": 0
"hosts":
- "host_name": "127.0.0.1"
"port": "6379"
Expand All @@ -38,7 +38,7 @@
"gene_protein_db":
"ssl_enabled": false
"is_cluster": false
"db": 4
"db": 0
"hosts":
- "host_name": "127.0.0.1"
"port": "6379"
Expand All @@ -56,7 +56,7 @@
"info_content_db":
"ssl_enabled": false
"is_cluster": false
"db": 5
"db": 0
"hosts":
- "host_name": "127.0.0.1"
"port": "6379"
Expand Down
Loading

0 comments on commit 795a036

Please sign in to comment.