feat!: use vrs 2.0-alpha core gene model in /normalize (#252)

* bump ga4gh.vrs version to include core models * Removes custom Gene Descriptor/Extension/Gene Value Object pydantic classes and uses ga4gh.core models * Remove references to VRSATILE/VODs
cancervariants · Sep 27, 2023 · d3c10ad · d3c10ad
1 parent b1b1aeb
commit d3c10ad
Show file tree

Hide file tree

Showing 15 changed files with 523 additions and 395 deletions.
diff --git a/Pipfile b/Pipfile
@@ -9,7 +9,7 @@ fastapi = "*"
 uvicorn = "*"
 click = "*"
 boto3 = "*"
-"ga4gh.vrs" = {version = "==2.0.0.dev0"}
+"ga4gh.vrs" = "~=2.0.0a1"
 
 [dev-packages]
 gene = {editable = true, path = "."}

diff --git a/README.md b/README.md
@@ -18,9 +18,9 @@ Call the `normalize()` method with a gene term. If available, a rich description
 
 ```
 >>> result = q.normalize("BRAF")
->>> result.gene_descriptor.gene
+>>> result.normalized_id
 "hgnc:1097"
->>> result.gene_descriptor.alternate_labels
+>>> result.gene.aliases
 ['NS7', 'RAFB1', 'B-raf', 'BRAF-1', 'BRAF1', 'B-RAF1']
 ```
 

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -20,9 +20,9 @@ The Gene Normalizer provides tools for resolving ambiguous human gene references
     >>> from gene.database import create_db
     >>> q = QueryHandler(create_db())
     >>> result = q.normalize("BRAF")
-    >>> result.gene_descriptor.gene
+    >>> result.normalized_id
     "hgnc:1097"
-    >>> result.gene_descriptor.alternate_labels
+    >>> result.gene.aliases
     ['NS7', 'RAFB1', 'B-raf', 'BRAF-1', 'BRAF1', 'B-RAF1']
 
 See the `public REST instance of the service <https://normalize.cancervariants.org/gene>`_ for a demonstration of all queryable endpoints.

diff --git a/docs/source/normalizing_data/normalization.rst b/docs/source/normalizing_data/normalization.rst
@@ -48,15 +48,15 @@ Normalized gene records are constructed by merging known data from all associate
 The normalized record
 ---------------------
 
-Normalized records are structured as `Gene Descriptors <https://vrsatile.readthedocs.io/en/latest/>`_ in conformance with the `GA4GH VRSATILE project <https://vrsatile.readthedocs.io/en/latest/>`_. The normalized gene concept is provided as a value object, and additional metadata is deposited as a label, xrefs, alternate labels, as well as Extensions for more complex information (such as loci and gene type). The following demonstrates this model for the BRAF gene:
+Normalized records are structured as `Genes <https://github.com/ga4gh/vrs/tree/2.0-alpha>`_. The normalized gene concept is provided and additional metadata is deposited as a label, xrefs, alternate labels, as well as Extensions for more complex information (such as loci and gene type). The following demonstrates this model for the BRAF gene:
 
 .. admonition:: Example
 
   .. code-block:: json
 
     {
       "id": "normalize.gene:braf",
-      "type": "GeneDescriptor",
+      "type": "Gene",
       "label": "BRAF",
       "extensions": [
         {
@@ -95,7 +95,7 @@ Normalized records are structured as `Gene Descriptors <https://vrsatile.readthe
           "name": "ensembl_locations",
           "value": [
             {
-              "id": "ga4gh:SL.iwWw9B3tkU3TCLF3d8xu4zSQBhpDZfJ6",
+              "id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf",
               "type": "SequenceLocation",
               "sequenceReference": {
                 "type": "SequenceReference",
@@ -138,7 +138,7 @@ Normalized records are structured as `Gene Descriptors <https://vrsatile.readthe
               "end": "q34"
             },
             {
-              "id": "ga4gh:SL.rXzVqqlchBvUef98MNQA77FvwSJgiOf5",
+              "id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE",
               "type": "SequenceLocation",
               "sequenceReference": {
                 "type": "SequenceReference",

diff --git a/docs/source/normalizing_data/sources.rst b/docs/source/normalizing_data/sources.rst
@@ -84,7 +84,7 @@ Ensembl
       "location_annotations": [],
       "locations": [
         {
-          "id": "ga4gh:SL.iwWw9B3tkU3TCLF3d8xu4zSQBhpDZfJ6",
+          "id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf",
           "type": "SequenceLocation",
           "sequenceReference": {
             "type": "SequenceReference",
@@ -133,7 +133,7 @@ The `NCBI Gene Database <https://www.ncbi.nlm.nih.gov/gene/>`_ is a service prov
           }
         },
         {
-          "id": "ga4gh:SL.rXzVqqlchBvUef98MNQA77FvwSJgiOf5",
+          "id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE",
           "type": "SequenceLocation",
           "sequenceReference": {
             "type": "SequenceReference",

diff --git a/docs/source/normalizing_data/vrs_compliance.rst b/docs/source/normalizing_data/vrs_compliance.rst
@@ -1,7 +1,7 @@
 VRS Compliance
 ==============
 
-As mentioned earlier in the documentation, the Gene Normalizer incorporates structures from the `GA4GH Variation Representation Specification (VRS) <https://vrs.ga4gh.org/en/stable/>`_ and from `GA4GH VRSATILE project <https://vrsatile.readthedocs.io/en/latest/>`_ to integrate more smoothly with other related libraries. The Gene Normalizer is currently released in two branches, corresponding to different VRS releases:
+As mentioned earlier in the documentation, the Gene Normalizer incorporates structures from the `GA4GH Variation Representation Specification (VRS) <https://vrs.ga4gh.org/en/stable/>`_ to integrate more smoothly with other related libraries. The Gene Normalizer is currently released in two branches, corresponding to different VRS releases:
 
 .. list-table::
    :widths: 25 25 25 25
@@ -10,12 +10,9 @@ As mentioned earlier in the documentation, the Gene Normalizer incorporates stru
    * - Gene Normalizer branch
      - Gene Normalizer version
      - VRS version
-     - VRSATILE version
    * - `main <https://github.com/cancervariants/gene-normalization>`_
      - 0.1.x
      - `1.X <https://github.com/ga4gh/vrs>`_
-     - `main <https://github.com/ga4gh/vrsatile/tree/main>`_
    * - `staging <https://github.com/cancervariants/gene-normalization/tree/staging>`_
-     - 0.2.x
-     - `metaschema-update <https://github.com/ga4gh/vrs/tree/metaschema-update>`_
-     - `metaschema-update <https://github.com/ga4gh/vrsatile/tree/metaschema-update>`_
+     - 0.3.x
+     - `metaschema-update <https://github.com/ga4gh/vrs/tree/2.0-alpha>`_
diff --git a/docs/source/quick_install.rst b/docs/source/quick_install.rst
@@ -49,9 +49,9 @@ The beginning of the response to a GET request to http://localhost:5000/gene/nor
         "name": "gene-normalizer",
         "url": "https://github.com/cancervariants/gene-normalization"
       },
-      "gene_descriptor": {
-        "id": "normalize.gene:braf",
-        "type": "GeneDescriptor",
+      "gene": {
+        "id": "normalize.gene.hgnc:107",
+        "type": "Gene",
         "label": "BRAF",
 
         ...

diff --git a/docs/source/usage.rst b/docs/source/usage.rst
@@ -35,7 +35,7 @@ Each search mode can be accessed directly within Python using the :ref:`query AP
     >>> normalized_response
     >>> normalized_response.match_type
     <MatchType.ALIAS: 60>
-    >>> normalized_response.gene_descriptor.label
+    >>> normalized_response.gene.label
     'ERBB2'
 
 Critically, the ``QueryHandler`` class must receive a database interface instance as its first argument. The most straightforward way to construct a database instance, as demonstrated above, is with the ``create_db`` method provided in the :py:mod:`gene.database` module. This method tries to build a database connection based on a number of conditions, which are resolved in the following order:

diff --git a/gene/query.py b/gene/query.py
@@ -2,19 +2,16 @@
 import re
 from datetime import datetime
 from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar
-from urllib.parse import quote
 
-from ga4gh.core import ga4gh_identify
+from ga4gh.core import core_models, ga4gh_identify
 from ga4gh.vrs import models
 
 from gene import ITEM_TYPES, NAMESPACE_LOOKUP, PREFIX_LOOKUP, logger
 from gene.database import AbstractDatabase, DatabaseReadException
 from gene.schemas import (
     BaseGene,
     BaseNormalizationService,
-    Extension,
     Gene,
-    GeneDescriptor,
     GeneTypeFieldName,
     MatchesNormalized,
     MatchType,
@@ -87,10 +84,10 @@ def _transform_sequence_location(loc: Dict) -> models.SequenceLocation:
         :param loc: GeneSequenceLocation represented as a dict
         :return: VRS sequence location
         """
-        sequence = loc["sequence_id"].split("ga4gh:")[-1]
+        refget_ac = loc["sequence_id"].split("ga4gh:")[-1]
 
         return models.SequenceLocation(
-            sequenceReference=models.SequenceReference(refgetAccession=sequence),
+            sequenceReference=models.SequenceReference(refgetAccession=refget_ac),
             start=int(loc["start"]),
             end=int(loc["end"]),
         )
@@ -362,15 +359,21 @@ def _add_merged_meta(self, response: NormalizeService) -> NormalizeService:
         :return: completed response object.
         """
         sources_meta = {}
-        gene_descr = response.gene_descriptor
-        xrefs = gene_descr.xrefs or []  # type: ignore
-        ids = [gene_descr.gene] + xrefs  # type: ignore
-        for concept_id in ids:
-            prefix = concept_id.split(":")[0]
-            src_name = PREFIX_LOOKUP[prefix.lower()]
-            if src_name not in sources_meta:
-                _source_meta = self.db.get_source_metadata(src_name)
-                sources_meta[SourceName(src_name)] = SourceMeta(**_source_meta)
+        gene = response.gene
+        sources = [response.normalized_id.split(":")[0]]
+        if gene.mappings:
+            sources += [m.coding.system for m in gene.mappings]
+
+        for src in sources:
+            try:
+                src_name = PREFIX_LOOKUP[src]
+            except KeyError:
+                # not an imported source
+                continue
+            else:
+                if src_name not in sources_meta:
+                    _source_meta = self.db.get_source_metadata(src_name)
+                    sources_meta[SourceName(src_name)] = SourceMeta(**_source_meta)
         response.source_meta_ = sources_meta
         return response
 
@@ -398,57 +401,69 @@ def _add_alt_matches(
             )
         return response
 
-    def _add_gene_descriptor(
+    def _add_gene(
         self,
         response: NormalizeService,
         record: Dict,
         match_type: MatchType,
         possible_concepts: Optional[List[str]] = None,
     ) -> NormalizeService:
-        """Add gene descriptor to response.
+        """Add core Gene object to response.
 
         :param response: Response object
         :param record: Gene record
         :param match_type: query's match type
         :param possible_concepts: List of other normalized concepts found
-        :return: Response with gene descriptor
+        :return: Response with core Gene
         """
-        params = {
-            "id": f"normalize.gene:{quote(response.query)}",
-            "label": record["symbol"],
-            "gene": record["concept_id"],
-        }
+        gene_obj = core_models.Gene(
+            id=f"normalize.gene.{record['concept_id']}",
+            label=record["symbol"],
+        )
 
-        # xrefs
-        if "xrefs" in record and record["xrefs"]:
-            params["xrefs"] = record["xrefs"]
+        # mappings
+        source_ids = record.get("xrefs", []) + record.get("associated_with", [])
+        mappings = []
+        for source_id in source_ids:
+            system, code = source_id.split(":")
+            mappings.append(
+                core_models.Mapping(
+                    coding=core_models.Coding(
+                        code=core_models.Code(code), system=system.lower()
+                    ),
+                    relation=core_models.Relation.RELATED_MATCH,
+                )
+            )
+        if mappings:
+            gene_obj.mappings = mappings
 
-        # alternate labels
-        alt_labels = set()
+        # aliases
+        aliases = set()
         for key in ["previous_symbols", "aliases"]:
             if key in record and record[key]:
                 val = record[key]
                 if isinstance(val, str):
                     val = [val]
-                alt_labels.update(val)
-        if alt_labels:
-            params["alternate_labels"] = list(alt_labels)
+                aliases.update(val)
+        if aliases:
+            gene_obj.aliases = list(aliases)
 
         # extensions
-        extensions = list()
+        extensions = []
         extension_and_record_labels = [
             ("symbol_status", "symbol_status"),
             ("approved_name", "label"),
-            ("associated_with", "associated_with"),
             ("previous_symbols", "previous_symbols"),
             ("location_annotations", "location_annotations"),
             ("strand", "strand"),
         ]
         for ext_label, record_label in extension_and_record_labels:
             if record_label in record and record[record_label]:
-                extensions.append(Extension(name=ext_label, value=record[record_label]))
+                extensions.append(
+                    core_models.Extension(name=ext_label, value=record[record_label])
+                )
 
-        record_locations = dict()
+        record_locations = {}
         if record["item_type"] == RecordType.IDENTITY:
             locs = record.get("locations")
             if locs:
@@ -459,18 +474,22 @@ def _add_gene_descriptor(
                     record_locations[k] = v
 
         for loc_name, locations in record_locations.items():
-            transformed_locs = list()
+            transformed_locs = []
             for loc in locations:
                 if loc["type"] == "SequenceLocation":
                     transformed_locs.append(self._transform_location(loc))
-            extensions.append(Extension(name=loc_name, value=transformed_locs))
+
+            if transformed_locs:
+                extensions.append(
+                    core_models.Extension(name=loc_name, value=transformed_locs)
+                )
 
         # handle gene types separately because they're wonky
         if record["item_type"] == RecordType.IDENTITY:
             gene_type = record.get("gene_type")
             if gene_type:
                 extensions.append(
-                    Extension(
+                    core_models.Extension(
                         name=GeneTypeFieldName[record["src_name"].upper()].value,
                         value=gene_type,
                     )
@@ -480,15 +499,18 @@ def _add_gene_descriptor(
                 field_name = f.value
                 values = record.get(field_name, [])
                 for value in values:
-                    extensions.append(Extension(name=field_name, value=value))
+                    extensions.append(
+                        core_models.Extension(name=field_name, value=value)
+                    )
         if extensions:
-            params["extensions"] = extensions
+            gene_obj.extensions = extensions
 
         # add warnings
         if possible_concepts:
             response = self._add_alt_matches(response, record, possible_concepts)
 
-        response.gene_descriptor = GeneDescriptor(**params)
+        response.normalized_id = record["concept_id"]
+        response.gene = gene_obj
         response = self._add_merged_meta(response)
         response.match_type = match_type
         return response
@@ -544,18 +566,16 @@ def normalize(self, query: str) -> NormalizeService:
         >>> from gene.database import create_db
         >>> q = QueryHandler(create_db())
         >>> result = q.normalize("BRAF")
-        >>> result.gene_descriptor.gene_id
+        >>> result.normalized_id
         'hgnc:1097'
-        >>> result.xrefs
-        ['ensembl:ENSG00000157764', 'ncbigene:673']
+        >>> result.aliases
+        ['BRAF1', 'RAFB1', 'B-raf', 'NS7', 'B-RAF1']
 
         :param query: String to find normalized concept for
         :return: Normalized gene concept
         """
         response = NormalizeService(**self._prepare_normalized_response(query))
-        return self._perform_normalized_lookup(
-            response, query, self._add_gene_descriptor
-        )
+        return self._perform_normalized_lookup(response, query, self._add_gene)
 
     def _resolve_merge(
         self,