From d3c10ad9c5d5c2db2b34782777666479523dce88 Mon Sep 17 00:00:00 2001 From: Kori Kuzma Date: Wed, 27 Sep 2023 09:34:14 -0400 Subject: [PATCH] feat!: use vrs 2.0-alpha core gene model in /normalize (#252) * bump ga4gh.vrs version to include core models * Removes custom Gene Descriptor/Extension/Gene Value Object pydantic classes and uses ga4gh.core models * Remove references to VRSATILE/VODs --- Pipfile | 2 +- README.md | 4 +- docs/source/index.rst | 4 +- .../source/normalizing_data/normalization.rst | 8 +- docs/source/normalizing_data/sources.rst | 4 +- .../normalizing_data/vrs_compliance.rst | 9 +- docs/source/quick_install.rst | 6 +- docs/source/usage.rst | 2 +- gene/query.py | 114 ++-- gene/schemas.py | 147 +++-- setup.cfg | 2 +- tests/unit/test_ensembl_source.py | 10 +- tests/unit/test_ncbi_source.py | 20 +- tests/unit/test_query.py | 582 +++++++++++------- tests/unit/test_schemas.py | 4 +- 15 files changed, 523 insertions(+), 395 deletions(-) diff --git a/Pipfile b/Pipfile index f2bd36bb..0117c2d5 100644 --- a/Pipfile +++ b/Pipfile @@ -9,7 +9,7 @@ fastapi = "*" uvicorn = "*" click = "*" boto3 = "*" -"ga4gh.vrs" = {version = "==2.0.0.dev0"} +"ga4gh.vrs" = "~=2.0.0a1" [dev-packages] gene = {editable = true, path = "."} diff --git a/README.md b/README.md index bf1ddf34..4cbd41ab 100644 --- a/README.md +++ b/README.md @@ -18,9 +18,9 @@ Call the `normalize()` method with a gene term. If available, a rich description ``` >>> result = q.normalize("BRAF") ->>> result.gene_descriptor.gene +>>> result.normalized_id "hgnc:1097" ->>> result.gene_descriptor.alternate_labels +>>> result.gene.aliases ['NS7', 'RAFB1', 'B-raf', 'BRAF-1', 'BRAF1', 'B-RAF1'] ``` diff --git a/docs/source/index.rst b/docs/source/index.rst index 1c1fd49f..047af91b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,9 +20,9 @@ The Gene Normalizer provides tools for resolving ambiguous human gene references >>> from gene.database import create_db >>> q = QueryHandler(create_db()) >>> result = q.normalize("BRAF") - >>> result.gene_descriptor.gene + >>> result.normalized_id "hgnc:1097" - >>> result.gene_descriptor.alternate_labels + >>> result.gene.aliases ['NS7', 'RAFB1', 'B-raf', 'BRAF-1', 'BRAF1', 'B-RAF1'] See the `public REST instance of the service `_ for a demonstration of all queryable endpoints. diff --git a/docs/source/normalizing_data/normalization.rst b/docs/source/normalizing_data/normalization.rst index d954df1f..b07d0e10 100644 --- a/docs/source/normalizing_data/normalization.rst +++ b/docs/source/normalizing_data/normalization.rst @@ -48,7 +48,7 @@ Normalized gene records are constructed by merging known data from all associate The normalized record --------------------- -Normalized records are structured as `Gene Descriptors `_ in conformance with the `GA4GH VRSATILE project `_. The normalized gene concept is provided as a value object, and additional metadata is deposited as a label, xrefs, alternate labels, as well as Extensions for more complex information (such as loci and gene type). The following demonstrates this model for the BRAF gene: +Normalized records are structured as `Genes `_. The normalized gene concept is provided and additional metadata is deposited as a label, xrefs, alternate labels, as well as Extensions for more complex information (such as loci and gene type). The following demonstrates this model for the BRAF gene: .. admonition:: Example @@ -56,7 +56,7 @@ Normalized records are structured as `Gene Descriptors `_ is a service prov } }, { - "id": "ga4gh:SL.rXzVqqlchBvUef98MNQA77FvwSJgiOf5", + "id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", diff --git a/docs/source/normalizing_data/vrs_compliance.rst b/docs/source/normalizing_data/vrs_compliance.rst index ac2a5130..93debb69 100644 --- a/docs/source/normalizing_data/vrs_compliance.rst +++ b/docs/source/normalizing_data/vrs_compliance.rst @@ -1,7 +1,7 @@ VRS Compliance ============== -As mentioned earlier in the documentation, the Gene Normalizer incorporates structures from the `GA4GH Variation Representation Specification (VRS) `_ and from `GA4GH VRSATILE project `_ to integrate more smoothly with other related libraries. The Gene Normalizer is currently released in two branches, corresponding to different VRS releases: +As mentioned earlier in the documentation, the Gene Normalizer incorporates structures from the `GA4GH Variation Representation Specification (VRS) `_ to integrate more smoothly with other related libraries. The Gene Normalizer is currently released in two branches, corresponding to different VRS releases: .. list-table:: :widths: 25 25 25 25 @@ -10,12 +10,9 @@ As mentioned earlier in the documentation, the Gene Normalizer incorporates stru * - Gene Normalizer branch - Gene Normalizer version - VRS version - - VRSATILE version * - `main `_ - 0.1.x - `1.X `_ - - `main `_ * - `staging `_ - - 0.2.x - - `metaschema-update `_ - - `metaschema-update `_ + - 0.3.x + - `metaschema-update `_ diff --git a/docs/source/quick_install.rst b/docs/source/quick_install.rst index 2471d668..59154f2c 100644 --- a/docs/source/quick_install.rst +++ b/docs/source/quick_install.rst @@ -49,9 +49,9 @@ The beginning of the response to a GET request to http://localhost:5000/gene/nor "name": "gene-normalizer", "url": "https://github.com/cancervariants/gene-normalization" }, - "gene_descriptor": { - "id": "normalize.gene:braf", - "type": "GeneDescriptor", + "gene": { + "id": "normalize.gene.hgnc:107", + "type": "Gene", "label": "BRAF", ... diff --git a/docs/source/usage.rst b/docs/source/usage.rst index a0c174da..8d439948 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -35,7 +35,7 @@ Each search mode can be accessed directly within Python using the :ref:`query AP >>> normalized_response >>> normalized_response.match_type - >>> normalized_response.gene_descriptor.label + >>> normalized_response.gene.label 'ERBB2' Critically, the ``QueryHandler`` class must receive a database interface instance as its first argument. The most straightforward way to construct a database instance, as demonstrated above, is with the ``create_db`` method provided in the :py:mod:`gene.database` module. This method tries to build a database connection based on a number of conditions, which are resolved in the following order: diff --git a/gene/query.py b/gene/query.py index cc427ae5..1281780c 100644 --- a/gene/query.py +++ b/gene/query.py @@ -2,9 +2,8 @@ import re from datetime import datetime from typing import Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar -from urllib.parse import quote -from ga4gh.core import ga4gh_identify +from ga4gh.core import core_models, ga4gh_identify from ga4gh.vrs import models from gene import ITEM_TYPES, NAMESPACE_LOOKUP, PREFIX_LOOKUP, logger @@ -12,9 +11,7 @@ from gene.schemas import ( BaseGene, BaseNormalizationService, - Extension, Gene, - GeneDescriptor, GeneTypeFieldName, MatchesNormalized, MatchType, @@ -87,10 +84,10 @@ def _transform_sequence_location(loc: Dict) -> models.SequenceLocation: :param loc: GeneSequenceLocation represented as a dict :return: VRS sequence location """ - sequence = loc["sequence_id"].split("ga4gh:")[-1] + refget_ac = loc["sequence_id"].split("ga4gh:")[-1] return models.SequenceLocation( - sequenceReference=models.SequenceReference(refgetAccession=sequence), + sequenceReference=models.SequenceReference(refgetAccession=refget_ac), start=int(loc["start"]), end=int(loc["end"]), ) @@ -362,15 +359,21 @@ def _add_merged_meta(self, response: NormalizeService) -> NormalizeService: :return: completed response object. """ sources_meta = {} - gene_descr = response.gene_descriptor - xrefs = gene_descr.xrefs or [] # type: ignore - ids = [gene_descr.gene] + xrefs # type: ignore - for concept_id in ids: - prefix = concept_id.split(":")[0] - src_name = PREFIX_LOOKUP[prefix.lower()] - if src_name not in sources_meta: - _source_meta = self.db.get_source_metadata(src_name) - sources_meta[SourceName(src_name)] = SourceMeta(**_source_meta) + gene = response.gene + sources = [response.normalized_id.split(":")[0]] + if gene.mappings: + sources += [m.coding.system for m in gene.mappings] + + for src in sources: + try: + src_name = PREFIX_LOOKUP[src] + except KeyError: + # not an imported source + continue + else: + if src_name not in sources_meta: + _source_meta = self.db.get_source_metadata(src_name) + sources_meta[SourceName(src_name)] = SourceMeta(**_source_meta) response.source_meta_ = sources_meta return response @@ -398,57 +401,69 @@ def _add_alt_matches( ) return response - def _add_gene_descriptor( + def _add_gene( self, response: NormalizeService, record: Dict, match_type: MatchType, possible_concepts: Optional[List[str]] = None, ) -> NormalizeService: - """Add gene descriptor to response. + """Add core Gene object to response. :param response: Response object :param record: Gene record :param match_type: query's match type :param possible_concepts: List of other normalized concepts found - :return: Response with gene descriptor + :return: Response with core Gene """ - params = { - "id": f"normalize.gene:{quote(response.query)}", - "label": record["symbol"], - "gene": record["concept_id"], - } + gene_obj = core_models.Gene( + id=f"normalize.gene.{record['concept_id']}", + label=record["symbol"], + ) - # xrefs - if "xrefs" in record and record["xrefs"]: - params["xrefs"] = record["xrefs"] + # mappings + source_ids = record.get("xrefs", []) + record.get("associated_with", []) + mappings = [] + for source_id in source_ids: + system, code = source_id.split(":") + mappings.append( + core_models.Mapping( + coding=core_models.Coding( + code=core_models.Code(code), system=system.lower() + ), + relation=core_models.Relation.RELATED_MATCH, + ) + ) + if mappings: + gene_obj.mappings = mappings - # alternate labels - alt_labels = set() + # aliases + aliases = set() for key in ["previous_symbols", "aliases"]: if key in record and record[key]: val = record[key] if isinstance(val, str): val = [val] - alt_labels.update(val) - if alt_labels: - params["alternate_labels"] = list(alt_labels) + aliases.update(val) + if aliases: + gene_obj.aliases = list(aliases) # extensions - extensions = list() + extensions = [] extension_and_record_labels = [ ("symbol_status", "symbol_status"), ("approved_name", "label"), - ("associated_with", "associated_with"), ("previous_symbols", "previous_symbols"), ("location_annotations", "location_annotations"), ("strand", "strand"), ] for ext_label, record_label in extension_and_record_labels: if record_label in record and record[record_label]: - extensions.append(Extension(name=ext_label, value=record[record_label])) + extensions.append( + core_models.Extension(name=ext_label, value=record[record_label]) + ) - record_locations = dict() + record_locations = {} if record["item_type"] == RecordType.IDENTITY: locs = record.get("locations") if locs: @@ -459,18 +474,22 @@ def _add_gene_descriptor( record_locations[k] = v for loc_name, locations in record_locations.items(): - transformed_locs = list() + transformed_locs = [] for loc in locations: if loc["type"] == "SequenceLocation": transformed_locs.append(self._transform_location(loc)) - extensions.append(Extension(name=loc_name, value=transformed_locs)) + + if transformed_locs: + extensions.append( + core_models.Extension(name=loc_name, value=transformed_locs) + ) # handle gene types separately because they're wonky if record["item_type"] == RecordType.IDENTITY: gene_type = record.get("gene_type") if gene_type: extensions.append( - Extension( + core_models.Extension( name=GeneTypeFieldName[record["src_name"].upper()].value, value=gene_type, ) @@ -480,15 +499,18 @@ def _add_gene_descriptor( field_name = f.value values = record.get(field_name, []) for value in values: - extensions.append(Extension(name=field_name, value=value)) + extensions.append( + core_models.Extension(name=field_name, value=value) + ) if extensions: - params["extensions"] = extensions + gene_obj.extensions = extensions # add warnings if possible_concepts: response = self._add_alt_matches(response, record, possible_concepts) - response.gene_descriptor = GeneDescriptor(**params) + response.normalized_id = record["concept_id"] + response.gene = gene_obj response = self._add_merged_meta(response) response.match_type = match_type return response @@ -544,18 +566,16 @@ def normalize(self, query: str) -> NormalizeService: >>> from gene.database import create_db >>> q = QueryHandler(create_db()) >>> result = q.normalize("BRAF") - >>> result.gene_descriptor.gene_id + >>> result.normalized_id 'hgnc:1097' - >>> result.xrefs - ['ensembl:ENSG00000157764', 'ncbigene:673'] + >>> result.aliases + ['BRAF1', 'RAFB1', 'B-raf', 'NS7', 'B-RAF1'] :param query: String to find normalized concept for :return: Normalized gene concept """ response = NormalizeService(**self._prepare_normalized_response(query)) - return self._perform_normalized_lookup( - response, query, self._add_gene_descriptor - ) + return self._perform_normalized_lookup(response, query, self._add_gene) def _resolve_merge( self, diff --git a/gene/schemas.py b/gene/schemas.py index f965f020..89cfb6bf 100644 --- a/gene/schemas.py +++ b/gene/schemas.py @@ -1,7 +1,8 @@ """Contains data models for representing VICC normalized gene records.""" from enum import Enum, IntEnum -from typing import Any, Dict, List, Literal, Optional, Union +from typing import Dict, List, Literal, Optional, Union +from ga4gh.core import core_models from ga4gh.vrs import models from pydantic import ( BaseModel, @@ -10,7 +11,6 @@ StrictInt, StrictStr, constr, - field_validator, ) from gene.version import __version__ @@ -18,50 +18,6 @@ CURIE = constr(pattern=r"^\w[^:]*:.+$") -class Extension(BaseModel): - """The Extension class provides VODs with a means to extend descriptions with other - attributes unique to a content provider. These extensions are not expected to be - natively understood under VRSATILE, but may be used for pre-negotiated exchange of - message attributes when needed. - """ - - type: Literal["Extension"] = "Extension" - name: StrictStr - value: Optional[Any] = None - - -class GeneValueObject(BaseModel): - """A reference to a Gene as defined by an authority. For human genes, the use of - `hgnc ` as the gene authority is - RECOMMENDED. - """ - - id: CURIE - type: Literal["Gene"] = "Gene" - - -class GeneDescriptor(BaseModel, extra="forbid"): - """Reference VRS Gene value objects.""" - - id: Optional[StrictStr] = None - type: Literal["GeneDescriptor"] = "GeneDescriptor" - gene: Union[CURIE, GeneValueObject] - label: Optional[StrictStr] = None - description: Optional[StrictStr] = None - xrefs: List[CURIE] = [] - alternate_labels: List[StrictStr] = [] - extensions: List[Extension] = [] - - @field_validator("xrefs") - def check_count_value(cls, v): - """Check xrefs value""" - if v: - assert len(v) == len( - {xref for xref in v} - ), "xrefs must contain unique items" # noqa: E501 - return v - - class SymbolStatus(str, Enum): """Define string constraints for symbol status attribute.""" @@ -427,7 +383,8 @@ class BaseNormalizationService(BaseModel): class NormalizeService(BaseNormalizationService): """Define model for returning normalized concept.""" - gene_descriptor: Optional[GeneDescriptor] = None + normalized_id: Optional[str] = None + gene: Optional[core_models.Gene] = None source_meta_: Dict[SourceName, SourceMeta] = {} model_config = ConfigDict( @@ -436,13 +393,74 @@ class NormalizeService(BaseNormalizationService): "query": "BRAF", "warnings": [], "match_type": 100, - "gene_descriptor": { - "id": "normalize.gene:BRAF", - "type": "GeneDescriptor", - "gene": "hgnc:1097", + "normalized_id": "hgnc:1037", + "gene": { + "type": "Gene", + "id": "normalize.gene.hgnc:1097", "label": "BRAF", - "xrefs": ["ncbigene:673", "ensembl:ENSG00000157764"], - "alternate_labels": ["BRAF1", "RAFB1", "B-raf", "NS7", "B-RAF1"], + "mappings": [ + { + "coding": {"code": "673", "system": "ncbigene"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "ENSG00000157764", "system": "ensembl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS5863", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1943", "system": "iuphar"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "119066", "system": "orphanet"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "BRAF", "system": "cosmic"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "2284096", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "uc003vwc.5", "system": "ucsc"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "164757", "system": "omim"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "NM_004333", "system": "refseq"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS87555", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "P15056", "system": "uniprot"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "M95712", "system": "ena.embl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1565476", "system": "pubmed"}, + "relation": "relatedMatch", + }, + ], + "aliases": ["BRAF1", "RAFB1", "B-raf", "NS7", "B-RAF1"], "extensions": [ { "name": "approved_name", @@ -454,25 +472,6 @@ class NormalizeService(BaseNormalizationService): "value": "approved", "type": "Extension", }, - { - "name": "associated_with", - "value": [ - "ccds:CCDS5863", - "iuphar:1943", - "orphanet:119066", - "cosmic:BRAF", - "pubmed:2284096", - "ucsc:uc003vwc.5", - "omim:164757", - "refseq:NM_004333", - "ccds:CCDS87555", - "uniprot:P15056", - "ena.embl:M95712", - "vega:OTTHUMG00000157457", - "pubmed:1565476", - ], - "type": "Extension", - }, # { # "name": "chromosome_location", # "value": { @@ -632,7 +631,7 @@ class UnmergedNormalizationService(BaseNormalizationService): "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.oyhehgtv3XV3iMTlul7XtMQ_5RSAvts6", # noqa: E501 + "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", # noqa: E501 "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -682,7 +681,7 @@ class UnmergedNormalizationService(BaseNormalizationService): # "end": "q22.1" }, { - "id": "ga4gh:SL.OuUQ-JYrkb92VioFp1P9JLGAbVQA1Wqs", # noqa: E501 + "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", # noqa: E501 "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", diff --git a/setup.cfg b/setup.cfg index ba6169a7..43d2a947 100644 --- a/setup.cfg +++ b/setup.cfg @@ -36,7 +36,7 @@ install_requires = uvicorn click boto3 - ga4gh.vrs == 2.0.0.dev0 + ga4gh.vrs ~= 2.0.0a1 tests_require = pytest diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index d649fa8e..77a73375 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -36,7 +36,7 @@ def ddx11l1(): "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.qwRl0_yhw7TeR2YHOthUScmIHb3WxC5x", + "id": "ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L", "end": 14409, "start": 11868, "sequenceReference": { @@ -68,7 +68,7 @@ def tp53(): "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.LUROHdtf1GW2jSMsTgeEyRBLwCJhzqan", + "id": "ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9", "end": 7687538, "start": 7661778, "sequenceReference": { @@ -100,7 +100,7 @@ def ATP6AP1_DT(): # noqa: N802 "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.9frk77a5havUlYwgPZZIfv4ZeCb-5F7y", + "id": "ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3", "end": 154428526, "start": 154424377, "sequenceReference": { @@ -132,7 +132,7 @@ def hsa_mir_1253(): "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.jewt2nx_zdMgfT-gZ8IzkXAeLYSTJj1g", + "id": "ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR", "end": 2748182, "start": 2748077, "sequenceReference": { @@ -164,7 +164,7 @@ def spry3(): "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.bOQPvz0yXBTTPtSHeddE5zcKbQuy_kc-", + "id": "ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0", "end": 155782459, "start": 155612571, "sequenceReference": { diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index 1010cefc..d3a56415 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -66,7 +66,7 @@ def dpf1(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.oLxKdt9nWsqa0pXO81Pfpyx6rqIZc6t7", + "id": "ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20", "end": 38229695, "start": 38211005, "sequenceReference": { @@ -106,7 +106,7 @@ def pdp1_symbol(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.ZcdRLYcMf2AvkpcpofRalXf1v7LMM1L1", + "id": "ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG", "end": 93926068, "start": 93916922, "sequenceReference": { @@ -146,7 +146,7 @@ def pdp1_alias(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.SsroS4-L34LHf4Aeh950IZJz0HoUwno9", + "id": "ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY", "end": 4665258, "start": 4662293, "sequenceReference": { @@ -195,7 +195,7 @@ def spry3(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.tSNZjm5jNs6yveiX4KsFkeWKI1dWL8_P", + "id": "ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV", "end": 155782459, "start": 155612585, "sequenceReference": { @@ -205,7 +205,7 @@ def spry3(): "type": "SequenceLocation", }, { - "id": "ga4gh:SL.CqpZwgas9C6armUezb4pKVi-GccILGOh", + "id": "ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL", "end": 56968979, "start": 56954315, "sequenceReference": { @@ -290,7 +290,7 @@ def znf84(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.QCbFjOlA2uBs8WyLmeRlRYU5CewYqgeS", + "id": "ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud", "end": 133063299, "start": 133037508, "sequenceReference": { @@ -339,7 +339,7 @@ def slc25a6(): # "end": "p11.2" # }, { - "id": "ga4gh:SL.bFu_HjMueWqLR1aC1DiEtE5pxFI5SVgg", + "id": "ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -349,7 +349,7 @@ def slc25a6(): "end": 1392113, }, { - "id": "ga4gh:SL.3j0Ns_ZkksT5_WubJ2Tw5WuA9IlHiPGz", + "id": "ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -484,7 +484,7 @@ def prkrap1(): # "type": "ChromosomeLocation" # }, { - "id": "ga4gh:SL.mkpAMHHHoXgIPXGY3Mr1mF6YQzHftZ7g", + "id": "ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb", "end": 3941874, "start": 3940269, "sequenceReference": { @@ -494,7 +494,7 @@ def prkrap1(): "type": "SequenceLocation", }, { - "id": "ga4gh:SL.vPF_uudaVkiByZywvLxiDd1b5AYKG6Ea", + "id": "ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c", "end": 3932085, "start": 3930480, "sequenceReference": { diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index a8e99e87..f767ced1 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -1,10 +1,9 @@ """Module to test the query module.""" -import copy - import pytest +from ga4gh.core import core_models from gene.query import InvalidParameterException, QueryHandler -from gene.schemas import BaseGene, GeneDescriptor, MatchType, SourceName +from gene.schemas import BaseGene, MatchType, SourceName @pytest.fixture(scope="module") @@ -29,14 +28,66 @@ def normalize_unmerged(self, query_str): @pytest.fixture(scope="module") def normalized_ache(): - """Return normalized Gene Descriptor for ACHE.""" + """Return normalized core Gene object for ACHE.""" params = { - "id": "normalize.gene:ACHE", - "type": "GeneDescriptor", - "gene": "hgnc:108", + "type": "Gene", + "id": "normalize.gene.hgnc:108", "label": "ACHE", - "xrefs": {"ensembl:ENSG00000087085", "ncbigene:43"}, - "alternate_labels": ["3.1.1.7", "YT", "N-ACHE", "ARACHE", "ACEE"], + "mappings": [ + { + "coding": {"code": "ENSG00000087085", "system": "ensembl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "43", "system": "ncbigene"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "OTTHUMG00000157033", "system": "vega"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "uc003uxi.4", "system": "ucsc"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS5710", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS64736", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS5709", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "P22303", "system": "uniprot"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1380483", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "100740", "system": "omim"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "S09.979", "system": "merops"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "2465", "system": "iuphar"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "NM_015831", "system": "refseq"}, + "relation": "relatedMatch", + }, + ], + "aliases": ["3.1.1.7", "YT", "N-ACHE", "ARACHE", "ACEE"], "extensions": [ {"name": "previous_symbols", "value": ["ACEE", "YT"], "type": "Extension"}, { @@ -45,23 +96,6 @@ def normalized_ache(): "type": "Extension", }, {"name": "symbol_status", "value": "approved", "type": "Extension"}, - { - "name": "associated_with", - "value": [ - "vega:OTTHUMG00000157033", - "ucsc:uc003uxi.4", - "ccds:CCDS5710", - "ccds:CCDS64736", - "ccds:CCDS5709", - "uniprot:P22303", - "pubmed:1380483", - "omim:100740", - "merops:S09.979", - "iuphar:2465", - "refseq:NM_015831", - ], - "type": "Extension", - }, { "name": "ncbi_locations", "value": [ @@ -74,7 +108,7 @@ def normalized_ache(): # "start": "q22.1" # }, { - "id": "ga4gh:SL.OuUQ-JYrkb92VioFp1P9JLGAbVQA1Wqs", + "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -104,7 +138,7 @@ def normalized_ache(): "name": "ensembl_locations", "value": [ { - "id": "ga4gh:SL.oyhehgtv3XV3iMTlul7XtMQ_5RSAvts6", + "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -126,47 +160,93 @@ def normalized_ache(): {"name": "strand", "type": "Extension", "value": "-"}, ], } - return GeneDescriptor(**params) + return core_models.Gene(**params) @pytest.fixture(scope="module") def normalized_braf(): - """Return normalized Gene Descriptor for BRAF.""" + """Return normalized core Gene object for BRAF.""" params = { - "id": "normalize.gene:BRAF", - "type": "GeneDescriptor", - "gene": "hgnc:1097", + "type": "Gene", + "id": "normalize.gene.hgnc:1097", "label": "BRAF", - "xrefs": {"ensembl:ENSG00000157764", "ncbigene:673"}, - "alternate_labels": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], + "mappings": [ + { + "coding": {"code": "673", "system": "ncbigene"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "ENSG00000157764", "system": "ensembl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS5863", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1943", "system": "iuphar"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "119066", "system": "orphanet"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "BRAF", "system": "cosmic"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "2284096", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "uc003vwc.5", "system": "ucsc"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "164757", "system": "omim"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "NM_004333", "system": "refseq"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS87555", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "P15056", "system": "uniprot"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "M95712", "system": "ena.embl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1565476", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS94219", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS94218", "system": "ccds"}, + "relation": "relatedMatch", + }, + ], + "aliases": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], "extensions": [ { "name": "approved_name", "value": "B-Raf proto-oncogene, serine/threonine kinase", "type": "Extension", }, - {"name": "symbol_status", "value": "approved", "type": "Extension"}, - { - "name": "associated_with", - "value": [ - "vega:OTTHUMG00000157457", - "ucsc:uc003vwc.5", - "ccds:CCDS5863", - "ccds:CCDS87555", - "ccds:CCDS94218", - "ccds:CCDS94219", - "uniprot:P15056", - "pubmed:2284096", - "pubmed:1565476", - "cosmic:BRAF", - "omim:164757", - "orphanet:119066", - "iuphar:1943", - "ena.embl:M95712", - "refseq:NM_004333", - ], - "type": "Extension", - }, # { # "name": "hgnc_locations", # "value": [ @@ -185,7 +265,7 @@ def normalized_braf(): "name": "ensembl_locations", "value": [ { - "id": "ga4gh:SL.iwWw9B3tkU3TCLF3d8xu4zSQBhpDZfJ6", + "id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -209,7 +289,7 @@ def normalized_braf(): # "end": "q34" # }, { - "id": "ga4gh:SL.rXzVqqlchBvUef98MNQA77FvwSJgiOf5", + "id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -229,21 +309,82 @@ def normalized_braf(): }, {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, {"name": "strand", "type": "Extension", "value": "-"}, + {"name": "symbol_status", "type": "Extension", "value": "approved"}, ], } - return GeneDescriptor(**params) + return core_models.Gene(**params) @pytest.fixture(scope="module") def normalized_abl1(): - """Return normalized Gene Descriptor for ABL1.""" + """Return normalized core Gene object for ABL1.""" params = { - "id": "normalize.gene:ABL1", - "type": "GeneDescriptor", - "gene": "hgnc:76", + "type": "Gene", + "id": "normalize.gene.hgnc:76", "label": "ABL1", - "xrefs": {"ensembl:ENSG00000097007", "ncbigene:25"}, - "alternate_labels": [ + "mappings": [ + { + "coding": {"code": "ENSG00000097007", "system": "ensembl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "25", "system": "ncbigene"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "OTTHUMG00000020813", "system": "vega"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "uc004bzv.4", "system": "ucsc"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS35166", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS35165", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "P00519", "system": "uniprot"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1857987", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "12626632", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "ABL1", "system": "cosmic"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "189980", "system": "omim"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "117691", "system": "orphanet"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1923", "system": "iuphar"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "M14752", "system": "ena.embl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "NM_007313", "system": "refseq"}, + "relation": "relatedMatch", + }, + ], + "aliases": [ "c-ABL", "JTK7", "p150", @@ -267,26 +408,6 @@ def normalized_abl1(): "value": "ABL proto-oncogene 1, non-receptor tyrosine kinase", "type": "Extension", }, - {"name": "symbol_status", "value": "approved", "type": "Extension"}, - { - "name": "associated_with", - "value": [ - "vega:OTTHUMG00000020813", - "ucsc:uc004bzv.4", - "ccds:CCDS35166", - "ccds:CCDS35165", - "uniprot:P00519", - "pubmed:1857987", - "pubmed:12626632", - "cosmic:ABL1", - "omim:189980", - "orphanet:117691", - "iuphar:1923", - "ena.embl:M14752", - "refseq:NM_007313", - ], - "type": "Extension", - }, # { # "name": "hgnc_locations", # "value": [ @@ -313,7 +434,7 @@ def normalized_abl1(): # "end": "q34.12" # }, { - "id": "ga4gh:SL.qwMQXDwguWeHsOb5bd7qoLC8zyfxcHzC", + "id": "ga4gh:SL.F1QUtInXQaBEjAJNR1sYHXdp0XC000Qi", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -329,7 +450,7 @@ def normalized_abl1(): "name": "ensembl_locations", "value": [ { - "id": "ga4gh:SL.mL3bBgmOG_mOb3P68os_hfhlPzbqr1MS", + "id": "ga4gh:SL.P9Qu87GYxoWPYh1BdAQC5bTLorjvvye7", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -349,21 +470,62 @@ def normalized_abl1(): }, {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, {"name": "strand", "type": "Extension", "value": "+"}, + {"name": "symbol_status", "type": "Extension", "value": "approved"}, ], } - return GeneDescriptor(**params) + return core_models.Gene(**params) @pytest.fixture(scope="module") def normalized_p150(): - """Return normalized Gene Descriptor for p150.""" + """Return normalized core Gene object for p150.""" params = { - "id": "normalize.gene:P150", - "type": "GeneDescriptor", - "gene": "hgnc:1910", + "type": "Gene", + "id": "normalize.gene.hgnc:1910", "label": "CHAF1A", - "xrefs": {"ensembl:ENSG00000167670", "ncbigene:10036"}, - "alternate_labels": [ + "mappings": [ + { + "coding": {"code": "ENSG00000167670", "system": "ensembl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "10036", "system": "ncbigene"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "601246", "system": "omim"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "CCDS32875", "system": "ccds"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "7600578", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "OTTHUMG00000181922", "system": "vega"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "Q13111", "system": "uniprot"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "NM_005483", "system": "refseq"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "U20979", "system": "ena.embl"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "uc002mal.4", "system": "ucsc"}, + "relation": "relatedMatch", + }, + ], + "aliases": [ "CAF1P150", "MGC71229", "CAF-1", @@ -378,21 +540,6 @@ def normalized_p150(): "value": "chromatin assembly factor 1 subunit A", "type": "Extension", }, - {"name": "symbol_status", "value": "approved", "type": "Extension"}, - { - "name": "associated_with", - "value": [ - "omim:601246", - "ccds:CCDS32875", - "pubmed:7600578", - "vega:OTTHUMG00000181922", - "uniprot:Q13111", - "refseq:NM_005483", - "ena.embl:U20979", - "ucsc:uc002mal.4", - ], - "type": "Extension", - }, # { # "name": "hgnc_locations", # "value": [ @@ -411,7 +558,7 @@ def normalized_p150(): "name": "ensembl_locations", "value": [ { - "id": "ga4gh:SL.4RCVIbLVXLWPxvhd3IkRA-yI4o99Uwuq", + "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -434,7 +581,7 @@ def normalized_p150(): # "end": "p13.3" # }, { - "id": "ga4gh:SL.-EYdfD5JkE4lqRwkCR_NNzaaT0uTYBg2", + "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -458,9 +605,10 @@ def normalized_p150(): "value": ["LOC107985297"], }, {"name": "strand", "type": "Extension", "value": "+"}, + {"name": "symbol_status", "type": "Extension", "value": "approved"}, ], } - return GeneDescriptor(**params) + return core_models.Gene(**params) @pytest.fixture(scope="module") @@ -469,10 +617,9 @@ def normalized_loc_653303(): normalized results that don't merge records. """ params = { - "id": "normalize.gene:LOC653303", - "type": "GeneDescriptor", + "type": "Gene", "label": "LOC653303", - "alternate_labels": ["LOC196266", "LOC654080", "LOC731196"], + "aliases": ["LOC196266", "LOC654080", "LOC731196"], "extensions": [ { "type": "Extension", @@ -491,7 +638,7 @@ def normalized_loc_653303(): # "end": "q23.3" # }, { - "id": "ga4gh:SL.Iumme4GSaXUPAo0ifaq85LLlA1nT7l5o", + "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -510,9 +657,9 @@ def normalized_loc_653303(): {"type": "Extension", "name": "ncbi_gene_type", "value": "pseudo"}, {"name": "strand", "type": "Extension", "value": "+"}, ], - "gene": "ncbigene:653303", + "id": "normalize.gene.ncbigene:653303", } - return GeneDescriptor(**params) + return core_models.Gene(**params) @pytest.fixture(scope="module") @@ -542,7 +689,7 @@ def normalize_unmerged_loc_653303(): # "end": "q23.3" # }, { - "id": "ga4gh:SL.Iumme4GSaXUPAo0ifaq85LLlA1nT7l5o", + "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -624,7 +771,7 @@ def normalize_unmerged_chaf1a(): "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.4RCVIbLVXLWPxvhd3IkRA-yI4o99Uwuq", + "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -661,7 +808,7 @@ def normalize_unmerged_chaf1a(): # "end": "p13.3" # }, { - "id": "ga4gh:SL.-EYdfD5JkE4lqRwkCR_NNzaaT0uTYBg2", + "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -708,7 +855,7 @@ def normalize_unmerged_ache(): # "end": "q22.1" # }, { - "id": "ga4gh:SL.OuUQ-JYrkb92VioFp1P9JLGAbVQA1Wqs", + "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -737,7 +884,7 @@ def normalize_unmerged_ache(): "location_annotations": [], "locations": [ { - "id": "ga4gh:SL.oyhehgtv3XV3iMTlul7XtMQ_5RSAvts6", + "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", "type": "SequenceLocation", "sequenceReference": { "type": "SequenceReference", @@ -800,14 +947,30 @@ def normalize_unmerged_ache(): @pytest.fixture(scope="module") def normalized_ifnr(): - """Return normalized Gene Descriptor for IFNR.""" + """Return normalized core Gene object for IFNR.""" params = { - "id": "normalize.gene:IFNR", - "type": "GeneDescriptor", - "gene": "hgnc:5447", + "type": "Gene", + "id": "normalize.gene.hgnc:5447", "label": "IFNR", - "xrefs": {"ncbigene:3466"}, - "alternate_labels": ["IFNGM", "IFNGM2"], + "mappings": [ + { + "coding": {"code": "3466", "system": "ncbigene"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1906174", "system": "pubmed"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "147573", "system": "omim"}, + "relation": "relatedMatch", + }, + { + "coding": {"code": "1193239", "system": "pubmed"}, + "relation": "relatedMatch", + }, + ], + "aliases": ["IFNGM", "IFNGM2"], "extensions": [ { "name": "approved_name", @@ -815,17 +978,13 @@ def normalized_ifnr(): "type": "Extension", }, {"name": "symbol_status", "value": "approved", "type": "Extension"}, - { - "name": "associated_with", - "value": ["pubmed:1906174", "omim:147573", "pubmed:1193239"], - "type": "Extension", - }, + {"name": "symbol_status", "value": "approved", "type": "Extension"}, {"name": "ncbi_gene_type", "type": "Extension", "value": "unknown"}, {"name": "hgnc_locus_type", "type": "Extension", "value": "unknown"}, {"name": "location_annotations", "type": "Extension", "value": ["16"]}, ], } - return GeneDescriptor(**params) + return core_models.Gene(**params) @pytest.fixture(scope="module") @@ -861,7 +1020,7 @@ def compare_normalize_resp( resp, expected_query, expected_match_type, - expected_gene_descriptor, + expected_gene, expected_warnings=None, expected_source_meta=None, ): @@ -869,7 +1028,8 @@ def compare_normalize_resp( assert resp.query == expected_query compare_warnings(resp.warnings, expected_warnings) assert resp.match_type == expected_match_type - compare_gene_descriptor(expected_gene_descriptor, resp.gene_descriptor) + assert resp.normalized_id == expected_gene.id.split("normalize.gene.")[-1] + compare_gene(expected_gene, resp.gene) if not expected_source_meta: assert resp.source_meta_ == {} else: @@ -929,19 +1089,35 @@ def compare_service_meta(service_meta): assert service_meta.url == "https://github.com/cancervariants/gene-normalization" -def compare_gene_descriptor(test, actual): - """Test that actual and expected gene descriptors match.""" +def compare_gene(test, actual): + """Test that actual and expected core gene objects match.""" assert actual.id == test.id assert actual.type == test.type - assert actual.gene == test.gene assert actual.label == test.label - if actual.xrefs or test.xrefs: - assert set(actual.xrefs) == set(test.xrefs), "xrefs" - assert set(actual.alternate_labels) == set(test.alternate_labels), "alt labels" + + assert bool(actual.mappings) == bool(test.mappings) + if actual.mappings: + no_matches = [] + for actual_mapping in actual.mappings: + match = None + for fixture_mapping in test.mappings: + if actual_mapping == fixture_mapping: + match = actual_mapping + break + if not match: + no_matches.append(actual_mapping) + assert no_matches == [], no_matches + assert len(actual.mappings) == len(test.mappings) + + assert set(actual.aliases) == set(test.aliases), "aliases" extensions_present = "extensions" in test.model_fields.keys() assert ("extensions" in actual.model_fields.keys()) == extensions_present if extensions_present: - assert len(actual.extensions) == len(test.extensions), "len of extensions" + actual_ext_names = sorted([ext.name for ext in actual.extensions]) + unique_actual_ext_names = sorted(set(actual_ext_names)) + assert actual_ext_names == unique_actual_ext_names, "duplicate extension names" + test_ext_names = {ext.name for ext in test.extensions} + assert set(actual_ext_names) == test_ext_names, "extension names dont match" n_ext_correct = 0 for test_ext in test.extensions: for actual_ext in actual.extensions: @@ -1037,89 +1213,67 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): q = "ache" resp = query_handler.normalize(q) - cpy_normalized_ache = copy.deepcopy(normalized_ache) - cpy_normalized_ache.id = "normalize.gene:ache" compare_normalize_resp( - resp, q, MatchType.SYMBOL, cpy_normalized_ache, expected_source_meta=source_meta + resp, q, MatchType.SYMBOL, normalized_ache, expected_source_meta=source_meta ) q = "hgnc:108" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:hgnc%3A108" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_ache, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) q = "ensembl:ENSG00000087085" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:ensembl%3AENSG00000087085" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_ache, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) q = "ncbigene:43" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:ncbigene%3A43" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_ache, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) q = "3.1.1.7" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:3.1.1.7" compare_normalize_resp( - resp, q, MatchType.ALIAS, cpy_normalized_ache, expected_source_meta=source_meta + resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta ) q = "ARACHE" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:ARACHE" compare_normalize_resp( - resp, q, MatchType.ALIAS, cpy_normalized_ache, expected_source_meta=source_meta + resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta ) q = "YT" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:YT" compare_normalize_resp( resp, q, MatchType.PREV_SYMBOL, - cpy_normalized_ache, + normalized_ache, expected_source_meta=source_meta, ) q = "ACEE" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:ACEE" compare_normalize_resp( resp, q, MatchType.PREV_SYMBOL, - cpy_normalized_ache, + normalized_ache, expected_source_meta=source_meta, ) q = "omim:100740" resp = query_handler.normalize(q) - cpy_normalized_ache.id = "normalize.gene:omim%3A100740" compare_normalize_resp( resp, q, MatchType.ASSOCIATED_WITH, - cpy_normalized_ache, + normalized_ache, expected_source_meta=source_meta, ) @@ -1157,67 +1311,47 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): q = "braf" resp = query_handler.normalize(q) - cpy_normalized_braf = copy.deepcopy(normalized_braf) - cpy_normalized_braf.id = "normalize.gene:braf" compare_normalize_resp( - resp, q, MatchType.SYMBOL, cpy_normalized_braf, expected_source_meta=source_meta + resp, q, MatchType.SYMBOL, normalized_braf, expected_source_meta=source_meta ) q = "hgnc:1097" resp = query_handler.normalize(q) - cpy_normalized_braf.id = "normalize.gene:hgnc%3A1097" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_braf, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) q = "ensembl:ENSG00000157764" resp = query_handler.normalize(q) - cpy_normalized_braf.id = "normalize.gene:ensembl%3AENSG00000157764" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_braf, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) q = "ncbigene:673" resp = query_handler.normalize(q) - cpy_normalized_braf.id = "normalize.gene:ncbigene%3A673" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_braf, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) q = "NS7" resp = query_handler.normalize(q) - cpy_normalized_braf.id = "normalize.gene:NS7" compare_normalize_resp( - resp, q, MatchType.ALIAS, cpy_normalized_braf, expected_source_meta=source_meta + resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta ) q = "b-raf" resp = query_handler.normalize(q) - cpy_normalized_braf.id = "normalize.gene:b-raf" compare_normalize_resp( - resp, q, MatchType.ALIAS, cpy_normalized_braf, expected_source_meta=source_meta + resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta ) q = "omim:164757" resp = query_handler.normalize(q) - cpy_normalized_braf.id = "normalize.gene:omim%3A164757" compare_normalize_resp( resp, q, MatchType.ASSOCIATED_WITH, - cpy_normalized_braf, + normalized_braf, expected_source_meta=source_meta, ) @@ -1255,93 +1389,71 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): q = "abl1" resp = query_handler.normalize(q) - cpy_normalized_abl1 = copy.deepcopy(normalized_abl1) - cpy_normalized_abl1.id = "normalize.gene:abl1" compare_normalize_resp( - resp, q, MatchType.SYMBOL, cpy_normalized_abl1, expected_source_meta=source_meta + resp, q, MatchType.SYMBOL, normalized_abl1, expected_source_meta=source_meta ) q = "hgnc:76" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:hgnc%3A76" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_abl1, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) q = "ensembl:ENSG00000097007" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:ensembl%3AENSG00000097007" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_abl1, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) q = "ncbigene:25" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:ncbigene%3A25" compare_normalize_resp( - resp, - q, - MatchType.CONCEPT_ID, - cpy_normalized_abl1, - expected_source_meta=source_meta, + resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) q = "v-abl" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:v-abl" compare_normalize_resp( - resp, q, MatchType.ALIAS, cpy_normalized_abl1, expected_source_meta=source_meta + resp, q, MatchType.ALIAS, normalized_abl1, expected_source_meta=source_meta ) q = "LOC116063" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:LOC116063" compare_normalize_resp( resp, q, MatchType.PREV_SYMBOL, - cpy_normalized_abl1, + normalized_abl1, expected_source_meta=source_meta, ) q = "LOC112779" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:LOC112779" compare_normalize_resp( resp, q, MatchType.PREV_SYMBOL, - cpy_normalized_abl1, + normalized_abl1, expected_source_meta=source_meta, ) q = "ABL" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:ABL" compare_normalize_resp( resp, q, MatchType.PREV_SYMBOL, - cpy_normalized_abl1, + normalized_abl1, expected_source_meta=source_meta, ) q = "refseq:NM_007313" resp = query_handler.normalize(q) - cpy_normalized_abl1.id = "normalize.gene:refseq%3ANM_007313" compare_normalize_resp( resp, q, MatchType.ASSOCIATED_WITH, - cpy_normalized_abl1, + normalized_abl1, expected_source_meta=source_meta, ) @@ -1373,7 +1485,7 @@ def test_multiple_norm_concepts(query_handler, normalized_p150, source_meta): def test_normalize_single_entry(query_handler, normalized_loc_653303): """Test that the normalized endpoint correctly shapes unmerged identity - records into gene descriptors. + records into core gene objects. """ q = "LOC653303" resp = query_handler.normalize(q) diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py index 6e4d320b..3d5fceed 100644 --- a/tests/unit/test_schemas.py +++ b/tests/unit/test_schemas.py @@ -3,7 +3,7 @@ import pytest from ga4gh.vrs import models -from gene.schemas import Gene, GeneValueObject +from gene.schemas import Gene # @pytest.fixture(scope='module') # def chromosome_location(): @@ -102,7 +102,7 @@ def test_gene(gene, sequence_location): match_type=100, concept_id="hgnc:1096", symbol="BRAF", - locations=[GeneValueObject(id="hgnc:1")], + locations=["GRCh38:chr1"], ) # location not a list