diff --git a/gene/main.py b/gene/main.py index 4679b2b8..7c4e9f54 100644 --- a/gene/main.py +++ b/gene/main.py @@ -4,7 +4,8 @@ from typing import Optional from gene import __version__ from gene.query import QueryHandler, InvalidParameterException -from gene.schemas import SearchService, NormalizeService +from gene.schemas import SearchService, NormalizeService, \ + UnmergedNormalizationService import html @@ -96,7 +97,7 @@ def search(q: str = Query(..., description=q_descr), # noqa: D103 normalize_summary = "Given query, provide merged normalized record." normalize_response_descr = "A response to a validly-formed query." normalize_descr = "Return merged highest-match concept for query." -normalize_q_desecr = "Gene to normalize." +normalize_q_descr = "Gene to normalize." @app.get("/gene/normalize", @@ -104,7 +105,7 @@ def search(q: str = Query(..., description=q_descr), # noqa: D103 response_description=normalize_response_descr, response_model=NormalizeService, description=normalize_descr) -def normalize(q: str = Query(..., description=normalize_q_desecr)): +def normalize(q: str = Query(..., description=normalize_q_descr)): """Return strongest match concepts to query string provided by user. :param str q: gene search term @@ -115,3 +116,33 @@ def normalize(q: str = Query(..., description=normalize_q_desecr)): except InvalidParameterException as e: raise HTTPException(status_code=422, detail=str(e)) return resp + + +unmerged_matches_summary = ("Given query, provide source records corresponding to " + "normalized concept.") +unmerged_response_descr = ("Response containing source records contained within " + "normalized concept.") +unmerged_normalize_description = ("Return unmerged records associated with the " + "normalized result of the user-provided query " + "string.") + + +@app.get("/gene/normalize_unmerged", + summary=unmerged_matches_summary, + operation_id="getUnmergedRecords", + response_description=unmerged_response_descr, + response_model=UnmergedNormalizationService, + description=unmerged_normalize_description) +def normalize_unmerged( + q: str = Query(..., description=normalize_q_descr) +) -> UnmergedNormalizationService: + """Return all individual records associated with a normalized concept. + + :param q: Gene search term + :returns: JSON response with matching normalized record and source metadata + """ + try: + response = query_handler.normalize_unmerged(html.unescape(q)) + except InvalidParameterException as e: + raise HTTPException(status_code=422, detail=str(e)) + return response diff --git a/gene/query.py b/gene/query.py index 8ed6ff77..32ec2e6c 100644 --- a/gene/query.py +++ b/gene/query.py @@ -1,13 +1,14 @@ """This module provides methods for handling queries.""" import re -from typing import List, Dict, Set +from typing import List, Dict, Set, Any, TypeVar, Callable, Optional from urllib.parse import quote from .version import __version__ from gene import NAMESPACE_LOOKUP, PREFIX_LOOKUP, ITEM_TYPES from gene.database import Database -from gene.schemas import Gene, SourceMeta, MatchType, SourceName, \ +from gene.schemas import BaseGene, Gene, SourceMeta, MatchType, SourceName, \ ServiceMeta, SourcePriority, NormalizeService, SearchService, \ - GeneTypeFieldName + GeneTypeFieldName, UnmergedNormalizationService, MatchesNormalized, \ + BaseNormalizationService from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension from botocore.exceptions import ClientError from boto3.dynamodb.conditions import Key @@ -15,6 +16,9 @@ from gene import logger +NormService = TypeVar("NormService", bound=BaseNormalizationService) + + class InvalidParameterException(Exception): """Exception for invalid parameter args provided by the user.""" @@ -73,6 +77,23 @@ def fetch_meta(self, src_name: str) -> SourceMeta: except ClientError as e: logger.error(e.response['Error']['Message']) + @staticmethod + def _cast_location_ints(record: Dict) -> Dict: + """Ensure Locations are formatted correctly -- interval start and end need to + be recast to ints from how they're structured in DynamoDB + + :param Dict record: original record + :return: record with corrected locations attributes, if applicable + """ + if 'locations' in record: + for loc in record['locations']: + if loc['interval']['type'] == "SequenceInterval": + loc['interval']['start']['value'] = \ + int(loc['interval']['start']['value']) + loc['interval']['end']['value'] = \ + int(loc['interval']['end']['value']) + return record + def add_record(self, response: Dict[str, Dict], item: Dict, @@ -88,13 +109,7 @@ def add_record(self, """ del item['label_and_type'] # DynamoDB Numbers get converted to Decimal - if 'locations' in item: - for loc in item['locations']: - if loc['interval']['type'] == "SequenceInterval": - loc['interval']['start']['value'] = \ - int(loc['interval']['start']['value']) - loc['interval']['end']['value'] = \ - int(loc['interval']['end']['value']) + item = self._cast_location_ints(item) item["match_type"] = match_type gene = Gene(**item) src_name = item['src_name'] @@ -311,36 +326,62 @@ def search(self, query_str: str, keyed: bool = False, resp['service_meta_'] = self._get_service_meta() return SearchService(**resp) - def _add_merged_meta(self, response: Dict) -> Dict: + def _add_merged_meta(self, response: NormalizeService) -> NormalizeService: """Add source metadata to response object. :param Dict response: in-progress response object :return: completed response object. """ sources_meta = {} - gene_descr = response['gene_descriptor'] - ids = [gene_descr['gene_id']] + gene_descr.get('xrefs', []) + gene_descr = response.gene_descriptor + xrefs = gene_descr.xrefs or [] # type: ignore + ids = [gene_descr.gene_id] + xrefs # type: ignore for concept_id in ids: prefix = concept_id.split(':')[0] src_name = PREFIX_LOOKUP[prefix.lower()] if src_name not in sources_meta: sources_meta[src_name] = self.fetch_meta(src_name) - response['source_meta_'] = sources_meta + response.source_meta_ = sources_meta return response - def add_gene_descriptor(self, response, record, match_type, - possible_concepts=[]) -> Dict: + def _add_alt_matches(self, response: NormService, record: Dict, + possible_concepts: List[str]) -> NormService: + """Add alternate matches warning to response object + + :param NormService response: in-progress response object + :param Dict record: normalized record + :param List[str] possible_concepts: other possible matches + :return: updated response object + """ + norm_concepts = set() + for concept_id in possible_concepts: + r = self.db.get_record_by_id(concept_id, True) + if r: + merge_ref = r.get("merge_ref") + if merge_ref: + norm_concepts.add(merge_ref) + norm_concepts = norm_concepts - {record["concept_id"]} + if norm_concepts: + response.warnings.append({ + "multiple_normalized_concepts_found": list(norm_concepts) + }) + return response + + def add_gene_descriptor( + self, response: NormalizeService, record: Dict, match_type: MatchType, + possible_concepts: Optional[List[str]] = None + ) -> NormalizeService: """Add gene descriptor to response. :param Dict response: Response object :param Dict record: Gene record :param MatchType match_type: query's match type - :param list possible_concepts: List of other normalized concepts - found + :param Optional[List[str]] possible_concepts: List of other normalized + concepts found :return: Response with gene descriptor """ params = { - "id": f"normalize.gene:{quote(response['query'])}", + "id": f"normalize.gene:{quote(response.query)}", "label": record["symbol"], "gene_id": record["concept_id"] } @@ -382,7 +423,7 @@ def add_gene_descriptor(self, response, record, match_type, gene_type = record.get("gene_type") if gene_type: extensions.append(Extension( - name=GeneTypeFieldName[record["src_name"].upper()], + name=GeneTypeFieldName[record["src_name"].upper()].value, value=gene_type )) else: @@ -399,25 +440,12 @@ def add_gene_descriptor(self, response, record, match_type, # add warnings if possible_concepts: - norm_concepts = set() - for concept_id in possible_concepts: - r = self.db.get_record_by_id(concept_id, True) - if r: - merge_ref = r.get("merge_ref") - if merge_ref: - norm_concepts.add(merge_ref) - norm_concepts = norm_concepts - {record["concept_id"]} - if norm_concepts: - response["warnings"].append( - { - "multiple_normalized_concepts_found": - list(norm_concepts) - } - ) - response["gene_descriptor"] = \ - GeneDescriptor(**params).dict(exclude_none=True) + response = self._add_alt_matches(response, record, + possible_concepts) + + response.gene_descriptor = GeneDescriptor(**params) response = self._add_merged_meta(response) - response["match_type"] = match_type + response.match_type = match_type return response @staticmethod @@ -445,94 +473,174 @@ def _handle_failed_merge_ref(record, response, query) -> Dict: response['match_type'] = MatchType.NO_MATCH return response - def normalize(self, query: str) -> NormalizeService: - """Return normalized concept for query. + def _prepare_normalized_response(self, query: str) -> Dict[str, Any]: + """Provide base response object for normalize endpoints. - :param str query: String to find normalized concept for - :return: Normalized gene concept + :param str query: user-provided query + :return: basic normalization response boilerplate """ - response = { + return { "query": query, + "match_type": MatchType.NO_MATCH, "warnings": self.emit_warnings(query), - "service_meta_": self._get_service_meta() + "service_meta_": ServiceMeta( + version=__version__, + response_datetime=str(datetime.now())) } - if query == '': - response['match_type'] = MatchType.NO_MATCH - return NormalizeService(**response) + def normalize(self, query: str) -> NormalizeService: + """Return normalized concept for query. + + :param str query: String to find normalized concept for + :return: Normalized gene concept + """ + response = NormalizeService(**self._prepare_normalized_response(query)) + return self._perform_normalized_lookup(response, query, + self.add_gene_descriptor) + + def _resolve_merge( + self, response: NormService, record: Dict, match_type: MatchType, + callback: Callable, possible_concepts: Optional[List[str]] = None + ) -> NormService: + """Given a record, return the corresponding normalized record + + :param NormalizationService response: in-progress response object + :param Dict record: record to retrieve normalized concept for + :param MatchType match_type: type of match that returned these records + :param Callable callback: response constructor method + :param Optional[List[str]] possible_concepts: alternate possible matches + :return: Normalized response object + """ + merge_ref = record.get("merge_ref") + if merge_ref: + # follow merge_ref + merge = self.db.get_record_by_id(merge_ref, False, True) + if merge is None: + query = response.query + logger.error( + f"Merge ref lookup failed for ref {record['merge_ref']} " + f"in record {record['concept_id']} from query `{query}`" + ) + return response + else: + return callback(response, merge, match_type, possible_concepts) + else: + # record is sole member of concept group + return callback(response, record, match_type, possible_concepts) + + def _get_matches_by_type(self, query: str, match_type: str) -> List[Dict]: + """Get matches list for match tier. + :param str query: user-provided query + :param str match_type: keyword of match type to check + :return: List of records matching the query and match level + """ + matching_refs = self.db.get_records_by_type(query, match_type) + matching_records = [self.db.get_record_by_id(m["concept_id"], False) + for m in matching_refs] + return sorted(matching_records, key=self._record_order) # type: ignore + + def _perform_normalized_lookup( + self, response: NormService, query: str, response_builder: Callable + ) -> NormService: + """Retrieve normalized concept, for use in normalization endpoints + :param NormService response: in-progress response object + :param str query: user-provided query + :param Callable response_builder: response constructor callback method + :return: completed service response object + """ + if query == "": + return response query_str = query.lower().strip() # check merged concept ID match - record = self.db.get_record_by_id(query_str, case_sensitive=False, - merge=True) + record = self.db.get_record_by_id(query_str, case_sensitive=False, merge=True) if record: - response = self.add_gene_descriptor( - response, record, MatchType.CONCEPT_ID) - return NormalizeService(**response) + return response_builder(response, record, MatchType.CONCEPT_ID) # check concept ID match record = self.db.get_record_by_id(query_str, case_sensitive=False) if record: - merge_ref = record.get('merge_ref') - if not merge_ref: - response = self.add_gene_descriptor( - response, record, MatchType.CONCEPT_ID) - return NormalizeService(**response) - merge = self.db.get_record_by_id(merge_ref, - case_sensitive=False, - merge=True) - if merge is None: - response = self._handle_failed_merge_ref( - record, response, query_str) - return NormalizeService(**response) - else: - response = self.add_gene_descriptor( - response, merge, MatchType.CONCEPT_ID) - return NormalizeService(**response) + return self._resolve_merge(response, record, MatchType.CONCEPT_ID, + response_builder) - # check other match types - matching_records = None for match_type in ITEM_TYPES.values(): # get matches list for match tier matching_refs = self.db.get_records_by_type(query_str, match_type) matching_records = \ [self.db.get_record_by_id(m['concept_id'], False) for m in matching_refs] - matching_records.sort(key=self._record_order) + matching_records.sort(key=self._record_order) # type: ignore if len(matching_refs) > 1: possible_concepts = \ [ref["concept_id"] for ref in matching_refs] else: - possible_concepts = [] + possible_concepts = None # attempt merge ref resolution until successful for match in matching_records: - record = self.db.get_record_by_id(match['concept_id'], False) + assert match is not None + record = self.db.get_record_by_id(match["concept_id"], False) if record: - merge_ref = record.get('merge_ref') - if not merge_ref: - response = self.add_gene_descriptor( - response, record, - MatchType[match_type.upper()], - possible_concepts - ) - return NormalizeService(**response) - merge = self.db.get_record_by_id(record['merge_ref'], - case_sensitive=False, - merge=True) - if merge is None: - response = self._handle_failed_merge_ref( - record, response, query_str) - return NormalizeService(**response) - else: - response = self.add_gene_descriptor( - response, merge, - MatchType[match_type.upper()], - possible_concepts - ) - return NormalizeService(**response) - - if not matching_records: - response['match_type'] = MatchType.NO_MATCH - return NormalizeService(**response) + match_type_value = MatchType[match_type.upper()] + return self._resolve_merge( + response, record, match_type_value, + response_builder, possible_concepts + ) + return response + + def _add_normalized_records( + self, response: UnmergedNormalizationService, normalized_record: Dict, + match_type: MatchType, possible_concepts: Optional[List[str]] = None + ) -> UnmergedNormalizationService: + """Add individual records to unmerged normalize response. + + :param UnmergedNormalizationService response: in-progress response + :param Dict normalized_record: record associated with normalized concept, + either merged or single identity + :param MatchType match_type: type of match achieved + :param Optional[List[str]] possible_concepts: other possible results + :return: Completed response object + """ + response.match_type = match_type + response.normalized_concept_id = normalized_record["concept_id"] + if normalized_record["item_type"] == "identity": + record_source = SourceName[normalized_record["src_name"].upper()] + response.source_matches[record_source] = MatchesNormalized( + records=[BaseGene(**self._cast_location_ints(normalized_record))], + source_meta_=self.fetch_meta(record_source.value) + ) + else: + concept_ids = [normalized_record["concept_id"]] + \ + normalized_record.get("xrefs", []) + for concept_id in concept_ids: + record = self.db.get_record_by_id(concept_id, case_sensitive=False) + if not record: + continue + record_source = SourceName[record["src_name"].upper()] + gene = BaseGene(**self._cast_location_ints(record)) + if record_source in response.source_matches: + response.source_matches[record_source].records.append(gene) + else: + response.source_matches[record_source] = MatchesNormalized( + records=[gene], + source_meta_=self.fetch_meta(record_source.value) + ) + if possible_concepts: + response = self._add_alt_matches(response, normalized_record, + possible_concepts) + return response + + def normalize_unmerged(self, query: str) -> UnmergedNormalizationService: + """Return all source records under the normalized concept for the + provided query string. + + :param str query: string to search against + :return: Normalized response object + """ + response = UnmergedNormalizationService( + source_matches={}, + **self._prepare_normalized_response(query) + ) + return self._perform_normalized_lookup(response, query, + self._add_normalized_records) diff --git a/gene/schemas.py b/gene/schemas.py index e174f9a2..41b86239 100644 --- a/gene/schemas.py +++ b/gene/schemas.py @@ -56,10 +56,11 @@ class MatchType(IntEnum): NO_MATCH = 0 -class Gene(BaseModel): - """Gene""" +class BaseGene(BaseModel): + """Base gene model. Provide shared resources for records produced by + /search and /normalize_unmerged. + """ - match_type: MatchType concept_id: CURIE symbol: StrictStr symbol_status: Optional[SymbolStatus] @@ -79,6 +80,12 @@ class Gene(BaseModel): _get_associated_with_val = \ validator('associated_with', allow_reuse=True)(return_value) + +class Gene(BaseGene): + """Gene""" + + match_type: MatchType + class Config: """Configure model example""" @@ -416,15 +423,20 @@ class GeneTypeFieldName(str, Enum): ENSEMBL = "ensembl_biotype" -class NormalizeService(BaseModel): - """Define model for returning normalized concept.""" +class BaseNormalizationService(BaseModel): + """Base method providing shared attributes to Normalization service classes.""" query: StrictStr warnings: Optional[List[Dict]] match_type: MatchType + service_meta_: ServiceMeta + + +class NormalizeService(BaseNormalizationService): + """Define model for returning normalized concept.""" + gene_descriptor: Optional[GeneDescriptor] source_meta_: Optional[Dict[SourceName, SourceMeta]] - service_meta_: ServiceMeta class Config: """Configure model example""" @@ -561,3 +573,245 @@ def schema_extra(schema: Dict[str, Any], 'url': 'https://github.com/cancervariants/gene-normalization' # noqa: E501 } } + + +class MatchesNormalized(BaseModel): + """Matches associated with normalized concept from a single source.""" + + records: List[BaseGene] + source_meta_: SourceMeta + + class Config: + """Configure OpenAPI schema""" + + @staticmethod + def schema_extra(schema: Dict[str, Any], + model: Type["MatchesNormalized"]) -> None: + """Configure OpenAPI schema""" + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + + +class UnmergedNormalizationService(BaseNormalizationService): + """Response providing source records corresponding to normalization of user query. + Enables retrieval of normalized concept while retaining sourcing for accompanying + attributes. + """ + + normalized_concept_id: Optional[CURIE] + source_matches: Dict[SourceName, MatchesNormalized] + + class Config: + """Configure OpenAPI schema""" + + @staticmethod + def schema_extra(schema: Dict[str, Any], + model: Type["UnmergedNormalizationService"]) -> None: + """Configure OpenAPI schema example""" + if "title" in schema.keys(): + schema.pop("title", None) + for prop in schema.get("properties", {}).values(): + prop.pop("title", None) + schema["example"] = { + "query": "hgnc:108", + "warnings": [], + "match_type": 100, + "service_meta_": { + "version": "0.1.27", + "response_datetime": "2022-04-26 14:20:54.180240", + "name": "gene-normalizer", + "url": "https://github.com/cancervariants/gene-normalization" + }, + "normalized_concept_id": "hgnc:108", + "source_matches": { + "HGNC": { + "records": [ + { + "concept_id": "hgnc:108", + "symbol": "ACHE", + "symbol_status": "approved", + "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 + "strand": None, + "location_annotations": [], + "locations": [ + { + "type": "ChromosomeLocation", + "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + "species_id": "taxonomy:9606", + "chr": "7", + "interval": { + "type": "CytobandInterval", + "start": "q22.1", + "end": "q22.1" + } + } + ], + "aliases": [ + "3.1.1.7" + ], + "previous_symbols": [ + "YT" + ], + "xrefs": [ + "ncbigene:43", + "ensembl:ENSG00000087085" + ], + "associated_with": [ + "ucsc:uc003uxi.4", + "vega:OTTHUMG00000157033", + "merops:S09.979", + "ccds:CCDS5710", + "omim:100740", + "iuphar:2465", + "ccds:CCDS5709", + "refseq:NM_015831", + "pubmed:1380483", + "uniprot:P22303", + "ccds:CCDS64736" + ], + "gene_type": "gene with protein product" + } + ], + "source_meta_": { + "data_license": "custom", + "data_license_url": "https://www.genenames.org/about/", + "version": "20220407", + "data_url": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json", # noqa: E501 + "rdp_url": None, + "data_license_attributes": { + "non_commercial": False, + "share_alike": False, + "attribution": False + }, + "genome_assemblies": [] + } + }, + "Ensembl": { + "records": [ + { + "concept_id": "ensembl:ENSG00000087085", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 + "strand": "-", + "location_annotations": [], + "locations": [ + { + "_id": "ga4gh:VSL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm", # noqa: E501 + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896974 + } + } + } + ], + "aliases": [], + "previous_symbols": [], + "xrefs": [ + "hgnc:108" + ], + "associated_with": [], + "gene_type": "protein_coding" + } + ], + "source_meta_": { + "data_license": "custom", + "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501 + "version": "104", + "data_url": "ftp://ftp.ensembl.org/pub/Homo_sapiens.GRCh38.104.gff3.gz", # noqa: E501 + "rdp_url": None, + "data_license_attributes": { + "non_commercial": False, + "share_alike": False, + "attribution": False + }, + "genome_assemblies": [ + "GRCh38" + ] + } + }, + "NCBI": { + "records": [ + { + "concept_id": "ncbigene:43", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 + "strand": "-", + "location_annotations": [], + "locations": [ + { + "type": "ChromosomeLocation", + "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + "species_id": "taxonomy:9606", + "chr": "7", + "interval": { + "type": "CytobandInterval", + "start": "q22.1", + "end": "q22.1" + } + }, + { + "_id": "ga4gh:VSL.EepkXho2doYcUT1DW54fT1a00_zkqrn0", # noqa: E501 + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896994 + } + } + } + ], + "aliases": [ + "YT", + "ARACHE", + "ACEE", + "N-ACHE" + ], + "previous_symbols": [ + "ACEE" + ], + "xrefs": [ + "hgnc:108", + "ensembl:ENSG00000087085" + ], + "associated_with": [ + "omim:100740" + ], + "gene_type": "protein-coding" + } + ], + "source_meta_": { + "data_license": "custom", + "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 + "version": "20220407", + "data_url": "ftp://ftp.ncbi.nlm.nih.gov", + "rdp_url": "https://reusabledata.org/ncbi-gene.html", + "data_license_attributes": { + "non_commercial": False, + "share_alike": False, + "attribution": False + }, + "genome_assemblies": [ + "GRCh38.p13" + ] + } + } + } + } diff --git a/gene/version.py b/gene/version.py index 979f7214..3fd08450 100644 --- a/gene/version.py +++ b/gene/version.py @@ -1,2 +1,2 @@ """Gene normalizer version""" -__version__ = "0.1.26" +__version__ = "0.1.27" diff --git a/tests/unit/data/etl_data/ensembl_105.gff3 b/tests/unit/data/etl_data/ensembl_106.gff3 similarity index 100% rename from tests/unit/data/etl_data/ensembl_105.gff3 rename to tests/unit/data/etl_data/ensembl_106.gff3 diff --git a/tests/unit/data/etl_data/ncbi_GRCh38.p13.gff b/tests/unit/data/etl_data/ncbi_GRCh38.p14.gff similarity index 90% rename from tests/unit/data/etl_data/ncbi_GRCh38.p13.gff rename to tests/unit/data/etl_data/ncbi_GRCh38.p14.gff index 427d5ea5..cad9dcee 100644 --- a/tests/unit/data/etl_data/ncbi_GRCh38.p13.gff +++ b/tests/unit/data/etl_data/ncbi_GRCh38.p14.gff @@ -40,6 +40,12 @@ NC_000009.12 BestRefSeq CDS 4662376 4663263 . + 0 ID=cds-NP_982278.3;Parent=rna- ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NC_000011.10 RefSeq region 1 135086622 . + . ID=NC_000011.10:1..135086622;Dbxref=taxon:9606;Name=11;chromosome=11;gbkey=Src;genome=chromosome;mol_type=genomic DNA NC_000011.10 BestRefSeq gene 10751246 10779746 . + . ID=gene-CTR9;Dbxref=GeneID:9646,HGNC:HGNC:16850,MIM:609366;Name=CTR9;description=CTR9 homolog%2C Paf1/RNA polymerase II complex component;gbkey=Gene;gene=CTR9;gene_biotype=protein_coding;gene_synonym=p150,p150TSP,SH2BP1,TSBP +NC_000011.10 Curated Genomic pseudogene 117135529 117138867 . + . ID=gene-LOC653303;Dbxref=GeneID:653303;Name=LOC653303;description=proprotein convertase subtilisin/kexin type 7 pseudogene;gbkey=Gene;gene=LOC653303;gene_biotype=pseudogene;pseudo=true +NC_000011.10 Curated Genomic exon 117135529 117135686 . + . ID=id-LOC653303;Parent=gene-LOC653303;Dbxref=GeneID:653303;gbkey=exon;gene=LOC653303;pseudo=true +NC_000011.10 Curated Genomic exon 117136519 117136613 . + . ID=id-LOC653303-2;Parent=gene-LOC653303;Dbxref=GeneID:653303;gbkey=exon;gene=LOC653303;pseudo=true +NC_000011.10 Curated Genomic exon 117137423 117137516 . + . ID=id-LOC653303-3;Parent=gene-LOC653303;Dbxref=GeneID:653303;gbkey=exon;gene=LOC653303;pseudo=true +NC_000011.10 Curated Genomic exon 117137785 117137902 . + . ID=id-LOC653303-4;Parent=gene-LOC653303;Dbxref=GeneID:653303;gbkey=exon;gene=LOC653303;pseudo=true +NC_000011.10 Curated Genomic exon 117138227 117138867 . + . ID=id-LOC653303-5;Parent=gene-LOC653303;Dbxref=GeneID:653303;gbkey=exon;gene=LOC653303;pseudo=true ##sequence-region NC_000012.12 1 133275309 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NC_000012.12 RefSeq region 1 133275309 . + . ID=NC_000012.12:1..133275309;Dbxref=taxon:9606;Name=12;chromosome=12;gbkey=Src;genome=chromosome;mol_type=genomic DNA diff --git a/tests/unit/data/etl_data/ncbi_history_20210813.tsv b/tests/unit/data/etl_data/ncbi_history_20210813.tsv index a6d01a5d..06b23b48 100644 --- a/tests/unit/data/etl_data/ncbi_history_20210813.tsv +++ b/tests/unit/data/etl_data/ncbi_history_20210813.tsv @@ -6,6 +6,9 @@ 9606 25 116063 LOC116063 20050510 9606 25782 26114 DKFZP434D245 20050510 9606 293 8283 ANT3Y 20050510 +9606 653303 196266 LOC196266 20080725 +9606 653303 654080 LOC654080 20060323 +9606 653303 731196 LOC731196 20070619 9606 43 100187742 ACEE 20110915 9606 54704 5497 PPM2C 20050510 9606 54704 157663 LOC157663 20050507 diff --git a/tests/unit/data/etl_data/ncbi_info_20210813.tsv b/tests/unit/data/etl_data/ncbi_info_20210813.tsv index d2e458ef..46735b88 100644 --- a/tests/unit/data/etl_data/ncbi_info_20210813.tsv +++ b/tests/unit/data/etl_data/ncbi_info_20210813.tsv @@ -22,3 +22,4 @@ 9606 10251 SPRY3 - spry-3 MIM:300531|HGNC:HGNC:11271|Ensembl:ENSG00000168939 X|Y Xq28 and Yq12 sprouty RTK signaling antagonist 3 protein-coding SPRY3 sprouty RTK signaling antagonist 3 O protein sprouty homolog 3|antagonist of FGF signaling|sprouty homolog 3|sprouty3 20210807 - 9606 7637 ZNF84 - HPF2 MIM:618554|HGNC:HGNC:13159|Ensembl:ENSG00000198040 12 12q24.33|map from Rosati ref via FISH [AFS] zinc finger protein 84 protein-coding ZNF84 zinc finger protein 84 O zinc finger protein 84|zinc finger protein HPF2 20210611 - 9606 619538 OMS - COME/ROM MIM:166760 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 otitis media, susceptibility to unknown - - - chronic/recurrent otitis media 20170408 - +9606 653303 LOC653303 - - - 11 11q23.3 proprotein convertase subtilisin/kexin type 7 pseudogene pseudo - - - - 20211123 - diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 96dad07d..25584a29 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -91,7 +91,7 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, shutil.rmtree(e.src_data_dir) e._sequence_location.get_aliases = _get_aliases - e._data_src = etl_data_path / 'ensembl_105.gff3' + e._data_src = etl_data_path / 'ensembl_106.gff3' e._transform_data() e._add_meta() processed_ids += e._processed_ids @@ -132,7 +132,7 @@ def test_ncbi_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, n._sequence_location.get_aliases = _get_aliases n._info_src = etl_data_path / 'ncbi_info_20210813.tsv' n._history_src = etl_data_path / 'ncbi_history_20210813.tsv' - n._gff_src = etl_data_path / 'ncbi_GRCh38.p13.gff' + n._gff_src = etl_data_path / 'ncbi_GRCh38.p14.gff' n._version = n._info_src.stem.split('_')[-1] n._transform_data() n._add_meta() diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 99b76238..907d88e4 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -304,9 +304,9 @@ def test_meta_info(ensembl): assert resp.source_meta_.data_license == "custom" assert resp.source_meta_.data_license_url == \ "https://useast.ensembl.org/info/about/legal/disclaimer.html" - assert resp.source_meta_.version == "105" + assert resp.source_meta_.version == "106" assert resp.source_meta_.data_url == \ - "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.105.gff3.gz" # noqa: E501 + "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.106.gff3.gz" # noqa: E501 assert resp.source_meta_.rdp_url is None assert resp.source_meta_.genome_assemblies == ["GRCh38"] assert resp.source_meta_.data_license_attributes == { diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index b450b5c9..5f403401 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -892,7 +892,7 @@ def test_meta(ncbi): "ftp://ftp.ncbi.nlm.nih.gov" assert response.source_meta_.rdp_url == \ "https://reusabledata.org/ncbi-gene.html" - assert response.source_meta_.genome_assemblies == ["GRCh38.p13"] + assert response.source_meta_.genome_assemblies == ["GRCh38.p14"] assert response.source_meta_.data_license_attributes == { "non_commercial": False, "share_alike": False, diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index fa9ae836..88094094 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -1,7 +1,7 @@ """Module to test the query module.""" from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor from gene.query import QueryHandler, InvalidParameterException -from gene.schemas import SourceName, MatchType +from gene.schemas import BaseGene, SourceName, MatchType import copy import pytest @@ -21,6 +21,9 @@ def search(self, query_str, keyed=False, incl='', excl=''): def normalize(self, query_str): return self.query_handler.normalize(query_str) + def normalize_unmerged(self, query_str): + return self.query_handler.normalize_unmerged(query_str) + return QueryGetter() @@ -378,6 +381,428 @@ def normalized_p150(): return GeneDescriptor(**params) +@pytest.fixture(scope="module") +def normalized_loc_653303(): + """Provide test fixture for NCBI gene LOC653303. Used to validate + normalized results that don't merge records. + """ + params = { + "id": "normalize.gene:LOC653303", + "type": "GeneDescriptor", + "label": "LOC653303", + "alternate_labels": [ + "LOC196266", + "LOC654080", + "LOC731196" + ], + "extensions": [ + { + "type": "Extension", + "name": "approved_name", + "value": "proprotein convertase subtilisin/kexin type 7 pseudogene" # noqa: E501 + }, + { + "type": "Extension", + "name": "chromosome_location", + "value": { + "_id": "ga4gh:VCL.WzURLvTklFI7K2GAP8gIw6vgWDWXMXuW", + "type": "ChromosomeLocation", + "species_id": "taxonomy:9606", + "chr": "11", + "interval": { + "type": "CytobandInterval", + "start": "q23.3", + "end": "q23.3" + } + } + }, + { + "type": "Extension", + "name": "previous_symbols", + "value": [ + "LOC196266", + "LOC731196", + "LOC654080" + ] + }, + { + "type": "Extension", + "name": "ncbi_gene_type", + "value": "pseudo" + } + ], + "gene_id": "ncbigene:653303" + } + return GeneDescriptor(**params) + + +@pytest.fixture(scope="module") +def normalize_unmerged_loc_653303(): + """Provide fixture for NCBI gene LOC655303. Used to validate normalized results + that don't merge records. + """ + return { + "normalized_concept_id": "ncbigene:653303", + "source_matches": { + "NCBI": { + "records": [ + { + "concept_id": "ncbigene:653303", + "symbol": "LOC653303", + "symbol_status": None, + "label": "proprotein convertase subtilisin/kexin type 7 pseudogene", # noqa: E501 + "strand": "+", + "location_annotations": [], + "locations": [ + { + "type": "ChromosomeLocation", + "_id": "ga4gh:VCL.WzURLvTklFI7K2GAP8gIw6vgWDWXMXuW", + "species_id": "taxonomy:9606", + "chr": "11", + "interval": { + "type": "CytobandInterval", + "start": "q23.3", + "end": "q23.3" + } + }, + { + "_id": "ga4gh:VSL.dhj3ZilmW0bmmUjUvrG7zCWwsPn-7XyB", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 117135528 + }, + "end": { + "type": "Number", + "value": 117138867 + } + } + } + ], + "aliases": [], + "previous_symbols": [ + "LOC196266", + "LOC731196", + "LOC654080" + ], + "xrefs": [], + "associated_with": [], + "gene_type": "pseudo", + } + ] + } + } + } + + +@pytest.fixture(scope="module") +def normalize_unmerged_chaf1a(): + """Return expected results from /normalize_unmerged for CHAF1A.""" + return { + "normalized_concept_id": "hgnc:1910", + "source_matches": { + "HGNC": { + "records": [ + { + "concept_id": "hgnc:1910", + "symbol": "CHAF1A", + "symbol_status": "approved", + "label": "chromatin assembly factor 1 subunit A", + "strand": None, + "location_annotations": [], + "locations": [ + { + "type": "ChromosomeLocation", + "_id": "ga4gh:VCL.yF2TzeunqY92v3yhDsCR_t5X997mWriF", + "species_id": "taxonomy:9606", + "chr": "19", + "interval": { + "type": "CytobandInterval", + "start": "p13.3", + "end": "p13.3" + } + } + ], + "aliases": [ + "CAF1P150", + "P150", + "CAF1", + "CAF1B", + "MGC71229", + "CAF-1" + ], + "previous_symbols": [], + "xrefs": [ + "ensembl:ENSG00000167670", + "ncbigene:10036" + ], + "associated_with": [ + "vega:OTTHUMG00000181922", + "ccds:CCDS32875", + "ucsc:uc002mal.4", + "pubmed:7600578", + "uniprot:Q13111", + "omim:601246", + "ena.embl:U20979", + "refseq:NM_005483" + ], + "gene_type": "gene with protein product" + } + ], + }, + "Ensembl": { + "records": [ + { + "concept_id": "ensembl:ENSG00000167670", + "symbol": "CHAF1A", + "symbol_status": None, + "label": "chromatin assembly factor 1 subunit A", + "strand": "+", + "location_annotations": [], + "locations": [ + { + "_id": "ga4gh:VSL.VVxEanUPWWMy_IChkj_kPIpRnYAatqrq", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 4402639 + }, + "end": { + "type": "Number", + "value": 4445018 + } + } + } + ], + "aliases": [], + "previous_symbols": [], + "xrefs": [ + "hgnc:1910" + ], + "associated_with": [], + "gene_type": "protein_coding" + } + ], + }, + "NCBI": { + "records": [ + { + "concept_id": "ncbigene:10036", + "symbol": "CHAF1A", + "symbol_status": None, + "label": "chromatin assembly factor 1 subunit A", + "strand": "+", + "location_annotations": [], + "locations": [ + { + "type": "ChromosomeLocation", + "_id": "ga4gh:VCL.yF2TzeunqY92v3yhDsCR_t5X997mWriF", + "species_id": "taxonomy:9606", + "chr": "19", + "interval": { + "type": "CytobandInterval", + "start": "p13.3", + "end": "p13.3" + } + }, + { + "_id": "ga4gh:VSL.X4HEwp9RgFN5WpmJM4bWpcOcN9qHX-hj", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 4402595 + }, + "end": { + "type": "Number", + "value": 4448322 + } + } + } + ], + "aliases": [ + "CAF1P150", + "P150", + "CAF1", + "CAF1B", + "CAF-1" + ], + "previous_symbols": [], + "xrefs": [ + "ensembl:ENSG00000167670", + "hgnc:1910" + ], + "associated_with": [ + "omim:601246" + ], + "gene_type": "protein-coding" + } + ] + } + } + } + + +@pytest.fixture(scope="module") +def normalize_unmerged_ache(): + """Provide ACHE fixture for unmerged normalize endpoint.""" + return { + "normalized_concept_id": "hgnc:108", + "source_matches": { + "NCBI": { + "records": [ + { + "concept_id": "ncbigene:43", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": "-", + "location_annotations": [], + "locations": [ + { + "type": "ChromosomeLocation", + "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", + "species_id": "taxonomy:9606", + "chr": "7", + "interval": { + "type": "CytobandInterval", + "start": "q22.1", + "end": "q22.1" + } + }, + { + "_id": "ga4gh:VSL.EepkXho2doYcUT1DW54fT1a00_zkqrn0", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896994 + } + } + } + ], + "aliases": [ + "YT", + "ARACHE", + "ACEE", + "N-ACHE" + ], + "previous_symbols": [ + "ACEE" + ], + "xrefs": [ + "hgnc:108", + "ensembl:ENSG00000087085" + ], + "associated_with": [ + "omim:100740" + ], + "gene_type": "protein-coding" + } + ], + }, + "Ensembl": { + "records": [ + { + "concept_id": "ensembl:ENSG00000087085", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": "-", + "location_annotations": [], + "locations": [ + { + "_id": "ga4gh:VSL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm", + "type": "SequenceLocation", + "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 + "interval": { + "type": "SequenceInterval", + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896974 + } + } + } + ], + "aliases": [], + "previous_symbols": [], + "xrefs": ["hgnc:108"], + "associated_with": [], + "gene_type": "protein_coding", + } + ] + }, + "HGNC": { + "records": [ + { + "concept_id": "hgnc:108", + "symbol": "ACHE", + "symbol_status": "approved", + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": None, + "location_annotations": [], + "locations": [ + { + "type": "ChromosomeLocation", + "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", + "species_id": "taxonomy:9606", + "chr": "7", + "interval": { + "type": "CytobandInterval", + "start": "q22.1", + "end": "q22.1" + } + } + ], + "aliases": [ + "3.1.1.7" + ], + "previous_symbols": [ + "YT" + ], + "xrefs": [ + "ncbigene:43", + "ensembl:ENSG00000087085" + ], + "associated_with": [ + "ucsc:uc003uxi.4", + "vega:OTTHUMG00000157033", + "merops:S09.979", + "ccds:CCDS5710", + "omim:100740", + "iuphar:2465", + "ccds:CCDS5709", + "refseq:NM_015831", + "pubmed:1380483", + "uniprot:P22303", + "ccds:CCDS64736" + ], + "gene_type": "gene with protein product", + } + ] + } + } + } + + @pytest.fixture(scope='module') def num_sources(): """Get the number of sources.""" @@ -391,24 +816,29 @@ def source_meta(): SourceName.NCBI.value] -def compare_normalize_resp(resp, expected_query, expected_match_type, - expected_gene_descriptor, expected_warnings=None, - expected_source_meta=None): - """Check that normalize response is correct""" - assert resp.query == expected_query +def compare_warnings(actual_warnings, expected_warnings): + """Compare response warnings against expected results.""" if expected_warnings: - assert len(resp.warnings) == len(expected_warnings), "warnings len" + assert len(actual_warnings) == len(expected_warnings), "warnings len" for e_warnings in expected_warnings: - for r_warnings in resp.warnings: + for r_warnings in actual_warnings: for e_key, e_val in e_warnings.items(): - for r_key, r_val in r_warnings.items(): + for r_val in r_warnings.values(): if e_key == r_val: if isinstance(e_val, list): assert set(r_val) == set(e_val), "warnings val" else: assert r_val == e_val, "warnings val" else: - assert resp.warnings == [], "warnings != []" + assert actual_warnings == [], "warnings != []" + + +def compare_normalize_resp(resp, expected_query, expected_match_type, + expected_gene_descriptor, expected_warnings=None, + expected_source_meta=None): + """Check that normalize response is correct""" + assert resp.query == expected_query + compare_warnings(resp.warnings, expected_warnings) assert resp.match_type == expected_match_type compare_gene_descriptor(expected_gene_descriptor, resp.gene_descriptor) if not expected_source_meta: @@ -422,6 +852,49 @@ def compare_normalize_resp(resp, expected_query, expected_match_type, compare_service_meta(resp.service_meta_) +def compare_unmerged_record(gene, test_gene): + """Check that gene and test_gene are the same.""" + assert gene.label == test_gene.label + assert gene.concept_id == test_gene.concept_id + assert set(gene.aliases) == set(test_gene.aliases) + assert set(gene.xrefs) == \ + set(test_gene.xrefs) + assert gene.symbol_status == test_gene.symbol_status + assert set(gene.previous_symbols) == \ + set(test_gene.previous_symbols) + assert set(gene.associated_with) == \ + set(test_gene.associated_with) + assert gene.symbol == test_gene.symbol + assert len(gene.locations) == len(test_gene.locations) + for loc in gene.locations: + assert loc in test_gene.locations + assert set(gene.location_annotations) == \ + set(test_gene.location_annotations) + assert gene.strand == test_gene.strand + assert gene.gene_type == test_gene.gene_type + + +def compare_unmerged_response(actual, query, warnings, match_type, fixture): + """Compare response from normalize unmerged endpoint to fixture.""" + assert actual.query == query + compare_warnings(actual.warnings, warnings) + assert actual.match_type == match_type + assert actual.normalized_concept_id == fixture["normalized_concept_id"] + + for source, match in actual.source_matches.items(): + assert match.source_meta_ # check that it's there + for record in match.records: + concept_id = record.concept_id + fixture_gene = None + # get corresponding fixture record + for gene in fixture["source_matches"][source.value]["records"]: + if gene["concept_id"] == concept_id: + fixture_gene = BaseGene(**gene) + break + assert fixture_gene, f"Unable to find fixture for {concept_id}" + compare_unmerged_record(record, fixture_gene) + + def compare_service_meta(service_meta): """Check that service metadata is correct.""" assert service_meta.name == "gene-normalizer" @@ -437,7 +910,8 @@ def compare_gene_descriptor(test, actual): assert actual.type == test.type assert actual.gene_id == test.gene_id assert actual.label == test.label - assert set(actual.xrefs) == set(test.xrefs), "xrefs" + if actual.xrefs or test.xrefs: + assert set(actual.xrefs) == set(test.xrefs), "xrefs" assert set(actual.alternate_labels) == set(test.alternate_labels), \ "alt labels" extensions_present = "extensions" in test.__fields__.keys() @@ -797,6 +1271,82 @@ def test_multiple_norm_concepts(query_handler, normalized_p150, source_meta): expected_warnings=expected_warnings) +def test_normalize_single_entry(query_handler, normalized_loc_653303): + """Test that the normalized endpoint correctly shapes unmerged identity + records into gene descriptors. + """ + q = "LOC653303" + resp = query_handler.normalize(q) + compare_normalize_resp(resp, q, MatchType.SYMBOL, normalized_loc_653303, + expected_source_meta=[SourceName.NCBI.value]) + + +def test_normalize_unmerged(query_handler, normalize_unmerged_loc_653303, + normalize_unmerged_chaf1a, normalize_unmerged_ache): + """Test that unmerged normalization produces correct results.""" + # concept ID + q = "ncbigene:653303" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.CONCEPT_ID, + normalize_unmerged_loc_653303) + + q = "hgnc:1910" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.CONCEPT_ID, + normalize_unmerged_chaf1a) + + q = "HGNC:108" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.CONCEPT_ID, + normalize_unmerged_ache) + + # symbol + q = "LOC653303" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.SYMBOL, + normalize_unmerged_loc_653303) + + # prev symbol + q = "ACEE" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.PREV_SYMBOL, + normalize_unmerged_ache) + + q = "LOC196266" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.PREV_SYMBOL, + normalize_unmerged_loc_653303) + + # alias + q = "P150" + resp = query_handler.normalize_unmerged(q) + expected_warnings = [{ + "multiple_normalized_concepts_found": + ['hgnc:500', 'hgnc:8982', 'hgnc:17168', 'hgnc:16850', 'hgnc:76'] + }] + compare_unmerged_response(resp, q, expected_warnings, MatchType.ALIAS, + normalize_unmerged_chaf1a) + + q = "ARACHE" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_ache) + + q = "MGC71229" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_chaf1a) + + # assoc with + q = "omim:100740" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.ASSOCIATED_WITH, + normalize_unmerged_ache) + + q = "uniprot:Q13111" + resp = query_handler.normalize_unmerged(q) + compare_unmerged_response(resp, q, [], MatchType.ASSOCIATED_WITH, + normalize_unmerged_chaf1a) + + def test_invalid_queries(query_handler): """Test invalid queries""" resp = query_handler.normalize("B R A F")