From 7631ceb71aa6be0851d961bb5bda30641bbc3ef3 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 5 Sep 2023 09:28:20 -0400 Subject: [PATCH] feat!: remove search result formatting options --- gene/main.py | 9 +- gene/query.py | 36 +------ gene/schemas.py | 161 ++++++++++++------------------ tests/unit/test_ensembl_source.py | 2 +- tests/unit/test_hgnc_source.py | 2 +- tests/unit/test_ncbi_source.py | 2 +- tests/unit/test_query.py | 45 +++------ 7 files changed, 88 insertions(+), 169 deletions(-) diff --git a/gene/main.py b/gene/main.py index 6508cdd0..646b0812 100644 --- a/gene/main.py +++ b/gene/main.py @@ -42,8 +42,6 @@ read_query_summary = "Given query, provide best-matching source records." response_description = "A response to a validly-formed query" q_descr = "Gene to normalize." -keyed_descr = """Optional. If true, return response as key-value pairs of - sources to source matches. False by default.""" incl_descr = """Optional. Comma-separated list of source names to include in response. Will exclude all other sources. Returns HTTP status code 422: Unprocessable Entity if both 'incl' and 'excl' parameters @@ -68,15 +66,12 @@ ) def search( q: str = Query(..., description=q_descr), # noqa: D103 - keyed: Optional[bool] = Query(False, description=keyed_descr), incl: Optional[str] = Query(None, description=incl_descr), excl: Optional[str] = Query(None, description=excl_descr), ) -> SearchService: """Return strongest match concepts to query string provided by user. :param str q: gene search term - :param Optional[bool] keyed: if true, response is structured as key/value - pair of sources to source match lists. :param Optional[str] incl: comma-separated list of sources to include, with all others excluded. Raises HTTPException if both `incl` and `excl` are given. @@ -86,10 +81,10 @@ def search( :return: JSON response with matched records and source metadata """ try: - resp = query_handler.search(html.unescape(q), keyed=keyed, incl=incl, excl=excl) + resp = query_handler.search(html.unescape(q), incl=incl, excl=excl) except InvalidParameterException as e: raise HTTPException(status_code=422, detail=str(e)) - + breakpoint() return resp diff --git a/gene/query.py b/gene/query.py index 40c8a2fd..cc427ae5 100644 --- a/gene/query.py +++ b/gene/query.py @@ -223,9 +223,9 @@ def _post_process_resp(self, resp: Dict) -> Dict: records = sorted(records, key=lambda k: k.match_type, reverse=True) return resp - def _response_keyed(self, query: str, sources: Set[str]) -> Dict: - """Return response as dict where key is source name and value - is a list of records. Corresponds to `keyed=true` API parameter. + def _get_search_response(self, query: str, sources: Set[str]) -> Dict: + """Return response as dict where key is source name and value is a list of + records. :param query: string to match against :param sources: sources to match from @@ -275,28 +275,6 @@ def _response_keyed(self, query: str, sources: Set[str]) -> Dict: # remaining sources get no match return self._post_process_resp(resp) - def _response_list(self, query: str, sources: Set[str]) -> Dict: - """Return response as list, where the first key-value in each item - is the source name. Corresponds to `keyed=false` API parameter. - - :param query: string to match against - :param sources: sources to match from - :return: completed response object to return to client - """ - response_dict = self._response_keyed(query, sources) - source_list = [] - for src_name in response_dict["source_matches"].keys(): - src = { - "source": src_name, - } - to_merge = response_dict["source_matches"][src_name] - src.update(to_merge) - - source_list.append(src) - response_dict["source_matches"] = source_list - - return response_dict - @staticmethod def _get_service_meta() -> ServiceMeta: """Return metadata about gene-normalizer service. @@ -308,7 +286,6 @@ def _get_service_meta() -> ServiceMeta: def search( self, query_str: str, - keyed: bool = False, incl: str = "", excl: str = "", **params, @@ -323,8 +300,6 @@ def search( 'ncbigene:673' :param query_str: query, a string, to search for - :param keyed: if true, return response as dict keying source names to source - objects; otherwise, return list of source objects :param incl: str containing comma-separated names of sources to use. Will exclude all other sources. Case-insensitive. :param excl: str containing comma-separated names of source to exclude. Will @@ -375,10 +350,7 @@ def search( query_str = query_str.strip() - if keyed: - resp = self._response_keyed(query_str, query_sources) - else: - resp = self._response_list(query_str, query_sources) + resp = self._get_search_response(query_str, query_sources) resp["service_meta_"] = self._get_service_meta() return SearchService(**resp) diff --git a/gene/schemas.py b/gene/schemas.py index 6ce00545..f965f020 100644 --- a/gene/schemas.py +++ b/gene/schemas.py @@ -297,10 +297,8 @@ class SourceMeta(BaseModel): ) -class MatchesKeyed(BaseModel): - """Container for matching information from an individual source. - Used when matches are requested as an object, not an array. - """ +class SourceSearchMatches(BaseModel): + """Container for matching information from an individual source.""" records: List[Gene] = [] source_meta_: SourceMeta @@ -308,51 +306,66 @@ class MatchesKeyed(BaseModel): model_config = ConfigDict( json_schema_extra={ "example": { - "records": [], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 - "version": "20201215", - "data_url": "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/", - "rdp_url": "https://reusabledata.org/ncbi-gene.html", - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, - "genome_assemblies": [], + "query": "ensembl:ENSG00000157764", + "warnings": [], + "source_matches": { + "Ensembl": { + "records": [ + { + "concept_id": "ensembl:ENSG00000157764", + "symbol": "BRAF", + "symbol_status": None, + "label": "B-Raf proto-oncogene, serine/threonine kinase", + "strand": "-", + "location_annotations": [], + "locations": [ + { + "id": "ga4gh:SL.iwWw9B3tkU3TCLF3d8xu4zSQBhpDZfJ6", + "label": None, + "extensions": None, + "type": "SequenceLocation", + "digest": None, + "sequenceReference": { + "id": None, + "label": None, + "extensions": None, + "type": "SequenceReference", + "digest": None, + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", + "residueAlphabet": None, + }, + "start": 140719326, + "end": 140924929, + } + ], + "aliases": [], + "previous_symbols": [], + "xrefs": ["hgnc:1097"], + "associated_with": [], + "gene_type": "protein_coding", + "match_type": 100, + } + ], + "source_meta_": { + "data_license": "custom", + "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", + "version": "110", + "data_url": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz", + "rdp_url": None, + "data_license_attributes": { + "non_commercial": False, + "attribution": False, + "share_alike": False, + }, + "genome_assemblies": ["GRCh38"], + }, + } }, - } - } - ) - - -class MatchesListed(BaseModel): - """Container for matching information from an individual source. - Used when matches are requested as an array, not an object. - """ - - source: SourceName - records: List[Gene] = [] - source_meta_: SourceMeta - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "source": "NCBI", - "records": [], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 - "version": "20201215", - "data_url": "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/", - "rdp_url": "https://reusabledata.org/ncbi-gene.html", - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, - "genome_assemblies": [], + "service_meta_": { + "name": "gene-normalizer", + "version": "0.3.0-dev0", + "response_datetime": "2023-09-26 15:23:18.837074", + "url": "https://github.com/cancervariants/gene-normalization", }, } } @@ -386,58 +399,10 @@ class SearchService(BaseModel): query: StrictStr warnings: List[Dict] = [] - source_matches: Union[Dict[SourceName, MatchesKeyed], List[MatchesListed]] + source_matches: Dict[SourceName, SourceSearchMatches] service_meta_: ServiceMeta - model_config = ConfigDict( - json_schema_extra={ - "example": { - "query": "BRAF", - "warnings": [], - "source_matches": [ - { - "source": "Ensembl", - "records": [ - { - "label": None, - "concept_id": "ensembl:ENSG00000157764", - "symbol": "BRAF", - "previous_symbols": [], - "aliases": [], - "xrefs": [], - "symbol_status": None, - "strand": "-", - "locations": [], - "location_annotations": [], - "associated_with": [], - "gene_type": None, - "match_type": 100, - } - ], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://uswest.ensembl.org/info/about/legal/index.html", # noqa: E501 - "version": "102", - "data_url": "http://ftp.ensembl.org/pub/", - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, - "genome_assemblies": ["GRCh38"], - }, - } - ], - "service_meta_": { - "name": "gene-normalizer", - "version": __version__, - "response_datetime": "2022-03-23 15:57:14.180908", - "url": "https://github.com/cancervariants/gene-normalization", - }, - } - } - ) + model_config = ConfigDict(json_schema_extra={"example": {}}) class GeneTypeFieldName(str, Enum): diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 47b0305a..d649fa8e 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -14,7 +14,7 @@ def __init__(self): self.query_handler = QueryHandler(database) def search(self, query_str, incl="ensembl"): - resp = self.query_handler.search(query_str, keyed=True, incl=incl) + resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.ENSEMBL] e = QueryGetter() diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py index 0b953a44..8f567bc4 100644 --- a/tests/unit/test_hgnc_source.py +++ b/tests/unit/test_hgnc_source.py @@ -16,7 +16,7 @@ def __init__(self): self.query_handler = QueryHandler(database) def search(self, query_str, incl="hgnc"): - resp = self.query_handler.search(query_str, keyed=True, incl=incl) + resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.HGNC] h = QueryGetter() diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index b8cd1680..1010cefc 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -34,7 +34,7 @@ def __init__(self): self.query_handler = QueryHandler(database) def search(self, query_str, incl="ncbi"): - resp = self.query_handler.search(query_str, keyed=True, incl=incl) + resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.NCBI] n = QueryGetter() diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index a0b40ee7..a8e99e87 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -15,10 +15,8 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, keyed=False, incl="", excl=""): - return self.query_handler.search( - query_str=query_str, keyed=keyed, incl=incl, excl=excl - ) + def search(self, query_str, incl="", excl=""): + return self.query_handler.search(query_str=query_str, incl=incl, excl=excl) def normalize(self, query_str): return self.query_handler.normalize(query_str) @@ -971,15 +969,6 @@ def test_search_query(query_handler, num_sources): resp = query_handler.search(" BRAF ") assert resp.query == "BRAF" matches = resp.source_matches - assert isinstance(matches, list) - assert len(matches) == num_sources - - -def test_search_query_keyed(query_handler, num_sources): - """Test that query returns properly-structured response.""" - resp = query_handler.search(" BRAF ", keyed=True) - assert resp.query == "BRAF" - matches = resp.source_matches assert isinstance(matches, dict) assert len(matches) == num_sources @@ -992,14 +981,14 @@ def test_search_query_inc_exc(query_handler, num_sources): assert len(matches) == num_sources - len(sources.split()) sources = "Hgnc, NCBi" - resp = query_handler.search("BRAF", keyed=True, incl=sources) + resp = query_handler.search("BRAF", incl=sources) matches = resp.source_matches assert len(matches) == len(sources.split()) assert SourceName.HGNC in matches assert SourceName.NCBI in matches sources = "HGnC" - resp = query_handler.search("BRAF", keyed=True, excl=sources) + resp = query_handler.search("BRAF", excl=sources) matches = resp.source_matches assert len(matches) == num_sources - len(sources.split()) assert SourceName.ENSEMBL in matches @@ -1009,7 +998,7 @@ def test_search_query_inc_exc(query_handler, num_sources): def test_search_invalid_parameter_exception(query_handler): """Test that Invalid parameter exception works correctly.""" with pytest.raises(InvalidParameterException): - resp = query_handler.search("BRAF", keyed=True, incl="hgn") # noqa: F841 + _ = query_handler.search("BRAF", incl="hgn") # noqa: F841, E501 with pytest.raises(InvalidParameterException): resp = query_handler.search("BRAF", incl="hgnc", excl="hgnc") # noqa: F841 @@ -1018,21 +1007,21 @@ def test_search_invalid_parameter_exception(query_handler): def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): """Test that ACHE concept_id shows xref matches.""" # Search - resp = query_handler.search("ncbigene:43", keyed=True) + resp = query_handler.search("ncbigene:43") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search("hgnc:108", keyed=True) + resp = query_handler.search("hgnc:108") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search("ensembl:ENSG00000087085", keyed=True) + resp = query_handler.search("ensembl:ENSG00000087085") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1138,21 +1127,21 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): """Test that BRAF concept_id shows xref matches.""" # Search - resp = query_handler.search("ncbigene:673", keyed=True) + resp = query_handler.search("ncbigene:673") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search("hgnc:1097", keyed=True) + resp = query_handler.search("hgnc:1097") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search("ensembl:ENSG00000157764", keyed=True) + resp = query_handler.search("ensembl:ENSG00000157764") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1236,21 +1225,21 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): """Test that ABL1 concept_id shows xref matches.""" # Search - resp = query_handler.search("ncbigene:25", keyed=True) + resp = query_handler.search("ncbigene:25") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search("hgnc:76", keyed=True) + resp = query_handler.search("hgnc:76") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search("ensembl:ENSG00000097007", keyed=True) + resp = query_handler.search("ensembl:ENSG00000097007") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1507,10 +1496,8 @@ def test_invalid_queries(query_handler): resp["match_type"] resp = query_handler.search("B R A F") - assert len(resp.source_matches[0].records) == 0 - - with pytest.raises(TypeError): - resp.source_matches[0].records["match_type"] + records = [r for matches in resp.source_matches.values() for r in matches.records] + assert len(records) == 0 def test_service_meta(query_handler):