From 8e328ed90e7e7651fe998e84c71cfaf1baca90f4 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Tue, 2 Jan 2024 08:58:04 -0500 Subject: [PATCH] pls commit --- src/gene/cli.py | 7 + src/gene/database/database.py | 35 +- src/gene/database/dynamodb.py | 143 ++-- src/gene/database/postgresql.py | 267 +++--- src/gene/database/postgresql/add_fkeys.sql | 2 - src/gene/database/postgresql/add_indexes.sql | 2 - .../postgresql/create_record_lookup_view.sql | 6 - .../database/postgresql/create_tables.sql | 6 - .../postgresql/delete_normalized_concepts.sql | 1 - src/gene/database/postgresql/drop_fkeys.sql | 1 - src/gene/database/schemas.py | 137 +++ src/gene/etl/base.py | 78 +- src/gene/etl/ensembl.py | 143 ++-- src/gene/etl/hgnc.py | 72 +- src/gene/etl/merge.py | 72 +- src/gene/etl/ncbi.py | 110 ++- src/gene/main.py | 38 +- src/gene/query.py | 779 +++++------------- src/gene/schemas.py | 740 ++++++----------- tests/__init__.py | 1 + tests/unit/test_emit_warnings.py | 10 +- tests/unit/test_query.py | 2 +- 22 files changed, 1090 insertions(+), 1562 deletions(-) create mode 100644 src/gene/database/schemas.py create mode 100644 tests/__init__.py diff --git a/src/gene/cli.py b/src/gene/cli.py index b3ef3ee5..63e3508f 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -1,4 +1,5 @@ """Provides a CLI util to make updates to normalizer database.""" +import logging import os from pathlib import Path from typing import Optional, Tuple @@ -10,6 +11,12 @@ from gene.etl.update import update_all_sources, update_normalized, update_source from gene.schemas import SourceName +# TODO testing this out +logging.basicConfig() +_logger = logging.getLogger(__name__) +_logger.setLevel(logging.INFO) + + url_description = 'URL endpoint for the application database. Can either be a URL to a local DynamoDB server (e.g. "http://localhost:8001") or a libpq-compliant PostgreSQL connection description (e.g. "postgresql://postgres:password@localhost:5432/gene_normalizer").' diff --git a/src/gene/database/database.py b/src/gene/database/database.py index bfe4179a..73accb2e 100644 --- a/src/gene/database/database.py +++ b/src/gene/database/database.py @@ -4,11 +4,12 @@ from enum import Enum from os import environ from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Set, Union +from typing import Any, Generator, List, Optional, Set, Union import click -from gene.schemas import RecordType, RefType, SourceMeta, SourceName +from gene.database.schemas import StoredGene +from gene.schemas import Gene, RecordType, RefType, SourceMeta, SourceName class DatabaseError(Exception): @@ -107,28 +108,28 @@ def initialize_db(self) -> None: """ @abc.abstractmethod - def get_source_metadata(self, src_name: Union[str, SourceName]) -> Dict: + def get_source_metadata(self, src_name: Union[str, SourceName]) -> SourceMeta: """Get license, versioning, data lookup, etc information for a source. :param src_name: name of the source to get data for + :return: structured metadata object + :raise DatabaseReadError: if unable to find metadata for source """ @abc.abstractmethod def get_record_by_id( - self, concept_id: str, case_sensitive: bool = True, merge: bool = False - ) -> Optional[Dict]: + self, concept_id: str, case_sensitive: bool = True + ) -> Optional[Gene]: """Fetch record corresponding to provided concept ID :param concept_id: concept ID for gene record :param case_sensitive: if true, performs exact lookup, which may be quicker. Otherwise, performs filter operation, which doesn't require correct casing. - :param merge: if true, look for merged record; look for identity - record otherwise. :return: complete gene record, if match is found; None otherwise """ @abc.abstractmethod - def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: + def get_ids_by_ref(self, search_term: str, ref_type: RefType) -> List[str]: """Retrieve concept IDs for records matching the user's query. Other methods are responsible for actually retrieving full records. @@ -137,6 +138,10 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: :return: list of associated concept IDs. Empty if lookup fails. """ + @abc.abstractmethod + def get_normalized_record(self, concept_id: str) -> Optional[Gene]: + """TODO""" + @abc.abstractmethod def get_all_concept_ids(self) -> Set[str]: """Retrieve all available concept IDs for use in generating normalized records. @@ -145,7 +150,7 @@ def get_all_concept_ids(self) -> Set[str]: """ @abc.abstractmethod - def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None]: + def get_all_records(self, record_type: RecordType) -> Generator[Gene, None, None]: """Retrieve all source or normalized records. Either return all source records, or all records that qualify as "normalized" (i.e., merged groups + source records that are otherwise ungrouped). @@ -163,27 +168,27 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None """ @abc.abstractmethod - def add_source_metadata(self, src_name: SourceName, data: SourceMeta) -> None: + def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> None: """Add new source metadata entry. :param src_name: name of source - :param data: known source attributes + :param metadata: known source attributes :raise DatabaseWriteError: if write fails """ @abc.abstractmethod - def add_record(self, record: Dict, src_name: SourceName) -> None: + def add_record(self, gene: StoredGene, src_name: SourceName) -> None: """Add new record to database. - :param record: record to upload + :param record: source gene record to upload :param src_name: name of source for record. """ @abc.abstractmethod - def add_merged_record(self, record: Dict) -> None: + def add_merged_record(self, merged_gene: StoredGene) -> None: """Add merged record to database. - :param record: merged record to add + :param merged_gene: merged gene record to add """ @abc.abstractmethod diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 494d977a..cf9c1a37 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -22,9 +22,10 @@ DatabaseWriteError, confirm_aws_db_use, ) +from gene.database.schemas import StoredGene, convert_to_gene from gene.schemas import ( ITEM_TYPES, - PREFIX_LOOKUP, + Gene, RecordType, RefType, SourceMeta, @@ -95,7 +96,7 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: self.genes = self.dynamodb.Table(self.gene_table) self.batch = self.genes.batch_writer() - self._cached_sources = {} + self._cached_sources: Dict[str, SourceMeta] = {} atexit.register(self.close_connection) def list_tables(self) -> List[str]: @@ -210,10 +211,12 @@ def initialize_db(self) -> None: if not self.check_schema_initialized(): self._create_genes_table() - def get_source_metadata(self, src_name: Union[str, SourceName]) -> Dict: + def get_source_metadata(self, src_name: Union[str, SourceName]) -> SourceMeta: """Get license, versioning, data lookup, etc information for a source. :param src_name: name of the source to get data for + :return: structured metadata object + :raise DatabaseReadError: if unable to find metadata for source """ if isinstance(src_name, SourceName): src_name = src_name.value @@ -225,51 +228,53 @@ def get_source_metadata(self, src_name: Union[str, SourceName]) -> Dict: metadata = self.genes.get_item( Key={"label_and_type": pk, "concept_id": concept_id} ).get("Item") + structured_metadata = SourceMeta(**metadata) if not metadata: raise DatabaseReadError( f"Unable to retrieve data for source {src_name}" ) - self._cached_sources[src_name] = metadata - return metadata + self._cached_sources[src_name] = structured_metadata + return structured_metadata + + @staticmethod + def _get_gene_from_record(record: Dict) -> Gene: + """TODO""" def get_record_by_id( - self, concept_id: str, case_sensitive: bool = True, merge: bool = False - ) -> Optional[Dict]: + self, concept_id: str, case_sensitive: bool = True + ) -> Optional[Gene]: """Fetch record corresponding to provided concept ID - :param str concept_id: concept ID for gene record - :param bool case_sensitive: if true, performs exact lookup, which is more + :param concept_id: concept ID for gene record + :param case_sensitive: if true, performs exact lookup, which is more efficient. Otherwise, performs filter operation, which doesn't require correct casing. - :param bool merge: if true, look for merged record; look for identity record - otherwise. :return: complete gene record, if match is found; None otherwise """ + pk = f"{concept_id.lower()}##{RecordType.IDENTITY.value}" try: - if merge: - pk = f"{concept_id.lower()}##{RecordType.MERGER.value}" - else: - pk = f"{concept_id.lower()}##{RecordType.IDENTITY.value}" if case_sensitive: match = self.genes.get_item( Key={"label_and_type": pk, "concept_id": concept_id} ) - return match["Item"] + result = match["Item"] else: exp = Key("label_and_type").eq(pk) response = self.genes.query(KeyConditionExpression=exp) - record = response["Items"][0] - del record["label_and_type"] - return record + result = response["Items"][0] + except (KeyError, IndexError): # record doesn't exist + return None except ClientError as e: _logger.error( f"boto3 client error on get_records_by_id for search term {concept_id}: {e.response['Error']['Message']}" ) return None - except (KeyError, IndexError): # record doesn't exist - return None - def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: + result_parsed = StoredGene(**result) + result_formatted = convert_to_gene(result_parsed) + return result_formatted + + def get_ids_by_ref(self, search_term: str, ref_type: RefType) -> List[str]: """Retrieve concept IDs for records matching the user's query. Other methods are responsible for actually retrieving full records. @@ -288,6 +293,31 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: ) return [] + def get_normalized_record(self, concept_id: str) -> Optional[Gene]: + """TODO""" + result = self.genes.get_item( + Key={ + "label_and_type": f"{concept_id.lower()}##{RecordType.IDENTITY.value}", + "concept_id": concept_id, + } + ) + if "Item" not in result: + return None + record = result["Item"] + if "normalized_id" in record: + normalized_id = record["normalized_id"] + result = self.genes.get_item( + Key={ + "label_and_type": f"normalize.gene.{normalized_id.lower()}##{RecordType.MERGER.value}", + "concept_id": normalized_id, + } + ) + if "Item" not in result: + _logger.error("Broken merge ref to % in %.", normalized_id, concept_id) + return None + record = result["Item"] + return record + def get_all_concept_ids(self) -> Set[str]: """Retrieve concept IDs for use in generating normalized records. @@ -313,7 +343,8 @@ def get_all_concept_ids(self) -> Set[str]: break return set(concept_ids) - def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None]: + # TODO not done + def get_all_records(self, record_type: RecordType) -> Generator[Gene, None, None]: """Retrieve all source or normalized records. Either return all source records, or all records that qualify as "normalized" (i.e., merged groups + source records that are otherwise ungrouped). @@ -361,7 +392,7 @@ def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> Non :raise DatabaseWriteError: if write fails """ src_name_value = src_name.value - metadata_item = metadata.model_dump() + metadata_item = metadata.model_dump(mode="json", exclude_none=True) metadata_item["src_name"] = src_name_value metadata_item["label_and_type"] = f"{str(src_name_value).lower()}##source" metadata_item["concept_id"] = f"source:{str(src_name_value).lower()}" @@ -371,72 +402,68 @@ def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> Non except ClientError as e: raise DatabaseWriteError(e) - def add_record(self, record: Dict, src_name: SourceName) -> None: + def add_record(self, gene: StoredGene, src_name: SourceName) -> None: """Add new record to database. - :param Dict record: record to upload - :param SourceName src_name: name of source for record + :param record: source gene record to upload + :param src_name: name of source for record. """ - concept_id = record["concept_id"] - record["src_name"] = src_name.value - label_and_type = f"{concept_id.lower()}##identity" - record["label_and_type"] = label_and_type - record["item_type"] = "identity" + db_record = gene.model_dump(mode="json", exclude_none=True) + concept_id = gene.concept_id + db_record["src_name"] = src_name.value + db_record["label_and_type"] = f"{concept_id.lower()}##identity" + db_record["item_type"] = RecordType.IDENTITY try: - self.batch.put_item(Item=record) + self.batch.put_item(Item=db_record) except ClientError as e: _logger.error( f"boto3 client error on add_record for {concept_id}: {e.response['Error']['Message']}" ) - for attr_type, item_type in ITEM_TYPES.items(): - if attr_type in record: - value = record.get(attr_type) + for attr_type in ITEM_TYPES.keys(): + if attr_type in db_record: + value = db_record[attr_type] if not value: continue + ref_type = RefType[attr_type.upper()] if isinstance(value, str): - items = [value.lower()] + self._add_ref_record(value, concept_id, ref_type, src_name) else: - items = {item.lower() for item in value} - for item in items: - self._add_ref_record( - item, record["concept_id"], item_type, src_name - ) + for item in {v.lower() for v in value}: + self._add_ref_record(item, concept_id, ref_type, src_name) - def add_merged_record(self, record: Dict) -> None: + def add_merged_record(self, merged_gene: StoredGene) -> None: """Add merged record to database. - :param record: merged record to add + :param merged_gene: merged gene record to add """ - concept_id = record["concept_id"] - id_prefix = concept_id.split(":")[0].lower() - record["src_name"] = PREFIX_LOOKUP[id_prefix] - label_and_type = f"{concept_id.lower()}##{RecordType.MERGER.value}" - record["label_and_type"] = label_and_type - record["item_type"] = RecordType.MERGER.value + db_record = merged_gene.model_dump(mode="json", exclude_none=True) + concept_id = db_record["concept_id"] + db_record["label_and_type"] = f"{concept_id.lower()}##{RecordType.MERGER.value}" + db_record["item_type"] = RecordType.MERGER.value + try: - self.batch.put_item(Item=record) + self.batch.put_item(Item=db_record) except ClientError as e: _logger.error( f"boto3 client error on add_record for {concept_id}: {e.response['Error']['Message']}" ) def _add_ref_record( - self, term: str, concept_id: str, ref_type: str, src_name: SourceName + self, term: str, concept_id: str, ref_type: RefType, src_name: SourceName ) -> None: """Add auxiliary/reference record to database. - :param str term: referent term - :param str concept_id: concept ID to refer to - :param str ref_type: one of {'alias', 'label', 'xref', - 'associated_with'} + :param term: referent term + :param concept_id: concept ID to refer to + :param ref_type: type of reference :param src_name: name of source for record """ - label_and_type = f"{term.lower()}##{ref_type}" + label_and_type = f"{term.lower()}##{ref_type.value}" record = { "label_and_type": label_and_type, "concept_id": concept_id.lower(), "src_name": src_name.value, - "item_type": ref_type, + "item_type": ref_type.value, } try: self.batch.put_item(Item=record) diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index 9b5967d0..52a4e070 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -7,7 +7,7 @@ import tempfile from datetime import datetime from pathlib import Path -from typing import Any, Dict, Generator, List, Optional, Set, Tuple +from typing import Any, Dict, Generator, List, Optional, Set, Tuple, Union import click import psycopg @@ -25,7 +25,15 @@ DatabaseReadError, DatabaseWriteError, ) -from gene.schemas import RecordType, RefType, SourceMeta, SourceName +from gene.database.schemas import StoredGene +from gene.schemas import ( + DataLicenseAttributes, + Gene, + RecordType, + RefType, + SourceMeta, + SourceName, +) _logger = logging.getLogger(__name__) @@ -97,7 +105,6 @@ def list_tables(self) -> List[str]: _drop_db_query = b""" DROP MATERIALIZED VIEW IF EXISTS record_lookup_view; DROP TABLE IF EXISTS - gene_associations, gene_symbols, gene_previous_symbols, gene_aliases, @@ -272,7 +279,7 @@ def _create_tables(self) -> None: cur.execute(tables_query) self.conn.commit() - def get_source_metadata(self, src_name: SourceName) -> Dict: + def get_source_metadata(self, src_name: Union[str, SourceName]) -> SourceMeta: """Get license, versioning, data lookup, etc information for a source. :param src_name: name of the source to get data for @@ -289,19 +296,19 @@ def get_source_metadata(self, src_name: SourceName) -> Dict: metadata_result = cur.fetchone() if not metadata_result: raise DatabaseReadError(f"{src_name} metadata lookup failed") - metadata = { - "data_license": metadata_result[1], - "data_license_url": metadata_result[2], - "version": metadata_result[3], - "data_url": metadata_result[4], - "rdp_url": metadata_result[5], - "data_license_attributes": { - "non_commercial": metadata_result[6], - "attribution": metadata_result[7], - "share_alike": metadata_result[8], - }, - "genome_assemblies": metadata_result[9], - } + metadata = SourceMeta( + data_license=metadata_result[1], + data_license_url=metadata_result[2], + version=metadata_result[3], + data_url=metadata_result[4], + rdp_url=metadata_result[5], + data_license_attributes=DataLicenseAttributes( + non_commercial=metadata_result[6], + attribution=metadata_result[7], + share_alike=metadata_result[8], + ), + genome_assemblies=metadata_result[9], + ) self._cached_sources[src_name] = metadata return metadata @@ -309,48 +316,33 @@ def get_source_metadata(self, src_name: SourceName) -> Dict: b"SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;" # noqa: E501 ) - def _format_source_record(self, source_row: Tuple) -> Dict: + def _format_source_record(self, source_row: Tuple) -> Gene: """Restructure row from gene_concepts table as source record result object. :param source_row: result tuple from psycopg :return: reformatted dictionary keying gene properties to row values """ - gene_record = { - "concept_id": source_row[0], - "symbol_status": source_row[1], - "label": source_row[2], - "strand": source_row[3], - "location_annotations": source_row[4], - "locations": source_row[5], - "gene_type": source_row[6], - "aliases": source_row[7], - "associated_with": source_row[8], - "previous_symbols": source_row[9], - "symbol": source_row[10], - "xrefs": source_row[11], - "src_name": source_row[12], - "merge_ref": source_row[13], - "item_type": RecordType.IDENTITY.value, - } - return {k: v for k, v in gene_record.items() if v} - - def _get_record(self, concept_id: str, case_sensitive: bool) -> Optional[Dict]: - """Retrieve non-merged record. The query is pretty different, so this method - is broken out for PostgreSQL. - - :param concept_id: ID of concept to get - :param case_sensitive: record lookups are performed using a case-insensitive - index, so this parameter isn't used by Postgres - :return: complete record object if successful - """ - concept_id_param = concept_id.lower() - - with self.conn.cursor() as cur: - cur.execute(self._get_record_query, [concept_id_param]) - result = cur.fetchone() - if not result: - return None - return self._format_source_record(result) + # return Gene( + # id=source_row[0], + # # SymbolStatusExtension + # ) + # gene_record = { + # "concept_id": source_row[0], + # "symbol_status": source_row[1], + # "label": source_row[2], + # "strand": source_row[3], + # "location_annotations": source_row[4], + # "locations": source_row[5], + # "gene_type": source_row[6], + # "aliases": source_row[7], + # "previous_symbols": source_row[9], # TODO + # "symbol": source_row[10], + # "xrefs": source_row[11], + # "src_name": source_row[12], + # "merge_ref": source_row[13], + # "item_type": RecordType.IDENTITY.value, + # } + # return {k: v for k, v in gene_record.items() if v} def _format_merged_record(self, merged_row: Tuple) -> Dict: """Restructure row from gene_merged table as normalized result object. @@ -373,8 +365,7 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict: "hgnc_locus_type": merged_row[11], "ncbi_gene_type": merged_row[12], "aliases": merged_row[13], - "associated_with": merged_row[14], - "xrefs": merged_row[15], + "xrefs": merged_row[15], # TODO "item_type": RecordType.MERGER.value, } return {k: v for k, v in merged_record.items() if v} @@ -402,29 +393,30 @@ def _get_merged_record( return self._format_merged_record(result) def get_record_by_id( - self, concept_id: str, case_sensitive: bool = True, merge: bool = False - ) -> Optional[Dict]: + self, concept_id: str, case_sensitive: bool = True + ) -> Optional[Gene]: """Fetch record corresponding to provided concept ID - :param str concept_id: concept ID for gene record - :param bool case_sensitive: - :param bool merge: if true, look for merged record; look for identity record - otherwise. + + :param concept_id: concept ID for gene record + :param case_sensitive: not used by postgres implementation :return: complete gene record, if match is found; None otherwise """ - if merge: - return self._get_merged_record(concept_id, case_sensitive) - else: - return self._get_record(concept_id, case_sensitive) + concept_id_param = concept_id.lower() + with self.conn.cursor() as cur: + cur.execute(self._get_record_query, [concept_id_param]) + result = cur.fetchone() + if not result: + return None + return self._format_source_record(result) _ref_types_query = { RefType.SYMBOL: b"SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;", # noqa: E501 RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", # noqa: E501 RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", # noqa: E501 RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;", - RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", # noqa: E501 } - def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: + def get_ids_by_ref(self, search_term: str, ref_type: RefType) -> List[str]: """Retrieve concept IDs for records matching the user's query. Other methods are responsible for actually retrieving full records. @@ -518,11 +510,11 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None VALUES ( %s, %s, %s, %s, %s, %s, %s, %s, %s, %s ); """ - def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: + def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> None: """Add new source metadata entry. :param src_name: name of source - :param meta: known source attributes + :param metadata: known source attributes :raise DatabaseWriteError: if write fails """ with self.conn.cursor() as cur: @@ -530,15 +522,15 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: self._add_source_metadata_query, [ src_name.value, - meta.data_license, - meta.data_license_url, - meta.version, - json.dumps(meta.data_url), - meta.rdp_url, - meta.data_license_attributes["non_commercial"], - meta.data_license_attributes["attribution"], - meta.data_license_attributes["share_alike"], - meta.genome_assemblies, + metadata.data_license, + metadata.data_license_url, + metadata.version, + json.dumps(metadata.data_url), + metadata.rdp_url, + metadata.data_license_attributes.non_commercial, + metadata.data_license_attributes.attribution, + metadata.data_license_attributes.share_alike, + metadata.genome_assemblies, ], ) self.conn.commit() @@ -558,19 +550,17 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: ) _ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);" _ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);" - _ins_assoc_query = ( - b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);" - ) - def add_record(self, record: Dict, src_name: SourceName) -> None: + def add_record(self, gene: StoredGene, src_name: SourceName) -> None: """Add new record to database. - :param record: record to upload - :param src_name: name of source for record. Not used by PostgreSQL instance. + :param record: source gene record to upload + :param src_name: name of source for record. """ - concept_id = record["concept_id"] - locations = [json.dumps(loc) for loc in record.get("locations", [])] - if not locations: + concept_id = gene.concept_id + if gene.locations: + locations = [json.dumps(loc) for loc in gene.locations] + else: locations = None with self.conn.cursor() as cur: try: @@ -578,25 +568,23 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: self._add_record_query, [ concept_id, - record["src_name"], - record.get("symbol_status"), - record.get("label"), - record.get("strand"), - record.get("location_annotations"), + src_name.value, + gene.symbol_status, + gene.label, + gene.strand, + gene.location_annotations, locations, - record.get("gene_type"), + gene.gene_types, # TODO now its plural ], ) - for a in record.get("aliases", []): + for a in gene.get("aliases", []): cur.execute(self._ins_alias_query, [a, concept_id]) - for x in record.get("xrefs", []): + for x in gene.get("xrefs", []): cur.execute(self._ins_xref_query, [x, concept_id]) - for a in record.get("associated_with", []): - cur.execute(self._ins_assoc_query, [a, concept_id]) - for p in record.get("previous_symbols", []): + for p in gene.get("previous_symbols", []): cur.execute(self._ins_prev_symbol_query, [p, concept_id]) - if record.get("symbol"): - cur.execute(self._ins_symbol_query, [record["symbol"], concept_id]) + if gene.get("symbol"): + cur.execute(self._ins_symbol_query, [gene["symbol"], concept_id]) self.conn.commit() except UniqueViolation: _logger.error(f"Record with ID {concept_id} already exists") @@ -612,43 +600,42 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ - def add_merged_record(self, record: Dict) -> None: + def add_merged_record(self, merged_gene: StoredGene) -> None: """Add merged record to database. - :param record: merged record to add + :param merged_gene: merged gene record to add """ - ensembl_locations = record.get("ensembl_locations") - if ensembl_locations: - ensembl_locations = [json.dumps(i) for i in ensembl_locations] - ncbi_locations = record.get("ncbi_locations") - if ncbi_locations: - ncbi_locations = [json.dumps(i) for i in ncbi_locations] - hgnc_locations = record.get("hgnc_locations") - if hgnc_locations: - hgnc_locations = [json.dumps(i) for i in hgnc_locations] - with self.conn.cursor() as cur: - cur.execute( - self._add_merged_record_query, - [ - record["concept_id"], - record.get("symbol"), - record.get("symbol_status"), - record.get("previous_symbols"), - record.get("label"), - record.get("strand"), - record.get("location_annotations"), - ensembl_locations, - hgnc_locations, - ncbi_locations, - record.get("hgnc_locus_type"), - record.get("ensembl_biotype"), - record.get("ncbi_gene_type"), - record.get("aliases"), - record.get("associated_with"), - record.get("xrefs"), - ], - ) - self.conn.commit() + # ensembl_locations = record.get("ensembl_locations") + # if ensembl_locations: + # ensembl_locations = [json.dumps(i) for i in ensembl_locations] + # ncbi_locations = record.get("ncbi_locations") + # if ncbi_locations: + # ncbi_locations = [json.dumps(i) for i in ncbi_locations] + # hgnc_locations = record.get("hgnc_locations") + # if hgnc_locations: + # hgnc_locations = [json.dumps(i) for i in hgnc_locations] + # with self.conn.cursor() as cur: + # cur.execute( + # self._add_merged_record_query, + # [ + # record["concept_id"], + # record.get("symbol"), + # record.get("symbol_status"), + # record.get("previous_symbols"), + # record.get("label"), + # record.get("strand"), + # record.get("location_annotations"), + # ensembl_locations, + # hgnc_locations, + # ncbi_locations, + # record.get("hgnc_locus_type"), + # record.get("ensembl_biotype"), + # record.get("ncbi_gene_type"), + # record.get("aliases"), + # record.get("xrefs"), + # ], + # ) + # self.conn.commit() _update_merge_ref_query = b""" UPDATE gene_concepts @@ -780,7 +767,7 @@ def close_connection(self) -> None: self.conn.commit() self.conn.close() - def load_from_remote(self, url: Optional[str]) -> None: + def load_from_remote(self, url: Optional[str] = None) -> None: """Load DB from remote dump. Warning: Deletes all existing data. If not passed as an argument, will try to grab latest release from VICC S3 bucket. @@ -822,7 +809,7 @@ def load_from_remote(self, url: Optional[str]) -> None: if result != 0: raise DatabaseError(f"System call '{result}' returned failing exit code.") - def export_db(self, output_directory: Path) -> None: + def export_db(self, export_location: Path) -> None: """Dump DB to specified location. :param export_location: path to directory to save DB dump in @@ -831,12 +818,12 @@ def export_db(self, output_directory: Path) -> None: :raise ValueError: if output directory isn't a directory or doesn't exist :raise DatabaseError: if psql call fails """ - if not output_directory.is_dir() or not output_directory.exists(): + if not export_location.is_dir() or not export_location.exists(): raise ValueError( - f"Output location {output_directory} isn't a directory or doesn't exist" + f"Output location {export_location} isn't a directory or doesn't exist" ) # noqa: E501 now = datetime.now().strftime("%Y%m%d%H%M%S") - output_location = output_directory / f"gene_norm_{now}.sql" + output_location = export_location / f"gene_norm_{now}.sql" user = self.conn.info.user host = self.conn.info.host port = self.conn.info.port diff --git a/src/gene/database/postgresql/add_fkeys.sql b/src/gene/database/postgresql/add_fkeys.sql index f93459b3..28e1a88f 100644 --- a/src/gene/database/postgresql/add_fkeys.sql +++ b/src/gene/database/postgresql/add_fkeys.sql @@ -1,7 +1,5 @@ ALTER TABLE gene_aliases ADD CONSTRAINT gene_aliases_concept_id_fkey FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id); -ALTER TABLE gene_associations ADD CONSTRAINT gene_associations_concept_id_fkey - FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id); ALTER TABLE gene_previous_symbols ADD CONSTRAINT gene_previous_symbols_concept_id_fkey FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id); diff --git a/src/gene/database/postgresql/add_indexes.sql b/src/gene/database/postgresql/add_indexes.sql index b96df534..805ad71b 100644 --- a/src/gene/database/postgresql/add_indexes.sql +++ b/src/gene/database/postgresql/add_indexes.sql @@ -7,7 +7,5 @@ CREATE INDEX idx_gps_symbol_low ON gene_previous_symbols (lower(prev_symbol)); CREATE INDEX idx_ga_alias_low ON gene_aliases (lower(alias)); CREATE INDEX idx_gx_xref_low ON gene_xrefs (lower(xref)); -CREATE INDEX idx_g_as_association_low - ON gene_associations (lower(associated_with)); CREATE INDEX idx_rlv_concept_id_low ON record_lookup_view (lower(concept_id)); diff --git a/src/gene/database/postgresql/create_record_lookup_view.sql b/src/gene/database/postgresql/create_record_lookup_view.sql index 1e33977f..7474a07a 100644 --- a/src/gene/database/postgresql/create_record_lookup_view.sql +++ b/src/gene/database/postgresql/create_record_lookup_view.sql @@ -7,7 +7,6 @@ SELECT gc.concept_id, gc.locations, gc.gene_type, ga.aliases, - gas.associated_with, gps.previous_symbols, gs.symbol, gx.xrefs, @@ -20,11 +19,6 @@ FULL JOIN ( FROM gene_aliases ga_1 GROUP BY ga_1.concept_id ) ga ON gc.concept_id::text = ga.concept_id::text -FULL JOIN ( - SELECT gas_1.concept_id, array_agg(gas_1.associated_with) AS associated_with - FROM gene_associations gas_1 - GROUP BY gas_1.concept_id -) gas ON gc.concept_id::text = gas.concept_id::text FULL JOIN ( SELECT gps_1.concept_id, array_agg(gps_1.prev_symbol) AS previous_symbols FROM gene_previous_symbols gps_1 diff --git a/src/gene/database/postgresql/create_tables.sql b/src/gene/database/postgresql/create_tables.sql index 83198199..9100e553 100644 --- a/src/gene/database/postgresql/create_tables.sql +++ b/src/gene/database/postgresql/create_tables.sql @@ -26,7 +26,6 @@ CREATE TABLE gene_merged ( hgnc_locus_type TEXT [], ncbi_gene_type TEXT [], aliases TEXT [], - associated_with TEXT [], xrefs TEXT [] ); CREATE TABLE gene_concepts ( @@ -60,8 +59,3 @@ CREATE TABLE gene_xrefs ( xref TEXT NOT NULL, concept_id VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id) ); -CREATE TABLE gene_associations ( - id SERIAL PRIMARY KEY, - associated_with TEXT NOT NULL, - concept_ID VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id) -); diff --git a/src/gene/database/postgresql/delete_normalized_concepts.sql b/src/gene/database/postgresql/delete_normalized_concepts.sql index 5141c841..e5e1bdce 100644 --- a/src/gene/database/postgresql/delete_normalized_concepts.sql +++ b/src/gene/database/postgresql/delete_normalized_concepts.sql @@ -19,7 +19,6 @@ CREATE TABLE gene_merged ( hgnc_locus_type TEXT [], ncbi_gene_type TEXT [], aliases TEXT [], - associated_with TEXT [], xrefs TEXT [] ); ALTER TABLE gene_concepts ADD CONSTRAINT gene_concepts_merge_ref_fkey diff --git a/src/gene/database/postgresql/drop_fkeys.sql b/src/gene/database/postgresql/drop_fkeys.sql index f804ca1e..ba2aeef5 100644 --- a/src/gene/database/postgresql/drop_fkeys.sql +++ b/src/gene/database/postgresql/drop_fkeys.sql @@ -1,5 +1,4 @@ ALTER TABLE gene_aliases DROP CONSTRAINT gene_aliases_concept_id_fkey; -ALTER TABLE gene_associations DROP CONSTRAINT gene_associations_concept_id_fkey; ALTER TABLE gene_previous_symbols DROP CONSTRAINT gene_previous_symbols_concept_id_fkey; ALTER TABLE gene_symbols DROP CONSTRAINT gene_symbols_concept_id_fkey; diff --git a/src/gene/database/schemas.py b/src/gene/database/schemas.py new file mode 100644 index 00000000..3b119b70 --- /dev/null +++ b/src/gene/database/schemas.py @@ -0,0 +1,137 @@ +"""Provide schemas for database storage, as well as associated helper methods.""" +from typing import List, Optional + +from ga4gh.core._internal.models import Mapping +from ga4gh.vrs._internal.models import SequenceLocation, SequenceReference +from pydantic import BaseModel, ConfigDict, StrictInt, StrictStr, constr + +from gene.schemas import ( + ApprovedNameExtension, + Gene, + GeneTypeExtension, + GeneTypeExtensionName, + LocationAnnotationsExtension, + PreviousSymbolsExtension, + SequenceLocationExtension, + SequenceLocationExtensionName, + Strand, + SymbolStatus, + SymbolStatusExtension, +) + + +class StoredSequenceLocation(BaseModel): + """Flattened SequenceLocation object for easier storage.""" + + name: SequenceLocationExtensionName + start: StrictInt + end: StrictInt + sequence_id: constr(pattern=r"^ga4gh:SQ.[0-9A-Za-z_\-]{32}$") + + +# class GeneChromosomeLocation(BaseModel): +# """Chromosome Location model when storing in DynamDB.""" + +# type: Literal["ChromosomeLocation"] = "ChromosomeLocation" +# species_id: Literal["taxonomy:9606"] = "taxonomy:9606" +# chr: StrictStr +# start: StrictStr +# end: StrictStr + + +class StoredGeneType(BaseModel): + """Flattened gene type extension object for easier storage.""" + + name: GeneTypeExtensionName + value: StrictStr + + +class StoredGene(BaseModel): + """Flatted gene object for easier storage. + + The full GA4GH core Gene object is quite verbose and includes a lot of redundant + information. This is fine for machine-readable output, but unnecessary/unwieldy + when storing in a DB. This class represents the minimum information in a flatter + structure, and should be expanded out to a full gene object by database + implementation classes. + """ + + concept_id: StrictStr + label: Optional[StrictStr] = None # symbol + aliases: Optional[List[str]] = None + xrefs: Optional[List[str]] = None + + symbol_status: Optional[SymbolStatus] = None + approved_name: Optional[StrictStr] = None # full name + previous_symbols: Optional[List[StrictStr]] = None + strand: Optional[Strand] = None + location_annotations: Optional[List[StrictStr]] = None + locations: Optional[List[StoredSequenceLocation]] = None + gene_types: Optional[List[StoredGeneType]] = None + + normalized_id: Optional[str] = None + + model_config = ConfigDict(extra="forbid") + + +def convert_to_vrs_location(location: StoredSequenceLocation) -> SequenceLocation: + """Convert collapsed DB sequence location object to valid VRS SequenceLocation. + + :param location: stored location object + :return: Corresponding VRS location + """ + refget_ac = location.sequence_id.split("ga4gh:")[-1] + return SequenceLocation( + sequenceReference=SequenceReference(refgetAccession=refget_ac), + start=location.start, + end=location.end, + ) + + +def convert_to_gene(stored_gene: StoredGene) -> Gene: + """Convert gene from stored format to GA4GH core Gene object. + + :param stored_gene: gene record, as retrieved from database + :return: equivalent Gene object + """ + mappings = [] + if stored_gene.xrefs: + for xref in stored_gene.xrefs: + split = xref.split(":", maxsplit=1) + mappings.append( + Mapping( + relation="relatedMatch", + coding={"system": split[0], "code": split[1]}, + ) + ) + + extensions = [] + if stored_gene.symbol_status: + extensions.append(SymbolStatusExtension(value=stored_gene.symbol_status)) + if stored_gene.approved_name: + extensions.append(ApprovedNameExtension(value=stored_gene.approved_name)) + if stored_gene.previous_symbols: + extensions.append(PreviousSymbolsExtension(value=stored_gene.previous_symbols)) + if stored_gene.strand: + extensions.append(PreviousSymbolsExtension(value=stored_gene.strand)) + if stored_gene.location_annotations: + extensions += LocationAnnotationsExtension( + value=stored_gene.location_annotations + ) + if stored_gene.locations: + locations = [convert_to_vrs_location(loc) for loc in stored_gene.locations] + extensions += SequenceLocationExtension(value=locations) + if stored_gene.gene_types: + gene_types = [ + GeneTypeExtension(**gt.model_dump()) for gt in stored_gene.gene_types + ] + extensions += gene_types + + gene = Gene( + id=stored_gene.concept_id, + label=stored_gene.label, + aliases=stored_gene.aliases, + extensions=extensions, + mappings=mappings, + ) + return gene diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 93804f35..3e31f962 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -12,7 +12,12 @@ from wags_tails import EnsemblData, HgncData, NcbiGeneData from gene.database import AbstractDatabase -from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName +from gene.database.schemas import StoredGene, StoredSequenceLocation +from gene.schemas import ( + ITEM_TYPES, + SequenceLocationExtensionName, + SourceName, +) _logger = logging.getLogger(__name__) @@ -117,27 +122,21 @@ def _load_gene(self, gene: Dict) -> None: :param gene: Gene record """ + for attr_type in ITEM_TYPES: + if attr_type in gene: + value = gene[attr_type] + if value is None or value == []: + del gene[attr_type] + elif isinstance(value, str): + continue + gene[attr_type] = list(set(value)) try: - assert Gene(match_type=MatchType.NO_MATCH, **gene) + stored_gene = StoredGene(**gene) except pydantic.ValidationError as e: - _logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") + _logger.warning(f"Unable to load {gene} due to validation error: {e}") else: - concept_id = gene["concept_id"] - gene["label_and_type"] = f"{concept_id.lower()}##identity" - gene["src_name"] = self._src_name.value - gene["item_type"] = "identity" - - for attr_type in ITEM_TYPES: - if attr_type in gene: - value = gene[attr_type] - if value is None or value == []: - del gene[attr_type] - elif isinstance(value, str): - continue - gene[attr_type] = list(set(value)) - - self._database.add_record(gene, self._src_name) - self._processed_ids.append(concept_id) + self._database.add_record(stored_gene, self._src_name) + self._processed_ids.append(stored_gene.concept_id) def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo: """Return SeqRepo instance if seqrepo_dir exists. @@ -224,32 +223,41 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: _logger.warning(f"SeqRepo raised KeyError: {e}") return aliases - def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: + def _get_sequence_location( + self, seq_id: str, gene: Feature, concept_id: str + ) -> Optional[StoredSequenceLocation]: """Get a gene's GeneSequenceLocation. :param seq_id: The sequence ID. :param gene: A gene from the source file. - :param params: The transformed gene record. - :return: A dictionary of a GA4GH VRS SequenceLocation, if seq_id alias found. - Else, empty dictionary + :param concept_id: record ID from source + :return: Storeable representation of a GA4GH VRS SequenceLocation, if seq_id + alias found. Else, empty dictionary. """ - location = {} aliases = self._get_seq_id_aliases(seq_id) - if not aliases: - return location + if not aliases or gene.start is None or gene.end is None: + return None sequence = aliases[0] if gene.start != "." and gene.end != "." and sequence: - if 0 <= gene.start <= gene.end: # type: ignore - location = GeneSequenceLocation( - start=gene.start - 1, # type: ignore - end=gene.end, # type: ignore + if 0 <= gene.start <= gene.end: + if self._src_name == SourceName.NCBI: + name = SequenceLocationExtensionName.NCBI_LOCATIONS + elif self._src_name == SourceName.ENSEMBL: + name = SequenceLocationExtensionName.ENSEMBL_LOCATIONS + else: + raise ValueError( + f"Unrecognized source class for location extension: {self._src_name}" + ) + return StoredSequenceLocation( + name=name, + start=gene.start - 1, + end=gene.end, sequence_id=sequence, - ).model_dump() # type: ignore + ) else: _logger.warning( - f"{params['concept_id']} has invalid interval:" - f"start={gene.start - 1} end={gene.end}" - ) # type: ignore - return location + f"{concept_id} has invalid interval: start={gene.start - 1} end={gene.end}" + ) + return None diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index c640b6ac..1563f961 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -6,8 +6,15 @@ import gffutils from gffutils.feature import Feature +from gene.database.schemas import StoredGeneType from gene.etl.base import Base, GeneNormalizerEtlError -from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand +from gene.schemas import ( + DataLicenseAttributes, + GeneTypeExtensionName, + NamespacePrefix, + SourceMeta, + Strand, +) _logger = logging.getLogger(__name__) @@ -66,22 +73,20 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: :param accession_numbers: Accession numbers for each chromosome and scaffold :return: A gene dictionary containing data if the ID attribute exists. """ - gene = dict() - if f.strand == "-": - gene["strand"] = Strand.REVERSE.value - elif f.strand == "+": - gene["strand"] = Strand.FORWARD.value - gene["src_name"] = SourceName.ENSEMBL.value - - self._add_attributes(f, gene) - location = self._add_location(f, gene, accession_numbers) + gene_params = {} + try: + gene_params["strand"] = Strand(f.strand) + except ValueError: + pass + + self._add_attributes(f, gene_params) + location = self._get_sequence_location( + accession_numbers[f.seqid], f, gene_params["concept_id"] + ) if location: - gene["locations"] = [location] - - gene["label_and_type"] = f"{gene['concept_id'].lower()}##identity" - gene["item_type"] = "identity" + gene_params["locations"] = [location] - return gene + return gene_params def _add_attributes(self, f: Feature, gene: Dict) -> None: """Add concept_id, symbol, xrefs, and associated_with to a gene record. @@ -89,79 +94,41 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: :param f: A gene from the data :param gene: A transformed gene record """ - attributes = { - "ID": "concept_id", - "Name": "symbol", - "description": "xrefs", - "biotype": "gene_type", - } - - for attribute in f.attributes.items(): - key = attribute[0] - - if key in attributes.keys(): - val = attribute[1] - - if len(val) == 1: - val = val[0] - if key == "ID": - if val.startswith("gene"): - val = ( - f"{NamespacePrefix.ENSEMBL.value}:" - f"{val.split(':')[1]}" - ) - - if key == "description": - gene["label"] = val.split("[")[0].strip() - if "Source:" in val: - src_name = ( - val.split("[")[-1] - .split("Source:")[-1] - .split("Acc")[0] - .split(";")[0] - ) - src_id = val.split("Acc:")[-1].split("]")[0] - if ":" in src_id: - src_id = src_id.split(":")[-1] - source = self._get_xref_associated_with(src_name, src_id) - if "xrefs" in source: - gene["xrefs"] = source["xrefs"] - elif "associated_with" in source: - gene["associated_with"] = source["associated_with"] - continue - - gene[attributes[key]] = val - - def _add_location(self, f: Feature, gene: Dict, accession_numbers: Dict) -> Dict: - """Add GA4GH SequenceLocation to a gene record. - https://vr-spec.readthedocs.io/en/1.1/terms_and_model.html#sequencelocation - - :param f: A gene from the data - :param gene: A transformed gene record - :param accession_numbers: Accession numbers for each chromosome and scaffold - :return: gene record dictionary with location added - """ - return self._get_sequence_location(accession_numbers[f.seqid], f, gene) - - def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: + for key, value in f.attributes.items(): + if key == "ID" and len(value) == 1 and value[0].startswith("gene:"): + gene["concept_id"] = f"{NamespacePrefix.ENSEMBL.value}:{value[0][5:]}" + elif key == "biotype": + gene["gene_types"] = [ + StoredGeneType(name=GeneTypeExtensionName.ENSEMBL, value=value[0]) + ] + elif key == "Name": + gene["label"] = value[0] + elif key == "description": + description = value[0] + gene["approved_name"] = description.split("[")[0].strip() + ref_pattern = r".*\[Source:(.*);Acc:(.*)\]" + match = re.findall(ref_pattern, description) + if match: + gene["xrefs"] = [self._process_xref(match[0][0], match[0][1])] + + def _process_xref(self, src_name: str, src_id: str) -> str: """Get xref or associated_with concept. :param src_name: Source name :param src_id: The source's accession number - :return: A dict containing an other identifier or xref + :return: Formatted xref + :raise ValueError: if `src_name` isn't a recognized source name """ - source = dict() - if src_name.startswith("HGNC"): - source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] - elif src_name.startswith("NCBI"): - source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"] - elif src_name.startswith("UniProt"): - source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] - elif src_name.startswith("miRBase"): - source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] - elif src_name.startswith("RFAM"): - source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] - return source + for pattern, prefix in ( + ("HGNC", NamespacePrefix.HGNC), + ("NCBI", NamespacePrefix.NCBI), + ("UniProt", NamespacePrefix.UNIPROT), + ("miRBase", NamespacePrefix.MIRBASE), + ("RFAM", NamespacePrefix.RFAM), + ): + if src_name.startswith(pattern): + return f"{prefix.value}:{src_id}" + raise ValueError(f"Unrecognized source name: {src_name}") def _add_meta(self) -> None: """Add Ensembl metadata. @@ -181,11 +148,11 @@ def _add_meta(self) -> None: "genome_annotations": f"ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz" }, rdp_url=None, - data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, + data_license_attributes=DataLicenseAttributes( + non_commercial=False, + share_alike=False, + attribution=False, + ), genome_assemblies=[self._assembly], ) diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 39e20ffc..3110f56d 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -2,13 +2,15 @@ import json import logging import re -from typing import Dict +from typing import Dict, List +from gene.database.schemas import StoredGeneType from gene.etl.base import Base, GeneNormalizerEtlError from gene.schemas import ( - PREFIX_LOOKUP, Annotation, Chromosome, + DataLicenseAttributes, + GeneTypeExtensionName, NamespacePrefix, SourceMeta, SourceName, @@ -32,17 +34,13 @@ def _transform_data(self) -> None: for r in records: gene = dict() gene["concept_id"] = r["hgnc_id"].lower() - gene["label_and_type"] = f"{gene['concept_id']}##identity" - gene["item_type"] = "identity" - gene["symbol"] = r["symbol"] - gene["label"] = r["name"] - gene["src_name"] = SourceName.HGNC.value + gene["label"] = r["symbol"] + gene["approved_name"] = r["name"] if r["status"]: if r["status"] == "Approved": gene["symbol_status"] = SymbolStatus.APPROVED.value elif r["status"] == "Entry Withdrawn": gene["symbol_status"] = SymbolStatus.WITHDRAWN.value - gene["src_name"] = SourceName.HGNC.value # store alias, xref, associated_with, prev_symbols, location self._get_aliases(r, gene) @@ -52,8 +50,12 @@ def _transform_data(self) -> None: if "location" in r: self._get_location(r, gene) if "locus_type" in r: - gene["gene_type"] = r["locus_type"] - self._load_gene(gene) + gene["gene_types"] = [ + StoredGeneType( + name=GeneTypeExtensionName.HGNC, value=r["locus_type"] + ) + ] + self._load_gene(gene) _logger.info("HGNC data transform complete.") def _get_aliases(self, r: Dict, gene: Dict) -> None: @@ -82,15 +84,16 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: prev_symbols = r["prev_symbol"] if prev_symbols: gene["previous_symbols"] = list(set(prev_symbols)) + else: # TODO + breakpoint() - def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: + def _get_xrefs_associated_with(self, hgnc_record: Dict, gene: Dict) -> None: """Store xrefs and/or associated_with refs in a gene record. - :param r: A gene record in the HGNC data file + :param record: A gene record the HGNC data file :param gene: A transformed gene record """ - xrefs = list() - associated_with = list() + xrefs = [] sources = [ "entrez_id", "ensembl_gene_id", @@ -119,7 +122,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: ] for src in sources: - if src in r: + if src in hgnc_record: if "-" in src: key = src.split("-")[0] elif "." in src: @@ -130,35 +133,28 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: key = src if key.upper() in NamespacePrefix.__members__: - if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP.keys(): - self._get_xref_associated_with(key, src, r, xrefs) - else: - self._get_xref_associated_with(key, src, r, associated_with) + self._get_xref(key, src, hgnc_record, xrefs) else: _logger.warning(f"{key} not in schemas.py") if xrefs: gene["xrefs"] = xrefs - if associated_with: - gene["associated_with"] = associated_with - def _get_xref_associated_with( - self, key: str, src: str, r: Dict, src_type: Dict - ) -> None: + def _get_xref(self, key: str, src: str, hgnc_record: Dict, xrefs: List) -> None: """Add an xref or associated_with ref to a gene record. :param key: The source's name :param src: HGNC's source field - :param r: A gene record in the HGNC data file - :param src_type: Either xrefs or associated_with list + :param hgnc_record: A gene record in the HGNC data file + :param xrefs: in-progress list of xrefs from the HGNC record """ - if isinstance(r[src], list): - for xref in r[src]: - src_type.append(f"{NamespacePrefix[key.upper()].value}:{xref}") + if isinstance(hgnc_record[src], list): + for xref in hgnc_record[src]: + xrefs.append(f"{NamespacePrefix[key.upper()].value}:{xref}") else: - if isinstance(r[src], str) and ":" in r[src]: - r[src] = r[src].split(":")[-1].strip() - src_type.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}") + if isinstance(hgnc_record[src], str) and ":" in hgnc_record[src]: + hgnc_record[src] = hgnc_record[src].split(":")[-1].strip() + xrefs.append(f"{NamespacePrefix[key.upper()].value}" f":{hgnc_record[src]}") def _get_location(self, r: Dict, gene: Dict) -> None: """Store GA4GH VRS ChromosomeLocation in a gene record. @@ -211,7 +207,7 @@ def _set_annotation(self, loc: str, gene: Dict) -> None: loc = loc.split(annotation)[0].strip() if not loc: return None - return loc + return loc # TODO ? def _set_location(self, loc: str, location: Dict, gene: Dict) -> None: """Set a gene's location. @@ -256,11 +252,11 @@ def _add_meta(self) -> None: "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" }, rdp_url=None, - data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, + data_license_attributes=DataLicenseAttributes( + non_commercial=False, + share_alike=False, + attribution=False, + ), genome_assemblies=[], ) self._database.add_source_metadata(SourceName.HGNC, metadata) diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py index d065be73..ea8b4e10 100644 --- a/src/gene/etl/merge.py +++ b/src/gene/etl/merge.py @@ -1,11 +1,18 @@ """Create concept groups and merged records.""" import logging from timeit import default_timer as timer -from typing import Dict, Optional, Set, Tuple +from typing import Dict, List, Optional, Set, Tuple + +from ga4gh.core._internal.models import Gene from gene.database import AbstractDatabase from gene.database.database import DatabaseWriteError -from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority +from gene.schemas import ( + PREFIX_LOOKUP, + GeneTypeExtensionName, + RecordType, + SourcePriority, +) _logger = logging.getLogger(__name__) @@ -117,7 +124,7 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: :param record_id_set: group of concept IDs :return: completed merged drug object to be stored in DB """ - records = [] + records: List[Gene] = [] for record_id in record_id_set: record = self._database.get_record_by_id(record_id) if record: @@ -128,32 +135,49 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: f"record for {record_id} in {record_id_set}" ) - def record_order(record: Dict) -> Tuple: + def _record_order(record: Gene) -> Tuple[SourcePriority, str]: """Provide priority values of concepts for sort function.""" - src = record["src_name"].upper() - if src in SourcePriority.__members__: - source_rank = SourcePriority[src].value - else: - raise ValueError( - f"Prohibited source: {src} in concept_id " f"{record['concept_id']}" - ) - return source_rank, record["concept_id"] + concept_id: str = record.id # type: ignore + src_name = PREFIX_LOOKUP[concept_id.split(":")[0]] + priority = SourcePriority[src_name] + return (priority, concept_id) - records.sort(key=record_order) + records.sort(key=_record_order) # initialize merged record merged_attrs = { - "concept_id": records[0]["concept_id"], - "aliases": set(), - "associated_with": set(), - "previous_symbols": set(), - "hgnc_locus_type": set(), - "ncbi_gene_type": set(), - "ensembl_biotype": set(), - "strand": set(), + "concept_id": records[0].id, + "label": None, + "aliases": [], + "xrefs": [r.id for r in records[1:]], + "symbol_status": None, # TODO is this a weird way to represent this? + "approved_name": None, + "previous_symbols": [], + "strand": None, + "location_annotations": None, + "locations": [], + "gene_types": [], } - if len(records) > 1: - merged_attrs["xrefs"] = list({r["concept_id"] for r in records[1:]}) + + for record in records: + for field in ( + "aliases", + "xrefs", + "previous_symbols", + "gene_types", + "locations", + ): + attribute = record.__getattribute__(field) + if attribute: + merged_attrs[field] |= attribute + for field in ( + "label", + "symbol_status", + "approved_name", + "strand", + "location_annotations", + ): + pass # TODO # merge from constituent records set_fields = ["aliases", "associated_with", "previous_symbols", "strand"] @@ -172,7 +196,7 @@ def record_order(record: Dict) -> Tuple: gene_type = record.get("gene_type") if gene_type: - merged_field = GeneTypeFieldName[record["src_name"].upper()] + merged_field = GeneTypeExtensionName[record["src_name"].upper()] merged_attrs[merged_field] |= {gene_type} for field in set_fields + [ diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index da5d97e8..ff86dd84 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -3,18 +3,21 @@ import logging import re from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union import gffutils from wags_tails import NcbiGenomeData from wags_tails.ncbi import NcbiGenePaths from gene.database import AbstractDatabase +from gene.database.schemas import StoredGeneType from gene.etl.base import SEQREPO_ROOT_DIR, Base, GeneNormalizerEtlError from gene.schemas import ( PREFIX_LOOKUP, Annotation, Chromosome, + DataLicenseAttributes, + GeneTypeExtensionName, NamespacePrefix, SourceMeta, SourceName, @@ -78,42 +81,42 @@ def _get_prev_symbols(self) -> Dict[str, str]: prev_symbols = {} for row in history: # Only interested in rows that have homo sapiens tax id - if row[0] == "9606": - if row[1] != "-": - gene_id = row[1] - if gene_id in prev_symbols.keys(): - prev_symbols[gene_id].append(row[3]) - else: - prev_symbols[gene_id] = [row[3]] + if row[0] != "9606": + continue + if row[1] != "-": + gene_id = row[1] + if gene_id in prev_symbols.keys(): + prev_symbols[gene_id].append(row[3]) else: - # Load discontinued genes - params = { - "concept_id": f"{NamespacePrefix.NCBI.value}:{row[2]}", - "symbol": row[3], - "symbol_status": SymbolStatus.DISCONTINUED.value, - } - self._load_gene(params) + prev_symbols[gene_id] = [row[3]] + else: + # Load discontinued genes + params = { + "concept_id": f"{NamespacePrefix.NCBI.value}:{row[2]}", + "label": row[3], + "symbol_status": SymbolStatus.DISCONTINUED, + } + self._load_gene(params) history_file.close() return prev_symbols - def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: + def _add_xrefs(self, source_value: List[str], gene_params: Dict) -> None: """Add xrefs and associated_with refs to a transformed gene. - :param val: A list of source ids for a given gene - :param params: A transformed gene record + :param source_value: A list of source ids for a given gene + :param gene_params: A transformed gene record """ - params["xrefs"] = [] - params["associated_with"] = [] - for src in val: + gene_params["xrefs"] = [] + for src in source_value: src_name = src.split(":")[0].upper() src_id = src.split(":")[-1] if src_name == "GENEID": - params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{src_id}" + gene_params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{src_id}" elif ( src_name in NamespacePrefix.__members__ and NamespacePrefix[src_name].value in PREFIX_LOOKUP ): - params["xrefs"].append( + gene_params["xrefs"].append( f"{NamespacePrefix[src_name].value}" f":{src_id}" ) else: @@ -123,22 +126,16 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: prefix = NamespacePrefix.IMGT_GENE_DB.value elif src_name.startswith("MIRBASE"): prefix = NamespacePrefix.MIRBASE.value - else: - prefix = None - if prefix: - params["associated_with"].append(f"{prefix}:{src_id}") else: _logger.info(f"{src_name} is not in NameSpacePrefix.") - if not params["xrefs"]: - del params["xrefs"] - if not params["associated_with"]: - del params["associated_with"] + continue + gene_params["xrefs"].append(f"{prefix}:{src_id}") - def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: + def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, Dict]: """Store genes from NCBI info file. :param prev_symbols: A dictionary of a gene's previous symbols - :return: A dictionary of gene's from the NCBI info file. + :return: A dictionary of genes from the NCBI info file. """ # open info file, skip headers info_file = open(self._info_src, "r") @@ -147,36 +144,35 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: info_genes = dict() for row in info: - params = dict() - params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{row[1]}" - # get symbol - params["symbol"] = row[2] - # get aliases + params: Dict[str, Union[List, str]] = { + "concept_id": f"{NamespacePrefix.NCBI.value}:{row[1]}", + "label": row[2], + } if row[4] != "-": params["aliases"] = row[4].split("|") - else: - params["aliases"] = [] - # get associated_with + # get xrefs if row[5] != "-": - associated_with = row[5].split("|") - self._add_xrefs_associated_with(associated_with, params) + xrefs = row[5].split("|") + self._add_xrefs(xrefs, params) # get chromosome location vrs_chr_location = self._get_vrs_chr_location(row, params) if "exclude" in vrs_chr_location: # Exclude genes with multiple distinct locations (e.g. OMS) continue - if not vrs_chr_location: - vrs_chr_location = [] - params["locations"] = vrs_chr_location + if vrs_chr_location: + params["locations"] = vrs_chr_location # get label if row[8] != "-": - params["label"] = row[8] + params["approved_name"] = row[8] # add prev symbols if row[1] in prev_symbols.keys(): params["previous_symbols"] = prev_symbols[row[1]] - info_genes[params["symbol"]] = params # get type - params["gene_type"] = row[9] + params["gene_types"] = [ + StoredGeneType(name=GeneTypeExtensionName.NCBI, value=row[9]) + ] + + info_genes[params["label"]] = params return info_genes def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: @@ -239,7 +235,7 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: val = val[0] if key == "Dbxref": - self._add_xrefs_associated_with(val, gene) + self._add_xrefs(val, gene) elif key == "Name": gene["symbol"] = val @@ -256,7 +252,9 @@ def _get_vrs_sq_location( """ gene = db[f_id] params["strand"] = gene.strand - return self._get_sequence_location(gene.seqid, gene, params) + return self._get_sequence_location( + gene.seqid, gene, params["concept_id"] + ) # TODO type? def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: """Get xref or associated_with ref. @@ -493,11 +491,11 @@ def _add_meta(self) -> None: "assembly_file": self._assembly_url, }, rdp_url="https://reusabledata.org/ncbi-gene.html", - data_license_attributes={ - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, + data_license_attributes=DataLicenseAttributes( + non_commercial=False, + share_alike=False, + attribution=False, + ), genome_assemblies=[self._assembly], ) diff --git a/src/gene/main.py b/src/gene/main.py index 18ffa17f..b7d90568 100644 --- a/src/gene/main.py +++ b/src/gene/main.py @@ -1,5 +1,6 @@ """Main application for FastAPI""" import html +from datetime import datetime from typing import Optional from fastapi import FastAPI, HTTPException, Query @@ -10,14 +11,24 @@ from gene.schemas import ( SOURCES, NormalizeService, + NormalizeUnmergedService, SearchService, + ServiceMeta, SourceName, - UnmergedNormalizationService, ) db = create_db() query_handler = QueryHandler(db) + +def _get_service_meta() -> ServiceMeta: + """Create service metadata object + + :return: complete service metadata description + """ + return ServiceMeta(version=__version__, response_datetime=str(datetime.now())) + + description = """ The Gene Normalizer provides tools for resolving ambiguous gene references to consistently-structured, normalized terms. @@ -90,8 +101,13 @@ def search( detail=f"Unable to parse source name: {candidate_source}", ) parsed_sources.append(parsed_source) - resp = query_handler.search(html.unescape(q), sources=parsed_sources) - return resp + search_result = query_handler.search(html.unescape(q), sources=parsed_sources) + service_response = SearchService( + query=q, + results=search_result, + service_meta_=_get_service_meta(), + ) + return service_response normalize_summary = "Given query, provide merged normalized record." @@ -115,8 +131,11 @@ def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeSe :param str q: gene search term :return: JSON response with normalized gene concept """ - resp = query_handler.normalize(html.unescape(q)) - return resp + normalize_result = query_handler.normalize(html.unescape(q)) + normalize_response = NormalizeService( + query=q, result=normalize_result, service_meta_=_get_service_meta() + ) + return normalize_response unmerged_matches_summary = ( @@ -137,17 +156,20 @@ def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeSe summary=unmerged_matches_summary, operation_id="getUnmergedRecords", response_description=unmerged_response_descr, - response_model=UnmergedNormalizationService, + response_model=NormalizeUnmergedService, description=unmerged_normalize_description, tags=["Query"], ) def normalize_unmerged( q: str = Query(..., description=normalize_q_descr), -) -> UnmergedNormalizationService: +) -> NormalizeUnmergedService: """Return all individual records associated with a normalized concept. :param q: Gene search term :returns: JSON response with matching normalized record and source metadata """ - response = query_handler.normalize_unmerged(html.unescape(q)) + result = query_handler.normalize_unmerged(html.unescape(q)) + response = NormalizeUnmergedService( + query=q, service_meta_=_get_service_meta(), match=result + ) return response diff --git a/src/gene/query.py b/src/gene/query.py index 8c100446..d85594a1 100644 --- a/src/gene/query.py +++ b/src/gene/query.py @@ -1,38 +1,31 @@ """Provides methods for handling queries.""" import logging import re -from datetime import datetime -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, TypeVar +from typing import List, Optional, Tuple -from ga4gh.core import core_models, ga4gh_identify -from ga4gh.vrs import models - -from gene.database import AbstractDatabase, DatabaseReadError +from gene.database import AbstractDatabase from gene.schemas import ( ITEM_TYPES, NAMESPACE_LOOKUP, PREFIX_LOOKUP, - BaseGene, - BaseNormalizationService, + REF_TO_MATCH_MAP, Gene, - GeneTypeFieldName, - MatchesNormalized, + GeneMatch, MatchType, - NamespacePrefix, - NormalizeService, + NormalizeResult, + NormalizeUnmergedMatches, + NormalizeUnmergedResult, + QueryWarning, RecordType, RefType, - SearchService, - ServiceMeta, - SourceMeta, + ResultSourceMeta, + SearchResult, SourceName, SourcePriority, - UnmergedNormalizationService, + WarningType, ) -from gene.version import __version__ _logger = logging.getLogger(__name__) -NormService = TypeVar("NormService", bound=BaseNormalizationService) class QueryHandler: @@ -58,463 +51,169 @@ def __init__(self, database: AbstractDatabase) -> None: self.db = database @staticmethod - def _emit_warnings(query_str: str) -> List: - """Emit warnings if query contains non breaking space characters. + def _parse_query_input(raw_query: str) -> Tuple[str, List[QueryWarning]]: + """Preprocess user query: + * Strip white space + * Check for non-breaking spaces. + + Return any necessary warnings. - :param query_str: query string - :return: List of warnings + :param raw_str: raw query string + :return: updated query and list of warnings """ - warnings = [] - nbsp = re.search("\xa0| ", query_str) + warning_list = [] + parsed_query = raw_query.strip() + if raw_query != parsed_query: + warning_list.append( + QueryWarning( + type=WarningType.STRIPPED_QUERY, + description=f'Stripped query "{raw_query}" to "{parsed_query}"', + ) + ) + nbsp = re.search("\xa0| ", parsed_query) if nbsp: - warnings = [ - { - "non_breaking_space_characters": "Query contains non-breaking space characters" - } - ] + warning_list.append( + QueryWarning( + type=WarningType.NBSP, + description="Query contains non-breaking space characters", + ) + ) _logger.warning( - f"Query ({query_str}) contains non-breaking space characters." + f"Query ({parsed_query}) contains non-breaking space characters." ) - return warnings + return (parsed_query.lower(), warning_list) - @staticmethod - def _transform_sequence_location(loc: Dict) -> models.SequenceLocation: - """Transform a sequence location to VRS sequence location + def _get_sources_meta(self, sources: List[SourceName]) -> ResultSourceMeta: + """Fetch result source meta object. - :param loc: GeneSequenceLocation represented as a dict - :return: VRS sequence location + :param sources: List of requested sources + :return: structured source metadata for requested sources """ - refget_ac = loc["sequence_id"].split("ga4gh:")[-1] + params = {} + for name in sources: + meta = self.db.get_source_metadata(name) + params[name.value.lower()] = meta + return ResultSourceMeta(**params) - return models.SequenceLocation( - sequenceReference=models.SequenceReference(refgetAccession=refget_ac), - start=int(loc["start"]), - end=int(loc["end"]), - ) - - # @staticmethod - # def _transform_chromosome_location(loc: Dict) -> ChromosomeLocation: - # """Transform a chromosome location to VRS chromosome location - - # :param loc: Chromosome location - # :return: VRS chromosome location - # """ - # return ChromosomeLocation( - # species_id=loc["species_id"], - # chr=loc["chr"], - # start=loc["start"], - # end=loc["end"] - # ) - - def _transform_location(self, loc: Dict) -> Dict: - """Transform a sequence/chromosome location to VRS sequence/chromosome location - - :param loc: Sequence or Chromosome location - :return: VRS sequence or chromosome location represented as a dictionary - """ - # if loc["type"] == "SequenceLocation": - # transformed_loc = self._transform_sequence_location(loc) - # else: - # transformed_loc = self._transform_chromosome_location(loc) - # Only support sequence locations atm - transformed_loc = self._transform_sequence_location(loc) - transformed_loc.id = ga4gh_identify(transformed_loc) - return transformed_loc.model_dump(exclude_none=True) - - def _transform_locations(self, record: Dict) -> Dict: - """Transform gene locations to VRS Chromosome/Sequence Locations - - :param record: original record - :return: record with transformed locations attributes, if applicable - """ - record_locations = list() - if "locations" in record: - for loc in record["locations"]: - if loc["type"] == "SequenceLocation": - record_locations.append(self._transform_location(loc)) - record["locations"] = record_locations - return record - - def _get_src_name(self, concept_id: str) -> SourceName: - """Get source name enum from ID. - - :param concept_id: candidate concept ID string to check - :return: SourceName option - :raise: ValueError if unrecognized ID provided - """ - if concept_id.startswith(NamespacePrefix.ENSEMBL.value): - return SourceName.ENSEMBL - elif concept_id.startswith(NamespacePrefix.NCBI.value): - return SourceName.NCBI - elif concept_id.startswith(NamespacePrefix.HGNC.value): - return SourceName.HGNC - else: - raise ValueError("Invalid or unrecognized concept ID provided") - - def _add_record( - self, response: Dict[str, Dict], item: Dict, match_type: MatchType - ) -> None: - """Add individual record (i.e. Item in DynamoDB) to response object - - :param response: in-progress response object to return to client - :param item: Item retrieved from DynamoDB - :param match_type: match type for query - """ - item = self._transform_locations(item) - item["match_type"] = match_type - gene = Gene(**item) - src_name = item["src_name"] - - matches = response["source_matches"] - if src_name not in matches.keys(): - pass - elif matches[src_name] is None: - matches[src_name] = { - "records": [gene], - "source_meta_": self.db.get_source_metadata(src_name), - } - else: - matches[src_name]["records"].append(gene) - - def _fetch_record( - self, response: Dict[str, Dict], concept_id: str, match_type: MatchType - ) -> None: - """Add fetched record to response - - :param response: in-progress response object to return to client. - :param concept_id: Concept id to fetch record for. Should be all lower-case. - :param match_type: match type for record - """ - try: - match = self.db.get_record_by_id(concept_id, case_sensitive=False) - except DatabaseReadError as e: - _logger.error(f"Encountered DatabaseReadError looking up {concept_id}: {e}") - else: - if match: - self._add_record(response, match, match_type) - else: - _logger.error( - f"Unable to find expected record for {concept_id} matching as {match_type}" - ) # noqa: E501 - - def _post_process_resp(self, resp: Dict) -> Dict: - """Fill all empty source_matches slots with NO_MATCH results and - sort source records by descending `match_type`. - - :param resp: incoming response object - :return: response object with empty source slots filled with NO_MATCH results - and corresponding source metadata - """ - for src_name in resp["source_matches"].keys(): - if resp["source_matches"][src_name] is None: - resp["source_matches"][src_name] = { - "match_type": MatchType.NO_MATCH, - "records": [], - "source_meta_": self.db.get_source_metadata(src_name), - } - else: - records = resp["source_matches"][src_name]["records"] - if len(records) > 1: - records = sorted(records, key=lambda k: k.match_type, reverse=True) - return resp - - def _get_search_response(self, query: str, sources: Iterable[SourceName]) -> Dict: - """Return response as dict where key is source name and value is a list of - records. - - :param query: string to match against - :param sources: sources to match from - :return: completed response object to return to client + @staticmethod + def _get_search_queries(query: str) -> List[Tuple[str, MatchType]]: + """Construct list of individual queries and corresponding match types to + perform. + * Check if query is a CURIE from a stored source + * Check if a namespace can be inferred # TODO update warning somehow + * Check if the query is a name/reference for any other known item type + + :param query: formatted query from user + :return: List of queries to perform (search string and corresponding match type) """ - resp = { - "query": query, - "warnings": self._emit_warnings(query), - "source_matches": {source.value: None for source in sources}, - } + queries = [] if query == "": - return self._post_process_resp(resp) - query_l = query.lower() - - queries = list() - if [p for p in PREFIX_LOOKUP.keys() if query_l.startswith(p)]: - queries.append((query_l, RecordType.IDENTITY.value)) - - for prefix in [p for p in NAMESPACE_LOOKUP.keys() if query_l.startswith(p)]: - term = f"{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}" + return queries + if [p for p in PREFIX_LOOKUP.keys() if query.startswith(str(p))]: + queries.append((query, RecordType.IDENTITY.value)) + for prefix in [p for p in NAMESPACE_LOOKUP.keys() if query.startswith(p)]: + term = f"{NAMESPACE_LOOKUP[prefix]}:{query}" queries.append((term, RecordType.IDENTITY.value)) - for match in ITEM_TYPES.values(): - queries.append((query_l, match)) - - matched_concept_ids = list() - for term, item_type in queries: - try: - if item_type == RecordType.IDENTITY.value: - record = self.db.get_record_by_id(term, False) - if record and record["concept_id"] not in matched_concept_ids: - self._add_record(resp, record, MatchType.CONCEPT_ID) - else: - refs = self.db.get_refs_by_type(term, RefType(item_type)) - for ref in refs: - if ref not in matched_concept_ids: - self._fetch_record(resp, ref, MatchType[item_type.upper()]) - matched_concept_ids.append(ref) - - except DatabaseReadError as e: - _logger.error( - f"Encountered DatabaseReadError looking up {item_type}" - f" {term}: {e}" - ) - continue + queries.append((query, match)) + return queries - # remaining sources get no match - return self._post_process_resp(resp) + def _perform_search_queries( + self, search_queries: List[Tuple[str, MatchType]] + ) -> List[Gene]: + """Run all prepared queries. - @staticmethod - def _get_service_meta() -> ServiceMeta: - """Return metadata about gene-normalizer service. - - :return: Service Meta + :param search_queries: list of queries (strings + match types) to perform + :return: list of all matching Genes. Should be non-redundant and ordered by + match type. """ - return ServiceMeta(version=__version__, response_datetime=str(datetime.now())) + matched_concept_ids = [] + matched_genes = [] + for term, item_type in search_queries: + if item_type == RecordType.IDENTITY.value: + record = self.db.get_record_by_id(term, False) + if record and record.id not in matched_concept_ids: + matched_concept_ids.append(record.id) + matched_genes.append(record) + else: + refs = self.db.get_ids_by_ref(term, RefType(item_type)) + for ref in refs: + if ref not in matched_concept_ids: + record = self.db.get_record_by_id(term, False) + if record and record.id not in matched_concept_ids: + matched_concept_ids.append(record.id) + matched_genes.append(record) + return matched_genes def search( - self, - query_str: str, - sources: Optional[List[SourceName]] = None, - ) -> SearchService: - """Return highest match for each source. + self, query: str, sources: Optional[List[SourceName]] = None + ) -> SearchResult: + """Return all matches for each source. >>> from gene.query import QueryHandler >>> from gene.database import create_db >>> q = QueryHandler(create_db()) >>> result = q.search("BRAF") - >>> result.source_matches[0].records[0].concept_id + >>> result.source_matches[0].records[0].concept_id # TODO update 'ncbigene:673' :param query_str: query, a string, to search for :param sources: If given, only return records from these sources - :return: SearchService class containing all matches found in sources. + :return: search response class containing all matches found in sources. """ if not sources: sources = list(SourceName.__members__.values()) - - query_str = query_str.strip() - resp = self._get_search_response(query_str, sources) - - resp["service_meta_"] = self._get_service_meta() - return SearchService(**resp) - - def _add_merged_meta(self, response: NormalizeService) -> NormalizeService: - """Add source metadata to response object. - - :param response: in-progress response object - :return: completed response object. - """ - sources_meta = {} - gene = response.gene - sources = [response.normalized_id.split(":")[0]] - if gene.mappings: - sources += [m.coding.system for m in gene.mappings] - - for src in sources: - try: - src_name = PREFIX_LOOKUP[src] - except KeyError: - # not an imported source - continue - else: - if src_name not in sources_meta: - _source_meta = self.db.get_source_metadata(src_name) - sources_meta[SourceName(src_name)] = SourceMeta(**_source_meta) - response.source_meta_ = sources_meta - return response - - def _add_alt_matches( - self, response: NormService, record: Dict, possible_concepts: List[str] - ) -> NormService: - """Add alternate matches warning to response object - - :param response: in-progress response object - :param record: normalized record - :param possible_concepts: other possible matches - :return: updated response object - """ - norm_concepts = set() - for concept_id in possible_concepts: - r = self.db.get_record_by_id(concept_id, True) - if r: - merge_ref = r.get("merge_ref") - if merge_ref: - norm_concepts.add(merge_ref) - norm_concepts = norm_concepts - {record["concept_id"]} - if norm_concepts: - response.warnings.append( - {"multiple_normalized_concepts_found": list(norm_concepts)} - ) - return response - - def _add_gene( - self, - response: NormalizeService, - record: Dict, - match_type: MatchType, - possible_concepts: Optional[List[str]] = None, - ) -> NormalizeService: - """Add core Gene object to response. - - :param response: Response object - :param record: Gene record - :param match_type: query's match type - :param possible_concepts: List of other normalized concepts found - :return: Response with core Gene - """ - gene_obj = core_models.Gene( - id=f"normalize.gene.{record['concept_id']}", - label=record["symbol"], + parsed_query, warnings = self._parse_query_input(query) + response = SearchResult( + warnings=warnings, source_meta=self._get_sources_meta(sources) ) + search_queries = self._get_search_queries(parsed_query) + matched_genes = self._perform_search_queries(search_queries) + + for gene in matched_genes: + field_name = f"{gene.id.split(':')[0]}_matches" # type: ignore + existing_matches = getattr(response, field_name) + if not existing_matches: + setattr(response, field_name, [gene]) + else: + existing_matches.append(gene) - # mappings - source_ids = record.get("xrefs", []) + record.get("associated_with", []) - mappings = [] - for source_id in source_ids: - system, code = source_id.split(":") - mappings.append( - core_models.Mapping( - coding=core_models.Coding( - code=core_models.Code(code), system=system.lower() - ), - relation=core_models.Relation.RELATED_MATCH, - ) - ) - if mappings: - gene_obj.mappings = mappings - - # aliases - aliases = set() - for key in ["previous_symbols", "aliases"]: - if key in record and record[key]: - val = record[key] - if isinstance(val, str): - val = [val] - aliases.update(val) - if aliases: - gene_obj.aliases = list(aliases) - - # extensions - extensions = [] - extension_and_record_labels = [ - ("symbol_status", "symbol_status"), - ("approved_name", "label"), - ("previous_symbols", "previous_symbols"), - ("location_annotations", "location_annotations"), - ("strand", "strand"), - ] - for ext_label, record_label in extension_and_record_labels: - if record_label in record and record[record_label]: - extensions.append( - core_models.Extension(name=ext_label, value=record[record_label]) - ) - - record_locations = {} - if record["item_type"] == RecordType.IDENTITY: - locs = record.get("locations") - if locs: - record_locations[f"{record['src_name'].lower()}_locations"] = locs - elif record["item_type"] == RecordType.MERGER: - for k, v in record.items(): - if k.endswith("locations") and v: - record_locations[k] = v - - for loc_name, locations in record_locations.items(): - transformed_locs = [] - for loc in locations: - if loc["type"] == "SequenceLocation": - transformed_locs.append(self._transform_location(loc)) - - if transformed_locs: - extensions.append( - core_models.Extension(name=loc_name, value=transformed_locs) - ) - - # handle gene types separately because they're wonky - if record["item_type"] == RecordType.IDENTITY: - gene_type = record.get("gene_type") - if gene_type: - extensions.append( - core_models.Extension( - name=GeneTypeFieldName[record["src_name"].upper()].value, - value=gene_type, - ) - ) - else: - for f in GeneTypeFieldName: - field_name = f.value - values = record.get(field_name, []) - for value in values: - extensions.append( - core_models.Extension(name=field_name, value=value) - ) - if extensions: - gene_obj.extensions = extensions - - # add warnings - if possible_concepts: - response = self._add_alt_matches(response, record, possible_concepts) - - response.normalized_id = record["concept_id"] - response.gene = gene_obj - response = self._add_merged_meta(response) - response.match_type = match_type return response - @staticmethod - def _record_order(record: Dict) -> Tuple[int, str]: - """Construct priority order for matching. Only called by sort(). + def _get_normalized_record( + self, query: str + ) -> Optional[Tuple[Gene, MatchType, List[str]]]: + """Get highest-priority available normalized record. - :param record: individual record item in iterable to sort - :return: tuple with rank value and concept ID + :param query: user query + :return: Tuple containing the normalized gene, the match type that produced it, + and a list of alternate normalized objects """ - src = record["src_name"].upper() - source_rank = SourcePriority[src] - return source_rank, record["concept_id"] - - @staticmethod - def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict: - """Log + fill out response for a failed merge reference lookup. + # check concept ID match + record = self.db.get_normalized_record(query) + if record: + return (record, MatchType.CONCEPT_ID, []) - :param record: record containing failed merge_ref - :param response: in-progress response object - :param query: original query value - :return: response with no match - """ - _logger.error( - f"Merge ref lookup failed for ref {record['merge_ref']} " - f"in record {record['concept_id']} from query {query}" - ) - response["match_type"] = MatchType.NO_MATCH - return response + # check each kind of match type + for match_type in RefType: + matching_concepts = self.db.get_ids_by_ref(query, match_type) + matching_concepts.sort( + key=lambda c: (SourcePriority[PREFIX_LOOKUP[c.split(":")[0]]], c) + ) + while matching_concepts: + record = self.db.get_normalized_record(matching_concepts[0]) + if record: + return (record, REF_TO_MATCH_MAP[match_type], matching_concepts[1:]) + matching_concepts = matching_concepts[1:] - def _prepare_normalized_response(self, query: str) -> Dict[str, Any]: - """Provide base response object for normalize endpoints. + return None - :param query: user-provided query - :return: basic normalization response boilerplate - """ - return { - "query": query, - "match_type": MatchType.NO_MATCH, - "warnings": self._emit_warnings(query), - "service_meta_": ServiceMeta( - version=__version__, response_datetime=str(datetime.now()) - ), - } - - def normalize(self, query: str) -> NormalizeService: + def normalize(self, query: str) -> NormalizeResult: """Return normalized concept for query. Use to retrieve normalized gene concept records: + # TODO update this >>> from gene.query import QueryHandler >>> from gene.database import create_db >>> q = QueryHandler(create_db()) @@ -527,150 +226,64 @@ def normalize(self, query: str) -> NormalizeService: :param query: String to find normalized concept for :return: Normalized gene concept """ - response = NormalizeService(**self._prepare_normalized_response(query)) - return self._perform_normalized_lookup(response, query, self._add_gene) - - def _resolve_merge( - self, - response: NormService, - record: Dict, - match_type: MatchType, - callback: Callable, - possible_concepts: Optional[List[str]] = None, - ) -> NormService: - """Given a record, return the corresponding normalized record - - :param response: in-progress response object - :param record: record to retrieve normalized concept for - :param match_type: type of match that returned these records - :param callback: response constructor method - :param possible_concepts: alternate possible matches - :return: Normalized response object - """ - merge_ref = record.get("merge_ref") - if merge_ref: - # follow merge_ref - merge = self.db.get_record_by_id(merge_ref, False, True) - if merge is None: - query = response.query - _logger.error( - f"Merge ref lookup failed for ref {record['merge_ref']} " - f"in record {record['concept_id']} from query `{query}`" - ) - return response - else: - return callback(response, merge, match_type, possible_concepts) - else: - # record is sole member of concept group - return callback(response, record, match_type, possible_concepts) - - def _perform_normalized_lookup( - self, response: NormService, query: str, response_builder: Callable - ) -> NormService: - """Retrieve normalized concept, for use in normalization endpoints - - :param response: in-progress response object - :param query: user-provided query - :param response_builder: response constructor callback method - :return: completed service response object - """ - if query == "": - return response - query_str = query.lower().strip() - - # check merged concept ID match - record = self.db.get_record_by_id(query_str, case_sensitive=False, merge=True) - if record: - return response_builder(response, record, MatchType.CONCEPT_ID) - - # check concept ID match - record = self.db.get_record_by_id(query_str, case_sensitive=False) - if record: - return self._resolve_merge( - response, record, MatchType.CONCEPT_ID, response_builder - ) - - for match_type in RefType: - # get matches list for match tier - matching_refs = self.db.get_refs_by_type(query_str, match_type) - matching_records = [ - self.db.get_record_by_id(ref, False) for ref in matching_refs - ] - matching_records.sort(key=self._record_order) # type: ignore - - if len(matching_refs) > 1: - possible_concepts = [ref for ref in matching_refs] - else: - possible_concepts = None - - # attempt merge ref resolution until successful - for match in matching_records: - assert match is not None - record = self.db.get_record_by_id(match["concept_id"], False) - if record: - match_type_value = MatchType[match_type.value.upper()] - return self._resolve_merge( - response, - record, - match_type_value, - response_builder, - possible_concepts, + parsed_query, warnings = self._parse_query_input(query) + result = NormalizeResult( + source_meta=ResultSourceMeta(), + warnings=warnings, + ) + normalized_gene = None + normalized_match = self._get_normalized_record(parsed_query) + if normalized_match: + normalized_gene, match_type, alt_matches = normalized_match + result.match = GeneMatch(gene=normalized_gene, match_type=match_type) + result.normalized_id = normalized_gene.id[15:] # type: ignore + if alt_matches: + result.warnings.append( # type: ignore + QueryWarning( + type=WarningType.MULTIPLE_NORMALIZED_CONCEPTS, + description=f"Alternative possible normalized matches: {alt_matches}", ) - return response - - def _add_normalized_records( - self, - response: UnmergedNormalizationService, - normalized_record: Dict, - match_type: MatchType, - possible_concepts: Optional[List[str]] = None, - ) -> UnmergedNormalizationService: - """Add individual records to unmerged normalize response. - - :param response: in-progress response - :param normalized_record: record associated with normalized concept, either - merged or single identity - :param match_type: type of match achieved - :param possible_concepts: other possible results - :return: Completed response object - """ - response.match_type = match_type - response.normalized_concept_id = normalized_record["concept_id"] - if normalized_record["item_type"] == RecordType.IDENTITY: - record_source = SourceName[normalized_record["src_name"].upper()] - meta = self.db.get_source_metadata(record_source.value) - response.source_matches[record_source] = MatchesNormalized( - records=[BaseGene(**self._transform_locations(normalized_record))], - source_meta_=meta, # type: ignore - ) - else: - concept_ids = [normalized_record["concept_id"]] + normalized_record.get( - "xrefs", [] - ) + ) + concept_ids: List[str] = [result.match.id] # type: ignore + if result.match.gene.mappings: + for mapping in result.match.gene.mappings: + concept_ids.append(f"{mapping.coding.system}:{mapping.coding.code}") + sources = set() for concept_id in concept_ids: - record = self.db.get_record_by_id(concept_id, case_sensitive=False) - if not record: - continue - record_source = SourceName[record["src_name"].upper()] - gene = BaseGene(**self._transform_locations(record)) - if record_source in response.source_matches: - response.source_matches[record_source].records.append(gene) + prefix = concept_id.split(":", 1)[0] + sources.add(SourceName(PREFIX_LOOKUP[prefix])) + result.source_meta = self._get_sources_meta(list(sources)) + return result + + def _get_unmerged_matches( + self, normalized_record: Gene + ) -> NormalizeUnmergedMatches: + """Acquire source records that make up provided normalized record. + + :param normalized_record: given normalized record + :return: unmerged matches object with gene records grouped by source + """ + grouped_genes = {} + concept_ids: List[str] = [normalized_record.id] # type: ignore + if normalized_record.mappings: + for mapping in normalized_record.mappings: + concept_ids.append(f"{mapping.coding.system}:{mapping.coding.code}") + for concept_id in concept_ids: + record = self.db.get_record_by_id(concept_id) + if record: + prefix = record.id.split(":", 1)[0] # type: ignore + key = f"{PREFIX_LOOKUP[prefix].lower()}_matches" + if key in grouped_genes: + grouped_genes[key].append(record) else: - meta = self.db.get_source_metadata(record_source.value) - response.source_matches[record_source] = MatchesNormalized( - records=[gene], - source_meta_=meta, # type: ignore - ) - if possible_concepts: - response = self._add_alt_matches( - response, normalized_record, possible_concepts - ) - return response + grouped_genes[key] = [record] + return NormalizeUnmergedMatches(**grouped_genes) - def normalize_unmerged(self, query: str) -> UnmergedNormalizationService: + def normalize_unmerged(self, query: str) -> NormalizeUnmergedResult: """Return all source records under the normalized concept for the provided query string. + # TODO update this >>> from gene.query import QueryHandler >>> from gene.database import create_db >>> from gene.schemas import SourceName @@ -684,9 +297,27 @@ def normalize_unmerged(self, query: str) -> UnmergedNormalizationService: :param query: string to search against :return: Normalized response object """ - response = UnmergedNormalizationService( - source_matches={}, **self._prepare_normalized_response(query) - ) - return self._perform_normalized_lookup( - response, query, self._add_normalized_records + parsed_query, warnings = self._parse_query_input(query) + result = NormalizeUnmergedResult( + source_meta=ResultSourceMeta(), + warnings=warnings, + source_genes=NormalizeUnmergedMatches(), ) + normalized_gene = None + normalized_match = self._get_normalized_record(parsed_query) + if normalized_match: + normalized_gene, match_type, alt_matches = normalized_match + base_match = GeneMatch(gene=normalized_gene, match_type=match_type) + result.normalized_id = normalized_gene.id[15:] # type: ignore + if alt_matches: + result.warnings.append( # type: ignore + QueryWarning( + type=WarningType.MULTIPLE_NORMALIZED_CONCEPTS, + description=f"Alternative possible normalized matches: {alt_matches}", + ) + ) + result.source_genes = self._get_unmerged_matches(base_match.gene) + sources = list(result.source_genes.get_matches_by_source().keys()) + result.source_meta = self._get_sources_meta(sources) + + return result diff --git a/src/gene/schemas.py b/src/gene/schemas.py index e6cb5183..a05ab3ff 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -1,142 +1,24 @@ """Contains data models for representing VICC normalized gene records.""" -from enum import Enum, IntEnum +from enum import Enum, IntEnum, StrEnum from typing import Dict, List, Literal, Optional, Union -from ga4gh.core import core_models +from ga4gh.core._internal.models import Extension, Gene from ga4gh.vrs import models from pydantic import ( BaseModel, ConfigDict, StrictBool, - StrictInt, StrictStr, constr, ) from gene.version import __version__ -CURIE = constr(pattern=r"^\w[^:]*:.+$") - - -class SymbolStatus(str, Enum): - """Define string constraints for symbol status attribute.""" - - WITHDRAWN = "withdrawn" - APPROVED = "approved" - DISCONTINUED = "discontinued" - - -class Strand(str, Enum): - """Define string constraints for strand attribute.""" - - FORWARD = "+" - REVERSE = "-" - - -class Annotation(str, Enum): - """Define string constraints for annotations when gene location - is absent. - """ - - NOT_FOUND_ON_REFERENCE = "not on reference assembly" - UNPLACED = "unplaced" - RESERVED = "reserved" - ALT_LOC = "alternate reference locus" - - -class Chromosome(str, Enum): - """Define string constraints for chromosomes.""" - - MITOCHONDRIA = "MT" - - -class MatchType(IntEnum): - """Define string constraints for use in Match Type attributes.""" - - CONCEPT_ID = 100 - SYMBOL = 100 - PREV_SYMBOL = 80 - ALIAS = 60 - XREF = 60 - ASSOCIATED_WITH = 60 - FUZZY_MATCH = 20 - NO_MATCH = 0 - - -class GeneSequenceLocation(BaseModel): - """Sequence Location model when storing in DynamoDB.""" - - type: Literal["SequenceLocation"] = "SequenceLocation" - start: StrictInt - end: StrictInt - sequence_id: constr(pattern=r"^ga4gh:SQ.[0-9A-Za-z_\-]{32}$") # noqa: F722 - - -# class GeneChromosomeLocation(BaseModel): -# """Chromosome Location model when storing in DynamDB.""" - -# type: Literal["ChromosomeLocation"] = "ChromosomeLocation" -# species_id: Literal["taxonomy:9606"] = "taxonomy:9606" -# chr: StrictStr -# start: StrictStr -# end: StrictStr - - -class BaseGene(BaseModel): - """Base gene model. Provide shared resources for records produced by - /search and /normalize_unmerged. - """ +############################################################################### +# namespace/identification +############################################################################### - concept_id: CURIE - symbol: StrictStr - symbol_status: Optional[SymbolStatus] = None - label: Optional[StrictStr] = None - strand: Optional[Strand] = None - location_annotations: List[StrictStr] = [] - locations: Union[ - List[models.SequenceLocation], List[GeneSequenceLocation] - # List[Union[SequenceLocation, ChromosomeLocation]], - # List[Union[GeneSequenceLocation, GeneChromosomeLocation]] # dynamodb - ] = [] - aliases: List[StrictStr] = [] - previous_symbols: List[StrictStr] = [] - xrefs: List[CURIE] = [] - associated_with: List[CURIE] = [] - gene_type: Optional[StrictStr] = None - - -class Gene(BaseGene): - """Gene""" - - match_type: MatchType - - model_config = ConfigDict( - json_schema_extra={ - "example": { - "label": None, - "concept_id": "ensembl:ENSG00000157764", - "symbol": "BRAF", - "previous_symbols": [], - "aliases": [], - "xrefs": [], - "symbol_status": None, - "strand": "-", - "locations": [], - "location_annotations": [], - "associated_with": [], - "gene_type": None, - "match_type": 100, - } - } - ) - - -class GeneGroup(Gene): - """A grouping of genes based on common attributes.""" - - description: StrictStr - type_identifier: StrictStr - genes: List[Gene] = [] +CURIE = constr(pattern=r"^\w[^:]*:.+$") class SourceName(Enum): @@ -218,19 +100,128 @@ class NamespacePrefix(Enum): if v.value != "" } +############################################################################### +# gene elements +############################################################################### -class DataLicenseAttributes(BaseModel): - """Define constraints for data license attributes.""" - non_commercial: StrictBool - share_alike: StrictBool - attribution: StrictBool +class SymbolStatus(str, Enum): + """Define string constraints for symbol status attribute.""" + + WITHDRAWN = "withdrawn" + APPROVED = "approved" + DISCONTINUED = "discontinued" + + +class SymbolStatusExtension(Extension): + """Define symbol status extension object structure.""" + + name: Literal["symbol_status"] = "symbol_status" + value: SymbolStatus + + +class Strand(str, Enum): + """Define string constraints for strand attribute.""" + + FORWARD = "+" + REVERSE = "-" + + +class StrandExtension(Extension): + """Define strand extension object structure.""" + + name: Literal["strand"] = "strand" + value: Strand + + +class ApprovedNameExtension(Extension): + """Define approved name object structure.""" + + name: Literal["approved_name"] = "approved_name" + value: StrictStr + + +class Annotation(str, Enum): + """Define string constraints for annotations when gene location + is absent. + """ + + NOT_FOUND_ON_REFERENCE = "not on reference assembly" + UNPLACED = "unplaced" + RESERVED = "reserved" + ALT_LOC = "alternate reference locus" + + +class Chromosome(str, Enum): + """Define string constraints for chromosomes.""" + + MITOCHONDRIA = "MT" + + +class LocationAnnotationsExtension(Extension): + """Define location annotation extension object structure. + + # TODO: what even is this? + """ + + name: Literal["location_annotations"] = "location_annotations" + value: List[Union[Annotation, Chromosome]] + + +class GeneTypeExtensionName(StrEnum): + """Designate source-specific gene type field names for Extensions and + internal records. + """ + + HGNC = "hgnc_locus_type" + NCBI = "ncbi_gene_type" + ENSEMBL = "ensembl_biotype" + + +class GeneTypeExtension(Extension): + """Define gene type extension object structure.""" + + name: GeneTypeExtensionName + value: StrictStr + + +class SequenceLocationExtensionName(StrEnum): + """Define name restrictions for source-provided sequence location extensions.""" + + NCBI_LOCATIONS = "ncbi_locations" + ENSEMBL_LOCATIONS = "ensembl_locations" + + +class SequenceLocationExtension(Extension): + """Define structure for sequence location extension.""" + + name: SequenceLocationExtensionName + value: List[models.SequenceLocation] + + +class PreviousSymbolsExtension(Extension): + """Define previous symbols extension object structure.""" + + name: Literal["previous_symbols"] = "previous_symbols" + value: List[StrictStr] + + +# class GeneGroup(Gene): +# """A grouping of genes based on common attributes.""" +# +# description: StrictStr +# type_identifier: StrictStr +# genes: List[Gene] = [] + +############################################################################### +# database and match components +############################################################################### class RecordType(str, Enum): """Record item types.""" - IDENTITY = "identity" + IDENTITY = "identity" # TODO i think this should change MERGER = "merger" @@ -242,12 +233,23 @@ class RefType(str, Enum): PREVIOUS_SYMBOLS = "prev_symbol" ALIASES = "alias" XREFS = "xref" - ASSOCIATED_WITH = "associated_with" # collective name to singular name, e.g. {"previous_symbols": "prev_symbol"} ITEM_TYPES = {k.lower(): v.value for k, v in RefType.__members__.items()} +############################################################################### +# response components +############################################################################### + + +class DataLicenseAttributes(BaseModel): + """Define constraints for data license attributes.""" + + non_commercial: StrictBool + share_alike: StrictBool + attribution: StrictBool + class SourceMeta(BaseModel): """Metadata for a given source to return in response object.""" @@ -255,9 +257,9 @@ class SourceMeta(BaseModel): data_license: StrictStr data_license_url: StrictStr version: StrictStr - data_url: Dict[StrictStr, StrictStr] # TODO strictness necessary? + data_url: Dict[StrictStr, StrictStr] rdp_url: Optional[StrictStr] = None - data_license_attributes: Dict[StrictStr, StrictBool] + data_license_attributes: DataLicenseAttributes genome_assemblies: List[StrictStr] = [] model_config = ConfigDict( @@ -283,15 +285,6 @@ class SourceMeta(BaseModel): ) -class SourceSearchMatches(BaseModel): - """Container for matching information from an individual source.""" - - records: List[Gene] = [] - source_meta_: SourceMeta - - model_config = ConfigDict(json_schema_extra={"example": {}}) # TODO - - class ServiceMeta(BaseModel): """Metadata regarding the gene-normalization service.""" @@ -300,7 +293,7 @@ class ServiceMeta(BaseModel): response_datetime: StrictStr url: Literal[ "https://github.com/cancervariants/gene-normalization" - ] = "https://github.com/cancervariants/gene-normalization" # noqa: E501 + ] = "https://github.com/cancervariants/gene-normalization" model_config = ConfigDict( json_schema_extra={ @@ -314,385 +307,128 @@ class ServiceMeta(BaseModel): ) -class SearchService(BaseModel): - """Define model for returning highest match typed concepts from sources.""" +class MatchType(IntEnum): + """Define string constraints for use in Match Type attributes.""" + + CONCEPT_ID = 100 + SYMBOL = 100 + PREV_SYMBOL = 80 + ALIAS = 60 + XREF = 60 + FUZZY_MATCH = 20 + NO_MATCH = 0 + + +REF_TO_MATCH_MAP = { + RefType.SYMBOL: MatchType.SYMBOL, + RefType.PREVIOUS_SYMBOLS: MatchType.PREV_SYMBOL, + RefType.ALIASES: MatchType.ALIAS, + RefType.XREFS: MatchType.XREF, +} + + +class WarningType(StrEnum): + """Define possible warning types.""" + + STRIPPED_QUERY = "stripped_query" + MULTIPLE_NORMALIZED_CONCEPTS = "multiple_normalized_concepts_found" + INFERRED_NAMESPACE = "inferred_namespace" # TODO implement + NBSP = "non_breaking_space_characters" + + +class QueryWarning(BaseModel): + """Define warning structure.""" + + type: WarningType + description: StrictStr + + +class _Service(BaseModel): + """Define base service response object.""" query: StrictStr - warnings: List[Dict] = [] - source_matches: Dict[SourceName, SourceSearchMatches] + additional_params: Optional[Dict] = None service_meta_: ServiceMeta - model_config = ConfigDict(json_schema_extra={}) # TODO +class ResultSourceMeta(BaseModel): + """Structure source metadata for all results objects.""" -class GeneTypeFieldName(str, Enum): - """Designate source-specific gene type field names for Extensions and - internal records. + hgnc: Optional[SourceMeta] = None + ensembl: Optional[SourceMeta] = None + ncbi: Optional[SourceMeta] = None + + +class _Result(BaseModel): + """Define base lookup result object. Returned by QueryHandler methods like + `normalize()`, and included in REST responses. """ - HGNC = "hgnc_locus_type" - NCBI = "ncbi_gene_type" - ENSEMBL = "ensembl_biotype" + warnings: Optional[List[QueryWarning]] = None + source_meta: ResultSourceMeta -class BaseNormalizationService(BaseModel): - """Base method providing shared attributes to Normalization service classes.""" +class GeneMatch(BaseModel): + """Structure individual gene match.""" - query: StrictStr - warnings: List[Dict] = [] match_type: MatchType - service_meta_: ServiceMeta + gene: Gene -class NormalizeService(BaseNormalizationService): - """Define model for returning normalized concept.""" +class SearchResult(_Result): + """Result object for `search()` endpoint.""" - normalized_id: Optional[str] = None - gene: Optional[core_models.Gene] = None - source_meta_: Dict[SourceName, SourceMeta] = {} + hgnc_matches: Optional[List[GeneMatch]] = None + ensembl_matches: Optional[List[GeneMatch]] = None + ncbi_matches: Optional[List[GeneMatch]] = None - model_config = ConfigDict( - json_schema_extra={ - "example": { - "query": "BRAF", - "warnings": [], - "match_type": 100, - "normalized_id": "hgnc:1037", - "gene": { - "type": "Gene", - "id": "normalize.gene.hgnc:1097", - "label": "BRAF", - "mappings": [ - { - "coding": {"code": "673", "system": "ncbigene"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "ENSG00000157764", "system": "ensembl"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "CCDS5863", "system": "ccds"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "1943", "system": "iuphar"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "119066", "system": "orphanet"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "BRAF", "system": "cosmic"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "2284096", "system": "pubmed"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "uc003vwc.5", "system": "ucsc"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "164757", "system": "omim"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "NM_004333", "system": "refseq"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "CCDS87555", "system": "ccds"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "P15056", "system": "uniprot"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "M95712", "system": "ena.embl"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, - "relation": "relatedMatch", - }, - { - "coding": {"code": "1565476", "system": "pubmed"}, - "relation": "relatedMatch", - }, - ], - "aliases": ["BRAF1", "RAFB1", "B-raf", "NS7", "B-RAF1"], - "extensions": [ - { - "name": "approved_name", - "value": "B-Raf proto-oncogene, serine/threonine kinase", - "type": "Extension", - }, - { - "name": "symbol_status", - "value": "approved", - "type": "Extension", - }, - # { - # "name": "chromosome_location", - # "value": { - # "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501 - # "type": "ChromosomeLocation", - # "species_id": "taxonomy:9606", - # "chr": "7", - # "end": "q34", - # "start": "q34", - # }, - # "type": "Extension" - # } - ], - }, - "source_meta_": { - "HGNC": { - "data_license": "custom", - "data_license_url": "https://www.genenames.org/about/", - "version": "20210810", - "data_url": { - "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" - }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "attribution": False, - "share_alike": False, - }, - "genome_assemblies": [], - }, - "Ensembl": { - "data_license": "custom", - "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501 - "version": "104", - "data_url": { - "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" - }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "attribution": False, - "share_alike": False, - }, - "genome_assemblies": ["GRCh38"], - }, - "NCBI": { - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 - "version": "20210813", - "data_url": { - "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", - "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", - "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", - }, - "rdp_url": "https://reusabledata.org/ncbi-gene.html", - "data_license_attributes": { - "non_commercial": False, - "attribution": False, - "share_alike": False, - }, - "genome_assemblies": ["GRCh38.p13"], - }, - }, - "service_meta_": { - "name": "gene-normalizer", - "version": __version__, - "response_datetime": "2022-03-23 15:57:14.180908", - "url": "https://github.com/cancervariants/gene-normalization", - }, - } - } - ) +class SearchService(_Service): + """Define response object for /search endpoint.""" -class MatchesNormalized(BaseModel): - """Matches associated with normalized concept from a single source.""" + results: SearchResult - records: List[BaseGene] = [] - source_meta_: SourceMeta +class NormalizeResult(_Result): + """Result object for `normalize()` method.""" -class UnmergedNormalizationService(BaseNormalizationService): - """Response providing source records corresponding to normalization of user query. - Enables retrieval of normalized concept while retaining sourcing for accompanying - attributes. - """ + normalized_id: Optional[CURIE] = None + match: Optional[GeneMatch] = None - normalized_concept_id: Optional[CURIE] = None - source_matches: Dict[SourceName, MatchesNormalized] - model_config = ConfigDict( - json_schema_extra={ - "example": { - "query": "hgnc:108", - "warnings": [], - "match_type": 100, - "service_meta_": { - "version": __version__, - "response_datetime": "2022-04-26 14:20:54.180240", - "name": "gene-normalizer", - "url": "https://github.com/cancervariants/gene-normalization", - }, - "normalized_concept_id": "hgnc:108", - "source_matches": { - "HGNC": { - "records": [ - { - "concept_id": "hgnc:108", - "symbol": "ACHE", - "symbol_status": "approved", - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 - "strand": None, - "location_annotations": [], - "locations": [ - # { - # "type": "ChromosomeLocation", - # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 - # "species_id": "taxonomy:9606", - # "chr": "7", - # "start": "q22.1", - # "end": "q22.1" - # } - ], - "aliases": ["3.1.1.7"], - "previous_symbols": ["YT"], - "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], - "associated_with": [ - "ucsc:uc003uxi.4", - "vega:OTTHUMG00000157033", - "merops:S09.979", - "ccds:CCDS5710", - "omim:100740", - "iuphar:2465", - "ccds:CCDS5709", - "refseq:NM_015831", - "pubmed:1380483", - "uniprot:P22303", - "ccds:CCDS64736", - ], - "gene_type": "gene with protein product", - } - ], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://www.genenames.org/about/", - "version": "20220407", - "data_url": { - "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" - }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, - "genome_assemblies": [], - }, - }, - "Ensembl": { - "records": [ - { - "concept_id": "ensembl:ENSG00000087085", - "symbol": "ACHE", - "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 - "strand": "-", - "location_annotations": [], - "locations": [ - { - "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", # noqa: E501 - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 - }, - "start": 100889993, - "end": 100896974, - } - ], - "aliases": [], - "previous_symbols": [], - "xrefs": ["hgnc:108"], - "associated_with": [], - "gene_type": "protein_coding", - } - ], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", # noqa: E501 - "version": "104", - "data_url": { - "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" - }, - "rdp_url": None, - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, - "genome_assemblies": ["GRCh38"], - }, - }, - "NCBI": { - "records": [ - { - "concept_id": "ncbigene:43", - "symbol": "ACHE", - "symbol_status": None, - "label": "acetylcholinesterase (Cartwright blood group)", # noqa: E501 - "strand": "-", - "location_annotations": [], - "locations": [ - { - # "type": "ChromosomeLocation", - # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 - # "species_id": "taxonomy:9606", - # "chr": "7", - # "start": "q22.1", - # "end": "q22.1" - }, - { - "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", # noqa: E501 - "type": "SequenceLocation", - "sequenceReference": { - "type": "SequenceReference", - "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 - }, - "start": 100889993, - "end": 100896994, - }, - ], - "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], - "previous_symbols": ["ACEE"], - "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], - "associated_with": ["omim:100740"], - "gene_type": "protein-coding", - } - ], - "source_meta_": { - "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 - "version": "20220407", - "data_url": { - "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", - "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", - "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", - }, - "rdp_url": "https://reusabledata.org/ncbi-gene.html", - "data_license_attributes": { - "non_commercial": False, - "share_alike": False, - "attribution": False, - }, - "genome_assemblies": ["GRCh38.p13"], - }, - }, - }, - } - } - ) +class NormalizeService(_Service): + """Define response object for /normalize endpoint.""" + + result: NormalizeResult + + +class NormalizeUnmergedMatches(BaseModel): + """Structure individual source matches under the `unmerged()` method.""" + + hgnc_genes: Optional[List[GeneMatch]] = None + ensembl_genes: Optional[List[GeneMatch]] = None + ncbi_genes: Optional[List[GeneMatch]] = None + + def get_matches_by_source(self) -> Dict[SourceName, Optional[List[GeneMatch]]]: + """Get a more easily computable directory of matches by source + + :return: Dictionary mapping SourceName instances to gene match lists + """ + matches = {} + for field_name in self.model_fields: + key = SourceName(SOURCES[field_name.split("_")[0]]) + matches[key] = self.__getattribute__(field_name) + return matches + + +class NormalizeUnmergedResult(_Result): + """Match object for `normalize_unmerged()` method.""" + + normalized_id: Optional[CURIE] = None + source_genes: NormalizeUnmergedMatches + + +class NormalizeUnmergedService(_Service): + """Define response object for /normalize_unmerged endpoint.""" + + result: NormalizeUnmergedResult diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 00000000..f58158c6 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +"""Provide tests.""" diff --git a/tests/unit/test_emit_warnings.py b/tests/unit/test_emit_warnings.py index c8309aac..b2b7c5d0 100644 --- a/tests/unit/test_emit_warnings.py +++ b/tests/unit/test_emit_warnings.py @@ -14,18 +14,18 @@ def test_emit_warnings(): query_handler = QueryHandler(db) # Test emit no warnings - actual_warnings = query_handler._emit_warnings("spry3") + actual_warnings = query_handler._parse_query_input("spry3") assert actual_warnings == [] # Test emit warnings - actual_warnings = query_handler._emit_warnings("sp ry3") + actual_warnings = query_handler._parse_query_input("sp ry3") assert actual_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings("sp\u00A0ry3") + actual_warnings = query_handler._parse_query_input("sp\u00A0ry3") assert expected_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings("sp ry3") + actual_warnings = query_handler._parse_query_input("sp ry3") assert expected_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings("sp\xa0ry3") + actual_warnings = query_handler._parse_query_input("sp\xa0ry3") assert expected_warnings == actual_warnings diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index bfb11460..315b94ce 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -15,7 +15,7 @@ def __init__(self): self.query_handler = QueryHandler(database) def search(self, query_str, sources=None): - return self.query_handler.search(query_str=query_str, sources=sources) + return self.query_handler.search(query=query_str, sources=sources) def normalize(self, query_str): return self.query_handler.normalize(query_str)