From 0e7a3351d4041c5f01694c630b26633121413764 Mon Sep 17 00:00:00 2001 From: James Stevenson Date: Wed, 3 Jan 2024 08:59:03 -0500 Subject: [PATCH] feat: remove associated_with as distinct group (merge w/ xrefs) --- docs/source/index.rst | 2 +- docs/source/normalizing_data/sources.rst | 9 +- src/gene/database/dynamodb.py | 3 +- src/gene/database/postgresql.py | 35 ++---- src/gene/database/postgresql/add_fkeys.sql | 2 - src/gene/database/postgresql/add_indexes.sql | 2 - .../postgresql/create_record_lookup_view.sql | 6 - .../database/postgresql/create_tables.sql | 6 - .../postgresql/delete_normalized_concepts.sql | 1 - src/gene/database/postgresql/drop_fkeys.sql | 1 - src/gene/database/postgresql/drop_indexes.sql | 1 - src/gene/etl/ensembl.py | 118 ++++++++++-------- src/gene/etl/hgnc.py | 31 ++--- src/gene/etl/merge.py | 14 ++- src/gene/etl/ncbi.py | 52 ++++---- src/gene/query.py | 3 +- src/gene/schemas.py | 10 +- tests/conftest.py | 1 - tests/unit/test_database_and_etl.py | 6 - tests/unit/test_ensembl_source.py | 11 +- tests/unit/test_hgnc_source.py | 73 ++++++----- tests/unit/test_ncbi_source.py | 59 ++++----- tests/unit/test_query.py | 40 +++--- tests/unit/test_schemas.py | 9 -- 24 files changed, 209 insertions(+), 286 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 7c6e5926..d64bbe4d 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -12,7 +12,7 @@ Gene Normalizer |version| :alt: citation :target: https://zenodo.org/badge/latestdoi/309797998 -The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene `_, `Ensembl `_, and `HGNC `_, it designates a `CURIE `_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references and associations, and coordinates. +The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene `_, `Ensembl `_, and `HGNC `_, it designates a `CURIE `_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references, and coordinates. A `public REST instance of the service `_ is available for programmatic queries: diff --git a/docs/source/normalizing_data/sources.rst b/docs/source/normalizing_data/sources.rst index 591e582c..0de9cf57 100644 --- a/docs/source/normalizing_data/sources.rst +++ b/docs/source/normalizing_data/sources.rst @@ -33,9 +33,7 @@ HGNC "previous_symbols": [], "xrefs": [ "ensembl:ENSG00000157764", - "ncbigene:673" - ], - "associated_with": [ + "ncbigene:673", "uniprot:P15056", "pubmed:2284096", "omim:164757", @@ -99,7 +97,6 @@ Ensembl "xrefs": [ "hgnc:1097" ], - "associated_with": [], "gene_type": "protein_coding", "match_type": 100 } @@ -143,9 +140,7 @@ The `NCBI Gene Database `_ is a service prov "previous_symbols": [], "xrefs": [ "ensembl:ENSG00000157764", - "hgnc:1097" - ], - "associated_with": [ + "hgnc:1097", "omim:164757" ], "gene_type": "protein-coding", diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index 6f7b0ee7..629059c8 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -434,8 +434,7 @@ def _add_ref_record( :param str term: referent term :param str concept_id: concept ID to refer to - :param str ref_type: one of {'alias', 'label', 'xref', - 'associated_with'} + :param str ref_type: one of {'alias', 'label', 'xref'} :param src_name: name of source for record """ label_and_type = f"{term.lower()}##{ref_type}" diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index 6638645c..66a43132 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -97,7 +97,6 @@ def list_tables(self) -> List[str]: _drop_db_query = b""" DROP MATERIALIZED VIEW IF EXISTS record_lookup_view; DROP TABLE IF EXISTS - gene_associations, gene_symbols, gene_previous_symbols, gene_aliases, @@ -324,12 +323,11 @@ def _format_source_record(self, source_row: Tuple) -> Dict: "locations": source_row[5], "gene_type": source_row[6], "aliases": source_row[7], - "associated_with": source_row[8], - "previous_symbols": source_row[9], - "symbol": source_row[10], - "xrefs": source_row[11], - "src_name": source_row[12], - "merge_ref": source_row[13], + "previous_symbols": source_row[8], + "symbol": source_row[9], + "xrefs": source_row[10], + "src_name": source_row[11], + "merge_ref": source_row[12], "item_type": RecordType.IDENTITY.value, } return {k: v for k, v in gene_record.items() if v} @@ -373,8 +371,7 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict: "hgnc_locus_type": merged_row[11], "ncbi_gene_type": merged_row[12], "aliases": merged_row[13], - "associated_with": merged_row[14], - "xrefs": merged_row[15], + "xrefs": merged_row[14], "item_type": RecordType.MERGER.value, } return {k: v for k, v in merged_record.items() if v} @@ -421,7 +418,6 @@ def get_record_by_id( RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", # noqa: E501 RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", # noqa: E501 RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;", - RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", # noqa: E501 } def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: @@ -558,9 +554,6 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: ) _ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);" _ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);" - _ins_assoc_query = ( - b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);" - ) def add_record(self, record: Dict, src_name: SourceName) -> None: """Add new record to database. @@ -591,8 +584,6 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: cur.execute(self._ins_alias_query, [a, concept_id]) for x in record.get("xrefs", []): cur.execute(self._ins_xref_query, [x, concept_id]) - for a in record.get("associated_with", []): - cur.execute(self._ins_assoc_query, [a, concept_id]) for p in record.get("previous_symbols", []): cur.execute(self._ins_prev_symbol_query, [p, concept_id]) if record.get("symbol"): @@ -606,10 +597,9 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: INSERT INTO gene_merged ( concept_id, symbol, symbol_status, previous_symbols, label, strand, location_annotations, ensembl_locations, hgnc_locations, ncbi_locations, - hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, associated_with, - xrefs + hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, xrefs ) - VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); + VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s); """ def add_merged_record(self, record: Dict) -> None: @@ -644,7 +634,6 @@ def add_merged_record(self, record: Dict) -> None: record.get("ensembl_biotype"), record.get("ncbi_gene_type"), record.get("aliases"), - record.get("associated_with"), record.get("xrefs"), ], ) @@ -702,13 +691,6 @@ def delete_normalized_concepts(self) -> None: WHERE gc.source = %s ); """ - _drop_associations_query = b""" - DELETE FROM gene_associations WHERE id IN ( - SELECT ga.id FROM gene_associations ga LEFT JOIN gene_concepts gc - ON gc.concept_id = ga.concept_id - WHERE gc.source = %s - ); - """ _drop_prev_symbols_query = b""" DELETE FROM gene_previous_symbols WHERE id IN ( SELECT gps.id FROM gene_previous_symbols gps LEFT JOIN gene_concepts gc @@ -750,7 +732,6 @@ def delete_source(self, src_name: SourceName) -> None: """ with self.conn.cursor() as cur: cur.execute(self._drop_aliases_query, [src_name.value]) - cur.execute(self._drop_associations_query, [src_name.value]) cur.execute(self._drop_prev_symbols_query, [src_name.value]) cur.execute(self._drop_symbols_query, [src_name.value]) cur.execute(self._drop_xrefs_query, [src_name.value]) diff --git a/src/gene/database/postgresql/add_fkeys.sql b/src/gene/database/postgresql/add_fkeys.sql index f93459b3..28e1a88f 100644 --- a/src/gene/database/postgresql/add_fkeys.sql +++ b/src/gene/database/postgresql/add_fkeys.sql @@ -1,7 +1,5 @@ ALTER TABLE gene_aliases ADD CONSTRAINT gene_aliases_concept_id_fkey FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id); -ALTER TABLE gene_associations ADD CONSTRAINT gene_associations_concept_id_fkey - FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id); ALTER TABLE gene_previous_symbols ADD CONSTRAINT gene_previous_symbols_concept_id_fkey FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id); diff --git a/src/gene/database/postgresql/add_indexes.sql b/src/gene/database/postgresql/add_indexes.sql index b96df534..805ad71b 100644 --- a/src/gene/database/postgresql/add_indexes.sql +++ b/src/gene/database/postgresql/add_indexes.sql @@ -7,7 +7,5 @@ CREATE INDEX idx_gps_symbol_low ON gene_previous_symbols (lower(prev_symbol)); CREATE INDEX idx_ga_alias_low ON gene_aliases (lower(alias)); CREATE INDEX idx_gx_xref_low ON gene_xrefs (lower(xref)); -CREATE INDEX idx_g_as_association_low - ON gene_associations (lower(associated_with)); CREATE INDEX idx_rlv_concept_id_low ON record_lookup_view (lower(concept_id)); diff --git a/src/gene/database/postgresql/create_record_lookup_view.sql b/src/gene/database/postgresql/create_record_lookup_view.sql index 1e33977f..7474a07a 100644 --- a/src/gene/database/postgresql/create_record_lookup_view.sql +++ b/src/gene/database/postgresql/create_record_lookup_view.sql @@ -7,7 +7,6 @@ SELECT gc.concept_id, gc.locations, gc.gene_type, ga.aliases, - gas.associated_with, gps.previous_symbols, gs.symbol, gx.xrefs, @@ -20,11 +19,6 @@ FULL JOIN ( FROM gene_aliases ga_1 GROUP BY ga_1.concept_id ) ga ON gc.concept_id::text = ga.concept_id::text -FULL JOIN ( - SELECT gas_1.concept_id, array_agg(gas_1.associated_with) AS associated_with - FROM gene_associations gas_1 - GROUP BY gas_1.concept_id -) gas ON gc.concept_id::text = gas.concept_id::text FULL JOIN ( SELECT gps_1.concept_id, array_agg(gps_1.prev_symbol) AS previous_symbols FROM gene_previous_symbols gps_1 diff --git a/src/gene/database/postgresql/create_tables.sql b/src/gene/database/postgresql/create_tables.sql index 83198199..9100e553 100644 --- a/src/gene/database/postgresql/create_tables.sql +++ b/src/gene/database/postgresql/create_tables.sql @@ -26,7 +26,6 @@ CREATE TABLE gene_merged ( hgnc_locus_type TEXT [], ncbi_gene_type TEXT [], aliases TEXT [], - associated_with TEXT [], xrefs TEXT [] ); CREATE TABLE gene_concepts ( @@ -60,8 +59,3 @@ CREATE TABLE gene_xrefs ( xref TEXT NOT NULL, concept_id VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id) ); -CREATE TABLE gene_associations ( - id SERIAL PRIMARY KEY, - associated_with TEXT NOT NULL, - concept_ID VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id) -); diff --git a/src/gene/database/postgresql/delete_normalized_concepts.sql b/src/gene/database/postgresql/delete_normalized_concepts.sql index 5141c841..e5e1bdce 100644 --- a/src/gene/database/postgresql/delete_normalized_concepts.sql +++ b/src/gene/database/postgresql/delete_normalized_concepts.sql @@ -19,7 +19,6 @@ CREATE TABLE gene_merged ( hgnc_locus_type TEXT [], ncbi_gene_type TEXT [], aliases TEXT [], - associated_with TEXT [], xrefs TEXT [] ); ALTER TABLE gene_concepts ADD CONSTRAINT gene_concepts_merge_ref_fkey diff --git a/src/gene/database/postgresql/drop_fkeys.sql b/src/gene/database/postgresql/drop_fkeys.sql index f804ca1e..ba2aeef5 100644 --- a/src/gene/database/postgresql/drop_fkeys.sql +++ b/src/gene/database/postgresql/drop_fkeys.sql @@ -1,5 +1,4 @@ ALTER TABLE gene_aliases DROP CONSTRAINT gene_aliases_concept_id_fkey; -ALTER TABLE gene_associations DROP CONSTRAINT gene_associations_concept_id_fkey; ALTER TABLE gene_previous_symbols DROP CONSTRAINT gene_previous_symbols_concept_id_fkey; ALTER TABLE gene_symbols DROP CONSTRAINT gene_symbols_concept_id_fkey; diff --git a/src/gene/database/postgresql/drop_indexes.sql b/src/gene/database/postgresql/drop_indexes.sql index 7c9743d0..dd9156dc 100644 --- a/src/gene/database/postgresql/drop_indexes.sql +++ b/src/gene/database/postgresql/drop_indexes.sql @@ -4,5 +4,4 @@ DROP INDEX IF EXISTS idx_gs_symbol_low; DROP INDEX IF EXISTS idx_gps_symbol_low; DROP INDEX IF EXISTS idx_gx_xref_low; DROP INDEX IF EXISTS idx_ga_alias_low; -DROP INDEX IF EXISTS idx_g_as_association_low; DROP INDEX IF EXISTS idx_rlv_concept_id_low; diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index bb590047..aef9d388 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -1,7 +1,7 @@ """Defines the Ensembl ETL methods.""" import logging import re -from typing import Dict +from typing import Dict, Optional import gffutils from gffutils.feature import Feature @@ -90,73 +90,83 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: return gene_params def _add_attributes(self, f: Feature, gene: Dict) -> None: - """Add concept_id, symbol, xrefs, and associated_with to a gene record. + """Add concept_id, symbol, and xrefs to a gene record. :param f: A gene from the data :param gene: A transformed gene record """ - attributes = { + attributes_map = { "ID": "concept_id", "Name": "symbol", "description": "xrefs", "biotype": "gene_type", } - for attribute in f.attributes.items(): - key = attribute[0] - - if key in attributes.keys(): - val = attribute[1] - - if len(val) == 1: - val = val[0] - if key == "ID": - if val.startswith("gene"): - val = ( - f"{NamespacePrefix.ENSEMBL.value}:" - f"{val.split(':')[1]}" - ) - - if key == "description": - gene["label"] = val.split("[")[0].strip() - if "Source:" in val: - src_name = ( - val.split("[")[-1] - .split("Source:")[-1] - .split("Acc")[0] - .split(";")[0] - ) - src_id = val.split("Acc:")[-1].split("]")[0] - if ":" in src_id: - src_id = src_id.split(":")[-1] - source = self._get_xref_associated_with(src_name, src_id) - if "xrefs" in source: - gene["xrefs"] = source["xrefs"] - elif "associated_with" in source: - gene["associated_with"] = source["associated_with"] - continue - - gene[attributes[key]] = val - - def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: - """Get xref or associated_with concept. + for key, value in f.attributes.items(): + if key not in attributes_map: + continue + + if key == "ID" and value[0].startswith("gene"): + gene[ + "concept_id" + ] = f"{NamespacePrefix.ENSEMBL.value}:{value[0].split(':')[1]}" + elif key == "description": + pattern = "^(.*) \\[Source:.*;Acc:(.*):(.*)\\]$" + matches = re.findall(pattern, value[0]) + if matches: + gene["label"] = matches[0][0] + gene["xrefs"] = [self._get_xref(matches[0][1], matches[0][2])] + else: + gene[attributes_map[key]] = value + # key = attribute[0] + # + # if key in attributes_map.keys(): + # val = attribute[1] + # + # if len(val) == 1: + # val = val[0] + # if key == "ID": + # if val.startswith("gene"): + # val = ( + # f"{NamespacePrefix.ENSEMBL.value}:" + # f"{val.split(':')[1]}" + # ) + # + # if key == "description": + # gene["label"] = val.split("[")[0].strip() + # if "Source:" in val: + # src_name = ( + # val.split("[")[-1] + # .split("Source:")[-1] + # .split("Acc")[0] + # .split(";")[0] + # ) + # src_id = val.split("Acc:")[-1].split("]")[0] + # if ":" in src_id: + # src_id = src_id.split(":")[-1] + # gene["xrefs"] = self._get_xref(src_name, src_id) + # continue + # + # gene[attributes_map[key]] = val + + def _get_xref(self, src_name: str, src_id: str) -> Optional[str]: + """Get xref. :param src_name: Source name :param src_id: The source's accession number - :return: A dict containing an other identifier or xref + :return: xref, if successfully parsed """ - source = dict() - if src_name.startswith("HGNC"): - source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] - elif src_name.startswith("NCBI"): - source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"] - elif src_name.startswith("UniProt"): - source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] - elif src_name.startswith("miRBase"): - source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] - elif src_name.startswith("RFAM"): - source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] - return source + for prefix, constrained_prefix in ( + ("HGNC", NamespacePrefix.HGNC), + ("NCBI", NamespacePrefix.NCBI), + ("UniProt", NamespacePrefix.UNIPROT), + ("miRBase", NamespacePrefix.MIRBASE), + ("RFAM", NamespacePrefix.RFAM), + ): + if src_name.startswith(prefix): + return f"{constrained_prefix.value}:{src_id}" + _logger.warning("Unrecognized source name: %:%", src_name, src_id) + return None def _add_meta(self) -> None: """Add Ensembl metadata. diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 1f060935..805fbe37 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -6,7 +6,6 @@ from gene.etl.base import Base, GeneNormalizerEtlError from gene.schemas import ( - PREFIX_LOOKUP, Annotation, Chromosome, DataLicenseAttributes, @@ -42,9 +41,9 @@ def _transform_data(self) -> None: elif r["status"] == "Entry Withdrawn": gene["symbol_status"] = SymbolStatus.WITHDRAWN.value - # store alias, xref, associated_with, prev_symbols, location + # store alias, xref, prev_symbols, location self._get_aliases(r, gene) - self._get_xrefs_associated_with(r, gene) + self._get_xrefs(r, gene) if "prev_symbol" in r: self._get_previous_symbols(r, gene) if "location" in r: @@ -81,14 +80,13 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: if prev_symbols: gene["previous_symbols"] = list(set(prev_symbols)) - def _get_xrefs_associated_with(self, record: Dict, gene: Dict) -> None: - """Store xrefs and/or associated_with refs in a gene record. + def _get_xrefs(self, record: Dict, gene: Dict) -> None: + """Store xrefs in a gene record. :param record: A gene record in the HGNC data file :param gene: A transformed gene record """ xrefs = list() - associated_with = list() sources = [ "entrez_id", "ensembl_gene_id", @@ -128,37 +126,28 @@ def _get_xrefs_associated_with(self, record: Dict, gene: Dict) -> None: key = src if key.upper() in NamespacePrefix.__members__: - if NamespacePrefix[key.upper()].value in PREFIX_LOOKUP.keys(): - self._get_xref_associated_with(key, src, record, xrefs) - else: - self._get_xref_associated_with( - key, src, record, associated_with - ) + self._get_xref(key, src, record, xrefs) else: _logger.warning(f"{key} not in schemas.py") if xrefs: gene["xrefs"] = xrefs - if associated_with: - gene["associated_with"] = associated_with - def _get_xref_associated_with( - self, key: str, src: str, r: Dict, src_type: List[str] - ) -> None: - """Add an xref or associated_with ref to a gene record. + def _get_xref(self, key: str, src: str, r: Dict, xrefs: List[str]) -> None: + """Add an xref to a gene record. :param key: The source's name :param src: HGNC's source field :param r: A gene record in the HGNC data file - :param src_type: Either xrefs or associated_with list + :param xrefs: xrefs list """ if isinstance(r[src], list): for xref in r[src]: - src_type.append(f"{NamespacePrefix[key.upper()].value}:{xref}") + xrefs.append(f"{NamespacePrefix[key.upper()].value}:{xref}") else: if isinstance(r[src], str) and ":" in r[src]: r[src] = r[src].split(":")[-1].strip() - src_type.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}") + xrefs.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}") def _get_location(self, r: Dict, gene: Dict) -> None: """Store GA4GH VRS ChromosomeLocation in a gene record. diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py index d065be73..57ca439c 100644 --- a/src/gene/etl/merge.py +++ b/src/gene/etl/merge.py @@ -5,7 +5,7 @@ from gene.database import AbstractDatabase from gene.database.database import DatabaseWriteError -from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority +from gene.schemas import GeneTypeFieldName, NamespacePrefix, RecordType, SourcePriority _logger = logging.getLogger(__name__) @@ -98,7 +98,14 @@ def _create_record_id_set( if not record_xrefs: return observed_id_set | {db_record["concept_id"]} else: - local_id_set = set(record_xrefs) + local_id_set = set() + for xref in record_xrefs: + if ( + xref.startswith(NamespacePrefix.NCBI.value) + or xref.startswith(NamespacePrefix.ENSEMBL.value) + or xref.startswith(NamespacePrefix.HGNC.value) + ): + local_id_set.add(xref) merged_id_set = {record_id} | observed_id_set for local_record_id in local_id_set - observed_id_set: merged_id_set |= self._create_record_id_set( @@ -145,7 +152,6 @@ def record_order(record: Dict) -> Tuple: merged_attrs = { "concept_id": records[0]["concept_id"], "aliases": set(), - "associated_with": set(), "previous_symbols": set(), "hgnc_locus_type": set(), "ncbi_gene_type": set(), @@ -156,7 +162,7 @@ def record_order(record: Dict) -> Tuple: merged_attrs["xrefs"] = list({r["concept_id"] for r in records[1:]}) # merge from constituent records - set_fields = ["aliases", "associated_with", "previous_symbols", "strand"] + set_fields = ["aliases", "previous_symbols", "strand"] scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"] for record in records: for field in set_fields: diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index 427d57e6..a5954bbe 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -98,14 +98,13 @@ def _get_prev_symbols(self) -> Dict[str, str]: history_file.close() return prev_symbols - def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: - """Add xrefs and associated_with refs to a transformed gene. + def _add_xrefs(self, val: List[str], params: Dict) -> None: + """Add xrefs to a transformed gene. :param val: A list of source ids for a given gene :param params: A transformed gene record """ params["xrefs"] = [] - params["associated_with"] = [] for src in val: src_name = src.split(":")[0].upper() src_id = src.split(":")[-1] @@ -125,16 +124,12 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: prefix = NamespacePrefix.IMGT_GENE_DB.value elif src_name.startswith("MIRBASE"): prefix = NamespacePrefix.MIRBASE.value - else: - prefix = None - if prefix: - params["associated_with"].append(f"{prefix}:{src_id}") else: _logger.info(f"{src_name} is not in NameSpacePrefix.") + continue + params["xrefs"].append(f"{prefix}:{src_id}") if not params["xrefs"]: del params["xrefs"] - if not params["associated_with"]: - del params["associated_with"] def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: """Store genes from NCBI info file. @@ -158,10 +153,10 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: params["aliases"] = row[4].split("|") else: params["aliases"] = [] - # get associated_with + # get xrefs if row[5] != "-": - associated_with = row[5].split("|") - self._add_xrefs_associated_with(associated_with, params) + xrefs = row[5].split("|") + self._add_xrefs(xrefs, params) # get chromosome location vrs_chr_location = self._get_vrs_chr_location(row, params) if "exclude" in vrs_chr_location: @@ -223,7 +218,7 @@ def _add_gff_gene( return params def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: - """Add concept_id, symbol, and xrefs/associated_with to a gene record. + """Add concept_id, symbol, and xrefs to a gene record. :param gffutils.feature.Feature f: A gene from the data :param gene: A transformed gene record @@ -239,7 +234,7 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: val = val[0] if key == "Dbxref": - self._add_xrefs_associated_with(val, gene) + self._add_xrefs(val, gene) elif key == "Name": gene["symbol"] = val @@ -258,25 +253,24 @@ def _get_vrs_sq_location( params["strand"] = gene.strand return self._build_sequence_location(gene.seqid, gene, params["concept_id"]) - def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: - """Get xref or associated_with ref. + def _get_xref(self, src_name: str, src_id: str) -> Dict: + """Get xref. :param src_name: Source name :param src_id: The source's accession number - :return: A dict containing an xref or associated_with ref + :return: A dict containing an xref """ - source = dict() - if src_name.startswith("HGNC"): - source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] - elif src_name.startswith("NCBI"): - source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"] - elif src_name.startswith("UniProt"): - source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] - elif src_name.startswith("miRBase"): - source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] - elif src_name.startswith("RFAM"): - source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] - return source + for prefix, constrained_prefix in ( + ("HGNC", NamespacePrefix.HGNC), + ("NCBI", NamespacePrefix.NCBI), # ? + ("UniProt", NamespacePrefix.UNIPROT), + ("miRBase", NamespacePrefix.MIRBASE), + ("RFAM", NamespacePrefix.RFAM), + ): + if src_name.startswith(prefix): + return {"xrefs": [f"{constrained_prefix.value}:{src_id}"]} + _logger.warning("Unrecognized source name: %:%", src_name, src_id) + return {} def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List: """Store GA4GH VRS ChromosomeLocation in a gene record. diff --git a/src/gene/query.py b/src/gene/query.py index 8c100446..0a57be43 100644 --- a/src/gene/query.py +++ b/src/gene/query.py @@ -375,9 +375,8 @@ def _add_gene( ) # mappings - source_ids = record.get("xrefs", []) + record.get("associated_with", []) mappings = [] - for source_id in source_ids: + for source_id in record.get("xrefs", []): system, code = source_id.split(":") mappings.append( core_models.Mapping( diff --git a/src/gene/schemas.py b/src/gene/schemas.py index d2c17d84..96b6fc4b 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -58,7 +58,6 @@ class MatchType(IntEnum): PREV_SYMBOL = 80 ALIAS = 60 XREF = 60 - ASSOCIATED_WITH = 60 FUZZY_MATCH = 20 NO_MATCH = 0 @@ -102,7 +101,6 @@ class BaseGene(BaseModel): aliases: List[StrictStr] = [] previous_symbols: List[StrictStr] = [] xrefs: List[CURIE] = [] - associated_with: List[CURIE] = [] gene_type: Optional[StrictStr] = None @@ -242,7 +240,6 @@ class RefType(str, Enum): PREVIOUS_SYMBOLS = "prev_symbol" ALIASES = "alias" XREFS = "xref" - ASSOCIATED_WITH = "associated_with" # collective name to singular name, e.g. {"previous_symbols": "prev_symbol"} @@ -561,8 +558,9 @@ class UnmergedNormalizationService(BaseNormalizationService): ], "aliases": ["3.1.1.7"], "previous_symbols": ["YT"], - "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], - "associated_with": [ + "xrefs": [ + "ncbigene:43", + "ensembl:ENSG00000087085", "ucsc:uc003uxi.4", "vega:OTTHUMG00000157033", "merops:S09.979", @@ -671,8 +669,6 @@ class UnmergedNormalizationService(BaseNormalizationService): "xrefs": [ "hgnc:108", "ensembl:ENSG00000087085", - ], - "associated_with": [ "omim:100740", ], "gene_type": "protein-coding", diff --git a/tests/conftest.py b/tests/conftest.py index ad1a14a2..923d71ac 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -41,7 +41,6 @@ def _compare_records(normalized_gene, test_gene, match_type): assert set(normalized_gene.xrefs) == set(test_gene.xrefs) assert normalized_gene.symbol_status == test_gene.symbol_status assert set(normalized_gene.previous_symbols) == set(test_gene.previous_symbols) - assert set(normalized_gene.associated_with) == set(test_gene.associated_with) assert normalized_gene.symbol == test_gene.symbol assert len(normalized_gene.locations) == len(test_gene.locations) for loc in normalized_gene.locations: diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 092cc6c3..62a4154e 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -76,7 +76,6 @@ def test_tables_created(db_fixture): existing_tables = db_fixture.db.list_tables() if db_fixture.db_name == "PostgresDatabase": assert set(existing_tables) == { - "gene_associations", "gene_symbols", "gene_previous_symbols", "gene_aliases", @@ -150,11 +149,6 @@ def test_item_type(db_fixture): assert "item_type" in item assert item["item_type"] == "alias" - filter_exp = Key("label_and_type").eq("omim:606689##associated_with") - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] - assert "item_type" in item - assert item["item_type"] == "associated_with" - filter_exp = Key("label_and_type").eq("ensembl:ensg00000268895##xref") item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] assert "item_type" in item diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 1ab55430..e9980579 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -47,7 +47,6 @@ def ddx11l1(): } ], "strand": "+", - "associated_with": [], "gene_type": "transcribed_unprocessed_pseudogene", } return Gene(**params) @@ -79,7 +78,6 @@ def tp53(): } ], "strand": "-", - "associated_with": [], "gene_type": "protein_coding", } return Gene(**params) @@ -111,7 +109,6 @@ def ATP6AP1_DT(): # noqa: N802 } ], "strand": "-", - "associated_with": [], "gene_type": "lncRNA", } return Gene(**params) @@ -127,7 +124,6 @@ def hsa_mir_1253(): "label": "hsa-mir-1253", "previous_symbols": [], "aliases": [], - "xrefs": [], "symbol_status": None, "location_annotations": [], "locations": [ @@ -143,7 +139,7 @@ def hsa_mir_1253(): } ], "strand": "+", - "associated_with": ["mirbase:MI0006387"], + "xrefs": ["mirbase:MI0006387"], "gene_type": "lncRNA", } return Gene(**params) @@ -175,7 +171,6 @@ def spry3(): } ], "strand": "+", - "associated_with": [], "gene_type": "protein_coding", } return Gene(**params) @@ -254,9 +249,9 @@ def test_hsa_mir_1253(check_resp_single_record, ensembl, hsa_mir_1253): resp = ensembl.search("hsa-mir-1253") check_resp_single_record(resp, hsa_mir_1253, MatchType.SYMBOL) - # associated_with + # xref resp = ensembl.search("mirbase:MI0006387") - check_resp_single_record(resp, hsa_mir_1253, MatchType.ASSOCIATED_WITH) + check_resp_single_record(resp, hsa_mir_1253, MatchType.XREF) def test_spry3(check_resp_single_record, ensembl, spry3): diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py index 185809fe..2d35c028 100644 --- a/tests/unit/test_hgnc_source.py +++ b/tests/unit/test_hgnc_source.py @@ -49,7 +49,9 @@ def a1bg_as1(): "previous_symbols": ["NCRNA00181", "A1BGAS", "A1BG-AS"], "aliases": ["FLJ23569"], "symbol_status": "approved", - "associated_with": [ + "xrefs": [ + "ensembl:ENSG00000268895", + "ncbigene:503538", "vega:OTTHUMG00000183508", "ucsc:uc002qse.3", "refseq:NR_015380", @@ -57,7 +59,6 @@ def a1bg_as1(): "refseq:NR_015380", "ena.embl:BC040926", ], - "xrefs": ["ensembl:ENSG00000268895", "ncbigene:503538"], "gene_type": "RNA, long non-coding", } return Gene(**params) @@ -86,7 +87,7 @@ def tp53(): "previous_symbols": [], "aliases": ["p53", "LFS1"], "symbol_status": "approved", - "associated_with": [ + "xrefs": [ "vega:OTTHUMG00000162125", "refseq:NM_000546", "cosmic:TP53", @@ -110,8 +111,9 @@ def tp53(): "pubmed:6396087", "pubmed:3456488", "pubmed:2047879", + "ensembl:ENSG00000141510", + "ncbigene:7157", ], - "xrefs": ["ensembl:ENSG00000141510", "ncbigene:7157"], "gene_type": "gene with protein product", } return Gene(**params) @@ -140,8 +142,9 @@ def a3galt2(): "previous_symbols": ["A3GALT2P"], "aliases": ["IGBS3S", "IGB3S"], "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000184389", "ncbigene:127550"], - "associated_with": [ + "xrefs": [ + "ensembl:ENSG00000184389", + "ncbigene:127550", "vega:OTTHUMG00000004125", "vega:OTTHUMG00000004125", "ucsc:uc031plq.1", @@ -180,8 +183,9 @@ def wdhd1(): "previous_symbols": [], "aliases": ["AND-1", "CTF4", "CHTF4"], "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000198554", "ncbigene:11169"], - "associated_with": [ + "xrefs": [ + "ensembl:ENSG00000198554", + "ncbigene:11169", "vega:OTTHUMG00000140304", "refseq:NM_007086", "omim:608126", @@ -212,8 +216,12 @@ def g6pr(): "previous_symbols": [], "aliases": ["GSD1aSP"], "symbol_status": "approved", - "xrefs": ["ncbigene:2541"], - "associated_with": ["pubmed:2172641", "pubmed:7814621", "pubmed:2996501"], + "xrefs": [ + "ncbigene:2541", + "pubmed:2172641", + "pubmed:7814621", + "pubmed:2996501", + ], "gene_type": "unknown", } return Gene(**params) @@ -233,8 +241,7 @@ def pirc24(): "previous_symbols": [], "aliases": [], "symbol_status": "approved", - "xrefs": ["ncbigene:100313810"], - "associated_with": ["pubmed:17881367"], + "xrefs": ["ncbigene:100313810", "pubmed:17881367"], "gene_type": "RNA, cluster", } return Gene(**params) @@ -263,8 +270,8 @@ def gage4(): "previous_symbols": [], "aliases": ["CT4.4"], "symbol_status": "approved", - "xrefs": ["ncbigene:2576"], - "associated_with": [ + "xrefs": [ + "ncbigene:2576", "refseq:NM_001474", "omim:300597", "uniprot:P0DSO3", @@ -290,8 +297,9 @@ def mafip(): "previous_symbols": [], "aliases": ["FLJ35473", "FLJ00219", "FLJ39633", "MIP", "pp5644", "TEKT4P4"], "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000274847", "ncbigene:727764"], - "associated_with": [ + "xrefs": [ + "ensembl:ENSG00000274847", + "ncbigene:727764", "vega:OTTHUMG00000188065", "refseq:NR_046439", "uniprot:Q8WZ33", @@ -319,8 +327,7 @@ def mt_7sdna(): "previous_symbols": ["MT7SDNA"], "aliases": [], "symbol_status": "approved", - "xrefs": [], - "associated_with": ["pubmed:24709344", "pubmed:273237"], + "xrefs": ["pubmed:24709344", "pubmed:273237"], "gene_type": "region", } return Gene(**params) @@ -350,7 +357,6 @@ def cecr(): "aliases": [], "symbol_status": "approved", "xrefs": ["ncbigene:1055"], - "associated_with": [], "gene_type": "region", } return Gene(**params) @@ -387,8 +393,9 @@ def csf2ra(): "previous_symbols": ["CSF2R"], "aliases": ["CD116", "alphaGMR"], "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000198223", "ncbigene:1438"], - "associated_with": [ + "xrefs": [ + "ensembl:ENSG00000198223", + "ncbigene:1438", "vega:OTTHUMG00000012533", "refseq:NM_001161529", "orphanet:209477", @@ -435,8 +442,7 @@ def rps24p5(): "previous_symbols": [], "aliases": [], "symbol_status": "approved", - "xrefs": ["ncbigene:100271094"], - "associated_with": ["refseq:NG_011274", "pubmed:19123937"], + "xrefs": ["ncbigene:100271094", "refseq:NG_011274", "pubmed:19123937"], "gene_type": "pseudogene", } return Gene(**params) @@ -465,8 +471,7 @@ def trl_cag2_1(): "previous_symbols": ["TRNAL13"], "aliases": ["tRNA-Leu-CAG-2-1"], "symbol_status": "approved", - "xrefs": ["ncbigene:100189130"], - "associated_with": ["ena.embl:HG983896"], + "xrefs": ["ncbigene:100189130", "ena.embl:HG983896"], "gene_type": "RNA, transfer", } return Gene(**params) @@ -495,8 +500,9 @@ def myo5b(): "previous_symbols": [], "aliases": ["KIAA1119"], "symbol_status": "approved", - "xrefs": ["ensembl:ENSG00000167306", "ncbigene:4645"], - "associated_with": [ + "xrefs": [ + "ensembl:ENSG00000167306", + "ncbigene:4645", "vega:OTTHUMG00000179843", "refseq:NM_001080467", "omim:606540", @@ -539,7 +545,7 @@ def gstt1(): "previous_symbols": [], "aliases": ["2.5.1.18"], "symbol_status": "approved", - "associated_with": [ + "xrefs": [ "refseq:NM_000853", "omim:600436", "ucsc:uc002zze.4", @@ -547,8 +553,9 @@ def gstt1(): "orphanet:470418", "ena.embl:KI270879", "pubmed:8617495", + "ensembl:ENSG00000277656", + "ncbigene:2952", ], - "xrefs": ["ensembl:ENSG00000277656", "ncbigene:2952"], "gene_type": "gene with protein product", } return Gene(**params) @@ -772,9 +779,9 @@ def test_myo5b(check_resp_single_record, myo5b, hgnc): resp = hgnc.search("MYO5B") check_resp_single_record(resp, myo5b, MatchType.SYMBOL) - # associated_with + # xref resp = hgnc.search("refseq:NM_001080467") - check_resp_single_record(resp, myo5b, MatchType.ASSOCIATED_WITH) + check_resp_single_record(resp, myo5b, MatchType.XREF) def test_gstt1(check_resp_single_record, gstt1, hgnc): @@ -787,9 +794,9 @@ def test_gstt1(check_resp_single_record, gstt1, hgnc): resp = hgnc.search("GSTT1") check_resp_single_record(resp, gstt1, MatchType.SYMBOL) - # associated_with + # xref resp = hgnc.search("omim:600436") - check_resp_single_record(resp, gstt1, MatchType.ASSOCIATED_WITH) + check_resp_single_record(resp, gstt1, MatchType.XREF) def test_no_match(hgnc): diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index f7b7508c..95504401 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -22,7 +22,6 @@ def check_ncbi_discontinued_gene(normalizer_response, concept_id, symbol, match_ assert resp.aliases == [] assert resp.previous_symbols == [] assert resp.xrefs == [] - assert resp.associated_with == [] @pytest.fixture(scope="module") @@ -50,9 +49,8 @@ def dpf1(): "concept_id": "ncbigene:8193", "symbol": "DPF1", "aliases": ["BAF45b", "NEUD4", "neuro-d4", "SMARCG1"], - "xrefs": ["hgnc:20225", "ensembl:ENSG00000011332"], "previous_symbols": [], - "associated_with": ["omim:601670"], + "xrefs": ["hgnc:20225", "ensembl:ENSG00000011332", "omim:601670"], "symbol_status": None, "location_annotations": [], "strand": "-", @@ -90,9 +88,8 @@ def pdp1_symbol(): "concept_id": "ncbigene:54704", "symbol": "PDP1", "aliases": ["PDH", "PDP", "PDPC", "PPM2A", "PPM2C"], - "xrefs": ["hgnc:9279", "ensembl:ENSG00000164951"], + "xrefs": ["hgnc:9279", "ensembl:ENSG00000164951", "omim:605993"], "previous_symbols": ["LOC157663", "PPM2C"], - "associated_with": ["omim:605993"], "symbol_status": None, "location_annotations": [], "strand": "+", @@ -130,9 +127,8 @@ def pdp1_alias(): "concept_id": "ncbigene:403313", "symbol": "PLPP6", "aliases": ["PDP1", "PSDP", "PPAPDC2", "bA6J24.6", "LPRP-B", "PA-PSP"], - "xrefs": ["hgnc:23682", "ensembl:ENSG00000205808"], + "xrefs": ["hgnc:23682", "ensembl:ENSG00000205808", "omim:611666"], "previous_symbols": [], - "associated_with": ["omim:611666"], "symbol_status": None, "location_annotations": [], "strand": "+", @@ -171,9 +167,8 @@ def spry3(): "concept_id": "ncbigene:10251", "symbol": "SPRY3", "aliases": ["spry-3"], - "xrefs": ["hgnc:11271", "ensembl:ENSG00000168939"], + "xrefs": ["hgnc:11271", "ensembl:ENSG00000168939", "omim:300531"], "previous_symbols": ["LOC170187", "LOC253479"], - "associated_with": ["omim:300531"], "symbol_status": None, "location_annotations": [], "strand": "+", @@ -232,7 +227,6 @@ def adcp1(): "aliases": [], "xrefs": ["hgnc:229"], "previous_symbols": [], - "associated_with": [], "symbol_status": None, "strand": None, "location_annotations": ["6"], @@ -252,9 +246,8 @@ def afa(): "concept_id": "ncbigene:170", "symbol": "AFA", "aliases": [], - "xrefs": [], "previous_symbols": [], - "associated_with": ["omim:106250"], + "xrefs": ["omim:106250"], "symbol_status": None, "strand": None, "location_annotations": [], @@ -274,9 +267,8 @@ def znf84(): "concept_id": "ncbigene:7637", "symbol": "ZNF84", "aliases": ["HPF2"], - "xrefs": ["hgnc:13159", "ensembl:ENSG00000198040"], + "xrefs": ["hgnc:13159", "ensembl:ENSG00000198040", "omim:618554"], "previous_symbols": ["LOC100287429"], - "associated_with": ["omim:618554"], "symbol_status": None, "location_annotations": ["map from Rosati ref via FISH [AFS]"], "strand": "+", @@ -315,9 +307,14 @@ def slc25a6(): "concept_id": "ncbigene:293", "symbol": "SLC25A6", "aliases": ["AAC3", "ANT", "ANT 2", "ANT 3", "ANT3", "ANT3Y"], - "xrefs": ["hgnc:10992", "ensembl:ENSG00000169100", "ensembl:ENSG00000292334"], + "xrefs": [ + "hgnc:10992", + "ensembl:ENSG00000169100", + "ensembl:ENSG00000292334", + "omim:300151", + "omim:403000", + ], "previous_symbols": ["ANT3Y"], - "associated_with": ["omim:300151", "omim:403000"], "symbol_status": None, "location_annotations": [], "strand": "-", @@ -376,7 +373,6 @@ def loc106783576(): "aliases": [], "xrefs": [], "previous_symbols": [], - "associated_with": [], "symbol_status": None, "location_annotations": [], "strand": None, @@ -405,9 +401,8 @@ def glc1b(): "concept_id": "ncbigene:2722", "symbol": "GLC1B", "aliases": [], - "xrefs": [], "previous_symbols": [], - "associated_with": ["omim:606689"], + "xrefs": ["omim:606689"], "symbol_status": None, "location_annotations": [], "strand": None, @@ -436,9 +431,8 @@ def hdpa(): "concept_id": "ncbigene:50829", "symbol": "HDPA", "aliases": [], - "xrefs": [], "previous_symbols": [], - "associated_with": ["omim:300221"], + "xrefs": ["omim:300221"], "symbol_status": None, "location_annotations": [], "strand": None, @@ -470,7 +464,6 @@ def prkrap1(): "aliases": [], "xrefs": ["hgnc:33447"], "previous_symbols": ["LOC100289695"], - "associated_with": [], "symbol_status": None, "location_annotations": ["alternate reference locus"], "strand": "+", @@ -519,9 +512,8 @@ def mhb(): "concept_id": "ncbigene:619511", "symbol": "MHB", "aliases": [], - "xrefs": [], "previous_symbols": [], - "associated_with": ["omim:255160"], + "xrefs": ["omim:255160"], "symbol_status": None, "location_annotations": [], "strand": None, @@ -550,9 +542,8 @@ def spg37(): "concept_id": "ncbigene:100049159", "symbol": "SPG37", "aliases": [], - "xrefs": [], "previous_symbols": [], - "associated_with": ["omim:611945"], + "xrefs": ["omim:611945"], "symbol_status": None, "location_annotations": [], "strand": None, @@ -607,9 +598,9 @@ def test_dpf1(check_resp_single_record, ncbi, dpf1): resp = ncbi.search("neuro-d4") check_resp_single_record(resp, dpf1, MatchType.ALIAS) - # associated_with + # xref resp = ncbi.search("omim:601670") - check_resp_single_record(resp, dpf1, MatchType.ASSOCIATED_WITH) + check_resp_single_record(resp, dpf1, MatchType.XREF) # No Match resp = ncbi.search("DPF 1") @@ -751,9 +742,9 @@ def test_glc1b(check_resp_single_record, ncbi, glc1b): resp = ncbi.search("GLC1B") check_resp_single_record(resp, glc1b, MatchType.SYMBOL) - # associated_with + # xref resp = ncbi.search("omim:606689") - check_resp_single_record(resp, glc1b, MatchType.ASSOCIATED_WITH) + check_resp_single_record(resp, glc1b, MatchType.XREF) def test_hdpa(check_resp_single_record, ncbi, hdpa): @@ -792,9 +783,9 @@ def test_mhb(check_resp_single_record, ncbi, mhb): resp = ncbi.search("MHB") check_resp_single_record(resp, mhb, MatchType.SYMBOL) - # associated_with + # xref resp = ncbi.search("OMIM:255160") - check_resp_single_record(resp, mhb, MatchType.ASSOCIATED_WITH) + check_resp_single_record(resp, mhb, MatchType.XREF) def test_spg37(check_resp_single_record, ncbi, spg37): @@ -807,9 +798,9 @@ def test_spg37(check_resp_single_record, ncbi, spg37): resp = ncbi.search("SPG37") check_resp_single_record(resp, spg37, MatchType.SYMBOL) - # associated_with + # xref resp = ncbi.search("omim:611945") - check_resp_single_record(resp, spg37, MatchType.ASSOCIATED_WITH) + check_resp_single_record(resp, spg37, MatchType.XREF) def test_discontinued_genes(ncbi): diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index bfb11460..f9b08927 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -702,7 +702,6 @@ def normalize_unmerged_loc_653303(): "aliases": [], "previous_symbols": ["LOC196266", "LOC731196", "LOC654080"], "xrefs": [], - "associated_with": [], "gene_type": "pseudo", } ] @@ -745,8 +744,9 @@ def normalize_unmerged_chaf1a(): "CAF-1", ], "previous_symbols": [], - "xrefs": ["ensembl:ENSG00000167670", "ncbigene:10036"], - "associated_with": [ + "xrefs": [ + "ensembl:ENSG00000167670", + "ncbigene:10036", "vega:OTTHUMG00000181922", "ccds:CCDS32875", "ucsc:uc002mal.4", @@ -784,7 +784,6 @@ def normalize_unmerged_chaf1a(): "aliases": [], "previous_symbols": [], "xrefs": ["hgnc:1910"], - "associated_with": [], "gene_type": "protein_coding", } ], @@ -820,8 +819,11 @@ def normalize_unmerged_chaf1a(): ], "aliases": ["CAF1P150", "P150", "CAF1", "CAF1B", "CAF-1"], "previous_symbols": ["LOC107985297"], - "xrefs": ["ensembl:ENSG00000167670", "hgnc:1910"], - "associated_with": ["omim:601246"], + "xrefs": [ + "ensembl:ENSG00000167670", + "hgnc:1910", + "omim:601246", + ], "gene_type": "protein-coding", } ] @@ -867,8 +869,7 @@ def normalize_unmerged_ache(): ], "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], "previous_symbols": ["ACEE"], - "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], - "associated_with": ["omim:100740"], + "xrefs": ["hgnc:108", "ensembl:ENSG00000087085", "omim:100740"], "gene_type": "protein-coding", } ], @@ -897,7 +898,6 @@ def normalize_unmerged_ache(): "aliases": [], "previous_symbols": [], "xrefs": ["hgnc:108"], - "associated_with": [], "gene_type": "protein_coding", } ] @@ -923,8 +923,9 @@ def normalize_unmerged_ache(): ], "aliases": ["3.1.1.7"], "previous_symbols": ["YT"], - "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], - "associated_with": [ + "xrefs": [ + "ncbigene:43", + "ensembl:ENSG00000087085", "ucsc:uc003uxi.4", "vega:OTTHUMG00000157033", "merops:S09.979", @@ -1050,7 +1051,6 @@ def compare_unmerged_record(gene, test_gene): assert set(gene.xrefs) == set(test_gene.xrefs) assert gene.symbol_status == test_gene.symbol_status assert set(gene.previous_symbols) == set(test_gene.previous_symbols) - assert set(gene.associated_with) == set(test_gene.associated_with) assert gene.symbol == test_gene.symbol assert len(gene.locations) == len(test_gene.locations) for loc in gene.locations: @@ -1259,7 +1259,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): compare_normalize_resp( resp, q, - MatchType.ASSOCIATED_WITH, + MatchType.XREF, normalized_ache, expected_source_meta=source_meta, ) @@ -1337,7 +1337,7 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): compare_normalize_resp( resp, q, - MatchType.ASSOCIATED_WITH, + MatchType.XREF, normalized_braf, expected_source_meta=source_meta, ) @@ -1439,7 +1439,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): compare_normalize_resp( resp, q, - MatchType.ASSOCIATED_WITH, + MatchType.XREF, normalized_abl1, expected_source_meta=source_meta, ) @@ -1572,18 +1572,14 @@ def test_normalize_unmerged( resp = query_handler.normalize_unmerged(q) compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_chaf1a) - # assoc with + # xref q = "omim:100740" resp = query_handler.normalize_unmerged(q) - compare_unmerged_response( - resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_ache - ) + compare_unmerged_response(resp, q, [], MatchType.XREF, normalize_unmerged_ache) q = "uniprot:Q13111" resp = query_handler.normalize_unmerged(q) - compare_unmerged_response( - resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_chaf1a - ) + compare_unmerged_response(resp, q, [], MatchType.XREF, normalize_unmerged_chaf1a) def test_invalid_queries(query_handler): diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py index 3d5fceed..afe56b84 100644 --- a/tests/unit/test_schemas.py +++ b/tests/unit/test_schemas.py @@ -78,15 +78,6 @@ def test_gene(gene, sequence_location): xrefs=["hgnc", "hgnc:1"], ) - # associated_with not a valid curie - with pytest.raises(pydantic.ValidationError): - Gene( - match_type=100, - concept_id="hgnc:1096", - symbol="BRAF", - associated_with=["hgnc", "hgnc:1"], - ) - # symbol status invalid with pytest.raises(pydantic.ValidationError): Gene(