Skip to content

Commit

Permalink
feat: remove associated_with as distinct group (merge w/ xrefs)
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Jan 3, 2024
1 parent dc52928 commit 0e7a335
Show file tree
Hide file tree
Showing 24 changed files with 209 additions and 286 deletions.
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Gene Normalizer |version|
:alt: citation
:target: https://zenodo.org/badge/latestdoi/309797998

The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene <https://www.ncbi.nlm.nih.gov/gene/>`_, `Ensembl <https://useast.ensembl.org/index.html>`_, and `HGNC <https://www.genenames.org/>`_, it designates a `CURIE <https://en.wikipedia.org/wiki/CURIE>`_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references and associations, and coordinates.
The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene <https://www.ncbi.nlm.nih.gov/gene/>`_, `Ensembl <https://useast.ensembl.org/index.html>`_, and `HGNC <https://www.genenames.org/>`_, it designates a `CURIE <https://en.wikipedia.org/wiki/CURIE>`_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references, and coordinates.

A `public REST instance of the service <https://normalize.cancervariants.org/gene>`_ is available for programmatic queries:

Expand Down
9 changes: 2 additions & 7 deletions docs/source/normalizing_data/sources.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ HGNC
"previous_symbols": [],
"xrefs": [
"ensembl:ENSG00000157764",
"ncbigene:673"
],
"associated_with": [
"ncbigene:673",
"uniprot:P15056",
"pubmed:2284096",
"omim:164757",
Expand Down Expand Up @@ -99,7 +97,6 @@ Ensembl
"xrefs": [
"hgnc:1097"
],
"associated_with": [],
"gene_type": "protein_coding",
"match_type": 100
}
Expand Down Expand Up @@ -143,9 +140,7 @@ The `NCBI Gene Database <https://www.ncbi.nlm.nih.gov/gene/>`_ is a service prov
"previous_symbols": [],
"xrefs": [
"ensembl:ENSG00000157764",
"hgnc:1097"
],
"associated_with": [
"hgnc:1097",
"omim:164757"
],
"gene_type": "protein-coding",
Expand Down
3 changes: 1 addition & 2 deletions src/gene/database/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,8 +434,7 @@ def _add_ref_record(
:param str term: referent term
:param str concept_id: concept ID to refer to
:param str ref_type: one of {'alias', 'label', 'xref',
'associated_with'}
:param str ref_type: one of {'alias', 'label', 'xref'}
:param src_name: name of source for record
"""
label_and_type = f"{term.lower()}##{ref_type}"
Expand Down
35 changes: 8 additions & 27 deletions src/gene/database/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ def list_tables(self) -> List[str]:
_drop_db_query = b"""
DROP MATERIALIZED VIEW IF EXISTS record_lookup_view;
DROP TABLE IF EXISTS
gene_associations,
gene_symbols,
gene_previous_symbols,
gene_aliases,
Expand Down Expand Up @@ -324,12 +323,11 @@ def _format_source_record(self, source_row: Tuple) -> Dict:
"locations": source_row[5],
"gene_type": source_row[6],
"aliases": source_row[7],
"associated_with": source_row[8],
"previous_symbols": source_row[9],
"symbol": source_row[10],
"xrefs": source_row[11],
"src_name": source_row[12],
"merge_ref": source_row[13],
"previous_symbols": source_row[8],
"symbol": source_row[9],
"xrefs": source_row[10],
"src_name": source_row[11],
"merge_ref": source_row[12],
"item_type": RecordType.IDENTITY.value,
}
return {k: v for k, v in gene_record.items() if v}
Expand Down Expand Up @@ -373,8 +371,7 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict:
"hgnc_locus_type": merged_row[11],
"ncbi_gene_type": merged_row[12],
"aliases": merged_row[13],
"associated_with": merged_row[14],
"xrefs": merged_row[15],
"xrefs": merged_row[14],
"item_type": RecordType.MERGER.value,
}
return {k: v for k, v in merged_record.items() if v}
Expand Down Expand Up @@ -421,7 +418,6 @@ def get_record_by_id(
RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", # noqa: E501
RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", # noqa: E501
RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;",
RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", # noqa: E501
}

def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
Expand Down Expand Up @@ -558,9 +554,6 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
)
_ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);"
_ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);"
_ins_assoc_query = (
b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);"
)

def add_record(self, record: Dict, src_name: SourceName) -> None:
"""Add new record to database.
Expand Down Expand Up @@ -591,8 +584,6 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
cur.execute(self._ins_alias_query, [a, concept_id])
for x in record.get("xrefs", []):
cur.execute(self._ins_xref_query, [x, concept_id])
for a in record.get("associated_with", []):
cur.execute(self._ins_assoc_query, [a, concept_id])
for p in record.get("previous_symbols", []):
cur.execute(self._ins_prev_symbol_query, [p, concept_id])
if record.get("symbol"):
Expand All @@ -606,10 +597,9 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
INSERT INTO gene_merged (
concept_id, symbol, symbol_status, previous_symbols, label, strand,
location_annotations, ensembl_locations, hgnc_locations, ncbi_locations,
hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, associated_with,
xrefs
hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, xrefs
)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
"""

def add_merged_record(self, record: Dict) -> None:
Expand Down Expand Up @@ -644,7 +634,6 @@ def add_merged_record(self, record: Dict) -> None:
record.get("ensembl_biotype"),
record.get("ncbi_gene_type"),
record.get("aliases"),
record.get("associated_with"),
record.get("xrefs"),
],
)
Expand Down Expand Up @@ -702,13 +691,6 @@ def delete_normalized_concepts(self) -> None:
WHERE gc.source = %s
);
"""
_drop_associations_query = b"""
DELETE FROM gene_associations WHERE id IN (
SELECT ga.id FROM gene_associations ga LEFT JOIN gene_concepts gc
ON gc.concept_id = ga.concept_id
WHERE gc.source = %s
);
"""
_drop_prev_symbols_query = b"""
DELETE FROM gene_previous_symbols WHERE id IN (
SELECT gps.id FROM gene_previous_symbols gps LEFT JOIN gene_concepts gc
Expand Down Expand Up @@ -750,7 +732,6 @@ def delete_source(self, src_name: SourceName) -> None:
"""
with self.conn.cursor() as cur:
cur.execute(self._drop_aliases_query, [src_name.value])
cur.execute(self._drop_associations_query, [src_name.value])
cur.execute(self._drop_prev_symbols_query, [src_name.value])
cur.execute(self._drop_symbols_query, [src_name.value])
cur.execute(self._drop_xrefs_query, [src_name.value])
Expand Down
2 changes: 0 additions & 2 deletions src/gene/database/postgresql/add_fkeys.sql
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
ALTER TABLE gene_aliases ADD CONSTRAINT gene_aliases_concept_id_fkey
FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
ALTER TABLE gene_associations ADD CONSTRAINT gene_associations_concept_id_fkey
FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
ALTER TABLE gene_previous_symbols
ADD CONSTRAINT gene_previous_symbols_concept_id_fkey
FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
Expand Down
2 changes: 0 additions & 2 deletions src/gene/database/postgresql/add_indexes.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,5 @@ CREATE INDEX idx_gps_symbol_low
ON gene_previous_symbols (lower(prev_symbol));
CREATE INDEX idx_ga_alias_low ON gene_aliases (lower(alias));
CREATE INDEX idx_gx_xref_low ON gene_xrefs (lower(xref));
CREATE INDEX idx_g_as_association_low
ON gene_associations (lower(associated_with));
CREATE INDEX idx_rlv_concept_id_low
ON record_lookup_view (lower(concept_id));
6 changes: 0 additions & 6 deletions src/gene/database/postgresql/create_record_lookup_view.sql
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ SELECT gc.concept_id,
gc.locations,
gc.gene_type,
ga.aliases,
gas.associated_with,
gps.previous_symbols,
gs.symbol,
gx.xrefs,
Expand All @@ -20,11 +19,6 @@ FULL JOIN (
FROM gene_aliases ga_1
GROUP BY ga_1.concept_id
) ga ON gc.concept_id::text = ga.concept_id::text
FULL JOIN (
SELECT gas_1.concept_id, array_agg(gas_1.associated_with) AS associated_with
FROM gene_associations gas_1
GROUP BY gas_1.concept_id
) gas ON gc.concept_id::text = gas.concept_id::text
FULL JOIN (
SELECT gps_1.concept_id, array_agg(gps_1.prev_symbol) AS previous_symbols
FROM gene_previous_symbols gps_1
Expand Down
6 changes: 0 additions & 6 deletions src/gene/database/postgresql/create_tables.sql
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ CREATE TABLE gene_merged (
hgnc_locus_type TEXT [],
ncbi_gene_type TEXT [],
aliases TEXT [],
associated_with TEXT [],
xrefs TEXT []
);
CREATE TABLE gene_concepts (
Expand Down Expand Up @@ -60,8 +59,3 @@ CREATE TABLE gene_xrefs (
xref TEXT NOT NULL,
concept_id VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id)
);
CREATE TABLE gene_associations (
id SERIAL PRIMARY KEY,
associated_with TEXT NOT NULL,
concept_ID VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id)
);
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ CREATE TABLE gene_merged (
hgnc_locus_type TEXT [],
ncbi_gene_type TEXT [],
aliases TEXT [],
associated_with TEXT [],
xrefs TEXT []
);
ALTER TABLE gene_concepts ADD CONSTRAINT gene_concepts_merge_ref_fkey
Expand Down
1 change: 0 additions & 1 deletion src/gene/database/postgresql/drop_fkeys.sql
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
ALTER TABLE gene_aliases DROP CONSTRAINT gene_aliases_concept_id_fkey;
ALTER TABLE gene_associations DROP CONSTRAINT gene_associations_concept_id_fkey;
ALTER TABLE gene_previous_symbols
DROP CONSTRAINT gene_previous_symbols_concept_id_fkey;
ALTER TABLE gene_symbols DROP CONSTRAINT gene_symbols_concept_id_fkey;
Expand Down
1 change: 0 additions & 1 deletion src/gene/database/postgresql/drop_indexes.sql
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@ DROP INDEX IF EXISTS idx_gs_symbol_low;
DROP INDEX IF EXISTS idx_gps_symbol_low;
DROP INDEX IF EXISTS idx_gx_xref_low;
DROP INDEX IF EXISTS idx_ga_alias_low;
DROP INDEX IF EXISTS idx_g_as_association_low;
DROP INDEX IF EXISTS idx_rlv_concept_id_low;
118 changes: 64 additions & 54 deletions src/gene/etl/ensembl.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Defines the Ensembl ETL methods."""
import logging
import re
from typing import Dict
from typing import Dict, Optional

import gffutils
from gffutils.feature import Feature
Expand Down Expand Up @@ -90,73 +90,83 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
return gene_params

def _add_attributes(self, f: Feature, gene: Dict) -> None:
"""Add concept_id, symbol, xrefs, and associated_with to a gene record.
"""Add concept_id, symbol, and xrefs to a gene record.
:param f: A gene from the data
:param gene: A transformed gene record
"""
attributes = {
attributes_map = {
"ID": "concept_id",
"Name": "symbol",
"description": "xrefs",
"biotype": "gene_type",
}

for attribute in f.attributes.items():
key = attribute[0]

if key in attributes.keys():
val = attribute[1]

if len(val) == 1:
val = val[0]
if key == "ID":
if val.startswith("gene"):
val = (
f"{NamespacePrefix.ENSEMBL.value}:"
f"{val.split(':')[1]}"
)

if key == "description":
gene["label"] = val.split("[")[0].strip()
if "Source:" in val:
src_name = (
val.split("[")[-1]
.split("Source:")[-1]
.split("Acc")[0]
.split(";")[0]
)
src_id = val.split("Acc:")[-1].split("]")[0]
if ":" in src_id:
src_id = src_id.split(":")[-1]
source = self._get_xref_associated_with(src_name, src_id)
if "xrefs" in source:
gene["xrefs"] = source["xrefs"]
elif "associated_with" in source:
gene["associated_with"] = source["associated_with"]
continue

gene[attributes[key]] = val

def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
"""Get xref or associated_with concept.
for key, value in f.attributes.items():
if key not in attributes_map:
continue

if key == "ID" and value[0].startswith("gene"):
gene[
"concept_id"
] = f"{NamespacePrefix.ENSEMBL.value}:{value[0].split(':')[1]}"
elif key == "description":
pattern = "^(.*) \\[Source:.*;Acc:(.*):(.*)\\]$"
matches = re.findall(pattern, value[0])
if matches:
gene["label"] = matches[0][0]
gene["xrefs"] = [self._get_xref(matches[0][1], matches[0][2])]
else:
gene[attributes_map[key]] = value
# key = attribute[0]
#
# if key in attributes_map.keys():
# val = attribute[1]
#
# if len(val) == 1:
# val = val[0]
# if key == "ID":
# if val.startswith("gene"):
# val = (
# f"{NamespacePrefix.ENSEMBL.value}:"
# f"{val.split(':')[1]}"
# )
#
# if key == "description":
# gene["label"] = val.split("[")[0].strip()
# if "Source:" in val:
# src_name = (
# val.split("[")[-1]
# .split("Source:")[-1]
# .split("Acc")[0]
# .split(";")[0]
# )
# src_id = val.split("Acc:")[-1].split("]")[0]
# if ":" in src_id:
# src_id = src_id.split(":")[-1]
# gene["xrefs"] = self._get_xref(src_name, src_id)
# continue
#
# gene[attributes_map[key]] = val

def _get_xref(self, src_name: str, src_id: str) -> Optional[str]:
"""Get xref.
:param src_name: Source name
:param src_id: The source's accession number
:return: A dict containing an other identifier or xref
:return: xref, if successfully parsed
"""
source = dict()
if src_name.startswith("HGNC"):
source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"]
elif src_name.startswith("NCBI"):
source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"]
elif src_name.startswith("UniProt"):
source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
elif src_name.startswith("miRBase"):
source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
elif src_name.startswith("RFAM"):
source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
return source
for prefix, constrained_prefix in (
("HGNC", NamespacePrefix.HGNC),
("NCBI", NamespacePrefix.NCBI),
("UniProt", NamespacePrefix.UNIPROT),
("miRBase", NamespacePrefix.MIRBASE),
("RFAM", NamespacePrefix.RFAM),
):
if src_name.startswith(prefix):
return f"{constrained_prefix.value}:{src_id}"
_logger.warning("Unrecognized source name: %:%", src_name, src_id)
return None

def _add_meta(self) -> None:
"""Add Ensembl metadata.
Expand Down
Loading

0 comments on commit 0e7a335

Please sign in to comment.