feat: remove associated_with as distinct group (merge w/ xrefs)

cancervariants · Jan 3, 2024 · 0e7a335 · 0e7a335
1 parent dc52928
commit 0e7a335
Show file tree

Hide file tree

Showing 24 changed files with 209 additions and 286 deletions.
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -12,7 +12,7 @@ Gene Normalizer |version|
      :alt: citation
      :target: https://zenodo.org/badge/latestdoi/309797998
 
-The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene <https://www.ncbi.nlm.nih.gov/gene/>`_, `Ensembl <https://useast.ensembl.org/index.html>`_, and `HGNC <https://www.genenames.org/>`_, it designates a `CURIE <https://en.wikipedia.org/wiki/CURIE>`_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references and associations, and coordinates.
+The Gene Normalizer provides tools for resolving ambiguous human gene references to consistently-structured, normalized terms. For gene concepts extracted from `NCBI Gene <https://www.ncbi.nlm.nih.gov/gene/>`_, `Ensembl <https://useast.ensembl.org/index.html>`_, and `HGNC <https://www.genenames.org/>`_, it designates a `CURIE <https://en.wikipedia.org/wiki/CURIE>`_, and provides additional metadata like current and previously-used symbols, aliases, database cross-references, and coordinates.
 
 A `public REST instance of the service <https://normalize.cancervariants.org/gene>`_ is available for programmatic queries:
 

diff --git a/docs/source/normalizing_data/sources.rst b/docs/source/normalizing_data/sources.rst
@@ -33,9 +33,7 @@ HGNC
      "previous_symbols": [],
      "xrefs": [
        "ensembl:ENSG00000157764",
-       "ncbigene:673"
-     ],
-     "associated_with": [
+       "ncbigene:673",
        "uniprot:P15056",
        "pubmed:2284096",
        "omim:164757",
@@ -99,7 +97,6 @@ Ensembl
      "xrefs": [
        "hgnc:1097"
      ],
-     "associated_with": [],
      "gene_type": "protein_coding",
      "match_type": 100
    }
@@ -143,9 +140,7 @@ The `NCBI Gene Database <https://www.ncbi.nlm.nih.gov/gene/>`_ is a service prov
       "previous_symbols": [],
       "xrefs": [
         "ensembl:ENSG00000157764",
-        "hgnc:1097"
-      ],
-      "associated_with": [
+        "hgnc:1097",
         "omim:164757"
       ],
       "gene_type": "protein-coding",

diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py
@@ -434,8 +434,7 @@ def _add_ref_record(
 
         :param str term: referent term
         :param str concept_id: concept ID to refer to
-        :param str ref_type: one of {'alias', 'label', 'xref',
-            'associated_with'}
+        :param str ref_type: one of {'alias', 'label', 'xref'}
         :param src_name: name of source for record
         """
         label_and_type = f"{term.lower()}##{ref_type}"

diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py
@@ -97,7 +97,6 @@ def list_tables(self) -> List[str]:
     _drop_db_query = b"""
     DROP MATERIALIZED VIEW IF EXISTS record_lookup_view;
     DROP TABLE IF EXISTS
-        gene_associations,
         gene_symbols,
         gene_previous_symbols,
         gene_aliases,
@@ -324,12 +323,11 @@ def _format_source_record(self, source_row: Tuple) -> Dict:
             "locations": source_row[5],
             "gene_type": source_row[6],
             "aliases": source_row[7],
-            "associated_with": source_row[8],
-            "previous_symbols": source_row[9],
-            "symbol": source_row[10],
-            "xrefs": source_row[11],
-            "src_name": source_row[12],
-            "merge_ref": source_row[13],
+            "previous_symbols": source_row[8],
+            "symbol": source_row[9],
+            "xrefs": source_row[10],
+            "src_name": source_row[11],
+            "merge_ref": source_row[12],
             "item_type": RecordType.IDENTITY.value,
         }
         return {k: v for k, v in gene_record.items() if v}
@@ -373,8 +371,7 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict:
             "hgnc_locus_type": merged_row[11],
             "ncbi_gene_type": merged_row[12],
             "aliases": merged_row[13],
-            "associated_with": merged_row[14],
-            "xrefs": merged_row[15],
+            "xrefs": merged_row[14],
             "item_type": RecordType.MERGER.value,
         }
         return {k: v for k, v in merged_record.items() if v}
@@ -421,7 +418,6 @@ def get_record_by_id(
         RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;",  # noqa: E501
         RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;",  # noqa: E501
         RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;",
-        RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;",  # noqa: E501
     }
 
     def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
@@ -558,9 +554,6 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None:
     )
     _ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);"
     _ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);"
-    _ins_assoc_query = (
-        b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);"
-    )
 
     def add_record(self, record: Dict, src_name: SourceName) -> None:
         """Add new record to database.
@@ -591,8 +584,6 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
                     cur.execute(self._ins_alias_query, [a, concept_id])
                 for x in record.get("xrefs", []):
                     cur.execute(self._ins_xref_query, [x, concept_id])
-                for a in record.get("associated_with", []):
-                    cur.execute(self._ins_assoc_query, [a, concept_id])
                 for p in record.get("previous_symbols", []):
                     cur.execute(self._ins_prev_symbol_query, [p, concept_id])
                 if record.get("symbol"):
@@ -606,10 +597,9 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
     INSERT INTO gene_merged (
         concept_id, symbol, symbol_status, previous_symbols, label, strand,
         location_annotations, ensembl_locations, hgnc_locations, ncbi_locations,
-        hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, associated_with,
-        xrefs
+        hgnc_locus_type, ensembl_biotype, ncbi_gene_type, aliases, xrefs
     )
-    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
+    VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
     """
 
     def add_merged_record(self, record: Dict) -> None:
@@ -644,7 +634,6 @@ def add_merged_record(self, record: Dict) -> None:
                     record.get("ensembl_biotype"),
                     record.get("ncbi_gene_type"),
                     record.get("aliases"),
-                    record.get("associated_with"),
                     record.get("xrefs"),
                 ],
             )
@@ -702,13 +691,6 @@ def delete_normalized_concepts(self) -> None:
         WHERE gc.source = %s
     );
     """
-    _drop_associations_query = b"""
-    DELETE FROM gene_associations WHERE id IN (
-        SELECT ga.id FROM gene_associations ga LEFT JOIN gene_concepts gc
-            ON gc.concept_id = ga.concept_id
-        WHERE gc.source = %s
-    );
-    """
     _drop_prev_symbols_query = b"""
     DELETE FROM gene_previous_symbols WHERE id IN (
         SELECT gps.id FROM gene_previous_symbols gps LEFT JOIN gene_concepts gc
@@ -750,7 +732,6 @@ def delete_source(self, src_name: SourceName) -> None:
         """
         with self.conn.cursor() as cur:
             cur.execute(self._drop_aliases_query, [src_name.value])
-            cur.execute(self._drop_associations_query, [src_name.value])
             cur.execute(self._drop_prev_symbols_query, [src_name.value])
             cur.execute(self._drop_symbols_query, [src_name.value])
             cur.execute(self._drop_xrefs_query, [src_name.value])

diff --git a/src/gene/database/postgresql/add_fkeys.sql b/src/gene/database/postgresql/add_fkeys.sql
@@ -1,7 +1,5 @@
 ALTER TABLE gene_aliases ADD CONSTRAINT gene_aliases_concept_id_fkey
     FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
-ALTER TABLE gene_associations ADD CONSTRAINT gene_associations_concept_id_fkey
-    FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);
 ALTER TABLE gene_previous_symbols
     ADD CONSTRAINT gene_previous_symbols_concept_id_fkey
     FOREIGN KEY (concept_id) REFERENCES gene_concepts (concept_id);

diff --git a/src/gene/database/postgresql/add_indexes.sql b/src/gene/database/postgresql/add_indexes.sql
@@ -7,7 +7,5 @@ CREATE INDEX idx_gps_symbol_low
     ON gene_previous_symbols (lower(prev_symbol));
 CREATE INDEX idx_ga_alias_low ON gene_aliases (lower(alias));
 CREATE INDEX idx_gx_xref_low ON gene_xrefs (lower(xref));
-CREATE INDEX idx_g_as_association_low
-    ON gene_associations (lower(associated_with));
 CREATE INDEX idx_rlv_concept_id_low
     ON record_lookup_view (lower(concept_id));
diff --git a/src/gene/database/postgresql/create_record_lookup_view.sql b/src/gene/database/postgresql/create_record_lookup_view.sql
@@ -7,7 +7,6 @@ SELECT gc.concept_id,
        gc.locations,
        gc.gene_type,
        ga.aliases,
-       gas.associated_with,
        gps.previous_symbols,
        gs.symbol,
        gx.xrefs,
@@ -20,11 +19,6 @@ FULL JOIN (
     FROM gene_aliases ga_1
     GROUP BY ga_1.concept_id
 ) ga ON gc.concept_id::text = ga.concept_id::text
-FULL JOIN (
-    SELECT gas_1.concept_id, array_agg(gas_1.associated_with) AS associated_with
-    FROM gene_associations gas_1
-    GROUP BY gas_1.concept_id
-) gas ON gc.concept_id::text = gas.concept_id::text
 FULL JOIN (
     SELECT gps_1.concept_id, array_agg(gps_1.prev_symbol) AS previous_symbols
     FROM gene_previous_symbols gps_1

diff --git a/src/gene/database/postgresql/create_tables.sql b/src/gene/database/postgresql/create_tables.sql
@@ -26,7 +26,6 @@ CREATE TABLE gene_merged (
     hgnc_locus_type TEXT [],
     ncbi_gene_type TEXT [],
     aliases TEXT [],
-    associated_with TEXT [],
     xrefs TEXT []
 );
 CREATE TABLE gene_concepts (
@@ -60,8 +59,3 @@ CREATE TABLE gene_xrefs (
     xref TEXT NOT NULL,
     concept_id VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id)
 );
-CREATE TABLE gene_associations (
-    id SERIAL PRIMARY KEY,
-    associated_with TEXT NOT NULL,
-    concept_ID VARCHAR(127) NOT NULL REFERENCES gene_concepts (concept_id)
-);
diff --git a/src/gene/database/postgresql/delete_normalized_concepts.sql b/src/gene/database/postgresql/delete_normalized_concepts.sql
@@ -19,7 +19,6 @@ CREATE TABLE gene_merged (
     hgnc_locus_type TEXT [],
     ncbi_gene_type TEXT [],
     aliases TEXT [],
-    associated_with TEXT [],
     xrefs TEXT []
 );
 ALTER TABLE gene_concepts ADD CONSTRAINT gene_concepts_merge_ref_fkey

diff --git a/src/gene/database/postgresql/drop_fkeys.sql b/src/gene/database/postgresql/drop_fkeys.sql
@@ -1,5 +1,4 @@
 ALTER TABLE gene_aliases DROP CONSTRAINT gene_aliases_concept_id_fkey;
-ALTER TABLE gene_associations DROP CONSTRAINT gene_associations_concept_id_fkey;
 ALTER TABLE gene_previous_symbols
     DROP CONSTRAINT gene_previous_symbols_concept_id_fkey;
 ALTER TABLE gene_symbols DROP CONSTRAINT gene_symbols_concept_id_fkey;

diff --git a/src/gene/database/postgresql/drop_indexes.sql b/src/gene/database/postgresql/drop_indexes.sql
@@ -4,5 +4,4 @@ DROP INDEX IF EXISTS idx_gs_symbol_low;
 DROP INDEX IF EXISTS idx_gps_symbol_low;
 DROP INDEX IF EXISTS idx_gx_xref_low;
 DROP INDEX IF EXISTS idx_ga_alias_low;
-DROP INDEX IF EXISTS idx_g_as_association_low;
 DROP INDEX IF EXISTS idx_rlv_concept_id_low;
diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py
@@ -1,7 +1,7 @@
 """Defines the Ensembl ETL methods."""
 import logging
 import re
-from typing import Dict
+from typing import Dict, Optional
 
 import gffutils
 from gffutils.feature import Feature
@@ -90,73 +90,83 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
         return gene_params
 
     def _add_attributes(self, f: Feature, gene: Dict) -> None:
-        """Add concept_id, symbol, xrefs, and associated_with to a gene record.
+        """Add concept_id, symbol, and xrefs to a gene record.
 
         :param f: A gene from the data
         :param gene: A transformed gene record
         """
-        attributes = {
+        attributes_map = {
             "ID": "concept_id",
             "Name": "symbol",
             "description": "xrefs",
             "biotype": "gene_type",
         }
 
-        for attribute in f.attributes.items():
-            key = attribute[0]
-
-            if key in attributes.keys():
-                val = attribute[1]
-
-                if len(val) == 1:
-                    val = val[0]
-                    if key == "ID":
-                        if val.startswith("gene"):
-                            val = (
-                                f"{NamespacePrefix.ENSEMBL.value}:"
-                                f"{val.split(':')[1]}"
-                            )
-
-                if key == "description":
-                    gene["label"] = val.split("[")[0].strip()
-                    if "Source:" in val:
-                        src_name = (
-                            val.split("[")[-1]
-                            .split("Source:")[-1]
-                            .split("Acc")[0]
-                            .split(";")[0]
-                        )
-                        src_id = val.split("Acc:")[-1].split("]")[0]
-                        if ":" in src_id:
-                            src_id = src_id.split(":")[-1]
-                        source = self._get_xref_associated_with(src_name, src_id)
-                        if "xrefs" in source:
-                            gene["xrefs"] = source["xrefs"]
-                        elif "associated_with" in source:
-                            gene["associated_with"] = source["associated_with"]
-                    continue
-
-                gene[attributes[key]] = val
-
-    def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict:
-        """Get xref or associated_with concept.
+        for key, value in f.attributes.items():
+            if key not in attributes_map:
+                continue
+
+            if key == "ID" and value[0].startswith("gene"):
+                gene[
+                    "concept_id"
+                ] = f"{NamespacePrefix.ENSEMBL.value}:{value[0].split(':')[1]}"
+            elif key == "description":
+                pattern = "^(.*) \\[Source:.*;Acc:(.*):(.*)\\]$"
+                matches = re.findall(pattern, value[0])
+                if matches:
+                    gene["label"] = matches[0][0]
+                    gene["xrefs"] = [self._get_xref(matches[0][1], matches[0][2])]
+            else:
+                gene[attributes_map[key]] = value
+            # key = attribute[0]
+            #
+            # if key in attributes_map.keys():
+            #     val = attribute[1]
+            #
+            #     if len(val) == 1:
+            #         val = val[0]
+            #         if key == "ID":
+            #             if val.startswith("gene"):
+            #                 val = (
+            #                     f"{NamespacePrefix.ENSEMBL.value}:"
+            #                     f"{val.split(':')[1]}"
+            #                 )
+            #
+            #     if key == "description":
+            #         gene["label"] = val.split("[")[0].strip()
+            #         if "Source:" in val:
+            #             src_name = (
+            #                 val.split("[")[-1]
+            #                 .split("Source:")[-1]
+            #                 .split("Acc")[0]
+            #                 .split(";")[0]
+            #             )
+            #             src_id = val.split("Acc:")[-1].split("]")[0]
+            #             if ":" in src_id:
+            #                 src_id = src_id.split(":")[-1]
+            #             gene["xrefs"] = self._get_xref(src_name, src_id)
+            #         continue
+            #
+            #     gene[attributes_map[key]] = val
+
+    def _get_xref(self, src_name: str, src_id: str) -> Optional[str]:
+        """Get xref.
 
         :param src_name: Source name
         :param src_id: The source's accession number
-        :return: A dict containing an other identifier or xref
+        :return: xref, if successfully parsed
         """
-        source = dict()
-        if src_name.startswith("HGNC"):
-            source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"]
-        elif src_name.startswith("NCBI"):
-            source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"]
-        elif src_name.startswith("UniProt"):
-            source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
-        elif src_name.startswith("miRBase"):
-            source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
-        elif src_name.startswith("RFAM"):
-            source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
-        return source
+        for prefix, constrained_prefix in (
+            ("HGNC", NamespacePrefix.HGNC),
+            ("NCBI", NamespacePrefix.NCBI),
+            ("UniProt", NamespacePrefix.UNIPROT),
+            ("miRBase", NamespacePrefix.MIRBASE),
+            ("RFAM", NamespacePrefix.RFAM),
+        ):
+            if src_name.startswith(prefix):
+                return f"{constrained_prefix.value}:{src_id}"
+        _logger.warning("Unrecognized source name: %:%", src_name, src_id)
+        return None
 
     def _add_meta(self) -> None:
         """Add Ensembl metadata.