feat(IPVC-2265): associated_accessions (#15)

invitae · Apr 1, 2024 · c5c1918 · c5c1918
1 parent e0e4ff3
commit c5c1918
Show file tree

Hide file tree

Showing 11 changed files with 262 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -289,7 +289,7 @@ To develop UTA, follow these steps.
 4. Testing
 
         $ docker build --target uta-test -t uta-test .
-        $ docker run -it --rm -v $(pwd)/tests:/opt/repos/uta/tests -v $(pwd)/src:/opt/repos/uta/src uta-test bash
+        $ docker run -it --rm -v $(pwd):/opt/repos/uta uta-test bash
         $ python -m unittest
 
 ## UTA update procedure

diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
@@ -47,6 +47,9 @@ sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gen
 sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \
   tee "$logs_dir/ncbi-fetch-assoc-acs"
 
+sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
+  tee "$logs_dir/assoc-acs-merge"
+
 # parse transcript info from GBFF input files
 GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
 sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/gbff.txinfo.gz" 2>&1 | \
@@ -66,6 +69,9 @@ sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loa
 uta --conf=etc/global.conf --conf=etc/[email protected] load-geneinfo "$loading_dir/genes.geneinfo.gz" 2>&1 | \
   tee "$logs_dir/load-geneinfo.log"
 
+uta --conf=etc/global.conf --conf=etc/[email protected] load-assoc-ac "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
+  tee "$logs_dir/load-assoc-ac.log"
+
 # transcript info
 uta --conf=etc/global.conf --conf=etc/[email protected] load-txinfo "$loading_dir/gbff.txinfo.gz" 2>&1 | \
   tee "$logs_dir/load-txinfo.log"

diff --git a/sbin/ncbi-parse-gene2refseq b/sbin/ncbi-parse-gene2refseq
@@ -10,8 +10,6 @@ ftp://ftp.ncbi.nih.gov/gene/DATA/gene2refseq.gz
 import io
 import sys
 
-from csv import DictReader
-
 from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
 from uta.formats.ncbitsv import NCBITSVReader
 
@@ -47,7 +45,7 @@ if __name__ == "__main__":
 
         key = (ga.gene_symbol, ga.tx_ac, ga.gene_id, ga.pro_ac)
         if key in seen:
-           continue 
+            continue
 
         seen.add(key)
         gaw.write(ga)
diff --git a/src/uta/cli.py b/src/uta/cli.py
@@ -13,13 +13,14 @@
   uta (-C CONF ...) [options] load-geneinfo FILE
   uta (-C CONF ...) [options] load-txinfo FILE
   uta (-C CONF ...) [options] load-exonset FILE
+  uta (-C CONF ...) [options] load-assoc-ac FILE
   uta (-C CONF ...) [options] load-sequences
   uta (-C CONF ...) [options] align-exons [--sql SQL]
   uta (-C CONF ...) [options] load-ncbi-seqgene FILE
   uta (-C CONF ...) [options] grant-permissions
   uta (-C CONF ...) [options] refresh-matviews
   uta (-C CONF ...) [options] analyze
-  
+
 Options:
   -C CONF, --conf CONF	Configuration to read (required)
 
@@ -69,6 +70,7 @@ def main():
         ("create-schema",       ul.create_schema),
         ("drop-schema",         ul.drop_schema),
         ("grant-permissions",   ul.grant_permissions),
+        ("load-assoc-ac",       ul.load_assoc_ac),
         ("load-exonset",        ul.load_exonset),
         ("load-geneinfo",       ul.load_geneinfo),
         ("load-origin",         ul.load_origin),
@@ -118,13 +120,10 @@ def main():
         cmd=cmd, elapsed=time.time() - t0))
 
 
-
 if __name__ == "__main__":
     main()
 
 
-
-
 # <LICENSE>
 # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
 ##

diff --git a/src/uta/exceptions.py b/src/uta/exceptions.py
@@ -21,6 +21,7 @@ class InvalidHGVSVariantError(UTAError):
 class EutilsDownloadError(Exception):
     pass
 
+
 # <LICENSE>
 # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
 ##

diff --git a/src/uta/formats/geneinfo.py b/src/uta/formats/geneinfo.py
@@ -38,7 +38,6 @@ def __next__(self):
         return GeneInfo(**d)
 
 
-
 if __name__ == '__main__':
     tmpfn = '/tmp/exonset'
 

diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -7,12 +7,14 @@
 import itertools
 import logging
 import time
+from typing import Any
 
 from biocommons.seqrepo import SeqRepo
 from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
 from bioutils.digests import seq_md5
 from bioutils.sequences import reverse_complement
 from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import Session
 from sqlalchemy.orm.exc import NoResultFound
 from sqlalchemy import text
 import psycopg2.extras
@@ -240,6 +242,47 @@ def grant_permissions(session, opts, cf):
     session.commit()
 
 
+def load_assoc_ac(session, opts, cf):
+    """
+    Insert rows into `associated_accessions` table in the UTA database,
+    using data from a file written by sbin/assoc-acs-merge.
+    """
+    logger.info("load_assoc_ac")
+
+    admin_role = cf.get("uta", "admin_role")
+    session.execute(text(f"set role {admin_role};"))
+    session.execute(text(f"set search_path = {usam.schema_name};"))
+    fname = opts["FILE"]
+
+    with gzip.open(fname, "rt") as fhandle:
+        for file_row in csv.DictReader(fhandle, delimiter="\t"):
+            row = {
+                "origin": file_row["origin"],
+                "pro_ac": file_row["pro_ac"],
+                "tx_ac": file_row["tx_ac"],
+            }
+            aa, created = _get_or_insert(
+                session=session,
+                table=usam.AssociatedAccessions,
+                row=row,
+                row_identifier=('origin', 'tx_ac', 'pro_ac'),
+            )
+            if created:
+                # If committing on every insert is too slow, we can
+                # look into committing in batches like load_txinfo does.
+                session.commit()
+                logger.info(f"Added: {aa.tx_ac}, {aa.pro_ac}, {aa.origin}")
+            else:
+                logger.info(f"Already exists: {file_row}")
+                # All fields should should match when unique identifiers match.
+                # Discrepancies should be investigated.
+                existing_row = {
+                    "origin": aa.origin,
+                    "pro_ac": aa.pro_ac,
+                    "tx_ac": aa.tx_ac,
+                }
+
+
 def load_exonset(session, opts, cf):
     # exonsets and associated exons are loaded together
 
@@ -265,7 +308,7 @@ def load_exonset(session, opts, cf):
             logger.exception(e)
             session.rollback()
             n_errors += 1
-        finally:        
+        finally:
             (no) = (n is not None, o is not None)
             if no == (True, False):
                 n_new += 1
@@ -305,7 +348,7 @@ def load_geneinfo(session, opts, cf):
 
 def load_ncbi_geneinfo(session, opts, cf):
     """
-    import data as downloaded (by you) from 
+    import data as downloaded (by you) from
     ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
     """
 
@@ -475,7 +518,7 @@ def _upsert_seq(si):
     for md5, si_iter in itertools.groupby(sorted(sir, key=lambda si: si.md5),
                                           key=lambda si: si.md5):
         sis = list(si_iter)
-    
+
         # if sequence doesn't exist in sequence table, make it
         # this is to satisfy a FK dependency, which should be reconsidered
         si = sis[0]
@@ -688,8 +731,6 @@ def _fetch_origin_by_name(name):
                 i_ti=i_ti, n_rows=n_rows,
                 n_new=n_new, n_unchanged=n_unchanged, n_cds_changed=n_cds_changed, n_exons_changed=n_exons_changed,
                 p=(i_ti + 1) / n_rows * 100))
-
-
 
 
 def refresh_matviews(session, opts, cf):
@@ -740,6 +781,34 @@ def _get_seqrepo(cf):
 _get_seqfetcher = _get_seqrepo
 
 
+def _get_or_insert(
+    session: Session,
+    table: type[usam.Base],
+    row: dict[str, Any],
+    row_identifier: str | tuple[str, ...],
+) -> tuple[usam.Base, bool]:
+    """
+    Returns a sqlalchemy model of the inserted or fetched row.
+
+    `session` is a sqlalchemy session.
+    `table` is the database table in which to insert `row`.
+    `row` is the a list of key-value pairs to insert into the table.
+    `row_identifier` is a map of key-value pairs which define a match between `row` and an existing row in the table.
+
+    sqlalchemy.orm.exc.MultipleResultsFound may be raised if `row_identifier` does not uniquely identify a row.
+    KeyError may be raised if `row_identifier` refers to columns not present as keys in `row`.
+    sqlalchemy.exc.IntegrityError (raised from psycopg2.errors.ForeignKeyViolation) may be raised if a foreign key reference does not exist
+    """
+    row_filter = {ri: row[ri] for ri in row_identifier}
+    try:
+        row_instance = session.query(table).filter_by(**row_filter).one()
+        created = False
+    except NoResultFound:
+        row_instance = table(**row)
+        session.add(row_instance)
+        created = True
+    return row_instance, created
+
 
 def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
 
@@ -749,7 +818,7 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
     (new, None) -- no prior record; new was inserted
     (None, old) -- prior record and unchaged; nothing was inserted
     (new, old)  -- prior record existed and was changed
-    
+
     """
 
     key = (tx_ac, alt_ac, method)

diff --git a/src/uta/models.py b/src/uta/models.py
@@ -6,6 +6,8 @@
 
 import sqlalchemy as sa
 import sqlalchemy.orm as sao
+import sqlalchemy.types
+import sqlalchemy.sql.functions
 from sqlalchemy.ext.declarative import declarative_base
 
 
@@ -220,6 +222,26 @@ class ExonAln(Base):
     # methods:
 
 
+class AssociatedAccessions(Base):
+    __tablename__ = "associated_accessions"
+    __table_args__ = (
+        sa.UniqueConstraint("origin", "tx_ac", "pro_ac", name="unique_pair_in_origin"),
+        sa.Index("associated_accessions_pro_ac", "pro_ac"),
+        sa.Index("associated_accessions_tx_ac", "tx_ac"),
+    )
+
+    # columns:
+    associated_accession_id = sa.Column(sa.Integer, primary_key=True, autoincrement=True)
+    tx_ac = sa.Column(sa.Text, nullable=False)
+    pro_ac = sa.Column(sa.Text, nullable=False)
+    origin = sa.Column(sa.Text, nullable=False)
+    added = sa.Column(
+        sqlalchemy.types.TIMESTAMP,
+        server_default=sqlalchemy.sql.functions.now(),
+        nullable=False,
+    )
+
+
 # <LICENSE>
 # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
 ##

diff --git a/tests/data/assocacs.gz b/tests/data/assocacs.gz
diff --git a/tests/test_uta_loading.py b/tests/test_uta_loading.py
@@ -0,0 +1,108 @@
+import configparser
+import signal
+import unittest
+
+import sqlalchemy as sa
+import testing.postgresql
+
+import uta
+import uta.loading as ul
+import uta.models as usam
+
+
+class TestUtaLoading(unittest.TestCase):
+
+    def setUp(self):
+        self.db = testing.postgresql.Postgresql()
+        self.session = uta.connect(self.db.url())
+        schema = usam.schema_name
+        self.session.execute(sa.text(f'drop schema if exists {schema} cascade'))
+        self.session.execute(sa.text(f'create schema {schema}'))
+        self.session.execute(sa.text('create role uta_admin'))
+        self.session.execute(sa.text(f'grant all privileges on schema {schema} to uta_admin'))
+        self.session.commit()
+
+        # create all uta tables
+        usam.Base.metadata.create_all(self.session.bind.engine)
+        self.session.execute(sa.text(f'grant all privileges on all tables in schema {schema} to uta_admin'))
+        self.session.execute(sa.text(f'grant all privileges on all sequences in schema {schema} to uta_admin'))
+        self.session.commit()
+
+    def tearDown(self):
+        self.session.close()
+        self.db.stop(_signal=signal.SIGKILL)
+        self.db.cleanup()
+
+    def test_load_assoc_ac(self):
+        """
+        Loading file tests/data/assocacs.gz should create associated_accessions records in the database.
+        Row will be created in associated_accessions even when transcript or origin does not exist in database.
+        This is only the case until tx_ac and origin are converted to foreign keys.
+        """
+
+        # insert origins referenced in data file
+        o1 = usam.Origin(
+            name='NCBI',
+            url='http://bogus.com/ncbi',
+            url_ac_fmt='http://bogus.com/ncbi/{ac}',
+        )
+        self.session.add(o1)
+
+        # insert transcripts referenced in data file
+        t1 = usam.Transcript(
+            ac='NM_001097.3',
+            origin=o1,
+            hgnc='ACR',
+            cds_start_i=0,
+            cds_end_i=1,
+            cds_md5='a',
+        )
+        t2 = usam.Transcript(
+            ac='NM_001098.3',
+            origin=o1,
+            hgnc='ACO2',
+            cds_start_i=2,
+            cds_end_i=3,
+            cds_md5='b',
+        )
+        self.session.add(t1)
+        self.session.add(t2)
+
+        # pre-add one of the associated_acessions from the test data file
+        # to demonstrate get-or-insert behavior
+        p = usam.AssociatedAccessions(
+            tx_ac='NM_001097.3',
+            pro_ac='NP_001088.2',
+            origin='NCBI',
+        )
+        self.session.add(p)
+
+        self.session.commit()
+
+        cf = configparser.ConfigParser()
+        cf.add_section('uta')
+        cf.set('uta', 'admin_role', 'uta_admin')
+
+        ul.load_assoc_ac(self.session, {'FILE': 'tests/data/assocacs.gz'}, cf)
+
+        # associated_accessions table should contain one record per line in file
+        aa = self.session.query(usam.AssociatedAccessions).order_by(usam.AssociatedAccessions.tx_ac).all()
+        aa_list = [{'tx_ac': aa.tx_ac, 'pro_ac': aa.pro_ac, 'origin_name': aa.origin} for aa in aa]
+        expected_aa_list = [
+            {
+                'tx_ac': 'DummyTx',
+                'pro_ac': 'DummyProtein',
+                'origin_name': 'DummyOrigin',
+            },
+            {
+                'tx_ac': 'NM_001097.3',
+                'pro_ac': 'NP_001088.2',
+                'origin_name': 'NCBI',
+            },
+            {
+                'tx_ac': 'NM_001098.3',
+                'pro_ac': 'NP_001089.1',
+                'origin_name': 'NCBI',
+            },
+        ]
+        self.assertEqual(aa_list, expected_aa_list)