From c5c1918b32ef2af16e92c068eddece960f64cf13 Mon Sep 17 00:00:00 2001 From: nvta1209 <162694616+nvta1209@users.noreply.github.com> Date: Mon, 1 Apr 2024 13:10:31 -0700 Subject: [PATCH] feat(IPVC-2265): associated_accessions (#15) --- README.md | 2 +- etc/scripts/run-uta-build.sh | 6 ++ sbin/ncbi-parse-gene2refseq | 4 +- src/uta/cli.py | 7 +-- src/uta/exceptions.py | 1 + src/uta/formats/geneinfo.py | 1 - src/uta/loading.py | 81 ++++++++++++++++++++++++-- src/uta/models.py | 22 +++++++ tests/data/assocacs.gz | Bin 0 -> 144 bytes tests/test_uta_loading.py | 108 +++++++++++++++++++++++++++++++++++ tests/test_uta_models.py | 49 ++++++++++++++-- 11 files changed, 262 insertions(+), 19 deletions(-) create mode 100644 tests/data/assocacs.gz create mode 100644 tests/test_uta_loading.py diff --git a/README.md b/README.md index e897474..8b95314 100644 --- a/README.md +++ b/README.md @@ -289,7 +289,7 @@ To develop UTA, follow these steps. 4. Testing $ docker build --target uta-test -t uta-test . - $ docker run -it --rm -v $(pwd)/tests:/opt/repos/uta/tests -v $(pwd)/src:/opt/repos/uta/src uta-test bash + $ docker run -it --rm -v $(pwd):/opt/repos/uta uta-test bash $ python -m unittest ## UTA update procedure diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh index 86175c8..166c6cb 100755 --- a/etc/scripts/run-uta-build.sh +++ b/etc/scripts/run-uta-build.sh @@ -47,6 +47,9 @@ sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gen sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \ tee "$logs_dir/ncbi-fetch-assoc-acs" +sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assocacs.cleaned.gz" 2>&1 | \ + tee "$logs_dir/assoc-acs-merge" + # parse transcript info from GBFF input files GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz) sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/gbff.txinfo.gz" 2>&1 | \ @@ -66,6 +69,9 @@ sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loa uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$loading_dir/genes.geneinfo.gz" 2>&1 | \ tee "$logs_dir/load-geneinfo.log" +uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$loading_dir/assocacs.cleaned.gz" 2>&1 | \ + tee "$logs_dir/load-assoc-ac.log" + # transcript info uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loading_dir/gbff.txinfo.gz" 2>&1 | \ tee "$logs_dir/load-txinfo.log" diff --git a/sbin/ncbi-parse-gene2refseq b/sbin/ncbi-parse-gene2refseq index 7bfe70c..1f2f6cb 100755 --- a/sbin/ncbi-parse-gene2refseq +++ b/sbin/ncbi-parse-gene2refseq @@ -10,8 +10,6 @@ ftp://ftp.ncbi.nih.gov/gene/DATA/gene2refseq.gz import io import sys -from csv import DictReader - from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter from uta.formats.ncbitsv import NCBITSVReader @@ -47,7 +45,7 @@ if __name__ == "__main__": key = (ga.gene_symbol, ga.tx_ac, ga.gene_id, ga.pro_ac) if key in seen: - continue + continue seen.add(key) gaw.write(ga) diff --git a/src/uta/cli.py b/src/uta/cli.py index 4b85440..adc8e8b 100644 --- a/src/uta/cli.py +++ b/src/uta/cli.py @@ -13,13 +13,14 @@ uta (-C CONF ...) [options] load-geneinfo FILE uta (-C CONF ...) [options] load-txinfo FILE uta (-C CONF ...) [options] load-exonset FILE + uta (-C CONF ...) [options] load-assoc-ac FILE uta (-C CONF ...) [options] load-sequences uta (-C CONF ...) [options] align-exons [--sql SQL] uta (-C CONF ...) [options] load-ncbi-seqgene FILE uta (-C CONF ...) [options] grant-permissions uta (-C CONF ...) [options] refresh-matviews uta (-C CONF ...) [options] analyze - + Options: -C CONF, --conf CONF Configuration to read (required) @@ -69,6 +70,7 @@ def main(): ("create-schema", ul.create_schema), ("drop-schema", ul.drop_schema), ("grant-permissions", ul.grant_permissions), + ("load-assoc-ac", ul.load_assoc_ac), ("load-exonset", ul.load_exonset), ("load-geneinfo", ul.load_geneinfo), ("load-origin", ul.load_origin), @@ -118,13 +120,10 @@ def main(): cmd=cmd, elapsed=time.time() - t0)) - if __name__ == "__main__": main() - - # # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta) ## diff --git a/src/uta/exceptions.py b/src/uta/exceptions.py index a0cad0c..e391297 100644 --- a/src/uta/exceptions.py +++ b/src/uta/exceptions.py @@ -21,6 +21,7 @@ class InvalidHGVSVariantError(UTAError): class EutilsDownloadError(Exception): pass + # # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta) ## diff --git a/src/uta/formats/geneinfo.py b/src/uta/formats/geneinfo.py index c226452..e094adf 100644 --- a/src/uta/formats/geneinfo.py +++ b/src/uta/formats/geneinfo.py @@ -38,7 +38,6 @@ def __next__(self): return GeneInfo(**d) - if __name__ == '__main__': tmpfn = '/tmp/exonset' diff --git a/src/uta/loading.py b/src/uta/loading.py index 7d1c310..7ddd354 100644 --- a/src/uta/loading.py +++ b/src/uta/loading.py @@ -7,12 +7,14 @@ import itertools import logging import time +from typing import Any from biocommons.seqrepo import SeqRepo from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND from bioutils.digests import seq_md5 from bioutils.sequences import reverse_complement from sqlalchemy.exc import IntegrityError +from sqlalchemy.orm import Session from sqlalchemy.orm.exc import NoResultFound from sqlalchemy import text import psycopg2.extras @@ -240,6 +242,47 @@ def grant_permissions(session, opts, cf): session.commit() +def load_assoc_ac(session, opts, cf): + """ + Insert rows into `associated_accessions` table in the UTA database, + using data from a file written by sbin/assoc-acs-merge. + """ + logger.info("load_assoc_ac") + + admin_role = cf.get("uta", "admin_role") + session.execute(text(f"set role {admin_role};")) + session.execute(text(f"set search_path = {usam.schema_name};")) + fname = opts["FILE"] + + with gzip.open(fname, "rt") as fhandle: + for file_row in csv.DictReader(fhandle, delimiter="\t"): + row = { + "origin": file_row["origin"], + "pro_ac": file_row["pro_ac"], + "tx_ac": file_row["tx_ac"], + } + aa, created = _get_or_insert( + session=session, + table=usam.AssociatedAccessions, + row=row, + row_identifier=('origin', 'tx_ac', 'pro_ac'), + ) + if created: + # If committing on every insert is too slow, we can + # look into committing in batches like load_txinfo does. + session.commit() + logger.info(f"Added: {aa.tx_ac}, {aa.pro_ac}, {aa.origin}") + else: + logger.info(f"Already exists: {file_row}") + # All fields should should match when unique identifiers match. + # Discrepancies should be investigated. + existing_row = { + "origin": aa.origin, + "pro_ac": aa.pro_ac, + "tx_ac": aa.tx_ac, + } + + def load_exonset(session, opts, cf): # exonsets and associated exons are loaded together @@ -265,7 +308,7 @@ def load_exonset(session, opts, cf): logger.exception(e) session.rollback() n_errors += 1 - finally: + finally: (no) = (n is not None, o is not None) if no == (True, False): n_new += 1 @@ -305,7 +348,7 @@ def load_geneinfo(session, opts, cf): def load_ncbi_geneinfo(session, opts, cf): """ - import data as downloaded (by you) from + import data as downloaded (by you) from ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz """ @@ -475,7 +518,7 @@ def _upsert_seq(si): for md5, si_iter in itertools.groupby(sorted(sir, key=lambda si: si.md5), key=lambda si: si.md5): sis = list(si_iter) - + # if sequence doesn't exist in sequence table, make it # this is to satisfy a FK dependency, which should be reconsidered si = sis[0] @@ -688,8 +731,6 @@ def _fetch_origin_by_name(name): i_ti=i_ti, n_rows=n_rows, n_new=n_new, n_unchanged=n_unchanged, n_cds_changed=n_cds_changed, n_exons_changed=n_exons_changed, p=(i_ti + 1) / n_rows * 100)) - - def refresh_matviews(session, opts, cf): @@ -740,6 +781,34 @@ def _get_seqrepo(cf): _get_seqfetcher = _get_seqrepo +def _get_or_insert( + session: Session, + table: type[usam.Base], + row: dict[str, Any], + row_identifier: str | tuple[str, ...], +) -> tuple[usam.Base, bool]: + """ + Returns a sqlalchemy model of the inserted or fetched row. + + `session` is a sqlalchemy session. + `table` is the database table in which to insert `row`. + `row` is the a list of key-value pairs to insert into the table. + `row_identifier` is a map of key-value pairs which define a match between `row` and an existing row in the table. + + sqlalchemy.orm.exc.MultipleResultsFound may be raised if `row_identifier` does not uniquely identify a row. + KeyError may be raised if `row_identifier` refers to columns not present as keys in `row`. + sqlalchemy.exc.IntegrityError (raised from psycopg2.errors.ForeignKeyViolation) may be raised if a foreign key reference does not exist + """ + row_filter = {ri: row[ri] for ri in row_identifier} + try: + row_instance = session.query(table).filter_by(**row_filter).one() + created = False + except NoResultFound: + row_instance = table(**row) + session.add(row_instance) + created = True + return row_instance, created + def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess): @@ -749,7 +818,7 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess): (new, None) -- no prior record; new was inserted (None, old) -- prior record and unchaged; nothing was inserted (new, old) -- prior record existed and was changed - + """ key = (tx_ac, alt_ac, method) diff --git a/src/uta/models.py b/src/uta/models.py index 2305666..20c71b5 100644 --- a/src/uta/models.py +++ b/src/uta/models.py @@ -6,6 +6,8 @@ import sqlalchemy as sa import sqlalchemy.orm as sao +import sqlalchemy.types +import sqlalchemy.sql.functions from sqlalchemy.ext.declarative import declarative_base @@ -220,6 +222,26 @@ class ExonAln(Base): # methods: +class AssociatedAccessions(Base): + __tablename__ = "associated_accessions" + __table_args__ = ( + sa.UniqueConstraint("origin", "tx_ac", "pro_ac", name="unique_pair_in_origin"), + sa.Index("associated_accessions_pro_ac", "pro_ac"), + sa.Index("associated_accessions_tx_ac", "tx_ac"), + ) + + # columns: + associated_accession_id = sa.Column(sa.Integer, primary_key=True, autoincrement=True) + tx_ac = sa.Column(sa.Text, nullable=False) + pro_ac = sa.Column(sa.Text, nullable=False) + origin = sa.Column(sa.Text, nullable=False) + added = sa.Column( + sqlalchemy.types.TIMESTAMP, + server_default=sqlalchemy.sql.functions.now(), + nullable=False, + ) + + # # Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta) ## diff --git a/tests/data/assocacs.gz b/tests/data/assocacs.gz new file mode 100644 index 0000000000000000000000000000000000000000..10214c22b780f85aff6da9cb855410b5b5462d90 GIT binary patch literal 144 zcmV;B0B`>viwFn`xcg-Q17UM>Z)0I&a{x