Skip to content

Commit

Permalink
feat(IPVC-2265): associated_accessions (#15)
Browse files Browse the repository at this point in the history
  • Loading branch information
nvta1209 authored Apr 1, 2024
1 parent e0e4ff3 commit c5c1918
Show file tree
Hide file tree
Showing 11 changed files with 262 additions and 19 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,7 @@ To develop UTA, follow these steps.
4. Testing

$ docker build --target uta-test -t uta-test .
$ docker run -it --rm -v $(pwd)/tests:/opt/repos/uta/tests -v $(pwd)/src:/opt/repos/uta/src uta-test bash
$ docker run -it --rm -v $(pwd):/opt/repos/uta uta-test bash
$ python -m unittest

## UTA update procedure
Expand Down
6 changes: 6 additions & 0 deletions etc/scripts/run-uta-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,9 @@ sbin/ncbi-parse-geneinfo $ncbi_dir/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gen
sbin/ncbi-parse-gene2refseq $ncbi_dir/gene/DATA/gene2accession.gz | gzip -c > "$loading_dir/assocacs.gz" 2>&1 | \
tee "$logs_dir/ncbi-fetch-assoc-acs"

sbin/assoc-acs-merge "$loading_dir/assocacs.gz" | gzip -c > "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
tee "$logs_dir/assoc-acs-merge"

# parse transcript info from GBFF input files
GBFF_files=$(ls $ncbi_dir/refseq/H_sapiens/mRNA_Prot/human.*.rna.gbff.gz)
sbin/ncbi-parse-gbff "$GBFF_files" | gzip -c > "$loading_dir/gbff.txinfo.gz" 2>&1 | \
Expand All @@ -66,6 +69,9 @@ sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loa
uta --conf=etc/global.conf --conf=etc/[email protected] load-geneinfo "$loading_dir/genes.geneinfo.gz" 2>&1 | \
tee "$logs_dir/load-geneinfo.log"

uta --conf=etc/global.conf --conf=etc/[email protected] load-assoc-ac "$loading_dir/assocacs.cleaned.gz" 2>&1 | \
tee "$logs_dir/load-assoc-ac.log"

# transcript info
uta --conf=etc/global.conf --conf=etc/[email protected] load-txinfo "$loading_dir/gbff.txinfo.gz" 2>&1 | \
tee "$logs_dir/load-txinfo.log"
Expand Down
4 changes: 1 addition & 3 deletions sbin/ncbi-parse-gene2refseq
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ ftp://ftp.ncbi.nih.gov/gene/DATA/gene2refseq.gz
import io
import sys

from csv import DictReader

from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
from uta.formats.ncbitsv import NCBITSVReader

Expand Down Expand Up @@ -47,7 +45,7 @@ if __name__ == "__main__":

key = (ga.gene_symbol, ga.tx_ac, ga.gene_id, ga.pro_ac)
if key in seen:
continue
continue

seen.add(key)
gaw.write(ga)
7 changes: 3 additions & 4 deletions src/uta/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@
uta (-C CONF ...) [options] load-geneinfo FILE
uta (-C CONF ...) [options] load-txinfo FILE
uta (-C CONF ...) [options] load-exonset FILE
uta (-C CONF ...) [options] load-assoc-ac FILE
uta (-C CONF ...) [options] load-sequences
uta (-C CONF ...) [options] align-exons [--sql SQL]
uta (-C CONF ...) [options] load-ncbi-seqgene FILE
uta (-C CONF ...) [options] grant-permissions
uta (-C CONF ...) [options] refresh-matviews
uta (-C CONF ...) [options] analyze
Options:
-C CONF, --conf CONF Configuration to read (required)
Expand Down Expand Up @@ -69,6 +70,7 @@ def main():
("create-schema", ul.create_schema),
("drop-schema", ul.drop_schema),
("grant-permissions", ul.grant_permissions),
("load-assoc-ac", ul.load_assoc_ac),
("load-exonset", ul.load_exonset),
("load-geneinfo", ul.load_geneinfo),
("load-origin", ul.load_origin),
Expand Down Expand Up @@ -118,13 +120,10 @@ def main():
cmd=cmd, elapsed=time.time() - t0))



if __name__ == "__main__":
main()




# <LICENSE>
# Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
##
Expand Down
1 change: 1 addition & 0 deletions src/uta/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ class InvalidHGVSVariantError(UTAError):
class EutilsDownloadError(Exception):
pass


# <LICENSE>
# Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
##
Expand Down
1 change: 0 additions & 1 deletion src/uta/formats/geneinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def __next__(self):
return GeneInfo(**d)



if __name__ == '__main__':
tmpfn = '/tmp/exonset'

Expand Down
81 changes: 75 additions & 6 deletions src/uta/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,14 @@
import itertools
import logging
import time
from typing import Any

from biocommons.seqrepo import SeqRepo
from bioutils.coordinates import strand_pm_to_int, MINUS_STRAND
from bioutils.digests import seq_md5
from bioutils.sequences import reverse_complement
from sqlalchemy.exc import IntegrityError
from sqlalchemy.orm import Session
from sqlalchemy.orm.exc import NoResultFound
from sqlalchemy import text
import psycopg2.extras
Expand Down Expand Up @@ -240,6 +242,47 @@ def grant_permissions(session, opts, cf):
session.commit()


def load_assoc_ac(session, opts, cf):
"""
Insert rows into `associated_accessions` table in the UTA database,
using data from a file written by sbin/assoc-acs-merge.
"""
logger.info("load_assoc_ac")

admin_role = cf.get("uta", "admin_role")
session.execute(text(f"set role {admin_role};"))
session.execute(text(f"set search_path = {usam.schema_name};"))
fname = opts["FILE"]

with gzip.open(fname, "rt") as fhandle:
for file_row in csv.DictReader(fhandle, delimiter="\t"):
row = {
"origin": file_row["origin"],
"pro_ac": file_row["pro_ac"],
"tx_ac": file_row["tx_ac"],
}
aa, created = _get_or_insert(
session=session,
table=usam.AssociatedAccessions,
row=row,
row_identifier=('origin', 'tx_ac', 'pro_ac'),
)
if created:
# If committing on every insert is too slow, we can
# look into committing in batches like load_txinfo does.
session.commit()
logger.info(f"Added: {aa.tx_ac}, {aa.pro_ac}, {aa.origin}")
else:
logger.info(f"Already exists: {file_row}")
# All fields should should match when unique identifiers match.
# Discrepancies should be investigated.
existing_row = {
"origin": aa.origin,
"pro_ac": aa.pro_ac,
"tx_ac": aa.tx_ac,
}


def load_exonset(session, opts, cf):
# exonsets and associated exons are loaded together

Expand All @@ -265,7 +308,7 @@ def load_exonset(session, opts, cf):
logger.exception(e)
session.rollback()
n_errors += 1
finally:
finally:
(no) = (n is not None, o is not None)
if no == (True, False):
n_new += 1
Expand Down Expand Up @@ -305,7 +348,7 @@ def load_geneinfo(session, opts, cf):

def load_ncbi_geneinfo(session, opts, cf):
"""
import data as downloaded (by you) from
import data as downloaded (by you) from
ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/gene_info.gz
"""

Expand Down Expand Up @@ -475,7 +518,7 @@ def _upsert_seq(si):
for md5, si_iter in itertools.groupby(sorted(sir, key=lambda si: si.md5),
key=lambda si: si.md5):
sis = list(si_iter)

# if sequence doesn't exist in sequence table, make it
# this is to satisfy a FK dependency, which should be reconsidered
si = sis[0]
Expand Down Expand Up @@ -688,8 +731,6 @@ def _fetch_origin_by_name(name):
i_ti=i_ti, n_rows=n_rows,
n_new=n_new, n_unchanged=n_unchanged, n_cds_changed=n_cds_changed, n_exons_changed=n_exons_changed,
p=(i_ti + 1) / n_rows * 100))




def refresh_matviews(session, opts, cf):
Expand Down Expand Up @@ -740,6 +781,34 @@ def _get_seqrepo(cf):
_get_seqfetcher = _get_seqrepo


def _get_or_insert(
session: Session,
table: type[usam.Base],
row: dict[str, Any],
row_identifier: str | tuple[str, ...],
) -> tuple[usam.Base, bool]:
"""
Returns a sqlalchemy model of the inserted or fetched row.
`session` is a sqlalchemy session.
`table` is the database table in which to insert `row`.
`row` is the a list of key-value pairs to insert into the table.
`row_identifier` is a map of key-value pairs which define a match between `row` and an existing row in the table.
sqlalchemy.orm.exc.MultipleResultsFound may be raised if `row_identifier` does not uniquely identify a row.
KeyError may be raised if `row_identifier` refers to columns not present as keys in `row`.
sqlalchemy.exc.IntegrityError (raised from psycopg2.errors.ForeignKeyViolation) may be raised if a foreign key reference does not exist
"""
row_filter = {ri: row[ri] for ri in row_identifier}
try:
row_instance = session.query(table).filter_by(**row_filter).one()
created = False
except NoResultFound:
row_instance = table(**row)
session.add(row_instance)
created = True
return row_instance, created


def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):

Expand All @@ -749,7 +818,7 @@ def _upsert_exon_set_record(session, tx_ac, alt_ac, strand, method, ess):
(new, None) -- no prior record; new was inserted
(None, old) -- prior record and unchaged; nothing was inserted
(new, old) -- prior record existed and was changed
"""

key = (tx_ac, alt_ac, method)
Expand Down
22 changes: 22 additions & 0 deletions src/uta/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

import sqlalchemy as sa
import sqlalchemy.orm as sao
import sqlalchemy.types
import sqlalchemy.sql.functions
from sqlalchemy.ext.declarative import declarative_base


Expand Down Expand Up @@ -220,6 +222,26 @@ class ExonAln(Base):
# methods:


class AssociatedAccessions(Base):
__tablename__ = "associated_accessions"
__table_args__ = (
sa.UniqueConstraint("origin", "tx_ac", "pro_ac", name="unique_pair_in_origin"),
sa.Index("associated_accessions_pro_ac", "pro_ac"),
sa.Index("associated_accessions_tx_ac", "tx_ac"),
)

# columns:
associated_accession_id = sa.Column(sa.Integer, primary_key=True, autoincrement=True)
tx_ac = sa.Column(sa.Text, nullable=False)
pro_ac = sa.Column(sa.Text, nullable=False)
origin = sa.Column(sa.Text, nullable=False)
added = sa.Column(
sqlalchemy.types.TIMESTAMP,
server_default=sqlalchemy.sql.functions.now(),
nullable=False,
)


# <LICENSE>
# Copyright 2014 UTA Contributors (https://bitbucket.org/biocommons/uta)
##
Expand Down
Binary file added tests/data/assocacs.gz
Binary file not shown.
108 changes: 108 additions & 0 deletions tests/test_uta_loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import configparser
import signal
import unittest

import sqlalchemy as sa
import testing.postgresql

import uta
import uta.loading as ul
import uta.models as usam


class TestUtaLoading(unittest.TestCase):

def setUp(self):
self.db = testing.postgresql.Postgresql()
self.session = uta.connect(self.db.url())
schema = usam.schema_name
self.session.execute(sa.text(f'drop schema if exists {schema} cascade'))
self.session.execute(sa.text(f'create schema {schema}'))
self.session.execute(sa.text('create role uta_admin'))
self.session.execute(sa.text(f'grant all privileges on schema {schema} to uta_admin'))
self.session.commit()

# create all uta tables
usam.Base.metadata.create_all(self.session.bind.engine)
self.session.execute(sa.text(f'grant all privileges on all tables in schema {schema} to uta_admin'))
self.session.execute(sa.text(f'grant all privileges on all sequences in schema {schema} to uta_admin'))
self.session.commit()

def tearDown(self):
self.session.close()
self.db.stop(_signal=signal.SIGKILL)
self.db.cleanup()

def test_load_assoc_ac(self):
"""
Loading file tests/data/assocacs.gz should create associated_accessions records in the database.
Row will be created in associated_accessions even when transcript or origin does not exist in database.
This is only the case until tx_ac and origin are converted to foreign keys.
"""

# insert origins referenced in data file
o1 = usam.Origin(
name='NCBI',
url='http://bogus.com/ncbi',
url_ac_fmt='http://bogus.com/ncbi/{ac}',
)
self.session.add(o1)

# insert transcripts referenced in data file
t1 = usam.Transcript(
ac='NM_001097.3',
origin=o1,
hgnc='ACR',
cds_start_i=0,
cds_end_i=1,
cds_md5='a',
)
t2 = usam.Transcript(
ac='NM_001098.3',
origin=o1,
hgnc='ACO2',
cds_start_i=2,
cds_end_i=3,
cds_md5='b',
)
self.session.add(t1)
self.session.add(t2)

# pre-add one of the associated_acessions from the test data file
# to demonstrate get-or-insert behavior
p = usam.AssociatedAccessions(
tx_ac='NM_001097.3',
pro_ac='NP_001088.2',
origin='NCBI',
)
self.session.add(p)

self.session.commit()

cf = configparser.ConfigParser()
cf.add_section('uta')
cf.set('uta', 'admin_role', 'uta_admin')

ul.load_assoc_ac(self.session, {'FILE': 'tests/data/assocacs.gz'}, cf)

# associated_accessions table should contain one record per line in file
aa = self.session.query(usam.AssociatedAccessions).order_by(usam.AssociatedAccessions.tx_ac).all()
aa_list = [{'tx_ac': aa.tx_ac, 'pro_ac': aa.pro_ac, 'origin_name': aa.origin} for aa in aa]
expected_aa_list = [
{
'tx_ac': 'DummyTx',
'pro_ac': 'DummyProtein',
'origin_name': 'DummyOrigin',
},
{
'tx_ac': 'NM_001097.3',
'pro_ac': 'NP_001088.2',
'origin_name': 'NCBI',
},
{
'tx_ac': 'NM_001098.3',
'pro_ac': 'NP_001089.1',
'origin_name': 'NCBI',
},
]
self.assertEqual(aa_list, expected_aa_list)
Loading

0 comments on commit c5c1918

Please sign in to comment.