Skip to content

Commit

Permalink
Merge branch 'interface-updates-epic' into fix-error-names
Browse files Browse the repository at this point in the history
  • Loading branch information
jsstevenson committed Dec 14, 2023
2 parents 2bc13de + da1a121 commit 3aea4fc
Show file tree
Hide file tree
Showing 9 changed files with 67 additions and 79 deletions.
5 changes: 0 additions & 5 deletions src/gene/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Provides a CLI util to make updates to normalizer database."""
import logging
import os
from pathlib import Path
from typing import Optional, Tuple
Expand All @@ -11,10 +10,6 @@
from gene.etl.update import update_all_sources, update_normalized, update_source
from gene.schemas import SourceName

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)


url_description = 'URL endpoint for the application database. Can either be a URL to a local DynamoDB server (e.g. "http://localhost:8001") or a libpq-compliant PostgreSQL connection description (e.g. "postgresql://postgres:password@localhost:5432/gene_normalizer").'


Expand Down
43 changes: 17 additions & 26 deletions src/gene/database/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
SourceName,
)

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


class DynamoDbDatabase(AbstractDatabase):
Expand Down Expand Up @@ -165,7 +165,7 @@ def check_schema_initialized(self) -> bool:
existing_tables = self.list_tables()
exists = self.gene_table in existing_tables
if not exists:
logger.info(f"{self.gene_table} table is missing or unavailable.")
_logger.info(f"{self.gene_table} table is missing or unavailable.")
return exists

def check_tables_populated(self) -> bool:
Expand All @@ -182,7 +182,7 @@ def check_tables_populated(self) -> bool:
KeyConditionExpression=Key("item_type").eq("source"),
).get("Items", [])
if len(sources) < len(SourceName):
logger.info("Gene sources table is missing expected sources.")
_logger.info("Gene sources table is missing expected sources.")
return False

records = self.genes.query(
Expand All @@ -191,7 +191,7 @@ def check_tables_populated(self) -> bool:
Limit=1,
)
if len(records.get("Items", [])) < 1:
logger.info("Gene records index is empty.")
_logger.info("Gene records index is empty.")
return False

normalized_records = self.genes.query(
Expand All @@ -200,7 +200,7 @@ def check_tables_populated(self) -> bool:
Limit=1,
)
if len(normalized_records.get("Items", [])) < 1:
logger.info("Normalized gene records index is empty.")
_logger.info("Normalized gene records index is empty.")
return False

return True
Expand Down Expand Up @@ -262,10 +262,8 @@ def get_record_by_id(
del record["label_and_type"]
return record
except ClientError as e:
logger.error(
f"boto3 client error on get_records_by_id for "
f"search term {concept_id}: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on get_records_by_id for search term {concept_id}: {e.response['Error']['Message']}"
)
return None
except (KeyError, IndexError): # record doesn't exist
Expand All @@ -285,10 +283,8 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
matches = self.genes.query(KeyConditionExpression=filter_exp)
return [m["concept_id"] for m in matches.get("Items", None)]
except ClientError as e:
logger.error(
f"boto3 client error on get_refs_by_type for "
f"search term {search_term}: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on get_refs_by_type for search term {search_term}: {e.response['Error']['Message']}"
)
return []

Expand Down Expand Up @@ -389,9 +385,8 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
try:
self.batch.put_item(Item=record)
except ClientError as e:
logger.error(
"boto3 client error on add_record for "
f"{concept_id}: {e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on add_record for {concept_id}: {e.response['Error']['Message']}"
)
for attr_type, item_type in ITEM_TYPES.items():
if attr_type in record:
Expand Down Expand Up @@ -421,9 +416,8 @@ def add_merged_record(self, record: Dict) -> None:
try:
self.batch.put_item(Item=record)
except ClientError as e:
logger.error(
"boto3 client error on add_record for "
f"{concept_id}: {e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on add_record for {concept_id}: {e.response['Error']['Message']}"
)

def _add_ref_record(
Expand All @@ -447,10 +441,8 @@ def _add_ref_record(
try:
self.batch.put_item(Item=record)
except ClientError as e:
logger.error(
f"boto3 client error adding reference {term} for "
f"{concept_id} with match type {ref_type}: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error adding reference {term} for {concept_id} with match type {ref_type}: {e.response['Error']['Message']}"
)

def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: ANN401
Expand Down Expand Up @@ -479,9 +471,8 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN
f"No such record exists for keys {label_and_type}, {concept_id}"
)
else:
logger.error(
f"boto3 client error in `database.update_record()`: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error in `database.update_record()`: {e.response['Error']['Message']}"
)

def delete_normalized_concepts(self) -> None:
Expand Down
22 changes: 11 additions & 11 deletions src/gene/database/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)
from gene.schemas import RecordType, RefType, SourceMeta, SourceName

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


SCRIPTS_DIR = Path(__file__).parent / "postgresql"
Expand Down Expand Up @@ -124,7 +124,7 @@ def drop_db(self) -> None:
with self.conn.cursor() as cur:
cur.execute(self._drop_db_query)
self.conn.commit()
logger.info("Dropped all existing gene normalizer tables.")
_logger.info("Dropped all existing gene normalizer tables.")

def check_schema_initialized(self) -> bool:
"""Check if database schema is properly initialized.
Expand All @@ -137,7 +137,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Gene table existence check failed.")
_logger.info("Gene table existence check failed.")
self.conn.rollback()
return False
try:
Expand All @@ -146,7 +146,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateObject:
self.conn.rollback()
else:
logger.info("Gene foreign key existence check failed.")
_logger.info("Gene foreign key existence check failed.")
self.conn.rollback()
return False
try:
Expand All @@ -157,7 +157,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Gene normalized view lookup failed.")
_logger.info("Gene normalized view lookup failed.")
self.conn.rollback()
return False
try:
Expand All @@ -166,7 +166,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Gene indexes check failed.")
_logger.info("Gene indexes check failed.")
self.conn.rollback()
return False

Expand All @@ -189,21 +189,21 @@ def check_tables_populated(self) -> bool:
cur.execute(self._check_sources_query)
results = cur.fetchall()
if len(results) < len(SourceName):
logger.info("Gene sources table is missing expected sources.")
_logger.info("Gene sources table is missing expected sources.")
return False

with self.conn.cursor() as cur:
cur.execute(self._check_concepts_query)
result = cur.fetchone()
if not result or result[0] < 1:
logger.info("Gene records table is empty.")
_logger.info("Gene records table is empty.")
return False

with self.conn.cursor() as cur:
cur.execute(self._check_merged_query)
result = cur.fetchone()
if not result or result[0] < 1:
logger.info("Normalized gene records table is empty.")
_logger.info("Normalized gene records table is empty.")
return False

return True
Expand Down Expand Up @@ -265,7 +265,7 @@ def _drop_indexes(self) -> None:

def _create_tables(self) -> None:
"""Create all tables, indexes, and views."""
logger.debug("Creating new gene normalizer tables.")
_logger.info("Creating new gene normalizer tables.")
tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes()

with self.conn.cursor() as cur:
Expand Down Expand Up @@ -599,7 +599,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
cur.execute(self._ins_symbol_query, [record["symbol"], concept_id])
self.conn.commit()
except UniqueViolation:
logger.error(f"Record with ID {concept_id} already exists")
_logger.error(f"Record with ID {concept_id} already exists")
self.conn.rollback()

_add_merged_record_query = b"""
Expand Down
16 changes: 8 additions & 8 deletions src/gene/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pathlib import Path
from typing import Dict, List, Optional, Union

import click
import pydantic
from biocommons.seqrepo import SeqRepo
from gffutils.feature import Feature
Expand All @@ -15,8 +14,7 @@
from gene.database import AbstractDatabase
from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)
_logger = logging.getLogger(__name__)


class GeneNormalizerEtlError(Exception):
Expand Down Expand Up @@ -80,11 +78,11 @@ def perform_etl(self, use_existing: bool = False) -> List[str]:
uploaded.
"""
self._extract_data(use_existing)
if not self._silent:
click.echo("Transforming and loading data to DB...")
_logger.info(f"Transforming and loading {self._src_name} data to DB...")
self._add_meta()
self._transform_data()
self._database.complete_write_transaction()
_logger.info(f"Data load complete for {self._src_name}.")
return self._processed_ids

def _extract_data(self, use_existing: bool) -> None:
Expand All @@ -95,9 +93,11 @@ def _extract_data(self, use_existing: bool) -> None:
:param bool use_existing: if True, don't try to fetch latest source data
"""
_logger.info(f"Gathering {self._src_name} data...")
self._data_file, self._version = self._data_source.get_latest(
from_local=use_existing
)
_logger.info(f"Acquired data for {self._src_name}: {self._data_file}")

@abstractmethod
def _transform_data(self) -> None:
Expand All @@ -120,7 +120,7 @@ def _load_gene(self, gene: Dict) -> None:
try:
assert Gene(match_type=MatchType.NO_MATCH, **gene)
except pydantic.ValidationError as e:
logger.warning(f"Unable to load {gene} due to validation error: " f"{e}")
_logger.warning(f"Unable to load {gene} due to validation error: " f"{e}")
else:
concept_id = gene["concept_id"]
gene["label_and_type"] = f"{concept_id.lower()}##identity"
Expand Down Expand Up @@ -221,7 +221,7 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]:
try:
aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh")
except KeyError as e:
logger.warning(f"SeqRepo raised KeyError: {e}")
_logger.warning(f"SeqRepo raised KeyError: {e}")
return aliases

def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict:
Expand All @@ -248,7 +248,7 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di
sequence_id=sequence,
).model_dump() # type: ignore
else:
logger.warning(
_logger.warning(
f"{params['concept_id']} has invalid interval:"
f"start={gene.start - 1} end={gene.end}"
) # type: ignore
Expand Down
9 changes: 5 additions & 4 deletions src/gene/etl/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,7 @@
from gene.etl.base import Base, GeneNormalizerEtlError
from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)
_logger = logging.getLogger(__name__)


class Ensembl(Base):
Expand All @@ -24,16 +23,18 @@ def _extract_data(self, use_existing: bool) -> None:
:param use_existing: if True, don't try to fetch latest source data
"""
_logger.info("Gathering Ensembl data...")
self._data_file, raw_version = self._data_source.get_latest(
from_local=use_existing
)
match = re.match(r"(GRCh\d+)_(\d+)", raw_version)
self._assembly = match.groups()[0]
self._version = match.groups()[1]
_logger.info(f"Acquired data for Ensembl: {self._data_file}")

def _transform_data(self) -> None:
"""Transform the Ensembl source."""
logger.info("Transforming Ensembl...")
_logger.info("Transforming Ensembl data...")
db = gffutils.create_db(
str(self._data_file),
dbfn=":memory:",
Expand All @@ -56,7 +57,7 @@ def _transform_data(self) -> None:
gene = self._add_gene(f, accession_numbers)
if gene:
self._load_gene(gene)
logger.info("Successfully transformed Ensembl.")
_logger.info("Ensembl data transform complete.")

def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
"""Create a transformed gene record.
Expand Down
9 changes: 4 additions & 5 deletions src/gene/etl/hgnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,15 @@
SymbolStatus,
)

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)
_logger = logging.getLogger(__name__)


class HGNC(Base):
"""ETL the HGNC source into the normalized database."""

def _transform_data(self) -> None:
"""Transform the HGNC source."""
logger.info("Transforming HGNC...")
_logger.info("Transforming HGNC data...")
with open(self._data_file, "r") as f: # type: ignore
data = json.load(f)

Expand Down Expand Up @@ -55,7 +54,7 @@ def _transform_data(self) -> None:
if "locus_type" in r:
gene["gene_type"] = r["locus_type"]
self._load_gene(gene)
logger.info("Successfully transformed HGNC.")
_logger.info("HGNC data transform complete.")

def _get_aliases(self, r: Dict, gene: Dict) -> None:
"""Store aliases in a gene record.
Expand Down Expand Up @@ -136,7 +135,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:
else:
self._get_xref_associated_with(key, src, r, associated_with)
else:
logger.warning(f"{key} not in schemas.py")
_logger.warning(f"{key} not in schemas.py")

if xrefs:
gene["xrefs"] = xrefs
Expand Down
Loading

0 comments on commit 3aea4fc

Please sign in to comment.