Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: overhaul logging #311

Merged
merged 1 commit into from
Dec 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 0 additions & 5 deletions src/gene/cli.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
"""Provides a CLI util to make updates to normalizer database."""
import logging
import os
from pathlib import Path
from typing import Optional, Tuple
Expand All @@ -11,10 +10,6 @@
from gene.etl.update import update_all_sources, update_normalized, update_source
from gene.schemas import SourceName

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)


url_description = 'URL endpoint for the application database. Can either be a URL to a local DynamoDB server (e.g. "http://localhost:8001") or a libpq-compliant PostgreSQL connection description (e.g. "postgresql://postgres:password@localhost:5432/gene_normalizer").'


Expand Down
43 changes: 17 additions & 26 deletions src/gene/database/dynamodb.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
SourceName,
)

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


class DynamoDbDatabase(AbstractDatabase):
Expand Down Expand Up @@ -165,7 +165,7 @@ def check_schema_initialized(self) -> bool:
existing_tables = self.list_tables()
exists = self.gene_table in existing_tables
if not exists:
logger.info(f"{self.gene_table} table is missing or unavailable.")
_logger.info(f"{self.gene_table} table is missing or unavailable.")
return exists

def check_tables_populated(self) -> bool:
Expand All @@ -182,7 +182,7 @@ def check_tables_populated(self) -> bool:
KeyConditionExpression=Key("item_type").eq("source"),
).get("Items", [])
if len(sources) < len(SourceName):
logger.info("Gene sources table is missing expected sources.")
_logger.info("Gene sources table is missing expected sources.")
return False

records = self.genes.query(
Expand All @@ -191,7 +191,7 @@ def check_tables_populated(self) -> bool:
Limit=1,
)
if len(records.get("Items", [])) < 1:
logger.info("Gene records index is empty.")
_logger.info("Gene records index is empty.")
return False

normalized_records = self.genes.query(
Expand All @@ -200,7 +200,7 @@ def check_tables_populated(self) -> bool:
Limit=1,
)
if len(normalized_records.get("Items", [])) < 1:
logger.info("Normalized gene records index is empty.")
_logger.info("Normalized gene records index is empty.")
return False

return True
Expand Down Expand Up @@ -262,10 +262,8 @@ def get_record_by_id(
del record["label_and_type"]
return record
except ClientError as e:
logger.error(
f"boto3 client error on get_records_by_id for "
f"search term {concept_id}: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on get_records_by_id for search term {concept_id}: {e.response['Error']['Message']}"
)
return None
except (KeyError, IndexError): # record doesn't exist
Expand All @@ -285,10 +283,8 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]:
matches = self.genes.query(KeyConditionExpression=filter_exp)
return [m["concept_id"] for m in matches.get("Items", None)]
except ClientError as e:
logger.error(
f"boto3 client error on get_refs_by_type for "
f"search term {search_term}: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on get_refs_by_type for search term {search_term}: {e.response['Error']['Message']}"
)
return []

Expand Down Expand Up @@ -389,9 +385,8 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
try:
self.batch.put_item(Item=record)
except ClientError as e:
logger.error(
"boto3 client error on add_record for "
f"{concept_id}: {e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on add_record for {concept_id}: {e.response['Error']['Message']}"
)
for attr_type, item_type in ITEM_TYPES.items():
if attr_type in record:
Expand Down Expand Up @@ -421,9 +416,8 @@ def add_merged_record(self, record: Dict) -> None:
try:
self.batch.put_item(Item=record)
except ClientError as e:
logger.error(
"boto3 client error on add_record for "
f"{concept_id}: {e.response['Error']['Message']}"
_logger.error(
f"boto3 client error on add_record for {concept_id}: {e.response['Error']['Message']}"
)

def _add_ref_record(
Expand All @@ -447,10 +441,8 @@ def _add_ref_record(
try:
self.batch.put_item(Item=record)
except ClientError as e:
logger.error(
f"boto3 client error adding reference {term} for "
f"{concept_id} with match type {ref_type}: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error adding reference {term} for {concept_id} with match type {ref_type}: {e.response['Error']['Message']}"
)

def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: ANN401
Expand Down Expand Up @@ -479,9 +471,8 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN
f"No such record exists for keys {label_and_type}, {concept_id}"
)
else:
logger.error(
f"boto3 client error in `database.update_record()`: "
f"{e.response['Error']['Message']}"
_logger.error(
f"boto3 client error in `database.update_record()`: {e.response['Error']['Message']}"
)

def delete_normalized_concepts(self) -> None:
Expand Down
22 changes: 11 additions & 11 deletions src/gene/database/postgresql.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
)
from gene.schemas import RecordType, RefType, SourceMeta, SourceName

logger = logging.getLogger(__name__)
_logger = logging.getLogger(__name__)


SCRIPTS_DIR = Path(__file__).parent / "postgresql"
Expand Down Expand Up @@ -124,7 +124,7 @@ def drop_db(self) -> None:
with self.conn.cursor() as cur:
cur.execute(self._drop_db_query)
self.conn.commit()
logger.info("Dropped all existing gene normalizer tables.")
_logger.info("Dropped all existing gene normalizer tables.")

def check_schema_initialized(self) -> bool:
"""Check if database schema is properly initialized.
Expand All @@ -137,7 +137,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Gene table existence check failed.")
_logger.info("Gene table existence check failed.")
self.conn.rollback()
return False
try:
Expand All @@ -146,7 +146,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateObject:
self.conn.rollback()
else:
logger.info("Gene foreign key existence check failed.")
_logger.info("Gene foreign key existence check failed.")
self.conn.rollback()
return False
try:
Expand All @@ -157,7 +157,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Gene normalized view lookup failed.")
_logger.info("Gene normalized view lookup failed.")
self.conn.rollback()
return False
try:
Expand All @@ -166,7 +166,7 @@ def check_schema_initialized(self) -> bool:
except DuplicateTable:
self.conn.rollback()
else:
logger.info("Gene indexes check failed.")
_logger.info("Gene indexes check failed.")
self.conn.rollback()
return False

Expand All @@ -189,21 +189,21 @@ def check_tables_populated(self) -> bool:
cur.execute(self._check_sources_query)
results = cur.fetchall()
if len(results) < len(SourceName):
logger.info("Gene sources table is missing expected sources.")
_logger.info("Gene sources table is missing expected sources.")
return False

with self.conn.cursor() as cur:
cur.execute(self._check_concepts_query)
result = cur.fetchone()
if not result or result[0] < 1:
logger.info("Gene records table is empty.")
_logger.info("Gene records table is empty.")
return False

with self.conn.cursor() as cur:
cur.execute(self._check_merged_query)
result = cur.fetchone()
if not result or result[0] < 1:
logger.info("Normalized gene records table is empty.")
_logger.info("Normalized gene records table is empty.")
return False

return True
Expand Down Expand Up @@ -265,7 +265,7 @@ def _drop_indexes(self) -> None:

def _create_tables(self) -> None:
"""Create all tables, indexes, and views."""
logger.debug("Creating new gene normalizer tables.")
_logger.info("Creating new gene normalizer tables.")
tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes()

with self.conn.cursor() as cur:
Expand Down Expand Up @@ -599,7 +599,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None:
cur.execute(self._ins_symbol_query, [record["symbol"], concept_id])
self.conn.commit()
except UniqueViolation:
logger.error(f"Record with ID {concept_id} already exists")
_logger.error(f"Record with ID {concept_id} already exists")
self.conn.rollback()

_add_merged_record_query = b"""
Expand Down
16 changes: 8 additions & 8 deletions src/gene/etl/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pathlib import Path
from typing import Dict, List, Optional, Union

import click
import pydantic
from biocommons.seqrepo import SeqRepo
from gffutils.feature import Feature
Expand All @@ -15,8 +14,7 @@
from gene.database import AbstractDatabase
from gene.schemas import ITEM_TYPES, Gene, GeneSequenceLocation, MatchType, SourceName

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)
_logger = logging.getLogger(__name__)


APP_ROOT = Path(__file__).resolve().parent
Expand Down Expand Up @@ -76,11 +74,11 @@ def perform_etl(self, use_existing: bool = False) -> List[str]:
uploaded.
"""
self._extract_data(use_existing)
if not self._silent:
click.echo("Transforming and loading data to DB...")
_logger.info(f"Transforming and loading {self._src_name} data to DB...")
self._add_meta()
self._transform_data()
self._database.complete_write_transaction()
_logger.info(f"Data load complete for {self._src_name}.")
return self._processed_ids

def _extract_data(self, use_existing: bool) -> None:
Expand All @@ -91,9 +89,11 @@ def _extract_data(self, use_existing: bool) -> None:

:param bool use_existing: if True, don't try to fetch latest source data
"""
_logger.info(f"Gathering {self._src_name} data...")
self._data_file, self._version = self._data_source.get_latest(
from_local=use_existing
)
_logger.info(f"Acquired data for {self._src_name}: {self._data_file}")

@abstractmethod
def _transform_data(self) -> None:
Expand All @@ -116,7 +116,7 @@ def _load_gene(self, gene: Dict) -> None:
try:
assert Gene(match_type=MatchType.NO_MATCH, **gene)
except pydantic.ValidationError as e:
logger.warning(f"Unable to load {gene} due to validation error: " f"{e}")
_logger.warning(f"Unable to load {gene} due to validation error: " f"{e}")
else:
concept_id = gene["concept_id"]
gene["label_and_type"] = f"{concept_id.lower()}##identity"
Expand Down Expand Up @@ -217,7 +217,7 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]:
try:
aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh")
except KeyError as e:
logger.warning(f"SeqRepo raised KeyError: {e}")
_logger.warning(f"SeqRepo raised KeyError: {e}")
return aliases

def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict:
Expand All @@ -244,7 +244,7 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di
sequence_id=sequence,
).model_dump() # type: ignore
else:
logger.warning(
_logger.warning(
f"{params['concept_id']} has invalid interval:"
f"start={gene.start - 1} end={gene.end}"
) # type: ignore
Expand Down
9 changes: 5 additions & 4 deletions src/gene/etl/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@
)
from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)
_logger = logging.getLogger(__name__)


class Ensembl(Base):
Expand All @@ -27,16 +26,18 @@ def _extract_data(self, use_existing: bool) -> None:

:param use_existing: if True, don't try to fetch latest source data
"""
_logger.info("Gathering Ensembl data...")
self._data_file, raw_version = self._data_source.get_latest(
from_local=use_existing
)
match = re.match(r"(GRCh\d+)_(\d+)", raw_version)
self._assembly = match.groups()[0]
self._version = match.groups()[1]
_logger.info(f"Acquired data for Ensembl: {self._data_file}")

def _transform_data(self) -> None:
"""Transform the Ensembl source."""
logger.info("Transforming Ensembl...")
_logger.info("Transforming Ensembl data...")
db = gffutils.create_db(
str(self._data_file),
dbfn=":memory:",
Expand All @@ -59,7 +60,7 @@ def _transform_data(self) -> None:
gene = self._add_gene(f, accession_numbers)
if gene:
self._load_gene(gene)
logger.info("Successfully transformed Ensembl.")
_logger.info("Ensembl data transform complete.")

def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict:
"""Create a transformed gene record.
Expand Down
9 changes: 4 additions & 5 deletions src/gene/etl/hgnc.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,15 @@
SymbolStatus,
)

logger = logging.getLogger("gene")
logger.setLevel(logging.DEBUG)
_logger = logging.getLogger(__name__)


class HGNC(Base):
"""ETL the HGNC source into the normalized database."""

def _transform_data(self) -> None:
"""Transform the HGNC source."""
logger.info("Transforming HGNC...")
_logger.info("Transforming HGNC data...")
with open(self._data_file, "r") as f: # type: ignore
data = json.load(f)

Expand Down Expand Up @@ -58,7 +57,7 @@ def _transform_data(self) -> None:
if "locus_type" in r:
gene["gene_type"] = r["locus_type"]
self._load_gene(gene)
logger.info("Successfully transformed HGNC.")
_logger.info("HGNC data transform complete.")

def _get_aliases(self, r: Dict, gene: Dict) -> None:
"""Store aliases in a gene record.
Expand Down Expand Up @@ -139,7 +138,7 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None:
else:
self._get_xref_associated_with(key, src, r, associated_with)
else:
logger.warning(f"{key} not in schemas.py")
_logger.warning(f"{key} not in schemas.py")

if xrefs:
gene["xrefs"] = xrefs
Expand Down
Loading