Skip to content

Commit

Permalink
Merge pull request #135 from cancervariants/staging
Browse files Browse the repository at this point in the history
Staging
  • Loading branch information
korikuzma authored Aug 24, 2022
2 parents b594e15 + fe9cef6 commit c4cf02a
Show file tree
Hide file tree
Showing 24 changed files with 676 additions and 897 deletions.
4 changes: 2 additions & 2 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ beautifulsoup4 = "*"
gffutils = "*"
requests = "*"
"biocommons.seqrepo" = "*"
"ga4gh.vrs" = {version = ">=0.7.5.dev1", extras = ["extras"]}
"ga4gh.vrsatile.pydantic" = ">=0.0.10"
"ga4gh.vrs" = {version = ">=0.8.6dev0", extras = ["extras"]}
"ga4gh.vrsatile.pydantic" = ">=0.1.dev0"

[dev-packages]
gene = {editable = true, path = "."}
Expand Down
4 changes: 2 additions & 2 deletions gene/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
logging.getLogger("botocore").setLevel(logging.INFO)
logging.getLogger("urllib3").setLevel(logging.INFO)
logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO)
logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) # noqa: E501
logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) # noqa: E501
logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO)
logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO)

if "GENE_NORM_EB_PROD" in environ:
ch = logging.StreamHandler()
Expand Down
7 changes: 6 additions & 1 deletion gene/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from gene.schemas import SourceName
from gene.etl.merge import Merge
from timeit import default_timer as timer
from gene.database import Database
from gene.database import Database, confirm_aws_db_use
from boto3.dynamodb.conditions import Key
from os import environ
import logging
Expand Down Expand Up @@ -45,6 +45,11 @@ class CLI:
def update_normalizer_db(normalizer, prod, db_url, update_all,
update_merged):
"""Update selected normalizer source(s) in the gene database."""
# Sometimes GENE_NORM_EB_PROD is accidentally set. We should verify that
# it should actually be used in CLI
if "GENE_NORM_EB_PROD" in environ:
confirm_aws_db_use("PROD")

if prod:
environ['GENE_NORM_PROD'] = "TRUE"
db: Database = Database()
Expand Down
44 changes: 33 additions & 11 deletions gene/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,16 @@
logger.setLevel(logging.DEBUG)


def confirm_aws_db_use(env_name: str) -> None:
"""Check to ensure that AWS instance should actually be used."""
if click.confirm(f"Are you sure you want to use the AWS {env_name} database?",
default=False):
click.echo(f"***GENE {env_name.upper()} DATABASE IN USE***")
else:
click.echo("Exiting.")
sys.exit()


class Database:
"""The database class."""

Expand All @@ -23,19 +33,30 @@ def __init__(self, db_url: str = '', region_name: str = 'us-east-2'):
:param str db_url: URL endpoint for DynamoDB source
:param str region_name: default AWS region
"""
gene_concepts_table = "gene_concepts" # default
gene_metadata_table = "gene_metadata" # default

if 'GENE_NORM_PROD' in environ or 'GENE_NORM_EB_PROD' in environ:
boto_params = {
'region_name': region_name
}
if 'GENE_NORM_EB_PROD' not in environ:
# EB Instance should not have to confirm.
# This is used only for updating production via CLI
if click.confirm("Are you sure you want to use the "
"production database?", default=False):
click.echo("***GENE PRODUCTION DATABASE IN USE***")
else:
click.echo("Exiting.")
sys.exit()
confirm_aws_db_use("PROD")
elif "GENE_NORM_NONPROD" in environ:
# This is a nonprod table. Only to be used for creating backups which
# prod will restore. Will need to manually delete / create this table
# on an as needed basis.
gene_concepts_table = "gene_concepts_nonprod"
gene_metadata_table = "gene_metadata_nonprod"

boto_params = {
"region_name": region_name
}

# This is used only for updating nonprod via CLI
confirm_aws_db_use("NONPROD")
else:
if db_url:
endpoint_url = db_url
Expand All @@ -52,13 +73,14 @@ def __init__(self, db_url: str = '', region_name: str = 'us-east-2'):
self.dynamodb = boto3.resource('dynamodb', **boto_params)
self.dynamodb_client = boto3.client('dynamodb', **boto_params)

# Create tables if nonexistent if not connecting to production database
if 'GENE_NORM_PROD' not in environ and\
'GENE_NORM_EB_PROD' not in environ and 'TEST' not in environ:
# Only create tables for local instance
envs_do_not_create_tables = {"GENE_NORM_PROD", "GENE_NORM_EB_PROD",
"GENE_NORM_NONPROD", "TEST"}
if not set(envs_do_not_create_tables) & set(environ):
self.create_db_tables()

self.genes = self.dynamodb.Table('gene_concepts')
self.metadata = self.dynamodb.Table('gene_metadata')
self.genes = self.dynamodb.Table(gene_concepts_table)
self.metadata = self.dynamodb.Table(gene_metadata_table)
self.batch = self.genes.batch_writer()
self.cached_sources = {}

Expand Down
8 changes: 4 additions & 4 deletions gene/etl/ensembl.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ def _add_attributes(self, f, gene):
src_id = val.split("Acc:")[-1].split("]")[0]
if ":" in src_id:
src_id = src_id.split(":")[-1]
source = self._get_xref_associated_with(src_name, src_id) # noqa: E501
source = self._get_xref_associated_with(src_name, src_id)
if "xrefs" in source:
gene["xrefs"] = source["xrefs"]
elif "associated_with" in source:
Expand Down Expand Up @@ -199,11 +199,11 @@ def _get_xref_associated_with(self, src_name, src_id):
source["xrefs"] = \
[f"{NamespacePrefix.NCBI.value}:{src_id}"]
elif src_name.startswith("UniProt"):
source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] # noqa: E501
source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
elif src_name.startswith("miRBase"):
source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] # noqa: E501
source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
elif src_name.startswith("RFAM"):
source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] # noqa: E501
source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
return source

def perform_etl(self, *args, **kwargs):
Expand Down
11 changes: 5 additions & 6 deletions gene/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
ServiceMeta, SourcePriority, NormalizeService, SearchService, \
GeneTypeFieldName, UnmergedNormalizationService, MatchesNormalized, \
BaseNormalizationService
from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension
from ga4gh.vrsatile.pydantic.core_models import Extension
from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor
from botocore.exceptions import ClientError
from boto3.dynamodb.conditions import Key
from datetime import datetime
Expand Down Expand Up @@ -87,11 +88,9 @@ def _cast_location_ints(record: Dict) -> Dict:
"""
if 'locations' in record:
for loc in record['locations']:
if loc['interval']['type'] == "SequenceInterval":
loc['interval']['start']['value'] = \
int(loc['interval']['start']['value'])
loc['interval']['end']['value'] = \
int(loc['interval']['end']['value'])
if loc['type'] == 'SequenceLocation':
loc['start']['value'] = int(loc['start']['value'])
loc['end']['value'] = int(loc['end']['value'])
return record

def add_record(self,
Expand Down
82 changes: 32 additions & 50 deletions gene/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from pydantic import BaseModel, StrictBool, validator
from enum import Enum, IntEnum
from ga4gh.vrsatile.pydantic import return_value
from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, \
ChromosomeLocation, CURIE
from ga4gh.vrsatile.pydantic.core_models import CURIE
from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, ChromosomeLocation
from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor
from pydantic.types import StrictStr

Expand Down Expand Up @@ -222,7 +222,7 @@ def schema_extra(schema: Dict[str, Any],
prop.pop('title', None)
schema['example'] = {
"data_license": "custom",
"data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501
"data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/",
"version": "20201215",
"data_url": "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/",
"rdp_url": "https://reusabledata.org/ncbi-gene.html",
Expand Down Expand Up @@ -408,7 +408,7 @@ def schema_extra(schema: Dict[str, Any],
'name': 'gene-normalizer',
'version': '0.1.0',
'response_datetime': '2022-03-23 15:57:14.180908',
'url': 'https://github.com/cancervariants/gene-normalization' # noqa: E501
'url': 'https://github.com/cancervariants/gene-normalization'
}
}

Expand Down Expand Up @@ -458,10 +458,7 @@ def schema_extra(schema: Dict[str, Any],
"gene_descriptor": {
"id": "normalize.gene:BRAF",
"type": "GeneDescriptor",
"gene": {
"gene_id": "hgnc:1097",
"type": "Gene"
},
"gene_id": "hgnc:1097",
"label": "BRAF",
"xrefs": [
"ncbigene:673",
Expand All @@ -477,7 +474,7 @@ def schema_extra(schema: Dict[str, Any],
"extensions": [
{
"name": "approved_name",
"value": "B-Raf proto-oncogene, serine/threonine kinase", # noqa: E501
"value": "B-Raf proto-oncogene, serine/threonine kinase",
"type": "Extension"
},
{
Expand Down Expand Up @@ -507,15 +504,12 @@ def schema_extra(schema: Dict[str, Any],
{
"name": "chromosome_location",
"value": {
"_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501
"id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501
"type": "ChromosomeLocation",
"species_id": "taxonomy:9606",
"chr": "7",
"interval": {
"end": "q34",
"start": "q34",
"type": "CytobandInterval"
}
"end": "q34",
"start": "q34",
},
"type": "Extension"
}
Expand Down Expand Up @@ -570,7 +564,7 @@ def schema_extra(schema: Dict[str, Any],
'name': 'gene-normalizer',
'version': '0.1.19',
'response_datetime': '2022-03-23 15:57:14.180908',
'url': 'https://github.com/cancervariants/gene-normalization' # noqa: E501
'url': 'https://github.com/cancervariants/gene-normalization'
}
}

Expand Down Expand Up @@ -638,14 +632,11 @@ def schema_extra(schema: Dict[str, Any],
"locations": [
{
"type": "ChromosomeLocation",
"_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501
"id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501
"species_id": "taxonomy:9606",
"chr": "7",
"interval": {
"type": "CytobandInterval",
"start": "q22.1",
"end": "q22.1"
}
"start": "q22.1",
"end": "q22.1"
}
],
"aliases": [
Expand Down Expand Up @@ -699,19 +690,16 @@ def schema_extra(schema: Dict[str, Any],
"location_annotations": [],
"locations": [
{
"_id": "ga4gh:VSL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm", # noqa: E501
"id": "ga4gh:SL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm", # noqa: E501
"type": "SequenceLocation",
"sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501
"interval": {
"type": "SequenceInterval",
"start": {
"type": "Number",
"value": 100889993
},
"end": {
"type": "Number",
"value": 100896974
}
"start": {
"type": "Number",
"value": 100889993
},
"end": {
"type": "Number",
"value": 100896974
}
}
],
Expand Down Expand Up @@ -752,29 +740,23 @@ def schema_extra(schema: Dict[str, Any],
"locations": [
{
"type": "ChromosomeLocation",
"_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501
"id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501
"species_id": "taxonomy:9606",
"chr": "7",
"interval": {
"type": "CytobandInterval",
"start": "q22.1",
"end": "q22.1"
}
"start": "q22.1",
"end": "q22.1"
},
{
"_id": "ga4gh:VSL.EepkXho2doYcUT1DW54fT1a00_zkqrn0", # noqa: E501
"id": "ga4gh:SL.EepkXho2doYcUT1DW54fT1a00_zkqrn0", # noqa: E501
"type": "SequenceLocation",
"sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501
"interval": {
"type": "SequenceInterval",
"start": {
"type": "Number",
"value": 100889993
},
"end": {
"type": "Number",
"value": 100896994
}
"start": {
"type": "Number",
"value": 100889993
},
"end": {
"type": "Number",
"value": 100896994
}
}
],
Expand Down
2 changes: 1 addition & 1 deletion gene/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Gene normalizer version"""
__version__ = "0.1.27"
__version__ = "0.2.0"
22 changes: 7 additions & 15 deletions gene/vrs_locations/chromosome_location.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,11 @@ def add_location(self, location):
chr_location = models.ChromosomeLocation(
species_id="taxonomy:9606",
chr=location['chr'],
interval=models.CytobandInterval(
start=location['start'],
end=location['end'],
type="CytobandInterval"
),
start=location['start'],
end=location['end'],
type="ChromosomeLocation"
)
chr_location._id = ga4gh_identify(chr_location)
chr_location.id = ga4gh_identify(chr_location)
return chr_location.as_dict()

def get_location(self, location, gene):
Expand All @@ -40,21 +37,16 @@ def get_location(self, location, gene):
dictionary containing the ChromosomeLocation.
Else, return None.
"""
if 'chr' in location and 'start' in location \
and 'end' in location:
if 'chr' in location and 'start' in location and 'end' in location:
if location['start'] == 'p' and location['end'] == 'p':
location['start'] = 'pter'
location['end'] = 'cen'
elif location['start'] == 'q' and \
location['end'] == 'q':
elif location['start'] == 'q' and location['end'] == 'q':
location['start'] = 'cen'
location['end'] = 'qter'
try:
chr_location = \
self.add_location(
location)
except python_jsonschema_objects.validators. \
ValidationError as e:
chr_location = self.add_location(location)
except python_jsonschema_objects.validators.ValidationError as e:
logger.info(f"{e} for {gene['symbol']}")
else:
return chr_location
Expand Down
Loading

0 comments on commit c4cf02a

Please sign in to comment.