Merge pull request #135 from cancervariants/staging

Staging
cancervariants · Aug 24, 2022 · c4cf02a · c4cf02a
2 parents b594e15 + fe9cef6
commit c4cf02a
Show file tree

Hide file tree

Showing 24 changed files with 676 additions and 897 deletions.
diff --git a/Pipfile b/Pipfile
@@ -13,8 +13,8 @@ beautifulsoup4 = "*"
 gffutils = "*"
 requests = "*"
 "biocommons.seqrepo" = "*"
-"ga4gh.vrs" = {version = ">=0.7.5.dev1", extras = ["extras"]}
-"ga4gh.vrsatile.pydantic" = ">=0.0.10"
+"ga4gh.vrs" = {version = ">=0.8.6dev0", extras = ["extras"]}
+"ga4gh.vrsatile.pydantic" = ">=0.1.dev0"
 
 [dev-packages]
 gene = {editable = true, path = "."}

diff --git a/gene/__init__.py b/gene/__init__.py
@@ -22,8 +22,8 @@
 logging.getLogger("botocore").setLevel(logging.INFO)
 logging.getLogger("urllib3").setLevel(logging.INFO)
 logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO)
-logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO)  # noqa: E501
-logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO)  # noqa: E501
+logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO)
+logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO)
 
 if "GENE_NORM_EB_PROD" in environ:
     ch = logging.StreamHandler()

diff --git a/gene/cli.py b/gene/cli.py
@@ -5,7 +5,7 @@
 from gene.schemas import SourceName
 from gene.etl.merge import Merge
 from timeit import default_timer as timer
-from gene.database import Database
+from gene.database import Database, confirm_aws_db_use
 from boto3.dynamodb.conditions import Key
 from os import environ
 import logging
@@ -45,6 +45,11 @@ class CLI:
     def update_normalizer_db(normalizer, prod, db_url, update_all,
                              update_merged):
         """Update selected normalizer source(s) in the gene database."""
+        # Sometimes GENE_NORM_EB_PROD is accidentally set. We should verify that
+        # it should actually be used in CLI
+        if "GENE_NORM_EB_PROD" in environ:
+            confirm_aws_db_use("PROD")
+
         if prod:
             environ['GENE_NORM_PROD'] = "TRUE"
             db: Database = Database()

diff --git a/gene/database.py b/gene/database.py
@@ -14,6 +14,16 @@
 logger.setLevel(logging.DEBUG)
 
 
+def confirm_aws_db_use(env_name: str) -> None:
+    """Check to ensure that AWS instance should actually be used."""
+    if click.confirm(f"Are you sure you want to use the AWS {env_name} database?",
+                     default=False):
+        click.echo(f"***GENE {env_name.upper()} DATABASE IN USE***")
+    else:
+        click.echo("Exiting.")
+        sys.exit()
+
+
 class Database:
     """The database class."""
 
@@ -23,19 +33,30 @@ def __init__(self, db_url: str = '', region_name: str = 'us-east-2'):
         :param str db_url: URL endpoint for DynamoDB source
         :param str region_name: default AWS region
         """
+        gene_concepts_table = "gene_concepts"  # default
+        gene_metadata_table = "gene_metadata"  # default
+
         if 'GENE_NORM_PROD' in environ or 'GENE_NORM_EB_PROD' in environ:
             boto_params = {
                 'region_name': region_name
             }
             if 'GENE_NORM_EB_PROD' not in environ:
                 # EB Instance should not have to confirm.
                 # This is used only for updating production via CLI
-                if click.confirm("Are you sure you want to use the "
-                                 "production database?", default=False):
-                    click.echo("***GENE PRODUCTION DATABASE IN USE***")
-                else:
-                    click.echo("Exiting.")
-                    sys.exit()
+                confirm_aws_db_use("PROD")
+        elif "GENE_NORM_NONPROD" in environ:
+            # This is a nonprod table. Only to be used for creating backups which
+            # prod will restore. Will need to manually delete / create this table
+            # on an as needed basis.
+            gene_concepts_table = "gene_concepts_nonprod"
+            gene_metadata_table = "gene_metadata_nonprod"
+
+            boto_params = {
+                "region_name": region_name
+            }
+
+            # This is used only for updating nonprod via CLI
+            confirm_aws_db_use("NONPROD")
         else:
             if db_url:
                 endpoint_url = db_url
@@ -52,13 +73,14 @@ def __init__(self, db_url: str = '', region_name: str = 'us-east-2'):
         self.dynamodb = boto3.resource('dynamodb', **boto_params)
         self.dynamodb_client = boto3.client('dynamodb', **boto_params)
 
-        # Create tables if nonexistent if not connecting to production database
-        if 'GENE_NORM_PROD' not in environ and\
-                'GENE_NORM_EB_PROD' not in environ and 'TEST' not in environ:
+        # Only create tables for local instance
+        envs_do_not_create_tables = {"GENE_NORM_PROD", "GENE_NORM_EB_PROD",
+                                     "GENE_NORM_NONPROD", "TEST"}
+        if not set(envs_do_not_create_tables) & set(environ):
             self.create_db_tables()
 
-        self.genes = self.dynamodb.Table('gene_concepts')
-        self.metadata = self.dynamodb.Table('gene_metadata')
+        self.genes = self.dynamodb.Table(gene_concepts_table)
+        self.metadata = self.dynamodb.Table(gene_metadata_table)
         self.batch = self.genes.batch_writer()
         self.cached_sources = {}
 

diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py
@@ -162,7 +162,7 @@ def _add_attributes(self, f, gene):
                         src_id = val.split("Acc:")[-1].split("]")[0]
                         if ":" in src_id:
                             src_id = src_id.split(":")[-1]
-                        source = self._get_xref_associated_with(src_name, src_id)  # noqa: E501
+                        source = self._get_xref_associated_with(src_name, src_id)
                         if "xrefs" in source:
                             gene["xrefs"] = source["xrefs"]
                         elif "associated_with" in source:
@@ -199,11 +199,11 @@ def _get_xref_associated_with(self, src_name, src_id):
             source["xrefs"] = \
                 [f"{NamespacePrefix.NCBI.value}:{src_id}"]
         elif src_name.startswith("UniProt"):
-            source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]  # noqa: E501
+            source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"]
         elif src_name.startswith("miRBase"):
-            source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]  # noqa: E501
+            source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"]
         elif src_name.startswith("RFAM"):
-            source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]  # noqa: E501
+            source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"]
         return source
 
     def perform_etl(self, *args, **kwargs):

diff --git a/gene/query.py b/gene/query.py
@@ -9,7 +9,8 @@
     ServiceMeta, SourcePriority, NormalizeService, SearchService, \
     GeneTypeFieldName, UnmergedNormalizationService, MatchesNormalized, \
     BaseNormalizationService
-from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension
+from ga4gh.vrsatile.pydantic.core_models import Extension
+from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor
 from botocore.exceptions import ClientError
 from boto3.dynamodb.conditions import Key
 from datetime import datetime
@@ -87,11 +88,9 @@ def _cast_location_ints(record: Dict) -> Dict:
         """
         if 'locations' in record:
             for loc in record['locations']:
-                if loc['interval']['type'] == "SequenceInterval":
-                    loc['interval']['start']['value'] = \
-                        int(loc['interval']['start']['value'])
-                    loc['interval']['end']['value'] = \
-                        int(loc['interval']['end']['value'])
+                if loc['type'] == 'SequenceLocation':
+                    loc['start']['value'] = int(loc['start']['value'])
+                    loc['end']['value'] = int(loc['end']['value'])
         return record
 
     def add_record(self,

diff --git a/gene/schemas.py b/gene/schemas.py
@@ -5,8 +5,8 @@
 from pydantic import BaseModel, StrictBool, validator
 from enum import Enum, IntEnum
 from ga4gh.vrsatile.pydantic import return_value
-from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, \
-    ChromosomeLocation, CURIE
+from ga4gh.vrsatile.pydantic.core_models import CURIE
+from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, ChromosomeLocation
 from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor
 from pydantic.types import StrictStr
 
@@ -222,7 +222,7 @@ def schema_extra(schema: Dict[str, Any],
                 prop.pop('title', None)
             schema['example'] = {
                 "data_license": "custom",
-                "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/",  # noqa: E501
+                "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/",
                 "version": "20201215",
                 "data_url": "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/",
                 "rdp_url": "https://reusabledata.org/ncbi-gene.html",
@@ -408,7 +408,7 @@ def schema_extra(schema: Dict[str, Any],
                     'name': 'gene-normalizer',
                     'version': '0.1.0',
                     'response_datetime': '2022-03-23 15:57:14.180908',
-                    'url': 'https://github.com/cancervariants/gene-normalization'  # noqa: E501
+                    'url': 'https://github.com/cancervariants/gene-normalization'
                 }
             }
 
@@ -458,10 +458,7 @@ def schema_extra(schema: Dict[str, Any],
                 "gene_descriptor": {
                     "id": "normalize.gene:BRAF",
                     "type": "GeneDescriptor",
-                    "gene": {
-                        "gene_id": "hgnc:1097",
-                        "type": "Gene"
-                    },
+                    "gene_id": "hgnc:1097",
                     "label": "BRAF",
                     "xrefs": [
                         "ncbigene:673",
@@ -477,7 +474,7 @@ def schema_extra(schema: Dict[str, Any],
                     "extensions": [
                         {
                             "name": "approved_name",
-                            "value": "B-Raf proto-oncogene, serine/threonine kinase",  # noqa: E501
+                            "value": "B-Raf proto-oncogene, serine/threonine kinase",
                             "type": "Extension"
                         },
                         {
@@ -507,15 +504,12 @@ def schema_extra(schema: Dict[str, Any],
                         {
                             "name": "chromosome_location",
                             "value": {
-                                "_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw",  # noqa: E501
+                                "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw",  # noqa: E501
                                 "type": "ChromosomeLocation",
                                 "species_id": "taxonomy:9606",
                                 "chr": "7",
-                                "interval": {
-                                    "end": "q34",
-                                    "start": "q34",
-                                    "type": "CytobandInterval"
-                                }
+                                "end": "q34",
+                                "start": "q34",
                             },
                             "type": "Extension"
                         }
@@ -570,7 +564,7 @@ def schema_extra(schema: Dict[str, Any],
                     'name': 'gene-normalizer',
                     'version': '0.1.19',
                     'response_datetime': '2022-03-23 15:57:14.180908',
-                    'url': 'https://github.com/cancervariants/gene-normalization'  # noqa: E501
+                    'url': 'https://github.com/cancervariants/gene-normalization'
                 }
             }
 
@@ -638,14 +632,11 @@ def schema_extra(schema: Dict[str, Any],
                                 "locations": [
                                     {
                                         "type": "ChromosomeLocation",
-                                        "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD",  # noqa: E501
+                                        "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD",  # noqa: E501
                                         "species_id": "taxonomy:9606",
                                         "chr": "7",
-                                        "interval": {
-                                            "type": "CytobandInterval",
-                                            "start": "q22.1",
-                                            "end": "q22.1"
-                                        }
+                                        "start": "q22.1",
+                                        "end": "q22.1"
                                     }
                                 ],
                                 "aliases": [
@@ -699,19 +690,16 @@ def schema_extra(schema: Dict[str, Any],
                                 "location_annotations": [],
                                 "locations": [
                                     {
-                                        "_id": "ga4gh:VSL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm",  # noqa: E501
+                                        "id": "ga4gh:SL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm",  # noqa: E501
                                         "type": "SequenceLocation",
                                         "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",  # noqa: E501
-                                        "interval": {
-                                            "type": "SequenceInterval",
-                                            "start": {
-                                                "type": "Number",
-                                                "value": 100889993
-                                            },
-                                            "end": {
-                                                "type": "Number",
-                                                "value": 100896974
-                                            }
+                                        "start": {
+                                            "type": "Number",
+                                            "value": 100889993
+                                        },
+                                        "end": {
+                                            "type": "Number",
+                                            "value": 100896974
                                         }
                                     }
                                 ],
@@ -752,29 +740,23 @@ def schema_extra(schema: Dict[str, Any],
                                 "locations": [
                                     {
                                         "type": "ChromosomeLocation",
-                                        "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD",  # noqa: E501
+                                        "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD",  # noqa: E501
                                         "species_id": "taxonomy:9606",
                                         "chr": "7",
-                                        "interval": {
-                                            "type": "CytobandInterval",
-                                            "start": "q22.1",
-                                            "end": "q22.1"
-                                        }
+                                        "start": "q22.1",
+                                        "end": "q22.1"
                                     },
                                     {
-                                        "_id": "ga4gh:VSL.EepkXho2doYcUT1DW54fT1a00_zkqrn0",  # noqa: E501
+                                        "id": "ga4gh:SL.EepkXho2doYcUT1DW54fT1a00_zkqrn0",  # noqa: E501
                                         "type": "SequenceLocation",
                                         "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul",  # noqa: E501
-                                        "interval": {
-                                            "type": "SequenceInterval",
-                                            "start": {
-                                                "type": "Number",
-                                                "value": 100889993
-                                            },
-                                            "end": {
-                                                "type": "Number",
-                                                "value": 100896994
-                                            }
+                                        "start": {
+                                            "type": "Number",
+                                            "value": 100889993
+                                        },
+                                        "end": {
+                                            "type": "Number",
+                                            "value": 100896994
                                         }
                                     }
                                 ],

diff --git a/gene/version.py b/gene/version.py
@@ -1,2 +1,2 @@
 """Gene normalizer version"""
-__version__ = "0.1.27"
+__version__ = "0.2.0"
diff --git a/gene/vrs_locations/chromosome_location.py b/gene/vrs_locations/chromosome_location.py
@@ -21,14 +21,11 @@ def add_location(self, location):
         chr_location = models.ChromosomeLocation(
             species_id="taxonomy:9606",
             chr=location['chr'],
-            interval=models.CytobandInterval(
-                start=location['start'],
-                end=location['end'],
-                type="CytobandInterval"
-            ),
+            start=location['start'],
+            end=location['end'],
             type="ChromosomeLocation"
         )
-        chr_location._id = ga4gh_identify(chr_location)
+        chr_location.id = ga4gh_identify(chr_location)
         return chr_location.as_dict()
 
     def get_location(self, location, gene):
@@ -40,21 +37,16 @@ def get_location(self, location, gene):
          dictionary containing the ChromosomeLocation.
          Else, return None.
         """
-        if 'chr' in location and 'start' in location \
-                and 'end' in location:
+        if 'chr' in location and 'start' in location and 'end' in location:
             if location['start'] == 'p' and location['end'] == 'p':
                 location['start'] = 'pter'
                 location['end'] = 'cen'
-            elif location['start'] == 'q' and \
-                    location['end'] == 'q':
+            elif location['start'] == 'q' and location['end'] == 'q':
                 location['start'] = 'cen'
                 location['end'] = 'qter'
             try:
-                chr_location = \
-                    self.add_location(
-                        location)
-            except python_jsonschema_objects.validators. \
-                    ValidationError as e:
+                chr_location = self.add_location(location)
+            except python_jsonschema_objects.validators.ValidationError as e:
                 logger.info(f"{e} for {gene['symbol']}")
             else:
                 return chr_location