diff --git a/Pipfile b/Pipfile index 829cc80e..f0ed2d5d 100644 --- a/Pipfile +++ b/Pipfile @@ -13,8 +13,8 @@ beautifulsoup4 = "*" gffutils = "*" requests = "*" "biocommons.seqrepo" = "*" -"ga4gh.vrs" = {version = ">=0.7.5.dev1", extras = ["extras"]} -"ga4gh.vrsatile.pydantic" = ">=0.0.10" +"ga4gh.vrs" = {version = ">=0.8.6dev0", extras = ["extras"]} +"ga4gh.vrsatile.pydantic" = ">=0.1.dev0" [dev-packages] gene = {editable = true, path = "."} diff --git a/gene/__init__.py b/gene/__init__.py index d4100b89..3146f381 100644 --- a/gene/__init__.py +++ b/gene/__init__.py @@ -22,8 +22,8 @@ logging.getLogger("botocore").setLevel(logging.INFO) logging.getLogger("urllib3").setLevel(logging.INFO) logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO) -logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) # noqa: E501 -logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) # noqa: E501 +logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) +logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) if "GENE_NORM_EB_PROD" in environ: ch = logging.StreamHandler() diff --git a/gene/cli.py b/gene/cli.py index 47e06ce2..64ae37e3 100644 --- a/gene/cli.py +++ b/gene/cli.py @@ -5,7 +5,7 @@ from gene.schemas import SourceName from gene.etl.merge import Merge from timeit import default_timer as timer -from gene.database import Database +from gene.database import Database, confirm_aws_db_use from boto3.dynamodb.conditions import Key from os import environ import logging @@ -45,6 +45,11 @@ class CLI: def update_normalizer_db(normalizer, prod, db_url, update_all, update_merged): """Update selected normalizer source(s) in the gene database.""" + # Sometimes GENE_NORM_EB_PROD is accidentally set. We should verify that + # it should actually be used in CLI + if "GENE_NORM_EB_PROD" in environ: + confirm_aws_db_use("PROD") + if prod: environ['GENE_NORM_PROD'] = "TRUE" db: Database = Database() diff --git a/gene/database.py b/gene/database.py index c1efdac6..d7085dab 100644 --- a/gene/database.py +++ b/gene/database.py @@ -14,6 +14,16 @@ logger.setLevel(logging.DEBUG) +def confirm_aws_db_use(env_name: str) -> None: + """Check to ensure that AWS instance should actually be used.""" + if click.confirm(f"Are you sure you want to use the AWS {env_name} database?", + default=False): + click.echo(f"***GENE {env_name.upper()} DATABASE IN USE***") + else: + click.echo("Exiting.") + sys.exit() + + class Database: """The database class.""" @@ -23,6 +33,9 @@ def __init__(self, db_url: str = '', region_name: str = 'us-east-2'): :param str db_url: URL endpoint for DynamoDB source :param str region_name: default AWS region """ + gene_concepts_table = "gene_concepts" # default + gene_metadata_table = "gene_metadata" # default + if 'GENE_NORM_PROD' in environ or 'GENE_NORM_EB_PROD' in environ: boto_params = { 'region_name': region_name @@ -30,12 +43,20 @@ def __init__(self, db_url: str = '', region_name: str = 'us-east-2'): if 'GENE_NORM_EB_PROD' not in environ: # EB Instance should not have to confirm. # This is used only for updating production via CLI - if click.confirm("Are you sure you want to use the " - "production database?", default=False): - click.echo("***GENE PRODUCTION DATABASE IN USE***") - else: - click.echo("Exiting.") - sys.exit() + confirm_aws_db_use("PROD") + elif "GENE_NORM_NONPROD" in environ: + # This is a nonprod table. Only to be used for creating backups which + # prod will restore. Will need to manually delete / create this table + # on an as needed basis. + gene_concepts_table = "gene_concepts_nonprod" + gene_metadata_table = "gene_metadata_nonprod" + + boto_params = { + "region_name": region_name + } + + # This is used only for updating nonprod via CLI + confirm_aws_db_use("NONPROD") else: if db_url: endpoint_url = db_url @@ -52,13 +73,14 @@ def __init__(self, db_url: str = '', region_name: str = 'us-east-2'): self.dynamodb = boto3.resource('dynamodb', **boto_params) self.dynamodb_client = boto3.client('dynamodb', **boto_params) - # Create tables if nonexistent if not connecting to production database - if 'GENE_NORM_PROD' not in environ and\ - 'GENE_NORM_EB_PROD' not in environ and 'TEST' not in environ: + # Only create tables for local instance + envs_do_not_create_tables = {"GENE_NORM_PROD", "GENE_NORM_EB_PROD", + "GENE_NORM_NONPROD", "TEST"} + if not set(envs_do_not_create_tables) & set(environ): self.create_db_tables() - self.genes = self.dynamodb.Table('gene_concepts') - self.metadata = self.dynamodb.Table('gene_metadata') + self.genes = self.dynamodb.Table(gene_concepts_table) + self.metadata = self.dynamodb.Table(gene_metadata_table) self.batch = self.genes.batch_writer() self.cached_sources = {} diff --git a/gene/etl/ensembl.py b/gene/etl/ensembl.py index 2ea25642..5d150c10 100644 --- a/gene/etl/ensembl.py +++ b/gene/etl/ensembl.py @@ -162,7 +162,7 @@ def _add_attributes(self, f, gene): src_id = val.split("Acc:")[-1].split("]")[0] if ":" in src_id: src_id = src_id.split(":")[-1] - source = self._get_xref_associated_with(src_name, src_id) # noqa: E501 + source = self._get_xref_associated_with(src_name, src_id) if "xrefs" in source: gene["xrefs"] = source["xrefs"] elif "associated_with" in source: @@ -199,11 +199,11 @@ def _get_xref_associated_with(self, src_name, src_id): source["xrefs"] = \ [f"{NamespacePrefix.NCBI.value}:{src_id}"] elif src_name.startswith("UniProt"): - source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] # noqa: E501 + source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] elif src_name.startswith("miRBase"): - source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] # noqa: E501 + source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] elif src_name.startswith("RFAM"): - source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] # noqa: E501 + source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] return source def perform_etl(self, *args, **kwargs): diff --git a/gene/query.py b/gene/query.py index 32ec2e6c..08d13f29 100644 --- a/gene/query.py +++ b/gene/query.py @@ -9,7 +9,8 @@ ServiceMeta, SourcePriority, NormalizeService, SearchService, \ GeneTypeFieldName, UnmergedNormalizationService, MatchesNormalized, \ BaseNormalizationService -from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor, Extension +from ga4gh.vrsatile.pydantic.core_models import Extension +from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor from botocore.exceptions import ClientError from boto3.dynamodb.conditions import Key from datetime import datetime @@ -87,11 +88,9 @@ def _cast_location_ints(record: Dict) -> Dict: """ if 'locations' in record: for loc in record['locations']: - if loc['interval']['type'] == "SequenceInterval": - loc['interval']['start']['value'] = \ - int(loc['interval']['start']['value']) - loc['interval']['end']['value'] = \ - int(loc['interval']['end']['value']) + if loc['type'] == 'SequenceLocation': + loc['start']['value'] = int(loc['start']['value']) + loc['end']['value'] = int(loc['end']['value']) return record def add_record(self, diff --git a/gene/schemas.py b/gene/schemas.py index 41b86239..90ff58b4 100644 --- a/gene/schemas.py +++ b/gene/schemas.py @@ -5,8 +5,8 @@ from pydantic import BaseModel, StrictBool, validator from enum import Enum, IntEnum from ga4gh.vrsatile.pydantic import return_value -from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, \ - ChromosomeLocation, CURIE +from ga4gh.vrsatile.pydantic.core_models import CURIE +from ga4gh.vrsatile.pydantic.vrs_models import SequenceLocation, ChromosomeLocation from ga4gh.vrsatile.pydantic.vrsatile_models import GeneDescriptor from pydantic.types import StrictStr @@ -222,7 +222,7 @@ def schema_extra(schema: Dict[str, Any], prop.pop('title', None) schema['example'] = { "data_license": "custom", - "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", # noqa: E501 + "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", "version": "20201215", "data_url": "ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/", "rdp_url": "https://reusabledata.org/ncbi-gene.html", @@ -408,7 +408,7 @@ def schema_extra(schema: Dict[str, Any], 'name': 'gene-normalizer', 'version': '0.1.0', 'response_datetime': '2022-03-23 15:57:14.180908', - 'url': 'https://github.com/cancervariants/gene-normalization' # noqa: E501 + 'url': 'https://github.com/cancervariants/gene-normalization' } } @@ -458,10 +458,7 @@ def schema_extra(schema: Dict[str, Any], "gene_descriptor": { "id": "normalize.gene:BRAF", "type": "GeneDescriptor", - "gene": { - "gene_id": "hgnc:1097", - "type": "Gene" - }, + "gene_id": "hgnc:1097", "label": "BRAF", "xrefs": [ "ncbigene:673", @@ -477,7 +474,7 @@ def schema_extra(schema: Dict[str, Any], "extensions": [ { "name": "approved_name", - "value": "B-Raf proto-oncogene, serine/threonine kinase", # noqa: E501 + "value": "B-Raf proto-oncogene, serine/threonine kinase", "type": "Extension" }, { @@ -507,15 +504,12 @@ def schema_extra(schema: Dict[str, Any], { "name": "chromosome_location", "value": { - "_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501 + "id": "ga4gh:CL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", # noqa: E501 "type": "ChromosomeLocation", "species_id": "taxonomy:9606", "chr": "7", - "interval": { - "end": "q34", - "start": "q34", - "type": "CytobandInterval" - } + "end": "q34", + "start": "q34", }, "type": "Extension" } @@ -570,7 +564,7 @@ def schema_extra(schema: Dict[str, Any], 'name': 'gene-normalizer', 'version': '0.1.19', 'response_datetime': '2022-03-23 15:57:14.180908', - 'url': 'https://github.com/cancervariants/gene-normalization' # noqa: E501 + 'url': 'https://github.com/cancervariants/gene-normalization' } } @@ -638,14 +632,11 @@ def schema_extra(schema: Dict[str, Any], "locations": [ { "type": "ChromosomeLocation", - "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 "species_id": "taxonomy:9606", "chr": "7", - "interval": { - "type": "CytobandInterval", - "start": "q22.1", - "end": "q22.1" - } + "start": "q22.1", + "end": "q22.1" } ], "aliases": [ @@ -699,19 +690,16 @@ def schema_extra(schema: Dict[str, Any], "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm", # noqa: E501 + "id": "ga4gh:SL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm", # noqa: E501 "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 100889993 - }, - "end": { - "type": "Number", - "value": 100896974 - } + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896974 } } ], @@ -752,29 +740,23 @@ def schema_extra(schema: Dict[str, Any], "locations": [ { "type": "ChromosomeLocation", - "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 + "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", # noqa: E501 "species_id": "taxonomy:9606", "chr": "7", - "interval": { - "type": "CytobandInterval", - "start": "q22.1", - "end": "q22.1" - } + "start": "q22.1", + "end": "q22.1" }, { - "_id": "ga4gh:VSL.EepkXho2doYcUT1DW54fT1a00_zkqrn0", # noqa: E501 + "id": "ga4gh:SL.EepkXho2doYcUT1DW54fT1a00_zkqrn0", # noqa: E501 "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 100889993 - }, - "end": { - "type": "Number", - "value": 100896994 - } + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896994 } } ], diff --git a/gene/version.py b/gene/version.py index 3fd08450..ba53979f 100644 --- a/gene/version.py +++ b/gene/version.py @@ -1,2 +1,2 @@ """Gene normalizer version""" -__version__ = "0.1.27" +__version__ = "0.2.0" diff --git a/gene/vrs_locations/chromosome_location.py b/gene/vrs_locations/chromosome_location.py index e2c36e02..b568a3df 100644 --- a/gene/vrs_locations/chromosome_location.py +++ b/gene/vrs_locations/chromosome_location.py @@ -21,14 +21,11 @@ def add_location(self, location): chr_location = models.ChromosomeLocation( species_id="taxonomy:9606", chr=location['chr'], - interval=models.CytobandInterval( - start=location['start'], - end=location['end'], - type="CytobandInterval" - ), + start=location['start'], + end=location['end'], type="ChromosomeLocation" ) - chr_location._id = ga4gh_identify(chr_location) + chr_location.id = ga4gh_identify(chr_location) return chr_location.as_dict() def get_location(self, location, gene): @@ -40,21 +37,16 @@ def get_location(self, location, gene): dictionary containing the ChromosomeLocation. Else, return None. """ - if 'chr' in location and 'start' in location \ - and 'end' in location: + if 'chr' in location and 'start' in location and 'end' in location: if location['start'] == 'p' and location['end'] == 'p': location['start'] = 'pter' location['end'] = 'cen' - elif location['start'] == 'q' and \ - location['end'] == 'q': + elif location['start'] == 'q' and location['end'] == 'q': location['start'] = 'cen' location['end'] = 'qter' try: - chr_location = \ - self.add_location( - location) - except python_jsonschema_objects.validators. \ - ValidationError as e: + chr_location = self.add_location(location) + except python_jsonschema_objects.validators.ValidationError as e: logger.info(f"{e} for {gene['symbol']}") else: return chr_location diff --git a/gene/vrs_locations/sequence_location.py b/gene/vrs_locations/sequence_location.py index 2dcc8f3e..0e2a8f1a 100644 --- a/gene/vrs_locations/sequence_location.py +++ b/gene/vrs_locations/sequence_location.py @@ -18,7 +18,12 @@ def get_aliases(self, sr, seqid) -> List[str]: :param str seqid: Sequence ID accession :return: List of aliases for seqid """ - return sr.translate_alias(seqid) + aliases = [] + try: + aliases = sr.translate_alias(seqid) + except KeyError as e: + logger.warning(f"SeqRepo raised KeyError: {e}") + return aliases def add_location(self, seqid, gene, params, sr): """Get a gene's Sequence Location. @@ -31,21 +36,20 @@ def add_location(self, seqid, gene, params, sr): """ location = dict() aliases = self.get_aliases(sr, seqid) + if not aliases: + return location + sequence_id = [a for a in aliases if a.startswith('ga4gh')][0] if gene.start != '.' and gene.end != '.' and sequence_id: if 0 <= gene.start <= gene.end: seq_location = models.SequenceLocation( sequence_id=sequence_id, - interval=models.SequenceInterval( - start=models.Number(value=gene.start - 1, - type="Number"), - end=models.Number(value=gene.end, type="Number"), - type="SequenceInterval" - ), + start=models.Number(value=gene.start - 1, type="Number"), + end=models.Number(value=gene.end, type="Number"), type="SequenceLocation" ) - seq_location._id = ga4gh_identify(seq_location) + seq_location.id = ga4gh_identify(seq_location) location = seq_location.as_dict() else: logger.info(f"{params['concept_id']} has invalid interval:" diff --git a/requirements-dev.txt b/requirements-dev.txt index 93f0cabf..162bdd2d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,164 +1,155 @@ -# -# These requirements were autogenerated by pipenv -# To regenerate from the project's Pipfile, run: -# -# pipenv lock --requirements --dev -# - -# Note: in pipenv 2020.x, "--dev" changed to emit both default and development -# requirements. To emit only development requirements, pass "--dev-only". - -i https://pypi.org/simple --e . -anyio==3.5.0; python_full_version >= '3.6.2' +anyio==3.6.1 appdirs==1.4.4 -appnope==0.1.2; sys_platform == 'darwin' -argcomplete==2.0.0; python_version >= '3.6' +appnope==0.1.3 +argcomplete==2.0.0 argh==0.26.2 -argon2-cffi-bindings==21.2.0; python_version >= '3.6' -argon2-cffi==21.3.0; python_version >= '3.6' -asgiref==3.5.0; python_version >= '3.7' -asttokens==2.0.5 -attrs==21.4.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -babel==2.9.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +asttokens==2.0.8 +attrs==22.1.0 backcall==0.2.0 -beautifulsoup4==4.10.0 +beautifulsoup4==4.11.1 biocommons.seqrepo==0.6.5 -bioutils==0.5.5; python_version >= '3.6' -bleach==4.1.0; python_version >= '3.6' -boto3==1.21.26 -botocore==1.24.26; python_version >= '3.6' +bioutils==0.5.7 +boto3==1.24.55 +botocore==1.27.55 bs4==0.0.1 -canonicaljson==1.6.0; python_version ~= '3.7' -certifi==2021.10.8 -cffi==1.15.0 -cfgv==3.3.1; python_full_version >= '3.6.1' -charset-normalizer==2.0.12; python_version >= '3' -click==8.0.4 -coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -configparser==5.2.0; python_version >= '3.6' -coverage==6.3.2 -coveralls==3.3.1 -cssselect==1.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -debugpy==1.6.0; python_version >= '3.7' -decorator==5.1.1; python_version >= '3.5' -defusedxml==0.7.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -distlib==0.3.4 -docopt==0.6.2 -entrypoints==0.4; python_version >= '3.6' -executing==0.8.3 +canonicaljson==1.6.2 +certifi==2022.6.15 +charset-normalizer==2.1.0 +click==8.1.3 +coloredlogs==15.0.1 +configparser==5.2.0 +cssselect==1.1.0 +decorator==5.1.1 +executing==0.10.0 fake-useragent==0.1.11 -fastapi==0.75.0 -filelock==3.6.0; python_version >= '3.7' -flake8-docstrings==1.6.0 -flake8==4.0.1 -ga4gh.vrs[extras]==0.8.0.dev0 -ga4gh.vrsatile.pydantic==0.0.10 -gffutils==0.10.1 -h11==0.13.0; python_version >= '3.6' +fastapi==0.79.1 +ga4gh.vrs[extras]==0.8.6.dev0 +ga4gh.vrsatile.pydantic==0.1.dev0 +gffutils==0.11.0 +h11==0.13.0 hgvs==1.5.2 -humanfriendly==10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -identify==2.4.12; python_version >= '3.7' -idna==3.3; python_version >= '3' -importlib-metadata==4.11.3; python_version < '3.10' -inflection==0.5.1; python_version >= '3.5' -iniconfig==1.1.1 -ipykernel==6.9.2; python_version >= '3.7' -ipython-genutils==0.2.0 -ipython==8.1.1; python_version >= '3.8' -jedi==0.18.1; python_version >= '3.6' -jinja2==3.1.0; python_version >= '3.7' -jmespath==1.0.0; python_version >= '3.7' -json5==0.9.6 +humanfriendly==10.0 +idna==3.3 +importlib-metadata==4.12.0 +inflection==0.5.1 +ipython==8.4.0 +jedi==0.18.1 +jmespath==1.0.1 jsonschema==3.2.0 -jupyter-client==7.1.2; python_full_version >= '3.6.1' -jupyter-core==4.9.2; python_version >= '3.6' -jupyter-server==1.15.6; python_version >= '3.7' -jupyterlab-pygments==0.1.2 -jupyterlab-server==2.11.2; python_version >= '3.7' -jupyterlab==3.3.2 -lxml==4.8.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -markdown==3.3.6; python_version >= '3.6' -markupsafe==2.1.1; python_version >= '3.7' -matplotlib-inline==0.1.3; python_version >= '3.5' -mccabe==0.6.1 -mistune==0.8.4 -mock==4.0.3 -nbclassic==0.3.7; python_version >= '3.7' -nbclient==0.5.13; python_version >= '3.7' -nbconvert==6.4.4; python_version >= '3.7' -nbformat==5.2.0; python_version >= '3.7' -nest-asyncio==1.5.4; python_version >= '3.5' -nodeenv==1.6.0 -notebook-shim==0.1.0; python_version >= '3.7' -notebook==6.4.10; python_version >= '3.6' -numpy==1.22.3; python_version >= '3.8' -packaging==21.3; python_version >= '3.6' -pandocfilters==1.5.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +lxml==4.9.1 +markdown==3.4.1 +matplotlib-inline==0.1.6 +numpy==1.23.2 parse==1.19.0 parsley==1.3 -parso==0.8.3; python_version >= '3.6' -pexpect==4.8.0; sys_platform != 'win32' +parso==0.8.3 +pexpect==4.8.0 pickleshare==0.7.5 -platformdirs==2.5.1; python_version >= '3.7' -pluggy==1.0.0; python_version >= '3.6' -pre-commit==2.17.0 -prometheus-client==0.13.1; python_version >= '3.6' -prompt-toolkit==3.0.28; python_full_version >= '3.6.2' -psutil==5.9.0; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' -psycopg2==2.9.3; python_version >= '3.6' +prompt-toolkit==3.0.30 +psycopg2==2.9.3 +psycopg2-binary==2.9.3 ptyprocess==0.7.0 pure-eval==0.2.2 -py==1.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -pycodestyle==2.8.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -pycparser==2.21 -pydantic==1.9.0 -pydocstyle==6.1.1; python_version >= '3.6' +pydantic==1.9.2 pyee==8.2.2 -pyfaidx==0.6.4 -pyflakes==2.4.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -pygments==2.11.2; python_version >= '3.5' -pyparsing==3.0.7; python_version >= '3.6' -pyppeteer==1.0.2; python_version >= '3.7' and python_version < '4.0' +pyfaidx==0.7.1 +pygments==2.13.0 +pyppeteer==1.0.2 pyquery==1.4.3 -pyrsistent==0.18.1; python_version >= '3.7' -pysam==0.18.0 -pytest-cov==3.0.0 -pytest==7.1.1 -python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +pyrsistent==0.18.1 +pysam==0.19.1 +python-dateutil==2.8.2 python-jsonschema-objects==0.4.1 -pytz==2022.1 -pyyaml==6.0; python_version >= '3.6' -pyzmq==22.3.0; python_version >= '3.6' -requests-html==0.10.0; python_version >= '3.6' -requests==2.27.1 -s3transfer==0.5.2; python_version >= '3.6' +pyyaml==6.0 +requests==2.28.1 +requests-html==0.10.0 +s3transfer==0.6.0 +setuptools==65.1.0 +simplejson==3.17.6 +six==1.16.0 +sniffio==1.2.0 +soupsieve==2.3.2.post1 +sqlparse==0.4.2 +stack-data==0.4.0 +starlette==0.19.1 +tabulate==0.8.10 +tqdm==4.64.0 +traitlets==5.3.0 +typing-extensions==4.3.0 +urllib3==1.26.11 +uvicorn==0.18.2 +w3lib==2.0.1 +wcwidth==0.2.5 +websockets==10.3 +yoyo-migrations==7.3.2 +zipp==3.8.1 +argon2-cffi==21.3.0 +argon2-cffi-bindings==21.2.0 +babel==2.10.3 +bleach==5.0.1 +cffi==1.15.1 +cfgv==3.3.1 +coverage==6.4.4 +coveralls==3.3.1 +debugpy==1.6.3 +defusedxml==0.7.1 +distlib==0.3.5 +docopt==0.6.2 +entrypoints==0.4 +fastjsonschema==2.16.1 +filelock==3.8.0 +flake8==5.0.4 +flake8-docstrings==1.6.0 +-e . +identify==2.5.3 +iniconfig==1.1.1 +ipykernel==6.15.1 +ipython-genutils==0.2.0 +jinja2==3.1.2 +json5==0.9.10 +jupyter-client==7.3.4 +jupyter-core==4.11.1 +jupyter-server==1.18.1 +jupyterlab==3.4.5 +jupyterlab-pygments==0.2.2 +jupyterlab-server==2.15.0 +markupsafe==2.1.1 +mccabe==0.7.0 +mistune==0.8.4 +mock==4.0.3 +nbclassic==0.4.3 +nbclient==0.6.6 +nbconvert==6.5.3 +nbformat==5.4.0 +nest-asyncio==1.5.5 +nodeenv==1.7.0 +notebook==6.4.12 +notebook-shim==0.1.0 +packaging==21.3 +pandocfilters==1.5.0 +platformdirs==2.5.2 +pluggy==1.0.0 +pre-commit==2.20.0 +prometheus-client==0.14.1 +psutil==5.9.1 +py==1.11.0 +pycodestyle==2.9.1 +pycparser==2.21 +pydocstyle==6.1.1 +pyflakes==2.5.0 +pyparsing==3.0.9 +pytest==7.1.2 +pytest-cov==3.0.0 +pytz==2022.2.1 +pyzmq==23.2.1 send2trash==1.8.0 -setuptools==61.0.0; python_version >= '3.7' -simplejson==3.17.6; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' -six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -sniffio==1.2.0; python_version >= '3.5' snowballstemmer==2.2.0 -soupsieve==2.3.1; python_version >= '3.6' -sqlparse==0.4.2; python_version >= '3.5' -stack-data==0.2.0 -starlette==0.17.1; python_version >= '3.6' -tabulate==0.8.9 -terminado==0.13.3; python_version >= '3.7' -testpath==0.6.0; python_version >= '3.5' -toml==0.10.2; python_version >= '2.6' and python_version not in '3.0, 3.1, 3.2, 3.3' -tomli==2.0.1; python_version >= '3.7' -tornado==6.1; python_version >= '3.5' -tqdm==4.63.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -traitlets==5.1.1; python_version >= '3.7' -typing-extensions==4.1.1; python_version >= '3.6' -urllib3==1.26.9; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0' -uvicorn==0.17.6 -virtualenv==20.13.4; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -w3lib==1.22.0 -wcwidth==0.2.5 +terminado==0.15.0 +tinycss2==1.1.1 +toml==0.10.2 +tomli==2.0.1 +tornado==6.2 +virtualenv==20.16.3 webencodings==0.5.1 -websocket-client==1.3.1; python_version >= '3.6' -websockets==10.2; python_version >= '3.7' -yoyo-migrations==7.3.2 -zipp==3.7.0; python_version >= '3.7' +websocket-client==1.3.3 diff --git a/requirements.txt b/requirements.txt index 7386be8f..ba7879b7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,93 +1,86 @@ -# -# These requirements were autogenerated by pipenv -# To regenerate from the project's Pipfile, run: -# -# pipenv lock --requirements -# - -i https://pypi.org/simple -anyio==3.5.0; python_full_version >= '3.6.2' +anyio==3.6.1 appdirs==1.4.4 -appnope==0.1.2; sys_platform == 'darwin' -argcomplete==2.0.0; python_version >= '3.6' +appnope==0.1.3 +argcomplete==2.0.0 argh==0.26.2 -asgiref==3.5.0; python_version >= '3.7' -asttokens==2.0.5 -attrs==21.4.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' +asttokens==2.0.8 +attrs==22.1.0 backcall==0.2.0 -beautifulsoup4==4.10.0 +beautifulsoup4==4.11.1 biocommons.seqrepo==0.6.5 -bioutils==0.5.5; python_version >= '3.6' -boto3==1.21.26 -botocore==1.24.26; python_version >= '3.6' +bioutils==0.5.7 +boto3==1.24.55 +botocore==1.27.55 bs4==0.0.1 -canonicaljson==1.6.0; python_version ~= '3.7' -certifi==2021.10.8 -charset-normalizer==2.0.12; python_version >= '3' -click==8.0.4 -coloredlogs==15.0.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -configparser==5.2.0; python_version >= '3.6' -cssselect==1.1.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -decorator==5.1.1; python_version >= '3.5' -executing==0.8.3 +canonicaljson==1.6.2 +certifi==2022.6.15 +charset-normalizer==2.1.0 +click==8.1.3 +coloredlogs==15.0.1 +configparser==5.2.0 +cssselect==1.1.0 +decorator==5.1.1 +executing==0.10.0 fake-useragent==0.1.11 -fastapi==0.75.0 -ga4gh.vrs[extras]==0.8.0.dev0 -ga4gh.vrsatile.pydantic==0.0.10 -gffutils==0.10.1 -h11==0.13.0; python_version >= '3.6' +fastapi==0.79.1 +ga4gh.vrs[extras]==0.8.6.dev0 +ga4gh.vrsatile.pydantic==0.1.dev0 +gffutils==0.11.0 +h11==0.13.0 hgvs==1.5.2 -humanfriendly==10.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -idna==3.3; python_version >= '3' -importlib-metadata==4.11.3; python_version < '3.10' -inflection==0.5.1; python_version >= '3.5' -ipython==8.1.1; python_version >= '3.8' -jedi==0.18.1; python_version >= '3.6' -jmespath==1.0.0; python_version >= '3.7' +humanfriendly==10.0 +idna==3.3 +importlib-metadata==4.12.0 +inflection==0.5.1 +ipython==8.4.0 +jedi==0.18.1 +jmespath==1.0.1 jsonschema==3.2.0 -lxml==4.8.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' -markdown==3.3.6; python_version >= '3.6' -matplotlib-inline==0.1.3; python_version >= '3.5' -numpy==1.22.3; python_version >= '3.8' +lxml==4.9.1 +markdown==3.4.1 +matplotlib-inline==0.1.6 +numpy==1.23.2 parse==1.19.0 parsley==1.3 -parso==0.8.3; python_version >= '3.6' -pexpect==4.8.0; sys_platform != 'win32' +parso==0.8.3 +pexpect==4.8.0 pickleshare==0.7.5 -prompt-toolkit==3.0.28; python_full_version >= '3.6.2' -psycopg2==2.9.3; python_version >= '3.6' +prompt-toolkit==3.0.30 +psycopg2==2.9.3 +psycopg2-binary==2.9.3 ptyprocess==0.7.0 pure-eval==0.2.2 -pydantic==1.9.0 +pydantic==1.9.2 pyee==8.2.2 -pyfaidx==0.6.4 -pygments==2.11.2; python_version >= '3.5' -pyppeteer==1.0.2; python_version >= '3.7' and python_version < '4.0' +pyfaidx==0.7.1 +pygments==2.13.0 +pyppeteer==1.0.2 pyquery==1.4.3 -pyrsistent==0.18.1; python_version >= '3.7' -pysam==0.18.0 -python-dateutil==2.8.2; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' +pyrsistent==0.18.1 +pysam==0.19.1 +python-dateutil==2.8.2 python-jsonschema-objects==0.4.1 -pyyaml==6.0; python_version >= '3.6' -requests-html==0.10.0; python_version >= '3.6' -requests==2.27.1 -s3transfer==0.5.2; python_version >= '3.6' -setuptools==61.0.0; python_version >= '3.7' -simplejson==3.17.6; python_version >= '2.5' and python_version not in '3.0, 3.1, 3.2, 3.3' -six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -sniffio==1.2.0; python_version >= '3.5' -soupsieve==2.3.1; python_version >= '3.6' -sqlparse==0.4.2; python_version >= '3.5' -stack-data==0.2.0 -starlette==0.17.1; python_version >= '3.6' -tabulate==0.8.9 -tqdm==4.63.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' -traitlets==5.1.1; python_version >= '3.7' -typing-extensions==4.1.1; python_version >= '3.6' -urllib3==1.26.9; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' and python_version < '4.0' -uvicorn==0.17.6 -w3lib==1.22.0 +pyyaml==6.0 +requests==2.28.1 +requests-html==0.10.0 +s3transfer==0.6.0 +setuptools==65.1.0 +simplejson==3.17.6 +six==1.16.0 +sniffio==1.2.0 +soupsieve==2.3.2.post1 +sqlparse==0.4.2 +stack-data==0.4.0 +starlette==0.19.1 +tabulate==0.8.10 +tqdm==4.64.0 +traitlets==5.3.0 +typing-extensions==4.3.0 +urllib3==1.26.11 +uvicorn==0.18.2 +w3lib==2.0.1 wcwidth==0.2.5 -websockets==10.2; python_version >= '3.7' +websockets==10.3 yoyo-migrations==7.3.2 -zipp==3.7.0; python_version >= '3.7' +zipp==3.8.1 diff --git a/setup.cfg b/setup.cfg index 1dfcc3c0..1916d4e1 100644 --- a/setup.cfg +++ b/setup.cfg @@ -33,8 +33,8 @@ install_requires = gffutils requests biocommons.seqrepo - ga4gh.vrs[extras] >= 0.7.5.dev1 - ga4gh.vrsatile.pydantic >= 0.0.10 + ga4gh.vrs[extras] >= 0.8.6dev0 + ga4gh.vrsatile.pydantic >= 0.1.dev0 tests_require = diff --git a/tests/unit/data/etl_data/ensembl_106.gff3 b/tests/unit/data/etl_data/ensembl_107.gff3 similarity index 96% rename from tests/unit/data/etl_data/ensembl_106.gff3 rename to tests/unit/data/etl_data/ensembl_107.gff3 index 5256b224..78687c6e 100644 --- a/tests/unit/data/etl_data/ensembl_106.gff3 +++ b/tests/unit/data/etl_data/ensembl_107.gff3 @@ -197,12 +197,12 @@ #!genome-version GRCh38 #!genome-date 2013-12 #!genome-build-accession GCA_000001405.28 -#!genebuild-last-updated 2021-03 +#!genebuild-last-updated 2022-04 1 GRCh38 chromosome 1 248956422 . . . ID=chromosome:1;Alias=CM000663.2,chr1,NC_000001.11 ### 1 havana pseudogene 11869 14409 . + . ID=gene:ENSG00000223972;Name=DDX11L1;biotype=transcribed_unprocessed_pseudogene;description=DEAD/H-box helicase 11 like 1 (pseudogene) [Source:HGNC Symbol%3BAcc:HGNC:37102];gene_id=ENSG00000223972;logic_name=havana_homo_sapiens;version=5 ### -1 ensembl_havana gene 220148293 220272453 . - . ID=gene:ENSG00000118873;Name=RAB3GAP2;biotype=protein_coding;description=RAB3 GTPase activating non-catalytic protein subunit 2 [Source:HGNC Symbol%3BAcc:HGNC:17168];gene_id=ENSG00000118873;logic_name=ensembl_havana_gene_homo_sapiens;version=16 +1 ensembl_havana gene 220148293 220272529 . - . ID=gene:ENSG00000118873;Name=RAB3GAP2;biotype=protein_coding;description=RAB3 GTPase activating non-catalytic protein subunit 2 [Source:HGNC Symbol%3BAcc:HGNC:17168];gene_id=ENSG00000118873;logic_name=ensembl_havana_gene_homo_sapiens;version=17 ### 2 GRCh38 chromosome 1 242193529 . . . ID=chromosome:2;Alias=CM000664.2,chr2,NC_000002.12 ### @@ -227,7 +227,6 @@ 11 ensembl_havana gene 10751246 10801625 . + . ID=gene:ENSG00000198730;Name=CTR9;biotype=protein_coding;description=CTR9 homolog%2C Paf1/RNA polymerase II complex component [Source:HGNC Symbol%3BAcc:HGNC:16850];gene_id=ENSG00000198730;logic_name=ensembl_havana_gene_homo_sapiens;version=9 ### 15 GRCh38 chromosome 1 101991189 . . . ID=chromosome:15;Alias=CM000677.2,chr15,NC_000015.10 -### 15 ensembl_havana gene 89784895 89815401 . - . ID=gene:ENSG00000166825;Name=ANPEP;biotype=protein_coding;description=alanyl aminopeptidase%2C membrane [Source:HGNC Symbol%3BAcc:HGNC:500];gene_id=ENSG00000166825;logic_name=ensembl_havana_gene_homo_sapiens;version=15 ### 17 GRCh38 chromosome 1 83257441 . . . ID=chromosome:17;Alias=CM000679.2,chr17,NC_000017.11 @@ -242,7 +241,7 @@ ### X GRCh38 chromosome 1 156040895 . . . ID=chromosome:X;Alias=CM000685.2,chrX,NC_000023.11 ### -X havana ncRNA_gene 154424378 154428512 . - . ID=gene:ENSG00000197180;Name=CH17-340M24.3;biotype=lncRNA;description=uncharacterized protein BC009467 [Source:NCBI gene (formerly Entrezgene)%3BAcc:158960];gene_id=ENSG00000197180;logic_name=havana_homo_sapiens;version=3 +X havana ncRNA_gene 154424378 154428526 . - . ID=gene:ENSG00000197180;Name=ATP6AP1-DT;biotype=lncRNA;description=ATP6AP1 divergent transcript [Source:HGNC Symbol%3BAcc:HGNC:25138];gene_id=ENSG00000197180;logic_name=havana_homo_sapiens;version=4 ### -X ensembl_havana gene 155612572 155782459 . + . ID=gene:ENSG00000168939;Name=SPRY3;biotype=protein_coding;description=sprouty RTK signaling antagonist 3 [Source:HGNC Symbol%3BAcc:HGNC:11271];gene_id=ENSG00000168939;logic_name=ensembl_havana_gene_homo_sapiens;version=12 +X ensembl_havana gene 155612572 155782459 . + . ID=gene:ENSG00000168939;Name=SPRY3;biotype=protein_coding;description=sprouty RTK signaling antagonist 3 [Source:HGNC Symbol%3BAcc:HGNC:11271];gene_id=ENSG00000168939;logic_name=ensembl_havana_gene_homo_sapiens;version=13 ### diff --git a/tests/unit/data/etl_data/hgnc_20210810.json b/tests/unit/data/etl_data/hgnc_20210810.json index bdb65134..666e01d7 100644 --- a/tests/unit/data/etl_data/hgnc_20210810.json +++ b/tests/unit/data/etl_data/hgnc_20210810.json @@ -251,66 +251,66 @@ "location_sortable": "22pter-q11" }, { - "date_approved_reserved": "1999-09-29", - "alias_name": [ - "chromatin assembly factor I (150 kDa)" - ], - "vega_id": "OTTHUMG00000181922", - "locus_group": "protein-coding gene", "mane_select": [ "ENST00000301280.10", "NM_005483.3" - ], - "status": "Approved", - "alias_symbol": [ + ], + "locus_group": "protein-coding gene", + "alias_name": [ + "chromatin assembly factor I (150 kDa)" + ], + "status": "Approved", + "vega_id": "OTTHUMG00000181922", + "hgnc_id": "HGNC:1910", + "location_sortable": "19p13.3", + "uuid": "cacf6159-9c08-4ca4-a648-cb1895edee54", + "date_name_changed": "2015-11-23", + "entrez_id": "10036", + "pubmed_id": [ + 7600578 + ], + "ccds_id": [ + "CCDS32875" + ], + "ucsc_id": "uc002mal.4", + "name": "chromatin assembly factor 1 subunit A", + "ena": [ + "U20979" + ], + "mgd_id": [ + "MGI:1351331" + ], + "date_modified": "2019-08-21", + "rgd_id": [ + "RGD:1590865" + ], + "alias_symbol": [ "CAF1P150", "CAF1B", "CAF-1", "CAF1", "P150", "MGC71229" - ], - "_version_": 1707696198253543425, - "uuid": "cbaac19b-6e86-4b58-9053-e34c3aa5d99e", - "prev_name": [ + ], + "symbol": "CHAF1A", + "date_approved_reserved": "1999-09-29", + "prev_name": [ "chromatin assembly factor 1, subunit A (p150)" - ], - "refseq_accession": [ - "NM_005483" - ], - "locus_type": "gene with protein product", - "agr": "HGNC:1910", - "hgnc_id": "HGNC:1910", - "rgd_id": [ - "RGD:1590865" - ], - "ensembl_gene_id": "ENSG00000167670", - "entrez_id": "10036", - "omim_id": [ + ], + "omim_id": [ "601246" - ], - "symbol": "CHAF1A", - "date_name_changed": "2015-11-23", - "location": "19p13.3", - "name": "chromatin assembly factor 1 subunit A", - "date_modified": "2019-08-21", - "mgd_id": [ - "MGI:1351331" - ], - "ucsc_id": "uc002mal.4", - "uniprot_ids": [ + ], + "refseq_accession": [ + "NM_005483" + ], + "location": "19p13.3", + "locus_type": "gene with protein product", + "_version_": 1741469986589769728, + "agr": "HGNC:1910", + "ensembl_gene_id": "ENSG00000167670", + "uniprot_ids": [ "Q13111" - ], - "ccds_id": [ - "CCDS32875" - ], - "ena": [ - "U20979" - ], - "pubmed_id": [ - 7600578 - ], - "location_sortable": "19p13.3" + ] }, { "date_approved_reserved": "2003-11-13", @@ -459,68 +459,71 @@ "location_sortable": "Xp22.32 and Yp11.3" }, { - "date_approved_reserved": "2005-05-06", + "prev_symbol": [ + "A3GALT2P" + ], + "date_symbol_changed": "2013-03-11", + "entrez_id": "127550", + "date_modified": "2018-02-08", + "mgd_id": [ + "MGI:2685279" + ], + "location_sortable": "01p35.1", + "status": "Approved", + "vega_id": "OTTHUMG00000004125", + "date_name_changed": "2013-03-11", + "uuid": "68885c2c-4b8a-4001-a77e-41db23685bfe", "alias_name": [ "iGb3 synthase", "isoglobotriaosylceramide synthase" ], - "vega_id": "OTTHUMG00000004125", "locus_group": "protein-coding gene", - "mane_select": [ - "ENST00000442999.3", - "NM_001080438.1" - ], - "status": "Approved", - "alias_symbol": [ - "IGBS3S", - "IGB3S" - ], - "_version_": 1707696195380445184, - "uuid": "ec929101-693b-4afc-ae1b-bbe1d38f9c62", - "prev_name": [ - "alpha 1,3-galactosyltransferase 2, pseudogene" - ], - "refseq_accession": [ - "NM_001080438" - ], + "ensembl_gene_id": "ENSG00000184389", + "_version_": 1741469984037535744, "locus_type": "gene with protein product", - "agr": "HGNC:30005", - "hgnc_id": "HGNC:30005", "rgd_id": [ "RGD:727913" ], - "ensembl_gene_id": "ENSG00000184389", - "entrez_id": "127550", + "date_approved_reserved": "2005-05-06", + "ccds_id": [ + "CCDS60080" + ], + "ucsc_id": "uc031plq.1", + "pubmed_id": [ + 10854427, + 18630988 + ], + "name": "alpha 1,3-galactosyltransferase 2", + "hgnc_id": "HGNC:30005", "gene_group": [ "Glycosyltransferase family 6" ], - "symbol": "A3GALT2", - "date_name_changed": "2013-03-11", - "location": "1p35.1", - "name": "alpha 1,3-galactosyltransferase 2", - "date_modified": "2018-02-08", - "mgd_id": [ - "MGI:2685279" + "mane_select": [ + "ENST00000442999.3", + "NM_001080438.1" ], - "ucsc_id": "uc031plq.1", - "prev_symbol": [ - "A3GALT2P" + "gene_group_id": [ + 429 ], "uniprot_ids": [ "U3KPV4" ], - "ccds_id": [ - "CCDS60080" + "agr": "HGNC:30005", + "location": "1p35.1", + "refseq_accession": [ + "NM_001080438" ], - "gene_group_id": [ - 429 + "omim_id": [ + "619850" ], - "date_symbol_changed": "2013-03-11", - "pubmed_id": [ - 10854427, - 18630988 + "symbol": "A3GALT2", + "alias_symbol": [ + "IGBS3S", + "IGB3S" ], - "location_sortable": "01p35.1" + "prev_name": [ + "alpha 1,3-galactosyltransferase 2, pseudogene" + ] }, { "date_approved_reserved": "2009-02-18", @@ -1173,4 +1176,4 @@ ], "start": 0 } -} +} \ No newline at end of file diff --git a/tests/unit/data/etl_data/ncbi_GRCh38.p14.gff b/tests/unit/data/etl_data/ncbi_GRCh38.p14.gff index cad9dcee..46599b07 100644 --- a/tests/unit/data/etl_data/ncbi_GRCh38.p14.gff +++ b/tests/unit/data/etl_data/ncbi_GRCh38.p14.gff @@ -49,7 +49,7 @@ NC_000011.10 Curated Genomic exon 117138227 117138867 . ##sequence-region NC_000012.12 1 133275309 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NC_000012.12 RefSeq region 1 133275309 . + . ID=NC_000012.12:1..133275309;Dbxref=taxon:9606;Name=12;chromosome=12;gbkey=Src;genome=chromosome;mol_type=genomic DNA -NC_000012.12 BestRefSeq%2CGnomon gene 133037301 133063299 . + . ID=gene-ZNF84;Dbxref=GeneID:7637,HGNC:HGNC:13159,MIM:618554;Name=ZNF84;description=zinc finger protein 84;gbkey=Gene;gene=ZNF84;gene_biotype=protein_coding;gene_synonym=HPF2 +NC_000012.12 BestRefSeq%2CGnomon gene 133037509 133063299 . + . ID=gene-ZNF84;Dbxref=GeneID:7637,HGNC:HGNC:13159,MIM:618554;Name=ZNF84;description=zinc finger protein 84;gbkey=Gene;gene=ZNF84;gene_biotype=protein_coding;gene_synonym=HPF2 ##sequence-region NC_000015.10 1 101991189 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NC_000015.10 RefSeq region 1 101991189 . + . ID=NC_000015.10:1..101991189;Dbxref=taxon:9606;Name=15;chromosome=15;gbkey=Src;genome=chromosome;mol_type=genomic DNA @@ -58,7 +58,7 @@ NC_000015.10 BestRefSeq gene 89784895 89814852 . - . ID=gene-ANPEP;Dbxref=GeneID ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NC_000019.10 RefSeq region 1 58617616 . + . ID=NC_000019.10:1..58617616;Dbxref=taxon:9606;Name=19;chromosome=19;gbkey=Src;genome=chromosome;mol_type=genomic DNA NC_000019.10 BestRefSeq%2CGnomon gene 38211006 38229695 . - . ID=gene-DPF1;Dbxref=GeneID:8193,HGNC:HGNC:20225,MIM:601670;Name=DPF1;description=double PHD fingers 1;gbkey=Gene;gene=DPF1;gene_biotype=protein_coding;gene_synonym=BAF45b,NEUD4,neuro-d4 -NC_000019.10 BestRefSeq%2CGnomon gene 4402596 4448322 . + . ID=gene-CHAF1A;Dbxref=GeneID:10036,HGNC:HGNC:1910,MIM:601246;Name=CHAF1A;description=chromatin assembly factor 1 subunit A;gbkey=Gene;gene=CHAF1A;gene_biotype=protein_coding;gene_synonym=CAF-1,CAF1,CAF1B,CAF1P150,P150 +NC_000019.10 BestRefSeq%2CGnomon gene 4402640 4450830 . + . ID=gene-CHAF1A;Dbxref=GeneID:10036,HGNC:HGNC:1910,MIM:601246;Name=CHAF1A;description=chromatin assembly factor 1 subunit A;gbkey=Gene;gene=CHAF1A;gene_biotype=protein_coding;gene_synonym=CAF-1,CAF1,CAF1B,CAF1P150,P150 ##sequence-region NT_187390.1 1 42811 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NT_187390.1 RefSeq region 1 42811 . + . ID=NT_187390.1:1..42811;Dbxref=taxon:9606;Name=22;chromosome=22;gbkey=Src;genome=genomic;map=unlocalized;mol_type=genomic DNA @@ -83,7 +83,7 @@ NC_000023.11 BestRefSeq gene 155612586 155782459 . + . ID=gene-SPRY3;Dbxref=Gene ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NC_000024.10 RefSeq region 1 57227415 . + . ID=NC_000024.10:1..57227415;Dbxref=taxon:9606;Name=Y;chromosome=Y;gbkey=Src;genome=chromosome;mol_type=genomic DNA NC_000024.10 BestRefSeq gene 1386152 1392113 . - . ID=gene-SLC25A6-2;Dbxref=GeneID:293,HGNC:HGNC:10992,MIM:403000;Name=SLC25A6;description=solute carrier family 25 member 6;gbkey=Gene;gene=SLC25A6;gene_biotype=protein_coding;gene_synonym=AAC3,ANT,ANT 2,ANT 3,ANT3,ANT3Y -NC_000024.10 BestRefSeq gene 56923423 56968979 . + . ID=gene-SPRY3-2;Dbxref=GeneID:10251,HGNC:HGNC:11271,MIM:300531;Name=SPRY3;description=sprouty RTK signaling antagonist 3;gbkey=Gene;gene=SPRY3;gene_biotype=protein_coding;gene_synonym=spry-3;partial=true;start_range=.,56923423 +NC_000024.10 BestRefSeq gene 56954316 56968979 . + . ID=gene-SPRY3-2;Dbxref=GeneID:10251,HGNC:HGNC:11271,MIM:300531;Name=SPRY3;description=sprouty RTK signaling antagonist 3;gbkey=Gene;gene=SPRY3;gene_biotype=protein_coding;gene_synonym=spry-3 ##sequence-region NT_167246.2 1 4677643 ##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=9606 NT_167246.2 RefSeq region 1 4677643 . + . ID=NT_167246.2:1..4677643;Dbxref=taxon:9606;Name=6;chromosome=6;gbkey=Src;genome=genomic;map=6p22.1-21.32;mol_type=genomic DNA diff --git a/tests/unit/data/etl_data/ncbi_history_20210813.tsv b/tests/unit/data/etl_data/ncbi_history_20210813.tsv index 06b23b48..b7559bc0 100644 --- a/tests/unit/data/etl_data/ncbi_history_20210813.tsv +++ b/tests/unit/data/etl_data/ncbi_history_20210813.tsv @@ -16,3 +16,4 @@ 9606 7637 100287429 LOC100287429 20110803 9606 - 103344718 HOTS 20200620 9606 - 544580 AASTH23 20190503 +9606 10036 107985297 LOC107985297 20220408 \ No newline at end of file diff --git a/tests/unit/data/etl_data/ncbi_info_20210813.tsv b/tests/unit/data/etl_data/ncbi_info_20210813.tsv index 46735b88..bfc0e5ea 100644 --- a/tests/unit/data/etl_data/ncbi_info_20210813.tsv +++ b/tests/unit/data/etl_data/ncbi_info_20210813.tsv @@ -19,7 +19,7 @@ 9606 25782 RAB3GAP2 - MARTS1|RAB3-GAP150|RAB3GAP150|SPG69|WARBM2|p150 MIM:609275|HGNC:HGNC:17168|Ensembl:ENSG00000118873 1 1q41 RAB3 GTPase activating non-catalytic protein subunit 2 protein-coding RAB3GAP2 RAB3 GTPase activating non-catalytic protein subunit 2 O rab3 GTPase-activating protein non-catalytic subunit|RAB3 GTPase activating protein subunit 2 (non-catalytic)|RGAP-iso|rab3 GTPase-activating protein 150 kDa subunit|rab3-GAP p150|rab3-GAP regulatory subunit 20210709 - 9606 293 SLC25A6 - AAC3|ANT|ANT 2|ANT 3|ANT3|ANT3Y MIM:300151|MIM:403000|HGNC:HGNC:10992|Ensembl:ENSG00000169100 X|Y X;Y solute carrier family 25 member 6 protein-coding SLC25A6 solute carrier family 25 member 6 O ADP/ATP translocase 3|ADP,ATP carrier protein 3|ADP,ATP carrier protein, liver|ADP/ATP translocator of liver|adenine nucleotide translocator 3|epididymis secretory sperm binding protein|solute carrier family 25 (mitochondrial carrier; adenine nucleotide translocator), member 6 20210708 - 9606 100049159 SPG37 - - MIM:611945 8 8p21.2-q13.3 spastic paraplegia 37 (autosomal dominant) unknown - - - - 20191002 - -9606 10251 SPRY3 - spry-3 MIM:300531|HGNC:HGNC:11271|Ensembl:ENSG00000168939 X|Y Xq28 and Yq12 sprouty RTK signaling antagonist 3 protein-coding SPRY3 sprouty RTK signaling antagonist 3 O protein sprouty homolog 3|antagonist of FGF signaling|sprouty homolog 3|sprouty3 20210807 - +9606 10251 SPRY3 - spry-3 MIM:300531|HGNC:HGNC:11271|Ensembl:ENSG00000168939|AllianceGenome:HGNC:11271 X|Y Xq28 and Yq12 sprouty RTK signaling antagonist 3 protein-coding SPRY3 sprouty RTK signaling antagonist 3 O protein sprouty homolog 3|antagonist of FGF signaling|sprouty homolog 3|sprouty3 20220805 - 9606 7637 ZNF84 - HPF2 MIM:618554|HGNC:HGNC:13159|Ensembl:ENSG00000198040 12 12q24.33|map from Rosati ref via FISH [AFS] zinc finger protein 84 protein-coding ZNF84 zinc finger protein 84 O zinc finger protein 84|zinc finger protein HPF2 20210611 - 9606 619538 OMS - COME/ROM MIM:166760 10|19|3 10q26.3;19q13.42-q13.43;3p25.3 otitis media, susceptibility to unknown - - - chronic/recurrent otitis media 20170408 - 9606 653303 LOC653303 - - - 11 11q23.3 proprotein convertase subtilisin/kexin type 7 pseudogene pseudo - - - - 20211123 - diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 25584a29..e9160591 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -91,7 +91,7 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, dynamodb, etl_data_path, shutil.rmtree(e.src_data_dir) e._sequence_location.get_aliases = _get_aliases - e._data_src = etl_data_path / 'ensembl_106.gff3' + e._data_src = etl_data_path / 'ensembl_107.gff3' e._transform_data() e._add_meta() processed_ids += e._processed_ids @@ -148,37 +148,31 @@ def test_merged_conecpts(processed_ids, dynamodb, is_test_env): def test_item_type(dynamodb): """Check that items are tagged with item_type attribute.""" filter_exp = Key('label_and_type').eq('ncbigene:8193##identity') - item = \ - dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + item = dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] assert 'item_type' in item assert item['item_type'] == 'identity' filter_exp = Key('label_and_type').eq('prkrap1##symbol') - item = \ - dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + item = dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] assert 'item_type' in item assert item['item_type'] == 'symbol' filter_exp = Key('label_and_type').eq('loc157663##prev_symbol') - item = \ - dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + item = dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] assert 'item_type' in item assert item['item_type'] == 'prev_symbol' filter_exp = Key('label_and_type').eq('flj23569##alias') - item = \ - dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + item = dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] assert 'item_type' in item assert item['item_type'] == 'alias' filter_exp = Key('label_and_type').eq('omim:606689##associated_with') - item = \ - dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + item = dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] assert 'item_type' in item assert item['item_type'] == 'associated_with' filter_exp = Key('label_and_type').eq('ensembl:ensg00000268895##xref') - item = \ - dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] + item = dynamodb.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] assert 'item_type' in item assert item['item_type'] == 'xref' diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 907d88e4..a502aba4 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -35,12 +35,9 @@ def ddx11l1(): "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.naD2_Q0JKCEKkGj8FvMzerePKnNNcF5N", - "interval": { - "end": {"value": 14409, "type": "Number"}, - "start": {"value": 11868, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.RjgWBC-z7VwJBq_PsWvHEtYXa8CYsN1m", + "end": {"value": 14409, "type": "Number"}, + "start": {"value": 11868, "type": "Number"}, "sequence_id": "ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", "type": "SequenceLocation" } @@ -67,12 +64,9 @@ def tp53(): "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.7q-vAjxSYARaPbbUjhDng2oay795NfbE", - "interval": { - "end": {"value": 7687538, "type": "Number"}, - "start": {"value": 7661778, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.fACfpMomyTUpCf5dm5NY3_V9Y-eI3RNJ", + "end": {"value": 7687538, "type": "Number"}, + "start": {"value": 7661778, "type": "Number"}, "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", "type": "SequenceLocation" } @@ -85,26 +79,23 @@ def tp53(): @pytest.fixture(scope="module") -def CH17_340M24_3(): - """Create a CH17-340M24.3 test fixture.""" +def ATP6AP1_DT(): + """Create a ATP6AP1-DT test fixture.""" params = { "match_type": MatchType.NO_MATCH, "concept_id": "ensembl:ENSG00000197180", - "symbol": "CH17-340M24.3", - "label": "uncharacterized protein BC009467", + "symbol": "ATP6AP1-DT", + "label": "ATP6AP1 divergent transcript", "previous_symbols": [], "aliases": [], - "xrefs": ["ncbigene:158960"], + "xrefs": ["hgnc:25138"], "symbol_status": None, "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.Qgt1dnZLg46y-lkbsk2lCnlfose0VsFt", - "interval": { - "end": {"value": 154428512, "type": "Number"}, - "start": {"value": 154424377, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.xFs-A2mSsCuLVdaRikxNgVnR4W3IUMom", + "end": {"value": 154428526, "type": "Number"}, + "start": {"value": 154424377, "type": "Number"}, "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", "type": "SequenceLocation" } @@ -131,12 +122,9 @@ def hsa_mir_1253(): "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.goBvYPYef2mQildG6AiiRNVhTo-g4-1E", - "interval": { - "end": {"value": 2748182, "type": "Number"}, - "start": {"value": 2748077, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.L55GGleEpudsf12fVDRqwG1X3R3mVTQW", + "end": {"value": 2748182, "type": "Number"}, + "start": {"value": 2748077, "type": "Number"}, "sequence_id": "ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", "type": "SequenceLocation" } @@ -163,12 +151,9 @@ def spry3(): "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.7Jax3UNlW_EZrZ44U-R1eLe_OeCC71IR", - "interval": { - "end": {"value": 155782459, "type": "Number"}, - "start": {"value": 155612571, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.w_6vBYiRzkypkRjnNKPKHs_g_NPiGT8n", + "end": {"value": 155782459, "type": "Number"}, + "start": {"value": 155612571, "type": "Number"}, "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", "type": "SequenceLocation" } @@ -220,21 +205,21 @@ def test_tp53(ensembl, tp53): check_resp_single_record(resp, tp53, MatchType.SYMBOL) -def test_CH17_340M24_3(ensembl, CH17_340M24_3): - """Test that CH17-340M24.3 normalizes to correct gene concept.""" +def test_ATP6AP1_DT(ensembl, ATP6AP1_DT): + """Test that ATP6AP1-DT normalizes to correct gene concept.""" # Concept ID resp = ensembl.search("ensembl:ENSG00000197180") - check_resp_single_record(resp, CH17_340M24_3, MatchType.CONCEPT_ID) + check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) resp = ensembl.search("ENSEMBL:ENSG00000197180") - check_resp_single_record(resp, CH17_340M24_3, MatchType.CONCEPT_ID) + check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) resp = ensembl.search("ENSG00000197180") - check_resp_single_record(resp, CH17_340M24_3, MatchType.CONCEPT_ID) + check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search("CH17-340M24.3") - check_resp_single_record(resp, CH17_340M24_3, MatchType.SYMBOL) + resp = ensembl.search("ATP6AP1-DT") + check_resp_single_record(resp, ATP6AP1_DT, MatchType.SYMBOL) def test_hsa_mir_1253(ensembl, hsa_mir_1253): @@ -304,9 +289,9 @@ def test_meta_info(ensembl): assert resp.source_meta_.data_license == "custom" assert resp.source_meta_.data_license_url == \ "https://useast.ensembl.org/info/about/legal/disclaimer.html" - assert resp.source_meta_.version == "106" + assert resp.source_meta_.version == "107" assert resp.source_meta_.data_url == \ - "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.106.gff3.gz" # noqa: E501 + "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.107.gff3.gz" # noqa: E501 assert resp.source_meta_.rdp_url is None assert resp.source_meta_.genome_assemblies == ["GRCh38"] assert resp.source_meta_.data_license_attributes == { diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py index 2408e0e3..1af2140e 100644 --- a/tests/unit/test_hgnc_source.py +++ b/tests/unit/test_hgnc_source.py @@ -36,13 +36,10 @@ def a1bg_as1(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.3Zdz1Stgx8HdWf1cT1KaUHFUQjoKTTcD", + "id": "ga4gh:CL.Rz-M5wA0_bIhQYLKi2ZPqlqW3nBPfAx5", "chr": "19", - "interval": { - "end": "q13.43", - "start": "q13.43", - "type": "CytobandInterval" - }, + "end": "q13.43", + "start": "q13.43", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -83,13 +80,10 @@ def tp53(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL._Cl_XG2bfBUVG6uwi-jHtCHavOAyfPXN", + "id": "ga4gh:CL.BPk3okUhv4BBatjkyC7eQQsyXL6YwmeF", "chr": "17", - "interval": { - "end": "p13.1", - "start": "p13.1", - "type": "CytobandInterval" - }, + "end": "p13.1", + "start": "p13.1", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -146,13 +140,10 @@ def a3galt2(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.Rs8bogwClWoTYjhY9vI9J3wnPEXlao-U", + "id": "ga4gh:CL.iiwv6oaDfVVkjMZ_OH6XEQmM0daVft4u", "chr": "1", - "interval": { - "end": "p35.1", - "start": "p35.1", - "type": "CytobandInterval" - }, + "end": "p35.1", + "start": "p35.1", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -177,7 +168,8 @@ def a3galt2(): "ccds:CCDS60080", "pubmed:10854427", "pubmed:18630988", - "refseq:NM_001080438" + "refseq:NM_001080438", + "omim:619850" ], "gene_type": "gene with protein product" } @@ -196,13 +188,10 @@ def wdhd1(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.R_izmPbRVtPQ2HwflIVh1XLXvRtVi-a7", + "id": "ga4gh:CL.sNe5mpPbxivH2KE6HdaDA3U29BkCQXc3", "chr": "14", - "interval": { - "end": "q22.2", - "start": "q22.3", - "type": "CytobandInterval" - }, + "end": "q22.2", + "start": "q22.3", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -308,13 +297,10 @@ def gage4(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.AlwtARlUTZiNX3NEEKab-X5eeayXd8v8", + "id": "ga4gh:CL.6KzwrFm2WeSXqwIIiNbAu-pKQQHt2q5Q", "chr": "X", - "interval": { - "end": "p11.2", - "start": "p11.4", - "type": "CytobandInterval" - }, + "end": "p11.2", + "start": "p11.4", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -416,13 +402,10 @@ def cecr(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.-hT6Cp6B32GmZTD8BXh1xf6SJeLM1uN7", + "id": "ga4gh:CL.AgASk5sB6LCeaB6rcqOwmrm16ise3pof", "chr": "22", - "interval": { - "end": "q11", - "start": "pter", - "type": "CytobandInterval" - }, + "end": "q11", + "start": "pter", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -452,24 +435,18 @@ def csf2ra(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.B5KOWxL8BQRpM2MOHP-RUmGlmm4ZtMAC", + "id": "ga4gh:CL.cITg67iNn_QNZTKpJd0I-1JMMhW_yHGU", "chr": "X", - "interval": { - "end": "p22.32", - "start": "p22.32", - "type": "CytobandInterval" - }, + "end": "p22.32", + "start": "p22.32", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VCL.QzbpRniVtZz8V-7B7vKhGeX3A3huKacK", + "id": "ga4gh:CL.2uc0CuKSdD7pkb4jKUqg2eusTXro99wM", "chr": "Y", - "interval": { - "end": "p11.3", - "start": "p11.3", - "type": "CytobandInterval" - }, + "end": "p11.3", + "start": "p11.3", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -523,13 +500,10 @@ def rps24p5(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.dLLTOKtFTnVd3ope5gTii1Gbdj7FxSfa", + "id": "ga4gh:CL.Ri0ddtMpe6DGzrC9_QGbL35gYAtU2bh_", "chr": "1", - "interval": { - "end": "q41", - "start": "p36.13", - "type": "CytobandInterval" - }, + "end": "q41", + "start": "p36.13", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -563,13 +537,10 @@ def trl_cag2_1(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.r_iXu-FjXuJjmeNhmDEputf6tgjXRQIr", + "id": "ga4gh:CL.aZ5aYHaC3GhDWgwhKkAcd9GBvkEo034v", "chr": "16", - "interval": { - "end": "q13", - "start": "q21", - "type": "CytobandInterval" - }, + "end": "q13", + "start": "q21", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -604,13 +575,10 @@ def myo5b(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.1vd9qlPiSSaDZC5X4jIKpapokxvKrITd", + "id": "ga4gh:CL.hFukVqPVLD70cshAz1Gtmd6EC1imobpO", "chr": "18", - "interval": { - "end": "qter", - "start": "cen", - "type": "CytobandInterval" - }, + "end": "qter", + "start": "cen", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -656,13 +624,10 @@ def gstt1(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.EfA-UFrmtjncDxutoiP6PWxu32UtH1Zu", + "id": "ga4gh:CL.g74mxFvAzPoenOlyMjY32j-UFMvjjas_", "chr": "22", - "interval": { - "end": "q11.23", - "start": "q11.23", - "type": "CytobandInterval" - }, + "end": "q11.23", + "start": "q11.23", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -949,7 +914,7 @@ def test_meta_info(hgnc): assert resp.source_meta_.data_license == "custom" assert resp.source_meta_.data_license_url == \ "https://www.genenames.org/about/" - assert datetime.strptime(resp.source_meta_.version, "%Y%m%d") # noqa: E501 + assert datetime.strptime(resp.source_meta_.version, "%Y%m%d") assert resp.source_meta_.data_url == \ "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" # noqa: E501 assert resp.source_meta_.rdp_url is None diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index 5f403401..4cf81cb8 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -39,23 +39,17 @@ def dpf1(): "strand": "-", "locations": [ { - "_id": "ga4gh:VCL.nEPKXzyfglrOMMFySOTQ8Om_f6xmr-pP", + "id": "ga4gh:CL.bzgLv8gt3KHK00OWTAEUNZcdgUjbHU8i", "chr": "19", - "interval": { - "end": "q13.2", - "start": "q13.2", - "type": "CytobandInterval" - }, + "end": "q13.2", + "start": "q13.2", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VSL.MbzGuoGI9MRB8oPe6eE-ULk3FIBdpMF8", - "interval": { - "end": {"value": 38229695, "type": "Number"}, - "start": {"value": 38211005, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.edxKC-S5GBhbpASqBt7BEg6cFr6CcbY3", + "end": {"value": 38229695, "type": "Number"}, + "start": {"value": 38211005, "type": "Number"}, "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", "type": "SequenceLocation" } @@ -82,23 +76,17 @@ def pdp1_symbol(): "strand": "+", "locations": [ { - "_id": "ga4gh:VCL.n9W_wjDCStQf29yPcjhkMnFmESG8wN9A", + "id": "ga4gh:CL.cJsZWKrEtzpFn5psdCtgofb6NaEDVPfB", "chr": "8", - "interval": { - "end": "q22.1", - "start": "q22.1", - "type": "CytobandInterval" - }, + "end": "q22.1", + "start": "q22.1", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VSL.KmLM61Mm2jxuep7cdgg7lvOOXaIxSW0Y", - "interval": { - "end": {"value": 93926068, "type": "Number"}, - "start": {"value": 93916922, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.HJJGMEo7TZQIwsq64prLwLhYzCkaGaDt", + "end": {"value": 93926068, "type": "Number"}, + "start": {"value": 93916922, "type": "Number"}, "sequence_id": "ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", "type": "SequenceLocation" } @@ -126,23 +114,17 @@ def pdp1_alias(): "strand": "+", "locations": [ { - "_id": "ga4gh:VCL.MOV-2FXnbeP1BrCwBQLN0aRpVLGFs-Mb", + "id": "ga4gh:CL.7ivmMgKAqiFiRh1qsbA909w2kUcPabr_", "chr": "9", - "interval": { - "end": "p24.1", - "start": "p24.1", - "type": "CytobandInterval" - }, + "end": "p24.1", + "start": "p24.1", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VSL.X_fA-Sjsw2ZreE9rykzPNzCFctpFGG-u", - "interval": { - "end": {"value": 4665258, "type": "Number"}, - "start": {"value": 4662293, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.jduggMCta37Xu9RgW251jNd7tmEuUOtw", + "end": {"value": 4665258, "type": "Number"}, + "start": {"value": 4662293, "type": "Number"}, "sequence_id": "ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", "type": "SequenceLocation" } @@ -170,44 +152,32 @@ def spry3(): "strand": "+", "locations": [ { - "_id": "ga4gh:VCL.A1s9hZY1tgmRi1WuXM1ETZOqJcpo4Ftx", + "id": "ga4gh:CL.r8Qv_b-B3SoguReqdunL3GCkt1RH-es1", "chr": "Y", - "interval": { - "end": "q12", - "start": "q12", - "type": "CytobandInterval" - }, + "end": "q12", + "start": "q12", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VCL.fEBeCyej0jVKsvjw4vxyW6j1h8UVLb5S", + "id": "ga4gh:CL.p5Va-YpCTrSTYWyJrpR-rvnxO1YWPIDY", "chr": "X", - "interval": { - "end": "q28", - "start": "q28", - "type": "CytobandInterval" - }, + "end": "q28", + "start": "q28", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VSL.r6_z0hmAdPdufX0g1ciRj_zPU6poQviA", - "interval": { - "end": {"value": 155782459, "type": "Number"}, - "start": {"value": 155612585, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.kaGMGXo0NREqH_gsgDwqfQMnBSem3OP1", + "end": {"value": 155782459, "type": "Number"}, + "start": {"value": 155612585, "type": "Number"}, "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", "type": "SequenceLocation" }, { - "_id": "ga4gh:VSL.Cr_HtUTpUe6KB37Y7zOTDbx9JglIzE1O", - "interval": { - "end": {"value": 56968979, "type": "Number"}, - "start": {"value": 56923422, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.I51347TrFRIHMT8Bg2iFbKP22_yFxXQb", + "end": {"value": 56968979, "type": "Number"}, + "start": {"value": 56954315, "type": "Number"}, "sequence_id": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", "type": "SequenceLocation" } @@ -279,23 +249,17 @@ def znf84(): "strand": "+", "locations": [ { - "_id": "ga4gh:VCL.CusjBE-q66vf4v8VSHRhMxjR_4G688Ve", + "id": "ga4gh:CL.6YvQEs6MuHuNvt0Vlv8r4hMKIOK5Ktq4", "chr": "12", - "interval": { - "end": "q24.33", - "start": "q24.33", - "type": "CytobandInterval" - }, + "end": "q24.33", + "start": "q24.33", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VSL.w5FE3al-0SUkARxk_RdCD5ypYIh_WtSM", - "interval": { - "end": {"value": 133063299, "type": "Number"}, - "start": {"value": 133037300, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.M67HYj0B8ocTAWgZ9zUEourxxna6ZleJ", + "end": {"value": 133063299, "type": "Number"}, + "start": {"value": 133037508, "type": "Number"}, "sequence_id": "ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", "type": "SequenceLocation" } @@ -323,22 +287,16 @@ def slc25a6(): "strand": "-", "locations": [ { - "_id": "ga4gh:VSL.HG0bXHwmZoxZzU2ckz4T6lvxIswXhLQZ", - "interval": { - "end": {"value": 1392113, "type": "Number"}, - "start": {"value": 1386151, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.bJtaHxlESQXto-K8WCE4jUS_uncnJP3l", + "end": {"value": 1392113, "type": "Number"}, + "start": {"value": 1386151, "type": "Number"}, "sequence_id": "ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", "type": "SequenceLocation" }, { - "_id": "ga4gh:VSL.1J-MNAWJ9hvZtIM_90lqLbxEt707zL_A", - "interval": { - "end": {"value": 1392113, "type": "Number"}, - "start": {"value": 1386151, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.fkr3SI-mzvw2IJgPm3ck9k5pQtbJ8BvX", + "end": {"value": 1392113, "type": "Number"}, + "start": {"value": 1386151, "type": "Number"}, "sequence_id": "ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", "type": "SequenceLocation" } @@ -366,13 +324,10 @@ def loc106783576(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.RFN35KQMhqzhmo4QP7AxKAzlPtnh7slL", + "id": "ga4gh:CL.YYGQrLtmKwKgp38asAkHT8AydAidnui8", "chr": "10", - "interval": { - "end": "cen", - "start": "pter", - "type": "CytobandInterval" - }, + "end": "cen", + "start": "pter", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -400,13 +355,10 @@ def glc1b(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.HStPIl_6UkNQmbjZW1TeUmHFMptbIj6t", + "id": "ga4gh:CL.8D0hLCktRxyPrx4Etgabq10vEq6TtU43", "chr": "2", - "interval": { - "end": "q13", - "start": "cen", - "type": "CytobandInterval" - }, + "end": "q13", + "start": "cen", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -434,13 +386,10 @@ def hdpa(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.faRHNO_VJMssbjYQ628mfdRgLqg9qK2b", + "id": "ga4gh:CL.kl9HXvnUCE6Z1ktXibt83NBdXvxnT2RA", "chr": "X", - "interval": { - "end": "p22.32", - "start": "pter", - "type": "CytobandInterval" - }, + "end": "p22.32", + "start": "pter", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -469,33 +418,24 @@ def prkrap1(): "strand": "+", "locations": [ { - "_id": "ga4gh:VCL.HeTd-jABCr22v4rUfVWJbkz2NkPyGScK", + "id": "ga4gh:CL.FYt7UkCHZVLpkYe7zhNdMk1K6lxl_k7I", "chr": "6", - "interval": { - "end": "p21.3", - "start": "p21.3", - "type": "CytobandInterval" - }, + "end": "p21.3", + "start": "p21.3", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" }, { - "_id": "ga4gh:VSL.WB_2IFcms7VmbkPBXUgUaH-R1EdKRs4s", - "interval": { - "end": {"value": 3941874, "type": "Number"}, - "start": {"value": 3940269, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.4rniEdEGTHBg9ZkvZLEVPR_MSBUlU2ih", + "end": {"value": 3941874, "type": "Number"}, + "start": {"value": 3940269, "type": "Number"}, "sequence_id": "ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1", "type": "SequenceLocation" }, { - "_id": "ga4gh:VSL.PIeADExe9_iSJkTLQbSvhxAJ8PM19R6r", - "interval": { - "end": {"value": 3932085, "type": "Number"}, - "start": {"value": 3930480, "type": "Number"}, - "type": "SequenceInterval" - }, + "id": "ga4gh:SL.qlH29_Ijp2JDyb29kxyCrtLOBa0NNx9j", + "end": {"value": 3932085, "type": "Number"}, + "start": {"value": 3930480, "type": "Number"}, "sequence_id": "ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-", "type": "SequenceLocation" } @@ -523,13 +463,10 @@ def mhb(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.2WDJu032Gc_9BN4qiNELb577XomiZv8z", + "id": "ga4gh:CL.6vlmdqdXYxSAGsJI9no7kLN5iLKpvr5X", "chr": "3", - "interval": { - "end": "p21.32", - "start": "p22.2", - "type": "CytobandInterval" - }, + "end": "p21.32", + "start": "p22.2", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -557,13 +494,10 @@ def spg37(): "strand": None, "locations": [ { - "_id": "ga4gh:VCL.P5jAIluXneqHZMV9FBEQ2ZqOpO-8fqbP", + "id": "ga4gh:CL.XWbwTwmJ95KD-aCuXfJcD8cNIvXbiXRh", "chr": "8", - "interval": { - "end": "q13.3", - "start": "p21.2", - "type": "CytobandInterval" - }, + "end": "q13.3", + "start": "p21.2", "species_id": "taxonomy:9606", "type": "ChromosomeLocation" } @@ -842,11 +776,9 @@ def test_no_match(ncbi): assert response.source_meta_.data_license_url == \ "https://www.ncbi.nlm.nih.gov/home/about/policies/" assert datetime.strptime(response.source_meta_.version, "%Y%m%d") - assert response.source_meta_.data_url == \ - "ftp://ftp.ncbi.nlm.nih.gov" - assert response.source_meta_.rdp_url == \ - "https://reusabledata.org/ncbi-gene.html" - assert not response.source_meta_.data_license_attributes["non_commercial"] # noqa: E501 + assert response.source_meta_.data_url == "ftp://ftp.ncbi.nlm.nih.gov" + assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" + assert not response.source_meta_.data_license_attributes["non_commercial"] assert not response.source_meta_.data_license_attributes["share_alike"] assert not response.source_meta_.data_license_attributes["attribution"] @@ -888,10 +820,8 @@ def test_meta(ncbi): assert response.source_meta_.data_license_url == \ "https://www.ncbi.nlm.nih.gov/home/about/policies/" assert datetime.strptime(response.source_meta_.version, "%Y%m%d") - assert response.source_meta_.data_url == \ - "ftp://ftp.ncbi.nlm.nih.gov" - assert response.source_meta_.rdp_url == \ - "https://reusabledata.org/ncbi-gene.html" + assert response.source_meta_.data_url == "ftp://ftp.ncbi.nlm.nih.gov" + assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" assert response.source_meta_.genome_assemblies == ["GRCh38.p14"] assert response.source_meta_.data_license_attributes == { "non_commercial": False, diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 88094094..65ded58b 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -85,15 +85,12 @@ def normalized_ache(): { "name": "chromosome_location", "value": { - "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", + "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", "type": "ChromosomeLocation", "species_id": "taxonomy:9606", "chr": "7", - "interval": { - "end": "q22.1", - "start": "q22.1", - "type": "CytobandInterval" - } + "end": "q22.1", + "start": "q22.1" }, "type": "Extension" }, @@ -170,15 +167,12 @@ def normalized_braf(): { "name": "chromosome_location", "value": { - "_id": "ga4gh:VCL.O6yCQ1cnThOrTfK9YUgMlTfM6HTqbrKw", + "id": "ga4gh:CL.ZZZYpOwuW1BLLJXc_Dm4eVZ5E0smVYCc", "type": "ChromosomeLocation", "species_id": "taxonomy:9606", "chr": "7", - "interval": { - "end": "q34", - "start": "q34", - "type": "CytobandInterval" - } + "end": "q34", + "start": "q34", }, "type": "Extension" }, @@ -269,15 +263,12 @@ def normalized_abl1(): { "name": "chromosome_location", "value": { - "_id": "ga4gh:VCL.WvMfE67KxSDAV8JaK593TI74yyJWIsMQ", + "id": "ga4gh:CL.1vsxettosueUHyFIOoTPzwIFD1DodLuT", "type": "ChromosomeLocation", "species_id": "taxonomy:9606", "chr": "9", - "interval": { - "end": "q34.12", - "start": "q34.12", - "type": "CytobandInterval" - } + "end": "q34.12", + "start": "q34.12" }, "type": "Extension" }, @@ -319,7 +310,8 @@ def normalized_p150(): "CAF-1", "P150", "CAF1B", - "CAF1" + "CAF1", + "LOC107985297" ], "extensions": [ { @@ -349,15 +341,12 @@ def normalized_p150(): { "name": "chromosome_location", "value": { - "_id": "ga4gh:VCL.yF2TzeunqY92v3yhDsCR_t5X997mWriF", + "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", "type": "ChromosomeLocation", "species_id": "taxonomy:9606", "chr": "19", - "interval": { - "end": "p13.3", - "start": "p13.3", - "type": "CytobandInterval" - } + "end": "p13.3", + "start": "p13.3" }, "type": "Extension" }, @@ -375,6 +364,11 @@ def normalized_p150(): "name": "ensembl_biotype", "type": "Extension", "value": "protein_coding" + }, + { + "name": "previous_symbols", + "type": "Extension", + "value": ["LOC107985297"] } ] } @@ -399,21 +393,18 @@ def normalized_loc_653303(): { "type": "Extension", "name": "approved_name", - "value": "proprotein convertase subtilisin/kexin type 7 pseudogene" # noqa: E501 + "value": "proprotein convertase subtilisin/kexin type 7 pseudogene" }, { "type": "Extension", "name": "chromosome_location", "value": { - "_id": "ga4gh:VCL.WzURLvTklFI7K2GAP8gIw6vgWDWXMXuW", + "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl", "type": "ChromosomeLocation", "species_id": "taxonomy:9606", "chr": "11", - "interval": { - "type": "CytobandInterval", - "start": "q23.3", - "end": "q23.3" - } + "start": "q23.3", + "end": "q23.3" } }, { @@ -456,29 +447,23 @@ def normalize_unmerged_loc_653303(): "locations": [ { "type": "ChromosomeLocation", - "_id": "ga4gh:VCL.WzURLvTklFI7K2GAP8gIw6vgWDWXMXuW", + "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl", "species_id": "taxonomy:9606", "chr": "11", - "interval": { - "type": "CytobandInterval", - "start": "q23.3", - "end": "q23.3" - } + "start": "q23.3", + "end": "q23.3" }, { - "_id": "ga4gh:VSL.dhj3ZilmW0bmmUjUvrG7zCWwsPn-7XyB", + "id": "ga4gh:SL.zfPjFTCoZIj-gz5tEBDQwKEOfGBxF9kF", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", # noqa: E501 - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 117135528 - }, - "end": { - "type": "Number", - "value": 117138867 - } + "start": { + "type": "Number", + "value": 117135528 + }, + "end": { + "type": "Number", + "value": 117138867 } } ], @@ -516,14 +501,11 @@ def normalize_unmerged_chaf1a(): "locations": [ { "type": "ChromosomeLocation", - "_id": "ga4gh:VCL.yF2TzeunqY92v3yhDsCR_t5X997mWriF", + "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", "species_id": "taxonomy:9606", "chr": "19", - "interval": { - "type": "CytobandInterval", - "start": "p13.3", - "end": "p13.3" - } + "start": "p13.3", + "end": "p13.3" } ], "aliases": [ @@ -564,19 +546,16 @@ def normalize_unmerged_chaf1a(): "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.VVxEanUPWWMy_IChkj_kPIpRnYAatqrq", + "id": "ga4gh:SL.kL2Hr6VSSiPQyjE_pJ1Ta5viB-yuJchF", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 4402639 - }, - "end": { - "type": "Number", - "value": 4445018 - } + "start": { + "type": "Number", + "value": 4402639 + }, + "end": { + "type": "Number", + "value": 4445018 } } ], @@ -602,29 +581,23 @@ def normalize_unmerged_chaf1a(): "locations": [ { "type": "ChromosomeLocation", - "_id": "ga4gh:VCL.yF2TzeunqY92v3yhDsCR_t5X997mWriF", + "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", "species_id": "taxonomy:9606", "chr": "19", - "interval": { - "type": "CytobandInterval", - "start": "p13.3", - "end": "p13.3" - } + "start": "p13.3", + "end": "p13.3" }, { - "_id": "ga4gh:VSL.X4HEwp9RgFN5WpmJM4bWpcOcN9qHX-hj", + "id": "ga4gh:SL.LTvvCAh9DqsH8G0A_rNgPafqvPmWk0tn", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", # noqa: E501 - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 4402595 - }, - "end": { - "type": "Number", - "value": 4448322 - } + "start": { + "type": "Number", + "value": 4402639 + }, + "end": { + "type": "Number", + "value": 4450830 } } ], @@ -635,7 +608,7 @@ def normalize_unmerged_chaf1a(): "CAF1B", "CAF-1" ], - "previous_symbols": [], + "previous_symbols": ["LOC107985297"], "xrefs": [ "ensembl:ENSG00000167670", "hgnc:1910" @@ -669,29 +642,23 @@ def normalize_unmerged_ache(): "locations": [ { "type": "ChromosomeLocation", - "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", + "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", "species_id": "taxonomy:9606", "chr": "7", - "interval": { - "type": "CytobandInterval", - "start": "q22.1", - "end": "q22.1" - } + "start": "q22.1", + "end": "q22.1" }, { - "_id": "ga4gh:VSL.EepkXho2doYcUT1DW54fT1a00_zkqrn0", + "id": "ga4gh:SL.UZ4q9hFsiwhOMyB4nDftMupd-i0OVu4w", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 100889993 - }, - "end": { - "type": "Number", - "value": 100896994 - } + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896994 } } ], @@ -726,19 +693,16 @@ def normalize_unmerged_ache(): "location_annotations": [], "locations": [ { - "_id": "ga4gh:VSL.AF6wPZclBqTauGr3yx_CqmMndLKhq0Cm", + "id": "ga4gh:SL.K5fLruGxiT-ls_GjUrNpNm3tV01hhlgQ", "type": "SequenceLocation", "sequence_id": "ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", # noqa: E501 - "interval": { - "type": "SequenceInterval", - "start": { - "type": "Number", - "value": 100889993 - }, - "end": { - "type": "Number", - "value": 100896974 - } + "start": { + "type": "Number", + "value": 100889993 + }, + "end": { + "type": "Number", + "value": 100896974 } } ], @@ -762,14 +726,11 @@ def normalize_unmerged_ache(): "locations": [ { "type": "ChromosomeLocation", - "_id": "ga4gh:VCL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", + "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", "species_id": "taxonomy:9606", "chr": "7", - "interval": { - "type": "CytobandInterval", - "start": "q22.1", - "end": "q22.1" - } + "start": "q22.1", + "end": "q22.1" } ], "aliases": [ @@ -857,19 +818,15 @@ def compare_unmerged_record(gene, test_gene): assert gene.label == test_gene.label assert gene.concept_id == test_gene.concept_id assert set(gene.aliases) == set(test_gene.aliases) - assert set(gene.xrefs) == \ - set(test_gene.xrefs) + assert set(gene.xrefs) == set(test_gene.xrefs) assert gene.symbol_status == test_gene.symbol_status - assert set(gene.previous_symbols) == \ - set(test_gene.previous_symbols) - assert set(gene.associated_with) == \ - set(test_gene.associated_with) + assert set(gene.previous_symbols) == set(test_gene.previous_symbols) + assert set(gene.associated_with) == set(test_gene.associated_with) assert gene.symbol == test_gene.symbol assert len(gene.locations) == len(test_gene.locations) for loc in gene.locations: assert loc in test_gene.locations - assert set(gene.location_annotations) == \ - set(test_gene.location_annotations) + assert set(gene.location_annotations) == set(test_gene.location_annotations) assert gene.strand == test_gene.strand assert gene.gene_type == test_gene.gene_type @@ -900,8 +857,7 @@ def compare_service_meta(service_meta): assert service_meta.name == "gene-normalizer" assert service_meta.version >= "0.1.0" assert isinstance(service_meta.response_datetime, str) - assert service_meta.url == \ - 'https://github.com/cancervariants/gene-normalization' + assert service_meta.url == 'https://github.com/cancervariants/gene-normalization' def compare_gene_descriptor(test, actual): @@ -912,28 +868,24 @@ def compare_gene_descriptor(test, actual): assert actual.label == test.label if actual.xrefs or test.xrefs: assert set(actual.xrefs) == set(test.xrefs), "xrefs" - assert set(actual.alternate_labels) == set(test.alternate_labels), \ - "alt labels" + assert set(actual.alternate_labels) == set(test.alternate_labels), "alt labels" extensions_present = "extensions" in test.__fields__.keys() assert ("extensions" in actual.__fields__.keys()) == extensions_present if extensions_present: - assert len(actual.extensions) == len(test.extensions), \ - "len of extensions" + assert len(actual.extensions) == len(test.extensions), "len of extensions" n_ext_correct = 0 for test_ext in test.extensions: for actual_ext in actual.extensions: if actual_ext.name == test_ext.name: - assert isinstance(actual_ext.value, - type(test_ext.value)) + assert isinstance(actual_ext.value, type(test_ext.value)) if isinstance(test_ext.value, list): assert set(actual_ext.value) == \ - set(test_ext.value), f"{test_ext.value} value" # noqa: E501 + set(test_ext.value), f"{test_ext.value} value" else: assert actual_ext.value == test_ext.value assert actual_ext.type == test_ext.type n_ext_correct += 1 - assert n_ext_correct == len(test.extensions), \ - "number of correct extensions" + assert n_ext_correct == len(test.extensions), "number of correct extensions" def test_search_query(query_handler, num_sources): @@ -979,10 +931,10 @@ def test_search_query_inc_exc(query_handler, num_sources): def test_search_invalid_parameter_exception(query_handler): """Test that Invalid parameter exception works correctly.""" with pytest.raises(InvalidParameterException): - resp = query_handler.search('BRAF', keyed=True, incl='hgn') # noqa: F841, E501 + resp = query_handler.search('BRAF', keyed=True, incl='hgn') # noqa: F841 with pytest.raises(InvalidParameterException): - resp = query_handler.search('BRAF', incl='hgnc', excl='hgnc') # noqa: F841, E501 + resp = query_handler.search('BRAF', incl='hgnc', excl='hgnc') # noqa: F841 def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): @@ -991,31 +943,24 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): resp = query_handler.search('ncbigene:43', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL.value].records) == 0 - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.CONCEPT_ID + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.CONCEPT_ID resp = query_handler.search('hgnc:108', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.CONCEPT_ID - assert matches[SourceName.ENSEMBL.value].records[0].match_type == \ - MatchType.XREF - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.CONCEPT_ID + assert matches[SourceName.ENSEMBL.value].records[0].match_type == MatchType.XREF + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.XREF resp = query_handler.search('ensembl:ENSG00000087085', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.XREF assert matches[SourceName.ENSEMBL.value].records[0].match_type == \ MatchType.CONCEPT_ID - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.XREF # Normalize q = "ACHE" @@ -1086,31 +1031,24 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): resp = query_handler.search('ncbigene:673', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL.value].records) == 0 - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.CONCEPT_ID + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.CONCEPT_ID resp = query_handler.search('hgnc:1097', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type ==\ - MatchType.CONCEPT_ID - assert matches[SourceName.ENSEMBL.value].records[0].match_type == \ - MatchType.XREF - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.CONCEPT_ID + assert matches[SourceName.ENSEMBL.value].records[0].match_type == MatchType.XREF + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.XREF resp = query_handler.search('ensembl:ENSG00000157764', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.XREF assert matches[SourceName.ENSEMBL.value].records[0].match_type == \ MatchType.CONCEPT_ID - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.XREF # Normalize q = "BRAF" @@ -1169,31 +1107,24 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): resp = query_handler.search('ncbigene:25', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL.value].records) == 0 - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.CONCEPT_ID + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.CONCEPT_ID resp = query_handler.search('hgnc:76', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.CONCEPT_ID - assert matches[SourceName.ENSEMBL.value].records[0].match_type == \ - MatchType.XREF - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.CONCEPT_ID + assert matches[SourceName.ENSEMBL.value].records[0].match_type == MatchType.XREF + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.XREF resp = query_handler.search('ensembl:ENSG00000097007', keyed=True) matches = resp.source_matches assert len(matches) == num_sources - assert matches[SourceName.HGNC.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.HGNC.value].records[0].match_type == MatchType.XREF assert matches[SourceName.ENSEMBL.value].records[0].match_type == \ MatchType.CONCEPT_ID - assert matches[SourceName.NCBI.value].records[0].match_type == \ - MatchType.XREF + assert matches[SourceName.NCBI.value].records[0].match_type == MatchType.XREF # Normalize q = "ABL1" diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py index e91bf004..a96ee616 100644 --- a/tests/unit/test_schemas.py +++ b/tests/unit/test_schemas.py @@ -3,44 +3,27 @@ import pydantic from gene.schemas import Gene from ga4gh.vrsatile.pydantic.vrs_models import ChromosomeLocation, \ - SequenceLocation, Gene as GeneValueObject, CytobandInterval, \ - SequenceInterval, Number + SequenceLocation, Gene as GeneValueObject, Number @pytest.fixture(scope='module') -def cytoband_interval(): - """Create a valid cytoband interval test fixture.""" - return CytobandInterval( - start='q34', - end='q34' - ) - - -@pytest.fixture(scope='module') -def chromosome_location(cytoband_interval): +def chromosome_location(): """Create a valid chromosome location test fixture.""" return ChromosomeLocation( species_id='taxonomy:9606', chr='7', - interval=cytoband_interval - ) - - -@pytest.fixture(scope='module') -def sequence_interval(): - """Create a valid simple interval test fixture.""" - return SequenceInterval( - start=Number(value=140719327), - end=Number(value=140924929) + start='q34', + end='q34' ) @pytest.fixture(scope='module') -def sequence_location(sequence_interval): +def sequence_location(): """Create a valid sequence location test fixture.""" return SequenceLocation( sequence_id='ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', - interval=sequence_interval + start=Number(value=140719327), + end=Number(value=140924929) )