diff --git a/docs/scripts/generate_normalize_figure.py b/docs/scripts/generate_normalize_figure.py index d5863a38..aac3dcae 100644 --- a/docs/scripts/generate_normalize_figure.py +++ b/docs/scripts/generate_normalize_figure.py @@ -18,9 +18,9 @@ from gene.schemas import UnmergedNormalizationService COLORS = [ - '#F8766D', - '#00BA38', - '#00B9E3', + "#F8766D", + "#00BA38", + "#00B9E3", ] @@ -30,50 +30,50 @@ def create_gjgf(result: UnmergedNormalizationService) -> Dict: :param result: result from Unmerged Normalization search """ graph = { - 'graph': { - 'label': 'tmp', - 'nodes': {}, - 'edges': [], - 'metadata': { - 'arrow_size': 15, - 'node_size': 15, - 'node_label_size': 20, - 'edge_size': 2, + "graph": { + "label": "tmp", + "nodes": {}, + "edges": [], + "metadata": { + "arrow_size": 15, + "node_size": 15, + "node_label_size": 20, + "edge_size": 2, }, } } for i, (_, matches) in enumerate(result.source_matches.items()): for match in matches.records: - graph['graph']['nodes'][match.concept_id] = { - 'metadata': { - 'color': COLORS[i], - 'hover': f'{match.concept_id}\n{match.symbol}\n{match.label}', - 'click': f"

{json.dumps(match.model_dump(), indent=2)}

", + graph["graph"]["nodes"][match.concept_id] = { + "metadata": { + "color": COLORS[i], + "hover": f"{match.concept_id}\n{match.symbol}\n{match.label}", + "click": f"

{json.dumps(match.model_dump(), indent=2)}

", } } for xref in match.xrefs: - graph['graph']['edges'].append( - {'source': match.concept_id, 'target': xref} + graph["graph"]["edges"].append( + {"source": match.concept_id, "target": xref} ) included_edges = [] - for edge in graph['graph']['edges']: + for edge in graph["graph"]["edges"]: if ( - edge['target'] in graph['graph']['nodes'] - and edge['source'] in graph['graph']['nodes'] + edge["target"] in graph["graph"]["nodes"] + and edge["source"] in graph["graph"]["nodes"] ): included_edges.append(edge) - graph['graph']['edges'] = included_edges + graph["graph"]["edges"] = included_edges - included_nodes = {k['source'] for k in graph['graph']['edges']}.union( - {k['target'] for k in graph['graph']['edges']} + included_nodes = {k["source"] for k in graph["graph"]["edges"]}.union( + {k["target"] for k in graph["graph"]["edges"]} ) new_nodes = {} - for key, value in graph['graph']['nodes'].items(): + for key, value in graph["graph"]["nodes"].items(): if key in included_nodes: new_nodes[key] = value - graph['graph']['nodes'] = new_nodes + graph["graph"]["nodes"] = new_nodes return graph @@ -82,8 +82,8 @@ def gen_norm_figure() -> None: """Generate normalized graph figure for docs.""" q = QueryHandler(create_db()) - otx2p1 = 'OTX2P1' - otx2p2 = 'OTX2P2' + otx2p1 = "OTX2P1" + otx2p2 = "OTX2P2" otx2p1_result = q.normalize_unmerged(otx2p1) otx2p2_result = q.normalize_unmerged(otx2p2) @@ -91,15 +91,15 @@ def gen_norm_figure() -> None: otx2p1_graph = create_gjgf(otx2p1_result) otx2p2_graph = create_gjgf(otx2p2_result) - nodes = otx2p1_graph['graph']['nodes'] - nodes.update(otx2p2_graph['graph']['nodes']) + nodes = otx2p1_graph["graph"]["nodes"] + nodes.update(otx2p2_graph["graph"]["nodes"]) graph = { - 'graph': { - 'label': f'Reference network for {otx2p1} and {otx2p2}', - 'metadata': otx2p1_graph['graph']['metadata'], - 'nodes': nodes, - 'edges': otx2p1_graph['graph']['edges'] + otx2p2_graph['graph']['edges'], + "graph": { + "label": f"Reference network for {otx2p1} and {otx2p2}", + "metadata": otx2p1_graph["graph"]["metadata"], + "nodes": nodes, + "edges": otx2p1_graph["graph"]["edges"] + otx2p2_graph["graph"]["edges"], } } @@ -107,20 +107,20 @@ def gen_norm_figure() -> None: data=graph, graph_height=250, node_hover_neighborhood=True, - node_label_font='arial', + node_label_font="arial", ) fig.export_html( ( APP_ROOT.parents[0] - / 'docs' - / 'source' - / '_static' - / 'html' - / 'normalize_example.html' + / "docs" + / "source" + / "_static" + / "html" + / "normalize_example.html" ).absolute(), overwrite=True, ) -if __name__ == '__main__': +if __name__ == "__main__": gen_norm_figure() diff --git a/src/gene/__init__.py b/src/gene/__init__.py index 9dd82081..36b6fb46 100644 --- a/src/gene/__init__.py +++ b/src/gene/__init__.py @@ -6,22 +6,22 @@ APP_ROOT = Path(__file__).resolve().parent logging.basicConfig( - filename='gene.log', format='[%(asctime)s] - %(name)s - %(levelname)s : %(message)s' + filename="gene.log", format="[%(asctime)s] - %(name)s - %(levelname)s : %(message)s" ) -logger = logging.getLogger('gene') +logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) logger.handlers = [] -logging.getLogger('boto3').setLevel(logging.INFO) -logging.getLogger('botocore').setLevel(logging.INFO) -logging.getLogger('urllib3').setLevel(logging.INFO) -logging.getLogger('python_jsonschema_objects').setLevel(logging.INFO) -logging.getLogger('biocommons.seqrepo.seqaliasdb.seqaliasdb').setLevel(logging.INFO) -logging.getLogger('biocommons.seqrepo.fastadir.fastadir').setLevel(logging.INFO) +logging.getLogger("boto3").setLevel(logging.INFO) +logging.getLogger("botocore").setLevel(logging.INFO) +logging.getLogger("urllib3").setLevel(logging.INFO) +logging.getLogger("python_jsonschema_objects").setLevel(logging.INFO) +logging.getLogger("biocommons.seqrepo.seqaliasdb.seqaliasdb").setLevel(logging.INFO) +logging.getLogger("biocommons.seqrepo.fastadir.fastadir").setLevel(logging.INFO) SEQREPO_ROOT_DIR = Path( - environ.get('SEQREPO_ROOT_DIR', '/usr/local/share/seqrepo/latest') + environ.get("SEQREPO_ROOT_DIR", "/usr/local/share/seqrepo/latest") ) @@ -59,5 +59,5 @@ class DownloadException(Exception): # noqa: N818 NAMESPACE_LOOKUP = { v.value.lower(): NamespacePrefix[k].value for k, v in SourceIDAfterNamespace.__members__.items() - if v.value != '' + if v.value != "" } diff --git a/src/gene/cli.py b/src/gene/cli.py index f19a6537..26af3380 100644 --- a/src/gene/cli.py +++ b/src/gene/cli.py @@ -17,13 +17,13 @@ from gene.database.database import DatabaseException from gene.schemas import SourceName -logger = logging.getLogger('gene') +logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) @click.command() -@click.option('--db_url', help='URL endpoint for the application database.') -@click.option('--verbose', '-v', is_flag=True, help='Print result to console if set.') +@click.option("--db_url", help="URL endpoint for the application database.") +@click.option("--verbose", "-v", is_flag=True, help="Print result to console if set.") def check_db(db_url: str, verbose: bool = False) -> None: """Perform basic checks on DB health and population. Exits with status code 1 if DB schema is uninitialized or if critical tables appear to be empty. @@ -35,21 +35,21 @@ def check_db(db_url: str, verbose: bool = False) -> None: db = create_db(db_url, False) if not db.check_schema_initialized(): if verbose: - click.echo('Health check failed: DB schema uninitialized.') + click.echo("Health check failed: DB schema uninitialized.") click.get_current_context().exit(1) if not db.check_tables_populated(): if verbose: - click.echo('Health check failed: DB is incompletely populated.') + click.echo("Health check failed: DB is incompletely populated.") click.get_current_context().exit(1) if verbose: - click.echo('DB health check successful: tables appear complete.') + click.echo("DB health check successful: tables appear complete.") @click.command() -@click.option('--data_url', help='URL to data dump') -@click.option('--db_url', help='URL endpoint for the application database.') +@click.option("--data_url", help="URL to data dump") +@click.option("--db_url", help="URL endpoint for the application database.") def update_from_remote(data_url: Optional[str], db_url: str) -> None: """Update data from remotely-hosted DB dump. By default, fetches from latest available dump on VICC S3 bucket; specific URLs can be provided instead by @@ -59,31 +59,31 @@ def update_from_remote(data_url: Optional[str], db_url: str) -> None: :param data_url: user-specified location to pull DB dump from :param db_url: URL to normalizer database """ - if not click.confirm('Are you sure you want to overwrite existing data?'): + if not click.confirm("Are you sure you want to overwrite existing data?"): click.get_current_context().exit() if not data_url: - data_url = os.environ.get('GENE_NORM_REMOTE_DB_URL') + data_url = os.environ.get("GENE_NORM_REMOTE_DB_URL") db = create_db(db_url, False) try: db.load_from_remote(data_url) except NotImplementedError: click.echo( - f'Error: Fetching remote data dump not supported for {db.__class__.__name__}' + f"Error: Fetching remote data dump not supported for {db.__class__.__name__}" ) click.get_current_context().exit(1) except DatabaseException as e: - click.echo(f'Encountered exception during update: {str(e)}') + click.echo(f"Encountered exception during update: {str(e)}") click.get_current_context().exit(1) @click.command() @click.option( - '--output_directory', - '-o', - help='Output location to write to', + "--output_directory", + "-o", + help="Output location to write to", type=click.Path(exists=True, path_type=Path), ) -@click.option('--db_url', help='URL endpoint for the application database.') +@click.option("--db_url", help="URL endpoint for the application database.") def dump_database(output_directory: Path, db_url: str) -> None: """Dump data from database into file. @@ -92,18 +92,18 @@ def dump_database(output_directory: Path, db_url: str) -> None: :param db_url: URL to normalizer database """ if not output_directory: - output_directory = Path('.') + output_directory = Path(".") db = create_db(db_url, False) try: db.export_db(output_directory) except NotImplementedError: click.echo( - f'Error: Dumping data to file not supported for {db.__class__.__name__}' + f"Error: Dumping data to file not supported for {db.__class__.__name__}" ) click.get_current_context().exit(1) except DatabaseException as e: - click.echo(f'Encountered exception during update: {str(e)}') + click.echo(f"Encountered exception during update: {str(e)}") click.get_current_context().exit(1) @@ -138,20 +138,20 @@ def _delete_source(n: SourceName, db: AbstractDatabase) -> float: :param db: database instance :return: time taken (in seconds) to delete """ - msg = f'Deleting {n.value}...' - click.echo(f'\n{msg}') + msg = f"Deleting {n.value}..." + click.echo(f"\n{msg}") logger.info(msg) start_delete = timer() db.delete_source(n) end_delete = timer() delete_time = end_delete - start_delete - msg = f'Deleted {n.value} in {delete_time:.5f} seconds.' - click.echo(f'{msg}\n') + msg = f"Deleted {n.value} in {delete_time:.5f} seconds." + click.echo(f"{msg}\n") logger.info(msg) return delete_time -_etl_dependency_help = 'Are ETL dependencies installed? See the Installation page in the documentation for more info.' +_etl_dependency_help = "Are ETL dependencies installed? See the Installation page in the documentation for more info." def _load_source( @@ -170,7 +170,7 @@ def _load_source( :param use_existing: if True, use most recent local data files instead of fetching from remote """ - msg = f'Loading {n.value}...' + msg = f"Loading {n.value}..." click.echo(msg) logger.info(msg) start_load = timer() @@ -181,7 +181,7 @@ def _load_source( from gene.etl.exceptions import GeneNormalizerEtlError except ModuleNotFoundError as e: click.echo( - f'Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}' + f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" ) click.get_current_context().exit() SourceClass = eval(n.value) # noqa: N806 @@ -191,14 +191,14 @@ def _load_source( processed_ids += source.perform_etl(use_existing) except GeneNormalizerEtlError as e: logger.error(e) - click.echo(f'Encountered error while loading {n}: {e}.') + click.echo(f"Encountered error while loading {n}: {e}.") click.get_current_context().exit() end_load = timer() load_time = end_load - start_load - msg = f'Loaded {n.value} in {load_time:.5f} seconds.' + msg = f"Loaded {n.value} in {load_time:.5f} seconds." click.echo(msg) logger.info(msg) - msg = f'Total time for {n.value}: {(delete_time + load_time):.5f} seconds.' + msg = f"Total time for {n.value}: {(delete_time + load_time):.5f} seconds." click.echo(msg) logger.info(msg) @@ -208,15 +208,15 @@ def _delete_normalized_data(database: AbstractDatabase) -> None: :param database: DB instance """ - click.echo('\nDeleting normalized records...') + click.echo("\nDeleting normalized records...") start_delete = timer() try: database.delete_normalized_concepts() except (DatabaseReadException, DatabaseWriteException) as e: - click.echo(f'Encountered exception during normalized data deletion: {e}') + click.echo(f"Encountered exception during normalized data deletion: {e}") end_delete = timer() delete_time = end_delete - start_delete - click.echo(f'Deleted normalized records in {delete_time:.5f} seconds.') + click.echo(f"Deleted normalized records in {delete_time:.5f} seconds.") def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: @@ -234,34 +234,34 @@ def _load_merge(db: AbstractDatabase, processed_ids: Set[str]) -> None: from gene.etl.merge import Merge except ModuleNotFoundError as e: click.echo( - f'Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}' + f"Encountered ModuleNotFoundError attempting to import {e.name}. {_etl_dependency_help}" ) click.get_current_context().exit() merge = Merge(database=db) - click.echo('Constructing normalized records...') + click.echo("Constructing normalized records...") merge.create_merged_concepts(processed_ids) end = timer() click.echo( - f'Merged concept generation completed in ' f'{(end - start):.5f} seconds' + f"Merged concept generation completed in " f"{(end - start):.5f} seconds" ) @click.command() -@click.option('--sources', help='The source(s) you wish to update separated by spaces.') -@click.option('--aws_instance', is_flag=True, help='Using AWS DynamodDB instance.') -@click.option('--db_url', help='URL endpoint for the application database.') -@click.option('--update_all', is_flag=True, help='Update all normalizer sources.') +@click.option("--sources", help="The source(s) you wish to update separated by spaces.") +@click.option("--aws_instance", is_flag=True, help="Using AWS DynamodDB instance.") +@click.option("--db_url", help="URL endpoint for the application database.") +@click.option("--update_all", is_flag=True, help="Update all normalizer sources.") @click.option( - '--update_merged', + "--update_merged", is_flag=True, - help='Update concepts for normalize endpoint from accepted sources.', + help="Update concepts for normalize endpoint from accepted sources.", ) @click.option( - '--use_existing', + "--use_existing", is_flag=True, default=False, - help='Use most recent local source data instead of fetching latest version', + help="Use most recent local source data instead of fetching latest version", ) def update_normalizer_db( sources: str, @@ -296,7 +296,7 @@ def update_normalizer_db( else: ctx = click.get_current_context() click.echo( - 'Must either enter 1 or more sources, or use `--update_all` parameter' + "Must either enter 1 or more sources, or use `--update_all` parameter" ) click.echo(ctx.get_help()) ctx.exit() @@ -304,16 +304,16 @@ def update_normalizer_db( sources_split = sources.lower().split() if len(sources_split) == 0: - raise Exception('Must enter 1 or more source names to update') + raise Exception("Must enter 1 or more source names to update") non_sources = set(sources_split) - set(SOURCES) if len(non_sources) != 0: - raise Exception(f'Not valid source(s): {non_sources}') + raise Exception(f"Not valid source(s): {non_sources}") parsed_source_names = {SourceName(SOURCES[s]) for s in sources_split} _update_normalizer(parsed_source_names, db, update_merged, use_existing) -if __name__ == '__main__': +if __name__ == "__main__": update_normalizer_db() diff --git a/src/gene/database/database.py b/src/gene/database/database.py index 93ef3cdf..be62d177 100644 --- a/src/gene/database/database.py +++ b/src/gene/database/database.py @@ -61,12 +61,12 @@ def _check_delete_okay() -> bool: :raise DatabaseWriteException: if skip confirmation variable is set -- manual approval is required. """ - if environ.get(AWS_ENV_VAR_NAME, '') == AwsEnvName.PRODUCTION: - if environ.get(SKIP_AWS_DB_ENV_NAME, '') == 'true': + if environ.get(AWS_ENV_VAR_NAME, "") == AwsEnvName.PRODUCTION: + if environ.get(SKIP_AWS_DB_ENV_NAME, "") == "true": raise DatabaseWriteException( - f'Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()' + f"Must unset {SKIP_AWS_DB_ENV_NAME} env variable to enable drop_db()" ) - return click.confirm('Are you sure you want to delete existing data?') + return click.confirm("Are you sure you want to delete existing data?") else: return True @@ -242,19 +242,19 @@ def export_db(self, export_location: Path) -> None: # can be set to either `Dev`, `Staging`, or `Prod` # ONLY set when wanting to access aws instance -AWS_ENV_VAR_NAME = 'GENE_NORM_ENV' +AWS_ENV_VAR_NAME = "GENE_NORM_ENV" # Set to "true" if want to skip db confirmation check. Should ONLY be used for # deployment needs -SKIP_AWS_DB_ENV_NAME = 'SKIP_AWS_CONFIRMATION' +SKIP_AWS_DB_ENV_NAME = "SKIP_AWS_CONFIRMATION" class AwsEnvName(str, Enum): """AWS environment name that is being used""" - DEVELOPMENT = 'Dev' - STAGING = 'Staging' - PRODUCTION = 'Prod' + DEVELOPMENT = "Dev" + STAGING = "Staging" + PRODUCTION = "Prod" VALID_AWS_ENV_NAMES = {v.value for v in AwsEnvName.__members__.values()} @@ -263,11 +263,11 @@ class AwsEnvName(str, Enum): def confirm_aws_db_use(env_name: str) -> None: """Check to ensure that AWS instance should actually be used.""" if click.confirm( - f'Are you sure you want to use the AWS {env_name} database?', default=False + f"Are you sure you want to use the AWS {env_name} database?", default=False ): - click.echo(f'***GENE AWS {env_name.upper()} DATABASE IN USE***') + click.echo(f"***GENE AWS {env_name.upper()} DATABASE IN USE***") else: - click.echo('Exiting.') + click.echo("Exiting.") sys.exit() @@ -324,13 +324,13 @@ def create_db( else: if db_url: endpoint_url = db_url - elif 'GENE_NORM_DB_URL' in environ.keys(): - endpoint_url = environ['GENE_NORM_DB_URL'] + elif "GENE_NORM_DB_URL" in environ.keys(): + endpoint_url = environ["GENE_NORM_DB_URL"] else: - endpoint_url = 'http://localhost:8000' + endpoint_url = "http://localhost:8000" # prefer DynamoDB unless connection explicitly reads like a libpq URI - if endpoint_url.startswith('postgres'): + if endpoint_url.startswith("postgres"): from gene.database.postgresql import PostgresDatabase db = PostgresDatabase(endpoint_url) diff --git a/src/gene/database/dynamodb.py b/src/gene/database/dynamodb.py index b7658aa2..88df0646 100644 --- a/src/gene/database/dynamodb.py +++ b/src/gene/database/dynamodb.py @@ -39,48 +39,48 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: * region_name: AWS region (defaults to "us-east-2") :raise DatabaseInitializationException: if initial setup fails """ - self.gene_table = environ.get('GENE_DYNAMO_TABLE', 'gene_normalizer') - region_name = db_args.get('region_name', 'us-east-2') + self.gene_table = environ.get("GENE_DYNAMO_TABLE", "gene_normalizer") + region_name = db_args.get("region_name", "us-east-2") if AWS_ENV_VAR_NAME in environ: - if 'GENE_TEST' in environ: + if "GENE_TEST" in environ: raise DatabaseInitializationException( - f'Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set.' + f"Cannot have both GENE_TEST and {AWS_ENV_VAR_NAME} set." ) aws_env = environ[AWS_ENV_VAR_NAME] if aws_env not in VALID_AWS_ENV_NAMES: raise DatabaseInitializationException( - f'{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}' + f"{AWS_ENV_VAR_NAME} must be one of {VALID_AWS_ENV_NAMES}" ) skip_confirmation = environ.get(SKIP_AWS_DB_ENV_NAME) if (not skip_confirmation) or ( - skip_confirmation and skip_confirmation != 'true' + skip_confirmation and skip_confirmation != "true" ): confirm_aws_db_use(environ[AWS_ENV_VAR_NAME]) - boto_params = {'region_name': region_name} + boto_params = {"region_name": region_name} if aws_env == AwsEnvName.DEVELOPMENT: self.gene_table = environ.get( - 'GENE_DYNAMO_TABLE', 'gene_normalizer_nonprod' + "GENE_DYNAMO_TABLE", "gene_normalizer_nonprod" ) else: if db_url: endpoint_url = db_url - elif 'GENE_NORM_DB_URL' in environ: - endpoint_url = environ['GENE_NORM_DB_URL'] + elif "GENE_NORM_DB_URL" in environ: + endpoint_url = environ["GENE_NORM_DB_URL"] else: - endpoint_url = 'http://localhost:8000' - click.echo(f'***Using Gene Database Endpoint: {endpoint_url}***') - boto_params = {'region_name': region_name, 'endpoint_url': endpoint_url} + endpoint_url = "http://localhost:8000" + click.echo(f"***Using Gene Database Endpoint: {endpoint_url}***") + boto_params = {"region_name": region_name, "endpoint_url": endpoint_url} - self.dynamodb = boto3.resource('dynamodb', **boto_params) - self.dynamodb_client = boto3.client('dynamodb', **boto_params) + self.dynamodb = boto3.resource("dynamodb", **boto_params) + self.dynamodb_client = boto3.client("dynamodb", **boto_params) # Only create tables for local instance - envs_do_not_create_tables = {AWS_ENV_VAR_NAME, 'GENE_TEST'} + envs_do_not_create_tables = {AWS_ENV_VAR_NAME, "GENE_TEST"} if not set(envs_do_not_create_tables) & set(environ): self.initialize_db() @@ -94,7 +94,7 @@ def list_tables(self) -> List[str]: :return: Table names in DynamoDB """ - return self.dynamodb_client.list_tables()['TableNames'] + return self.dynamodb_client.list_tables()["TableNames"] def drop_db(self) -> None: """Delete all tables from database. Requires manual confirmation. @@ -116,36 +116,36 @@ def _create_genes_table(self) -> None: self.dynamodb.create_table( TableName=self.gene_table, KeySchema=[ - {'AttributeName': 'label_and_type', 'KeyType': 'HASH'}, # Partition key - {'AttributeName': 'concept_id', 'KeyType': 'RANGE'}, # Sort key + {"AttributeName": "label_and_type", "KeyType": "HASH"}, # Partition key + {"AttributeName": "concept_id", "KeyType": "RANGE"}, # Sort key ], AttributeDefinitions=[ - {'AttributeName': 'label_and_type', 'AttributeType': 'S'}, - {'AttributeName': 'concept_id', 'AttributeType': 'S'}, - {'AttributeName': 'src_name', 'AttributeType': 'S'}, - {'AttributeName': 'item_type', 'AttributeType': 'S'}, + {"AttributeName": "label_and_type", "AttributeType": "S"}, + {"AttributeName": "concept_id", "AttributeType": "S"}, + {"AttributeName": "src_name", "AttributeType": "S"}, + {"AttributeName": "item_type", "AttributeType": "S"}, ], GlobalSecondaryIndexes=[ { - 'IndexName': 'src_index', - 'KeySchema': [{'AttributeName': 'src_name', 'KeyType': 'HASH'}], - 'Projection': {'ProjectionType': 'KEYS_ONLY'}, - 'ProvisionedThroughput': { - 'ReadCapacityUnits': 10, - 'WriteCapacityUnits': 10, + "IndexName": "src_index", + "KeySchema": [{"AttributeName": "src_name", "KeyType": "HASH"}], + "Projection": {"ProjectionType": "KEYS_ONLY"}, + "ProvisionedThroughput": { + "ReadCapacityUnits": 10, + "WriteCapacityUnits": 10, }, }, { - 'IndexName': 'item_type_index', - 'KeySchema': [{'AttributeName': 'item_type', 'KeyType': 'HASH'}], - 'Projection': {'ProjectionType': 'KEYS_ONLY'}, - 'ProvisionedThroughput': { - 'ReadCapacityUnits': 10, - 'WriteCapacityUnits': 10, + "IndexName": "item_type_index", + "KeySchema": [{"AttributeName": "item_type", "KeyType": "HASH"}], + "Projection": {"ProjectionType": "KEYS_ONLY"}, + "ProvisionedThroughput": { + "ReadCapacityUnits": 10, + "WriteCapacityUnits": 10, }, }, ], - ProvisionedThroughput={'ReadCapacityUnits': 10, 'WriteCapacityUnits': 10}, + ProvisionedThroughput={"ReadCapacityUnits": 10, "WriteCapacityUnits": 10}, ) def check_schema_initialized(self) -> bool: @@ -156,7 +156,7 @@ def check_schema_initialized(self) -> bool: existing_tables = self.list_tables() exists = self.gene_table in existing_tables if not exists: - logger.info(f'{self.gene_table} table is missing or unavailable.') + logger.info(f"{self.gene_table} table is missing or unavailable.") return exists def check_tables_populated(self) -> bool: @@ -169,29 +169,29 @@ def check_tables_populated(self) -> bool: :return: True if queries successful, false if DB appears empty """ sources = self.genes.query( - IndexName='item_type_index', - KeyConditionExpression=Key('item_type').eq('source'), - ).get('Items', []) + IndexName="item_type_index", + KeyConditionExpression=Key("item_type").eq("source"), + ).get("Items", []) if len(sources) < len(SourceName): - logger.info('Gene sources table is missing expected sources.') + logger.info("Gene sources table is missing expected sources.") return False records = self.genes.query( - IndexName='item_type_index', - KeyConditionExpression=Key('item_type').eq('identity'), + IndexName="item_type_index", + KeyConditionExpression=Key("item_type").eq("identity"), Limit=1, ) - if len(records.get('Items', [])) < 1: - logger.info('Gene records index is empty.') + if len(records.get("Items", [])) < 1: + logger.info("Gene records index is empty.") return False normalized_records = self.genes.query( - IndexName='item_type_index', - KeyConditionExpression=Key('item_type').eq(RecordType.MERGER.value), + IndexName="item_type_index", + KeyConditionExpression=Key("item_type").eq(RecordType.MERGER.value), Limit=1, ) - if len(normalized_records.get('Items', [])) < 1: - logger.info('Normalized gene records index is empty.') + if len(normalized_records.get("Items", [])) < 1: + logger.info("Normalized gene records index is empty.") return False return True @@ -211,14 +211,14 @@ def get_source_metadata(self, src_name: Union[str, SourceName]) -> Dict: if src_name in self._cached_sources: return self._cached_sources[src_name] else: - pk = f'{src_name.lower()}##source' - concept_id = f'source:{src_name.lower()}' + pk = f"{src_name.lower()}##source" + concept_id = f"source:{src_name.lower()}" metadata = self.genes.get_item( - Key={'label_and_type': pk, 'concept_id': concept_id} - ).get('Item') + Key={"label_and_type": pk, "concept_id": concept_id} + ).get("Item") if not metadata: raise DatabaseReadException( - f'Unable to retrieve data for source {src_name}' + f"Unable to retrieve data for source {src_name}" ) self._cached_sources[src_name] = metadata return metadata @@ -238,19 +238,19 @@ def get_record_by_id( """ try: if merge: - pk = f'{concept_id.lower()}##{RecordType.MERGER.value}' + pk = f"{concept_id.lower()}##{RecordType.MERGER.value}" else: - pk = f'{concept_id.lower()}##{RecordType.IDENTITY.value}' + pk = f"{concept_id.lower()}##{RecordType.IDENTITY.value}" if case_sensitive: match = self.genes.get_item( - Key={'label_and_type': pk, 'concept_id': concept_id} + Key={"label_and_type": pk, "concept_id": concept_id} ) - return match['Item'] + return match["Item"] else: - exp = Key('label_and_type').eq(pk) + exp = Key("label_and_type").eq(pk) response = self.genes.query(KeyConditionExpression=exp) - record = response['Items'][0] - del record['label_and_type'] + record = response["Items"][0] + del record["label_and_type"] return record except ClientError as e: logger.error( @@ -270,11 +270,11 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: :param ref_type: type of match to look for. :return: list of associated concept IDs. Empty if lookup fails. """ - pk = f'{search_term}##{ref_type.value.lower()}' - filter_exp = Key('label_and_type').eq(pk) + pk = f"{search_term}##{ref_type.value.lower()}" + filter_exp = Key("label_and_type").eq(pk) try: matches = self.genes.query(KeyConditionExpression=filter_exp) - return [m['concept_id'] for m in matches.get('Items', None)] + return [m["concept_id"] for m in matches.get("Items", None)] except ClientError as e: logger.error( f"boto3 client error on get_refs_by_type for " @@ -291,7 +291,7 @@ def get_all_concept_ids(self) -> Set[str]: last_evaluated_key = None concept_ids = [] params = { - 'ProjectionExpression': 'concept_id', + "ProjectionExpression": "concept_id", } while True: if last_evaluated_key: @@ -300,10 +300,10 @@ def get_all_concept_ids(self) -> Set[str]: ) else: response = self.genes.scan(**params) - records = response['Items'] + records = response["Items"] for record in records: - concept_ids.append(record['concept_id']) - last_evaluated_key = response.get('LastEvaluatedKey') + concept_ids.append(record["concept_id"]) + last_evaluated_key = response.get("LastEvaluatedKey") if not last_evaluated_key: break return set(concept_ids) @@ -332,19 +332,19 @@ def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None ) else: response = self.genes.scan() - records = response.get('Items', []) + records = response.get("Items", []) for record in records: - incoming_record_type = record.get('item_type') + incoming_record_type = record.get("item_type") if record_type == RecordType.IDENTITY: if incoming_record_type == record_type: yield record else: if ( incoming_record_type == RecordType.IDENTITY - and not record.get('merge_ref') + and not record.get("merge_ref") ) or incoming_record_type == RecordType.MERGER: yield record - last_evaluated_key = response.get('LastEvaluatedKey') + last_evaluated_key = response.get("LastEvaluatedKey") if not last_evaluated_key: break @@ -357,10 +357,10 @@ def add_source_metadata(self, src_name: SourceName, metadata: SourceMeta) -> Non """ src_name_value = src_name.value metadata_item = metadata.model_dump() - metadata_item['src_name'] = src_name_value - metadata_item['label_and_type'] = f'{str(src_name_value).lower()}##source' - metadata_item['concept_id'] = f'source:{str(src_name_value).lower()}' - metadata_item['item_type'] = 'source' + metadata_item["src_name"] = src_name_value + metadata_item["label_and_type"] = f"{str(src_name_value).lower()}##source" + metadata_item["concept_id"] = f"source:{str(src_name_value).lower()}" + metadata_item["item_type"] = "source" try: self.genes.put_item(Item=metadata_item) except ClientError as e: @@ -372,11 +372,11 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: :param Dict record: record to upload :param SourceName src_name: name of source for record """ - concept_id = record['concept_id'] - record['src_name'] = src_name.value - label_and_type = f'{concept_id.lower()}##identity' - record['label_and_type'] = label_and_type - record['item_type'] = 'identity' + concept_id = record["concept_id"] + record["src_name"] = src_name.value + label_and_type = f"{concept_id.lower()}##identity" + record["label_and_type"] = label_and_type + record["item_type"] = "identity" try: self.batch.put_item(Item=record) except ClientError as e: @@ -395,7 +395,7 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: items = {item.lower() for item in value} for item in items: self._add_ref_record( - item, record['concept_id'], item_type, src_name + item, record["concept_id"], item_type, src_name ) def add_merged_record(self, record: Dict) -> None: @@ -403,12 +403,12 @@ def add_merged_record(self, record: Dict) -> None: :param record: merged record to add """ - concept_id = record['concept_id'] - id_prefix = concept_id.split(':')[0].lower() - record['src_name'] = PREFIX_LOOKUP[id_prefix] - label_and_type = f'{concept_id.lower()}##{RecordType.MERGER.value}' - record['label_and_type'] = label_and_type - record['item_type'] = RecordType.MERGER.value + concept_id = record["concept_id"] + id_prefix = concept_id.split(":")[0].lower() + record["src_name"] = PREFIX_LOOKUP[id_prefix] + label_and_type = f"{concept_id.lower()}##{RecordType.MERGER.value}" + record["label_and_type"] = label_and_type + record["item_type"] = RecordType.MERGER.value try: self.batch.put_item(Item=record) except ClientError as e: @@ -428,12 +428,12 @@ def _add_ref_record( 'associated_with'} :param src_name: name of source for record """ - label_and_type = f'{term.lower()}##{ref_type}' + label_and_type = f"{term.lower()}##{ref_type}" record = { - 'label_and_type': label_and_type, - 'concept_id': concept_id.lower(), - 'src_name': src_name.value, - 'item_type': ref_type, + "label_and_type": label_and_type, + "concept_id": concept_id.lower(), + "src_name": src_name.value, + "item_type": ref_type, } try: self.batch.put_item(Item=record) @@ -451,11 +451,11 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN :param merge_ref: new ref value :raise DatabaseWriteException: if attempting to update non-existent record """ - label_and_type = f'{concept_id.lower()}##identity' - key = {'label_and_type': label_and_type, 'concept_id': concept_id} - update_expression = 'set merge_ref=:r' - update_values = {':r': merge_ref.lower()} - condition_expression = 'attribute_exists(label_and_type)' + label_and_type = f"{concept_id.lower()}##identity" + key = {"label_and_type": label_and_type, "concept_id": concept_id} + update_expression = "set merge_ref=:r" + update_values = {":r": merge_ref.lower()} + condition_expression = "attribute_exists(label_and_type)" try: self.genes.update_item( Key=key, @@ -464,10 +464,10 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN ConditionExpression=condition_expression, ) except ClientError as e: - code = e.response.get('Error', {}).get('Code') - if code == 'ConditionalCheckFailedException': + code = e.response.get("Error", {}).get("Code") + if code == "ConditionalCheckFailedException": raise DatabaseWriteException( - f'No such record exists for keys {label_and_type}, {concept_id}' + f"No such record exists for keys {label_and_type}, {concept_id}" ) else: logger.error( @@ -485,25 +485,25 @@ def delete_normalized_concepts(self) -> None: """ while True: with self.genes.batch_writer( - overwrite_by_pkeys=['label_and_type', 'concept_id'] + overwrite_by_pkeys=["label_and_type", "concept_id"] ) as batch: try: response = self.genes.query( - IndexName='item_type_index', - KeyConditionExpression=Key('item_type').eq( + IndexName="item_type_index", + KeyConditionExpression=Key("item_type").eq( RecordType.MERGER.value ), ) except ClientError as e: raise DatabaseReadException(e) - records = response['Items'] + records = response["Items"] if not records: break for record in records: batch.delete_item( Key={ - 'label_and_type': record['label_and_type'], - 'concept_id': record['concept_id'], + "label_and_type": record["label_and_type"], + "concept_id": record["concept_id"], } ) @@ -518,23 +518,23 @@ def delete_source(self, src_name: SourceName) -> None: while True: try: response = self.genes.query( - IndexName='src_index', - KeyConditionExpression=Key('src_name').eq(src_name.value), + IndexName="src_index", + KeyConditionExpression=Key("src_name").eq(src_name.value), ) except ClientError as e: raise DatabaseReadException(e) - records = response['Items'] + records = response["Items"] if not records: break with self.genes.batch_writer( - overwrite_by_pkeys=['label_and_type', 'concept_id'] + overwrite_by_pkeys=["label_and_type", "concept_id"] ) as batch: for record in records: try: batch.delete_item( Key={ - 'label_and_type': record['label_and_type'], - 'concept_id': record['concept_id'], + "label_and_type": record["label_and_type"], + "concept_id": record["concept_id"], } ) except ClientError as e: diff --git a/src/gene/database/postgresql.py b/src/gene/database/postgresql.py index 90924c8c..83e4f56a 100644 --- a/src/gene/database/postgresql.py +++ b/src/gene/database/postgresql.py @@ -29,7 +29,7 @@ logger = logging.getLogger(__name__) -SCRIPTS_DIR = Path(__file__).parent / 'postgresql' +SCRIPTS_DIR = Path(__file__).parent / "postgresql" class PostgresDatabase(AbstractDatabase): @@ -56,16 +56,16 @@ def __init__(self, db_url: Optional[str] = None, **db_args) -> None: """ if db_url: conninfo = db_url - elif 'GENE_NORM_DB_URL' in os.environ: - conninfo = os.environ['GENE_NORM_DB_URL'] + elif "GENE_NORM_DB_URL" in os.environ: + conninfo = os.environ["GENE_NORM_DB_URL"] else: - user = db_args.get('user', 'postgres') - password = db_args.get('password', '') - db_name = db_args.get('db_name', 'gene_normalizer') + user = db_args.get("user", "postgres") + password = db_args.get("password", "") + db_name = db_args.get("db_name", "gene_normalizer") if password: - conninfo = f'dbname={db_name} user={user} password={password}' + conninfo = f"dbname={db_name} user={user} password={password}" else: - conninfo = f'dbname={db_name} user={user}' + conninfo = f"dbname={db_name} user={user}" self.conn = psycopg.connect(conninfo) self.initialize_db() @@ -119,7 +119,7 @@ def drop_db(self) -> None: with self.conn.cursor() as cur: cur.execute(self._drop_db_query) self.conn.commit() - logger.info('Dropped all existing gene normalizer tables.') + logger.info("Dropped all existing gene normalizer tables.") def check_schema_initialized(self) -> bool: """Check if database schema is properly initialized. @@ -128,48 +128,48 @@ def check_schema_initialized(self) -> bool: """ try: with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / 'create_tables.sql').read_bytes()) + cur.execute((SCRIPTS_DIR / "create_tables.sql").read_bytes()) except DuplicateTable: self.conn.rollback() else: - logger.info('Gene table existence check failed.') + logger.info("Gene table existence check failed.") self.conn.rollback() return False try: with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / 'add_fkeys.sql').read_bytes()) + cur.execute((SCRIPTS_DIR / "add_fkeys.sql").read_bytes()) except DuplicateObject: self.conn.rollback() else: - logger.info('Gene foreign key existence check failed.') + logger.info("Gene foreign key existence check failed.") self.conn.rollback() return False try: with self.conn.cursor() as cur: cur.execute( - (SCRIPTS_DIR / 'create_record_lookup_view.sql').read_bytes() + (SCRIPTS_DIR / "create_record_lookup_view.sql").read_bytes() ) except DuplicateTable: self.conn.rollback() else: - logger.info('Gene normalized view lookup failed.') + logger.info("Gene normalized view lookup failed.") self.conn.rollback() return False try: with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / 'add_indexes.sql').read_bytes()) + cur.execute((SCRIPTS_DIR / "add_indexes.sql").read_bytes()) except DuplicateTable: self.conn.rollback() else: - logger.info('Gene indexes check failed.') + logger.info("Gene indexes check failed.") self.conn.rollback() return False return True - _check_sources_query = b'SELECT name FROM gene_sources;' - _check_concepts_query = b'SELECT COUNT(1) FROM gene_concepts LIMIT 1;' - _check_merged_query = b'SELECT COUNT(1) FROM gene_merged LIMIT 1;' + _check_sources_query = b"SELECT name FROM gene_sources;" + _check_concepts_query = b"SELECT COUNT(1) FROM gene_concepts LIMIT 1;" + _check_merged_query = b"SELECT COUNT(1) FROM gene_merged LIMIT 1;" def check_tables_populated(self) -> bool: """Perform rudimentary checks to see if tables are populated. @@ -184,21 +184,21 @@ def check_tables_populated(self) -> bool: cur.execute(self._check_sources_query) results = cur.fetchall() if len(results) < len(SourceName): - logger.info('Gene sources table is missing expected sources.') + logger.info("Gene sources table is missing expected sources.") return False with self.conn.cursor() as cur: cur.execute(self._check_concepts_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info('Gene records table is empty.') + logger.info("Gene records table is empty.") return False with self.conn.cursor() as cur: cur.execute(self._check_merged_query) result = cur.fetchone() if not result or result[0] < 1: - logger.info('Normalized gene records table is empty.') + logger.info("Normalized gene records table is empty.") return False return True @@ -213,12 +213,12 @@ def initialize_db(self) -> None: def _create_views(self) -> None: """Create materialized views.""" - create_view_query = (SCRIPTS_DIR / 'create_record_lookup_view.sql').read_bytes() + create_view_query = (SCRIPTS_DIR / "create_record_lookup_view.sql").read_bytes() with self.conn.cursor() as cur: cur.execute(create_view_query) self.conn.commit() - _refresh_views_query = b'REFRESH MATERIALIZED VIEW record_lookup_view;' + _refresh_views_query = b"REFRESH MATERIALIZED VIEW record_lookup_view;" def _refresh_views(self) -> None: """Update materialized views. @@ -232,36 +232,36 @@ def _refresh_views(self) -> None: def _add_fkeys(self) -> None: """Add fkey relationships.""" - add_fkey_query = (SCRIPTS_DIR / 'add_fkeys.sql').read_bytes() + add_fkey_query = (SCRIPTS_DIR / "add_fkeys.sql").read_bytes() with self.conn.cursor() as cur: cur.execute(add_fkey_query) self.conn.commit() def _drop_fkeys(self) -> None: """Drop fkey relationships.""" - drop_fkey_query = (SCRIPTS_DIR / 'drop_fkeys.sql').read_bytes() + drop_fkey_query = (SCRIPTS_DIR / "drop_fkeys.sql").read_bytes() with self.conn.cursor() as cur: cur.execute(drop_fkey_query) self.conn.commit() def _add_indexes(self) -> None: """Create core search indexes.""" - add_indexes_query = (SCRIPTS_DIR / 'add_indexes.sql').read_bytes() + add_indexes_query = (SCRIPTS_DIR / "add_indexes.sql").read_bytes() with self.conn.cursor() as cur: cur.execute(add_indexes_query) self.conn.commit() def _drop_indexes(self) -> None: """Drop all custom indexes.""" - drop_indexes_query = (SCRIPTS_DIR / 'drop_indexes.sql').read_bytes() + drop_indexes_query = (SCRIPTS_DIR / "drop_indexes.sql").read_bytes() with self.conn.cursor() as cur: cur.execute(drop_indexes_query) self.conn.commit() def _create_tables(self) -> None: """Create all tables, indexes, and views.""" - logger.debug('Creating new gene normalizer tables.') - tables_query = (SCRIPTS_DIR / 'create_tables.sql').read_bytes() + logger.debug("Creating new gene normalizer tables.") + tables_query = (SCRIPTS_DIR / "create_tables.sql").read_bytes() with self.conn.cursor() as cur: cur.execute(tables_query) @@ -278,30 +278,30 @@ def get_source_metadata(self, src_name: SourceName) -> Dict: if src_name in self._cached_sources: return self._cached_sources[src_name] - metadata_query = 'SELECT * FROM gene_sources WHERE name = %s;' + metadata_query = "SELECT * FROM gene_sources WHERE name = %s;" with self.conn.cursor() as cur: cur.execute(metadata_query, [src_name]) metadata_result = cur.fetchone() if not metadata_result: - raise DatabaseReadException(f'{src_name} metadata lookup failed') + raise DatabaseReadException(f"{src_name} metadata lookup failed") metadata = { - 'data_license': metadata_result[1], - 'data_license_url': metadata_result[2], - 'version': metadata_result[3], - 'data_url': metadata_result[4], - 'rdp_url': metadata_result[5], - 'data_license_attributes': { - 'non_commercial': metadata_result[6], - 'attribution': metadata_result[7], - 'share_alike': metadata_result[8], + "data_license": metadata_result[1], + "data_license_url": metadata_result[2], + "version": metadata_result[3], + "data_url": metadata_result[4], + "rdp_url": metadata_result[5], + "data_license_attributes": { + "non_commercial": metadata_result[6], + "attribution": metadata_result[7], + "share_alike": metadata_result[8], }, - 'genome_assemblies': metadata_result[9], + "genome_assemblies": metadata_result[9], } self._cached_sources[src_name] = metadata return metadata _get_record_query = ( - b'SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;' + b"SELECT * FROM record_lookup_view WHERE lower(concept_id) = %s;" ) def _format_source_record(self, source_row: Tuple) -> Dict: @@ -311,21 +311,21 @@ def _format_source_record(self, source_row: Tuple) -> Dict: :return: reformatted dictionary keying gene properties to row values """ gene_record = { - 'concept_id': source_row[0], - 'symbol_status': source_row[1], - 'label': source_row[2], - 'strand': source_row[3], - 'location_annotations': source_row[4], - 'locations': source_row[5], - 'gene_type': source_row[6], - 'aliases': source_row[7], - 'associated_with': source_row[8], - 'previous_symbols': source_row[9], - 'symbol': source_row[10], - 'xrefs': source_row[11], - 'src_name': source_row[12], - 'merge_ref': source_row[13], - 'item_type': RecordType.IDENTITY.value, + "concept_id": source_row[0], + "symbol_status": source_row[1], + "label": source_row[2], + "strand": source_row[3], + "location_annotations": source_row[4], + "locations": source_row[5], + "gene_type": source_row[6], + "aliases": source_row[7], + "associated_with": source_row[8], + "previous_symbols": source_row[9], + "symbol": source_row[10], + "xrefs": source_row[11], + "src_name": source_row[12], + "merge_ref": source_row[13], + "item_type": RecordType.IDENTITY.value, } return {k: v for k, v in gene_record.items() if v} @@ -354,28 +354,28 @@ def _format_merged_record(self, merged_row: Tuple) -> Dict: :return: reformatted dictionary keying normalized gene properties to row values """ merged_record = { - 'concept_id': merged_row[0], - 'symbol': merged_row[1], - 'symbol_status': merged_row[2], - 'previous_symbols': merged_row[3], - 'label': merged_row[4], - 'strand': merged_row[5], - 'ensembl_locations': merged_row[6], - 'hgnc_locations': merged_row[7], - 'ncbi_locations': merged_row[8], - 'location_annotations': merged_row[9], - 'ensembl_biotype': merged_row[10], - 'hgnc_locus_type': merged_row[11], - 'ncbi_gene_type': merged_row[12], - 'aliases': merged_row[13], - 'associated_with': merged_row[14], - 'xrefs': merged_row[15], - 'item_type': RecordType.MERGER.value, + "concept_id": merged_row[0], + "symbol": merged_row[1], + "symbol_status": merged_row[2], + "previous_symbols": merged_row[3], + "label": merged_row[4], + "strand": merged_row[5], + "ensembl_locations": merged_row[6], + "hgnc_locations": merged_row[7], + "ncbi_locations": merged_row[8], + "location_annotations": merged_row[9], + "ensembl_biotype": merged_row[10], + "hgnc_locus_type": merged_row[11], + "ncbi_gene_type": merged_row[12], + "aliases": merged_row[13], + "associated_with": merged_row[14], + "xrefs": merged_row[15], + "item_type": RecordType.MERGER.value, } return {k: v for k, v in merged_record.items() if v} _get_merged_record_query = ( - b'SELECT * FROM gene_merged WHERE lower(concept_id) = %s;' + b"SELECT * FROM gene_merged WHERE lower(concept_id) = %s;" ) def _get_merged_record( @@ -412,11 +412,11 @@ def get_record_by_id( return self._get_record(concept_id, case_sensitive) _ref_types_query = { - RefType.SYMBOL: b'SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;', - RefType.PREVIOUS_SYMBOLS: b'SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;', - RefType.ALIASES: b'SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;', - RefType.XREFS: b'SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;', - RefType.ASSOCIATED_WITH: b'SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;', + RefType.SYMBOL: b"SELECT concept_id FROM gene_symbols WHERE lower(symbol) = %s;", + RefType.PREVIOUS_SYMBOLS: b"SELECT concept_id FROM gene_previous_symbols WHERE lower(prev_symbol) = %s;", + RefType.ALIASES: b"SELECT concept_id FROM gene_aliases WHERE lower(alias) = %s;", + RefType.XREFS: b"SELECT concept_id FROM gene_xrefs WHERE lower(xref) = %s;", + RefType.ASSOCIATED_WITH: b"SELECT concept_id FROM gene_associations WHERE lower(associated_with) = %s;", } def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: @@ -429,7 +429,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: """ query = self._ref_types_query.get(ref_type) if not query: - raise ValueError('invalid reference type') + raise ValueError("invalid reference type") with self.conn.cursor() as cur: cur.execute(query, (search_term.lower(),)) @@ -439,7 +439,7 @@ def get_refs_by_type(self, search_term: str, ref_type: RefType) -> List[str]: else: return [] - _ids_query = b'SELECT concept_id FROM gene_concepts;' + _ids_query = b"SELECT concept_id FROM gene_concepts;" def get_all_concept_ids(self) -> Set[str]: """Retrieve concept IDs for use in generating normalized records. @@ -451,11 +451,11 @@ def get_all_concept_ids(self) -> Set[str]: ids_tuple = cur.fetchall() return {i[0] for i in ids_tuple} - _get_all_normalized_records_query = b'SELECT * FROM gene_merged;' + _get_all_normalized_records_query = b"SELECT * FROM gene_merged;" _get_all_unmerged_source_records_query = ( - b'SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;' + b"SELECT * FROM record_lookup_view WHERE merge_ref IS NULL;" ) - _get_all_source_records_query = b'SELECT * FROM record_lookup_view;' + _get_all_source_records_query = b"SELECT * FROM record_lookup_view;" def get_all_records(self, record_type: RecordType) -> Generator[Dict, None, None]: """Retrieve all source or normalized records. Either return all source records, @@ -530,9 +530,9 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: meta.version, json.dumps(meta.data_url), meta.rdp_url, - meta.data_license_attributes['non_commercial'], - meta.data_license_attributes['attribution'], - meta.data_license_attributes['share_alike'], + meta.data_license_attributes["non_commercial"], + meta.data_license_attributes["attribution"], + meta.data_license_attributes["share_alike"], meta.genome_assemblies, ], ) @@ -546,15 +546,15 @@ def add_source_metadata(self, src_name: SourceName, meta: SourceMeta) -> None: VALUES (%s, %s, %s, %s, %s, %s, %s, %s); """ _ins_symbol_query = ( - b'INSERT INTO gene_symbols (symbol, concept_id) VALUES (%s, %s);' + b"INSERT INTO gene_symbols (symbol, concept_id) VALUES (%s, %s);" ) _ins_prev_symbol_query = ( - b'INSERT INTO gene_previous_symbols (prev_symbol, concept_id) VALUES (%s, %s);' + b"INSERT INTO gene_previous_symbols (prev_symbol, concept_id) VALUES (%s, %s);" ) - _ins_alias_query = b'INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);' - _ins_xref_query = b'INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);' + _ins_alias_query = b"INSERT INTO gene_aliases (alias, concept_id) VALUES (%s, %s);" + _ins_xref_query = b"INSERT INTO gene_xrefs (xref, concept_id) VALUES (%s, %s);" _ins_assoc_query = ( - b'INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);' + b"INSERT INTO gene_associations (associated_with, concept_id) VALUES (%s, %s);" ) def add_record(self, record: Dict, src_name: SourceName) -> None: @@ -563,8 +563,8 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: :param record: record to upload :param src_name: name of source for record. Not used by PostgreSQL instance. """ - concept_id = record['concept_id'] - locations = [json.dumps(loc) for loc in record.get('locations', [])] + concept_id = record["concept_id"] + locations = [json.dumps(loc) for loc in record.get("locations", [])] if not locations: locations = None with self.conn.cursor() as cur: @@ -573,28 +573,28 @@ def add_record(self, record: Dict, src_name: SourceName) -> None: self._add_record_query, [ concept_id, - record['src_name'], - record.get('symbol_status'), - record.get('label'), - record.get('strand'), - record.get('location_annotations'), + record["src_name"], + record.get("symbol_status"), + record.get("label"), + record.get("strand"), + record.get("location_annotations"), locations, - record.get('gene_type'), + record.get("gene_type"), ], ) - for a in record.get('aliases', []): + for a in record.get("aliases", []): cur.execute(self._ins_alias_query, [a, concept_id]) - for x in record.get('xrefs', []): + for x in record.get("xrefs", []): cur.execute(self._ins_xref_query, [x, concept_id]) - for a in record.get('associated_with', []): + for a in record.get("associated_with", []): cur.execute(self._ins_assoc_query, [a, concept_id]) - for p in record.get('previous_symbols', []): + for p in record.get("previous_symbols", []): cur.execute(self._ins_prev_symbol_query, [p, concept_id]) - if record.get('symbol'): - cur.execute(self._ins_symbol_query, [record['symbol'], concept_id]) + if record.get("symbol"): + cur.execute(self._ins_symbol_query, [record["symbol"], concept_id]) self.conn.commit() except UniqueViolation: - logger.error(f'Record with ID {concept_id} already exists') + logger.error(f"Record with ID {concept_id} already exists") self.conn.rollback() _add_merged_record_query = b""" @@ -612,35 +612,35 @@ def add_merged_record(self, record: Dict) -> None: :param record: merged record to add """ - ensembl_locations = record.get('ensembl_locations') + ensembl_locations = record.get("ensembl_locations") if ensembl_locations: ensembl_locations = [json.dumps(i) for i in ensembl_locations] - ncbi_locations = record.get('ncbi_locations') + ncbi_locations = record.get("ncbi_locations") if ncbi_locations: ncbi_locations = [json.dumps(i) for i in ncbi_locations] - hgnc_locations = record.get('hgnc_locations') + hgnc_locations = record.get("hgnc_locations") if hgnc_locations: hgnc_locations = [json.dumps(i) for i in hgnc_locations] with self.conn.cursor() as cur: cur.execute( self._add_merged_record_query, [ - record['concept_id'], - record.get('symbol'), - record.get('symbol_status'), - record.get('previous_symbols'), - record.get('label'), - record.get('strand'), - record.get('location_annotations'), + record["concept_id"], + record.get("symbol"), + record.get("symbol_status"), + record.get("previous_symbols"), + record.get("label"), + record.get("strand"), + record.get("location_annotations"), ensembl_locations, hgnc_locations, ncbi_locations, - record.get('hgnc_locus_type'), - record.get('ensembl_biotype'), - record.get('ncbi_gene_type'), - record.get('aliases'), - record.get('associated_with'), - record.get('xrefs'), + record.get("hgnc_locus_type"), + record.get("ensembl_biotype"), + record.get("ncbi_gene_type"), + record.get("aliases"), + record.get("associated_with"), + record.get("xrefs"), ], ) self.conn.commit() @@ -661,7 +661,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN with self.conn.cursor() as cur: cur.execute( self._update_merge_ref_query, - {'merge_ref': merge_ref, 'concept_id': concept_id}, + {"merge_ref": merge_ref, "concept_id": concept_id}, ) row_count = cur.rowcount self.conn.commit() @@ -669,7 +669,7 @@ def update_merge_ref(self, concept_id: str, merge_ref: Any) -> None: # noqa: AN # UPDATE will fail silently unless we check the # of affected rows if row_count < 1: raise DatabaseWriteException( - f'No such record exists for primary key {concept_id}' + f"No such record exists for primary key {concept_id}" ) def delete_normalized_concepts(self) -> None: @@ -687,7 +687,7 @@ def delete_normalized_concepts(self) -> None: :raise DatabaseWriteException: if deletion call fails """ with self.conn.cursor() as cur: - cur.execute((SCRIPTS_DIR / 'delete_normalized_concepts.sql').read_bytes()) + cur.execute((SCRIPTS_DIR / "delete_normalized_concepts.sql").read_bytes()) self.conn.commit() _drop_aliases_query = b""" @@ -725,8 +725,8 @@ def delete_normalized_concepts(self) -> None: WHERE gc.source = %s ); """ - _drop_concepts_query = b'DELETE FROM gene_concepts WHERE source = %s;' - _drop_source_query = b'DELETE FROM gene_sources gs WHERE gs.name = %s;' + _drop_concepts_query = b"DELETE FROM gene_concepts WHERE source = %s;" + _drop_source_query = b"DELETE FROM gene_sources gs WHERE gs.name = %s;" def delete_source(self, src_name: SourceName) -> None: """Delete all data for a source. Use when updating source data. @@ -784,35 +784,35 @@ def load_from_remote(self, url: Optional[str]) -> None: command fails """ if not url: - url = 'https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz' + url = "https://vicc-normalizers.s3.us-east-2.amazonaws.com/gene_normalization/postgresql/gene_norm_latest.sql.tar.gz" with tempfile.TemporaryDirectory() as tempdir: tempdir_path = Path(tempdir) - temp_tarfile = tempdir_path / 'gene_norm_latest.tar.gz' + temp_tarfile = tempdir_path / "gene_norm_latest.tar.gz" with requests.get(url, stream=True) as r: try: r.raise_for_status() except requests.HTTPError: raise DatabaseException( - f'Unable to retrieve PostgreSQL dump file from {url}' + f"Unable to retrieve PostgreSQL dump file from {url}" ) - with open(temp_tarfile, 'wb') as h: + with open(temp_tarfile, "wb") as h: for chunk in r.iter_content(chunk_size=8192): if chunk: h.write(chunk) - tar = tarfile.open(temp_tarfile, 'r:gz') + tar = tarfile.open(temp_tarfile, "r:gz") tar_dump_file = [ - f for f in tar.getmembers() if f.name.startswith('gene_norm_') + f for f in tar.getmembers() if f.name.startswith("gene_norm_") ][0] tar.extractall(path=tempdir_path, members=[tar_dump_file]) dump_file = tempdir_path / tar_dump_file.name if self.conn.info.password: - pw_param = f'-W {self.conn.info.password}' + pw_param = f"-W {self.conn.info.password}" else: - pw_param = '-w' + pw_param = "-w" self.drop_db() - system_call = f'psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}' + system_call = f"psql -d {self.conn.info.dbname} -U {self.conn.info.user} {pw_param} -f {dump_file.absolute()}" result = os.system(system_call) if result != 0: raise DatabaseException( @@ -832,18 +832,18 @@ def export_db(self, output_directory: Path) -> None: raise ValueError( f"Output location {output_directory} isn't a directory or doesn't exist" ) - now = datetime.now().strftime('%Y%m%d%H%M%S') - output_location = output_directory / f'gene_norm_{now}.sql' + now = datetime.now().strftime("%Y%m%d%H%M%S") + output_location = output_directory / f"gene_norm_{now}.sql" user = self.conn.info.user host = self.conn.info.host port = self.conn.info.port database_name = self.conn.info.dbname if self.conn.info.password: - pw_param = f'-W {self.conn.info.password}' + pw_param = f"-W {self.conn.info.password}" else: - pw_param = '-w' + pw_param = "-w" - system_call = f'pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}' + system_call = f"pg_dump -E UTF8 -f {output_location} -U {user} {pw_param} -h {host} -p {port} {database_name}" result = os.system(system_call) if result != 0: raise DatabaseException( diff --git a/src/gene/etl/__init__.py b/src/gene/etl/__init__.py index 1d7020b3..569df1d7 100644 --- a/src/gene/etl/__init__.py +++ b/src/gene/etl/__init__.py @@ -9,10 +9,10 @@ from .ncbi import NCBI __all__ = [ - 'Ensembl', - 'HGNC', - 'NCBI', - 'GeneNormalizerEtlError', - 'GeneFileVersionError', - 'GeneSourceFetchError', + "Ensembl", + "HGNC", + "NCBI", + "GeneNormalizerEtlError", + "GeneFileVersionError", + "GeneSourceFetchError", ] diff --git a/src/gene/etl/base.py b/src/gene/etl/base.py index 771e2294..77e9eee1 100644 --- a/src/gene/etl/base.py +++ b/src/gene/etl/base.py @@ -15,7 +15,7 @@ from gene.database import AbstractDatabase from gene.schemas import Gene, GeneSequenceLocation, MatchType, SourceName -logger = logging.getLogger('gene') +logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) @@ -71,7 +71,7 @@ def perform_etl(self, use_existing: bool = False) -> List[str]: """ self._extract_data(use_existing) if not self._silent: - click.echo('Transforming and loading data to DB...') + click.echo("Transforming and loading data to DB...") self._add_meta() self._transform_data() self._database.complete_write_transaction() @@ -110,12 +110,12 @@ def _load_gene(self, gene: Dict) -> None: try: assert Gene(match_type=MatchType.NO_MATCH, **gene) except pydantic.ValidationError as e: - logger.warning(f'Unable to load {gene} due to validation error: ' f'{e}') + logger.warning(f"Unable to load {gene} due to validation error: " f"{e}") else: - concept_id = gene['concept_id'] - gene['label_and_type'] = f'{concept_id.lower()}##identity' - gene['src_name'] = self._src_name.value - gene['item_type'] = 'identity' + concept_id = gene["concept_id"] + gene["label_and_type"] = f"{concept_id.lower()}##identity" + gene["src_name"] = self._src_name.value + gene["item_type"] = "identity" for attr_type in ITEM_TYPES: if attr_type in gene: @@ -136,7 +136,7 @@ def get_seqrepo(self, seqrepo_dir: Path) -> SeqRepo: :return: SeqRepo instance """ if not Path(seqrepo_dir).exists(): - raise NotADirectoryError(f'Could not find {seqrepo_dir}') + raise NotADirectoryError(f"Could not find {seqrepo_dir}") return SeqRepo(seqrepo_dir) def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: @@ -146,33 +146,33 @@ def _set_cl_interval_range(self, loc: str, arm_ix: int, location: Dict) -> None: :param arm_ix: The index of the q or p arm for a given location :param location: VRS chromosome location. This will be mutated. """ - range_ix = re.search('-', loc).start() # type: ignore + range_ix = re.search("-", loc).start() # type: ignore start = loc[arm_ix:range_ix] - start_arm_ix = re.search('[pq]', start).start() # type: ignore + start_arm_ix = re.search("[pq]", start).start() # type: ignore start_arm = start[start_arm_ix] end = loc[range_ix + 1 :] - end_arm_match = re.search('[pq]', end) + end_arm_match = re.search("[pq]", end) if not end_arm_match: # Does not specify the arm, so use the same as start"s - end = f'{start[0]}{end}' - end_arm_match = re.search('[pq]', end) + end = f"{start[0]}{end}" + end_arm_match = re.search("[pq]", end) end_arm_ix = end_arm_match.start() # type: ignore end_arm = end[end_arm_ix] if (start_arm == end_arm and start > end) or ( - start_arm != end_arm and start_arm == 'p' and end_arm == 'q' + start_arm != end_arm and start_arm == "p" and end_arm == "q" ): - location['start'] = start - location['end'] = end + location["start"] = start + location["end"] = end elif (start_arm == end_arm and start < end) or ( - start_arm != end_arm and start_arm == 'q' and end_arm == 'p' + start_arm != end_arm and start_arm == "q" and end_arm == "p" ): - location['start'] = end - location['end'] = start + location["start"] = end + location["end"] = start # Add back once VRS Chromosome Location is supported in 2.0-alpha # def _get_chromosome_location(self, location: Dict, gene: Dict) -> Optional[Dict]: @@ -209,9 +209,9 @@ def _get_seq_id_aliases(self, seq_id: str) -> List[str]: """ aliases = [] try: - aliases = self.seqrepo.translate_alias(seq_id, target_namespaces='ga4gh') + aliases = self.seqrepo.translate_alias(seq_id, target_namespaces="ga4gh") except KeyError as e: - logger.warning(f'SeqRepo raised KeyError: {e}') + logger.warning(f"SeqRepo raised KeyError: {e}") return aliases def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Dict: @@ -230,7 +230,7 @@ def _get_sequence_location(self, seq_id: str, gene: Feature, params: Dict) -> Di sequence = aliases[0] - if gene.start != '.' and gene.end != '.' and sequence: + if gene.start != "." and gene.end != "." and sequence: if 0 <= gene.start <= gene.end: # type: ignore location = GeneSequenceLocation( start=gene.start - 1, # type: ignore diff --git a/src/gene/etl/ensembl.py b/src/gene/etl/ensembl.py index 4e775afd..4a52975a 100644 --- a/src/gene/etl/ensembl.py +++ b/src/gene/etl/ensembl.py @@ -12,7 +12,7 @@ ) from gene.schemas import NamespacePrefix, SourceMeta, SourceName, Strand -logger = logging.getLogger('gene') +logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) @@ -30,36 +30,36 @@ def _extract_data(self, use_existing: bool) -> None: self._data_file, raw_version = self._data_source.get_latest( from_local=use_existing ) - match = re.match(r'(GRCh\d+)_(\d+)', raw_version) + match = re.match(r"(GRCh\d+)_(\d+)", raw_version) self._assembly = match.groups()[0] self._version = match.groups()[1] def _transform_data(self) -> None: """Transform the Ensembl source.""" - logger.info('Transforming Ensembl...') + logger.info("Transforming Ensembl...") db = gffutils.create_db( str(self._data_file), - dbfn=':memory:', + dbfn=":memory:", force=True, - merge_strategy='create_unique', + merge_strategy="create_unique", keep_order=True, ) # Get accession numbers accession_numbers = dict() - for item in db.features_of_type('scaffold'): - accession_numbers[item[0]] = item[8]['Alias'][-1] - for item in db.features_of_type('chromosome'): - accession_numbers[item[0]] = item[8]['Alias'][-1] + for item in db.features_of_type("scaffold"): + accession_numbers[item[0]] = item[8]["Alias"][-1] + for item in db.features_of_type("chromosome"): + accession_numbers[item[0]] = item[8]["Alias"][-1] for f in db.all_features(): - if f.attributes.get('ID'): - f_id = f.attributes.get('ID')[0].split(':')[0] - if f_id == 'gene': + if f.attributes.get("ID"): + f_id = f.attributes.get("ID")[0].split(":")[0] + if f_id == "gene": gene = self._add_gene(f, accession_numbers) if gene: self._load_gene(gene) - logger.info('Successfully transformed Ensembl.') + logger.info("Successfully transformed Ensembl.") def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: """Create a transformed gene record. @@ -69,19 +69,19 @@ def _add_gene(self, f: Feature, accession_numbers: Dict) -> Dict: :return: A gene dictionary containing data if the ID attribute exists. """ gene = dict() - if f.strand == '-': - gene['strand'] = Strand.REVERSE.value - elif f.strand == '+': - gene['strand'] = Strand.FORWARD.value - gene['src_name'] = SourceName.ENSEMBL.value + if f.strand == "-": + gene["strand"] = Strand.REVERSE.value + elif f.strand == "+": + gene["strand"] = Strand.FORWARD.value + gene["src_name"] = SourceName.ENSEMBL.value self._add_attributes(f, gene) location = self._add_location(f, gene, accession_numbers) if location: - gene['locations'] = [location] + gene["locations"] = [location] - gene['label_and_type'] = f"{gene['concept_id'].lower()}##identity" - gene['item_type'] = 'identity' + gene["label_and_type"] = f"{gene['concept_id'].lower()}##identity" + gene["item_type"] = "identity" return gene @@ -92,10 +92,10 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: :param gene: A transformed gene record """ attributes = { - 'ID': 'concept_id', - 'Name': 'symbol', - 'description': 'xrefs', - 'biotype': 'gene_type', + "ID": "concept_id", + "Name": "symbol", + "description": "xrefs", + "biotype": "gene_type", } for attribute in f.attributes.items(): @@ -106,30 +106,30 @@ def _add_attributes(self, f: Feature, gene: Dict) -> None: if len(val) == 1: val = val[0] - if key == 'ID': - if val.startswith('gene'): + if key == "ID": + if val.startswith("gene"): val = ( f"{NamespacePrefix.ENSEMBL.value}:" f"{val.split(':')[1]}" ) - if key == 'description': - gene['label'] = val.split('[')[0].strip() - if 'Source:' in val: + if key == "description": + gene["label"] = val.split("[")[0].strip() + if "Source:" in val: src_name = ( - val.split('[')[-1] - .split('Source:')[-1] - .split('Acc')[0] - .split(';')[0] + val.split("[")[-1] + .split("Source:")[-1] + .split("Acc")[0] + .split(";")[0] ) - src_id = val.split('Acc:')[-1].split(']')[0] - if ':' in src_id: - src_id = src_id.split(':')[-1] + src_id = val.split("Acc:")[-1].split("]")[0] + if ":" in src_id: + src_id = src_id.split(":")[-1] source = self._get_xref_associated_with(src_name, src_id) - if 'xrefs' in source: - gene['xrefs'] = source['xrefs'] - elif 'associated_with' in source: - gene['associated_with'] = source['associated_with'] + if "xrefs" in source: + gene["xrefs"] = source["xrefs"] + elif "associated_with" in source: + gene["associated_with"] = source["associated_with"] continue gene[attributes[key]] = val @@ -153,16 +153,16 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: :return: A dict containing an other identifier or xref """ source = dict() - if src_name.startswith('HGNC'): - source['xrefs'] = [f'{NamespacePrefix.HGNC.value}:{src_id}'] - elif src_name.startswith('NCBI'): - source['xrefs'] = [f'{NamespacePrefix.NCBI.value}:{src_id}'] - elif src_name.startswith('UniProt'): - source['associated_with'] = [f'{NamespacePrefix.UNIPROT.value}:{src_id}'] - elif src_name.startswith('miRBase'): - source['associated_with'] = [f'{NamespacePrefix.MIRBASE.value}:{src_id}'] - elif src_name.startswith('RFAM'): - source['associated_with'] = [f'{NamespacePrefix.RFAM.value}:{src_id}'] + if src_name.startswith("HGNC"): + source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] + elif src_name.startswith("NCBI"): + source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"] + elif src_name.startswith("UniProt"): + source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] + elif src_name.startswith("miRBase"): + source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] + elif src_name.startswith("RFAM"): + source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] return source def _add_meta(self) -> None: @@ -172,21 +172,21 @@ def _add_meta(self) -> None: """ if not self._version or not self._assembly: raise GeneNormalizerEtlError( - 'Source metadata unavailable -- was data properly acquired before attempting to load DB?' + "Source metadata unavailable -- was data properly acquired before attempting to load DB?" ) metadata = SourceMeta( - data_license='custom', - data_license_url='https://useast.ensembl.org/info/about' - '/legal/disclaimer.html', + data_license="custom", + data_license_url="https://useast.ensembl.org/info/about" + "/legal/disclaimer.html", version=self._version, data_url={ - 'genome_annotations': f'ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz' + "genome_annotations": f"ftp://ftp.ensembl.org/pub/release-{self._version}/gff3/homo_sapiens/Homo_sapiens.{self._assembly}.{self._version}.gff3.gz" }, rdp_url=None, data_license_attributes={ - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "non_commercial": False, + "share_alike": False, + "attribution": False, }, genome_assemblies=[self._assembly], ) diff --git a/src/gene/etl/hgnc.py b/src/gene/etl/hgnc.py index 5e4f7c2a..2fee6117 100644 --- a/src/gene/etl/hgnc.py +++ b/src/gene/etl/hgnc.py @@ -18,7 +18,7 @@ SymbolStatus, ) -logger = logging.getLogger('gene') +logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) @@ -27,38 +27,38 @@ class HGNC(Base): def _transform_data(self) -> None: """Transform the HGNC source.""" - logger.info('Transforming HGNC...') - with open(self._data_file, 'r') as f: # type: ignore + logger.info("Transforming HGNC...") + with open(self._data_file, "r") as f: # type: ignore data = json.load(f) - records = data['response']['docs'] + records = data["response"]["docs"] for r in records: gene = dict() - gene['concept_id'] = r['hgnc_id'].lower() - gene['label_and_type'] = f"{gene['concept_id']}##identity" - gene['item_type'] = 'identity' - gene['symbol'] = r['symbol'] - gene['label'] = r['name'] - gene['src_name'] = SourceName.HGNC.value - if r['status']: - if r['status'] == 'Approved': - gene['symbol_status'] = SymbolStatus.APPROVED.value - elif r['status'] == 'Entry Withdrawn': - gene['symbol_status'] = SymbolStatus.WITHDRAWN.value - gene['src_name'] = SourceName.HGNC.value + gene["concept_id"] = r["hgnc_id"].lower() + gene["label_and_type"] = f"{gene['concept_id']}##identity" + gene["item_type"] = "identity" + gene["symbol"] = r["symbol"] + gene["label"] = r["name"] + gene["src_name"] = SourceName.HGNC.value + if r["status"]: + if r["status"] == "Approved": + gene["symbol_status"] = SymbolStatus.APPROVED.value + elif r["status"] == "Entry Withdrawn": + gene["symbol_status"] = SymbolStatus.WITHDRAWN.value + gene["src_name"] = SourceName.HGNC.value # store alias, xref, associated_with, prev_symbols, location self._get_aliases(r, gene) self._get_xrefs_associated_with(r, gene) - if 'prev_symbol' in r: + if "prev_symbol" in r: self._get_previous_symbols(r, gene) - if 'location' in r: + if "location" in r: self._get_location(r, gene) - if 'locus_type' in r: - gene['gene_type'] = r['locus_type'] + if "locus_type" in r: + gene["gene_type"] = r["locus_type"] self._load_gene(gene) - logger.info('Successfully transformed HGNC.') + logger.info("Successfully transformed HGNC.") def _get_aliases(self, r: Dict, gene: Dict) -> None: """Store aliases in a gene record. @@ -68,14 +68,14 @@ def _get_aliases(self, r: Dict, gene: Dict) -> None: """ alias_symbol = list() enzyme_id = list() - if 'alias_symbol' in r: - alias_symbol = r['alias_symbol'] + if "alias_symbol" in r: + alias_symbol = r["alias_symbol"] - if 'enzyme_id' in r: - enzyme_id = r['enzyme_id'] + if "enzyme_id" in r: + enzyme_id = r["enzyme_id"] if alias_symbol or enzyme_id: - gene['aliases'] = list(set(alias_symbol + enzyme_id)) + gene["aliases"] = list(set(alias_symbol + enzyme_id)) def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: """Store previous symbols in a gene record. @@ -83,9 +83,9 @@ def _get_previous_symbols(self, r: Dict, gene: Dict) -> None: :param r: A gene record in the HGNC data file :param gene: A transformed gene record """ - prev_symbols = r['prev_symbol'] + prev_symbols = r["prev_symbol"] if prev_symbols: - gene['previous_symbols'] = list(set(prev_symbols)) + gene["previous_symbols"] = list(set(prev_symbols)) def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: """Store xrefs and/or associated_with refs in a gene record. @@ -96,40 +96,40 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: xrefs = list() associated_with = list() sources = [ - 'entrez_id', - 'ensembl_gene_id', - 'vega_id', - 'ucsc_id', - 'ccds_id', - 'uniprot_ids', - 'pubmed_id', - 'cosmic', - 'omim_id', - 'mirbase', - 'homeodb', - 'snornabase', - 'orphanet', - 'horde_id', - 'merops', - 'imgt', - 'iuphar', - 'kznf_gene_catalog', - 'mamit-trnadb', - 'cd', - 'lncrnadb', - 'ena', - 'pseudogene.org', - 'refseq_accession', + "entrez_id", + "ensembl_gene_id", + "vega_id", + "ucsc_id", + "ccds_id", + "uniprot_ids", + "pubmed_id", + "cosmic", + "omim_id", + "mirbase", + "homeodb", + "snornabase", + "orphanet", + "horde_id", + "merops", + "imgt", + "iuphar", + "kznf_gene_catalog", + "mamit-trnadb", + "cd", + "lncrnadb", + "ena", + "pseudogene.org", + "refseq_accession", ] for src in sources: if src in r: - if '-' in src: - key = src.split('-')[0] - elif '.' in src: - key = src.split('.')[0] - elif '_' in src: - key = src.split('_')[0] + if "-" in src: + key = src.split("-")[0] + elif "." in src: + key = src.split(".")[0] + elif "_" in src: + key = src.split("_")[0] else: key = src @@ -139,12 +139,12 @@ def _get_xrefs_associated_with(self, r: Dict, gene: Dict) -> None: else: self._get_xref_associated_with(key, src, r, associated_with) else: - logger.warning(f'{key} not in schemas.py') + logger.warning(f"{key} not in schemas.py") if xrefs: - gene['xrefs'] = xrefs + gene["xrefs"] = xrefs if associated_with: - gene['associated_with'] = associated_with + gene["associated_with"] = associated_with def _get_xref_associated_with( self, key: str, src: str, r: Dict, src_type: Dict @@ -158,11 +158,11 @@ def _get_xref_associated_with( """ if isinstance(r[src], list): for xref in r[src]: - src_type.append(f'{NamespacePrefix[key.upper()].value}:{xref}') + src_type.append(f"{NamespacePrefix[key.upper()].value}:{xref}") else: - if isinstance(r[src], str) and ':' in r[src]: - r[src] = r[src].split(':')[-1].strip() - src_type.append(f'{NamespacePrefix[key.upper()].value}' f':{r[src]}') + if isinstance(r[src], str) and ":" in r[src]: + r[src] = r[src].split(":")[-1].strip() + src_type.append(f"{NamespacePrefix[key.upper()].value}" f":{r[src]}") def _get_location(self, r: Dict, gene: Dict) -> None: """Store GA4GH VRS ChromosomeLocation in a gene record. @@ -172,20 +172,20 @@ def _get_location(self, r: Dict, gene: Dict) -> None: :param gene: A transformed gene record """ # Get list of a gene's map locations - if 'and' in r['location']: - locations = r['location'].split('and') + if "and" in r["location"]: + locations = r["location"].split("and") else: - locations = [r['location']] + locations = [r["location"]] location_list = list() - gene['location_annotations'] = list() + gene["location_annotations"] = list() for loc in locations: loc = loc.strip() loc = self._set_annotation(loc, gene) if loc: - if loc == 'mitochondria': - gene['location_annotations'].append(Chromosome.MITOCHONDRIA.value) + if loc == "mitochondria": + gene["location_annotations"].append(Chromosome.MITOCHONDRIA.value) else: location = dict() self._set_location(loc, location, gene) @@ -194,9 +194,9 @@ def _get_location(self, r: Dict, gene: Dict) -> None: # location_list.append(chr_location) if location_list: - gene['locations'] = location_list - if not gene['location_annotations']: - del gene['location_annotations'] + gene["locations"] = location_list + if not gene["location_annotations"]: + del gene["location_annotations"] def _set_annotation(self, loc: str, gene: Dict) -> None: """Set the annotations attribute if one is provided. @@ -210,7 +210,7 @@ def _set_annotation(self, loc: str, gene: Dict) -> None: for annotation in annotations: if annotation in loc: - gene['location_annotations'].append(annotation) + gene["location_annotations"].append(annotation) # Check if location is also included loc = loc.split(annotation)[0].strip() if not loc: @@ -224,24 +224,24 @@ def _set_location(self, loc: str, location: Dict, gene: Dict) -> None: :param location: GA4GH location :param gene: A transformed gene record """ - arm_match = re.search('[pq]', loc) + arm_match = re.search("[pq]", loc) if arm_match: # Location gives arm and sub / sub band arm_ix = arm_match.start() - location['chr'] = loc[:arm_ix] + location["chr"] = loc[:arm_ix] - if '-' in loc: + if "-" in loc: # Location gives both start and end self._set_cl_interval_range(loc, arm_ix, location) else: # Location only gives start start = loc[arm_ix:] - location['start'] = start - location['end'] = start + location["start"] = start + location["end"] = start else: # Only gives chromosome - gene['location_annotations'].append(loc) + gene["location_annotations"].append(loc) def _add_meta(self) -> None: """Add HGNC metadata. @@ -250,20 +250,20 @@ def _add_meta(self) -> None: """ if not self._version: raise GeneNormalizerEtlError( - 'Source metadata unavailable -- was data properly acquired before attempting to load DB?' + "Source metadata unavailable -- was data properly acquired before attempting to load DB?" ) metadata = SourceMeta( - data_license='CC0', - data_license_url='https://www.genenames.org/about/license/', + data_license="CC0", + data_license_url="https://www.genenames.org/about/license/", version=self._version, data_url={ - 'complete_set_archive': 'ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' + "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" }, rdp_url=None, data_license_attributes={ - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "non_commercial": False, + "share_alike": False, + "attribution": False, }, genome_assemblies=[], ) diff --git a/src/gene/etl/merge.py b/src/gene/etl/merge.py index 9121e498..8124d294 100644 --- a/src/gene/etl/merge.py +++ b/src/gene/etl/merge.py @@ -7,7 +7,7 @@ from gene.database.database import DatabaseWriteException from gene.schemas import GeneTypeFieldName, RecordType, SourcePriority -logger = logging.getLogger('gene') +logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) @@ -28,7 +28,7 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: :param record_ids: concept identifiers from which groups should be generated. Should *not* include any records from excluded sources. """ - logger.info('Generating record ID sets...') + logger.info("Generating record ID sets...") start = timer() for record_id in record_ids: new_group = self._create_record_id_set(record_id) @@ -36,11 +36,11 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: for concept_id in new_group: self._groups[concept_id] = new_group end = timer() - logger.debug(f'Built record ID sets in {end - start} seconds') + logger.debug(f"Built record ID sets in {end - start} seconds") self._groups = {k: v for k, v in self._groups.items() if len(v) > 1} - logger.info('Creating merged records and updating database...') + logger.info("Creating merged records and updating database...") uploaded_ids = set() start = timer() for record_id, group in self._groups.items(): @@ -53,22 +53,22 @@ def create_merged_concepts(self, record_ids: Set[str]) -> None: # add updated references for concept_id in group: - merge_ref = merged_record['concept_id'] + merge_ref = merged_record["concept_id"] try: self._database.update_merge_ref(concept_id, merge_ref) except DatabaseWriteException as dw: - if str(dw).startswith('No such record exists'): + if str(dw).startswith("No such record exists"): logger.error( - f'Updating nonexistent record: {concept_id} ' - f'for merge ref to {merge_ref}' + f"Updating nonexistent record: {concept_id} " + f"for merge ref to {merge_ref}" ) else: logger.error(str(dw)) uploaded_ids |= group self._database.complete_write_transaction() - logger.info('Merged concept generation successful.') + logger.info("Merged concept generation successful.") end = timer() - logger.debug(f'Generated and added concepts in {end - start} seconds') + logger.debug(f"Generated and added concepts in {end - start} seconds") def _create_record_id_set( self, record_id: str, observed_id_set: Optional[Set] = None @@ -89,15 +89,15 @@ def _create_record_id_set( db_record = self._database.get_record_by_id(record_id) if not db_record: logger.warning( - f'Record ID set creator could not resolve ' - f'lookup for {record_id} in ID set: ' - f'{observed_id_set}' + f"Record ID set creator could not resolve " + f"lookup for {record_id} in ID set: " + f"{observed_id_set}" ) return observed_id_set - {record_id} - record_xrefs = db_record.get('xrefs') + record_xrefs = db_record.get("xrefs") if not record_xrefs: - return observed_id_set | {db_record['concept_id']} + return observed_id_set | {db_record["concept_id"]} else: local_id_set = set(record_xrefs) merged_id_set = {record_id} | observed_id_set @@ -125,40 +125,40 @@ def _generate_merged_record(self, record_id_set: Set[str]) -> Dict: records.append(record) else: logger.error( - f'Merge record generator could not retrieve ' - f'record for {record_id} in {record_id_set}' + f"Merge record generator could not retrieve " + f"record for {record_id} in {record_id_set}" ) def record_order(record: Dict) -> Tuple: """Provide priority values of concepts for sort function.""" - src = record['src_name'].upper() + src = record["src_name"].upper() if src in SourcePriority.__members__: source_rank = SourcePriority[src].value else: raise Exception( f"Prohibited source: {src} in concept_id " f"{record['concept_id']}" ) - return source_rank, record['concept_id'] + return source_rank, record["concept_id"] records.sort(key=record_order) # initialize merged record merged_attrs = { - 'concept_id': records[0]['concept_id'], - 'aliases': set(), - 'associated_with': set(), - 'previous_symbols': set(), - 'hgnc_locus_type': set(), - 'ncbi_gene_type': set(), - 'ensembl_biotype': set(), - 'strand': set(), + "concept_id": records[0]["concept_id"], + "aliases": set(), + "associated_with": set(), + "previous_symbols": set(), + "hgnc_locus_type": set(), + "ncbi_gene_type": set(), + "ensembl_biotype": set(), + "strand": set(), } if len(records) > 1: - merged_attrs['xrefs'] = list({r['concept_id'] for r in records[1:]}) + merged_attrs["xrefs"] = list({r["concept_id"] for r in records[1:]}) # merge from constituent records - set_fields = ['aliases', 'associated_with', 'previous_symbols', 'strand'] - scalar_fields = ['symbol', 'symbol_status', 'label', 'location_annotations'] + set_fields = ["aliases", "associated_with", "previous_symbols", "strand"] + scalar_fields = ["symbol", "symbol_status", "label", "location_annotations"] for record in records: for field in set_fields: merged_attrs[field] |= set(record.get(field, set())) @@ -167,19 +167,19 @@ def record_order(record: Dict) -> Tuple: if field not in merged_attrs and field in record: merged_attrs[field] = record[field] - locations = record.get('locations') + locations = record.get("locations") if locations: merged_attrs[f"{record['src_name'].lower()}_locations"] = locations - gene_type = record.get('gene_type') + gene_type = record.get("gene_type") if gene_type: - merged_field = GeneTypeFieldName[record['src_name'].upper()] + merged_field = GeneTypeFieldName[record["src_name"].upper()] merged_attrs[merged_field] |= {gene_type} for field in set_fields + [ - 'hgnc_locus_type', - 'ncbi_gene_type', - 'ensembl_biotype', + "hgnc_locus_type", + "ncbi_gene_type", + "ensembl_biotype", ]: field_value = merged_attrs[field] if field_value: @@ -188,12 +188,12 @@ def record_order(record: Dict) -> Tuple: del merged_attrs[field] # ensure no conflicting strands - unique_strand_values = set(merged_attrs.get('strand', [])) + unique_strand_values = set(merged_attrs.get("strand", [])) num_unique_strand_values = len(unique_strand_values) if num_unique_strand_values > 1: - del merged_attrs['strand'] + del merged_attrs["strand"] elif num_unique_strand_values == 1: - merged_attrs['strand'] = list(unique_strand_values)[0] + merged_attrs["strand"] = list(unique_strand_values)[0] - merged_attrs['item_type'] = RecordType.MERGER.value + merged_attrs["item_type"] = RecordType.MERGER.value return merged_attrs diff --git a/src/gene/etl/ncbi.py b/src/gene/etl/ncbi.py index ba675dc3..a3b2e706 100644 --- a/src/gene/etl/ncbi.py +++ b/src/gene/etl/ncbi.py @@ -24,7 +24,7 @@ SymbolStatus, ) -logger = logging.getLogger('gene') +logger = logging.getLogger("gene") logger.setLevel(logging.DEBUG) @@ -63,10 +63,10 @@ def _extract_data(self, use_existing: bool) -> None: self._info_src = gene_paths.gene_info self._history_src = gene_paths.gene_history self._gene_url = ( - 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz' + "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz" ) - self._history_url = 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz' - self._assembly_url = 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/' + self._history_url = "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz" + self._assembly_url = "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/" def _get_prev_symbols(self) -> Dict[str, str]: """Store a gene's symbol history. @@ -74,14 +74,14 @@ def _get_prev_symbols(self) -> Dict[str, str]: :return: A dictionary of a gene's previous symbols """ # get symbol history - history_file = open(self._history_src, 'r') - history = csv.reader(history_file, delimiter='\t') + history_file = open(self._history_src, "r") + history = csv.reader(history_file, delimiter="\t") next(history) prev_symbols = {} for row in history: # Only interested in rows that have homo sapiens tax id - if row[0] == '9606': - if row[1] != '-': + if row[0] == "9606": + if row[1] != "-": gene_id = row[1] if gene_id in prev_symbols.keys(): prev_symbols[gene_id].append(row[3]) @@ -90,9 +90,9 @@ def _get_prev_symbols(self) -> Dict[str, str]: else: # Load discontinued genes params = { - 'concept_id': f'{NamespacePrefix.NCBI.value}:{row[2]}', - 'symbol': row[3], - 'symbol_status': SymbolStatus.DISCONTINUED.value, + "concept_id": f"{NamespacePrefix.NCBI.value}:{row[2]}", + "symbol": row[3], + "symbol_status": SymbolStatus.DISCONTINUED.value, } self._load_gene(params) history_file.close() @@ -104,37 +104,37 @@ def _add_xrefs_associated_with(self, val: List[str], params: Dict) -> None: :param val: A list of source ids for a given gene :param params: A transformed gene record """ - params['xrefs'] = [] - params['associated_with'] = [] + params["xrefs"] = [] + params["associated_with"] = [] for src in val: - src_name = src.split(':')[0].upper() - src_id = src.split(':')[-1] - if src_name == 'GENEID': - params['concept_id'] = f'{NamespacePrefix.NCBI.value}:{src_id}' + src_name = src.split(":")[0].upper() + src_id = src.split(":")[-1] + if src_name == "GENEID": + params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{src_id}" elif ( src_name in NamespacePrefix.__members__ and NamespacePrefix[src_name].value in PREFIX_LOOKUP ): - params['xrefs'].append( - f'{NamespacePrefix[src_name].value}' f':{src_id}' + params["xrefs"].append( + f"{NamespacePrefix[src_name].value}" f":{src_id}" ) else: - if src_name.startswith('MIM'): + if src_name.startswith("MIM"): prefix = NamespacePrefix.OMIM.value - elif src_name.startswith('IMGT/GENE-DB'): + elif src_name.startswith("IMGT/GENE-DB"): prefix = NamespacePrefix.IMGT_GENE_DB.value - elif src_name.startswith('MIRBASE'): + elif src_name.startswith("MIRBASE"): prefix = NamespacePrefix.MIRBASE.value else: prefix = None if prefix: - params['associated_with'].append(f'{prefix}:{src_id}') + params["associated_with"].append(f"{prefix}:{src_id}") else: - logger.info(f'{src_name} is not in NameSpacePrefix.') - if not params['xrefs']: - del params['xrefs'] - if not params['associated_with']: - del params['associated_with'] + logger.info(f"{src_name} is not in NameSpacePrefix.") + if not params["xrefs"]: + del params["xrefs"] + if not params["associated_with"]: + del params["associated_with"] def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: """Store genes from NCBI info file. @@ -143,42 +143,42 @@ def _get_gene_info(self, prev_symbols: Dict[str, str]) -> Dict[str, str]: :return: A dictionary of gene's from the NCBI info file. """ # open info file, skip headers - info_file = open(self._info_src, 'r') - info = csv.reader(info_file, delimiter='\t') + info_file = open(self._info_src, "r") + info = csv.reader(info_file, delimiter="\t") next(info) info_genes = dict() for row in info: params = dict() - params['concept_id'] = f'{NamespacePrefix.NCBI.value}:{row[1]}' + params["concept_id"] = f"{NamespacePrefix.NCBI.value}:{row[1]}" # get symbol - params['symbol'] = row[2] + params["symbol"] = row[2] # get aliases - if row[4] != '-': - params['aliases'] = row[4].split('|') + if row[4] != "-": + params["aliases"] = row[4].split("|") else: - params['aliases'] = [] + params["aliases"] = [] # get associated_with - if row[5] != '-': - associated_with = row[5].split('|') + if row[5] != "-": + associated_with = row[5].split("|") self._add_xrefs_associated_with(associated_with, params) # get chromosome location vrs_chr_location = self._get_vrs_chr_location(row, params) - if 'exclude' in vrs_chr_location: + if "exclude" in vrs_chr_location: # Exclude genes with multiple distinct locations (e.g. OMS) continue if not vrs_chr_location: vrs_chr_location = [] - params['locations'] = vrs_chr_location + params["locations"] = vrs_chr_location # get label - if row[8] != '-': - params['label'] = row[8] + if row[8] != "-": + params["label"] = row[8] # add prev symbols if row[1] in prev_symbols.keys(): - params['previous_symbols'] = prev_symbols[row[1]] - info_genes[params['symbol']] = params + params["previous_symbols"] = prev_symbols[row[1]] + info_genes[params["symbol"]] = params # get type - params['gene_type'] = row[9] + params["gene_type"] = row[9] return info_genes def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: @@ -188,20 +188,20 @@ def _get_gene_gff(self, db: gffutils.FeatureDB, info_genes: Dict) -> None: :param info_genes: A dictionary of gene's from the NCBI info file. """ for f in db.all_features(): - if f.attributes.get('ID'): - f_id = f.attributes.get('ID')[0] - if f_id.startswith('gene'): - symbol = f.attributes['Name'][0] + if f.attributes.get("ID"): + f_id = f.attributes.get("ID")[0] + if f_id.startswith("gene"): + symbol = f.attributes["Name"][0] if symbol in info_genes: # Just need to add SequenceLocation params = info_genes.get(symbol) vrs_sq_location = self._get_vrs_sq_location(db, params, f_id) if vrs_sq_location: - params['locations'].append(vrs_sq_location) # type: ignore + params["locations"].append(vrs_sq_location) # type: ignore else: # Need to add entire gene gene = self._add_gff_gene(db, f, f_id) - info_genes[gene['symbol']] = gene + info_genes[gene["symbol"]] = gene def _add_gff_gene( self, db: gffutils.FeatureDB, f: gffutils.Feature, f_id: str @@ -214,14 +214,14 @@ def _add_gff_gene( :return: A gene dictionary if the ID attribute exists. Else return None. """ params = dict() - params['src_name'] = SourceName.NCBI.value + params["src_name"] = SourceName.NCBI.value self._add_attributes(f, params) sq_loc = self._get_vrs_sq_location(db, params, f_id) if sq_loc: - params['locations'] = [sq_loc] + params["locations"] = [sq_loc] else: - params['locations'] = list() - params['label_and_type'] = f"{params['concept_id'].lower()}##identity" + params["locations"] = list() + params["label_and_type"] = f"{params['concept_id'].lower()}##identity" return params def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: @@ -230,20 +230,20 @@ def _add_attributes(self, f: gffutils.feature.Feature, gene: Dict) -> None: :param gffutils.feature.Feature f: A gene from the data :param gene: A transformed gene record """ - attributes = ['ID', 'Name', 'description', 'Dbxref'] + attributes = ["ID", "Name", "description", "Dbxref"] for attribute in f.attributes.items(): key = attribute[0] if key in attributes: val = attribute[1] - if len(val) == 1 and key != 'Dbxref': + if len(val) == 1 and key != "Dbxref": val = val[0] - if key == 'Dbxref': + if key == "Dbxref": self._add_xrefs_associated_with(val, gene) - elif key == 'Name': - gene['symbol'] = val + elif key == "Name": + gene["symbol"] = val def _get_vrs_sq_location( self, db: gffutils.FeatureDB, params: Dict, f_id: str @@ -257,7 +257,7 @@ def _get_vrs_sq_location( :return: A GA4GH VRS SequenceLocation """ gene = db[f_id] - params['strand'] = gene.strand + params["strand"] = gene.strand return self._get_sequence_location(gene.seqid, gene, params) def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: @@ -268,16 +268,16 @@ def _get_xref_associated_with(self, src_name: str, src_id: str) -> Dict: :return: A dict containing an xref or associated_with ref """ source = dict() - if src_name.startswith('HGNC'): - source['xrefs'] = [f'{NamespacePrefix.HGNC.value}:{src_id}'] - elif src_name.startswith('NCBI'): - source['xrefs'] = [f'{NamespacePrefix.NCBI.value}:{src_id}'] - elif src_name.startswith('UniProt'): - source['associated_with'] = [f'{NamespacePrefix.UNIPROT.value}:{src_id}'] - elif src_name.startswith('miRBase'): - source['associated_with'] = [f'{NamespacePrefix.MIRBASE.value}:{src_id}'] - elif src_name.startswith('RFAM'): - source['associated_with'] = [f'{NamespacePrefix.RFAM.value}:{src_id}'] + if src_name.startswith("HGNC"): + source["xrefs"] = [f"{NamespacePrefix.HGNC.value}:{src_id}"] + elif src_name.startswith("NCBI"): + source["xrefs"] = [f"{NamespacePrefix.NCBI.value}:{src_id}"] + elif src_name.startswith("UniProt"): + source["associated_with"] = [f"{NamespacePrefix.UNIPROT.value}:{src_id}"] + elif src_name.startswith("miRBase"): + source["associated_with"] = [f"{NamespacePrefix.MIRBASE.value}:{src_id}"] + elif src_name.startswith("RFAM"): + source["associated_with"] = [f"{NamespacePrefix.RFAM.value}:{src_id}"] return source def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List: @@ -288,24 +288,24 @@ def _get_vrs_chr_location(self, row: List[str], params: Dict) -> List: :param params: A transformed gene record :return: A list of GA4GH VRS ChromosomeLocations """ - params['location_annotations'] = list() + params["location_annotations"] = list() chromosomes_locations = self._set_chromsomes_locations(row, params) - locations = chromosomes_locations['locations'] - chromosomes = chromosomes_locations['chromosomes'] - if chromosomes_locations['exclude']: - return ['exclude'] + locations = chromosomes_locations["locations"] + chromosomes = chromosomes_locations["chromosomes"] + if chromosomes_locations["exclude"]: + return ["exclude"] location_list = list() if chromosomes and not locations: for chromosome in chromosomes: - if chromosome == 'MT': - params['location_annotations'].append(Chromosome.MITOCHONDRIA.value) + if chromosome == "MT": + params["location_annotations"].append(Chromosome.MITOCHONDRIA.value) else: - params['location_annotations'].append(chromosome.strip()) + params["location_annotations"].append(chromosome.strip()) elif locations: self._add_chromosome_location(locations, location_list, params) - if not params['location_annotations']: - del params['location_annotations'] + if not params["location_annotations"]: + del params["location_annotations"] return location_list def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: @@ -316,29 +316,29 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: :return: A dictionary containing a gene's chromosomes and locations """ chromosomes = None - if row[6] != '-': - if '|' in row[6]: - chromosomes = row[6].split('|') + if row[6] != "-": + if "|" in row[6]: + chromosomes = row[6].split("|") else: chromosomes = [row[6]] if len(chromosomes) >= 2: - if chromosomes and 'X' not in chromosomes and 'Y' not in chromosomes: + if chromosomes and "X" not in chromosomes and "Y" not in chromosomes: logger.info( - f'{row[2]} contains multiple distinct ' - f'chromosomes: {chromosomes}.' + f"{row[2]} contains multiple distinct " + f"chromosomes: {chromosomes}." ) chromosomes = None locations = None exclude = False - if row[7] != '-': - if '|' in row[7]: - locations = row[7].split('|') - elif ';' in row[7]: - locations = row[7].split(';') - elif 'and' in row[7]: - locations = row[7].split('and') + if row[7] != "-": + if "|" in row[7]: + locations = row[7].split("|") + elif ";" in row[7]: + locations = row[7].split(";") + elif "and" in row[7]: + locations = row[7].split("and") else: locations = [row[7]] @@ -351,7 +351,7 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: # i.e. OMS: '10q26.3', '19q13.42-q13.43', '3p25.3' if len(locations) > 2: logger.info( - f'{row[2]} contains multiple distinct ' f'locations: {locations}.' + f"{row[2]} contains multiple distinct " f"locations: {locations}." ) locations = None exclude = True @@ -360,13 +360,13 @@ def _set_chromsomes_locations(self, row: List[str], params: Dict) -> Dict: if locations: for i in range(len(locations)): loc = locations[i].strip() - if not re.match('^([1-9][0-9]?|X[pq]?|Y[pq]?)', loc): + if not re.match("^([1-9][0-9]?|X[pq]?|Y[pq]?)", loc): logger.info( - f'{row[2]} contains invalid map location:' f'{loc}.' + f"{row[2]} contains invalid map location:" f"{loc}." ) - params['location_annotations'].append(loc) + params["location_annotations"].append(loc) del locations[i] - return {'locations': locations, 'chromosomes': chromosomes, 'exclude': exclude} + return {"locations": locations, "chromosomes": chromosomes, "exclude": exclude} def _add_chromosome_location( self, locations: List, location_list: List, params: Dict @@ -382,42 +382,42 @@ def _add_chromosome_location( location = dict() if Annotation.ALT_LOC.value in loc: - loc = loc.split(f'{Annotation.ALT_LOC.value}')[0].strip() - params['location_annotations'].append(Annotation.ALT_LOC.value) + loc = loc.split(f"{Annotation.ALT_LOC.value}")[0].strip() + params["location_annotations"].append(Annotation.ALT_LOC.value) contains_centromere = False - if 'cen' in loc: + if "cen" in loc: contains_centromere = True - arm_match = re.search('[pq]', loc) + arm_match = re.search("[pq]", loc) if arm_match and not contains_centromere: arm_ix = arm_match.start() chromosome = loc[:arm_ix].strip() # NCBI sometimes stores invalid map locations # i.e. 7637 stores 'map from Rosati ref via FISH [AFS]' - if not re.match('^([1-9][0-9]?|X|Y|MT)$', chromosome): + if not re.match("^([1-9][0-9]?|X|Y|MT)$", chromosome): continue - location['chr'] = chromosome + location["chr"] = chromosome # Check to see if there is a band / sub band included if arm_ix != len(loc) - 1: - if '-' in loc: + if "-" in loc: self._set_cl_interval_range(loc, arm_ix, location) else: # Location only gives start start = loc[arm_ix:] - location['start'] = start - location['end'] = start + location["start"] = start + location["end"] = start else: # Only arm is included - location['start'] = loc[arm_ix] - location['end'] = loc[arm_ix] + location["start"] = loc[arm_ix] + location["end"] = loc[arm_ix] elif contains_centromere: self._set_centromere_location(loc, location) else: # Location only gives chr - params['location_annotations'].append(loc) + params["location_annotations"].append(loc) # chr_location = self._get_chromosome_location(location, params) # if chr_location: @@ -429,36 +429,36 @@ def _set_centromere_location(self, loc: str, location: Dict) -> None: :param loc: A gene location :param location: GA4GH location """ - centromere_ix = re.search('cen', loc).start() # type: ignore - if '-' in loc: + centromere_ix = re.search("cen", loc).start() # type: ignore + if "-" in loc: # Location gives both start and end - range_ix = re.search('-', loc).start() # type: ignore - if 'q' in loc: - location['chr'] = loc[:centromere_ix].strip() - location['start'] = 'cen' - location['end'] = loc[range_ix + 1 :] - elif 'p' in loc: - p_ix = re.search('p', loc).start() # type: ignore - location['chr'] = loc[:p_ix].strip() - location['end'] = 'cen' - location['start'] = loc[:range_ix] + range_ix = re.search("-", loc).start() # type: ignore + if "q" in loc: + location["chr"] = loc[:centromere_ix].strip() + location["start"] = "cen" + location["end"] = loc[range_ix + 1 :] + elif "p" in loc: + p_ix = re.search("p", loc).start() # type: ignore + location["chr"] = loc[:p_ix].strip() + location["end"] = "cen" + location["start"] = loc[:range_ix] else: - location['chr'] = loc[:centromere_ix].strip() - location['start'] = 'cen' - location['end'] = 'cen' + location["chr"] = loc[:centromere_ix].strip() + location["start"] = "cen" + location["end"] = "cen" def _transform_data(self) -> None: """Modify data and pass to loading functions.""" - logger.info('Transforming NCBI...') + logger.info("Transforming NCBI...") prev_symbols = self._get_prev_symbols() info_genes = self._get_gene_info(prev_symbols) # create db for gff file db = gffutils.create_db( str(self._gff_src), - dbfn=':memory:', + dbfn=":memory:", force=True, - merge_strategy='create_unique', + merge_strategy="create_unique", keep_order=True, ) @@ -466,7 +466,7 @@ def _transform_data(self) -> None: for gene in info_genes.keys(): self._load_gene(info_genes[gene]) - logger.info('Successfully transformed NCBI.') + logger.info("Successfully transformed NCBI.") def _add_meta(self) -> None: """Add Ensembl metadata. @@ -483,22 +483,22 @@ def _add_meta(self) -> None: ] ): raise GeneNormalizerEtlError( - 'Source metadata unavailable -- was data properly acquired before attempting to load DB?' + "Source metadata unavailable -- was data properly acquired before attempting to load DB?" ) metadata = SourceMeta( - data_license='custom', - data_license_url='https://www.ncbi.nlm.nih.gov/home/about/policies/', + data_license="custom", + data_license_url="https://www.ncbi.nlm.nih.gov/home/about/policies/", version=self._version, data_url={ - 'info_file': self._gene_url, - 'history_file': self._history_url, - 'assembly_file': self._assembly_url, + "info_file": self._gene_url, + "history_file": self._history_url, + "assembly_file": self._assembly_url, }, - rdp_url='https://reusabledata.org/ncbi-gene.html', + rdp_url="https://reusabledata.org/ncbi-gene.html", data_license_attributes={ - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "non_commercial": False, + "share_alike": False, + "attribution": False, }, genome_assemblies=[self._assembly], ) diff --git a/src/gene/main.py b/src/gene/main.py index 31db7076..7023471f 100644 --- a/src/gene/main.py +++ b/src/gene/main.py @@ -21,27 +21,27 @@ """ app = FastAPI( - title='Gene Normalizer', + title="Gene Normalizer", description=description, version=__version__, contact={ - 'name': 'Alex H. Wagner', - 'email': 'Alex.Wagner@nationwidechildrens.org', - 'url': 'https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab', + "name": "Alex H. Wagner", + "email": "Alex.Wagner@nationwidechildrens.org", + "url": "https://www.nationwidechildrens.org/specialties/institute-for-genomic-medicine/research-labs/wagner-lab", }, license={ - 'name': 'MIT', - 'url': 'https://github.com/cancervariants/gene-normalization/blob/main/LICENSE', + "name": "MIT", + "url": "https://github.com/cancervariants/gene-normalization/blob/main/LICENSE", }, - docs_url='/gene', - openapi_url='/gene/openapi.json', - swagger_ui_parameters={'tryItOutEnabled': True}, + docs_url="/gene", + openapi_url="/gene/openapi.json", + swagger_ui_parameters={"tryItOutEnabled": True}, ) -read_query_summary = 'Given query, provide best-matching source records.' -response_description = 'A response to a validly-formed query' -q_descr = 'Gene to normalize.' +read_query_summary = "Given query, provide best-matching source records." +response_description = "A response to a validly-formed query" +q_descr = "Gene to normalize." incl_descr = """Optional. Comma-separated list of source names to include in response. Will exclude all other sources. Returns HTTP status code 422: Unprocessable Entity if both 'incl' and 'excl' parameters @@ -51,18 +51,18 @@ code 422: Unprocessable Entity if both 'incl' and 'excl' parameters are given.""" search_description = ( - 'For each source, return strongest-match concepts ' - 'for query string provided by user' + "For each source, return strongest-match concepts " + "for query string provided by user" ) @app.get( - '/gene/search', + "/gene/search", summary=read_query_summary, response_description=response_description, response_model=SearchService, description=search_description, - tags=['Query'], + tags=["Query"], ) def search( q: str = Query(..., description=q_descr), @@ -87,20 +87,20 @@ def search( return resp -normalize_summary = 'Given query, provide merged normalized record.' -normalize_response_descr = 'A response to a validly-formed query.' -normalize_descr = 'Return merged highest-match concept for query.' -normalize_q_descr = 'Gene to normalize.' +normalize_summary = "Given query, provide merged normalized record." +normalize_response_descr = "A response to a validly-formed query." +normalize_descr = "Return merged highest-match concept for query." +normalize_q_descr = "Gene to normalize." @app.get( - '/gene/normalize', + "/gene/normalize", summary=normalize_summary, response_description=normalize_response_descr, response_model=NormalizeService, response_model_exclude_none=True, description=normalize_descr, - tags=['Query'], + tags=["Query"], ) def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeService: """Return strongest match concepts to query string provided by user. @@ -113,26 +113,26 @@ def normalize(q: str = Query(..., description=normalize_q_descr)) -> NormalizeSe unmerged_matches_summary = ( - 'Given query, provide source records corresponding to ' 'normalized concept.' + "Given query, provide source records corresponding to " "normalized concept." ) unmerged_response_descr = ( - 'Response containing source records contained within ' 'normalized concept.' + "Response containing source records contained within " "normalized concept." ) unmerged_normalize_description = ( - 'Return unmerged records associated with the ' - 'normalized result of the user-provided query ' - 'string.' + "Return unmerged records associated with the " + "normalized result of the user-provided query " + "string." ) @app.get( - '/gene/normalize_unmerged', + "/gene/normalize_unmerged", summary=unmerged_matches_summary, - operation_id='getUnmergedRecords', + operation_id="getUnmergedRecords", response_description=unmerged_response_descr, response_model=UnmergedNormalizationService, description=unmerged_normalize_description, - tags=['Query'], + tags=["Query"], ) def normalize_unmerged( q: str = Query(..., description=normalize_q_descr), diff --git a/src/gene/query.py b/src/gene/query.py index 59a5bd5b..504877bc 100644 --- a/src/gene/query.py +++ b/src/gene/query.py @@ -28,7 +28,7 @@ ) from gene.version import __version__ -NormService = TypeVar('NormService', bound=BaseNormalizationService) +NormService = TypeVar("NormService", bound=BaseNormalizationService) class InvalidParameterException(Exception): # noqa: N818 @@ -65,15 +65,15 @@ def _emit_warnings(query_str: str) -> List: :return: List of warnings """ warnings = [] - nbsp = re.search('\xa0| ', query_str) + nbsp = re.search("\xa0| ", query_str) if nbsp: warnings = [ { - 'non_breaking_space_characters': 'Query contains non-breaking space characters' + "non_breaking_space_characters": "Query contains non-breaking space characters" } ] logger.warning( - f'Query ({query_str}) contains non-breaking space characters.' + f"Query ({query_str}) contains non-breaking space characters." ) return warnings @@ -84,12 +84,12 @@ def _transform_sequence_location(loc: Dict) -> models.SequenceLocation: :param loc: GeneSequenceLocation represented as a dict :return: VRS sequence location """ - refget_ac = loc['sequence_id'].split('ga4gh:')[-1] + refget_ac = loc["sequence_id"].split("ga4gh:")[-1] return models.SequenceLocation( sequenceReference=models.SequenceReference(refgetAccession=refget_ac), - start=int(loc['start']), - end=int(loc['end']), + start=int(loc["start"]), + end=int(loc["end"]), ) # @staticmethod @@ -128,11 +128,11 @@ def _transform_locations(self, record: Dict) -> Dict: :return: record with transformed locations attributes, if applicable """ record_locations = list() - if 'locations' in record: - for loc in record['locations']: - if loc['type'] == 'SequenceLocation': + if "locations" in record: + for loc in record["locations"]: + if loc["type"] == "SequenceLocation": record_locations.append(self._transform_location(loc)) - record['locations'] = record_locations + record["locations"] = record_locations return record def _get_src_name(self, concept_id: str) -> SourceName: @@ -149,7 +149,7 @@ def _get_src_name(self, concept_id: str) -> SourceName: elif concept_id.startswith(NamespacePrefix.HGNC.value): return SourceName.HGNC else: - raise ValueError('Invalid or unrecognized concept ID provided') + raise ValueError("Invalid or unrecognized concept ID provided") def _add_record( self, response: Dict[str, Dict], item: Dict, match_type: MatchType @@ -161,20 +161,20 @@ def _add_record( :param match_type: match type for query """ item = self._transform_locations(item) - item['match_type'] = match_type + item["match_type"] = match_type gene = Gene(**item) - src_name = item['src_name'] + src_name = item["src_name"] - matches = response['source_matches'] + matches = response["source_matches"] if src_name not in matches.keys(): pass elif matches[src_name] is None: matches[src_name] = { - 'records': [gene], - 'source_meta_': self.db.get_source_metadata(src_name), + "records": [gene], + "source_meta_": self.db.get_source_metadata(src_name), } else: - matches[src_name]['records'].append(gene) + matches[src_name]["records"].append(gene) def _fetch_record( self, response: Dict[str, Dict], concept_id: str, match_type: MatchType @@ -189,14 +189,14 @@ def _fetch_record( match = self.db.get_record_by_id(concept_id, case_sensitive=False) except DatabaseReadException as e: logger.error( - f'Encountered DatabaseReadException looking up {concept_id}: {e}' + f"Encountered DatabaseReadException looking up {concept_id}: {e}" ) else: if match: self._add_record(response, match, match_type) else: logger.error( - f'Unable to find expected record for {concept_id} matching as {match_type}' + f"Unable to find expected record for {concept_id} matching as {match_type}" ) def _post_process_resp(self, resp: Dict) -> Dict: @@ -207,15 +207,15 @@ def _post_process_resp(self, resp: Dict) -> Dict: :return: response object with empty source slots filled with NO_MATCH results and corresponding source metadata """ - for src_name in resp['source_matches'].keys(): - if resp['source_matches'][src_name] is None: - resp['source_matches'][src_name] = { - 'match_type': MatchType.NO_MATCH, - 'records': [], - 'source_meta_': self.db.get_source_metadata(src_name), + for src_name in resp["source_matches"].keys(): + if resp["source_matches"][src_name] is None: + resp["source_matches"][src_name] = { + "match_type": MatchType.NO_MATCH, + "records": [], + "source_meta_": self.db.get_source_metadata(src_name), } else: - records = resp['source_matches'][src_name]['records'] + records = resp["source_matches"][src_name]["records"] if len(records) > 1: records = sorted(records, key=lambda k: k.match_type, reverse=True) return resp @@ -229,11 +229,11 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: :return: completed response object to return to client """ resp = { - 'query': query, - 'warnings': self._emit_warnings(query), - 'source_matches': {source: None for source in sources}, + "query": query, + "warnings": self._emit_warnings(query), + "source_matches": {source: None for source in sources}, } - if query == '': + if query == "": return self._post_process_resp(resp) query_l = query.lower() @@ -242,7 +242,7 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: queries.append((query_l, RecordType.IDENTITY.value)) for prefix in [p for p in NAMESPACE_LOOKUP.keys() if query_l.startswith(p)]: - term = f'{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}' + term = f"{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}" queries.append((term, RecordType.IDENTITY.value)) for match in ITEM_TYPES.values(): @@ -253,7 +253,7 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: try: if item_type == RecordType.IDENTITY.value: record = self.db.get_record_by_id(term, False) - if record and record['concept_id'] not in matched_concept_ids: + if record and record["concept_id"] not in matched_concept_ids: self._add_record(resp, record, MatchType.CONCEPT_ID) else: refs = self.db.get_refs_by_type(term, RefType(item_type)) @@ -264,8 +264,8 @@ def _get_search_response(self, query: str, sources: Set[str]) -> Dict: except DatabaseReadException as e: logger.error( - f'Encountered DatabaseReadException looking up {item_type}' - f' {term}: {e}' + f"Encountered DatabaseReadException looking up {item_type}" + f" {term}: {e}" ) continue @@ -283,8 +283,8 @@ def _get_service_meta() -> ServiceMeta: def search( self, query_str: str, - incl: str = '', - excl: str = '', + incl: str = "", + excl: str = "", **params, ) -> SearchService: """Return highest match for each source. @@ -316,10 +316,10 @@ def search( if not incl and not excl: query_sources = set(sources.values()) elif incl and excl: - detail = 'Cannot request both source inclusions and exclusions.' + detail = "Cannot request both source inclusions and exclusions." raise InvalidParameterException(detail) elif incl: - req_sources = [n.strip() for n in incl.split(',')] + req_sources = [n.strip() for n in incl.split(",")] invalid_sources = [] query_sources = set() for source in req_sources: @@ -328,10 +328,10 @@ def search( else: invalid_sources.append(source) if invalid_sources: - detail = f'Invalid source name(s): {invalid_sources}' + detail = f"Invalid source name(s): {invalid_sources}" raise InvalidParameterException(detail) else: - req_exclusions = [n.strip() for n in excl.lower().split(',')] + req_exclusions = [n.strip() for n in excl.lower().split(",")] req_excl_dict = {r.lower(): r for r in req_exclusions} invalid_sources = [] query_sources = set() @@ -342,14 +342,14 @@ def search( if src_l not in req_excl_dict.keys(): query_sources.add(src) if invalid_sources: - detail = f'Invalid source name(s): {invalid_sources}' + detail = f"Invalid source name(s): {invalid_sources}" raise InvalidParameterException(detail) query_str = query_str.strip() resp = self._get_search_response(query_str, query_sources) - resp['service_meta_'] = self._get_service_meta() + resp["service_meta_"] = self._get_service_meta() return SearchService(**resp) def _add_merged_meta(self, response: NormalizeService) -> NormalizeService: @@ -360,7 +360,7 @@ def _add_merged_meta(self, response: NormalizeService) -> NormalizeService: """ sources_meta = {} gene = response.gene - sources = [response.normalized_id.split(':')[0]] + sources = [response.normalized_id.split(":")[0]] if gene.mappings: sources += [m.coding.system for m in gene.mappings] @@ -391,13 +391,13 @@ def _add_alt_matches( for concept_id in possible_concepts: r = self.db.get_record_by_id(concept_id, True) if r: - merge_ref = r.get('merge_ref') + merge_ref = r.get("merge_ref") if merge_ref: norm_concepts.add(merge_ref) - norm_concepts = norm_concepts - {record['concept_id']} + norm_concepts = norm_concepts - {record["concept_id"]} if norm_concepts: response.warnings.append( - {'multiple_normalized_concepts_found': list(norm_concepts)} + {"multiple_normalized_concepts_found": list(norm_concepts)} ) return response @@ -418,14 +418,14 @@ def _add_gene( """ gene_obj = core_models.Gene( id=f"normalize.gene.{record['concept_id']}", - label=record['symbol'], + label=record["symbol"], ) # mappings - source_ids = record.get('xrefs', []) + record.get('associated_with', []) + source_ids = record.get("xrefs", []) + record.get("associated_with", []) mappings = [] for source_id in source_ids: - system, code = source_id.split(':') + system, code = source_id.split(":") mappings.append( core_models.Mapping( coding=core_models.Coding( @@ -439,7 +439,7 @@ def _add_gene( # aliases aliases = set() - for key in ['previous_symbols', 'aliases']: + for key in ["previous_symbols", "aliases"]: if key in record and record[key]: val = record[key] if isinstance(val, str): @@ -451,11 +451,11 @@ def _add_gene( # extensions extensions = [] extension_and_record_labels = [ - ('symbol_status', 'symbol_status'), - ('approved_name', 'label'), - ('previous_symbols', 'previous_symbols'), - ('location_annotations', 'location_annotations'), - ('strand', 'strand'), + ("symbol_status", "symbol_status"), + ("approved_name", "label"), + ("previous_symbols", "previous_symbols"), + ("location_annotations", "location_annotations"), + ("strand", "strand"), ] for ext_label, record_label in extension_and_record_labels: if record_label in record and record[record_label]: @@ -464,19 +464,19 @@ def _add_gene( ) record_locations = {} - if record['item_type'] == RecordType.IDENTITY: - locs = record.get('locations') + if record["item_type"] == RecordType.IDENTITY: + locs = record.get("locations") if locs: record_locations[f"{record['src_name'].lower()}_locations"] = locs - elif record['item_type'] == RecordType.MERGER: + elif record["item_type"] == RecordType.MERGER: for k, v in record.items(): - if k.endswith('locations') and v: + if k.endswith("locations") and v: record_locations[k] = v for loc_name, locations in record_locations.items(): transformed_locs = [] for loc in locations: - if loc['type'] == 'SequenceLocation': + if loc["type"] == "SequenceLocation": transformed_locs.append(self._transform_location(loc)) if transformed_locs: @@ -485,12 +485,12 @@ def _add_gene( ) # handle gene types separately because they're wonky - if record['item_type'] == RecordType.IDENTITY: - gene_type = record.get('gene_type') + if record["item_type"] == RecordType.IDENTITY: + gene_type = record.get("gene_type") if gene_type: extensions.append( core_models.Extension( - name=GeneTypeFieldName[record['src_name'].upper()].value, + name=GeneTypeFieldName[record["src_name"].upper()].value, value=gene_type, ) ) @@ -509,7 +509,7 @@ def _add_gene( if possible_concepts: response = self._add_alt_matches(response, record, possible_concepts) - response.normalized_id = record['concept_id'] + response.normalized_id = record["concept_id"] response.gene = gene_obj response = self._add_merged_meta(response) response.match_type = match_type @@ -522,9 +522,9 @@ def _record_order(record: Dict) -> Tuple[int, str]: :param record: individual record item in iterable to sort :return: tuple with rank value and concept ID """ - src = record['src_name'].upper() + src = record["src_name"].upper() source_rank = SourcePriority[src] - return source_rank, record['concept_id'] + return source_rank, record["concept_id"] @staticmethod def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict: @@ -539,7 +539,7 @@ def _handle_failed_merge_ref(record: Dict, response: Dict, query: str) -> Dict: f"Merge ref lookup failed for ref {record['merge_ref']} " f"in record {record['concept_id']} from query {query}" ) - response['match_type'] = MatchType.NO_MATCH + response["match_type"] = MatchType.NO_MATCH return response def _prepare_normalized_response(self, query: str) -> Dict[str, Any]: @@ -549,10 +549,10 @@ def _prepare_normalized_response(self, query: str) -> Dict[str, Any]: :return: basic normalization response boilerplate """ return { - 'query': query, - 'match_type': MatchType.NO_MATCH, - 'warnings': self._emit_warnings(query), - 'service_meta_': ServiceMeta( + "query": query, + "match_type": MatchType.NO_MATCH, + "warnings": self._emit_warnings(query), + "service_meta_": ServiceMeta( version=__version__, response_datetime=str(datetime.now()) ), } @@ -594,7 +594,7 @@ def _resolve_merge( :param possible_concepts: alternate possible matches :return: Normalized response object """ - merge_ref = record.get('merge_ref') + merge_ref = record.get("merge_ref") if merge_ref: # follow merge_ref merge = self.db.get_record_by_id(merge_ref, False, True) @@ -621,7 +621,7 @@ def _perform_normalized_lookup( :param response_builder: response constructor callback method :return: completed service response object """ - if query == '': + if query == "": return response query_str = query.lower().strip() @@ -653,7 +653,7 @@ def _perform_normalized_lookup( # attempt merge ref resolution until successful for match in matching_records: assert match is not None - record = self.db.get_record_by_id(match['concept_id'], False) + record = self.db.get_record_by_id(match["concept_id"], False) if record: match_type_value = MatchType[match_type.value.upper()] return self._resolve_merge( @@ -682,23 +682,23 @@ def _add_normalized_records( :return: Completed response object """ response.match_type = match_type - response.normalized_concept_id = normalized_record['concept_id'] - if normalized_record['item_type'] == RecordType.IDENTITY: - record_source = SourceName[normalized_record['src_name'].upper()] + response.normalized_concept_id = normalized_record["concept_id"] + if normalized_record["item_type"] == RecordType.IDENTITY: + record_source = SourceName[normalized_record["src_name"].upper()] meta = self.db.get_source_metadata(record_source.value) response.source_matches[record_source] = MatchesNormalized( records=[BaseGene(**self._transform_locations(normalized_record))], source_meta_=meta, # type: ignore ) else: - concept_ids = [normalized_record['concept_id']] + normalized_record.get( - 'xrefs', [] + concept_ids = [normalized_record["concept_id"]] + normalized_record.get( + "xrefs", [] ) for concept_id in concept_ids: record = self.db.get_record_by_id(concept_id, case_sensitive=False) if not record: continue - record_source = SourceName[record['src_name'].upper()] + record_source = SourceName[record["src_name"].upper()] gene = BaseGene(**self._transform_locations(record)) if record_source in response.source_matches: response.source_matches[record_source].records.append(gene) diff --git a/src/gene/schemas.py b/src/gene/schemas.py index 602c9abb..cd89951a 100644 --- a/src/gene/schemas.py +++ b/src/gene/schemas.py @@ -15,22 +15,22 @@ from gene.version import __version__ -CURIE = constr(pattern=r'^\w[^:]*:.+$') +CURIE = constr(pattern=r"^\w[^:]*:.+$") class SymbolStatus(str, Enum): """Define string constraints for symbol status attribute.""" - WITHDRAWN = 'withdrawn' - APPROVED = 'approved' - DISCONTINUED = 'discontinued' + WITHDRAWN = "withdrawn" + APPROVED = "approved" + DISCONTINUED = "discontinued" class Strand(str, Enum): """Define string constraints for strand attribute.""" - FORWARD = '+' - REVERSE = '-' + FORWARD = "+" + REVERSE = "-" class Annotation(str, Enum): @@ -38,16 +38,16 @@ class Annotation(str, Enum): is absent. """ - NOT_FOUND_ON_REFERENCE = 'not on reference assembly' - UNPLACED = 'unplaced' - RESERVED = 'reserved' - ALT_LOC = 'alternate reference locus' + NOT_FOUND_ON_REFERENCE = "not on reference assembly" + UNPLACED = "unplaced" + RESERVED = "reserved" + ALT_LOC = "alternate reference locus" class Chromosome(str, Enum): """Define string constraints for chromosomes.""" - MITOCHONDRIA = 'MT' + MITOCHONDRIA = "MT" class MatchType(IntEnum): @@ -66,10 +66,10 @@ class MatchType(IntEnum): class GeneSequenceLocation(BaseModel): """Sequence Location model when storing in DynamoDB.""" - type: Literal['SequenceLocation'] = 'SequenceLocation' + type: Literal["SequenceLocation"] = "SequenceLocation" start: StrictInt end: StrictInt - sequence_id: constr(pattern=r'^ga4gh:SQ.[0-9A-Za-z_\-]{32}$') # noqa: F722 + sequence_id: constr(pattern=r"^ga4gh:SQ.[0-9A-Za-z_\-]{32}$") # noqa: F722 # class GeneChromosomeLocation(BaseModel): @@ -112,20 +112,20 @@ class Gene(BaseGene): model_config = ConfigDict( json_schema_extra={ - 'example': { - 'label': None, - 'concept_id': 'ensembl:ENSG00000157764', - 'symbol': 'BRAF', - 'previous_symbols': [], - 'aliases': [], - 'xrefs': [], - 'symbol_status': None, - 'strand': '-', - 'locations': [], - 'location_annotations': [], - 'associated_with': [], - 'gene_type': None, - 'match_type': 100, + "example": { + "label": None, + "concept_id": "ensembl:ENSG00000157764", + "symbol": "BRAF", + "previous_symbols": [], + "aliases": [], + "xrefs": [], + "symbol_status": None, + "strand": "-", + "locations": [], + "location_annotations": [], + "associated_with": [], + "gene_type": None, + "match_type": 100, } } ) @@ -142,9 +142,9 @@ class GeneGroup(Gene): class SourceName(Enum): """Define string constraints to ensure consistent capitalization.""" - HGNC = 'HGNC' - ENSEMBL = 'Ensembl' - NCBI = 'NCBI' + HGNC = "HGNC" + ENSEMBL = "Ensembl" + NCBI = "NCBI" class SourcePriority(IntEnum): @@ -158,42 +158,42 @@ class SourcePriority(IntEnum): class SourceIDAfterNamespace(Enum): """Define string constraints after namespace.""" - HGNC = '' - ENSEMBL = 'ENSG' - NCBI = '' + HGNC = "" + ENSEMBL = "ENSG" + NCBI = "" class NamespacePrefix(Enum): """Define string constraints for namespace prefixes on concept IDs.""" - HGNC = 'hgnc' - ENSEMBL = 'ensembl' - NCBI = 'ncbigene' + HGNC = "hgnc" + ENSEMBL = "ensembl" + NCBI = "ncbigene" ENTREZ = NCBI - VEGA = 'vega' - UCSC = 'ucsc' - ENA = 'ena.embl' - REFSEQ = 'refseq' - CCDS = 'ccds' - UNIPROT = 'uniprot' - PUBMED = 'pubmed' - COSMIC = 'cosmic' - OMIM = 'omim' - MIRBASE = 'mirbase' - HOMEODB = 'homeodb' - SNORNABASE = 'snornabase' - ORPHANET = 'orphanet' - PSEUDOGENE = 'pseudogene.org' - HORDE = 'hordedb' - MEROPS = 'merops' - IUPHAR = 'iuphar' - KZNF = 'knzfgc' - MAMIT = 'mamittrnadb' - CD = 'hcdmdb' - LNCRNADB = 'lncrnadb' - IMGT = 'imgt' # .hla? .ligm? leave as is? - IMGT_GENE_DB = 'imgt/gene-db' # redundant w/ above? - RFAM = 'rfam' + VEGA = "vega" + UCSC = "ucsc" + ENA = "ena.embl" + REFSEQ = "refseq" + CCDS = "ccds" + UNIPROT = "uniprot" + PUBMED = "pubmed" + COSMIC = "cosmic" + OMIM = "omim" + MIRBASE = "mirbase" + HOMEODB = "homeodb" + SNORNABASE = "snornabase" + ORPHANET = "orphanet" + PSEUDOGENE = "pseudogene.org" + HORDE = "hordedb" + MEROPS = "merops" + IUPHAR = "iuphar" + KZNF = "knzfgc" + MAMIT = "mamittrnadb" + CD = "hcdmdb" + LNCRNADB = "lncrnadb" + IMGT = "imgt" # .hla? .ligm? leave as is? + IMGT_GENE_DB = "imgt/gene-db" # redundant w/ above? + RFAM = "rfam" class DataLicenseAttributes(BaseModel): @@ -207,19 +207,19 @@ class DataLicenseAttributes(BaseModel): class RecordType(str, Enum): """Record item types.""" - IDENTITY = 'identity' - MERGER = 'merger' + IDENTITY = "identity" + MERGER = "merger" class RefType(str, Enum): """Reference item types.""" # Must be in descending MatchType order. - SYMBOL = 'symbol' - PREVIOUS_SYMBOLS = 'prev_symbol' - ALIASES = 'alias' - XREFS = 'xref' - ASSOCIATED_WITH = 'associated_with' + SYMBOL = "symbol" + PREVIOUS_SYMBOLS = "prev_symbol" + ALIASES = "alias" + XREFS = "xref" + ASSOCIATED_WITH = "associated_with" class SourceMeta(BaseModel): @@ -235,22 +235,22 @@ class SourceMeta(BaseModel): model_config = ConfigDict( json_schema_extra={ - 'example': { - 'data_license': 'custom', - 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/', - 'version': '20201215', - 'data_url': { - 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', - 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', - 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', + "example": { + "data_license": "custom", + "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", + "version": "20201215", + "data_url": { + "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", + "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", + "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", }, - 'rdp_url': 'https://reusabledata.org/ncbi-gene.html', - 'data_license_attributes': { - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "rdp_url": "https://reusabledata.org/ncbi-gene.html", + "data_license_attributes": { + "non_commercial": False, + "share_alike": False, + "attribution": False, }, - 'genome_assemblies': [], + "genome_assemblies": [], } } ) @@ -262,26 +262,26 @@ class SourceSearchMatches(BaseModel): records: List[Gene] = [] source_meta_: SourceMeta - model_config = ConfigDict(json_schema_extra={'example': {}}) # TODO + model_config = ConfigDict(json_schema_extra={"example": {}}) # TODO class ServiceMeta(BaseModel): """Metadata regarding the gene-normalization service.""" - name: Literal['gene-normalizer'] = 'gene-normalizer' + name: Literal["gene-normalizer"] = "gene-normalizer" version: StrictStr response_datetime: StrictStr url: Literal[ - 'https://github.com/cancervariants/gene-normalization' - ] = 'https://github.com/cancervariants/gene-normalization' + "https://github.com/cancervariants/gene-normalization" + ] = "https://github.com/cancervariants/gene-normalization" model_config = ConfigDict( json_schema_extra={ - 'example': { - 'name': 'gene-normalizer', - 'version': __version__, - 'response_datetime': '2022-03-23 15:57:14.180908', - 'url': 'https://github.com/cancervariants/gene-normalization', + "example": { + "name": "gene-normalizer", + "version": __version__, + "response_datetime": "2022-03-23 15:57:14.180908", + "url": "https://github.com/cancervariants/gene-normalization", } } ) @@ -303,9 +303,9 @@ class GeneTypeFieldName(str, Enum): internal records. """ - HGNC = 'hgnc_locus_type' - NCBI = 'ncbi_gene_type' - ENSEMBL = 'ensembl_biotype' + HGNC = "hgnc_locus_type" + NCBI = "ncbi_gene_type" + ENSEMBL = "ensembl_biotype" class BaseNormalizationService(BaseModel): @@ -326,88 +326,88 @@ class NormalizeService(BaseNormalizationService): model_config = ConfigDict( json_schema_extra={ - 'example': { - 'query': 'BRAF', - 'warnings': [], - 'match_type': 100, - 'normalized_id': 'hgnc:1037', - 'gene': { - 'type': 'Gene', - 'id': 'normalize.gene.hgnc:1097', - 'label': 'BRAF', - 'mappings': [ + "example": { + "query": "BRAF", + "warnings": [], + "match_type": 100, + "normalized_id": "hgnc:1037", + "gene": { + "type": "Gene", + "id": "normalize.gene.hgnc:1097", + "label": "BRAF", + "mappings": [ { - 'coding': {'code': '673', 'system': 'ncbigene'}, - 'relation': 'relatedMatch', + "coding": {"code": "673", "system": "ncbigene"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'ENSG00000157764', 'system': 'ensembl'}, - 'relation': 'relatedMatch', + "coding": {"code": "ENSG00000157764", "system": "ensembl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS5863', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS5863", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1943', 'system': 'iuphar'}, - 'relation': 'relatedMatch', + "coding": {"code": "1943", "system": "iuphar"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '119066', 'system': 'orphanet'}, - 'relation': 'relatedMatch', + "coding": {"code": "119066", "system": "orphanet"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'BRAF', 'system': 'cosmic'}, - 'relation': 'relatedMatch', + "coding": {"code": "BRAF", "system": "cosmic"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '2284096', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "2284096", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'uc003vwc.5', 'system': 'ucsc'}, - 'relation': 'relatedMatch', + "coding": {"code": "uc003vwc.5", "system": "ucsc"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '164757', 'system': 'omim'}, - 'relation': 'relatedMatch', + "coding": {"code": "164757", "system": "omim"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'NM_004333', 'system': 'refseq'}, - 'relation': 'relatedMatch', + "coding": {"code": "NM_004333", "system": "refseq"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS87555', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS87555", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'P15056', 'system': 'uniprot'}, - 'relation': 'relatedMatch', + "coding": {"code": "P15056", "system": "uniprot"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'M95712', 'system': 'ena.embl'}, - 'relation': 'relatedMatch', + "coding": {"code": "M95712", "system": "ena.embl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'OTTHUMG00000157457', 'system': 'vega'}, - 'relation': 'relatedMatch', + "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1565476', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "1565476", "system": "pubmed"}, + "relation": "relatedMatch", }, ], - 'aliases': ['BRAF1', 'RAFB1', 'B-raf', 'NS7', 'B-RAF1'], - 'extensions': [ + "aliases": ["BRAF1", "RAFB1", "B-raf", "NS7", "B-RAF1"], + "extensions": [ { - 'name': 'approved_name', - 'value': 'B-Raf proto-oncogene, serine/threonine kinase', - 'type': 'Extension', + "name": "approved_name", + "value": "B-Raf proto-oncogene, serine/threonine kinase", + "type": "Extension", }, { - 'name': 'symbol_status', - 'value': 'approved', - 'type': 'Extension', + "name": "symbol_status", + "value": "approved", + "type": "Extension", }, # { # "name": "chromosome_location", @@ -423,60 +423,60 @@ class NormalizeService(BaseNormalizationService): # } ], }, - 'source_meta_': { - 'HGNC': { - 'data_license': 'custom', - 'data_license_url': 'https://www.genenames.org/about/', - 'version': '20210810', - 'data_url': { - 'complete_set_archive': 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' + "source_meta_": { + "HGNC": { + "data_license": "custom", + "data_license_url": "https://www.genenames.org/about/", + "version": "20210810", + "data_url": { + "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" }, - 'rdp_url': None, - 'data_license_attributes': { - 'non_commercial': False, - 'attribution': False, - 'share_alike': False, + "rdp_url": None, + "data_license_attributes": { + "non_commercial": False, + "attribution": False, + "share_alike": False, }, - 'genome_assemblies': [], + "genome_assemblies": [], }, - 'Ensembl': { - 'data_license': 'custom', - 'data_license_url': 'https://useast.ensembl.org/info/about/legal/disclaimer.html', - 'version': '104', - 'data_url': { - 'genome_annotations': 'ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz' + "Ensembl": { + "data_license": "custom", + "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", + "version": "104", + "data_url": { + "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" }, - 'rdp_url': None, - 'data_license_attributes': { - 'non_commercial': False, - 'attribution': False, - 'share_alike': False, + "rdp_url": None, + "data_license_attributes": { + "non_commercial": False, + "attribution": False, + "share_alike": False, }, - 'genome_assemblies': ['GRCh38'], + "genome_assemblies": ["GRCh38"], }, - 'NCBI': { - 'data_license': 'custom', - 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/', - 'version': '20210813', - 'data_url': { - 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', - 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', - 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', + "NCBI": { + "data_license": "custom", + "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", + "version": "20210813", + "data_url": { + "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", + "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", + "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", }, - 'rdp_url': 'https://reusabledata.org/ncbi-gene.html', - 'data_license_attributes': { - 'non_commercial': False, - 'attribution': False, - 'share_alike': False, + "rdp_url": "https://reusabledata.org/ncbi-gene.html", + "data_license_attributes": { + "non_commercial": False, + "attribution": False, + "share_alike": False, }, - 'genome_assemblies': ['GRCh38.p13'], + "genome_assemblies": ["GRCh38.p13"], }, }, - 'service_meta_': { - 'name': 'gene-normalizer', - 'version': __version__, - 'response_datetime': '2022-03-23 15:57:14.180908', - 'url': 'https://github.com/cancervariants/gene-normalization', + "service_meta_": { + "name": "gene-normalizer", + "version": __version__, + "response_datetime": "2022-03-23 15:57:14.180908", + "url": "https://github.com/cancervariants/gene-normalization", }, } } @@ -501,28 +501,28 @@ class UnmergedNormalizationService(BaseNormalizationService): model_config = ConfigDict( json_schema_extra={ - 'example': { - 'query': 'hgnc:108', - 'warnings': [], - 'match_type': 100, - 'service_meta_': { - 'version': __version__, - 'response_datetime': '2022-04-26 14:20:54.180240', - 'name': 'gene-normalizer', - 'url': 'https://github.com/cancervariants/gene-normalization', + "example": { + "query": "hgnc:108", + "warnings": [], + "match_type": 100, + "service_meta_": { + "version": __version__, + "response_datetime": "2022-04-26 14:20:54.180240", + "name": "gene-normalizer", + "url": "https://github.com/cancervariants/gene-normalization", }, - 'normalized_concept_id': 'hgnc:108', - 'source_matches': { - 'HGNC': { - 'records': [ + "normalized_concept_id": "hgnc:108", + "source_matches": { + "HGNC": { + "records": [ { - 'concept_id': 'hgnc:108', - 'symbol': 'ACHE', - 'symbol_status': 'approved', - 'label': 'acetylcholinesterase (Cartwright blood group)', - 'strand': None, - 'location_annotations': [], - 'locations': [ + "concept_id": "hgnc:108", + "symbol": "ACHE", + "symbol_status": "approved", + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": None, + "location_annotations": [], + "locations": [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", @@ -532,95 +532,95 @@ class UnmergedNormalizationService(BaseNormalizationService): # "end": "q22.1" # } ], - 'aliases': ['3.1.1.7'], - 'previous_symbols': ['YT'], - 'xrefs': ['ncbigene:43', 'ensembl:ENSG00000087085'], - 'associated_with': [ - 'ucsc:uc003uxi.4', - 'vega:OTTHUMG00000157033', - 'merops:S09.979', - 'ccds:CCDS5710', - 'omim:100740', - 'iuphar:2465', - 'ccds:CCDS5709', - 'refseq:NM_015831', - 'pubmed:1380483', - 'uniprot:P22303', - 'ccds:CCDS64736', + "aliases": ["3.1.1.7"], + "previous_symbols": ["YT"], + "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], + "associated_with": [ + "ucsc:uc003uxi.4", + "vega:OTTHUMG00000157033", + "merops:S09.979", + "ccds:CCDS5710", + "omim:100740", + "iuphar:2465", + "ccds:CCDS5709", + "refseq:NM_015831", + "pubmed:1380483", + "uniprot:P22303", + "ccds:CCDS64736", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } ], - 'source_meta_': { - 'data_license': 'custom', - 'data_license_url': 'https://www.genenames.org/about/', - 'version': '20220407', - 'data_url': { - 'complete_set_archive': 'ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' + "source_meta_": { + "data_license": "custom", + "data_license_url": "https://www.genenames.org/about/", + "version": "20220407", + "data_url": { + "complete_set_archive": "ftp://ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" }, - 'rdp_url': None, - 'data_license_attributes': { - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "rdp_url": None, + "data_license_attributes": { + "non_commercial": False, + "share_alike": False, + "attribution": False, }, - 'genome_assemblies': [], + "genome_assemblies": [], }, }, - 'Ensembl': { - 'records': [ + "Ensembl": { + "records": [ { - 'concept_id': 'ensembl:ENSG00000087085', - 'symbol': 'ACHE', - 'symbol_status': None, - 'label': 'acetylcholinesterase (Cartwright blood group)', - 'strand': '-', - 'location_annotations': [], - 'locations': [ + "concept_id": "ensembl:ENSG00000087085", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": "-", + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 100889993, - 'end': 100896974, + "start": 100889993, + "end": 100896974, } ], - 'aliases': [], - 'previous_symbols': [], - 'xrefs': ['hgnc:108'], - 'associated_with': [], - 'gene_type': 'protein_coding', + "aliases": [], + "previous_symbols": [], + "xrefs": ["hgnc:108"], + "associated_with": [], + "gene_type": "protein_coding", } ], - 'source_meta_': { - 'data_license': 'custom', - 'data_license_url': 'https://useast.ensembl.org/info/about/legal/disclaimer.html', - 'version': '104', - 'data_url': { - 'genome_annotations': 'ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz' + "source_meta_": { + "data_license": "custom", + "data_license_url": "https://useast.ensembl.org/info/about/legal/disclaimer.html", + "version": "104", + "data_url": { + "genome_annotations": "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" }, - 'rdp_url': None, - 'data_license_attributes': { - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "rdp_url": None, + "data_license_attributes": { + "non_commercial": False, + "share_alike": False, + "attribution": False, }, - 'genome_assemblies': ['GRCh38'], + "genome_assemblies": ["GRCh38"], }, }, - 'NCBI': { - 'records': [ + "NCBI": { + "records": [ { - 'concept_id': 'ncbigene:43', - 'symbol': 'ACHE', - 'symbol_status': None, - 'label': 'acetylcholinesterase (Cartwright blood group)', - 'strand': '-', - 'location_annotations': [], - 'locations': [ + "concept_id": "ncbigene:43", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": "-", + "location_annotations": [], + "locations": [ { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.VtdU_0lYXL_o95lXRUfhv-NDJVVpmKoD", @@ -630,39 +630,39 @@ class UnmergedNormalizationService(BaseNormalizationService): # "end": "q22.1" }, { - 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 100889993, - 'end': 100896994, + "start": 100889993, + "end": 100896994, }, ], - 'aliases': ['YT', 'ARACHE', 'ACEE', 'N-ACHE'], - 'previous_symbols': ['ACEE'], - 'xrefs': ['hgnc:108', 'ensembl:ENSG00000087085'], - 'associated_with': ['omim:100740'], - 'gene_type': 'protein-coding', + "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], + "previous_symbols": ["ACEE"], + "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], + "associated_with": ["omim:100740"], + "gene_type": "protein-coding", } ], - 'source_meta_': { - 'data_license': 'custom', - 'data_license_url': 'https://www.ncbi.nlm.nih.gov/home/about/policies/', - 'version': '20220407', - 'data_url': { - 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', - 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', - 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', + "source_meta_": { + "data_license": "custom", + "data_license_url": "https://www.ncbi.nlm.nih.gov/home/about/policies/", + "version": "20220407", + "data_url": { + "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", + "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", + "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", }, - 'rdp_url': 'https://reusabledata.org/ncbi-gene.html', - 'data_license_attributes': { - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "rdp_url": "https://reusabledata.org/ncbi-gene.html", + "data_license_attributes": { + "non_commercial": False, + "share_alike": False, + "attribution": False, }, - 'genome_assemblies': ['GRCh38.p13'], + "genome_assemblies": ["GRCh38.p13"], }, }, }, diff --git a/src/gene/version.py b/src/gene/version.py index b4913868..75c5d6c1 100644 --- a/src/gene/version.py +++ b/src/gene/version.py @@ -1,2 +1,2 @@ """Gene normalizer version""" -__version__ = '0.3.0-dev1' +__version__ = "0.3.0-dev1" diff --git a/tests/conftest.py b/tests/conftest.py index ba941b0a..ad1a14a2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,7 +6,7 @@ from gene.database import AbstractDatabase, create_db -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def database() -> AbstractDatabase: """Create database instance.""" return create_db() @@ -17,19 +17,19 @@ def pytest_addoption(parser): See https://docs.pytest.org/en/7.1.x/reference/reference.html#parser """ parser.addoption( - '--verbose-logs', - action='store_true', + "--verbose-logs", + action="store_true", default=False, - help='show noisy module logs', + help="show noisy module logs", ) def pytest_configure(config): """Configure pytest setup.""" - if not config.getoption('--verbose-logs'): - logging.getLogger('botocore').setLevel(logging.ERROR) - logging.getLogger('boto3').setLevel(logging.ERROR) - logging.getLogger('urllib3.connectionpool').setLevel(logging.ERROR) + if not config.getoption("--verbose-logs"): + logging.getLogger("botocore").setLevel(logging.ERROR) + logging.getLogger("boto3").setLevel(logging.ERROR) + logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR) def _compare_records(normalized_gene, test_gene, match_type): @@ -53,7 +53,7 @@ def _compare_records(normalized_gene, test_gene, match_type): assert normalized_gene.gene_type == test_gene.gene_type -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def compare_records(): """Provide record(s) comparison function""" return _compare_records @@ -65,7 +65,7 @@ def _check_resp_single_record(resp, test_gene, match_type): _compare_records(resp.records[0], test_gene, match_type) -@pytest.fixture(scope='session') +@pytest.fixture(scope="session") def check_resp_single_record(): """Provide record comparison function for single record""" return _check_resp_single_record diff --git a/tests/unit/test_database_and_etl.py b/tests/unit/test_database_and_etl.py index 58ef6461..092cc6c3 100644 --- a/tests/unit/test_database_and_etl.py +++ b/tests/unit/test_database_and_etl.py @@ -12,28 +12,28 @@ from gene.schemas import RecordType ALIASES = { - 'NC_000001.11': ['ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO'], - 'NC_000002.12': ['ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g'], - 'NC_000003.12': ['ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX'], - 'NC_000007.14': ['ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul'], - 'NC_000009.12': ['ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI'], - 'NC_000011.10': ['ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1'], - 'NC_000015.10': ['ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6'], - 'NC_000017.11': ['ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7'], - 'NC_000019.10': ['ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl'], - 'NC_000023.11': ['ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP'], - 'NC_000008.11': ['ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs'], - 'NC_000012.12': ['ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl'], - 'NC_000024.10': ['ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5'], - 'NT_167246.2': ['ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1'], - 'NT_167249.2': ['ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-'], + "NC_000001.11": ["ga4gh:SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO"], + "NC_000002.12": ["ga4gh:SQ.pnAqCRBrTsUoBghSD1yp_jXWSmlbdh4g"], + "NC_000003.12": ["ga4gh:SQ.Zu7h9AggXxhTaGVsy7h_EZSChSZGcmgX"], + "NC_000007.14": ["ga4gh:SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul"], + "NC_000009.12": ["ga4gh:SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI"], + "NC_000011.10": ["ga4gh:SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1"], + "NC_000015.10": ["ga4gh:SQ.AsXvWL1-2i5U_buw6_niVIxD6zTbAuS6"], + "NC_000017.11": ["ga4gh:SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7"], + "NC_000019.10": ["ga4gh:SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl"], + "NC_000023.11": ["ga4gh:SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP"], + "NC_000008.11": ["ga4gh:SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs"], + "NC_000012.12": ["ga4gh:SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl"], + "NC_000024.10": ["ga4gh:SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5"], + "NT_167246.2": ["ga4gh:SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1"], + "NT_167249.2": ["ga4gh:SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-"], } -IS_TEST_ENV = environ.get('GENE_TEST', '').lower() == 'true' -IS_DDB_TEST = not environ.get('GENE_NORM_DB_URL', '').lower().startswith('postgres') +IS_TEST_ENV = environ.get("GENE_TEST", "").lower() == "true" +IS_DDB_TEST = not environ.get("GENE_NORM_DB_URL", "").lower().startswith("postgres") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def db_fixture(database): """Create a database test fixture.""" @@ -49,7 +49,7 @@ def __init__(self): return DB() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def processed_ids(): """Create a test fixture to store processed ids for merged concepts.""" return list() @@ -64,33 +64,33 @@ def _get_aliases(seqid): return ALIASES[seqid] -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def etl_data_path(): """Create a test fixture to return etl data path.""" test_root = Path(__file__).resolve().parents[2] - return test_root / 'tests' / 'unit' / 'data' / 'etl_data' + return test_root / "tests" / "unit" / "data" / "etl_data" def test_tables_created(db_fixture): """Check that requisite tables are created.""" existing_tables = db_fixture.db.list_tables() - if db_fixture.db_name == 'PostgresDatabase': + if db_fixture.db_name == "PostgresDatabase": assert set(existing_tables) == { - 'gene_associations', - 'gene_symbols', - 'gene_previous_symbols', - 'gene_aliases', - 'gene_xrefs', - 'gene_concepts', - 'gene_merged', - 'gene_sources', + "gene_associations", + "gene_symbols", + "gene_previous_symbols", + "gene_aliases", + "gene_xrefs", + "gene_concepts", + "gene_merged", + "gene_sources", } else: assert db_fixture.db.gene_table in existing_tables -@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') -@patch.object(Ensembl, 'get_seqrepo') +@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") +@patch.object(Ensembl, "get_seqrepo") def test_ensembl_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): """Test that ensembl etl methods work correctly.""" test_get_seqrepo.return_value = None @@ -100,8 +100,8 @@ def test_ensembl_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path) processed_ids += ensembl_ids -@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') -@patch.object(HGNC, 'get_seqrepo') +@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") +@patch.object(HGNC, "get_seqrepo") def test_hgnc_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): """Test that hgnc etl methods work correctly.""" test_get_seqrepo.return_value = None @@ -110,8 +110,8 @@ def test_hgnc_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): processed_ids += hgnc_ids -@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') -@patch.object(NCBI, 'get_seqrepo') +@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") +@patch.object(NCBI, "get_seqrepo") def test_ncbi_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): """Test that ncbi etl methods work correctly.""" test_get_seqrepo.return_value = None @@ -121,47 +121,47 @@ def test_ncbi_etl(test_get_seqrepo, processed_ids, db_fixture, etl_data_path): processed_ids += ncbi_ids -@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') +@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") def test_merged_concepts(processed_ids, db_fixture): """Create merged concepts and load to db.""" db_fixture.merge.create_merged_concepts(processed_ids) -@pytest.mark.skipif(not IS_DDB_TEST, reason='only applies to DynamoDB in test env') +@pytest.mark.skipif(not IS_DDB_TEST, reason="only applies to DynamoDB in test env") def test_item_type(db_fixture): """Check that items are tagged with item_type attribute.""" - filter_exp = Key('label_and_type').eq('ncbigene:8193##identity') - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'identity' - - filter_exp = Key('label_and_type').eq('prkrap1##symbol') - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'symbol' - - filter_exp = Key('label_and_type').eq('loc157663##prev_symbol') - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'prev_symbol' - - filter_exp = Key('label_and_type').eq('flj23569##alias') - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'alias' - - filter_exp = Key('label_and_type').eq('omim:606689##associated_with') - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'associated_with' - - filter_exp = Key('label_and_type').eq('ensembl:ensg00000268895##xref') - item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)['Items'][0] - assert 'item_type' in item - assert item['item_type'] == 'xref' - - -@pytest.mark.skipif(not IS_TEST_ENV, reason='not in test environment') + filter_exp = Key("label_and_type").eq("ncbigene:8193##identity") + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] + assert "item_type" in item + assert item["item_type"] == "identity" + + filter_exp = Key("label_and_type").eq("prkrap1##symbol") + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] + assert "item_type" in item + assert item["item_type"] == "symbol" + + filter_exp = Key("label_and_type").eq("loc157663##prev_symbol") + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] + assert "item_type" in item + assert item["item_type"] == "prev_symbol" + + filter_exp = Key("label_and_type").eq("flj23569##alias") + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] + assert "item_type" in item + assert item["item_type"] == "alias" + + filter_exp = Key("label_and_type").eq("omim:606689##associated_with") + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] + assert "item_type" in item + assert item["item_type"] == "associated_with" + + filter_exp = Key("label_and_type").eq("ensembl:ensg00000268895##xref") + item = db_fixture.db.genes.query(KeyConditionExpression=filter_exp)["Items"][0] + assert "item_type" in item + assert item["item_type"] == "xref" + + +@pytest.mark.skipif(not IS_TEST_ENV, reason="not in test environment") def test_get_all_records(db_fixture): """Basic test of get_all_records method. @@ -171,10 +171,10 @@ def test_get_all_records(db_fixture): """ source_records = list(db_fixture.db.get_all_records(RecordType.IDENTITY)) assert len(source_records) == 63 - source_ids = {r['concept_id'] for r in source_records} + source_ids = {r["concept_id"] for r in source_records} assert len(source_ids) == 63 normalized_records = list(db_fixture.db.get_all_records(RecordType.MERGER)) assert len(normalized_records) == 46 - normalized_ids = {r['concept_id'] for r in normalized_records} + normalized_ids = {r["concept_id"] for r in normalized_records} assert len(normalized_ids) == 46 diff --git a/tests/unit/test_emit_warnings.py b/tests/unit/test_emit_warnings.py index c28e7ae5..c8309aac 100644 --- a/tests/unit/test_emit_warnings.py +++ b/tests/unit/test_emit_warnings.py @@ -7,25 +7,25 @@ def test_emit_warnings(): """Test that emit_warnings works correctly.""" expected_warnings = [ { - 'non_breaking_space_characters': 'Query contains non-breaking space characters' + "non_breaking_space_characters": "Query contains non-breaking space characters" } ] db = create_db() query_handler = QueryHandler(db) # Test emit no warnings - actual_warnings = query_handler._emit_warnings('spry3') + actual_warnings = query_handler._emit_warnings("spry3") assert actual_warnings == [] # Test emit warnings - actual_warnings = query_handler._emit_warnings('sp ry3') + actual_warnings = query_handler._emit_warnings("sp ry3") assert actual_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings('sp\u00A0ry3') + actual_warnings = query_handler._emit_warnings("sp\u00A0ry3") assert expected_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings('sp ry3') + actual_warnings = query_handler._emit_warnings("sp ry3") assert expected_warnings == actual_warnings - actual_warnings = query_handler._emit_warnings('sp\xa0ry3') + actual_warnings = query_handler._emit_warnings("sp\xa0ry3") assert expected_warnings == actual_warnings diff --git a/tests/unit/test_endpoints.py b/tests/unit/test_endpoints.py index 25e3aa05..0639e6a0 100644 --- a/tests/unit/test_endpoints.py +++ b/tests/unit/test_endpoints.py @@ -10,7 +10,7 @@ from gene.main import app -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def api_client(): """Provide test client fixture.""" return TestClient(app) @@ -18,26 +18,26 @@ def api_client(): def test_search(api_client): """Test /search endpoint.""" - response = api_client.get('/gene/search?q=braf') + response = api_client.get("/gene/search?q=braf") assert response.status_code == 200 assert ( - response.json()['source_matches']['HGNC']['records'][0]['concept_id'] - == 'hgnc:1097' + response.json()["source_matches"]["HGNC"]["records"][0]["concept_id"] + == "hgnc:1097" ) - response = api_client.get('/gene/search?q=braf&incl=sdkl') + response = api_client.get("/gene/search?q=braf&incl=sdkl") assert response.status_code == 422 def test_normalize(api_client): """Test /normalize endpoint.""" - response = api_client.get('/gene/normalize?q=braf') + response = api_client.get("/gene/normalize?q=braf") assert response.status_code == 200 - assert response.json()['normalized_id'] == 'hgnc:1097' + assert response.json()["normalized_id"] == "hgnc:1097" def test_normalize_unmerged(api_client): """Test /normalize_unmerged endpoint.""" - response = api_client.get('/gene/normalize_unmerged?q=braf') + response = api_client.get("/gene/normalize_unmerged?q=braf") assert response.status_code == 200 - assert response.json()['normalized_concept_id'] == 'hgnc:1097' + assert response.json()["normalized_concept_id"] == "hgnc:1097" diff --git a/tests/unit/test_ensembl_source.py b/tests/unit/test_ensembl_source.py index 0e012a78..7660be3e 100644 --- a/tests/unit/test_ensembl_source.py +++ b/tests/unit/test_ensembl_source.py @@ -5,7 +5,7 @@ from gene.schemas import Gene, MatchType, SourceName -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def ensembl(database): """Build ensembl test fixture.""" @@ -13,7 +13,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl='ensembl'): + def search(self, query_str, incl="ensembl"): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.ENSEMBL] @@ -21,162 +21,162 @@ def search(self, query_str, incl='ensembl'): return e -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def ddx11l1(): """Create a DDX11L1 fixutre.""" params = { - 'match_type': MatchType.NO_MATCH, - 'concept_id': 'ensembl:ENSG00000223972', - 'symbol': 'DDX11L1', - 'label': 'DEAD/H-box helicase 11 like 1 (pseudogene)', - 'previous_symbols': [], - 'aliases': [], - 'xrefs': ['hgnc:37102'], - 'symbol_status': None, - 'location_annotations': [], - 'locations': [ + "match_type": MatchType.NO_MATCH, + "concept_id": "ensembl:ENSG00000223972", + "symbol": "DDX11L1", + "label": "DEAD/H-box helicase 11 like 1 (pseudogene)", + "previous_symbols": [], + "aliases": [], + "xrefs": ["hgnc:37102"], + "symbol_status": None, + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L', - 'end': 14409, - 'start': 11868, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO', + "id": "ga4gh:SL.Ihi0T86UoFIEbH0DHttX2nIw_BdOkI5L", + "end": 14409, + "start": 11868, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Ya6Rs7DHhDeg7YaOSg1EoNi3U_nQ9SvO", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'strand': '+', - 'associated_with': [], - 'gene_type': 'transcribed_unprocessed_pseudogene', + "strand": "+", + "associated_with": [], + "gene_type": "transcribed_unprocessed_pseudogene", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def tp53(): """Create a TP53 fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'concept_id': 'ensembl:ENSG00000141510', - 'symbol': 'TP53', - 'label': 'tumor protein p53', - 'previous_symbols': [], - 'aliases': [], - 'xrefs': ['hgnc:11998'], - 'symbol_status': None, - 'location_annotations': [], - 'locations': [ + "match_type": MatchType.NO_MATCH, + "concept_id": "ensembl:ENSG00000141510", + "symbol": "TP53", + "label": "tumor protein p53", + "previous_symbols": [], + "aliases": [], + "xrefs": ["hgnc:11998"], + "symbol_status": None, + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9', - 'end': 7687538, - 'start': 7661778, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', + "id": "ga4gh:SL.TlGoA-JmP3Xky3RhJ6_UU3eJKq8EpEp9", + "end": 7687538, + "start": 7661778, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'strand': '-', - 'associated_with': [], - 'gene_type': 'protein_coding', + "strand": "-", + "associated_with": [], + "gene_type": "protein_coding", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def ATP6AP1_DT(): # noqa: N802 """Create a ATP6AP1-DT test fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'concept_id': 'ensembl:ENSG00000197180', - 'symbol': 'ATP6AP1-DT', - 'label': 'ATP6AP1 divergent transcript', - 'previous_symbols': [], - 'aliases': [], - 'xrefs': ['hgnc:25138'], - 'symbol_status': None, - 'location_annotations': [], - 'locations': [ + "match_type": MatchType.NO_MATCH, + "concept_id": "ensembl:ENSG00000197180", + "symbol": "ATP6AP1-DT", + "label": "ATP6AP1 divergent transcript", + "previous_symbols": [], + "aliases": [], + "xrefs": ["hgnc:25138"], + "symbol_status": None, + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3', - 'end': 154428526, - 'start': 154424377, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', + "id": "ga4gh:SL.bPbeeEGSqjlZJ1Ddmg5T9ptJz9tKxYi3", + "end": 154428526, + "start": 154424377, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'strand': '-', - 'associated_with': [], - 'gene_type': 'lncRNA', + "strand": "-", + "associated_with": [], + "gene_type": "lncRNA", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def hsa_mir_1253(): """Create a hsa-miR-1253 test fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'concept_id': 'ensembl:ENSG00000272920', - 'symbol': 'hsa-mir-1253', - 'label': 'hsa-mir-1253', - 'previous_symbols': [], - 'aliases': [], - 'xrefs': [], - 'symbol_status': None, - 'location_annotations': [], - 'locations': [ + "match_type": MatchType.NO_MATCH, + "concept_id": "ensembl:ENSG00000272920", + "symbol": "hsa-mir-1253", + "label": "hsa-mir-1253", + "previous_symbols": [], + "aliases": [], + "xrefs": [], + "symbol_status": None, + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR', - 'end': 2748182, - 'start': 2748077, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7', + "id": "ga4gh:SL.x4kOE6ZXG-xY7nm6bu2W7lvm6ljaJXzR", + "end": 2748182, + "start": 2748077, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.dLZ15tNO1Ur0IcGjwc3Sdi_0A6Yf4zm7", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'strand': '+', - 'associated_with': ['mirbase:MI0006387'], - 'gene_type': 'lncRNA', + "strand": "+", + "associated_with": ["mirbase:MI0006387"], + "gene_type": "lncRNA", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def spry3(): """Create a SPRY3 test fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'concept_id': 'ensembl:ENSG00000168939', - 'symbol': 'SPRY3', - 'label': 'sprouty RTK signaling antagonist 3', - 'previous_symbols': [], - 'aliases': [], - 'xrefs': ['hgnc:11271'], - 'symbol_status': None, - 'location_annotations': [], - 'locations': [ + "match_type": MatchType.NO_MATCH, + "concept_id": "ensembl:ENSG00000168939", + "symbol": "SPRY3", + "label": "sprouty RTK signaling antagonist 3", + "previous_symbols": [], + "aliases": [], + "xrefs": ["hgnc:11271"], + "symbol_status": None, + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0', - 'end': 155782459, - 'start': 155612571, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', + "id": "ga4gh:SL.fxU7Axal2_GbyOfW8NQf0plM-SUWFCB0", + "end": 155782459, + "start": 155612571, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'strand': '+', - 'associated_with': [], - 'gene_type': 'protein_coding', + "strand": "+", + "associated_with": [], + "gene_type": "protein_coding", } return Gene(**params) @@ -184,137 +184,137 @@ def spry3(): def test_ddx11l1(check_resp_single_record, ensembl, ddx11l1): """Test that DDX11L1 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search('ensembl:ENSG00000223972') + resp = ensembl.search("ensembl:ENSG00000223972") check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSEMBL:ENSG00000223972') + resp = ensembl.search("ENSEMBL:ENSG00000223972") check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSG00000223972') + resp = ensembl.search("ENSG00000223972") check_resp_single_record(resp, ddx11l1, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search('ddx11l1') + resp = ensembl.search("ddx11l1") check_resp_single_record(resp, ddx11l1, MatchType.SYMBOL) - resp = ensembl.search('DDX11L1') + resp = ensembl.search("DDX11L1") check_resp_single_record(resp, ddx11l1, MatchType.SYMBOL) def test_tp53(check_resp_single_record, ensembl, tp53): """Test that tp53 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search('ensembl:ENSG00000141510') + resp = ensembl.search("ensembl:ENSG00000141510") check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSEMBL:ENSG00000141510') + resp = ensembl.search("ENSEMBL:ENSG00000141510") check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSG00000141510') + resp = ensembl.search("ENSG00000141510") check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search('tp53') + resp = ensembl.search("tp53") check_resp_single_record(resp, tp53, MatchType.SYMBOL) - resp = ensembl.search('TP53') + resp = ensembl.search("TP53") check_resp_single_record(resp, tp53, MatchType.SYMBOL) def test_ATP6AP1_DT(check_resp_single_record, ensembl, ATP6AP1_DT): # noqa: N802 N803 """Test that ATP6AP1-DT normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search('ensembl:ENSG00000197180') + resp = ensembl.search("ensembl:ENSG00000197180") check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSEMBL:ENSG00000197180') + resp = ensembl.search("ENSEMBL:ENSG00000197180") check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSG00000197180') + resp = ensembl.search("ENSG00000197180") check_resp_single_record(resp, ATP6AP1_DT, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search('ATP6AP1-DT') + resp = ensembl.search("ATP6AP1-DT") check_resp_single_record(resp, ATP6AP1_DT, MatchType.SYMBOL) def test_hsa_mir_1253(check_resp_single_record, ensembl, hsa_mir_1253): """Test that hsa-mir-1253 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search('ensembl:ENSG00000272920') + resp = ensembl.search("ensembl:ENSG00000272920") check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSEMBL:ENSG00000272920') + resp = ensembl.search("ENSEMBL:ENSG00000272920") check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSG00000272920') + resp = ensembl.search("ENSG00000272920") check_resp_single_record(resp, hsa_mir_1253, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search('hsa-mir-1253') + resp = ensembl.search("hsa-mir-1253") check_resp_single_record(resp, hsa_mir_1253, MatchType.SYMBOL) # associated_with - resp = ensembl.search('mirbase:MI0006387') + resp = ensembl.search("mirbase:MI0006387") check_resp_single_record(resp, hsa_mir_1253, MatchType.ASSOCIATED_WITH) def test_spry3(check_resp_single_record, ensembl, spry3): """Test that spry3 normalizes to correct gene concept.""" # Concept ID - resp = ensembl.search('ensembl:EnSG00000168939') + resp = ensembl.search("ensembl:EnSG00000168939") check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) - resp = ensembl.search('ENSEMBL:EnSG00000168939') + resp = ensembl.search("ENSEMBL:EnSG00000168939") check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) - resp = ensembl.search('EnSG00000168939') + resp = ensembl.search("EnSG00000168939") check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) # Symbol - resp = ensembl.search('spry3') + resp = ensembl.search("spry3") check_resp_single_record(resp, spry3, MatchType.SYMBOL) def test_no_match(ensembl): """Test that a term normalizes to correct gene concept as a NO match.""" - resp = ensembl.search('A1BG - AS1') + resp = ensembl.search("A1BG - AS1") assert len(resp.records) == 0 - resp = ensembl.search('hnc:5') + resp = ensembl.search("hnc:5") assert len(resp.records) == 0 # Test empty query - resp = ensembl.search('') + resp = ensembl.search("") assert len(resp.records) == 0 # Do not search on label - resp = ensembl.search('A1BG antisense RNA 1') + resp = ensembl.search("A1BG antisense RNA 1") assert len(resp.records) == 0 - resp = ensembl.search('ensembl:ENSG00000278704') + resp = ensembl.search("ensembl:ENSG00000278704") assert len(resp.records) == 0 - resp = ensembl.search('ensembl:ENSG00000284906') + resp = ensembl.search("ensembl:ENSG00000284906") assert len(resp.records) == 0 def test_meta_info(ensembl): """Test that the meta field is correct.""" - resp = ensembl.search('chromosome:1') - assert resp.source_meta_.data_license == 'custom' + resp = ensembl.search("chromosome:1") + assert resp.source_meta_.data_license == "custom" assert ( resp.source_meta_.data_license_url - == 'https://useast.ensembl.org/info/about/legal/disclaimer.html' + == "https://useast.ensembl.org/info/about/legal/disclaimer.html" ) - assert resp.source_meta_.version == '110' + assert resp.source_meta_.version == "110" assert resp.source_meta_.data_url == { - 'genome_annotations': 'ftp://ftp.ensembl.org/pub/release-110/gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz' + "genome_annotations": "ftp://ftp.ensembl.org/pub/release-110/gff3/homo_sapiens/Homo_sapiens.GRCh38.110.gff3.gz" } assert resp.source_meta_.rdp_url is None - assert resp.source_meta_.genome_assemblies == ['GRCh38'] + assert resp.source_meta_.genome_assemblies == ["GRCh38"] assert resp.source_meta_.data_license_attributes == { - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "non_commercial": False, + "share_alike": False, + "attribution": False, } diff --git a/tests/unit/test_hgnc_source.py b/tests/unit/test_hgnc_source.py index 1673c2ba..54d0aff0 100644 --- a/tests/unit/test_hgnc_source.py +++ b/tests/unit/test_hgnc_source.py @@ -7,7 +7,7 @@ from gene.schemas import Gene, MatchType, SourceName -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def hgnc(database): """Build hgnc test fixture.""" @@ -15,7 +15,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl='hgnc'): + def search(self, query_str, incl="hgnc"): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.HGNC] @@ -26,17 +26,17 @@ def search(self, query_str, incl='hgnc'): # Test Non Alt Loci Set -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def a1bg_as1(): """Create an A1BG-AS1 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'A1BG antisense RNA 1', - 'concept_id': 'hgnc:37133', - 'symbol': 'A1BG-AS1', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "A1BG antisense RNA 1", + "concept_id": "hgnc:37133", + "symbol": "A1BG-AS1", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.Rz-M5wA0_bIhQYLKi2ZPqlqW3nBPfAx5", # "chr": "19", @@ -46,34 +46,34 @@ def a1bg_as1(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': ['NCRNA00181', 'A1BGAS', 'A1BG-AS'], - 'aliases': ['FLJ23569'], - 'symbol_status': 'approved', - 'associated_with': [ - 'vega:OTTHUMG00000183508', - 'ucsc:uc002qse.3', - 'refseq:NR_015380', - 'ena.embl:BC040926', - 'refseq:NR_015380', - 'ena.embl:BC040926', + "previous_symbols": ["NCRNA00181", "A1BGAS", "A1BG-AS"], + "aliases": ["FLJ23569"], + "symbol_status": "approved", + "associated_with": [ + "vega:OTTHUMG00000183508", + "ucsc:uc002qse.3", + "refseq:NR_015380", + "ena.embl:BC040926", + "refseq:NR_015380", + "ena.embl:BC040926", ], - 'xrefs': ['ensembl:ENSG00000268895', 'ncbigene:503538'], - 'gene_type': 'RNA, long non-coding', + "xrefs": ["ensembl:ENSG00000268895", "ncbigene:503538"], + "gene_type": "RNA, long non-coding", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def tp53(): """Create a TP53 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'tumor protein p53', - 'concept_id': 'hgnc:11998', - 'symbol': 'TP53', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "tumor protein p53", + "concept_id": "hgnc:11998", + "symbol": "TP53", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.BPk3okUhv4BBatjkyC7eQQsyXL6YwmeF", # "chr": "17", @@ -83,51 +83,51 @@ def tp53(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': [], - 'aliases': ['p53', 'LFS1'], - 'symbol_status': 'approved', - 'associated_with': [ - 'vega:OTTHUMG00000162125', - 'refseq:NM_000546', - 'cosmic:TP53', - 'omim:191170', - 'ucsc:uc060aur.1', - 'uniprot:P04637', - 'orphanet:120204', - 'ccds:CCDS73968', - 'ccds:CCDS73971', - 'ccds:CCDS73970', - 'ccds:CCDS73969', - 'ccds:CCDS73967', - 'ccds:CCDS73966', - 'ccds:CCDS73965', - 'ccds:CCDS73964', - 'ccds:CCDS73963', - 'ccds:CCDS11118', - 'ccds:CCDS45605', - 'ccds:CCDS45606', - 'ena.embl:AF307851', - 'pubmed:6396087', - 'pubmed:3456488', - 'pubmed:2047879', + "previous_symbols": [], + "aliases": ["p53", "LFS1"], + "symbol_status": "approved", + "associated_with": [ + "vega:OTTHUMG00000162125", + "refseq:NM_000546", + "cosmic:TP53", + "omim:191170", + "ucsc:uc060aur.1", + "uniprot:P04637", + "orphanet:120204", + "ccds:CCDS73968", + "ccds:CCDS73971", + "ccds:CCDS73970", + "ccds:CCDS73969", + "ccds:CCDS73967", + "ccds:CCDS73966", + "ccds:CCDS73965", + "ccds:CCDS73964", + "ccds:CCDS73963", + "ccds:CCDS11118", + "ccds:CCDS45605", + "ccds:CCDS45606", + "ena.embl:AF307851", + "pubmed:6396087", + "pubmed:3456488", + "pubmed:2047879", ], - 'xrefs': ['ensembl:ENSG00000141510', 'ncbigene:7157'], - 'gene_type': 'gene with protein product', + "xrefs": ["ensembl:ENSG00000141510", "ncbigene:7157"], + "gene_type": "gene with protein product", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def a3galt2(): """Create an A3GALT2 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'alpha 1,3-galactosyltransferase 2', - 'concept_id': 'hgnc:30005', - 'symbol': 'A3GALT2', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "alpha 1,3-galactosyltransferase 2", + "concept_id": "hgnc:30005", + "symbol": "A3GALT2", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.iiwv6oaDfVVkjMZ_OH6XEQmM0daVft4u", # "chr": "1", @@ -137,37 +137,37 @@ def a3galt2(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': ['A3GALT2P'], - 'aliases': ['IGBS3S', 'IGB3S'], - 'symbol_status': 'approved', - 'xrefs': ['ensembl:ENSG00000184389', 'ncbigene:127550'], - 'associated_with': [ - 'vega:OTTHUMG00000004125', - 'vega:OTTHUMG00000004125', - 'ucsc:uc031plq.1', - 'uniprot:U3KPV4', - 'ccds:CCDS60080', - 'pubmed:10854427', - 'pubmed:18630988', - 'refseq:NM_001080438', - 'omim:619850', + "previous_symbols": ["A3GALT2P"], + "aliases": ["IGBS3S", "IGB3S"], + "symbol_status": "approved", + "xrefs": ["ensembl:ENSG00000184389", "ncbigene:127550"], + "associated_with": [ + "vega:OTTHUMG00000004125", + "vega:OTTHUMG00000004125", + "ucsc:uc031plq.1", + "uniprot:U3KPV4", + "ccds:CCDS60080", + "pubmed:10854427", + "pubmed:18630988", + "refseq:NM_001080438", + "omim:619850", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def wdhd1(): """Create a WDHD1 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'WD repeat and HMG-box DNA binding protein 1', - 'concept_id': 'hgnc:23170', - 'symbol': 'WDHD1', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "WD repeat and HMG-box DNA binding protein 1", + "concept_id": "hgnc:23170", + "symbol": "WDHD1", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.sNe5mpPbxivH2KE6HdaDA3U29BkCQXc3", # "chr": "14", @@ -177,80 +177,80 @@ def wdhd1(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': [], - 'aliases': ['AND-1', 'CTF4', 'CHTF4'], - 'symbol_status': 'approved', - 'xrefs': ['ensembl:ENSG00000198554', 'ncbigene:11169'], - 'associated_with': [ - 'vega:OTTHUMG00000140304', - 'refseq:NM_007086', - 'omim:608126', - 'ucsc:uc001xbm.3', - 'uniprot:O75717', - 'ccds:CCDS41955', - 'ccds:CCDS9721', - 'ena.embl:AJ006266', - 'pubmed:9175701', - 'pubmed:20028748', + "previous_symbols": [], + "aliases": ["AND-1", "CTF4", "CHTF4"], + "symbol_status": "approved", + "xrefs": ["ensembl:ENSG00000198554", "ncbigene:11169"], + "associated_with": [ + "vega:OTTHUMG00000140304", + "refseq:NM_007086", + "omim:608126", + "ucsc:uc001xbm.3", + "uniprot:O75717", + "ccds:CCDS41955", + "ccds:CCDS9721", + "ena.embl:AJ006266", + "pubmed:9175701", + "pubmed:20028748", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def g6pr(): """Create a G6PR gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'glucose-6-phosphatase regulator', - 'concept_id': 'hgnc:4059', - 'symbol': 'G6PR', - 'location_annotations': ['reserved'], - 'locations': [], - 'strand': None, - 'previous_symbols': [], - 'aliases': ['GSD1aSP'], - 'symbol_status': 'approved', - 'xrefs': ['ncbigene:2541'], - 'associated_with': ['pubmed:2172641', 'pubmed:7814621', 'pubmed:2996501'], - 'gene_type': 'unknown', + "match_type": MatchType.NO_MATCH, + "label": "glucose-6-phosphatase regulator", + "concept_id": "hgnc:4059", + "symbol": "G6PR", + "location_annotations": ["reserved"], + "locations": [], + "strand": None, + "previous_symbols": [], + "aliases": ["GSD1aSP"], + "symbol_status": "approved", + "xrefs": ["ncbigene:2541"], + "associated_with": ["pubmed:2172641", "pubmed:7814621", "pubmed:2996501"], + "gene_type": "unknown", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def pirc24(): """Create a PIRC24 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'piwi-interacting RNA cluster 24', - 'concept_id': 'hgnc:37528', - 'symbol': 'PIRC24', - 'location_annotations': ['6'], - 'locations': [], - 'strand': None, - 'previous_symbols': [], - 'aliases': [], - 'symbol_status': 'approved', - 'xrefs': ['ncbigene:100313810'], - 'associated_with': ['pubmed:17881367'], - 'gene_type': 'RNA, cluster', + "match_type": MatchType.NO_MATCH, + "label": "piwi-interacting RNA cluster 24", + "concept_id": "hgnc:37528", + "symbol": "PIRC24", + "location_annotations": ["6"], + "locations": [], + "strand": None, + "previous_symbols": [], + "aliases": [], + "symbol_status": "approved", + "xrefs": ["ncbigene:100313810"], + "associated_with": ["pubmed:17881367"], + "gene_type": "RNA, cluster", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def gage4(): """Create a GAGE4 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'G antigen 4', - 'concept_id': 'hgnc:4101', - 'symbol': 'GAGE4', - 'location_annotations': ['not on reference assembly'], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "G antigen 4", + "concept_id": "hgnc:4101", + "symbol": "GAGE4", + "location_annotations": ["not on reference assembly"], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.6KzwrFm2WeSXqwIIiNbAu-pKQQHt2q5Q", # "chr": "X", @@ -260,83 +260,83 @@ def gage4(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': [], - 'aliases': ['CT4.4'], - 'symbol_status': 'approved', - 'xrefs': ['ncbigene:2576'], - 'associated_with': [ - 'refseq:NM_001474', - 'omim:300597', - 'uniprot:P0DSO3', - 'ena.embl:U19145', - 'pubmed:7544395', + "previous_symbols": [], + "aliases": ["CT4.4"], + "symbol_status": "approved", + "xrefs": ["ncbigene:2576"], + "associated_with": [ + "refseq:NM_001474", + "omim:300597", + "uniprot:P0DSO3", + "ena.embl:U19145", + "pubmed:7544395", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def mafip(): """Create a MAFIP gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'MAFF interacting protein', - 'concept_id': 'hgnc:31102', - 'symbol': 'MAFIP', - 'location_annotations': ['unplaced', '14'], - 'locations': [], - 'strand': None, - 'previous_symbols': [], - 'aliases': ['FLJ35473', 'FLJ00219', 'FLJ39633', 'MIP', 'pp5644', 'TEKT4P4'], - 'symbol_status': 'approved', - 'xrefs': ['ensembl:ENSG00000274847', 'ncbigene:727764'], - 'associated_with': [ - 'vega:OTTHUMG00000188065', - 'refseq:NR_046439', - 'uniprot:Q8WZ33', - 'ena.embl:AK074146', - 'ena.embl:AF289559', - 'pubmed:16549056', - 'pubmed:15881666', + "match_type": MatchType.NO_MATCH, + "label": "MAFF interacting protein", + "concept_id": "hgnc:31102", + "symbol": "MAFIP", + "location_annotations": ["unplaced", "14"], + "locations": [], + "strand": None, + "previous_symbols": [], + "aliases": ["FLJ35473", "FLJ00219", "FLJ39633", "MIP", "pp5644", "TEKT4P4"], + "symbol_status": "approved", + "xrefs": ["ensembl:ENSG00000274847", "ncbigene:727764"], + "associated_with": [ + "vega:OTTHUMG00000188065", + "refseq:NR_046439", + "uniprot:Q8WZ33", + "ena.embl:AK074146", + "ena.embl:AF289559", + "pubmed:16549056", + "pubmed:15881666", ], - 'gene_type': 'unknown', + "gene_type": "unknown", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def mt_7sdna(): """Create a MT-7SDNA gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'mitochondrially encoded 7S DNA', - 'concept_id': 'hgnc:7409', - 'symbol': 'MT-7SDNA', - 'location_annotations': ['MT'], - 'locations': [], - 'strand': None, - 'previous_symbols': ['MT7SDNA'], - 'aliases': [], - 'symbol_status': 'approved', - 'xrefs': [], - 'associated_with': ['pubmed:24709344', 'pubmed:273237'], - 'gene_type': 'region', + "match_type": MatchType.NO_MATCH, + "label": "mitochondrially encoded 7S DNA", + "concept_id": "hgnc:7409", + "symbol": "MT-7SDNA", + "location_annotations": ["MT"], + "locations": [], + "strand": None, + "previous_symbols": ["MT7SDNA"], + "aliases": [], + "symbol_status": "approved", + "xrefs": [], + "associated_with": ["pubmed:24709344", "pubmed:273237"], + "gene_type": "region", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def cecr(): """Create a CECR gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'cat eye syndrome chromosome region', - 'concept_id': 'hgnc:1838', - 'symbol': 'CECR', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "cat eye syndrome chromosome region", + "concept_id": "hgnc:1838", + "symbol": "CECR", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.AgASk5sB6LCeaB6rcqOwmrm16ise3pof", # "chr": "22", @@ -346,27 +346,27 @@ def cecr(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': [], - 'aliases': [], - 'symbol_status': 'approved', - 'xrefs': ['ncbigene:1055'], - 'associated_with': [], - 'gene_type': 'region', + "previous_symbols": [], + "aliases": [], + "symbol_status": "approved", + "xrefs": ["ncbigene:1055"], + "associated_with": [], + "gene_type": "region", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def csf2ra(): """Create a CSF2RA gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'colony stimulating factor 2 receptor subunit alpha', - 'concept_id': 'hgnc:2435', - 'symbol': 'CSF2RA', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "colony stimulating factor 2 receptor subunit alpha", + "concept_id": "hgnc:2435", + "symbol": "CSF2RA", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.cITg67iNn_QNZTKpJd0I-1JMMhW_yHGU", # "chr": "X", @@ -384,45 +384,45 @@ def csf2ra(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': ['CSF2R'], - 'aliases': ['CD116', 'alphaGMR'], - 'symbol_status': 'approved', - 'xrefs': ['ensembl:ENSG00000198223', 'ncbigene:1438'], - 'associated_with': [ - 'vega:OTTHUMG00000012533', - 'refseq:NM_001161529', - 'orphanet:209477', - 'iuphar:1707', - 'hcdmdb:CD116', - 'omim:306250', - 'omim:425000', - 'ucsc:uc010nvv.3', - 'uniprot:P15509', - 'ena.embl:M64445', - 'ccds:CCDS35190', - 'ccds:CCDS55360', - 'ccds:CCDS35191', - 'ccds:CCDS55359', - 'ccds:CCDS35192', - 'ccds:CCDS35193', - 'pubmed:1702217', + "previous_symbols": ["CSF2R"], + "aliases": ["CD116", "alphaGMR"], + "symbol_status": "approved", + "xrefs": ["ensembl:ENSG00000198223", "ncbigene:1438"], + "associated_with": [ + "vega:OTTHUMG00000012533", + "refseq:NM_001161529", + "orphanet:209477", + "iuphar:1707", + "hcdmdb:CD116", + "omim:306250", + "omim:425000", + "ucsc:uc010nvv.3", + "uniprot:P15509", + "ena.embl:M64445", + "ccds:CCDS35190", + "ccds:CCDS55360", + "ccds:CCDS35191", + "ccds:CCDS55359", + "ccds:CCDS35192", + "ccds:CCDS35193", + "pubmed:1702217", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def rps24p5(): """Create a RPS24P5 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'ribosomal protein S24 pseudogene 5', - 'concept_id': 'hgnc:36026', - 'symbol': 'RPS24P5', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "ribosomal protein S24 pseudogene 5", + "concept_id": "hgnc:36026", + "symbol": "RPS24P5", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.Ri0ddtMpe6DGzrC9_QGbL35gYAtU2bh_", # "chr": "1", @@ -432,27 +432,27 @@ def rps24p5(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': [], - 'aliases': [], - 'symbol_status': 'approved', - 'xrefs': ['ncbigene:100271094'], - 'associated_with': ['refseq:NG_011274', 'pubmed:19123937'], - 'gene_type': 'pseudogene', + "previous_symbols": [], + "aliases": [], + "symbol_status": "approved", + "xrefs": ["ncbigene:100271094"], + "associated_with": ["refseq:NG_011274", "pubmed:19123937"], + "gene_type": "pseudogene", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def trl_cag2_1(): """Create a TRL-CAG2-1 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'tRNA-Leu (anticodon CAG) 2-1', - 'concept_id': 'hgnc:34692', - 'symbol': 'TRL-CAG2-1', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "tRNA-Leu (anticodon CAG) 2-1", + "concept_id": "hgnc:34692", + "symbol": "TRL-CAG2-1", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.aZ5aYHaC3GhDWgwhKkAcd9GBvkEo034v", # "chr": "16", @@ -462,27 +462,27 @@ def trl_cag2_1(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': ['TRNAL13'], - 'aliases': ['tRNA-Leu-CAG-2-1'], - 'symbol_status': 'approved', - 'xrefs': ['ncbigene:100189130'], - 'associated_with': ['ena.embl:HG983896'], - 'gene_type': 'RNA, transfer', + "previous_symbols": ["TRNAL13"], + "aliases": ["tRNA-Leu-CAG-2-1"], + "symbol_status": "approved", + "xrefs": ["ncbigene:100189130"], + "associated_with": ["ena.embl:HG983896"], + "gene_type": "RNA, transfer", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def myo5b(): """Create a MYO5B gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'myosin VB', - 'concept_id': 'hgnc:7603', - 'symbol': 'MYO5B', - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "myosin VB", + "concept_id": "hgnc:7603", + "symbol": "MYO5B", + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.hFukVqPVLD70cshAz1Gtmd6EC1imobpO", # "chr": "18", @@ -492,23 +492,23 @@ def myo5b(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': [], - 'aliases': ['KIAA1119'], - 'symbol_status': 'approved', - 'xrefs': ['ensembl:ENSG00000167306', 'ncbigene:4645'], - 'associated_with': [ - 'vega:OTTHUMG00000179843', - 'refseq:NM_001080467', - 'omim:606540', - 'ucsc:uc002leb.3', - 'uniprot:Q9ULV0', - 'orphanet:171089', - 'ccds:CCDS42436', - 'ena.embl:AB032945', - 'pubmed:8884266', - 'pubmed:17462998', + "previous_symbols": [], + "aliases": ["KIAA1119"], + "symbol_status": "approved", + "xrefs": ["ensembl:ENSG00000167306", "ncbigene:4645"], + "associated_with": [ + "vega:OTTHUMG00000179843", + "refseq:NM_001080467", + "omim:606540", + "ucsc:uc002leb.3", + "uniprot:Q9ULV0", + "orphanet:171089", + "ccds:CCDS42436", + "ena.embl:AB032945", + "pubmed:8884266", + "pubmed:17462998", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } return Gene(**params) @@ -516,17 +516,17 @@ def myo5b(): # Test Alt Loci Set -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def gstt1(): """Create an GSTT1 gene fixture.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'glutathione S-transferase theta 1', - 'concept_id': 'hgnc:4641', - 'symbol': 'GSTT1', - 'location_annotations': ['alternate reference locus'], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "glutathione S-transferase theta 1", + "concept_id": "hgnc:4641", + "symbol": "GSTT1", + "location_annotations": ["alternate reference locus"], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.g74mxFvAzPoenOlyMjY32j-UFMvjjas_", # "chr": "22", @@ -536,20 +536,20 @@ def gstt1(): # "type": "ChromosomeLocation" # } ], - 'previous_symbols': [], - 'aliases': ['2.5.1.18'], - 'symbol_status': 'approved', - 'associated_with': [ - 'refseq:NM_000853', - 'omim:600436', - 'ucsc:uc002zze.4', - 'uniprot:P30711', - 'orphanet:470418', - 'ena.embl:KI270879', - 'pubmed:8617495', + "previous_symbols": [], + "aliases": ["2.5.1.18"], + "symbol_status": "approved", + "associated_with": [ + "refseq:NM_000853", + "omim:600436", + "ucsc:uc002zze.4", + "uniprot:P30711", + "orphanet:470418", + "ena.embl:KI270879", + "pubmed:8617495", ], - 'xrefs': ['ensembl:ENSG00000277656', 'ncbigene:2952'], - 'gene_type': 'gene with protein product', + "xrefs": ["ensembl:ENSG00000277656", "ncbigene:2952"], + "gene_type": "gene with protein product", } return Gene(**params) @@ -557,273 +557,273 @@ def gstt1(): def test_a1bg_as1(check_resp_single_record, a1bg_as1, hgnc): """Test that a1bg_as1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:37133') + resp = hgnc.search("hgnc:37133") check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID) - resp = hgnc.search('HGNC:37133') + resp = hgnc.search("HGNC:37133") check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID) - resp = hgnc.search('Hgnc:37133') + resp = hgnc.search("Hgnc:37133") check_resp_single_record(resp, a1bg_as1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('A1BG-AS1') + resp = hgnc.search("A1BG-AS1") check_resp_single_record(resp, a1bg_as1, MatchType.SYMBOL) - resp = hgnc.search('A1BG-as1') + resp = hgnc.search("A1BG-as1") check_resp_single_record(resp, a1bg_as1, MatchType.SYMBOL) # Previous Symbol - resp = hgnc.search('NCRNA00181') + resp = hgnc.search("NCRNA00181") check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL) - resp = hgnc.search('A1BGAS') + resp = hgnc.search("A1BGAS") check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL) - resp = hgnc.search('A1BG-AS') + resp = hgnc.search("A1BG-AS") check_resp_single_record(resp, a1bg_as1, MatchType.PREV_SYMBOL) # Alias - resp = hgnc.search('FLJ23569') + resp = hgnc.search("FLJ23569") check_resp_single_record(resp, a1bg_as1, MatchType.ALIAS) - resp = hgnc.search('flj23569') + resp = hgnc.search("flj23569") check_resp_single_record(resp, a1bg_as1, MatchType.ALIAS) def test_a3galt2(check_resp_single_record, a3galt2, hgnc): """Test that a3galt2 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:30005') + resp = hgnc.search("hgnc:30005") check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID) - resp = hgnc.search('HGNC:30005') + resp = hgnc.search("HGNC:30005") check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID) - resp = hgnc.search('Hgnc:30005') + resp = hgnc.search("Hgnc:30005") check_resp_single_record(resp, a3galt2, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('A3GALT2') + resp = hgnc.search("A3GALT2") check_resp_single_record(resp, a3galt2, MatchType.SYMBOL) - resp = hgnc.search('a3galt2') + resp = hgnc.search("a3galt2") check_resp_single_record(resp, a3galt2, MatchType.SYMBOL) # Previous Symbol - resp = hgnc.search('A3GALT2P') + resp = hgnc.search("A3GALT2P") check_resp_single_record(resp, a3galt2, MatchType.PREV_SYMBOL) - resp = hgnc.search('A3GALT2p') + resp = hgnc.search("A3GALT2p") check_resp_single_record(resp, a3galt2, MatchType.PREV_SYMBOL) # Alias - resp = hgnc.search('IGBS3S') + resp = hgnc.search("IGBS3S") check_resp_single_record(resp, a3galt2, MatchType.ALIAS) - resp = hgnc.search('igB3s') + resp = hgnc.search("igB3s") check_resp_single_record(resp, a3galt2, MatchType.ALIAS) def test_tp53(check_resp_single_record, tp53, hgnc): """Test that tp53 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:11998') + resp = hgnc.search("hgnc:11998") check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = hgnc.search('HGNC:11998') + resp = hgnc.search("HGNC:11998") check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) - resp = hgnc.search('Hgnc:11998') + resp = hgnc.search("Hgnc:11998") check_resp_single_record(resp, tp53, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('tp53') + resp = hgnc.search("tp53") check_resp_single_record(resp, tp53, MatchType.SYMBOL) - resp = hgnc.search('TP53') + resp = hgnc.search("TP53") check_resp_single_record(resp, tp53, MatchType.SYMBOL) # Alias - resp = hgnc.search('LFS1') + resp = hgnc.search("LFS1") check_resp_single_record(resp, tp53, MatchType.ALIAS) - resp = hgnc.search('p53') + resp = hgnc.search("p53") check_resp_single_record(resp, tp53, MatchType.ALIAS) def test_wdhd1(check_resp_single_record, wdhd1, hgnc): """Test that a1bg_as1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:23170') + resp = hgnc.search("hgnc:23170") check_resp_single_record(resp, wdhd1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('WDHD1') + resp = hgnc.search("WDHD1") check_resp_single_record(resp, wdhd1, MatchType.SYMBOL) def test_g6pr(check_resp_single_record, g6pr, hgnc): """Test that g6pr normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:4059') + resp = hgnc.search("hgnc:4059") check_resp_single_record(resp, g6pr, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('G6PR') + resp = hgnc.search("G6PR") check_resp_single_record(resp, g6pr, MatchType.SYMBOL) def test_pirc24(check_resp_single_record, pirc24, hgnc): """Test that pirc24 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:37528') + resp = hgnc.search("hgnc:37528") check_resp_single_record(resp, pirc24, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('PIRC24') + resp = hgnc.search("PIRC24") check_resp_single_record(resp, pirc24, MatchType.SYMBOL) def test_gage4(check_resp_single_record, gage4, hgnc): """Test that gage4 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:4101') + resp = hgnc.search("hgnc:4101") check_resp_single_record(resp, gage4, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('GAGE4') + resp = hgnc.search("GAGE4") check_resp_single_record(resp, gage4, MatchType.SYMBOL) def test_mafip(check_resp_single_record, mafip, hgnc): """Test that mafip normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:31102') + resp = hgnc.search("hgnc:31102") check_resp_single_record(resp, mafip, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('MAFIP') + resp = hgnc.search("MAFIP") check_resp_single_record(resp, mafip, MatchType.SYMBOL) def test_mt_7sdna(check_resp_single_record, mt_7sdna, hgnc): """Test that mt_7sdna normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:7409') + resp = hgnc.search("hgnc:7409") check_resp_single_record(resp, mt_7sdna, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('MT-7SDNA') + resp = hgnc.search("MT-7SDNA") check_resp_single_record(resp, mt_7sdna, MatchType.SYMBOL) def test_cecr(check_resp_single_record, cecr, hgnc): """Test that cecr normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:1838') + resp = hgnc.search("hgnc:1838") check_resp_single_record(resp, cecr, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('CECR') + resp = hgnc.search("CECR") check_resp_single_record(resp, cecr, MatchType.SYMBOL) def test_csf2ra(check_resp_single_record, csf2ra, hgnc): """Test that csf2ra normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:2435') + resp = hgnc.search("hgnc:2435") check_resp_single_record(resp, csf2ra, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('CSF2RA') + resp = hgnc.search("CSF2RA") check_resp_single_record(resp, csf2ra, MatchType.SYMBOL) def test_rps24p5(check_resp_single_record, rps24p5, hgnc): """Test that rps24p5 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:36026') + resp = hgnc.search("hgnc:36026") check_resp_single_record(resp, rps24p5, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('rpS24P5') + resp = hgnc.search("rpS24P5") check_resp_single_record(resp, rps24p5, MatchType.SYMBOL) def test_trl_cag2_1(check_resp_single_record, trl_cag2_1, hgnc): """Test that trl_cag2_1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:34692') + resp = hgnc.search("hgnc:34692") check_resp_single_record(resp, trl_cag2_1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('TRL-CAG2-1') + resp = hgnc.search("TRL-CAG2-1") check_resp_single_record(resp, trl_cag2_1, MatchType.SYMBOL) def test_myo5b(check_resp_single_record, myo5b, hgnc): """Test that myo5b normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:7603') + resp = hgnc.search("hgnc:7603") check_resp_single_record(resp, myo5b, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('MYO5B') + resp = hgnc.search("MYO5B") check_resp_single_record(resp, myo5b, MatchType.SYMBOL) # associated_with - resp = hgnc.search('refseq:NM_001080467') + resp = hgnc.search("refseq:NM_001080467") check_resp_single_record(resp, myo5b, MatchType.ASSOCIATED_WITH) def test_gstt1(check_resp_single_record, gstt1, hgnc): """Test that gstt1 normalizes to correct gene concept.""" # Concept ID - resp = hgnc.search('hgnc:4641') + resp = hgnc.search("hgnc:4641") check_resp_single_record(resp, gstt1, MatchType.CONCEPT_ID) # Symbol - resp = hgnc.search('GSTT1') + resp = hgnc.search("GSTT1") check_resp_single_record(resp, gstt1, MatchType.SYMBOL) # associated_with - resp = hgnc.search('omim:600436') + resp = hgnc.search("omim:600436") check_resp_single_record(resp, gstt1, MatchType.ASSOCIATED_WITH) def test_no_match(hgnc): """Test that a term normalizes to correct gene concept as a NO match.""" - resp = hgnc.search('A1BG - AS1') + resp = hgnc.search("A1BG - AS1") assert len(resp.records) == 0 - resp = hgnc.search('hnc:5') + resp = hgnc.search("hnc:5") assert len(resp.records) == 0 # Test empty query - resp = hgnc.search('') + resp = hgnc.search("") assert len(resp.records) == 0 # Do not search on label - resp = hgnc.search('A1BG antisense RNA 1') + resp = hgnc.search("A1BG antisense RNA 1") assert len(resp.records) == 0 def test_meta_info(hgnc): """Test that the meta field is correct.""" - resp = hgnc.search('HGNC:37133') - assert resp.source_meta_.data_license == 'CC0' + resp = hgnc.search("HGNC:37133") + assert resp.source_meta_.data_license == "CC0" assert ( - resp.source_meta_.data_license_url == 'https://www.genenames.org/about/license/' + resp.source_meta_.data_license_url == "https://www.genenames.org/about/license/" ) - assert datetime.strptime(resp.source_meta_.version, '%Y%m%d') + assert datetime.strptime(resp.source_meta_.version, "%Y%m%d") assert resp.source_meta_.data_url == { - 'complete_set_archive': 'ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json' + "complete_set_archive": "ftp.ebi.ac.uk/pub/databases/genenames/hgnc/json/hgnc_complete_set.json" } assert resp.source_meta_.rdp_url is None assert resp.source_meta_.genome_assemblies == [] assert resp.source_meta_.data_license_attributes == { - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "non_commercial": False, + "share_alike": False, + "attribution": False, } diff --git a/tests/unit/test_ncbi_source.py b/tests/unit/test_ncbi_source.py index 2476a725..d0083a43 100644 --- a/tests/unit/test_ncbi_source.py +++ b/tests/unit/test_ncbi_source.py @@ -25,7 +25,7 @@ def check_ncbi_discontinued_gene(normalizer_response, concept_id, symbol, match_ assert resp.associated_with == [] -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def ncbi(database): """Build ncbi test fixture.""" @@ -33,7 +33,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl='ncbi'): + def search(self, query_str, incl="ncbi"): resp = self.query_handler.search(query_str, incl=incl) return resp.source_matches[SourceName.NCBI] @@ -41,22 +41,22 @@ def search(self, query_str, incl='ncbi'): return n -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def dpf1(): """Create gene fixture for DPF1.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'double PHD fingers 1', - 'concept_id': 'ncbigene:8193', - 'symbol': 'DPF1', - 'aliases': ['BAF45b', 'NEUD4', 'neuro-d4', 'SMARCG1'], - 'xrefs': ['hgnc:20225', 'ensembl:ENSG00000011332'], - 'previous_symbols': [], - 'associated_with': ['omim:601670'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': '-', - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "double PHD fingers 1", + "concept_id": "ncbigene:8193", + "symbol": "DPF1", + "aliases": ["BAF45b", "NEUD4", "neuro-d4", "SMARCG1"], + "xrefs": ["hgnc:20225", "ensembl:ENSG00000011332"], + "previous_symbols": [], + "associated_with": ["omim:601670"], + "symbol_status": None, + "location_annotations": [], + "strand": "-", + "locations": [ # { # "id": "ga4gh:CL.bzgLv8gt3KHK00OWTAEUNZcdgUjbHU8i", # "chr": "19", @@ -66,37 +66,37 @@ def dpf1(): # "type": "ChromosomeLocation" # }, { - 'id': 'ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20', - 'end': 38229695, - 'start': 38211005, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', + "id": "ga4gh:SL.0bmpLh_dlBRrzfviiQY9Vg4iEH0XeR20", + "end": 38229695, + "start": 38211005, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'gene_type': 'protein-coding', + "gene_type": "protein-coding", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def pdp1_symbol(): """Create gene fixture for PDP1 (ncbigene:54704).""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'pyruvate dehydrogenase phosphatase catalytic subunit 1', - 'concept_id': 'ncbigene:54704', - 'symbol': 'PDP1', - 'aliases': ['PDH', 'PDP', 'PDPC', 'PPM2A', 'PPM2C'], - 'xrefs': ['hgnc:9279', 'ensembl:ENSG00000164951'], - 'previous_symbols': ['LOC157663', 'PPM2C'], - 'associated_with': ['omim:605993'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': '+', - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "pyruvate dehydrogenase phosphatase catalytic subunit 1", + "concept_id": "ncbigene:54704", + "symbol": "PDP1", + "aliases": ["PDH", "PDP", "PDPC", "PPM2A", "PPM2C"], + "xrefs": ["hgnc:9279", "ensembl:ENSG00000164951"], + "previous_symbols": ["LOC157663", "PPM2C"], + "associated_with": ["omim:605993"], + "symbol_status": None, + "location_annotations": [], + "strand": "+", + "locations": [ # { # "id": "ga4gh:CL.cJsZWKrEtzpFn5psdCtgofb6NaEDVPfB", # "chr": "8", @@ -106,37 +106,37 @@ def pdp1_symbol(): # "type": "ChromosomeLocation" # }, { - 'id': 'ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG', - 'end': 93926068, - 'start': 93916922, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs', + "id": "ga4gh:SL.-455M-S51D8nXPFoGH0dYNFVFAJxm5dG", + "end": 93926068, + "start": 93916922, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.209Z7zJ-mFypBEWLk4rNC6S_OxY5p7bs", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'gene_type': 'protein-coding', + "gene_type": "protein-coding", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def pdp1_alias(): """Create gene fixture for PDP1 (ncbigene:403313).""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'phospholipid phosphatase 6', - 'concept_id': 'ncbigene:403313', - 'symbol': 'PLPP6', - 'aliases': ['PDP1', 'PSDP', 'PPAPDC2', 'bA6J24.6', 'LPRP-B', 'PA-PSP'], - 'xrefs': ['hgnc:23682', 'ensembl:ENSG00000205808'], - 'previous_symbols': [], - 'associated_with': ['omim:611666'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': '+', - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "phospholipid phosphatase 6", + "concept_id": "ncbigene:403313", + "symbol": "PLPP6", + "aliases": ["PDP1", "PSDP", "PPAPDC2", "bA6J24.6", "LPRP-B", "PA-PSP"], + "xrefs": ["hgnc:23682", "ensembl:ENSG00000205808"], + "previous_symbols": [], + "associated_with": ["omim:611666"], + "symbol_status": None, + "location_annotations": [], + "strand": "+", + "locations": [ # { # "id": "ga4gh:CL.7ivmMgKAqiFiRh1qsbA909w2kUcPabr_", # "chr": "9", @@ -146,38 +146,38 @@ def pdp1_alias(): # "type": "ChromosomeLocation" # }, { - 'id': 'ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY', - 'end': 4665258, - 'start': 4662293, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI', + "id": "ga4gh:SL.VI_0P0-ei90MDsLjAeUrDfeXBlZVJtJY", + "end": 4665258, + "start": 4662293, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'gene_type': 'protein-coding', + "gene_type": "protein-coding", } return Gene(**params) # X and Y chromosomes -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def spry3(): """Create gene fixture for SPRY3.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'sprouty RTK signaling antagonist 3', - 'concept_id': 'ncbigene:10251', - 'symbol': 'SPRY3', - 'aliases': ['spry-3'], - 'xrefs': ['hgnc:11271', 'ensembl:ENSG00000168939'], - 'previous_symbols': ['LOC170187', 'LOC253479'], - 'associated_with': ['omim:300531'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': '+', - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "sprouty RTK signaling antagonist 3", + "concept_id": "ncbigene:10251", + "symbol": "SPRY3", + "aliases": ["spry-3"], + "xrefs": ["hgnc:11271", "ensembl:ENSG00000168939"], + "previous_symbols": ["LOC170187", "LOC253479"], + "associated_with": ["omim:300531"], + "symbol_status": None, + "location_annotations": [], + "strand": "+", + "locations": [ # { # "id": "ga4gh:CL.r8Qv_b-B3SoguReqdunL3GCkt1RH-es1", # "chr": "Y", @@ -195,92 +195,92 @@ def spry3(): # "type": "ChromosomeLocation" # }, { - 'id': 'ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV', - 'end': 155782459, - 'start': 155612585, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', + "id": "ga4gh:SL.2N5aguRIvBdGemRgABZFutmLTV925dsV", + "end": 155782459, + "start": 155612585, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", }, { - 'id': 'ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL', - 'end': 56968979, - 'start': 56954315, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5', + "id": "ga4gh:SL.U9E9WtQdzFc4elR3t1qw48nueHgfWFWL", + "end": 56968979, + "start": 56954315, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", }, ], - 'gene_type': 'protein-coding', + "gene_type": "protein-coding", } return Gene(**params) # chromosome but no map locations -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def adcp1(): """Create gene fixture for ADCP1.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'adenosine deaminase complexing protein 1', - 'concept_id': 'ncbigene:106', - 'symbol': 'ADCP1', - 'aliases': [], - 'xrefs': ['hgnc:229'], - 'previous_symbols': [], - 'associated_with': [], - 'symbol_status': None, - 'strand': None, - 'location_annotations': ['6'], - 'locations': [], - 'gene_type': 'unknown', + "match_type": MatchType.NO_MATCH, + "label": "adenosine deaminase complexing protein 1", + "concept_id": "ncbigene:106", + "symbol": "ADCP1", + "aliases": [], + "xrefs": ["hgnc:229"], + "previous_symbols": [], + "associated_with": [], + "symbol_status": None, + "strand": None, + "location_annotations": ["6"], + "locations": [], + "gene_type": "unknown", } return Gene(**params) # no chromosome or map locations -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def afa(): """Create gene fixture for AFA.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'ankyloblepharon filiforme adnatum', - 'concept_id': 'ncbigene:170', - 'symbol': 'AFA', - 'aliases': [], - 'xrefs': [], - 'previous_symbols': [], - 'associated_with': ['omim:106250'], - 'symbol_status': None, - 'strand': None, - 'location_annotations': [], - 'locations': [], - 'gene_type': 'unknown', + "match_type": MatchType.NO_MATCH, + "label": "ankyloblepharon filiforme adnatum", + "concept_id": "ncbigene:170", + "symbol": "AFA", + "aliases": [], + "xrefs": [], + "previous_symbols": [], + "associated_with": ["omim:106250"], + "symbol_status": None, + "strand": None, + "location_annotations": [], + "locations": [], + "gene_type": "unknown", } return Gene(**params) # Contains non cytogenic locations (i.e. "map from Rosati....") -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def znf84(): """Create gene fixture for ZNF84.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'zinc finger protein 84', - 'concept_id': 'ncbigene:7637', - 'symbol': 'ZNF84', - 'aliases': ['HPF2'], - 'xrefs': ['hgnc:13159', 'ensembl:ENSG00000198040'], - 'previous_symbols': ['LOC100287429'], - 'associated_with': ['omim:618554'], - 'symbol_status': None, - 'location_annotations': ['map from Rosati ref via FISH [AFS]'], - 'strand': '+', - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "zinc finger protein 84", + "concept_id": "ncbigene:7637", + "symbol": "ZNF84", + "aliases": ["HPF2"], + "xrefs": ["hgnc:13159", "ensembl:ENSG00000198040"], + "previous_symbols": ["LOC100287429"], + "associated_with": ["omim:618554"], + "symbol_status": None, + "location_annotations": ["map from Rosati ref via FISH [AFS]"], + "strand": "+", + "locations": [ # { # "id": "ga4gh:CL.6YvQEs6MuHuNvt0Vlv8r4hMKIOK5Ktq4", # "chr": "12", @@ -290,38 +290,38 @@ def znf84(): # "type": "ChromosomeLocation" # }, { - 'id': 'ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud', - 'end': 133063299, - 'start': 133037508, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl', + "id": "ga4gh:SL.IRsls9vud2-CiA7Jq4L3ry2VVK7LoNud", + "end": 133063299, + "start": 133037508, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.6wlJpONE3oNb4D69ULmEXhqyDZ4vwNfl", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", } ], - 'gene_type': 'protein-coding', + "gene_type": "protein-coding", } return Gene(**params) # No arm or sub band -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def slc25a6(): """Create gene fixture for SLC25A6.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'solute carrier family 25 member 6', - 'concept_id': 'ncbigene:293', - 'symbol': 'SLC25A6', - 'aliases': ['AAC3', 'ANT', 'ANT 2', 'ANT 3', 'ANT3', 'ANT3Y'], - 'xrefs': ['hgnc:10992', 'ensembl:ENSG00000169100', 'ensembl:ENSG00000292334'], - 'previous_symbols': ['ANT3Y'], - 'associated_with': ['omim:300151', 'omim:403000'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': '-', - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "solute carrier family 25 member 6", + "concept_id": "ncbigene:293", + "symbol": "SLC25A6", + "aliases": ["AAC3", "ANT", "ANT 2", "ANT 3", "ANT3", "ANT3Y"], + "xrefs": ["hgnc:10992", "ensembl:ENSG00000169100", "ensembl:ENSG00000292334"], + "previous_symbols": ["ANT3Y"], + "associated_with": ["omim:300151", "omim:403000"], + "symbol_status": None, + "location_annotations": [], + "strand": "-", + "locations": [ # { # "id": "ga4gh:CL.Z5pOXNI2Bt8L2NpypNYsbbtgC9L1uyl4", # "type": "ChromosomeLocation", @@ -339,48 +339,48 @@ def slc25a6(): # "end": "p11.2" # }, { - 'id': 'ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP', + "id": "ga4gh:SL.dvD-ZopQGZkVWx4Z-vFpP9ateicPHgQ6", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.w0WZEvgJF0zf_P4yyTzjjv9oW1z61HHP", }, - 'start': 1386151, - 'end': 1392113, + "start": 1386151, + "end": 1392113, }, { - 'id': 'ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5', + "id": "ga4gh:SL.bv3LobZZ-sERq5cIthyS4w_tmSwV2QSg", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.8_liLu1aycC0tPQPFmUaGXJLDs5SbPZ5", }, - 'start': 1386151, - 'end': 1392113, + "start": 1386151, + "end": 1392113, }, ], - 'gene_type': 'protein-coding', + "gene_type": "protein-coding", } return Gene(**params) # Contains arm but no sub band -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def loc106783576(): """Create gene fixture for .""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'nonconserved acetylation island sequence 68 enhancer', - 'concept_id': 'ncbigene:106783576', - 'symbol': 'LOC106783576', - 'aliases': [], - 'xrefs': [], - 'previous_symbols': [], - 'associated_with': [], - 'symbol_status': None, - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "nonconserved acetylation island sequence 68 enhancer", + "concept_id": "ncbigene:106783576", + "symbol": "LOC106783576", + "aliases": [], + "xrefs": [], + "previous_symbols": [], + "associated_with": [], + "symbol_status": None, + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.YYGQrLtmKwKgp38asAkHT8AydAidnui8", # "chr": "10", @@ -390,28 +390,28 @@ def loc106783576(): # "type": "ChromosomeLocation" # } ], - 'gene_type': 'biological-region', + "gene_type": "biological-region", } return Gene(**params) # Testing for cen -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def glc1b(): """Create gene fixture for GLC1B.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'glaucoma 1, open angle, B (adult-onset)', - 'concept_id': 'ncbigene:2722', - 'symbol': 'GLC1B', - 'aliases': [], - 'xrefs': [], - 'previous_symbols': [], - 'associated_with': ['omim:606689'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "glaucoma 1, open angle, B (adult-onset)", + "concept_id": "ncbigene:2722", + "symbol": "GLC1B", + "aliases": [], + "xrefs": [], + "previous_symbols": [], + "associated_with": ["omim:606689"], + "symbol_status": None, + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.8D0hLCktRxyPrx4Etgabq10vEq6TtU43", # "chr": "2", @@ -421,28 +421,28 @@ def glc1b(): # "type": "ChromosomeLocation" # } ], - 'gene_type': 'unknown', + "gene_type": "unknown", } return Gene(**params) # Testing for ter ranges -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def hdpa(): """Create gene fixture for HDPA.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'Hodgkin disease, susceptibility, pseudoautosomal', - 'concept_id': 'ncbigene:50829', - 'symbol': 'HDPA', - 'aliases': [], - 'xrefs': [], - 'previous_symbols': [], - 'associated_with': ['omim:300221'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "Hodgkin disease, susceptibility, pseudoautosomal", + "concept_id": "ncbigene:50829", + "symbol": "HDPA", + "aliases": [], + "xrefs": [], + "previous_symbols": [], + "associated_with": ["omim:300221"], + "symbol_status": None, + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.kl9HXvnUCE6Z1ktXibt83NBdXvxnT2RA", # "chr": "X", @@ -452,29 +452,29 @@ def hdpa(): # "type": "ChromosomeLocation" # } ], - 'gene_type': 'unknown', + "gene_type": "unknown", } return Gene(**params) # Testing for annotation -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def prkrap1(): """Create gene fixture for PRKRAP1.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'protein activator of interferon induced protein kinase ' - 'EIF2AK2 pseudogene 1', - 'concept_id': 'ncbigene:731716', - 'symbol': 'PRKRAP1', - 'aliases': [], - 'xrefs': ['hgnc:33447'], - 'previous_symbols': ['LOC100289695'], - 'associated_with': [], - 'symbol_status': None, - 'location_annotations': ['alternate reference locus'], - 'strand': '+', - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "protein activator of interferon induced protein kinase " + "EIF2AK2 pseudogene 1", + "concept_id": "ncbigene:731716", + "symbol": "PRKRAP1", + "aliases": [], + "xrefs": ["hgnc:33447"], + "previous_symbols": ["LOC100289695"], + "associated_with": [], + "symbol_status": None, + "location_annotations": ["alternate reference locus"], + "strand": "+", + "locations": [ # { # "id": "ga4gh:CL.FYt7UkCHZVLpkYe7zhNdMk1K6lxl_k7I", # "chr": "6", @@ -484,48 +484,48 @@ def prkrap1(): # "type": "ChromosomeLocation" # }, { - 'id': 'ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb', - 'end': 3941874, - 'start': 3940269, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1', + "id": "ga4gh:SL.LwWy5JYncZVnOM9hWiLWW_z0n2eY-peb", + "end": 3941874, + "start": 3940269, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.MjujHSAsgNWRTX4w3ysM7b5OVhZpdXu1", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", }, { - 'id': 'ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c', - 'end': 3932085, - 'start': 3930480, - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-', + "id": "ga4gh:SL.q36ql_fX4HrZy_G2EXX_SGWl-7X5Bq6c", + "end": 3932085, + "start": 3930480, + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.Q8IworEhpLeXwpz1CHM7C3luysh-ltx-", }, - 'type': 'SequenceLocation', + "type": "SequenceLocation", }, ], - 'gene_type': 'pseudo', + "gene_type": "pseudo", } return Gene(**params) # start > end -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def mhb(): """Create gene fixture for MHB.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'myopathy, hyaline body, autosomal recessive', - 'concept_id': 'ncbigene:619511', - 'symbol': 'MHB', - 'aliases': [], - 'xrefs': [], - 'previous_symbols': [], - 'associated_with': ['omim:255160'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "myopathy, hyaline body, autosomal recessive", + "concept_id": "ncbigene:619511", + "symbol": "MHB", + "aliases": [], + "xrefs": [], + "previous_symbols": [], + "associated_with": ["omim:255160"], + "symbol_status": None, + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.6vlmdqdXYxSAGsJI9no7kLN5iLKpvr5X", # "chr": "3", @@ -535,28 +535,28 @@ def mhb(): # "type": "ChromosomeLocation" # } ], - 'gene_type': 'unknown', + "gene_type": "unknown", } return Gene(**params) # Different arms -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def spg37(): """Create gene fixture for SPG37.""" params = { - 'match_type': MatchType.NO_MATCH, - 'label': 'spastic paraplegia 37 (autosomal dominant)', - 'concept_id': 'ncbigene:100049159', - 'symbol': 'SPG37', - 'aliases': [], - 'xrefs': [], - 'previous_symbols': [], - 'associated_with': ['omim:611945'], - 'symbol_status': None, - 'location_annotations': [], - 'strand': None, - 'locations': [ + "match_type": MatchType.NO_MATCH, + "label": "spastic paraplegia 37 (autosomal dominant)", + "concept_id": "ncbigene:100049159", + "symbol": "SPG37", + "aliases": [], + "xrefs": [], + "previous_symbols": [], + "associated_with": ["omim:611945"], + "symbol_status": None, + "location_annotations": [], + "strand": None, + "locations": [ # { # "id": "ga4gh:CL.XWbwTwmJ95KD-aCuXfJcD8cNIvXbiXRh", # "chr": "8", @@ -566,349 +566,349 @@ def spg37(): # "type": "ChromosomeLocation" # } ], - 'gene_type': 'unknown', + "gene_type": "unknown", } return Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def source_urls(): """Provide source data URLs fixture.""" return { - 'info_file': 'ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz', - 'history_file': 'ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz', - 'assembly_file': 'ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/', + "info_file": "ftp.ncbi.nlm.nih.govgene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", + "history_file": "ftp.ncbi.nlm.nih.govgene/DATA/gene_history.gz", + "assembly_file": "ftp.ncbi.nlm.nih.govgenomes/refseq/vertebrate_mammalian/Homo_sapiens/latest_assembly_versions/", } def test_dpf1(check_resp_single_record, ncbi, dpf1): """Test that DPF1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('ncbigene:8193') + resp = ncbi.search("ncbigene:8193") check_resp_single_record(resp, dpf1, MatchType.CONCEPT_ID) - resp = ncbi.search('ncbIgene:8193') + resp = ncbi.search("ncbIgene:8193") check_resp_single_record(resp, dpf1, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('DPF1') + resp = ncbi.search("DPF1") check_resp_single_record(resp, dpf1, MatchType.SYMBOL) - resp = ncbi.search('DpF1') + resp = ncbi.search("DpF1") check_resp_single_record(resp, dpf1, MatchType.SYMBOL) # Alias - resp = ncbi.search('BAF45b') + resp = ncbi.search("BAF45b") check_resp_single_record(resp, dpf1, MatchType.ALIAS) - resp = ncbi.search('NEUD4') + resp = ncbi.search("NEUD4") check_resp_single_record(resp, dpf1, MatchType.ALIAS) - resp = ncbi.search('neuro-d4') + resp = ncbi.search("neuro-d4") check_resp_single_record(resp, dpf1, MatchType.ALIAS) # associated_with - resp = ncbi.search('omim:601670') + resp = ncbi.search("omim:601670") check_resp_single_record(resp, dpf1, MatchType.ASSOCIATED_WITH) # No Match - resp = ncbi.search('DPF 1') + resp = ncbi.search("DPF 1") assert len(resp.records) == 0 - resp = ncbi.search('DPG1') + resp = ncbi.search("DPG1") assert len(resp.records) == 0 def test_pdp1(compare_records, check_resp_single_record, ncbi, pdp1_symbol, pdp1_alias): """Test that PDP1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('ncbigene:54704') + resp = ncbi.search("ncbigene:54704") check_resp_single_record(resp, pdp1_symbol, MatchType.CONCEPT_ID) - resp = ncbi.search('NCBIGENE:54704') + resp = ncbi.search("NCBIGENE:54704") check_resp_single_record(resp, pdp1_symbol, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('PDP1') + resp = ncbi.search("PDP1") assert len(resp.records) == 2 # first record check (should always be symbol) compare_records(resp.records[0], pdp1_symbol, MatchType.SYMBOL) compare_records(resp.records[1], pdp1_alias, MatchType.ALIAS) - resp = ncbi.search('pdp1') + resp = ncbi.search("pdp1") assert len(resp.records) == 2 # first record check (should always be symbol) compare_records(resp.records[0], pdp1_symbol, MatchType.SYMBOL) compare_records(resp.records[1], pdp1_alias, MatchType.ALIAS) # Previous Symbol - resp = ncbi.search('LOC157663') + resp = ncbi.search("LOC157663") check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL) - resp = ncbi.search('PPM2C') + resp = ncbi.search("PPM2C") check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL) - resp = ncbi.search('loc157663') + resp = ncbi.search("loc157663") check_resp_single_record(resp, pdp1_symbol, MatchType.PREV_SYMBOL) # Alias - resp = ncbi.search('pdh') + resp = ncbi.search("pdh") check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) - resp = ncbi.search('PDP') + resp = ncbi.search("PDP") check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) - resp = ncbi.search('PDPC') + resp = ncbi.search("PDPC") check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) - resp = ncbi.search('PPM2A') + resp = ncbi.search("PPM2A") check_resp_single_record(resp, pdp1_symbol, MatchType.ALIAS) def test_spry3(check_resp_single_record, ncbi, spry3): """Test that SPRY3 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:10251') + resp = ncbi.search("NCBIgene:10251") check_resp_single_record(resp, spry3, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('sprY3') + resp = ncbi.search("sprY3") check_resp_single_record(resp, spry3, MatchType.SYMBOL) # Alias - resp = ncbi.search('SPRY-3') + resp = ncbi.search("SPRY-3") check_resp_single_record(resp, spry3, MatchType.ALIAS) def test_adcp1(check_resp_single_record, ncbi, adcp1): """Test that ADCP1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:106') + resp = ncbi.search("NCBIgene:106") check_resp_single_record(resp, adcp1, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('ADCP1') + resp = ncbi.search("ADCP1") check_resp_single_record(resp, adcp1, MatchType.SYMBOL) def test_afa(check_resp_single_record, ncbi, afa): """Test that AFA normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:170') + resp = ncbi.search("NCBIgene:170") check_resp_single_record(resp, afa, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('AFA') + resp = ncbi.search("AFA") check_resp_single_record(resp, afa, MatchType.SYMBOL) def test_znf84(check_resp_single_record, ncbi, znf84): """Test that ZNF84 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:7637') + resp = ncbi.search("NCBIgene:7637") check_resp_single_record(resp, znf84, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('ZNF84') + resp = ncbi.search("ZNF84") check_resp_single_record(resp, znf84, MatchType.SYMBOL) def test_slc25a6(check_resp_single_record, ncbi, slc25a6): """Test that SLC25A6 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:293') + resp = ncbi.search("NCBIgene:293") check_resp_single_record(resp, slc25a6, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('SLC25A6') + resp = ncbi.search("SLC25A6") check_resp_single_record(resp, slc25a6, MatchType.SYMBOL) def test_loc106783576(check_resp_single_record, ncbi, loc106783576): """Test that LOC106783576 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:106783576') + resp = ncbi.search("NCBIgene:106783576") check_resp_single_record(resp, loc106783576, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('LOC106783576') + resp = ncbi.search("LOC106783576") check_resp_single_record(resp, loc106783576, MatchType.SYMBOL) def test_oms(ncbi): """Test that OMS matches to correct gene concept.""" - resp = ncbi.search('NCBIgene:619538') + resp = ncbi.search("NCBIgene:619538") assert len(resp.records) == 0 def test_glc1b(check_resp_single_record, ncbi, glc1b): """Test that GLC1B normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:2722') + resp = ncbi.search("NCBIgene:2722") check_resp_single_record(resp, glc1b, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('GLC1B') + resp = ncbi.search("GLC1B") check_resp_single_record(resp, glc1b, MatchType.SYMBOL) # associated_with - resp = ncbi.search('omim:606689') + resp = ncbi.search("omim:606689") check_resp_single_record(resp, glc1b, MatchType.ASSOCIATED_WITH) def test_hdpa(check_resp_single_record, ncbi, hdpa): """Test that HDPA normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:50829') + resp = ncbi.search("NCBIgene:50829") check_resp_single_record(resp, hdpa, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('HDPA') + resp = ncbi.search("HDPA") check_resp_single_record(resp, hdpa, MatchType.SYMBOL) def test_prkrap1(check_resp_single_record, ncbi, prkrap1): """Test that PRKRAP1 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:731716') + resp = ncbi.search("NCBIgene:731716") check_resp_single_record(resp, prkrap1, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('PRKRAP1') + resp = ncbi.search("PRKRAP1") check_resp_single_record(resp, prkrap1, MatchType.SYMBOL) # xref - resp = ncbi.search('hgnc:33447') + resp = ncbi.search("hgnc:33447") check_resp_single_record(resp, prkrap1, MatchType.XREF) def test_mhb(check_resp_single_record, ncbi, mhb): """Test that MHB normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:619511') + resp = ncbi.search("NCBIgene:619511") check_resp_single_record(resp, mhb, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('MHB') + resp = ncbi.search("MHB") check_resp_single_record(resp, mhb, MatchType.SYMBOL) # associated_with - resp = ncbi.search('OMIM:255160') + resp = ncbi.search("OMIM:255160") check_resp_single_record(resp, mhb, MatchType.ASSOCIATED_WITH) def test_spg37(check_resp_single_record, ncbi, spg37): """Test that SPG37 normalizes to correct gene concept.""" # Concept ID - resp = ncbi.search('NCBIgene:100049159') + resp = ncbi.search("NCBIgene:100049159") check_resp_single_record(resp, spg37, MatchType.CONCEPT_ID) # Symbol - resp = ncbi.search('SPG37') + resp = ncbi.search("SPG37") check_resp_single_record(resp, spg37, MatchType.SYMBOL) # associated_with - resp = ncbi.search('omim:611945') + resp = ncbi.search("omim:611945") check_resp_single_record(resp, spg37, MatchType.ASSOCIATED_WITH) def test_discontinued_genes(ncbi): """Test searches for discontinued genes.""" # HOTS - resp = ncbi.search('ncbigene:103344718') + resp = ncbi.search("ncbigene:103344718") check_ncbi_discontinued_gene( - resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID + resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID ) - resp = ncbi.search('HOTS') + resp = ncbi.search("HOTS") check_ncbi_discontinued_gene( - resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID + resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID ) - resp = ncbi.search('hots') + resp = ncbi.search("hots") check_ncbi_discontinued_gene( - resp, 'ncbigene:103344718', 'HOTS', MatchType.CONCEPT_ID + resp, "ncbigene:103344718", "HOTS", MatchType.CONCEPT_ID ) # AASTH23 - resp = ncbi.search('ncbigene:544580') + resp = ncbi.search("ncbigene:544580") check_ncbi_discontinued_gene( - resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID + resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID ) - resp = ncbi.search('AASTH23') + resp = ncbi.search("AASTH23") check_ncbi_discontinued_gene( - resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID + resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID ) - resp = ncbi.search('aastH23') + resp = ncbi.search("aastH23") check_ncbi_discontinued_gene( - resp, 'ncbigene:544580', 'AASTH23', MatchType.CONCEPT_ID + resp, "ncbigene:544580", "AASTH23", MatchType.CONCEPT_ID ) def test_no_match(ncbi, source_urls): """Test that nonexistent query doesn"t normalize to a match.""" - response = ncbi.search('cisplatin') + response = ncbi.search("cisplatin") assert len(response.records) == 0 # double-check that meta still populates - assert response.source_meta_.data_license == 'custom' + assert response.source_meta_.data_license == "custom" assert ( response.source_meta_.data_license_url - == 'https://www.ncbi.nlm.nih.gov/home/about/policies/' + == "https://www.ncbi.nlm.nih.gov/home/about/policies/" ) - assert datetime.strptime(response.source_meta_.version, '%Y%m%d') + assert datetime.strptime(response.source_meta_.version, "%Y%m%d") assert response.source_meta_.data_url == source_urls - assert response.source_meta_.rdp_url == 'https://reusabledata.org/ncbi-gene.html' - assert not response.source_meta_.data_license_attributes['non_commercial'] - assert not response.source_meta_.data_license_attributes['share_alike'] - assert not response.source_meta_.data_license_attributes['attribution'] + assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" + assert not response.source_meta_.data_license_attributes["non_commercial"] + assert not response.source_meta_.data_license_attributes["share_alike"] + assert not response.source_meta_.data_license_attributes["attribution"] # check blank - response = ncbi.search('') + response = ncbi.search("") assert len(response.records) == 0 # check some strange characters - response = ncbi.search('----') + response = ncbi.search("----") assert len(response.records) == 0 response = ncbi.search("''") assert len(response.records) == 0 - response = ncbi.search('~~~') + response = ncbi.search("~~~") assert len(response.records) == 0 - response = ncbi.search(' ') + response = ncbi.search(" ") assert len(response.records) == 0 # Incorrect Concept IDs - response = ncbi.search('ncblgene:8193') + response = ncbi.search("ncblgene:8193") assert len(response.records) == 0 - response = ncbi.search('NCBIGENE54704') + response = ncbi.search("NCBIGENE54704") assert len(response.records) == 0 - response = ncbi.search('54704') + response = ncbi.search("54704") assert len(response.records) == 0 - response = ncbi.search('ncbigene;54704') + response = ncbi.search("ncbigene;54704") assert len(response.records) == 0 def test_meta(ncbi, source_urls): """Test NCBI source metadata.""" - response = ncbi.search('PDP1') - assert response.source_meta_.data_license == 'custom' + response = ncbi.search("PDP1") + assert response.source_meta_.data_license == "custom" assert ( response.source_meta_.data_license_url - == 'https://www.ncbi.nlm.nih.gov/home/about/policies/' + == "https://www.ncbi.nlm.nih.gov/home/about/policies/" ) - assert datetime.strptime(response.source_meta_.version, '%Y%m%d') + assert datetime.strptime(response.source_meta_.version, "%Y%m%d") assert response.source_meta_.data_url == source_urls - assert response.source_meta_.rdp_url == 'https://reusabledata.org/ncbi-gene.html' - assert response.source_meta_.genome_assemblies == ['GRCh38.p14'] + assert response.source_meta_.rdp_url == "https://reusabledata.org/ncbi-gene.html" + assert response.source_meta_.genome_assemblies == ["GRCh38.p14"] assert response.source_meta_.data_license_attributes == { - 'non_commercial': False, - 'share_alike': False, - 'attribution': False, + "non_commercial": False, + "share_alike": False, + "attribution": False, } diff --git a/tests/unit/test_query.py b/tests/unit/test_query.py index 31da2878..4521e5e8 100644 --- a/tests/unit/test_query.py +++ b/tests/unit/test_query.py @@ -6,7 +6,7 @@ from gene.schemas import BaseGene, MatchType, SourceName -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def query_handler(database): """Build query_handler test fixture.""" @@ -14,7 +14,7 @@ class QueryGetter: def __init__(self): self.query_handler = QueryHandler(database) - def search(self, query_str, incl='', excl=''): + def search(self, query_str, incl="", excl=""): return self.query_handler.search(query_str=query_str, incl=incl, excl=excl) def normalize(self, query_str): @@ -26,79 +26,79 @@ def normalize_unmerged(self, query_str): return QueryGetter() -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalized_ache(): """Return normalized core Gene object for ACHE.""" params = { - 'type': 'Gene', - 'id': 'normalize.gene.hgnc:108', - 'label': 'ACHE', - 'mappings': [ + "type": "Gene", + "id": "normalize.gene.hgnc:108", + "label": "ACHE", + "mappings": [ { - 'coding': {'code': 'ENSG00000087085', 'system': 'ensembl'}, - 'relation': 'relatedMatch', + "coding": {"code": "ENSG00000087085", "system": "ensembl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '43', 'system': 'ncbigene'}, - 'relation': 'relatedMatch', + "coding": {"code": "43", "system": "ncbigene"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'OTTHUMG00000157033', 'system': 'vega'}, - 'relation': 'relatedMatch', + "coding": {"code": "OTTHUMG00000157033", "system": "vega"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'uc003uxi.4', 'system': 'ucsc'}, - 'relation': 'relatedMatch', + "coding": {"code": "uc003uxi.4", "system": "ucsc"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS5710', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS5710", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS64736', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS64736", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS5709', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS5709", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'P22303', 'system': 'uniprot'}, - 'relation': 'relatedMatch', + "coding": {"code": "P22303", "system": "uniprot"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1380483', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "1380483", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '100740', 'system': 'omim'}, - 'relation': 'relatedMatch', + "coding": {"code": "100740", "system": "omim"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'S09.979', 'system': 'merops'}, - 'relation': 'relatedMatch', + "coding": {"code": "S09.979", "system": "merops"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '2465', 'system': 'iuphar'}, - 'relation': 'relatedMatch', + "coding": {"code": "2465", "system": "iuphar"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'NM_015831', 'system': 'refseq'}, - 'relation': 'relatedMatch', + "coding": {"code": "NM_015831", "system": "refseq"}, + "relation": "relatedMatch", }, ], - 'aliases': ['3.1.1.7', 'YT', 'N-ACHE', 'ARACHE', 'ACEE'], - 'extensions': [ - {'name': 'previous_symbols', 'value': ['ACEE', 'YT'], 'type': 'Extension'}, + "aliases": ["3.1.1.7", "YT", "N-ACHE", "ARACHE", "ACEE"], + "extensions": [ + {"name": "previous_symbols", "value": ["ACEE", "YT"], "type": "Extension"}, { - 'name': 'approved_name', - 'value': 'acetylcholinesterase (Cartwright blood group)', - 'type': 'Extension', + "name": "approved_name", + "value": "acetylcholinesterase (Cartwright blood group)", + "type": "Extension", }, - {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'}, + {"name": "symbol_status", "value": "approved", "type": "Extension"}, { - 'name': 'ncbi_locations', - 'value': [ + "name": "ncbi_locations", + "value": [ # { # "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", # "type": "ChromosomeLocation", @@ -108,17 +108,17 @@ def normalized_ache(): # "start": "q22.1" # }, { - 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 100889993, - 'end': 100896994, + "start": 100889993, + "end": 100896994, } ], - 'type': 'Extension', + "type": "Extension", }, # { # "name": "hgnc_locations", @@ -135,117 +135,117 @@ def normalized_ache(): # "type": "Extension" # }, { - 'name': 'ensembl_locations', - 'value': [ + "name": "ensembl_locations", + "value": [ { - 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 100889993, - 'end': 100896974, + "start": 100889993, + "end": 100896974, } ], - 'type': 'Extension', + "type": "Extension", }, - {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, + {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, { - 'name': 'hgnc_locus_type', - 'type': 'Extension', - 'value': 'gene with protein product', + "name": "hgnc_locus_type", + "type": "Extension", + "value": "gene with protein product", }, - {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, - {'name': 'strand', 'type': 'Extension', 'value': '-'}, + {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, + {"name": "strand", "type": "Extension", "value": "-"}, ], } return core_models.Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalized_braf(): """Return normalized core Gene object for BRAF.""" params = { - 'type': 'Gene', - 'id': 'normalize.gene.hgnc:1097', - 'label': 'BRAF', - 'mappings': [ + "type": "Gene", + "id": "normalize.gene.hgnc:1097", + "label": "BRAF", + "mappings": [ { - 'coding': {'code': '673', 'system': 'ncbigene'}, - 'relation': 'relatedMatch', + "coding": {"code": "673", "system": "ncbigene"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'ENSG00000157764', 'system': 'ensembl'}, - 'relation': 'relatedMatch', + "coding": {"code": "ENSG00000157764", "system": "ensembl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS5863', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS5863", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1943', 'system': 'iuphar'}, - 'relation': 'relatedMatch', + "coding": {"code": "1943", "system": "iuphar"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '119066', 'system': 'orphanet'}, - 'relation': 'relatedMatch', + "coding": {"code": "119066", "system": "orphanet"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'BRAF', 'system': 'cosmic'}, - 'relation': 'relatedMatch', + "coding": {"code": "BRAF", "system": "cosmic"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '2284096', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "2284096", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'uc003vwc.5', 'system': 'ucsc'}, - 'relation': 'relatedMatch', + "coding": {"code": "uc003vwc.5", "system": "ucsc"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '164757', 'system': 'omim'}, - 'relation': 'relatedMatch', + "coding": {"code": "164757", "system": "omim"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'NM_004333', 'system': 'refseq'}, - 'relation': 'relatedMatch', + "coding": {"code": "NM_004333", "system": "refseq"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS87555', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS87555", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'P15056', 'system': 'uniprot'}, - 'relation': 'relatedMatch', + "coding": {"code": "P15056", "system": "uniprot"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'M95712', 'system': 'ena.embl'}, - 'relation': 'relatedMatch', + "coding": {"code": "M95712", "system": "ena.embl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'OTTHUMG00000157457', 'system': 'vega'}, - 'relation': 'relatedMatch', + "coding": {"code": "OTTHUMG00000157457", "system": "vega"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1565476', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "1565476", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS94219', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS94219", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS94218', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS94218", "system": "ccds"}, + "relation": "relatedMatch", }, ], - 'aliases': ['BRAF1', 'BRAF-1', 'RAFB1', 'NS7', 'B-RAF1', 'B-raf'], - 'extensions': [ + "aliases": ["BRAF1", "BRAF-1", "RAFB1", "NS7", "B-RAF1", "B-raf"], + "extensions": [ { - 'name': 'approved_name', - 'value': 'B-Raf proto-oncogene, serine/threonine kinase', - 'type': 'Extension', + "name": "approved_name", + "value": "B-Raf proto-oncogene, serine/threonine kinase", + "type": "Extension", }, # { # "name": "hgnc_locations", @@ -262,24 +262,24 @@ def normalized_braf(): # "type": "Extension" # }, { - 'name': 'ensembl_locations', - 'value': [ + "name": "ensembl_locations", + "value": [ { - 'id': 'ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.WJ0hsPzXuK54mQyVysTqUNV5jaCATnRf", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 140719326, - 'end': 140924929, + "start": 140719326, + "end": 140924929, } ], - 'type': 'Extension', + "type": "Extension", }, { - 'name': 'ncbi_locations', - 'value': [ + "name": "ncbi_locations", + "value": [ # { # "id": "ga4gh:CL.ZZZYpOwuW1BLLJXc_Dm4eVZ5E0smVYCc", # "type": "ChromosomeLocation", @@ -289,124 +289,124 @@ def normalized_braf(): # "end": "q34" # }, { - 'id': 'ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.uNBZoxhjhohl24VlIut-JxPJAGfJ7EQE", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 140713327, - 'end': 140924929, + "start": 140713327, + "end": 140924929, } ], - 'type': 'Extension', + "type": "Extension", }, - {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, + {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, { - 'name': 'hgnc_locus_type', - 'type': 'Extension', - 'value': 'gene with protein product', + "name": "hgnc_locus_type", + "type": "Extension", + "value": "gene with protein product", }, - {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, - {'name': 'strand', 'type': 'Extension', 'value': '-'}, - {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'}, + {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, + {"name": "strand", "type": "Extension", "value": "-"}, + {"name": "symbol_status", "type": "Extension", "value": "approved"}, ], } return core_models.Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalized_abl1(): """Return normalized core Gene object for ABL1.""" params = { - 'type': 'Gene', - 'id': 'normalize.gene.hgnc:76', - 'label': 'ABL1', - 'mappings': [ + "type": "Gene", + "id": "normalize.gene.hgnc:76", + "label": "ABL1", + "mappings": [ { - 'coding': {'code': 'ENSG00000097007', 'system': 'ensembl'}, - 'relation': 'relatedMatch', + "coding": {"code": "ENSG00000097007", "system": "ensembl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '25', 'system': 'ncbigene'}, - 'relation': 'relatedMatch', + "coding": {"code": "25", "system": "ncbigene"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'OTTHUMG00000020813', 'system': 'vega'}, - 'relation': 'relatedMatch', + "coding": {"code": "OTTHUMG00000020813", "system": "vega"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'uc004bzv.4', 'system': 'ucsc'}, - 'relation': 'relatedMatch', + "coding": {"code": "uc004bzv.4", "system": "ucsc"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS35166', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS35166", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS35165', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS35165", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'P00519', 'system': 'uniprot'}, - 'relation': 'relatedMatch', + "coding": {"code": "P00519", "system": "uniprot"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1857987', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "1857987", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '12626632', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "12626632", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'ABL1', 'system': 'cosmic'}, - 'relation': 'relatedMatch', + "coding": {"code": "ABL1", "system": "cosmic"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '189980', 'system': 'omim'}, - 'relation': 'relatedMatch', + "coding": {"code": "189980", "system": "omim"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '117691', 'system': 'orphanet'}, - 'relation': 'relatedMatch', + "coding": {"code": "117691", "system": "orphanet"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1923', 'system': 'iuphar'}, - 'relation': 'relatedMatch', + "coding": {"code": "1923", "system": "iuphar"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'M14752', 'system': 'ena.embl'}, - 'relation': 'relatedMatch', + "coding": {"code": "M14752", "system": "ena.embl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'NM_007313', 'system': 'refseq'}, - 'relation': 'relatedMatch', + "coding": {"code": "NM_007313", "system": "refseq"}, + "relation": "relatedMatch", }, ], - 'aliases': [ - 'c-ABL', - 'JTK7', - 'p150', - 'CHDSKM', - 'BCR-ABL', - 'v-abl', - 'c-ABL1', - 'bcr/abl', - 'LOC116063', - 'LOC112779', - 'ABL', + "aliases": [ + "c-ABL", + "JTK7", + "p150", + "CHDSKM", + "BCR-ABL", + "v-abl", + "c-ABL1", + "bcr/abl", + "LOC116063", + "LOC112779", + "ABL", ], - 'extensions': [ + "extensions": [ { - 'name': 'previous_symbols', - 'value': ['LOC116063', 'LOC112779', 'ABL'], - 'type': 'Extension', + "name": "previous_symbols", + "value": ["LOC116063", "LOC112779", "ABL"], + "type": "Extension", }, { - 'name': 'approved_name', - 'value': 'ABL proto-oncogene 1, non-receptor tyrosine kinase', - 'type': 'Extension', + "name": "approved_name", + "value": "ABL proto-oncogene 1, non-receptor tyrosine kinase", + "type": "Extension", }, # { # "name": "hgnc_locations", @@ -423,8 +423,8 @@ def normalized_abl1(): # "type": "Extension" # }, { - 'name': 'ncbi_locations', - 'value': [ + "name": "ncbi_locations", + "value": [ # { # "id": "ga4gh:CL.1vsxettosueUHyFIOoTPzwIFD1DodLuT", # "type": "ChromosomeLocation", @@ -434,111 +434,111 @@ def normalized_abl1(): # "end": "q34.12" # }, { - 'id': 'ga4gh:SL.F1QUtInXQaBEjAJNR1sYHXdp0XC000Qi', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI', + "id": "ga4gh:SL.F1QUtInXQaBEjAJNR1sYHXdp0XC000Qi", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", }, - 'start': 130713042, - 'end': 130887675, + "start": 130713042, + "end": 130887675, } ], - 'type': 'Extension', + "type": "Extension", }, { - 'name': 'ensembl_locations', - 'value': [ + "name": "ensembl_locations", + "value": [ { - 'id': 'ga4gh:SL.P9Qu87GYxoWPYh1BdAQC5bTLorjvvye7', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI', + "id": "ga4gh:SL.P9Qu87GYxoWPYh1BdAQC5bTLorjvvye7", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.KEO-4XBcm1cxeo_DIQ8_ofqGUkp4iZhI", }, - 'start': 130713015, - 'end': 130887675, + "start": 130713015, + "end": 130887675, } ], - 'type': 'Extension', + "type": "Extension", }, - {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, + {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, { - 'name': 'hgnc_locus_type', - 'type': 'Extension', - 'value': 'gene with protein product', + "name": "hgnc_locus_type", + "type": "Extension", + "value": "gene with protein product", }, - {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, - {'name': 'strand', 'type': 'Extension', 'value': '+'}, - {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'}, + {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, + {"name": "strand", "type": "Extension", "value": "+"}, + {"name": "symbol_status", "type": "Extension", "value": "approved"}, ], } return core_models.Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalized_p150(): """Return normalized core Gene object for p150.""" params = { - 'type': 'Gene', - 'id': 'normalize.gene.hgnc:1910', - 'label': 'CHAF1A', - 'mappings': [ + "type": "Gene", + "id": "normalize.gene.hgnc:1910", + "label": "CHAF1A", + "mappings": [ { - 'coding': {'code': 'ENSG00000167670', 'system': 'ensembl'}, - 'relation': 'relatedMatch', + "coding": {"code": "ENSG00000167670", "system": "ensembl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '10036', 'system': 'ncbigene'}, - 'relation': 'relatedMatch', + "coding": {"code": "10036", "system": "ncbigene"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '601246', 'system': 'omim'}, - 'relation': 'relatedMatch', + "coding": {"code": "601246", "system": "omim"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'CCDS32875', 'system': 'ccds'}, - 'relation': 'relatedMatch', + "coding": {"code": "CCDS32875", "system": "ccds"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '7600578', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "7600578", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'OTTHUMG00000181922', 'system': 'vega'}, - 'relation': 'relatedMatch', + "coding": {"code": "OTTHUMG00000181922", "system": "vega"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'Q13111', 'system': 'uniprot'}, - 'relation': 'relatedMatch', + "coding": {"code": "Q13111", "system": "uniprot"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'NM_005483', 'system': 'refseq'}, - 'relation': 'relatedMatch', + "coding": {"code": "NM_005483", "system": "refseq"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'U20979', 'system': 'ena.embl'}, - 'relation': 'relatedMatch', + "coding": {"code": "U20979", "system": "ena.embl"}, + "relation": "relatedMatch", }, { - 'coding': {'code': 'uc002mal.4', 'system': 'ucsc'}, - 'relation': 'relatedMatch', + "coding": {"code": "uc002mal.4", "system": "ucsc"}, + "relation": "relatedMatch", }, ], - 'aliases': [ - 'CAF1P150', - 'MGC71229', - 'CAF-1', - 'P150', - 'CAF1B', - 'CAF1', - 'LOC107985297', + "aliases": [ + "CAF1P150", + "MGC71229", + "CAF-1", + "P150", + "CAF1B", + "CAF1", + "LOC107985297", ], - 'extensions': [ + "extensions": [ { - 'name': 'approved_name', - 'value': 'chromatin assembly factor 1 subunit A', - 'type': 'Extension', + "name": "approved_name", + "value": "chromatin assembly factor 1 subunit A", + "type": "Extension", }, # { # "name": "hgnc_locations", @@ -555,23 +555,23 @@ def normalized_p150(): # "type": "Extension" # }, { - 'name': 'ensembl_locations', - 'value': [ + "name": "ensembl_locations", + "value": [ { - 'id': 'ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', + "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", }, - 'start': 4402639, - 'end': 4445018, + "start": 4402639, + "end": 4445018, } ], }, { - 'name': 'ncbi_locations', - 'value': [ + "name": "ncbi_locations", + "value": [ # { # "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", # "type": "ChromosomeLocation", @@ -581,54 +581,54 @@ def normalized_p150(): # "end": "p13.3" # }, { - 'id': 'ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', + "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", }, - 'start': 4402639, - 'end': 4450830, + "start": 4402639, + "end": 4450830, } ], }, - {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'protein-coding'}, + {"name": "ncbi_gene_type", "type": "Extension", "value": "protein-coding"}, { - 'name': 'hgnc_locus_type', - 'type': 'Extension', - 'value': 'gene with protein product', + "name": "hgnc_locus_type", + "type": "Extension", + "value": "gene with protein product", }, - {'name': 'ensembl_biotype', 'type': 'Extension', 'value': 'protein_coding'}, + {"name": "ensembl_biotype", "type": "Extension", "value": "protein_coding"}, { - 'name': 'previous_symbols', - 'type': 'Extension', - 'value': ['LOC107985297'], + "name": "previous_symbols", + "type": "Extension", + "value": ["LOC107985297"], }, - {'name': 'strand', 'type': 'Extension', 'value': '+'}, - {'name': 'symbol_status', 'type': 'Extension', 'value': 'approved'}, + {"name": "strand", "type": "Extension", "value": "+"}, + {"name": "symbol_status", "type": "Extension", "value": "approved"}, ], } return core_models.Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalized_loc_653303(): """Provide test fixture for NCBI gene LOC653303. Used to validate normalized results that don't merge records. """ params = { - 'type': 'Gene', - 'label': 'LOC653303', - 'aliases': ['LOC196266', 'LOC654080', 'LOC731196'], - 'extensions': [ + "type": "Gene", + "label": "LOC653303", + "aliases": ["LOC196266", "LOC654080", "LOC731196"], + "extensions": [ { - 'type': 'Extension', - 'name': 'approved_name', - 'value': 'proprotein convertase subtilisin/kexin type 7 pseudogene', + "type": "Extension", + "name": "approved_name", + "value": "proprotein convertase subtilisin/kexin type 7 pseudogene", }, { - 'name': 'ncbi_locations', - 'value': [ + "name": "ncbi_locations", + "value": [ # { # "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl", # "type": "ChromosomeLocation", @@ -638,48 +638,48 @@ def normalized_loc_653303(): # "end": "q23.3" # }, { - 'id': 'ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1', + "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", }, - 'start': 117135528, - 'end': 117138867, + "start": 117135528, + "end": 117138867, } ], }, { - 'type': 'Extension', - 'name': 'previous_symbols', - 'value': ['LOC196266', 'LOC731196', 'LOC654080'], + "type": "Extension", + "name": "previous_symbols", + "value": ["LOC196266", "LOC731196", "LOC654080"], }, - {'type': 'Extension', 'name': 'ncbi_gene_type', 'value': 'pseudo'}, - {'name': 'strand', 'type': 'Extension', 'value': '+'}, + {"type": "Extension", "name": "ncbi_gene_type", "value": "pseudo"}, + {"name": "strand", "type": "Extension", "value": "+"}, ], - 'id': 'normalize.gene.ncbigene:653303', + "id": "normalize.gene.ncbigene:653303", } return core_models.Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalize_unmerged_loc_653303(): """Provide fixture for NCBI gene LOC655303. Used to validate normalized results that don't merge records. """ return { - 'normalized_concept_id': 'ncbigene:653303', - 'source_matches': { - 'NCBI': { - 'records': [ + "normalized_concept_id": "ncbigene:653303", + "source_matches": { + "NCBI": { + "records": [ { - 'concept_id': 'ncbigene:653303', - 'symbol': 'LOC653303', - 'symbol_status': None, - 'label': 'proprotein convertase subtilisin/kexin type 7 pseudogene', - 'strand': '+', - 'location_annotations': [], - 'locations': [ + "concept_id": "ncbigene:653303", + "symbol": "LOC653303", + "symbol_status": None, + "label": "proprotein convertase subtilisin/kexin type 7 pseudogene", + "strand": "+", + "location_annotations": [], + "locations": [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.82tL1yxucvwp5U2Yo4jNYX06pru8zZYl", @@ -689,21 +689,21 @@ def normalize_unmerged_loc_653303(): # "end": "q23.3" # }, { - 'id': 'ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1', + "id": "ga4gh:SL.hgpw5EH5q6_PFX1CTcOx5od0LKUQRuDH", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.2NkFm8HK88MqeNkCgj78KidCAXgnsfV1", }, - 'start': 117135528, - 'end': 117138867, + "start": 117135528, + "end": 117138867, } ], - 'aliases': [], - 'previous_symbols': ['LOC196266', 'LOC731196', 'LOC654080'], - 'xrefs': [], - 'associated_with': [], - 'gene_type': 'pseudo', + "aliases": [], + "previous_symbols": ["LOC196266", "LOC731196", "LOC654080"], + "xrefs": [], + "associated_with": [], + "gene_type": "pseudo", } ] } @@ -711,22 +711,22 @@ def normalize_unmerged_loc_653303(): } -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalize_unmerged_chaf1a(): """Return expected results from /normalize_unmerged for CHAF1A.""" return { - 'normalized_concept_id': 'hgnc:1910', - 'source_matches': { - 'HGNC': { - 'records': [ + "normalized_concept_id": "hgnc:1910", + "source_matches": { + "HGNC": { + "records": [ { - 'concept_id': 'hgnc:1910', - 'symbol': 'CHAF1A', - 'symbol_status': 'approved', - 'label': 'chromatin assembly factor 1 subunit A', - 'strand': None, - 'location_annotations': [], - 'locations': [ + "concept_id": "hgnc:1910", + "symbol": "CHAF1A", + "symbol_status": "approved", + "label": "chromatin assembly factor 1 subunit A", + "strand": None, + "location_annotations": [], + "locations": [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", @@ -736,69 +736,69 @@ def normalize_unmerged_chaf1a(): # "end": "p13.3" # } ], - 'aliases': [ - 'CAF1P150', - 'P150', - 'CAF1', - 'CAF1B', - 'MGC71229', - 'CAF-1', + "aliases": [ + "CAF1P150", + "P150", + "CAF1", + "CAF1B", + "MGC71229", + "CAF-1", ], - 'previous_symbols': [], - 'xrefs': ['ensembl:ENSG00000167670', 'ncbigene:10036'], - 'associated_with': [ - 'vega:OTTHUMG00000181922', - 'ccds:CCDS32875', - 'ucsc:uc002mal.4', - 'pubmed:7600578', - 'uniprot:Q13111', - 'omim:601246', - 'ena.embl:U20979', - 'refseq:NM_005483', + "previous_symbols": [], + "xrefs": ["ensembl:ENSG00000167670", "ncbigene:10036"], + "associated_with": [ + "vega:OTTHUMG00000181922", + "ccds:CCDS32875", + "ucsc:uc002mal.4", + "pubmed:7600578", + "uniprot:Q13111", + "omim:601246", + "ena.embl:U20979", + "refseq:NM_005483", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } ], }, - 'Ensembl': { - 'records': [ + "Ensembl": { + "records": [ { - 'concept_id': 'ensembl:ENSG00000167670', - 'symbol': 'CHAF1A', - 'symbol_status': None, - 'label': 'chromatin assembly factor 1 subunit A', - 'strand': '+', - 'location_annotations': [], - 'locations': [ + "concept_id": "ensembl:ENSG00000167670", + "symbol": "CHAF1A", + "symbol_status": None, + "label": "chromatin assembly factor 1 subunit A", + "strand": "+", + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', + "id": "ga4gh:SL.tLUFh2LAYq-bsMi0Vob_TIWrz-sE4HgE", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", }, - 'start': 4402639, - 'end': 4445018, + "start": 4402639, + "end": 4445018, } ], - 'aliases': [], - 'previous_symbols': [], - 'xrefs': ['hgnc:1910'], - 'associated_with': [], - 'gene_type': 'protein_coding', + "aliases": [], + "previous_symbols": [], + "xrefs": ["hgnc:1910"], + "associated_with": [], + "gene_type": "protein_coding", } ], }, - 'NCBI': { - 'records': [ + "NCBI": { + "records": [ { - 'concept_id': 'ncbigene:10036', - 'symbol': 'CHAF1A', - 'symbol_status': None, - 'label': 'chromatin assembly factor 1 subunit A', - 'strand': '+', - 'location_annotations': [], - 'locations': [ + "concept_id": "ncbigene:10036", + "symbol": "CHAF1A", + "symbol_status": None, + "label": "chromatin assembly factor 1 subunit A", + "strand": "+", + "location_annotations": [], + "locations": [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.kPEG2TGUPOAsAYK6HY0ukprQ-DR_IuMZ", @@ -808,21 +808,21 @@ def normalize_unmerged_chaf1a(): # "end": "p13.3" # }, { - 'id': 'ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl', + "id": "ga4gh:SL.-3T7UXNk6nIkMKB9YGEb0RTYxbVY2TUy", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.IIB53T8CNeJJdUqzn9V_JnRtQadwWCbl", }, - 'start': 4402639, - 'end': 4450830, + "start": 4402639, + "end": 4450830, } ], - 'aliases': ['CAF1P150', 'P150', 'CAF1', 'CAF1B', 'CAF-1'], - 'previous_symbols': ['LOC107985297'], - 'xrefs': ['ensembl:ENSG00000167670', 'hgnc:1910'], - 'associated_with': ['omim:601246'], - 'gene_type': 'protein-coding', + "aliases": ["CAF1P150", "P150", "CAF1", "CAF1B", "CAF-1"], + "previous_symbols": ["LOC107985297"], + "xrefs": ["ensembl:ENSG00000167670", "hgnc:1910"], + "associated_with": ["omim:601246"], + "gene_type": "protein-coding", } ] }, @@ -830,22 +830,22 @@ def normalize_unmerged_chaf1a(): } -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalize_unmerged_ache(): """Provide ACHE fixture for unmerged normalize endpoint.""" return { - 'normalized_concept_id': 'hgnc:108', - 'source_matches': { - 'NCBI': { - 'records': [ + "normalized_concept_id": "hgnc:108", + "source_matches": { + "NCBI": { + "records": [ { - 'concept_id': 'ncbigene:43', - 'symbol': 'ACHE', - 'symbol_status': None, - 'label': 'acetylcholinesterase (Cartwright blood group)', - 'strand': '-', - 'location_annotations': [], - 'locations': [ + "concept_id": "ncbigene:43", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": "-", + "location_annotations": [], + "locations": [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", @@ -855,63 +855,63 @@ def normalize_unmerged_ache(): # "end": "q22.1" # }, { - 'id': 'ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.U7vPSlX8eyCKdFSiROIsc9om0Y7pCm2g", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 100889993, - 'end': 100896994, + "start": 100889993, + "end": 100896994, } ], - 'aliases': ['YT', 'ARACHE', 'ACEE', 'N-ACHE'], - 'previous_symbols': ['ACEE'], - 'xrefs': ['hgnc:108', 'ensembl:ENSG00000087085'], - 'associated_with': ['omim:100740'], - 'gene_type': 'protein-coding', + "aliases": ["YT", "ARACHE", "ACEE", "N-ACHE"], + "previous_symbols": ["ACEE"], + "xrefs": ["hgnc:108", "ensembl:ENSG00000087085"], + "associated_with": ["omim:100740"], + "gene_type": "protein-coding", } ], }, - 'Ensembl': { - 'records': [ + "Ensembl": { + "records": [ { - 'concept_id': 'ensembl:ENSG00000087085', - 'symbol': 'ACHE', - 'symbol_status': None, - 'label': 'acetylcholinesterase (Cartwright blood group)', - 'strand': '-', - 'location_annotations': [], - 'locations': [ + "concept_id": "ensembl:ENSG00000087085", + "symbol": "ACHE", + "symbol_status": None, + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": "-", + "location_annotations": [], + "locations": [ { - 'id': 'ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1', - 'type': 'SequenceLocation', - 'sequenceReference': { - 'type': 'SequenceReference', - 'refgetAccession': 'SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul', + "id": "ga4gh:SL.dnydHb2Bnv5pwXjI4MpJmrZUADf5QLe1", + "type": "SequenceLocation", + "sequenceReference": { + "type": "SequenceReference", + "refgetAccession": "SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul", }, - 'start': 100889993, - 'end': 100896974, + "start": 100889993, + "end": 100896974, } ], - 'aliases': [], - 'previous_symbols': [], - 'xrefs': ['hgnc:108'], - 'associated_with': [], - 'gene_type': 'protein_coding', + "aliases": [], + "previous_symbols": [], + "xrefs": ["hgnc:108"], + "associated_with": [], + "gene_type": "protein_coding", } ] }, - 'HGNC': { - 'records': [ + "HGNC": { + "records": [ { - 'concept_id': 'hgnc:108', - 'symbol': 'ACHE', - 'symbol_status': 'approved', - 'label': 'acetylcholinesterase (Cartwright blood group)', - 'strand': None, - 'location_annotations': [], - 'locations': [ + "concept_id": "hgnc:108", + "symbol": "ACHE", + "symbol_status": "approved", + "label": "acetylcholinesterase (Cartwright blood group)", + "strand": None, + "location_annotations": [], + "locations": [ # { # "type": "ChromosomeLocation", # "id": "ga4gh:CL.JSw-08GkF-7M-OQR-33MLLKQHSi7QJb5", @@ -921,23 +921,23 @@ def normalize_unmerged_ache(): # "end": "q22.1" # } ], - 'aliases': ['3.1.1.7'], - 'previous_symbols': ['YT'], - 'xrefs': ['ncbigene:43', 'ensembl:ENSG00000087085'], - 'associated_with': [ - 'ucsc:uc003uxi.4', - 'vega:OTTHUMG00000157033', - 'merops:S09.979', - 'ccds:CCDS5710', - 'omim:100740', - 'iuphar:2465', - 'ccds:CCDS5709', - 'refseq:NM_015831', - 'pubmed:1380483', - 'uniprot:P22303', - 'ccds:CCDS64736', + "aliases": ["3.1.1.7"], + "previous_symbols": ["YT"], + "xrefs": ["ncbigene:43", "ensembl:ENSG00000087085"], + "associated_with": [ + "ucsc:uc003uxi.4", + "vega:OTTHUMG00000157033", + "merops:S09.979", + "ccds:CCDS5710", + "omim:100740", + "iuphar:2465", + "ccds:CCDS5709", + "refseq:NM_015831", + "pubmed:1380483", + "uniprot:P22303", + "ccds:CCDS64736", ], - 'gene_type': 'gene with protein product', + "gene_type": "gene with protein product", } ] }, @@ -945,55 +945,55 @@ def normalize_unmerged_ache(): } -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def normalized_ifnr(): """Return normalized core Gene object for IFNR.""" params = { - 'type': 'Gene', - 'id': 'normalize.gene.hgnc:5447', - 'label': 'IFNR', - 'mappings': [ + "type": "Gene", + "id": "normalize.gene.hgnc:5447", + "label": "IFNR", + "mappings": [ { - 'coding': {'code': '3466', 'system': 'ncbigene'}, - 'relation': 'relatedMatch', + "coding": {"code": "3466", "system": "ncbigene"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1906174', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "1906174", "system": "pubmed"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '147573', 'system': 'omim'}, - 'relation': 'relatedMatch', + "coding": {"code": "147573", "system": "omim"}, + "relation": "relatedMatch", }, { - 'coding': {'code': '1193239', 'system': 'pubmed'}, - 'relation': 'relatedMatch', + "coding": {"code": "1193239", "system": "pubmed"}, + "relation": "relatedMatch", }, ], - 'aliases': ['IFNGM', 'IFNGM2'], - 'extensions': [ - { - 'name': 'approved_name', - 'value': 'interferon production regulator', - 'type': 'Extension', - }, - {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'}, - {'name': 'symbol_status', 'value': 'approved', 'type': 'Extension'}, - {'name': 'ncbi_gene_type', 'type': 'Extension', 'value': 'unknown'}, - {'name': 'hgnc_locus_type', 'type': 'Extension', 'value': 'unknown'}, - {'name': 'location_annotations', 'type': 'Extension', 'value': ['16']}, + "aliases": ["IFNGM", "IFNGM2"], + "extensions": [ + { + "name": "approved_name", + "value": "interferon production regulator", + "type": "Extension", + }, + {"name": "symbol_status", "value": "approved", "type": "Extension"}, + {"name": "symbol_status", "value": "approved", "type": "Extension"}, + {"name": "ncbi_gene_type", "type": "Extension", "value": "unknown"}, + {"name": "hgnc_locus_type", "type": "Extension", "value": "unknown"}, + {"name": "location_annotations", "type": "Extension", "value": ["16"]}, ], } return core_models.Gene(**params) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def num_sources(): """Get the number of sources.""" return len({s for s in SourceName}) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def source_meta(): """Create test fixture for source meta""" return [SourceName.HGNC, SourceName.ENSEMBL, SourceName.NCBI] @@ -1002,18 +1002,18 @@ def source_meta(): def compare_warnings(actual_warnings, expected_warnings): """Compare response warnings against expected results.""" if expected_warnings: - assert len(actual_warnings) == len(expected_warnings), 'warnings len' + assert len(actual_warnings) == len(expected_warnings), "warnings len" for e_warnings in expected_warnings: for r_warnings in actual_warnings: for e_key, e_val in e_warnings.items(): for r_val in r_warnings.values(): if e_key == r_val: if isinstance(e_val, list): - assert set(r_val) == set(e_val), 'warnings val' + assert set(r_val) == set(e_val), "warnings val" else: - assert r_val == e_val, 'warnings val' + assert r_val == e_val, "warnings val" else: - assert actual_warnings == [], 'warnings != []' + assert actual_warnings == [], "warnings != []" def compare_normalize_resp( @@ -1028,7 +1028,7 @@ def compare_normalize_resp( assert resp.query == expected_query compare_warnings(resp.warnings, expected_warnings) assert resp.match_type == expected_match_type - assert resp.normalized_id == expected_gene.id.split('normalize.gene.')[-1] + assert resp.normalized_id == expected_gene.id.split("normalize.gene.")[-1] compare_gene(expected_gene, resp.gene) if not expected_source_meta: assert resp.source_meta_ == {} @@ -1036,7 +1036,7 @@ def compare_normalize_resp( resp_source_meta_keys = resp.source_meta_.keys() assert len(resp_source_meta_keys) == len( expected_source_meta - ), 'source_meta_keys' + ), "source_meta_keys" for src in expected_source_meta: assert src in resp_source_meta_keys compare_service_meta(resp.service_meta_) @@ -1065,7 +1065,7 @@ def compare_unmerged_response(actual, query, warnings, match_type, fixture): assert actual.query == query compare_warnings(actual.warnings, warnings) assert actual.match_type == match_type - assert actual.normalized_concept_id == fixture['normalized_concept_id'] + assert actual.normalized_concept_id == fixture["normalized_concept_id"] for source, match in actual.source_matches.items(): assert match.source_meta_ # check that it's there @@ -1073,20 +1073,20 @@ def compare_unmerged_response(actual, query, warnings, match_type, fixture): concept_id = record.concept_id fixture_gene = None # get corresponding fixture record - for gene in fixture['source_matches'][source.value]['records']: - if gene['concept_id'] == concept_id: + for gene in fixture["source_matches"][source.value]["records"]: + if gene["concept_id"] == concept_id: fixture_gene = BaseGene(**gene) break - assert fixture_gene, f'Unable to find fixture for {concept_id}' + assert fixture_gene, f"Unable to find fixture for {concept_id}" compare_unmerged_record(record, fixture_gene) def compare_service_meta(service_meta): """Check that service metadata is correct.""" - assert service_meta.name == 'gene-normalizer' - assert service_meta.version >= '0.1.0' + assert service_meta.name == "gene-normalizer" + assert service_meta.version >= "0.1.0" assert isinstance(service_meta.response_datetime, str) - assert service_meta.url == 'https://github.com/cancervariants/gene-normalization' + assert service_meta.url == "https://github.com/cancervariants/gene-normalization" def compare_gene(test, actual): @@ -1109,15 +1109,15 @@ def compare_gene(test, actual): assert no_matches == [], no_matches assert len(actual.mappings) == len(test.mappings) - assert set(actual.aliases) == set(test.aliases), 'aliases' - extensions_present = 'extensions' in test.model_fields.keys() - assert ('extensions' in actual.model_fields.keys()) == extensions_present + assert set(actual.aliases) == set(test.aliases), "aliases" + extensions_present = "extensions" in test.model_fields.keys() + assert ("extensions" in actual.model_fields.keys()) == extensions_present if extensions_present: actual_ext_names = sorted([ext.name for ext in actual.extensions]) unique_actual_ext_names = sorted(set(actual_ext_names)) - assert actual_ext_names == unique_actual_ext_names, 'duplicate extension names' + assert actual_ext_names == unique_actual_ext_names, "duplicate extension names" test_ext_names = {ext.name for ext in test.extensions} - assert set(actual_ext_names) == test_ext_names, 'extension names dont match' + assert set(actual_ext_names) == test_ext_names, "extension names dont match" n_ext_correct = 0 for test_ext in test.extensions: for actual_ext in actual.extensions: @@ -1130,20 +1130,20 @@ def compare_gene(test, actual): else: assert set(actual_ext.value) == set( test_ext.value - ), f'{test_ext.value} value' + ), f"{test_ext.value} value" else: assert actual_ext.value == test_ext.value else: assert actual_ext.value == test_ext.value assert actual_ext.type == test_ext.type n_ext_correct += 1 - assert n_ext_correct == len(test.extensions), 'number of correct extensions' + assert n_ext_correct == len(test.extensions), "number of correct extensions" def test_search_query(query_handler, num_sources): """Test that query returns properly-structured response.""" - resp = query_handler.search(' BRAF ') - assert resp.query == 'BRAF' + resp = query_handler.search(" BRAF ") + assert resp.query == "BRAF" matches = resp.source_matches assert isinstance(matches, dict) assert len(matches) == num_sources @@ -1151,20 +1151,20 @@ def test_search_query(query_handler, num_sources): def test_search_query_inc_exc(query_handler, num_sources): """Test that query incl and excl work correctly.""" - sources = 'hgnc, ensembl, ncbi' - resp = query_handler.search('BRAF', excl=sources) + sources = "hgnc, ensembl, ncbi" + resp = query_handler.search("BRAF", excl=sources) matches = resp.source_matches assert len(matches) == num_sources - len(sources.split()) - sources = 'Hgnc, NCBi' - resp = query_handler.search('BRAF', incl=sources) + sources = "Hgnc, NCBi" + resp = query_handler.search("BRAF", incl=sources) matches = resp.source_matches assert len(matches) == len(sources.split()) assert SourceName.HGNC in matches assert SourceName.NCBI in matches - sources = 'HGnC' - resp = query_handler.search('BRAF', excl=sources) + sources = "HGnC" + resp = query_handler.search("BRAF", excl=sources) matches = resp.source_matches assert len(matches) == num_sources - len(sources.split()) assert SourceName.ENSEMBL in matches @@ -1174,30 +1174,30 @@ def test_search_query_inc_exc(query_handler, num_sources): def test_search_invalid_parameter_exception(query_handler): """Test that Invalid parameter exception works correctly.""" with pytest.raises(InvalidParameterException): - _ = query_handler.search('BRAF', incl='hgn') # noqa: F841 + _ = query_handler.search("BRAF", incl="hgn") # noqa: F841 with pytest.raises(InvalidParameterException): - resp = query_handler.search('BRAF', incl='hgnc', excl='hgnc') # noqa: F841 + resp = query_handler.search("BRAF", incl="hgnc", excl="hgnc") # noqa: F841 def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): """Test that ACHE concept_id shows xref matches.""" # Search - resp = query_handler.search('ncbigene:43') + resp = query_handler.search("ncbigene:43") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search('hgnc:108') + resp = query_handler.search("hgnc:108") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search('ensembl:ENSG00000087085') + resp = query_handler.search("ensembl:ENSG00000087085") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1205,49 +1205,49 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF # Normalize - q = 'ACHE' + q = "ACHE" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_ache, expected_source_meta=source_meta ) - q = 'ache' + q = "ache" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_ache, expected_source_meta=source_meta ) - q = 'hgnc:108' + q = "hgnc:108" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) - q = 'ensembl:ENSG00000087085' + q = "ensembl:ENSG00000087085" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) - q = 'ncbigene:43' + q = "ncbigene:43" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_ache, expected_source_meta=source_meta ) - q = '3.1.1.7' + q = "3.1.1.7" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta ) - q = 'ARACHE' + q = "ARACHE" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_ache, expected_source_meta=source_meta ) - q = 'YT' + q = "YT" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1257,7 +1257,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): expected_source_meta=source_meta, ) - q = 'ACEE' + q = "ACEE" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1267,7 +1267,7 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): expected_source_meta=source_meta, ) - q = 'omim:100740' + q = "omim:100740" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1281,21 +1281,21 @@ def test_ache_query(query_handler, num_sources, normalized_ache, source_meta): def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): """Test that BRAF concept_id shows xref matches.""" # Search - resp = query_handler.search('ncbigene:673') + resp = query_handler.search("ncbigene:673") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search('hgnc:1097') + resp = query_handler.search("hgnc:1097") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search('ensembl:ENSG00000157764') + resp = query_handler.search("ensembl:ENSG00000157764") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1303,49 +1303,49 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF # Normalize - q = 'BRAF' + q = "BRAF" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_braf, expected_source_meta=source_meta ) - q = 'braf' + q = "braf" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_braf, expected_source_meta=source_meta ) - q = 'hgnc:1097' + q = "hgnc:1097" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) - q = 'ensembl:ENSG00000157764' + q = "ensembl:ENSG00000157764" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) - q = 'ncbigene:673' + q = "ncbigene:673" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_braf, expected_source_meta=source_meta ) - q = 'NS7' + q = "NS7" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta ) - q = 'b-raf' + q = "b-raf" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_braf, expected_source_meta=source_meta ) - q = 'omim:164757' + q = "omim:164757" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1359,21 +1359,21 @@ def test_braf_query(query_handler, num_sources, normalized_braf, source_meta): def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): """Test that ABL1 concept_id shows xref matches.""" # Search - resp = query_handler.search('ncbigene:25') + resp = query_handler.search("ncbigene:25") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF assert len(matches[SourceName.ENSEMBL].records) == 0 assert matches[SourceName.NCBI].records[0].match_type == MatchType.CONCEPT_ID - resp = query_handler.search('hgnc:76') + resp = query_handler.search("hgnc:76") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.CONCEPT_ID assert matches[SourceName.ENSEMBL].records[0].match_type == MatchType.XREF assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF - resp = query_handler.search('ensembl:ENSG00000097007') + resp = query_handler.search("ensembl:ENSG00000097007") matches = resp.source_matches assert len(matches) == num_sources assert matches[SourceName.HGNC].records[0].match_type == MatchType.XREF @@ -1381,43 +1381,43 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): assert matches[SourceName.NCBI].records[0].match_type == MatchType.XREF # Normalize - q = 'ABL1' + q = "ABL1" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_abl1, expected_source_meta=source_meta ) - q = 'abl1' + q = "abl1" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.SYMBOL, normalized_abl1, expected_source_meta=source_meta ) - q = 'hgnc:76' + q = "hgnc:76" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) - q = 'ensembl:ENSG00000097007' + q = "ensembl:ENSG00000097007" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) - q = 'ncbigene:25' + q = "ncbigene:25" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.CONCEPT_ID, normalized_abl1, expected_source_meta=source_meta ) - q = 'v-abl' + q = "v-abl" resp = query_handler.normalize(q) compare_normalize_resp( resp, q, MatchType.ALIAS, normalized_abl1, expected_source_meta=source_meta ) - q = 'LOC116063' + q = "LOC116063" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1427,7 +1427,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): expected_source_meta=source_meta, ) - q = 'LOC112779' + q = "LOC112779" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1437,7 +1437,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): expected_source_meta=source_meta, ) - q = 'ABL' + q = "ABL" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1447,7 +1447,7 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): expected_source_meta=source_meta, ) - q = 'refseq:NM_007313' + q = "refseq:NM_007313" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1460,16 +1460,16 @@ def test_abl1_query(query_handler, num_sources, normalized_abl1, source_meta): def test_multiple_norm_concepts(query_handler, normalized_p150, source_meta): """Tests where more than one normalized concept is found.""" - q = 'P150' + q = "P150" resp = query_handler.normalize(q) expected_warnings = [ { - 'multiple_normalized_concepts_found': [ - 'hgnc:16850', - 'hgnc:76', - 'hgnc:17168', - 'hgnc:500', - 'hgnc:8982', + "multiple_normalized_concepts_found": [ + "hgnc:16850", + "hgnc:76", + "hgnc:17168", + "hgnc:500", + "hgnc:8982", ] } ] @@ -1487,7 +1487,7 @@ def test_normalize_single_entry(query_handler, normalized_loc_653303): """Test that the normalized endpoint correctly shapes unmerged identity records into core gene objects. """ - q = 'LOC653303' + q = "LOC653303" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1502,7 +1502,7 @@ def test_normalize_no_locations(query_handler, normalized_ifnr): """Test that the normalized endpoint correcly shapes merged entity with no locations """ - q = 'IFNR' + q = "IFNR" resp = query_handler.normalize(q) compare_normalize_resp( resp, @@ -1521,55 +1521,55 @@ def test_normalize_unmerged( ): """Test that unmerged normalization produces correct results.""" # concept ID - q = 'ncbigene:653303' + q = "ncbigene:653303" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_loc_653303 ) - q = 'hgnc:1910' + q = "hgnc:1910" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_chaf1a ) - q = 'HGNC:108' + q = "HGNC:108" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.CONCEPT_ID, normalize_unmerged_ache ) # symbol - q = 'LOC653303' + q = "LOC653303" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.SYMBOL, normalize_unmerged_loc_653303 ) # prev symbol - q = 'ACEE' + q = "ACEE" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.PREV_SYMBOL, normalize_unmerged_ache ) - q = 'LOC196266' + q = "LOC196266" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.PREV_SYMBOL, normalize_unmerged_loc_653303 ) # alias - q = 'P150' + q = "P150" resp = query_handler.normalize_unmerged(q) expected_warnings = [ { - 'multiple_normalized_concepts_found': [ - 'hgnc:500', - 'hgnc:8982', - 'hgnc:17168', - 'hgnc:16850', - 'hgnc:76', + "multiple_normalized_concepts_found": [ + "hgnc:500", + "hgnc:8982", + "hgnc:17168", + "hgnc:16850", + "hgnc:76", ] } ] @@ -1577,22 +1577,22 @@ def test_normalize_unmerged( resp, q, expected_warnings, MatchType.ALIAS, normalize_unmerged_chaf1a ) - q = 'ARACHE' + q = "ARACHE" resp = query_handler.normalize_unmerged(q) compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_ache) - q = 'MGC71229' + q = "MGC71229" resp = query_handler.normalize_unmerged(q) compare_unmerged_response(resp, q, [], MatchType.ALIAS, normalize_unmerged_chaf1a) # assoc with - q = 'omim:100740' + q = "omim:100740" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_ache ) - q = 'uniprot:Q13111' + q = "uniprot:Q13111" resp = query_handler.normalize_unmerged(q) compare_unmerged_response( resp, q, [], MatchType.ASSOCIATED_WITH, normalize_unmerged_chaf1a @@ -1601,18 +1601,18 @@ def test_normalize_unmerged( def test_invalid_queries(query_handler): """Test invalid queries""" - resp = query_handler.normalize('B R A F') + resp = query_handler.normalize("B R A F") assert resp.match_type is MatchType.NO_MATCH with pytest.raises(TypeError): - resp['match_type'] + resp["match_type"] - resp = query_handler.search('B R A F') + resp = query_handler.search("B R A F") records = [r for matches in resp.source_matches.values() for r in matches.records] assert len(records) == 0 def test_service_meta(query_handler): """Test service meta info in response.""" - resp = query_handler.search('pheno') + resp = query_handler.search("pheno") compare_service_meta(resp.service_meta_) diff --git a/tests/unit/test_schemas.py b/tests/unit/test_schemas.py index fbf6f339..3d5fceed 100644 --- a/tests/unit/test_schemas.py +++ b/tests/unit/test_schemas.py @@ -16,22 +16,22 @@ # ) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def sequence_location(): """Create a valid sequence location test fixture.""" return models.SequenceLocation( sequence=models.SequenceReference( - refgetAccession='SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul' + refgetAccession="SQ.F-LrLMe1SRpfUZHkQmvkVKFEGaoDeHul" ), start=140719327, end=140924929, ) -@pytest.fixture(scope='module') +@pytest.fixture(scope="module") def gene(): """Create a valid gene test fixture.""" - return Gene(match_type=100, concept_id='hgnc:1097', symbol='BRAF') + return Gene(match_type=100, concept_id="hgnc:1097", symbol="BRAF") def test_gene(gene, sequence_location): @@ -39,77 +39,77 @@ def test_gene(gene, sequence_location): assert gene assert Gene( match_type=100, - concept_id='ensembl:1', - symbol='GENE', + concept_id="ensembl:1", + symbol="GENE", # locations=[chromosome_location, sequence_location] locations=[sequence_location], ) assert Gene( match_type=100, - concept_id='ensembl:1', - symbol='GENE', + concept_id="ensembl:1", + symbol="GENE", locations=[sequence_location], ) assert Gene( match_type=100, - concept_id='ensembl:1', - symbol='GENE', + concept_id="ensembl:1", + symbol="GENE", locations=[sequence_location], ) # id not a valid curie with pytest.raises(pydantic.ValidationError): - Gene(match_type=100, concept_id='hgnc1096', symbol='BRAF') + Gene(match_type=100, concept_id="hgnc1096", symbol="BRAF") # symbol not a str with pytest.raises(pydantic.ValidationError): - Gene(match_type=100, concept_id='hgnc:1096', symbol=1) + Gene(match_type=100, concept_id="hgnc:1096", symbol=1) # strand not -/+ with pytest.raises(pydantic.ValidationError): - Gene(match_type=100, concept_id='hgnc:1096', symbol='BRAF', strand='positive') + Gene(match_type=100, concept_id="hgnc:1096", symbol="BRAF", strand="positive") # xrefs not a valid curie with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id='hgnc:1096', - symbol='BRAF', - xrefs=['hgnc', 'hgnc:1'], + concept_id="hgnc:1096", + symbol="BRAF", + xrefs=["hgnc", "hgnc:1"], ) # associated_with not a valid curie with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id='hgnc:1096', - symbol='BRAF', - associated_with=['hgnc', 'hgnc:1'], + concept_id="hgnc:1096", + symbol="BRAF", + associated_with=["hgnc", "hgnc:1"], ) # symbol status invalid with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id='hgnc:1096', - symbol='BRAF', - symbol_status='nothing', + concept_id="hgnc:1096", + symbol="BRAF", + symbol_status="nothing", ) # locations not a sequence or chromosome location with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id='hgnc:1096', - symbol='BRAF', - locations=['GRCh38:chr1'], + concept_id="hgnc:1096", + symbol="BRAF", + locations=["GRCh38:chr1"], ) # location not a list with pytest.raises(pydantic.ValidationError): Gene( match_type=100, - concept_id='hgnc:1096', - symbol='BRAF', + concept_id="hgnc:1096", + symbol="BRAF", locations=sequence_location, )