From 8b4d5e3c27ec5c45cd2696f435f3c746107cf635 Mon Sep 17 00:00:00 2001 From: Jorge Alvarez Jarreta Date: Wed, 6 Nov 2024 10:59:40 +0000 Subject: [PATCH] add automatic fixes provided by ruff --- .../genomio/annotation/update_description.py | 26 +-- .../ensembl/io/genomio/assembly/download.py | 39 +++-- .../ensembl/io/genomio/assembly/status.py | 29 ++-- .../io/genomio/database/core_server.py | 14 +- .../io/genomio/database/dbconnection_lite.py | 12 +- .../ensembl/io/genomio/database/factory.py | 5 +- src/python/ensembl/io/genomio/events/dump.py | 66 ++++---- .../ensembl/io/genomio/events/format.py | 11 +- src/python/ensembl/io/genomio/events/load.py | 31 ++-- .../ensembl/io/genomio/external_db/db_map.py | 1 + src/python/ensembl/io/genomio/fasta/chunk.py | 37 +++-- .../ensembl/io/genomio/fasta/process.py | 25 ++- .../ensembl/io/genomio/genbank/download.py | 2 +- .../io/genomio/genbank/extract_data.py | 149 +++++++++--------- .../io/genomio/genome_metadata/dump.py | 24 +-- .../io/genomio/genome_metadata/extend.py | 31 ++-- .../io/genomio/genome_metadata/prepare.py | 20 ++- .../io/genomio/genome_stats/compare.py | 18 +-- .../ensembl/io/genomio/genome_stats/dump.py | 19 +-- .../ensembl/io/genomio/gff3/exceptions.py | 4 + .../io/genomio/gff3/extract_annotation.py | 31 ++-- .../ensembl/io/genomio/gff3/features.py | 1 + .../ensembl/io/genomio/gff3/gene_merger.py | 12 +- .../ensembl/io/genomio/gff3/id_allocator.py | 21 ++- .../ensembl/io/genomio/gff3/overlaps.py | 22 +-- src/python/ensembl/io/genomio/gff3/process.py | 4 +- .../ensembl/io/genomio/gff3/restructure.py | 14 +- .../ensembl/io/genomio/gff3/simplifier.py | 40 +++-- .../io/genomio/manifest/check_integrity.py | 33 ++-- .../io/genomio/manifest/compute_stats.py | 92 ++++++----- .../ensembl/io/genomio/manifest/generate.py | 4 +- .../ensembl/io/genomio/manifest/manifest.py | 8 +- .../io/genomio/manifest/manifest_stats.py | 21 +-- .../io/genomio/schemas/json/factory.py | 11 +- .../io/genomio/schemas/json/validate.py | 6 +- .../io/genomio/seq_region/collection.py | 3 +- .../ensembl/io/genomio/seq_region/dump.py | 10 +- .../io/genomio/seq_region/exceptions.py | 1 + .../ensembl/io/genomio/seq_region/gbff.py | 1 + .../ensembl/io/genomio/seq_region/mappings.py | 4 +- .../ensembl/io/genomio/seq_region/prepare.py | 6 +- .../ensembl/io/genomio/seq_region/report.py | 4 +- .../annotation/test_update_description.py | 1 + src/python/tests/assembly/test_download.py | 24 +-- src/python/tests/assembly/test_status.py | 26 +-- src/python/tests/conftest.py | 2 + src/python/tests/database/test_core_server.py | 26 +-- .../tests/database/test_dbconnection_lite.py | 9 +- src/python/tests/database/test_factory.py | 10 +- .../tests/external_db/test_external_db_map.py | 2 +- src/python/tests/fasta/test_chunk.py | 32 ++-- src/python/tests/fasta/test_process.py | 8 +- src/python/tests/genbank/test_download.py | 5 +- src/python/tests/genbank/test_extract_data.py | 15 +- .../tests/genbank/test_extract_data_seq.py | 14 +- src/python/tests/genome_metadata/test_dump.py | 16 +- .../tests/genome_metadata/test_extend.py | 23 +-- .../tests/genome_metadata/test_prepare.py | 19 ++- src/python/tests/genome_stats/test_compare.py | 12 +- src/python/tests/genome_stats/test_dump.py | 21 +-- .../tests/gff3/test_extract_annotation.py | 29 ++-- src/python/tests/gff3/test_id_allocator.py | 24 +-- src/python/tests/gff3/test_records.py | 6 +- src/python/tests/gff3/test_restructure.py | 38 ++--- src/python/tests/gff3/test_simplifier.py | 16 +- .../tests/manifest/test_check_integrity.py | 2 +- src/python/tests/manifest/test_manifest.py | 16 +- .../tests/manifest/test_manifest_stats.py | 16 +- src/python/tests/schemas/test_json.py | 4 +- .../tests/seq_region/test_collection.py | 14 +- src/python/tests/seq_region/test_dump.py | 8 +- src/python/tests/seq_region/test_gbff.py | 2 +- src/python/tests/seq_region/test_prepare.py | 1 + 73 files changed, 739 insertions(+), 614 deletions(-) diff --git a/src/python/ensembl/io/genomio/annotation/update_description.py b/src/python/ensembl/io/genomio/annotation/update_description.py index 61b5fce8f..c850de027 100644 --- a/src/python/ensembl/io/genomio/annotation/update_description.py +++ b/src/python/ensembl/io/genomio/annotation/update_description.py @@ -21,7 +21,7 @@ import logging from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any from sqlalchemy.orm import Session from sqlalchemy import and_, select @@ -39,18 +39,18 @@ "transcript": "transcript", } -FeatStruct = Tuple[str, str, str] +FeatStruct = tuple[str, str, str] -def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> Dict[str, FeatStruct]: +def get_core_data(session: Session, table: str, match_xrefs: bool = False) -> dict[str, FeatStruct]: """Returns the table descriptions from a core database. Args: session: Session open on a core database. table: "gene" or "transcript" table from the core database. match_xrefs: If the IDs do not match, try to match an Xref ID instead. - """ + """ if table == "gene": stmt = ( select(Gene.gene_id, Gene.stable_id, Gene.description, Xref.dbprimary_acc) @@ -103,6 +103,7 @@ def load_descriptions( report: Print the mapping of changes to perform in the standard output. do_update: Actually update the core database. match_xrefs: If the IDs do not match, try to match an Xref ID instead. + """ func = get_json(func_file) logging.info(f"{len(func)} annotations from {func_file}") @@ -125,7 +126,7 @@ def load_descriptions( } # Compare, only keep the descriptions that have changed features_to_update = _get_features_to_update( - table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs + table, feat_func, feat_data, stats, report=report, do_update=do_update, match_xrefs=match_xrefs, ) # Show stats for this feature type @@ -141,8 +142,8 @@ def load_descriptions( def _get_cur_feat( - feat_data: Dict[str, FeatStruct], new_feat: Dict[str, Any], match_xrefs: bool = False -) -> Optional[FeatStruct]: + feat_data: dict[str, FeatStruct], new_feat: dict[str, Any], match_xrefs: bool = False, +) -> FeatStruct | None: """Match a feature ID, synonyms or xrefs to a core stable ID and return the matching core feature. Returns None if no match. @@ -169,14 +170,14 @@ def _get_cur_feat( def _get_features_to_update( table: str, - feat_func: List[Dict[str, Any]], - feat_data: Dict[str, FeatStruct], - stats: Dict[str, int], + feat_func: list[dict[str, Any]], + feat_data: dict[str, FeatStruct], + stats: dict[str, int], *, report: bool = False, do_update: bool = False, match_xrefs: bool = True, -) -> List[Dict[str, Any]]: +) -> list[dict[str, Any]]: """Checks a list of features and returns those whose description we want to update. Args: @@ -190,6 +191,7 @@ def _get_features_to_update( Returns: The list of features with their operation changed to update or insert. + """ to_update = [] for new_feat in feat_func: @@ -247,7 +249,7 @@ def main() -> None: parser.add_argument("--report", action="store_true", help="Show what change would be made") parser.add_argument("--update", action="store_true", help="Make the changes to the database") parser.add_argument( - "--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work" + "--match_xrefs", action="store_true", help="Use xref IDs to match features if IDs do not work", ) parser.add_log_arguments(add_log_file=True) args = parser.parse_args() diff --git a/src/python/ensembl/io/genomio/assembly/download.py b/src/python/ensembl/io/genomio/assembly/download.py index e243a2f1e..1b8874146 100644 --- a/src/python/ensembl/io/genomio/assembly/download.py +++ b/src/python/ensembl/io/genomio/assembly/download.py @@ -34,7 +34,6 @@ from pathlib import Path import re import time -from typing import Dict, Optional from ensembl.utils.argparse import ArgumentParser from ensembl.utils.logging import init_logging_with_args @@ -49,14 +48,17 @@ class FileDownloadError(Exception): + """When a file download fails or there is a problem with that file.""" class FTPConnectionError(Exception): + """Error while initialising an FTP connection.""" class UnsupportedFormatError(Exception): + """When a string does not have the expected format.""" @@ -70,8 +72,8 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP: Raises: UnsupportedFormatError: If `accession` does not follow INSDC's accession format. - """ + """ match = re.match(r"^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})(\.[0-9]+)?$", accession) if not match: raise UnsupportedFormatError(f"Could not recognize GCA accession format: {accession}") @@ -89,15 +91,15 @@ def establish_ftp(ftp_conn: FTP, ftp_url: str, accession: str) -> FTP: return ftp_conn -def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str = "md5checksums.txt") -> bool: - """ - Check all files checksums with the sums listed in a checksum file, if available. +def md5_files(dl_dir: Path, md5_path: Path | None = None, md5_filename: str = "md5checksums.txt") -> bool: + """Check all files checksums with the sums listed in a checksum file, if available. Return False if there is no checksum file, or a file is missing, or has a wrong checksum. Args: dl_dir: Path location to containing downloaded FTP files. md5_path: Full path to an MD5 checksum file. md5_filename: Name of a checksum file in the `dl_dir` (used if no `md5_path` is given). + """ # Get or set md5 file to user or default setting if md5_path is None: @@ -127,14 +129,14 @@ def md5_files(dl_dir: Path, md5_path: Optional[Path] = None, md5_filename: str = return True -def get_checksums(checksum_path: Path) -> Dict[str, str]: - """ - Get a dict of checksums from a file, with file names as keys and sums as values +def get_checksums(checksum_path: Path) -> dict[str, str]: + """Get a dict of checksums from a file, with file names as keys and sums as values Args: checksum_path: Path location to MD5 checksum file. + """ - sums: Dict[str, str] = {} + sums: dict[str, str] = {} if not checksum_path.is_file(): return sums with checksum_path.open(mode="r") as fh: @@ -147,16 +149,15 @@ def get_checksums(checksum_path: Path) -> Dict[str, str]: def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo: int) -> None: - """ - Given an INSDC accession, download all available files from the ftp to the download dir + """Given an INSDC accession, download all available files from the ftp to the download dir Args: ftp_connection: An open FTP connection object accession: Genome assembly accession. dl_dir: Path to downloaded FTP files. max_redo: Maximum FTP connection retry attempts. - """ + """ # Get the list of assemblies for this accession for ftp_dir, _ in ftp_connection.mlsd(): if re.search(accession, ftp_dir): @@ -176,12 +177,12 @@ def download_files(ftp_connection: FTP, accession: str, dl_dir: Path, max_redo: _download_file(ftp_connection, ftp_file, md5_sums, dl_dir, max_redo) else: logging.warning( - f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection" + f"Could not find accession '{accession}' from ftp {ftp_dir} in open FTP connection", ) def _download_file( - ftp_connection: FTP, ftp_file: str, md5_sums: Dict[str, str], dl_dir: Path, max_redo: int = 0 + ftp_connection: FTP, ftp_file: str, md5_sums: dict[str, str], dl_dir: Path, max_redo: int = 0, ) -> None: """Downloads individual files from FTP server. @@ -191,6 +192,7 @@ def _download_file( md5_sums: Dictionary of key value pairs filename - md5_checksums. dl_dir: Path to downloaded FTP files. max_redo: Maximum number of connection retry attempts. + """ has_md5 = True expected_sum = "" @@ -238,7 +240,7 @@ def _download_file( raise FileDownloadError(f"Could not download file {ftp_file} after {redo} tries") -def get_files_selection(dl_dir: Path) -> Dict[str, str]: +def get_files_selection(dl_dir: Path) -> dict[str, str]: """Returns a dictionary with the relevant downloaded files classified. Args: @@ -249,6 +251,7 @@ def get_files_selection(dl_dir: Path) -> Dict[str, str]: Raises: FileDownloadError: If `dl_dir` tree does not include a file named `*_assembly_report.txt`. + """ files = {} root_name = get_root_name(dl_dir) @@ -267,6 +270,7 @@ def get_root_name(dl_dir: Path) -> str: Args: dl_dir: Path location of downloaded FTP files. + """ root_name = "" for dl_file in dl_dir.iterdir(): @@ -294,6 +298,7 @@ def retrieve_assembly_data( Raises: FileDownloadError: If no files are downloaded or if any does not match its MD5 checksum. + """ download_dir = Path(download_dir) @@ -304,7 +309,7 @@ def retrieve_assembly_data( if not md5_files(download_dir, None): logging.info(" Download the files") - for increment in range(0, max_increment + 1): + for increment in range(max_increment + 1): if increment > 0: logging.info(f" Increment accession version once from {accession}") version = int(accession[-1]) @@ -331,7 +336,7 @@ def main() -> None: parser = ArgumentParser(description="Download an assembly data files from INSDC or RefSeq.") parser.add_argument("--accession", required=True, help="Genome assembly accession") parser.add_argument_dst_path( - "--download_dir", default=Path.cwd(), help="Folder where the data will be downloaded" + "--download_dir", default=Path.cwd(), help="Folder where the data will be downloaded", ) parser.add_log_arguments() args = parser.parse_args() diff --git a/src/python/ensembl/io/genomio/assembly/status.py b/src/python/ensembl/io/genomio/assembly/status.py index 9ff41cd32..657f5dc8d 100644 --- a/src/python/ensembl/io/genomio/assembly/status.py +++ b/src/python/ensembl/io/genomio/assembly/status.py @@ -48,11 +48,13 @@ class UnsupportedFormatError(Exception): + """When a string does not have the expected format.""" @dataclass class ReportStructure: + """Stores key report meta information.""" species_name: str = "" @@ -100,8 +102,8 @@ def singularity_image_setter(sif_cache_dir: Path | None, datasets_version: str | Returns: `spython.main.client` instance of singularity container image housing `datasets`. - """ + """ # Set singularity cache dir from user defined path or use environment if sif_cache_dir and sif_cache_dir.is_dir(): image_dl_path = sif_cache_dir @@ -109,12 +111,12 @@ def singularity_image_setter(sif_cache_dir: Path | None, datasets_version: str | elif os.environ.get("NXF_SINGULARITY_CACHEDIR"): image_dl_path = Path(os.environ["NXF_SINGULARITY_CACHEDIR"]) logging.info( - f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}" + f"Using preferred nextflow singularity cache dir 'NXF_SINGULARITY_CACHEDIR': {image_dl_path}", ) elif os.environ.get("SINGULARITY_CACHEDIR"): image_dl_path = Path(os.environ["SINGULARITY_CACHEDIR"]) logging.info( - f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}" + f"Using the default singularity installation cache dir 'SINGULARITY_CACHEDIR': {image_dl_path}", ) else: image_dl_path = Path() @@ -142,6 +144,7 @@ def get_assembly_accessions(src_file: StrPath) -> list[str]: Raises: UnsupportedFormatError: If an accession does not match the INSDC assembly accession format. + """ query_accessions: list[str] = [] with Path(src_file).open(mode="r") as fin: @@ -165,8 +168,8 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s Returns: Dict of core database names (key) and their corresponding INSDC assembly accession (value). - """ + """ core_accn_meta = {} database_count = 0 count_accn_found = 0 @@ -179,7 +182,7 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s db_connection = DBConnection(db_connection_url) with db_connection.begin() as conn: query_result = conn.execute( - text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";') + text('SELECT meta_value FROM meta WHERE meta_key = "assembly.accession";'), ).fetchall() if not query_result: @@ -193,14 +196,14 @@ def fetch_accessions_from_core_dbs(src_file: StrPath, server_url: URL) -> dict[s logging.warning(f"Core {core_db} has {len(query_result)} assembly.accessions") logging.info( - f"From initial input core databases ({database_count}), obtained ({count_accn_found}) accessions" + f"From initial input core databases ({database_count}), obtained ({count_accn_found}) accessions", ) return core_accn_meta def fetch_datasets_reports( - sif_image: Client, assembly_accessions: dict[str, str], download_directory: StrPath, batch_size: int + sif_image: Client, assembly_accessions: dict[str, str], download_directory: StrPath, batch_size: int, ) -> dict[str, dict]: """Obtain assembly reports in JSON format for each assembly accession via `datasets` CLI. @@ -229,7 +232,7 @@ def fetch_datasets_reports( for accessions in accn_subsample: # Make call to singularity datasets providing a multi-accession query client_return = Client.execute( - image=sif_image, command=datasets_command + accessions, return_result=True, quiet=True + image=sif_image, command=datasets_command + accessions, return_result=True, quiet=True, ) raw_result = client_return["message"] @@ -272,6 +275,7 @@ def extract_assembly_metadata(assembly_reports: dict[str, dict]) -> dict[str, Re Returns: Parsed assembly report meta (source, meta). + """ parsed_meta = {} @@ -329,6 +333,7 @@ def generate_report_tsv( query_type: Type of query (either core databases or accessions). output_directory: Directory to store report TSV file. outfile_name: Name to give to the output TSV file. + """ tsv_outfile = Path(output_directory, f"{outfile_name}.tsv") @@ -384,7 +389,7 @@ def main() -> None: subparsers = parser.add_subparsers(title="report assembly status from", required=True, dest="src") # Specific arguments required when using Ensembl core database names as source core_db_parser = subparsers.add_parser( - "core_db", parents=[base_parser], help="list of Ensembl core databases" + "core_db", parents=[base_parser], help="list of Ensembl core databases", ) core_db_parser.add_argument_src_path( "--input", @@ -394,10 +399,10 @@ def main() -> None: core_db_parser.add_server_arguments() # Specific arguments required when using assembly accessions as source accessions_parser = subparsers.add_parser( - "accession", parents=[base_parser], help="list of INSDC accessions" + "accession", parents=[base_parser], help="list of INSDC accessions", ) accessions_parser.add_argument_src_path( - "--input", required=True, help="file path with list of assembly INSDC query accessions" + "--input", required=True, help="file path with list of assembly INSDC query accessions", ) args = parser.parse_args() @@ -414,7 +419,7 @@ def main() -> None: # Datasets query implementation for one or more batched accessions assembly_reports = fetch_datasets_reports( - datasets_image, query_accessions, args.reports_dir, args.datasets_batch_size + datasets_image, query_accessions, args.reports_dir, args.datasets_batch_size, ) # Extract the key assembly report meta information for reporting status diff --git a/src/python/ensembl/io/genomio/database/core_server.py b/src/python/ensembl/io/genomio/database/core_server.py index c20c80868..18f85fb9f 100644 --- a/src/python/ensembl/io/genomio/database/core_server.py +++ b/src/python/ensembl/io/genomio/database/core_server.py @@ -17,7 +17,6 @@ __all__ = ["CoreServer"] import re -from typing import List, Optional import logging import sqlalchemy @@ -26,6 +25,7 @@ class CoreServer: + """Basic interface to a MySQL server with core databases. Allows to get a filtered list of databases. @@ -35,9 +35,8 @@ def __init__(self, server_url: URL) -> None: logging.debug(f"Connect to {server_url}") self.engine = sqlalchemy.create_engine(server_url) - def get_all_core_names(self) -> List[str]: + def get_all_core_names(self) -> list[str]: """Query the server and retrieve all database names that look like Ensembl cores.""" - with self.engine.connect() as connection: all_query = connection.execute(text(r"SHOW DATABASES LIKE '%%_core_%%'")) dbs = [row[0] for row in all_query.fetchall()] @@ -48,11 +47,11 @@ def get_cores( self, *, prefix: str = "", - build: Optional[int] = None, - version: Optional[int] = None, + build: int | None = None, + version: int | None = None, dbname_re: str = "", - db_list: Optional[List[str]] = None, - ) -> List[str]: + db_list: list[str] | None = None, + ) -> list[str]: """Returns a list of core databases, filtered if requested. Args: @@ -61,6 +60,7 @@ def get_cores( version: Filter by Ensembl version. dbname_re: Filter by dbname regular expression. db_list: Explicit list of database names. + """ dbs = [] diff --git a/src/python/ensembl/io/genomio/database/dbconnection_lite.py b/src/python/ensembl/io/genomio/database/dbconnection_lite.py index 5ce46e6bd..a37f036cd 100644 --- a/src/python/ensembl/io/genomio/database/dbconnection_lite.py +++ b/src/python/ensembl/io/genomio/database/dbconnection_lite.py @@ -18,7 +18,7 @@ import logging import re -from typing import Any, Dict, List, Optional +from typing import Any from sqlalchemy import select from sqlalchemy.orm import Session @@ -30,13 +30,14 @@ class DBConnectionLite(DBConnection): + """Extension to get metadata directly from a database, assuming it has a metadata table.""" def __init__(self, url: StrURL, reflect: bool = False, **kwargs: Any) -> None: super().__init__(url, reflect, **kwargs) - self._metadata: Dict[str, List] = {} + self._metadata: dict[str, list] = {} - def get_metadata(self) -> Dict[str, List]: + def get_metadata(self) -> dict[str, list]: """Retrieves all metadata from the `meta` table in the database. Returns: @@ -48,7 +49,6 @@ def get_metadata(self) -> Dict[str, List]: def _load_metadata(self) -> None: """Caches the metadata values.""" - if self._metadata: return @@ -63,9 +63,8 @@ def _load_metadata(self) -> None: else: self._metadata[meta_key] = [meta_value] - def get_meta_value(self, meta_key: str) -> Optional[str]: + def get_meta_value(self, meta_key: str) -> str | None: """Returns the first meta_value for a given meta_key.""" - self._load_metadata() try: return self._metadata[meta_key][0] @@ -75,7 +74,6 @@ def get_meta_value(self, meta_key: str) -> Optional[str]: def get_project_release(self) -> str: """Returns the project release number from the database name. Returns empty string if not found.""" - match = re.search(_DB_PATTERN_RELEASE, self.db_name) if match: return match.group(1) diff --git a/src/python/ensembl/io/genomio/database/factory.py b/src/python/ensembl/io/genomio/database/factory.py index 9d47501f2..24841fe3b 100644 --- a/src/python/ensembl/io/genomio/database/factory.py +++ b/src/python/ensembl/io/genomio/database/factory.py @@ -41,6 +41,7 @@ def format_db_data(server_url: URL, dbs: list[str], brc_mode: bool = False) -> l Returns: List of dictionaries with 3 keys: "database", "species" and "division". + """ databases_data = [] for db_name in dbs: @@ -108,6 +109,7 @@ def get_core_dbs_metadata( Returns: List of dictionaries with 3 keys: "database", "species" and "division". + """ db_list_file = None if db_list: @@ -117,7 +119,7 @@ def get_core_dbs_metadata( server = CoreServer(server_url) logging.debug("Fetching databases...") databases = server.get_cores( - prefix=prefix, build=build, version=version, dbname_re=db_regex, db_list=db_list_file + prefix=prefix, build=build, version=version, dbname_re=db_regex, db_list=db_list_file, ) logging.info(f"Got {len(databases)} databases") logging.debug("\n".join(databases)) @@ -129,6 +131,7 @@ def parse_args(arg_list: list[str] | None) -> argparse.Namespace: Args: arg_list: TODO + """ parser = ArgumentParser(description=__doc__) parser.add_server_arguments() diff --git a/src/python/ensembl/io/genomio/events/dump.py b/src/python/ensembl/io/genomio/events/dump.py index c502335c1..0b2ab3e2e 100644 --- a/src/python/ensembl/io/genomio/events/dump.py +++ b/src/python/ensembl/io/genomio/events/dump.py @@ -26,7 +26,6 @@ from datetime import datetime from pathlib import Path -from typing import List, Dict, Optional, Set, Tuple import logging from sqlalchemy import select, and_, or_ @@ -39,16 +38,16 @@ BRC4_START_DATE = datetime(2020, 5, 1) -IdsSet = Set[str] -DictToIdsSet = Dict[str, IdsSet] +IdsSet = set[str] +DictToIdsSet = dict[str, IdsSet] class Pair: + """Simple old_id - new_id pair representation""" - def __init__(self, old_id: Optional[str], new_id: Optional[str]) -> None: + def __init__(self, old_id: str | None, new_id: str | None) -> None: """Create a pair with an old_id and a new_id if provided""" - self.old_id = old_id if old_id is not None else "" if new_id is not None: self.new_id = new_id @@ -65,15 +64,16 @@ def has_new_id(self) -> bool: def is_empty(self) -> bool: """Test if the current pair has no id.""" - return not (self.has_old_id() or self.has_new_id()) class UnsupportedEvent(ValueError): + """If an event is not supported""" class Event: + """Represents a stable id event from one gene set version to another one. Various events: - new genes - deleted genes @@ -95,13 +95,12 @@ class Event: def __init__( self, - from_list: Optional[Set[str]] = None, - to_list: Optional[Set[str]] = None, - release: Optional[str] = None, - date: Optional[datetime] = None, + from_list: set[str] | None = None, + to_list: set[str] | None = None, + release: str | None = None, + date: datetime | None = None, ) -> None: """Create a stable id event from a set of old_ids to a set of new_ids""" - if from_list is None: from_list = set() if to_list is None: @@ -111,16 +110,15 @@ def __init__( self.release = release self.date = date self.name = "" - self.pairs: List[Pair] = [] + self.pairs: list[Pair] = [] def __str__(self) -> str: """String representation of the stable id event""" - from_str = ",".join(self.from_set) to_str = ",".join(self.to_set) return f"From {from_str} to {to_str} = {self.get_name()} in release {self.release}" - def brc_format_1(self) -> List[str]: + def brc_format_1(self) -> list[str]: """Returns a list events, one line per initial ID, in the following TSV format: - old gene id - event name @@ -159,7 +157,7 @@ def brc_format_1(self) -> List[str]: line_list.append("\t".join(line)) return line_list - def brc_format_2(self) -> List[str]: + def brc_format_2(self) -> list[str]: """Returns a list of combination of genes, one line per combination of old_id - new_ids, in the following TSV format: - old gene id @@ -189,7 +187,7 @@ def brc_format_2(self) -> List[str]: return line_list @staticmethod - def clean_set(this_list: Set) -> Set: + def clean_set(this_list: set) -> set: """Removes any empty elements from a list. Args: @@ -280,7 +278,7 @@ def get_name(self) -> str: self._name_event() return self.name - def add_pairs(self, pairs: List[Pair]) -> None: + def add_pairs(self, pairs: list[Pair]) -> None: """Provided all the pairs, keep those that are used by this event. Args: @@ -300,6 +298,7 @@ def add_pairs(self, pairs: List[Pair]) -> None: class DumpStableIDs: + """An processor that create events from pairs of ids and can print those events out. Attributes: @@ -311,14 +310,13 @@ def __init__(self, session: Session) -> None: """Create a processor for events""" self.session = session - def get_history(self) -> List: + def get_history(self) -> list: """Retrieve all events from a database. Returns: A list of all events. """ - sessions = self.get_mapping_sessions() events = [] @@ -334,7 +332,7 @@ def get_history(self) -> List: # Then analyse the pairs to make events return events - def print_events(self, events: List[Event], output_file: Path) -> None: + def print_events(self, events: list[Event], output_file: Path) -> None: """Print events in a format for BRC. Args: @@ -351,7 +349,7 @@ def print_events(self, events: List[Event], output_file: Path) -> None: for line in event_lines: out_fh.write(line + "\n") - def get_mapping_sessions(self) -> List[MappingSession]: + def get_mapping_sessions(self) -> list[MappingSession]: """Retrieve the mapping sessions from the connected database. Returns: @@ -362,7 +360,7 @@ def get_mapping_sessions(self) -> List[MappingSession]: map_sessions = list(self.session.scalars(map_sessions_stmt).unique().all()) return map_sessions - def get_pairs(self, session_id: int) -> List[Pair]: + def get_pairs(self, session_id: int) -> list[Pair]: """Retrieve all pair of ids for a given session. Args: @@ -372,7 +370,6 @@ def get_pairs(self, session_id: int) -> List[Pair]: All pairs of IDs. """ - id_events_stmt = ( select(StableIdEvent) .where( @@ -386,19 +383,19 @@ def get_pairs(self, session_id: int) -> List[Pair]: (StableIdEvent.old_stable_id != StableIdEvent.new_stable_id), ) ), - ) + ), ) .group_by( - StableIdEvent.old_stable_id, StableIdEvent.new_stable_id, StableIdEvent.mapping_session_id + StableIdEvent.old_stable_id, StableIdEvent.new_stable_id, StableIdEvent.mapping_session_id, ) ) - pairs: List[Pair] = [] + pairs: list[Pair] = [] for row in self.session.scalars(id_events_stmt).unique().all(): pair = Pair(row.old_stable_id, row.new_stable_id) pairs.append(pair) return pairs - def make_events(self, pairs: List[Pair]) -> List: + def make_events(self, pairs: list[Pair]) -> list: """Given a list of pairs, create events. Args: @@ -408,11 +405,10 @@ def make_events(self, pairs: List[Pair]) -> List: A list of events. """ - from_list, to_list = self.get_pairs_from_to(pairs) # Create events with those 2 dicts - events: List[Event] = [] + events: list[Event] = [] for old_id, from_old_list in from_list.items(): if not old_id or old_id not in from_list: continue @@ -444,9 +440,8 @@ def make_events(self, pairs: List[Pair]) -> List: return events @staticmethod - def get_pairs_from_to(pairs: List[Pair]) -> Tuple[DictToIdsSet, DictToIdsSet]: - """ - From a list of Pairs, extract a mapping of all ids from a given old id (from_list), + def get_pairs_from_to(pairs: list[Pair]) -> tuple[DictToIdsSet, DictToIdsSet]: + """From a list of Pairs, extract a mapping of all ids from a given old id (from_list), and a mapping of all ids to a given new id (to_list). Args: @@ -487,8 +482,8 @@ def get_pairs_from_to(pairs: List[Pair]) -> Tuple[DictToIdsSet, DictToIdsSet]: return from_list, to_list def extend_event( - self, event: Event, from_list: DictToIdsSet, to_list: DictToIdsSet - ) -> Tuple[Event, DictToIdsSet, DictToIdsSet]: + self, event: Event, from_list: DictToIdsSet, to_list: DictToIdsSet, + ) -> tuple[Event, DictToIdsSet, DictToIdsSet]: """Given an event, aggregate ids in the 'from' and 'to' sets, to connect the whole group. Args: @@ -501,7 +496,6 @@ def extend_event( have been added to the event have been removed. """ - extended = True while extended: @@ -537,7 +531,7 @@ def extend_event( def main() -> None: """Main entrypoint""" parser = ArgumentParser( - description="Dump the stable ID events from the information available in a core database." + description="Dump the stable ID events from the information available in a core database.", ) parser.add_server_arguments(include_database=True) parser.add_argument_dst_path("--output_file", required=True, help="Output file") diff --git a/src/python/ensembl/io/genomio/events/format.py b/src/python/ensembl/io/genomio/events/format.py index 8580ec721..99f1f1d50 100644 --- a/src/python/ensembl/io/genomio/events/format.py +++ b/src/python/ensembl/io/genomio/events/format.py @@ -19,7 +19,6 @@ from os import PathLike from pathlib import Path import re -from typing import Dict, List from ensembl.io.genomio.events.load import EventCollection from ensembl.utils.argparse import ArgumentParser @@ -27,16 +26,18 @@ class IdsMapper: + """Simple mapper object, to cleanly get a mapping dict.""" def __init__(self, map_file: PathLike) -> None: self.map = self._load_mapping(Path(map_file)) - def _load_mapping(self, map_file: Path) -> Dict[str, str]: + def _load_mapping(self, map_file: Path) -> dict[str, str]: """Return a mapping in a simple dict from a tab file with 2 columns: from_id, to_id. Args: map_file: Tab file path. + """ mapping = {} with map_file.open("r") as map_fh: @@ -52,7 +53,7 @@ def _load_mapping(self, map_file: Path) -> Dict[str, str]: return mapping -def load_list(list_file: Path) -> List[str]: +def load_list(list_file: Path) -> list[str]: """Return a simple list from a file.""" items = set() empty_spaces = re.compile(r"\s+") @@ -71,10 +72,10 @@ def main() -> None: parser = ArgumentParser(description="Map stable IDs in a file and produce an events file.") parser.add_argument_src_path("--input_file", required=True, help="Input file from gene_diff") parser.add_argument_src_path( - "--deletes_file", required=True, help="Deleted genes file (apart from the deletes from the gene diff)" + "--deletes_file", required=True, help="Deleted genes file (apart from the deletes from the gene diff)", ) parser.add_argument_src_path( - "--map_file", required=True, help="Mapping tab file with 2 columns: old_id, new_id" + "--map_file", required=True, help="Mapping tab file with 2 columns: old_id, new_id", ) parser.add_argument("--release_name", required=True, metavar="NAME", help="Release name for all events") parser.add_argument("--release_date", required=True, metavar="DATE", help="Release date for all events") diff --git a/src/python/ensembl/io/genomio/events/load.py b/src/python/ensembl/io/genomio/events/load.py index 371d607bd..c4c9c2f9c 100644 --- a/src/python/ensembl/io/genomio/events/load.py +++ b/src/python/ensembl/io/genomio/events/load.py @@ -24,7 +24,7 @@ from pathlib import Path import re import logging -from typing import Dict, Generator, List, Optional, Tuple +from typing import Generator from sqlalchemy.orm import Session @@ -36,6 +36,7 @@ @dataclass class IdEvent: + """Simple representation for the events from the input file""" from_id: str @@ -55,12 +56,13 @@ def is_change(self) -> bool: class MapSession: + """Simple mapping_sessions representation from the input file""" def __init__(self, release: str, release_date: str) -> None: self.release = release self.release_date = release_date - self.events: List[IdEvent] = [] + self.events: list[IdEvent] = [] def add_event(self, event: IdEvent) -> None: """Add an event to this mapping_session""" @@ -68,17 +70,18 @@ def add_event(self, event: IdEvent) -> None: class EventCollection: + """Collection of events with loader/writer in various formats.""" def __init__(self) -> None: - self.events: List[IdEvent] = [] + self.events: list[IdEvent] = [] def load_events(self, input_file: PathLike) -> None: """Load events from input file. Expected tab file columns: old_id, new_id, event_name, release, release_date """ - events: List[IdEvent] = [] + events: list[IdEvent] = [] with Path(input_file).open("r") as events_fh: for line in events_fh: @@ -87,23 +90,23 @@ def load_events(self, input_file: PathLike) -> None: continue (from_id, to_id, event_name, release, release_date) = line.split("\t") event = IdEvent( - from_id=from_id, to_id=to_id, event=event_name, release=release, release_date=release_date + from_id=from_id, to_id=to_id, event=event_name, release=release, release_date=release_date, ) events.append(event) self.events = events def add_deletes( - self, genes: List[str], release_name: str = "release_name", release_date: str = "release_date" + self, genes: list[str], release_name: str = "release_name", release_date: str = "release_date", ) -> None: """Add deletion events from a list of deleted genes.""" for gene_id in genes: event = IdEvent( - from_id=gene_id, to_id="", event="deletion", release=release_name, release_date=release_date + from_id=gene_id, to_id="", event="deletion", release=release_name, release_date=release_date, ) self.events.append(event) def load_events_from_gene_diff( - self, input_file: PathLike, release_name: str = "release_name", release_date: str = "release_date" + self, input_file: PathLike, release_name: str = "release_name", release_date: str = "release_date", ) -> None: """Load events from input file from gene_diff.""" loaded_event = set() @@ -131,7 +134,7 @@ def load_events_from_gene_diff( ) self.events.append(event) - def _parse_gene_diff_event(self, event_string: str) -> Generator[Tuple[str, str, str], None, None]: + def _parse_gene_diff_event(self, event_string: str) -> Generator[tuple[str, str, str], None, None]: """Gets all the pairs of IDs from an event string from gene diff.""" event_symbol = { "~": "identical", @@ -157,13 +160,13 @@ def _parse_gene_diff_event(self, event_string: str) -> Generator[Tuple[str, str, for to_id in to_ids.split(":"): yield (from_id, to_id, event_name) - def remap_to_ids(self, map_dict: Dict[str, str]) -> None: + def remap_to_ids(self, map_dict: dict[str, str]) -> None: """Using a mapping dict, remap the to_id of all events. Raises: ValueError: If there are events without map information. - """ + """ no_map = 0 for event in self.events: if not event.to_id: @@ -192,7 +195,7 @@ def write_events_to_db(self, session: Session, update: bool = False) -> None: """ # First, create mapping_sessions based on the release - mappings: Dict[str, MapSession] = {} + mappings: dict[str, MapSession] = {} for event in self.events: release = event.release if release not in mappings: @@ -208,10 +211,10 @@ def write_events_to_db(self, session: Session, update: bool = False) -> None: session.flush() session.refresh(map_session) for event in mapping.events: - from_id: Optional[str] = event.from_id + from_id: str | None = event.from_id if from_id == "": from_id = None - to_id: Optional[str] = event.to_id + to_id: str | None = event.to_id if to_id == "": to_id = None id_event = StableIdEvent( diff --git a/src/python/ensembl/io/genomio/external_db/db_map.py b/src/python/ensembl/io/genomio/external_db/db_map.py index 9b07d5b73..9332003f1 100644 --- a/src/python/ensembl/io/genomio/external_db/db_map.py +++ b/src/python/ensembl/io/genomio/external_db/db_map.py @@ -30,6 +30,7 @@ class MapFormatError(ValueError): + """Error when parsing the db map file.""" diff --git a/src/python/ensembl/io/genomio/fasta/chunk.py b/src/python/ensembl/io/genomio/fasta/chunk.py index 21c95141e..ad1a63c10 100644 --- a/src/python/ensembl/io/genomio/fasta/chunk.py +++ b/src/python/ensembl/io/genomio/fasta/chunk.py @@ -32,7 +32,7 @@ import logging from pathlib import Path import re -from typing import Any, Callable, ContextManager, Optional +from typing import Any, Callable, ContextManager from Bio import SeqIO from Bio.Seq import Seq @@ -48,6 +48,7 @@ def _on_value_error(msg: str) -> None: Args: msg: A message to raise ValueError with. + """ raise ValueError(msg) @@ -65,6 +66,7 @@ def check_chunk_size_and_tolerance( Dies: If checks failed dies with `parser.error` + """ if chunk_size < 50_000: error_f(f"wrong '--chunk_size' value: '{chunk_size}'. should be greater then 50_000. exiting...") @@ -72,7 +74,7 @@ def check_chunk_size_and_tolerance( error_f(f"wrong '--chunk_tolerance' value: '{chunk_tolerance}'. can't be less then 0. exiting...") -def split_seq_by_n(seq: str, split_pattern: Optional[re.Pattern]) -> list[int]: +def split_seq_by_n(seq: str, split_pattern: re.Pattern | None) -> list[int]: """Split a string into chunks at the positions where the pattern is found. `N`s (pattern) are appended to the chunk on the left. @@ -85,6 +87,7 @@ def split_seq_by_n(seq: str, split_pattern: Optional[re.Pattern]) -> list[int]: Returns: List with open coordinates of the chunks ends (or with only a single sequence length). + """ seq_len = len(seq) if not split_pattern: @@ -96,7 +99,7 @@ def split_seq_by_n(seq: str, split_pattern: Optional[re.Pattern]) -> list[int]: def split_seq_by_chunk_size( - ends: list[int], chunk_size: int, tolerated_size: Optional[int] = None + ends: list[int], chunk_size: int, tolerated_size: int | None = None, ) -> list[int]: """Split list of end coordinates, to form chunks not longer then chunk_size. @@ -108,6 +111,7 @@ def split_seq_by_chunk_size( Returns: List with open coordinates of the chunks ends (or with only a single sequence length). + """ if not ends or chunk_size < 1: return ends @@ -131,20 +135,21 @@ def _individual_file_opener(name: str) -> TextIOWrapper: Args: name: Name of the file to open + """ - return open(name, "wt", encoding="utf-8") + return open(name, "w", encoding="utf-8") def chunk_fasta_stream( input_fasta: TextIOWrapper, chunk_size: int, chunk_size_tolerated: int, - output_fasta: Optional[TextIOWrapper] | nullcontext[Any], - individual_file_prefix: Optional[Path], + output_fasta: TextIOWrapper | None | nullcontext[Any], + individual_file_prefix: Path | None, *, n_sequence_len: int = 0, chunk_sfx: str = "ens_chunk", - append_offset_to_chunk_name: Optional[bool] = None, + append_offset_to_chunk_name: bool | None = None, open_individual: Callable[[str], ContextManager[Any]] = _individual_file_opener, ) -> list[str]: """Split input TextIOWrapper stream with fasta into a smaller chunks based on @@ -165,8 +170,8 @@ def chunk_fasta_stream( append_offset_to_chunk_name: A flag to append 0-based offset (`_off_{offset}`) to the chunk name. open_individual: A callable taking filename as an input to generate the output file for individual contig if out_put FASTA is `false` of `None`, folders should be preexisting. - """ + """ chunk_size_tolerated = max(chunk_size, chunk_size_tolerated) # output agp_lines list agp_lines = [] @@ -233,6 +238,7 @@ def get_tolerated_size(size: int, tolerance: int) -> int: Returns: Maximum tolerated chunk size. + """ tolerance = max(tolerance, 0) @@ -246,12 +252,12 @@ def chunk_fasta( chunk_size: int, chunk_size_tolerated: int, out_file_name: str, - individual_file_prefix: Optional[Path], + individual_file_prefix: Path | None, *, - agp_output_file: Optional[str] = None, + agp_output_file: str | None = None, n_sequence_len: int = 0, chunk_sfx: str = "ens_chunk", - append_offset_to_chunk_name: Optional[bool] = None, + append_offset_to_chunk_name: bool | None = None, ) -> None: """Open `input_fasta_file` and split into a smaller chunks based on stretches of "N"s and then based on chunk_size_tolerated and store either to @@ -269,19 +275,19 @@ def chunk_fasta( n_sequence_len: Length of the stretch of `N`s to split at; not slitting on `N`s if 0. chunk_sfx: A string to put between the original sequence name and the chunk suffix. append_offset_to_chunk_name: Append 0-based offset in the form of `_off_{offset}` to the chunk name. - """ + """ # process input fasta with open_gz_file(input_fasta_file) as fasta: logging.info( f"splitting sequences from '{input_fasta_file}', chunk size {chunk_size:_}, \ - splitting on {n_sequence_len} Ns (0 -- disabled)" + splitting on {n_sequence_len} Ns (0 -- disabled)", ) # do not open a joined file if you plan to open many individual ones with ( individual_file_prefix and nullcontext(None) - or open(out_file_name, "wt", encoding="utf-8") as out_file_joined + or open(out_file_name, "w", encoding="utf-8") as out_file_joined ): agp_lines = chunk_fasta_stream( fasta, @@ -300,7 +306,7 @@ def chunk_fasta( agp_out.write("\n".join(agp_lines) + "\n") -def prepare_out_dir_for_individuals(dir_name: Path, file_part: str) -> Optional[Path]: +def prepare_out_dir_for_individuals(dir_name: Path, file_part: str) -> Path | None: """Creates `dir_name` (including upstream dirs) and returns its paths with the `file_part` appended. Args: @@ -312,6 +318,7 @@ def prepare_out_dir_for_individuals(dir_name: Path, file_part: str) -> Optional[ Throws: exception if not able to create directory. + """ file_prefix = None if dir_name: diff --git a/src/python/ensembl/io/genomio/fasta/process.py b/src/python/ensembl/io/genomio/fasta/process.py index 2c83a2844..4c89af260 100644 --- a/src/python/ensembl/io/genomio/fasta/process.py +++ b/src/python/ensembl/io/genomio/fasta/process.py @@ -19,7 +19,6 @@ import logging from pathlib import Path from os import PathLike -from typing import List, Optional, Set from Bio import SeqIO @@ -28,18 +27,18 @@ from ensembl.utils.logging import init_logging_with_args -exclude_seq_regions: List[str] = [] +exclude_seq_regions: list[str] = [] class FastaParserError(Exception): + """Error while parsing a FASTA file.""" -def get_peptides_to_exclude(genbank_path: PathLike, seqr_to_exclude: Set[str]) -> Set[str]: - """ - Extract peptide IDs from a genbank file that are in a given list of seq regions +def get_peptides_to_exclude(genbank_path: PathLike, seqr_to_exclude: set[str]) -> set[str]: + """Extract peptide IDs from a genbank file that are in a given list of seq regions """ - peptides_to_exclude: Set[str] = set() + peptides_to_exclude: set[str] = set() with open_gz_file(genbank_path) as in_genbank: for record in SeqIO.parse(in_genbank, "genbank"): if record.id in seqr_to_exclude: @@ -56,16 +55,16 @@ def get_peptides_to_exclude(genbank_path: PathLike, seqr_to_exclude: Set[str]) - def prep_fasta_data( fasta_infile: PathLike, - genbank_infile: Optional[PathLike], + genbank_infile: PathLike | None, fasta_outfile: PathLike, peptide_mode: bool = False, ) -> None: - """ - Args: - fasta_file: Input FASTA file - DNA / Protein - genbank_infile: Input GenBank GBFF file (Optional) - fasta_outfile: Output FASTA sequence file. - peptide_mode: Process proteins instead of DNA + """Args: + fasta_file: Input FASTA file - DNA / Protein + genbank_infile: Input GenBank GBFF file (Optional) + fasta_outfile: Output FASTA sequence file. + peptide_mode: Process proteins instead of DNA + """ file_path = Path(fasta_infile) diff --git a/src/python/ensembl/io/genomio/genbank/download.py b/src/python/ensembl/io/genomio/genbank/download.py index 5110e75d8..ea21876df 100644 --- a/src/python/ensembl/io/genomio/genbank/download.py +++ b/src/python/ensembl/io/genomio/genbank/download.py @@ -27,6 +27,7 @@ class DownloadError(Exception): + """In case a download failed.""" def __init__(self, msg: str) -> None: @@ -46,7 +47,6 @@ def download_genbank(accession: str, output_file: PathLike) -> None: DownloadError: If the download fails. """ - # Get the list of assemblies for this accession entrez_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" entrez_params = { diff --git a/src/python/ensembl/io/genomio/genbank/extract_data.py b/src/python/ensembl/io/genomio/genbank/extract_data.py index 59c001fa7..36de9b819 100644 --- a/src/python/ensembl/io/genomio/genbank/extract_data.py +++ b/src/python/ensembl/io/genomio/genbank/extract_data.py @@ -25,6 +25,7 @@ Returns: json_output: json file with a dict that contains all genome files created. + """ __all__ = ["GBParseError", "UnsupportedData", "GenomeFiles", "FormattedFilesGenerator"] @@ -34,7 +35,7 @@ import logging from os import PathLike from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any from BCBio import GFF from Bio import GenBank, SeqIO @@ -47,16 +48,18 @@ class GBParseError(Exception): + """Error when parsing the Genbank file.""" class UnsupportedData(Exception): + """When an expected data is not supported by the current parser.""" class GenomeFiles(dict): - """ - Store the representation of the genome files created. + + """Store the representation of the genome files created. """ def __init__(self, out_dir: PathLike) -> None: @@ -70,8 +73,8 @@ def __init__(self, out_dir: PathLike) -> None: class FormattedFilesGenerator: - """ - Contains a parser to load data from a file, and output a set of files that follow our schema + + """Contains a parser to load data from a file, and output a set of files that follow our schema for input into a core database """ @@ -94,7 +97,7 @@ class FormattedFilesGenerator: def __init__(self, prod_name: str, gb_file: PathLike, prefix: str, out_dir: PathLike) -> None: self.prefix = prefix - self.seq_records: List[SeqRecord] = [] + self.seq_records: list[SeqRecord] = [] self.prod_name = prod_name self.gb_file = gb_file @@ -102,16 +105,16 @@ def __init__(self, prod_name: str, gb_file: PathLike, prefix: str, out_dir: Path self.files = GenomeFiles(Path(out_dir)) def parse_genbank(self, gb_file: PathLike) -> None: - """ - Load records metadata from a Genbank file + """Load records metadata from a Genbank file Args: gb_file: Path to downloaded genbank file + """ organella = self._get_organella(gb_file) logging.debug(f"Organella loaded: {organella}") - with open(gb_file, "r") as gbh: + with open(gb_file) as gbh: for record in SeqIO.parse(gbh, "genbank"): # We don't want the record description (especially for the fasta file) record.description = "" @@ -126,24 +129,23 @@ def parse_genbank(self, gb_file: PathLike) -> None: logging.warning("No records are found in gb_file") def format_write_record(self) -> None: - """ - Generate the prepared files from genbank record + """Generate the prepared files from genbank record """ self._format_genome_data() self._format_write_genes_gff() self._format_write_seq_json() self._write_fasta_dna() - def _get_organella(self, gb_file: PathLike) -> Dict[str, str]: - """ - Retrieve the organelle from the genbank file, using the specific GenBank object, + def _get_organella(self, gb_file: PathLike) -> dict[str, str]: + """Retrieve the organelle from the genbank file, using the specific GenBank object, because SeqIO does not support this field Args: gb_file: path to genbank file + """ organella = {} - with open(gb_file, "r") as gbh: + with open(gb_file) as gbh: for record in GenBank.parse(gbh): accession = record.version for q in record.features[0].qualifiers: @@ -153,21 +155,19 @@ def _get_organella(self, gb_file: PathLike) -> Dict[str, str]: return organella def _write_fasta_dna(self) -> None: - """ - Generate a DNA fasta file with all the sequences in the record + """Generate a DNA fasta file with all the sequences in the record """ logging.debug(f"Write {len(self.seq_records)} DNA sequences to {self.files['fasta_dna']}") with open(self.files["fasta_dna"], "w") as fasta_fh: SeqIO.write(self.seq_records, fasta_fh, "fasta") def _format_write_genes_gff(self) -> None: - """ - Extract gene models from the record, and write a GFF and peptide fasta file. + """Extract gene models from the record, and write a GFF and peptide fasta file. Raise GBParseError If the IDs in all the records are not unique. """ - peptides: List[SeqRecord] = [] - gff_records: List[SeqRecord] = [] - all_ids: List[str] = [] + peptides: list[SeqRecord] = [] + gff_records: list[SeqRecord] = [] + all_ids: list[str] = [] for record in self.seq_records: new_record, rec_ids, rec_peptides = self._parse_record(record) @@ -192,38 +192,37 @@ def _format_write_genes_gff(self) -> None: if num_duplicates > 0: raise GBParseError(f"Some {num_duplicates} IDs are duplicated") - def _write_genes_gff(self, gff_records: List[SeqRecord]) -> None: - """ - Generate gene_models.gff file with the parsed gff_features + def _write_genes_gff(self, gff_records: list[SeqRecord]) -> None: + """Generate gene_models.gff file with the parsed gff_features Args: gff_records: List of records with features extracted from the record + """ logging.debug(f"Write {len(gff_records)} gene records to {self.files['gene_models']}") with self.files["gene_models"].open("w") as gff_fh: GFF.write(gff_records, gff_fh) - def _write_pep_fasta(self, peptides: List[SeqRecord]) -> None: - """ - Generate a peptide fasta file with the protein ids and sequence + def _write_pep_fasta(self, peptides: list[SeqRecord]) -> None: + """Generate a peptide fasta file with the protein ids and sequence Args: peptides: List of extracted peptide features as records + """ logging.debug(f"Write {len(peptides)} peptide sequences to {self.files['fasta_pep']}") with self.files["fasta_pep"].open("w") as fasta_fh: SeqIO.write(peptides, fasta_fh, "fasta") - def _parse_record(self, record: SeqRecord) -> Tuple[SeqRecord, List[str], List[SeqRecord]]: - """ - Parse a gene feature from the genbank file + def _parse_record(self, record: SeqRecord) -> tuple[SeqRecord, list[str], list[SeqRecord]]: + """Parse a gene feature from the genbank file Args: gene_feat: Gene feature to parse gene_name: Gene name associated with the gene feature """ - all_ids: List[str] = [] - peptides: List[SeqRecord] = [] - feats: Dict[str, SeqFeature] = {} + all_ids: list[str] = [] + peptides: list[SeqRecord] = [] + feats: dict[str, SeqFeature] = {} for feat in record.features: # Silently skip any unsupported feature type @@ -264,21 +263,20 @@ def _parse_record(self, record: SeqRecord) -> Tuple[SeqRecord, List[str], List[S return new_record, all_ids, peptides def _parse_gene_feat( - self, gene_feat: SeqFeature, gene_name: str - ) -> Tuple[Dict[str, SeqFeature], List[str], List[SeqRecord]]: - """ - Parse a gene feature from the genbank file + self, gene_feat: SeqFeature, gene_name: str, + ) -> tuple[dict[str, SeqFeature], list[str], list[SeqRecord]]: + """Parse a gene feature from the genbank file Args: gene_feat: Gene feature to parse gene_name: Gene name associated with the gene feature - """ + """ gene_id = self.prefix + gene_name gene_qualifiers = gene_feat.qualifiers - new_feats: Dict[str, Any] = {} - peptides: List[SeqRecord] = [] - all_ids: List[str] = [] + new_feats: dict[str, Any] = {} + peptides: list[SeqRecord] = [] + all_ids: list[str] = [] if gene_feat.type == "gene": if "pseudo" in gene_qualifiers: @@ -334,15 +332,15 @@ def _parse_gene_feat( return new_feats, all_ids, peptides - def _parse_rna_feat(self, rna_feat: SeqFeature) -> Tuple[Dict[str, SeqFeature], List[str]]: - """ - Parse an RNA feature + def _parse_rna_feat(self, rna_feat: SeqFeature) -> tuple[dict[str, SeqFeature], list[str]]: + """Parse an RNA feature Args: gene_feat: list of RNA features found in the record + """ - new_feats: Dict[str, Any] = {} - all_ids: List[str] = [] + new_feats: dict[str, Any] = {} + all_ids: list[str] = [] gff_qualifiers = rna_feat.qualifiers feat_name = gff_qualifiers["product"][0] @@ -376,16 +374,15 @@ def _parse_rna_feat(self, rna_feat: SeqFeature) -> Tuple[Dict[str, SeqFeature], return new_feats, all_ids - def _uniquify_id(self, gene_id: str, all_ids: List[str]) -> str: - """ - Ensure the gene id used is unique, + def _uniquify_id(self, gene_id: str, all_ids: list[str]) -> str: + """Ensure the gene id used is unique, and append a number otherwise, starting at 2 Args: all_ids: list of all the feature ids gene_id: ids assigned to gene - """ + """ new_id = gene_id num = 1 while new_id in all_ids: @@ -397,24 +394,23 @@ def _uniquify_id(self, gene_id: str, all_ids: List[str]) -> str: return new_id def _format_write_seq_json(self) -> None: - """ - Add the sequence metadata to seq_json based on ensembl requirements + """Add the sequence metadata to seq_json based on ensembl requirements """ json_array = [] for seq in self.seq_records: codon_table = self._get_codon_table(seq) if codon_table is None: logging.warning( - ( + "No codon table found. Make sure to change the codon table number in " - f"{self.files['seq_region']} manually if it is not the standard codon table." - ) + f"{self.files['seq_region']} manually if it is not the standard codon table.", + ) codon_table = 1 else: codon_table = int(codon_table) - seq_obj: Dict[str, Any] = { + seq_obj: dict[str, Any] = { "name": seq.id, "coord_system_level": "chromosome", "circular": (seq.annotations["topology"] == "circular"), @@ -425,11 +421,11 @@ def _format_write_seq_json(self) -> None: seq_obj["location"] = self._prepare_location(str(seq.annotations["organelle"])) if not codon_table: logging.warning( - ( + f"'{seq.annotations['organelle']}' is an organelle: " "make sure to change the codon table number " - f"in {self.files['seq_region']} manually if it is not the standard codon table" - ) + f"in {self.files['seq_region']} manually if it is not the standard codon table", + ) # Additional attributes for Ensembl @@ -443,23 +439,23 @@ def _format_write_seq_json(self) -> None: json_array.append(seq_obj) self._write_seq_region_json(json_array) - def _write_seq_region_json(self, json_array: List[Dict[str, Any]]) -> None: - """ - Generate seq_region.json file with metadata for the sequence + def _write_seq_region_json(self, json_array: list[dict[str, Any]]) -> None: + """Generate seq_region.json file with metadata for the sequence Args: json_array: List of extracted sequence with metadata + """ logging.debug(f"Write {len(json_array)} seq_region to {self.files['seq_region']}") with open(self.files["seq_region"], "w") as seq_fh: seq_fh.write(json.dumps(json_array, indent=4)) - def _get_codon_table(self, seq: SeqRecord) -> Optional[int]: - """ - Look at the CDS features to see if they have a codon table + def _get_codon_table(self, seq: SeqRecord) -> int | None: + """Look at the CDS features to see if they have a codon table Args: seq: SeqRecord in the genbank file + """ for feat in seq.features: if feat.type == "CDS": @@ -470,24 +466,23 @@ def _get_codon_table(self, seq: SeqRecord) -> Optional[int]: return None def _prepare_location(self, organelle: str) -> str: - """ - Given an organelle name, returns the SO term corresponding to its location + """Given an organelle name, returns the SO term corresponding to its location Args: organelle: SeqRecord with organelle + """ if organelle in self.locations: return self.locations[organelle] raise UnsupportedData(f"Unknown organelle: {organelle}") def _format_genome_data(self) -> None: - """ - Write a draft for the genome json file + """Write a draft for the genome json file Only the production_name is needed, but the rest of the fields need to be given for the validation of the json file """ prod_name = self.prod_name - genome_data: Dict[str, Dict[str, Any]] = { + genome_data: dict[str, dict[str, Any]] = { "species": { "production_name": prod_name, "taxonomy_id": 0, @@ -498,19 +493,19 @@ def _format_genome_data(self) -> None: if not genome_data["species"]["production_name"]: logging.warning( - f"Please add the relevant production_name for this genome in {self.files['genome']}" + f"Please add the relevant production_name for this genome in {self.files['genome']}", ) ids = [seq.id for seq in self.seq_records] genome_data["added_seq"]["region_name"] = ids self._write_genome_json(genome_data) - def _write_genome_json(self, genome_data: Dict[str, Any]) -> None: - """ - Generate genome.json file with metadata for the assembly + def _write_genome_json(self, genome_data: dict[str, Any]) -> None: + """Generate genome.json file with metadata for the assembly Args: genome_data: Dict of metadata for assembly + """ logging.debug(f"Write assembly metadata to {self.files['genome']}") with open(self.files["genome"], "w") as genome_fh: @@ -524,13 +519,13 @@ def main() -> None: parser.add_argument("--prefix", required=True, help="prefix to add to every feature ID") parser.add_argument("--prod_name", required=True, help="production name for the species") parser.add_argument_dst_path( - "--out_dir", default=Path.cwd(), help="output folder where the generated files will be stored" + "--out_dir", default=Path.cwd(), help="output folder where the generated files will be stored", ) parser.add_log_arguments(add_log_file=True) args = parser.parse_args() init_logging_with_args(args) gb_extractor = FormattedFilesGenerator( - prefix=args.prefix, prod_name=args.prod_name, gb_file=args.gb_file, out_dir=args.out_dir + prefix=args.prefix, prod_name=args.prod_name, gb_file=args.gb_file, out_dir=args.out_dir, ) gb_extractor.parse_genbank(args.gb_file) diff --git a/src/python/ensembl/io/genomio/genome_metadata/dump.py b/src/python/ensembl/io/genomio/genome_metadata/dump.py index 7cb7702b5..edbc50988 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/dump.py +++ b/src/python/ensembl/io/genomio/genome_metadata/dump.py @@ -22,7 +22,7 @@ ] import json -from typing import Any, Dict, Type +from typing import Any import logging from sqlalchemy import select @@ -34,7 +34,7 @@ from ensembl.utils.logging import init_logging_with_args -METADATA_FILTER: Dict[str, Dict[str, Type]] = { +METADATA_FILTER: dict[str, dict[str, type]] = { "added_seq": {"region_name": str}, "annotation": {"provider_name": str, "provider_url": str}, "assembly": { @@ -60,14 +60,14 @@ } -def get_genome_metadata(session: Session) -> Dict[str, Any]: +def get_genome_metadata(session: Session) -> dict[str, Any]: """Returns the meta table content from the core database in a nested dictionary. Args: session: Session for the current core. """ - genome_metadata: Dict[str, Any] = {} + genome_metadata: dict[str, Any] = {} meta_statement = select(Meta) for row in session.execute(meta_statement).unique().all(): meta_key = row[0].meta_key @@ -96,15 +96,16 @@ def get_genome_metadata(session: Session) -> Dict[str, Any]: return genome_metadata -def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: +def filter_genome_meta(genome_metadata: dict[str, Any]) -> dict[str, Any]: """Returns a filtered metadata dictionary with only the predefined keys in METADATA_FILTER. Also converts to expected data types (to follow the genome JSON schema). Args: genome_metadata: Nested metadata key values from the core metadata table. + """ - filtered_metadata: Dict[str, Any] = {} + filtered_metadata: dict[str, Any] = {} for key, subfilter in METADATA_FILTER.items(): if key in genome_metadata: filtered_metadata[key] = {} @@ -123,18 +124,19 @@ def filter_genome_meta(genome_metadata: Dict[str, Any]) -> Dict[str, Any]: return filtered_metadata -def check_assembly_refseq(gmeta_out: Dict[str, Any]) -> None: +def check_assembly_refseq(gmeta_out: dict[str, Any]) -> None: """Update the GCA accession to use GCF if it is from RefSeq. Args: genome_metadata: Nested metadata key values from the core metadata table. + """ assembly = gmeta_out.get("assembly", {}) if assembly.get("provider_name", "") == "RefSeq": assembly["accession"] = assembly["accession"].replace("GCA", "GCF") -def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: +def check_assembly_version(genome_metadata: dict[str, Any]) -> None: """Updates the assembly version of the genome metadata provided. If `version` meta key is not and integer or it is not available, the assembly accession's version @@ -145,6 +147,7 @@ def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: Raises: ValueError: If both `version` and the assembly accession's version are not integers or are missing. + """ assembly = genome_metadata["assembly"] version = assembly.get("version") @@ -164,7 +167,7 @@ def check_assembly_version(genome_metadata: Dict[str, Any]) -> None: logging.info(f'Located version [v{assembly["version"]}] info from meta data.') -def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: +def check_genebuild_version(genome_metadata: dict[str, Any]) -> None: """Updates the genebuild version (if not present) from the genebuild ID, removing the latter. Args: @@ -172,6 +175,7 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: Raises: ValueError: If there is no genebuild version or ID available. + """ try: genebuild = genome_metadata["genebuild"] @@ -191,7 +195,7 @@ def check_genebuild_version(genome_metadata: Dict[str, Any]) -> None: def main() -> None: """Main script entry-point.""" parser = ArgumentParser( - description="Fetch the genome metadata from a core database and print it in JSON format." + description="Fetch the genome metadata from a core database and print it in JSON format.", ) parser.add_server_arguments(include_database=True) parser.add_log_arguments(add_log_file=True) diff --git a/src/python/ensembl/io/genomio/genome_metadata/extend.py b/src/python/ensembl/io/genomio/genome_metadata/extend.py index 3b3532996..1d09ab449 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/extend.py +++ b/src/python/ensembl/io/genomio/genome_metadata/extend.py @@ -25,7 +25,6 @@ from os import PathLike from pathlib import Path import re -from typing import Dict, List, Tuple, Optional from Bio import SeqIO @@ -38,12 +37,13 @@ _VERSION_END = re.compile(r"\.\d+$") -def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[str]: +def get_additions(report_path: PathLike, gbff_path: PathLike | None) -> list[str]: """Returns all `seq_regions` that are mentioned in the report but that are not in the data. Args: report_path: Path to the report file. gbff_path: Path to the GBFF file. + """ gbff_regions = set(get_gbff_regions(gbff_path)) report_regions = get_report_regions_names(report_path) @@ -59,11 +59,12 @@ def get_additions(report_path: PathLike, gbff_path: Optional[PathLike]) -> List[ return additions -def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]: +def get_gbff_regions(gbff_path: PathLike | None) -> list[str]: """Returns the `seq_region` data from a GBFF file. Args: gbff_path: GBFF file path to use. + """ seq_regions = [] if gbff_path: @@ -74,11 +75,12 @@ def get_gbff_regions(gbff_path: Optional[PathLike]) -> List[str]: return seq_regions -def _report_to_csv(report_path: PathLike) -> Tuple[str, Dict]: +def _report_to_csv(report_path: PathLike) -> tuple[str, dict]: """Returns the assembly report as a CSV string, and its metadata as a dictionary. Args: report_path: Path to the assembly report file from INSDC/RefSeq. + """ data = "" metadata = {} @@ -100,11 +102,12 @@ def _report_to_csv(report_path: PathLike) -> Tuple[str, Dict]: return data, metadata -def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]: +def get_report_regions_names(report_path: PathLike) -> list[tuple[str, str]]: """Returns a list of GenBank-RefSeq `seq_region` names from the assembly report file. Args: report_path: Path to the assembly report file from INSDC/RefSeq. + """ # Get the report in a CSV format, easier to manipulate report_csv, _ = _report_to_csv(report_path) @@ -128,15 +131,15 @@ def get_report_regions_names(report_path: PathLike) -> List[Tuple[str, str]]: def amend_genome_metadata( genome_infile: PathLike, genome_outfile: PathLike, - report_file: Optional[PathLike] = None, - genbank_file: Optional[PathLike] = None, + report_file: PathLike | None = None, + genbank_file: PathLike | None = None, ) -> None: - """ - Args: - genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`. - genome_outfile: Amended genome metadata file. - report_file: INSDC/RefSeq sequences report file. - genbank_file: INSDC/RefSeq GBFF file. + """Args: + genome_infile: Genome metadata following the `src/python/ensembl/io/genomio/data/schemas/genome.json`. + genome_outfile: Amended genome metadata file. + report_file: INSDC/RefSeq sequences report file. + genbank_file: INSDC/RefSeq GBFF file. + """ genome_metadata = get_json(genome_infile) # Get additional sequences in the assembly but not in the data @@ -159,7 +162,7 @@ def main() -> None: help="Input genome metadata file (following src/python/ensembl/io/genomio/data/schemas/genome.json)", ) parser.add_argument_dst_path( - "--genome_outfile", required=True, help="Path to the new amended genome metadata file" + "--genome_outfile", required=True, help="Path to the new amended genome metadata file", ) parser.add_argument_src_path("--report_file", help="INSDC/RefSeq sequences report file") parser.add_argument_src_path("--genbank_file", help="INSDC/RefSeq GBFF file") diff --git a/src/python/ensembl/io/genomio/genome_metadata/prepare.py b/src/python/ensembl/io/genomio/genome_metadata/prepare.py index c7eaca1fd..f356ba088 100644 --- a/src/python/ensembl/io/genomio/genome_metadata/prepare.py +++ b/src/python/ensembl/io/genomio/genome_metadata/prepare.py @@ -29,7 +29,6 @@ import datetime from os import PathLike -from typing import Dict from ensembl.io.genomio.utils import get_json, print_json from ensembl.utils.argparse import ArgumentParser @@ -61,14 +60,16 @@ class MissingNodeError(Exception): + """When a taxon XML node cannot be found.""" class MetadataError(Exception): + """When a metadata value is not expected.""" -def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None: +def add_provider(genome_metadata: dict, ncbi_data: dict) -> None: """Updates the genome metadata adding provider information for assembly and gene models. Assembly provider metadata will only be added if it is missing, i.e. neither `"provider_name"` or @@ -80,6 +81,7 @@ def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None: Raises: MetadataError: If accession's format in genome metadata does not match with a known provider. + """ # Get accession provider accession = genome_metadata["assembly"]["accession"] @@ -104,11 +106,12 @@ def add_provider(genome_metadata: Dict, ncbi_data: Dict) -> None: annotation["provider_url"] = f'{provider["annotation"]["provider_url"]}/{accession}' -def add_assembly_version(genome_data: Dict) -> None: +def add_assembly_version(genome_data: dict) -> None: """Adds version number to the genome's assembly information if one is not present already. Args: genome_data: Genome information of assembly, accession and annotation. + """ assembly = genome_data["assembly"] if "version" not in assembly: @@ -118,13 +121,14 @@ def add_assembly_version(genome_data: Dict) -> None: assembly["version"] = int(version) -def add_genebuild_metadata(genome_data: Dict) -> None: +def add_genebuild_metadata(genome_data: dict) -> None: """Adds genebuild metadata to genome information if not present already. The default convention is to use the current date as `"version"` and `"start_date"`. Args: genome_data: Genome information of assembly, accession and annotation. + """ genebuild = genome_data.setdefault("genebuild", {}) current_date = datetime.date.today().isoformat() @@ -134,7 +138,7 @@ def add_genebuild_metadata(genome_data: Dict) -> None: genebuild["start_date"] = current_date -def add_species_metadata(genome_metadata: Dict, ncbi_data: Dict) -> None: +def add_species_metadata(genome_metadata: dict, ncbi_data: dict) -> None: """Adds taxonomy ID, scientific name and strain (if present) from the NCBI dataset report. Args: @@ -194,15 +198,15 @@ def main() -> None: parser = ArgumentParser(description=__doc__) parser.add_argument_src_path("--input_file", required=True, help="Genome metadata JSON file") parser.add_argument_dst_path( - "--output_file", required=True, help="Output path for the new genome metadata file" + "--output_file", required=True, help="Output path for the new genome metadata file", ) parser.add_argument_src_path( - "--ncbi_meta", required=True, help="JSON file from NCBI datasets for this genome." + "--ncbi_meta", required=True, help="JSON file from NCBI datasets for this genome.", ) parser.add_log_arguments() args = parser.parse_args() init_logging_with_args(args) prepare_genome_metadata( - input_file=args.input_file, output_file=args.output_file, ncbi_meta=args.ncbi_meta + input_file=args.input_file, output_file=args.output_file, ncbi_meta=args.ncbi_meta, ) diff --git a/src/python/ensembl/io/genomio/genome_stats/compare.py b/src/python/ensembl/io/genomio/genome_stats/compare.py index 3f66b84fb..f6506e479 100644 --- a/src/python/ensembl/io/genomio/genome_stats/compare.py +++ b/src/python/ensembl/io/genomio/genome_stats/compare.py @@ -19,14 +19,14 @@ import json from os import PathLike import re -from typing import Any, Dict +from typing import Any from ensembl.io.genomio.utils import get_json from ensembl.utils.argparse import ArgumentParser from ensembl.utils.logging import init_logging_with_args -def stats_dict_cmp(ncbi: Dict[str, int], core: Dict[str, int]) -> Dict[str, Dict]: +def stats_dict_cmp(ncbi: dict[str, int], core: dict[str, int]) -> dict[str, dict]: """Compares both dictionaries and returns the similar and different elements between both. The method assumes both dictionaries have the same set of keys. A key would be considered the @@ -53,7 +53,7 @@ def stats_dict_cmp(ncbi: Dict[str, int], core: Dict[str, int]) -> Dict[str, Dict same[key] = ncbi_count else: diff[key] = {"ncbi": ncbi_count, "core": core_count, "diff": core_count - ncbi_count} - comparison: Dict[str, Dict] = {} + comparison: dict[str, dict] = {} if same: comparison["same"] = same if diff: @@ -61,7 +61,7 @@ def stats_dict_cmp(ncbi: Dict[str, int], core: Dict[str, int]) -> Dict[str, Dict return comparison -def compare_assembly(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]: +def compare_assembly(ncbi: dict[str, Any], core: dict[str, Any]) -> dict[str, dict]: """Extracts the assembly statistics and returns the comparison between both sources. The assembly statistics compared are the number of: organella, chromosomes, scaffolds and contigs. @@ -121,7 +121,7 @@ def compare_assembly(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Di return stats_dict_cmp(ncbi_counts, core_counts) -def compare_annotation(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]: +def compare_annotation(ncbi: dict[str, Any], core: dict[str, Any]) -> dict[str, dict]: """Extracts the annotation statistics and returns the comparison between both sources. Annotation statistics compared: @@ -168,7 +168,7 @@ def compare_annotation(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, return stats_dict_cmp(ncbi_counts, core_counts) -def compare_stats(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict]: +def compare_stats(ncbi: dict[str, Any], core: dict[str, Any]) -> dict[str, dict]: """Compares the genome statistics between an NCBI dataset and a core database. Args: @@ -186,7 +186,7 @@ def compare_stats(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict] core_assembly_stats = core.get("assembly_stats", {}) core_annotation_stats = core.get("annotation_stats", {}) - comp: Dict[str, Dict] = { + comp: dict[str, dict] = { "assembly_diff": compare_assembly(ncbi, core_assembly_stats), } if core_annotation_stats or ncbi_annotation_stats: @@ -194,7 +194,7 @@ def compare_stats(ncbi: Dict[str, Any], core: Dict[str, Any]) -> Dict[str, Dict] return comp -def compare_stats_files(ncbi_file: PathLike, core_file: PathLike) -> Dict[str, Dict]: +def compare_stats_files(ncbi_file: PathLike, core_file: PathLike) -> dict[str, dict]: """Compares the genome statistics between an NCBI dataset and a core database. Args: @@ -218,7 +218,7 @@ def compare_stats_files(ncbi_file: PathLike, core_file: PathLike) -> Dict[str, D def main() -> None: """Main script entry-point.""" parser = ArgumentParser( - description="Compares the genome statistics between an NCBI dataset and a core database." + description="Compares the genome statistics between an NCBI dataset and a core database.", ) parser.add_argument_src_path("--ncbi_stats", required=True, help="NCBI dataset stats JSON file") parser.add_argument_src_path("--core_stats", required=True, help="core database stats JSON file") diff --git a/src/python/ensembl/io/genomio/genome_stats/dump.py b/src/python/ensembl/io/genomio/genome_stats/dump.py index d56f05f1c..13997b552 100644 --- a/src/python/ensembl/io/genomio/genome_stats/dump.py +++ b/src/python/ensembl/io/genomio/genome_stats/dump.py @@ -18,7 +18,7 @@ from dataclasses import dataclass import json -from typing import Any, Dict +from typing import Any from sqlalchemy import select, func from sqlalchemy.orm import Session @@ -32,11 +32,12 @@ @dataclass class StatsGenerator: + """Interface to extract genome stats from a core database.""" session: Session - def get_assembly_stats(self) -> Dict[str, Any]: + def get_assembly_stats(self) -> dict[str, Any]: """Returns a dict of stats about the assembly.""" stats = { "coord_system": self.get_attrib_counts("coord_system_tag"), @@ -48,7 +49,7 @@ def get_assembly_stats(self) -> Dict[str, Any]: return stats @staticmethod - def _fix_scaffolds(stats: Dict[str, Any]) -> None: + def _fix_scaffolds(stats: dict[str, Any]) -> None: """Renames supercontigs to scaffolds in the provided stats. If scaffolds are present already, nothing is done. @@ -62,7 +63,7 @@ def _fix_scaffolds(stats: Dict[str, Any]) -> None: coords["scaffold"] = coords["supercontig"] del coords["supercontig"] - def get_attrib_counts(self, code: str) -> Dict[str, Any]: + def get_attrib_counts(self, code: str) -> dict[str, Any]: """Returns a dict of count for each value counted with the attrib_type code provided. Args: @@ -81,7 +82,7 @@ def get_attrib_counts(self, code: str) -> Dict[str, Any]: attributes[attribute_name] = count return attributes - def get_annotation_stats(self) -> Dict[str, Any]: + def get_annotation_stats(self) -> dict[str, Any]: """Returns a dict of stats about the coordinate systems (number of biotypes, etc.).""" stats = { "genes": self.get_feature_stats(Gene), @@ -89,7 +90,7 @@ def get_annotation_stats(self) -> Dict[str, Any]: } return stats - def get_biotypes(self, table: Any) -> Dict[str, int]: + def get_biotypes(self, table: Any) -> dict[str, int]: """Returns a dict of stats about the feature biotypes.""" # pylint: disable-next=not-callable seqs_st = select(table.biotype, func.count()).group_by(table.biotype) @@ -99,7 +100,7 @@ def get_biotypes(self, table: Any) -> Dict[str, int]: biotypes[biotype] = count return biotypes - def get_feature_stats(self, table: Any) -> Dict[str, int]: + def get_feature_stats(self, table: Any) -> dict[str, int]: """Returns a dict of stats about a given feature.""" session = self.session totals_st = select(func.count()).select_from(table) # pylint: disable=not-callable @@ -122,7 +123,7 @@ def get_feature_stats(self, table: Any) -> Dict[str, int]: } return feat_stats - def get_genome_stats(self) -> Dict[str, Any]: + def get_genome_stats(self) -> dict[str, Any]: """Returns a dict of stats about the assembly and annotation.""" genome_stats = { "assembly_stats": self.get_assembly_stats(), @@ -131,7 +132,7 @@ def get_genome_stats(self) -> Dict[str, Any]: return genome_stats -def dump_genome_stats(url: StrURL) -> Dict[str, Any]: +def dump_genome_stats(url: StrURL) -> dict[str, Any]: """Returns JSON object containing the genome stats (assembly and annotation) of the given core database. Args: diff --git a/src/python/ensembl/io/genomio/gff3/exceptions.py b/src/python/ensembl/io/genomio/gff3/exceptions.py index d218da268..1d4a3b177 100644 --- a/src/python/ensembl/io/genomio/gff3/exceptions.py +++ b/src/python/ensembl/io/genomio/gff3/exceptions.py @@ -23,6 +23,7 @@ class GFFParserError(Exception): + """Error when parsing a GFF3 file.""" def __init__(self, message: str) -> None: @@ -31,12 +32,15 @@ def __init__(self, message: str) -> None: class GeneSegmentError(GFFParserError): + """GFF3 gene segment parsing error.""" class IgnoredFeatureError(GFFParserError): + """GFF3 feature can be ignored.""" class UnsupportedFeatureError(GFFParserError): + """GFF3 feature is not supported.""" diff --git a/src/python/ensembl/io/genomio/gff3/extract_annotation.py b/src/python/ensembl/io/genomio/gff3/extract_annotation.py index c52af22b8..6404e555c 100644 --- a/src/python/ensembl/io/genomio/gff3/extract_annotation.py +++ b/src/python/ensembl/io/genomio/gff3/extract_annotation.py @@ -26,13 +26,13 @@ import logging from pathlib import Path import re -from typing import Any, Dict, List, Optional +from typing import Any from ensembl.io.genomio.utils.json_utils import print_json from .features import GFFSeqFeature -Annotation = Dict[str, Any] +Annotation = dict[str, Any] _PARENTS = { "transcript": "gene", @@ -41,42 +41,46 @@ class DuplicateIdError(Exception): + """Trying to add a feature with an ID already in use.""" class MissingParentError(Exception): + """Trying to add a feature without an expected parent.""" class AnnotationError(Exception): + """If anything wrong happens when recording annotations.""" class FunctionalAnnotations: + """List of annotations extracted from a GFF3 file.""" ignored_xrefs = {"go", "interpro", "uniprot"} def __init__(self, provider_name: str = "") -> None: - self.annotations: List[Annotation] = [] + self.annotations: list[Annotation] = [] self.provider_name = provider_name # Annotated features # Under each feature, each dict's key is a feature ID - self.features: Dict[str, Dict[str, Annotation]] = { + self.features: dict[str, dict[str, Annotation]] = { "gene": {}, "transcript": {}, "translation": {}, "transposable_element": {}, } # Keep parent info: key is the feature ID, value is the parent ID - self.parents: Dict[str, Dict[str, str]] = { + self.parents: dict[str, dict[str, str]] = { "gene": {}, "transcript": {}, } - def get_xrefs(self, feature: GFFSeqFeature) -> List[Dict[str, Any]]: + def get_xrefs(self, feature: GFFSeqFeature) -> list[dict[str, Any]]: """Get the xrefs from the Dbxref field.""" - all_xref: List[Dict[str, str]] = [] + all_xref: list[dict[str, str]] = [] if "Dbxref" in feature.qualifiers: for xref in feature.qualifiers["Dbxref"]: @@ -99,7 +103,7 @@ def get_xrefs(self, feature: GFFSeqFeature) -> List[Dict[str, Any]]: return all_xref - def get_features(self, feat_type: str) -> Dict[str, Annotation]: + def get_features(self, feat_type: str) -> dict[str, Annotation]: """Get all feature annotations for the requested type.""" try: return self.features[feat_type] @@ -129,8 +133,8 @@ def add_feature( self, feature: GFFSeqFeature, feat_type: str, - parent_id: Optional[str] = None, - all_parent_ids: Optional[List[str]] = None, + parent_id: str | None = None, + all_parent_ids: list[str] | None = None, ) -> None: """Add annotation for a feature of a given type. If a parent_id is provided, record the relationship. @@ -139,6 +143,7 @@ def add_feature( feat_type: Type of the feature to annotate. parent_id: Parent ID of this feature to keep it linked. all_parent_ids: All parent IDs to remove from non-informative descriptions. + """ if all_parent_ids is None: all_parent_ids = [] @@ -157,8 +162,8 @@ def add_feature( raise AnnotationError(f"No parent possible for {feat_type} {feature.id}") def _generic_feature( - self, feature: GFFSeqFeature, feat_type: str, parent_ids: Optional[List[str]] = None - ) -> Dict[str, Any]: + self, feature: GFFSeqFeature, feat_type: str, parent_ids: list[str] | None = None, + ) -> dict[str, Any]: """Create a feature object following the specifications. Args: @@ -245,7 +250,7 @@ def _clean_description(description: str) -> str: return description @staticmethod - def product_is_informative(product: str, feat_ids: Optional[List[str]] = None) -> bool: + def product_is_informative(product: str, feat_ids: list[str] | None = None) -> bool: """Returns True if the product name contains informative words, False otherwise. It is considered uninformative when the description contains words such as "hypothetical" or diff --git a/src/python/ensembl/io/genomio/gff3/features.py b/src/python/ensembl/io/genomio/gff3/features.py index 4198d304a..0c17594ed 100644 --- a/src/python/ensembl/io/genomio/gff3/features.py +++ b/src/python/ensembl/io/genomio/gff3/features.py @@ -24,6 +24,7 @@ class GFFSeqFeature(SeqFeature): + """Extends `Bio.SeqFeature.SeqFeature` with sub_features, to be used for typing.""" def __init__( diff --git a/src/python/ensembl/io/genomio/gff3/gene_merger.py b/src/python/ensembl/io/genomio/gff3/gene_merger.py index 2ade281a3..a4b552400 100644 --- a/src/python/ensembl/io/genomio/gff3/gene_merger.py +++ b/src/python/ensembl/io/genomio/gff3/gene_merger.py @@ -23,13 +23,13 @@ from os import PathLike from pathlib import Path import re -from typing import List import ensembl.io.genomio.data.gff3 from ensembl.io.genomio.utils.json_utils import get_json class GFFGeneMerger: + """Specialized class to merge split genes in a GFF3 file, prior to further parsing.""" def __init__(self) -> None: @@ -37,9 +37,8 @@ def __init__(self) -> None: with as_file(source) as biotypes_json: self._biotypes = get_json(biotypes_json) - def merge(self, in_gff_path: PathLike, out_gff_path: PathLike) -> List[str]: - """ - Merge genes in a gff that are split in multiple lines. + def merge(self, in_gff_path: PathLike, out_gff_path: PathLike) -> list[str]: + """Merge genes in a gff that are split in multiple lines. Args: in_gff_path: Input GFF3 that may have split merge. @@ -47,9 +46,10 @@ def merge(self, in_gff_path: PathLike, out_gff_path: PathLike) -> List[str]: Returns: List of all merged genes, each represented as a string of the GFF3 lines of all their parts. + """ to_merge = [] - merged: List[str] = [] + merged: list[str] = [] with Path(in_gff_path).open("r") as in_gff_fh, Path(out_gff_path).open("w") as out_gff_fh: for line in in_gff_fh: @@ -101,7 +101,7 @@ def merge(self, in_gff_path: PathLike, out_gff_path: PathLike) -> List[str]: logging.debug(f"Merged lines: {len(merged)}") return merged - def _merge_genes(self, to_merge: List) -> str: + def _merge_genes(self, to_merge: list) -> str: """Returns a single gene gff3 line merged from separate parts. Args: diff --git a/src/python/ensembl/io/genomio/gff3/id_allocator.py b/src/python/ensembl/io/genomio/gff3/id_allocator.py index 07a190ead..ca8265fa3 100644 --- a/src/python/ensembl/io/genomio/gff3/id_allocator.py +++ b/src/python/ensembl/io/genomio/gff3/id_allocator.py @@ -19,17 +19,18 @@ from dataclasses import dataclass, field import logging import re -from typing import Dict, List, Optional, Set from .features import GFFSeqFeature class InvalidStableID(ValueError): + """Raised when there is a problem with an stable ID.""" @dataclass class StableIDAllocator: + """Set of tools to check and allocate stable IDs.""" # Multiple parameters to automate various fixes @@ -38,9 +39,9 @@ class StableIDAllocator: current_id_number: int = 0 make_missing_stable_ids: bool = True prefix: str = "TMP_" - _loaded_ids: Set = field(default_factory=set) + _loaded_ids: set = field(default_factory=set) - def set_prefix(self, genome: Dict) -> None: + def set_prefix(self, genome: dict) -> None: """Sets the ID prefix using the organism abbrev if it exists in the genome metadata.""" try: org = genome["BRC4"]["organism_abbrev"] @@ -62,10 +63,11 @@ def generate_gene_id(self) -> str: def is_valid(self, stable_id: str) -> bool: """Checks that the format of a stable ID is valid. + Args: stable_id: Stable ID to validate. - """ + """ if self.skip_gene_id_validation: logging.debug(f"Validation deactivated by user: '{stable_id}' not checked") return True @@ -93,7 +95,7 @@ def is_valid(self, stable_id: str) -> bool: return True @staticmethod - def remove_prefix(stable_id: str, prefixes: List[str]) -> str: + def remove_prefix(stable_id: str, prefixes: list[str]) -> str: """Returns the stable ID after removing its prefix (if any). If more than one prefix may be found, only the first one is removed. @@ -101,8 +103,8 @@ def remove_prefix(stable_id: str, prefixes: List[str]) -> str: Args: stable_id: Stable ID to process. prefixes: List of prefixes to search for. - """ + """ for prefix in prefixes: if stable_id.startswith(prefix): return stable_id[len(prefix) :] @@ -111,9 +113,11 @@ def remove_prefix(stable_id: str, prefixes: List[str]) -> str: @staticmethod def generate_transcript_id(gene_id: str, number: int) -> str: """Returns a formatted transcript ID generated from a gene ID and number. + Args: gene_id: Gene stable ID. number: Positive number. + Raises: ValueError: If the number provided is not greater than zero. @@ -134,7 +138,6 @@ def normalize_cds_id(self, cds_id: str) -> str: cds_id: CDS ID to normalize. """ - prefixes = ["cds-", "cds:"] normalized_cds_id = StableIDAllocator.remove_prefix(cds_id, prefixes) @@ -152,6 +155,7 @@ def normalize_pseudogene_cds_id(self, pseudogene: GFFSeqFeature) -> None: Args: pseudogene: Pseudogene feature. + """ for transcript in pseudogene.sub_features: for feat in transcript.sub_features: @@ -161,7 +165,7 @@ def normalize_pseudogene_cds_id(self, pseudogene: GFFSeqFeature) -> None: feat.id = f"{transcript.id}_cds" feat.qualifiers["ID"] = feat.id - def normalize_gene_id(self, gene: GFFSeqFeature, refseq: Optional[bool] = False) -> str: + def normalize_gene_id(self, gene: GFFSeqFeature, refseq: bool | None = False) -> str: """Returns a normalized gene stable ID. Removes any unnecessary prefixes, but will generate a new stable ID if the normalized one is @@ -169,6 +173,7 @@ def normalize_gene_id(self, gene: GFFSeqFeature, refseq: Optional[bool] = False) Args: gene: Gene feature to normalize. + """ prefixes = ["gene-", "gene:"] new_gene_id = StableIDAllocator.remove_prefix(gene.id, prefixes) diff --git a/src/python/ensembl/io/genomio/gff3/overlaps.py b/src/python/ensembl/io/genomio/gff3/overlaps.py index 571dfd243..285054fa2 100644 --- a/src/python/ensembl/io/genomio/gff3/overlaps.py +++ b/src/python/ensembl/io/genomio/gff3/overlaps.py @@ -42,8 +42,8 @@ def summarize_feature_stats(gff_in: Path) -> None: Args: gff_in: User supplied GFF3 input file. - """ + """ logging.info("Alt processing: Not parsing the GFF3, producing summary feature stats instead!") examiner = GFFExaminer() @@ -59,9 +59,10 @@ def identify_feature_overlaps(gff_in: Path, output_file: Path, isolate_feature: gff_in: User supplied GFF3 input file. output_file: Output file to write feature overlaps. isolate_feature: Sequence feature type to filter by. + """ logging.info("Processing sequence feature overlaps!") - logging.info(f"Output file = {str(output_file)}") + logging.info(f"Output file = {output_file!s}") logging.info(f"Features filtered by type: {isolate_feature}") gff_type_filter: dict = {"gff_type": [isolate_feature]} @@ -98,8 +99,8 @@ def scan_tree(feature_intervals: list) -> set: Return: Set of intervals identified in the input GFF3 file that overlap with 2 or more intervals. - """ + """ interval_sets = set() traversed_tree = IntervalTree(Interval(*iv) for iv in feature_intervals) @@ -123,6 +124,7 @@ def _write_report(out_file: Path, seq_dict: dict, genes_dict: dict) -> int: Returns: Count of overlaps detected + """ overlap_count = 0 overlap_features = [] @@ -156,8 +158,8 @@ def get_intervals(record: SeqRecord, genes_dict: dict, seq_dict: dict, seq_name: genes_dict: Genes. seq_dict: Sequences. seq_name: Feature sequence name. - """ + """ for feature in record.features: genes_dict[str(feature.id)] = { "sequence": f"{record.id}", @@ -169,11 +171,11 @@ def get_intervals(record: SeqRecord, genes_dict: dict, seq_dict: dict, seq_name: if feature.location.strand == 1: seq_dict[seq_name]["plus"].append( - (int(feature.location.start), int(feature.location.end), str(feature.id)) + (int(feature.location.start), int(feature.location.end), str(feature.id)), ) elif feature.location.strand == -1: seq_dict[seq_name]["minus"].append( - (int(feature.location.start), int(feature.location.end), str(feature.id)) + (int(feature.location.start), int(feature.location.end), str(feature.id)), ) else: logging.critical("Something went wrong with the strand processing!") @@ -189,21 +191,21 @@ def main() -> None: # Add subparsers with their parent being the base parser with the common arguments subparsers = parser.add_subparsers(title="Parse GFF3 and ", required=True, dest="subcommand") gff3_stats_parser = subparsers.add_parser( # pylint: disable=unused-variable - "stats", parents=[base_parser], help="Provide summary of feature types" + "stats", parents=[base_parser], help="Provide summary of feature types", ) overlaps_parser = subparsers.add_parser("overlaps", parents=[base_parser], help="Find feature overlaps") overlaps_parser.add_argument_dst_path( - "--output_file", default="feature_overlaps.txt", help="path of output file" + "--output_file", default="feature_overlaps.txt", help="path of output file", ) overlaps_parser.add_argument( - "--filter_type", default="gene", help="sequence feature type used for overlap isolation" + "--filter_type", default="gene", help="sequence feature type used for overlap isolation", ) args = parser.parse_args() init_logging_with_args(args) logging.info("Starting processing...") - logging.info(f"GFF input file = {str(args.input_gff)}") + logging.info(f"GFF input file = {args.input_gff!s}") # Check optional processing param if args.subcommand == "stats": diff --git a/src/python/ensembl/io/genomio/gff3/process.py b/src/python/ensembl/io/genomio/gff3/process.py index 6285c3003..917b0032f 100644 --- a/src/python/ensembl/io/genomio/gff3/process.py +++ b/src/python/ensembl/io/genomio/gff3/process.py @@ -31,12 +31,12 @@ def main() -> None: description=( "Standardize the gene model representation of a GFF3 file, and extract the functional " "annotation in a separate file." - ) + ), ) parser.add_argument_src_path("--in_gff_path", required=True, help="Input GFF3 file") parser.add_argument_src_path("--genome_data", required=True, help="Genome JSON file") parser.add_argument( - "--fail_missing_stable_ids", action="store_true", help="Do not generate IDs when missing/invalid" + "--fail_missing_stable_ids", action="store_true", help="Do not generate IDs when missing/invalid", ) parser.add_argument_dst_path("--out_gff_path", default=Path("gene_models.gff3"), help="Output GFF3 file") parser.add_argument_dst_path( diff --git a/src/python/ensembl/io/genomio/gff3/restructure.py b/src/python/ensembl/io/genomio/gff3/restructure.py index ad7c73cc0..81cf8721d 100644 --- a/src/python/ensembl/io/genomio/gff3/restructure.py +++ b/src/python/ensembl/io/genomio/gff3/restructure.py @@ -27,7 +27,6 @@ from collections import Counter import logging -from typing import List from .exceptions import GFFParserError from .features import GFFSeqFeature @@ -47,6 +46,7 @@ def restructure_gene(gene: GFFSeqFeature) -> None: Raises: GFFParserError: If there are CDSs/exons remaining under the gene after applying the fixes. + """ # Skip if the children of the gene look ok counts = _get_feat_counts(gene) @@ -70,7 +70,6 @@ def restructure_gene(gene: GFFSeqFeature) -> None: def add_transcript_to_naked_gene(gene: GFFSeqFeature) -> None: """Add an unspecific transcript to a gene without any sub features.""" - if (len(gene.sub_features) > 0) or (gene.type != "gene"): return @@ -84,7 +83,6 @@ def move_only_cdss_to_new_mrna(gene: GFFSeqFeature) -> None: """Add intermediate mRNAs to a gene with only CDS children. Do nothing if some sub-features are not CDS. """ - counts = _get_feat_counts(gene) if (len(counts) != 1) or not counts.get("CDS"): return @@ -117,7 +115,6 @@ def move_only_exons_to_new_mrna(gene: GFFSeqFeature) -> None: """Add an mRNA for a gene that only has exons and move the exons under the mRNA. No change if the gene has other sub_features than exon. """ - counts = _get_feat_counts(gene) if (len(counts) != 1) or not counts.get("exon"): return @@ -146,13 +143,14 @@ def move_cds_to_existing_mrna(gene: GFFSeqFeature) -> None: Raises: GFFParserError: If the feature structure is not recognized. + """ counts = _get_feat_counts(gene) if not counts.get("mRNA") or not counts.get("CDS"): return if counts["mRNA"] > 1: raise GFFParserError( - f"Can't fix gene {gene.id}: contains several mRNAs and CDSs, all children of the gene" + f"Can't fix gene {gene.id}: contains several mRNAs and CDSs, all children of the gene", ) # First, count the types @@ -194,11 +192,10 @@ def move_cds_to_existing_mrna(gene: GFFSeqFeature) -> None: logging.debug(f"Gene {gene.id}: moved {len(cdss)} CDSs to the mRNA") -def _check_sub_exons(mrna: GFFSeqFeature, cdss: List[GFFSeqFeature], sub_exons: List[GFFSeqFeature]) -> None: +def _check_sub_exons(mrna: GFFSeqFeature, cdss: list[GFFSeqFeature], sub_exons: list[GFFSeqFeature]) -> None: """Check that the exons of the mRNA and the CDSs match. If there are no exons, create them from the CDSs. """ - new_sub_exons = [] if sub_exons: # Check that they match the CDS outside @@ -211,7 +208,7 @@ def _check_sub_exons(mrna: GFFSeqFeature, cdss: List[GFFSeqFeature], sub_exons: raise GFFParserError(f"Gene CDSs and exons under the mRNA {mrna.id} do not match") else: raise GFFParserError( - f"Gene CDSs and exons under the mRNA {mrna.id} do not match (different count)" + f"Gene CDSs and exons under the mRNA {mrna.id} do not match (different count)", ) else: # No exons in the mRNA? Create them with the CDS coordinates @@ -232,6 +229,7 @@ def remove_extra_exons(gene: GFFSeqFeature) -> None: Raises: GFFParserError: If not all exons of this gene start with "id-". + """ counts = _get_feat_counts(gene) if not counts.get("mRNA") and not counts.get("exon"): diff --git a/src/python/ensembl/io/genomio/gff3/simplifier.py b/src/python/ensembl/io/genomio/gff3/simplifier.py index 9f649220e..16a9b5176 100644 --- a/src/python/ensembl/io/genomio/gff3/simplifier.py +++ b/src/python/ensembl/io/genomio/gff3/simplifier.py @@ -27,7 +27,6 @@ from os import PathLike from pathlib import Path import re -from typing import List, Optional, Set from BCBio import GFF from Bio.SeqRecord import SeqRecord @@ -42,14 +41,16 @@ class Records(list): + """List of GFF3 SeqRecords.""" - def from_gff(self, in_gff_path: PathLike, excluded: Optional[List[str]] = None) -> None: + def from_gff(self, in_gff_path: PathLike, excluded: list[str] | None = None) -> None: """Loads records from a GFF3 file. Args: in_gff_path: Input GFF3 file path. excluded: Record IDs to not load from the GFF3 file. + """ if excluded is None: excluded = [] @@ -67,21 +68,24 @@ def to_gff(self, out_gff_path: PathLike) -> None: Args: out_gff_path: Path to GFF3 file where to write the records. + """ with Path(out_gff_path).open("w") as out_gff_fh: GFF.write(self, out_gff_fh) class GFFSimplifier: + """Parse a GGF3 file and output a cleaned up GFF3 + annotation json file. Raises: GFFParserError: If an error cannot be automatically fixed. + """ def __init__( self, - genome_path: Optional[PathLike] = None, + genome_path: PathLike | None = None, skip_unrecognized: bool = False, allow_pseudogene_with_cds: bool = False, ): @@ -94,6 +98,7 @@ def __init__( Raises: GFFParserError: If a biotype is unknown and `skip_unrecognized` is False. + """ self.skip_unrecognized = skip_unrecognized self.allow_pseudogene_with_cds = allow_pseudogene_with_cds @@ -116,8 +121,8 @@ def __init__( # Other preparations self.stable_ids = StableIDAllocator() self.stable_ids.set_prefix(self.genome) - self.exclude_seq_regions: List[str] = [] - self.fail_types: Set = set() + self.exclude_seq_regions: list[str] = [] + self.fail_types: set = set() # Init the actual data we will store self.records = Records() @@ -171,6 +176,7 @@ def simpler_gff3_feature(self, gene: GFFSeqFeature) -> GFFSeqFeature: Raises: IgnoredFeatureError: If the feature type is ignored. UnsupportedFeatureError: If the feature type is not supported. + """ # Special cases non_gene = self.normalize_non_gene(gene) @@ -202,6 +208,7 @@ def create_gene_for_lone_transcript(self, feat: GFFSeqFeature) -> GFFSeqFeature: Args: feat: The transcript for which we want to create a gene. + """ transcript_types = self._biotypes["transcript"]["supported"] if feat.type not in transcript_types: @@ -244,6 +251,7 @@ def create_gene_for_lone_cds(self, feat: GFFSeqFeature) -> GFFSeqFeature: Args: feat: The CDS for which we want to create a gene. + """ if feat.type != "CDS": return feat @@ -275,7 +283,7 @@ def create_gene_for_lone_cds(self, feat: GFFSeqFeature) -> GFFSeqFeature: return new_gene - def normalize_non_gene(self, feat: GFFSeqFeature) -> Optional[GFFSeqFeature]: + def normalize_non_gene(self, feat: GFFSeqFeature) -> GFFSeqFeature | None: """Returns a normalised "non-gene" or `None` if not applicable. Only transposable elements supported at the moment. @@ -285,8 +293,8 @@ def normalize_non_gene(self, feat: GFFSeqFeature) -> Optional[GFFSeqFeature]: Raises: NotImplementedError: If the feature is a not supported non-gene. - """ + """ if feat.type not in self._biotypes["non_gene"]["supported"]: return None if feat.type in ("mobile_genetic_element", "transposable_element"): @@ -324,7 +332,6 @@ def _normalize_mobile_genetic_element(self, feat: GFFSeqFeature) -> GFFSeqFeatur def clean_gene(self, gene: GFFSeqFeature) -> GFFSeqFeature: """Return the same gene without qualifiers unrelated to the gene structure.""" - old_gene_qualifiers = gene.qualifiers gene.qualifiers = {"ID": gene.id, "source": old_gene_qualifiers["source"]} for transcript in gene.sub_features: @@ -356,7 +363,6 @@ def normalize_gene(self, gene: GFFSeqFeature) -> GFFSeqFeature: functional_annotation: List of feature annotations (appended by this method). """ - gene.id = self.stable_ids.normalize_gene_id(gene, refseq=self.refseq) restructure_gene(gene) self.normalize_transcripts(gene) @@ -376,7 +382,6 @@ def normalize_pseudogene(self, gene: GFFSeqFeature) -> None: def normalize_transcripts(self, gene: GFFSeqFeature) -> None: """Normalizes a transcript.""" - allowed_transcript_types = self._biotypes["transcript"]["supported"] ignored_transcript_types = self._biotypes["transcript"]["ignored"] @@ -388,7 +393,7 @@ def normalize_transcripts(self, gene: GFFSeqFeature) -> None: ): self.fail_types.add(f"transcript={transcript.type}") logging.warning( - f"Unrecognized transcript type: {transcript.type}" f" for {transcript.id} ({gene.id})" + f"Unrecognized transcript type: {transcript.type} for {transcript.id} ({gene.id})", ) transcripts_to_delete.append(count) continue @@ -416,6 +421,7 @@ def format_gene_segments(self, transcript: GFFSeqFeature) -> GFFSeqFeature: Raises: GeneSegmentError: Unable to get the segment type information from the feature. + """ if transcript.type not in ("C_gene_segment", "V_gene_segment"): return transcript @@ -424,8 +430,8 @@ def format_gene_segments(self, transcript: GFFSeqFeature) -> GFFSeqFeature: seg_type = self._get_segment_type(transcript) if not seg_type: # Get the information from a CDS instead - sub_feats: List[GFFSeqFeature] = transcript.sub_features - cdss: List[GFFSeqFeature] = list(filter(lambda x: x.type == "CDS", sub_feats)) + sub_feats: list[GFFSeqFeature] = transcript.sub_features + cdss: list[GFFSeqFeature] = list(filter(lambda x: x.type == "CDS", sub_feats)) if cdss: seg_type = self._get_segment_type(cdss[0]) if not seg_type: @@ -440,7 +446,6 @@ def _get_segment_type(self, feature: GFFSeqFeature) -> str: Returns an empty string if no segment type info was found. """ - product = feature.qualifiers.get("standard_name", [""])[0] if not product: product = feature.qualifiers.get("product", [""])[0] @@ -454,7 +459,7 @@ def _get_segment_type(self, feature: GFFSeqFeature) -> str: return "" def _normalize_transcript_subfeatures( - self, gene: GFFSeqFeature, transcript: GFFSeqFeature + self, gene: GFFSeqFeature, transcript: GFFSeqFeature, ) -> GFFSeqFeature: """Returns a transcript with normalized sub-features.""" exons_to_delete = [] @@ -481,7 +486,7 @@ def _normalize_transcript_subfeatures( self.fail_types.add(f"sub_transcript={feat.type}") logging.warning( f"Unrecognized exon type for {feat.type}: {feat.id}" - f" (for transcript {transcript.id} of type {transcript.type})" + f" (for transcript {transcript.id} of type {transcript.type})", ) exons_to_delete.append(tcount) continue @@ -491,7 +496,7 @@ def _normalize_transcript_subfeatures( transcript.sub_features.pop(elt) return transcript - def normalize_mirna(self, gene: GFFSeqFeature) -> List[GFFSeqFeature]: + def normalize_mirna(self, gene: GFFSeqFeature) -> list[GFFSeqFeature]: """Returns gene representations from a miRNA gene that can be loaded in an Ensembl database. Change the representation from the form `gene[ primary_transcript[ exon, miRNA[ exon ] ] ]` @@ -500,6 +505,7 @@ def normalize_mirna(self, gene: GFFSeqFeature) -> List[GFFSeqFeature]: Raises: GFFParserError: If gene has more than 1 transcript, the transcript was not formatted correctly or there are unknown sub-features. + """ base_id = gene.id transcripts = gene.sub_features diff --git a/src/python/ensembl/io/genomio/manifest/check_integrity.py b/src/python/ensembl/io/genomio/manifest/check_integrity.py index b9fdde66c..008973e32 100644 --- a/src/python/ensembl/io/genomio/manifest/check_integrity.py +++ b/src/python/ensembl/io/genomio/manifest/check_integrity.py @@ -29,6 +29,7 @@ class IntegrityTool: + """Check the integrity of sequence and annotation files in the genome""" def __init__( @@ -57,7 +58,6 @@ def check_integrity(self) -> None: and lengths are consistent with the information in gff. Compare sequence length from fasta_dna file to seq_region.json metadata. """ - # Load the manifest integrity counts manifest = self.manifest manifest.prepare_integrity_data() @@ -96,7 +96,7 @@ def check_integrity(self) -> None: # We do not compare the peptide lengths because of sequence edits if pep: tr_errors = self.check_lengths( - pep, gff_translations, "Fasta translations vs gff", special_diff=True + pep, gff_translations, "Fasta translations vs gff", special_diff=True, ) if len(tr_errors) > 0: # The pseudo CDSs are included in this check @@ -118,7 +118,7 @@ def check_integrity(self) -> None: if ann_genes: self.add_errors(self.check_ids(ann_genes, gff_genes, "Gene ids metadata vs gff")) tr_id_errors = self.check_ids( - ann_translations, gff_translations, "Translation ids metadata vs gff" + ann_translations, gff_translations, "Translation ids metadata vs gff", ) if tr_id_errors: tr_id_errors_all = self.check_ids( @@ -134,11 +134,11 @@ def check_integrity(self) -> None: ann_transposable_elements, gff_transposable_elements, "TE ids metadata vs gff", - ) + ), ) self.check_seq_region_lengths( - seq_lengths, gff_seq_regions, "seq_regions JSON vs GFF3 lengths", seq_circular + seq_lengths, gff_seq_regions, "seq_regions JSON vs GFF3 lengths", seq_circular, ) self.check_seq_region_lengths(seq_lengths, dna, "seq_regions JSON vs DNA lengths") @@ -175,8 +175,8 @@ def check_ids(self, list1: dict[str, Any], list2: dict[str, Any], name: str) -> Return: List of message errors of sequence IDs found only in one of the lists provided. - """ + """ only1 = [] only2 = [] common = [] @@ -228,8 +228,8 @@ def check_lengths( Returns: Error if there is a difference in length or ids between the lists. - """ + """ # check list differences, checks if abs(values diff) < allowed_len_diff set1 = frozenset(list1) @@ -263,17 +263,17 @@ def check_lengths( _dlist.append(f"{e}: {list1[e]}, {list2[e]}") if diff_len_special_list: errors.append( - ( + f"{len(diff_len_special_list)} common elements with one BP/AA length diff for {name}" - f"(e.g. {diff_len_special_list[0]})" - ) + f"(e.g. {diff_len_special_list[0]})", + ) if diff_len_list: errors.append( - ( + f"{len(diff_len_list)} common elements with length diff for {name}" - f"(e.g. {diff_len_list[0]})" - ) + f"(e.g. {diff_len_list[0]})", + ) if common_len > 0: logging.warning(f"{common_len} common elements between lists for {name}") @@ -303,6 +303,7 @@ def check_seq_region_lengths( Returns: Error if there are common sequences with difference in ids and if the sequences are not consistent in the files. + """ if not seqrs or not feats: return @@ -328,7 +329,7 @@ def check_seq_region_lengths( self.add_errors(f"{len(only_feat)} only in second list in {name} (first: {only_feat[0]})") def _compare_seqs( - self, seqrs: dict[str, Any], feats: dict[str, Any], circular: dict[str, Any] | None = None + self, seqrs: dict[str, Any], feats: dict[str, Any], circular: dict[str, Any] | None = None, ) -> dict[str, list[str]]: """Give the intersection and other comparison between two groups of sequences. @@ -384,10 +385,10 @@ def main() -> None: parser = ArgumentParser(description=__doc__) parser.add_argument_src_path("--manifest_file", required=True, help="Manifest file for the data to check") parser.add_argument( - "--ignore_final_stops", action="store_true", help="Ignore final stop when calculating peptide length" + "--ignore_final_stops", action="store_true", help="Ignore final stop when calculating peptide length", ) parser.add_argument( - "--no_fail", action="store_true", help="In case of errors, don't fail but print errors to stdout." + "--no_fail", action="store_true", help="In case of errors, don't fail but print errors to stdout.", ) parser.add_log_arguments(add_log_file=True) args = parser.parse_args() diff --git a/src/python/ensembl/io/genomio/manifest/compute_stats.py b/src/python/ensembl/io/genomio/manifest/compute_stats.py index f65300e9d..ea98bbbb0 100644 --- a/src/python/ensembl/io/genomio/manifest/compute_stats.py +++ b/src/python/ensembl/io/genomio/manifest/compute_stats.py @@ -25,7 +25,6 @@ from shutil import which from statistics import mean import subprocess -from typing import Dict, List, Optional, Set, Union from BCBio import GFF @@ -36,13 +35,14 @@ class BiotypeCounter: + """A counter for a given biotype, given a list of features.""" - def __init__(self, count: int = 0, ids: Optional[Set[str]] = None, example: Optional[str] = None) -> None: + def __init__(self, count: int = 0, ids: set[str] | None = None, example: str | None = None) -> None: self.count: int = count if ids is None: ids = set() - self.ids: Set[str] = ids + self.ids: set[str] = ids if example is None: example = "" self.example: str = example @@ -52,6 +52,7 @@ def add_id(self, feature_id: str) -> None: Args: feature_id (str): Feature id to add. + """ self.count += 1 self.ids.add(feature_id) @@ -61,21 +62,24 @@ def unique_count(self) -> int: Returns: int: number of features in the counter. + """ return len(self.ids) class StatsError(Exception): + """Raised when stats could not be computed.""" class manifest_stats: + """Representation of the statistics of the set of files listed in the manifest file provided.""" - def __init__(self, manifest_dir: str, accession: Optional[str], datasets_bin: Optional[str]): + def __init__(self, manifest_dir: str, accession: str | None, datasets_bin: str | None): self.manifest = f"{manifest_dir}/manifest.json" - self.accession: Optional[str] = accession - self.errors: List[str] = [] + self.accession: str | None = accession + self.errors: list[str] = [] self.errors_file = Path(manifest_dir) / "stats_diff.log" if datasets_bin is None: datasets_bin = "datasets" @@ -88,6 +92,7 @@ def run(self, stats_path: StrPath) -> None: Raises: StatsError: Could not compute some stats. + """ manifest = self.get_manifest() @@ -113,11 +118,12 @@ def run(self, stats_path: StrPath) -> None: for error_line in self.errors: errors_fh.write(error_line) - def get_manifest(self) -> Dict: + def get_manifest(self) -> dict: """Get the files metadata from the manifest json file. Returns: Dict: A representation of the manifest json data. + """ with open(self.manifest) as f_json: manifest = json.load(f_json) @@ -138,7 +144,7 @@ def get_manifest(self) -> Dict: return manifest - def get_seq_region_stats(self, seq_region_path: Path) -> List[str]: + def get_seq_region_stats(self, seq_region_path: Path) -> list[str]: """Compute stats from the seq_region json file. Args: @@ -146,12 +152,13 @@ def get_seq_region_stats(self, seq_region_path: Path) -> List[str]: Returns: List[str]: Stats from the seq_regions. + """ with seq_region_path.open("r") as json_file: seq_regions = json.load(json_file) # Get basic data - coord_systems: Dict[str, List[int]] = {} + coord_systems: dict[str, list[int]] = {} circular = 0 locations = [] codon_tables = [] @@ -176,14 +183,14 @@ def get_seq_region_stats(self, seq_region_path: Path) -> List[str]: locations.append(f"{seqr_name} = {seqr['location']}") # Stats - stats: List[str] = [] + stats: list[str] = [] stats.append(seq_region_path.name) stats += self.coord_systems_stats(coord_systems) stats += self.seq_region_special_stats(circular, locations, codon_tables) stats.append("\n") return stats - def coord_systems_stats(self, coord_systems: Dict[str, List[int]]) -> List[str]: + def coord_systems_stats(self, coord_systems: dict[str, list[int]]) -> list[str]: """For each coord_system compute various stats: - number of sequences - sequence length sum, minimum, maximum, mean @@ -193,13 +200,14 @@ def coord_systems_stats(self, coord_systems: Dict[str, List[int]]) -> List[str]: Returns: A list with the computed statistics in a printable format. + """ - stats: List[str] = [] + stats: list[str] = [] stats.append(f"Total coord_systems {len(coord_systems)}") for coord_name, lengths in coord_systems.items(): stats.append(f"\nCoord_system: {coord_name}") - stat_counts: Dict[str, Union[int, float]] = { + stat_counts: dict[str, int | float] = { "Number of sequences": len(lengths), "Sequence length sum": sum(lengths), "Sequence length minimum": min(lengths), @@ -217,9 +225,9 @@ def coord_systems_stats(self, coord_systems: Dict[str, List[int]]) -> List[str]: def seq_region_special_stats( self, circular: int = 0, - locations: Optional[List[str]] = None, - codon_tables: Optional[List[str]] = None, - ) -> List[str]: + locations: list[str] | None = None, + codon_tables: list[str] | None = None, + ) -> list[str]: """Prepare stats in case there are circular regions, specific locations and codon_tables. stats.append(f"{count: 9f}\t{name}") @@ -230,8 +238,9 @@ def seq_region_special_stats( Returns: A list with the computed statistics in a printable format. + """ - stats: List[str] = [] + stats: list[str] = [] if circular or locations or codon_tables: stats.append("\nSpecial") if circular: @@ -246,7 +255,7 @@ def seq_region_special_stats( stats.append(f"\t\t\t{table}") return stats - def get_gff3_stats(self, gff3_path: Path) -> List[str]: + def get_gff3_stats(self, gff3_path: Path) -> list[str]: """Extract the gene models from the GFF3 file and compute stats. Args: @@ -254,15 +263,15 @@ def get_gff3_stats(self, gff3_path: Path) -> List[str]: Returns: List: Stats from the gene model. - """ + """ biotypes = self.count_biotypes(gff3_path) # Compile final stats stats = self.biotypes_stats(biotypes) stats += self.check_ncbi_stats(biotypes) return stats - def count_biotypes(self, gff3_path: Path) -> Dict[str, BiotypeCounter]: + def count_biotypes(self, gff3_path: Path) -> dict[str, BiotypeCounter]: """Count the biotypes in a GFF3 file. Args: @@ -270,9 +279,9 @@ def count_biotypes(self, gff3_path: Path) -> Dict[str, BiotypeCounter]: Returns: Dictionary of biotype counters. - """ - biotypes: Dict[str, BiotypeCounter] = {} + """ + biotypes: dict[str, BiotypeCounter] = {} with open_gz_file(gff3_path) as gff3_handle: for rec in GFF.parse(gff3_handle): @@ -290,7 +299,7 @@ def count_biotypes(self, gff3_path: Path) -> Dict[str, BiotypeCounter]: if feat3.type == "exon": continue manifest_stats.increment_biotype( - biotypes, feat3.id, f"{feat1.type}-{feat2.type}-{feat3.type}" + biotypes, feat3.id, f"{feat1.type}-{feat2.type}-{feat3.type}", ) # Main categories counts @@ -298,23 +307,22 @@ def count_biotypes(self, gff3_path: Path) -> Dict[str, BiotypeCounter]: manifest_stats.increment_biotype(biotypes, feat1.id, "pseudogene") elif is_protein: manifest_stats.increment_biotype(biotypes, feat1.id, f"PROT_{feat1.type}") + # Special case, undefined gene-transcript + elif ( + feat1.type == "gene" + and feat1.sub_features + and feat1.sub_features[0].type == "transcript" + ): + manifest_stats.increment_biotype(biotypes, feat1.id, "OTHER") else: - # Special case, undefined gene-transcript - if ( - feat1.type == "gene" - and feat1.sub_features - and feat1.sub_features[0].type == "transcript" - ): - manifest_stats.increment_biotype(biotypes, feat1.id, "OTHER") - else: - manifest_stats.increment_biotype(biotypes, feat1.id, f"NONPROT_{feat1.type}") + manifest_stats.increment_biotype(biotypes, feat1.id, f"NONPROT_{feat1.type}") # Total if feat1.type in ("gene", "pseudogene"): manifest_stats.increment_biotype(biotypes, feat1.id, "ALL_GENES") return biotypes - def biotypes_stats(self, biotypes: Dict[str, BiotypeCounter]) -> List[str]: + def biotypes_stats(self, biotypes: dict[str, BiotypeCounter]) -> list[str]: """Prepare biotype stats in order of their name. Args: @@ -322,6 +330,7 @@ def biotypes_stats(self, biotypes: Dict[str, BiotypeCounter]) -> List[str]: Returns: A list with the computed statistics in a printable format. + """ sorted_biotypes = {} for name in sorted(biotypes.keys()): @@ -334,9 +343,9 @@ def biotypes_stats(self, biotypes: Dict[str, BiotypeCounter]) -> List[str]: ] return stats - def check_ncbi_stats(self, biotypes: Dict[str, BiotypeCounter]) -> List[str]: + def check_ncbi_stats(self, biotypes: dict[str, BiotypeCounter]) -> list[str]: """Use the dataset tool from NCBI to get stats and compare with what we have""" - stats: List[str] = [] + stats: list[str] = [] if not self.check_ncbi: return stats @@ -365,9 +374,9 @@ def check_ncbi_stats(self, biotypes: Dict[str, BiotypeCounter]) -> List[str]: stats = self.compare_ncbi_counts(biotypes, counts) return stats - def compare_ncbi_counts(self, biotypes: Dict[str, BiotypeCounter], ncbi: Dict) -> List[str]: + def compare_ncbi_counts(self, biotypes: dict[str, BiotypeCounter], ncbi: dict) -> list[str]: """Compare specific gene stats from NCBI""" - stats: List[str] = [] + stats: list[str] = [] maps = [ ["total", "ALL_GENES"], @@ -380,7 +389,7 @@ def compare_ncbi_counts(self, biotypes: Dict[str, BiotypeCounter], ncbi: Dict) - for count_map in maps: ncbi_name, prep_name = count_map ncbi_count = ncbi.get(ncbi_name, 0) - prepped: Optional[BiotypeCounter] = biotypes.get(prep_name) + prepped: BiotypeCounter | None = biotypes.get(prep_name) prep_count = 0 if prepped is not None: prep_count = prepped.count @@ -394,13 +403,14 @@ def compare_ncbi_counts(self, biotypes: Dict[str, BiotypeCounter], ncbi: Dict) - return stats @staticmethod - def increment_biotype(biotypes: Dict[str, BiotypeCounter], feature_id: str, feature_biotype: str) -> None: + def increment_biotype(biotypes: dict[str, BiotypeCounter], feature_id: str, feature_biotype: str) -> None: """Add the feature to their respective biotype counter. Args: biotypes (Dict[str, BiotypeCounter]): All current biotypes, with their counter. feature_id (str): Feature id to be counted. feature_biotype (str): The biotype of the feature. + """ if feature_biotype not in biotypes: biotypes[feature_biotype] = BiotypeCounter(example=feature_id) @@ -410,10 +420,10 @@ def increment_biotype(biotypes: Dict[str, BiotypeCounter], feature_id: str, feat def main() -> None: """Main entrypoint.""" parser = ArgumentParser( - description="Compute stats from the current genome files associated with the manifest." + description="Compute stats from the current genome files associated with the manifest.", ) parser.add_argument_src_path( - "--manifest_dir", required=True, help="Manifest directory where 'manifest.json' file is located" + "--manifest_dir", required=True, help="Manifest directory where 'manifest.json' file is located", ) parser.add_argument("--accession", help="Sequence accession ID to compare stats with NCBI") parser.add_argument("--datasets_bin", help="Datasets bin status") diff --git a/src/python/ensembl/io/genomio/manifest/generate.py b/src/python/ensembl/io/genomio/manifest/generate.py index 91cfb154d..2e916bbc7 100644 --- a/src/python/ensembl/io/genomio/manifest/generate.py +++ b/src/python/ensembl/io/genomio/manifest/generate.py @@ -22,10 +22,10 @@ def main() -> None: """Main entrypoint.""" parser = ArgumentParser( - description="Compare the genomic data between the files present in a manifest file." + description="Compare the genomic data between the files present in a manifest file.", ) parser.add_argument_dst_path( - "--manifest_dir", required=True, help="Folder where to create a manifest file" + "--manifest_dir", required=True, help="Folder where to create a manifest file", ) parser.add_log_arguments() args = parser.parse_args() diff --git a/src/python/ensembl/io/genomio/manifest/manifest.py b/src/python/ensembl/io/genomio/manifest/manifest.py index 4c3fe25cd..61a98299d 100644 --- a/src/python/ensembl/io/genomio/manifest/manifest.py +++ b/src/python/ensembl/io/genomio/manifest/manifest.py @@ -27,10 +27,12 @@ class ManifestError(Exception): + """Could not load a manifest file.""" class Manifest: + """Records of a manifest file and its files and md5 checksums.""" _same_names = { @@ -58,6 +60,7 @@ def __init__(self, manifest_dir: Path) -> None: Args: manifest_dir: directory where the files are contained. + """ self.root_dir = manifest_dir self.file_path = manifest_dir / "manifest.json" @@ -110,7 +113,7 @@ def get_files_checksums(self) -> ManifestDict: return self.files def _prepare_object_name( - self, subfile: Path, name: str, manifest_file_dict: dict[str, dict[str, str]] + self, subfile: Path, name: str, manifest_file_dict: dict[str, dict[str, str]], ) -> str: # Prepare object name try: @@ -126,7 +129,7 @@ def _prepare_object_name( # Add number if duplicate name obj_name_base = obj_name count = 1 - while obj_name in manifest_file_dict.keys(): + while obj_name in manifest_file_dict: obj_name = f"{obj_name_base}.{count}" count += 1 if count >= 10: @@ -171,6 +174,7 @@ def _check_md5sum(self, file_path: Path, md5sum: str) -> None: Args: file_path: Path to a genome file. md5sum: MD5 hash for the files. + """ file_md5sum = self._get_md5sum(file_path) if file_md5sum != md5sum: diff --git a/src/python/ensembl/io/genomio/manifest/manifest_stats.py b/src/python/ensembl/io/genomio/manifest/manifest_stats.py index 58ceb123d..a3404b571 100644 --- a/src/python/ensembl/io/genomio/manifest/manifest_stats.py +++ b/src/python/ensembl/io/genomio/manifest/manifest_stats.py @@ -38,10 +38,12 @@ class InvalidIntegrityError(Exception): + """When a file integrity check fails""" class ManifestStats: + """Representation of the main stats of the files in a manifest for comparison. The stats in question are: @@ -84,6 +86,7 @@ def _get_manifest(self, manifest_path: PathLike) -> dict[str, Any]: Returns: Dict: Content of the manifest file. + """ manifest = Manifest(Path(manifest_path).parent) manifest_files = manifest.load() @@ -103,8 +106,7 @@ def add_error(self, error: str) -> None: def load_seq_regions(self) -> None: """Retrieve seq_regions lengths and circular information from the seq_region JSON file.""" - - if not "seq_region" in self.manifest_files: + if "seq_region" not in self.manifest_files: return logging.info("Manifest contains seq_region JSON") seq_regions = get_json(Path(self.manifest_files["seq_region"])) @@ -129,7 +131,7 @@ def load_peptides_fasta_lengths(self) -> None: if "fasta_pep" not in self.manifest_files: return self.lengths["peptide_sequences"] = self._get_fasta_lengths( - self.manifest_files["fasta_pep"], ignore_final_stops=self.ignore_final_stops + self.manifest_files["fasta_pep"], ignore_final_stops=self.ignore_final_stops, ) def load_dna_fasta_lengths(self) -> None: @@ -148,7 +150,6 @@ def _get_fasta_lengths(self, fasta_path: StrPath, ignore_final_stops: bool = Fal ignore_final_stops: Do not include final stop in the total length. """ - data = {} non_unique = {} non_unique_count = 0 @@ -169,9 +170,7 @@ def _get_fasta_lengths(self, fasta_path: StrPath, ignore_final_stops: bool = Fal # Store sequence id and length data[rec.id] = len(rec.seq) stops = rec.seq.count("*") - if stops >= 1 and not rec.seq.endswith("*"): - contains_stop_codon += 1 - elif rec.seq.endswith("*") and not ignore_final_stops: + if stops >= 1 and not rec.seq.endswith("*") or rec.seq.endswith("*") and not ignore_final_stops: contains_stop_codon += 1 if empty_id_count > 0: @@ -257,7 +256,7 @@ def load_gff3(self) -> None: self.lengths = {**self.lengths, **stats} def _retrieve_gff_gene_lengths( - self, feat: GFFSeqFeature, genes: StatsLengths, peps: StatsLengths, all_peps: StatsLengths + self, feat: GFFSeqFeature, genes: StatsLengths, peps: StatsLengths, all_peps: StatsLengths, ) -> None: """Record genes and peptides lengths from a feature. @@ -311,6 +310,7 @@ def load_agp_seq_regions(self, agp_dict: dict | None) -> None: Note: AGP file is only used in the older builds, not used for current processing. + """ if not agp_dict: return @@ -318,7 +318,7 @@ def load_agp_seq_regions(self, agp_dict: dict | None) -> None: seqr: StatsLengths = {} for agp_path in agp_dict.values(): - with open(agp_path, "r") as agph: + with open(agp_path) as agph: for line in agph: ( asm_id, @@ -370,6 +370,7 @@ def has_lengths(self, name: str) -> bool: Raises: KeyError: If the name is not supported. + """ try: return bool(self.lengths[name]) @@ -384,6 +385,7 @@ def get_lengths(self, name: str) -> dict[str, Any]: Raises: KeyError: If the name is not supported. + """ try: return self.lengths[name] @@ -398,6 +400,7 @@ def get_circular(self, name: str) -> dict[str, Any]: Raises: KeyError: If the name is not supported. + """ try: return self.circular[name] diff --git a/src/python/ensembl/io/genomio/schemas/json/factory.py b/src/python/ensembl/io/genomio/schemas/json/factory.py index 96132416a..679de00d0 100644 --- a/src/python/ensembl/io/genomio/schemas/json/factory.py +++ b/src/python/ensembl/io/genomio/schemas/json/factory.py @@ -20,13 +20,12 @@ from os import PathLike from pathlib import Path import shutil -from typing import List from ensembl.utils.argparse import ArgumentParser from ensembl.utils.logging import init_logging_with_args -def schema_factory(manifest_dir: PathLike, metadata_types: List[str], output_dir: PathLike) -> None: +def schema_factory(manifest_dir: PathLike, metadata_types: list[str], output_dir: PathLike) -> None: """Generates one JSON file per metadata type inside `manifest`, including "manifest.json" itself. Each JSON file will have the file name of the metadata type, e.g. "seq_region.json". @@ -65,16 +64,16 @@ def schema_factory(manifest_dir: PathLike, metadata_types: List[str], output_dir def main() -> None: """Main script entry-point.""" parser = ArgumentParser( - description="Generates one JSON file per metadata type in the provided manifest, including itself." + description="Generates one JSON file per metadata type in the provided manifest, including itself.", ) parser.add_argument_src_path( - "--manifest_dir", required=True, help="Folder containing the 'manifest.json' file to check" + "--manifest_dir", required=True, help="Folder containing the 'manifest.json' file to check", ) parser.add_argument( - "--metadata_types", required=True, nargs="+", metavar="TYPE", help="Metadata types to extract" + "--metadata_types", required=True, nargs="+", metavar="TYPE", help="Metadata types to extract", ) parser.add_argument_dst_path( - "--output_dir", default=Path.cwd(), help="Folder to store the produced files" + "--output_dir", default=Path.cwd(), help="Folder to store the produced files", ) parser.add_log_arguments() args = parser.parse_args() diff --git a/src/python/ensembl/io/genomio/schemas/json/validate.py b/src/python/ensembl/io/genomio/schemas/json/validate.py index abc9e2add..7cb3bf88a 100644 --- a/src/python/ensembl/io/genomio/schemas/json/validate.py +++ b/src/python/ensembl/io/genomio/schemas/json/validate.py @@ -15,7 +15,6 @@ """Validates a JSON file with the provided JSON schema. Examples: - >>> from ensembl.io.genomio.schemas import json >>> json.schema_validator(json_file="functional_annotation.json", json_schema="functional_annotation") >>> json.schema_validator(json_file="functional_annotation.json", json_schema="genome") @@ -36,7 +35,6 @@ import json from os import PathLike from pathlib import Path -from typing import Union import jsonschema @@ -50,7 +48,7 @@ _JSON_SCHEMAS[file_path.stem] = file_path -def schema_validator(json_file: PathLike, json_schema: Union[str, PathLike]) -> None: +def schema_validator(json_file: PathLike, json_schema: str | PathLike) -> None: """Validates a JSON file with the provided JSON schema. Args: @@ -75,7 +73,7 @@ def main() -> None: parser = ArgumentParser(description="Validates a JSON file against a JSON schema.") parser.add_argument_src_path("--json_file", required=True, help="JSON file to check") parser.add_argument( - "--json_schema", required=True, choices=_JSON_SCHEMAS.keys(), help="JSON schema to validate against" + "--json_schema", required=True, choices=_JSON_SCHEMAS.keys(), help="JSON schema to validate against", ) args = parser.parse_args() diff --git a/src/python/ensembl/io/genomio/seq_region/collection.py b/src/python/ensembl/io/genomio/seq_region/collection.py index 43b46d8b3..0741fd215 100644 --- a/src/python/ensembl/io/genomio/seq_region/collection.py +++ b/src/python/ensembl/io/genomio/seq_region/collection.py @@ -36,6 +36,7 @@ class SeqCollection: + """Represent a collection of seq_regions metadata.""" mock: bool @@ -213,7 +214,7 @@ def add_mitochondrial_codon_table(self, taxon_id: int) -> None: logging.info("Skip mitochondrial codon table: no taxon_id to use") return - url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{str(taxon_id)}" + url = f"https://www.ebi.ac.uk/ena/taxonomy/rest/tax-id/{taxon_id!s}" response = requests.get(url, headers={"Content-Type": "application/json"}, timeout=60) response.raise_for_status() # In case we have been redirected, check for HTML opening tag diff --git a/src/python/ensembl/io/genomio/seq_region/dump.py b/src/python/ensembl/io/genomio/seq_region/dump.py index 9b2e80f52..eb148f47e 100644 --- a/src/python/ensembl/io/genomio/seq_region/dump.py +++ b/src/python/ensembl/io/genomio/seq_region/dump.py @@ -48,6 +48,7 @@ def fetch_coord_systems(session: Session) -> Iterator[CoordSystem]: Yields: All default coord_systems in the core database. + """ coord_system_select = select(CoordSystem).filter(CoordSystem.attrib.like(r"%default_version%")) for row in session.execute(coord_system_select).unique().all(): @@ -64,6 +65,7 @@ def fetch_seq_regions(session: Session, coord_system: CoordSystem) -> Iterator[S Yields: All seq_regions for the coord_system. + """ seq_region_select = ( select(SeqRegion) @@ -90,6 +92,7 @@ def add_attribs(seq_region: dict, seq_region_attrib: dict) -> None: Args: seq_region: A seq_region dict to modify. seq_region_attrib: The attribs for this seq_region. + """ bool_attribs = { "circular_seq": "circular", @@ -131,6 +134,7 @@ def get_synonyms(seq_region: SeqRegion, external_db_map: dict[str, str]) -> list Returns: List of all synonyms as a dict with 'name' and 'source' keys. + """ synonyms = seq_region.seq_region_synonym syns = [] @@ -157,6 +161,7 @@ def get_karyotype(seq_region: SeqRegion) -> list[dict[str, str]]: Returns: List of all karyotype bands as a dict with values 'start', 'end', 'name' 'stain', 'structure'. + """ bands = seq_region.karyotype kars = [] @@ -184,6 +189,7 @@ def get_added_sequence(seq_region: SeqRegion) -> dict[str, str | dict[str, str]] Returns: Accession as well as assembly and annotation provider information of the added sequence. + """ attribs = get_attribs_dict(seq_region) accession = attribs.get("added_seq_accession") @@ -260,11 +266,11 @@ def get_seq_regions(session: Session, external_db_map: dict) -> list[SeqRegion]: def main() -> None: """Main script entry-point.""" parser = ArgumentParser( - description="Fetch all the sequence regions from a core database and print them in JSON format." + description="Fetch all the sequence regions from a core database and print them in JSON format.", ) parser.add_server_arguments(include_database=True) parser.add_argument_src_path( - "--external_db_map", default=DEFAULT_EXTERNAL_DB_MAP.resolve(), help="File with external_db mapping" + "--external_db_map", default=DEFAULT_EXTERNAL_DB_MAP.resolve(), help="File with external_db mapping", ) parser.add_log_arguments(add_log_file=True) args = parser.parse_args() diff --git a/src/python/ensembl/io/genomio/seq_region/exceptions.py b/src/python/ensembl/io/genomio/seq_region/exceptions.py index 7453dd1b1..8ddcfe1c3 100644 --- a/src/python/ensembl/io/genomio/seq_region/exceptions.py +++ b/src/python/ensembl/io/genomio/seq_region/exceptions.py @@ -20,4 +20,5 @@ class UnknownMetadata(Exception): + """If a metadata is not supported or recognized.""" diff --git a/src/python/ensembl/io/genomio/seq_region/gbff.py b/src/python/ensembl/io/genomio/seq_region/gbff.py index 198a64b4c..f979671cc 100644 --- a/src/python/ensembl/io/genomio/seq_region/gbff.py +++ b/src/python/ensembl/io/genomio/seq_region/gbff.py @@ -30,6 +30,7 @@ @dataclass class GBFFRecord: + """Wrapper around a `SeqRecord` object to extract specific data.""" record: SeqRecord diff --git a/src/python/ensembl/io/genomio/seq_region/mappings.py b/src/python/ensembl/io/genomio/seq_region/mappings.py index 35f3ac841..b6601ece2 100644 --- a/src/python/ensembl/io/genomio/seq_region/mappings.py +++ b/src/python/ensembl/io/genomio/seq_region/mappings.py @@ -30,7 +30,7 @@ "GenBank-Accn": "GenBank", "RefSeq-Accn": "RefSeq", "Sequence-Name": "INSDC_submitted_name", - } + }, ) MOLECULE_LOCATION: Mapping[str, str] = MappingProxyType( { @@ -40,6 +40,6 @@ "linkage group": "linkage_group", "mitochondrion": "mitochondrial_chromosome", "plasmid": "plasmid", - } + }, ) LOCATION_CODON: Mapping[str, int] = MappingProxyType({"apicoplast_chromosome": 4}) diff --git a/src/python/ensembl/io/genomio/seq_region/prepare.py b/src/python/ensembl/io/genomio/seq_region/prepare.py index d5ae84147..803f1324e 100644 --- a/src/python/ensembl/io/genomio/seq_region/prepare.py +++ b/src/python/ensembl/io/genomio/seq_region/prepare.py @@ -73,14 +73,14 @@ def main() -> None: parser = ArgumentParser(description="Construct a sequence region metadata file from INSDC files.") parser.add_argument_src_path("--genome_file", required=True, help="Genome metadata JSON file") parser.add_argument_src_path( - "--report_file", required=True, help="INSDC/RefSeq sequences report file to parse" + "--report_file", required=True, help="INSDC/RefSeq sequences report file to parse", ) parser.add_argument_src_path("--gbff_file", help="INSDC/RefSeq GBFF file to parse") parser.add_argument_dst_path( - "--dst_file", default="seq_region.json", help="Output JSON file for the processed sequence regions" + "--dst_file", default="seq_region.json", help="Output JSON file for the processed sequence regions", ) parser.add_argument( - "--to_exclude", nargs="*", metavar="SEQ_REGION_NAME", help="Sequence region names to exclude" + "--to_exclude", nargs="*", metavar="SEQ_REGION_NAME", help="Sequence region names to exclude", ) parser.add_argument("--mock_run", action="store_true", help="Do not call external APIs") parser.add_log_arguments() diff --git a/src/python/ensembl/io/genomio/seq_region/report.py b/src/python/ensembl/io/genomio/seq_region/report.py index 587247fa8..89d1d63a4 100644 --- a/src/python/ensembl/io/genomio/seq_region/report.py +++ b/src/python/ensembl/io/genomio/seq_region/report.py @@ -22,12 +22,12 @@ from os import PathLike from pathlib import Path import re -from typing import Tuple from ensembl.utils.archive import open_gz_file class ReportRecord: + """Represent an assembly report file. Exposes 2 things: - Metadata as a dict from the comments. - A DictReader that yields all the seq_region lines of the report as dicts. @@ -39,7 +39,7 @@ def __init__(self, report_path: Path) -> None: self.reader = csv.DictReader(report_csv.splitlines(), delimiter="\t", quoting=csv.QUOTE_NONE) @staticmethod - def report_to_csv(report_path: PathLike) -> Tuple[str, dict]: + def report_to_csv(report_path: PathLike) -> tuple[str, dict]: """Returns an assembly report as a CSV string. Args: diff --git a/src/python/tests/annotation/test_update_description.py b/src/python/tests/annotation/test_update_description.py index 4c4c0daca..c16d5c3de 100644 --- a/src/python/tests/annotation/test_update_description.py +++ b/src/python/tests/annotation/test_update_description.py @@ -41,6 +41,7 @@ def add_gene(dialect: str, session: sqlalchemy.orm.Session, gene_data: dict[str, "tr_name" -> transcript.stable_id "tr_desc" -> transcript.description "gene_xref" -> xref display_name attached to the gene + """ gene_name = gene_data.get("gene_name", "gene1") gene_description = gene_data.get("gene_desc", "") diff --git a/src/python/tests/assembly/test_download.py b/src/python/tests/assembly/test_download.py index abd534bd6..f6fa0b5ad 100644 --- a/src/python/tests/assembly/test_download.py +++ b/src/python/tests/assembly/test_download.py @@ -19,7 +19,7 @@ import filecmp import logging from pathlib import Path -from typing import Callable, ContextManager, Optional +from typing import Callable, ContextManager from unittest.mock import Mock, patch, MagicMock from ftplib import error_reply as ftp_error_reply @@ -39,10 +39,10 @@ "ftp_url, accession, expectation", [ pytest.param( - "ftp.ncbi.nlm.nih.gov", "GCA_017607445.1", does_not_raise(), id="Successful ftp connection" + "ftp.ncbi.nlm.nih.gov", "GCA_017607445.1", does_not_raise(), id="Successful ftp connection", ), pytest.param( - "", "GCA_017607445.1", pytest.raises(FTPConnectionError), id="Failed connection case bad url" + "", "GCA_017607445.1", pytest.raises(FTPConnectionError), id="Failed connection case bad url", ), pytest.param( "ftp.ncbi.nlm.nih.gov", @@ -66,6 +66,7 @@ def test_ftp_connection( ftp_url: FTP URL. sub_dir: Subdirectory path. expectation: Context manager expected raise exception. + """ def side_eff_conn(url: str) -> None: @@ -87,7 +88,7 @@ def side_eff_conn(url: str) -> None: "checksum_file, checksum, expectation", [ pytest.param( - Path("md5checksums.txt"), "40df91d5c40cb55621c4c92201da6834", does_not_raise(), id="Normal case" + Path("md5checksums.txt"), "40df91d5c40cb55621c4c92201da6834", does_not_raise(), id="Normal case", ), pytest.param( Path("malformed_md5_checksums.txt"), @@ -99,7 +100,7 @@ def side_eff_conn(url: str) -> None: ], ) def test_checksums( - data_dir: Path, checksum_file: Path, checksum: Optional[str], expectation: ContextManager + data_dir: Path, checksum_file: Path, checksum: str | None, expectation: ContextManager, ) -> None: """Tests the `download.get_checksums()` function. @@ -108,6 +109,7 @@ def test_checksums( checksum_file: File name containing checksums checksum: Test MD5 checksum expectation: Context manager expected raise exception + """ with expectation: md5_input_path = data_dir / checksum_file @@ -127,7 +129,7 @@ def test_checksums( pytest.param("missing_file_md5.txt", None, False, id="md5 checksum with ref of missing file"), ], ) -def test_md5_files(data_dir: Path, md5_file: str, md5_path: Optional[Path], checksum_bool: bool) -> None: +def test_md5_files(data_dir: Path, md5_file: str, md5_path: Path | None, checksum_bool: bool) -> None: """Tests the md5_files() function Args: data_dir: Path to test data root dir @@ -197,8 +199,8 @@ def test_download_single_file( ftp_file: FTP file which to mock download md5_sums: FTP file and md5_sum value pair expectation: Context manager expected raise exception - """ + """ data_file = data_dir / ftp_file retr_file = tmp_path / ftp_file @@ -263,8 +265,8 @@ def test_download_all_files( compare_accession: Defines test of expected accession md5: Source file for md5 checksums to inspect expectation: Context manager expected raise exception - """ + """ data_file = data_dir / md5 def side_eff_ftp_mlsd() -> list[tuple[str, list[str]]]: @@ -330,7 +332,7 @@ def mock_retr_binary(command: str, callback: Callable) -> None: ], ) def test_get_files_selection( - data_dir: Path, has_download_dir: bool, files_expected: dict, expectation: ContextManager + data_dir: Path, has_download_dir: bool, files_expected: dict, expectation: ContextManager, ) -> None: """Tests the `download.get_files_selection()` function. @@ -338,8 +340,8 @@ def test_get_files_selection( download_dir: Path to specific location of downloaded files. files_expected: Defines contents of test files downloaded expectation: Context manager expected raise exception - """ + """ if has_download_dir: download_dir = data_dir else: @@ -416,8 +418,8 @@ def test_retrieve_assembly_data( is_dir: Param to define state of result output dir files_downloaded: Defines contents of test files marked as downloaded expectation: Context manager expected raise exception - """ + """ if is_dir: download_dir = data_dir else: diff --git a/src/python/tests/assembly/test_status.py b/src/python/tests/assembly/test_status.py index f77036a0f..6ca31d483 100644 --- a/src/python/tests/assembly/test_status.py +++ b/src/python/tests/assembly/test_status.py @@ -56,7 +56,7 @@ assembly_type="haploid", accession="GCF_001194135.2", assembly_status="current", - ) + ), } STRAIN_METADATA = { @@ -68,7 +68,7 @@ assembly_type="haploid", accession="GCF_001194135.2", assembly_status="current", - ) + ), } COMPLETE_METADATA = { @@ -83,7 +83,7 @@ last_updated="2015-06-29T09:51:41.073", assembly_status="current", assembly_notes="RefSeq", - ) + ), } @@ -91,6 +91,7 @@ class Meta(Base): + """Meta class mirroring the Ensembl core database meta table without any foreign keys""" __tablename__ = "meta" @@ -105,7 +106,7 @@ class Meta(Base): meta_value: Column = Column(String(255), nullable=False) -@pytest.mark.dependency() +@pytest.mark.dependency def test_report_structure() -> None: """Tests the `ReportStructure` class.""" assert ReportStructure() @@ -194,6 +195,7 @@ def test_singularity_image_setter( datasets_version: URL of singularity container (custom `datasets` version if desired). nextflow_cachedir: Value to assign to environment variable NXF_SINGULARITY_CACHEDIR. singularity_cachedir: Value to assign to environment variable SINGULARITY_CACHEDIR. + """ mock_client.pull.return_value = True # Define SIF cache path and expected path used to pull the container @@ -223,7 +225,7 @@ def test_singularity_image_setter( assert singularity_image_setter(sif_cache_path, datasets_version) # Check that the spython pull method was called with the right arguments mock_client.pull.assert_called_with( - expected_container_url, stream=False, pull_folder=expected_cache_path, quiet=True + expected_container_url, stream=False, pull_folder=expected_cache_path, quiet=True, ) @@ -235,7 +237,7 @@ def test_singularity_image_setter( ], ) def test_get_assembly_accessions( - data_dir: Path, file_name: str, expected_output: list[str], expectation: ContextManager + data_dir: Path, file_name: str, expected_output: list[str], expectation: ContextManager, ) -> None: """Tests the `get_assembly_accessions()` function. @@ -246,6 +248,7 @@ def test_get_assembly_accessions( file_name: File with one line per INSDC assembly accession. expected_output: Expected assembly accessions returned. expectation: Context manager of expected raise exception. + """ file_path = data_dir / file_name with expectation: @@ -265,7 +268,7 @@ def test_get_assembly_accessions( indirect=True, ) def test_fetch_accessions_from_core_dbs( - request: FixtureRequest, tmp_path: Path, test_dbs: dict[str, UnitTestDB] + request: FixtureRequest, tmp_path: Path, test_dbs: dict[str, UnitTestDB], ) -> None: """Tests the `fetch_accessions_from_core_dbs()` function. @@ -283,7 +286,7 @@ def test_fetch_accessions_from_core_dbs( @patch("ensembl.io.genomio.assembly.status.Client") def test_fetch_datasets_reports( - mock_client: Mock, tmp_path: Path, data_dir: Path, assert_files: Callable[[StrPath, StrPath], None] + mock_client: Mock, tmp_path: Path, data_dir: Path, assert_files: Callable[[StrPath, StrPath], None], ) -> None: """Tests the `fetch_datasets_reports()` function. @@ -292,7 +295,7 @@ def test_fetch_datasets_reports( """ def execute_return( - command: list[str], **kwargs: Any # pylint: disable=unused-argument + command: list[str], **kwargs: Any, # pylint: disable=unused-argument ) -> dict[str, str]: report_path = data_dir / f"{command[-1]}.asm_report.json" if report_path.exists(): @@ -340,7 +343,7 @@ def test_fetch_datasets_reports_runtime_error(mock_client: Mock) -> None: ], ) def test_extract_assembly_metadata( - data_dir: Path, file_name: str, expected_metadata: dict[str, ReportStructure] + data_dir: Path, file_name: str, expected_metadata: dict[str, ReportStructure], ) -> None: """Tests the `extract_assembly_metadata()` function. @@ -349,6 +352,7 @@ def test_extract_assembly_metadata( Args: file_name: Test data file to extract the assembly metadata from. expected_metadata: Expected key value pairs of source name <> assembly report. + """ report_path = data_dir / file_name report = {"my_core": get_json(report_path)} @@ -358,7 +362,7 @@ def test_extract_assembly_metadata( @pytest.mark.dependency(depends=["test_report_structure"]) def test_generate_report_tsv( - tmp_path: Path, data_dir: Path, assert_files: Callable[[StrPath, StrPath], None] + tmp_path: Path, data_dir: Path, assert_files: Callable[[StrPath, StrPath], None], ) -> None: """Tests the `generate_report_tsv()` function. diff --git a/src/python/tests/conftest.py b/src/python/tests/conftest.py index 354f5982f..8e83cba32 100644 --- a/src/python/tests/conftest.py +++ b/src/python/tests/conftest.py @@ -57,6 +57,7 @@ def _json_data(file_name: str) -> Any: class MockResponse: + """Mock a `requests` response.""" def __init__(self, json_str: str) -> None: @@ -64,6 +65,7 @@ def __init__(self, json_str: str) -> None: Args: json_str: Expected JSON test response. + """ self.text = json_str diff --git a/src/python/tests/database/test_core_server.py b/src/python/tests/database/test_core_server.py index cd04fab76..47f95725b 100644 --- a/src/python/tests/database/test_core_server.py +++ b/src/python/tests/database/test_core_server.py @@ -17,7 +17,7 @@ from __future__ import annotations -from typing import Any, List, Optional +from typing import Any import pytest from pytest_mock import MockerFixture @@ -37,17 +37,19 @@ class MockResult: + """Mocker of `sqlalchemy.engine.Result` class.""" - def __init__(self, core_dbs: List[str]): + def __init__(self, core_dbs: list[str]): self.core_dbs = core_dbs - def fetchall(self) -> List[List[str]]: + def fetchall(self) -> list[list[str]]: """Return a list of lists, each one containing a single core db.""" return [[x] for x in self.core_dbs] class MockConnection: + """Mock a SQLAlchemy connection.""" def __init__(self, result: MockResult) -> None: @@ -60,14 +62,15 @@ def execute(self, *args: Any, **kwargs: Any) -> MockResult: # pylint: disable=u def __enter__(self, *args: Any, **kwargs: Any) -> MockConnection: # pylint: disable=unused-argument return self - def __exit__(self, *args: Any, **kwargs: Any) -> None: # pylint: disable=unused-argument + def __exit__(self, *args: object, **kwargs: Any) -> None: # pylint: disable=unused-argument pass class MockEngine: + """Mocker of `sqlalchemy.engine.Engine` class.""" - def __init__(self, core_dbs: List[str]) -> None: + def __init__(self, core_dbs: list[str]) -> None: self.result = MockResult(core_dbs) def connect(self) -> MockConnection: @@ -76,6 +79,7 @@ def connect(self) -> MockConnection: class TestCoreServer: + """Tests for the `CoreServer` class.""" @pytest.mark.parametrize( @@ -95,13 +99,13 @@ class TestCoreServer: def test_get_cores( self, mocker: MockerFixture, - dbs: List[str], + dbs: list[str], prefix: str, - build: Optional[int], - version: Optional[int], + build: int | None, + version: int | None, dbname_re: str, - db_list: List[str], - output: List[str], + db_list: list[str], + output: list[str], ) -> None: """Tests the `CoreServer.get_cores()` method. @@ -126,6 +130,6 @@ def test_get_cores( # Checks the filters from get_cores all_cores = server.get_cores( - prefix=prefix, build=build, version=version, dbname_re=dbname_re, db_list=db_list + prefix=prefix, build=build, version=version, dbname_re=dbname_re, db_list=db_list, ) assert set(all_cores) == set(output) diff --git a/src/python/tests/database/test_dbconnection_lite.py b/src/python/tests/database/test_dbconnection_lite.py index 29ec89fcd..7e6d03d3d 100644 --- a/src/python/tests/database/test_dbconnection_lite.py +++ b/src/python/tests/database/test_dbconnection_lite.py @@ -15,7 +15,7 @@ """Unit testing of `ensembl.io.genomio.database.dbconnection_lite` module. """ -from typing import Callable, Optional +from typing import Callable import pytest @@ -50,7 +50,6 @@ def fixture_meta_test_db(db_factory: Callable) -> UnitTestDB: # Use ensembl-utils UnitTestDB def test_get_metadata(meta_test_db: UnitTestDB) -> None: """Tests the method get_metadata()""" - # Check the new connection lite dblite = DBConnectionLite(meta_test_db.dbc.url) assert dblite.get_metadata() == _METADATA_CONTENT @@ -60,16 +59,16 @@ def test_get_metadata(meta_test_db: UnitTestDB) -> None: "meta_key, meta_value", [ pytest.param( - "species.scientific_name", _METADATA_CONTENT["species.scientific_name"][0], id="Unique key exists" + "species.scientific_name", _METADATA_CONTENT["species.scientific_name"][0], id="Unique key exists", ), pytest.param( - "species.classification", _METADATA_CONTENT["species.classification"][0], id="First key exists" + "species.classification", _METADATA_CONTENT["species.classification"][0], id="First key exists", ), pytest.param("lorem.ipsum", None, id="Non-existing key, 2 parts"), pytest.param("lorem_ipsum", None, id="Non-existing key, 1 part"), ], ) -def test_get_meta_value(meta_test_db: UnitTestDB, meta_key: str, meta_value: Optional[str]) -> None: +def test_get_meta_value(meta_test_db: UnitTestDB, meta_key: str, meta_value: str | None) -> None: """Tests the method get_meta_value()""" dblite = DBConnectionLite(meta_test_db.dbc.url) assert dblite.get_meta_value(meta_key) == meta_value diff --git a/src/python/tests/database/test_factory.py b/src/python/tests/database/test_factory.py index 59d17df45..b069c664b 100644 --- a/src/python/tests/database/test_factory.py +++ b/src/python/tests/database/test_factory.py @@ -54,7 +54,7 @@ "division": "metazoa", "accession": "GCA_000111222.3", "release": "110", - } + }, ], id="Ensembl core database", ), @@ -111,7 +111,7 @@ ], ) def test_format_db_data( - mock_dbconn: Mock, server_url: URL, dbs: list[str], brc_mode: bool, skip_keys: bool, output: list[dict] + mock_dbconn: Mock, server_url: URL, dbs: list[str], brc_mode: bool, skip_keys: bool, output: list[dict], ) -> None: """Tests the `factory.format_db_data()` function. @@ -122,6 +122,7 @@ def test_format_db_data( brc_mode: BRC mode? skip_keys: Return `None` instead of the assigned value for "BRC4.*" meta keys. output: Expected list of dictionaries with metadata per database. + """ def _get_meta_value(meta_key: str) -> str | None: @@ -175,6 +176,7 @@ def test_get_core_dbs_metadata( data_dir: Module's test data directory fixture. use_db_file: Use database file to filter databases. output: Expected list of dictionaries with some metadata for each selected database. + """ def _format_db_data(server_url: URL, dbs: list[str], brc_mode: bool = False) -> list[dict]: @@ -260,7 +262,7 @@ def test_parse_args(arg_list: list[str], expected: dict) -> None: args = factory.parse_args(arg_list) if args.db_list: # DeepDiff is not able to compare two objects of Path type, so convert it to string - setattr(args, "db_list", str(args.db_list)) + args.db_list = str(args.db_list) assert not DeepDiff(vars(args), expected) @@ -290,7 +292,7 @@ def test_main( factory.main(arg_list) # Check that we have called the mocked function once with the expected parameters mock_get_core_dbs_metadata.assert_called_once_with( - server_url=server_url, prefix="", build=None, version=None, db_regex="", db_list=None, brc_mode=False + server_url=server_url, prefix="", build=None, version=None, db_regex="", db_list=None, brc_mode=False, ) # Check that the stdout is as expected captured = capsys.readouterr() diff --git a/src/python/tests/external_db/test_external_db_map.py b/src/python/tests/external_db/test_external_db_map.py index de5cc70f6..616ae3a3c 100644 --- a/src/python/tests/external_db/test_external_db_map.py +++ b/src/python/tests/external_db/test_external_db_map.py @@ -39,7 +39,7 @@ ], ) def test_get_external_db_map( - tmp_path: Path, file_content: str, expected_output: dict, expected: ContextManager + tmp_path: Path, file_content: str, expected_output: dict, expected: ContextManager, ) -> None: """Tests the `get_external_db_map` method. diff --git a/src/python/tests/fasta/test_chunk.py b/src/python/tests/fasta/test_chunk.py index 5a4b200a2..7581b62ae 100644 --- a/src/python/tests/fasta/test_chunk.py +++ b/src/python/tests/fasta/test_chunk.py @@ -19,7 +19,7 @@ from io import StringIO, TextIOWrapper from pathlib import Path import re -from typing import Any, Callable, ContextManager, Generator, Optional +from typing import Any, Callable, ContextManager, Generator import pytest @@ -41,8 +41,8 @@ def test__on_value_error(msg: str, expectation: ContextManager) -> None: Args: msg: Msg to raise. expectation: A context manager with expected exception (`pytest.raises` or nullcontext) - """ + """ with expectation: FastaChunking._on_value_error(msg) # pylint: disable=protected-access @@ -69,8 +69,8 @@ def test_check_chunk_size_and_tolerance( chunk_size: Chunk size to check chunk_tolerance: Chunk tolerance to check expectation: A context manager with expected exception (`pytest.raises` or nullcontext) - """ + """ with expectation: FastaChunking.check_chunk_size_and_tolerance(chunk_size, chunk_tolerance) @@ -89,13 +89,14 @@ def test_check_chunk_size_and_tolerance( ("NAAAAN", re.compile("NN"), [6]), ], ) -def test_split_seq_by_n(seq: str, pattern: Optional[re.Pattern], expectation: list[int]) -> None: +def test_split_seq_by_n(seq: str, pattern: re.Pattern | None, expectation: list[int]) -> None: """Tests the `chunk.split_seq_by_n` function. Args: seq: A sequence to split pattern: A pattern to split on expectation: A list of open chunk ends (like for python list slices) + """ assert FastaChunking.split_seq_by_n(seq, pattern) == expectation @@ -118,7 +119,7 @@ def test_split_seq_by_n(seq: str, pattern: Optional[re.Pattern], expectation: li ], ) def test_split_seq_by_chunk_size( - chunk_ends: list[int], chunk_size: int, tolerated_size: Optional[int], expectation: list[int] + chunk_ends: list[int], chunk_size: int, tolerated_size: int | None, expectation: list[int], ) -> None: """Tests the `chunk.split_seq_by_chunk_size` function. @@ -127,6 +128,7 @@ def test_split_seq_by_chunk_size( chunk_size: Chunk size tolerated_size: A more relaxed value of the chunk size expectation: A list of open chunk ends (python slices coordinates) + """ assert FastaChunking.split_seq_by_chunk_size(chunk_ends, chunk_size, tolerated_size) == expectation @@ -136,6 +138,7 @@ def test_individual_file_opener(tmp_path: Path) -> None: Args: tmp_path: Where temporary files will be created. + """ test_dir = Path(tmp_path, "file_opener_test") test_dir.mkdir() @@ -158,6 +161,7 @@ def test_prepare_out_dir_for_individuals(tmp_path: Path) -> None: Args: tmp_path: Where temporary files will be created. + """ test_dir = Path(tmp_path, "prepare_out_dir_test") test_file = Path(test_dir, "test.file") @@ -191,6 +195,7 @@ def test_get_tolerated_size(size: int, tolerance: int, expectation: int) -> None size: Base size tolerance: Percent of allowed deviance as integer. expectation: An expected tolerated size + """ assert FastaChunking.get_tolerated_size(size, tolerance) == expectation @@ -264,7 +269,7 @@ def test_chunk_fasta_stream( chunk_size_tolerated: int, n_sequence_len: int, chunk_sfx: str, - append_offset_to_chunk_name: Optional[bool], + append_offset_to_chunk_name: bool | None, expected_chunked_fasta_text: str, expected_agp_list: list[str], expected_individual_files_count: int, @@ -283,6 +288,7 @@ def test_chunk_fasta_stream( expected_agp_list: A list with expected AGP entries. expected_individual_files_count: A number of individually created entities/chunks with files. expected_raised: A context manager with expected exception (`pytest.raises` or nullcontext) + """ # a workaround for storing individual chunks @@ -355,8 +361,8 @@ def _individual_opener(name: str) -> ContextManager: def test_chunk_fasta( monkeypatch: Any, tmp_path: Path, - individual_file_prefix: Optional[str], - agp_output_file_name: Optional[str], + individual_file_prefix: str | None, + agp_output_file_name: str | None, expected_missing_joined: ContextManager, ) -> None: """Tests the `chunk.chunk_fasta` function. @@ -367,6 +373,7 @@ def test_chunk_fasta( first part of the chunk file name or None. agp_output_file_name: Output AGP file name or None. expected_missing_joined: A context manager with expected exception (`pytest.raises` or nullcontext). + """ # a helper mock function @@ -374,13 +381,13 @@ def _chunk_fasta_stream_mock( input_fasta: TextIOWrapper, chunk_size: int, # pylint: disable=unused-argument chunk_size_tolerated: int, # pylint: disable=unused-argument - output_fasta: Optional[TextIOWrapper] | nullcontext[Any], - individual_file_prefix: Optional[str], + output_fasta: TextIOWrapper | None | nullcontext[Any], + individual_file_prefix: str | None, n_sequence_len: int, # pylint: disable=unused-argument chunk_sfx: str, # pylint: disable=unused-argument - append_offset_to_chunk_name: Optional[bool], # pylint: disable=unused-argument + append_offset_to_chunk_name: bool | None, # pylint: disable=unused-argument open_individual: Callable[ - [str], TextIOWrapper + [str], TextIOWrapper, ] = FastaChunking._individual_file_opener, # pylint: disable=protected-access ) -> list[str]: """Mock the `chunk.chunk_fasta_stream` function. @@ -388,6 +395,7 @@ def _chunk_fasta_stream_mock( Args: *args: Positional arguments. **kwargs: Keyword arguments. + """ chunk_file_name = "" if individual_file_prefix: diff --git a/src/python/tests/fasta/test_process.py b/src/python/tests/fasta/test_process.py index c8781a9c2..b511709aa 100644 --- a/src/python/tests/fasta/test_process.py +++ b/src/python/tests/fasta/test_process.py @@ -18,7 +18,7 @@ from contextlib import nullcontext as does_not_raise import filecmp from pathlib import Path -from typing import ContextManager, Set +from typing import ContextManager import pytest @@ -66,8 +66,8 @@ def test_fasta_prep( input_gbff: Name of the input GBFF example input, in the test folder. pep_mode: Boolean flag to set processing in peptide mode. expected_output_fasta: Name of the output fasta file with expected output, in the test folder. - """ + """ fasta_input_path = data_dir / input_fasta if input_gbff is not None: gbff_input_path = data_dir / input_gbff @@ -107,8 +107,8 @@ def test_fasta_prep( def test_exclude_seq_regions( data_dir: Path, input_gbff: str, - excluded_seq_regions: Set[str], - output: Set[str], + excluded_seq_regions: set[str], + output: set[str], expectation: ContextManager, ) -> None: """Tests the `process.get_peptides_to_exclude()` function. diff --git a/src/python/tests/genbank/test_download.py b/src/python/tests/genbank/test_download.py index 8a1695fe7..442f2fc4e 100644 --- a/src/python/tests/genbank/test_download.py +++ b/src/python/tests/genbank/test_download.py @@ -39,6 +39,7 @@ ) @patch("ensembl.io.genomio.genbank.download.requests.get") class TestDownloadGenbank: + """Tests for the `download_genbank` class""" def test_successful_download(self, mock_requests_get: Mock, tmp_path: Path, accession: str) -> None: @@ -48,8 +49,8 @@ def test_successful_download(self, mock_requests_get: Mock, tmp_path: Path, acce mock_requests_get: A mock of `request.get()` method. tmp_path: Function-scoped temporary directory fixture. accession: Genbank accession to be downloaded. - """ + """ # Set success_code and content as an attribute to the mock object mock_requests_get.return_value.status_code = 200 mock_content = b"The genbank download for the following accession" @@ -78,8 +79,8 @@ def test_failed_download(self, mock_requests_failed: Mock, tmp_path: Path, acces mock_requests_failed: A mock of `request.get()` method. tmp_path: Function-scoped temporary directory fixture. accession: Genbank accession to be downloaded. - """ + """ output_file = tmp_path / f"{accession}.gb" # Set the mock status code to 404 for request not found mock_requests_failed.return_value.status_code = 404 diff --git a/src/python/tests/genbank/test_extract_data.py b/src/python/tests/genbank/test_extract_data.py index 906191cc6..3c90eee93 100644 --- a/src/python/tests/genbank/test_extract_data.py +++ b/src/python/tests/genbank/test_extract_data.py @@ -16,7 +16,6 @@ # pylint: disable=too-many-positional-arguments from pathlib import Path -from typing import List from unittest.mock import Mock, patch from BCBio import GFF @@ -32,6 +31,7 @@ class TestWriteFormattedFiles: + """Test if all the expected output files are generated and formatted correctly""" prod_name = "TEST_prod" @@ -40,7 +40,7 @@ class TestWriteFormattedFiles: @pytest.fixture(scope="class", autouse=True) def formatted_files_generator( - self, data_dir: Path, tmp_path_factory: TempPathFactory + self, data_dir: Path, tmp_path_factory: TempPathFactory, ) -> FormattedFilesGenerator: """Call the function `FormattedFilesGenerator` with set parameters. Fixture that returns the class of the module that we are testing @@ -140,7 +140,7 @@ def test_format_seq_region_json( """Check seq_region.json file contains the correct metadata""" record = SeqRecord(Seq("ATGC"), id="record", annotations={"topology": "circular"}) CDS_feature = SeqFeature( - FeatureLocation(10, 20), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"} + FeatureLocation(10, 20), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"}, ) record.features.append(CDS_feature) record.annotations["organelle"] = "mitochondrion" @@ -166,8 +166,8 @@ def test_format_write_genes_gff( mock_write_pep: Mock, mock_write_genes: Mock, mock_parse_record: Mock, - all_ids: List[str], - peptides: List[str], + all_ids: list[str], + peptides: list[str], formatted_files_generator: FormattedFilesGenerator, ) -> None: """Check gene features in GFF3 format are generated as expected.""" @@ -211,11 +211,10 @@ def test_write_genes_gff( formatted_files_generator: FormattedFilesGenerator, ) -> None: """Test if GFF3 file is generated when there are SeqFeatures present""" - record = SeqRecord(Seq("ATGC"), id="record") gene_feature = SeqFeature(FeatureLocation(10, 20), type="gene", qualifiers={"gene": ["GlyrA"]}) CDS_feature = SeqFeature( - FeatureLocation(10, 15), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"} + FeatureLocation(10, 15), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"}, ) record.features = [gene_feature, CDS_feature] formatted_files_generator.seq_records = [record] @@ -237,7 +236,7 @@ def test_write_pep_fasta( """Test if peptides FASTA file is generated when peptides are identified""" record = SeqRecord(Seq("MFLRTQARFFHATTKKM"), id="cds-record") CDS_feature = SeqFeature( - FeatureLocation(10, 20), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"} + FeatureLocation(10, 20), type="CDS", qualifiers={"gene": ["GlyrA"], "transl_table": "2"}, ) record.features.append(CDS_feature) formatted_files_generator.files["fasta_pep"] = tmp_path / "pep.fasta" diff --git a/src/python/tests/genbank/test_extract_data_seq.py b/src/python/tests/genbank/test_extract_data_seq.py index 1a1aca97a..ce35cba68 100644 --- a/src/python/tests/genbank/test_extract_data_seq.py +++ b/src/python/tests/genbank/test_extract_data_seq.py @@ -16,7 +16,6 @@ # pylint: disable=too-many-positional-arguments from pathlib import Path -from typing import List from unittest.mock import Mock, patch from Bio.Seq import Seq @@ -29,6 +28,7 @@ class TestFormattedFilesGenerator: + """Test if all the internal methods of `FormattedFilesGenerator` are giving the correct output""" prod_name = "TEST_prod" @@ -37,7 +37,7 @@ class TestFormattedFilesGenerator: @pytest.fixture(scope="class", autouse=True) def formatted_files_generator( - self, data_dir: Path, tmp_path_factory: TempPathFactory + self, data_dir: Path, tmp_path_factory: TempPathFactory, ) -> FormattedFilesGenerator: """Call the function `FormattedFilesGenerator` with set parameters""" gb_file = self.gb_file @@ -71,10 +71,10 @@ def test_parse_record( gene_feature = SeqFeature(FeatureLocation(10, 20), type="gene", qualifiers={gene_name: expected_name}) rna_feature = SeqFeature(FeatureLocation(10, 15), type=type_feature) cds_feature = SeqFeature( - FeatureLocation(10, 20), type="CDS", qualifiers={gene_name: "GlyrA", "transl_table": "2"} + FeatureLocation(10, 20), type="CDS", qualifiers={gene_name: "GlyrA", "transl_table": "2"}, ) record.features = [gene_feature, rna_feature, cds_feature] - mock_peptides: List = [] + mock_peptides: list = [] gene_feature_feat = {expected_id: gene_feature} mock_parse_gene_feat.return_value = ( @@ -132,7 +132,7 @@ def test_parse_gene_feat( # Check the returned feature is as expected # pylint: disable=protected-access result_seq_feature, result_seq_id, result_peptide = formatted_files_generator._parse_gene_feat( - seq_feature, gene_name + seq_feature, gene_name, ) gene_id = self.prefix + gene_name @@ -196,7 +196,7 @@ def test_parse_rna_feat( ) def test_uniquify_id( self, - all_ids: List[str], + all_ids: list[str], expected_id: str, gene_id: str, formatted_files_generator: FormattedFilesGenerator, @@ -231,7 +231,7 @@ def test_prepare_location_with_unsupported_organelle( formatted_files_generator._prepare_location(organelle) @pytest.mark.parametrize( - "type_feature, expected_value", [("gene", None), ("mRNA", None), ("CDS", 2), ("CDS", 5)] + "type_feature, expected_value", [("gene", None), ("mRNA", None), ("CDS", 2), ("CDS", 5)], ) def test_get_codon_table( self, diff --git a/src/python/tests/genome_metadata/test_dump.py b/src/python/tests/genome_metadata/test_dump.py index 75c4a4494..219ae9d05 100644 --- a/src/python/tests/genome_metadata/test_dump.py +++ b/src/python/tests/genome_metadata/test_dump.py @@ -21,7 +21,7 @@ from collections import namedtuple from contextlib import nullcontext as does_not_raise -from typing import Any, ContextManager, Dict, List +from typing import Any, ContextManager from unittest.mock import Mock, patch from deepdiff import DeepDiff @@ -58,7 +58,7 @@ ], ) def test_check_assembly_version( - genome_metadata: Dict[str, Any], output: int, expectation: ContextManager + genome_metadata: dict[str, Any], output: int, expectation: ContextManager, ) -> None: """Tests the `dump.check_assembly_version()` method. @@ -66,6 +66,7 @@ def test_check_assembly_version( genome_metadata: Nested genome metadata key values. output: Expected assembly version. expectation: Context manager for the expected exception (if any). + """ with expectation: dump.check_assembly_version(genome_metadata) @@ -98,7 +99,7 @@ def test_check_assembly_version( ], ) def test_check_genebuild_version( - genome_metadata: Dict[str, Any], output: Dict[str, Any], expectation: ContextManager + genome_metadata: dict[str, Any], output: dict[str, Any], expectation: ContextManager, ) -> None: """Tests the `dump.check_genebuild_version()` method. @@ -106,6 +107,7 @@ def test_check_genebuild_version( genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. expectation: Context manager for the expected exception (if any). + """ with expectation: dump.check_genebuild_version(genome_metadata) @@ -125,12 +127,13 @@ def test_check_genebuild_version( ({"added_seq": {"region_name": [1, 2]}}, {"added_seq": {"region_name": ["1", "2"]}}), ], ) -def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, Any]) -> None: +def test_filter_genome_meta(genome_metadata: dict[str, Any], output: dict[str, Any]) -> None: """Tests the `dump.filter_genome_meta()` method. Args: genome_metadata: Nested genome metadata key values. output: Expected change in the genome metadata dictionary. + """ result = dump.filter_genome_meta(genome_metadata) assert not DeepDiff(result, output) @@ -174,8 +177,8 @@ def test_filter_genome_meta(genome_metadata: Dict[str, Any], output: Dict[str, A def test_get_genome_metadata( mock_session: Mock, mock_result: Mock, - meta_data: List[MetaRow], - output: Dict[str, Any], + meta_data: list[MetaRow], + output: dict[str, Any], expectation: ContextManager, ) -> None: """Tests the `dump.get_genome_metadata()` method. @@ -185,6 +188,7 @@ def test_get_genome_metadata( meta_data: `meta` table content in a list of named tuples. output: Expected genome metadata dictionary. expectation: Context manager for the expected exception (if any). + """ mock_result.unique.return_value = mock_result mock_result.all.return_value = meta_data diff --git a/src/python/tests/genome_metadata/test_extend.py b/src/python/tests/genome_metadata/test_extend.py index f4fec3589..b0c2718c8 100644 --- a/src/python/tests/genome_metadata/test_extend.py +++ b/src/python/tests/genome_metadata/test_extend.py @@ -16,7 +16,7 @@ # pylint: disable=too-many-positional-arguments from pathlib import Path -from typing import Callable, Dict, List, Tuple +from typing import Callable from deepdiff import DeepDiff import pytest @@ -32,13 +32,14 @@ pytest.param("sequences.gbff", ["CP089274", "CP089275", "RefChr0002"], id="sequences.gbff"), ], ) -def test_get_gbff_regions(data_dir: Path, gbff_file: str, output: List[str]) -> None: +def test_get_gbff_regions(data_dir: Path, gbff_file: str, output: list[str]) -> None: """Tests the `extend.get_gbff_regions()` method. Args: data_dir: Module's test data directory fixture. gbff_file: GBFF file name. output: Expected list of sequence region IDs. + """ if gbff_file: gbff_path = data_dir / gbff_file @@ -71,13 +72,14 @@ def test_get_gbff_regions(data_dir: Path, gbff_file: str, output: List[str]) -> ), ], ) -def test_report_to_csv(data_dir: Path, report_file: str, output: Tuple[str, Dict]) -> None: +def test_report_to_csv(data_dir: Path, report_file: str, output: tuple[str, dict]) -> None: """Tests the `extend._report_to_csv()` method. Args: data_dir: Module's test data directory fixture. report_file: Assembly report file name. output: Expected returned value for the given assembly report file. + """ report_path = data_dir / report_file # pylint: disable=protected-access @@ -97,13 +99,14 @@ def test_report_to_csv(data_dir: Path, report_file: str, output: Tuple[str, Dict ), ], ) -def test_get_report_regions_names(data_dir: Path, report_file: str, output: List[Tuple[str, str]]) -> None: +def test_get_report_regions_names(data_dir: Path, report_file: str, output: list[tuple[str, str]]) -> None: """Tests the `extend.get_report_regions_names()` method. Args: data_dir: Module's test data directory fixture. report_file: Assembly report file name. output: Expected returned value for the given assembly report file. + """ report_path = data_dir / report_file result = extend.get_report_regions_names(report_path) @@ -111,18 +114,18 @@ def test_get_report_regions_names(data_dir: Path, report_file: str, output: List @pytest.mark.dependency( - name="test_get_additions", depends=["test_get_gbff_regions", "test_get_report_regions_names"] + name="test_get_additions", depends=["test_get_gbff_regions", "test_get_report_regions_names"], ) @pytest.mark.parametrize( "report_file, gbff_file, output", [ pytest.param( - "assembly_report.txt", "", ["CP089275", "RefChr0001", "RefChr0002"], id="Additional regions found" + "assembly_report.txt", "", ["CP089275", "RefChr0001", "RefChr0002"], id="Additional regions found", ), pytest.param("assembly_report.txt", "sequences.gbff", [], id="No additional regions"), ], ) -def test_get_additions(data_dir: Path, report_file: str, gbff_file: str, output: List[str]) -> None: +def test_get_additions(data_dir: Path, report_file: str, gbff_file: str, output: list[str]) -> None: """Tests the `extend.get_additions()` method. Args: @@ -130,6 +133,7 @@ def test_get_additions(data_dir: Path, report_file: str, gbff_file: str, output: report_file: Assembly report file name. gbff_path: GBFF file name. output: Expected sequence regions names that need to be added. + """ report_path = data_dir / report_file gbff_path = data_dir / gbff_file if gbff_file else None @@ -143,10 +147,10 @@ def test_get_additions(data_dir: Path, report_file: str, gbff_file: str, output: [ pytest.param("genome.json", "", "", "genome.json", id="No report file"), pytest.param( - "genome.json", "assembly_report.txt", "", "updated_genome.json", id="Additional seq regions" + "genome.json", "assembly_report.txt", "", "updated_genome.json", id="Additional seq regions", ), pytest.param( - "genome.json", "assembly_report.txt", "sequences.gbff", "genome.json", id="No additional regions" + "genome.json", "assembly_report.txt", "sequences.gbff", "genome.json", id="No additional regions", ), ], ) @@ -169,6 +173,7 @@ def test_amend_genome_metadata( report_file: INSDC/RefSeq sequences report file. genbank_file: INSDC/RefSeq GBFF file. output_file: Expected amended genome metadata file. + """ genome_inpath = data_dir / genome_infile report_path = data_dir / report_file if report_file else None diff --git a/src/python/tests/genome_metadata/test_prepare.py b/src/python/tests/genome_metadata/test_prepare.py index b42aa4c17..5797e3a6a 100644 --- a/src/python/tests/genome_metadata/test_prepare.py +++ b/src/python/tests/genome_metadata/test_prepare.py @@ -17,7 +17,7 @@ from contextlib import nullcontext as does_not_raise from pathlib import Path -from typing import Any, Callable, ContextManager, Dict, Optional +from typing import Any, Callable, ContextManager from unittest.mock import Mock, patch from deepdiff import DeepDiff @@ -68,15 +68,15 @@ id="Provider information already present", ), pytest.param( - "cncb_genome.json", {}, {}, pytest.raises(prepare.MetadataError), id="Unexpected provider" + "cncb_genome.json", {}, {}, pytest.raises(prepare.MetadataError), id="Unexpected provider", ), ], ) def test_add_provider( json_data: Callable[[str], Any], genome_file: str, - ncbi_data: Dict, - output: Dict[str, Dict[str, Optional[str]]], + ncbi_data: dict, + output: dict[str, dict[str, str | None]], expectation: ContextManager, ) -> None: """Tests the `prepare.add_provider()` method. @@ -87,6 +87,7 @@ def test_add_provider( ncbi_data: Report from NCBI datasets. output: Expected elements present in the updated genome metadata. expectation: Context manager for the expected exception (if any). + """ genome_metadata = json_data(genome_file) with expectation: @@ -111,6 +112,7 @@ def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, json_data: JSON test file parsing fixture. genome_file: Genome metadata JSON file. output: Assembly version expected in the updated genome metadata. + """ genome_metadata = json_data(genome_file) prepare.add_assembly_version(genome_metadata) @@ -126,7 +128,7 @@ def test_add_assembly_version(json_data: Callable[[str], Any], genome_file: str, ], ) def test_add_genebuild_metadata( - mock_date: Mock, json_data: Callable[[str], Any], genome_file: str, output: str + mock_date: Mock, json_data: Callable[[str], Any], genome_file: str, output: str, ) -> None: """Tests the `prepare.add_genebuild_metadata()` method. @@ -135,6 +137,7 @@ def test_add_genebuild_metadata( json_data: JSON test file parsing fixture. genome_file: Genome metadata JSON file. output: Expected date for genebuild's `"start_date"` and `"version"` in the updated genome metadata. + """ mock_date.today.return_value = mock_date mock_date.isoformat.return_value = output @@ -174,8 +177,8 @@ def test_add_genebuild_metadata( def test_add_species_metadata( json_data: Callable[[str], Any], genome_file: str, - ncbi_data_organism: Dict, - output: Dict[str, Any], + ncbi_data_organism: dict, + output: dict[str, Any], ) -> None: """Tests the `prepare.add_species_metadata()` method. @@ -184,6 +187,7 @@ def test_add_species_metadata( genome_file: Genome metadata JSON file. ncbi_data_organism: NCBI dataset organism report. output: Expected `"species"` genome metadata content. + """ ncbi_data = {"organism": ncbi_data_organism} genome_metadata = json_data(genome_file) @@ -224,6 +228,7 @@ def test_prepare_genome_metadata( input_filename: Input genome JSON file. ncbi_filename: NCBI dataset report JSON file. expected_filename: Expected output genome JSON file. + """ mock_date.today.return_value = mock_date mock_date.isoformat.return_value = "2024-03-19" diff --git a/src/python/tests/genome_stats/test_compare.py b/src/python/tests/genome_stats/test_compare.py index 24878f1f7..8998458e8 100644 --- a/src/python/tests/genome_stats/test_compare.py +++ b/src/python/tests/genome_stats/test_compare.py @@ -20,7 +20,7 @@ """ from pathlib import Path -from typing import Callable, Dict +from typing import Callable from deepdiff import DeepDiff import pytest @@ -36,7 +36,7 @@ pytest.param({"a": 0}, {"a": 0}, {}, id="same_dicts_zero_values"), pytest.param({"a": 3}, {"a": 3}, {"same": {"a": 3}}, id="same_dicts_non_zero_values"), pytest.param( - {"a": 3}, {"a": 5}, {"different": {"a": {"ncbi": 3, "core": 5, "diff": 2}}}, id="different_dicts" + {"a": 3}, {"a": 5}, {"different": {"a": {"ncbi": 3, "core": 5, "diff": 2}}}, id="different_dicts", ), pytest.param( {"a": 3, "b": 5}, @@ -46,7 +46,7 @@ ), ], ) -def test_compare_dicts(ncbi: Dict[str, int], core: Dict[str, int], output: Dict[str, Dict]) -> None: +def test_compare_dicts(ncbi: dict[str, int], core: dict[str, int], output: dict[str, dict]) -> None: """Tests the `compare._compare_dicts()` method. Args: @@ -67,7 +67,7 @@ def test_compare_dicts(ncbi: Dict[str, int], core: Dict[str, int], output: Dict[ ("ncbi_annotated.json", "core_annotated.json", "output_annotated.json"), ], ) -def test_compare_assembly(json_data: Callable, ncbi_file: Dict, core_file: Dict, output_file: Dict) -> None: +def test_compare_assembly(json_data: Callable, ncbi_file: dict, core_file: dict, output_file: dict) -> None: """Tests the `compare.compare_assembly()` method. Args: @@ -112,7 +112,7 @@ def test_compare_annotation(json_data: Callable, ncbi_file: str, core_file: str, @pytest.mark.dependency( - name="test_compare_stats", depends=["test_compare_assembly", "test_compare_annotation"] + name="test_compare_stats", depends=["test_compare_assembly", "test_compare_annotation"], ) @pytest.mark.parametrize( "ncbi_file, core_file, output_file", @@ -146,7 +146,7 @@ def test_compare_stats(json_data: Callable, ncbi_file: str, core_file: str, outp ], ) def test_compare_stats_files( - data_dir: Path, json_data: Callable, ncbi_file: str, core_file: str, output_file: str + data_dir: Path, json_data: Callable, ncbi_file: str, core_file: str, output_file: str, ) -> None: """Tests the `compare.compare_stats_files()` method. diff --git a/src/python/tests/genome_stats/test_dump.py b/src/python/tests/genome_stats/test_dump.py index 748af5b66..ffd2dfe1e 100644 --- a/src/python/tests/genome_stats/test_dump.py +++ b/src/python/tests/genome_stats/test_dump.py @@ -22,7 +22,7 @@ from dataclasses import dataclass from pathlib import Path from string import Template -from typing import Any, Callable, Dict, List, Tuple +from typing import Any, Callable from unittest.mock import MagicMock, patch import pytest @@ -36,15 +36,16 @@ @dataclass class MockResult: + """Mocker of `sqlalchemy.engine.Result` class.""" - rows: List + rows: list def __iter__(self) -> Any: """Iterates over the elements in `rows` attribute.""" yield from self.rows - def one(self) -> Tuple: + def one(self) -> tuple: """Returns the first element in `rows` attribute.""" return self.rows[0] @@ -53,21 +54,22 @@ def one(self) -> Tuple: ATTRIB_COUNTS_QUERY = Template( "SELECT seq_region_attrib.value, count(*) AS count_1 " "FROM seq_region_attrib JOIN attrib_type ON attrib_type.attrib_type_id = seq_region_attrib.attrib_type_id" - " WHERE attrib_type.code = '${code}' GROUP BY seq_region_attrib.value" + " WHERE attrib_type.code = '${code}' GROUP BY seq_region_attrib.value", ) BIOTYPES_QUERY = Template( - "SELECT ${table}.biotype, count(*) AS count_1 FROM ${table} GROUP BY ${table}.biotype" + "SELECT ${table}.biotype, count(*) AS count_1 FROM ${table} GROUP BY ${table}.biotype", ) FEATURE_STATS_TOTAL_QUERY = Template("SELECT count(*) AS count_1 FROM ${table}") FEATURE_STATS_NULL_QUERY = Template( - "SELECT count(*) AS count_1 FROM ${table} WHERE ${table}.description IS NULL" + "SELECT count(*) AS count_1 FROM ${table} WHERE ${table}.description IS NULL", ) FEATURE_STATS_SOURCE_QUERY = Template( - "SELECT count(*) AS count_1 FROM ${table} WHERE ${table}.description LIKE '%[Source:%'" + "SELECT count(*) AS count_1 FROM ${table} WHERE ${table}.description LIKE '%[Source:%'", ) class MockSession(Session): + """Mocker of `sqlalchemy.orm.Session` class that replaces its `execute()` method for testing.""" # pylint: disable-next=too-many-return-statements @@ -111,10 +113,11 @@ def execute(self, statement: ClauseElement) -> MockResult: # type: ignore[overr class TestStatsGenerator: + """Tests for the `StatsGenerator` class.""" stats_gen: dump.StatsGenerator - genome_stats: Dict[str, Any] + genome_stats: dict[str, Any] @pytest.fixture(scope="class", autouse=True) def setup(self, data_dir: Path) -> None: @@ -139,7 +142,7 @@ def setup(self, data_dir: Path) -> None: ], ) @pytest.mark.dependency(name="fix_scaffolds") - def test_fix_scaffolds(self, stats: Dict, output: Dict) -> None: + def test_fix_scaffolds(self, stats: dict, output: dict) -> None: """Tests the `StatsGenerator._fix_scaffolds()` static method. Args: diff --git a/src/python/tests/gff3/test_extract_annotation.py b/src/python/tests/gff3/test_extract_annotation.py index ffe122c8e..8e926a321 100644 --- a/src/python/tests/gff3/test_extract_annotation.py +++ b/src/python/tests/gff3/test_extract_annotation.py @@ -17,7 +17,7 @@ from contextlib import nullcontext as does_not_raise from pathlib import Path -from typing import Callable, ContextManager, Dict, List, Optional +from typing import Callable, ContextManager import pytest from pytest import raises, param @@ -66,7 +66,7 @@ ("LOW QUALITY PROTEIN: uncharacterized protein PROTID12345", ["PROTID12345"], False), ], ) -def test_product_is_informative(description: str, feature_id: Optional[List[str]], output: bool) -> None: +def test_product_is_informative(description: str, feature_id: list[str] | None, output: bool) -> None: """Tests the `FunctionalAnnotations.product_is_informative()` method.""" assert FunctionalAnnotations.product_is_informative(description, feature_id) == output @@ -106,7 +106,7 @@ def test_add_feature(seq_feat_type: str, feat_type: str, expected: ContextManage pytest.param("featA", "featA_name", ["featA_name"], id="Diff name and ID"), ], ) -def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: List[str]) -> None: +def test_add_feature_name(feat_id: str, feat_name: str, expected_synonyms: list[str]) -> None: """Tests the `FunctionalAnnotations.add_feature()` method with a feature name.""" annot = FunctionalAnnotations() @@ -180,7 +180,7 @@ def test_get_parent( parent = GFFSeqFeature(type=in_parent_type, id=in_parent_id) annot.add_feature(parent, "gene") annot.add_feature( - GFFSeqFeature(type="mRNA", id=in_child_id), feat_type="transcript", parent_id=in_parent_id + GFFSeqFeature(type="mRNA", id=in_child_id), feat_type="transcript", parent_id=in_parent_id, ) with expected: @@ -195,13 +195,13 @@ def test_get_parent( pytest.param("bad_type", "mrna_A", "gene_A", raises(KeyError), id="Child type does not exist"), pytest.param("gene", "gene_A", None, raises(AnnotationError), id="Feature ID already loaded"), pytest.param( - "gene", "gene_B", "gene_A", raises(AnnotationError), id="Cannot add a gene child of a gene" + "gene", "gene_B", "gene_A", raises(AnnotationError), id="Cannot add a gene child of a gene", ), ], ) @pytest.mark.dependency(name="add_feature_fail", depends=["add_feature", "get_parent"]) def test_add_feature_fail( - child_type: str, child_id: str, out_parent_id: Optional[str], expected: ContextManager + child_type: str, child_id: str, out_parent_id: str | None, expected: ContextManager, ) -> None: """Tests the `FunctionalAnnotation.add_feature()` method failures. @@ -269,7 +269,7 @@ def test_add_feature_fail( ], ) def test_get_xrefs( - in_id: str, in_xrefs: Optional[List[str]], provider_name: str, expected_xrefs: List[Dict[str, str]] + in_id: str, in_xrefs: list[str] | None, provider_name: str, expected_xrefs: list[dict[str, str]], ) -> None: """Tests the `FunctionalAnnotation.get_xrefs()` method.""" annot = FunctionalAnnotations(provider_name=provider_name) @@ -335,11 +335,11 @@ def test_get_features(feat_type: str, expected_number: int, expected: ContextMan ) @pytest.mark.dependency(depends=["get_features"]) def test_transfer_descriptions( - gene_desc: Optional[str], - transc_desc: Optional[str], - transl_desc: Optional[str], - out_gene_desc: Optional[str], - out_transc_desc: Optional[str], + gene_desc: str | None, + transc_desc: str | None, + transl_desc: str | None, + out_gene_desc: str | None, + out_transc_desc: str | None, ) -> None: """Tests the `FunctionalAnnotation.transfer_descriptions()` method. @@ -388,7 +388,7 @@ def test_transfer_descriptions( ], ) def test_store_gene( - cds_parts: int, num_cds: int, expected_num_genes: int, expected_num_tr: int, expected_num_cds: int + cds_parts: int, num_cds: int, expected_num_genes: int, expected_num_tr: int, expected_num_cds: int, ) -> None: """Test store_gene given a gene Feature with a transcript and optional translation. @@ -398,6 +398,7 @@ def test_store_gene( expected_num_genes: Number of genes stored as expected expected_num_tr: Number of transcripts stored as expected expected_num_cds: Number of CDSs stored as expected + """ annot = FunctionalAnnotations() gene_name = "gene_A" @@ -439,7 +440,7 @@ def test_store_gene( ), pytest.param( GFFSeqFeature( - type="gene", id="gene_A", qualifiers={"description": ["Gene description"], "Name": ["GeneA"]} + type="gene", id="gene_A", qualifiers={"description": ["Gene description"], "Name": ["GeneA"]}, ), GFFSeqFeature(type="mRNA", id="tran_A"), GFFSeqFeature(type="CDS", id="cds_A"), diff --git a/src/python/tests/gff3/test_id_allocator.py b/src/python/tests/gff3/test_id_allocator.py index ea26104fc..be223ee7b 100644 --- a/src/python/tests/gff3/test_id_allocator.py +++ b/src/python/tests/gff3/test_id_allocator.py @@ -18,7 +18,7 @@ from difflib import unified_diff import filecmp from pathlib import Path -from typing import ContextManager, Dict, List, Optional +from typing import ContextManager import pytest @@ -48,9 +48,9 @@ def _write_record(out_record: SeqRecord, out_gff: Path) -> None: def _show_diff(result_path: Path, expected_path: Path) -> str: """Create a useful diff between 2 files.""" - with open(result_path, "r") as result_fh: + with open(result_path) as result_fh: results = result_fh.readlines() - with open(expected_path, "r") as expected_fh: + with open(expected_path) as expected_fh: expected = expected_fh.readlines() diff = list(unified_diff(expected, results)) return "".join(diff) @@ -63,7 +63,7 @@ def _show_diff(result_path: Path, expected_path: Path) -> str: pytest.param({"BRC4": {"organism_abbrev": "LOREM"}}, "TMP_LOREM_", id="Prefix from genome meta"), ], ) -def test_set_prefix(genome: Dict, expected_prefix: str) -> None: +def test_set_prefix(genome: dict, expected_prefix: str) -> None: """Test prefix setting from genome metadata.""" ids = StableIDAllocator() ids.set_prefix(genome) @@ -78,7 +78,7 @@ def test_set_prefix(genome: Dict, expected_prefix: str) -> None: pytest.param("MYPREF_", ["MYPREF_1", "MYPREF_2"], id="Prefix MYPREF_"), ], ) -def test_generate_id(prefix: str, expected_ids: List[str]) -> None: +def test_generate_id(prefix: str, expected_ids: list[str]) -> None: """Test IDs generation.""" ids = StableIDAllocator() if prefix is not None: @@ -106,7 +106,7 @@ def test_generate_id(prefix: str, expected_ids: List[str]) -> None: pytest.param(None, "TRNAA-UAA", False, id="Trna ID, upper case"), ], ) -def test_valid_id(min_id_length: Optional[int], test_id: str, outcome: bool) -> None: +def test_valid_id(min_id_length: int | None, test_id: str, outcome: bool) -> None: """Test ID validity check.""" ids = StableIDAllocator() if min_id_length is not None: @@ -139,7 +139,7 @@ def test_valid_id_skip(test_id: str, skip_flag: bool, outcome: bool) -> None: pytest.param("LOREM-IPSUM1", ["LOREM-", "IPSUM"], "IPSUM1", id="Only 1 prefix is removed"), ], ) -def test_remove_prefixes(test_id: str, prefixes: List[str], outcome: str) -> None: +def test_remove_prefixes(test_id: str, prefixes: list[str], outcome: str) -> None: """Test prefix removal.""" assert StableIDAllocator.remove_prefix(test_id, prefixes) == outcome @@ -168,7 +168,7 @@ def test_normalize_cds_id(test_id: str, outcome: str) -> None: pytest.param("LOREM-IPSUM1", [1, 1], ["LOREM-IPSUM1_t1", "LOREM-IPSUM1_t1"], id="Same number (!)"), ], ) -def test_normalize_transcript_id(test_id: str, numbers: List[int], outcomes: List[str]) -> None: +def test_normalize_transcript_id(test_id: str, numbers: list[int], outcomes: list[str]) -> None: """Test transcript id normalization.""" new_ids = [] for number in numbers: @@ -187,7 +187,7 @@ def test_normalize_transcript_id(test_id: str, numbers: List[int], outcomes: Lis ], ) def test_normalize_pseudogene_cds_id( - tmp_path: Path, data_dir: Path, input_gff: str, expected_gff: str + tmp_path: Path, data_dir: Path, input_gff: str, expected_gff: str, ) -> None: """Test pseudogene CDS ID normalization.""" ids = StableIDAllocator() @@ -219,7 +219,7 @@ def test_normalize_pseudogene_cds_id( ], ) def test_normalize_gene_id( - data_dir: Path, input_gff: str, expected_id: str, make_id: Optional[bool], expected: ContextManager + data_dir: Path, input_gff: str, expected_id: str, make_id: bool | None, expected: ContextManager, ) -> None: """Test gene ID normalization.""" ids = StableIDAllocator() @@ -241,12 +241,12 @@ def test_normalize_gene_id( "input_gff, expected_ids, expected", [ pytest.param( - "geneid_GeneID2.gff3", ["GeneID_000001", "GeneID_000001_2"], does_not_raise(), id="Same GeneIDs" + "geneid_GeneID2.gff3", ["GeneID_000001", "GeneID_000001_2"], does_not_raise(), id="Same GeneIDs", ), ], ) def test_normalize_gene_id_duplicate( - data_dir: Path, input_gff: str, expected_ids: List[str], expected: ContextManager + data_dir: Path, input_gff: str, expected_ids: list[str], expected: ContextManager, ) -> None: """Test gene ID normalization with duplicate Gene ID features.""" ids = StableIDAllocator() diff --git a/src/python/tests/gff3/test_records.py b/src/python/tests/gff3/test_records.py index 0f282fe47..209ae0796 100644 --- a/src/python/tests/gff3/test_records.py +++ b/src/python/tests/gff3/test_records.py @@ -17,7 +17,7 @@ from contextlib import nullcontext as no_raise from os import PathLike from pathlib import Path -from typing import Callable, ContextManager, List, Optional +from typing import Callable, ContextManager import pytest from pytest import param, raises @@ -37,8 +37,8 @@ def test_from_gff( data_dir: Path, in_gff: PathLike, - excluded: Optional[List[str]], - expected_loaded: List[str], + excluded: list[str] | None, + expected_loaded: list[str], expectation: ContextManager, ) -> None: """Test loading GFF records from file.""" diff --git a/src/python/tests/gff3/test_restructure.py b/src/python/tests/gff3/test_restructure.py index 89355f621..139e85a60 100644 --- a/src/python/tests/gff3/test_restructure.py +++ b/src/python/tests/gff3/test_restructure.py @@ -15,7 +15,7 @@ """Unit testing of `ensembl.io.genomio.gff3.restructure` module.""" from contextlib import nullcontext as does_not_raise -from typing import Any, ContextManager, Dict, List, Union +from typing import Any, ContextManager from Bio.SeqFeature import SimpleLocation import pytest @@ -27,6 +27,7 @@ class FeatGenerator: + """Generates features and structures for testing.""" start = 1 @@ -35,17 +36,17 @@ class FeatGenerator: region = "LOREM" source = "Foo" - def make(self, ftype: str, number: int = 1) -> List[GFFSeqFeature]: + def make(self, ftype: str, number: int = 1) -> list[GFFSeqFeature]: """Returns a list with a defined number of features of a given type.""" feats = [] - for _ in range(0, number): + for _ in range(number): loc = SimpleLocation(self.start, self.end, self.strand) feat = GFFSeqFeature(loc, type=ftype) feat.qualifiers["source"] = self.source feats.append(feat) return feats - def make_structure(self, children: List[Any]) -> List[GFFSeqFeature]: + def make_structure(self, children: list[Any]) -> list[GFFSeqFeature]: """Returns a list of SeqFeature children structure from the form: struct = ["mRNA"] struct = [{"mRNA": ["CDS", "exon"]}, "exon", "exon"] @@ -64,7 +65,7 @@ def make_structure(self, children: List[Any]) -> List[GFFSeqFeature]: return output - def get_sub_structure(self, feat: GFFSeqFeature) -> Union[Dict, str]: + def get_sub_structure(self, feat: GFFSeqFeature) -> dict | str: """Create a children structure from a SeqFeature.""" if feat.sub_features: feat_subs = [] @@ -82,11 +83,11 @@ def get_sub_structure(self, feat: GFFSeqFeature) -> Union[Dict, str]: param([{"gene": ["mRNA", "mRNA"]}], [{"gene": ["mRNA", "mRNA"]}], id="gene + 2 mRNA"), param(["ncRNA_gene"], ["ncRNA_gene"], id="1 ncRNA_gene, no transcript"), param( - [{"ncRNA_gene": ["transcript"]}], [{"ncRNA_gene": ["transcript"]}], id="1 ncRNA_gene + transcript" + [{"ncRNA_gene": ["transcript"]}], [{"ncRNA_gene": ["transcript"]}], id="1 ncRNA_gene + transcript", ), ], ) -def test_add_transcript_to_naked_gene(children: List[Any], expected_children: List[Any]) -> None: +def test_add_transcript_to_naked_gene(children: list[Any], expected_children: list[Any]) -> None: """Test the creation of a transcript for a gene without one.""" gen = FeatGenerator() genes = gen.make_structure(children) @@ -109,8 +110,8 @@ def test_add_transcript_to_naked_gene(children: List[Any], expected_children: Li ], ) def test_move_only_cdss_to_new_mrna( - children: List[str], - expected_children: Dict[str, int], + children: list[str], + expected_children: dict[str, int], ) -> None: """Test the creation of a new mRNA for CDSs under a gene.""" gen = FeatGenerator() @@ -131,8 +132,8 @@ def test_move_only_cdss_to_new_mrna( ], ) def test_move_only_exons_to_new_mrna( - children: List[str], - expected_children: Dict[str, int], + children: list[str], + expected_children: dict[str, int], ) -> None: """Test the creation of a new mRNA for exons under a gene.""" gen = FeatGenerator() @@ -216,15 +217,16 @@ def test_move_only_exons_to_new_mrna( ], ) def test_move_cds_to_existing_mrna( - children: List[str], + children: list[str], diff_exon: bool, - expected_children: Dict[str, int], + expected_children: dict[str, int], expectation: ContextManager, ) -> None: """Test moving CDSs under a gene to under the mRNA. Args: diff_exons: use exons with different coordinates than the CDSs. + """ gen = FeatGenerator() gene = gen.make("gene", 1)[0] @@ -279,12 +281,13 @@ def test_move_cds_to_existing_mrna( ], ) def test_remove_extra_exons( - children: List[Any], has_id: int, expected_children: List[Any], expectation: ContextManager + children: list[Any], has_id: int, expected_children: list[Any], expectation: ContextManager, ) -> None: """Test removing extra unneeded exons. Args: has_id: add an ID starting with 'id-' for this number of exons (if any). + """ gen = FeatGenerator() gene = gen.make("gene", 1)[0] @@ -324,10 +327,9 @@ def test_remove_extra_exons( ], ) def test_restructure_gene( - children: List[Any], expected_children: List[Any], expectation: ContextManager + children: list[Any], expected_children: list[Any], expectation: ContextManager, ) -> None: """Test the `restructure_gene()` main function.""" - gen = FeatGenerator() gene = gen.make("gene", 1)[0] gene.sub_features += gen.make_structure(children) @@ -343,7 +345,7 @@ def test_restructure_gene( param("pseudogene", "pseudogene", id="pseudogene"), param({"pseudogene": ["mRNA"]}, {"pseudogene": ["mRNA"]}, id="pseudogene mRNA"), param( - {"pseudogene": [{"mRNA": ["CDS", "CDS"]}]}, {"pseudogene": ["mRNA"]}, id="pseudogene mRNA CDSs" + {"pseudogene": [{"mRNA": ["CDS", "CDS"]}]}, {"pseudogene": ["mRNA"]}, id="pseudogene mRNA CDSs", ), param( {"pseudogene": [{"mRNA": ["CDS", "exon"]}]}, @@ -353,7 +355,7 @@ def test_restructure_gene( param({"pseudogene": ["CDS", "CDS"]}, "pseudogene", id="pseudogene CDSs"), ], ) -def test_remove_cds_from_pseudogene(children: List[Any], expected_children: List[Any]) -> None: +def test_remove_cds_from_pseudogene(children: list[Any], expected_children: list[Any]) -> None: """Test CDS removal from pseudogene.""" gen = FeatGenerator() gene = gen.make_structure([children])[0] diff --git a/src/python/tests/gff3/test_simplifier.py b/src/python/tests/gff3/test_simplifier.py index 2c660fda3..580b12627 100644 --- a/src/python/tests/gff3/test_simplifier.py +++ b/src/python/tests/gff3/test_simplifier.py @@ -18,7 +18,7 @@ from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import Callable, ContextManager, Dict, Optional +from typing import Callable, ContextManager import pytest from pytest import param, raises @@ -53,7 +53,7 @@ ), ], ) -def test_get_provider_name(tmp_path: Path, genome_meta: Dict, expected_provider_name: str) -> None: +def test_get_provider_name(tmp_path: Path, genome_meta: dict, expected_provider_name: str) -> None: """Tests `GFFSimplifier.get_provider_name().`""" # Write metadata file meta_path = tmp_path / "meta.json" @@ -73,7 +73,7 @@ def test_get_provider_name(tmp_path: Path, genome_meta: Dict, expected_provider_ ), ], ) -def test_init_provider_name(tmp_path: Path, genome_meta: Dict, expected_provider_name: str) -> None: +def test_init_provider_name(tmp_path: Path, genome_meta: dict, expected_provider_name: str) -> None: """Tests `GFFSimplifier.__init__` to set the `provider_name` to its `FunctionalAnnotations` attrib.""" # Write metadata file meta_path = tmp_path / "meta.json" @@ -202,10 +202,10 @@ def test_create_gene_for_lone_cds( ) def test_normalize_non_gene( in_type: str, - in_mobile_type: Optional[str], - in_product: Optional[str], + in_mobile_type: str | None, + in_product: str | None, out_type: str, - out_description: Optional[str], + out_description: str | None, expectation: ContextManager, ) -> None: """Test non-gene normalization.""" @@ -330,7 +330,7 @@ def test_simpler_gff3_feature( tmp_path: Path, assert_files: Callable, in_gff: PathLike, - expected_gff: Optional[PathLike], + expected_gff: PathLike | None, expectation: ContextManager, ) -> None: """Test simplifying one gene (from a GFF3 file).""" @@ -460,7 +460,7 @@ def test_gffsimplifier_with_genome( data_dir: Path, tmp_path: Path, assert_files: Callable, - genome_file: Optional[PathLike], + genome_file: PathLike | None, in_gff: PathLike, expected_gff: PathLike, ) -> None: diff --git a/src/python/tests/manifest/test_check_integrity.py b/src/python/tests/manifest/test_check_integrity.py index 85d3024fc..ed30cfb96 100644 --- a/src/python/tests/manifest/test_check_integrity.py +++ b/src/python/tests/manifest/test_check_integrity.py @@ -31,7 +31,7 @@ ], ) def test_check_integrity( - data_dir: Path, manifest_file: str, ignore_false_stops: bool, expected: ContextManager + data_dir: Path, manifest_file: str, ignore_false_stops: bool, expected: ContextManager, ) -> None: """Tests the `IntegrityTool.check_integrity()` method. diff --git a/src/python/tests/manifest/test_manifest.py b/src/python/tests/manifest/test_manifest.py index c0a75b19e..a32d2c242 100644 --- a/src/python/tests/manifest/test_manifest.py +++ b/src/python/tests/manifest/test_manifest.py @@ -29,7 +29,7 @@ _CONTENT_MD5 = "45685e95985e20822fb2538a522a5ccf" -@pytest.mark.dependency() +@pytest.mark.dependency def test_init(tmp_path: Path) -> None: """Tests `Manifest.__init__()`.""" _ = Manifest(tmp_path) @@ -77,7 +77,7 @@ def test_get_files_checksum(tmp_path: Path, file_name: str, expected_name: str) "link1": {"file": "link1.agp", "md5sum": _CONTENT_MD5}, "link2": {"file": "link2.agp", "md5sum": _CONTENT_MD5}, "link3": {"file": "link3.agp", "md5sum": _CONTENT_MD5}, - } + }, }, no_raise(), id="3 agp files, different names", @@ -88,7 +88,7 @@ def test_get_files_checksum(tmp_path: Path, file_name: str, expected_name: str) "agp": { "agp": {"file": "a_agp.agp", "md5sum": _CONTENT_MD5}, "agp.1": {"file": "b_agp.agp", "md5sum": _CONTENT_MD5}, - } + }, }, no_raise(), id="2 agp files with same name", @@ -103,7 +103,7 @@ def test_get_files_checksum(tmp_path: Path, file_name: str, expected_name: str) ) @pytest.mark.dependency(depends=["test_init"]) def test_get_files_checksum_multifiles( - tmp_path: Path, file_names: list[str], expected_content: dict, expected: ContextManager + tmp_path: Path, file_names: list[str], expected_content: dict, expected: ContextManager, ) -> None: """Tests `Manifest.get_files_checksum()` with several files for the same name. @@ -113,6 +113,7 @@ def test_get_files_checksum_multifiles( file_names: List of files to create. expected_content: Expected checksum dict. expected: Expected exception. + """ for file_name in file_names: with Path(tmp_path / file_name).open("w") as fh: @@ -158,7 +159,7 @@ def test_get_files_checksum_warning_empty(tmp_path: Path, caplog: LogCaptureFixt ) @pytest.mark.dependency(depends=["test_init"]) def test_create_manifest( - tmp_path: Path, assert_files: Callable, files: list[str], expected_content: dict + tmp_path: Path, assert_files: Callable, files: list[str], expected_content: dict, ) -> None: """Tests `Manifest.create()`. @@ -187,7 +188,7 @@ def test_create_manifest( "files_dir, expected_files, expected", [ param( - "full_data", {"functional_annotation", "seq_region"}, no_raise(), id="OK manifest with OK files" + "full_data", {"functional_annotation", "seq_region"}, no_raise(), id="OK manifest with OK files", ), param("duplicates", {"agp"}, no_raise(), id="Several files for key"), param("", {}, raises(ManifestError), id="No manifest to load"), @@ -197,7 +198,7 @@ def test_create_manifest( ) @pytest.mark.dependency(depends=["test_init"]) def test_load( - tmp_path: Path, data_dir: Path, files_dir: str, expected_files: set, expected: ContextManager + tmp_path: Path, data_dir: Path, files_dir: str, expected_files: set, expected: ContextManager, ) -> None: """Tests `Manifest.load()`. @@ -207,6 +208,7 @@ def test_load( files_dir: Directory where test data files are copied from. expected_files: Set of main files expected to be loaded. expected: Catch an expected exception. + """ # Copy the files to the tmp folder if files_dir: diff --git a/src/python/tests/manifest/test_manifest_stats.py b/src/python/tests/manifest/test_manifest_stats.py index 46f0b0fdb..ed52f8ba6 100644 --- a/src/python/tests/manifest/test_manifest_stats.py +++ b/src/python/tests/manifest/test_manifest_stats.py @@ -73,6 +73,7 @@ def test_load_seq_regions( manifest_dir: Directory name with the manifest file in it. expected_lengths: Expected length data from the files. expected_circular: Expected circular data from the files. + """ if manifest_dir: seq_manifest = data_dir / manifest_dir / "manifest.json" @@ -110,6 +111,7 @@ def test_load_dna_fasta_lengths( fasta_str: Content of a test input FASTA DNA file. expected_lengths: Expected length data from the files. expected_error: Expected errors while loading. + """ fasta_path = tmp_path / "fasta_dna.fasta" with fasta_path.open("w") as fasta_fh: @@ -161,6 +163,7 @@ def test_load_peptides_fasta_lengths( ignore_final_stops: Ignore final stops for the protein sequences. expected_lengths: Expected length data from the files. expected_error: Expected errors while loading. + """ fasta_path = tmp_path / "fasta_pep.fasta" with fasta_path.open("w") as fasta_fh: @@ -184,7 +187,7 @@ def test_load_peptides_fasta_lengths( param({}, "", {}, id="Empty JSON"), param({"id": "gene1", "object_type": "gene"}, "ann_genes", {"gene1": 1}, id="1 gene"), param( - {"id": "pep1", "object_type": "translation"}, "ann_translations", {"pep1": 1}, id="1 translation" + {"id": "pep1", "object_type": "translation"}, "ann_translations", {"pep1": 1}, id="1 translation", ), param( {"id": "te1", "object_type": "transposable_element"}, @@ -209,6 +212,7 @@ def test_load_functional_annotations( ignore_final_stops: Ignore final stops for the protein sequences. expected_lengths: Expected length data from the files. expected_error: Expected errors while loading. + """ func_path = tmp_path / "functional_annotation.json" if json_data is not None: @@ -269,6 +273,7 @@ def test_load_gff3( Args: gff3_path: Path to a test GFF3 file. expected_data: Expected length data extracted from the GFF3 file. + """ if gff3_path: copy(data_dir / gff3_path, tmp_path / "test.gff3") @@ -325,7 +330,7 @@ def test_prepare_integrity_data(tmp_path: Path) -> None: ], ) def test_has_lengths( - manifest_path: Path, key: str, expected_data: dict[str, bool], expectation: ContextManager + manifest_path: Path, key: str, expected_data: dict[str, bool], expectation: ContextManager, ) -> None: """Tests `ManifestStats.has_lengths()`. @@ -335,6 +340,7 @@ def test_has_lengths( key: Key for the length dict. expected_data: If lengths exist for that key. expectation: Expected exception. + """ stats = ManifestStats(manifest_path) stats.load_functional_annotation() @@ -350,7 +356,7 @@ def test_has_lengths( ], ) def test_get_lengths( - manifest_path: Path, key: str, expected_data: dict[str, bool], expectation: ContextManager + manifest_path: Path, key: str, expected_data: dict[str, bool], expectation: ContextManager, ) -> None: """Tests `ManifestStats.get_lengths()`. @@ -360,6 +366,7 @@ def test_get_lengths( key: Key for the length dict. expected_data: Expected length information for that key. expectation: Expected exception. + """ stats = ManifestStats(manifest_path) stats.load_functional_annotation() @@ -375,7 +382,7 @@ def test_get_lengths( ], ) def test_get_circular( - manifest_path: Path, key: str, expected_data: dict[str, bool], expectation: ContextManager + manifest_path: Path, key: str, expected_data: dict[str, bool], expectation: ContextManager, ) -> None: """Tests `ManifestStats.get_circular()`. @@ -385,6 +392,7 @@ def test_get_circular( key: Key for the circular dict. expected_data: Expected circular information for that key. expectation: Expected exception. + """ stats = ManifestStats(manifest_path) stats.load_seq_regions() diff --git a/src/python/tests/schemas/test_json.py b/src/python/tests/schemas/test_json.py index 7315a8f2d..f7c95d8fa 100644 --- a/src/python/tests/schemas/test_json.py +++ b/src/python/tests/schemas/test_json.py @@ -25,7 +25,7 @@ from contextlib import nullcontext as does_not_raise from os import PathLike from pathlib import Path -from typing import ContextManager, List +from typing import ContextManager from jsonschema.exceptions import ValidationError import pytest @@ -45,7 +45,7 @@ ], ) def test_schema_factory( - tmp_path: Path, data_dir: Path, metadata_types: List[str], output: List[PathLike] + tmp_path: Path, data_dir: Path, metadata_types: list[str], output: list[PathLike], ) -> None: """Tests the `schema_factory()` method. diff --git a/src/python/tests/seq_region/test_collection.py b/src/python/tests/seq_region/test_collection.py index 4cb072400..8ddde5dde 100644 --- a/src/python/tests/seq_region/test_collection.py +++ b/src/python/tests/seq_region/test_collection.py @@ -102,7 +102,7 @@ def test_from_gbff(data_dir: Path) -> None: "length": 60, "location": "apicoplast_chromosome", "synonyms": [{"name": "U87145", "source": "INSDC"}], - } + }, } collection = SeqCollection() collection.from_gbff(data_dir / gb_file) @@ -169,7 +169,7 @@ def test_from_gbff(data_dir: Path) -> None: ], ) def test_make_seqregion_from_report( - seq_data: dict, is_refseq: bool, expected_key: str, expected_value: Any | None, expected: ContextManager + seq_data: dict, is_refseq: bool, expected_key: str, expected_value: Any | None, expected: ContextManager, ) -> None: """Test `SeqCollection.make_seqregion_from_report()`. @@ -179,6 +179,7 @@ def test_make_seqregion_from_report( expected_key: Check this key. expected_value: Check this value for the key. expected: Context manager to catch expected exceptions. + """ collection = SeqCollection() input_data = {} @@ -203,7 +204,7 @@ def test_make_seqregion_from_report_custom() -> None: custom_locations = {"chromosome": "custom_chromosome"} custom_synonyms = {"Sequence-Name": "custom_name"} seq_dict = collection.make_seq_region_from_report( - input_data, is_refseq=True, molecule_location=custom_locations, synonym_map=custom_synonyms + input_data, is_refseq=True, molecule_location=custom_locations, synonym_map=custom_synonyms, ) assert seq_dict["location"] == "custom_chromosome" assert seq_dict["synonyms"] == [{"source": "custom_name", "name": "seq_name"}] @@ -224,7 +225,7 @@ def test_from_report(data_dir: Path) -> None: {"name": "TGME49_chrIa", "source": "INSDC_submitted_name"}, {"name": "NC_031467.1", "source": "RefSeq"}, ], - } + }, } collection = SeqCollection() collection.from_report(data_dir / report_file) @@ -253,7 +254,7 @@ def test_remove() -> None: ], ) def test_add_translation_table( - input_seq: dict[str, str], code_map: dict[str, int] | None, expected_codon_table: int | None + input_seq: dict[str, str], code_map: dict[str, int] | None, expected_codon_table: int | None, ) -> None: """Test `SeqCollection.add_translation_table()`. @@ -261,6 +262,7 @@ def test_add_translation_table( input_seq: Sequence dict with usable values (`codon_table`, `location`). code_map: A custom map location -> codon table number. expected_codon_table: Expected codon table number. + """ collection = SeqCollection() seq_name = "foobar" @@ -322,8 +324,8 @@ def test_add_mitochondrial_codon_table( response_data: Return data from the request. expected_codon_table: Expected codon table after update. expected: Context manager to catch expected exceptions. - """ + """ mock_requests_get.return_value = mock_response(response_data) collection = SeqCollection() seq_name = "foobar" diff --git a/src/python/tests/seq_region/test_dump.py b/src/python/tests/seq_region/test_dump.py index 7728e2041..ca196cc49 100644 --- a/src/python/tests/seq_region/test_dump.py +++ b/src/python/tests/seq_region/test_dump.py @@ -114,7 +114,7 @@ def _add_test_synonym(session: Session, dialect: str, synonym: str, db_name: str def _add_test_attrib( - session: Session, dialect: str, logic_name: str, value: str | int, attrib_id: int + session: Session, dialect: str, logic_name: str, value: str | int, attrib_id: int, ) -> None: """Add a seq_region attrib to the test seq_region. @@ -138,7 +138,7 @@ def _add_test_attrib( def _add_test_karyotype( - session: Session, dialect: str, start: int, end: int, band: str | None = None, stain: str | None = None + session: Session, dialect: str, start: int, end: int, band: str | None = None, stain: str | None = None, ) -> None: """Add a seq_region karyotype band to the test seq_region. @@ -240,7 +240,7 @@ def test_get_karyotype(seq_test_db: UnitTestDB, bands: list, expected_kar: dict) param({"toplevel": 1, "circular_seq": 1}, {"circular": 1}, id="circular"), param({"toplevel": 1, "circular_seq": 0}, {}, id="not circular"), param( - {"toplevel": 1, "coord_system_tag": "contig"}, {"coord_system_level": "contig"}, id="contig level" + {"toplevel": 1, "coord_system_tag": "contig"}, {"coord_system_level": "contig"}, id="contig level", ), ], ) @@ -306,7 +306,7 @@ def test_get_seq_regions_attribs( ], ) def test_get_added_sequence( - seq_test_db: UnitTestDB, attribs: dict[str, str], expected_output: dict[str, str | dict] + seq_test_db: UnitTestDB, attribs: dict[str, str], expected_output: dict[str, str | dict], ) -> None: """Tests the `get_added_sequences` method. diff --git a/src/python/tests/seq_region/test_gbff.py b/src/python/tests/seq_region/test_gbff.py index fadcdb3fc..5a601dbe5 100644 --- a/src/python/tests/seq_region/test_gbff.py +++ b/src/python/tests/seq_region/test_gbff.py @@ -93,7 +93,7 @@ def test_get_codon_table(data_dir: Path, input_gb: str, expected_table: str | No ], ) def test_get_organelle( - data_dir: Path, input_gb: str, expected_location: str | None, expectation: ContextManager + data_dir: Path, input_gb: str, expected_location: str | None, expectation: ContextManager, ) -> None: """Test for `gbff.get_organelle()`. diff --git a/src/python/tests/seq_region/test_prepare.py b/src/python/tests/seq_region/test_prepare.py index 101d3ad96..e4a533e9d 100644 --- a/src/python/tests/seq_region/test_prepare.py +++ b/src/python/tests/seq_region/test_prepare.py @@ -50,6 +50,7 @@ def test_prepare_seq_region_metadata( gbff_path: Input GBFF file in any. expected_path: Expect JSON output. to_exclude: List of sequences to exclude. + """ gbff_file = None if gbff_path: