diff --git a/src/api/endpoints.py b/src/api/endpoints.py index f7a853c..04f4bbc 100644 --- a/src/api/endpoints.py +++ b/src/api/endpoints.py @@ -1,3 +1,5 @@ +import asyncio +import json import os from typing import Dict, List @@ -7,8 +9,6 @@ from pydantic import BaseModel from api.sessions import session_manager -from core.input import InputData -from core.results import analyse class InputSchema(BaseModel): @@ -21,11 +21,26 @@ class InputSchema(BaseModel): router = APIRouter() +async def run_cli_command(command: list): + process = await asyncio.create_subprocess_exec( + *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + + stdout, stderr = await process.communicate() + + stdout = stdout.decode().strip() + stderr = stderr.decode().strip() + + if process.returncode != 0: + raise RuntimeError( + f"CLI command failed with return code {process.returncode}: {stderr}" + ) + + return stdout + + @router.post("/init") -async def initialize( - input_data: InputSchema, - background_tasks: BackgroundTasks, -) -> JSONResponse: +async def initialize(input_data: InputSchema) -> JSONResponse: """ Initialize the analysis process. @@ -53,20 +68,29 @@ async def initialize( ) session_id, result_dir = session_manager.new() - data = InputData( - nodesdb_f=session_manager.nodesdb_f, - go_mapping_f=session_manager.go_mapping_f, - pfam_mapping_f=session_manager.pfam_mapping_f, - sequence_ids_file=session_manager.sequence_ids_f, - ipr_mapping_f=session_manager.ipr_mapping_f, - cluster_file=session_manager.cluster_f, - config_data=input_data.config, - taxon_idx_mapping_file=session_manager.taxon_idx_mapping_file, - output_path=result_dir, - plot_format="png", # as we require images - ) - - background_tasks.add_task(analyse, data) + os.makedirs(result_dir, exist_ok=True) + config_f = os.path.join(result_dir, "config.json") + + with open(config_f, "w") as file: + json.dump(input_data.config, file) + + command = [ + "python", + "src/main.py", + "analyse", + "-g", + session_manager.cluster_f, + "-c", + config_f, + "-s", + session_manager.sequence_ids_f, + "-m", + session_manager.taxon_idx_mapping_file, + "-o", + result_dir, + ] + + asyncio.create_task(run_cli_command(command)) return JSONResponse( content={"detail": "Analysis task has been queued."}, diff --git a/src/cli/commands.py b/src/cli/commands.py index ac55f2e..6a8d176 100644 --- a/src/cli/commands.py +++ b/src/cli/commands.py @@ -1,5 +1,6 @@ import argparse import sys +from typing import Union from cli.validate import validate_cli_args from core.config import SUPPORTED_PLOT_FORMATS, SUPPORTED_TAXRANKS, SUPPORTED_TESTS @@ -13,7 +14,7 @@ def parse_args( pfam_mapping_f: str, ipr_mapping_f: str, go_mapping_f: str, -) -> ServeArgs | InputData: +) -> Union[ServeArgs, InputData]: """Parse command-line arguments. Args: @@ -68,6 +69,9 @@ def parse_args( other_files_group.add_argument( "-p", "--species_ids_file", help="SpeciesIDs.txt used in OrthoFinder" ) + other_files_group.add_argument( + "-m", "--taxon_idx_mapping", help="TAXON IDX Mapping File" + ) other_files_group.add_argument( "-f", "--functional_annotation", @@ -179,7 +183,7 @@ def parse_args( return InputData( cluster_file=args.cluster_file, - config_data=args.config_file, + config_f=args.config_file, sequence_ids_file=args.sequence_ids_file, species_ids_file=args.species_ids_file, functional_annotation_f=args.functional_annotation, @@ -202,6 +206,7 @@ def parse_args( pfam_mapping_f=pfam_mapping_f, ipr_mapping_f=ipr_mapping_f, go_mapping_f=go_mapping_f, + taxon_idx_mapping_file=args.taxon_idx_mapping, ) else: sys.exit() diff --git a/src/core/alo.py b/src/core/alo.py index e61846a..378c83c 100644 --- a/src/core/alo.py +++ b/src/core/alo.py @@ -1,4 +1,4 @@ -from typing import Dict, List, Literal, Optional, Set +from typing import Dict, List, Literal, Optional, Set, Union from core.clusters import Cluster @@ -33,7 +33,7 @@ def __init__(self, attribute: str, level: str, proteomes: Set[str]) -> None: "shared": [], } - self.protein_span_by_cluster_type: Dict[str, List[int | float]] = { + self.protein_span_by_cluster_type: Dict[str, List[Union[int, float]]] = { "singleton": [], "specific": [], "shared": [], @@ -59,7 +59,9 @@ def __init__(self, attribute: str, level: str, proteomes: Set[str]) -> None: self.domain_counter_by_domain_source_by_cluster_type = None self.protein_with_domain_count_by_domain_source_by_cluster_type = None - self.protein_length_stats_by_cluster_id: Dict[str, Dict[str, int | float]] = {} + self.protein_length_stats_by_cluster_id: Dict[ + str, Dict[str, Union[int, float]] + ] = {} self.protein_count_by_cluster_id: Dict[str, int] = {} def add_cluster( @@ -67,7 +69,7 @@ def add_cluster( cluster: Cluster, attribute_cluster_type: Literal["singleton", "shared", "specific"], ALO_cluster_status: Literal["absent", "present"], - ALO_protein_length_stats: Dict[str, int | float], + ALO_protein_length_stats: Dict[str, Union[int, float]], ALO_protein_ids_in_cluster: List[str], ALO_cluster_cardinality: Optional[str], mwu_pvalue: Optional[float], @@ -84,7 +86,7 @@ def add_cluster( Type of the cluster as either 'singleton', 'shared', or 'specific'. ALO_cluster_status (Literal["absent", "present"]): Status of the cluster, either 'absent' or 'present'. - ALO_protein_length_stats (Dict[str, int | float]): + ALO_protein_length_stats (Dict[str, Union[int, float]]): Length statistics of proteins in the cluster. ALO_protein_ids_in_cluster (List[str]): List of protein IDs present in the cluster. @@ -192,7 +194,7 @@ def get_cluster_count_by_cluster_status_by_cluster_type( ] ) - def get_protein_span_by_cluster_type(self, cluster_type: str) -> int | float: + def get_protein_span_by_cluster_type(self, cluster_type: str) -> Union[int, float]: """ Get the total span of proteins for a specific cluster type. @@ -201,7 +203,7 @@ def get_protein_span_by_cluster_type(self, cluster_type: str) -> int | float: Use "total" to get the total span across all cluster types. Returns: - int | float: Total span of proteins in the specified cluster type. + Union[int, float]: Total span of proteins in the specified cluster type. If 'cluster_type' is "total", returns the sum of spans across all cluster types. """ diff --git a/src/core/alo_collections.py b/src/core/alo_collections.py index 05b7022..b7712fb 100644 --- a/src/core/alo_collections.py +++ b/src/core/alo_collections.py @@ -186,7 +186,7 @@ def generate_header_for_node(self, node: ete3.TreeNode, dirs: Dict[str, str]): table.scale(2, 1) for key, cell in list(table.get_celld().items()): row, col = key - cell._text.set_color("grey") + cell._text.set_color("grey") # type:ignore cell.set_edgecolor("darkgrey") cell.visible_edges = "T" if row > 0 else "B" if row == len(data) - 2: diff --git a/src/core/build.py b/src/core/build.py index 7a56fe0..d35530c 100644 --- a/src/core/build.py +++ b/src/core/build.py @@ -1,5 +1,7 @@ -from collections import Counter -from typing import Any, Dict, List, Optional +from collections import Counter, OrderedDict, defaultdict +import json +import os +from typing import Any, Dict, List, Optional, Set, Union from ete3 import Tree @@ -7,8 +9,7 @@ from core.clusters import Cluster, ClusterCollection from core.logic import ( add_taxid_attributes, - parse_attributes_from_config_file, - parse_attributes_from_json, + parse_attributes_from_config_data, parse_fasta_dir, parse_go_mapping, parse_ipr_mapping, @@ -56,33 +57,107 @@ def get_singletons( def parse_cluster_file( + output_dir: str, cluster_f: str, proteinCollection: ProteinCollection, + available_proteomes: Set[str], ) -> List[Cluster]: """ Parses a cluster file to create Cluster objects and updates protein information. + Saves the filtered clustering data and stats to files. Args: + output_dir (str): Base directory path for saving files. cluster_f (str): Path to the cluster file. proteinCollection (ProteinCollection): Collection of Protein objects. + available_proteomes (Set[str]): Set of all available proteomes. Returns: - List[Cluster]: List of Cluster objects created from the file. + Tuple[List[Cluster], Dict[str, any]]: List of Cluster objects and stats. Raises: FileNotFoundError: If the cluster file `cluster_f` does not exist. """ cluster_list: List[Cluster] = [] - with open(cluster_f) as fh: + stats = { + "total_clusters": 0, + "total_proteins": 0, + "total_proteomes": len(available_proteomes), + "filtered_clusters": 0, + "filtered_proteins": 0, + "included_proteins": [], + "excluded_proteins": [], + "included_proteomes": defaultdict(int), + "excluded_proteomes": defaultdict(int), + } + + output_filtered_file = os.path.join(output_dir, "filtered_orthogroups.txt") + stats_file = os.path.join(output_dir, "filtering_summary.json") + + with open(cluster_f) as fh, open(output_filtered_file, "w") as ofh: for line in fh: + stats["total_clusters"] += 1 temp: List[str] = line.rstrip("\n").split(" ") cluster_id, protein_ids = temp[0].replace(":", ""), temp[1:] protein_ids = [protein_id for protein_id in protein_ids if protein_id] - cluster = Cluster(cluster_id, protein_ids, proteinCollection) + + filtered_protein_ids = [] for protein_id in protein_ids: - protein = proteinCollection.proteins_by_protein_id[protein_id] - protein.clustered = True - cluster_list.append(cluster) + proteome_id = protein_id.split(".")[0] # Extract proteome ID + if proteome_id in available_proteomes: + filtered_protein_ids.append(protein_id) + stats["included_proteins"].append(protein_id) + stats["included_proteomes"][proteome_id] += 1 + else: + stats["excluded_proteins"].append(protein_id) + stats["excluded_proteomes"][proteome_id] += 1 + + stats["total_proteins"] += len(protein_ids) + stats["filtered_proteins"] += len(filtered_protein_ids) + + if filtered_protein_ids: + # Only create a cluster if there are proteins left after filtering + cluster = Cluster(cluster_id, filtered_protein_ids, proteinCollection) + for protein_id in filtered_protein_ids: + protein = proteinCollection.proteins_by_protein_id[protein_id] + protein.clustered = True + cluster_list.append(cluster) + + ofh.write(f"{cluster_id}: {', '.join(filtered_protein_ids)}\n") + stats["filtered_clusters"] += 1 + + stats["included_proteins_count"] = len(set(stats["included_proteins"])) + stats["excluded_proteins_count"] = len(set(stats["excluded_proteins"])) + + # Convert proteome counts to lists of counts for JSON serialization + stats["included_proteomes"] = dict(stats["included_proteomes"]) + stats["excluded_proteomes"] = dict(stats["excluded_proteomes"]) + + # Reorder stats + ordered_stats = OrderedDict( + [ + ("total_clusters", stats["total_clusters"]), + ("total_proteins", stats["total_proteins"]), + ("total_proteomes", stats["total_proteomes"]), + ("filtered_clusters", stats["filtered_clusters"]), + ("filtered_proteins", stats["filtered_proteins"]), + ("included_proteins_count", stats["included_proteins_count"]), + ("excluded_proteins_count", stats["excluded_proteins_count"]), + ("included_proteomes", stats["included_proteomes"]), + ("excluded_proteomes", stats["excluded_proteomes"]), + ("included_proteins", stats["included_proteins"]), + ("excluded_proteins", stats["excluded_proteins"]), + ] + ) + + with open(stats_file, "w") as mf: + json.dump( + ordered_stats, + mf, + separators=(", ", ": "), + indent=4, + ) + return cluster_list @@ -155,12 +230,13 @@ def parse_domains_from_functional_annotations_file( proteinCollection.functional_annotation_parsed = True -# cli +# common def build_AloCollection( config_f: str, nodesdb_f: str, taxranks: List[str], tree_f: Optional[str], + taxon_idx_mapping_file: Optional[str], ) -> AloCollection: """ Builds an AloCollection object from command-line interface (CLI) inputs. @@ -179,8 +255,7 @@ def build_AloCollection( proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id, - ) = parse_attributes_from_config_file(config_f) - + ) = parse_attributes_from_config_data(config_f, taxon_idx_mapping_file) # Add taxonomy if needed if "TAXID" in set(attributes): logger.info( @@ -212,71 +287,9 @@ def build_AloCollection( ) -# api -def build_AloCollection_from_json( - nodesdb_f: str, - taxranks: List[str], - json_list: List[Dict[str, str]], - taxon_idx_mapping_file: str, - tree_f: Optional[str], -): - """ - Builds an AloCollection object from API input. - - Args: - json_list List[Dict[str,str]]: JSON list of attributes. - taxon_idx_mapping_file str: The path to the taxon-idx mapping file - nodesdb_f (str): Path to the nodes database file for inferring taxonomic ranks. - taxranks (List[str]): List of taxonomic ranks to be inferred. - tree_f (Optional[str]): Path to the tree file. If provided, ALOs are added from the tree. - - Returns: - AloCollection: An instance of the AloCollection class containing parsed data. - """ - ( - proteomes, - proteome_id_by_species_id, - attributes, - level_by_attribute_by_proteome_id, - ) = parse_attributes_from_json( - json_list=json_list, - taxon_idx_mapping_file=taxon_idx_mapping_file, - ) - - # Add taxonomy if needed - if "TAXID" in set(attributes): - logger.info( - "[STATUS] - Attribute 'TAXID' found, inferring taxonomic ranks from nodesDB" - ) - attributes, level_by_attribute_by_proteome_id = add_taxid_attributes( - attributes=attributes, - level_by_attribute_by_proteome_id=level_by_attribute_by_proteome_id, - nodesdb_f=nodesdb_f, - taxranks=taxranks, - ) - # Add ALOs from tree if provided - tree_ete: Optional[Tree] = None - node_idx_by_proteome_ids: Optional[Dict[Any, Any]] = None - tree_ete, node_idx_by_proteome_ids = parse_tree_from_file( - tree_f, - attributes, - level_by_attribute_by_proteome_id, - proteomes, - ) - - logger.info("[STATUS] - Building AloCollection ...") - return AloCollection( - proteomes=proteomes, - attributes=attributes, - proteome_id_by_species_id=proteome_id_by_species_id, - level_by_attribute_by_proteome_id=level_by_attribute_by_proteome_id, - node_idx_by_proteome_ids=node_idx_by_proteome_ids, - tree_ete=tree_ete, - ) - - def get_protein_list_from_seq_f(sequence_ids_f: str, aloCollection: AloCollection): logger.info(f"[STATUS] - Parsing sequence IDs: {sequence_ids_f} ...") + proteins_list: List[Protein] = [] for line in yield_file_lines(sequence_ids_f): temp = line.split(": ") @@ -293,9 +306,6 @@ def get_protein_list_from_seq_f(sequence_ids_f: str, aloCollection: AloCollectio if proteome_id := aloCollection.proteome_id_by_species_id.get(species_id, None): protein = Protein(protein_id, proteome_id, species_id, sequence_id) proteins_list.append(protein) - # else: - # error_msg = f"[ERROR] - Offending SequenceID : {line} (unknown species_id {species_id})" - # raise ValueError(error_msg) return proteins_list @@ -359,12 +369,19 @@ def build_ProteinCollection( def build_ClusterCollection( + output_dir: str, cluster_f: str, proteinCollection: ProteinCollection, infer_singletons: Optional[bool], + available_proteomes: Set[str], ) -> ClusterCollection: logger.info(f"[STATUS] - Parsing {cluster_f} ... this may take a while") - cluster_list: List[Cluster] = parse_cluster_file(cluster_f, proteinCollection) + cluster_list: List[Cluster] = parse_cluster_file( + output_dir, + cluster_f, + proteinCollection, + available_proteomes, + ) inferred_singletons_count = 0 if infer_singletons: diff --git a/src/core/clusters.py b/src/core/clusters.py index eaac9d4..79cff89 100644 --- a/src/core/clusters.py +++ b/src/core/clusters.py @@ -86,7 +86,7 @@ def compute_protein_length_stats( (standard deviation) of protein lengths, if all lengths are available and at least one protein ID is provided. Returns None if no valid protein lengths are found. """ - protein_lengths: List[int | None] = [ + protein_lengths: List[Optional[int]] = [ proteinCollection.proteins_by_protein_id[protein_id].length for protein_id in protein_ids ] diff --git a/src/core/datastore.py b/src/core/datastore.py index 6006e09..abb4c10 100644 --- a/src/core/datastore.py +++ b/src/core/datastore.py @@ -2,7 +2,7 @@ import shutil import time from collections import Counter, defaultdict -from typing import Any, Dict, FrozenSet, Generator, List, Optional, Set, Tuple +from typing import Any, Dict, FrozenSet, Generator, List, Optional, Set, Tuple, Union import matplotlib as mat import matplotlib.pyplot as plt @@ -14,7 +14,6 @@ from core.alo_collections import AloCollection from core.build import ( build_AloCollection, - build_AloCollection_from_json, build_ClusterCollection, build_ProteinCollection, ) @@ -39,24 +38,13 @@ class DataFactory: def __init__(self, inputData: InputData) -> None: self.dirs = {} self.inputData: InputData = inputData - if isinstance(self.inputData.config_data, str): - self.aloCollection: AloCollection = build_AloCollection( - config_f=self.inputData.config_data, - nodesdb_f=self.inputData.nodesdb_f, - tree_f=self.inputData.tree_f, - taxranks=self.inputData.taxranks, - ) - elif self.inputData.taxon_idx_mapping_file is not None: - self.aloCollection: AloCollection = build_AloCollection_from_json( - nodesdb_f=self.inputData.nodesdb_f, - tree_f=self.inputData.tree_f, - taxranks=self.inputData.taxranks, - json_list=self.inputData.config_data, - taxon_idx_mapping_file=self.inputData.taxon_idx_mapping_file, - ) - else: - raise ValueError("[ERROR] - Either provide config file or json") - + self.aloCollection: AloCollection = build_AloCollection( + config_f=self.inputData.config_f, + nodesdb_f=self.inputData.nodesdb_f, + tree_f=self.inputData.tree_f, + taxranks=self.inputData.taxranks, + taxon_idx_mapping_file=self.inputData.taxon_idx_mapping_file, + ) self.proteinCollection: ProteinCollection = build_ProteinCollection( aloCollection=self.aloCollection, fasta_dir=self.inputData.fasta_dir, @@ -71,37 +59,39 @@ def __init__(self, inputData: InputData) -> None: ) self.clusterCollection: ClusterCollection = build_ClusterCollection( cluster_f=self.inputData.cluster_f, + output_dir=self.inputData.output_path, proteinCollection=self.proteinCollection, infer_singletons=self.inputData.infer_singletons, + available_proteomes=self.aloCollection.proteomes, ) def setup_dirs(self) -> None: """ Set up output directories for storing results and attributes. """ - output_path: Optional[str] = self.inputData.output_path - - if output_path: - if not os.path.isabs(output_path): - output_path = os.path.abspath(output_path) - else: - output_path = os.path.join(os.getcwd(), "kinfin_results") + output_path: str = self.inputData.output_path self.dirs["main"] = output_path logger.info("[STATUS] - Output directories in") logger.info(f"\t{output_path}") - if os.path.exists(output_path): - logger.info("[STATUS] - Directory exists. Deleting directory ...") - shutil.rmtree(output_path) + log_file_path = ( + os.path.join(output_path, "kinfin.log") + if os.path.exists(output_path) + else None + ) + if not os.path.exists(output_path): + logger.info("[STATUS] - Creating main output directory...") + os.makedirs(output_path) logger.info("[STATUS] - Creating directories ...") - os.mkdir(output_path) for attribute in self.aloCollection.attributes: attribute_path = os.path.join(output_path, attribute) self.dirs[attribute] = attribute_path if not os.path.exists(attribute_path): - logger.info(f"\t{attribute_path}") - os.mkdir(attribute_path) + logger.info( + f"[STATUS] - Creating directory for attribute: {attribute_path}" + ) + os.makedirs(attribute_path) if self.aloCollection.tree_ete is not None: tree_path = os.path.join(output_path, "tree") @@ -109,17 +99,23 @@ def setup_dirs(self) -> None: node_header_path = os.path.join(tree_path, "headers") if not os.path.exists(tree_path): - logger.info(f"\t{tree_path}") - os.mkdir(tree_path) + logger.info(f"[STATUS] - Creating tree directory: {tree_path}") + os.makedirs(tree_path) self.dirs["tree"] = tree_path - logger.info(f"\t{node_chart_path}") - os.mkdir(node_chart_path) + if not os.path.exists(node_chart_path): + logger.info( + f"[STATUS] - Creating node charts directory: {node_chart_path}" + ) + os.makedirs(node_chart_path) self.dirs["tree_charts"] = node_chart_path - if self.inputData.plot_tree: - logger.info(f"\t{node_header_path}") - os.mkdir(node_header_path) + if self.inputData.plot_tree: + if not os.path.exists(node_header_path): + logger.info( + f"[STATUS] - Creating node headers directory: {node_header_path}" + ) + os.makedirs(node_header_path) self.dirs["tree_headers"] = node_header_path def analyse_clusters(self) -> None: @@ -213,10 +209,18 @@ def plot_rarefaction_data( y_mins_array = np.array(y_mins) y_maxs_array = np.array(y_maxs) ax.plot( - median_x_values, median_y_values, "-", color=colour, label=level + median_x_values, + median_y_values, + "-", + color=colour, + label=level, ) ax.fill_between( - x_array, y_mins_array, y_maxs_array, color=colour, alpha=0.5 + x_array, + y_mins_array, # type:ignore + y_maxs_array, # type:ignore + color=colour, + alpha=0.5, ) ax.set_xlim([0, max_number_of_samples + 1]) ax.set_ylabel("Count of non-singleton clusters", fontsize=fontsize) @@ -387,7 +391,7 @@ def __process_level( attribute: str, level: str, protein_ids_by_level: Dict[str, List[str]], - protein_length_stats_by_level: Dict[str, Dict[str, int | float]], + protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]], explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]], ) -> None: """ @@ -447,7 +451,7 @@ def __update_ALO_data( cluster: Cluster, attribute: str, protein_ids_by_level: Dict[str, List[str]], - protein_length_stats_by_level: Dict[str, Dict[str, int | float]], + protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]], explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]], ) -> None: """ @@ -558,7 +562,7 @@ def __process_single_attribute(self, cluster: Cluster, attribute: str) -> None: None """ protein_ids_by_level: Dict[str, List[str]] = {} - protein_length_stats_by_level: Dict[str, Dict[str, int | float]] = {} + protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]] = {} explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]] = {} cluster.protein_counts_of_proteomes_by_level_by_attribute[attribute] = {} @@ -849,7 +853,7 @@ def __plot_cluster_sizes(self) -> None: y_values.append(count) x_array = np.array(x_values) # type: ignore y_array = np.array(y_values) - ax.scatter(x_array, y_array, marker="o", alpha=0.8, s=100) + ax.scatter(x_array, y_array, marker="o", alpha=0.8, s=100) # type: ignore ax.set_xlabel("Cluster size", fontsize=self.inputData.fontsize) ax.set_ylabel("Count", fontsize=self.inputData.fontsize) ax.set_yscale("log") @@ -1888,7 +1892,7 @@ def __plot_data( """ # Plot histogram binwidth = 0.05 - xymax = np.max(np.fabs(log2fc_array)) + xymax = np.max(np.fabs(log2fc_array)) # type: ignore lim = (int(xymax / binwidth) + 1) * binwidth bins = np.arange(-lim, lim + binwidth, binwidth) axHistx.hist( diff --git a/src/core/input.py b/src/core/input.py index 38b9fd9..3562942 100644 --- a/src/core/input.py +++ b/src/core/input.py @@ -1,4 +1,5 @@ -from typing import Dict, List, Optional, Set, Tuple +import os +from typing import Dict, List, Optional, Set, Tuple, Union class ServeArgs: @@ -14,7 +15,7 @@ def __init__( ipr_mapping_f: str, go_mapping_f: str, cluster_file: str, - config_data: List[Dict[str, str]] | str, + config_f: str, sequence_ids_file: str, species_ids_file: Optional[str] = None, functional_annotation_f: Optional[str] = None, @@ -25,7 +26,7 @@ def __init__( plot_tree: bool = False, min_proteomes: int = 2, test: str = "mannwhitneyu", - taxranks: List[str] = None, + taxranks: List[str] = ["phylum", "order", "genus"], repetitions: int = 30, fuzzy_count: int = 1, fuzzy_fraction: float = 0.75, @@ -35,16 +36,22 @@ def __init__( plot_format: str = "pdf", taxon_idx_mapping_file: Optional[str] = None, ): - if taxranks is None: - taxranks = ["phylum", "order", "genus"] + if output_path: + if not os.path.isabs(output_path): + output_path = os.path.abspath(output_path) + else: + output_path = os.path.join(os.getcwd(), "kinfin_results") + self.cluster_f = cluster_file - self.config_data = config_data + self.config_f = config_f self.sequence_ids_f = sequence_ids_file self.species_ids_f = species_ids_file self.tree_f = tree_file self.functional_annotation_f = functional_annotation_f + if config_f.endswith(".json"): + if not taxon_idx_mapping_file: + raise ValueError("[ERROR] - taxon_idx_mapping not present") self.taxon_idx_mapping_file = taxon_idx_mapping_file - self.nodesdb_f = nodesdb_f self.pfam_mapping_f = pfam_mapping_f self.ipr_mapping_f = ipr_mapping_f diff --git a/src/core/logic.py b/src/core/logic.py index f9ce5bd..6b09bfa 100644 --- a/src/core/logic.py +++ b/src/core/logic.py @@ -1,12 +1,17 @@ import json import os from collections import defaultdict -from typing import DefaultDict, Dict, List, Literal, Optional, Set, Tuple +from typing import DefaultDict, Dict, List, Literal, Optional, Set, Tuple, Union import ete3 from ete3 import Tree, TreeNode -from core.utils import logger, progress, read_fasta_len, yield_file_lines +from core.utils import ( + progress, + read_fasta_len, + yield_config_lines, + yield_file_lines, +) import logging @@ -82,8 +87,9 @@ def get_lineage( # cli -def parse_attributes_from_config_file( +def parse_attributes_from_config_data( config_f: str, + taxon_idx_mapping_file: Optional[str], ) -> Tuple[Set[str], Dict[str, str], List[str], Dict[str, Dict[str, str]]]: """ Parses attributes from a configuration file. @@ -111,13 +117,13 @@ def parse_attributes_from_config_file( - The 'TAXON' attribute is expected to be unique for each line. """ - logger.info(f"[STATUS] - Parsing config file: {config_f} ...") + logger.info(f"[STATUS] - Parsing config data ...") attributes: List[str] = [] level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]] = {} proteomes: Set[str] = set() proteome_id_by_species_id: Dict[str, str] = {} - for line in yield_file_lines(config_f): + for line in yield_config_lines(config_f, taxon_idx_mapping_file): if line.startswith("#"): if not attributes: attributes = [x.strip() for x in line.lstrip("#").split(",")] @@ -274,72 +280,6 @@ def parse_tree_from_file( return tree_ete, node_idx_by_proteome_ids -# api -def parse_attributes_from_json( - json_list: List[Dict[str, str]], - taxon_idx_mapping_file: str, -) -> Tuple[Set[str], Dict[str, str], List[str], Dict[str, Dict[str, str]]]: - """ - Parses attributes from a JSON list. - - Args: - json_list List[Dict[str,str]]: JSON list of attributes. - taxon_idx_mapping_file str: The path to the taxon-idx mapping file - - Returns: - Tuple[Set[str], Dict[str, str], List[str], Dict[str, Dict[str, str]]]: A tuple containing: - - A set of proteome IDs. - - A dictionary mapping species IDs to proteome IDs. - - A list of attributes. - - A dictionary mapping proteome IDs to dictionaries, where each inner dictionary - maps attributes to their corresponding levels. - - Raises: - FileNotFoundError: If the specified configuration file is not found. - ValueError: If there are errors in the configuration file format or content. - - Note: - - The configuration file is expected to have a header line starting with '#', - where the first element is 'IDX' and the second element is 'TAXON'. - - Each subsequent non-empty line in the configuration file should contain - comma-separated values corresponding to the attributes defined in the header line. - - The 'TAXON' attribute is expected to be unique for each line. - """ - - logger.info("[STATUS] - Parsing JSON list...") - attributes: List[str] = [] - level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]] = {} - proteomes: Set[str] = set() - proteome_id_by_species_id: Dict[str, str] = {} - - attributes = list(json_list[0].keys()) - attributes.insert(0, "IDX") - - with open(taxon_idx_mapping_file, "r") as f: - taxon_idx_mapping = json.load(f) - - attributes.insert(0, "all") - - for entry in json_list: - proteome_id = entry["TAXON"] - species_id = taxon_idx_mapping[proteome_id] - proteomes.add(proteome_id) - proteome_id_by_species_id[species_id] = proteome_id - - level_by_attribute_by_proteome_id[proteome_id] = { - attribute: entry.get(attribute, "") for attribute in attributes[1:] - } - level_by_attribute_by_proteome_id[proteome_id]["IDX"] = proteome_id - level_by_attribute_by_proteome_id[proteome_id]["all"] = "all" - attributes.insert(0, "all") - return ( - proteomes, - proteome_id_by_species_id, - attributes, - level_by_attribute_by_proteome_id, - ) - - def parse_fasta_dir(species_ids_f: str, fasta_dir: str) -> Dict[str, int]: """ Parse a species IDs file to retrieve fasta file names and then calculate diff --git a/src/core/proteins.py b/src/core/proteins.py index ac0a41c..c47439d 100644 --- a/src/core/proteins.py +++ b/src/core/proteins.py @@ -1,5 +1,5 @@ from collections import Counter -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union from core.utils import mean, median, sd @@ -72,7 +72,7 @@ def add_annotation_to_protein( def get_protein_length_stats( self, protein_ids: List[str] - ) -> Dict[str, int | float]: + ) -> Dict[str, Union[int, float]]: """ Calculate statistics (sum, mean, median, standard deviation) of protein lengths. @@ -80,7 +80,7 @@ def get_protein_length_stats( protein_ids (List[str]): List of protein IDs for which to calculate statistics. Returns: - Dict[str, int | float]: A dictionary containing the calculated statistics: + Dict[str, Union[int, float]): A dictionary containing the calculated statistics: - 'sum': Sum of lengths of proteins in the input list. - 'mean': Mean length of proteins in the input list. - 'median': Median length of proteins in the input list. diff --git a/src/core/utils.py b/src/core/utils.py index 93c4a4d..f7974fd 100644 --- a/src/core/utils.py +++ b/src/core/utils.py @@ -1,9 +1,10 @@ import gzip +import json import logging import os import sys from math import log, sqrt -from typing import Any, Generator, List, Optional, Tuple +from typing import Any, Dict, Generator, List, Optional, Tuple, Union import scipy import logging @@ -11,7 +12,7 @@ logger = logging.getLogger("kinfin_logger") -def progress(iteration: int, steps: int | float, max_value: int) -> None: +def progress(iteration: int, steps: Union[int, float], max_value: int) -> None: """ Print progress in percentage based on the current iteration, steps, and maximum value. @@ -36,7 +37,7 @@ def progress(iteration: int, steps: int | float, max_value: int) -> None: sys.stdout.flush() -def check_file(filepath: str | None, install_kinfin: bool = False) -> None: +def check_file(filepath: Optional[str], install_kinfin: bool = False) -> None: """ Check if a file exists. @@ -76,6 +77,32 @@ def yield_file_lines(filepath: str) -> Generator[str, Any, None]: yield line.rstrip("\n") +def yield_config_lines( + config_f: str, + taxon_idx_mapping_file: Optional[str], +): + if config_f.endswith(".json"): + if not taxon_idx_mapping_file: + raise ValueError("[ERROR] - taxon_idx_mapping not present") + + with open(taxon_idx_mapping_file, "r") as f_mapping, open( + config_f, "r" + ) as f_config: + taxon_idx_mapping = json.load(f_mapping) + config_data = json.load(f_config) + headers = ["IDX"] + list(config_data[0].keys()) + yield "#" + ",".join(headers) + + for item in config_data: + idx = taxon_idx_mapping[item["TAXON"]] + row = [idx] + [item[key] for key in headers[1:]] + yield ",".join(row) + return + else: + yield from yield_file_lines(config_f) + return + + def read_fasta_len(fasta_file: str) -> Generator[Tuple[str, int], Any, None]: """ Generator function to parse a FASTA file and yield tuples of header and sequence length.