diff --git a/src/cli/commands.py b/src/cli/commands.py index 62199e0..ac55f2e 100644 --- a/src/cli/commands.py +++ b/src/cli/commands.py @@ -33,8 +33,7 @@ def parse_args( description="Kinfin proteome cluster analysis tool" ) - subparsers = parser.add_subparsers( - title="command", required=True, dest="command") + subparsers = parser.add_subparsers(title="command", required=True, dest="command") api_parser = subparsers.add_parser("serve", help="Start the server") api_parser.add_argument( "-p", @@ -74,8 +73,7 @@ def parse_args( "--functional_annotation", help="Mapping of ProteinIDs to GO/IPRS/SignalP/Pfam (can be generated through 'iprs_to_table.py')", ) - other_files_group.add_argument( - "-a", "--fasta_dir", help="Directory of FASTA files") + other_files_group.add_argument("-a", "--fasta_dir", help="Directory of FASTA files") other_files_group.add_argument( "-t", "--tree_file", diff --git a/src/core/alo.py b/src/core/alo.py index dc43e79..e61846a 100644 --- a/src/core/alo.py +++ b/src/core/alo.py @@ -46,9 +46,7 @@ def __init__(self, attribute: str, level: str, proteomes: Set[str]) -> None: "specific": {"true": [], "fuzzy": []}, } - self.cluster_status_by_cluster_id: Dict[ - str, Literal["absent", "present"] - ] = {} + self.cluster_status_by_cluster_id: Dict[str, Literal["absent", "present"]] = {} self.cluster_type_by_cluster_id: Dict[ str, Literal["singleton", "shared", "specific"] ] = {} @@ -61,9 +59,7 @@ def __init__(self, attribute: str, level: str, proteomes: Set[str]) -> None: self.domain_counter_by_domain_source_by_cluster_type = None self.protein_with_domain_count_by_domain_source_by_cluster_type = None - self.protein_length_stats_by_cluster_id: Dict[ - str, Dict[str, int | float] - ] = {} + self.protein_length_stats_by_cluster_id: Dict[str, Dict[str, int | float]] = {} self.protein_count_by_cluster_id: Dict[str, int] = {} def add_cluster( @@ -134,9 +130,7 @@ def add_cluster( self.cluster_mwu_pvalue_by_cluster_id[cluster.cluster_id] = mwu_pvalue self.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id] = mwu_log2_mean - self.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id] = ( - mean_ALO_count - ) + self.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id] = mean_ALO_count self.cluster_mean_non_ALO_count_by_cluster_id[cluster.cluster_id] = ( mean_non_ALO_count ) @@ -251,6 +245,4 @@ def get_proteomes(self) -> str: Returns: str: Comma-separated and sorted list of proteome IDs. """ - return ", ".join( - sorted([str(proteome_id) for proteome_id in self.proteomes]) - ) + return ", ".join(sorted([str(proteome_id) for proteome_id in self.proteomes])) diff --git a/src/core/alo_collections.py b/src/core/alo_collections.py index 2034632..91df5d2 100644 --- a/src/core/alo_collections.py +++ b/src/core/alo_collections.py @@ -72,9 +72,7 @@ def compute_proteomes_by_level_by_attribute( } for proteome_id in self.level_by_attribute_by_proteome_id: for attribute in self.attributes: - level = self.level_by_attribute_by_proteome_id[proteome_id][ - attribute - ] + level = self.level_by_attribute_by_proteome_id[proteome_id][attribute] if level not in proteomes_by_level_by_attribute[attribute]: proteomes_by_level_by_attribute[attribute][level] = set() proteomes_by_level_by_attribute[attribute][level].add(proteome_id) @@ -96,9 +94,7 @@ def create_ALOs(self) -> Dict[str, Dict[str, Optional[AttributeLevel]]]: } for attribute in self.proteome_ids_by_level_by_attribute: for level in self.proteome_ids_by_level_by_attribute[attribute]: - proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][ - level - ] + proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level] ALO = AttributeLevel( # attribute=attribute, @@ -312,15 +308,11 @@ def plot_tree( node.set_style(style) if header_f_by_node_name[node.name]: # must be PNG! (ETE can't do PDF Faces) - node_header_face = ete3.faces.ImgFace( - header_f_by_node_name[node.name] - ) + node_header_face = ete3.faces.ImgFace(header_f_by_node_name[node.name]) node.add_face(node_header_face, column=0, position="branch-top") if charts_f_by_node_name[node.name]: # must be PNG! (ETE can't do PDF Faces) - node_chart_face = ete3.faces.ImgFace( - charts_f_by_node_name[node.name] - ) + node_chart_face = ete3.faces.ImgFace(charts_f_by_node_name[node.name]) node.add_face(node_chart_face, column=0, position="branch-bottom") node_name_face = ete3.TextFace(node.name, fsize=64) node.img_style["size"] = 10 @@ -396,10 +388,7 @@ def write_tree( for synapomorphic_cluster_string in node.synapomorphic_cluster_strings: # type: ignore node_clusters.append( "\t".join( - [ - str(string) - for string in list(synapomorphic_cluster_string) - ] + [str(string) for string in list(synapomorphic_cluster_string)] ) ) node_stats_line = [ @@ -463,13 +452,13 @@ def compute_repetition_for_rarefaction_curve( sample_size = idx + 1 if ( sample_size - not in rarefaction_by_samplesize_by_level_by_attribute[ - attribute - ][level] - ): - rarefaction_by_samplesize_by_level_by_attribute[attribute][ + not in rarefaction_by_samplesize_by_level_by_attribute[attribute][ level - ][sample_size] = [] + ] + ): + rarefaction_by_samplesize_by_level_by_attribute[attribute][level][ + sample_size + ] = [] rarefaction_by_samplesize_by_level_by_attribute[attribute][level][ sample_size ].append(len(seen_cluster_ids)) @@ -496,9 +485,7 @@ def compute_rarefaction_data( logger.info("[STATUS] - Generating rarefaction data ...") for attribute in self.attributes: for level in self.proteome_ids_by_level_by_attribute[attribute]: - proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][ - level - ] + proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level] if len(proteome_ids) == 1: continue diff --git a/src/core/build.py b/src/core/build.py index dde287e..ce1a9f2 100644 --- a/src/core/build.py +++ b/src/core/build.py @@ -137,14 +137,10 @@ def parse_domains_from_functional_annotations_file( if domain_source == "GO": domain_id = domain_id_count else: - domain_id, domain_count_str = domain_id_count.rsplit( - ":", 2 - ) + domain_id, domain_count_str = domain_id_count.rsplit(":", 2) domain_count = int(domain_count_str) domain_counts_by_domain_id[domain_id] = domain_count - domain_counter: Counter[str] = Counter( - domain_counts_by_domain_id - ) + domain_counter: Counter[str] = Counter(domain_counts_by_domain_id) domain_counter_by_domain_source[domain_source] = domain_counter proteinCollection.add_annotation_to_protein( domain_protein_id=domain_protein_id, @@ -290,9 +286,7 @@ def get_protein_list_from_seq_f(sequence_ids_f: str, aloCollection: AloCollectio .replace(")", "_") ) # orthofinder replaces characters species_id = sequence_id.split("_")[0] - if proteome_id := aloCollection.proteome_id_by_species_id.get( - species_id, None - ): + if proteome_id := aloCollection.proteome_id_by_species_id.get(species_id, None): protein = Protein(protein_id, proteome_id, species_id, sequence_id) proteins_list.append(protein) # else: diff --git a/src/core/clusters.py b/src/core/clusters.py index c518c22..eaac9d4 100644 --- a/src/core/clusters.py +++ b/src/core/clusters.py @@ -25,12 +25,12 @@ def __init__( } except KeyError as e: error_msg = f"[ERROR] - Protein {e.args[0]} in clustering belongs to proteomes that are not present in the config-file." - error_msg += "Please add those proteomes or recluster by omitting these proteomes." + error_msg += ( + "Please add those proteomes or recluster by omitting these proteomes." + ) raise KeyError(error_msg) from e - self.proteome_ids_list: List[str] = list( - self.proteomes_by_protein_id.values() - ) + self.proteome_ids_list: List[str] = list(self.proteomes_by_protein_id.values()) self.protein_count_by_proteome_id: Counter[str] = Counter( self.proteome_ids_list ) @@ -45,9 +45,7 @@ def __init__( self.protein_counts_of_proteomes_by_level_by_attribute: Dict[ str, Dict[str, List[int]] ] = {} - self.proteome_coverage_by_level_by_attribute: Dict[str, Dict[str, float]] = ( - {} - ) + self.proteome_coverage_by_level_by_attribute: Dict[str, Dict[str, float]] = {} self.implicit_protein_ids_by_proteome_id_by_level_by_attribute: Dict[ str, Dict[str, Dict[str, List[str]]] ] = {} @@ -59,10 +57,8 @@ def __init__( self.protein_length_stats: Optional[Dict[str, float]] = ( self.compute_protein_length_stats(proteinCollection, self.protein_ids) ) - self.secreted_cluster_coverage: float = ( - self.compute_secreted_cluster_coverage( - proteinCollection, self.protein_ids, self.protein_count - ) + self.secreted_cluster_coverage: float = self.compute_secreted_cluster_coverage( + proteinCollection, self.protein_ids, self.protein_count ) self.domain_counter_by_domain_source: Dict[str, Counter[str]] = ( self.compute_domain_counter_by_domain_source( diff --git a/src/core/logic.py b/src/core/logic.py index 20b6ff9..9984ae8 100644 --- a/src/core/logic.py +++ b/src/core/logic.py @@ -136,9 +136,7 @@ def parse_attributes_from_config_file( proteomes.add(proteome_id) proteome_id_by_species_id[species_id] = proteome_id - level_by_attribute_by_proteome_id[proteome_id] = dict( - zip(attributes, temp) - ) + level_by_attribute_by_proteome_id[proteome_id] = dict(zip(attributes, temp)) level_by_attribute_by_proteome_id[proteome_id]["all"] = "all" attributes.insert(0, "all") # append to front return ( @@ -181,9 +179,7 @@ def add_taxid_attributes( # add lineage attribute/levels for taxrank in taxranks: - level_by_attribute_by_proteome_id[proteome_id][taxrank] = lineage[ - taxrank - ] + level_by_attribute_by_proteome_id[proteome_id][taxrank] = lineage[taxrank] # remove taxid-levels del level_by_attribute_by_proteome_id[proteome_id]["TAXID"] @@ -452,9 +448,7 @@ def parse_go_mapping(go_mapping_f: str) -> Dict[str, str]: if not line.startswith("!"): temp: List[str] = line.replace(" > ", "|").split("|") go_string: List[str] = temp[1].split(";") - go_desc, go_id = go_string[0].replace("GO:", ""), go_string[1].lstrip( - " " - ) + go_desc, go_id = go_string[0].replace("GO:", ""), go_string[1].lstrip(" ") if go_id not in go_mapping_dict: go_mapping_dict[go_id] = go_desc diff --git a/src/core/utils.py b/src/core/utils.py index 5027a34..b572335 100644 --- a/src/core/utils.py +++ b/src/core/utils.py @@ -217,10 +217,7 @@ def statistic( implicit_count_1: List[float] = [count for count in count_1 if count > 0] implicit_count_2: List[float] = [count for count in count_2 if count > 0] - if ( - len(implicit_count_1) < min_proteomes - or len(implicit_count_2) < min_proteomes - ): + if len(implicit_count_1) < min_proteomes or len(implicit_count_2) < min_proteomes: return None, None, None, None mean_count_1 = mean(implicit_count_1) @@ -255,9 +252,7 @@ def statistic( pvalue = 1.0 elif test == "ttest": # try: - pvalue = scipy.stats.ttest_ind(implicit_count_1, implicit_count_2)[ - 1 - ] # t-test + pvalue = scipy.stats.ttest_ind(implicit_count_1, implicit_count_2)[1] # t-test if pvalue != pvalue: # testing for "nan" pvalue = 1.0 elif test == "ks": diff --git a/src/main.py b/src/main.py index 99e729c..d432f6e 100755 --- a/src/main.py +++ b/src/main.py @@ -41,13 +41,12 @@ logger.error("[ERROR] CLUSTER_FILE_PATH should be an absolute path.") sys.exit(1) if sequence_ids_f is None or not os.path.isabs(sequence_ids_f): - logger.error( - "[ERROR] SEQUENCE_IDS_FILE_PATH should be an absolute path.") + logger.error("[ERROR] SEQUENCE_IDS_FILE_PATH should be an absolute path.") sys.exit(1) - if taxon_idx_mapping_file is None or not os.path.isabs( - taxon_idx_mapping_file): + if taxon_idx_mapping_file is None or not os.path.isabs(taxon_idx_mapping_file): logger.error( - "[ERROR] TAXON_IDX_MAPPING_FILE_PATH should be an absolute path.") + "[ERROR] TAXON_IDX_MAPPING_FILE_PATH should be an absolute path." + ) sys.exit(1) if results_base_dir is None or not os.path.isabs(results_base_dir): logger.error("[ERROR] RESULTS_BASE_DIR should be an absolute path.")