diff --git a/.eggs/README.txt b/.eggs/README.txt deleted file mode 100644 index 5d01668..0000000 --- a/.eggs/README.txt +++ /dev/null @@ -1,6 +0,0 @@ -This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins. - -This directory caches those eggs to prevent repeated downloads. - -However, it is safe to delete this directory. - diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..7aca9f0 --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +CLUSTER_FILE_PATH=/absolute/path/to/Orthogroups.txt +SEQUENCE_IDS_FILE_PATH=/absolute/path/to/SequenceIDs.txt +TAXON_IDX_MAPPING_FILE_PATH=/absolute/path/to/taxon_idx_mapping.json +RESULTS_BASE_DIR=/absolute/path/where/all/results/should/be/stored/ +SESSION_INACTIVITY_THRESHOLD=24 \ No newline at end of file diff --git a/.gitignore b/.gitignore index dde7931..3f89431 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,9 @@ example/test.* build/ dist/ +venv +.test_data +result +.DS_Store +.env +data \ No newline at end of file diff --git a/build/lib/kinfin/kinfin.py b/build/lib/kinfin/kinfin.py deleted file mode 100644 index 68ecdbc..0000000 --- a/build/lib/kinfin/kinfin.py +++ /dev/null @@ -1,2195 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -""" -usage: kinfin-d.py -g -c -s [-t ] [-o ] - [--infer_singletons] [--plot_tree] - [-p ] [-a ] - [--functional_annotation ] - [--nodesdb ] [--taxranks ] - [-f ] [-n ] [--min ] [--max ] - [-r ] [--min_proteomes ] - [--fontsize ] [--plotsize INT,INT] - [--plotfmt ] - [-h|--help] - - Options: - -h --help show this - - Input files - -g, --cluster_file OrthologousGroups.txt produced by OrthoFinder - -c, --config_file Config file (in CSV format) - -s, --sequence_ids_file SequenceIDs.txt used in OrthoFinder - - -p, --species_ids_file SpeciesIDs.txt used in OrthoFinder - --functional_annotation Mapping of ProteinIDs to GO/IPRS/SignalP/Pfam/... (can be generated through 'iprs_to_table.py') - -a, --fasta_dir Directory of FASTA files - --nodesdb nodesdb file (in data/ folder, has to be uncompressed) - -t, --tree_file Tree file (on which ALOs are defined) - General options - -o, --outprefix Output prefix - --infer_singletons Absence of proteins in clustering is interpreted as singleton (based on SequenceIDs.txt) - --plot_tree Plot annotated phylogenetic tree (requires full ETE3 installation and X-server/xvfb-run) - --min_proteomes Required number of proteomes in a taxon-set to be used - in rarefaction/representation-test computations [default: 2] - --taxranks Taxonomic ranks to be inferred from TaxID [default: phylum,order,genus] - -r, --repetitions Number of repetitions for rarefaction curves [default: 30] - "Fuzzy"-Orthology-groups - -f, --target_fraction Minimum proportion of proteomes with target protein count [default: 0.75]. - -n, --target_count Target protein count by proteome in (100*F)% of cluster [default: 1] - --min Min count of proteins by proteome in (100*(1-F))% of cluster [default: 0] - --max Max count of proteins by proteome in (100*(1-F))% of cluster [default: 100] - Plotting - --fontsize Fontsize for plots [default: 18] - --plotsize Size (WIDTH,HEIGHT) for plots [default: 24,12] - --plotfmt Plot formats [default: pdf] - -""" - - -######################################################################## -# Imports -######################################################################## - -from __future__ import division -import sys -from os.path import isfile, join, exists, realpath, dirname -from os import getcwd, mkdir, remove, environ -import shutil -import random -import time -import urllib -from decimal import Decimal - -from collections import Counter, defaultdict -from math import sqrt, log - -import_errors = [] -try: - from docopt import docopt -except ImportError: - import_errors.append("[ERROR] : Module \'Docopt\' was not found. Please install \'Docopt\' using \'pip install docopt\'") -try: - import matplotlib as mat - mat.use("agg") -except ImportError: - import_errors.append("[ERROR] : Module \'Matplotlib\' was not found. Please install \'Matplotlob\' using \'pip install matplotlib\'") -try: - import scipy -except ImportError: - import_errors.append("[ERROR] : Module \'SciPy\' was not found. Please install \'SciPy\' using \'pip install scipy\'") - -if import_errors: - sys.exit("\n".join(import_errors)) - -import numpy as np -from matplotlib.ticker import FormatStrFormatter -import matplotlib.pyplot as plt -plt.style.use('ggplot') -mat.rc('ytick', labelsize=20) -mat.rc('xtick', labelsize=20) -axis_font = {'size': '20'} -mat.rcParams.update({'font.size': 22}) - -######################################################################## -# General functions -######################################################################## - - -def retrieve_ftp(remote_f, local_f): - try: - print "[STATUS] - Downloading '%s' to '%s'." % (remote_f, local_f) - urllib.urlretrieve(remote_f, local_f) - except IOError: - sys.exit("[ERROR] : '%s' could not be downloaded." % (remote_f)) - - -def check_file(infile): - if infile: - if not isfile(infile): - sys.exit("[ERROR] : %s does not exist." % (infile)) - - -def get_attribute_cluster_type(singleton, implicit_protein_ids_by_proteome_id_by_level): - if singleton: - return 'singleton' - else: - if len(implicit_protein_ids_by_proteome_id_by_level) > 1: - return 'shared' - else: - return 'specific' - - -def get_ALO_cluster_cardinality(ALO_proteome_counts_in_cluster): - if len(ALO_proteome_counts_in_cluster) > 2: - ALO_proteome_counts_in_cluster_length = len(ALO_proteome_counts_in_cluster) - if all(count == 1 for count in ALO_proteome_counts_in_cluster): - return 'true' - else: - ALO_proteome_counts_in_cluster_at_fuzzycount_count = len([ALO_proteome_counts for ALO_proteome_counts in ALO_proteome_counts_in_cluster if ALO_proteome_counts == inputObj.fuzzy_count]) - ALO_proteome_counts_in_cluster_in_fuzzyrange_count = len([ALO_proteome_counts for ALO_proteome_counts in ALO_proteome_counts_in_cluster if ALO_proteome_counts in inputObj.fuzzy_range]) - fuzzy_fraction = ALO_proteome_counts_in_cluster_at_fuzzycount_count / ALO_proteome_counts_in_cluster_length - if fuzzy_fraction >= inputObj.fuzzy_fraction: - if ALO_proteome_counts_in_cluster_at_fuzzycount_count + ALO_proteome_counts_in_cluster_in_fuzzyrange_count == ALO_proteome_counts_in_cluster_length: - return 'fuzzy' - return None - - -def mannwhitneyu(count_1, count_2): - pvalue, log2_mean, mean_count_1, mean_count_2 = None, None, None, None - implicit_count_1 = [count for count in count_1 if count > 0] - implicit_count_2 = [count for count in count_2 if count > 0] - if len(implicit_count_1) >= inputObj.min_proteomes and len(implicit_count_2) >= inputObj.min_proteomes: - try: - pvalue = scipy.stats.mannwhitneyu(implicit_count_1, implicit_count_2, alternative="two-sided")[1] - except: - pvalue = 1.0 - mean_count_1 = mean(implicit_count_1) - mean_count_2 = mean(implicit_count_2) - log2_mean = log((mean(implicit_count_1)/mean(implicit_count_2)), 2) - return pvalue, log2_mean, mean_count_1, mean_count_2 - - -def get_lineage(taxid, nodesdb): - lineage = {taxrank: 'undef' for taxrank in inputObj.taxranks} - parent = '' - node = taxid - while parent != "1": - taxrank = nodesdb[node]['rank'] - name = nodesdb[node]['name'] - parent = nodesdb[node]['parent'] - if taxrank in inputObj.taxranks: - lineage[taxrank] = name - node = parent - return lineage - - -def parse_nodesdb(nodesdb_f): - nodesdb = {} - nodesdb_count = 0 - nodes_count = 0 - for line in read_file(nodesdb_f): - if line.startswith("#"): - nodesdb_count = int(line.lstrip("# nodes_count = ").rstrip("\n")) - else: - nodes_count += 1 - node, rank, name, parent = line.rstrip("\n").split("\t") - nodesdb[node] = {'rank': rank, 'name': name, 'parent': parent} - if nodesdb_count: - progress(nodes_count, 1000, nodesdb_count) - return nodesdb - - -def parse_mapping(mapping_file_by_domain_source): - domain_description_by_domain_id_by_domain_source = {} - if mapping_file_by_domain_source: - for domain_source, mapping_f in mapping_file_by_domain_source.items(): - if domain_source == 'Pfam': - domain_description_by_domain_id_by_domain_source[domain_source] = {} - print "[STATUS] - Parsing %s ... " % (mapping_f) - for line in read_file(mapping_f): - temp = line.split("\t") - domain_id = temp[0] - domain_desc = temp[4] - if domain_id not in domain_description_by_domain_id_by_domain_source[domain_source]: - domain_description_by_domain_id_by_domain_source[domain_source][domain_id] = domain_desc - else: - if not domain_desc == domain_description_by_domain_id_by_domain_source[domain_source][domain_id]: - sys.exit("[ERROR] : Conflicting descriptions for %s" % (domain_id)) - elif domain_source == 'GO': - domain_description_by_domain_id_by_domain_source['GO'] = {} - print "[STATUS] - Parsing %s ... " % (mapping_f) - for line in read_file(mapping_f): - if not line.startswith("!"): - temp = line.replace(" > ", "|").split("|") - go_string = temp[1].split(";") - go_desc, go_id = go_string[0].replace("GO:", ""), go_string[1].lstrip(" ") - if go_id not in domain_description_by_domain_id_by_domain_source['GO']: - domain_description_by_domain_id_by_domain_source['GO'][go_id] = go_desc - else: - if not go_desc == domain_description_by_domain_id_by_domain_source['GO'][go_id]: - sys.exit("[ERROR] : Conflicting descriptions for %s" % (go_id)) - elif domain_source == 'IPR': - domain_description_by_domain_id_by_domain_source['IPR'] = {} - print "[STATUS] - Parsing %s ... " % (mapping_f) - for line in read_file(mapping_f): - if not line.startswith("Active_site"): - temp = line.split() - ipr_id = temp[0] - ipr_desc = " ".join(temp[1:]) - if ipr_id not in domain_description_by_domain_id_by_domain_source['IPR']: - domain_description_by_domain_id_by_domain_source['IPR'][ipr_id] = ipr_desc - else: - if not ipr_desc == domain_description_by_domain_id_by_domain_source['IPR'][ipr_id]: - sys.exit("[ERROR] : Conflicting descriptions for %s" % (ipr_id)) - return domain_description_by_domain_id_by_domain_source - - -def parse_tree(tree_f, outgroups): - check_file(tree_f) - print "[STATUS] - Parsing Tree file : %s ..." % (tree_f) - tree_ete = ete3.Tree(tree_f) - if len(outgroups) > 1: - outgroup_node = tree_ete.get_common_ancestor(outgroups) - try: - tree_ete.set_outgroup(outgroup_node) - print "[STATUS] - Setting LCA of %s as outgroup : ..." % (",".join(outgroups)) - except ete3.coretype.tree.TreeError: - print "[STATUS] - Tree seems to be rooted already : ..." - else: - print "[STATUS] - Setting %s as outgroup : ..." % (",".join(outgroups)) - tree_ete.set_outgroup(outgroups[0]) - print tree_ete - node_idx_by_proteome_ids = {} - for idx, node in enumerate(tree_ete.traverse("levelorder")): - proteome_ids = frozenset([leaf.name for leaf in node]) - if not node.name: - node.add_features( - name="n%s" % (idx), - nodetype="node", - proteome_ids=proteome_ids, - apomorphic_cluster_counts={'singletons': 0, 'non_singletons': 0}, - synapomorphic_cluster_counts={'complete_presence': 0, 'stochastic_absence': 0}, - synapomorphic_cluster_strings=[], - counts={'specific': 0, 'shared': 0, "absent": 0, "singleton": 0}) - else: - node.add_features( - nodetype="tip", - proteome_ids=proteome_ids, - apomorphic_cluster_counts={'singletons': 0, 'non_singletons': 0}, - synapomorphic_cluster_counts={'complete_presence': 0, 'stochastic_absence': 0}, - synapomorphic_cluster_strings=[], - counts={'specific': 0, 'shared': 0, "absent": 0, "singleton": 0}) - node_idx_by_proteome_ids[proteome_ids] = node.name - return tree_ete, node_idx_by_proteome_ids - - -def readFastaLen(infile): - with open(infile) as fh: - header, seqs = '', [] - for l in fh: - if l[0] == '>': - if header: - header = header.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") # orthofinder replaces chars - yield header, len(''.join(seqs)) - header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace - else: - seqs.append(l[:-1]) - header = header.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") # orthofinder replaces chars - yield header, len(''.join(seqs)) - - -def median(lst): - list_sorted = sorted(lst) - list_length = len(lst) - index = (list_length - 1) // 2 - if list_length % 2: - return list_sorted[index]/1.0 - else: - return (list_sorted[index] + list_sorted[index + 1])/2.0 - - -def mean(lst): - if lst: - return float(sum(lst)) / len(lst) - else: - return 0.0 - - -def sd(lst, population=True): - n = len(lst) - differences = [x_ - mean(lst) for x_ in lst] - sq_differences = [d ** 2 for d in differences] - ssd = sum(sq_differences) - if population is True: - variance = ssd / n - else: - variance = ssd / (n - 1) - sd_result = sqrt(variance) - return sd_result - - -def progress(iteration, steps, max_value): - if int(iteration) == int(max_value): - sys.stdout.write('\r') - print "[PROGRESS] \t- %d%%" % (100) - elif int(iteration) % int(steps + 1) == 0: - sys.stdout.write('\r') - print "[PROGRESS] \t- %d%%" % (float(int(iteration) / int(max_value)) * 100), - sys.stdout.flush() - else: - pass - - -def read_file(infile): - if not infile or not exists(infile): - sys.exit("[ERROR] - File '%s' does not exist." % (infile)) - if infile.endswith(".gz"): - import gzip - with gzip.open(infile) as fh: - for line in fh: - yield line.rstrip("\n") - else: - with open(infile) as fh: - for line in fh: - yield line.rstrip("\n") - -######################################################################## -# CLASS : DataFactory -######################################################################## - - -class DataFactory(): - def __init__(self): - self.dirs = None - - ############################### - ### build_AloCollection - ############################### - - def build_AloCollection(self): - config_f = inputObj.config_f - nodesdb_f = inputObj.nodesdb_f - tree_f = inputObj.tree_f - proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id = self.parse_attributes(config_f) - # Add taxonomy if needed - if 'TAXID' in set(attributes): - print "[STATUS] - Attribute 'TAXID' found, inferring taxonomic ranks from nodesDB..." - attributes, level_by_attribute_by_proteome_id = self.add_taxid_attributes(nodesdb_f, attributes, level_by_attribute_by_proteome_id) - # Add ALOs from tree if provided - tree_ete = None - node_idx_by_proteome_ids = None - if tree_f: - outgroups = [] - if not "OUT" in attributes: - sys.exit("[ERROR] - Please specify one of more outgroup taxa in the config file.") - outgroups = [proteome_id for proteome_id in proteomes if level_by_attribute_by_proteome_id[proteome_id]["OUT"] == "1"] - tree_ete, node_idx_by_proteome_ids = parse_tree(tree_f, outgroups) - print "[STATUS] - Building AloCollection ..." - return AloCollection(proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id, tree_ete, node_idx_by_proteome_ids) - - ############################### - ### build_AloCollection parse_attributes - ############################### - - def parse_attributes(self, config_f): - print "[STATUS] - Parsing SpeciesClassification file: %s ..." % (config_f) - attributes = [] - level_by_attribute_by_proteome_id = {} - proteomes = set() - proteome_id_by_species_id = {} - for line in read_file(config_f): - if line.startswith("#"): - if not attributes: - attributes = [x.strip() for x in line.lstrip("#").split(",")] - if not 'IDX' == attributes[0] or not 'TAXON' == attributes[1]: - sys.exit("[ERROR] - First/second element have to be IDX/TAXON.\n\t%s" % (attributes)) - else: - pass # accounts for SpeciesIDs that are commented out for Orthofinder - elif line.strip(): - temp = line.split(",") - if not len(temp) == len(attributes): - sys.exit("[ERROR] - number of columns in line differs from header\n\t%s\n\t%s" % (attributes, temp)) - if temp[1] in proteomes: - sys.exit("[ERROR] - 'TAXON' should be unique. %s was encountered multiple times" % (temp[0])) - species_id = temp[0] - proteome_id = temp[1] - proteomes.add(proteome_id) - proteome_id_by_species_id[species_id] = proteome_id - level_by_attribute_by_proteome_id[proteome_id] = {x : '' for x in attributes} - for idx, level in enumerate(temp): - attribute = attributes[idx] - level_by_attribute_by_proteome_id[proteome_id][attribute] = level - level_by_attribute_by_proteome_id[proteome_id]['all'] = 'all' - else: - pass - attributes.insert(0, "all") # append to front - return proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id - - ############################### - ### build_AloCollection add_taxid_attributes - ############################### - - def add_taxid_attributes(self, nodesdb_f, attributes, level_by_attribute_by_proteome_id): - if nodesdb_f: - check_file(nodesdb_f) - else: - sys.exit("[ERROR] - Please provide a nodesDB file or remove the 'TAXID' attribute") - print "[STATUS] - Parsing nodesDB %s" % (nodesdb_f) - NODESDB = parse_nodesdb(nodesdb_f) - for proteome_id in level_by_attribute_by_proteome_id: - taxid = level_by_attribute_by_proteome_id[proteome_id]['TAXID'] - lineage = get_lineage(taxid, NODESDB) - # add lineage attribute/levels - for taxrank in inputObj.taxranks: - level_by_attribute_by_proteome_id[proteome_id][taxrank] = lineage[taxrank].replace(" ", "_") - # remove taxid-levels - del level_by_attribute_by_proteome_id[proteome_id]['TAXID'] - # remove taxid-attribute - attributes.remove('TAXID') - # add taxranks to rank - for taxrank in inputObj.taxranks: - attributes.append(taxrank) - self.nodesdb_file = nodesdb_f - return attributes, level_by_attribute_by_proteome_id - - ############################### - ### setup_dirs - ############################### - - def setup_dirs(self, inputObj): - outprefix = inputObj.outprefix - self.dirs = {} - if outprefix: - result_path = join(getcwd(), "%s.kinfin_results" % (outprefix)) - else: - result_path = join(getcwd(), "kinfin_results") - self.dirs['main'] = result_path - print "[STATUS] - Output directories in \n\t%s" % (result_path) - if exists(result_path): - print "[STATUS] - Directory exists. Deleting directory ..." - shutil.rmtree(result_path) - print "[STATUS] - Creating directories ..." - mkdir(result_path) - for attribute in aloCollection.attributes: - attribute_path = join(result_path, attribute) - self.dirs[attribute] = attribute_path - if not exists(attribute_path): - print "\t%s" % (attribute_path) - mkdir(attribute_path) - if aloCollection.tree_ete: - tree_path = join(result_path, "tree") - node_chart_path = join(tree_path, "charts") - node_header_path = join(tree_path, "headers") - self.dirs["tree"] = tree_path - self.dirs["tree_charts"] = node_chart_path - self.dirs["tree_headers"] = node_header_path - if not exists(tree_path): - print "\t%s" % (tree_path) - mkdir(tree_path) - print "\t%s" % (node_chart_path) - mkdir(node_chart_path) - print "\t%s" % (node_header_path) - mkdir(node_header_path) - - ############################### - ### build_ProteinCollection - ############################### - - def build_ProteinCollection(self, inputObj): - # PARSE PROTEINS - proteinObjs = [] - sequence_ids_f = inputObj.sequence_ids_f - print "[STATUS] - Parsing sequence IDs: %s ..." % sequence_ids_f - for line in read_file(sequence_ids_f): - temp = line.split(": ") - sequence_id = temp[0] - protein_id = temp[1].split(" ")[0].replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") # orthofinder replaces characters - species_id = sequence_id.split("_")[0] - proteome_id = aloCollection.proteome_id_by_species_id.get(species_id, None) - if proteome_id: - proteinObj = ProteinObj(protein_id, proteome_id, species_id, sequence_id) - proteinObjs.append(proteinObj) - #else: - # sys.exit("[ERROR] - Offending SequenceID : %s (unknown species_id %s)" % (line, species_id)) - proteinCollection = ProteinCollection(proteinObjs) - print "[STATUS]\t - Proteins found = %s" % (proteinCollection.protein_count) - - # PARSE FASTA DIR - fasta_dir = inputObj.fasta_dir - species_ids_f = inputObj.species_ids_f - if fasta_dir: - print "[STATUS] - Parsing FASTAs ..." - fasta_file_by_species_id = self.parse_species_ids(species_ids_f) - fasta_len_by_protein_id = self.parse_fasta_dir(fasta_dir, fasta_file_by_species_id) - print "[STATUS] - Adding FASTAs to ProteinCollection ..." - parse_steps = proteinCollection.protein_count/100 - for idx, proteinObj in enumerate(proteinCollection.proteinObjs): - proteinObj.add_length(fasta_len_by_protein_id[proteinObj.protein_id]) - progress(idx+1, parse_steps, proteinCollection.protein_count) - aloCollection.fastas_parsed = True - proteinCollection.fastas_parsed = True - else: - print "[STATUS] - No Fasta-Dir given, no AA-span information will be reported ..." - - # PARSE DOMAINS - functional_annotation_f = inputObj.functional_annotation_f - if functional_annotation_f: - # PARSE DOMAINS - print "[STATUS] - Parsing %s ... this may take a while" % (functional_annotation_f) - for line in read_file(functional_annotation_f): - temp = line.split() - if temp[0].startswith("#"): - proteinCollection.domain_sources = temp[1:] - else: - if not proteinCollection.domain_sources: - sys.exit("[ERROR] - %s does not seem to have a header." % (functional_annotation_f)) - domain_protein_id = temp.pop(0) - go_terms = [] - domain_counter_by_domain_source = {} - for idx, field in enumerate(temp): - if not field == "None": - domain_source = proteinCollection.domain_sources[idx] - domain_string = field.split(";") - domain_counts_by_domain_id = {} - for domain_id_count in domain_string: - domain_id, domain_count = '', 1 - if domain_source == "GO": - domain_id = domain_id_count - else: - domain_id, domain_count = domain_id_count.split(":") - domain_counts_by_domain_id[domain_id] = int(domain_count) - domain_counter = Counter(domain_counts_by_domain_id) - domain_counter_by_domain_source[domain_source] = domain_counter - proteinCollection.add_annotation_to_proteinObj(domain_protein_id, domain_counter_by_domain_source, go_terms) - proteinCollection.functional_annotation_parsed = True - mapping_file_by_domain_source = {} - if inputObj.pfam_mapping and "Pfam" in proteinCollection.domain_sources: - mapping_file_by_domain_source["Pfam"] = inputObj.pfam_mapping_f - if inputObj.ipr_mapping and "IPR" in proteinCollection.domain_sources: - mapping_file_by_domain_source["IPR"] = inputObj.ipr_mapping_f - if inputObj.go_mapping_f: - mapping_file_by_domain_source["GO"] = inputObj.go_mapping_f - proteinCollection.domain_description_by_domain_id_by_domain_source = parse_mapping(mapping_file_by_domain_source) - - return proteinCollection - - ############################### - ### build_ProteinCollection : parse_species_ids - ############################### - - def parse_species_ids(self, species_ids_f): - fasta_by_ortho_id = {} - for line in read_file(species_ids_f): - if not line.startswith("#"): - idx, fasta = line.split(": ") - fasta_by_ortho_id[idx] = fasta - self.species_ids_file = species_ids_f - return fasta_by_ortho_id - - ############################### - ### build_ProteinCollection : parse_fasta_dir - ############################### - - def parse_fasta_dir(self, fasta_dir, fasta_file_by_species_id): - fasta_len_by_protein_id = {} - for species_id, fasta_f in fasta_file_by_species_id.items(): - fasta_path = join(fasta_dir, fasta_f) - if not isfile(fasta_path): - sys.exit("[ERROR] - %s does not exist." % (fasta_path)) - print "[STATUS]\t - Parsing FASTA %s" % (fasta_path) - for header, length in readFastaLen(fasta_path): - fasta_len_by_protein_id[header] = length - self.fasta_dir = fasta_dir - return fasta_len_by_protein_id - - ############################### - ### build_ClusterCollection - ############################### - - def build_ClusterCollection(self, inputObj): - cluster_f = inputObj.cluster_f - print "[STATUS] - Parsing %s ... this may take a while" % (cluster_f) - clusterObjs = [] - with open(cluster_f) as fh: - for line in fh: - temp = line.rstrip("\n").split(" ") - cluster_id, protein_string = temp[0].replace(":", ""), temp[1:] - protein_string = [protein_id for protein_id in protein_string if protein_id] - clusterObj = ClusterObj(cluster_id, protein_string) - for protein_id in protein_string: - proteinObj = proteinCollection.proteinObjs_by_protein_id[protein_id] - proteinObj.clustered = True - clusterObjs.append(clusterObj) - inferred_singletons_count = 0 - if inputObj.infer_singletons: - print "[STATUS] - Inferring singletons ..." - singleton_idx = 0 - for proteinObj in proteinCollection.proteinObjs: - if proteinObj.clustered == False: - cluster_id = "singleton_%s" % singleton_idx - clusterObj = ClusterObj(cluster_id, [proteinObj.protein_id]) - clusterObjs.append(clusterObj) - singleton_idx += 1 - inferred_singletons_count = singleton_idx - return ClusterCollection(clusterObjs, inferred_singletons_count, proteinCollection.functional_annotation_parsed, proteinCollection.fastas_parsed, proteinCollection.domain_sources) - - ############################### - ### write_output - ############################### - - def write_output(self): - self.plot_cluster_sizes() - self.write_cluster_metrics() - - ############################### - ### write_output : write_ALO_stats - ############################### - - def plot_cluster_sizes(self): - cluster_protein_count = [] - for clusterObj in clusterCollection.clusterObjs: - cluster_protein_count.append(clusterObj.protein_count) - cluster_protein_counter = Counter(cluster_protein_count) - count_plot_f = join(self.dirs['main'], "cluster_size_distribution.%s" % (inputObj.plot_format)) - f, ax = plt.subplots(figsize=inputObj.plot_size) - ax.set_facecolor('white') - x_values = [] - y_values = [] - for value, count in cluster_protein_counter.items(): - x_values.append(value) - y_values.append(count) - x_array = np.array(x_values) - y_array = np.array(y_values) - ax.scatter(x_array, y_array, marker='o', alpha=0.8, s=100) - ax.set_xlabel('Cluster size', fontsize=inputObj.plot_font_size) - ax.set_ylabel('Count', fontsize=inputObj.plot_font_size) - ax.set_yscale('log') - ax.set_xscale('log') - plt.margins(0.8) - plt.gca().set_ylim(bottom=0.8) - plt.gca().set_xlim(left=0.8) - ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f')) - ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f')) - f.tight_layout() - - ax.grid(True, linewidth=1, which="major", color="lightgrey") - print "[STATUS] - Plotting %s" % (count_plot_f) - f.savefig(count_plot_f, format=inputObj.plot_format) - plt.close() - - def get_header_line(self, filetype, attribute): - if filetype == "attribute_metrics": - attribute_metrics_header = [] - attribute_metrics_header.append("#attribute") - attribute_metrics_header.append("taxon_set") - attribute_metrics_header.append("cluster_total_count") - attribute_metrics_header.append("protein_total_count") - attribute_metrics_header.append("protein_total_span") - attribute_metrics_header.append("singleton_cluster_count") - attribute_metrics_header.append("singleton_protein_count") - attribute_metrics_header.append("singleton_protein_span") - attribute_metrics_header.append("specific_cluster_count") - attribute_metrics_header.append("specific_protein_count") - attribute_metrics_header.append("specific_protein_span") - attribute_metrics_header.append("shared_cluster_count") - attribute_metrics_header.append("shared_protein_count") - attribute_metrics_header.append("shared_protein_span") - attribute_metrics_header.append("specific_cluster_true_1to1_count") - attribute_metrics_header.append("specific_cluster_fuzzy_count") - attribute_metrics_header.append("shared_cluster_true_1to1_count") - attribute_metrics_header.append("shared_cluster_fuzzy_count") - attribute_metrics_header.append("absent_cluster_total_count") - attribute_metrics_header.append("absent_cluster_singleton_count") - attribute_metrics_header.append("absent_cluster_specific_count") - attribute_metrics_header.append("absent_cluster_shared_count") - attribute_metrics_header.append("TAXON_count") - attribute_metrics_header.append("TAXA") - return "\t".join(attribute_metrics_header) - elif filetype == "cluster_metrics_ALO": - cluster_metrics_ALO_header = [] - cluster_metrics_ALO_header.append("#cluster_id") - cluster_metrics_ALO_header.append("cluster_status") - cluster_metrics_ALO_header.append("cluster_type") - cluster_metrics_ALO_header.append("cluster_protein_count") - cluster_metrics_ALO_header.append("cluster_proteome_count") - cluster_metrics_ALO_header.append("taxon_protein_count") - cluster_metrics_ALO_header.append("taxon_mean_count") - cluster_metrics_ALO_header.append("non_taxon_mean_count") - cluster_metrics_ALO_header.append("representation") - cluster_metrics_ALO_header.append("log2_mean(TAXON/others)") - cluster_metrics_ALO_header.append("mwu_pvalue(TAXON vs. others)") - cluster_metrics_ALO_header.append("taxon_proteome_coverage") - cluster_metrics_ALO_header.append("taxon_proteomes_present_count") - cluster_metrics_ALO_header.append("taxon_proteomes_present") - #for domain_source in clusterCollection.domain_sources: - # cluster_metrics_ALO_header.append(domain_source) - return "\t".join(cluster_metrics_ALO_header) - elif filetype == "cluster_metrics": - cluster_metrics_header = [] - cluster_metrics_header.append("#cluster_id") - cluster_metrics_header.append("cluster_protein_count") - cluster_metrics_header.append("protein_median_count") - cluster_metrics_header.append("TAXON_count") - cluster_metrics_header.append("attribute") - cluster_metrics_header.append("attribute_cluster_type") - cluster_metrics_header.append("protein_span_mean") - cluster_metrics_header.append("protein_span_sd") - cluster_metrics_header += ["%s_count" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])] - if not attribute == "TAXON": - cluster_metrics_header += ["%s_median" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])] - cluster_metrics_header += ["%s_cov" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])] - return "\t".join(cluster_metrics_header) - elif filetype == "cluster_metrics_domains": - cluster_metrics_domains_header = [] - cluster_metrics_domains_header.append("#cluster_id") - cluster_metrics_domains_header.append("cluster_protein_count") - cluster_metrics_domains_header.append("TAXON_count") - cluster_metrics_domains_header.append("protein_span_mean") - cluster_metrics_domains_header.append("protein_span_sd") - cluster_metrics_domains_header.append("fraction_secreted") - for domain_source in clusterCollection.domain_sources: - cluster_metrics_domains_header.append(domain_source) - cluster_metrics_domains_header.append("%s_entropy" % (domain_source)) - return "\t".join(cluster_metrics_domains_header) - elif filetype == "cluster_metrics_domains_detailed": - cluster_metrics_domains_detailed_header = [] - cluster_metrics_domains_detailed_header.append("#cluster_id") - cluster_metrics_domains_detailed_header.append("domain_source") - cluster_metrics_domains_detailed_header.append("domain_id") - cluster_metrics_domains_detailed_header.append("domain_description") - cluster_metrics_domains_detailed_header.append("protein_count") - cluster_metrics_domains_detailed_header.append("protein_count_with_domain") - cluster_metrics_domains_detailed_header.append("TAXA_with_domain_fraction") - cluster_metrics_domains_detailed_header.append("TAXA_with_domain") - cluster_metrics_domains_detailed_header.append("TAXA_without_domain") - return "\t".join(cluster_metrics_domains_detailed_header) - elif filetype == "cafe": - cafe_header = [] - cafe_header.append("ID") - for level in sorted(aloCollection.ALO_by_level_by_attribute['TAXON']): - cafe_header.append(level) - return "\t".join(cafe_header) - elif filetype == "pairwise_representation_test": - pairwise_representation_test_header = [] - pairwise_representation_test_header.append("#cluster_id") - pairwise_representation_test_header.append("TAXON_1") - pairwise_representation_test_header.append("TAXON_1_mean") - pairwise_representation_test_header.append("TAXON_2") - pairwise_representation_test_header.append("TAXON_2_mean") - pairwise_representation_test_header.append("log2_mean(TAXON_1/TAXON_2)") - pairwise_representation_test_header.append("mwu_pvalue(TAXON_1 vs. TAXON_2)") - #pairwise_representation_test_header.append("go_terms") - #for domain_source in clusterCollection.domain_sources: - # pairwise_representation_test_header.append(domain_source) - return "\t".join(pairwise_representation_test_header) - elif filetype == 'cluster_1to1s_ALO': - cluster_1to1s_ALO_header = [] - cluster_1to1s_ALO_header.append("#cluster_id") - cluster_1to1s_ALO_header.append("cluster_type") - cluster_1to1s_ALO_header.append("cardinality") - cluster_1to1s_ALO_header.append("proteome_count") - cluster_1to1s_ALO_header.append("percentage_at_target_count") - return "\t".join(cluster_1to1s_ALO_header) - else: - sys.exit("[ERROR] %s is not a valid header 'filetype'" % (filetype)) - - def get_attribute_metrics(self, ALO): - attribute_metrics = [] - attribute_metrics.append(ALO.attribute) - attribute_metrics.append(ALO.level) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'total')) - attribute_metrics.append(ALO.get_protein_count_by_cluster_type('total')) - attribute_metrics.append(ALO.get_protein_span_by_cluster_type('total')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'singleton')) - attribute_metrics.append(ALO.get_protein_count_by_cluster_type('singleton')) - attribute_metrics.append(ALO.get_protein_span_by_cluster_type('singleton')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'specific')) - attribute_metrics.append(ALO.get_protein_count_by_cluster_type('specific')) - attribute_metrics.append(ALO.get_protein_span_by_cluster_type('specific')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'shared')) - attribute_metrics.append(ALO.get_protein_count_by_cluster_type('shared')) - attribute_metrics.append(ALO.get_protein_span_by_cluster_type('shared')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('specific', 'true')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('specific', 'fuzzy')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('shared', 'true')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('shared', 'fuzzy')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'total')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'singleton')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'specific')) - attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'shared')) - attribute_metrics.append(ALO.proteome_count) - attribute_metrics.append(ALO.get_proteomes()) - return "\t".join([str(field) for field in attribute_metrics]) - - def write_cluster_metrics(self): - cafe_f = join(self.dirs['main'], "clusters_counts_by_taxon.txt") - cafe_output = [] - cafe_output.append(self.get_header_line('cafe', "TAXON")) - - cluster_metrics_domains_f = join(self.dirs['main'], "cluster_metrics_domains.txt") - cluster_metrics_domains_output = [] - cluster_metrics_domains_output.append(self.get_header_line('cluster_metrics_domains', "TAXON")) - - cluster_metrics_domains_detailed_output_by_domain_source = {} - cluster_metrics_domains_detailed_f_by_domain_source = {} - for domain_source in clusterCollection.domain_sources: - cluster_metrics_domains_detailed_output_by_domain_source[domain_source] = [] - cluster_metrics_domains_detailed_output_by_domain_source[domain_source].append(self.get_header_line('cluster_metrics_domains_detailed', "TAXON")) - cluster_metrics_domains_detailed_f_by_domain_source[domain_source] = join(self.dirs['main'], "cluster_metrics_domains.%s.txt" % (domain_source)) - - for attribute in aloCollection.attributes: - - attribute_metrics_f = join(self.dirs[attribute], "%s.attribute_metrics.txt" % (attribute)) - attribute_metrics_output = [] - attribute_metrics_output.append(self.get_header_line('attribute_metrics', attribute)) - - pairwise_representation_test_f = join(self.dirs[attribute], "%s.pairwise_representation_test.txt" % (attribute)) - pairwise_representation_test_output = [] - pairwise_representation_test_output.append(self.get_header_line('pairwise_representation_test', attribute)) - - pairwise_representation_test_by_pair_by_attribute = {} - - ########################### - # cluster_metrics - ########################### - - cluster_metrics_f = join(self.dirs[attribute], "%s.cluster_metrics.txt" % (attribute)) - cluster_metrics_output = [] - cluster_metrics_output.append(self.get_header_line('cluster_metrics', attribute)) - - levels = sorted([x for x in aloCollection.ALO_by_level_by_attribute[attribute]]) - levels_seen = set() - - for level in levels: - ALO = aloCollection.ALO_by_level_by_attribute[attribute][level] - - ########################### - # attribute_metrics - ########################### - - attribute_metrics_output.append(self.get_attribute_metrics(ALO)) - - ########################### - # cluster_metrics_ALO : setup - ########################### - - cluster_metrics_ALO_f = join(self.dirs[attribute], "%s.%s.cluster_metrics.txt" % (attribute, level)) - cluster_metrics_ALO_output = [] - cluster_metrics_ALO_output.append(self.get_header_line('cluster_metrics_ALO', attribute)) - - background_representation_test_by_pair_by_attribute = {} - - ########################### - # cluster_1to1s - ########################### - - cluster_1to1_ALO_f = join(self.dirs[attribute], "%s.%s.cluster_1to1s.txt" % (attribute, level)) - cluster_1to1_ALO_output = [] - cluster_1to1_ALO_output.append(self.get_header_line('cluster_1to1s_ALO', attribute)) - if not attribute == "TAXON": - for cluster_type in ALO.clusters_by_cluster_cardinality_by_cluster_type: - for cluster_cardinality in ALO.clusters_by_cluster_cardinality_by_cluster_type[cluster_type]: - for cluster_id in ALO.clusters_by_cluster_cardinality_by_cluster_type[cluster_type][cluster_cardinality]: - cluster_1to1_ALO_line = [] - cluster_1to1_ALO_line.append(cluster_id) - cluster_1to1_ALO_line.append(cluster_type) - cluster_1to1_ALO_line.append(cluster_cardinality) - cluster_1to1_ALO_line.append(clusterCollection.clusterObjs_by_cluster_id[cluster_id].proteome_count) - cluster_1to1_ALO_line.append("{0:.2f}".format( - len([protein_count for proteome_id, protein_count in clusterCollection.clusterObjs_by_cluster_id[cluster_id].protein_count_by_proteome_id.items() if protein_count == inputObj.fuzzy_count]) / clusterCollection.clusterObjs_by_cluster_id[cluster_id].proteome_count) - ) - cluster_1to1_ALO_output.append("\t".join([str(field) for field in cluster_1to1_ALO_line])) - - for clusterObj in clusterCollection.clusterObjs: - - ########################### - # cluster_metrics (only done once for each attribute) - ########################### - - if not levels_seen: - cluster_metrics_line = [] - cluster_metrics_line.append(clusterObj.cluster_id) - cluster_metrics_line.append(clusterObj.protein_count) - cluster_metrics_line.append(clusterObj.protein_median) - cluster_metrics_line.append(clusterObj.proteome_count) - cluster_metrics_line.append(attribute) - cluster_metrics_line.append(clusterObj.cluster_type_by_attribute[attribute]) - if clusterCollection.fastas_parsed: - cluster_metrics_line.append(clusterObj.protein_length_stats['mean']) - cluster_metrics_line.append(clusterObj.protein_length_stats['sd']) - else: - cluster_metrics_line.append("N/A") - cluster_metrics_line.append("N/A") - for _level in levels: - cluster_metrics_line.append(sum(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][_level])) - if not attribute == "TAXON": - for _level in levels: - cluster_metrics_line.append(median(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][_level])) - for _level in levels: - cluster_metrics_line.append("{0:.2f}".format(clusterObj.proteome_coverage_by_level_by_attribute[attribute][_level])) - cluster_metrics_output.append("\t".join([str(field) for field in cluster_metrics_line])) - - ########################### - # cafe (only done for attribute "TAXON") - ########################### - - if not levels_seen and attribute == "TAXON": - cafe_line = [] - #cafe_line.append("None") - cafe_line.append(str(clusterObj.cluster_id)) - for _level in levels: - cafe_line.append(sum(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][_level])) - cafe_output.append("\t".join([str(field) for field in cafe_line])) - - ########################### - # cluster_metrics_domains (only done for attribute "TAXON") - # - now different: - # - has line for each domain_id for each domain_source - ########################### - - if not levels_seen and attribute == "TAXON": - if clusterCollection.functional_annotation_parsed: - # cluster_metrics_domain_line - cluster_metrics_domains_line = [] - cluster_metrics_domains_line.append(clusterObj.cluster_id) - cluster_metrics_domains_line.append(clusterObj.protein_count) - cluster_metrics_domains_line.append(clusterObj.proteome_count) - if clusterCollection.fastas_parsed: - cluster_metrics_domains_line.append(clusterObj.protein_length_stats['mean']) - cluster_metrics_domains_line.append(clusterObj.protein_length_stats['sd']) - else: - cluster_metrics_domains_line.append("N/A") - cluster_metrics_domains_line.append("N/A") - if "SignalP_EUK" in clusterCollection.domain_sources: - cluster_metrics_domains_line.append("{0:.2f}".format(clusterObj.secreted_cluster_coverage)) - else: - cluster_metrics_domains_line.append("N/A") - for domain_source in clusterCollection.domain_sources: - # cluster_metrics_domains - if domain_source in clusterObj.domain_counter_by_domain_source: - cluster_metrics_domains_line.append(";".join(["%s:%s" % (domain_id, count) for domain_id, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()])) - cluster_metrics_domains_line.append("{0:.3f}".format(clusterObj.domain_entropy_by_domain_source[domain_source])) - else: - cluster_metrics_domains_line.append("N/A") - cluster_metrics_domains_line.append("N/A") - cluster_metrics_domains_output.append("\t".join([str(field) for field in cluster_metrics_domains_line])) - for domain_source in clusterObj.domain_counter_by_domain_source: - for domain_id, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common(): - cluster_metrics_domains_detailed_output_line = [] - cluster_metrics_domains_detailed_output_line.append(clusterObj.cluster_id) - cluster_metrics_domains_detailed_output_line.append(domain_source) - cluster_metrics_domains_detailed_output_line.append(domain_id) - if domain_source == 'SignalP_EUK': - cluster_metrics_domains_detailed_output_line.append(domain_id) - else: - if domain_source in proteinCollection.domain_description_by_domain_id_by_domain_source: - cluster_metrics_domains_detailed_output_line.append(proteinCollection.domain_description_by_domain_id_by_domain_source[domain_source].get(domain_id, "N/A")) - else: - cluster_metrics_domains_detailed_output_line.append("N/A") - cluster_metrics_domains_detailed_output_line.append(clusterObj.protein_count) - protein_with_domain_count_by_proteome_id = {} - proteome_count_with_domain = 0 - protein_without_domain_count_by_proteome_id = {} - for proteome_id, protein_ids in clusterObj.protein_ids_by_proteome_id.items(): - proteome_seen = False - for protein_id in protein_ids: - if domain_source in proteinCollection.proteinObjs_by_protein_id[protein_id].domain_counter_by_domain_source and domain_id in proteinCollection.proteinObjs_by_protein_id[protein_id].domain_counter_by_domain_source[domain_source]: - protein_with_domain_count_by_proteome_id[proteome_id] = protein_with_domain_count_by_proteome_id.get(proteome_id, 0) + 1 - if not proteome_seen: - proteome_count_with_domain += 1 - proteome_seen = True - else: - protein_without_domain_count_by_proteome_id[proteome_id] = protein_without_domain_count_by_proteome_id.get(proteome_id, 0) + 1 - proteomes_with_domain_count_string = ",".join(sorted(["%s:%s/%s" % (proteome_id, count, len(clusterObj.protein_ids_by_proteome_id[proteome_id])) for proteome_id, count in protein_with_domain_count_by_proteome_id.items()])) - proteomes_without_domain_count_string = ",".join(sorted(["%s:%s/%s" % (proteome_id, count, len(clusterObj.protein_ids_by_proteome_id[proteome_id])) for proteome_id, count in protein_without_domain_count_by_proteome_id.items()])) - cluster_metrics_domains_detailed_output_line.append(sum(protein_with_domain_count_by_proteome_id.values())) - cluster_metrics_domains_detailed_output_line.append("{0:.3f}".format(proteome_count_with_domain / clusterObj.proteome_count)) - if proteomes_with_domain_count_string: - cluster_metrics_domains_detailed_output_line.append(proteomes_with_domain_count_string) - else: - cluster_metrics_domains_detailed_output_line.append("N/A") - if proteomes_without_domain_count_string: - cluster_metrics_domains_detailed_output_line.append(proteomes_without_domain_count_string) - else: - cluster_metrics_domains_detailed_output_line.append("N/A") - cluster_metrics_domains_detailed_output_by_domain_source[domain_source].append("\t".join([str(field) for field in cluster_metrics_domains_detailed_output_line])) - - ########################### - # cluster_metrics_ALO : populate - ########################### - - cluster_metrics_ALO_line = [] - cluster_metrics_ALO_line.append(clusterObj.cluster_id) - cluster_metrics_ALO_line.append(ALO.cluster_status_by_cluster_id[clusterObj.cluster_id]) - cluster_metrics_ALO_line.append(ALO.cluster_type_by_cluster_id[clusterObj.cluster_id]) - cluster_metrics_ALO_line.append(clusterObj.protein_count) - cluster_metrics_ALO_line.append(clusterObj.proteome_count) - cluster_metrics_ALO_line.append(sum(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][level])) - if ALO.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id]: - cluster_metrics_ALO_line.append(ALO.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id]) - else: - cluster_metrics_ALO_line.append("N/A") - if ALO.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id]: - cluster_metrics_ALO_line.append(ALO.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id]) - else: - cluster_metrics_ALO_line.append("N/A") - if ALO.cluster_type_by_cluster_id[clusterObj.cluster_id] == 'shared': - if ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id]: - background_pair = (level, "background") - if attribute not in background_representation_test_by_pair_by_attribute: - background_representation_test_by_pair_by_attribute[attribute] = {} - if background_pair not in background_representation_test_by_pair_by_attribute[attribute]: - background_representation_test_by_pair_by_attribute[attribute][background_pair] = [] - background_representation_test = [] - background_representation_test.append(clusterObj.cluster_id) - background_representation_test.append(level) - background_representation_test.append("background") - background_representation_test.append(ALO.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id]) - background_representation_test.append(ALO.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id]) - background_representation_test.append(ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id]) - background_representation_test.append(ALO.cluster_mwu_pvalue_by_cluster_id[clusterObj.cluster_id]) - background_representation_test_by_pair_by_attribute[attribute][background_pair].append(background_representation_test) - - if ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id] > 0: - cluster_metrics_ALO_line.append("enriched") - elif ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id] < 0: - cluster_metrics_ALO_line.append("depleted") - else: - cluster_metrics_ALO_line.append("equal") - cluster_metrics_ALO_line.append(ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id]) - cluster_metrics_ALO_line.append(ALO.cluster_mwu_pvalue_by_cluster_id[clusterObj.cluster_id]) - else: - cluster_metrics_ALO_line.append("N/A") - cluster_metrics_ALO_line.append("N/A") - cluster_metrics_ALO_line.append("N/A") - else: - cluster_metrics_ALO_line.append("N/A") - cluster_metrics_ALO_line.append("N/A") - cluster_metrics_ALO_line.append("N/A") - cluster_metrics_ALO_line.append("{0:.2f}".format(clusterObj.proteome_coverage_by_level_by_attribute[attribute][level])) - ALO_proteomes_present = [] - if ALO.cluster_status_by_cluster_id[clusterObj.cluster_id] == 'present': - ALO_proteomes_present = clusterObj.proteome_ids.intersection(ALO.proteomes) - cluster_metrics_ALO_line.append(len(ALO_proteomes_present)) - if ALO_proteomes_present: - cluster_metrics_ALO_line.append(",".join(sorted(list(ALO_proteomes_present)))) - else: - cluster_metrics_ALO_line.append("N/A") - if clusterObj.go_terms: - cluster_metrics_ALO_line.append(";".join(sorted(list(clusterObj.go_terms)))) - else: - cluster_metrics_ALO_line.append("N/A") - #for domain_source in clusterCollection.domain_sources: - # if domain_source in clusterObj.domain_counter_by_domain_source: - # cluster_metrics_ALO_line.append(";".join(["%s:%s" % (domain, count) for domain, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()])) - # else: - # cluster_metrics_ALO_line.append("N/A") - cluster_metrics_ALO_output.append("\t".join([str(field) for field in cluster_metrics_ALO_line])) - - if len(levels) > 1 and len(ALO_proteomes_present) >= inputObj.min_proteomes: - for result in self.pairwise_representation_test(clusterObj, attribute, level, levels_seen, levels): - # [clusterObj.cluster_id, level, other_level, mean_level, mean_other_level, log2fc_mean, pvalue] - if attribute not in pairwise_representation_test_by_pair_by_attribute: - pairwise_representation_test_by_pair_by_attribute[attribute] = {} - pair = (result[1], result[2]) - if pair not in pairwise_representation_test_by_pair_by_attribute[attribute]: - pairwise_representation_test_by_pair_by_attribute[attribute][pair] = [] - pairwise_representation_test_by_pair_by_attribute[attribute][pair].append(result) - - pairwise_representation_test_line = [] - pairwise_representation_test_line.append(result[0]) - pairwise_representation_test_line.append(result[1]) - pairwise_representation_test_line.append(result[3]) - pairwise_representation_test_line.append(result[2]) - pairwise_representation_test_line.append(result[4]) - pairwise_representation_test_line.append(result[5]) - pairwise_representation_test_line.append(result[6]) - #if clusterObj.go_terms: - # pairwise_representation_test_line.append(";".join(sorted(list(clusterObj.go_terms)))) - #else: - # pairwise_representation_test_line.append("N/A") - #for domain_source in clusterCollection.domain_sources: - # if domain_source in clusterObj.domain_counter_by_domain_source: - # pairwise_representation_test_line.append(";".join(["%s:%s" % (domain, count) for domain, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()])) - # else: - # pairwise_representation_test_line.append("N/A") - pairwise_representation_test_output.append("\t".join([str(field) for field in pairwise_representation_test_line])) - - levels_seen.add(level) - # END of cluster loop - - if len(cafe_output) > 1: - with open(cafe_f, 'w') as cafe_fh: - print "[STATUS] - Writing %s" % (cafe_f) - cafe_fh.write("\n".join(cafe_output) + "\n") - cafe_output = [] - if len(cluster_metrics_output) > 1: - with open(cluster_metrics_f, 'w') as cluster_metrics_fh: - print "[STATUS] - Writing %s" % (cluster_metrics_f) - cluster_metrics_fh.write("\n".join(cluster_metrics_output) + "\n") - cluster_metrics_output = [] - if len(cluster_metrics_domains_output) > 1: - with open(cluster_metrics_domains_f, 'w') as cluster_metrics_domains_fh: - print "[STATUS] - Writing %s" % (cluster_metrics_domains_f) - cluster_metrics_domains_fh.write("\n".join(cluster_metrics_domains_output) + "\n") - cluster_metrics_domains_output = [] - for domain_source in cluster_metrics_domains_detailed_output_by_domain_source: - if len(cluster_metrics_domains_detailed_output_by_domain_source[domain_source]) > 1: - cluster_metrics_domains_detailed_f = cluster_metrics_domains_detailed_f_by_domain_source[domain_source] - with open(cluster_metrics_domains_detailed_f, 'w') as cluster_metrics_domains_detailed_fh: - print "[STATUS] - Writing %s" % (cluster_metrics_domains_detailed_f) - cluster_metrics_domains_detailed_fh.write("\n".join(cluster_metrics_domains_detailed_output_by_domain_source[domain_source]) + "\n") - cluster_metrics_domains_detailed_output_by_domain_source[domain_source] = [] - if len(cluster_metrics_ALO_output) > 1: - with open(cluster_metrics_ALO_f, 'w') as cluster_metrics_ALO_fh: - print "[STATUS] - Writing %s" % (cluster_metrics_ALO_f) - cluster_metrics_ALO_fh.write("\n".join(cluster_metrics_ALO_output) + "\n") - cluster_metrics_ALO_output = [] - if len(cluster_1to1_ALO_output) > 1: - with open(cluster_1to1_ALO_f, 'w') as cluster_1to1_ALO_fh: - print "[STATUS] - Writing %s" % (cluster_1to1_ALO_f) - cluster_1to1_ALO_fh.write("\n".join(cluster_1to1_ALO_output) + "\n") - cluster_1to1_ALO_output = [] - if background_representation_test_by_pair_by_attribute: - self.plot_count_comparisons_vulcano(background_representation_test_by_pair_by_attribute) - - - if len(attribute_metrics_output) > 1: - with open(attribute_metrics_f, 'w') as attribute_metrics_fh: - print "[STATUS] - Writing %s" % (attribute_metrics_f) - attribute_metrics_fh.write("\n".join(attribute_metrics_output) + "\n") - if len(pairwise_representation_test_output) > 1: - with open(pairwise_representation_test_f, 'w') as pairwise_representation_test_fh: - print "[STATUS] - Writing %s" % (pairwise_representation_test_f) - pairwise_representation_test_fh.write("\n".join(pairwise_representation_test_output) + "\n") - if pairwise_representation_test_by_pair_by_attribute: - self.plot_count_comparisons_vulcano(pairwise_representation_test_by_pair_by_attribute) - - def plot_count_comparisons_vulcano(self, pairwise_representation_test_by_pair_by_attribute): - # [clusterObj.cluster_id, level, other_level, mean_level, mean_other_level, log2fc_mean, pvalue] - for attribute in pairwise_representation_test_by_pair_by_attribute: - for pair in pairwise_representation_test_by_pair_by_attribute[attribute]: - pair_list = list(pair) - x_label = pair_list[0] - y_label = pair_list[1] - pair_data = pairwise_representation_test_by_pair_by_attribute[attribute][pair] - pair_data_count = len(pair_data) - p_values = [] - log2fc_values = [] - for data in pair_data: - log2fc_values.append(data[5]) - p_values.append(data[6]) - if p_values: - pairwise_representation_test_f = join(self.dirs[attribute], "%s.pairwise_representation_test.%s.%s" % (attribute, "_".join(pair_list), inputObj.plot_format)) - f, ax = plt.subplots(figsize=inputObj.plot_size) - ax.set_facecolor('white') - p_array = np.array(p_values) - log2fc_array = np.array(log2fc_values) - ax.scatter(log2fc_array, p_array, alpha=0.8, edgecolors='none', s=25, c='grey') - - ooFive = 0.05 - ooOne = 0.01 - ooFive_corrected = 0.05 / pair_data_count - ooOne_corrected = 0.01 / pair_data_count - - ax.axhline(y=ooFive, linewidth=2, color='orange', linestyle="--") - ooFive_artist = plt.Line2D((0, 1), (0, 0), color='orange', linestyle='--') - ax.axhline(y=ooOne, linewidth=2, color='red', linestyle="--") - ooOne_artist = plt.Line2D((0, 1), (0, 0), color='red', linestyle='--') - ax.axhline(y=ooFive_corrected, linewidth=2, color='grey', linestyle="--") - ooFive_corrected_artist = plt.Line2D((0, 1), (0, 0), color='grey', linestyle='--') - ax.axhline(y=ooOne_corrected, linewidth=2, color='black', linestyle="--") - ooOne_corrected_artist = plt.Line2D((0, 1), (0, 0), color='black', linestyle='--') - - # Create legend from custom artist/label lists - legend = ax.legend([ooFive_artist, ooOne_artist, ooFive_corrected_artist, ooOne_corrected_artist], - [ooFive, ooOne, "%s (0.05 corrected)" % '%.2E' % Decimal(ooFive_corrected), "%s (0.01 corrected)" % '%.2E' % Decimal(ooOne_corrected)], - fontsize=inputObj.plot_font_size, frameon=True) - legend.get_frame().set_facecolor('white') - if abs(np.min(log2fc_array)) < abs(np.max(log2fc_array)): - x_min = 0.0 - abs(np.max(log2fc_array)) - x_max = 0.0 + abs(np.max(log2fc_array)) - ax.set_xlim(x_min - 1, x_max + 1) - else: - x_min = 0.0 - abs(np.min(log2fc_array)) - x_max = 0.0 + abs(np.min(log2fc_array)) - ax.set_xlim(x_min - 1, x_max + 1) - - ax.grid(True, linewidth=1, which="major", color="lightgrey") - ax.set_ylim(np.min(p_array) * 0.1, 1.1) - ax.set_xlabel("log2(mean(%s)/mean(%s))" % (x_label, y_label), fontsize=inputObj.plot_font_size) - ax.set_ylabel("p-value", fontsize=inputObj.plot_font_size) - plt.gca().invert_yaxis() - ax.set_yscale('log') - print "[STATUS] - Plotting %s" % (pairwise_representation_test_f) - f.savefig(pairwise_representation_test_f, format=inputObj.plot_format) - plt.close() - - def pairwise_representation_test(self, clusterObj, attribute, level, levels_seen, levels): - for other_level in set(levels).difference(levels_seen): - if not other_level == level: - other_ALO = aloCollection.ALO_by_level_by_attribute[attribute][other_level] - if len(clusterObj.proteome_ids.intersection(other_ALO.proteomes)) >= 2: - protein_counts_level = [count for count in clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][level] if count > 0] - protein_counts_other_level = [count for count in clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][other_level] if count > 0] - if protein_counts_level and protein_counts_other_level: - pvalue = None - try: - pvalue = scipy.stats.mannwhitneyu(protein_counts_level, protein_counts_other_level, alternative="two-sided")[1] - except: - pvalue = 1.0 - mean_level = mean(protein_counts_level) - mean_other_level = mean(protein_counts_other_level) - log2fc_mean = log((mean_level/mean_other_level), 2) - yield [clusterObj.cluster_id, level, other_level, mean_level, mean_other_level, log2fc_mean, pvalue] - -######################################################################## -# CLASS : AloCollection -######################################################################## - -class AloCollection(): - def __init__(self, proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id, tree_ete, node_idx_by_proteome_ids): - self.attributes_verbose = attributes - self.attributes = [attribute for attribute in attributes if attribute not in inputObj.ATTRIBUTE_RESERVED] # list of attributes - self.proteome_id_by_species_id = proteome_id_by_species_id - self.tree_ete = tree_ete - - self.node_idx_by_proteome_ids = node_idx_by_proteome_ids - self.level_by_attribute_by_proteome_id = level_by_attribute_by_proteome_id - self.proteome_ids_by_level_by_attribute = self.compute_proteomes_by_level_by_attribute() - - self.counts_of_all_proteome_subsets = {} - self.cluster_ids_of_all_proteome_subsets = {} - - self.ALO_by_level_by_attribute = self.create_ALOs() - - self.fastas_parsed = False - self.rarefaction_by_samplesize_by_level_by_attribute = {} - - ############################### - ### create_ALOs - ############################### - - def create_ALOs(self): - ALO_by_level_by_attribute = {attribute: {} for attribute in self.attributes} - for attribute in self.proteome_ids_by_level_by_attribute: - for level in self.proteome_ids_by_level_by_attribute[attribute]: - proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level] - ALO = AttributeLevelObj(attribute, level, proteome_ids) - if not level in ALO_by_level_by_attribute[attribute]: - ALO_by_level_by_attribute[attribute][level] = {} - ALO_by_level_by_attribute[attribute][level] = ALO - return ALO_by_level_by_attribute - - ############################### - ### compute_proteomes_by_level_by_attribute - ############################### - - def compute_proteomes_by_level_by_attribute(self): - proteomes_by_level_by_attribute = {attribute : {} for attribute in self.attributes} - for proteome_id in self.level_by_attribute_by_proteome_id: - for attribute in self.attributes: - level = self.level_by_attribute_by_proteome_id[proteome_id][attribute] - if not level in proteomes_by_level_by_attribute[attribute]: - proteomes_by_level_by_attribute[attribute][level] = set() - proteomes_by_level_by_attribute[attribute][level].add(proteome_id) - return proteomes_by_level_by_attribute - ############################### - ### compute_levels_by_attribute - ############################### - - def compute_levels_by_attribute(self): - levels_by_attribute = {attribute : set() for attribute in self.attributes} - for proteome in self.level_by_attribute_by_proteome_id: - for attribute in self.attributes: - level = self.level_by_attribute_by_proteome_id[proteome][attribute] - levels_by_attribute[attribute].add(level) - return levels_by_attribute - - def analyse_domains(self): - if proteinCollection.functional_annotation_parsed: - for attribute in self.ALO_by_level_by_attribute: - for level in self.ALO_by_level_by_attribute[attribute]: - ALO = self.ALO_by_level_by_attribute[attribute][level] - ALO.analyse_domains() - ############################### - ### analyse_clusters - ############################### - - def analyse_clusters(self): - if clusterCollection.inferred_singletons_count: - print "[STATUS]\t - Clusters found = %s (of which %s were inferred singletons)" % (clusterCollection.cluster_count, clusterCollection.inferred_singletons_count) - else: - print "[STATUS]\t - Clusters found = %s" % (clusterCollection.cluster_count) - parse_steps = clusterCollection.cluster_count/100 - print "[STATUS] - Analysing clusters ..." - analyse_clusters_start = time.time() - for idx, clusterObj in enumerate(clusterCollection.clusterObjs): - self.analyse_cluster(clusterObj) - progress(idx+1, parse_steps, clusterCollection.cluster_count) - analyse_clusters_end = time.time() - analyse_clusters_elapsed = analyse_clusters_end - analyse_clusters_start - print "[STATUS] - Took %ss to analyse clusters" % (analyse_clusters_elapsed) - - ############################### - ### analyse_clusters : analyse_cluster - ############################### - - def analyse_cluster(self, clusterObj): - '''This function selects the ALOs to which the cluster has to be added''' - # avoiding dots - protein_get_by_proteome_id = clusterObj.protein_ids_by_proteome_id.get - - implicit_protein_ids_by_proteome_id_by_level_by_attribute = {} - cluster_type_by_attribute = {} - protein_counts_of_proteomes_by_level_by_attribute = {} - proteome_coverage_by_level_by_attribute = {} - if self.tree_ete: - for node in self.tree_ete.traverse("levelorder"): - intersection = clusterObj.proteome_ids.intersection(node.proteome_ids) - difference = clusterObj.proteome_ids.difference(node.proteome_ids) - if len(intersection) == 0: - # Nothing to see here ... - node.counts['absent'] += 1 - else: - if clusterObj.singleton == True: - # This is a singleton - node.counts['singleton'] += 1 - node.apomorphic_cluster_counts['singletons'] += 1 - elif len(difference) > 0: - # This is a 'shared' cluster - node.counts['shared'] +=1 - elif len(difference) == 0: - # This is a node 'specific' cluster - node.counts['specific'] += 1 - if clusterObj.proteome_count == 1: - # But it only belongs to one proteome - node.apomorphic_cluster_counts['non_singletons'] += 1 - else: - # It has more than one proteome - child_nodes_covered = [] - child_node_proteome_coverage_strings = [] - child_node_proteome_ids_covered_count = 0 - for child_node in node.get_children(): - if child_node.proteome_ids.isdisjoint(clusterObj.proteome_ids): - # No child node proteomes are not in cluster - child_nodes_covered.append(False) - else: - # At least on child node proteome in cluster - child_nodes_covered.append(True) - child_node_proteome_ids_covered_count = len(clusterObj.proteome_ids.intersection(child_node.proteome_ids)) - child_node_proteome_coverage_strings.append(\ - "%s=(%s/%s)" % (child_node.name, child_node_proteome_ids_covered_count, len(child_node.proteome_ids))) - if all(child_nodes_covered): - # At least one proteome of each child node in cluster - # => SYNAPOMORPHY - node_proteome_coverage = len(intersection)/len(node.proteome_ids) - node_cluster_type = '' - if node_proteome_coverage == 1.0: - node_cluster_type = 'complete_presence' - else: - node_cluster_type = 'stochastic_absence' - node.synapomorphic_cluster_counts[node_cluster_type] += 1 - node.synapomorphic_cluster_strings.append(\ - (clusterObj.cluster_id, \ - node.name, \ - node_cluster_type, \ - '{0:.3}'.format(node_proteome_coverage), \ - ";".join(child_node_proteome_coverage_strings), \ - ",".join(sorted(intersection))) \ - #",".join(sorted(clusterObj.proteome_ids))) \ - ) - else: - sys.exit("[ERROR] You broke my program ...") - - for attribute in self.attributes: - protein_counts_of_proteomes_by_level_by_attribute[attribute] = {} - proteome_coverage_by_level_by_attribute[attribute] = {} - implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute] = {} - protein_ids_by_level = {} - protein_length_stats_by_level = {} - explicit_protein_count_by_proteome_id_by_level = {} - - for level in self.ALO_by_level_by_attribute[attribute]: - protein_ids_by_proteome_id = {} - protein_count_by_proteome_id = {} - protein_ids_by_level[level] = [] - for proteome_id in self.ALO_by_level_by_attribute[attribute][level].proteomes_list: - protein_ids = protein_get_by_proteome_id(proteome_id, []) - protein_ids_by_level[level] += protein_ids - protein_count_by_proteome_id[proteome_id] = len(protein_ids) - if not protein_count_by_proteome_id[proteome_id] == 0: - protein_ids_by_proteome_id[proteome_id] = protein_ids - if protein_ids_by_proteome_id: - implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute][level] = protein_ids_by_proteome_id - explicit_protein_count_by_proteome_id_by_level[level] = protein_count_by_proteome_id - protein_length_stats_by_level[level] = proteinCollection.get_protein_length_stats(protein_ids_by_level[level]) - protein_counts_of_proteomes_by_level_by_attribute[attribute][level] = [protein_count for proteome_id, protein_count in protein_count_by_proteome_id.items()] - cluster_type_by_attribute[attribute] = get_attribute_cluster_type(clusterObj.singleton, implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute]) - - for level in self.ALO_by_level_by_attribute[attribute]: - ALO = self.ALO_by_level_by_attribute[attribute][level] - proteome_coverage_by_level_by_attribute[attribute][level] = len(implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute].get(level, [])) / ALO.proteome_count - ALO_cluster_status = None - ALO_cluster_cardinality = None - mwu_pvalue = None - mwu_log2_mean = None - mean_ALO_count = None - mean_non_ALO_count = None - if level not in implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute]: - ALO_cluster_status = 'absent' - else: - ALO_cluster_status = 'present' - if not cluster_type_by_attribute[attribute] == 'singleton': - ALO_proteome_counts_in_cluster = [count for proteome_id, count in explicit_protein_count_by_proteome_id_by_level[level].items()] - ALO_cluster_cardinality = get_ALO_cluster_cardinality(ALO_proteome_counts_in_cluster) - if cluster_type_by_attribute[attribute] == 'shared': - non_ALO_levels = [non_ALO_level for non_ALO_level in explicit_protein_count_by_proteome_id_by_level if not non_ALO_level == level] - non_ALO_proteome_counts_in_cluster = [] - for non_ALO_level in non_ALO_levels: - for proteome_id in explicit_protein_count_by_proteome_id_by_level[non_ALO_level]: - non_ALO_proteome_counts_in_cluster.append(explicit_protein_count_by_proteome_id_by_level[non_ALO_level][proteome_id]) - mwu_pvalue, mwu_log2_mean, mean_ALO_count, mean_non_ALO_count = mannwhitneyu(ALO_proteome_counts_in_cluster, non_ALO_proteome_counts_in_cluster) - - ALO.add_clusterObj( - clusterObj, - cluster_type_by_attribute[attribute], - ALO_cluster_status, - ALO_cluster_cardinality, - protein_ids_by_level[level], - protein_length_stats_by_level[level], - mwu_pvalue, - mwu_log2_mean, - mean_ALO_count, - mean_non_ALO_count - ) - clusterObj.protein_counts_of_proteomes_by_level_by_attribute = protein_counts_of_proteomes_by_level_by_attribute - clusterObj.protein_median = median([count for count in protein_counts_of_proteomes_by_level_by_attribute['all']['all'] if not count == 0]) - clusterObj.proteome_coverage_by_level_by_attribute = proteome_coverage_by_level_by_attribute - clusterObj.implicit_protein_ids_by_proteome_id_by_level_by_attribute = implicit_protein_ids_by_proteome_id_by_level_by_attribute - clusterObj.cluster_type_by_attribute = cluster_type_by_attribute - - def write_tree(self): - if self.tree_ete: - print "[STATUS] - Writing data for tree ... " - # Node stats - node_stats_f = join(dataFactory.dirs['tree'], "tree.node_metrics.txt") - node_stats_header = [] - node_stats_header.append('nodeID') - node_stats_header.append('taxon_specific_apomorphies_[singletons]') - node_stats_header.append('taxon_specific_apomorphies (non-singletons)') - node_stats_header.append('node_specific_synapomorphies_total') - node_stats_header.append('node_specific_synapomorphies_all') - node_stats_header.append('node_specific_synapomorphies_stochastic_absence') - node_stats_header.append('proteome_count') - node_stats = [] - node_stats.append("\t".join(node_stats_header)) - # Cluster node stats - node_clusters_f = join(dataFactory.dirs['tree'], "tree.cluster_metrics.txt") - node_clusters_header = [] - node_clusters_header.append('clusterID') - node_clusters_header.append('nodeID') - node_clusters_header.append('synapomorphy_type') - node_clusters_header.append('node_proteomes_coverage') - node_clusters_header.append('children_coverage') - node_clusters_header.append('node_proteomes_present') - node_clusters = [] - node_clusters.append("\t".join(node_clusters_header)) - # header_f_by_node_name - header_f_by_node_name = {} - charts_f_by_node_name = {} - for node in self.tree_ete.traverse("levelorder"): - for synapomorphic_cluster_string in node.synapomorphic_cluster_strings: - node_clusters.append("\t".join([str(string) for string in list(synapomorphic_cluster_string)])) - node_stats_line = [] - node_stats_line.append(node.name) - node_stats_line.append(node.apomorphic_cluster_counts['singletons']) - node_stats_line.append(node.apomorphic_cluster_counts['non_singletons']) - node_stats_line.append(node.synapomorphic_cluster_counts['complete_presence'] + node.synapomorphic_cluster_counts['stochastic_absence']) - node_stats_line.append(node.synapomorphic_cluster_counts['complete_presence']) - node_stats_line.append(node.synapomorphic_cluster_counts['stochastic_absence']) - node_stats_line.append(len(node.proteome_ids)) - node_stats.append("\t".join([str(string) for string in node_stats_line])) - if inputObj.render_tree: - header_f_by_node_name[node.name] = self.generate_header_for_node(node) - charts_f_by_node_name[node.name] = self.generate_chart_for_node(node) - print "[STATUS] - Writing %s ... " % node_stats_f - with open(node_stats_f, 'w') as node_stats_fh: - node_stats_fh.write("\n".join(node_stats) + "\n") - print "[STATUS] - Writing %s ... " % node_clusters_f - with open(node_clusters_f, 'w') as node_clusters_fh: - node_clusters_fh.write("\n".join(node_clusters) + "\n") - if inputObj.render_tree: - self.plot_tree(header_f_by_node_name, charts_f_by_node_name) - - def plot_tree(self, header_f_by_node_name, charts_f_by_node_name): - tree_f = join(dataFactory.dirs['tree'], "tree.%s" % ('pdf')) # must be PDF! (otherwise it breaks) - style = ete3.NodeStyle() - style["vt_line_width"] = 5 - style["hz_line_width"] = 5 - style["fgcolor"] = "darkgrey" - for node in self.tree_ete.traverse("levelorder"): - node.set_style(style) - if header_f_by_node_name[node.name]: - node_header_face = ete3.faces.ImgFace(header_f_by_node_name[node.name]) # must be PNG! (ETE can't do PDF Faces) - node.add_face(node_header_face, column=0, position="branch-top") - if charts_f_by_node_name[node.name]: - node_chart_face = ete3.faces.ImgFace(charts_f_by_node_name[node.name]) # must be PNG! (ETE can't do PDF Faces) - node.add_face(node_chart_face, column=0, position="branch-bottom") - node_name_face = ete3.TextFace(node.name, fsize=64) - node.img_style["size"] = 10 - node.img_style["shape"] = "sphere" - node.img_style["fgcolor"] = "black" - if not node.is_leaf(): - node.add_face(node_name_face, column=0, position="branch-right") - node.add_face(node_name_face, column=0, position="aligned") - ts = ete3.TreeStyle() - ts.draw_guiding_lines = True - ts.show_scale = False - ts.show_leaf_name = False - ts.allow_face_overlap = True - ts.guiding_lines_color = "lightgrey" - print "[STATUS] - Writing tree %s ... " % (tree_f) - self.tree_ete.render(tree_f, dpi=600, h=1189, units="mm", tree_style=ts) - - def generate_header_for_node(self, node): - node_header_f = join(dataFactory.dirs['tree_headers'], "%s.header.png" % (node.name)) - data = [] - data.append(("Apomorphies (size=1)", "{:,}".format(node.apomorphic_cluster_counts['singletons']))) - data.append(("Apomorphies (size>1)", "{:,}".format(node.apomorphic_cluster_counts['non_singletons']))) - data.append(("Synapomorphies (all)", "{:,}".format(node.synapomorphic_cluster_counts['complete_presence'] + node.synapomorphic_cluster_counts['stochastic_absence']))) - data.append(("Synapomorphies (cov=100%)", "{:,}".format(node.synapomorphic_cluster_counts['complete_presence']))) - data.append(("Synapomorphies (cov<100%)", "{:,}".format(node.synapomorphic_cluster_counts['stochastic_absence']))) - col_labels = ('Type', 'Count') - fig, ax = plt.subplots(figsize=(2, 0.5)) - ax.set_facecolor('white') - table = ax.table( - cellText=data, - colLabels=col_labels, - loc='bottom', fontsize=24, colLoc='center', rowLoc='right', edges='' - ) - table.set_fontsize(24) - table.scale(2, 1) - for key, cell in table.get_celld().items(): - row, col = key - cell._text.set_color('grey') - if row > 0: - cell.set_edgecolor("darkgrey") - cell.visible_edges = "T" - else: - cell.set_edgecolor("darkgrey") - cell.visible_edges = "B" - if row == len(data) - 2: - cell.set_edgecolor("darkgrey") - cell.visible_edges = "T" - ax.axis('tight') - ax.axis("off") - print "[STATUS]\t- Plotting %s" % (node_header_f) - fig.savefig(node_header_f, pad=0, bbox_inches='tight', format='png') - plt.close() - return node_header_f - - def generate_chart_for_node(self, node): - proteome_coverages = [] - for synapomorphic_cluster_string in node.synapomorphic_cluster_strings: - proteome_coverages.append(float(synapomorphic_cluster_string[3])) - if proteome_coverages: - chart_f = join(dataFactory.dirs['tree_charts'], "%s.barchart.png" % (node.name)) - f, ax = plt.subplots(figsize=(3.0, 3.0)) - ax.set_facecolor('white') - x_values = np.array(proteome_coverages) - ax.hist(x_values, histtype='stepfilled', align='mid', bins=np.arange(0.0, 1.0 + 0.1, 0.1)) - ax.set_xlim(-0.1, 1.1) - for tick in ax.xaxis.get_major_ticks(): - tick.label.set_fontsize(inputObj.plot_font_size - 2) - tick.label.set_rotation('vertical') - for tick in ax.yaxis.get_major_ticks(): - tick.label.set_fontsize(inputObj.plot_font_size - 2) - ax.set_frame_on(False) - ax.xaxis.grid(True, linewidth=1, which="major", color="lightgrey") - ax.yaxis.grid(True, linewidth=1, which="major", color="lightgrey") - f.suptitle("Synapomorphies", y=1.1) - ax.set_ylabel("Count", fontsize=inputObj.plot_font_size) - ax.set_xlabel("Proteome coverage", fontsize=inputObj.plot_font_size) - print "[STATUS]\t- Plotting %s" % (chart_f) - f.savefig(chart_f, bbox_inches='tight', format='png') - if inputObj.plot_format == 'pdf': - pdf_chart_f = join(dataFactory.dirs['tree_charts'], "%s.barchart.pdf" % (node.name)) - print "[STATUS]\t- Plotting %s" % (pdf_chart_f) - f.savefig(pdf_chart_f, bbox_inches='tight', format='pdf') - plt.close() - return chart_f - - def compute_rarefaction_data(self): - rarefaction_by_samplesize_by_level_by_attribute = {} - print "[STATUS] - Generating rarefaction data ..." - for attribute in self.attributes: - for level in self.proteome_ids_by_level_by_attribute[attribute]: - proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level] - if not len(proteome_ids) == 1: - ALO = self.ALO_by_level_by_attribute[attribute][level] - if not attribute in rarefaction_by_samplesize_by_level_by_attribute: - rarefaction_by_samplesize_by_level_by_attribute[attribute] = {} - if not level in rarefaction_by_samplesize_by_level_by_attribute[attribute]: - rarefaction_by_samplesize_by_level_by_attribute[attribute][level] = {} - for repetition in xrange(0, inputObj.repetitions): - seen_cluster_ids = set() - random_list_of_proteome_ids = [x for x in ALO.proteomes] - random.shuffle(random_list_of_proteome_ids) - for idx, proteome_id in enumerate(random_list_of_proteome_ids): - proteome_ALO = self.ALO_by_level_by_attribute['TAXON'][proteome_id] - seen_cluster_ids.update(proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status['present']['specific']) - seen_cluster_ids.update(proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status['present']['shared']) - sample_size = idx + 1 - if not sample_size in rarefaction_by_samplesize_by_level_by_attribute[attribute][level]: - rarefaction_by_samplesize_by_level_by_attribute[attribute][level][sample_size] = [] - rarefaction_by_samplesize_by_level_by_attribute[attribute][level][sample_size].append(len(seen_cluster_ids)) - - for attribute in rarefaction_by_samplesize_by_level_by_attribute: - rarefaction_plot_f = join(dataFactory.dirs[attribute], "%s.rarefaction_curve.%s" % (attribute, inputObj.plot_format)) - rarefaction_by_samplesize_by_level = rarefaction_by_samplesize_by_level_by_attribute[attribute] - f, ax = plt.subplots(figsize=inputObj.plot_size) - ax.set_facecolor('white') - max_number_of_samples = 0 - for idx, level in enumerate(rarefaction_by_samplesize_by_level): - number_of_samples = len(rarefaction_by_samplesize_by_level[level]) - if number_of_samples > max_number_of_samples: - max_number_of_samples = number_of_samples - colour = plt.cm.Paired(idx/len(rarefaction_by_samplesize_by_level)) - x_values = [] - y_mins = [] - y_maxs = [] - median_y_values = [] - median_x_values = [] - for x, y_reps in rarefaction_by_samplesize_by_level[level].items(): - x_values.append(x) - y_mins.append(min(y_reps)) - y_maxs.append(max(y_reps)) - median_y_values.append(median(y_reps)) - median_x_values.append(x) - x_array = np.array(x_values) - y_mins_array = np.array(y_mins) - y_maxs_array = np.array(y_maxs) - ax.plot(median_x_values, median_y_values, '-', color=colour, label=level) - ax.fill_between(x_array, y_mins_array, y_maxs_array, color=colour, alpha=0.5) - ax.set_xlim([0, max_number_of_samples + 1]) - ax.set_ylabel("Count of non-singleton clusters", fontsize=inputObj.plot_font_size) - ax.set_xlabel("Sampled proteomes", fontsize=inputObj.plot_font_size) - - ax.grid(True, linewidth=1, which="major", color="lightgrey") - legend = ax.legend(ncol=1, numpoints=1, loc="lower right", frameon=True, fontsize=inputObj.plot_font_size) - legend.get_frame().set_facecolor('white') - print "[STATUS]\t- Plotting %s" % (rarefaction_plot_f) - f.savefig(rarefaction_plot_f, format=inputObj.plot_format) - plt.close() - -######################################################################## -# CLASS : AttributeLevelObj -######################################################################## - -class AttributeLevelObj(): - ''' - Definitions: - 'shared' : shared between one ALO and others - 'singleton' : cardinality of 1 ('specific', but separate) - 'specific' : only present within one ALO - ''' - def __init__(self, attribute, level, proteomes): - self.attribute = attribute # string - self.level = level # string - self.proteomes_list = list(proteomes) # - self.proteomes = set(proteomes) # frozenset(), used for checking whether cluster and ALO intersect - self.proteome_count = len(proteomes) # int - - self.cluster_ids_by_cluster_type_by_cluster_status = {'present' : {'singleton' : [], 'specific' : [], 'shared' : []}, - 'absent' : {'singleton' : [], 'specific' : [], 'shared' : []}} # sums up to cluster_count - self.protein_ids_by_cluster_type = {'singleton' : [], 'specific' : [], 'shared' : []} # list of lists - self.protein_span_by_cluster_type = {'singleton' : [], 'specific' : [], 'shared' : []} - self.clusters_by_cluster_cardinality_by_cluster_type = {'shared' : {'true' : [], 'fuzzy' : []}, 'specific' : {'true' : [], 'fuzzy' : []}} - - self.cluster_status_by_cluster_id = {} - self.cluster_type_by_cluster_id = {} - - self.cluster_mwu_pvalue_by_cluster_id = {} - self.cluster_mwu_log2_mean_by_cluster_id = {} - self.cluster_mean_ALO_count_by_cluster_id = {} - self.cluster_mean_non_ALO_count_by_cluster_id = {} - - self.domain_counter_by_domain_source_by_cluster_type = None - self.protein_with_domain_count_by_domain_source_by_cluster_type = None - - self.protein_length_stats_by_cluster_id = {} - self.protein_count_by_cluster_id = {} - - self.rarefaction_data = {} # repetition : number of clusters - - ############################### - ### add_clusterObj - ############################### - - def analyse_domains(self): - print "[STATUS] - Analysing domains (this may take a while) ... " - domain_counter_by_domain_source_by_cluster_type = {'singleton' : {}, 'specific' : {}, 'shared' : {}} - protein_with_domain_count_by_domain_source_by_cluster_type = {'singleton' : {}, 'specific' : {}, 'shared' : {}} - get_proteinObj_by_protein_id = proteinCollection.proteinObjs_by_protein_id.get - for cluster_type in self.protein_ids_by_cluster_type: - for domain_source in proteinCollection.domain_sources: - if not domain_source in domain_counter_by_domain_source_by_cluster_type[cluster_type]: - domain_counter_by_domain_source_by_cluster_type[cluster_type][domain_source] = Counter() - protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type][domain_source] = 0 - for protein_id in self.protein_ids_by_cluster_type[cluster_type]: - proteinObj = get_proteinObj_by_protein_id(protein_id) - if domain_source in proteinObj.domain_counter_by_domain_source: - domain_counter = proteinObj.domain_counter_by_domain_source[domain_source] - if domain_counter: - domain_counter_by_domain_source_by_cluster_type[cluster_type][domain_source] += domain_counter - protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type][domain_source] += 1 - if proteinObj.go_terms: - domain_counter = Counter(list(proteinObj.go_terms)) - domain_counter_by_domain_source_by_cluster_type[cluster_type]["GO"] += domain_counter - protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type]["GO"] += 1 - - domain_counter_by_domain_source_by_cluster_type['total'] = {} - protein_with_domain_count_by_domain_source_by_cluster_type['total'] = {} - for domain_source in proteinCollection.domain_sources: - domain_counter_by_domain_source_by_cluster_type['total'][domain_source] = Counter() - protein_with_domain_count_by_domain_source_by_cluster_type['total'][domain_source] = 0 - - for cluster_type in domain_counter_by_domain_source_by_cluster_type: - for domain_source in proteinCollection.domain_sources: - domain_counter_by_domain_source_by_cluster_type['total'][domain_source] += domain_counter_by_domain_source_by_cluster_type[cluster_type][domain_source] - protein_with_domain_count_by_domain_source_by_cluster_type['total'][domain_source] += protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type][domain_source] - self.domain_counter_by_domain_source_by_cluster_type = domain_counter_by_domain_source_by_cluster_type - self.protein_with_domain_count_by_domain_source_by_cluster_type = protein_with_domain_count_by_domain_source_by_cluster_type - #print self.level - #for cluster_type in self.domain_counter_by_domain_source_by_cluster_type: - # print cluster_type - # print self.domain_counter_by_domain_source_by_cluster_type[cluster_type] - # print self.protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type] - - def add_clusterObj(self, clusterObj, attribute_cluster_type, ALO_cluster_status, ALO_cluster_cardinality, ALO_protein_ids_in_cluster, ALO_protein_length_stats, mwu_pvalue, mwu_log2_mean, mean_ALO_count, mean_non_ALO_count): - self.cluster_ids_by_cluster_type_by_cluster_status[ALO_cluster_status][attribute_cluster_type].append(clusterObj.cluster_id) - self.cluster_status_by_cluster_id[clusterObj.cluster_id] = ALO_cluster_status - self.cluster_type_by_cluster_id[clusterObj.cluster_id] = attribute_cluster_type - self.protein_length_stats_by_cluster_id[clusterObj.cluster_id] = ALO_protein_length_stats - - self.protein_count_by_cluster_id[clusterObj.cluster_id] = len(ALO_protein_ids_in_cluster) - if ALO_cluster_status == 'present': - for ALO_protein_id in ALO_protein_ids_in_cluster: - self.protein_ids_by_cluster_type[attribute_cluster_type].append(ALO_protein_id) - self.protein_span_by_cluster_type[attribute_cluster_type].append(ALO_protein_length_stats['sum']) - if not attribute_cluster_type == 'singleton': - if ALO_cluster_cardinality: - self.clusters_by_cluster_cardinality_by_cluster_type[attribute_cluster_type][ALO_cluster_cardinality].append(clusterObj.cluster_id) - - self.cluster_mwu_pvalue_by_cluster_id[clusterObj.cluster_id] = mwu_pvalue - self.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id] = mwu_log2_mean - self.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id] = mean_ALO_count - self.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id] = mean_non_ALO_count - - ############################### - ### get_protein_count_by_cluster_type - ############################### - - def get_protein_count_by_cluster_type(self, cluster_type): - if cluster_type == 'total': - return sum([len(protein_ids) for cluster_type, protein_ids in self.protein_ids_by_cluster_type.items()]) - else: - return len(self.protein_ids_by_cluster_type[cluster_type]) - - ############################### - ### get_protein_span_by_cluster_type - ############################### - - def get_protein_span_by_cluster_type(self, cluster_type): - span = 0 - if cluster_type == 'total': - span = sum([sum(protein_ids) for cluster_type, protein_ids in self.protein_span_by_cluster_type.items()]) - else: - span = sum(self.protein_span_by_cluster_type[cluster_type]) - return span - - ############################### - ### get_cluster_count_by_cluster_status_by_cluster_type - ############################### - - def get_cluster_count_by_cluster_status_by_cluster_type(self, cluster_status, cluster_type): - if cluster_type == 'total': - return sum([len(cluster_ids) for cluster_type, cluster_ids in self.cluster_ids_by_cluster_type_by_cluster_status[cluster_status].items()]) - else: - return len(self.cluster_ids_by_cluster_type_by_cluster_status[cluster_status][cluster_type]) - - def get_cluster_count_by_cluster_cardinality_by_cluster_type(self, cluster_type, cluster_cardinality): - return len(self.clusters_by_cluster_cardinality_by_cluster_type[cluster_type][cluster_cardinality]) - - def get_proteomes(self): - return ", ".join(sorted([str(proteome_id) for proteome_id in self.proteomes])) - -######################################################################## -# CLASS : ProteinCollection -######################################################################## - -class ProteinCollection(): - def __init__(self, proteinObjs): - self.proteinObjs = proteinObjs - self.proteinObjs_by_protein_id = {proteinObj.protein_id : proteinObj for proteinObj in proteinObjs} - self.protein_count = len(proteinObjs) - self.domain_sources = [] - self.fastas_parsed = False - self.functional_annotation_parsed = False - self.domain_description_by_domain_id_by_domain_source = None - - ############################### - ### add_domainObjs_to_proteinObjs - ############################### - - def add_annotation_to_proteinObj(self, domain_protein_id, domain_counter_by_domain_source, go_terms): - proteinObj = self.proteinObjs_by_protein_id.get(domain_protein_id, None) - if proteinObj: - proteinObj.domain_counter_by_domain_source = domain_counter_by_domain_source - signalp_notm = proteinObj.domain_counter_by_domain_source.get("SignalP_EUK", None) - if signalp_notm and "SignalP-noTM" in signalp_notm: - proteinObj.secreted = True - proteinObj.go_terms = go_terms - - def get_protein_length_stats(self, protein_ids): - protein_length_stats = {'sum' : 0, 'mean' : 0.0, 'median' : 0, 'sd': 0.0} - if protein_ids and self.fastas_parsed: - protein_lengths = [self.proteinObjs_by_protein_id[protein_id].length for protein_id in protein_ids] - protein_length_stats['sum'] = sum(protein_lengths) - protein_length_stats['mean'] = mean(protein_lengths) - protein_length_stats['median'] = median(protein_lengths) - protein_length_stats['sd'] = sd(protein_lengths) - return protein_length_stats -######################################################################## -# CLASS : ProteinObj -######################################################################## - -class ProteinObj(): - def __init__(self, protein_id, proteome_id, species_id, sequence_id): - self.protein_id = protein_id - self.proteome_id = proteome_id - self.species_id = species_id - self.sequence_id = sequence_id - self.length = None - self.clustered = False - - self.secreted = False - - self.domain_counter_by_domain_source = {} - self.go_terms = [] - - - ############################### - ### add_length - ############################### - - def add_length(self, length): - self.length = length - - ############################### - ### get_domain_list - ############################### - - def get_domain_list(self): - return sorted(self.domain_list, key=lambda x: x.domain_start, reverse=False) - - - ############################### - ### compute_domain_count_by_domain_id_by_domain_source - ############################### - - def compute_domain_count_by_domain_id_by_domain_source(self): - if self.domain_list: - domain_ids_by_domain_source = {domainObj.domain_source : [] for domainObj in self.domain_list} - for domainObj in self.domain_list: - domain_ids_by_domain_source[domainObj.domain_source].append(domainObj.domain_id) - self.domain_count_by_domain_id_by_domain_source = {domain_source : Counter(domain_ids_by_domain_source[domain_source]) for domain_source in domain_ids_by_domain_source} - else: - self.domain_count_by_domain_id_by_domain_source = Counter() - -######################################################################## -# CLASS : ClusterCollection -######################################################################## - -class ClusterCollection(): - def __init__(self, clusterObjs, inferred_singletons_count, functional_annotation_parsed, fastas_parsed, domain_sources): - self.clusterObjs = clusterObjs - self.clusterObjs_by_cluster_id = {clusterObj.cluster_id: clusterObj for clusterObj in clusterObjs} # only for testing - self.cluster_count = len(clusterObjs) - self.inferred_singletons_count = inferred_singletons_count - self.functional_annotation_parsed = functional_annotation_parsed - self.fastas_parsed = fastas_parsed - #self.domain_sources = [domain_source for domain_source in domain_sources if not domain_source == "GO"] - self.domain_sources = domain_sources -######################################################################## -# CLASS : ClusterObj -######################################################################## - -class ClusterObj(): - def __init__(self, cluster_id, protein_ids): - self.cluster_id = cluster_id - self.protein_ids = set(protein_ids) - self.protein_count = len(protein_ids) - try: - self.proteomes_by_protein_id = {protein_id : proteinCollection.proteinObjs_by_protein_id[protein_id].proteome_id for protein_id in protein_ids} - except KeyError as e: - sys.exit("[ERROR] - Protein %s in clustering belongs to proteomes that are not present in the SpeciesClassification-file. Please add those proteoemes or recluster by omitting these proteomes." % (e.args[0])) - - self.proteome_ids_list = self.proteomes_by_protein_id.values() - self.protein_count_by_proteome_id = Counter(self.proteome_ids_list) - self.proteome_ids = frozenset(self.proteome_ids_list) - self.proteome_count = len(self.proteome_ids) - self.singleton = False if self.protein_count > 1 else True - self.apomorphy = False if self.proteome_count > 1 else True - self.protein_ids_by_proteome_id = self.compute_protein_ids_by_proteome() - - # DOMAINS - self.go_terms = self.compute_go_terms() - self.domain_counter_by_domain_source = self.compute_domain_counter_by_domain_source() - self.secreted_cluster_coverage = self.compute_secreted_cluster_coverage() - self.domain_entropy_by_domain_source = self.compute_domain_entropy_by_domain_source() - self.protein_length_stats = self.compute_protein_length_stats() - - self.implicit_protein_ids_by_proteome_id_by_level_by_attribute = None - self.proteome_ids_by_level_by_attribute = None # used for checking status - self.proteome_coverage_by_level_by_attribute = None - self.protein_counts_of_proteomes_by_level_by_attribute = None # non-zero-counts - self.protein_median = None - self.cluster_type_by_attribute = None - - ############################### - ### compute_protein_ids_by_proteome - ############################### - - def compute_protein_ids_by_proteome(self): - protein_ids_by_proteome_id = defaultdict(set) - for protein_id, proteome_id in self.proteomes_by_protein_id.items(): - protein_ids_by_proteome_id[proteome_id].add(protein_id) - return protein_ids_by_proteome_id - - def compute_secreted_cluster_coverage(self): - secreted = 0 - for protein_id in self.protein_ids: - if proteinCollection.proteinObjs_by_protein_id[protein_id].secreted: - secreted += 1 - return secreted/self.protein_count - - def compute_protein_length_stats(self): - protein_lengths = [proteinCollection.proteinObjs_by_protein_id[protein_id].length for protein_id in self.protein_ids] - if all(protein_lengths): - protein_length_stats = {} - protein_length_stats['mean'] = mean(protein_lengths) - protein_length_stats['median'] = median(protein_lengths) - protein_length_stats['sd'] = sd(protein_lengths) - return protein_length_stats - - def compute_domain_counter_by_domain_source(self): - cluster_domain_counter_by_domain_source = {} - for protein_id in self.protein_ids: - protein_domain_counter_by_domain_source = proteinCollection.proteinObjs_by_protein_id[protein_id].domain_counter_by_domain_source - if protein_domain_counter_by_domain_source: - for domain_source, protein_domain_counter in protein_domain_counter_by_domain_source.items(): - if not domain_source in cluster_domain_counter_by_domain_source: - cluster_domain_counter_by_domain_source[domain_source] = Counter() - cluster_domain_counter_by_domain_source[domain_source] += protein_domain_counter - return cluster_domain_counter_by_domain_source - - def compute_domain_entropy_by_domain_source(self): - domain_entropy_by_domain_source = {} - for domain_source, domain_counter in self.domain_counter_by_domain_source.items(): - total_count = len([domain for domain in domain_counter.elements()]) - domain_entropy = -sum([i/total_count * log(i/total_count, 2) for i in domain_counter.values()]) - if str(domain_entropy) == "-0.0": - domain_entropy_by_domain_source[domain_source] = 0.0 - else: - domain_entropy_by_domain_source[domain_source] = domain_entropy - return domain_entropy_by_domain_source - - def compute_go_terms(self): - go_terms = set() - for protein_id in self.protein_ids: - if proteinCollection.proteinObjs_by_protein_id[protein_id].go_terms: - for go_term in proteinCollection.proteinObjs_by_protein_id[protein_id].go_terms: - go_terms.add(go_term) - return go_terms - - -class InputObj(): - def __init__(self, args): - # reserved attributes - self.ATTRIBUTE_RESERVED = ['IDX', 'OUT', "TAXID"] - # input files - self.cluster_f = args['--cluster_file'] - self.config_f = args['--config_file'] - self.sequence_ids_f = args['--sequence_ids_file'] - self.species_ids_f = args['--species_ids_file'] - self.tree_f = args['--tree_file'] - self.render_tree = args['--plot_tree'] - self.nodesdb_f = args['--nodesdb'] - self.functional_annotation_f = args['--functional_annotation'] - self.pfam_mapping = True - self.pfam_mapping_f = None - self.ipr_mapping = True - self.ipr_mapping_f = None - self.go_mapping_f = None - self.check_input_files() - self.check_that_ete_can_plot() - # FASTA files - self.fasta_dir = args['--fasta_dir'] - self.check_if_fasta_dir_and_species_ids_f() - # outprefix - self.outprefix = args['--outprefix'] - # proteins - self.infer_singletons = args['--infer_singletons'] - # values: fuzzyness - self.fuzzy_count = None - self.check_fuzzy_count(args['--target_count']) - self.fuzzy_fraction = None - self.check_fuzzy_fraction(args['--target_fraction']) - self.fuzzy_min = None - self.fuzzy_max = None - self.check_fuzzy_min_max(args['--min'], args['--max']) - self.fuzzy_range = set([x for x in xrange(self.fuzzy_min, self.fuzzy_max+1) if not x == self.fuzzy_count]) - # values: rarefaction - self.repetitions = int(args['--repetitions']) + 1 - self.check_repetitions() - self.min_proteomes = int(args['--min_proteomes']) - self.check_min_proteomes() - # values: plots - self.plot_format = args['--plotfmt'] - self.check_plot_format() - self.plot_size = tuple(int(x) for x in args['--plotsize'].split(",")) - self.plot_font_size = int(args['--fontsize']) - # taxrank - self.taxranks = [taxrank.replace(" ","") for taxrank in args['--taxranks'].split(",")] - self.check_taxranks() - - def check_plot_format(self): - SUPPORTED_PLOT_FORMATS = set(['png', 'pdf', 'svg']) - if self.plot_format not in SUPPORTED_PLOT_FORMATS: - sys.exit("[ERROR] : Plot format %s not part of supported plot formats (%s)" % (self.plot_format, SUPPORTED_PLOT_FORMATS)) - - def check_repetitions(self): - if not self.repetitions > 0: - sys.exit("[ERROR] : Please specify a positive integer for the number of repetitions for the rarefaction curves") - - def check_min_proteomes(self): - if not self.min_proteomes > 0: - sys.exit("[ERROR] : Please specify a positive integer for the minimum number of proteomes to consider for computations") - - def check_taxranks(self): - SUPPORTED_TAXRANKS = set(['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'superfamily', 'family', 'subfamily', 'genus', 'species']) - unsupported_taxranks = [] - for taxrank in self.taxranks: - if not taxrank in SUPPORTED_TAXRANKS: - unsupported_taxranks.append(taxrank) - if unsupported_taxranks: - sys.exit("[ERROR] : Taxrank(s) %s not part of supported Taxranks (%s)" % (",".join(sorted(unsupported_taxranks)), ",".join(sorted(SUPPORTED_TAXRANKS)))) - - def check_if_fasta_dir_and_species_ids_f(self): - if self.fasta_dir: - if not self.species_ids_f: - sys.exit("[ERROR] : You have provided a FASTA-dir using '--fasta-dir'. Please provide a Species-ID file using ('--species_ids_file').") - - def check_input_files(self): - check_file(self.sequence_ids_f) - check_file(self.species_ids_f) - check_file(self.config_f) - check_file(self.functional_annotation_f) - check_file(self.sequence_ids_f) - check_file(self.tree_f) - check_file(self.nodesdb_f) - if self.pfam_mapping: - pfam_mapping_f = join(dirname(realpath(__file__)), "../data/Pfam-A.clans.tsv.gz") - if not isfile(pfam_mapping_f): - print "[WARN] - PFAM-ID file 'data/Pfam-A.clans.tsv.gz' not found." - remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz" - retrieve_ftp(remote_f, pfam_mapping_f) - self.pfam_mapping_f = pfam_mapping_f - if self.ipr_mapping: - ipr_mapping_f = join(dirname(realpath(__file__)), "../data/entry.list") - if not isfile(ipr_mapping_f): - print "[WARN] - IPR-ID file 'data/entry.list' not found." - remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/entry.list" - retrieve_ftp(remote_f, ipr_mapping_f) - self.ipr_mapping_f = ipr_mapping_f - go_mapping_f = join(dirname(realpath(__file__)), "../data/interpro2go") - if not isfile(go_mapping_f): - print "[WARN] - GO-ID file, but 'data/interpro2go' not found." - remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go" - retrieve_ftp(remote_f, go_mapping_f) - self.go_mapping_f = go_mapping_f - - def check_that_ete_can_plot(self): - if self.render_tree: - try: - import PyQt4 - except ImportError: - sys.exit("[ERROR] : Plotting of trees requires additional ETE3 dependencies. PyQt4 is not installed. Please install PyQt4") - if 'DISPLAY' in environ: - print "[STATUS] - X server seems to be present..." - test_tree_f = join(getcwd(), "this_is_a_test_tree.pdf") - t = ete3.Tree( "((a,b),c);" ) - try: - a = t.render(test_tree_f, w=40, units="mm") - print "[STATUS] - ETE can connect to X server (X11). Tree will be rendered." - except: - self.render_tree = False - print "[WARN] - ETE cannot connect to X server (X11). No tree will be rendered." - remove(test_tree_f) - else: - print "[STATUS] - No X server found. ETE can't render the tree. Consider using \'xvfb-run\' ..." - self.render_tree = False - - def check_fuzzy_count(self, target_count): - if int(target_count) > 0: - self.fuzzy_count = int(target_count) - else: - sys.exit("[ERROR] : --target_count %s must be greater than 0" % (target_count)) - - def check_fuzzy_fraction(self, fuzzyness): - if 0 <= float(fuzzyness) <= 1: - self.fuzzy_fraction = float(fuzzyness) - else: - sys.exit("[ERROR] : --target_fraction %s is not between 0.0 and 1.0" (fuzzyness)) - - def check_fuzzy_min_max(self, fuzzy_min, fuzzy_max): - if int(fuzzy_min) <= int(fuzzy_max): - self.fuzzy_min = int(args['--min']) - self.fuzzy_max = int(args['--max']) - else: - sys.exit("[ERROR] : --min %s is greater than --max %s" (fuzzy_min, fuzzy_max)) - - -def welcome_screen(): - screen = "\ - _ _ _ _______ _ \n\ - | | / |_) (_______|_) \n\ - | | / / _ ____ _____ _ ____ \n\ - | |< < | | _ \| ___) | | _ \ \n\ - | | \ \| | | | | | | | | | | \n\ - |_| \_)_|_| |_|_| |_|_| |_| v%s\n\ - " % (__version__) - print screen - - -if __name__ == "__main__": - __version__ = "0.9" - welcome_screen() - args = docopt(__doc__) - inputObj = InputObj(args) - if inputObj.tree_f: - try: - import ete3 - except ImportError: - sys.exit("[ERROR] : Module \'ete3\' was not found. Please install \'ete3\' using \'pip install ete3\'\n/tPlotting of trees requires additional dependencies:\n\t- PyQt4\n\t") - # Input sane ... now we start - print "[STATUS] - Starting analysis ..." - overall_start = time.time() - # Initialise - aloCollection = None - proteinCollection = None - domainCollection = None - clusterCollection = None - # Build dataFactory - dataFactory = DataFactory() - # Build Collections - aloCollection = dataFactory.build_AloCollection() - proteinCollection = dataFactory.build_ProteinCollection(inputObj) - clusterCollection = dataFactory.build_ClusterCollection(inputObj) - dataFactory.setup_dirs(inputObj) - aloCollection.analyse_clusters() - # aloCollection.analyse_domains() # takes prohibitely long, implement faster! - aloCollection.write_tree() - aloCollection.compute_rarefaction_data() - dataFactory.write_output() - - overall_end = time.time() - overall_elapsed = overall_end - overall_start - print "[STATUS] - Took %ss to run kinfin." % (overall_elapsed) - del aloCollection - del proteinCollection - del domainCollection - del clusterCollection diff --git a/dist/kinfin-0.9-py2.7.egg b/dist/kinfin-0.9-py2.7.egg deleted file mode 100644 index a4cb260..0000000 Binary files a/dist/kinfin-0.9-py2.7.egg and /dev/null differ diff --git a/example/curl_examples.md b/example/curl_examples.md new file mode 100644 index 0000000..ac7fb70 --- /dev/null +++ b/example/curl_examples.md @@ -0,0 +1,70 @@ +### 1. Initialize the Analysis Process + +```bash +curl -X POST "http://127.0.0.1:8000/kinfin/init" \ +-H "Content-Type: application/json" \ +-d '{"config": [{ "taxon": "BGLAB", "label1": "red" },{ "taxon": "CVIRG", "label1": "red" },{ "taxon": "DPOLY", "label1": "red" },{ "taxon": "GAEGI", "label1": "red" },{ "taxon": "LJAPO", "label1": "red" },{ "taxon": "LSAXA", "label1": "red" },{ "taxon": "MANGU", "label1": "red" },{ "taxon": "MAREN", "label1": "red" },{ "taxon": "MGIGA", "label1": "red" },{ "taxon": "MMERC", "label1": "red" },{ "taxon": "MTROS", "label1": "blue" },{ "taxon": "OBIMA", "label1": "blue" },{ "taxon": "OEDUL", "label1": "blue" },{ "taxon": "OSINE", "label1": "blue" },{ "taxon": "OVULG", "label1": "blue" },{ "taxon": "PCANA", "label1": "blue" },{ "taxon": "PMAXI", "label1": "blue" },{ "taxon": "PVULG", "label1": "blue" },{ "taxon": "TGRAN", "label1": "blue" }]}' | jq +``` + +### 2. Get Run Status + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/status" \ +-H "x-session-id: " | jq +``` + +### 3. Get Run Summary + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/run-summary" \ +-H "x-session-id: " | jq +``` + +### 4. Get Available Attributes and Taxon Sets + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/available-attributes-taxonsets" \ +-H "x-session-id: " | jq +``` + +### 5. Get Counts by Taxon + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/counts-by-taxon" \ +-H "x-session-id: " | jq +``` + +### 6. Get Cluster Summary + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/cluster-summary/label1" \ +-H "x-session-id: " | jq +``` + +### 7. Get Attribute Summary + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/attribute-summary/label1" \ +-H "x-session-id: " | jq +``` + +### 8. Get Cluster Metrics + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/cluster-metrics/label1/red" \ +-H "x-session-id: " | jq +``` + +### 9. Get Pairwise Analysis + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/pairwise-analysis/label1" \ +-H "x-session-id: " | jq +``` + +### 10. Get Plot + +```bash +curl -X GET "http://127.0.0.1:8000/kinfin/plot/" \ +-H "x-session-id: " -o ".png" +``` diff --git a/example/taxon_idx_mapping.json b/example/taxon_idx_mapping.json new file mode 100644 index 0000000..0812d0b --- /dev/null +++ b/example/taxon_idx_mapping.json @@ -0,0 +1,8 @@ +{ + "A": "0", + "B": "1", + "C": "2", + "D": "3", + "E": "4", + "F": "5" +} diff --git a/install b/install deleted file mode 100755 index b4ebe09..0000000 --- a/install +++ /dev/null @@ -1,84 +0,0 @@ -#!/usr/bin/env bash -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" - -echo "[+] Checking dependencies..." -wget=$(which wget) -if [ -x "$wget" ] ; then - echo " [+] [wget] $wget"; else - echo " [X] [wget] ... please install wget"; -fi -gunzip=$(which gunzip) -if [ -x "$gunzip" ] ; then - echo " [+] [gunzip] $gunzip"; else - echo " [X] [gunzip] ... please install gunzip"; -fi - -# Download files -echo "[+] Checking data files..." -pfam_dest=$DIR/data/Pfam-A.clans.tsv.gz -ipr_dest=$DIR/data/entry.list -go_dest=$DIR/data/interpro2go -nodesdbgz=$DIR/data/nodesdb.gz -nodesdb=$DIR/data/nodesdb.txt -if [ -f "$nodesdb" ]; then - echo " [+] $nodesdb" -else - if [ -f "$nodesdbgz" ]; then - echo -n " [+] Extracting $nodesdbgz ..." - $gunzip -c $nodesdbgz > $nodesdb - if [ $? -eq 0 ]; then - echo "done." - else - echo "FAIL... Please download kinfin again." - exit 1 - fi - else - echo "[-] $nodesdbgz not found... Please download kinfin again." - exit 1 - fi -fi - -if [ -f "$pfam_dest" ]; then - echo " [+] $pfam_dest" -else - echo -n " [-] $pfam_dest. Downloading ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz ..." - $wget -qN ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz -P $DIR/data/ - if [ $? -eq 0 ]; then - echo "done." - else - echo "FAIL." - fi -fi - -if [ -f "$ipr_dest" ]; then - echo " [+] $ipr_dest" -else - echo -n " [-] $ipr_dest. Downloading ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list ..." - $wget -qN ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list -P $DIR/data/ - if [ $? -eq 0 ]; then - echo "done." - else - echo "FAIL." - fi -fi - -if [ -f "$go_dest" ]; then - echo " [+] $go_dest" -else - echo -n " [-] $go_dest. Downloading ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro2go ... " - $wget -qN ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro2go -P $DIR/data/ - if [ $? -eq 0 ]; then - echo "done." - else - echo "FAIL." - fi -fi - -# Create executable -echo "[+] Creating executable..." -echo '#!/usr/bin/env bash -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -$DIR/src/kinfin.py "$@"' > $DIR/kinfin && chmod +x $DIR/kinfin - -# Done -echo "[+] Kinfin was installed. Please run ./kinfin" diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..58c2b9c --- /dev/null +++ b/install.sh @@ -0,0 +1,143 @@ +#!/usr/bin/env bash + +# logging function +log() { + local GREEN='\033[0;32m' + local YELLOW='\033[0;33m' + local RED='\033[0;31m' + local NO_COLOR='\033[0m' + + local level=$1 + local message=$2 + + case $level in + INFO) + echo -e "[${NO_COLOR}INFO${NO_COLOR}] - $message" + ;; + SUCCESS) + echo -e "[${GREEN}SUCCESS${NO_COLOR}] - $message" + ;; + ERROR) + echo -e "[${RED}ERROR${NO_COLOR}] - $message" >&2 + ;; + *) + echo "Invalid log level: $level" + ;; + esac +} + +# Check dependencies exist +check_dependencies() { + log INFO "Checking dependencies..." + + local dependencies=("wget" "gunzip") + local missing_dependencies=() + + for dependency in "${dependencies[@]}"; do + local item=$(command -v "$dependency") + if [ ! -x "$item" ]; then + missing_dependencies+=("$dependency") + fi + done + + if [ ${#missing_dependencies[@]} -gt 0 ]; then + log ERROR "Missing dependencies: ${missing_dependencies[*]}. Please install them." + exit 1 + else + for dependency in "${dependencies[@]}"; do + log SUCCESS "$dependency is installed." + done + log SUCCESS "All dependencies are installed." + return 0 + fi +} + +# Function to download a file +download_file() { + local url=$1 + local filename=$2 + + log INFO "Downloading $filename from $url" + $(which wget) -np -nd -qN --show-progress "$url" -P "$DIR/data/" + + if [ $? -eq 0 ]; then + log SUCCESS "Downloaded $filename" + else + log ERROR "Failed to download $filename from $url" + exit 1 + fi +} + +# Extract .gz files +extract_gzip() { + local gz_file=$1 + local dest=$2 + + log INFO "Extracting $gz_file..." + + $(which gunzip) -c "$gz_file" > "$dest" + + if [ $? -eq 0 ]; then + log SUCCESS "Extracted $gz_file at $dest" + else + log ERROR "Failed to extract $gz_file. Please download kinfin again." + exit 1 + fi +} + + + +main() { + # Set working directory + DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + + check_dependencies + + log INFO "Checking input data files..." + + local pfam_dest="$DIR/data/Pfam-A.clans.tsv.gz" + local ipr_dest="$DIR/data/entry.list" + local go_dest="$DIR/data/interpro2go" + local nodesdbgz="$DIR/data/nodesdb.gz" + local nodesdb="$DIR/data/nodesdb.txt" + + if [ ! -f "$nodesdb" ]; then + if [ -f "$nodesdbgz" ]; then + extract_gzip "$nodesdbgz" "$nodesdb" + else + log ERROR "$nodesdbgz not found. Please download kinfin again." + exit 1 + fi + else + log SUCCESS "$nodesdb is already present." + fi + + if [ ! -f "$pfam_dest" ]; then + download_file "ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz" "Pfam-A.clans.tsv.gz" + else + log SUCCESS "Pfam-A.clans.tsv.gz is already present." + fi + + if [ ! -f "$ipr_dest" ]; then + download_file "ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list" "entry.list" + else + log SUCCESS "entry.list is already present." + fi + + if [ ! -f "$go_dest" ]; then + download_file "ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro2go" "interpro2go" + else + log SUCCESS "interpro2go is already present." + fi + + log SUCCESS "All required files downloaded." + + # Create executable + log INFO "Creating executable..." + echo -e '#!/usr/bin/env bash\nDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"\n$DIR/src/kinfin.py "$@"' > $DIR/kinfin && chmod +x $DIR/kinfin + + # Done + log SUCCESS "Kinfin was installed. Please run ./kinfin --help" +} + +main \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..14bfacd --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +-r requirements.txt +fastapi==0.111.0 +pytest==8.2.2 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5a8fe76..7bcfdc2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ -scipy==1.11.1 -matplotlib==2.0.2 +scipy==1.13.1 +matplotlib==3.9.0 docopt==0.6.2 -networkx==1.11 -powerlaw==1.4.1 -ete3==3.0.0b35 +networkx==3.3 +powerlaw==1.5 +ete3==3.1.3 +fastapi==0.111.0 \ No newline at end of file diff --git a/scripts/get_protein_ids_from_cluster.py b/scripts/get_protein_ids_from_cluster.py index 4aca41f..17a006c 100755 --- a/scripts/get_protein_ids_from_cluster.py +++ b/scripts/get_protein_ids_from_cluster.py @@ -71,8 +71,8 @@ def parse_groups(group_f): def write_output(output, outprefix): - headers_found = set([k for k, v in headers.iteritems() if v]) - clusters_found = set([k for k, v in clusters.iteritems() if v]) + headers_found = set([k for k, v in headers.items() if v]) + clusters_found = set([k for k, v in clusters.items() if v]) if headers: print("[+] Found %s of headers ..." % "{:.0%}".format(len(headers_found) / len(headers))) if clusters: diff --git a/setup.py b/setup.py index 077cbed..29f9f71 100644 --- a/setup.py +++ b/setup.py @@ -1,37 +1,37 @@ import pip from setuptools import setup, find_packages -__version__ = '1.1' +__version__ = "1.1" # Get the long description from the README file -with open('README.md', 'r') as readme: +with open("README.md", "r") as readme: long_description = readme.read() # get the dependencies and installs -with open('requirements.txt', 'r') as requirements: +with open("requirements.txt", "r") as requirements: reqs = requirements.read().splitlines() setup( - name='kinfin', + name="kinfin", version=__version__, - description='Taxon-aware analysis of clustered protein data', + description="Taxon-aware analysis of clustered protein data", long_description=long_description, - url='https://github.com/DRL/kinfin', - download_url='https://github.com/DRL/kinfin/tarball/' + __version__, - license='GnuGPL3', + url="https://github.com/DRL/kinfin", + download_url="https://github.com/DRL/kinfin/tarball/" + __version__, + license="GnuGPL3", classifiers=[ - 'Development Status :: 3 - Alpha', - 'Intended Audience :: Developers', - 'Programming Language :: Python :: 3', + "Development Status :: 3 - Alpha", + "Intended Audience :: Developers", + "Programming Language :: Python :: 3", ], - keywords='Comparative genomics', - packages=find_packages(exclude=['docs', 'tests*']), + keywords="Comparative genomics", + packages=find_packages(exclude=["docs", "tests*"]), include_package_data=True, - author='Dominik R Laetsch', + author="Dominik R Laetsch", entry_points={ - 'console_scripts': [ + "console_scripts": [ "kinfin=src.kinfin:main", - ], - }, - author_email='dominik.laetsch@gmail.com' + ], + }, + author_email="dominik.laetsch@gmail.com", ) diff --git a/src/api/__init__.py b/src/api/__init__.py new file mode 100644 index 0000000..e8891de --- /dev/null +++ b/src/api/__init__.py @@ -0,0 +1,49 @@ +from core.input import ServeArgs + + +def run_server( + args: ServeArgs, + nodesdb_f: str, + pfam_mapping_f: str, + ipr_mapping_f: str, + go_mapping_f: str, + cluster_f: str, + taxon_idx_mapping_file: str, + sequence_ids_f: str, +) -> None: + """ + Starts the uvicorn server + + Parameters: + - args [ServeArgs] : An object containing server configuration arguments, such as the port. + - nodesdb_f [str] : File path to the nodesDB file. + - pfam_mapping_f [str] : File path to the PFAM mapping file. + - ipr_mapping_f [str] : File path to the InterPro mapping file. + - go_mapping_f [str] : File path to the Gene Ontology mapping file. + - cluster_f [str] : File path to the clustering data file. + - taxon_idx_mapping_file [str] : File path to the taxon index mapping file. + - sequence_ids_f [str] : File path to the sequence IDs file. + """ + import uvicorn + from fastapi import FastAPI + + from api.endpoints import router + from api.sessions import query_manager + + query_manager.cluster_f = cluster_f + query_manager.sequence_ids_f = sequence_ids_f + query_manager.taxon_idx_mapping_file = taxon_idx_mapping_file + query_manager.nodesdb_f = nodesdb_f + query_manager.pfam_mapping_f = pfam_mapping_f + query_manager.ipr_mapping_f = ipr_mapping_f + query_manager.go_mapping_f = go_mapping_f + + app = FastAPI() + + @app.get("/") + def hello(): + return {"hi": "hello"} + + app.include_router(router) + + uvicorn.run(app=app, port=args.port) diff --git a/src/api/endpoints.py b/src/api/endpoints.py new file mode 100644 index 0000000..9b338ce --- /dev/null +++ b/src/api/endpoints.py @@ -0,0 +1,836 @@ +import asyncio +import json +import os +from datetime import datetime +from functools import wraps +from typing import Any, Dict, List, Optional + +from fastapi import APIRouter, Depends, HTTPException, Query, Request +from fastapi.responses import FileResponse, JSONResponse +from fastapi.security import APIKeyHeader +from pydantic import BaseModel + +from api.fileparsers import ( + parse_attribute_summary_file, + parse_cluster_metrics_file, + parse_cluster_summary_file, + parse_pairwise_file, + parse_taxon_counts_file, +) +from api.sessions import query_manager +from api.utils import ( + extract_attributes_and_taxon_sets, + read_status, + run_cli_command, + sort_and_paginate_result, +) + +RUN_SUMMARY_FILEPATH = "summary.json" +COUNTS_FILEPATH = "cluster_counts_by_taxon.txt" +CLUSTER_SUMMARY_FILENAME = "cluster_summary.txt" +ATTRIBUTE_METRICS_FILENAME = "attribute_metrics.txt" +CLUSTER_METRICS_FILENAME = "cluster_metrics.txt" +PAIRWISE_ANALYSIS_FILE = "pairwise_representation_test.txt" + + +class InputSchema(BaseModel): + config: List[Dict[str, str]] + + +class ResponseSchema(BaseModel): + status: str + message: str + query: Optional[str] = None + data: Optional[Any] = None + timestamp: str = datetime.now().isoformat() + error: Optional[str] = None + total_pages: Optional[int] = None + current_page: Optional[int] = None + entries_per_page: Optional[int] = None + + +# X-Session-ID header will be required to access plots/files later +header_scheme = APIKeyHeader(name="x-session-id") + +router = APIRouter() + + +def check_kinfin_session(func): + @wraps(func) + async def wrapper(request: Request, session_id: str, *args, **kwargs): + try: + result_dir = query_manager.get_session_dir(session_id) + if not result_dir: + return JSONResponse( + content=ResponseSchema( + status="error", + message="Kinfin analysis not initialized", + error="session_not_initialized", + query=str(request.url), + ).model_dump(), + status_code=428, + ) + + status_file = os.path.join(result_dir, f"{session_id}.status") + if not os.path.exists(status_file): + return JSONResponse( + content=ResponseSchema( + status="success", + message="Kinfin analysis not initialized", + error="session_not_initialized", + query=str(request.url), + ).model_dump(), + status_code=428, + ) + + run_status = read_status(status_file) + status = run_status.get("status") + + if status in ["running", "pending"]: + return JSONResponse( + content=ResponseSchema( + status="success", + message="Kinfin analysis is still running. Please wait for analysis to complete", + data={"is_complete": False}, + query=str(request.url), + ).model_dump(), + status_code=202, + ) + elif status == "error": + return JSONResponse( + content=ResponseSchema( + status="error", + message="Some error occurred during Kinfin analysis.", + error=run_status, + data={"session_terminated_due_to_error"}, + query=str(request.url), + ).model_dump(), + status_code=400, + ) + + return await func(request, session_id=session_id, *args, **kwargs) + + except Exception as e: + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + error=str(e), + query=str(request.url), + ).model_dump(), + status_code=500, + ) + + return wrapper + + +@router.post("/kinfin/init", response_model=ResponseSchema) +async def initialize( + input_data: InputSchema, + request: Request, +): + """ + Initialize the analysis process. + + Args: + input_data (InputSchema): The input data for analysis. + background_tasks (BackgroundTasks): FastAPI's BackgroundTasks for running analysis asynchronously. + + Returns: + JSONResponse: A response indicating that the analysis task has been queued. + + Raises: + HTTPException: If there's an error in the input data or during processing. + """ + try: + if not isinstance(input_data.config, list): + return JSONResponse( + content=ResponseSchema( + status="error", + message="Data must be a list of dictionaries.", + error="Invalid input data format", + query=str(request.url), + ).model_dump(), + status_code=400, + ) + + if not all(isinstance(item, dict) for item in input_data.config): + return JSONResponse( + content=ResponseSchema( + status="error", + message="Each item in data must be a dictionary.", + error="Invalid data format", + query=str(request.url), + ).model_dump(), + status_code=400, + ) + + session_id, result_dir = query_manager.get_or_create_session(input_data.config) + config_f = os.path.join(result_dir, "config.json") + + with open(config_f, "w") as file: + json.dump(input_data.config, file) + + command = [ + "python", + "src/main.py", + "analyse", + "-g", + query_manager.cluster_f, + "-c", + config_f, + "-s", + query_manager.sequence_ids_f, + "-m", + query_manager.taxon_idx_mapping_file, + "-o", + result_dir, + "--plot_format", + "png", + ] + + status_file = os.path.join(result_dir, f"{session_id}.status") + asyncio.create_task(run_cli_command(command, status_file)) + + response = ResponseSchema( + status="success", + message="Analysis task has been queued.", + data={"session_id": session_id}, + query=str(request.url), + ) + return JSONResponse( + content=response.model_dump(), + status_code=202, + ) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get("/kinfin/status", response_model=ResponseSchema) +@check_kinfin_session +async def get_run_status(request: Request, session_id: str = Depends(header_scheme)): + try: + return JSONResponse( + content=ResponseSchema( + status="success", + message="Kinfin analysis is complete.", + data={"is_complete": True}, + query=str(request.url), + ).model_dump(), + status_code=200, + ) + + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get("/kinfin/run-summary", response_model=ResponseSchema) +@check_kinfin_session +async def get_run_summary( + request: Request, + session_id: str = Depends(header_scheme), + detailed: Optional[bool] = Query(False), +): + try: + result_dir = query_manager.get_session_dir(session_id) + filepath = os.path.join(result_dir, RUN_SUMMARY_FILEPATH) + if not os.path.exists(filepath): + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"{RUN_SUMMARY_FILEPATH} File Not Found", + error="File does not exist", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + with open(filepath, "r") as f: + data = json.load(f) + + if not detailed: + data = { + k: v + for k, v in data.items() + if k not in ["included_proteins", "excluded_proteins"] + } + + response = ResponseSchema( + status="success", + message="Run summary retrieved successfully.", + query=str(request.url), + data=data, + ) + return JSONResponse(content=response.model_dump()) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get("/kinfin/counts-by-taxon", response_model=ResponseSchema) +@check_kinfin_session +async def get_counts_by_tanon( + request: Request, + session_id: str = Depends(header_scheme), + include_clusters: Optional[str] = Query(None), + exclude_clusters: Optional[str] = Query(None), + min_count: Optional[int] = Query(None), + max_count: Optional[int] = Query(None), + include_taxons: Optional[str] = Query(None), + exclude_taxons: Optional[str] = Query(None), +): + try: + result_dir = query_manager.get_session_dir(session_id) + filepath = os.path.join(result_dir, COUNTS_FILEPATH) + + if not os.path.exists(filepath): + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"{RUN_SUMMARY_FILEPATH} File Not Found", + error="File does not exist", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + result = parse_taxon_counts_file( + filepath, + include_clusters, + exclude_clusters, + min_count, + max_count, + include_taxons, + exclude_taxons, + ) + + response = ResponseSchema( + status="success", + message="Cluster counts by Taxon retrieved successfully", + data=result, + query=str(request.url), + ) + return JSONResponse(response.model_dump()) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get("/kinfin/cluster-summary/{attribute}", response_model=ResponseSchema) +@check_kinfin_session +async def get_cluster_summary( + request: Request, + attribute: str, + session_id: str = Depends(header_scheme), + include_clusters: Optional[str] = Query(None), + exclude_clusters: Optional[str] = Query(None), + include_properties: Optional[str] = Query(None), + exclude_properties: Optional[str] = Query(None), + min_cluster_protein_count: Optional[int] = Query(None), + max_cluster_protein_count: Optional[int] = Query(None), + min_protein_median_count: Optional[float] = Query(None), + max_protein_median_count: Optional[float] = Query(None), + sort_by: Optional[str] = Query(None), + sort_order: Optional[str] = Query("asc"), + page: Optional[int] = Query(1), + size: Optional[int] = Query(10), +) -> JSONResponse: + try: + result_dir = query_manager.get_session_dir(session_id) + config_f = os.path.join(result_dir, "config.json") + if not os.path.exists(config_f): + return JSONResponse( + content=ResponseSchema( + status="error", + message="Kinfin analysis not initialized", + error="session_not_initialized", + query=str(request.url), + ).model_dump(), + status_code=428, + ) + + valid_endpoints = extract_attributes_and_taxon_sets(config_f) + valid_attributes = valid_endpoints["attributes"] + + if attribute and attribute not in valid_attributes: + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.", + error="Invalid Input", + ).model_dump(), + status_code=400, + ) + + filename = f"{attribute}/{attribute}.{CLUSTER_SUMMARY_FILENAME}" + filepath = os.path.join(result_dir, filename) + + if not os.path.exists(filepath): + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"{COUNTS_FILEPATH} File Not Found", + error="File does not exist", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + result = parse_cluster_summary_file( + filepath=filepath, + include_clusters=include_clusters, + exclude_clusters=exclude_clusters, + include_properties=include_properties, + exclude_properties=exclude_properties, + min_cluster_protein_count=min_cluster_protein_count, + max_cluster_protein_count=max_cluster_protein_count, + min_protein_median_count=min_protein_median_count, + max_protein_median_count=max_protein_median_count, + ) + + paginated_result, total_pages = sort_and_paginate_result( + result, + sort_by, + sort_order, + page, + size, + ) + + response = ResponseSchema( + status="success", + message="Cluster summary retrieved successfully", + data=paginated_result, + query=str(request.url), + current_page=page, + entries_per_page=size, + total_pages=total_pages, + ) + return JSONResponse(response.model_dump()) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get("/kinfin/available-attributes-taxonsets") +@check_kinfin_session +async def get_available_attributes_and_taxon_sets( + request: Request, + session_id: str = Depends(header_scheme), +): + try: + result_dir = query_manager.get_session_dir(session_id) + result = extract_attributes_and_taxon_sets(result_dir) + return JSONResponse( + content=ResponseSchema( + status="success", + message="List of available attributes and taxon sets fetched", + data=result, + query=str(request.url), + ).model_dump(), + status_code=200, + ) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get("/kinfin/attribute-summary/{attribute}", response_model=ResponseSchema) +@check_kinfin_session +async def get_attribute_summary( + request: Request, + attribute: str, + session_id: str = Depends(header_scheme), + sort_by: Optional[str] = Query(None), + sort_order: Optional[str] = Query("asc"), + page: Optional[int] = Query(1), + size: Optional[int] = Query(10), +): + try: + result_dir = query_manager.get_session_dir(session_id) + config_f = os.path.join(result_dir, "config.json") + if not os.path.exists(config_f): + return JSONResponse( + content=ResponseSchema( + status="error", + message="Kinfin analysis not initialized", + error="session_not_initialized", + query=str(request.url), + ).model_dump(), + status_code=428, + ) + + valid_endpoints = extract_attributes_and_taxon_sets(config_f) + valid_attributes = valid_endpoints["attributes"] + + if attribute and attribute not in valid_attributes: + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.", + error="Invalid Input", + ).model_dump(), + status_code=400, + ) + + filename = f"{attribute}/{attribute}.{ATTRIBUTE_METRICS_FILENAME}" + filepath = os.path.join(result_dir, filename) + + if not os.path.exists(filepath): + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"{COUNTS_FILEPATH} File Not Found", + error="File does not exist", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + result = parse_attribute_summary_file(filepath=filepath) + paginated_result, total_pages = sort_and_paginate_result( + result, + sort_by, + sort_order, + page, + size, + ) + response = ResponseSchema( + status="success", + message="Cluster summary retrieved successfully", + data=paginated_result, + query=str(request.url), + current_page=page, + entries_per_page=size, + total_pages=total_pages, + ) + return JSONResponse(response.model_dump()) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get( + "/kinfin/cluster-metrics/{attribute}/{taxon_set}", + response_model=ResponseSchema, +) +@check_kinfin_session +async def get_cluster_metrics( + request: Request, + attribute: str, + taxon_set: str, + session_id: str = Depends(header_scheme), + cluster_status: Optional[str] = Query(None), + cluster_type: Optional[str] = Query(None), + sort_by: Optional[str] = Query(None), + sort_order: Optional[str] = Query("asc"), + page: Optional[int] = Query(1), + size: Optional[int] = Query(10), +): + try: + result_dir = query_manager.get_session_dir(session_id) + config_f = os.path.join(result_dir, "config.json") + if not os.path.exists(config_f): + return JSONResponse( + content=ResponseSchema( + status="error", + message="Kinfin analysis not initialized", + error="session_not_initialized", + query=str(request.url), + ).model_dump(), + status_code=428, + ) + + valid_endpoints = extract_attributes_and_taxon_sets(config_f) + valid_attributes = valid_endpoints["attributes"] + + if attribute and attribute not in valid_attributes: + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.", + error="Invalid Input", + ).model_dump(), + status_code=400, + ) + + valid_taxon_sets = valid_endpoints["taxon_sets"] + + if taxon_set and taxon_set not in valid_taxon_sets: + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"Invalid taxon set: {taxon_set}. Must be one of {valid_taxon_sets}.", + error="Invalid Input", + ).model_dump(), + status_code=400, + ) + + filename = f"{attribute}/{attribute}.{taxon_set}.{CLUSTER_METRICS_FILENAME}" + filepath = os.path.join(result_dir, filename) + + if not os.path.exists(filepath): + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"{COUNTS_FILEPATH} File Not Found", + error="File does not exist", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + result = parse_cluster_metrics_file(filepath, cluster_status, cluster_type) + paginated_result, total_pages = sort_and_paginate_result( + result, + sort_by, + sort_order, + page, + size, + ) + response = ResponseSchema( + status="success", + message="Cluster summary retrieved successfully", + data=paginated_result, + query=str(request.url), + current_page=page, + entries_per_page=size, + total_pages=total_pages, + ) + + return JSONResponse(response.model_dump()) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get( + "/kinfin/pairwise-analysis/{attribute}", + response_model=ResponseSchema, +) +@check_kinfin_session +async def get_pairwise_analysis( + request: Request, + attribute: str, + session_id: str = Depends(header_scheme), + taxon_1: Optional[str] = Query(None), + taxon_2: Optional[str] = Query(None), +): + try: + result_dir = query_manager.get_session_dir(session_id) + config_f = os.path.join(result_dir, "config.json") + if not os.path.exists(config_f): + return JSONResponse( + content=ResponseSchema( + status="error", + message="Kinfin analysis not initialized", + error="session_not_initialized", + query=str(request.url), + ).model_dump(), + status_code=428, + ) + + valid_endpoints = extract_attributes_and_taxon_sets(config_f) + valid_attributes = valid_endpoints["attributes"] + + if attribute and attribute not in valid_attributes: + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.", + error="Invalid Input", + ).model_dump(), + status_code=400, + ) + + filename = f"{attribute}/{attribute}.{PAIRWISE_ANALYSIS_FILE}" + filepath = os.path.join(result_dir, filename) + + if not os.path.exists(filepath): + return JSONResponse( + content=ResponseSchema( + status="error", + message=f"{COUNTS_FILEPATH} File Not Found", + error="File does not exist", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + result = parse_pairwise_file(filepath, taxon_1, taxon_2) + + response = ResponseSchema( + status="success", + message="Cluster summary retrieved successfully", + data=result, + query=str(request.url), + ) + + return JSONResponse(response.model_dump()) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + query=str(request.url), + error=str(e), + ).model_dump(), + status_code=500, + ) + + +@router.get("/kinfin/plot/{plot_type}") +@check_kinfin_session +async def get_plot( + request: Request, + plot_type: str, + session_id: str = Depends(header_scheme), +) -> FileResponse: + """ + Retrieve a specific plot type for a given session. + + Args: + plot_type (str): The type of plot to retrieve. + session_id (str): The session ID for authentication. + + Returns: + FileResponse: The requested plot file. + + Raises: + HTTPException: If the plot type is invalid, session ID is invalid, or the file is not found. + """ + try: + if plot_type not in ["cluster-size-distribution", "all-rarefaction-curve"]: + return JSONResponse( + content=ResponseSchema( + status="error", + message="Invalid Plot Type", + error="invalid_plot_type", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + result_dir = query_manager.get_session_dir(session_id) + filepath: str = "" + match plot_type: + case "cluster-size-distribution": + filepath = "cluster_size_distribution.png" + case "all-rarefaction-curve": + filepath = "all/all.rarefaction_curve.png" + case _: + return JSONResponse( + content=ResponseSchema( + status="error", + message="Invalid Plot Type", + error="invalid_plot_type", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + filepath = os.path.join(result_dir, filepath) + + if not os.path.exists(filepath): + return JSONResponse( + content=ResponseSchema( + status="error", + message="Plot not found", + error="plot_not_found", + query=str(request.url), + ).model_dump(), + status_code=404, + ) + + return FileResponse( + filepath, + media_type="image/png", + headers={"Content-Disposition": "inline"}, + ) + except HTTPException as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message=e.detail, + query=str(request.url), + ).model_dump(), + status_code=e.status_code, + ) + except Exception as e: + print(e) + return JSONResponse( + content=ResponseSchema( + status="error", + message="Internal Server Error", + error=str(e), + query=str(request.url), + ).model_dump(), + status_code=500, + ) diff --git a/src/api/fileparsers.py b/src/api/fileparsers.py new file mode 100644 index 0000000..7e4e99b --- /dev/null +++ b/src/api/fileparsers.py @@ -0,0 +1,247 @@ +import csv +from typing import Optional, Set, Union + + +def read_tsv_file(filepath: str, delimiter: str = "\t"): + try: + with open(filepath, "r", newline="") as file: + yield from csv.DictReader(file, delimiter=delimiter) + except csv.Error as e: + raise ValueError(f"Error reading CSV file: {e}") from e + + +def split_to_set(value: Optional[str]) -> Optional[Set[str]]: + return set(value.split(",")) if value else None + + +def filter_include_exclude( + item: str, + include_set: Optional[Set[str]] = None, + exclude_set: Optional[Set[str]] = None, +) -> bool: + if include_set and item not in include_set: + return False + return not exclude_set or item not in exclude_set + + +def filter_min_max( + value: Union[int, float], + min_value: Optional[Union[int, float]] = None, + max_value: Optional[Union[int, float]] = None, +) -> bool: + if min_value is not None: + min_value = float(min_value) + if max_value is not None: + max_value = float(max_value) + + return (min_value is None or value >= min_value) and ( + max_value is None or value <= max_value + ) + + +def parse_taxon_counts_file( + filepath: str, + include_clusters: Optional[str], + exclude_clusters: Optional[str], + include_taxons: Optional[str], + exclude_taxons: Optional[str], + min_count: Optional[int], + max_count: Optional[int], +): + included_clusters = split_to_set(include_clusters) + excluded_clusters = split_to_set(exclude_clusters) + included_taxons = split_to_set(include_taxons) + excluded_taxons = split_to_set(exclude_taxons) + + result = {} + + for row in read_tsv_file(filepath): + cluster_id = row["#ID"] + + if not filter_include_exclude(cluster_id, included_clusters, excluded_clusters): + continue + + if filtered_values := { + taxon: int(count) + for taxon, count in row.items() + if taxon != "#ID" + and filter_min_max(int(count), min_count, max_count) + and filter_include_exclude(taxon, included_taxons, excluded_taxons) + }: + result[cluster_id] = filtered_values + + return result + + +def parse_cluster_summary_file( + filepath: str, + include_clusters: Optional[str], + exclude_clusters: Optional[str], + include_properties: Optional[str], + exclude_properties: Optional[str], + min_cluster_protein_count: Optional[int], + max_cluster_protein_count: Optional[int], + min_protein_median_count: Optional[float], + max_protein_median_count: Optional[float], +): + included_clusters = split_to_set(include_clusters) + excluded_clusters = split_to_set(exclude_clusters) + included_properties = split_to_set(include_properties) + excluded_properties = split_to_set(exclude_properties) + + rows = read_tsv_file(filepath) + result = {} + for row in rows: + cluster_id = row["#cluster_id"] + if not filter_include_exclude(cluster_id, included_clusters, excluded_clusters): + continue + + summary = { + "cluster_id": cluster_id, + "cluster_protein_count": int(row["cluster_protein_count"]), + "protein_median_count": float(row["protein_median_count"]), + "TAXON_count": int(row["TAXON_count"]), + "attribute": row["attribute"], + "attribute_cluster_type": row["attribute_cluster_type"], + "protein_span_mean": ( + None + if row["protein_span_mean"] == "N/A" + else float(row["protein_span_mean"]) + ), + "protein_span_sd": ( + None + if row["protein_span_sd"] == "N/A" + else float(row["protein_span_sd"]) + ), + } + + if not filter_min_max( + summary["cluster_protein_count"], + min_cluster_protein_count, + max_cluster_protein_count, + ) or not filter_min_max( + summary["protein_median_count"], + min_protein_median_count, + max_protein_median_count, + ): + continue + protein_counts = { + k: v + for k, v in row.items() + if k not in summary + and filter_include_exclude(k, included_properties, excluded_properties) + } + + result[cluster_id] = {**summary, "protein_counts": protein_counts} + return result + + +def parse_attribute_summary_file(filepath: str): + result = {} + + for row in read_tsv_file(filepath): + taxon_set = row["taxon_set"] + result[taxon_set] = { + "taxon_set": taxon_set, + "cluster_total_count": row["cluster_total_count"], + "protein_total_count": row["protein_total_count"], + "protein_total_span": row["protein_total_span"], + "singleton": { + "cluster_count": row["singleton_cluster_count"], + "protein_count": row["singleton_protein_count"], + "protein_span": row["singleton_protein_span"], + }, + "specific": { + "cluster_count": row["specific_cluster_count"], + "protein_count": row["specific_protein_count"], + "protein_span": row["specific_protein_span"], + "cluster_true_1to1_count": row["specific_cluster_true_1to1_count"], + "cluster_fuzzy_count": row["specific_cluster_fuzzy_count"], + }, + "shared": { + "cluster_count": row["shared_cluster_count"], + "protein_count": row["shared_protein_count"], + "protein_span": row["shared_protein_span"], + "cluster_true_1to1_count": row["shared_cluster_true_1to1_count"], + "cluster_fuzzy_count": row["shared_cluster_fuzzy_count"], + }, + "absent": { + "cluster_total_count": row["absent_cluster_total_count"], + "cluster_singleton_count": row["absent_cluster_singleton_count"], + "cluster_specific_count": row["absent_cluster_specific_count"], + "cluster_shared_count": row["absent_cluster_shared_count"], + }, + "TAXON_count": row["TAXON_count"], + "TAXON_taxa": row["TAXON_taxa"].split(", "), + } + return result + + +def parse_cluster_metrics_file( + filepath: str, + cluster_status: Optional[str], + cluster_type: Optional[str], +): + result = {} + valid_status = split_to_set(cluster_status) + valid_types = split_to_set(cluster_type) + rows = read_tsv_file(filepath) + + for row in rows: + cluster_id = row["#cluster_id"] + if valid_types and row["cluster_type"] not in valid_types: + continue + + if not filter_include_exclude(row["cluster_status"], valid_status): + continue + + if not filter_include_exclude(row["cluster_type"], valid_types): + continue + + result[cluster_id] = { + "cluster_id": cluster_id, + "cluster_status": row["cluster_status"], + "cluster_type": row["cluster_type"], + "present_in_cluster": row["cluster_status"] == "present", + "is_singleton": row["cluster_type"] == "singleton", + "is_specific": row["cluster_type"] == "specific", + "counts": { + "cluster_protein_count": row["cluster_protein_count"], + "cluster_proteome_count": row["cluster_proteome_count"], + "TAXON_protein_count": row["TAXON_protein_count"], + "TAXON_mean_count": row["TAXON_mean_count"], + "non_taxon_mean_count": row["non_taxon_mean_count"], + }, + "representation": row["representation"], + "log2_mean(TAXON/others)": row["log2_mean(TAXON/others)"], + "pvalue(TAXON vs. others)": row["pvalue(TAXON vs. others)"], + "coverage": { + "taxon_coverage": row["TAXON_coverage"], + "TAXON_count": row["TAXON_count"], + "non_TAXON_count": row["non_TAXON_count"], + }, + "TAXON_taxa": ( + row["TAXON_taxa"].split(",") if row["TAXON_taxa"] != "N/A" else "N/A" + ), + "non_TAXON_taxa": ( + row["non_TAXON_taxa"].split(",") + if row["non_TAXON_taxa"] != "N/A" + else "N/A" + ), + } + + return result + + +def parse_pairwise_file(filepath: str, taxon_1: Optional[str], taxon_2: Optional[str]): + result = [] + for row in read_tsv_file(filepath): + if taxon_1 and row["TAXON_1"] != taxon_1 and row["TAXON_2"] != taxon_1: + continue + + if taxon_2 and row["TAXON_1"] != taxon_2 and row["TAXON_2"] != taxon_2: + continue + + result.append(row) + + return result diff --git a/src/api/sessions.py b/src/api/sessions.py new file mode 100644 index 0000000..66b5d8b --- /dev/null +++ b/src/api/sessions.py @@ -0,0 +1,114 @@ +import hashlib +import json +import logging +import os +import shutil +import signal +import sys +import threading +import time +from datetime import datetime, timedelta +from typing import Dict, List, Optional, Tuple + +logger = logging.getLogger("kinfin_logger") + + +class QueryManager: + """ + A class to manage query sessions, including creation, retrieval, and cleanup of session directories. + """ + + def __init__(self, expiration_hours: int = 24) -> None: + """Initializes the QueryManager with the specified expiration time for sessions.""" + self.results_base_dir = os.environ.get("RESULTS_BASE_DIR") + self.cluster_f = "" + self.sequence_ids_f = "" + self.taxon_idx_mapping_file = "" + self.nodesdb_f = "" + self.pfam_mapping_f = "" + self.ipr_mapping_f = "" + self.go_mapping_f = "" + if self.results_base_dir is None or not os.path.isabs(self.results_base_dir): + sys.exit("[ERROR] RESULTS_BASE_DIR should be an absolute path.") + + self.expiration_hours = expiration_hours + os.makedirs(self.results_base_dir, exist_ok=True) + + self.cleanup_thread = threading.Thread(target=self.cleanup_loop, daemon=True) + self.cleanup_thread.start() + + def get_session_id(self, query: List[Dict[str, str]]) -> str: + """ + Generate a unique session ID based on the query. + + Args: + query (List[Dict[str, str]]): The query for which to generate a session ID. + + Returns: + str: The generated session ID. + """ + query_json = json.dumps(query, sort_keys=True) + return hashlib.md5(query_json.encode()).hexdigest() + + def get_or_create_session(self, query: List[Dict[str, str]]) -> Tuple[str, str]: + """ + Get or create a session directory based on the query. + + Args: + query (List[Dict[str, str]]): The query for which to get or create a session. + + Returns: + tuple: The session ID and the session directory path. + """ + session_id = self.get_session_id(query) + session_dir = os.path.join(self.results_base_dir, session_id) + + if not os.path.exists(session_dir): + os.makedirs(session_dir) + else: + os.utime(session_dir, None) + + return session_id, session_dir + + def get_session_dir(self, session_id: str) -> Optional[str]: + """ + Get the directory path of an existing session. + + Args: + session_id (str): The session ID for which to get the directory path. + + Returns: + str: The session directory path, or None if the session does not exist. + """ + session_dir = os.path.join(self.results_base_dir, session_id) + if os.path.exists(session_dir): + os.utime(session_dir, None) + return session_dir + return None + + def cleanup_loop(self) -> None: + """The main loop for periodically cleaning up expired sessions.""" + while True: + self.cleanup_expired_sessions() + time.sleep(3600) + + def cleanup_expired_sessions(self) -> None: + """Clean up sessions that have expired based on the expiration time.""" + now = datetime.now() + for session_id in os.listdir(self.results_base_dir): + session_dir = os.path.join(self.results_base_dir, session_id) + mod_time = datetime.fromtimestamp(os.path.getmtime(session_dir)) + + if now - mod_time > timedelta(hours=self.expiration_hours): + shutil.rmtree(session_dir) + + def __exit__(self, _, __) -> None: + """Cleanup all sessions when exiting due to signal""" + shutil.rmtree(self.results_base_dir) + exit(0) + + +query_manager = QueryManager() + +signal.signal(signal.SIGINT, query_manager.__exit__) +signal.signal(signal.SIGTERM, query_manager.__exit__) diff --git a/src/api/utils.py b/src/api/utils.py new file mode 100644 index 0000000..aa2f6ca --- /dev/null +++ b/src/api/utils.py @@ -0,0 +1,117 @@ +import asyncio +import glob +from collections import defaultdict + + +def read_status(status_file): + status_info = {} + with open(status_file, "r") as file: + for line in file: + key, value = line.strip().split("=", 1) + status_info[key] = value + + return status_info + + +def write_status( + status_file: str, + status: str, + exit_code: int = None, + error: str = None, +): + with open(status_file, "w") as file: + file.write(f"status={status}\n") + if exit_code is not None: + file.write(f"exit_code={exit_code}\n") + if error: + file.write(f"error={error}\n") + + +def extract_error_message(stderr: str) -> str: + lines = stderr.strip().splitlines() + error_message_lines = [] + error_found = False + + for line in lines: + if "[ERROR] -" in line: + error_found = True + error_message_lines.append(line.split("[ERROR] -")[1]) + continue + if error_found: + error_message_lines.append(line) + + return ( + " ".join(error_message_lines) + if error_message_lines + else "An unknown error occurred." + ) + + +async def run_cli_command(command: list, status_file: str): + write_status(status_file, "running") + + try: + process = await asyncio.create_subprocess_exec( + *command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + stdout, stderr = await process.communicate() + stdout = stdout.decode().strip() + stderr = stderr.decode().strip() + + if process.returncode == 0: + write_status(status_file, "completed") + return stdout + else: + error_message = extract_error_message(stderr) + write_status( + status_file, + "error", + exit_code=process.returncode, + error=error_message, + ) + return None + + except Exception as e: + write_status(status_file, "error", error=str(e)) + return None + + +def extract_attributes_and_taxon_sets(filepath: str): + files = glob.glob(f"{filepath}/**/*.cluster_metrics.txt") + files = [file.split(filepath)[1] for file in files] + attributes = set() + result = {"attributes": [], "taxon_set": defaultdict(list)} + for file in files: + filename = file.split("/")[-1] + attribute = filename.split(".")[0] + taxon_set = filename.split(".")[1] + attributes.add(attribute) + result["taxon_set"][attribute].append(taxon_set) + result["attributes"] = sorted(attributes) + return result + + +def sort_and_paginate_result( + result: dict, + sort_by: str, + sort_order: str = "asc", + page: int = 1, + size: int = 20, +) -> tuple: + if sort_by: + sort_keys = sort_by.split(",") + items = list(result.items()) + items.sort( + key=lambda item: tuple(item[1].get(key, float("inf")) for key in sort_keys), + reverse=(sort_order != "asc"), + ) + result = dict(items) + start_index = (page - 1) * size + end_index = start_index + size + paginated_result = dict(list(result.items())[start_index:end_index]) + total_pages = -(-len(result) // size) + + return paginated_result, total_pages diff --git a/src/cli/__init__.py b/src/cli/__init__.py new file mode 100644 index 0000000..5b52ebe --- /dev/null +++ b/src/cli/__init__.py @@ -0,0 +1,20 @@ +import os + +from core.input import InputData +from core.logger import setup_logger +from core.results import analyse + + +def run_cli(args: InputData) -> None: + """ + Run the command-line interface to perform analysis based on the provided input data. + + Args: + args (InputData): An instance of InputData containing input parameters and data. + + Returns: + None + """ + log_path = os.path.join(args.output_path, "kinfin.log") + setup_logger(log_path) + analyse(args) diff --git a/src/cli/commands.py b/src/cli/commands.py new file mode 100644 index 0000000..6a8d176 --- /dev/null +++ b/src/cli/commands.py @@ -0,0 +1,212 @@ +import argparse +import sys +from typing import Union + +from cli.validate import validate_cli_args +from core.config import SUPPORTED_PLOT_FORMATS, SUPPORTED_TAXRANKS, SUPPORTED_TESTS +from core.input import InputData, ServeArgs + + +# TODO : --plotsize should take a tuple +# TODO : --taxranks should take multiple inputs +def parse_args( + nodesdb_f: str, + pfam_mapping_f: str, + ipr_mapping_f: str, + go_mapping_f: str, +) -> Union[ServeArgs, InputData]: + """Parse command-line arguments. + + Args: + nodesdb_f (str): filepath of nodesdb_f. + pfam_mapping_f (str): filepath of pfam_mapping_f. + ipr_mapping_f (str): filepath of ipr_mapping_f. + go_mapping_f (str): filepath of go_mapping_f. + + Returns: + ServeArgs or InputData: Parsed arguments based on the command. + + Raises: + SystemExit: If an invalid command is provided. + """ + + parser = argparse.ArgumentParser( + description="Kinfin proteome cluster analysis tool" + ) + + subparsers = parser.add_subparsers(title="command", required=True, dest="command") + api_parser = subparsers.add_parser("serve", help="Start the server") + api_parser.add_argument( + "-p", + "--port", + type=int, + default=8000, + help="Port number for the server (default: 8000)", + ) + + cli_parser = subparsers.add_parser("analyse", help="Perform analysis") + + # Required Arguments + required_group = cli_parser.add_argument_group("Required Arguments") + required_group.add_argument( + "-g", + "--cluster_file", + help="OrthologousGroups.txt produced by OrthoFinder", + required=True, + ) + required_group.add_argument( + "-c", "--config_file", help="Config file (in CSV format)", required=True + ) + required_group.add_argument( + "-s", + "--sequence_ids_file", + help="SequenceIDs.txt used in OrthoFinder", + required=True, + ) + + # Other Files + other_files_group = cli_parser.add_argument_group("Other Files") + other_files_group.add_argument( + "-p", "--species_ids_file", help="SpeciesIDs.txt used in OrthoFinder" + ) + other_files_group.add_argument( + "-m", "--taxon_idx_mapping", help="TAXON IDX Mapping File" + ) + other_files_group.add_argument( + "-f", + "--functional_annotation", + help="Mapping of ProteinIDs to GO/IPRS/SignalP/Pfam (can be generated through 'iprs_to_table.py')", + ) + other_files_group.add_argument("-a", "--fasta_dir", help="Directory of FASTA files") + other_files_group.add_argument( + "-t", + "--tree_file", + help="Tree file in Newick format (taxon names must be the same as TAXON in config file)", + ) + + # General Options + general_group = cli_parser.add_argument_group("General Options") + general_group.add_argument("-o", "--output_path", help="Output prefix") + general_group.add_argument( + "--infer_singletons", + help="Absence of proteins in clustering is interpreted as singleton (based on SequenceIDs.txt)", + action="store_true", + ) + general_group.add_argument( + "--plot_tree", + help="Plot PDF of annotated phylogenetic tree (requires -t, full ETE3 installation and X-server/xvfb-run)", + action="store_true", + ) + general_group.add_argument( + "--min_proteomes", + help="Required number of proteomes in a taxon-set to be used in rarefaction/representation-test computations [default: 2]", + default=2, + type=int, + ) + general_group.add_argument( + "--test", + help="Test to be used in representation-test computations [default: mannwhitneyu]. Options: ttest, welch, mannwhitneyu, ks, kruskal", + default="mannwhitneyu", + choices=SUPPORTED_TESTS, + ) + general_group.add_argument( + "-r", + "--taxranks", + help="Taxonomic ranks to be inferred from TaxIDs in config file [default: phylum,order,genus]", + # TODO : Add SUPPORTED_TAXRANKS here + default=["phylum", "order", "genus"], + nargs="+", + choices=SUPPORTED_TAXRANKS, + ) + general_group.add_argument( + "--repetitions", + help="Number of repetitions for rarefaction curves [default: 30]", + default=30, + type=int, + ) + + # Fuzzy Orthology Groups + fuzzy_group = cli_parser.add_argument_group("Fuzzy Orthology Groups") + fuzzy_group.add_argument( + "-n", + "--target_count", + help="Target number of copies per proteome [default: 1]", + default=1, + type=int, + ) + fuzzy_group.add_argument( + "-x", + "--target_fraction", + help="Min proportion of proteomes at target_count [default: 0.75]", + default=0.75, + type=float, + ) + fuzzy_group.add_argument( + "--min", + help="Min count of proteins for proteomes outside of target_fraction [default: 0]", + default=0, + type=int, + ) + fuzzy_group.add_argument( + "--max", + help="Max count of proteins for proteomes outside of target_fraction [default: 20]", + default=20, + type=int, + ) + + plotting_group = cli_parser.add_argument_group("Plotting Options") + plotting_group.add_argument( + "--fontsize", help="Fontsize for plots [default: 18]", default=18, type=int + ) + plotting_group.add_argument( + "--plotsize", + help="Size (WIDTH,HEIGHT) for plots [default: 24,12]", + default=(24, 12), + nargs=2, + ) + plotting_group.add_argument( + "--plot_format", + help="Plot formats [default: pdf]", + default="pdf", + choices=SUPPORTED_PLOT_FORMATS, + ) + + args = parser.parse_args() + + if args.command == "serve": + return ServeArgs(port=args.port) + elif args.command == "analyse": + validate_cli_args(args=args) + fuzzy_range = { + x for x in range(args.min, args.max + 1) if x != args.target_count + } + + return InputData( + cluster_file=args.cluster_file, + config_f=args.config_file, + sequence_ids_file=args.sequence_ids_file, + species_ids_file=args.species_ids_file, + functional_annotation_f=args.functional_annotation, + fasta_dir=args.fasta_dir, + tree_file=args.tree_file, + output_path=args.output_path, + infer_singletons=args.infer_singletons, + plot_tree=args.plot_tree, + min_proteomes=args.min_proteomes, + test=args.test, + taxranks=args.taxranks, + repetitions=args.repetitions + 1, + fuzzy_count=args.target_count, + fuzzy_fraction=args.target_fraction, + fuzzy_range=fuzzy_range, + fontsize=args.fontsize, + plotsize=args.plotsize, + plot_format=args.plot_format, + nodesdb_f=nodesdb_f, + pfam_mapping_f=pfam_mapping_f, + ipr_mapping_f=ipr_mapping_f, + go_mapping_f=go_mapping_f, + taxon_idx_mapping_file=args.taxon_idx_mapping, + ) + else: + sys.exit() diff --git a/src/cli/validate.py b/src/cli/validate.py new file mode 100644 index 0000000..cc9bbe0 --- /dev/null +++ b/src/cli/validate.py @@ -0,0 +1,83 @@ +import logging +import sys + +from core.utils import check_file + +logger = logging.getLogger("kinfin_logger") + + +def validate_cli_args(args) -> None: + """Validate cli input arguments. + + This function checks if all required files exist and if the arguments meet specific conditions. + + Args: + args (InputData): Input arguments as a named tuple. + + Raises: + SystemExit: If there are any validation errors, exits the program with error messages. + """ + + error_msgs = [] + + try: + check_file(args.cluster_file) + except FileNotFoundError as e: + error_msgs.append(str(e)) + try: + if not isinstance(args.config_file, str): + raise ValueError("[ERROR] - Invalid config file data") + + check_file(args.config_file) + except (FileNotFoundError, ValueError) as e: + error_msgs.append(str(e)) + try: + check_file(args.sequence_ids_file) + except FileNotFoundError as e: + error_msgs.append(str(e)) + try: + check_file(args.species_ids_file) + except FileNotFoundError as e: + error_msgs.append(str(e)) + try: + check_file(args.tree_file) + except FileNotFoundError as e: + error_msgs.append(str(e)) + try: + check_file(args.functional_annotation) + except FileNotFoundError as e: + error_msgs.append(str(e)) + + if args.fasta_dir and not args.species_ids_file: + error_msgs.append( + "[ERROR] : You have provided a FASTA-dir using '--fasta-dir'. Please also provide a Species-ID file using ('--species_ids_file')." + ) + + if args.target_count < 0: + error_msgs.append( + f"[ERROR] : --target_count {args.target_count} must be greater than 0" + ) + + if args.target_fraction < 0 or args.target_fraction > 1: + error_msgs.append( + f"[ERROR] : --target_fraction {args.target_fraction} is not between 0.0 and 1.0" + ) + + if args.min > args.max: + error_msgs.append( + f"[ERROR] : --min {args.min} is greater than --max {args.max}" + ) + + if args.repetitions <= 0: + error_msgs.append( + "[ERROR] : Please specify a positive integer for the number of repetitions for the rarefaction curves" + ) + + if args.min_proteomes <= 0: + error_msgs.append( + "[ERROR] : Please specify a positive integer for the minimum number of proteomes to consider for computations" + ) + + if error_msgs: + logger.error("\n".join(error_msgs)) + sys.exit(1) diff --git a/build/lib/kinfin/__init__.py b/src/core/__init__.py similarity index 100% rename from build/lib/kinfin/__init__.py rename to src/core/__init__.py diff --git a/src/core/alo.py b/src/core/alo.py new file mode 100644 index 0000000..378c83c --- /dev/null +++ b/src/core/alo.py @@ -0,0 +1,250 @@ +from typing import Dict, List, Literal, Optional, Set, Union + +from core.clusters import Cluster + + +class AttributeLevel: + """ + Definitions: + 'shared' : shared between one ALO and others + 'singleton' : cardinality of 1 ('specific', but separate) + 'specific' : only present within one ALO + """ + + def __init__(self, attribute: str, level: str, proteomes: Set[str]) -> None: + self.attribute: str = attribute + self.level: str = level + self.proteomes: Set[str] = set(proteomes) + self.proteomes_list: List[str] = list(proteomes) + self.proteome_count: int = len(proteomes) + + self.cluster_ids_by_cluster_type_by_cluster_status: Dict[ + str, Dict[str, List[str]] + ] = { + # sums up to cluster_count + "present": {"singleton": [], "specific": [], "shared": []}, + "absent": {"singleton": [], "specific": [], "shared": []}, + } + + self.protein_ids_by_cluster_type: Dict[str, List[str]] = { + # list of lists + "singleton": [], + "specific": [], + "shared": [], + } + + self.protein_span_by_cluster_type: Dict[str, List[Union[int, float]]] = { + "singleton": [], + "specific": [], + "shared": [], + } + + self.clusters_by_cluster_cardinality_by_cluster_type: Dict[ + str, Dict[str, List[str]] + ] = { + "shared": {"true": [], "fuzzy": []}, + "specific": {"true": [], "fuzzy": []}, + } + + self.cluster_status_by_cluster_id: Dict[str, Literal["absent", "present"]] = {} + self.cluster_type_by_cluster_id: Dict[ + str, Literal["singleton", "shared", "specific"] + ] = {} + + self.cluster_mwu_pvalue_by_cluster_id = {} + self.cluster_mwu_log2_mean_by_cluster_id = {} + self.cluster_mean_ALO_count_by_cluster_id = {} + self.cluster_mean_non_ALO_count_by_cluster_id = {} + + self.domain_counter_by_domain_source_by_cluster_type = None + self.protein_with_domain_count_by_domain_source_by_cluster_type = None + + self.protein_length_stats_by_cluster_id: Dict[ + str, Dict[str, Union[int, float]] + ] = {} + self.protein_count_by_cluster_id: Dict[str, int] = {} + + def add_cluster( + self, + cluster: Cluster, + attribute_cluster_type: Literal["singleton", "shared", "specific"], + ALO_cluster_status: Literal["absent", "present"], + ALO_protein_length_stats: Dict[str, Union[int, float]], + ALO_protein_ids_in_cluster: List[str], + ALO_cluster_cardinality: Optional[str], + mwu_pvalue: Optional[float], + mwu_log2_mean: Optional[float], + mean_ALO_count: Optional[float], + mean_non_ALO_count: Optional[float], + ) -> None: + """ + Adds a cluster to various data structures maintained by the class. + + Args: + cluster (Cluster): The cluster object to add. + attribute_cluster_type (Literal["singleton", "shared", "specific"]): + Type of the cluster as either 'singleton', 'shared', or 'specific'. + ALO_cluster_status (Literal["absent", "present"]): + Status of the cluster, either 'absent' or 'present'. + ALO_protein_length_stats (Dict[str, Union[int, float]]): + Length statistics of proteins in the cluster. + ALO_protein_ids_in_cluster (List[str]): + List of protein IDs present in the cluster. + ALO_cluster_cardinality (Optional[str]): + Cardinality of the cluster (if applicable). + mwu_pvalue (Optional[float]): + P-value from Mann-Whitney U test (if applicable). + mwu_log2_mean (Optional[float]): + Log2 transformed mean (if applicable). + mean_ALO_count (Optional[float]): + Mean count of ALO (if applicable). + mean_non_ALO_count (Optional[float]): + Mean count of non-ALO (if applicable). + + Returns: + None + """ + self.cluster_ids_by_cluster_type_by_cluster_status[ALO_cluster_status][ + attribute_cluster_type + ].append(cluster.cluster_id) + self.cluster_status_by_cluster_id[cluster.cluster_id] = ALO_cluster_status + self.cluster_type_by_cluster_id[cluster.cluster_id] = attribute_cluster_type + self.protein_length_stats_by_cluster_id[cluster.cluster_id] = ( + ALO_protein_length_stats + ) + + self.protein_count_by_cluster_id[cluster.cluster_id] = len( + ALO_protein_ids_in_cluster + ) + + if ALO_cluster_status == "present": + for ALO_protein_id in ALO_protein_ids_in_cluster: + self.protein_ids_by_cluster_type[attribute_cluster_type].append( + ALO_protein_id + ) + self.protein_span_by_cluster_type[attribute_cluster_type].append( + ALO_protein_length_stats["sum"] + ) + if attribute_cluster_type != "singleton" and ALO_cluster_cardinality: + self.clusters_by_cluster_cardinality_by_cluster_type[ + attribute_cluster_type + ][ALO_cluster_cardinality].append(cluster.cluster_id) + + self.cluster_mwu_pvalue_by_cluster_id[cluster.cluster_id] = mwu_pvalue + self.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id] = mwu_log2_mean + self.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id] = mean_ALO_count + self.cluster_mean_non_ALO_count_by_cluster_id[cluster.cluster_id] = ( + mean_non_ALO_count + ) + + def get_protein_count_by_cluster_type(self, cluster_type: str) -> int: + """ + Return the count of proteins for a specific cluster type. + + Args: + cluster_type (str): Type of the cluster. Use "total" for the total count across all types. + + Returns: + int: Number of proteins in the specified cluster type. + + Raises: + KeyError: If 'cluster_type' is not found in self.protein_ids_by_cluster_type. + """ + if cluster_type == "total": + return sum( + len(protein_ids) + for _, protein_ids in list(self.protein_ids_by_cluster_type.items()) + ) + else: + return len(self.protein_ids_by_cluster_type[cluster_type]) + + def get_cluster_count_by_cluster_status_by_cluster_type( + self, + cluster_status: str, + cluster_type: str, + ) -> int: + """ + Get the count of clusters of a specific status and type. + + Args: + cluster_status (str): The status of clusters to count. + cluster_type (str): The type of cluster to count. Use "total" to get + the total count across all cluster types for the given status. + + Returns: + int: Number of clusters with the specified status and type. + + Raises: + KeyError: If 'cluster_status' or 'cluster_type' is not found in + self.cluster_ids_by_cluster_type_by_cluster_status. + """ + if cluster_type == "total": + return sum( + len(cluster_ids) + for _, cluster_ids in list( + self.cluster_ids_by_cluster_type_by_cluster_status[ + cluster_status + ].items() + ) + ) + else: + return len( + self.cluster_ids_by_cluster_type_by_cluster_status[cluster_status][ + cluster_type + ] + ) + + def get_protein_span_by_cluster_type(self, cluster_type: str) -> Union[int, float]: + """ + Get the total span of proteins for a specific cluster type. + + Args: + cluster_type (str): The type of cluster for which to retrieve protein span. + Use "total" to get the total span across all cluster types. + + Returns: + Union[int, float]: Total span of proteins in the specified cluster type. + If 'cluster_type' is "total", returns the sum of spans across all + cluster types. + """ + return ( + sum( + sum(protein_ids) + for _, protein_ids in list(self.protein_span_by_cluster_type.items()) + ) + if cluster_type == "total" + else sum(self.protein_span_by_cluster_type[cluster_type]) + ) + + def get_cluster_count_by_cluster_cardinality_by_cluster_type( + self, + cluster_type: str, + cluster_cardinality: str, + ) -> int: + """ + Return the count of clusters of a specific type and cardinality. + + Args: + cluster_type (str): Type of the cluster. + cluster_cardinality (str): Cardinality of the clusters. + + Returns: + int: Number of clusters with the specified type and cardinality. + + Raises: + KeyError: If 'cluster_type' or 'cluster_cardinality' is not found. + """ + return len( + self.clusters_by_cluster_cardinality_by_cluster_type[cluster_type][ + cluster_cardinality + ] + ) + + def get_proteomes(self) -> str: + """ + Get a sorted string representation of proteome IDs. + + Returns: + str: Comma-separated and sorted list of proteome IDs. + """ + return ", ".join(sorted([str(proteome_id) for proteome_id in self.proteomes])) diff --git a/src/core/alo_collections.py b/src/core/alo_collections.py new file mode 100644 index 0000000..232bfc7 --- /dev/null +++ b/src/core/alo_collections.py @@ -0,0 +1,513 @@ +import logging +import os +import random +from typing import Any, Dict, List, Optional, Set + +import ete3 +import matplotlib as mat +import matplotlib.pyplot as plt +import numpy as np +from ete3 import Tree + +from core.alo import AttributeLevel +from core.config import ATTRIBUTE_RESERVED + +logger = logging.getLogger("kinfin_logger") + +mat.use("agg") + + +plt.style.use("ggplot") +mat.rc("ytick", labelsize=20) +mat.rc("xtick", labelsize=20) +axis_font = {"size": "20"} +mat.rcParams.update({"font.size": 22}) + + +class AloCollection: + def __init__( + self, + proteomes: Set[str], + attributes: List[str], + proteome_id_by_species_id: Dict[str, str], + level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]], + node_idx_by_proteome_ids: Optional[Dict[Any, Any]], + tree_ete: Optional[Tree], + ) -> None: + self.proteomes = proteomes + self.attributes_verbose = attributes + self.attributes = [ + # list of attributes + attribute + for attribute in attributes + if attribute not in ATTRIBUTE_RESERVED + ] + self.proteome_id_by_species_id = proteome_id_by_species_id + self.level_by_attribute_by_proteome_id = level_by_attribute_by_proteome_id + self.node_idx_by_proteome_ids = node_idx_by_proteome_ids + self.tree_ete = tree_ete + self.proteome_ids_by_level_by_attribute = ( + self.compute_proteomes_by_level_by_attribute() + ) + self.fastas_parsed: bool = False + self.ALO_by_level_by_attribute = self.create_ALOs() + + def compute_proteomes_by_level_by_attribute( + self, + ) -> Dict[str, Dict[str, Set[str]]]: + """ + Compute proteomes grouped by levels for each attribute. + + Args: + attributes (List[str]): A list of strings representing attributes. + level_by_attribute_by_proteome_id (Dict[str, Dict[str, str]]): A dictionary where keys + are proteome IDs (strings), and values are dictionaries with keys representing + attributes (strings) and values representing levels (strings). + + Returns: + Dict[str, Dict[str, Set[str]]]: A dictionary where keys are attributes (strings), + and values are dictionaries. The inner dictionaries have keys representing + levels (strings) and values representing sets of proteome IDs (strings). + """ + proteomes_by_level_by_attribute: Dict[str, Dict[str, Set[str]]] = { + attribute: {} for attribute in self.attributes + } + for proteome_id in self.level_by_attribute_by_proteome_id: + for attribute in self.attributes: + level = self.level_by_attribute_by_proteome_id[proteome_id][attribute] + if level not in proteomes_by_level_by_attribute[attribute]: + proteomes_by_level_by_attribute[attribute][level] = set() + proteomes_by_level_by_attribute[attribute][level].add(proteome_id) + return proteomes_by_level_by_attribute + + def create_ALOs(self) -> Dict[str, Dict[str, Optional[AttributeLevel]]]: + """ + Creates Attribute Level Objects (ALOs) for each attribute and level based on + proteome IDs. + + Returns: + Dict[str, Dict[str, Optional[AttributeLevel]]]: + A dictionary where each key is an attribute name (str), + and the corresponding value is a dictionary mapping level (str) + to an AttributeLevel instance or None. + """ + ALO_by_level_by_attribute: Dict[str, Dict[str, Optional[AttributeLevel]]] = { + attribute: {} for attribute in self.attributes + } + for attribute in self.proteome_ids_by_level_by_attribute: + for level in self.proteome_ids_by_level_by_attribute[attribute]: + proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level] + ALO = AttributeLevel( + # + attribute=attribute, + level=level, + proteomes=proteome_ids, + ) + if level not in ALO_by_level_by_attribute[attribute]: + ALO_by_level_by_attribute[attribute][level] = None + ALO_by_level_by_attribute[attribute][level] = ALO + return ALO_by_level_by_attribute + + def generate_header_for_node(self, node: ete3.TreeNode, dirs: Dict[str, str]): + """ + Generates a header image for a given node of a tree with specified statistics. + + Args: + node (ete3.TreeNode): The TreeNode object representing the node for which the header is generated. + dirs (Dict[str, str]): A dictionary containing directory paths, including 'tree_headers' where the header image will be saved. + + Returns: + str: File path to the generated header image. + + Notes: + - The method generates a header image in PNG format displaying various statistics (apomorphies and synapomorphies) for the given tree node. + - The statistics include counts of singletons, non-singletons, complete presence synapomorphies, and partial absence synapomorphies. + - The generated image is saved in the specified directory under 'tree_headers' with the node's name as the filename. + + Raises: + Any exceptions that might occur during file saving or table rendering. + """ + + node_header_f = os.path.join(dirs["tree_headers"], f"{node.name}.header.png") + data = [ + ( + "Apomorphies (size=1)", + "{:,}".format( + node.apomorphic_cluster_counts["singletons"] # type:ignore + ), + ), + ( + "Apomorphies (size>1)", + "{:,}".format( + node.apomorphic_cluster_counts["non_singletons"] # type:ignore + ), + ), + ( + "Synapomorphies (all)", + "{:,}".format( + node.synapomorphic_cluster_counts[ # type:ignore + "complete_presence" + ] + + node.synapomorphic_cluster_counts[ # type:ignore + "partial_absence" + ] + ), + ), + ( + "Synapomorphies (cov=100%)", + "{:,}".format( + node.synapomorphic_cluster_counts[ # type:ignore + "complete_presence" + ] + ), + ), + ( + "Synapomorphies (cov<100%)", + "{:,}".format( + node.synapomorphic_cluster_counts[ # type: ignore + "partial_absence" + ] # type:ignore + ), + ), + ] + col_labels = ("Type", "Count") + fig, ax = plt.subplots(figsize=(2, 0.5)) + ax.set_facecolor("white") + table = ax.table( + cellText=data, + colLabels=col_labels, + loc="bottom", + fontsize=24, + colLoc="center", + rowLoc="right", + edges="", + ) + table.set_fontsize(24) + table.scale(2, 1) + for key, cell in list(table.get_celld().items()): + row, col = key + cell._text.set_color("grey") # type:ignore + cell.set_edgecolor("darkgrey") + cell.visible_edges = "T" if row > 0 else "B" + if row == len(data) - 2: + cell.set_edgecolor("darkgrey") + cell.visible_edges = "T" + ax.axis("tight") + ax.axis("off") + logger.info(f"[STATUS]\t- Plotting {node_header_f}") + fig.savefig(node_header_f, pad=0, bbox_inches="tight", format="png") + plt.close() + return node_header_f + + def generate_chart_for_node( + self, + node, + dirs: Dict[str, str], + plot_format: str, + fontsize: int, + ) -> Optional[str]: + """ + Generate and save a histogram chart for a given node's synapomorphies. + + Args: + - node: The node object containing synapomorphic cluster strings. + - dirs: A dictionary containing directory paths, specifically 'tree_charts' for saving charts. + - plot_format: The format in which to save the chart ('png' or 'pdf'). + - fontsize: Font size for axis labels and ticks. + + Returns: + - Optional[str]: Path to the saved chart file if successful, None otherwise. + """ + + if proteome_coverages := [ + float(synapomorphic_cluster_string[3]) + for synapomorphic_cluster_string in node.synapomorphic_cluster_strings + ]: + chart_f = os.path.join(dirs["tree_charts"], f"{node.name}.barchart.png") + f, ax = plt.subplots(figsize=(3.0, 3.0)) + ax.set_facecolor("white") + x_values = np.array(proteome_coverages) + ax.hist( + x_values, + histtype="stepfilled", + align="mid", + bins=np.arange(0.0, 1.0 + 0.1, 0.1), + ) + ax.set_xlim(-0.1, 1.1) + for tick in ax.xaxis.get_majorticklabels(): + tick.set_fontsize(fontsize - 2) + tick.set_rotation("vertical") + for tick in ax.yaxis.get_majorticklabels(): + tick.set_fontsize(fontsize - 2) + ax.set_frame_on(False) + ax.xaxis.grid(True, linewidth=1, which="major", color="lightgrey") + ax.yaxis.grid(True, linewidth=1, which="major", color="lightgrey") + f.suptitle("Synapomorphies", y=1.1) + ax.set_ylabel("Count", fontsize=fontsize) + ax.set_xlabel("Proteome coverage", fontsize=fontsize) + logger.info(f"[STATUS]\t- Plotting {chart_f}") + f.savefig(chart_f, bbox_inches="tight", format="png") + if plot_format == "pdf": + pdf_chart_f = os.path.join( + dirs["tree_charts"], + f"{node.name}.barchart.pdf", + ) + logger.info(f"[STATUS]\t- Plotting {pdf_chart_f}") + f.savefig(pdf_chart_f, bbox_inches="tight", format="pdf") + plt.close() + return chart_f + + def plot_text_tree(self, dirs: Dict[str, str]) -> None: + """ + Plot and save the textual representation of the tree. + + This method uses the `tree_ete` attribute of the class to generate and save + both a Newick format (.nwk) and a text format (.txt) representation of the tree. + + Args: + - dirs: A dictionary containing directory paths, specifically 'tree' for saving tree files. + + Returns: + - None + """ + if self.tree_ete: + tree_nwk_f = os.path.join(dirs["tree"], "tree.nwk") + self.tree_ete.write(format=1, outfile=tree_nwk_f) + tree_txt_f = os.path.join(dirs["tree"], "tree.txt") + with open(tree_txt_f, "w") as tree_txt_fh: + tree_txt_fh.write( + f"{self.tree_ete.get_ascii(show_internal=True, compact=False)}\n" + ) + + def plot_tree( + self, + header_f_by_node_name, + charts_f_by_node_name, + dirs: Dict[str, str], + ) -> None: + """ + Plot and save a tree visualization with custom header and chart images for nodes. + + This method uses the `self.tree_ete` attribute of the class to visualize the tree + in a hierarchical manner, with customized header and chart images for each node. + + Args: + - header_f_by_node_name: Dictionary mapping node names to header image file paths (must be PNG). + - charts_f_by_node_name: Dictionary mapping node names to chart image file paths (must be PNG). + - dirs: A dictionary containing directory paths, specifically 'tree' for saving the tree PDF. + + Returns: + - None + """ + tree_f = os.path.join( + dirs["tree"], "tree.pdf" + ) # must be PDF! (otherwise it breaks) + style = ete3.NodeStyle() + style["vt_line_width"] = 5 + style["hz_line_width"] = 5 + style["fgcolor"] = "darkgrey" + for node in self.tree_ete.traverse("levelorder"): # type: ignore + node.set_style(style) + if header_f_by_node_name[node.name]: + # must be PNG! (ETE can't do PDF Faces) + node_header_face = ete3.faces.ImgFace(header_f_by_node_name[node.name]) + node.add_face(node_header_face, column=0, position="branch-top") + if charts_f_by_node_name[node.name]: + # must be PNG! (ETE can't do PDF Faces) + node_chart_face = ete3.faces.ImgFace(charts_f_by_node_name[node.name]) + node.add_face(node_chart_face, column=0, position="branch-bottom") + node_name_face = ete3.TextFace(node.name, fsize=64) + node.img_style["size"] = 10 + node.img_style["shape"] = "sphere" + node.img_style["fgcolor"] = "black" + if not node.is_leaf(): + node.add_face(node_name_face, column=0, position="branch-right") + node.add_face(node_name_face, column=0, position="aligned") + ts = ete3.TreeStyle() + ts.draw_guiding_lines = True + ts.show_scale = False + ts.show_leaf_name = False + ts.allow_face_overlap = True + ts.guiding_lines_color = "lightgrey" + logger.info(f"[STATUS] - Writing tree {tree_f}... ") + self.tree_ete.render( # type: ignore + tree_f, dpi=600, h=1189, units="mm", tree_style=ts + ) + + def write_tree( + self, + dirs: Dict[str, str], + render_tree: bool, + plot_format: str, + fontsize: int, + ) -> None: + """ + Write tree data to files and optionally render a graphical tree representation. + + This method generates and saves various metrics and data related to the tree structure, + including node statistics and cluster metrics. It can also render a graphical tree + representation if specified. + + Args: + - dirs: A dictionary containing directory paths, including 'tree' for saving tree-related files. + - render_tree: Boolean flag indicating whether to render a graphical tree representation. + - plot_format: Format for saving plots ('png', 'pdf', etc.). + - fontsize: Font size used for plotting. + + Returns: + - None + """ + if not self.tree_ete: + return + logger.info("[STATUS] - Writing data for tree ... ") + # Node stats + node_stats_f = os.path.join(dirs["tree"], "tree.node_metrics.txt") + node_stats_header: List[str] = [ + "#nodeID", + "taxon_specific_apomorphies_singletons", + "taxon_specific_apomorphies_non_singletons", + "node_specific_synapomorphies_total", + "node_specific_synapomorphies_complete_presence", + "node_specific_synapomorphies_partial_absence", + "proteome_count", + ] + node_stats: List[str] = ["\t".join(node_stats_header)] + # Cluster node stats + node_clusters_f = os.path.join(dirs["tree"], "tree.cluster_metrics.txt") + node_clusters_header = [ + "#clusterID", + "nodeID", + "synapomorphy_type", + "node_taxon_coverage", + "children_coverage", + "node_taxa_present", + ] + node_clusters = ["\t".join(node_clusters_header)] + # header_f_by_node_name + header_f_by_node_name = {} + charts_f_by_node_name = {} + for node in self.tree_ete.traverse("levelorder"): # type: ignore + for synapomorphic_cluster_string in node.synapomorphic_cluster_strings: # type: ignore + node_clusters.append( + "\t".join( + [str(string) for string in list(synapomorphic_cluster_string)] + ) + ) + node_stats_line = [ + node.name, + node.apomorphic_cluster_counts["singletons"], # type: ignore + node.apomorphic_cluster_counts["non_singletons"], # type: ignore + ( + # type: ignore + node.synapomorphic_cluster_counts["complete_presence"] # type: ignore + # type: ignore + + node.synapomorphic_cluster_counts["partial_absence"] # type: ignore + ), + # type: ignore + node.synapomorphic_cluster_counts["complete_presence"], # type: ignore + node.synapomorphic_cluster_counts["partial_absence"], # type: ignore + len(node.proteome_ids), # type: ignore + ] + node_stats.append("\t".join([str(string) for string in node_stats_line])) + if render_tree: + header_f_by_node_name[node.name] = self.generate_header_for_node( + node, dirs + ) + charts_f_by_node_name[node.name] = self.generate_chart_for_node( + node, dirs, plot_format, fontsize + ) + logger.info(f"[STATUS] - Writing {node_stats_f} ... ") + with open(node_stats_f, "w") as node_stats_fh: + node_stats_fh.write("\n".join(node_stats) + "\n") + logger.info(f"[STATUS] - Writing {node_clusters_f} ... ") + with open(node_clusters_f, "w") as node_clusters_fh: + node_clusters_fh.write("\n".join(node_clusters) + "\n") + if render_tree: + self.plot_tree(header_f_by_node_name, charts_f_by_node_name, dirs) + else: + self.plot_text_tree(dirs) + + def compute_repetition_for_rarefaction_curve( + self, + ALO: AttributeLevel, + attribute: str, + level: str, + rarefaction_by_samplesize_by_level_by_attribute: Dict[ + str, Dict[str, Dict[int, List[int]]] + ], + ): + seen_cluster_ids = set() + random_list_of_proteome_ids = list(ALO.proteomes) + random.shuffle(random_list_of_proteome_ids) + for idx, proteome_id in enumerate(random_list_of_proteome_ids): + if proteome_ALO := self.ALO_by_level_by_attribute["taxon"][proteome_id]: + seen_cluster_ids.update( + proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status[ + "present" + ]["specific"] + ) + seen_cluster_ids.update( + proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status[ + "present" + ]["shared"] + ) + sample_size = idx + 1 + if ( + sample_size + not in rarefaction_by_samplesize_by_level_by_attribute[attribute][ + level + ] + ): + rarefaction_by_samplesize_by_level_by_attribute[attribute][level][ + sample_size + ] = [] + rarefaction_by_samplesize_by_level_by_attribute[attribute][level][ + sample_size + ].append(len(seen_cluster_ids)) + + def compute_rarefaction_data( + self, repetitions: int + ) -> Dict[str, Dict[str, Dict[int, List[int]]]]: + """ + Compute rarefaction data and generate rarefaction curves for proteome clusters. + + This method computes rarefaction curves to analyze the accumulation of non-singleton + clusters as proteome samples increase. It generates plots for each attribute based on + the specified parameters. + + Args: + - repetitions: Number of repetitions to shuffle proteome lists for random sampling. + + Returns: + - Dict[str, Dict[str, Dict[int, List[int]]]] + """ + rarefaction_by_samplesize_by_level_by_attribute: Dict[ + str, Dict[str, Dict[int, List[int]]] + ] = {} + logger.info("[STATUS] - Generating rarefaction data ...") + for attribute in self.attributes: + for level in self.proteome_ids_by_level_by_attribute[attribute]: + proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level] + if len(proteome_ids) == 1: + continue + + if attribute not in rarefaction_by_samplesize_by_level_by_attribute: + rarefaction_by_samplesize_by_level_by_attribute[attribute] = {} + if ( + level + not in rarefaction_by_samplesize_by_level_by_attribute[attribute] + ): + rarefaction_by_samplesize_by_level_by_attribute[attribute][ + level + ] = {} + ALO = self.ALO_by_level_by_attribute[attribute][level] + if ALO is None: + continue + for _ in range(repetitions): + self.compute_repetition_for_rarefaction_curve( + ALO=ALO, + attribute=attribute, + level=level, + rarefaction_by_samplesize_by_level_by_attribute=rarefaction_by_samplesize_by_level_by_attribute, + ) + return rarefaction_by_samplesize_by_level_by_attribute diff --git a/src/core/build.py b/src/core/build.py new file mode 100644 index 0000000..772fd0d --- /dev/null +++ b/src/core/build.py @@ -0,0 +1,393 @@ +import json +import logging +import os +from collections import Counter, OrderedDict, defaultdict +from typing import Dict, List, Optional, Set + +from core.alo_collections import AloCollection +from core.clusters import Cluster, ClusterCollection +from core.logic import ( + add_taxid_attributes, + parse_attributes_from_config_data, + parse_fasta_dir, + parse_go_mapping, + parse_ipr_mapping, + parse_pfam_mapping, + parse_tree_from_file, +) +from core.proteins import Protein, ProteinCollection +from core.utils import progress, yield_file_lines + +logger = logging.getLogger("kinfin_logger") + + +def get_singletons( + proteinCollection: ProteinCollection, + cluster_list: List[Cluster], +) -> int: + """ + Identify and create singleton clusters for unclustered proteins in a protein collection. + + Args: + - proteinCollection (ProteinCollection): An instance of ProteinCollection class. + - cluster_list (List[Cluster]): A list to which new singleton Cluster objects will be appended. + + Returns: + - int: Number of singleton clusters created and appended to cluster_list. + + This function iterates through proteins in the given protein collection that are not yet clustered. + For each unclustered protein, it creates a new singleton cluster and appends it to cluster_list. + """ + logger.info("[STATUS] - Inferring singletons ...") + singleton_idx = 0 + for protein in proteinCollection.proteins_list: + if protein.clustered is False: + cluster_id = f"singleton_{singleton_idx}" + cluster = Cluster( + cluster_id, + [protein.protein_id], + proteinCollection, + ) + cluster_list.append(cluster) + singleton_idx += 1 + return singleton_idx + + +def parse_cluster_file( + output_dir: str, + cluster_f: str, + proteinCollection: ProteinCollection, + available_proteomes: Set[str], +) -> List[Cluster]: + """ + Parses a cluster file to create Cluster objects and updates protein information. + Saves the filtered clustering data and stats to files. + + Args: + output_dir (str): Base directory path for saving files. + cluster_f (str): Path to the cluster file. + proteinCollection (ProteinCollection): Collection of Protein objects. + available_proteomes (Set[str]): Set of all available proteomes. + + Returns: + Tuple[List[Cluster], Dict[str, any]]: List of Cluster objects and stats. + + Raises: + FileNotFoundError: If the cluster file `cluster_f` does not exist. + """ + cluster_list: List[Cluster] = [] + stats = { + "total_clusters": 0, + "total_proteins": 0, + "total_proteomes": len(available_proteomes), + "filtered_clusters": 0, + "filtered_proteins": 0, + "included_proteins": [], + "excluded_proteins": [], + "included_proteomes": defaultdict(int), + "excluded_proteomes": defaultdict(int), + } + + output_filtered_file = os.path.join(output_dir, "orthogroups.filtered.txt") + stats_file = os.path.join(output_dir, "summary.json") + + with open(cluster_f) as fh, open(output_filtered_file, "w") as ofh: + for line in fh: + stats["total_clusters"] += 1 + temp: List[str] = line.rstrip("\n").split(" ") + cluster_id, protein_ids = temp[0].replace(":", ""), temp[1:] + protein_ids = [protein_id for protein_id in protein_ids if protein_id] + + filtered_protein_ids = [] + for protein_id in protein_ids: + proteome_id = protein_id.split(".")[0] # Extract proteome ID + if proteome_id in available_proteomes: + filtered_protein_ids.append(protein_id) + stats["included_proteins"].append(protein_id) + stats["included_proteomes"][proteome_id] += 1 + else: + stats["excluded_proteins"].append(protein_id) + stats["excluded_proteomes"][proteome_id] += 1 + + stats["total_proteins"] += len(protein_ids) + stats["filtered_proteins"] += len(filtered_protein_ids) + + if filtered_protein_ids: + # Only create a cluster if there are proteins left after filtering + cluster = Cluster(cluster_id, filtered_protein_ids, proteinCollection) + for protein_id in filtered_protein_ids: + protein = proteinCollection.proteins_by_protein_id[protein_id] + protein.clustered = True + cluster_list.append(cluster) + filtered_protein_ids.sort() + ofh.write(f"{cluster_id}: {', '.join(filtered_protein_ids)}\n") + stats["filtered_clusters"] += 1 + + stats["included_proteins_count"] = len(set(stats["included_proteins"])) + stats["excluded_proteins_count"] = len(set(stats["excluded_proteins"])) + + # Convert proteome counts to lists of counts for JSON serialization + stats["included_proteomes"] = dict(stats["included_proteomes"]) + stats["excluded_proteomes"] = dict(stats["excluded_proteomes"]) + + # Reorder stats + ordered_stats = OrderedDict( + [ + ("total_clusters", stats["total_clusters"]), + ("total_proteins", stats["total_proteins"]), + ("total_proteomes", stats["total_proteomes"]), + ("filtered_clusters", stats["filtered_clusters"]), + ("filtered_proteins", stats["filtered_proteins"]), + ("included_proteins_count", stats["included_proteins_count"]), + ("excluded_proteins_count", stats["excluded_proteins_count"]), + ("included_proteomes", stats["included_proteomes"]), + ("excluded_proteomes", stats["excluded_proteomes"]), + ("included_proteins", stats["included_proteins"]), + ("excluded_proteins", stats["excluded_proteins"]), + ] + ) + + with open(stats_file, "w") as mf: + json.dump( + ordered_stats, + mf, + separators=(", ", ": "), + indent=4, + ) + + return cluster_list + + +# cli +def parse_domains_from_functional_annotations_file( + functional_annotation_f: str, + proteinCollection: ProteinCollection, +) -> None: + """ + Parse functional annotations from a file and populate ProteinCollection with parsed data. + + Parameters: + - functional_annotation_f (str): Path to the functional annotation file. + - proteinCollection (ProteinCollection): Instance of ProteinCollection class to store parsed data. + - pfam_mapping (bool): Flag indicating whether to parse Pfam mappings. + - ipr_mapping (bool): Flag indicating whether to parse InterPro mappings. + - pfam_mapping_f (str): File path to the Pfam mapping file. + - ipr_mapping_f (str): File path to the InterPro mapping file. + - go_mapping_f (str): File path to the GO mapping file. + + Raises: + - ValueError: If the functional annotation file lacks a header. + + Notes: + - The function reads each line of the functional annotation file, parses relevant data, + and populates the proteinCollection with domain annotations and GO terms. + - It also optionally parses additional mappings (Pfam, InterPro, GO) based on provided flags. + - Updates proteinCollection.functional_annotation_parsed and proteinCollection.domain_desc_by_id_by_source. + """ + + logger.info( + f"[STATUS] - Parsing {functional_annotation_f} ... this may take a while" + ) + + for line in yield_file_lines(functional_annotation_f): + temp: List[str] = line.split() + if temp[0].startswith("#"): + proteinCollection.domain_sources = temp[1:] + + else: + if not proteinCollection.domain_sources: + error_msg = f"[ERROR] - {functional_annotation_f} does not seem to have a header." + raise ValueError(error_msg) + + domain_protein_id: str = temp.pop(0) + go_terms: List[str] = [] + domain_counter_by_domain_source: Dict[str, Counter[str]] = {} + for idx, field in enumerate(temp): + if field != "None": + domain_source: str = proteinCollection.domain_sources[idx] + domain_string: List[str] = field.split(";") + domain_counts_by_domain_id: Dict[str, int] = {} + for domain_id_count in domain_string: + domain_id: str + domain_count: int = 1 + if domain_source == "GO": + domain_id = domain_id_count + else: + domain_id, domain_count_str = domain_id_count.rsplit(":", 2) + domain_count = int(domain_count_str) + domain_counts_by_domain_id[domain_id] = domain_count + domain_counter: Counter[str] = Counter(domain_counts_by_domain_id) + domain_counter_by_domain_source[domain_source] = domain_counter + proteinCollection.add_annotation_to_protein( + domain_protein_id=domain_protein_id, + domain_counter_by_domain_source=domain_counter_by_domain_source, + go_terms=go_terms, + ) + + proteinCollection.functional_annotation_parsed = True + + +# common +def build_AloCollection( + config_f: str, + nodesdb_f: str, + taxranks: List[str], + tree_f: Optional[str], + taxon_idx_mapping_file: Optional[str], +) -> AloCollection: + """ + Builds an AloCollection object from command-line interface (CLI) inputs. + + Args: + config_f (str): Path to the configuration file containing proteome attributes. + nodesdb_f (str): Path to the nodes database file for inferring taxonomic ranks. + taxranks (List[str]): List of taxonomic ranks to be inferred. + tree_f (Optional[str]): Path to the tree file. If provided, ALOs are added from the tree. + + Returns: + AloCollection: An instance of the AloCollection class containing parsed data. + """ + ( + proteomes, + proteome_id_by_species_id, + attributes, + level_by_attribute_by_proteome_id, + ) = parse_attributes_from_config_data(config_f, taxon_idx_mapping_file) + # Add taxonomy if needed + if "TAXID" in set(attributes): + logger.info( + "[STATUS] - Attribute 'TAXID' found, inferring taxonomic ranks from nodesDB" + ) + attributes, level_by_attribute_by_proteome_id = add_taxid_attributes( + attributes=attributes, + level_by_attribute_by_proteome_id=level_by_attribute_by_proteome_id, + nodesdb_f=nodesdb_f, + taxranks=taxranks, + ) + + # Add ALOs from tree if provided + tree_ete, node_idx_by_proteome_ids = parse_tree_from_file( + tree_f, + attributes, + level_by_attribute_by_proteome_id, + proteomes, + ) + + logger.info("[STATUS] - Building AloCollection ...") + return AloCollection( + proteomes=proteomes, + attributes=attributes, + proteome_id_by_species_id=proteome_id_by_species_id, + level_by_attribute_by_proteome_id=level_by_attribute_by_proteome_id, + node_idx_by_proteome_ids=node_idx_by_proteome_ids, + tree_ete=tree_ete, + ) + + +def get_protein_list_from_seq_f(sequence_ids_f: str, aloCollection: AloCollection): + logger.info(f"[STATUS] - Parsing sequence IDs: {sequence_ids_f} ...") + + proteins_list: List[Protein] = [] + for line in yield_file_lines(sequence_ids_f): + temp = line.split(": ") + sequence_id = temp[0] + protein_id = ( + temp[1] + .split(" ")[0] + .replace(":", "_") + .replace(",", "_") + .replace("(", "_") + .replace(")", "_") + ) # orthofinder replaces characters + species_id = sequence_id.split("_")[0] + if proteome_id := aloCollection.proteome_id_by_species_id.get(species_id, None): + protein = Protein(protein_id, proteome_id, species_id, sequence_id) + proteins_list.append(protein) + return proteins_list + + +# common +def build_ProteinCollection( + sequence_ids_f: str, + aloCollection: AloCollection, + fasta_dir: Optional[str], + species_ids_f: Optional[str], + functional_annotation_f: Optional[str], + pfam_mapping: bool, + ipr_mapping: bool, + pfam_mapping_f: str, + go_mapping_f: str, + ipr_mapping_f: str, +) -> ProteinCollection: + proteins_list = get_protein_list_from_seq_f( + sequence_ids_f=sequence_ids_f, + aloCollection=aloCollection, + ) + proteinCollection = ProteinCollection(proteins_list) + + logger.info(f"[STATUS]\t - Proteins found = {proteinCollection.protein_count}") + + if fasta_dir is not None and species_ids_f is not None: + fasta_len_by_protein_id = parse_fasta_dir( + fasta_dir=fasta_dir, + species_ids_f=species_ids_f, + ) + logger.info("[STATUS] - Adding FASTAs to ProteinCollection ...") + parse_steps: float = proteinCollection.protein_count / 100 + for idx, protein in enumerate(proteinCollection.proteins_list): + protein.update_length(fasta_len_by_protein_id[protein.protein_id]) + progress(idx + 1, parse_steps, proteinCollection.protein_count) + aloCollection.fastas_parsed = True + proteinCollection.fastas_parsed = True + else: + logger.info( + "[STATUS] - No Fasta-Dir given, no AA-span information will be reported ..." + ) + + if functional_annotation_f is not None: + parse_domains_from_functional_annotations_file( + functional_annotation_f=functional_annotation_f, + proteinCollection=proteinCollection, + ) + domain_desc_by_id_by_source = {} + + if pfam_mapping and "Pfam" in proteinCollection.domain_sources: + domain_desc_by_id_by_source["Pfam"] = parse_pfam_mapping(pfam_mapping_f) + + if ipr_mapping and "IPR" in proteinCollection.domain_sources: + domain_desc_by_id_by_source["IPR"] = parse_ipr_mapping(ipr_mapping_f) + + if go_mapping_f: + domain_desc_by_id_by_source["GO"] = parse_go_mapping(go_mapping_f) + + proteinCollection.domain_desc_by_id_by_source = domain_desc_by_id_by_source + + return proteinCollection + + +def build_ClusterCollection( + output_dir: str, + cluster_f: str, + proteinCollection: ProteinCollection, + infer_singletons: Optional[bool], + available_proteomes: Set[str], +) -> ClusterCollection: + logger.info(f"[STATUS] - Parsing {cluster_f} ... this may take a while") + cluster_list: List[Cluster] = parse_cluster_file( + output_dir, + cluster_f, + proteinCollection, + available_proteomes, + ) + + inferred_singletons_count = 0 + if infer_singletons: + inferred_singletons_count = get_singletons(proteinCollection, cluster_list) + + return ClusterCollection( + cluster_list, + inferred_singletons_count, + proteinCollection.functional_annotation_parsed, + proteinCollection.fastas_parsed, + proteinCollection.domain_sources, + ) diff --git a/src/core/clusters.py b/src/core/clusters.py new file mode 100644 index 0000000..79cff89 --- /dev/null +++ b/src/core/clusters.py @@ -0,0 +1,196 @@ +from collections import Counter +from math import log +from typing import DefaultDict, Dict, FrozenSet, List, Literal, Optional, Set + +from core.logic import compute_protein_ids_by_proteome +from core.proteins import ProteinCollection +from core.utils import mean, median, sd + + +class Cluster: + def __init__( + self, + cluster_id: str, + protein_ids: List[str], + proteinCollection: ProteinCollection, + ) -> None: + self.cluster_id: str = cluster_id + self.protein_ids = set(protein_ids) + self.protein_count: int = len(protein_ids) + try: + + self.proteomes_by_protein_id: Dict[str, str] = { + _id: proteinCollection.proteins_by_protein_id[_id].proteome_id + for _id in protein_ids + } + except KeyError as e: + error_msg = f"[ERROR] - Protein {e.args[0]} in clustering belongs to proteomes that are not present in the config-file." + error_msg += ( + "Please add those proteomes or recluster by omitting these proteomes." + ) + raise KeyError(error_msg) from e + + self.proteome_ids_list: List[str] = list(self.proteomes_by_protein_id.values()) + self.protein_count_by_proteome_id: Counter[str] = Counter( + self.proteome_ids_list + ) + self.proteome_ids: FrozenSet[str] = frozenset(self.proteome_ids_list) + self.proteome_count: int = len(self.proteome_ids) + self.singleton: bool = self.protein_count <= 1 + self.apomorphy: bool = self.proteome_count <= 1 + + self.protein_ids_by_proteome_id: DefaultDict[str, Set[str]] = ( + compute_protein_ids_by_proteome(self.proteomes_by_protein_id) + ) + self.protein_counts_of_proteomes_by_level_by_attribute: Dict[ + str, Dict[str, List[int]] + ] = {} + self.proteome_coverage_by_level_by_attribute: Dict[str, Dict[str, float]] = {} + self.implicit_protein_ids_by_proteome_id_by_level_by_attribute: Dict[ + str, Dict[str, Dict[str, List[str]]] + ] = {} + self.cluster_type_by_attribute: Dict[ + str, + Literal["singleton", "shared", "specific"], + ] = {} + self.protein_median: Optional[float] = None + self.protein_length_stats: Optional[Dict[str, float]] = ( + self.compute_protein_length_stats(proteinCollection, self.protein_ids) + ) + self.secreted_cluster_coverage: float = self.compute_secreted_cluster_coverage( + proteinCollection, self.protein_ids, self.protein_count + ) + self.domain_counter_by_domain_source: Dict[str, Counter[str]] = ( + self.compute_domain_counter_by_domain_source( + proteinCollection, self.protein_ids + ) + ) + self.domain_entropy_by_domain_source: Dict[str, float] = ( + self.compute_domain_entropy_by_domain_source() + ) + + def compute_protein_length_stats( + self, + proteinCollection: ProteinCollection, + protein_ids: Set[str], + ) -> Optional[Dict[str, float]]: + """ + Computes statistics (mean, median, standard deviation) of protein lengths. + + Parameters: + - proteinCollection: A ProteinCollection object containing protein data. + - protein_ids: A set of protein IDs for which lengths are to be computed. + + Returns: + - Optional[Dict[str, float]]: A dictionary containing 'mean', 'median', and 'sd' + (standard deviation) of protein lengths, if all lengths are available and at least + one protein ID is provided. Returns None if no valid protein lengths are found. + """ + protein_lengths: List[Optional[int]] = [ + proteinCollection.proteins_by_protein_id[protein_id].length + for protein_id in protein_ids + ] + if all(protein_lengths): + protein_length_stats: Dict[str, float] = {"mean": mean(protein_lengths)} + protein_length_stats["median"] = median(protein_lengths) + protein_length_stats["sd"] = sd(protein_lengths) + return protein_length_stats + + def compute_secreted_cluster_coverage( + self, + proteinCollection: ProteinCollection, + protein_ids: Set[str], + protein_count: int, + ) -> float: + """ + Computes the fraction of secreted proteins in a given set of protein IDs. + + Parameters: + - proteinCollection: A ProteinCollection object containing protein data. + - protein_ids: A set of protein IDs to compute secreted protein coverage. + - protein_count: Total count of proteins in the cluster. + + Returns: + - float: Fraction of secreted proteins in the provided set of protein IDs. + """ + secreted = sum( + bool(proteinCollection.proteins_by_protein_id[protein_id].secreted) + for protein_id in protein_ids + ) + return secreted / protein_count + + def compute_domain_counter_by_domain_source( + self, + proteinCollection: ProteinCollection, + protein_ids: Set[str], + ) -> Dict[str, Counter[str]]: + """ + Computes the aggregated domain counts by domain source for a set of protein IDs. + + Parameters: + - proteinCollection: A ProteinCollection object containing protein data. + - protein_ids: A set of protein IDs for which domain counts are computed. + + Returns: + - Dict[str, Counter[str]]: A dictionary where keys are domain sources and values are + Counters mapping domain IDs to their respective counts. + """ + cluster_domain_counter_by_domain_source: Dict[str, Counter[str]] = {} + for protein_id in protein_ids: + if protein_domain_counter_by_domain_source := proteinCollection.proteins_by_protein_id[ + protein_id + ].domain_counter_by_domain_source: + for domain_source, protein_domain_counter in list( + protein_domain_counter_by_domain_source.items() + ): + if domain_source not in cluster_domain_counter_by_domain_source: + cluster_domain_counter_by_domain_source[domain_source] = ( + Counter() + ) + cluster_domain_counter_by_domain_source[ + domain_source + ] += protein_domain_counter + return cluster_domain_counter_by_domain_source + + def compute_domain_entropy_by_domain_source(self) -> Dict[str, float]: + """ + Computes entropy for domains grouped by different sources. + + Returns: + - Dict[str, float]: Dictionary where keys are domain sources and values are computed entropy values. + """ + self.domain_entropy_by_domain_source: Dict[str, float] = {} + for domain_source, domain_counter in list( + self.domain_counter_by_domain_source.items() + ): + total_count: int = len(list(domain_counter.elements())) + domain_entropy: float = -sum( + i / total_count * log(i / total_count, 2) + for i in list(domain_counter.values()) + ) + if str(domain_entropy) == "-0.0": + self.domain_entropy_by_domain_source[domain_source] = 0.0 + else: + self.domain_entropy_by_domain_source[domain_source] = domain_entropy + return self.domain_entropy_by_domain_source + + +class ClusterCollection: + def __init__( + self, + cluster_list: List[Cluster], + inferred_singletons_count: int, + functional_annotation_parsed: bool, + fastas_parsed: bool, + domain_sources: List[str], + ): + self.cluster_list: List[Cluster] = cluster_list + self.cluster_list_by_cluster_id: Dict[str, Cluster] = { + cluster.cluster_id: cluster for cluster in cluster_list + } # only for testing + self.cluster_count: int = len(cluster_list) + self.inferred_singletons_count: int = inferred_singletons_count + self.functional_annotation_parsed: bool = functional_annotation_parsed + self.fastas_parsed: bool = fastas_parsed + # self.domain_sources = [domain_source for domain_source in domain_sources if not domain_source == "GO"] + self.domain_sources: List[str] = domain_sources diff --git a/src/core/config.py b/src/core/config.py new file mode 100644 index 0000000..098c545 --- /dev/null +++ b/src/core/config.py @@ -0,0 +1,15 @@ +ATTRIBUTE_RESERVED = ["IDX", "OUT", "TAXID"] +SUPPORTED_TESTS = {"welch", "mannwhitneyu", "ttest", "ks", "kruskal"} +SUPPORTED_PLOT_FORMATS = {"png", "pdf", "svg"} +SUPPORTED_TAXRANKS = { + "superkingdom", + "kingdom", + "phylum", + "class", + "order", + "superfamily", + "family", + "subfamily", + "genus", + "species", +} diff --git a/src/core/datastore.py b/src/core/datastore.py new file mode 100644 index 0000000..58a4a73 --- /dev/null +++ b/src/core/datastore.py @@ -0,0 +1,2105 @@ +import logging +import os +import time +from collections import Counter, defaultdict +from typing import Any, Dict, FrozenSet, Generator, List, Set, Tuple, Union + +import matplotlib as mat +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.lines import Line2D +from matplotlib.ticker import FormatStrFormatter, NullFormatter + +from core.alo import AttributeLevel +from core.alo_collections import AloCollection +from core.build import ( + build_AloCollection, + build_ClusterCollection, + build_ProteinCollection, +) +from core.clusters import Cluster, ClusterCollection +from core.input import InputData +from core.logic import get_ALO_cluster_cardinality, get_attribute_cluster_type +from core.proteins import ProteinCollection +from core.utils import median, progress, statistic + +logger = logging.getLogger("kinfin_logger") +mat.use("agg") + +plt.style.use("ggplot") +mat.rc("ytick", labelsize=20) +mat.rc("xtick", labelsize=20) +axis_font = {"size": "20"} +mat.rcParams.update({"font.size": 22}) + + +class DataFactory: + def __init__(self, inputData: InputData) -> None: + self.dirs = {} + self.inputData: InputData = inputData + self.aloCollection: AloCollection = build_AloCollection( + config_f=self.inputData.config_f, + nodesdb_f=self.inputData.nodesdb_f, + tree_f=self.inputData.tree_f, + taxranks=self.inputData.taxranks, + taxon_idx_mapping_file=self.inputData.taxon_idx_mapping_file, + ) + self.proteinCollection: ProteinCollection = build_ProteinCollection( + aloCollection=self.aloCollection, + fasta_dir=self.inputData.fasta_dir, + go_mapping_f=self.inputData.go_mapping_f, + functional_annotation_f=self.inputData.functional_annotation_f, + ipr_mapping=self.inputData.ipr_mapping, + ipr_mapping_f=self.inputData.ipr_mapping_f, + pfam_mapping=self.inputData.pfam_mapping, + pfam_mapping_f=self.inputData.pfam_mapping_f, + sequence_ids_f=self.inputData.sequence_ids_f, + species_ids_f=self.inputData.species_ids_f, + ) + self.clusterCollection: ClusterCollection = build_ClusterCollection( + cluster_f=self.inputData.cluster_f, + output_dir=self.inputData.output_path, + proteinCollection=self.proteinCollection, + infer_singletons=self.inputData.infer_singletons, + available_proteomes=self.aloCollection.proteomes, + ) + + def setup_dirs(self) -> None: + """ + Set up output directories for storing results and attributes. + """ + output_path: str = self.inputData.output_path + + self.dirs["main"] = output_path + logger.info("[STATUS] - Output directories in") + logger.info(f"\t{output_path}") + if not os.path.exists(output_path): + logger.info("[STATUS] - Creating main output directory...") + os.makedirs(output_path) + + logger.info("[STATUS] - Creating directories ...") + for attribute in self.aloCollection.attributes: + attribute_path = os.path.join(output_path, attribute) + self.dirs[attribute] = attribute_path + if not os.path.exists(attribute_path): + logger.info( + f"[STATUS] - Creating directory for attribute: {attribute_path}" + ) + os.makedirs(attribute_path) + + if self.aloCollection.tree_ete is not None: + tree_path = os.path.join(output_path, "tree") + node_chart_path = os.path.join(tree_path, "charts") + node_header_path = os.path.join(tree_path, "headers") + + if not os.path.exists(tree_path): + logger.info(f"[STATUS] - Creating tree directory: {tree_path}") + os.makedirs(tree_path) + self.dirs["tree"] = tree_path + + if not os.path.exists(node_chart_path): + logger.info( + f"[STATUS] - Creating node charts directory: {node_chart_path}" + ) + os.makedirs(node_chart_path) + self.dirs["tree_charts"] = node_chart_path + + if self.inputData.plot_tree and not os.path.exists(node_header_path): + logger.info( + f"[STATUS] - Creating node headers directory: {node_header_path}" + ) + os.makedirs(node_header_path) + self.dirs["tree_headers"] = node_header_path + + def analyse_clusters(self) -> None: + """ + Analyses clusters within the cluster collection. + + Then proceeds to analyse each cluster individually, + logging progress and timing information. + + Returns: + None + """ + if self.clusterCollection.inferred_singletons_count: + logger.info( + f"[STATUS]\t - Clusters found = {self.clusterCollection.cluster_count} (of which {self.clusterCollection.inferred_singletons_count} were inferred singletons)") # fmt:skip + + else: + logger.info( + f"[STATUS]\t - Clusters found = {self.clusterCollection.cluster_count}" + ) + + parse_steps = self.clusterCollection.cluster_count / 100 + + logger.info("[STATUS] - Analysing clusters ...") + analyse_clusters_start = time.time() + for idx, cluster in enumerate(self.clusterCollection.cluster_list): + self.__analyse_cluster(cluster) + progress(idx + 1, parse_steps, self.clusterCollection.cluster_count) + analyse_clusters_end = time.time() + analyse_clusters_elapsed = analyse_clusters_end - analyse_clusters_start + logger.info(f"[STATUS] - Took {analyse_clusters_elapsed}s to analyse clusters") + + def plot_rarefaction_data( + self, + rarefaction_by_samplesize_by_level_by_attribute: Dict[ + str, Dict[str, Dict[int, List[int]]] + ], + dirs: Dict[str, str], + plotsize: Tuple[float, float], + plot_format: str, + fontsize: int, + ) -> None: + """ + Plot rarefaction curves based on provided data. + + Args: + rarefaction_by_samplesize_by_level_by_attribute (dict): A nested dictionary + where keys are attribute names, and values are dictionaries where keys + are level names and values are dictionaries mapping sample sizes to + lists of non-singleton cluster counts. + dirs (dict): A dictionary mapping attribute names to directory paths where + plots will be saved. + plotsize (tuple): A tuple specifying the size of the plot (width, height) in inches. + plot_format (str): The format of the plot to save (e.g., 'png', 'pdf'). + fontsize (int): Font size for plot labels and legend. + + Returns: + None + """ + for ( + attribute, + rarefaction_by_samplesize_by_level, + ) in rarefaction_by_samplesize_by_level_by_attribute.items(): + rarefaction_plot_f = os.path.join( + dirs[attribute], f"{attribute}.rarefaction_curve.{plot_format}" + ) + f, ax = plt.subplots(figsize=plotsize) + ax.set_facecolor("white") + max_number_of_samples = 0 + for idx, level in enumerate(rarefaction_by_samplesize_by_level): + number_of_samples = len(rarefaction_by_samplesize_by_level[level]) + if number_of_samples > max_number_of_samples: + max_number_of_samples = number_of_samples + colour = plt.cm.Paired( # type: ignore + idx / len(rarefaction_by_samplesize_by_level) + ) # type: ignore + x_values = [] + y_mins = [] + y_maxs = [] + median_y_values = [] + median_x_values = [] + for x, y_reps in list( + rarefaction_by_samplesize_by_level[level].items() + ): + x_values.append(x) + y_mins.append(min(y_reps)) + y_maxs.append(max(y_reps)) + median_y_values.append(median(y_reps)) + median_x_values.append(x) + x_array = np.array(x_values) + y_mins_array = np.array(y_mins) + y_maxs_array = np.array(y_maxs) + ax.plot( + median_x_values, + median_y_values, + "-", + color=colour, + label=level, + ) + ax.fill_between( + x_array, + y_mins_array, # type:ignore + y_maxs_array, # type:ignore + color=colour, + alpha=0.5, + ) + ax.set_xlim([0, max_number_of_samples + 1]) + ax.set_ylabel("Count of non-singleton clusters", fontsize=fontsize) + ax.set_xlabel("Sampled proteomes", fontsize=fontsize) + + ax.grid(True, linewidth=1, which="major", color="lightgrey") + legend = ax.legend( + ncol=1, + numpoints=1, + loc="lower right", + frameon=True, + fontsize=fontsize, + ) + legend.get_frame().set_facecolor("white") + logger.info(f"[STATUS]\t- Plotting {rarefaction_plot_f}") + f.savefig(rarefaction_plot_f, format=plot_format) + plt.close() + + def write_output(self) -> None: + """ + Executes various methods to generate and write output files related to cluster analysis. + + This method sequentially calls private methods to: + - Plot cluster sizes. + - Write cluster counts by taxon. + - Write cluster metrics related to domains. + - Write detailed cluster metrics related to domains. + - Write attribute metrics. + - Write a summary of cluster metrics. + - Write cluster metrics related to ALO (Additive Log Ratio) transformation. + - Write cluster 1-to-1 ALO metrics. + - Write pairwise representation metrics. + + Each private method is responsible for generating specific outputs based on internal data. + + Returns: + None + """ + self.__plot_cluster_sizes() + self.__write_cluster_counts_by_taxon() + self.__write_cluster_metrics_domains() + self.__write_cluster_metrics_domains_detailed() + self.__write_attribute_metrics() + self.__write_cluster_summary() + self.__write_cluster_metrics_ALO() + self.__write_cluster_1to1_ALO() + self.__write_pairwise_representation() + + # analyse cluster + def __analyse_ete_for_specific_cluster( + self, + cluster: Cluster, + intersection: FrozenSet[str], + node, + ) -> None: + """ + Analyzes a specific cluster within an evolutionary tree node. + + Updates various counts and attributes of the node based on the characteristics + of the given cluster and its intersection with proteome IDs. + + Args: + cluster (Cluster): The cluster to analyze. + intersection (FrozenSet[str]): The intersection of proteome IDs between + the cluster and the current node. + node: The evolutionary tree node to update. + + Returns: + None + """ + node.counts["specific"] += 1 # type: ignore + if cluster.proteome_count == 1: + # But it only belongs to one proteome + node.apomorphic_cluster_counts["non_singletons"] += 1 # type: ignore + else: + # It has more than one proteome + child_nodes_covered = [] + child_node_proteome_coverage_strings = [] + child_node_proteome_ids_covered_count = 0 + for child_node in node.get_children(): + if child_node.proteome_ids.isdisjoint(cluster.proteome_ids): + # No child node proteomes are not in cluster + child_nodes_covered.append(False) + else: + # At least on child node proteome in cluster + child_nodes_covered.append(True) + child_node_proteome_ids_covered_count = len( + cluster.proteome_ids.intersection(child_node.proteome_ids) + ) + child_node_proteome_coverage_strings.append( + f"{child_node.name}=({child_node_proteome_ids_covered_count}/{len(child_node.proteome_ids)})" + ) + if all(child_nodes_covered): + # At least one proteome of each child node in cluster + # => SYNAPOMORPHY + node_proteome_coverage = len(intersection) / len( + node.proteome_ids + ) # type: ignore + node_cluster_type = "" + node_cluster_type = ( + "complete_presence" + if node_proteome_coverage == 1.0 + else "partial_absence" + ) + # type: ignore + node.synapomorphic_cluster_counts[node_cluster_type] += 1 + + node.synapomorphic_cluster_strings.append( # type: ignore + ( + cluster.cluster_id, + node.name, + node_cluster_type, + "{0:.3}".format(node_proteome_coverage), + ";".join(child_node_proteome_coverage_strings), + ",".join(sorted(intersection)), + ) + ) + + def __analyse_tree_ete(self, cluster: Cluster) -> None: + """ + Analyzes a cluster within an ETE Tree if available in the ALO collection. + + Traverses the ETE Tree in level order, comparing proteome IDs of each node + with the cluster's proteome IDs. Updates counts and attributes of nodes + based on the analysis results. + + Args: + cluster (Cluster): The cluster to analyze. + + Returns: + None + """ + if not self.aloCollection.tree_ete: + return + + for node in self.aloCollection.tree_ete.traverse("levelorder"): # type: ignore + intersection = cluster.proteome_ids.intersection( + node.proteome_ids # type: ignore + ) # type: ignore + difference = cluster.proteome_ids.difference( + node.proteome_ids # type: ignore + ) # type: ignore + + if len(intersection) == 0: + # Nothing to see here ... + node.counts["absent"] += 1 # type: ignore + + elif cluster.singleton is True: + # This is a singleton + node.counts["singleton"] += 1 # type: ignore + node.apomorphic_cluster_counts["singletons"] += 1 # type: ignore + + elif len(difference) > 0: + # This is a 'shared' cluster + node.counts["shared"] += 1 # type: ignore + + elif len(difference) == 0: + # This is a node 'specific' cluster + self.__analyse_ete_for_specific_cluster( + cluster=cluster, + intersection=intersection, + node=node, + ) + + def __process_level( + self, + cluster: Cluster, + attribute: str, + level: str, + protein_ids_by_level: Dict[str, List[str]], + protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]], + explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]], + ) -> None: + """ + Processes a specific level within an attribute for a given cluster. + + Retrieves protein IDs and their counts associated with the specified level + from the ALO collection and updates various attributes and collections within + the cluster and the class instance. + + Args: + cluster (Cluster): The cluster for which to process the level. + attribute (str): The attribute associated with the level. + level (str): The specific level to process. + protein_ids_by_level (dict): A dictionary to store protein IDs by level. + protein_length_stats_by_level (dict): A dictionary to store protein length statistics + by level. + explicit_protein_count_by_proteome_id_by_level (dict): A dictionary to store explicit + protein counts by proteome ID for each level. + + Returns: + None + """ + ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level] + if ALO is None: + return + + protein_ids_by_proteome_id = {} + protein_count_by_proteome_id = {} + protein_ids_by_level[level] = [] + + for proteome_id in ALO.proteomes_list: + protein_ids = list(cluster.protein_ids_by_proteome_id.get(proteome_id, [])) + protein_ids_by_level[level].extend(protein_ids) + protein_count_by_proteome_id[proteome_id] = len(protein_ids) + if protein_count_by_proteome_id[proteome_id] != 0: + protein_ids_by_proteome_id[proteome_id] = protein_ids + + if protein_ids_by_proteome_id: + cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[ + attribute + ][level] = protein_ids_by_proteome_id + + explicit_protein_count_by_proteome_id_by_level[level] = ( + protein_count_by_proteome_id + ) + + protein_length_stats_by_level[level] = ( + self.proteinCollection.get_protein_length_stats(protein_ids_by_level[level]) + ) + + cluster.protein_counts_of_proteomes_by_level_by_attribute[attribute][level] = ( + list(protein_count_by_proteome_id.values()) + ) + + def __update_ALO_data( + self, + cluster: Cluster, + attribute: str, + protein_ids_by_level: Dict[str, List[str]], + protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]], + explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]], + ) -> None: + """ + Updates ALO (Additive Log Ratio) data for a given cluster and attribute. + + Iterates through each level of the ALO collection corresponding to the attribute, + calculates various metrics based on the cluster's protein IDs and attributes, and + updates the ALO object with this information. + + Args: + cluster (Cluster): The cluster to update ALO data for. + attribute (str): The attribute associated with the ALO data. + protein_ids_by_level (dict): A dictionary mapping level names to lists of protein IDs. + protein_length_stats_by_level (dict): A dictionary mapping level names to dictionaries + containing protein length statistics. + explicit_protein_count_by_proteome_id_by_level (dict): A dictionary mapping level names + to dictionaries where keys are proteome IDs and values are explicit protein counts. + + Returns: + None + """ + for level in self.aloCollection.ALO_by_level_by_attribute[attribute]: + ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level] + if ALO is None: + continue + + cluster.proteome_coverage_by_level_by_attribute[attribute][level] = ( + len( + cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[ + attribute + ].get(level, []) + ) + / ALO.proteome_count + ) + + ALO_cluster_status = ( + "present" + if level + in cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[ + attribute + ] + else "absent" + ) + + ALO_cluster_cardinality = None + mwu_pvalue = None + mwu_log2_mean = None + mean_ALO_count = None + mean_non_ALO_count = None + + if ( + ALO_cluster_status == "present" + and cluster.cluster_type_by_attribute[attribute] != "singleton" + ): + ALO_proteome_counts_in_cluster = list( + explicit_protein_count_by_proteome_id_by_level[level].values() + ) + ALO_cluster_cardinality = get_ALO_cluster_cardinality( + ALO_proteome_counts_in_cluster=ALO_proteome_counts_in_cluster, + fuzzy_count=self.inputData.fuzzy_count, + fuzzy_fraction=self.inputData.fuzzy_fraction, + fuzzy_range=self.inputData.fuzzy_range, + ) + + if cluster.cluster_type_by_attribute[attribute] == "shared": + non_ALO_proteome_counts_in_cluster = [ + count + for non_ALO_level in explicit_protein_count_by_proteome_id_by_level + if non_ALO_level != level + for count in explicit_protein_count_by_proteome_id_by_level[ + non_ALO_level + ].values() + ] + mwu_pvalue, mwu_log2_mean, mean_ALO_count, mean_non_ALO_count = ( + statistic( + count_1=ALO_proteome_counts_in_cluster, + count_2=non_ALO_proteome_counts_in_cluster, + test=self.inputData.test, + min_proteomes=self.inputData.min_proteomes, + ) + ) + + ALO.add_cluster( + cluster=cluster, + attribute_cluster_type=cluster.cluster_type_by_attribute[attribute], + ALO_cluster_status=ALO_cluster_status, + ALO_protein_length_stats=protein_length_stats_by_level[level], + ALO_protein_ids_in_cluster=protein_ids_by_level[level], + ALO_cluster_cardinality=ALO_cluster_cardinality, + mwu_pvalue=mwu_pvalue, + mwu_log2_mean=mwu_log2_mean, + mean_ALO_count=mean_ALO_count, + mean_non_ALO_count=mean_non_ALO_count, + ) + + def __process_single_attribute(self, cluster: Cluster, attribute: str) -> None: + """ + Processes a single attribute for a given cluster. + + Retrieves and processes each level associated with the attribute from the ALO + collection, updating various protein and cluster metrics within the cluster object. + + Args: + cluster (Cluster): The cluster to process the attribute for. + attribute (str): The attribute to process. + + Returns: + None + """ + protein_ids_by_level: Dict[str, List[str]] = {} + protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]] = {} + explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]] = {} + + cluster.protein_counts_of_proteomes_by_level_by_attribute[attribute] = {} + cluster.proteome_coverage_by_level_by_attribute[attribute] = {} + cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute] = ( + {} + ) + + for level in self.aloCollection.ALO_by_level_by_attribute[attribute]: + self.__process_level( + cluster, + attribute, + level, + protein_ids_by_level, + protein_length_stats_by_level, + explicit_protein_count_by_proteome_id_by_level, + ) + + cluster.cluster_type_by_attribute[attribute] = get_attribute_cluster_type( + cluster.singleton, + cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[ + attribute + ], + ) + + self.__update_ALO_data( + cluster, + attribute, + protein_ids_by_level, + protein_length_stats_by_level, + explicit_protein_count_by_proteome_id_by_level, + ) + + def __process_attributes(self, cluster: Cluster) -> None: + """ + Processes all attributes in the ALO collection for a given cluster. + + Iterates through each attribute in the ALO collection and processes it + using the __process_single_attribute method. + + Args: + cluster (Cluster): The cluster to process attributes for. + + Returns: + None + """ + for attribute in self.aloCollection.attributes: + self.__process_single_attribute(cluster, attribute) + + def __finalize_cluster_analysis(self, cluster: Cluster) -> None: + """ + Finalizes the cluster analysis by calculating the median protein count. + + Calculates the median protein count for the given cluster using the protein + counts of proteomes from specific levels and attributes. + + Args: + cluster (Cluster): The cluster for which to finalize the analysis. + + Returns: + None + """ + cluster.protein_median = median( + [ + count + for count in cluster.protein_counts_of_proteomes_by_level_by_attribute[ + "all" + ]["all"] + if count != 0 + ] + ) + + def __analyse_cluster(self, cluster: Cluster) -> None: + """ + Analyzes a cluster by performing various analysis steps. + + Executes the analysis steps for the given cluster: + 1. If an ETE tree is available in aloCollection, analyzes the tree structure. + 2. Processes attributes associated with the cluster. + 3. Finalizes the cluster analysis by calculating median protein counts. + + Args: + cluster (Cluster): The cluster to be analyzed. + + Returns: + None + """ + if self.aloCollection.tree_ete: + self.__analyse_tree_ete(cluster=cluster) + + self.__process_attributes(cluster) + self.__finalize_cluster_analysis(cluster) + + # write output + # 0. __get_header_line + def __get_header_line(self, filetype: str, attribute: str) -> str: + """ + Generates a header line for different types of file formats based on the provided + `filetype` and `attribute`. + + Args: + filetype (str): The type of file for which the header line is generated. Valid values: + - "attribute_metrics": Header line for attribute metrics. + - "cafe": Header line for CAFE analysis. + - "cluster_1to1s_ALO": Header line for cluster 1-to-1 relationships with ALO. + - "cluster_metrics": Header line for cluster metrics. + - "cluster_metrics_ALO": Header line for cluster metrics with ALO. + - "cluster_metrics_domains": Header line for cluster metrics with domains. + - "cluster_metrics_domains_detailed": Header line for detailed cluster metrics with domains. + - "pairwise_representation_test": Header line for pairwise representation test. + + attribute (str): The attribute associated with the cluster, used in certain file types. + + Returns: + str: The generated header line as a tab-separated string. + + Raises: + ValueError: If `filetype` is not recognized. + """ + if filetype == "attribute_metrics": + attribute_metrics_header = [ + "#attribute", + "taxon_set", + "cluster_total_count", + "protein_total_count", + "protein_total_span", + "singleton_cluster_count", + "singleton_protein_count", + "singleton_protein_span", + "specific_cluster_count", + "specific_protein_count", + "specific_protein_span", + "shared_cluster_count", + "shared_protein_count", + "shared_protein_span", + "specific_cluster_true_1to1_count", + "specific_cluster_fuzzy_count", + "shared_cluster_true_1to1_count", + "shared_cluster_fuzzy_count", + "absent_cluster_total_count", + "absent_cluster_singleton_count", + "absent_cluster_specific_count", + "absent_cluster_shared_count", + "TAXON_count", + "TAXON_taxa", + ] + return "\t".join(attribute_metrics_header) + elif filetype == "cafe": + cafe_header = ["#ID"] + cafe_header.extend( + iter(sorted(self.aloCollection.ALO_by_level_by_attribute["taxon"])) + ) + return "\t".join(cafe_header) + elif filetype == "cluster_1to1s_ALO": + cluster_1to1s_ALO_header = [ + "#cluster_id", + "cluster_type", + "1to1_type", + "proteome_count", + "percentage_at_target_count", + ] + return "\t".join(cluster_1to1s_ALO_header) + elif filetype == "cluster_metrics": + cluster_metrics_header = [ + "#cluster_id", + "cluster_protein_count", + "protein_median_count", + "TAXON_count", + "attribute", + "attribute_cluster_type", + "protein_span_mean", + "protein_span_sd", + ] + cluster_metrics_header += [ + f"{level}_count" + for level in sorted( + self.aloCollection.ALO_by_level_by_attribute[attribute] + ) + ] + if attribute != "taxon": + cluster_metrics_header += [ + f"{level}_median" + for level in sorted( + self.aloCollection.ALO_by_level_by_attribute[attribute] + ) + ] + cluster_metrics_header += [ + f"{level}_cov" + for level in sorted( + self.aloCollection.ALO_by_level_by_attribute[attribute] + ) + ] + return "\t".join(cluster_metrics_header) + elif filetype == "cluster_metrics_ALO": + cluster_metrics_ALO_header = [ + "#cluster_id", + "cluster_status", + "cluster_type", + "cluster_protein_count", + "cluster_proteome_count", + "TAXON_protein_count", + "TAXON_mean_count", + "non_taxon_mean_count", + "representation", + "log2_mean(TAXON/others)", + "pvalue(TAXON vs. others)", + "TAXON_coverage", + "TAXON_count", + "non_TAXON_count", + "TAXON_taxa", + "non_TAXON_taxa", + ] + # for domain_source in clusterCollection.domain_sources: + # cluster_metrics_ALO_header.append(domain_source) + return "\t".join(cluster_metrics_ALO_header) + elif filetype == "cluster_metrics_domains": + cluster_metrics_domains_header = [ + "#cluster_id", + "cluster_protein_count", + "TAXON_count", + "protein_span_mean", + "protein_span_sd", + "fraction_secreted", + ] + for domain_source in self.clusterCollection.domain_sources: + cluster_metrics_domains_header.extend( + (domain_source, f"{domain_source}_entropy") + ) + return "\t".join(cluster_metrics_domains_header) + elif filetype == "cluster_metrics_domains_detailed": + cluster_metrics_domains_detailed_header = [ + "#cluster_id", + "domain_source", + "domain_id", + "domain_description", + "protein_count", + "protein_count_with_domain", + "TAXA_with_domain_fraction", + "TAXA_with_domain", + "TAXA_without_domain", + ] + return "\t".join(cluster_metrics_domains_detailed_header) + elif filetype == "pairwise_representation_test": + pairwise_representation_test_header = [ + "#cluster_id", + "TAXON_1", + "TAXON_1_mean", + "TAXON_2", + "TAXON_2_mean", + "log2_mean(TAXON_1/TAXON_2)", + "mwu_pvalue(TAXON_1 vs. TAXON_2)", + ] + # pairwise_representation_test_header.append("go_terms") + # for domain_source in clusterCollection.domain_sources: + # pairwise_representation_test_header.append(domain_source) + return "\t".join(pairwise_representation_test_header) + else: + error_msg = f"[ERROR] {filetype} is not a valid header 'filetype'" + raise ValueError(error_msg) + + # 1. plot_cluster_sizes + def __plot_cluster_sizes(self) -> None: + """ + Plot the distribution of cluster sizes based on the protein counts in each cluster. + + Saves the plot as a figure in the directory specified by self.dirs["main"]. + + Returns: + None + + Raises: + ValueError: If self.inputData.plot_format is not a valid file format. + """ + cluster_protein_count = [ + cluster.protein_count for cluster in self.clusterCollection.cluster_list + ] + cluster_protein_counter = Counter(cluster_protein_count) + count_plot_f = os.path.join( + self.dirs["main"], + f"cluster_size_distribution.{self.inputData.plot_format}", + ) + f, ax = plt.subplots(figsize=self.inputData.plotsize) + ax.set_facecolor("white") + x_values = [] + y_values = [] + for value, count in list(cluster_protein_counter.items()): + x_values.append(value) + y_values.append(count) + x_array = np.array(x_values) # type: ignore + y_array = np.array(y_values) + ax.scatter(x_array, y_array, marker="o", alpha=0.8, s=100) # type: ignore + ax.set_xlabel("Cluster size", fontsize=self.inputData.fontsize) + ax.set_ylabel("Count", fontsize=self.inputData.fontsize) + ax.set_yscale("log") + ax.set_xscale("log") + plt.margins(0.8) + plt.gca().set_ylim(bottom=0.8) + plt.gca().set_xlim(left=0.8) + ax.xaxis.set_major_formatter(FormatStrFormatter("%.0f")) + ax.yaxis.set_major_formatter(FormatStrFormatter("%.0f")) + f.tight_layout() + + ax.grid(True, linewidth=1, which="major", color="lightgrey") + ax.grid(True, linewidth=0.5, which="minor", color="lightgrey") + logger.info(f"[STATUS] - Plotting {count_plot_f}") + f.savefig(count_plot_f, format=self.inputData.plot_format) + plt.close() + + # 2. write_cluster_counts_by_taxon + def __write_cluster_counts_by_taxon(self) -> None: + """ + Write cluster counts by taxon attribute to a text file. + + This method iterates through attributes in self.aloCollection.attributes, + retrieves protein counts by level for clusters in self.clusterCollection.cluster_list + that match the attribute "taxon", and writes the data to a text file named + 'cluster_counts_by_taxon.txt' in the directory specified by self.dirs["main"]. + + Raises: + ValueError: If the header type 'cafe' is not recognized. + """ + cafe_f = os.path.join(self.dirs["main"], "cluster_counts_by_taxon.txt") + for attribute in self.aloCollection.attributes: + levels = sorted( + list(self.aloCollection.ALO_by_level_by_attribute[attribute]) + ) + cafe_output = [] + for cluster in self.clusterCollection.cluster_list: + if attribute == "taxon": + cafe_line = f"{cluster.cluster_id}" + # cafe_line.append("None") + for _level in levels: + total_proteins = sum( + cluster.protein_counts_of_proteomes_by_level_by_attribute[ + attribute + ][_level] + ) + cafe_line += f"\t{total_proteins}" + cafe_output.append(cafe_line) + if cafe_output: + with open(cafe_f, "w") as cafe_fh: + logger.info(f"[STATUS] - Writing {cafe_f}") + cafe_output.sort() + cafe_output.insert(0, self.__get_header_line("cafe", "taxon")) + cafe_fh.write("\n".join(cafe_output) + "\n") + cafe_output = [] + + # 3. write_cluster_metrics_domains + def __write_cluster_metrics_domains(self) -> None: + """ + Write cluster metrics to a file 'cluster_metrics_domains.txt'. + + This method constructs and writes cluster metrics data to a text file, + including cluster IDs, protein counts, taxon counts, domain statistics, + and entropy for each domain source present in the cluster collection. + + Raises: + IOError: If there is an issue writing to the output file. + + """ + cluster_metrics_domains_f = os.path.join( + self.dirs["main"], "cluster_metrics_domains.txt" + ) + header = self.__get_header_line("cluster_metrics_domains", "taxon").split("\t") + cluster_metrics_domains_output = [] + + if self.clusterCollection.functional_annotation_parsed: + for cluster in self.clusterCollection.cluster_list: + line_parts = { + "#cluster_id": cluster.cluster_id, + "cluster_protein_count": str(cluster.protein_count), + "TAXON_count": str(cluster.proteome_count), + "protein_span_mean": "N/A", + "protein_span_sd": "N/A", + "fraction_secreted": "N/A", + } + + if ( + self.clusterCollection.fastas_parsed + and cluster.protein_length_stats + ): + line_parts["protein_span_mean"] = str( + cluster.protein_length_stats["mean"] + ) + line_parts["protein_span_sd"] = str( + cluster.protein_length_stats["sd"] + ) + + if "SignalP_EUK" in self.clusterCollection.domain_sources: + line_parts["fraction_secreted"] = "{0:.2f}".format( + cluster.secreted_cluster_coverage + ) + + for domain_source in self.clusterCollection.domain_sources: + if domain_source in cluster.domain_counter_by_domain_source: + sorted_counts = sorted( + [ + f"{domain_id}:{count}" + for domain_id, count in cluster.domain_counter_by_domain_source[ + domain_source + ].most_common() + ], + key=lambda x: (x.split(":")[-1], x.split(":")[-2]), + ) + line_parts[domain_source] = ";".join(sorted_counts) + line_parts[f"{domain_source}_entropy"] = "{0:.3f}".format( + cluster.domain_entropy_by_domain_source[domain_source] + ) + else: + line_parts[domain_source] = "N/A" + line_parts[f"{domain_source}_entropy"] = "N/A" + + # Ensure we're following the correct order from the header + ordered_line = [line_parts.get(col, "N/A") for col in header] + cluster_metrics_domains_output.append("\t".join(ordered_line)) + + if cluster_metrics_domains_output: + with open(cluster_metrics_domains_f, "w") as cluster_metrics_domains_fh: + logger.info(f"[STATUS] - Writing {cluster_metrics_domains_f}") + cluster_metrics_domains_output.sort() + cluster_metrics_domains_output.insert(0, "\t".join(header)) + cluster_metrics_domains_fh.write( + "\n".join(cluster_metrics_domains_output) + "\n" + ) + + # 4. write_cluster_metrics_domains_detailed + def __count_proteins_with_domain( + self, cluster: Cluster, domain_source: str, domain_id: str + ) -> Tuple[Dict[str, int], Dict[str, int]]: + """ + Count proteins with and without a specific domain in each proteome of a cluster. + + Args: + cluster (Cluster): The cluster object containing proteins to be analyzed. + domain_source (str): The source of the domain to be counted (e.g., "Pfam", "InterPro"). + domain_id (str): The ID of the specific domain to be counted. + + Returns: + Tuple[Dict[str, int], Dict[str, int]]: A tuple containing: + - A dictionary where keys are proteome IDs and values are counts of proteins + in the proteome that have the specified domain (`with_domain`). + - A dictionary where keys are proteome IDs and values are counts of proteins + in the proteome that do not have the specified domain (`without_domain`). + + """ + with_domain = defaultdict(int) + without_domain = defaultdict(int) + + for proteome_id, protein_ids in cluster.protein_ids_by_proteome_id.items(): + for protein_id in protein_ids: + protein = self.proteinCollection.proteins_by_protein_id[protein_id] + if ( + domain_source in protein.domain_counter_by_domain_source + and domain_id + in protein.domain_counter_by_domain_source[domain_source] + ): + with_domain[proteome_id] += 1 + else: + without_domain[proteome_id] += 1 + + return with_domain, without_domain + + def __format_proteome_counts( + self, count_dict: Dict[str, int], cluster: Cluster + ) -> str: + """ + Format proteome counts into a string representation. + + Args: + count_dict (Dict[str, int]): A dictionary where keys are proteome IDs and values are counts. + cluster (Cluster): The cluster object associated with the counts. + + Returns: + str: A string representation of proteome counts formatted as "proteome_id:count/total" + for each proteome ID in sorted order, separated by commas. If count_dict is empty, + returns "N/A". + + """ + return ( + ",".join( + f"{proteome_id}:{count}/{len(cluster.protein_ids_by_proteome_id[proteome_id])}" + for proteome_id, count in sorted(count_dict.items()) + ) + or "N/A" + ) + + def __get_domain_description(self, domain_source: str, domain_id: str) -> str: + """ + Get the description of a domain based on its source and ID. + + Args: + domain_source (str): The source of the domain (e.g., "SignalP_EUK", "Pfam"). + domain_id (str): The ID of the domain whose description is to be retrieved. + + Returns: + str: The description of the domain if found in `self.proteinCollection.domain_desc_by_id_by_source`, + otherwise returns "N/A". + + """ + if domain_source == "SignalP_EUK": + return domain_id + return self.proteinCollection.domain_desc_by_id_by_source.get( + domain_source, {} + ).get(domain_id, "N/A") + + def __process_cluster_domains( + self, cluster: Cluster, output_by_domain_source: Dict[str, List[str]] + ) -> None: + """ + Process domain statistics for a cluster and populate the output dictionary. + + Args: + cluster (Cluster): The cluster object containing domain statistics to process. + output_by_domain_source (Dict[str, List[str]]): A dictionary where keys are domain sources + and values are lists of output lines to be populated with processed domain statistics. + + Returns: + None + + """ + for ( + domain_source, + domain_counter, + ) in cluster.domain_counter_by_domain_source.items(): + for domain_id, count in domain_counter.most_common(): + with_domain, without_domain = self.__count_proteins_with_domain( + cluster, domain_source, domain_id + ) + proteome_count_with_domain = sum( + count > 0 for count in with_domain.values() + ) + + with_domain_str = self.__format_proteome_counts(with_domain, cluster) + without_domain_str = self.__format_proteome_counts( + without_domain, cluster + ) + + domain_description = self.__get_domain_description( + domain_source, domain_id + ) + + output_line = ( + f"{cluster.cluster_id}\t{domain_source}\t{domain_id}\t" + f"{domain_description}\t{cluster.protein_count}\t" + f"{sum(with_domain.values())}\t" + f"{proteome_count_with_domain / cluster.proteome_count:.3f}\t" + f"{with_domain_str}\t{without_domain_str}" + ) + + output_by_domain_source[domain_source].append(output_line) + + def __write_domain_outputs( + self, + output_by_domain_source: Dict[str, List[str]], + output_files: Dict[str, str], + ) -> None: + """ + Write domain outputs to respective output files. + + Args: + output_by_domain_source (Dict[str, List[str]]): A dictionary where keys are domain sources + and values are lists of output lines to be written to output files. + output_files (Dict[str, str]): A dictionary where keys are domain sources and values are + corresponding output file paths. + + Returns: + None + + """ + for domain_source, output_lines in output_by_domain_source.items(): + if len(output_lines) > 1: + output_file = output_files[domain_source] + logger.info(f"[STATUS] - Writing {output_file}") + with open(output_file, "w") as fh: + fh.write("\n".join(output_lines) + "\n") + + def __write_cluster_metrics_domains_detailed(self) -> None: + """ + Write detailed cluster metrics for domain annotations to respective output files. + + This method constructs detailed cluster metrics for domain annotations and writes + them to individual output files for each domain source specified in the cluster + collection. + + Returns: + None + """ + output_by_domain_source: Dict[str, List[str]] = { + source: [] for source in self.clusterCollection.domain_sources + } + + output_files: Dict[str, str] = { + source: os.path.join( + self.dirs["main"], f"cluster_domain_annotation.{source}.txt" + ) + for source in self.clusterCollection.domain_sources + } + + if self.clusterCollection.functional_annotation_parsed: + for cluster in self.clusterCollection.cluster_list: + self.__process_cluster_domains(cluster, output_by_domain_source) + + self.__write_domain_outputs(output_by_domain_source, output_files) + + # 5. write attribute metrics + def __get_attribute_metrics(self, ALO: AttributeLevel) -> str: + """ + Retrieve attribute metrics as a formatted string. + + Args: + ALO (AttributeLevel): An instance of AttributeLevel containing the attribute metrics. + + Returns: + str: A tab-separated string containing various attribute metrics: + - Attribute name + - Attribute level + - Cluster counts and protein counts/span for different cluster types and statuses. + - Proteome count and other relevant metrics. + + """ + attribute_metrics = [ + ALO.attribute, + ALO.level, + ALO.get_cluster_count_by_cluster_status_by_cluster_type("present", "total"), + ALO.get_protein_count_by_cluster_type("total"), + ALO.get_protein_span_by_cluster_type("total"), + ALO.get_cluster_count_by_cluster_status_by_cluster_type( + "present", "singleton" + ), + ALO.get_protein_count_by_cluster_type("singleton"), + ALO.get_protein_span_by_cluster_type("singleton"), + ALO.get_cluster_count_by_cluster_status_by_cluster_type( + "present", "specific" + ), + ALO.get_protein_count_by_cluster_type("specific"), + ALO.get_protein_span_by_cluster_type("specific"), + ALO.get_cluster_count_by_cluster_status_by_cluster_type( + "present", "shared" + ), + ALO.get_protein_count_by_cluster_type("shared"), + ALO.get_protein_span_by_cluster_type("shared"), + ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type( + "specific", "true" + ), + ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type( + "specific", "fuzzy" + ), + ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type( + "shared", "true" + ), + ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type( + "shared", "fuzzy" + ), + ALO.get_cluster_count_by_cluster_status_by_cluster_type("absent", "total"), + ALO.get_cluster_count_by_cluster_status_by_cluster_type( + "absent", "singleton" + ), + ALO.get_cluster_count_by_cluster_status_by_cluster_type( + "absent", "specific" + ), + ALO.get_cluster_count_by_cluster_status_by_cluster_type("absent", "shared"), + ALO.proteome_count, + ALO.get_proteomes(), + ] + + return "\t".join(map(str, attribute_metrics)) + + def __write_attribute_metrics(self) -> None: + """ + Write attribute metrics for each attribute to respective output files. + + This method iterates over each attribute in self.aloCollection.attributes, + retrieves attribute metrics for each level of the attribute, and writes them + to individual output files named after the attribute. + + Returns: + None + + """ + for attribute in self.aloCollection.attributes: + attribute_metrics_f = os.path.join( + self.dirs[attribute], f"{attribute}.attribute_metrics.txt" + ) + attribute_metrics_output = [] + levels = sorted( + list(self.aloCollection.ALO_by_level_by_attribute[attribute]) + ) + for level in levels: + if ALO := self.aloCollection.ALO_by_level_by_attribute[attribute][ + level + ]: + attribute_metrics_output.append(self.__get_attribute_metrics(ALO)) + + if attribute_metrics_output: + with open(attribute_metrics_f, "w") as attribute_metrics_fh: + logger.info(f"[STATUS] - Writing {attribute_metrics_f}") + attribute_metrics_output.sort() + header_line = self.__get_header_line("attribute_metrics", attribute) + attribute_metrics_output.insert(0, header_line) + attribute_metrics_fh.write( + "\n".join(attribute_metrics_output) + "\n" + ) + + # 6. write cluster summary + def __write_cluster_summary(self) -> None: + """ + Write cluster summary metrics for each attribute to respective output files. + + This method iterates over each attribute in self.aloCollection.attributes, + retrieves cluster summary metrics for each cluster in self.clusterCollection.cluster_list, + and writes them to individual output files named after the attribute. + + Returns: + None + + """ + for attribute in self.aloCollection.attributes: + cluster_metrics_f = os.path.join( + self.dirs[attribute], f"{attribute}.cluster_summary.txt" + ) + + levels = sorted( + list(self.aloCollection.ALO_by_level_by_attribute[attribute]) + ) + cluster_metrics_output = [] + for cluster in self.clusterCollection.cluster_list: + cluster_metrics_line = [ + str(cluster.cluster_id), + str(cluster.protein_count), + str(cluster.protein_median), + str(cluster.proteome_count), + str(attribute), + str(cluster.cluster_type_by_attribute[attribute]), + ] + if ( + self.clusterCollection.fastas_parsed + and cluster.protein_length_stats + ): + cluster_metrics_line.extend( + [ + str(cluster.protein_length_stats.get("mean", "N/A")), + str(cluster.protein_length_stats.get("sd", "N/A")), + ] + ) + else: + cluster_metrics_line.extend(["N/A", "N/A"]) + + cluster_metrics_line.extend( + str( + sum( + cluster.protein_counts_of_proteomes_by_level_by_attribute[ + attribute + ][_level] + ) + ) + for _level in levels + ) + + if attribute != "taxon": + cluster_metrics_line.extend( + [ + str( + median( + cluster.protein_counts_of_proteomes_by_level_by_attribute[ + attribute + ][ + _level + ] + ) + ) + for _level in levels + ] + ) + cluster_metrics_line.extend( + [ + "{0:.2f}".format( + cluster.proteome_coverage_by_level_by_attribute[ + attribute + ][_level] + ) + for _level in levels + ] + ) + + cluster_metrics_output.append("\t".join(cluster_metrics_line)) + + if cluster_metrics_output: + with open(cluster_metrics_f, "w") as cluster_metrics_fh: + logger.info(f"[STATUS] - Writing {cluster_metrics_f}") + cluster_metrics_output.sort() + header_line = self.__get_header_line("cluster_metrics", attribute) + cluster_metrics_output.insert(0, header_line) + cluster_metrics_fh.write("\n".join(cluster_metrics_output) + "\n") + cluster_metrics_output = [] + + # 7. Write cluster ALO metrics + def __get_enrichment_data(self, ALO: AttributeLevel, cluster: Cluster) -> List[str]: + """ + Retrieve enrichment data for a given AttributeLevel and Cluster. + + Args: + ALO (AttributeLevel): An instance of AttributeLevel containing enrichment data. + cluster (Cluster): An instance of Cluster for which enrichment data is retrieved. + + Returns: + List[str]: A list containing enrichment data: + - Enrichment status ("enriched", "depleted", "equal" or "N/A" if unavailable) + - Log2 mean value + - p-value + + """ + if ( + ALO + and ALO.cluster_type_by_cluster_id[cluster.cluster_id] == "shared" + and ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id] + ): + log2_mean = ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id] + enrichment = ( + "enriched" + if log2_mean > 0 + else "depleted" if log2_mean < 0 else "equal" + ) + return [ + enrichment, + f"{log2_mean}", + f"{ALO.cluster_mwu_pvalue_by_cluster_id[cluster.cluster_id]}", + ] + return ["N/A", "N/A", "N/A"] + + def __get_proteome_data(self, ALO: AttributeLevel, cluster: Cluster) -> List[str]: + """ + Retrieve proteome data for a given AttributeLevel and Cluster. + + Args: + ALO (AttributeLevel): An instance of AttributeLevel containing proteome data. + cluster (Cluster): An instance of Cluster for which proteome data is retrieved. + + Returns: + List[str]: A list containing proteome data: + - Number of proteomes present in both ALO and cluster + - Number of proteomes present only in cluster + - Sorted list of proteome IDs present in both ALO and cluster, or "N/A" if none + - Sorted list of proteome IDs present only in cluster, or "N/A" if none + + """ + ALO_proteomes_present = cluster.proteome_ids.intersection( + ALO.proteomes if ALO else set() + ) + non_ALO_proteomes_present = cluster.proteome_ids.difference( + ALO.proteomes if ALO else set() + ) + return [ + f"{len(ALO_proteomes_present)}", + f"{len(non_ALO_proteomes_present)}", + ( + f"{','.join(sorted(list(ALO_proteomes_present)))}" + if ALO_proteomes_present + else "N/A" + ), + ( + f"{','.join(sorted(list(non_ALO_proteomes_present)))}" + if non_ALO_proteomes_present + else "N/A" + ), + ] + + def __write_cluster_metrics_ALO(self) -> None: + """ + Write cluster metrics for each attribute level object (ALO) to separate files. + + For each attribute in self.aloCollection.attributes, this method writes cluster metrics + to a file named '{attribute}.{level}.cluster_metrics.txt' in the corresponding directory + under self.dirs[attribute]. + + Metrics include cluster ID, status, type, protein count, proteome count, counts by level, + mean ALO counts, mean non-ALO counts, enrichment data, and proteome coverage. + + Returns: + None + """ + for attribute in self.aloCollection.attributes: + levels = sorted( + list(self.aloCollection.ALO_by_level_by_attribute[attribute]) + ) + + for level in levels: + ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level] + cluster_metrics_ALO_f = os.path.join( + self.dirs[attribute], f"{attribute}.{level}.cluster_metrics.txt" + ) + if ALO is None: + continue + cluster_metrics_ALO_output = [ + "\t".join( + [ + f"{cluster.cluster_id}", + ( + f"{ALO.cluster_status_by_cluster_id[cluster.cluster_id]}" + if ALO + else "N/A" + ), + ( + f"{ALO.cluster_type_by_cluster_id[cluster.cluster_id]}" + if ALO + else "N/A" + ), + f"{cluster.protein_count}", + f"{cluster.proteome_count}", + f"{sum(cluster.protein_counts_of_proteomes_by_level_by_attribute[attribute][level])}", + ( + f"{ALO.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id]}" + if ALO + and ALO.cluster_mean_ALO_count_by_cluster_id[ + cluster.cluster_id + ] + else "N/A" + ), + ( + f"{ALO.cluster_mean_non_ALO_count_by_cluster_id[cluster.cluster_id]}" + if ALO + and ALO.cluster_mean_non_ALO_count_by_cluster_id[ + cluster.cluster_id + ] + else "N/A" + ), + *self.__get_enrichment_data(ALO, cluster), + "{0:.2f}".format( + cluster.proteome_coverage_by_level_by_attribute[ + attribute + ][level] + ), + *self.__get_proteome_data(ALO, cluster), + ] + ) + for cluster in self.clusterCollection.cluster_list + ] + if cluster_metrics_ALO_output: + with open(cluster_metrics_ALO_f, "w") as cluster_metrics_ALO_fh: + logger.info(f"[STATUS] - Writing {cluster_metrics_ALO_f}") + cluster_metrics_ALO_output.sort() + + header_line = self.__get_header_line( + "cluster_metrics_ALO", attribute + ) + cluster_metrics_ALO_output.insert(0, header_line) + cluster_metrics_ALO_fh.write( + "\n".join(cluster_metrics_ALO_output) + "\n" + ) + + # 8. write cluster 1to1 ALO + def __write_cluster_1to1_ALO(self) -> None: + """ + Write cluster 1-to-1 relationships for each attribute level object (ALO) to separate files. + + For each attribute in self.aloCollection.attributes, this method writes cluster 1-to-1 + relationships to a file named '{attribute}.{level}.cluster_1to1s.txt' in the corresponding + directory under self.dirs[attribute]. + + Relationships include cluster ID, type, cardinality, proteome count, and fuzzy count ratio. + + Returns: + None + """ + for attribute in self.aloCollection.attributes: + levels = sorted( + list(self.aloCollection.ALO_by_level_by_attribute[attribute]) + ) + for level in levels: + cluster_1to1_ALO_f = os.path.join( + self.dirs[attribute], f"{attribute}.{level}.cluster_1to1s.txt" + ) + cluster_1to1_ALO_output = [] + + ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level] + + if attribute != "taxon" and ALO: + for ( + cluster_type + ) in ALO.clusters_by_cluster_cardinality_by_cluster_type: + for ( + cluster_cardinality + ) in ALO.clusters_by_cluster_cardinality_by_cluster_type[ + cluster_type + ]: + for ( + cluster_id + ) in ALO.clusters_by_cluster_cardinality_by_cluster_type[ + cluster_type + ][ + cluster_cardinality + ]: + cluster = ( + self.clusterCollection.cluster_list_by_cluster_id[ + cluster_id + ] + ) + protein_count_by_proteome = ( + cluster.protein_count_by_proteome_id + ) + proteome_count = cluster.proteome_count + + fuzzy_proteome_ratio = ( + len( + [ + protein_count + for _, protein_count in protein_count_by_proteome.items() + if protein_count + == self.inputData.fuzzy_count + ] + ) + / proteome_count + ) + + cluster_1to1_ALO_line = "\t".join( + [ + str(cluster_id), + str(cluster_type), + str(cluster_cardinality), + str(proteome_count), + "{0:.2f}".format(fuzzy_proteome_ratio), + ] + ) + + cluster_1to1_ALO_output.append(cluster_1to1_ALO_line) + + if cluster_1to1_ALO_output: + with open(cluster_1to1_ALO_f, "w") as cluster_1to1_ALO_fh: + logger.info(f"[STATUS] - Writing {cluster_1to1_ALO_f}") + cluster_1to1_ALO_output.sort() + header_line = self.__get_header_line( + "cluster_1to1s_ALO", attribute + ) + cluster_1to1_ALO_output.insert(0, header_line) + cluster_1to1_ALO_fh.write( + "\n".join(cluster_1to1_ALO_output) + "\n" + ) + cluster_1to1_ALO_output = [] + + # 9. write_pairwise_representation + def __process_background_representation( + self, + attribute: str, + level: str, + ALO: AttributeLevel, + cluster: Cluster, + background_representation_test_by_pair_by_attribute, + ) -> None: + """ + Process and append background representation test results for a cluster and attribute level. + + Args: + attribute (str): The attribute name. + level (str): The attribute level. + ALO (AttributeLevel): The AttributeLevel object for the attribute and level. + cluster (Cluster): The Cluster object representing the cluster. + background_representation_test_by_pair_by_attribute (Dict[str, Dict[str, Any]]): + A nested dictionary to store background representation test results, + structured as [attribute][background_pair] = list of test results. + + Returns: + None + """ + background_pair = (level, "background") + if attribute not in background_representation_test_by_pair_by_attribute: + background_representation_test_by_pair_by_attribute[attribute] = {} + if ( + background_pair + not in background_representation_test_by_pair_by_attribute[attribute] + ): + background_representation_test_by_pair_by_attribute[attribute][ + background_pair + ] = [] + + background_representation_test = [ + cluster.cluster_id, + level, + "background", + ALO.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id], + ALO.cluster_mean_non_ALO_count_by_cluster_id[cluster.cluster_id], + ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id], + ALO.cluster_mwu_pvalue_by_cluster_id[cluster.cluster_id], + ] + background_representation_test_by_pair_by_attribute[attribute][ + background_pair + ].append(background_representation_test) + + def __get_pairwise_representation_test( + self, + cluster: Cluster, + attribute: str, + level: str, + levels_seen: Set[str], + levels: List[str], + ) -> Generator[List[Any], None, None]: + """ + Generate pairwise representation test results for a cluster and attribute level. + + Args: + cluster (Cluster): The Cluster object representing the cluster. + attribute (str): The attribute name. + level (str): The current attribute level. + levels_seen (Set[str]): A set of attribute levels already processed. + levels (List[str]): A list of all attribute levels. + + Yields: + Generator[List[Any], None, None]: A generator yielding lists containing pairwise representation test results. + Each list includes: + - cluster.cluster_id: ID of the cluster. + - level: Current attribute level. + - other_level: Another attribute level being compared with `level`. + - mean_ALO_count: Mean count of ALOs in the cluster at `level`. + - mean_non_ALO_count: Mean count of non-ALOs in the cluster at `level`. + - mwu_log2_mean: Log2 mean of the Mann-Whitney U test results between `level` and `other_level`. + - mwu_pvalue: P-value of the Mann-Whitney U test results between `level` and `other_level`. + """ + for other_level in set(levels).difference(levels_seen): + if other_level != level: + other_ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][ + other_level + ] + if ( + other_ALO + and len(cluster.proteome_ids.intersection(other_ALO.proteomes)) >= 2 + ): + protein_counts_level = [ + count + for count in cluster.protein_counts_of_proteomes_by_level_by_attribute[ + attribute + ][ + level + ] + if count > 0 + ] + protein_counts_other_level = [ + count + for count in cluster.protein_counts_of_proteomes_by_level_by_attribute[ + attribute + ][ + other_level + ] + if count > 0 + ] + if protein_counts_level and protein_counts_other_level: + ( + mwu_pvalue, + mwu_log2_mean, + mean_ALO_count, + mean_non_ALO_count, + ) = statistic( + protein_counts_level, + protein_counts_other_level, + self.inputData.test, + self.inputData.min_proteomes, + ) + yield [ + cluster.cluster_id, + level, + other_level, + mean_ALO_count, + mean_non_ALO_count, + mwu_log2_mean, + mwu_pvalue, + ] + # pvalue = None + # try: + # pvalue = scipy.stats.mannwhitneyu(protein_counts_level, protein_counts_other_level, alternative="two-sided")[1] + # except: + # pvalue = 1.0 + # mean_level = mean(protein_counts_level) + # mean_other_level = mean(protein_counts_other_level) + # log2fc_mean = log((mean_level/mean_other_level), 2) + # yield [cluster.cluster_id, level, other_level, mean_level, + # mean_other_level, log2fc_mean, pvalue] + + def __process_pairwise_representation( + self, + attribute: str, + level: str, + levels_seen: Set[str], + levels: List[str], + cluster: Cluster, + pairwise_representation_test_by_pair_by_attribute, + pairwise_representation_test_output: List[str], + ) -> None: + """ + Process pairwise representation tests for a specific attribute level and cluster. + + Args: + attribute (str): The attribute name. + level (str): The current attribute level. + levels_seen (Set[str]): A set of attribute levels already processed. + levels (List[str]): A list of all attribute levels. + cluster (Cluster): The Cluster object representing the cluster. + pairwise_representation_test_by_pair_by_attribute (Dict[str, Dict[Tuple[str, str], List[List[Any]]]]): + Dictionary storing pairwise representation test results by attribute and pair of levels. + pairwise_representation_test_output (List[str]): List to store formatted output lines of pairwise tests. + + Returns: + None + """ + for result in self.__get_pairwise_representation_test( + cluster, attribute, level, levels_seen, levels + ): + if attribute not in pairwise_representation_test_by_pair_by_attribute: + pairwise_representation_test_by_pair_by_attribute[attribute] = {} + pair = (result[1], result[2]) + if pair not in pairwise_representation_test_by_pair_by_attribute[attribute]: + pairwise_representation_test_by_pair_by_attribute[attribute][pair] = [] + pairwise_representation_test_by_pair_by_attribute[attribute][pair].append( + result + ) + + pairwise_representation_test_output.append( + f"{result[0]}\t{result[1]}\t{result[3]}\t{result[2]}\t{result[4]}\t{result[5]}\t{result[6]}" + ) + + # 9.5 __plot_count_comparisons_volcano + def __prepare_data(self, pair_data: List[str]) -> Tuple[List[float], List[float]]: + """ + Prepare data from pair_data into lists of p-values and log2 fold change (log2fc) values. + + Args: + pair_data (List[str]): List of strings containing data for each pair. + + Returns: + Tuple[List[float], List[float]]: Tuple containing: + - List[float]: p-values extracted from pair_data. + - List[float]: log2 fold change (log2fc) values extracted from pair_data. + """ + pair_data_count = len(pair_data) + p_values: List[float] = [] + log2fc_values: List[float] = [] + + for data in pair_data: + log2fc_values.append(float(data[5])) + pvalue = data[6] if data[6] != 0.0 else 0.01 / (pair_data_count + 1) + p_values.append(float(pvalue)) + + return p_values, log2fc_values + + def __get_output_filename(self, attribute: str, pair_list: List[str]) -> str: + """ + Generate an output filename based on attribute, pair_list, and plot_format. + + Args: + attribute (str): Attribute name used in the filename. + pair_list (List[str]): List of strings used to form part of the filename. + + Returns: + str: Generated output filename. + """ + return os.path.join( + self.dirs[attribute], + f"{attribute}.pairwise_representation_test.{'_'.join(pair_list)}.{self.inputData.plot_format}", + ) + + def __create_volcano_plot( + self, + p_values: List[float], + log2fc_values: List[float], + pair_list: List[str], + output_file: str, + ) -> None: + """ + Create a volcano plot to visualize differential expression analysis results. + + Parameters: + - p_values (List[float]): List of p-values for each comparison. + - log2fc_values (List[float]): List of log2 fold change values for each comparison. + - pair_list (List[str]): List of pairs or labels corresponding to each comparison. + - output_file (str): Filepath where the plot will be saved. + + Returns: + - None + """ + plt.figure(1, figsize=self.inputData.plotsize) + + axScatter, axHistx = self.__setup_plot_axes() + + p_array = np.array(p_values) + log2fc_array = np.array(log2fc_values) + + log2fc_percentile = self.__plot_data(axScatter, axHistx, log2fc_array, p_array) + self.__set_plot_properties( + axScatter, axHistx, log2fc_array, p_array, pair_list, log2fc_percentile + ) + + logger.info(f"[STATUS] - Plotting {output_file}") + plt.savefig(output_file, format=self.inputData.plot_format) + plt.close() + + def __setup_plot_axes(self) -> Tuple[Any, Any]: + """ + Set up the axes for a combined scatter plot and histogram. + + Returns: + - Tuple of matplotlib.axes.Axes: Tuple containing the scatter plot axes (`axScatter`) + and the histogram axes (`axHistx`). + """ + left, width = 0.1, 0.65 + bottom, height = 0.1, 0.65 + bottom_h = left + width + 0.02 + rect_scatter = (left, bottom, width, height) + rect_histx = (left, bottom_h, width, 0.2) + + axScatter = plt.axes(rect_scatter) + axScatter.set_facecolor("white") + axHistx = plt.axes(rect_histx) + axHistx.set_facecolor("white") + axHistx.xaxis.set_major_formatter(NullFormatter()) + axHistx.yaxis.set_major_formatter(NullFormatter()) + + return axScatter, axHistx + + def __plot_data( + self, + axScatter: Any, + axHistx: Any, + log2fc_array: np.ndarray, + p_array: np.ndarray, + ) -> Any: + """ + Plot data on scatter and histogram axes. + + Parameters: + - axScatter (Any): Axes for the scatter plot. + - axHistx (Any): Axes for the histogram plot. + - log2fc_array (np.ndarray): Array of log2 fold change values. + - p_array (np.ndarray): Array of p-values. + + Returns: + - float: 95th percentile of log2 fold change values. + """ + # Plot histogram + binwidth = 0.05 + xymax = np.max(np.fabs(log2fc_array)) # type: ignore + lim = (int(xymax / binwidth) + 1) * binwidth + bins = np.arange(-lim, lim + binwidth, binwidth) + axHistx.hist( + log2fc_array, bins=bins, histtype="stepfilled", color="grey", align="mid" + ) + + # Plot scatter + axScatter.scatter( + log2fc_array, p_array, alpha=0.8, edgecolors="none", s=25, c="grey" + ) + + # Add reference lines + ooFive, ooOne = 0.05, 0.01 + log2fc_percentile = np.percentile(log2fc_array, 95) + + axScatter.axhline(y=ooFive, linewidth=2, color="orange", linestyle="--") + axScatter.axhline(y=ooOne, linewidth=2, color="red", linestyle="--") + axScatter.axvline(x=1.0, linewidth=2, color="purple", linestyle="--") + axScatter.axvline( + x=log2fc_percentile, linewidth=2, color="blue", linestyle="--" + ) + axScatter.axvline(x=-1.0, linewidth=2, color="purple", linestyle="--") + axScatter.axvline( + x=-log2fc_percentile, linewidth=2, color="blue", linestyle="--" + ) + + return log2fc_percentile + + def __set_plot_properties( + self, + axScatter: Any, + axHistx: Any, + log2fc_array: np.ndarray, + p_array: np.ndarray, + pair_list: List[str], + log2fc_percentile: Any, + ) -> None: + """ + Set properties and customize the appearance of the volcano plot. + + Parameters: + - axScatter (Any): Axes for the scatter plot. + - axHistx (Any): Axes for the histogram plot. + - log2fc_array (np.ndarray): Array of log2 fold change values. + - p_array (np.ndarray): Array of p-values. + - pair_list (List[str]): List of pairs or labels corresponding to each comparison. + - log2fc_percentile (Any): 95th percentile of log2 fold change values. + + Returns: + - None + """ + # Set axis limits and properties + x_min = -max(abs(np.min(log2fc_array)), abs(np.max(log2fc_array))) + x_max = -x_min + axScatter.set_xlim(x_min - 1, x_max + 1) + axScatter.grid(True, linewidth=1, which="major", color="lightgrey") + axScatter.grid(True, linewidth=0.5, which="minor", color="lightgrey") + axScatter.set_ylim(1.1, np.min(p_array) * 0.1) + axScatter.set_xlabel( + f"log2(mean({pair_list[0]})/mean({pair_list[1]}))", + fontsize=self.inputData.fontsize, + ) + axScatter.set_ylabel("p-value", fontsize=self.inputData.fontsize) + axScatter.set_yscale("log") + axHistx.set_xlim(axScatter.get_xlim()) + + # Add legend + legend_elements = [ + Line2D([0], [0], color="orange", linestyle="--", label="p-value = 0.05"), + Line2D([0], [0], color="red", linestyle="--", label="p-value = 0.01"), + Line2D([0], [0], color="purple", linestyle="--", label="|log2FC| = 1"), + Line2D( + [0], + [0], + color="blue", + linestyle="--", + label=f"|log2FC-95%ile| = {log2fc_percentile:.2f}", + ), + ] + legend = axScatter.legend( + handles=legend_elements, fontsize=self.inputData.fontsize, frameon=True + ) + legend.get_frame().set_facecolor("white") + + def __plot_count_comparisons_volcano( + self, + pairwise_representation_test_by_pair_by_attribute, + ) -> None: + """ + Generate volcano plots for count comparisons based on pairwise representation test results. + + Parameters: + - pairwise_representation_test_by_pair_by_attribute (Dict[str, Dict[Tuple[str, str], Any]]): + Dictionary containing test results organized by attribute and pair. + + Returns: + - None + """ + for attribute in pairwise_representation_test_by_pair_by_attribute: + for pair in pairwise_representation_test_by_pair_by_attribute[attribute]: + pair_list = list(pair) + pair_data = pairwise_representation_test_by_pair_by_attribute[ + attribute + ][pair] + + p_values, log2fc_values = self.__prepare_data(pair_data) + + if p_values: + output_file = self.__get_output_filename(attribute, pair_list) + self.__create_volcano_plot( + p_values, log2fc_values, pair_list, output_file + ) + + def __write_pairwise_representation(self) -> None: + """ + Process pairwise representation tests, write results, and generate volcano plots. + + Iterates through attributes in `self.aloCollection.attributes` and performs the + following steps for each attribute: + 1. Initializes dictionaries `pairwise_representation_test_by_pair_by_attribute` + and `background_representation_test_by_pair_by_attribute`. + 2. Prepares output file path (`pairwise_representation_test_f`) and header line + (`pairwise_representation_test_output`) for pairwise representation test results. + 3. Retrieves sorted levels from `self.aloCollection.ALO_by_level_by_attribute[attribute]`. + 4. Iterates through each level and processes pairwise and background representation + tests for each cluster in `self.clusterCollection.cluster_list`. + 5. Generates volcano plots using `__plot_count_comparisons_volcano` for + `background_representation_test_by_pair_by_attribute` if available. + 6. Writes pairwise representation test results to `pairwise_representation_test_f` + if data is available. + 7. Generates volcano plots using `__plot_count_comparisons_volcano` for + `pairwise_representation_test_by_pair_by_attribute` if data is available. + + Returns: + - None + """ + for attribute in self.aloCollection.attributes: + pairwise_representation_test_by_pair_by_attribute: Dict[ + str, Dict[str, str] + ] = {} + background_representation_test_by_pair_by_attribute = {} + pairwise_representation_test_output = [] + pairwise_representation_test_f = os.path.join( + self.dirs[attribute], f"{attribute}.pairwise_representation_test.txt" + ) + levels = sorted( + list(self.aloCollection.ALO_by_level_by_attribute[attribute]) + ) + levels_seen: Set[str] = set() + + for level in levels: + ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level] + + for cluster in self.clusterCollection.cluster_list: + if ( + ALO + and ALO.cluster_type_by_cluster_id[cluster.cluster_id] + == "shared" + and ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id] + ): + self.__process_background_representation( + attribute, + level, + ALO, + cluster, + background_representation_test_by_pair_by_attribute, + ) + + ALO_proteomes_present = cluster.proteome_ids.intersection( + ALO.proteomes if ALO else set("") + ) + + if ( + len(levels) > 1 + and len(ALO_proteomes_present) >= self.inputData.min_proteomes + ): + self.__process_pairwise_representation( + attribute, + level, + levels_seen, + levels, + cluster, + pairwise_representation_test_by_pair_by_attribute, + pairwise_representation_test_output, + ) + + levels_seen.add(level) + + if background_representation_test_by_pair_by_attribute: + self.__plot_count_comparisons_volcano( + background_representation_test_by_pair_by_attribute + ) + + if pairwise_representation_test_output: + with open( + pairwise_representation_test_f, "w" + ) as pairwise_representation_test_fh: + logger.info(f"[STATUS] - Writing {pairwise_representation_test_f}") + pairwise_representation_test_output.sort() + header_line = self.__get_header_line( + "pairwise_representation_test", attribute + ) + pairwise_representation_test_output.insert(0, header_line) + pairwise_representation_test_fh.write( + "\n".join(pairwise_representation_test_output) + "\n" + ) + + if pairwise_representation_test_by_pair_by_attribute: + self.__plot_count_comparisons_volcano( + pairwise_representation_test_by_pair_by_attribute + ) diff --git a/src/core/input.py b/src/core/input.py new file mode 100644 index 0000000..1ab4636 --- /dev/null +++ b/src/core/input.py @@ -0,0 +1,77 @@ +import os +from typing import List, Optional, Set, Tuple + + +class ServeArgs: + def __init__(self, port: int = 8000): + self.port = port + + +class InputData: + def __init__( + self, + nodesdb_f: str, + pfam_mapping_f: str, + ipr_mapping_f: str, + go_mapping_f: str, + cluster_file: str, + config_f: str, + sequence_ids_file: str, + species_ids_file: Optional[str] = None, + functional_annotation_f: Optional[str] = None, + fasta_dir: Optional[str] = None, + tree_file: Optional[str] = None, + output_path: Optional[str] = None, + infer_singletons: Optional[bool] = False, + plot_tree: bool = False, + min_proteomes: int = 2, + test: str = "mannwhitneyu", + taxranks: List[str] = None, + repetitions: int = 30, + fuzzy_count: int = 1, + fuzzy_fraction: float = 0.75, + fuzzy_range: Set[int] = {x for x in range(20 + 1) if x != 1}, + fontsize: int = 18, + plotsize: Tuple[float, float] = (24, 12), + plot_format: str = "pdf", + taxon_idx_mapping_file: Optional[str] = None, + ) -> None: + if taxranks is None: + taxranks = ["phylum", "order", "genus"] + if output_path: + if not os.path.isabs(output_path): + output_path = os.path.abspath(output_path) + else: + output_path = os.path.join(os.getcwd(), "kinfin_results") + + self.cluster_f = cluster_file + self.config_f = config_f + self.sequence_ids_f = sequence_ids_file + self.species_ids_f = species_ids_file + self.tree_f = tree_file + self.functional_annotation_f = functional_annotation_f + if config_f.endswith(".json") and not taxon_idx_mapping_file: + raise ValueError("[ERROR] - taxon_idx_mapping not present") + self.taxon_idx_mapping_file = taxon_idx_mapping_file + self.nodesdb_f = nodesdb_f + self.pfam_mapping_f = pfam_mapping_f + self.ipr_mapping_f = ipr_mapping_f + self.go_mapping_f = go_mapping_f + + self.test = test + self.plot_tree = plot_tree + self.fasta_dir = fasta_dir + self.output_path = output_path + self.infer_singletons = infer_singletons + self.fuzzy_count = fuzzy_count + self.fuzzy_fraction = fuzzy_fraction + self.fuzzy_range = fuzzy_range + self.repetitions = repetitions + self.min_proteomes = min_proteomes + self.plot_format = plot_format + self.fontsize = fontsize + self.taxranks = taxranks + self.plotsize = plotsize + + self.pfam_mapping = True + self.ipr_mapping = True diff --git a/src/core/logger.py b/src/core/logger.py new file mode 100644 index 0000000..322e0af --- /dev/null +++ b/src/core/logger.py @@ -0,0 +1,29 @@ +import logging +import os + + +def setup_logger(log_path: str) -> logging.Logger: + """ + Sets up a logger that logs messages to both the console and a file. + + Args: + log_path (str): Path to the log file. + + Returns: + logging.Logger: Configured logger instance. + """ + os.makedirs(os.path.dirname(log_path), exist_ok=True) + + logger = logging.getLogger("kinfin_logger") + logger.setLevel(logging.DEBUG) + + console_handler = logging.StreamHandler() + formatter = logging.Formatter("%(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S") + console_handler.setFormatter(formatter) + logger.addHandler(console_handler) + + file_handler = logging.FileHandler(log_path, mode="w") + file_handler.setFormatter(formatter) + logger.addHandler(file_handler) + + return logger diff --git a/src/core/logic.py b/src/core/logic.py new file mode 100644 index 0000000..84adbb1 --- /dev/null +++ b/src/core/logic.py @@ -0,0 +1,490 @@ +import logging +import os +from collections import defaultdict +from typing import DefaultDict, Dict, List, Literal, Optional, Set, Tuple + +import ete3 +from ete3 import Tree, TreeNode + +from core.utils import progress, read_fasta_len, yield_config_lines, yield_file_lines + +logger = logging.getLogger("kinfin_logger") + + +# common +def parse_nodesdb(filepath: str) -> Dict[str, Dict[str, str]]: + """ + Parses the nodes database file. + + Args: + filepath (str): The path to the nodes database file. + + Returns: + Dict[str, Dict[str, str]]: A dictionary containing node information. + Keys are node identifiers, and values are dictionaries with keys: + + - 'rank': The rank of the node. + - 'name': The name of the node. + - 'parent': The parent of the node. + """ + logger.info(f"[STATUS] - Parsing nodesDB {filepath}") + + nodesdb: Dict[str, Dict[str, str]] = {} + nodesdb_count = 0 + nodes_count = 0 + + for line in yield_file_lines(filepath): + if line.startswith("#"): + nodesdb_count = int(line.lstrip("# nodes_count = ").rstrip("\n")) + elif line.strip(): + nodes_count += 1 + try: + node, rank, name, parent = line.rstrip("\n").split("\t") + nodesdb[node] = {"rank": rank, "name": name, "parent": parent} + except Exception: + pass + if nodesdb_count: + progress(nodes_count, 1000, nodesdb_count) + return nodesdb + + +# cli +def get_lineage( + taxid: str, + nodesdb: Dict[str, Dict[str, str]], + taxranks: List[str], +) -> Dict[str, str]: + """ + Get the lineage of a taxonomic identifier. + + Args: + taxid (str): The taxonomic identifier. + nodesdb (Dict[str, Dict[str, str]]): A dictionary containing information about nodes. + taxranks (List[str]): A list of taxonomic ranks to include in the lineage. + + Returns: + Dict[str, str]: A dictionary containing the lineage information, with taxonomic ranks as keys + and corresponding names as values. + """ + lineage = {taxrank: "undef" for taxrank in taxranks} + parent = "" + node = taxid + while parent != "1": + taxrank = nodesdb[node]["rank"] + parent = nodesdb[node]["parent"] + if taxrank in taxranks: + name = nodesdb[node]["name"] + lineage[taxrank] = name + node = parent + return lineage + + +# cli +def parse_attributes_from_config_data( + config_f: str, + taxon_idx_mapping_file: Optional[str], +) -> Tuple[Set[str], Dict[str, str], List[str], Dict[str, Dict[str, str]]]: + """ + Parses attributes from a configuration file. + + Args: + config_f (str): The path to the configuration file. + + Returns: + Tuple[Set[str], Dict[str, str], List[str], Dict[str, Dict[str, str]]]: A tuple containing: + - A set of proteome IDs. + - A dictionary mapping species IDs to proteome IDs. + - A list of attributes. + - A dictionary mapping proteome IDs to dictionaries, where each inner dictionary + maps attributes to their corresponding levels. + + Raises: + FileNotFoundError: If the specified configuration file is not found. + ValueError: If there are errors in the configuration file format or content. + + Note: + - The configuration file is expected to have a header line starting with '#', + where the first element is 'IDX' and the second element is 'TAXON'. + - Each subsequent non-empty line in the configuration file should contain + comma-separated values corresponding to the attributes defined in the header line. + - The 'TAXON' attribute is expected to be unique for each line. + """ + + logger.info("[STATUS] - Parsing config data ...") + attributes: List[str] = [] + level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]] = {} + proteomes: Set[str] = set() + proteome_id_by_species_id: Dict[str, str] = {} + + for line in yield_config_lines(config_f, taxon_idx_mapping_file): + if line.startswith("#"): + if not attributes: + attributes = [x.strip() for x in line.lstrip("#").split(",")] + if attributes[0] != "IDX" or attributes[1] != "taxon": + error_msg = f"[ERROR] - First/second element have to be IDX/TAXON.\n\t{attributes}" + raise ValueError(error_msg) + elif line.strip(): + temp = line.split(",") + + if len(temp) != len(attributes): + error_msg = f"[ERROR] - number of columns in line differs from header\n\t{attributes}\n\t{temp}" + raise ValueError(error_msg) + + if temp[1] in proteomes: + error_msg = f"[ERROR] - 'TAXON' should be unique. {temp[0]} was encountered multiple times" # fmt:skip + raise ValueError(error_msg) + + species_id = temp[0] + proteome_id = temp[1] + proteomes.add(proteome_id) + proteome_id_by_species_id[species_id] = proteome_id + + level_by_attribute_by_proteome_id[proteome_id] = dict(zip(attributes, temp)) + level_by_attribute_by_proteome_id[proteome_id]["all"] = "all" + attributes.insert(0, "all") # append to front + return ( + proteomes, + proteome_id_by_species_id, + attributes, + level_by_attribute_by_proteome_id, + ) + + +# common +def add_taxid_attributes( + nodesdb_f: str, + taxranks: List[str], + attributes: List[str], + level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]], +) -> Tuple[List[str], Dict[str, Dict[str, str]]]: + """ + Adds taxonomic attributes to the dictionary of attributes indexed by proteome ID. + + Parameters: + + - nodesdb_f (str): File path to the nodes database. + - taxranks (List[str]): List of taxonomic ranks to be included as attributes. + - attributes (List[str]): List of existing attributes. + - level_by_attribute_by_proteome_id (Dict[str, Dict[str, str]]): Dictionary where keys + are proteome IDs and values are dictionaries of attributes for each proteome ID, + including at least the "TAXID" attribute. + + Returns: + Tuple[List[str], Dict[str, Dict[str, str]]]: A tuple containing: + + - Updated list of attributes with taxonomic ranks added and "TAXID" removed. + - Updated dictionary of attributes indexed by proteome ID, with taxonomic attributes added and "TAXID" removed. + """ + NODESDB = parse_nodesdb(nodesdb_f) + for proteome_id in level_by_attribute_by_proteome_id: + taxid = level_by_attribute_by_proteome_id[proteome_id]["TAXID"] + lineage = get_lineage(taxid=taxid, nodesdb=NODESDB, taxranks=taxranks) + + # add lineage attribute/levels + for taxrank in taxranks: + level_by_attribute_by_proteome_id[proteome_id][taxrank] = lineage[taxrank] + + # remove taxid-levels + del level_by_attribute_by_proteome_id[proteome_id]["TAXID"] + + # remove taxid-attribute + attributes.remove("TAXID") + + # add taxranks to rank + attributes.extend(iter(taxranks)) + return attributes, level_by_attribute_by_proteome_id + + +# cli +def parse_tree_from_file( + tree_f: Optional[str], + attributes: List[str], + level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]], + proteomes: Set[str], +) -> Tuple[Optional[Tree], Optional[Dict[frozenset[str], str]]]: + """ + Parse a phylogenetic tree from nwk file and set specified outgroups. + + Args: + tree_f (str): Path to the nwk tree file. + outgroups (List[str]): List of outgroup taxa names. + + Returns: + tuple[ete3.Tree, Dict[str, int]]: A tuple containing the parsed phylogenetic tree + and a dictionary mapping proteome IDs to node indices. + """ + if not tree_f: + return None, None + outgroups: List[str] = [] + if "OUT" not in attributes: + error_msg = "[ERROR] - Please specify one of more outgroup taxa" + ValueError(error_msg) + outgroups = [ + proteome_id + for proteome_id in proteomes + if level_by_attribute_by_proteome_id[proteome_id]["OUT"] == "1" + ] + logger.info(f"[STATUS] - Parsing Tree file : {tree_f} ...") + tree_ete: TreeNode = ete3.Tree(tree_f) + if len(outgroups) > 1: + outgroup_node: TreeNode = tree_ete.get_common_ancestor( + outgroups + ) # type: ignore + try: + logger.info( + f"[STATUS] - Setting LCA of {', '.join(outgroups)} as outgroup : ..." + ) + tree_ete.set_outgroup(outgroup_node) # type: ignore + except ete3.coretype.tree.TreeError: # type: ignore + logger.info("[STATUS] - Tree seems to be rooted already : ...") + else: + logger.info(f"[STATUS] - Setting {','.join(outgroups)} as outgroup : ...") + tree_ete.set_outgroup(outgroups[0]) # type: ignore + logger.info(tree_ete) + node_idx_by_proteome_ids: Dict[frozenset[str], str] = {} + for idx, node in enumerate(tree_ete.traverse("levelorder")): # type: ignore + proteome_ids = frozenset(leaf.name for leaf in node) + if not node.name: + node.add_features( + name=f"n{idx}", + nodetype="node", + proteome_ids=proteome_ids, + apomorphic_cluster_counts={"singletons": 0, "non_singletons": 0}, + synapomorphic_cluster_counts={ + "complete_presence": 0, + "partial_absence": 0, + }, + synapomorphic_cluster_strings=[], + counts={"specific": 0, "shared": 0, "absent": 0, "singleton": 0}, + ) + else: + node.add_features( + nodetype="tip", + proteome_ids=proteome_ids, + apomorphic_cluster_counts={"singletons": 0, "non_singletons": 0}, + synapomorphic_cluster_counts={ + "complete_presence": 0, + "partial_absence": 0, + }, + synapomorphic_cluster_strings=[], + counts={"specific": 0, "shared": 0, "absent": 0, "singleton": 0}, + ) + node_idx_by_proteome_ids[proteome_ids] = node.name + return tree_ete, node_idx_by_proteome_ids + + +def parse_fasta_dir(species_ids_f: str, fasta_dir: str) -> Dict[str, int]: + """ + Parse a species IDs file to retrieve fasta file names and then calculate + lengths of sequences from corresponding FASTA files. + + Args: + - species_ids_f (str): Path to the species IDs file, where each line contains + an index and a corresponding FASTA file name separated by ': '. + - fasta_dir (str): Directory path where the FASTA files are located. + + Returns: + - Dict[str, int]: A dictionary mapping header strings (protein IDs) to their + corresponding sequence lengths extracted from the FASTA files. + """ + logger.info("[STATUS] - Parsing FASTAs ...") + fasta_file_by_species_id: Dict[str, str] = {} + + for line in yield_file_lines(species_ids_f): + if not line.startswith("#"): + idx, fasta = line.split(": ") + fasta_file_by_species_id[idx] = fasta + + fasta_len_by_protein_id: Dict[str, int] = {} + for _, fasta_f in list(fasta_file_by_species_id.items()): + fasta_f = os.path.join(fasta_dir, fasta_f) + + for header, length in read_fasta_len(fasta_f): + fasta_len_by_protein_id[header] = length + + return fasta_len_by_protein_id + + +def parse_pfam_mapping(pfam_mapping_f: str) -> Dict[str, str]: + """ + Parse a PFAM mapping file to create a dictionary mapping PFAM domain IDs to their descriptions. + + Args: + - pfam_mapping_f (str): Path to the PFAM mapping file, where each line contains tab-separated values + with the domain ID in the first column and its description in the fifth column. + + Returns: + - Dict[str, str]: A dictionary mapping PFAM domain IDs to their corresponding descriptions. + + Raises: + - ValueError: If conflicting descriptions are found for the same domain ID. + """ + logger.info(f"[STATUS] - Parsing {pfam_mapping_f} ... ") + + pfam_mapping_dict: Dict[str, str] = {} + for line in yield_file_lines(pfam_mapping_f): + temp: List[str] = line.split("\t") + domain_id: str = temp[0] + domain_desc: str = temp[4] + if domain_id not in pfam_mapping_dict: + pfam_mapping_dict[domain_id] = domain_desc + elif domain_desc != pfam_mapping_dict[domain_id]: + error_msg = f"[ERROR] : Conflicting descriptions for {domain_id}" + raise ValueError(error_msg) + + return pfam_mapping_dict + + +def parse_ipr_mapping(ipr_mapping_f: str) -> Dict[str, str]: + """ + Parse an InterPro (IPR) mapping file to create a dictionary mapping InterPro IDs to their descriptions. + + Args: + - ipr_mapping_f (str): Path to the InterPro mapping file, where each line contains an InterPro ID and its description. + Lines starting with "Active_site" are skipped as they are not relevant to mapping. + + Returns: + - Dict[str, str]: A dictionary mapping InterPro IDs to their corresponding descriptions. + + Raises: + - ValueError: If conflicting descriptions are found for the same InterPro ID. + """ + logger.info(f"[STATUS] - Parsing {ipr_mapping_f} ... ") + + ipr_mapping_dict: Dict[str, str] = {} + for line in yield_file_lines(ipr_mapping_f): + if not line.startswith("Active_site"): + temp: List[str] = line.split() + ipr_id: str = temp[0] + ipr_desc: str = " ".join(temp[1:]) + if ipr_id not in ipr_mapping_dict: + ipr_mapping_dict[ipr_id] = ipr_desc + elif ipr_desc != ipr_mapping_dict[ipr_id]: + error_msg = f"[ERROR] : Conflicting descriptions for {ipr_id}" + raise ValueError(error_msg) + return ipr_mapping_dict + + +def parse_go_mapping(go_mapping_f: str) -> Dict[str, str]: + """ + Parse a Gene Ontology (GO) mapping file to create a dictionary mapping GO IDs to their descriptions. + + Args: + - go_mapping_f (str): Path to the GO mapping file, where each line contains a GO ID and its description. + Lines starting with '!' are skipped as they are comments. + + Returns: + - Dict[str, str]: A dictionary mapping GO IDs (without 'GO:' prefix) to their corresponding descriptions. + + Raises: + - ValueError: If conflicting descriptions are found for the same GO ID. + """ + logger.info(f"[STATUS] - Parsing {go_mapping_f} ... ") + go_mapping_dict: Dict[str, str] = {} + for line in yield_file_lines(go_mapping_f): + if not line.startswith("!"): + temp: List[str] = line.replace(" > ", "|").split("|") + go_string: List[str] = temp[1].split(";") + go_desc, go_id = go_string[0].replace("GO:", ""), go_string[1].lstrip(" ") + + if go_id not in go_mapping_dict: + go_mapping_dict[go_id] = go_desc + elif go_desc != go_mapping_dict[go_id]: + error_msg = f"[ERROR] : Conflicting descriptions for {go_id}" + raise ValueError(error_msg) + return go_mapping_dict + + +def compute_protein_ids_by_proteome( + proteomes_by_protein_id: Dict[str, str] +) -> DefaultDict[str, Set[str]]: + """ + Compute protein IDs grouped by proteome IDs. + + Args: + proteomes_by_protein_id (Dict[str, str]): A dictionary mapping protein IDs to proteome IDs. + + Returns: + DefaultDict[str, Set[str]]: A defaultdict where keys are proteome IDs and values are sets + of protein IDs belonging to each proteome ID. + """ + protein_ids_by_proteome_id: DefaultDict[str, Set[str]] = defaultdict(set) + for protein_id, proteome_id in list(proteomes_by_protein_id.items()): + protein_ids_by_proteome_id[proteome_id].add(protein_id) + return protein_ids_by_proteome_id + + +# common +def get_attribute_cluster_type( + singleton, + implicit_protein_ids_by_proteome_id_by_level, +) -> Literal["singleton", "shared", "specific"]: + """ + Determines the type of cluster based on the parameters. + + Parameters: + - singleton: A boolean indicating whether the cluster is a singleton. + - implicit_protein_ids_by_proteome_id_by_level: A dictionary representing protein ids + grouped by proteome id at different levels. + + Returns: + - One of the following strings: + - "singleton": If `singleton` is True. + - "shared": If there are protein ids grouped under multiple proteome ids. + - "specific": If there is only one proteome id with protein ids. + + """ + if singleton: + return "singleton" + if len(implicit_protein_ids_by_proteome_id_by_level) > 1: + return "shared" + else: + return "specific" + + +def get_ALO_cluster_cardinality( + ALO_proteome_counts_in_cluster: List[int], + fuzzy_range: Set[int], + fuzzy_count: int = 1, + fuzzy_fraction: float = 0.75, +) -> Optional[str]: + """ + Determine the cardinality type of a cluster based on ALO proteome counts. + + Args: + ALO_proteome_counts_in_cluster (List[int]): List of ALO proteome counts in the cluster. + fuzzy_range (Set[int]): Set of integers representing the range of fuzzy counts. + fuzzy_count (int, optional): Specific count considered as fuzzy. Default is 1. + fuzzy_fraction (float, optional): Fraction threshold for considering a cluster as 'fuzzy'. Default is 0.75. + + Returns: + Optional[str]: Returns "true" (str) if all counts are 1, "fuzzy" (str) if the cluster meets fuzzy criteria, + and None otherwise. + """ + if len(ALO_proteome_counts_in_cluster) > 2: + length = len(ALO_proteome_counts_in_cluster) + if all(count == 1 for count in ALO_proteome_counts_in_cluster): + return "true" + fuzzycount_count = len( + [ + ALO_proteome_counts + for ALO_proteome_counts in ALO_proteome_counts_in_cluster + if ALO_proteome_counts == fuzzy_count + ] + ) + + fuzzyrange_count = len( + [ + ALO_proteome_counts + for ALO_proteome_counts in ALO_proteome_counts_in_cluster + if ALO_proteome_counts in fuzzy_range + ] + ) + + if fuzzycount_count + fuzzyrange_count == length: + fuzzy_fr = fuzzycount_count / length + + if fuzzy_fr >= fuzzy_fraction: + return "fuzzy" + + return None diff --git a/src/core/proteins.py b/src/core/proteins.py new file mode 100644 index 0000000..c47439d --- /dev/null +++ b/src/core/proteins.py @@ -0,0 +1,108 @@ +from collections import Counter +from typing import Dict, List, Optional, Union + +from core.utils import mean, median, sd + + +class Protein: + def __init__( + self, + protein_id: str, + proteome_id: str, + species_id: str, + sequence_id: str, + ) -> None: + + self.protein_id = protein_id + self.proteome_id = proteome_id + self.species_id = species_id + self.sequence_id = sequence_id + self.length: Optional[int] = None + self.clustered: bool = False + self.secreted: bool = False + self.domain_counter_by_domain_source: Dict[str, Counter[str]] = {} + self.go_terms: List[str] = [] + + def update_length(self, length: int) -> None: + self.length = length + + +class ProteinCollection: + def __init__(self, proteins_list: List[Protein]) -> None: + self.proteins_list: List[Protein] = proteins_list + self.proteins_by_protein_id: Dict[str, Protein] = { + protein.protein_id: protein for protein in proteins_list + } + self.protein_count: int = len(proteins_list) + self.domain_sources: List[str] = [] + self.fastas_parsed: bool = False + self.functional_annotation_parsed: bool = False + self.domain_desc_by_id_by_source: Dict[str, Dict[str, str]] = {} + + def add_annotation_to_protein( + self, + domain_protein_id: str, + domain_counter_by_domain_source: Dict[str, Counter], + go_terms: List[str], + ): + """ + Updates a protein object with domain counters and GO terms. + + Args: + - domain_protein_id (str): Identifier of the protein to annotate. + - domain_counter_by_domain_source (Dict[str, Counter]): Domain sources mapped to counters of domains. + - go_terms (List[str]): Gene Ontology (GO) terms associated with the protein. + + This method sets domain counters, assigns GO terms, and checks if the protein is secreted + based on domain information ('SignalP_EUK' source). + + Note: If 'SignalP_EUK' indicates 'SignalP-noTM', sets protein.secreted = True. + """ + protein: Optional[Protein] = self.proteins_by_protein_id.get( + domain_protein_id, None + ) + if protein is not None: + protein.domain_counter_by_domain_source = domain_counter_by_domain_source + signalp_notm = protein.domain_counter_by_domain_source.get( + "SignalP_EUK", None + ) + if signalp_notm and "SignalP-noTM" in signalp_notm: + protein.secreted = True + protein.go_terms = go_terms + + def get_protein_length_stats( + self, protein_ids: List[str] + ) -> Dict[str, Union[int, float]]: + """ + Calculate statistics (sum, mean, median, standard deviation) of protein lengths. + + Args: + protein_ids (List[str]): List of protein IDs for which to calculate statistics. + + Returns: + Dict[str, Union[int, float]): A dictionary containing the calculated statistics: + - 'sum': Sum of lengths of proteins in the input list. + - 'mean': Mean length of proteins in the input list. + - 'median': Median length of proteins in the input list. + - 'sd': Standard deviation of lengths of proteins in the input list. + + If no valid protein lengths could be calculated (e.g., if protein_ids is empty or no lengths + are available for the provided protein IDs), the values in the dictionary will default to 0 or 0.0. + """ + protein_length_stats = {"sum": 0, "mean": 0.0, "median": 0, "sd": 0.0} + if protein_ids and self.fastas_parsed: + protein_lengths: List[int] = [ + length + for length in [ + self.proteins_by_protein_id[protein_id].length + for protein_id in protein_ids + if protein_id in self.proteins_by_protein_id + ] + if length is not None + ] + protein_length_stats["sum"] = sum(protein_lengths) + protein_length_stats["mean"] = mean(protein_lengths) + protein_length_stats["median"] = median(protein_lengths) + protein_length_stats["sd"] = sd(protein_lengths) + + return protein_length_stats diff --git a/src/core/results.py b/src/core/results.py new file mode 100644 index 0000000..9aa0f60 --- /dev/null +++ b/src/core/results.py @@ -0,0 +1,46 @@ +import logging +import time + +from core.datastore import DataFactory +from core.input import InputData + +logger = logging.getLogger("kinfin_logger") + + +def analyse(input_data: InputData) -> None: + """ + Performs KinFin analysis based on the provided input data using DataFactory. + + Args: + input_data (InputData): An instance of InputData containing input parameters and data. + + Returns: + None + + Raises: + Any exceptions raised by DataFactory methods. + """ + overall_start = time.time() + dataFactory = DataFactory(input_data) + dataFactory.setup_dirs() + dataFactory.analyse_clusters() + dataFactory.aloCollection.write_tree( + dataFactory.dirs, + dataFactory.inputData.plot_tree, + dataFactory.inputData.plot_format, + dataFactory.inputData.fontsize, + ) + rarefaction_data = dataFactory.aloCollection.compute_rarefaction_data( + repetitions=dataFactory.inputData.repetitions + ) + dataFactory.plot_rarefaction_data( + dirs=dataFactory.dirs, + plotsize=dataFactory.inputData.plotsize, + plot_format=dataFactory.inputData.plot_format, + fontsize=dataFactory.inputData.fontsize, + rarefaction_by_samplesize_by_level_by_attribute=rarefaction_data, + ) + dataFactory.write_output() + overall_end = time.time() + overall_elapsed = overall_end - overall_start + logger.info(f"[STATUS] - Took {overall_elapsed}s to run kinfin.") diff --git a/src/core/utils.py b/src/core/utils.py new file mode 100644 index 0000000..7a2713e --- /dev/null +++ b/src/core/utils.py @@ -0,0 +1,285 @@ +import gzip +import json +import logging +import os +import sys +from math import log, sqrt +from typing import Any, Generator, List, Optional, Tuple, Union + +import scipy + +logger = logging.getLogger("kinfin_logger") + + +def progress(iteration: int, steps: Union[int, float], max_value: int) -> None: + """ + Print progress in percentage based on the current iteration, steps, and maximum value. + + Parameters: + - iteration (int): Current iteration or step number. + - steps (int | float): Number of steps or intervals after which progress is updated. + - max_value (int): Maximum value or total number of iterations. + + Returns: + - None + + Example: + >>> progress(5, 2, 10) + [PROGRESS] - 50% + """ + if iteration == max_value: + sys.stdout.write("\r") + print("[PROGRESS]\t- %d%%" % (100)) + elif iteration % int(steps + 1) == 0: + sys.stdout.write("\r") + print("[PROGRESS]\t- %d%%" % (float(iteration / max_value) * 100), end=" ") + sys.stdout.flush() + + +def check_file(filepath: Optional[str], install_kinfin: bool = False) -> None: + """ + Check if a file exists. + + Args: + filepath (str): Path to the file to be checked. + + Raises: + FileNotFoundError: If the file does not exist. + """ + + if filepath is not None and not os.path.isfile(filepath): + error_msg = f"[ERROR] - file {filepath} not found." + if install_kinfin: + error_msg += " Please run the install script to download kinfin." + raise FileNotFoundError(error_msg) + + +def yield_file_lines(filepath: str) -> Generator[str, Any, None]: + """ + Args: + filepath (str): Path to the file. + + Yields: + str: Each line from the file. + """ + check_file(filepath) + if filepath.endswith(".gz"): + with gzip.open(filepath, "rb") as fh: + for line in fh: + line = line.decode("utf-8") + if line.startswith("nodesDB.txt"): + line = f'#{line.split("#")[1]}' + yield line.rstrip("\n") + else: + with open(filepath) as fh: + for line in fh: + yield line.rstrip("\n") + + +def yield_config_lines( + config_f: str, + taxon_idx_mapping_file: Optional[str], +): + if config_f.endswith(".json"): + if not taxon_idx_mapping_file: + raise ValueError("[ERROR] - taxon_idx_mapping not present") + + with ( + open(taxon_idx_mapping_file, "r") as f_mapping, + open(config_f, "r") as f_config, + ): + taxon_idx_mapping = json.load(f_mapping) + config_data = json.load(f_config) + headers = ["IDX"] + list(config_data[0].keys()) + yield "#" + ",".join(headers) + + for item in config_data: + idx = taxon_idx_mapping[item["taxon"]] + row = [idx] + [item[key] for key in headers[1:]] + yield ",".join(row) + else: + yield from yield_file_lines(config_f) + + return + + +def read_fasta_len(fasta_file: str) -> Generator[Tuple[str, int], Any, None]: + """ + Generator function to parse a FASTA file and yield tuples of header and sequence length. + + Args: + - fasta_file (str): Path to the FASTA file to be parsed. + + Yields: + Tuple[str, int]: A tuple containing the header and the length of the sequence. + + Raises: + FileNotFoundError: If the specified FASTA file does not exist. + """ + check_file(fasta_file) + with open(fasta_file) as fh: + logger.info(f"[STATUS]\t - Parsing FASTA {fasta_file}") + header: str = "" + seqs: List[str] = [] + for line in fh: + if line[0] == ">": + if header: + header = ( + header.replace(":", "_") + .replace(",", "_") + .replace("(", "_") + .replace(")", "_") + ) # orthofinder replaces chars + yield header, len("".join(seqs)) + header, seqs = ( + line[1:-1].split()[0], + [], + ) # Header is split at first whitespace + else: + seqs.append(line[:-1]) + header = ( + header.replace(":", "_") + .replace(",", "_") + .replace("(", "_") + .replace(")", "_") + ) # orthofinder replaces chars + yield header, len("".join(seqs)) + + +def median(lst) -> float: + """ + Calculate the median of a list of numbers. + + Args: + - lst (list): List of numerical values. + + Returns: + - float: Median of the list. + """ + list_sorted = sorted(lst) + list_length = len(lst) + index = (list_length - 1) // 2 + if list_length % 2: + return list_sorted[index] / 1.0 + else: + return (list_sorted[index] + list_sorted[index + 1]) / 2.0 + + +def mean(lst) -> float: + """ + Calculate the mean (average) of a list of numbers. + + Args: + - lst (list): List of numerical values. + + Returns: + - float: Mean of the list. + """ + return float(sum(lst)) / len(lst) if lst else 0.0 + + +def sd(lst, population=True) -> float: + """ + Calculate the standard deviation of a list of numbers. + + Args: + - lst (list): List of numerical values. + - population (bool, optional): If True, calculates population standard deviation, + otherwise calculates sample standard deviation. Default is True. + + Returns: + - float: Standard deviation of the list. + """ + n = len(lst) + differences = [x_ - mean(lst) for x_ in lst] + sq_differences = [d**2 for d in differences] + ssd = sum(sq_differences) + variance = ssd / n if population is True else ssd / (n - 1) + return sqrt(variance) + + +def statistic( + count_1: List[int], + count_2: List[int], + test: str, + min_proteomes: int, +) -> Tuple[ + Optional[float], + Optional[float], + Optional[float], + Optional[float], +]: + """ + Perform statistical tests and calculate relevant statistics between two lists of counts. + + Args: + - count_1 (list): List of counts (integers). + - count_2 (list): Another list of counts (integers). + - test (str): Type of statistical test to perform, one of "welch", "mannwhitneyu", "ttest", "ks", "kruskal". + - min_proteomes (int): Minimum number of proteomes required for valid analysis. + + Returns: + - Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]: + Tuple containing: + - pvalue: p-value of the statistical test (or None if test is not applicable). + - log2_mean: Logarithm base 2 of the mean of count_1 divided by count_2. + - mean_count_1: Mean of count_1. + - mean_count_2: Mean of count_2. + """ + pvalue: Optional[float] = None + log2_mean: Optional[float] = None + mean_count_1: Optional[float] = None + mean_count_2: Optional[float] = None + + implicit_count_1: List[float] = [count for count in count_1 if count > 0] + implicit_count_2: List[float] = [count for count in count_2 if count > 0] + + if len(implicit_count_1) < min_proteomes or len(implicit_count_2) < min_proteomes: + return None, None, None, None + + mean_count_1 = mean(implicit_count_1) + mean_count_2 = mean(implicit_count_2) + log2_mean = log(mean_count_1 / mean_count_2, 2) + + if ( + len(set(implicit_count_1)) == 1 + and len(set(implicit_count_2)) == 1 + and set(implicit_count_1) == set(implicit_count_2) + ): # equal + pvalue = 1.0 + elif test == "welch": + # try: + # Welch's t-test + pvalue = scipy.stats.ttest_ind( + implicit_count_1, + implicit_count_2, + equal_var=False, + )[1] + + if pvalue != pvalue: # testing for "nan" + pvalue = 1.0 + elif test == "mannwhitneyu": + try: + pvalue = scipy.stats.mannwhitneyu( + implicit_count_1, + implicit_count_2, + alternative="two-sided", + )[1] + except ValueError: # throws ValueError when all numbers are equal + pvalue = 1.0 + elif test == "ttest": + # try: + pvalue = scipy.stats.ttest_ind(implicit_count_1, implicit_count_2)[1] # t-test + if pvalue != pvalue: # testing for "nan" + pvalue = 1.0 + elif test == "ks": + # H0 that they are drawn from the same distribution + pvalue = scipy.stats.ks_2samp(implicit_count_1, implicit_count_2)[1] + if pvalue != pvalue: # testing for "nan" + pvalue = 1.0 + elif test == "kruskal": + # H0 is that population median is equal + pvalue = scipy.stats.kruskal(implicit_count_1, implicit_count_2)[1] + if pvalue != pvalue: # testing for "nan" + pvalue = 1.0 + return pvalue, log2_mean, mean_count_1, mean_count_2 diff --git a/src/kinfin.py b/src/kinfin.py index a88b82f..377db8f 100755 --- a/src/kinfin.py +++ b/src/kinfin.py @@ -26,7 +26,7 @@ -t, --tree_file Tree file in Newick format (taxon names must be the same as TAXON in config file) General options - -o, --outprefix Output prefix + -o, --output_path Output prefix --infer_singletons Absence of proteins in clustering is interpreted as singleton (based on SequenceIDs.txt) --plot_tree Plot PDF of annotated phylogenetic tree (requires -t, full ETE3 installation and X-server/xvfb-run) --min_proteomes Required number of proteomes in a taxon-set to be used @@ -60,8 +60,8 @@ import sys -from os.path import isfile, join, exists, realpath, dirname -from os import getcwd, mkdir, remove, environ +from os.path import isfile, join, exists, realpath, dirname, isabs, abspath +from os import getcwd, mkdir, remove, environ, makedirs import shutil import random import time @@ -108,17 +108,6 @@ ######################################################################## -def retrieve_ftp(remote_f, local_f): - try: - print("[STATUS] - Downloading '%s' to '%s'." % (remote_f, local_f)) - req = urlopen(remote_f) - with open(local_f, 'wb') as local_fh: - shutil.copyfileobj(req, local_fh) - req.close() - except IOError: - sys.exit("[ERROR] : '%s' could not be downloaded." % (remote_f)) - - def check_file(infile): if infile: if not isfile(infile): @@ -472,7 +461,6 @@ def add_taxid_attributes(self, nodesdb_f, attributes, level_by_attribute_by_prot # add taxranks to rank for taxrank in inputObj.taxranks: attributes.append(taxrank) - self.nodesdb_file = nodesdb_f return attributes, level_by_attribute_by_proteome_id ############################### @@ -480,30 +468,29 @@ def add_taxid_attributes(self, nodesdb_f, attributes, level_by_attribute_by_prot ############################### def setup_dirs(self, inputObj): - outprefix = inputObj.outprefix + output_path = inputObj.output_path self.dirs = {} - if outprefix: - if outprefix.endswith("/"): - result_path = "%skinfin_results" % (outprefix) - else: - result_path = "%s.kinfin_results" % (outprefix) + if output_path: + if not isabs(output_path): + output_path = abspath(output_path) else: - result_path = join(getcwd(), "kinfin_results") - self.dirs['main'] = result_path - print("[STATUS] - Output directories in \n\t%s" % (result_path)) - if exists(result_path): + output_path = join(getcwd(), "kinfin_results") + + self.dirs['main'] = output_path + print("[STATUS] - Output directories in \n\t%s" % (output_path)) + if exists(output_path): print("[STATUS] - Directory exists. Deleting directory ...") - shutil.rmtree(result_path) + shutil.rmtree(output_path) print("[STATUS] - Creating directories ...") - mkdir(result_path) + makedirs(output_path) for attribute in aloCollection.attributes: - attribute_path = join(result_path, attribute) + attribute_path = join(output_path, attribute) self.dirs[attribute] = attribute_path if not exists(attribute_path): print("\t%s" % (attribute_path)) mkdir(attribute_path) if aloCollection.tree_ete: - tree_path = join(result_path, "tree") + tree_path = join(output_path, "tree") node_chart_path = join(tree_path, "charts") node_header_path = join(tree_path, "headers") if not exists(tree_path): @@ -610,7 +597,6 @@ def parse_species_ids(self, species_ids_f): if not line.startswith("#"): idx, fasta = line.split(": ") fasta_by_ortho_id[idx] = fasta - self.species_ids_file = species_ids_f return fasta_by_ortho_id ############################### @@ -626,7 +612,6 @@ def parse_fasta_dir(self, fasta_dir, fasta_file_by_species_id): print("[STATUS]\t - Parsing FASTA %s" % (fasta_path)) for header, length in readFastaLen(fasta_path): fasta_len_by_protein_id[header] = length - self.fasta_dir = fasta_dir return fasta_len_by_protein_id ############################### @@ -997,7 +982,20 @@ def write_cluster_metrics(self): for domain_source in clusterCollection.domain_sources: # cluster_metrics_domains if domain_source in clusterObj.domain_counter_by_domain_source: - cluster_metrics_domains_line.append(";".join(["%s:%s" % (domain_id, count) for domain_id, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()])) + sorted_counts = sorted( + [ + f"{domain_id}:{count}" + for domain_id, count in clusterObj.domain_counter_by_domain_source[ + domain_source + ].most_common() + ], + key=lambda x: ( + x.split(":")[-1], + x.split(":")[-2], + ), + ) + sorted_counts_str = ";".join(sorted_counts) + cluster_metrics_domains_line.append(sorted_counts_str) cluster_metrics_domains_line.append("{0:.3f}".format(clusterObj.domain_entropy_by_domain_source[domain_source])) else: cluster_metrics_domains_line.append("N/A") @@ -1703,11 +1701,11 @@ def generate_chart_for_node(self, node): x_values = np.array(proteome_coverages) ax.hist(x_values, histtype='stepfilled', align='mid', bins=np.arange(0.0, 1.0 + 0.1, 0.1)) ax.set_xlim(-0.1, 1.1) - for tick in ax.xaxis.get_major_ticks(): - tick.label.set_fontsize(inputObj.plot_font_size - 2) - tick.label.set_rotation('vertical') - for tick in ax.yaxis.get_major_ticks(): - tick.label.set_fontsize(inputObj.plot_font_size - 2) + for tick in ax.xaxis.get_majorticklabels(): + tick.set_fontsize(inputObj.plot_font_size - 2) + tick.set_rotation('vertical') + for tick in ax.yaxis.get_majorticklabels(): + tick.set_fontsize(inputObj.plot_font_size - 2) ax.set_frame_on(False) ax.xaxis.grid(True, linewidth=1, which="major", color="lightgrey") ax.yaxis.grid(True, linewidth=1, which="major", color="lightgrey") @@ -2145,8 +2143,8 @@ def __init__(self, args): # FASTA files self.fasta_dir = args['--fasta_dir'] self.check_if_fasta_dir_and_species_ids_f() - # outprefix - self.outprefix = args['--outprefix'] + # output_path + self.output_path = args['--output_path'] # proteins self.infer_singletons = args['--infer_singletons'] # values: fuzzyness @@ -2227,28 +2225,25 @@ def check_input_files(self): if self.pfam_mapping: pfam_mapping_f = join(dirname(realpath(__file__)), "../data/Pfam-A.clans.tsv.gz") if not isfile(pfam_mapping_f): - print("[WARN] - PFAM-ID file 'data/Pfam-A.clans.tsv.gz' not found. Will be downloaded from ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz") - remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz" - retrieve_ftp(remote_f, pfam_mapping_f) + print("[ERROR] - PFAM-ID file 'data/Pfam-A.clans.tsv.gz' not found. Please run the install script to download") + sys.exit() self.pfam_mapping_f = pfam_mapping_f if self.ipr_mapping: ipr_mapping_f = join(dirname(realpath(__file__)), "../data/entry.list") if not isfile(ipr_mapping_f): - print("[WARN] - IPR-ID file 'data/entry.list' not found. Will be downloaded from ftp://ftp.ebi.ac.uk/pub/databases/interpro/entry.list") - remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/entry.list" - retrieve_ftp(remote_f, ipr_mapping_f) + print("[ERROR] - IPR-ID file 'data/entry.list' not found. Please run the install script to download") + sys.exit() self.ipr_mapping_f = ipr_mapping_f go_mapping_f = join(dirname(realpath(__file__)), "../data/interpro2go") if not isfile(go_mapping_f): - print("[WARN] - GO-ID file, but 'data/interpro2go' not found. Will be downloaded from ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go") - remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go" - retrieve_ftp(remote_f, go_mapping_f) + print("[ERROR] - GO-ID file, but 'data/interpro2go' not found. Please run the install script to download") + sys.exit() self.go_mapping_f = go_mapping_f def check_that_ete_can_plot(self): if self.render_tree: try: - import PyQt4 + import PyQt4 # type: ignore except ImportError: sys.exit("[ERROR] : Plotting of trees requires additional ETE3 dependencies. PyQt4 is not installed. Please install PyQt4") if 'DISPLAY' in environ: diff --git a/src/main.py b/src/main.py new file mode 100755 index 0000000..8c20009 --- /dev/null +++ b/src/main.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import os +import sys + +from api import run_server +from cli import run_cli +from cli.commands import parse_args +from core.input import InputData, ServeArgs +from core.utils import check_file + +if __name__ == "__main__": + + # Without these files, application won't start + base_dir = os.getcwd() + nodesdb_f = os.path.join(base_dir, "data/nodesdb.txt") + pfam_mapping_f = os.path.join(base_dir, "data/Pfam-A.clans.tsv.gz") + ipr_mapping_f = os.path.join(base_dir, "data/entry.list") + go_mapping_f = os.path.join(base_dir, "data/interpro2go") + + try: + check_file(nodesdb_f, install_kinfin=True) + check_file(pfam_mapping_f, install_kinfin=True) + check_file(ipr_mapping_f, install_kinfin=True) + check_file(go_mapping_f, install_kinfin=True) + except FileNotFoundError as e: + sys.exit(str(e)) + + args = parse_args(nodesdb_f, pfam_mapping_f, ipr_mapping_f, go_mapping_f) + + if isinstance(args, ServeArgs): + # run the api server + cluster_f = os.environ.get("CLUSTER_FILE_PATH") + sequence_ids_f = os.environ.get("SEQUENCE_IDS_FILE_PATH") + taxon_idx_mapping_file = os.environ.get("TAXON_IDX_MAPPING_FILE_PATH") + + # Without env variables being absolute paths, application won't start + if cluster_f is None or not os.path.isabs(cluster_f): + sys.exit("[ERROR] CLUSTER_FILE_PATH should be an absolute path.") + if sequence_ids_f is None or not os.path.isabs(sequence_ids_f): + sys.exit("[ERROR] SEQUENCE_IDS_FILE_PATH should be an absolute path.") + if taxon_idx_mapping_file is None or not os.path.isabs(taxon_idx_mapping_file): + sys.exit("[ERROR] TAXON_IDX_MAPPING_FILE_PATH should be an absolute path.") + + try: + check_file(cluster_f, install_kinfin=True) + check_file(sequence_ids_f, install_kinfin=True) + check_file(taxon_idx_mapping_file, install_kinfin=True) + except FileNotFoundError as e: + sys.exit(str(e)) + + run_server( + args=args, + nodesdb_f=nodesdb_f, + go_mapping_f=go_mapping_f, + ipr_mapping_f=ipr_mapping_f, + pfam_mapping_f=pfam_mapping_f, + cluster_f=cluster_f, + sequence_ids_f=sequence_ids_f, + taxon_idx_mapping_file=taxon_idx_mapping_file, + ) + elif isinstance(args, InputData): + run_cli(args) + + else: + sys.exit("[ERROR] - invalid input provided.") diff --git a/test b/test deleted file mode 100755 index 20ea493..0000000 --- a/test +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env bash -DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" -$DIR/src/kinfin.py -g $DIR/example/OrthologousGroups.txt -c $DIR/example/config.txt -s $DIR/example/SequenceIDs.txt -t $DIR/example/tree.nwk -o $DIR/example/test -p $DIR/example/SpeciesIDs.txt -a $DIR/example/fasta/ -f $DIR/example/functional_annotation.txt --min_proteomes 2 -#$DIR/src/kinfin.py -g $DIR/example/OrthologousGroups.txt -c $DIR/example/config.txt -s $DIR/example/SequenceIDs.txt -t $DIR/example/tree.nwk -o $DIR/example/test -p $DIR/example/SpeciesIDs.txt -a $DIR/example/fasta/ -f $DIR/example/functional_annotation.txt --min_proteomes 2 --test kruskal -#$DIR/src/kinfin.py -g $DIR/example/OrthologousGroups.txt -c $DIR/example/config.txt -s $DIR/example/SequenceIDs.txt -t $DIR/example/tree.nwk -o $DIR/example/test -p $DIR/example/SpeciesIDs.txt -a $DIR/example/fasta/ -f $DIR/example/functional_annotation.txt --min_proteomes 2 --test ks diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..6bf2d3c --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,61 @@ +import os +from typing import List, Tuple + + +def pytest_addoption(parser) -> None: + """Add argument to take path to generated and expected output directories""" + parser.addoption( + "--generated", + action="store", + help="Path to the generated output directory", + ) + parser.addoption( + "--expected", + action="store", + help="Path to the expected output directory", + ) + + +def pytest_generate_tests(metafunc) -> None: + """Generates test for each file""" + if "gen_file" in metafunc.fixturenames and "exp_file" in metafunc.fixturenames: + file_pairs = get_file_pairs(metafunc.config) + metafunc.parametrize("gen_file,exp_file", file_pairs) + + +def get_file_pairs(config) -> List[Tuple[str, str]]: + """Get tuple of generate result file vs expected result file to compare""" + generated = config.getoption("generated") + expected = config.getoption("expected") + + assert os.path.exists(generated), f"Directory '{generated}' does not exist" + assert os.path.exists(expected), f"Directory '{expected}' does not exist" + + files1: List[str] = get_files(generated) + files2: List[str] = get_files(expected) + + set1 = set(files1) + set2 = set(files2) + + missing_files = set1.symmetric_difference(set2) + + assert not missing_files, f"files missing: {', '.join(list(missing_files))}" + + file_pairs: List[Tuple[str, str]] = [ + (os.path.join(generated, gen_file), os.path.join(expected, exp_file)) + for gen_file, exp_file in zip(files1, files2) + if gen_file.endswith(".txt") + ] + return file_pairs + + +def get_files(directory) -> List[str]: + """ + Recursively get all files in a directory + """ + file_list = [] + for root, _, files in os.walk(directory): + for file in files: + relative_path = os.path.relpath(os.path.join(root, file), directory) + file_list.append(relative_path) + return file_list diff --git a/tests/run_dev_tests.sh b/tests/run_dev_tests.sh new file mode 100755 index 0000000..9876cde --- /dev/null +++ b/tests/run_dev_tests.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash + +# DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +# src/kinfin.py -g example/OrthologousGroups.txt -c example/config.txt -s example/SequenceIDs.txt -t example/tree.nwk -o example/test -p example/SpeciesIDs.txt -a example/fasta/ -f example/functional_annotation.txt --min_proteomes 2 +# #src/kinfin.py -g example/OrthologousGroups.txt -c example/config.txt -s example/SequenceIDs.txt -t example/tree.nwk -o example/test -p example/SpeciesIDs.txt -a example/fasta/ -f example/functional_annotation.txt --min_proteomes 2 --test kruskal +# #src/kinfin.py -g example/OrthologousGroups.txt -c example/config.txt -s example/SequenceIDs.txt -t example/tree.nwk -o example/test -p example/SpeciesIDs.txt -a example/fasta/ -f example/functional_annotation.txt --min_proteomes 2 --test ks + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +cd "../$DIR" + +# To exit on error +set -e + +handle_error() { + echo "Error: $1" >&2 + exit 1 +} + +src/main.py analyse -g "example/OrthologousGroups.txt" -c "example/config.txt" -s "example/SequenceIDs.txt" -o "result/example" || handle_error "Failed to run basic analysis with new tool." + +# # Function to check if a directory exists and is not empty +# function is_directory_not_empty { +# local dir="$1" +# if [ -d "$dir" ] && [ "$(ls -A $dir)" ]; then +# return 0 # Directory exists and is not empty +# else +# return 1 # Directory does not exist or is empty +# fi +# } + + +# if ! is_directory_not_empty ".test_data"; then +# echo "Extracting test data..." +# tar -xzvf ./tests/test_data.tar.gz -C "./" || handle_error "Failed to extract test data." +# else +# echo "Test data is already extracted and present." +# fi + +# # echo "Running basic analysis with old tool (kinfin.py)..." +# # src/kinfin.py -g ".test_data/basic/input/Orthogroups.txt" -c ".test_data/basic/input/kinfin.config.basic.txt" -s ".test_data/basic/input/kinfin.SequenceIDs.txt" -o "result/basic.cli.old" || handle_error "Failed to run basic analysis with old tool." + +# echo "Running basic analysis with new tool (main.py)..." +# src/main.py analyse -g ".test_data/basic/input/Orthogroups.txt" -c ".test_data/basic/input/kinfin.config.basic.txt" -s ".test_data/basic/input/kinfin.SequenceIDs.txt" -o "result/basic.cli.new" || handle_error "Failed to run basic analysis with new tool." + +# # echo "Comparing output of old and new tools for basic analysis..." +# # pytest -v ./tests/test_output_match.py --expected result/basic.cli.old --generated result/basic.cli.new + +# # Check pytest exit status +# if [ $? -ne 0 ]; then +# echo "Basic test failed. Stopping execution." +# exit 1 +# fi + +# # If we get here, the basic test passed, so continue with advanced analysis +# # echo "Running advanced analysis with old tool (kinfin.py)..." +# # src/kinfin.py -g ".test_data/advanced/input/Orthogroups.txt" -c ".test_data/advanced/input/kinfin.config.advanced.txt" -s ".test_data/advanced/input/kinfin.SequenceIDs.txt" -o "result/advanced.cli.old" -p ".test_data/advanced/input/kinfin.SpeciesIDs.txt" -a ".test_data/advanced/input/fastas/" -t ".test_data/advanced/input/kinfin.tree.nwk" -f ".test_data/advanced/input/kinfin.functional_annotation.txt" || handle_error "Failed to run advanced analysis with old tool." + +# echo "Running advanced analysis with new tool (main.py)..." +# src/main.py analyse -g ".test_data/advanced/input/Orthogroups.txt" -c ".test_data/advanced/input/kinfin.config.advanced.txt" -s ".test_data/advanced/input/kinfin.SequenceIDs.txt" -o "result/advanced.cli.new" -p ".test_data/advanced/input/kinfin.SpeciesIDs.txt" -a ".test_data/advanced/input/fastas/" -t ".test_data/advanced/input/kinfin.tree.nwk" -f ".test_data/advanced/input/kinfin.functional_annotation.txt" || handle_error "Failed to run advanced analysis with new tool." + +# # echo "Comparing output of old and new tools for advanced analysis..." +# # pytest -v ./tests/test_output_match.py --expected result/advanced.cli.old --generated result/advanced.cli.new \ No newline at end of file diff --git a/tests/test_output_match.py b/tests/test_output_match.py new file mode 100644 index 0000000..f3c9905 --- /dev/null +++ b/tests/test_output_match.py @@ -0,0 +1,30 @@ +def compare_files(gen_file, exp_file): + """ + Compare files based on their types + """ + if gen_file.endswith(".txt"): + return check_is_mismatch(gen_file, exp_file) + else: + return False + + +def check_is_mismatch(gen_file, exp_file): + """ + Compare each line of two text files + """ + with open(gen_file, "r") as f1, open(exp_file, "r") as f2: + gen_lines = f1.readlines() + exp_lines = f2.readlines() + # Remove empty lines and strip whitespace + gen_lines = [line.strip() for line in gen_lines if line.strip()] + exp_lines = [line.strip() for line in exp_lines if line.strip()] + # Sort lines + gen_lines.sort() + exp_lines.sort() + # Compare sorted lines + return gen_lines != exp_lines + + +def test_compare_files(gen_file, exp_file): + mismatch = compare_files(gen_file, exp_file) + assert not mismatch, f"Files '{gen_file}' and '{exp_file}' have mismatches"