diff --git a/.eggs/README.txt b/.eggs/README.txt
deleted file mode 100644
index 5d01668..0000000
--- a/.eggs/README.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-This directory contains eggs that were downloaded by setuptools to build, test, and run plug-ins.
-
-This directory caches those eggs to prevent repeated downloads.
-
-However, it is safe to delete this directory.
-
diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..7aca9f0
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,5 @@
+CLUSTER_FILE_PATH=/absolute/path/to/Orthogroups.txt
+SEQUENCE_IDS_FILE_PATH=/absolute/path/to/SequenceIDs.txt
+TAXON_IDX_MAPPING_FILE_PATH=/absolute/path/to/taxon_idx_mapping.json
+RESULTS_BASE_DIR=/absolute/path/where/all/results/should/be/stored/
+SESSION_INACTIVITY_THRESHOLD=24
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index dde7931..3f89431 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,9 @@ example/test.*
 build/
 dist/
 
+venv
+.test_data
+result
+.DS_Store
+.env
+data
\ No newline at end of file
diff --git a/build/lib/kinfin/kinfin.py b/build/lib/kinfin/kinfin.py
deleted file mode 100644
index 68ecdbc..0000000
--- a/build/lib/kinfin/kinfin.py
+++ /dev/null
@@ -1,2195 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-"""
-usage: kinfin-d.py      -g <FILE> -c <FILE> -s <FILE> [-t <FILE>] [-o <PREFIX>]
-                        [--infer_singletons] [--plot_tree]
-                        [-p <FILE>] [-a <DIR>]
-                        [--functional_annotation <FILE>]
-                        [--nodesdb <FILE>] [--taxranks <STRING>]
-                        [-f <FLOAT>] [-n <INT>] [--min <INT>] [--max <INT>]
-                        [-r <INT>] [--min_proteomes <INT>]
-                        [--fontsize <INT>] [--plotsize INT,INT]
-                        [--plotfmt <PLOTFORMAT>]
-                        [-h|--help]
-
-    Options:
-        -h --help                               show this
-
-        Input files
-            -g, --cluster_file <FILE>           OrthologousGroups.txt produced by OrthoFinder
-            -c, --config_file <FILE>            Config file (in CSV format)
-            -s, --sequence_ids_file <FILE>      SequenceIDs.txt used in OrthoFinder
-
-            -p, --species_ids_file <FILE>       SpeciesIDs.txt used in OrthoFinder
-            --functional_annotation <FILE>      Mapping of ProteinIDs to GO/IPRS/SignalP/Pfam/... (can be generated through 'iprs_to_table.py')
-            -a, --fasta_dir <DIR>               Directory of FASTA files
-            --nodesdb <FILE>                    nodesdb file (in data/ folder, has to be uncompressed)
-            -t, --tree_file <FILE>              Tree file (on which ALOs are defined)
-        General options
-            -o, --outprefix <STR>               Output prefix
-            --infer_singletons                  Absence of proteins in clustering is interpreted as singleton (based on SequenceIDs.txt)
-            --plot_tree                         Plot annotated phylogenetic tree (requires full ETE3 installation and X-server/xvfb-run)
-            --min_proteomes <INT>               Required number of proteomes in a taxon-set to be used
-                                                    in rarefaction/representation-test computations [default: 2]
-            --taxranks <STRING>                 Taxonomic ranks to be inferred from TaxID [default: phylum,order,genus]
-            -r, --repetitions <INT>             Number of repetitions for rarefaction curves [default: 30]
-        "Fuzzy"-Orthology-groups
-            -f, --target_fraction <FLOAT>       Minimum proportion of proteomes with target protein count [default: 0.75].
-            -n, --target_count <INT>            Target protein count by proteome in (100*F)% of cluster [default: 1]
-            --min <INT>                         Min count of proteins by proteome in (100*(1-F))% of cluster [default: 0]
-            --max <INT>                         Max count of proteins by proteome in (100*(1-F))% of cluster [default: 100]
-        Plotting
-            --fontsize <INT>                    Fontsize for plots [default: 18]
-            --plotsize <INT,INT>                Size (WIDTH,HEIGHT) for plots [default: 24,12]
-            --plotfmt <STR>                     Plot formats [default: pdf]
-
-"""
-
-
-########################################################################
-# Imports
-########################################################################
-
-from __future__ import division
-import sys
-from os.path import isfile, join, exists, realpath, dirname
-from os import getcwd, mkdir, remove, environ
-import shutil
-import random
-import time
-import urllib
-from decimal import Decimal
-
-from collections import Counter, defaultdict
-from math import sqrt, log
-
-import_errors = []
-try:
-    from docopt import docopt
-except ImportError:
-    import_errors.append("[ERROR] : Module \'Docopt\' was not found. Please install \'Docopt\' using \'pip install docopt\'")
-try:
-    import matplotlib as mat
-    mat.use("agg")
-except ImportError:
-    import_errors.append("[ERROR] : Module \'Matplotlib\' was not found. Please install \'Matplotlob\' using \'pip install matplotlib\'")
-try:
-    import scipy
-except ImportError:
-    import_errors.append("[ERROR] : Module \'SciPy\' was not found. Please install \'SciPy\' using \'pip install scipy\'")
-
-if import_errors:
-    sys.exit("\n".join(import_errors))
-
-import numpy as np
-from matplotlib.ticker import FormatStrFormatter
-import matplotlib.pyplot as plt
-plt.style.use('ggplot')
-mat.rc('ytick', labelsize=20)
-mat.rc('xtick', labelsize=20)
-axis_font = {'size': '20'}
-mat.rcParams.update({'font.size': 22})
-
-########################################################################
-# General functions
-########################################################################
-
-
-def retrieve_ftp(remote_f, local_f):
-    try:
-        print "[STATUS] - Downloading '%s' to '%s'." % (remote_f, local_f)
-        urllib.urlretrieve(remote_f, local_f)
-    except IOError:
-        sys.exit("[ERROR] : '%s' could not be downloaded." % (remote_f))
-
-
-def check_file(infile):
-    if infile:
-        if not isfile(infile):
-            sys.exit("[ERROR] : %s does not exist." % (infile))
-
-
-def get_attribute_cluster_type(singleton, implicit_protein_ids_by_proteome_id_by_level):
-    if singleton:
-        return 'singleton'
-    else:
-        if len(implicit_protein_ids_by_proteome_id_by_level) > 1:
-            return 'shared'
-        else:
-            return 'specific'
-
-
-def get_ALO_cluster_cardinality(ALO_proteome_counts_in_cluster):
-    if len(ALO_proteome_counts_in_cluster) > 2:
-        ALO_proteome_counts_in_cluster_length = len(ALO_proteome_counts_in_cluster)
-        if all(count == 1 for count in ALO_proteome_counts_in_cluster):
-            return 'true'
-        else:
-            ALO_proteome_counts_in_cluster_at_fuzzycount_count = len([ALO_proteome_counts for ALO_proteome_counts in ALO_proteome_counts_in_cluster if ALO_proteome_counts == inputObj.fuzzy_count])
-            ALO_proteome_counts_in_cluster_in_fuzzyrange_count = len([ALO_proteome_counts for ALO_proteome_counts in ALO_proteome_counts_in_cluster if ALO_proteome_counts in inputObj.fuzzy_range])
-            fuzzy_fraction = ALO_proteome_counts_in_cluster_at_fuzzycount_count / ALO_proteome_counts_in_cluster_length
-            if fuzzy_fraction >= inputObj.fuzzy_fraction:
-                if ALO_proteome_counts_in_cluster_at_fuzzycount_count + ALO_proteome_counts_in_cluster_in_fuzzyrange_count == ALO_proteome_counts_in_cluster_length:
-                    return 'fuzzy'
-    return None
-
-
-def mannwhitneyu(count_1, count_2):
-    pvalue, log2_mean, mean_count_1, mean_count_2 = None, None, None, None
-    implicit_count_1 = [count for count in count_1 if count > 0]
-    implicit_count_2 = [count for count in count_2 if count > 0]
-    if len(implicit_count_1) >= inputObj.min_proteomes and len(implicit_count_2) >= inputObj.min_proteomes:
-        try:
-            pvalue = scipy.stats.mannwhitneyu(implicit_count_1, implicit_count_2, alternative="two-sided")[1]
-        except:
-            pvalue = 1.0
-        mean_count_1 = mean(implicit_count_1)
-        mean_count_2 = mean(implicit_count_2)
-        log2_mean = log((mean(implicit_count_1)/mean(implicit_count_2)), 2)
-    return pvalue, log2_mean, mean_count_1, mean_count_2
-
-
-def get_lineage(taxid, nodesdb):
-    lineage = {taxrank: 'undef' for taxrank in inputObj.taxranks}
-    parent = ''
-    node = taxid
-    while parent != "1":
-        taxrank = nodesdb[node]['rank']
-        name = nodesdb[node]['name']
-        parent = nodesdb[node]['parent']
-        if taxrank in inputObj.taxranks:
-            lineage[taxrank] = name
-        node = parent
-    return lineage
-
-
-def parse_nodesdb(nodesdb_f):
-    nodesdb = {}
-    nodesdb_count = 0
-    nodes_count = 0
-    for line in read_file(nodesdb_f):
-        if line.startswith("#"):
-            nodesdb_count = int(line.lstrip("# nodes_count = ").rstrip("\n"))
-        else:
-            nodes_count += 1
-            node, rank, name, parent = line.rstrip("\n").split("\t")
-            nodesdb[node] = {'rank': rank, 'name': name, 'parent': parent}
-            if nodesdb_count:
-                progress(nodes_count, 1000, nodesdb_count)
-    return nodesdb
-
-
-def parse_mapping(mapping_file_by_domain_source):
-    domain_description_by_domain_id_by_domain_source = {}
-    if mapping_file_by_domain_source:
-        for domain_source, mapping_f in mapping_file_by_domain_source.items():
-            if domain_source == 'Pfam':
-                domain_description_by_domain_id_by_domain_source[domain_source] = {}
-                print "[STATUS] - Parsing %s ... " % (mapping_f)
-                for line in read_file(mapping_f):
-                    temp = line.split("\t")
-                    domain_id = temp[0]
-                    domain_desc = temp[4]
-                    if domain_id not in domain_description_by_domain_id_by_domain_source[domain_source]:
-                        domain_description_by_domain_id_by_domain_source[domain_source][domain_id] = domain_desc
-                    else:
-                        if not domain_desc == domain_description_by_domain_id_by_domain_source[domain_source][domain_id]:
-                            sys.exit("[ERROR] : Conflicting descriptions for %s" % (domain_id))
-            elif domain_source == 'GO':
-                domain_description_by_domain_id_by_domain_source['GO'] = {}
-                print "[STATUS] - Parsing %s ... " % (mapping_f)
-                for line in read_file(mapping_f):
-                    if not line.startswith("!"):
-                        temp = line.replace(" > ", "|").split("|")
-                        go_string = temp[1].split(";")
-                        go_desc, go_id = go_string[0].replace("GO:", ""), go_string[1].lstrip(" ")
-                        if go_id not in domain_description_by_domain_id_by_domain_source['GO']:
-                            domain_description_by_domain_id_by_domain_source['GO'][go_id] = go_desc
-                        else:
-                            if not go_desc == domain_description_by_domain_id_by_domain_source['GO'][go_id]:
-                                sys.exit("[ERROR] : Conflicting descriptions for %s" % (go_id))
-            elif domain_source == 'IPR':
-                domain_description_by_domain_id_by_domain_source['IPR'] = {}
-                print "[STATUS] - Parsing %s ... " % (mapping_f)
-                for line in read_file(mapping_f):
-                    if not line.startswith("Active_site"):
-                        temp = line.split()
-                        ipr_id = temp[0]
-                        ipr_desc = " ".join(temp[1:])
-                        if ipr_id not in domain_description_by_domain_id_by_domain_source['IPR']:
-                            domain_description_by_domain_id_by_domain_source['IPR'][ipr_id] = ipr_desc
-                        else:
-                            if not ipr_desc == domain_description_by_domain_id_by_domain_source['IPR'][ipr_id]:
-                                sys.exit("[ERROR] : Conflicting descriptions for %s" % (ipr_id))
-    return domain_description_by_domain_id_by_domain_source
-
-
-def parse_tree(tree_f, outgroups):
-    check_file(tree_f)
-    print "[STATUS] - Parsing Tree file : %s ..." % (tree_f)
-    tree_ete = ete3.Tree(tree_f)
-    if len(outgroups) > 1:
-        outgroup_node = tree_ete.get_common_ancestor(outgroups)
-        try:
-            tree_ete.set_outgroup(outgroup_node)
-            print "[STATUS] - Setting LCA of %s as outgroup : ..." % (",".join(outgroups))
-        except ete3.coretype.tree.TreeError:
-            print "[STATUS] - Tree seems to be rooted already : ..."
-    else:
-        print "[STATUS] - Setting %s as outgroup : ..." % (",".join(outgroups))
-        tree_ete.set_outgroup(outgroups[0])
-    print tree_ete
-    node_idx_by_proteome_ids = {}
-    for idx, node in enumerate(tree_ete.traverse("levelorder")):
-        proteome_ids = frozenset([leaf.name for leaf in node])
-        if not node.name:
-            node.add_features(
-                name="n%s" % (idx),
-                nodetype="node",
-                proteome_ids=proteome_ids,
-                apomorphic_cluster_counts={'singletons': 0, 'non_singletons': 0},
-                synapomorphic_cluster_counts={'complete_presence': 0, 'stochastic_absence': 0},
-                synapomorphic_cluster_strings=[],
-                counts={'specific': 0, 'shared': 0, "absent": 0, "singleton": 0})
-        else:
-            node.add_features(
-                nodetype="tip",
-                proteome_ids=proteome_ids,
-                apomorphic_cluster_counts={'singletons': 0, 'non_singletons': 0},
-                synapomorphic_cluster_counts={'complete_presence': 0, 'stochastic_absence': 0},
-                synapomorphic_cluster_strings=[],
-                counts={'specific': 0, 'shared': 0, "absent": 0, "singleton": 0})
-        node_idx_by_proteome_ids[proteome_ids] = node.name
-    return tree_ete, node_idx_by_proteome_ids
-
-
-def readFastaLen(infile):
-    with open(infile) as fh:
-        header, seqs = '', []
-        for l in fh:
-            if l[0] == '>':
-                if header:
-                    header = header.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_")  # orthofinder replaces chars
-                    yield header, len(''.join(seqs))
-                header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
-            else:
-                seqs.append(l[:-1])
-        header = header.replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_")  # orthofinder replaces chars
-        yield header, len(''.join(seqs))
-
-
-def median(lst):
-    list_sorted = sorted(lst)
-    list_length = len(lst)
-    index = (list_length - 1) // 2
-    if list_length % 2:
-        return list_sorted[index]/1.0
-    else:
-        return (list_sorted[index] + list_sorted[index + 1])/2.0
-
-
-def mean(lst):
-    if lst:
-        return float(sum(lst)) / len(lst)
-    else:
-        return 0.0
-
-
-def sd(lst, population=True):
-    n = len(lst)
-    differences = [x_ - mean(lst) for x_ in lst]
-    sq_differences = [d ** 2 for d in differences]
-    ssd = sum(sq_differences)
-    if population is True:
-        variance = ssd / n
-    else:
-        variance = ssd / (n - 1)
-    sd_result = sqrt(variance)
-    return sd_result
-
-
-def progress(iteration, steps, max_value):
-    if int(iteration) == int(max_value):
-        sys.stdout.write('\r')
-        print "[PROGRESS] \t- %d%%" % (100)
-    elif int(iteration) % int(steps + 1) == 0:
-        sys.stdout.write('\r')
-        print "[PROGRESS] \t- %d%%" % (float(int(iteration) / int(max_value)) * 100),
-        sys.stdout.flush()
-    else:
-        pass
-
-
-def read_file(infile):
-    if not infile or not exists(infile):
-        sys.exit("[ERROR] - File '%s' does not exist." % (infile))
-    if infile.endswith(".gz"):
-        import gzip
-        with gzip.open(infile) as fh:
-            for line in fh:
-                yield line.rstrip("\n")
-    else:
-        with open(infile) as fh:
-            for line in fh:
-                yield line.rstrip("\n")
-
-########################################################################
-# CLASS : DataFactory
-########################################################################
-
-
-class DataFactory():
-    def __init__(self):
-        self.dirs = None
-
-    ###############################
-    ### build_AloCollection
-    ###############################
-
-    def build_AloCollection(self):
-        config_f = inputObj.config_f
-        nodesdb_f = inputObj.nodesdb_f
-        tree_f = inputObj.tree_f
-        proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id = self.parse_attributes(config_f)
-        # Add taxonomy if needed
-        if 'TAXID' in set(attributes):
-            print "[STATUS] - Attribute 'TAXID' found, inferring taxonomic ranks from nodesDB..."
-            attributes, level_by_attribute_by_proteome_id = self.add_taxid_attributes(nodesdb_f, attributes, level_by_attribute_by_proteome_id)
-        # Add ALOs from tree if provided
-        tree_ete = None
-        node_idx_by_proteome_ids = None
-        if tree_f:
-            outgroups = []
-            if not "OUT" in attributes:
-                sys.exit("[ERROR] - Please specify one of more outgroup taxa in the config file.")
-            outgroups = [proteome_id for proteome_id in proteomes if level_by_attribute_by_proteome_id[proteome_id]["OUT"] == "1"]
-            tree_ete, node_idx_by_proteome_ids = parse_tree(tree_f, outgroups)
-        print "[STATUS] - Building AloCollection ..."
-        return AloCollection(proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id, tree_ete, node_idx_by_proteome_ids)
-
-    ###############################
-    ### build_AloCollection  parse_attributes
-    ###############################
-
-    def parse_attributes(self, config_f):
-        print "[STATUS] - Parsing SpeciesClassification file: %s ..." % (config_f)
-        attributes = []
-        level_by_attribute_by_proteome_id = {}
-        proteomes = set()
-        proteome_id_by_species_id = {}
-        for line in read_file(config_f):
-            if line.startswith("#"):
-                if not attributes:
-                    attributes = [x.strip() for x in line.lstrip("#").split(",")]
-                    if not 'IDX' == attributes[0] or not 'TAXON' == attributes[1]:
-                        sys.exit("[ERROR] - First/second element have to be IDX/TAXON.\n\t%s" % (attributes))
-                else:
-                    pass # accounts for SpeciesIDs that are commented out for Orthofinder
-            elif line.strip():
-                temp = line.split(",")
-                if not len(temp) == len(attributes):
-                    sys.exit("[ERROR] - number of columns in line differs from header\n\t%s\n\t%s" % (attributes, temp))
-                if temp[1] in proteomes:
-                    sys.exit("[ERROR] - 'TAXON' should be unique. %s was encountered multiple times" % (temp[0]))
-                species_id = temp[0]
-                proteome_id = temp[1]
-                proteomes.add(proteome_id)
-                proteome_id_by_species_id[species_id] = proteome_id
-                level_by_attribute_by_proteome_id[proteome_id] = {x : '' for x in attributes}
-                for idx, level in enumerate(temp):
-                    attribute = attributes[idx]
-                    level_by_attribute_by_proteome_id[proteome_id][attribute] = level
-                level_by_attribute_by_proteome_id[proteome_id]['all'] = 'all'
-            else:
-                pass
-        attributes.insert(0, "all") # append to front
-        return proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id
-
-    ###############################
-    ### build_AloCollection  add_taxid_attributes
-    ###############################
-
-    def add_taxid_attributes(self, nodesdb_f, attributes, level_by_attribute_by_proteome_id):
-        if nodesdb_f:
-            check_file(nodesdb_f)
-        else:
-            sys.exit("[ERROR] - Please provide a nodesDB file or remove the 'TAXID' attribute")
-        print "[STATUS] - Parsing nodesDB %s" % (nodesdb_f)
-        NODESDB = parse_nodesdb(nodesdb_f)
-        for proteome_id in level_by_attribute_by_proteome_id:
-            taxid = level_by_attribute_by_proteome_id[proteome_id]['TAXID']
-            lineage = get_lineage(taxid, NODESDB)
-            # add lineage attribute/levels
-            for taxrank in inputObj.taxranks:
-                level_by_attribute_by_proteome_id[proteome_id][taxrank] = lineage[taxrank].replace(" ", "_")
-            # remove taxid-levels
-            del level_by_attribute_by_proteome_id[proteome_id]['TAXID']
-        # remove taxid-attribute
-        attributes.remove('TAXID')
-        # add taxranks to rank
-        for taxrank in inputObj.taxranks:
-            attributes.append(taxrank)
-        self.nodesdb_file = nodesdb_f
-        return attributes, level_by_attribute_by_proteome_id
-
-    ###############################
-    ### setup_dirs
-    ###############################
-
-    def setup_dirs(self, inputObj):
-        outprefix = inputObj.outprefix
-        self.dirs = {}
-        if outprefix:
-            result_path = join(getcwd(), "%s.kinfin_results" % (outprefix))
-        else:
-            result_path = join(getcwd(), "kinfin_results")
-        self.dirs['main'] = result_path
-        print "[STATUS] - Output directories in \n\t%s" % (result_path)
-        if exists(result_path):
-            print "[STATUS] - Directory exists. Deleting directory ..."
-            shutil.rmtree(result_path)
-        print "[STATUS] - Creating directories ..."
-        mkdir(result_path)
-        for attribute in aloCollection.attributes:
-            attribute_path = join(result_path, attribute)
-            self.dirs[attribute] = attribute_path
-            if not exists(attribute_path):
-                print "\t%s" % (attribute_path)
-                mkdir(attribute_path)
-        if aloCollection.tree_ete:
-            tree_path = join(result_path, "tree")
-            node_chart_path = join(tree_path, "charts")
-            node_header_path = join(tree_path, "headers")
-            self.dirs["tree"] = tree_path
-            self.dirs["tree_charts"] = node_chart_path
-            self.dirs["tree_headers"] = node_header_path
-            if not exists(tree_path):
-                print "\t%s" % (tree_path)
-                mkdir(tree_path)
-                print "\t%s" % (node_chart_path)
-                mkdir(node_chart_path)
-                print "\t%s" % (node_header_path)
-                mkdir(node_header_path)
-
-    ###############################
-    ### build_ProteinCollection
-    ###############################
-
-    def build_ProteinCollection(self, inputObj):
-        # PARSE PROTEINS
-        proteinObjs = []
-        sequence_ids_f = inputObj.sequence_ids_f
-        print "[STATUS] - Parsing sequence IDs: %s ..." % sequence_ids_f
-        for line in read_file(sequence_ids_f):
-            temp = line.split(": ")
-            sequence_id = temp[0]
-            protein_id = temp[1].split(" ")[0].replace(":", "_").replace(",", "_").replace("(", "_").replace(")", "_") # orthofinder replaces characters
-            species_id = sequence_id.split("_")[0]
-            proteome_id = aloCollection.proteome_id_by_species_id.get(species_id, None)
-            if proteome_id:
-                proteinObj = ProteinObj(protein_id, proteome_id, species_id, sequence_id)
-                proteinObjs.append(proteinObj)
-            #else:
-            #    sys.exit("[ERROR] - Offending SequenceID : %s (unknown species_id %s)" % (line, species_id))
-        proteinCollection = ProteinCollection(proteinObjs)
-        print "[STATUS]\t - Proteins found = %s" % (proteinCollection.protein_count)
-
-        # PARSE FASTA DIR
-        fasta_dir = inputObj.fasta_dir
-        species_ids_f = inputObj.species_ids_f
-        if fasta_dir:
-            print "[STATUS] - Parsing FASTAs ..."
-            fasta_file_by_species_id = self.parse_species_ids(species_ids_f)
-            fasta_len_by_protein_id = self.parse_fasta_dir(fasta_dir, fasta_file_by_species_id)
-            print "[STATUS] - Adding FASTAs to ProteinCollection ..."
-            parse_steps = proteinCollection.protein_count/100
-            for idx, proteinObj in enumerate(proteinCollection.proteinObjs):
-                proteinObj.add_length(fasta_len_by_protein_id[proteinObj.protein_id])
-                progress(idx+1, parse_steps, proteinCollection.protein_count)
-            aloCollection.fastas_parsed = True
-            proteinCollection.fastas_parsed = True
-        else:
-            print "[STATUS] - No Fasta-Dir given, no AA-span information will be reported ..."
-
-        # PARSE DOMAINS
-        functional_annotation_f = inputObj.functional_annotation_f
-        if functional_annotation_f:
-            # PARSE DOMAINS
-            print "[STATUS] - Parsing %s ... this may take a while" % (functional_annotation_f)
-            for line in read_file(functional_annotation_f):
-                temp = line.split()
-                if temp[0].startswith("#"):
-                    proteinCollection.domain_sources = temp[1:]
-                else:
-                    if not proteinCollection.domain_sources:
-                        sys.exit("[ERROR] - %s does not seem to have a header." % (functional_annotation_f))
-                    domain_protein_id = temp.pop(0)
-                    go_terms = []
-                    domain_counter_by_domain_source = {}
-                    for idx, field in enumerate(temp):
-                        if not field == "None":
-                            domain_source = proteinCollection.domain_sources[idx]
-                            domain_string = field.split(";")
-                            domain_counts_by_domain_id = {}
-                            for domain_id_count in domain_string:
-                                domain_id, domain_count = '', 1
-                                if domain_source == "GO":
-                                    domain_id = domain_id_count
-                                else:
-                                    domain_id, domain_count = domain_id_count.split(":")
-                                domain_counts_by_domain_id[domain_id] = int(domain_count)
-                            domain_counter = Counter(domain_counts_by_domain_id)
-                            domain_counter_by_domain_source[domain_source] = domain_counter
-                    proteinCollection.add_annotation_to_proteinObj(domain_protein_id, domain_counter_by_domain_source, go_terms)
-            proteinCollection.functional_annotation_parsed = True
-            mapping_file_by_domain_source = {}
-            if inputObj.pfam_mapping and "Pfam" in proteinCollection.domain_sources:
-                mapping_file_by_domain_source["Pfam"] = inputObj.pfam_mapping_f
-            if inputObj.ipr_mapping and "IPR" in proteinCollection.domain_sources:
-                mapping_file_by_domain_source["IPR"] = inputObj.ipr_mapping_f
-            if inputObj.go_mapping_f:
-                mapping_file_by_domain_source["GO"] = inputObj.go_mapping_f
-            proteinCollection.domain_description_by_domain_id_by_domain_source = parse_mapping(mapping_file_by_domain_source)
-
-        return proteinCollection
-
-    ###############################
-    ### build_ProteinCollection : parse_species_ids
-    ###############################
-
-    def parse_species_ids(self, species_ids_f):
-        fasta_by_ortho_id = {}
-        for line in read_file(species_ids_f):
-            if not line.startswith("#"):
-                idx, fasta = line.split(": ")
-                fasta_by_ortho_id[idx] = fasta
-        self.species_ids_file = species_ids_f
-        return fasta_by_ortho_id
-
-    ###############################
-    ### build_ProteinCollection : parse_fasta_dir
-    ###############################
-
-    def parse_fasta_dir(self, fasta_dir, fasta_file_by_species_id):
-        fasta_len_by_protein_id = {}
-        for species_id, fasta_f in fasta_file_by_species_id.items():
-            fasta_path = join(fasta_dir, fasta_f)
-            if not isfile(fasta_path):
-                sys.exit("[ERROR] - %s does not exist." % (fasta_path))
-            print "[STATUS]\t - Parsing FASTA %s" % (fasta_path)
-            for header, length in readFastaLen(fasta_path):
-                fasta_len_by_protein_id[header] = length
-        self.fasta_dir = fasta_dir
-        return fasta_len_by_protein_id
-
-    ###############################
-    ### build_ClusterCollection
-    ###############################
-
-    def build_ClusterCollection(self, inputObj):
-        cluster_f = inputObj.cluster_f
-        print "[STATUS] - Parsing %s ... this may take a while" % (cluster_f)
-        clusterObjs = []
-        with open(cluster_f) as fh:
-            for line in fh:
-                temp = line.rstrip("\n").split(" ")
-                cluster_id, protein_string = temp[0].replace(":", ""), temp[1:]
-                protein_string = [protein_id for protein_id in protein_string if protein_id]
-                clusterObj = ClusterObj(cluster_id, protein_string)
-                for protein_id in protein_string:
-                    proteinObj = proteinCollection.proteinObjs_by_protein_id[protein_id]
-                    proteinObj.clustered = True
-                clusterObjs.append(clusterObj)
-        inferred_singletons_count = 0
-        if inputObj.infer_singletons:
-            print "[STATUS] - Inferring singletons ..."
-            singleton_idx = 0
-            for proteinObj in proteinCollection.proteinObjs:
-                if proteinObj.clustered == False:
-                    cluster_id = "singleton_%s" % singleton_idx
-                    clusterObj = ClusterObj(cluster_id, [proteinObj.protein_id])
-                    clusterObjs.append(clusterObj)
-                    singleton_idx += 1
-            inferred_singletons_count = singleton_idx
-        return ClusterCollection(clusterObjs, inferred_singletons_count, proteinCollection.functional_annotation_parsed, proteinCollection.fastas_parsed, proteinCollection.domain_sources)
-
-    ###############################
-    ### write_output
-    ###############################
-
-    def write_output(self):
-        self.plot_cluster_sizes()
-        self.write_cluster_metrics()
-
-    ###############################
-    ### write_output : write_ALO_stats
-    ###############################
-
-    def plot_cluster_sizes(self):
-        cluster_protein_count = []
-        for clusterObj in clusterCollection.clusterObjs:
-            cluster_protein_count.append(clusterObj.protein_count)
-        cluster_protein_counter = Counter(cluster_protein_count)
-        count_plot_f = join(self.dirs['main'], "cluster_size_distribution.%s" % (inputObj.plot_format))
-        f, ax = plt.subplots(figsize=inputObj.plot_size)
-        ax.set_facecolor('white')
-        x_values = []
-        y_values = []
-        for value, count in cluster_protein_counter.items():
-            x_values.append(value)
-            y_values.append(count)
-        x_array = np.array(x_values)
-        y_array = np.array(y_values)
-        ax.scatter(x_array, y_array, marker='o', alpha=0.8, s=100)
-        ax.set_xlabel('Cluster size', fontsize=inputObj.plot_font_size)
-        ax.set_ylabel('Count', fontsize=inputObj.plot_font_size)
-        ax.set_yscale('log')
-        ax.set_xscale('log')
-        plt.margins(0.8)
-        plt.gca().set_ylim(bottom=0.8)
-        plt.gca().set_xlim(left=0.8)
-        ax.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))
-        ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))
-        f.tight_layout()
-
-        ax.grid(True, linewidth=1, which="major", color="lightgrey")
-        print "[STATUS] - Plotting %s" % (count_plot_f)
-        f.savefig(count_plot_f, format=inputObj.plot_format)
-        plt.close()
-
-    def get_header_line(self, filetype, attribute):
-        if filetype == "attribute_metrics":
-            attribute_metrics_header = []
-            attribute_metrics_header.append("#attribute")
-            attribute_metrics_header.append("taxon_set")
-            attribute_metrics_header.append("cluster_total_count")
-            attribute_metrics_header.append("protein_total_count")
-            attribute_metrics_header.append("protein_total_span")
-            attribute_metrics_header.append("singleton_cluster_count")
-            attribute_metrics_header.append("singleton_protein_count")
-            attribute_metrics_header.append("singleton_protein_span")
-            attribute_metrics_header.append("specific_cluster_count")
-            attribute_metrics_header.append("specific_protein_count")
-            attribute_metrics_header.append("specific_protein_span")
-            attribute_metrics_header.append("shared_cluster_count")
-            attribute_metrics_header.append("shared_protein_count")
-            attribute_metrics_header.append("shared_protein_span")
-            attribute_metrics_header.append("specific_cluster_true_1to1_count")
-            attribute_metrics_header.append("specific_cluster_fuzzy_count")
-            attribute_metrics_header.append("shared_cluster_true_1to1_count")
-            attribute_metrics_header.append("shared_cluster_fuzzy_count")
-            attribute_metrics_header.append("absent_cluster_total_count")
-            attribute_metrics_header.append("absent_cluster_singleton_count")
-            attribute_metrics_header.append("absent_cluster_specific_count")
-            attribute_metrics_header.append("absent_cluster_shared_count")
-            attribute_metrics_header.append("TAXON_count")
-            attribute_metrics_header.append("TAXA")
-            return "\t".join(attribute_metrics_header)
-        elif filetype == "cluster_metrics_ALO":
-            cluster_metrics_ALO_header = []
-            cluster_metrics_ALO_header.append("#cluster_id")
-            cluster_metrics_ALO_header.append("cluster_status")
-            cluster_metrics_ALO_header.append("cluster_type")
-            cluster_metrics_ALO_header.append("cluster_protein_count")
-            cluster_metrics_ALO_header.append("cluster_proteome_count")
-            cluster_metrics_ALO_header.append("taxon_protein_count")
-            cluster_metrics_ALO_header.append("taxon_mean_count")
-            cluster_metrics_ALO_header.append("non_taxon_mean_count")
-            cluster_metrics_ALO_header.append("representation")
-            cluster_metrics_ALO_header.append("log2_mean(TAXON/others)")
-            cluster_metrics_ALO_header.append("mwu_pvalue(TAXON vs. others)")
-            cluster_metrics_ALO_header.append("taxon_proteome_coverage")
-            cluster_metrics_ALO_header.append("taxon_proteomes_present_count")
-            cluster_metrics_ALO_header.append("taxon_proteomes_present")
-            #for domain_source in clusterCollection.domain_sources:
-            #    cluster_metrics_ALO_header.append(domain_source)
-            return "\t".join(cluster_metrics_ALO_header)
-        elif filetype == "cluster_metrics":
-            cluster_metrics_header = []
-            cluster_metrics_header.append("#cluster_id")
-            cluster_metrics_header.append("cluster_protein_count")
-            cluster_metrics_header.append("protein_median_count")
-            cluster_metrics_header.append("TAXON_count")
-            cluster_metrics_header.append("attribute")
-            cluster_metrics_header.append("attribute_cluster_type")
-            cluster_metrics_header.append("protein_span_mean")
-            cluster_metrics_header.append("protein_span_sd")
-            cluster_metrics_header += ["%s_count" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
-            if not attribute == "TAXON":
-                cluster_metrics_header += ["%s_median" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
-                cluster_metrics_header += ["%s_cov" % level for level in sorted(aloCollection.ALO_by_level_by_attribute[attribute])]
-            return "\t".join(cluster_metrics_header)
-        elif filetype == "cluster_metrics_domains":
-            cluster_metrics_domains_header = []
-            cluster_metrics_domains_header.append("#cluster_id")
-            cluster_metrics_domains_header.append("cluster_protein_count")
-            cluster_metrics_domains_header.append("TAXON_count")
-            cluster_metrics_domains_header.append("protein_span_mean")
-            cluster_metrics_domains_header.append("protein_span_sd")
-            cluster_metrics_domains_header.append("fraction_secreted")
-            for domain_source in clusterCollection.domain_sources:
-                cluster_metrics_domains_header.append(domain_source)
-                cluster_metrics_domains_header.append("%s_entropy" % (domain_source))
-            return "\t".join(cluster_metrics_domains_header)
-        elif filetype == "cluster_metrics_domains_detailed":
-            cluster_metrics_domains_detailed_header = []
-            cluster_metrics_domains_detailed_header.append("#cluster_id")
-            cluster_metrics_domains_detailed_header.append("domain_source")
-            cluster_metrics_domains_detailed_header.append("domain_id")
-            cluster_metrics_domains_detailed_header.append("domain_description")
-            cluster_metrics_domains_detailed_header.append("protein_count")
-            cluster_metrics_domains_detailed_header.append("protein_count_with_domain")
-            cluster_metrics_domains_detailed_header.append("TAXA_with_domain_fraction")
-            cluster_metrics_domains_detailed_header.append("TAXA_with_domain")
-            cluster_metrics_domains_detailed_header.append("TAXA_without_domain")
-            return "\t".join(cluster_metrics_domains_detailed_header)
-        elif filetype == "cafe":
-            cafe_header = []
-            cafe_header.append("ID")
-            for level in sorted(aloCollection.ALO_by_level_by_attribute['TAXON']):
-                cafe_header.append(level)
-            return "\t".join(cafe_header)
-        elif filetype == "pairwise_representation_test":
-            pairwise_representation_test_header = []
-            pairwise_representation_test_header.append("#cluster_id")
-            pairwise_representation_test_header.append("TAXON_1")
-            pairwise_representation_test_header.append("TAXON_1_mean")
-            pairwise_representation_test_header.append("TAXON_2")
-            pairwise_representation_test_header.append("TAXON_2_mean")
-            pairwise_representation_test_header.append("log2_mean(TAXON_1/TAXON_2)")
-            pairwise_representation_test_header.append("mwu_pvalue(TAXON_1 vs. TAXON_2)")
-            #pairwise_representation_test_header.append("go_terms")
-            #for domain_source in clusterCollection.domain_sources:
-            #    pairwise_representation_test_header.append(domain_source)
-            return "\t".join(pairwise_representation_test_header)
-        elif filetype == 'cluster_1to1s_ALO':
-            cluster_1to1s_ALO_header = []
-            cluster_1to1s_ALO_header.append("#cluster_id")
-            cluster_1to1s_ALO_header.append("cluster_type")
-            cluster_1to1s_ALO_header.append("cardinality")
-            cluster_1to1s_ALO_header.append("proteome_count")
-            cluster_1to1s_ALO_header.append("percentage_at_target_count")
-            return "\t".join(cluster_1to1s_ALO_header)
-        else:
-            sys.exit("[ERROR] %s is not a valid header 'filetype'" % (filetype))
-
-    def get_attribute_metrics(self, ALO):
-        attribute_metrics = []
-        attribute_metrics.append(ALO.attribute)
-        attribute_metrics.append(ALO.level)
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'total'))
-        attribute_metrics.append(ALO.get_protein_count_by_cluster_type('total'))
-        attribute_metrics.append(ALO.get_protein_span_by_cluster_type('total'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'singleton'))
-        attribute_metrics.append(ALO.get_protein_count_by_cluster_type('singleton'))
-        attribute_metrics.append(ALO.get_protein_span_by_cluster_type('singleton'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'specific'))
-        attribute_metrics.append(ALO.get_protein_count_by_cluster_type('specific'))
-        attribute_metrics.append(ALO.get_protein_span_by_cluster_type('specific'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('present', 'shared'))
-        attribute_metrics.append(ALO.get_protein_count_by_cluster_type('shared'))
-        attribute_metrics.append(ALO.get_protein_span_by_cluster_type('shared'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('specific', 'true'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('specific', 'fuzzy'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('shared', 'true'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type('shared', 'fuzzy'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'total'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'singleton'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'specific'))
-        attribute_metrics.append(ALO.get_cluster_count_by_cluster_status_by_cluster_type('absent', 'shared'))
-        attribute_metrics.append(ALO.proteome_count)
-        attribute_metrics.append(ALO.get_proteomes())
-        return "\t".join([str(field) for field in attribute_metrics])
-
-    def write_cluster_metrics(self):
-        cafe_f = join(self.dirs['main'], "clusters_counts_by_taxon.txt")
-        cafe_output = []
-        cafe_output.append(self.get_header_line('cafe', "TAXON"))
-
-        cluster_metrics_domains_f = join(self.dirs['main'], "cluster_metrics_domains.txt")
-        cluster_metrics_domains_output = []
-        cluster_metrics_domains_output.append(self.get_header_line('cluster_metrics_domains', "TAXON"))
-
-        cluster_metrics_domains_detailed_output_by_domain_source = {}
-        cluster_metrics_domains_detailed_f_by_domain_source = {}
-        for domain_source in clusterCollection.domain_sources:
-            cluster_metrics_domains_detailed_output_by_domain_source[domain_source] = []
-            cluster_metrics_domains_detailed_output_by_domain_source[domain_source].append(self.get_header_line('cluster_metrics_domains_detailed', "TAXON"))
-            cluster_metrics_domains_detailed_f_by_domain_source[domain_source] = join(self.dirs['main'], "cluster_metrics_domains.%s.txt" % (domain_source))
-
-        for attribute in aloCollection.attributes:
-
-            attribute_metrics_f = join(self.dirs[attribute], "%s.attribute_metrics.txt" % (attribute))
-            attribute_metrics_output = []
-            attribute_metrics_output.append(self.get_header_line('attribute_metrics', attribute))
-
-            pairwise_representation_test_f = join(self.dirs[attribute], "%s.pairwise_representation_test.txt" % (attribute))
-            pairwise_representation_test_output = []
-            pairwise_representation_test_output.append(self.get_header_line('pairwise_representation_test', attribute))
-
-            pairwise_representation_test_by_pair_by_attribute = {}
-
-            ###########################
-            # cluster_metrics
-            ###########################
-
-            cluster_metrics_f = join(self.dirs[attribute], "%s.cluster_metrics.txt" % (attribute))
-            cluster_metrics_output = []
-            cluster_metrics_output.append(self.get_header_line('cluster_metrics', attribute))
-
-            levels = sorted([x for x in aloCollection.ALO_by_level_by_attribute[attribute]])
-            levels_seen = set()
-
-            for level in levels:
-                ALO = aloCollection.ALO_by_level_by_attribute[attribute][level]
-
-                ###########################
-                # attribute_metrics
-                ###########################
-
-                attribute_metrics_output.append(self.get_attribute_metrics(ALO))
-
-                ###########################
-                # cluster_metrics_ALO : setup
-                ###########################
-
-                cluster_metrics_ALO_f = join(self.dirs[attribute], "%s.%s.cluster_metrics.txt" % (attribute, level))
-                cluster_metrics_ALO_output = []
-                cluster_metrics_ALO_output.append(self.get_header_line('cluster_metrics_ALO', attribute))
-
-                background_representation_test_by_pair_by_attribute = {}
-
-                ###########################
-                # cluster_1to1s
-                ###########################
-
-                cluster_1to1_ALO_f = join(self.dirs[attribute], "%s.%s.cluster_1to1s.txt" % (attribute, level))
-                cluster_1to1_ALO_output = []
-                cluster_1to1_ALO_output.append(self.get_header_line('cluster_1to1s_ALO', attribute))
-                if not attribute == "TAXON":
-                    for cluster_type in ALO.clusters_by_cluster_cardinality_by_cluster_type:
-                        for cluster_cardinality in ALO.clusters_by_cluster_cardinality_by_cluster_type[cluster_type]:
-                            for cluster_id in ALO.clusters_by_cluster_cardinality_by_cluster_type[cluster_type][cluster_cardinality]:
-                                cluster_1to1_ALO_line = []
-                                cluster_1to1_ALO_line.append(cluster_id)
-                                cluster_1to1_ALO_line.append(cluster_type)
-                                cluster_1to1_ALO_line.append(cluster_cardinality)
-                                cluster_1to1_ALO_line.append(clusterCollection.clusterObjs_by_cluster_id[cluster_id].proteome_count)
-                                cluster_1to1_ALO_line.append("{0:.2f}".format(
-                                    len([protein_count for proteome_id, protein_count in clusterCollection.clusterObjs_by_cluster_id[cluster_id].protein_count_by_proteome_id.items() if protein_count == inputObj.fuzzy_count]) / clusterCollection.clusterObjs_by_cluster_id[cluster_id].proteome_count)
-                                )
-                                cluster_1to1_ALO_output.append("\t".join([str(field) for field in cluster_1to1_ALO_line]))
-
-                for clusterObj in clusterCollection.clusterObjs:
-
-                    ###########################
-                    # cluster_metrics (only done once for each attribute)
-                    ###########################
-
-                    if not levels_seen:
-                        cluster_metrics_line = []
-                        cluster_metrics_line.append(clusterObj.cluster_id)
-                        cluster_metrics_line.append(clusterObj.protein_count)
-                        cluster_metrics_line.append(clusterObj.protein_median)
-                        cluster_metrics_line.append(clusterObj.proteome_count)
-                        cluster_metrics_line.append(attribute)
-                        cluster_metrics_line.append(clusterObj.cluster_type_by_attribute[attribute])
-                        if clusterCollection.fastas_parsed:
-                            cluster_metrics_line.append(clusterObj.protein_length_stats['mean'])
-                            cluster_metrics_line.append(clusterObj.protein_length_stats['sd'])
-                        else:
-                            cluster_metrics_line.append("N/A")
-                            cluster_metrics_line.append("N/A")
-                        for _level in levels:
-                            cluster_metrics_line.append(sum(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][_level]))
-                        if not attribute == "TAXON":
-                            for _level in levels:
-                                cluster_metrics_line.append(median(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][_level]))
-                            for _level in levels:
-                                cluster_metrics_line.append("{0:.2f}".format(clusterObj.proteome_coverage_by_level_by_attribute[attribute][_level]))
-                        cluster_metrics_output.append("\t".join([str(field) for field in cluster_metrics_line]))
-
-                    ###########################
-                    # cafe (only done for attribute "TAXON")
-                    ###########################
-
-                    if not levels_seen and attribute == "TAXON":
-                        cafe_line = []
-                        #cafe_line.append("None")
-                        cafe_line.append(str(clusterObj.cluster_id))
-                        for _level in levels:
-                            cafe_line.append(sum(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][_level]))
-                        cafe_output.append("\t".join([str(field) for field in cafe_line]))
-
-                    ###########################
-                    # cluster_metrics_domains (only done for attribute "TAXON")
-                    # - now different:
-                    # - has line for each domain_id for each domain_source
-                    ###########################
-
-                    if not levels_seen and attribute == "TAXON":
-                        if clusterCollection.functional_annotation_parsed:
-                            # cluster_metrics_domain_line
-                            cluster_metrics_domains_line = []
-                            cluster_metrics_domains_line.append(clusterObj.cluster_id)
-                            cluster_metrics_domains_line.append(clusterObj.protein_count)
-                            cluster_metrics_domains_line.append(clusterObj.proteome_count)
-                            if clusterCollection.fastas_parsed:
-                                cluster_metrics_domains_line.append(clusterObj.protein_length_stats['mean'])
-                                cluster_metrics_domains_line.append(clusterObj.protein_length_stats['sd'])
-                            else:
-                                cluster_metrics_domains_line.append("N/A")
-                                cluster_metrics_domains_line.append("N/A")
-                            if "SignalP_EUK" in clusterCollection.domain_sources:
-                                cluster_metrics_domains_line.append("{0:.2f}".format(clusterObj.secreted_cluster_coverage))
-                            else:
-                                cluster_metrics_domains_line.append("N/A")
-                            for domain_source in clusterCollection.domain_sources:
-                                # cluster_metrics_domains
-                                if domain_source in clusterObj.domain_counter_by_domain_source:
-                                    cluster_metrics_domains_line.append(";".join(["%s:%s" % (domain_id, count) for domain_id, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()]))
-                                    cluster_metrics_domains_line.append("{0:.3f}".format(clusterObj.domain_entropy_by_domain_source[domain_source]))
-                                else:
-                                    cluster_metrics_domains_line.append("N/A")
-                                    cluster_metrics_domains_line.append("N/A")
-                            cluster_metrics_domains_output.append("\t".join([str(field) for field in cluster_metrics_domains_line]))
-                            for domain_source in clusterObj.domain_counter_by_domain_source:
-                                for domain_id, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common():
-                                    cluster_metrics_domains_detailed_output_line = []
-                                    cluster_metrics_domains_detailed_output_line.append(clusterObj.cluster_id)
-                                    cluster_metrics_domains_detailed_output_line.append(domain_source)
-                                    cluster_metrics_domains_detailed_output_line.append(domain_id)
-                                    if domain_source == 'SignalP_EUK':
-                                        cluster_metrics_domains_detailed_output_line.append(domain_id)
-                                    else:
-                                        if domain_source in proteinCollection.domain_description_by_domain_id_by_domain_source:
-                                            cluster_metrics_domains_detailed_output_line.append(proteinCollection.domain_description_by_domain_id_by_domain_source[domain_source].get(domain_id, "N/A"))
-                                        else:
-                                            cluster_metrics_domains_detailed_output_line.append("N/A")
-                                    cluster_metrics_domains_detailed_output_line.append(clusterObj.protein_count)
-                                    protein_with_domain_count_by_proteome_id = {}
-                                    proteome_count_with_domain = 0
-                                    protein_without_domain_count_by_proteome_id = {}
-                                    for proteome_id, protein_ids in clusterObj.protein_ids_by_proteome_id.items():
-                                        proteome_seen = False
-                                        for protein_id in protein_ids:
-                                            if domain_source in proteinCollection.proteinObjs_by_protein_id[protein_id].domain_counter_by_domain_source and domain_id in proteinCollection.proteinObjs_by_protein_id[protein_id].domain_counter_by_domain_source[domain_source]:
-                                                protein_with_domain_count_by_proteome_id[proteome_id] = protein_with_domain_count_by_proteome_id.get(proteome_id, 0) + 1
-                                                if not proteome_seen:
-                                                    proteome_count_with_domain += 1
-                                                    proteome_seen = True
-                                            else:
-                                                protein_without_domain_count_by_proteome_id[proteome_id] = protein_without_domain_count_by_proteome_id.get(proteome_id, 0) + 1
-                                    proteomes_with_domain_count_string = ",".join(sorted(["%s:%s/%s" % (proteome_id, count, len(clusterObj.protein_ids_by_proteome_id[proteome_id])) for proteome_id, count in protein_with_domain_count_by_proteome_id.items()]))
-                                    proteomes_without_domain_count_string = ",".join(sorted(["%s:%s/%s" % (proteome_id, count, len(clusterObj.protein_ids_by_proteome_id[proteome_id])) for proteome_id, count in protein_without_domain_count_by_proteome_id.items()]))
-                                    cluster_metrics_domains_detailed_output_line.append(sum(protein_with_domain_count_by_proteome_id.values()))
-                                    cluster_metrics_domains_detailed_output_line.append("{0:.3f}".format(proteome_count_with_domain / clusterObj.proteome_count))
-                                    if proteomes_with_domain_count_string:
-                                        cluster_metrics_domains_detailed_output_line.append(proteomes_with_domain_count_string)
-                                    else:
-                                        cluster_metrics_domains_detailed_output_line.append("N/A")
-                                    if proteomes_without_domain_count_string:
-                                        cluster_metrics_domains_detailed_output_line.append(proteomes_without_domain_count_string)
-                                    else:
-                                        cluster_metrics_domains_detailed_output_line.append("N/A")
-                                    cluster_metrics_domains_detailed_output_by_domain_source[domain_source].append("\t".join([str(field) for field in cluster_metrics_domains_detailed_output_line]))
-
-                    ###########################
-                    # cluster_metrics_ALO : populate
-                    ###########################
-
-                    cluster_metrics_ALO_line = []
-                    cluster_metrics_ALO_line.append(clusterObj.cluster_id)
-                    cluster_metrics_ALO_line.append(ALO.cluster_status_by_cluster_id[clusterObj.cluster_id])
-                    cluster_metrics_ALO_line.append(ALO.cluster_type_by_cluster_id[clusterObj.cluster_id])
-                    cluster_metrics_ALO_line.append(clusterObj.protein_count)
-                    cluster_metrics_ALO_line.append(clusterObj.proteome_count)
-                    cluster_metrics_ALO_line.append(sum(clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][level]))
-                    if ALO.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id]:
-                        cluster_metrics_ALO_line.append(ALO.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id])
-                    else:
-                        cluster_metrics_ALO_line.append("N/A")
-                    if ALO.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id]:
-                        cluster_metrics_ALO_line.append(ALO.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id])
-                    else:
-                        cluster_metrics_ALO_line.append("N/A")
-                    if ALO.cluster_type_by_cluster_id[clusterObj.cluster_id] == 'shared':
-                        if ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id]:
-                            background_pair = (level, "background")
-                            if attribute not in background_representation_test_by_pair_by_attribute:
-                                background_representation_test_by_pair_by_attribute[attribute] = {}
-                            if background_pair not in background_representation_test_by_pair_by_attribute[attribute]:
-                                background_representation_test_by_pair_by_attribute[attribute][background_pair] = []
-                            background_representation_test = []
-                            background_representation_test.append(clusterObj.cluster_id)
-                            background_representation_test.append(level)
-                            background_representation_test.append("background")
-                            background_representation_test.append(ALO.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id])
-                            background_representation_test.append(ALO.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id])
-                            background_representation_test.append(ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id])
-                            background_representation_test.append(ALO.cluster_mwu_pvalue_by_cluster_id[clusterObj.cluster_id])
-                            background_representation_test_by_pair_by_attribute[attribute][background_pair].append(background_representation_test)
-
-                            if ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id] > 0:
-                                cluster_metrics_ALO_line.append("enriched")
-                            elif ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id] < 0:
-                                cluster_metrics_ALO_line.append("depleted")
-                            else:
-                                cluster_metrics_ALO_line.append("equal")
-                            cluster_metrics_ALO_line.append(ALO.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id])
-                            cluster_metrics_ALO_line.append(ALO.cluster_mwu_pvalue_by_cluster_id[clusterObj.cluster_id])
-                        else:
-                            cluster_metrics_ALO_line.append("N/A")
-                            cluster_metrics_ALO_line.append("N/A")
-                            cluster_metrics_ALO_line.append("N/A")
-                    else:
-                        cluster_metrics_ALO_line.append("N/A")
-                        cluster_metrics_ALO_line.append("N/A")
-                        cluster_metrics_ALO_line.append("N/A")
-                    cluster_metrics_ALO_line.append("{0:.2f}".format(clusterObj.proteome_coverage_by_level_by_attribute[attribute][level]))
-                    ALO_proteomes_present = []
-                    if ALO.cluster_status_by_cluster_id[clusterObj.cluster_id] == 'present':
-                        ALO_proteomes_present = clusterObj.proteome_ids.intersection(ALO.proteomes)
-                    cluster_metrics_ALO_line.append(len(ALO_proteomes_present))
-                    if ALO_proteomes_present:
-                        cluster_metrics_ALO_line.append(",".join(sorted(list(ALO_proteomes_present))))
-                    else:
-                        cluster_metrics_ALO_line.append("N/A")
-                    if clusterObj.go_terms:
-                        cluster_metrics_ALO_line.append(";".join(sorted(list(clusterObj.go_terms))))
-                    else:
-                        cluster_metrics_ALO_line.append("N/A")
-                    #for domain_source in clusterCollection.domain_sources:
-                    #    if domain_source in clusterObj.domain_counter_by_domain_source:
-                    #        cluster_metrics_ALO_line.append(";".join(["%s:%s" % (domain, count) for domain, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()]))
-                    #    else:
-                    #        cluster_metrics_ALO_line.append("N/A")
-                    cluster_metrics_ALO_output.append("\t".join([str(field) for field in cluster_metrics_ALO_line]))
-
-                    if len(levels) > 1 and len(ALO_proteomes_present) >= inputObj.min_proteomes:
-                        for result in self.pairwise_representation_test(clusterObj, attribute, level, levels_seen, levels):
-                            # [clusterObj.cluster_id, level, other_level, mean_level, mean_other_level, log2fc_mean, pvalue]
-                            if attribute not in pairwise_representation_test_by_pair_by_attribute:
-                                pairwise_representation_test_by_pair_by_attribute[attribute] = {}
-                            pair = (result[1], result[2])
-                            if pair not in pairwise_representation_test_by_pair_by_attribute[attribute]:
-                                pairwise_representation_test_by_pair_by_attribute[attribute][pair] = []
-                            pairwise_representation_test_by_pair_by_attribute[attribute][pair].append(result)
-
-                            pairwise_representation_test_line = []
-                            pairwise_representation_test_line.append(result[0])
-                            pairwise_representation_test_line.append(result[1])
-                            pairwise_representation_test_line.append(result[3])
-                            pairwise_representation_test_line.append(result[2])
-                            pairwise_representation_test_line.append(result[4])
-                            pairwise_representation_test_line.append(result[5])
-                            pairwise_representation_test_line.append(result[6])
-                            #if clusterObj.go_terms:
-                            #    pairwise_representation_test_line.append(";".join(sorted(list(clusterObj.go_terms))))
-                            #else:
-                            #    pairwise_representation_test_line.append("N/A")
-                            #for domain_source in clusterCollection.domain_sources:
-                            #    if domain_source in clusterObj.domain_counter_by_domain_source:
-                            #        pairwise_representation_test_line.append(";".join(["%s:%s" % (domain, count) for domain, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()]))
-                            #    else:
-                            #        pairwise_representation_test_line.append("N/A")
-                            pairwise_representation_test_output.append("\t".join([str(field) for field in pairwise_representation_test_line]))
-
-                levels_seen.add(level)
-                # END of cluster loop
-
-                if len(cafe_output) > 1:
-                    with open(cafe_f, 'w') as cafe_fh:
-                        print "[STATUS] - Writing %s" % (cafe_f)
-                        cafe_fh.write("\n".join(cafe_output) + "\n")
-                    cafe_output = []
-                if len(cluster_metrics_output) > 1:
-                    with open(cluster_metrics_f, 'w') as cluster_metrics_fh:
-                        print "[STATUS] - Writing %s" % (cluster_metrics_f)
-                        cluster_metrics_fh.write("\n".join(cluster_metrics_output) + "\n")
-                    cluster_metrics_output = []
-                if len(cluster_metrics_domains_output) > 1:
-                    with open(cluster_metrics_domains_f, 'w') as cluster_metrics_domains_fh:
-                        print "[STATUS] - Writing %s" % (cluster_metrics_domains_f)
-                        cluster_metrics_domains_fh.write("\n".join(cluster_metrics_domains_output) + "\n")
-                    cluster_metrics_domains_output = []
-                for domain_source in cluster_metrics_domains_detailed_output_by_domain_source:
-                    if len(cluster_metrics_domains_detailed_output_by_domain_source[domain_source]) > 1:
-                        cluster_metrics_domains_detailed_f = cluster_metrics_domains_detailed_f_by_domain_source[domain_source]
-                        with open(cluster_metrics_domains_detailed_f, 'w') as cluster_metrics_domains_detailed_fh:
-                            print "[STATUS] - Writing %s" % (cluster_metrics_domains_detailed_f)
-                            cluster_metrics_domains_detailed_fh.write("\n".join(cluster_metrics_domains_detailed_output_by_domain_source[domain_source]) + "\n")
-                        cluster_metrics_domains_detailed_output_by_domain_source[domain_source] = []
-                if len(cluster_metrics_ALO_output) > 1:
-                    with open(cluster_metrics_ALO_f, 'w') as cluster_metrics_ALO_fh:
-                        print "[STATUS] - Writing %s" % (cluster_metrics_ALO_f)
-                        cluster_metrics_ALO_fh.write("\n".join(cluster_metrics_ALO_output) + "\n")
-                    cluster_metrics_ALO_output = []
-                if len(cluster_1to1_ALO_output) > 1:
-                    with open(cluster_1to1_ALO_f, 'w') as cluster_1to1_ALO_fh:
-                        print "[STATUS] - Writing %s" % (cluster_1to1_ALO_f)
-                        cluster_1to1_ALO_fh.write("\n".join(cluster_1to1_ALO_output) + "\n")
-                    cluster_1to1_ALO_output = []
-                if background_representation_test_by_pair_by_attribute:
-                    self.plot_count_comparisons_vulcano(background_representation_test_by_pair_by_attribute)
-
-
-            if len(attribute_metrics_output) > 1:
-                with open(attribute_metrics_f, 'w') as attribute_metrics_fh:
-                    print "[STATUS] - Writing %s" % (attribute_metrics_f)
-                    attribute_metrics_fh.write("\n".join(attribute_metrics_output) + "\n")
-            if len(pairwise_representation_test_output) > 1:
-                with open(pairwise_representation_test_f, 'w') as pairwise_representation_test_fh:
-                    print "[STATUS] - Writing %s" % (pairwise_representation_test_f)
-                    pairwise_representation_test_fh.write("\n".join(pairwise_representation_test_output) + "\n")
-            if pairwise_representation_test_by_pair_by_attribute:
-                self.plot_count_comparisons_vulcano(pairwise_representation_test_by_pair_by_attribute)
-
-    def plot_count_comparisons_vulcano(self, pairwise_representation_test_by_pair_by_attribute):
-        # [clusterObj.cluster_id, level, other_level, mean_level, mean_other_level, log2fc_mean, pvalue]
-        for attribute in pairwise_representation_test_by_pair_by_attribute:
-            for pair in pairwise_representation_test_by_pair_by_attribute[attribute]:
-                pair_list = list(pair)
-                x_label = pair_list[0]
-                y_label = pair_list[1]
-                pair_data = pairwise_representation_test_by_pair_by_attribute[attribute][pair]
-                pair_data_count = len(pair_data)
-                p_values = []
-                log2fc_values = []
-                for data in pair_data:
-                    log2fc_values.append(data[5])
-                    p_values.append(data[6])
-                if p_values:
-                    pairwise_representation_test_f = join(self.dirs[attribute], "%s.pairwise_representation_test.%s.%s" % (attribute, "_".join(pair_list), inputObj.plot_format))
-                    f, ax = plt.subplots(figsize=inputObj.plot_size)
-                    ax.set_facecolor('white')
-                    p_array = np.array(p_values)
-                    log2fc_array = np.array(log2fc_values)
-                    ax.scatter(log2fc_array, p_array, alpha=0.8, edgecolors='none', s=25, c='grey')
-
-                    ooFive = 0.05
-                    ooOne = 0.01
-                    ooFive_corrected = 0.05 / pair_data_count
-                    ooOne_corrected = 0.01 / pair_data_count
-
-                    ax.axhline(y=ooFive, linewidth=2, color='orange', linestyle="--")
-                    ooFive_artist = plt.Line2D((0, 1), (0, 0), color='orange', linestyle='--')
-                    ax.axhline(y=ooOne, linewidth=2, color='red', linestyle="--")
-                    ooOne_artist = plt.Line2D((0, 1), (0, 0), color='red', linestyle='--')
-                    ax.axhline(y=ooFive_corrected, linewidth=2, color='grey', linestyle="--")
-                    ooFive_corrected_artist = plt.Line2D((0, 1), (0, 0), color='grey', linestyle='--')
-                    ax.axhline(y=ooOne_corrected, linewidth=2, color='black', linestyle="--")
-                    ooOne_corrected_artist = plt.Line2D((0, 1), (0, 0), color='black', linestyle='--')
-
-                    # Create legend from custom artist/label lists
-                    legend = ax.legend([ooFive_artist, ooOne_artist, ooFive_corrected_artist, ooOne_corrected_artist],
-                              [ooFive, ooOne, "%s (0.05 corrected)" % '%.2E' % Decimal(ooFive_corrected), "%s (0.01 corrected)" % '%.2E' % Decimal(ooOne_corrected)],
-                              fontsize=inputObj.plot_font_size, frameon=True)
-                    legend.get_frame().set_facecolor('white')
-                    if abs(np.min(log2fc_array)) < abs(np.max(log2fc_array)):
-                        x_min = 0.0 - abs(np.max(log2fc_array))
-                        x_max = 0.0 + abs(np.max(log2fc_array))
-                        ax.set_xlim(x_min - 1, x_max + 1)
-                    else:
-                        x_min = 0.0 - abs(np.min(log2fc_array))
-                        x_max = 0.0 + abs(np.min(log2fc_array))
-                        ax.set_xlim(x_min - 1, x_max + 1)
-
-                    ax.grid(True, linewidth=1, which="major", color="lightgrey")
-                    ax.set_ylim(np.min(p_array) * 0.1, 1.1)
-                    ax.set_xlabel("log2(mean(%s)/mean(%s))" % (x_label, y_label), fontsize=inputObj.plot_font_size)
-                    ax.set_ylabel("p-value", fontsize=inputObj.plot_font_size)
-                    plt.gca().invert_yaxis()
-                    ax.set_yscale('log')
-                    print "[STATUS] - Plotting %s" % (pairwise_representation_test_f)
-                    f.savefig(pairwise_representation_test_f, format=inputObj.plot_format)
-                    plt.close()
-
-    def pairwise_representation_test(self, clusterObj, attribute, level, levels_seen, levels):
-        for other_level in set(levels).difference(levels_seen):
-            if not other_level == level:
-                other_ALO = aloCollection.ALO_by_level_by_attribute[attribute][other_level]
-                if len(clusterObj.proteome_ids.intersection(other_ALO.proteomes)) >= 2:
-                    protein_counts_level = [count for count in clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][level] if count > 0]
-                    protein_counts_other_level = [count for count in clusterObj.protein_counts_of_proteomes_by_level_by_attribute[attribute][other_level] if count > 0]
-                    if protein_counts_level and protein_counts_other_level:
-                        pvalue = None
-                        try:
-                            pvalue = scipy.stats.mannwhitneyu(protein_counts_level, protein_counts_other_level, alternative="two-sided")[1]
-                        except:
-                            pvalue = 1.0
-                        mean_level = mean(protein_counts_level)
-                        mean_other_level = mean(protein_counts_other_level)
-                        log2fc_mean = log((mean_level/mean_other_level), 2)
-                        yield [clusterObj.cluster_id, level, other_level, mean_level, mean_other_level, log2fc_mean, pvalue]
-
-########################################################################
-# CLASS : AloCollection
-########################################################################
-
-class AloCollection():
-    def __init__(self, proteomes, proteome_id_by_species_id, attributes, level_by_attribute_by_proteome_id, tree_ete, node_idx_by_proteome_ids):
-        self.attributes_verbose = attributes
-        self.attributes = [attribute for attribute in attributes if attribute not in inputObj.ATTRIBUTE_RESERVED] # list of attributes
-        self.proteome_id_by_species_id = proteome_id_by_species_id
-        self.tree_ete = tree_ete
-
-        self.node_idx_by_proteome_ids = node_idx_by_proteome_ids
-        self.level_by_attribute_by_proteome_id = level_by_attribute_by_proteome_id
-        self.proteome_ids_by_level_by_attribute = self.compute_proteomes_by_level_by_attribute()
-
-        self.counts_of_all_proteome_subsets = {}
-        self.cluster_ids_of_all_proteome_subsets = {}
-
-        self.ALO_by_level_by_attribute = self.create_ALOs()
-
-        self.fastas_parsed = False
-        self.rarefaction_by_samplesize_by_level_by_attribute = {}
-
-    ###############################
-    ### create_ALOs
-    ###############################
-
-    def create_ALOs(self):
-        ALO_by_level_by_attribute = {attribute: {} for attribute in self.attributes}
-        for attribute in self.proteome_ids_by_level_by_attribute:
-            for level in self.proteome_ids_by_level_by_attribute[attribute]:
-                proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level]
-                ALO = AttributeLevelObj(attribute, level, proteome_ids)
-                if not level in ALO_by_level_by_attribute[attribute]:
-                    ALO_by_level_by_attribute[attribute][level] = {}
-                ALO_by_level_by_attribute[attribute][level] = ALO
-        return ALO_by_level_by_attribute
-
-    ###############################
-    ### compute_proteomes_by_level_by_attribute
-    ###############################
-
-    def compute_proteomes_by_level_by_attribute(self):
-        proteomes_by_level_by_attribute = {attribute : {} for attribute in self.attributes}
-        for proteome_id in self.level_by_attribute_by_proteome_id:
-            for attribute in self.attributes:
-                level = self.level_by_attribute_by_proteome_id[proteome_id][attribute]
-                if not level in proteomes_by_level_by_attribute[attribute]:
-                    proteomes_by_level_by_attribute[attribute][level] = set()
-                proteomes_by_level_by_attribute[attribute][level].add(proteome_id)
-        return proteomes_by_level_by_attribute
-    ###############################
-    ### compute_levels_by_attribute
-    ###############################
-
-    def compute_levels_by_attribute(self):
-        levels_by_attribute = {attribute : set() for attribute in self.attributes}
-        for proteome in self.level_by_attribute_by_proteome_id:
-            for attribute in self.attributes:
-                level = self.level_by_attribute_by_proteome_id[proteome][attribute]
-                levels_by_attribute[attribute].add(level)
-        return levels_by_attribute
-
-    def analyse_domains(self):
-        if proteinCollection.functional_annotation_parsed:
-            for attribute in self.ALO_by_level_by_attribute:
-                for level in self.ALO_by_level_by_attribute[attribute]:
-                    ALO = self.ALO_by_level_by_attribute[attribute][level]
-                    ALO.analyse_domains()
-    ###############################
-    ### analyse_clusters
-    ###############################
-
-    def analyse_clusters(self):
-        if clusterCollection.inferred_singletons_count:
-            print "[STATUS]\t - Clusters found = %s (of which %s were inferred singletons)" % (clusterCollection.cluster_count, clusterCollection.inferred_singletons_count)
-        else:
-            print "[STATUS]\t - Clusters found = %s" % (clusterCollection.cluster_count)
-        parse_steps = clusterCollection.cluster_count/100
-        print "[STATUS] - Analysing clusters ..."
-        analyse_clusters_start = time.time()
-        for idx, clusterObj in enumerate(clusterCollection.clusterObjs):
-            self.analyse_cluster(clusterObj)
-            progress(idx+1, parse_steps, clusterCollection.cluster_count)
-        analyse_clusters_end = time.time()
-        analyse_clusters_elapsed = analyse_clusters_end - analyse_clusters_start
-        print "[STATUS] - Took %ss to analyse clusters" % (analyse_clusters_elapsed)
-
- ###############################
-    ### analyse_clusters : analyse_cluster
-    ###############################
-
-    def analyse_cluster(self, clusterObj):
-        '''This function selects the ALOs to which the cluster has to be added'''
-        # avoiding dots
-        protein_get_by_proteome_id = clusterObj.protein_ids_by_proteome_id.get
-
-        implicit_protein_ids_by_proteome_id_by_level_by_attribute = {}
-        cluster_type_by_attribute = {}
-        protein_counts_of_proteomes_by_level_by_attribute = {}
-        proteome_coverage_by_level_by_attribute = {}
-        if self.tree_ete:
-            for node in self.tree_ete.traverse("levelorder"):
-                intersection = clusterObj.proteome_ids.intersection(node.proteome_ids)
-                difference = clusterObj.proteome_ids.difference(node.proteome_ids)
-                if len(intersection) == 0:
-                    # Nothing to see here ...
-                    node.counts['absent'] += 1
-                else:
-                    if clusterObj.singleton == True:
-                        # This is a singleton
-                        node.counts['singleton'] += 1
-                        node.apomorphic_cluster_counts['singletons'] += 1
-                    elif len(difference) > 0:
-                        # This is a 'shared' cluster
-                        node.counts['shared'] +=1
-                    elif len(difference) == 0:
-                        # This is a node 'specific' cluster
-                        node.counts['specific'] += 1
-                        if clusterObj.proteome_count == 1:
-                            # But it only belongs to one proteome
-                            node.apomorphic_cluster_counts['non_singletons'] += 1
-                        else:
-                            # It has more than one proteome
-                            child_nodes_covered = []
-                            child_node_proteome_coverage_strings = []
-                            child_node_proteome_ids_covered_count = 0
-                            for child_node in node.get_children():
-                                if child_node.proteome_ids.isdisjoint(clusterObj.proteome_ids):
-                                    # No child node proteomes are not in cluster
-                                    child_nodes_covered.append(False)
-                                else:
-                                    # At least on child node proteome in cluster
-                                    child_nodes_covered.append(True)
-                                    child_node_proteome_ids_covered_count = len(clusterObj.proteome_ids.intersection(child_node.proteome_ids))
-                                    child_node_proteome_coverage_strings.append(\
-                                            "%s=(%s/%s)" % (child_node.name, child_node_proteome_ids_covered_count, len(child_node.proteome_ids)))
-                            if all(child_nodes_covered):
-                                # At least one proteome of each child node in cluster
-                                # => SYNAPOMORPHY
-                                node_proteome_coverage = len(intersection)/len(node.proteome_ids)
-                                node_cluster_type = ''
-                                if node_proteome_coverage == 1.0:
-                                    node_cluster_type = 'complete_presence'
-                                else:
-                                    node_cluster_type = 'stochastic_absence'
-                                node.synapomorphic_cluster_counts[node_cluster_type] += 1
-                                node.synapomorphic_cluster_strings.append(\
-                                        (clusterObj.cluster_id, \
-                                            node.name, \
-                                            node_cluster_type, \
-                                            '{0:.3}'.format(node_proteome_coverage), \
-                                            ";".join(child_node_proteome_coverage_strings), \
-                                            ",".join(sorted(intersection))) \
-                                            #",".join(sorted(clusterObj.proteome_ids))) \
-                                        )
-                    else:
-                        sys.exit("[ERROR] You broke my program ...")
-
-        for attribute in self.attributes:
-            protein_counts_of_proteomes_by_level_by_attribute[attribute] = {}
-            proteome_coverage_by_level_by_attribute[attribute] = {}
-            implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute] = {}
-            protein_ids_by_level = {}
-            protein_length_stats_by_level = {}
-            explicit_protein_count_by_proteome_id_by_level = {}
-
-            for level in self.ALO_by_level_by_attribute[attribute]:
-                protein_ids_by_proteome_id = {}
-                protein_count_by_proteome_id = {}
-                protein_ids_by_level[level] = []
-                for proteome_id in self.ALO_by_level_by_attribute[attribute][level].proteomes_list:
-                    protein_ids = protein_get_by_proteome_id(proteome_id, [])
-                    protein_ids_by_level[level] += protein_ids
-                    protein_count_by_proteome_id[proteome_id] = len(protein_ids)
-                    if not protein_count_by_proteome_id[proteome_id] == 0:
-                        protein_ids_by_proteome_id[proteome_id] = protein_ids
-                if protein_ids_by_proteome_id:
-                    implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute][level] = protein_ids_by_proteome_id
-                explicit_protein_count_by_proteome_id_by_level[level] = protein_count_by_proteome_id
-                protein_length_stats_by_level[level] = proteinCollection.get_protein_length_stats(protein_ids_by_level[level])
-                protein_counts_of_proteomes_by_level_by_attribute[attribute][level] = [protein_count for proteome_id, protein_count in protein_count_by_proteome_id.items()]
-            cluster_type_by_attribute[attribute] = get_attribute_cluster_type(clusterObj.singleton, implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute])
-
-            for level in self.ALO_by_level_by_attribute[attribute]:
-                ALO = self.ALO_by_level_by_attribute[attribute][level]
-                proteome_coverage_by_level_by_attribute[attribute][level] = len(implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute].get(level, [])) / ALO.proteome_count
-                ALO_cluster_status = None
-                ALO_cluster_cardinality = None
-                mwu_pvalue = None
-                mwu_log2_mean = None
-                mean_ALO_count = None
-                mean_non_ALO_count = None
-                if level not in implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute]:
-                    ALO_cluster_status = 'absent'
-                else:
-                    ALO_cluster_status = 'present'
-                    if not cluster_type_by_attribute[attribute] == 'singleton':
-                        ALO_proteome_counts_in_cluster = [count for proteome_id, count in explicit_protein_count_by_proteome_id_by_level[level].items()]
-                        ALO_cluster_cardinality = get_ALO_cluster_cardinality(ALO_proteome_counts_in_cluster)
-                        if cluster_type_by_attribute[attribute] == 'shared':
-                            non_ALO_levels = [non_ALO_level for non_ALO_level in explicit_protein_count_by_proteome_id_by_level if not non_ALO_level == level]
-                            non_ALO_proteome_counts_in_cluster = []
-                            for non_ALO_level in non_ALO_levels:
-                                for proteome_id in explicit_protein_count_by_proteome_id_by_level[non_ALO_level]:
-                                    non_ALO_proteome_counts_in_cluster.append(explicit_protein_count_by_proteome_id_by_level[non_ALO_level][proteome_id])
-                            mwu_pvalue, mwu_log2_mean, mean_ALO_count, mean_non_ALO_count = mannwhitneyu(ALO_proteome_counts_in_cluster, non_ALO_proteome_counts_in_cluster)
-
-                ALO.add_clusterObj(
-                    clusterObj,
-                    cluster_type_by_attribute[attribute],
-                    ALO_cluster_status,
-                    ALO_cluster_cardinality,
-                    protein_ids_by_level[level],
-                    protein_length_stats_by_level[level],
-                    mwu_pvalue,
-                    mwu_log2_mean,
-                    mean_ALO_count,
-                    mean_non_ALO_count
-                )
-        clusterObj.protein_counts_of_proteomes_by_level_by_attribute = protein_counts_of_proteomes_by_level_by_attribute
-        clusterObj.protein_median = median([count for count in protein_counts_of_proteomes_by_level_by_attribute['all']['all'] if not count == 0])
-        clusterObj.proteome_coverage_by_level_by_attribute = proteome_coverage_by_level_by_attribute
-        clusterObj.implicit_protein_ids_by_proteome_id_by_level_by_attribute = implicit_protein_ids_by_proteome_id_by_level_by_attribute
-        clusterObj.cluster_type_by_attribute = cluster_type_by_attribute
-
-    def write_tree(self):
-        if self.tree_ete:
-            print "[STATUS] - Writing data for tree ... "
-            # Node stats
-            node_stats_f = join(dataFactory.dirs['tree'], "tree.node_metrics.txt")
-            node_stats_header = []
-            node_stats_header.append('nodeID')
-            node_stats_header.append('taxon_specific_apomorphies_[singletons]')
-            node_stats_header.append('taxon_specific_apomorphies (non-singletons)')
-            node_stats_header.append('node_specific_synapomorphies_total')
-            node_stats_header.append('node_specific_synapomorphies_all')
-            node_stats_header.append('node_specific_synapomorphies_stochastic_absence')
-            node_stats_header.append('proteome_count')
-            node_stats = []
-            node_stats.append("\t".join(node_stats_header))
-            # Cluster node stats
-            node_clusters_f = join(dataFactory.dirs['tree'], "tree.cluster_metrics.txt")
-            node_clusters_header = []
-            node_clusters_header.append('clusterID')
-            node_clusters_header.append('nodeID')
-            node_clusters_header.append('synapomorphy_type')
-            node_clusters_header.append('node_proteomes_coverage')
-            node_clusters_header.append('children_coverage')
-            node_clusters_header.append('node_proteomes_present')
-            node_clusters = []
-            node_clusters.append("\t".join(node_clusters_header))
-            # header_f_by_node_name
-            header_f_by_node_name = {}
-            charts_f_by_node_name = {}
-            for node in self.tree_ete.traverse("levelorder"):
-                for synapomorphic_cluster_string in node.synapomorphic_cluster_strings:
-                    node_clusters.append("\t".join([str(string) for string in list(synapomorphic_cluster_string)]))
-                node_stats_line = []
-                node_stats_line.append(node.name)
-                node_stats_line.append(node.apomorphic_cluster_counts['singletons'])
-                node_stats_line.append(node.apomorphic_cluster_counts['non_singletons'])
-                node_stats_line.append(node.synapomorphic_cluster_counts['complete_presence'] + node.synapomorphic_cluster_counts['stochastic_absence'])
-                node_stats_line.append(node.synapomorphic_cluster_counts['complete_presence'])
-                node_stats_line.append(node.synapomorphic_cluster_counts['stochastic_absence'])
-                node_stats_line.append(len(node.proteome_ids))
-                node_stats.append("\t".join([str(string) for string in node_stats_line]))
-                if inputObj.render_tree:
-                    header_f_by_node_name[node.name] = self.generate_header_for_node(node)
-                charts_f_by_node_name[node.name] = self.generate_chart_for_node(node)
-            print "[STATUS] - Writing %s ... " % node_stats_f
-            with open(node_stats_f, 'w') as node_stats_fh:
-                node_stats_fh.write("\n".join(node_stats) + "\n")
-            print "[STATUS] - Writing %s ... " % node_clusters_f
-            with open(node_clusters_f, 'w') as node_clusters_fh:
-                node_clusters_fh.write("\n".join(node_clusters) + "\n")
-            if inputObj.render_tree:
-                self.plot_tree(header_f_by_node_name, charts_f_by_node_name)
-
-    def plot_tree(self, header_f_by_node_name, charts_f_by_node_name):
-        tree_f = join(dataFactory.dirs['tree'], "tree.%s" % ('pdf'))  # must be PDF! (otherwise it breaks)
-        style = ete3.NodeStyle()
-        style["vt_line_width"] = 5
-        style["hz_line_width"] = 5
-        style["fgcolor"] = "darkgrey"
-        for node in self.tree_ete.traverse("levelorder"):
-            node.set_style(style)
-            if header_f_by_node_name[node.name]:
-                node_header_face = ete3.faces.ImgFace(header_f_by_node_name[node.name])  # must be PNG! (ETE can't do PDF Faces)
-                node.add_face(node_header_face, column=0, position="branch-top")
-            if charts_f_by_node_name[node.name]:
-                node_chart_face = ete3.faces.ImgFace(charts_f_by_node_name[node.name])  # must be PNG! (ETE can't do PDF Faces)
-                node.add_face(node_chart_face, column=0, position="branch-bottom")
-            node_name_face = ete3.TextFace(node.name, fsize=64)
-            node.img_style["size"] = 10
-            node.img_style["shape"] = "sphere"
-            node.img_style["fgcolor"] = "black"
-            if not node.is_leaf():
-                node.add_face(node_name_face, column=0, position="branch-right")
-            node.add_face(node_name_face, column=0, position="aligned")
-        ts = ete3.TreeStyle()
-        ts.draw_guiding_lines = True
-        ts.show_scale = False
-        ts.show_leaf_name = False
-        ts.allow_face_overlap = True
-        ts.guiding_lines_color = "lightgrey"
-        print "[STATUS] - Writing tree %s ... " % (tree_f)
-        self.tree_ete.render(tree_f, dpi=600, h=1189, units="mm", tree_style=ts)
-
-    def generate_header_for_node(self, node):
-        node_header_f = join(dataFactory.dirs['tree_headers'], "%s.header.png" % (node.name))
-        data = []
-        data.append(("Apomorphies (size=1)", "{:,}".format(node.apomorphic_cluster_counts['singletons'])))
-        data.append(("Apomorphies (size>1)", "{:,}".format(node.apomorphic_cluster_counts['non_singletons'])))
-        data.append(("Synapomorphies (all)", "{:,}".format(node.synapomorphic_cluster_counts['complete_presence'] + node.synapomorphic_cluster_counts['stochastic_absence'])))
-        data.append(("Synapomorphies (cov=100%)", "{:,}".format(node.synapomorphic_cluster_counts['complete_presence'])))
-        data.append(("Synapomorphies (cov<100%)", "{:,}".format(node.synapomorphic_cluster_counts['stochastic_absence'])))
-        col_labels = ('Type', 'Count')
-        fig, ax = plt.subplots(figsize=(2, 0.5))
-        ax.set_facecolor('white')
-        table = ax.table(
-            cellText=data,
-            colLabels=col_labels,
-            loc='bottom', fontsize=24, colLoc='center', rowLoc='right', edges=''
-        )
-        table.set_fontsize(24)
-        table.scale(2, 1)
-        for key, cell in table.get_celld().items():
-            row, col = key
-            cell._text.set_color('grey')
-            if row > 0:
-                cell.set_edgecolor("darkgrey")
-                cell.visible_edges = "T"
-            else:
-                cell.set_edgecolor("darkgrey")
-                cell.visible_edges = "B"
-            if row == len(data) - 2:
-                cell.set_edgecolor("darkgrey")
-                cell.visible_edges = "T"
-        ax.axis('tight')
-        ax.axis("off")
-        print "[STATUS]\t- Plotting %s" % (node_header_f)
-        fig.savefig(node_header_f, pad=0, bbox_inches='tight', format='png')
-        plt.close()
-        return node_header_f
-
-    def generate_chart_for_node(self, node):
-        proteome_coverages = []
-        for synapomorphic_cluster_string in node.synapomorphic_cluster_strings:
-            proteome_coverages.append(float(synapomorphic_cluster_string[3]))
-        if proteome_coverages:
-            chart_f = join(dataFactory.dirs['tree_charts'], "%s.barchart.png" % (node.name))
-            f, ax = plt.subplots(figsize=(3.0, 3.0))
-            ax.set_facecolor('white')
-            x_values = np.array(proteome_coverages)
-            ax.hist(x_values, histtype='stepfilled', align='mid', bins=np.arange(0.0, 1.0 + 0.1, 0.1))
-            ax.set_xlim(-0.1, 1.1)
-            for tick in ax.xaxis.get_major_ticks():
-                tick.label.set_fontsize(inputObj.plot_font_size - 2)
-                tick.label.set_rotation('vertical')
-            for tick in ax.yaxis.get_major_ticks():
-                tick.label.set_fontsize(inputObj.plot_font_size - 2)
-            ax.set_frame_on(False)
-            ax.xaxis.grid(True, linewidth=1, which="major", color="lightgrey")
-            ax.yaxis.grid(True, linewidth=1, which="major", color="lightgrey")
-            f.suptitle("Synapomorphies", y=1.1)
-            ax.set_ylabel("Count", fontsize=inputObj.plot_font_size)
-            ax.set_xlabel("Proteome coverage", fontsize=inputObj.plot_font_size)
-            print "[STATUS]\t- Plotting %s" % (chart_f)
-            f.savefig(chart_f,  bbox_inches='tight', format='png')
-            if inputObj.plot_format == 'pdf':
-                pdf_chart_f = join(dataFactory.dirs['tree_charts'], "%s.barchart.pdf" % (node.name))
-                print "[STATUS]\t- Plotting %s" % (pdf_chart_f)
-                f.savefig(pdf_chart_f,  bbox_inches='tight', format='pdf')
-            plt.close()
-            return chart_f
-
-    def compute_rarefaction_data(self):
-        rarefaction_by_samplesize_by_level_by_attribute = {}
-        print "[STATUS] - Generating rarefaction data ..."
-        for attribute in self.attributes:
-            for level in self.proteome_ids_by_level_by_attribute[attribute]:
-                proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level]
-                if not len(proteome_ids) == 1:
-                    ALO = self.ALO_by_level_by_attribute[attribute][level]
-                    if not attribute in rarefaction_by_samplesize_by_level_by_attribute:
-                        rarefaction_by_samplesize_by_level_by_attribute[attribute] = {}
-                    if not level in rarefaction_by_samplesize_by_level_by_attribute[attribute]:
-                        rarefaction_by_samplesize_by_level_by_attribute[attribute][level] = {}
-                    for repetition in xrange(0, inputObj.repetitions):
-                        seen_cluster_ids = set()
-                        random_list_of_proteome_ids = [x for x in ALO.proteomes]
-                        random.shuffle(random_list_of_proteome_ids)
-                        for idx, proteome_id in enumerate(random_list_of_proteome_ids):
-                            proteome_ALO = self.ALO_by_level_by_attribute['TAXON'][proteome_id]
-                            seen_cluster_ids.update(proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status['present']['specific'])
-                            seen_cluster_ids.update(proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status['present']['shared'])
-                            sample_size = idx + 1
-                            if not sample_size in rarefaction_by_samplesize_by_level_by_attribute[attribute][level]:
-                                rarefaction_by_samplesize_by_level_by_attribute[attribute][level][sample_size] = []
-                            rarefaction_by_samplesize_by_level_by_attribute[attribute][level][sample_size].append(len(seen_cluster_ids))
-
-        for attribute in rarefaction_by_samplesize_by_level_by_attribute:
-            rarefaction_plot_f = join(dataFactory.dirs[attribute], "%s.rarefaction_curve.%s" % (attribute, inputObj.plot_format))
-            rarefaction_by_samplesize_by_level = rarefaction_by_samplesize_by_level_by_attribute[attribute]
-            f, ax = plt.subplots(figsize=inputObj.plot_size)
-            ax.set_facecolor('white')
-            max_number_of_samples = 0
-            for idx, level in enumerate(rarefaction_by_samplesize_by_level):
-                number_of_samples = len(rarefaction_by_samplesize_by_level[level])
-                if number_of_samples > max_number_of_samples:
-                    max_number_of_samples = number_of_samples
-                colour = plt.cm.Paired(idx/len(rarefaction_by_samplesize_by_level))
-                x_values = []
-                y_mins = []
-                y_maxs = []
-                median_y_values = []
-                median_x_values = []
-                for x, y_reps in rarefaction_by_samplesize_by_level[level].items():
-                    x_values.append(x)
-                    y_mins.append(min(y_reps))
-                    y_maxs.append(max(y_reps))
-                    median_y_values.append(median(y_reps))
-                    median_x_values.append(x)
-                x_array = np.array(x_values)
-                y_mins_array = np.array(y_mins)
-                y_maxs_array = np.array(y_maxs)
-                ax.plot(median_x_values, median_y_values, '-', color=colour, label=level)
-                ax.fill_between(x_array, y_mins_array, y_maxs_array, color=colour, alpha=0.5)
-            ax.set_xlim([0, max_number_of_samples + 1])
-            ax.set_ylabel("Count of non-singleton clusters", fontsize=inputObj.plot_font_size)
-            ax.set_xlabel("Sampled proteomes", fontsize=inputObj.plot_font_size)
-
-            ax.grid(True, linewidth=1, which="major", color="lightgrey")
-            legend = ax.legend(ncol=1, numpoints=1, loc="lower right", frameon=True, fontsize=inputObj.plot_font_size)
-            legend.get_frame().set_facecolor('white')
-            print "[STATUS]\t- Plotting %s" % (rarefaction_plot_f)
-            f.savefig(rarefaction_plot_f, format=inputObj.plot_format)
-            plt.close()
-
-########################################################################
-# CLASS : AttributeLevelObj
-########################################################################
-
-class AttributeLevelObj():
-    '''
-    Definitions:
-        'shared' : shared between one ALO and others
-        'singleton' : cardinality of 1 ('specific', but separate)
-        'specific' : only present within one ALO
-    '''
-    def __init__(self, attribute, level, proteomes):
-        self.attribute = attribute # string
-        self.level = level # string
-        self.proteomes_list = list(proteomes) #
-        self.proteomes = set(proteomes) # frozenset(), used for checking whether cluster and ALO intersect
-        self.proteome_count = len(proteomes) # int
-
-        self.cluster_ids_by_cluster_type_by_cluster_status = {'present' : {'singleton' : [], 'specific' : [], 'shared' : []},
-                                                              'absent' : {'singleton' : [], 'specific' : [], 'shared' : []}}  # sums up to cluster_count
-        self.protein_ids_by_cluster_type = {'singleton' : [], 'specific' : [], 'shared' : []} # list of lists
-        self.protein_span_by_cluster_type = {'singleton' : [], 'specific' : [], 'shared' : []}
-        self.clusters_by_cluster_cardinality_by_cluster_type = {'shared' : {'true' : [], 'fuzzy' : []}, 'specific' : {'true' : [], 'fuzzy' : []}}
-
-        self.cluster_status_by_cluster_id = {}
-        self.cluster_type_by_cluster_id = {}
-
-        self.cluster_mwu_pvalue_by_cluster_id = {}
-        self.cluster_mwu_log2_mean_by_cluster_id = {}
-        self.cluster_mean_ALO_count_by_cluster_id = {}
-        self.cluster_mean_non_ALO_count_by_cluster_id = {}
-
-        self.domain_counter_by_domain_source_by_cluster_type = None
-        self.protein_with_domain_count_by_domain_source_by_cluster_type = None
-
-        self.protein_length_stats_by_cluster_id = {}
-        self.protein_count_by_cluster_id = {}
-
-        self.rarefaction_data = {} # repetition : number of clusters
-
-    ###############################
-    ### add_clusterObj
-    ###############################
-
-    def analyse_domains(self):
-        print "[STATUS] - Analysing domains (this may take a while) ... "
-        domain_counter_by_domain_source_by_cluster_type = {'singleton' : {}, 'specific' : {}, 'shared' : {}}
-        protein_with_domain_count_by_domain_source_by_cluster_type = {'singleton' : {}, 'specific' : {}, 'shared' : {}}
-        get_proteinObj_by_protein_id = proteinCollection.proteinObjs_by_protein_id.get
-        for cluster_type in self.protein_ids_by_cluster_type:
-            for domain_source in proteinCollection.domain_sources:
-                if not domain_source in domain_counter_by_domain_source_by_cluster_type[cluster_type]:
-                    domain_counter_by_domain_source_by_cluster_type[cluster_type][domain_source] = Counter()
-                    protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type][domain_source] = 0
-                for protein_id in self.protein_ids_by_cluster_type[cluster_type]:
-                    proteinObj = get_proteinObj_by_protein_id(protein_id)
-                    if domain_source in proteinObj.domain_counter_by_domain_source:
-                        domain_counter = proteinObj.domain_counter_by_domain_source[domain_source]
-                        if domain_counter:
-                            domain_counter_by_domain_source_by_cluster_type[cluster_type][domain_source] += domain_counter
-                            protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type][domain_source] += 1
-                    if proteinObj.go_terms:
-                        domain_counter = Counter(list(proteinObj.go_terms))
-                        domain_counter_by_domain_source_by_cluster_type[cluster_type]["GO"] += domain_counter
-                        protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type]["GO"] += 1
-
-        domain_counter_by_domain_source_by_cluster_type['total'] = {}
-        protein_with_domain_count_by_domain_source_by_cluster_type['total'] = {}
-        for domain_source in proteinCollection.domain_sources:
-            domain_counter_by_domain_source_by_cluster_type['total'][domain_source] = Counter()
-            protein_with_domain_count_by_domain_source_by_cluster_type['total'][domain_source] = 0
-
-        for cluster_type in domain_counter_by_domain_source_by_cluster_type:
-            for domain_source in proteinCollection.domain_sources:
-                domain_counter_by_domain_source_by_cluster_type['total'][domain_source] += domain_counter_by_domain_source_by_cluster_type[cluster_type][domain_source]
-                protein_with_domain_count_by_domain_source_by_cluster_type['total'][domain_source] += protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type][domain_source]
-        self.domain_counter_by_domain_source_by_cluster_type = domain_counter_by_domain_source_by_cluster_type
-        self.protein_with_domain_count_by_domain_source_by_cluster_type = protein_with_domain_count_by_domain_source_by_cluster_type
-        #print self.level
-        #for cluster_type in self.domain_counter_by_domain_source_by_cluster_type:
-        #    print cluster_type
-        #    print self.domain_counter_by_domain_source_by_cluster_type[cluster_type]
-        #    print self.protein_with_domain_count_by_domain_source_by_cluster_type[cluster_type]
-
-    def add_clusterObj(self, clusterObj, attribute_cluster_type, ALO_cluster_status, ALO_cluster_cardinality, ALO_protein_ids_in_cluster, ALO_protein_length_stats, mwu_pvalue, mwu_log2_mean, mean_ALO_count, mean_non_ALO_count):
-        self.cluster_ids_by_cluster_type_by_cluster_status[ALO_cluster_status][attribute_cluster_type].append(clusterObj.cluster_id)
-        self.cluster_status_by_cluster_id[clusterObj.cluster_id] = ALO_cluster_status
-        self.cluster_type_by_cluster_id[clusterObj.cluster_id] = attribute_cluster_type
-        self.protein_length_stats_by_cluster_id[clusterObj.cluster_id] = ALO_protein_length_stats
-
-        self.protein_count_by_cluster_id[clusterObj.cluster_id] = len(ALO_protein_ids_in_cluster)
-        if ALO_cluster_status == 'present':
-            for ALO_protein_id in ALO_protein_ids_in_cluster:
-                self.protein_ids_by_cluster_type[attribute_cluster_type].append(ALO_protein_id)
-            self.protein_span_by_cluster_type[attribute_cluster_type].append(ALO_protein_length_stats['sum'])
-            if not attribute_cluster_type == 'singleton':
-                if ALO_cluster_cardinality:
-                    self.clusters_by_cluster_cardinality_by_cluster_type[attribute_cluster_type][ALO_cluster_cardinality].append(clusterObj.cluster_id)
-
-        self.cluster_mwu_pvalue_by_cluster_id[clusterObj.cluster_id] = mwu_pvalue
-        self.cluster_mwu_log2_mean_by_cluster_id[clusterObj.cluster_id] = mwu_log2_mean
-        self.cluster_mean_ALO_count_by_cluster_id[clusterObj.cluster_id] = mean_ALO_count
-        self.cluster_mean_non_ALO_count_by_cluster_id[clusterObj.cluster_id] = mean_non_ALO_count
-
-    ###############################
-    ### get_protein_count_by_cluster_type
-    ###############################
-
-    def get_protein_count_by_cluster_type(self, cluster_type):
-        if cluster_type == 'total':
-            return sum([len(protein_ids) for cluster_type, protein_ids in self.protein_ids_by_cluster_type.items()])
-        else:
-            return len(self.protein_ids_by_cluster_type[cluster_type])
-
-    ###############################
-    ### get_protein_span_by_cluster_type
-    ###############################
-
-    def get_protein_span_by_cluster_type(self, cluster_type):
-        span = 0
-        if cluster_type == 'total':
-            span = sum([sum(protein_ids) for cluster_type, protein_ids in self.protein_span_by_cluster_type.items()])
-        else:
-            span = sum(self.protein_span_by_cluster_type[cluster_type])
-        return span
-
-    ###############################
-    ### get_cluster_count_by_cluster_status_by_cluster_type
-    ###############################
-
-    def get_cluster_count_by_cluster_status_by_cluster_type(self, cluster_status, cluster_type):
-        if cluster_type == 'total':
-            return sum([len(cluster_ids) for cluster_type, cluster_ids in self.cluster_ids_by_cluster_type_by_cluster_status[cluster_status].items()])
-        else:
-            return len(self.cluster_ids_by_cluster_type_by_cluster_status[cluster_status][cluster_type])
-
-    def get_cluster_count_by_cluster_cardinality_by_cluster_type(self, cluster_type, cluster_cardinality):
-        return len(self.clusters_by_cluster_cardinality_by_cluster_type[cluster_type][cluster_cardinality])
-
-    def get_proteomes(self):
-        return ", ".join(sorted([str(proteome_id) for proteome_id in self.proteomes]))
-
-########################################################################
-# CLASS : ProteinCollection
-########################################################################
-
-class ProteinCollection():
-    def __init__(self, proteinObjs):
-        self.proteinObjs = proteinObjs
-        self.proteinObjs_by_protein_id = {proteinObj.protein_id : proteinObj for proteinObj in proteinObjs}
-        self.protein_count = len(proteinObjs)
-        self.domain_sources = []
-        self.fastas_parsed = False
-        self.functional_annotation_parsed = False
-        self.domain_description_by_domain_id_by_domain_source = None
-
-    ###############################
-    ### add_domainObjs_to_proteinObjs
-    ###############################
-
-    def add_annotation_to_proteinObj(self, domain_protein_id, domain_counter_by_domain_source, go_terms):
-        proteinObj = self.proteinObjs_by_protein_id.get(domain_protein_id, None)
-        if proteinObj:
-            proteinObj.domain_counter_by_domain_source = domain_counter_by_domain_source
-            signalp_notm = proteinObj.domain_counter_by_domain_source.get("SignalP_EUK", None)
-            if signalp_notm and "SignalP-noTM" in signalp_notm:
-                proteinObj.secreted = True
-            proteinObj.go_terms = go_terms
-
-    def get_protein_length_stats(self, protein_ids):
-        protein_length_stats = {'sum' : 0, 'mean' : 0.0, 'median' : 0, 'sd': 0.0}
-        if protein_ids and self.fastas_parsed:
-            protein_lengths = [self.proteinObjs_by_protein_id[protein_id].length for protein_id in protein_ids]
-            protein_length_stats['sum'] = sum(protein_lengths)
-            protein_length_stats['mean'] = mean(protein_lengths)
-            protein_length_stats['median'] = median(protein_lengths)
-            protein_length_stats['sd'] = sd(protein_lengths)
-        return protein_length_stats
-########################################################################
-# CLASS : ProteinObj
-########################################################################
-
-class ProteinObj():
-    def __init__(self, protein_id, proteome_id, species_id, sequence_id):
-        self.protein_id = protein_id
-        self.proteome_id = proteome_id
-        self.species_id = species_id
-        self.sequence_id = sequence_id
-        self.length = None
-        self.clustered = False
-
-        self.secreted = False
-
-        self.domain_counter_by_domain_source = {}
-        self.go_terms = []
-
-
-    ###############################
-    ### add_length
-    ###############################
-
-    def add_length(self, length):
-        self.length = length
-
-    ###############################
-    ### get_domain_list
-    ###############################
-
-    def get_domain_list(self):
-        return sorted(self.domain_list, key=lambda x: x.domain_start, reverse=False)
-
-
-    ###############################
-    ### compute_domain_count_by_domain_id_by_domain_source
-    ###############################
-
-    def compute_domain_count_by_domain_id_by_domain_source(self):
-        if self.domain_list:
-            domain_ids_by_domain_source = {domainObj.domain_source : [] for domainObj in self.domain_list}
-            for domainObj in self.domain_list:
-                domain_ids_by_domain_source[domainObj.domain_source].append(domainObj.domain_id)
-            self.domain_count_by_domain_id_by_domain_source = {domain_source : Counter(domain_ids_by_domain_source[domain_source]) for domain_source in domain_ids_by_domain_source}
-        else:
-            self.domain_count_by_domain_id_by_domain_source = Counter()
-
-########################################################################
-# CLASS : ClusterCollection
-########################################################################
-
-class ClusterCollection():
-    def __init__(self, clusterObjs, inferred_singletons_count, functional_annotation_parsed, fastas_parsed, domain_sources):
-        self.clusterObjs = clusterObjs
-        self.clusterObjs_by_cluster_id = {clusterObj.cluster_id: clusterObj for clusterObj in clusterObjs} # only for testing
-        self.cluster_count = len(clusterObjs)
-        self.inferred_singletons_count = inferred_singletons_count
-        self.functional_annotation_parsed = functional_annotation_parsed
-        self.fastas_parsed = fastas_parsed
-        #self.domain_sources = [domain_source for domain_source in domain_sources if not domain_source == "GO"]
-        self.domain_sources = domain_sources
-########################################################################
-# CLASS : ClusterObj
-########################################################################
-
-class ClusterObj():
-    def __init__(self, cluster_id, protein_ids):
-        self.cluster_id = cluster_id
-        self.protein_ids = set(protein_ids)
-        self.protein_count = len(protein_ids)
-        try:
-            self.proteomes_by_protein_id = {protein_id : proteinCollection.proteinObjs_by_protein_id[protein_id].proteome_id for protein_id in protein_ids}
-        except KeyError as e:
-            sys.exit("[ERROR] - Protein %s in clustering belongs to proteomes that are not present in the SpeciesClassification-file. Please add those proteoemes or recluster by omitting these proteomes." % (e.args[0]))
-
-        self.proteome_ids_list = self.proteomes_by_protein_id.values()
-        self.protein_count_by_proteome_id = Counter(self.proteome_ids_list)
-        self.proteome_ids = frozenset(self.proteome_ids_list)
-        self.proteome_count = len(self.proteome_ids)
-        self.singleton = False if self.protein_count > 1 else True
-        self.apomorphy = False if self.proteome_count > 1 else True
-        self.protein_ids_by_proteome_id = self.compute_protein_ids_by_proteome()
-
-        # DOMAINS
-        self.go_terms = self.compute_go_terms()
-        self.domain_counter_by_domain_source = self.compute_domain_counter_by_domain_source()
-        self.secreted_cluster_coverage = self.compute_secreted_cluster_coverage()
-        self.domain_entropy_by_domain_source = self.compute_domain_entropy_by_domain_source()
-        self.protein_length_stats = self.compute_protein_length_stats()
-
-        self.implicit_protein_ids_by_proteome_id_by_level_by_attribute = None
-        self.proteome_ids_by_level_by_attribute = None # used for checking status
-        self.proteome_coverage_by_level_by_attribute = None
-        self.protein_counts_of_proteomes_by_level_by_attribute = None # non-zero-counts
-        self.protein_median = None
-        self.cluster_type_by_attribute = None
-
-    ###############################
-    ### compute_protein_ids_by_proteome
-    ###############################
-
-    def compute_protein_ids_by_proteome(self):
-        protein_ids_by_proteome_id = defaultdict(set)
-        for protein_id, proteome_id in self.proteomes_by_protein_id.items():
-            protein_ids_by_proteome_id[proteome_id].add(protein_id)
-        return protein_ids_by_proteome_id
-
-    def compute_secreted_cluster_coverage(self):
-        secreted = 0
-        for protein_id in self.protein_ids:
-            if proteinCollection.proteinObjs_by_protein_id[protein_id].secreted:
-                secreted += 1
-        return secreted/self.protein_count
-
-    def compute_protein_length_stats(self):
-        protein_lengths = [proteinCollection.proteinObjs_by_protein_id[protein_id].length for protein_id in self.protein_ids]
-        if all(protein_lengths):
-            protein_length_stats = {}
-            protein_length_stats['mean'] = mean(protein_lengths)
-            protein_length_stats['median'] = median(protein_lengths)
-            protein_length_stats['sd'] = sd(protein_lengths)
-            return protein_length_stats
-
-    def compute_domain_counter_by_domain_source(self):
-        cluster_domain_counter_by_domain_source = {}
-        for protein_id in self.protein_ids:
-            protein_domain_counter_by_domain_source = proteinCollection.proteinObjs_by_protein_id[protein_id].domain_counter_by_domain_source
-            if protein_domain_counter_by_domain_source:
-                for domain_source, protein_domain_counter in protein_domain_counter_by_domain_source.items():
-                    if not domain_source in cluster_domain_counter_by_domain_source:
-                        cluster_domain_counter_by_domain_source[domain_source] = Counter()
-                    cluster_domain_counter_by_domain_source[domain_source] += protein_domain_counter
-        return cluster_domain_counter_by_domain_source
-
-    def compute_domain_entropy_by_domain_source(self):
-        domain_entropy_by_domain_source = {}
-        for domain_source, domain_counter in self.domain_counter_by_domain_source.items():
-            total_count = len([domain for domain in domain_counter.elements()])
-            domain_entropy = -sum([i/total_count * log(i/total_count, 2) for i in domain_counter.values()])
-            if str(domain_entropy) == "-0.0":
-                domain_entropy_by_domain_source[domain_source] = 0.0
-            else:
-                domain_entropy_by_domain_source[domain_source] = domain_entropy
-        return domain_entropy_by_domain_source
-
-    def compute_go_terms(self):
-        go_terms = set()
-        for protein_id in self.protein_ids:
-            if proteinCollection.proteinObjs_by_protein_id[protein_id].go_terms:
-                for go_term in proteinCollection.proteinObjs_by_protein_id[protein_id].go_terms:
-                    go_terms.add(go_term)
-        return go_terms
-
-
-class InputObj():
-    def __init__(self, args):
-        # reserved attributes
-        self.ATTRIBUTE_RESERVED = ['IDX', 'OUT', "TAXID"]
-        # input files
-        self.cluster_f = args['--cluster_file']
-        self.config_f = args['--config_file']
-        self.sequence_ids_f = args['--sequence_ids_file']
-        self.species_ids_f = args['--species_ids_file']
-        self.tree_f = args['--tree_file']
-        self.render_tree = args['--plot_tree']
-        self.nodesdb_f = args['--nodesdb']
-        self.functional_annotation_f = args['--functional_annotation']
-        self.pfam_mapping = True
-        self.pfam_mapping_f = None
-        self.ipr_mapping = True
-        self.ipr_mapping_f = None
-        self.go_mapping_f = None
-        self.check_input_files()
-        self.check_that_ete_can_plot()
-        # FASTA files
-        self.fasta_dir = args['--fasta_dir']
-        self.check_if_fasta_dir_and_species_ids_f()
-        # outprefix
-        self.outprefix = args['--outprefix']
-        # proteins
-        self.infer_singletons = args['--infer_singletons']
-        # values: fuzzyness
-        self.fuzzy_count = None
-        self.check_fuzzy_count(args['--target_count'])
-        self.fuzzy_fraction = None
-        self.check_fuzzy_fraction(args['--target_fraction'])
-        self.fuzzy_min = None
-        self.fuzzy_max = None
-        self.check_fuzzy_min_max(args['--min'], args['--max'])
-        self.fuzzy_range = set([x for x in xrange(self.fuzzy_min, self.fuzzy_max+1) if not x == self.fuzzy_count])
-        # values: rarefaction
-        self.repetitions = int(args['--repetitions']) + 1
-        self.check_repetitions()
-        self.min_proteomes = int(args['--min_proteomes'])
-        self.check_min_proteomes()
-        # values: plots
-        self.plot_format = args['--plotfmt']
-        self.check_plot_format()
-        self.plot_size = tuple(int(x) for x in args['--plotsize'].split(","))
-        self.plot_font_size = int(args['--fontsize'])
-        # taxrank
-        self.taxranks = [taxrank.replace(" ","") for taxrank in args['--taxranks'].split(",")]
-        self.check_taxranks()
-
-    def check_plot_format(self):
-        SUPPORTED_PLOT_FORMATS = set(['png', 'pdf', 'svg'])
-        if self.plot_format not in SUPPORTED_PLOT_FORMATS:
-            sys.exit("[ERROR] : Plot format %s not part of supported plot formats (%s)" % (self.plot_format, SUPPORTED_PLOT_FORMATS))
-
-    def check_repetitions(self):
-        if not self.repetitions > 0:
-            sys.exit("[ERROR] : Please specify a positive integer for the number of repetitions for the rarefaction curves")
-
-    def check_min_proteomes(self):
-        if not self.min_proteomes > 0:
-            sys.exit("[ERROR] : Please specify a positive integer for the minimum number of proteomes to consider for computations")
-
-    def check_taxranks(self):
-        SUPPORTED_TAXRANKS = set(['superkingdom', 'kingdom', 'phylum', 'class', 'order', 'superfamily', 'family', 'subfamily', 'genus', 'species'])
-        unsupported_taxranks = []
-        for taxrank in self.taxranks:
-            if not taxrank in SUPPORTED_TAXRANKS:
-                unsupported_taxranks.append(taxrank)
-        if unsupported_taxranks:
-            sys.exit("[ERROR] : Taxrank(s) %s not part of supported Taxranks (%s)" % (",".join(sorted(unsupported_taxranks)), ",".join(sorted(SUPPORTED_TAXRANKS))))
-
-    def check_if_fasta_dir_and_species_ids_f(self):
-        if self.fasta_dir:
-            if not self.species_ids_f:
-                sys.exit("[ERROR] : You have provided a FASTA-dir using '--fasta-dir'. Please provide a Species-ID file using ('--species_ids_file').")
-
-    def check_input_files(self):
-        check_file(self.sequence_ids_f)
-        check_file(self.species_ids_f)
-        check_file(self.config_f)
-        check_file(self.functional_annotation_f)
-        check_file(self.sequence_ids_f)
-        check_file(self.tree_f)
-        check_file(self.nodesdb_f)
-        if self.pfam_mapping:
-            pfam_mapping_f = join(dirname(realpath(__file__)), "../data/Pfam-A.clans.tsv.gz")
-            if not isfile(pfam_mapping_f):
-                print "[WARN] - PFAM-ID file 'data/Pfam-A.clans.tsv.gz' not found."
-                remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
-                retrieve_ftp(remote_f, pfam_mapping_f)
-            self.pfam_mapping_f = pfam_mapping_f
-        if self.ipr_mapping:
-            ipr_mapping_f = join(dirname(realpath(__file__)), "../data/entry.list")
-            if not isfile(ipr_mapping_f):
-                print "[WARN] - IPR-ID file 'data/entry.list' not found."
-                remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/entry.list"
-                retrieve_ftp(remote_f, ipr_mapping_f)
-            self.ipr_mapping_f = ipr_mapping_f
-            go_mapping_f = join(dirname(realpath(__file__)), "../data/interpro2go")
-            if not isfile(go_mapping_f):
-                print "[WARN] - GO-ID file, but 'data/interpro2go' not found."
-                remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go"
-                retrieve_ftp(remote_f, go_mapping_f)
-            self.go_mapping_f = go_mapping_f
-
-    def check_that_ete_can_plot(self):
-        if self.render_tree:
-            try:
-                import PyQt4
-            except ImportError:
-                sys.exit("[ERROR] : Plotting of trees requires additional ETE3 dependencies. PyQt4 is not installed. Please install PyQt4")
-            if 'DISPLAY' in environ:
-                print "[STATUS] - X server seems to be present..."
-                test_tree_f = join(getcwd(), "this_is_a_test_tree.pdf")
-                t = ete3.Tree( "((a,b),c);" )
-                try:
-                    a = t.render(test_tree_f, w=40, units="mm")
-                    print "[STATUS] - ETE can connect to X server (X11). Tree will be rendered."
-                except:
-                    self.render_tree = False
-                    print "[WARN] - ETE cannot connect to X server (X11). No tree will be rendered."
-                remove(test_tree_f)
-            else:
-                print "[STATUS] - No X server found. ETE can't render the tree. Consider using \'xvfb-run\' ..."
-                self.render_tree = False
-
-    def check_fuzzy_count(self, target_count):
-        if int(target_count) > 0:
-            self.fuzzy_count = int(target_count)
-        else:
-            sys.exit("[ERROR] : --target_count %s must be greater than 0" % (target_count))
-
-    def check_fuzzy_fraction(self, fuzzyness):
-        if 0 <= float(fuzzyness) <= 1:
-            self.fuzzy_fraction = float(fuzzyness)
-        else:
-            sys.exit("[ERROR] : --target_fraction %s is not between 0.0 and 1.0" (fuzzyness))
-
-    def check_fuzzy_min_max(self, fuzzy_min, fuzzy_max):
-        if int(fuzzy_min) <= int(fuzzy_max):
-            self.fuzzy_min = int(args['--min'])
-            self.fuzzy_max = int(args['--max'])
-        else:
-            sys.exit("[ERROR] : --min %s is greater than --max %s" (fuzzy_min, fuzzy_max))
-
-
-def welcome_screen():
-    screen = "\
-     _    _ _       _______ _        \n\
-    | |  / |_)     (_______|_)       \n\
-    | | / / _ ____  _____   _ ____   \n\
-    | |< < | |  _ \|  ___) | |  _ \  \n\
-    | | \ \| | | | | |     | | | | | \n\
-    |_|  \_)_|_| |_|_|     |_|_| |_| v%s\n\
-    " % (__version__)
-    print screen
-
-
-if __name__ == "__main__":
-    __version__ = "0.9"
-    welcome_screen()
-    args = docopt(__doc__)
-    inputObj = InputObj(args)
-    if inputObj.tree_f:
-        try:
-            import ete3
-        except ImportError:
-            sys.exit("[ERROR] : Module \'ete3\' was not found. Please install \'ete3\' using \'pip install ete3\'\n/tPlotting of trees requires additional dependencies:\n\t- PyQt4\n\t")
-    # Input sane ... now we start
-    print "[STATUS] - Starting analysis ..."
-    overall_start = time.time()
-    # Initialise
-    aloCollection = None
-    proteinCollection = None
-    domainCollection = None
-    clusterCollection = None
-    # Build dataFactory
-    dataFactory = DataFactory()
-    # Build Collections
-    aloCollection = dataFactory.build_AloCollection()
-    proteinCollection = dataFactory.build_ProteinCollection(inputObj)
-    clusterCollection = dataFactory.build_ClusterCollection(inputObj)
-    dataFactory.setup_dirs(inputObj)
-    aloCollection.analyse_clusters()
-    # aloCollection.analyse_domains() # takes prohibitely long, implement faster!
-    aloCollection.write_tree()
-    aloCollection.compute_rarefaction_data()
-    dataFactory.write_output()
-
-    overall_end = time.time()
-    overall_elapsed = overall_end - overall_start
-    print "[STATUS] - Took %ss to run kinfin." % (overall_elapsed)
-    del aloCollection
-    del proteinCollection
-    del domainCollection
-    del clusterCollection
diff --git a/dist/kinfin-0.9-py2.7.egg b/dist/kinfin-0.9-py2.7.egg
deleted file mode 100644
index a4cb260..0000000
Binary files a/dist/kinfin-0.9-py2.7.egg and /dev/null differ
diff --git a/example/curl_examples.md b/example/curl_examples.md
new file mode 100644
index 0000000..ac7fb70
--- /dev/null
+++ b/example/curl_examples.md
@@ -0,0 +1,70 @@
+### 1. Initialize the Analysis Process
+
+```bash
+curl -X POST "http://127.0.0.1:8000/kinfin/init" \
+-H "Content-Type: application/json" \
+-d '{"config": [{ "taxon": "BGLAB", "label1": "red" },{ "taxon": "CVIRG", "label1": "red" },{ "taxon": "DPOLY", "label1": "red" },{ "taxon": "GAEGI", "label1": "red" },{ "taxon": "LJAPO", "label1": "red" },{ "taxon": "LSAXA", "label1": "red" },{ "taxon": "MANGU", "label1": "red" },{ "taxon": "MAREN", "label1": "red" },{ "taxon": "MGIGA", "label1": "red" },{ "taxon": "MMERC", "label1": "red" },{ "taxon": "MTROS", "label1": "blue" },{ "taxon": "OBIMA", "label1": "blue" },{ "taxon": "OEDUL", "label1": "blue" },{ "taxon": "OSINE", "label1": "blue" },{ "taxon": "OVULG", "label1": "blue" },{ "taxon": "PCANA", "label1": "blue" },{ "taxon": "PMAXI", "label1": "blue" },{ "taxon": "PVULG", "label1": "blue" },{ "taxon": "TGRAN", "label1": "blue" }]}' | jq
+```
+
+### 2. Get Run Status
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/status" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 3. Get Run Summary
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/run-summary" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 4. Get Available Attributes and Taxon Sets
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/available-attributes-taxonsets" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 5. Get Counts by Taxon
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/counts-by-taxon" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 6. Get Cluster Summary
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/cluster-summary/label1" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 7. Get Attribute Summary
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/attribute-summary/label1" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 8. Get Cluster Metrics
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/cluster-metrics/label1/red" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 9. Get Pairwise Analysis
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/pairwise-analysis/label1" \
+-H "x-session-id: <session_id>" | jq
+```
+
+### 10. Get Plot
+
+```bash
+curl -X GET "http://127.0.0.1:8000/kinfin/plot/<plot_type>" \
+-H "x-session-id: <session_id>" -o "<filename>.png"
+```
diff --git a/example/taxon_idx_mapping.json b/example/taxon_idx_mapping.json
new file mode 100644
index 0000000..0812d0b
--- /dev/null
+++ b/example/taxon_idx_mapping.json
@@ -0,0 +1,8 @@
+{
+    "A": "0",
+    "B": "1",
+    "C": "2",
+    "D": "3",
+    "E": "4",
+    "F": "5"
+}
diff --git a/install b/install
deleted file mode 100755
index b4ebe09..0000000
--- a/install
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env bash
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-
-echo "[+] Checking dependencies..."
-wget=$(which wget)
-if [ -x "$wget" ] ; then
-    echo "    [+] [wget] $wget"; else
-    echo "    [X] [wget] ... please install wget";
-fi
-gunzip=$(which gunzip)
-if [ -x "$gunzip" ] ; then
-    echo "    [+] [gunzip] $gunzip"; else
-    echo "    [X] [gunzip] ... please install gunzip";
-fi
-
-# Download files
-echo "[+] Checking data files..."
-pfam_dest=$DIR/data/Pfam-A.clans.tsv.gz
-ipr_dest=$DIR/data/entry.list
-go_dest=$DIR/data/interpro2go
-nodesdbgz=$DIR/data/nodesdb.gz
-nodesdb=$DIR/data/nodesdb.txt
-if [ -f "$nodesdb" ]; then
-    echo "    [+] $nodesdb"
-else
-    if [ -f "$nodesdbgz" ]; then
-        echo -n "    [+] Extracting $nodesdbgz ..."
-        $gunzip -c $nodesdbgz > $nodesdb
-        if [ $? -eq 0 ]; then
-            echo "done."
-        else
-            echo "FAIL... Please download kinfin again."
-            exit 1
-        fi
-    else
-        echo "[-] $nodesdbgz not found... Please download kinfin again."
-            exit 1
-    fi
-fi
-
-if [ -f "$pfam_dest" ]; then
-    echo "    [+] $pfam_dest"
-else
-    echo -n "    [-] $pfam_dest. Downloading ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz ..."
-    $wget -qN ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz -P $DIR/data/
-    if [ $? -eq 0 ]; then
-        echo "done."
-    else
-        echo "FAIL."
-    fi
-fi
-
-if [ -f "$ipr_dest" ]; then
-    echo "    [+] $ipr_dest"
-else
-    echo -n "    [-] $ipr_dest. Downloading ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list ..."
-    $wget -qN ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list -P $DIR/data/
-    if [ $? -eq 0 ]; then
-        echo "done."
-    else
-        echo "FAIL."
-    fi
-fi
-
-if [ -f "$go_dest" ]; then
-    echo "    [+] $go_dest"
-else
-    echo -n "    [-] $go_dest. Downloading ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro2go ... "
-    $wget -qN ftp://ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro2go -P $DIR/data/
-    if [ $? -eq 0 ]; then
-        echo "done."
-    else
-        echo "FAIL."
-    fi
-fi
-
-# Create executable
-echo "[+] Creating executable..."
-echo '#!/usr/bin/env bash
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-$DIR/src/kinfin.py "$@"' > $DIR/kinfin && chmod +x $DIR/kinfin
-
-# Done
-echo "[+] Kinfin was installed. Please run ./kinfin"
diff --git a/install.sh b/install.sh
new file mode 100755
index 0000000..58c2b9c
--- /dev/null
+++ b/install.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+
+# logging function
+log() {
+    local GREEN='\033[0;32m'
+    local YELLOW='\033[0;33m'
+    local RED='\033[0;31m'
+    local NO_COLOR='\033[0m'
+
+    local level=$1
+    local message=$2
+
+    case $level in
+        INFO)
+            echo -e "[${NO_COLOR}INFO${NO_COLOR}] - $message"
+            ;;
+        SUCCESS)
+            echo -e "[${GREEN}SUCCESS${NO_COLOR}] - $message"
+            ;;
+        ERROR)
+            echo -e "[${RED}ERROR${NO_COLOR}] - $message" >&2
+            ;;
+        *)
+            echo "Invalid log level: $level"
+            ;;
+    esac
+}
+
+# Check dependencies exist
+check_dependencies() {
+    log INFO "Checking dependencies..."
+
+    local dependencies=("wget" "gunzip")
+    local missing_dependencies=()
+
+    for dependency in "${dependencies[@]}"; do
+        local item=$(command -v "$dependency")
+        if [ ! -x "$item" ]; then
+            missing_dependencies+=("$dependency")
+        fi
+    done
+
+    if [ ${#missing_dependencies[@]} -gt 0 ]; then
+        log ERROR "Missing dependencies: ${missing_dependencies[*]}. Please install them."
+        exit 1
+    else
+        for dependency in "${dependencies[@]}"; do
+            log SUCCESS "$dependency is installed."
+        done
+        log SUCCESS "All dependencies are installed."
+        return 0
+    fi
+}
+
+# Function to download a file
+download_file() {
+    local url=$1
+    local filename=$2
+
+    log INFO "Downloading $filename from $url"
+    $(which wget) -np -nd -qN --show-progress "$url" -P "$DIR/data/"
+
+    if [ $? -eq 0 ]; then
+        log SUCCESS "Downloaded $filename"
+    else
+        log ERROR "Failed to download $filename from $url"
+        exit 1
+    fi
+}
+
+# Extract .gz files
+extract_gzip() {
+    local gz_file=$1
+    local dest=$2
+    
+    log INFO "Extracting $gz_file..."
+
+    $(which gunzip) -c "$gz_file" > "$dest"
+    
+    if [ $? -eq 0 ]; then
+        log SUCCESS "Extracted $gz_file at $dest"
+    else
+        log ERROR "Failed to extract $gz_file. Please download kinfin again."
+        exit 1
+    fi
+}
+
+
+
+main() {
+    # Set working directory
+    DIR="$(cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
+    check_dependencies
+
+    log INFO "Checking input data files..."
+
+    local pfam_dest="$DIR/data/Pfam-A.clans.tsv.gz"
+    local ipr_dest="$DIR/data/entry.list"
+    local go_dest="$DIR/data/interpro2go"
+    local nodesdbgz="$DIR/data/nodesdb.gz"
+    local nodesdb="$DIR/data/nodesdb.txt"
+
+    if [ ! -f "$nodesdb" ]; then
+        if [ -f "$nodesdbgz" ]; then
+            extract_gzip "$nodesdbgz" "$nodesdb"
+        else
+            log ERROR "$nodesdbgz not found. Please download kinfin again."
+            exit 1
+        fi
+    else
+        log SUCCESS "$nodesdb is already present."
+    fi
+
+    if [ ! -f "$pfam_dest" ]; then
+        download_file "ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz" "Pfam-A.clans.tsv.gz"
+    else
+        log SUCCESS "Pfam-A.clans.tsv.gz is already present."
+    fi
+
+    if [ ! -f "$ipr_dest" ]; then
+        download_file "ftp.ebi.ac.uk/pub/databases/interpro/current_release/entry.list" "entry.list"
+    else
+        log SUCCESS "entry.list is already present."
+    fi
+
+    if [ ! -f "$go_dest" ]; then
+        download_file "ftp.ebi.ac.uk/pub/databases/interpro/current_release/interpro2go" "interpro2go"
+    else
+        log SUCCESS "interpro2go is already present."
+    fi
+
+    log SUCCESS "All required files downloaded."
+
+    # Create executable
+    log INFO "Creating executable..."
+    echo -e '#!/usr/bin/env bash\nDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"\n$DIR/src/kinfin.py "$@"' > $DIR/kinfin && chmod +x $DIR/kinfin
+
+    # Done
+    log SUCCESS "Kinfin was installed. Please run ./kinfin --help"
+}
+
+main
\ No newline at end of file
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000..14bfacd
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,3 @@
+-r requirements.txt
+fastapi==0.111.0
+pytest==8.2.2
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 5a8fe76..7bcfdc2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
-scipy==1.11.1
-matplotlib==2.0.2
+scipy==1.13.1
+matplotlib==3.9.0
 docopt==0.6.2
-networkx==1.11
-powerlaw==1.4.1
-ete3==3.0.0b35
+networkx==3.3
+powerlaw==1.5
+ete3==3.1.3
+fastapi==0.111.0
\ No newline at end of file
diff --git a/scripts/get_protein_ids_from_cluster.py b/scripts/get_protein_ids_from_cluster.py
index 4aca41f..17a006c 100755
--- a/scripts/get_protein_ids_from_cluster.py
+++ b/scripts/get_protein_ids_from_cluster.py
@@ -71,8 +71,8 @@ def parse_groups(group_f):
 
 
 def write_output(output, outprefix):
-    headers_found = set([k for k, v in headers.iteritems() if v])
-    clusters_found = set([k for k, v in clusters.iteritems() if v])
+    headers_found = set([k for k, v in headers.items() if v])
+    clusters_found = set([k for k, v in clusters.items() if v])
     if headers:
         print("[+] Found %s of headers ..." % "{:.0%}".format(len(headers_found) / len(headers)))
     if clusters:
diff --git a/setup.py b/setup.py
index 077cbed..29f9f71 100644
--- a/setup.py
+++ b/setup.py
@@ -1,37 +1,37 @@
 import pip
 from setuptools import setup, find_packages
 
-__version__ = '1.1'
+__version__ = "1.1"
 
 # Get the long description from the README file
-with open('README.md', 'r') as readme:
+with open("README.md", "r") as readme:
     long_description = readme.read()
 
 # get the dependencies and installs
-with open('requirements.txt', 'r') as requirements:
+with open("requirements.txt", "r") as requirements:
     reqs = requirements.read().splitlines()
 
 setup(
-    name='kinfin',
+    name="kinfin",
     version=__version__,
-    description='Taxon-aware analysis of clustered protein data',
+    description="Taxon-aware analysis of clustered protein data",
     long_description=long_description,
-    url='https://github.com/DRL/kinfin',
-    download_url='https://github.com/DRL/kinfin/tarball/' + __version__,
-    license='GnuGPL3',
+    url="https://github.com/DRL/kinfin",
+    download_url="https://github.com/DRL/kinfin/tarball/" + __version__,
+    license="GnuGPL3",
     classifiers=[
-      'Development Status :: 3 - Alpha',
-      'Intended Audience :: Developers',
-      'Programming Language :: Python :: 3',
+        "Development Status :: 3 - Alpha",
+        "Intended Audience :: Developers",
+        "Programming Language :: Python :: 3",
     ],
-    keywords='Comparative genomics',
-    packages=find_packages(exclude=['docs', 'tests*']),
+    keywords="Comparative genomics",
+    packages=find_packages(exclude=["docs", "tests*"]),
     include_package_data=True,
-    author='Dominik R Laetsch',
+    author="Dominik R Laetsch",
     entry_points={
-        'console_scripts': [
+        "console_scripts": [
             "kinfin=src.kinfin:main",
-            ],
-        },
-    author_email='dominik.laetsch@gmail.com'
+        ],
+    },
+    author_email="dominik.laetsch@gmail.com",
 )
diff --git a/src/api/__init__.py b/src/api/__init__.py
new file mode 100644
index 0000000..e8891de
--- /dev/null
+++ b/src/api/__init__.py
@@ -0,0 +1,49 @@
+from core.input import ServeArgs
+
+
+def run_server(
+    args: ServeArgs,
+    nodesdb_f: str,
+    pfam_mapping_f: str,
+    ipr_mapping_f: str,
+    go_mapping_f: str,
+    cluster_f: str,
+    taxon_idx_mapping_file: str,
+    sequence_ids_f: str,
+) -> None:
+    """
+    Starts the uvicorn server
+
+    Parameters:
+    - args [ServeArgs] : An object containing server configuration arguments, such as the port.
+    - nodesdb_f [str] : File path to the nodesDB file.
+    - pfam_mapping_f [str] : File path to the PFAM mapping file.
+    - ipr_mapping_f [str] : File path to the InterPro mapping file.
+    - go_mapping_f [str] : File path to the Gene Ontology mapping file.
+    - cluster_f [str] : File path to the clustering data file.
+    - taxon_idx_mapping_file [str] : File path to the taxon index mapping file.
+    - sequence_ids_f [str] : File path to the sequence IDs file.
+    """
+    import uvicorn
+    from fastapi import FastAPI
+
+    from api.endpoints import router
+    from api.sessions import query_manager
+
+    query_manager.cluster_f = cluster_f
+    query_manager.sequence_ids_f = sequence_ids_f
+    query_manager.taxon_idx_mapping_file = taxon_idx_mapping_file
+    query_manager.nodesdb_f = nodesdb_f
+    query_manager.pfam_mapping_f = pfam_mapping_f
+    query_manager.ipr_mapping_f = ipr_mapping_f
+    query_manager.go_mapping_f = go_mapping_f
+
+    app = FastAPI()
+
+    @app.get("/")
+    def hello():
+        return {"hi": "hello"}
+
+    app.include_router(router)
+
+    uvicorn.run(app=app, port=args.port)
diff --git a/src/api/endpoints.py b/src/api/endpoints.py
new file mode 100644
index 0000000..9b338ce
--- /dev/null
+++ b/src/api/endpoints.py
@@ -0,0 +1,836 @@
+import asyncio
+import json
+import os
+from datetime import datetime
+from functools import wraps
+from typing import Any, Dict, List, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query, Request
+from fastapi.responses import FileResponse, JSONResponse
+from fastapi.security import APIKeyHeader
+from pydantic import BaseModel
+
+from api.fileparsers import (
+    parse_attribute_summary_file,
+    parse_cluster_metrics_file,
+    parse_cluster_summary_file,
+    parse_pairwise_file,
+    parse_taxon_counts_file,
+)
+from api.sessions import query_manager
+from api.utils import (
+    extract_attributes_and_taxon_sets,
+    read_status,
+    run_cli_command,
+    sort_and_paginate_result,
+)
+
+RUN_SUMMARY_FILEPATH = "summary.json"
+COUNTS_FILEPATH = "cluster_counts_by_taxon.txt"
+CLUSTER_SUMMARY_FILENAME = "cluster_summary.txt"
+ATTRIBUTE_METRICS_FILENAME = "attribute_metrics.txt"
+CLUSTER_METRICS_FILENAME = "cluster_metrics.txt"
+PAIRWISE_ANALYSIS_FILE = "pairwise_representation_test.txt"
+
+
+class InputSchema(BaseModel):
+    config: List[Dict[str, str]]
+
+
+class ResponseSchema(BaseModel):
+    status: str
+    message: str
+    query: Optional[str] = None
+    data: Optional[Any] = None
+    timestamp: str = datetime.now().isoformat()
+    error: Optional[str] = None
+    total_pages: Optional[int] = None
+    current_page: Optional[int] = None
+    entries_per_page: Optional[int] = None
+
+
+# X-Session-ID header will be required to access plots/files later
+header_scheme = APIKeyHeader(name="x-session-id")
+
+router = APIRouter()
+
+
+def check_kinfin_session(func):
+    @wraps(func)
+    async def wrapper(request: Request, session_id: str, *args, **kwargs):
+        try:
+            result_dir = query_manager.get_session_dir(session_id)
+            if not result_dir:
+                return JSONResponse(
+                    content=ResponseSchema(
+                        status="error",
+                        message="Kinfin analysis not initialized",
+                        error="session_not_initialized",
+                        query=str(request.url),
+                    ).model_dump(),
+                    status_code=428,
+                )
+
+            status_file = os.path.join(result_dir, f"{session_id}.status")
+            if not os.path.exists(status_file):
+                return JSONResponse(
+                    content=ResponseSchema(
+                        status="success",
+                        message="Kinfin analysis not initialized",
+                        error="session_not_initialized",
+                        query=str(request.url),
+                    ).model_dump(),
+                    status_code=428,
+                )
+
+            run_status = read_status(status_file)
+            status = run_status.get("status")
+
+            if status in ["running", "pending"]:
+                return JSONResponse(
+                    content=ResponseSchema(
+                        status="success",
+                        message="Kinfin analysis is still running. Please wait for analysis to complete",
+                        data={"is_complete": False},
+                        query=str(request.url),
+                    ).model_dump(),
+                    status_code=202,
+                )
+            elif status == "error":
+                return JSONResponse(
+                    content=ResponseSchema(
+                        status="error",
+                        message="Some error occurred during Kinfin analysis.",
+                        error=run_status,
+                        data={"session_terminated_due_to_error"},
+                        query=str(request.url),
+                    ).model_dump(),
+                    status_code=400,
+                )
+
+            return await func(request, session_id=session_id, *args, **kwargs)
+
+        except Exception as e:
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Internal Server Error",
+                    error=str(e),
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=500,
+            )
+
+    return wrapper
+
+
+@router.post("/kinfin/init", response_model=ResponseSchema)
+async def initialize(
+    input_data: InputSchema,
+    request: Request,
+):
+    """
+    Initialize the analysis process.
+
+    Args:
+        input_data (InputSchema): The input data for analysis.
+        background_tasks (BackgroundTasks): FastAPI's BackgroundTasks for running analysis asynchronously.
+
+    Returns:
+        JSONResponse: A response indicating that the analysis task has been queued.
+
+    Raises:
+        HTTPException: If there's an error in the input data or during processing.
+    """
+    try:
+        if not isinstance(input_data.config, list):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Data must be a list of dictionaries.",
+                    error="Invalid input data format",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=400,
+            )
+
+        if not all(isinstance(item, dict) for item in input_data.config):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Each item in data must be a dictionary.",
+                    error="Invalid data format",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=400,
+            )
+
+        session_id, result_dir = query_manager.get_or_create_session(input_data.config)
+        config_f = os.path.join(result_dir, "config.json")
+
+        with open(config_f, "w") as file:
+            json.dump(input_data.config, file)
+
+        command = [
+            "python",
+            "src/main.py",
+            "analyse",
+            "-g",
+            query_manager.cluster_f,
+            "-c",
+            config_f,
+            "-s",
+            query_manager.sequence_ids_f,
+            "-m",
+            query_manager.taxon_idx_mapping_file,
+            "-o",
+            result_dir,
+            "--plot_format",
+            "png",
+        ]
+
+        status_file = os.path.join(result_dir, f"{session_id}.status")
+        asyncio.create_task(run_cli_command(command, status_file))
+
+        response = ResponseSchema(
+            status="success",
+            message="Analysis task has been queued.",
+            data={"session_id": session_id},
+            query=str(request.url),
+        )
+        return JSONResponse(
+            content=response.model_dump(),
+            status_code=202,
+        )
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get("/kinfin/status", response_model=ResponseSchema)
+@check_kinfin_session
+async def get_run_status(request: Request, session_id: str = Depends(header_scheme)):
+    try:
+        return JSONResponse(
+            content=ResponseSchema(
+                status="success",
+                message="Kinfin analysis is complete.",
+                data={"is_complete": True},
+                query=str(request.url),
+            ).model_dump(),
+            status_code=200,
+        )
+
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get("/kinfin/run-summary", response_model=ResponseSchema)
+@check_kinfin_session
+async def get_run_summary(
+    request: Request,
+    session_id: str = Depends(header_scheme),
+    detailed: Optional[bool] = Query(False),
+):
+    try:
+        result_dir = query_manager.get_session_dir(session_id)
+        filepath = os.path.join(result_dir, RUN_SUMMARY_FILEPATH)
+        if not os.path.exists(filepath):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"{RUN_SUMMARY_FILEPATH} File Not Found",
+                    error="File does not exist",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        with open(filepath, "r") as f:
+            data = json.load(f)
+
+        if not detailed:
+            data = {
+                k: v
+                for k, v in data.items()
+                if k not in ["included_proteins", "excluded_proteins"]
+            }
+
+        response = ResponseSchema(
+            status="success",
+            message="Run summary retrieved successfully.",
+            query=str(request.url),
+            data=data,
+        )
+        return JSONResponse(content=response.model_dump())
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get("/kinfin/counts-by-taxon", response_model=ResponseSchema)
+@check_kinfin_session
+async def get_counts_by_tanon(
+    request: Request,
+    session_id: str = Depends(header_scheme),
+    include_clusters: Optional[str] = Query(None),
+    exclude_clusters: Optional[str] = Query(None),
+    min_count: Optional[int] = Query(None),
+    max_count: Optional[int] = Query(None),
+    include_taxons: Optional[str] = Query(None),
+    exclude_taxons: Optional[str] = Query(None),
+):
+    try:
+        result_dir = query_manager.get_session_dir(session_id)
+        filepath = os.path.join(result_dir, COUNTS_FILEPATH)
+
+        if not os.path.exists(filepath):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"{RUN_SUMMARY_FILEPATH} File Not Found",
+                    error="File does not exist",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        result = parse_taxon_counts_file(
+            filepath,
+            include_clusters,
+            exclude_clusters,
+            min_count,
+            max_count,
+            include_taxons,
+            exclude_taxons,
+        )
+
+        response = ResponseSchema(
+            status="success",
+            message="Cluster counts by Taxon retrieved successfully",
+            data=result,
+            query=str(request.url),
+        )
+        return JSONResponse(response.model_dump())
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get("/kinfin/cluster-summary/{attribute}", response_model=ResponseSchema)
+@check_kinfin_session
+async def get_cluster_summary(
+    request: Request,
+    attribute: str,
+    session_id: str = Depends(header_scheme),
+    include_clusters: Optional[str] = Query(None),
+    exclude_clusters: Optional[str] = Query(None),
+    include_properties: Optional[str] = Query(None),
+    exclude_properties: Optional[str] = Query(None),
+    min_cluster_protein_count: Optional[int] = Query(None),
+    max_cluster_protein_count: Optional[int] = Query(None),
+    min_protein_median_count: Optional[float] = Query(None),
+    max_protein_median_count: Optional[float] = Query(None),
+    sort_by: Optional[str] = Query(None),
+    sort_order: Optional[str] = Query("asc"),
+    page: Optional[int] = Query(1),
+    size: Optional[int] = Query(10),
+) -> JSONResponse:
+    try:
+        result_dir = query_manager.get_session_dir(session_id)
+        config_f = os.path.join(result_dir, "config.json")
+        if not os.path.exists(config_f):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Kinfin analysis not initialized",
+                    error="session_not_initialized",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=428,
+            )
+
+        valid_endpoints = extract_attributes_and_taxon_sets(config_f)
+        valid_attributes = valid_endpoints["attributes"]
+
+        if attribute and attribute not in valid_attributes:
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.",
+                    error="Invalid Input",
+                ).model_dump(),
+                status_code=400,
+            )
+
+        filename = f"{attribute}/{attribute}.{CLUSTER_SUMMARY_FILENAME}"
+        filepath = os.path.join(result_dir, filename)
+
+        if not os.path.exists(filepath):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"{COUNTS_FILEPATH} File Not Found",
+                    error="File does not exist",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        result = parse_cluster_summary_file(
+            filepath=filepath,
+            include_clusters=include_clusters,
+            exclude_clusters=exclude_clusters,
+            include_properties=include_properties,
+            exclude_properties=exclude_properties,
+            min_cluster_protein_count=min_cluster_protein_count,
+            max_cluster_protein_count=max_cluster_protein_count,
+            min_protein_median_count=min_protein_median_count,
+            max_protein_median_count=max_protein_median_count,
+        )
+
+        paginated_result, total_pages = sort_and_paginate_result(
+            result,
+            sort_by,
+            sort_order,
+            page,
+            size,
+        )
+
+        response = ResponseSchema(
+            status="success",
+            message="Cluster summary retrieved successfully",
+            data=paginated_result,
+            query=str(request.url),
+            current_page=page,
+            entries_per_page=size,
+            total_pages=total_pages,
+        )
+        return JSONResponse(response.model_dump())
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get("/kinfin/available-attributes-taxonsets")
+@check_kinfin_session
+async def get_available_attributes_and_taxon_sets(
+    request: Request,
+    session_id: str = Depends(header_scheme),
+):
+    try:
+        result_dir = query_manager.get_session_dir(session_id)
+        result = extract_attributes_and_taxon_sets(result_dir)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="success",
+                message="List of available attributes and taxon sets fetched",
+                data=result,
+                query=str(request.url),
+            ).model_dump(),
+            status_code=200,
+        )
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get("/kinfin/attribute-summary/{attribute}", response_model=ResponseSchema)
+@check_kinfin_session
+async def get_attribute_summary(
+    request: Request,
+    attribute: str,
+    session_id: str = Depends(header_scheme),
+    sort_by: Optional[str] = Query(None),
+    sort_order: Optional[str] = Query("asc"),
+    page: Optional[int] = Query(1),
+    size: Optional[int] = Query(10),
+):
+    try:
+        result_dir = query_manager.get_session_dir(session_id)
+        config_f = os.path.join(result_dir, "config.json")
+        if not os.path.exists(config_f):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Kinfin analysis not initialized",
+                    error="session_not_initialized",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=428,
+            )
+
+        valid_endpoints = extract_attributes_and_taxon_sets(config_f)
+        valid_attributes = valid_endpoints["attributes"]
+
+        if attribute and attribute not in valid_attributes:
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.",
+                    error="Invalid Input",
+                ).model_dump(),
+                status_code=400,
+            )
+
+        filename = f"{attribute}/{attribute}.{ATTRIBUTE_METRICS_FILENAME}"
+        filepath = os.path.join(result_dir, filename)
+
+        if not os.path.exists(filepath):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"{COUNTS_FILEPATH} File Not Found",
+                    error="File does not exist",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        result = parse_attribute_summary_file(filepath=filepath)
+        paginated_result, total_pages = sort_and_paginate_result(
+            result,
+            sort_by,
+            sort_order,
+            page,
+            size,
+        )
+        response = ResponseSchema(
+            status="success",
+            message="Cluster summary retrieved successfully",
+            data=paginated_result,
+            query=str(request.url),
+            current_page=page,
+            entries_per_page=size,
+            total_pages=total_pages,
+        )
+        return JSONResponse(response.model_dump())
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get(
+    "/kinfin/cluster-metrics/{attribute}/{taxon_set}",
+    response_model=ResponseSchema,
+)
+@check_kinfin_session
+async def get_cluster_metrics(
+    request: Request,
+    attribute: str,
+    taxon_set: str,
+    session_id: str = Depends(header_scheme),
+    cluster_status: Optional[str] = Query(None),
+    cluster_type: Optional[str] = Query(None),
+    sort_by: Optional[str] = Query(None),
+    sort_order: Optional[str] = Query("asc"),
+    page: Optional[int] = Query(1),
+    size: Optional[int] = Query(10),
+):
+    try:
+        result_dir = query_manager.get_session_dir(session_id)
+        config_f = os.path.join(result_dir, "config.json")
+        if not os.path.exists(config_f):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Kinfin analysis not initialized",
+                    error="session_not_initialized",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=428,
+            )
+
+        valid_endpoints = extract_attributes_and_taxon_sets(config_f)
+        valid_attributes = valid_endpoints["attributes"]
+
+        if attribute and attribute not in valid_attributes:
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.",
+                    error="Invalid Input",
+                ).model_dump(),
+                status_code=400,
+            )
+
+        valid_taxon_sets = valid_endpoints["taxon_sets"]
+
+        if taxon_set and taxon_set not in valid_taxon_sets:
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"Invalid taxon set: {taxon_set}. Must be one of {valid_taxon_sets}.",
+                    error="Invalid Input",
+                ).model_dump(),
+                status_code=400,
+            )
+
+        filename = f"{attribute}/{attribute}.{taxon_set}.{CLUSTER_METRICS_FILENAME}"
+        filepath = os.path.join(result_dir, filename)
+
+        if not os.path.exists(filepath):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"{COUNTS_FILEPATH} File Not Found",
+                    error="File does not exist",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        result = parse_cluster_metrics_file(filepath, cluster_status, cluster_type)
+        paginated_result, total_pages = sort_and_paginate_result(
+            result,
+            sort_by,
+            sort_order,
+            page,
+            size,
+        )
+        response = ResponseSchema(
+            status="success",
+            message="Cluster summary retrieved successfully",
+            data=paginated_result,
+            query=str(request.url),
+            current_page=page,
+            entries_per_page=size,
+            total_pages=total_pages,
+        )
+
+        return JSONResponse(response.model_dump())
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get(
+    "/kinfin/pairwise-analysis/{attribute}",
+    response_model=ResponseSchema,
+)
+@check_kinfin_session
+async def get_pairwise_analysis(
+    request: Request,
+    attribute: str,
+    session_id: str = Depends(header_scheme),
+    taxon_1: Optional[str] = Query(None),
+    taxon_2: Optional[str] = Query(None),
+):
+    try:
+        result_dir = query_manager.get_session_dir(session_id)
+        config_f = os.path.join(result_dir, "config.json")
+        if not os.path.exists(config_f):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Kinfin analysis not initialized",
+                    error="session_not_initialized",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=428,
+            )
+
+        valid_endpoints = extract_attributes_and_taxon_sets(config_f)
+        valid_attributes = valid_endpoints["attributes"]
+
+        if attribute and attribute not in valid_attributes:
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"Invalid attribute: {attribute}. Must be one of {valid_attributes}.",
+                    error="Invalid Input",
+                ).model_dump(),
+                status_code=400,
+            )
+
+        filename = f"{attribute}/{attribute}.{PAIRWISE_ANALYSIS_FILE}"
+        filepath = os.path.join(result_dir, filename)
+
+        if not os.path.exists(filepath):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message=f"{COUNTS_FILEPATH} File Not Found",
+                    error="File does not exist",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        result = parse_pairwise_file(filepath, taxon_1, taxon_2)
+
+        response = ResponseSchema(
+            status="success",
+            message="Cluster summary retrieved successfully",
+            data=result,
+            query=str(request.url),
+        )
+
+        return JSONResponse(response.model_dump())
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                query=str(request.url),
+                error=str(e),
+            ).model_dump(),
+            status_code=500,
+        )
+
+
+@router.get("/kinfin/plot/{plot_type}")
+@check_kinfin_session
+async def get_plot(
+    request: Request,
+    plot_type: str,
+    session_id: str = Depends(header_scheme),
+) -> FileResponse:
+    """
+    Retrieve a specific plot type for a given session.
+
+    Args:
+        plot_type (str): The type of plot to retrieve.
+        session_id (str): The session ID for authentication.
+
+    Returns:
+        FileResponse: The requested plot file.
+
+    Raises:
+        HTTPException: If the plot type is invalid, session ID is invalid, or the file is not found.
+    """
+    try:
+        if plot_type not in ["cluster-size-distribution", "all-rarefaction-curve"]:
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Invalid Plot Type",
+                    error="invalid_plot_type",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        result_dir = query_manager.get_session_dir(session_id)
+        filepath: str = ""
+        match plot_type:
+            case "cluster-size-distribution":
+                filepath = "cluster_size_distribution.png"
+            case "all-rarefaction-curve":
+                filepath = "all/all.rarefaction_curve.png"
+            case _:
+                return JSONResponse(
+                    content=ResponseSchema(
+                        status="error",
+                        message="Invalid Plot Type",
+                        error="invalid_plot_type",
+                        query=str(request.url),
+                    ).model_dump(),
+                    status_code=404,
+                )
+
+        filepath = os.path.join(result_dir, filepath)
+
+        if not os.path.exists(filepath):
+            return JSONResponse(
+                content=ResponseSchema(
+                    status="error",
+                    message="Plot not found",
+                    error="plot_not_found",
+                    query=str(request.url),
+                ).model_dump(),
+                status_code=404,
+            )
+
+        return FileResponse(
+            filepath,
+            media_type="image/png",
+            headers={"Content-Disposition": "inline"},
+        )
+    except HTTPException as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message=e.detail,
+                query=str(request.url),
+            ).model_dump(),
+            status_code=e.status_code,
+        )
+    except Exception as e:
+        print(e)
+        return JSONResponse(
+            content=ResponseSchema(
+                status="error",
+                message="Internal Server Error",
+                error=str(e),
+                query=str(request.url),
+            ).model_dump(),
+            status_code=500,
+        )
diff --git a/src/api/fileparsers.py b/src/api/fileparsers.py
new file mode 100644
index 0000000..7e4e99b
--- /dev/null
+++ b/src/api/fileparsers.py
@@ -0,0 +1,247 @@
+import csv
+from typing import Optional, Set, Union
+
+
+def read_tsv_file(filepath: str, delimiter: str = "\t"):
+    try:
+        with open(filepath, "r", newline="") as file:
+            yield from csv.DictReader(file, delimiter=delimiter)
+    except csv.Error as e:
+        raise ValueError(f"Error reading CSV file: {e}") from e
+
+
+def split_to_set(value: Optional[str]) -> Optional[Set[str]]:
+    return set(value.split(",")) if value else None
+
+
+def filter_include_exclude(
+    item: str,
+    include_set: Optional[Set[str]] = None,
+    exclude_set: Optional[Set[str]] = None,
+) -> bool:
+    if include_set and item not in include_set:
+        return False
+    return not exclude_set or item not in exclude_set
+
+
+def filter_min_max(
+    value: Union[int, float],
+    min_value: Optional[Union[int, float]] = None,
+    max_value: Optional[Union[int, float]] = None,
+) -> bool:
+    if min_value is not None:
+        min_value = float(min_value)
+    if max_value is not None:
+        max_value = float(max_value)
+
+    return (min_value is None or value >= min_value) and (
+        max_value is None or value <= max_value
+    )
+
+
+def parse_taxon_counts_file(
+    filepath: str,
+    include_clusters: Optional[str],
+    exclude_clusters: Optional[str],
+    include_taxons: Optional[str],
+    exclude_taxons: Optional[str],
+    min_count: Optional[int],
+    max_count: Optional[int],
+):
+    included_clusters = split_to_set(include_clusters)
+    excluded_clusters = split_to_set(exclude_clusters)
+    included_taxons = split_to_set(include_taxons)
+    excluded_taxons = split_to_set(exclude_taxons)
+
+    result = {}
+
+    for row in read_tsv_file(filepath):
+        cluster_id = row["#ID"]
+
+        if not filter_include_exclude(cluster_id, included_clusters, excluded_clusters):
+            continue
+
+        if filtered_values := {
+            taxon: int(count)
+            for taxon, count in row.items()
+            if taxon != "#ID"
+            and filter_min_max(int(count), min_count, max_count)
+            and filter_include_exclude(taxon, included_taxons, excluded_taxons)
+        }:
+            result[cluster_id] = filtered_values
+
+    return result
+
+
+def parse_cluster_summary_file(
+    filepath: str,
+    include_clusters: Optional[str],
+    exclude_clusters: Optional[str],
+    include_properties: Optional[str],
+    exclude_properties: Optional[str],
+    min_cluster_protein_count: Optional[int],
+    max_cluster_protein_count: Optional[int],
+    min_protein_median_count: Optional[float],
+    max_protein_median_count: Optional[float],
+):
+    included_clusters = split_to_set(include_clusters)
+    excluded_clusters = split_to_set(exclude_clusters)
+    included_properties = split_to_set(include_properties)
+    excluded_properties = split_to_set(exclude_properties)
+
+    rows = read_tsv_file(filepath)
+    result = {}
+    for row in rows:
+        cluster_id = row["#cluster_id"]
+        if not filter_include_exclude(cluster_id, included_clusters, excluded_clusters):
+            continue
+
+        summary = {
+            "cluster_id": cluster_id,
+            "cluster_protein_count": int(row["cluster_protein_count"]),
+            "protein_median_count": float(row["protein_median_count"]),
+            "TAXON_count": int(row["TAXON_count"]),
+            "attribute": row["attribute"],
+            "attribute_cluster_type": row["attribute_cluster_type"],
+            "protein_span_mean": (
+                None
+                if row["protein_span_mean"] == "N/A"
+                else float(row["protein_span_mean"])
+            ),
+            "protein_span_sd": (
+                None
+                if row["protein_span_sd"] == "N/A"
+                else float(row["protein_span_sd"])
+            ),
+        }
+
+        if not filter_min_max(
+            summary["cluster_protein_count"],
+            min_cluster_protein_count,
+            max_cluster_protein_count,
+        ) or not filter_min_max(
+            summary["protein_median_count"],
+            min_protein_median_count,
+            max_protein_median_count,
+        ):
+            continue
+        protein_counts = {
+            k: v
+            for k, v in row.items()
+            if k not in summary
+            and filter_include_exclude(k, included_properties, excluded_properties)
+        }
+
+        result[cluster_id] = {**summary, "protein_counts": protein_counts}
+    return result
+
+
+def parse_attribute_summary_file(filepath: str):
+    result = {}
+
+    for row in read_tsv_file(filepath):
+        taxon_set = row["taxon_set"]
+        result[taxon_set] = {
+            "taxon_set": taxon_set,
+            "cluster_total_count": row["cluster_total_count"],
+            "protein_total_count": row["protein_total_count"],
+            "protein_total_span": row["protein_total_span"],
+            "singleton": {
+                "cluster_count": row["singleton_cluster_count"],
+                "protein_count": row["singleton_protein_count"],
+                "protein_span": row["singleton_protein_span"],
+            },
+            "specific": {
+                "cluster_count": row["specific_cluster_count"],
+                "protein_count": row["specific_protein_count"],
+                "protein_span": row["specific_protein_span"],
+                "cluster_true_1to1_count": row["specific_cluster_true_1to1_count"],
+                "cluster_fuzzy_count": row["specific_cluster_fuzzy_count"],
+            },
+            "shared": {
+                "cluster_count": row["shared_cluster_count"],
+                "protein_count": row["shared_protein_count"],
+                "protein_span": row["shared_protein_span"],
+                "cluster_true_1to1_count": row["shared_cluster_true_1to1_count"],
+                "cluster_fuzzy_count": row["shared_cluster_fuzzy_count"],
+            },
+            "absent": {
+                "cluster_total_count": row["absent_cluster_total_count"],
+                "cluster_singleton_count": row["absent_cluster_singleton_count"],
+                "cluster_specific_count": row["absent_cluster_specific_count"],
+                "cluster_shared_count": row["absent_cluster_shared_count"],
+            },
+            "TAXON_count": row["TAXON_count"],
+            "TAXON_taxa": row["TAXON_taxa"].split(", "),
+        }
+    return result
+
+
+def parse_cluster_metrics_file(
+    filepath: str,
+    cluster_status: Optional[str],
+    cluster_type: Optional[str],
+):
+    result = {}
+    valid_status = split_to_set(cluster_status)
+    valid_types = split_to_set(cluster_type)
+    rows = read_tsv_file(filepath)
+
+    for row in rows:
+        cluster_id = row["#cluster_id"]
+        if valid_types and row["cluster_type"] not in valid_types:
+            continue
+
+        if not filter_include_exclude(row["cluster_status"], valid_status):
+            continue
+
+        if not filter_include_exclude(row["cluster_type"], valid_types):
+            continue
+
+        result[cluster_id] = {
+            "cluster_id": cluster_id,
+            "cluster_status": row["cluster_status"],
+            "cluster_type": row["cluster_type"],
+            "present_in_cluster": row["cluster_status"] == "present",
+            "is_singleton": row["cluster_type"] == "singleton",
+            "is_specific": row["cluster_type"] == "specific",
+            "counts": {
+                "cluster_protein_count": row["cluster_protein_count"],
+                "cluster_proteome_count": row["cluster_proteome_count"],
+                "TAXON_protein_count": row["TAXON_protein_count"],
+                "TAXON_mean_count": row["TAXON_mean_count"],
+                "non_taxon_mean_count": row["non_taxon_mean_count"],
+            },
+            "representation": row["representation"],
+            "log2_mean(TAXON/others)": row["log2_mean(TAXON/others)"],
+            "pvalue(TAXON vs. others)": row["pvalue(TAXON vs. others)"],
+            "coverage": {
+                "taxon_coverage": row["TAXON_coverage"],
+                "TAXON_count": row["TAXON_count"],
+                "non_TAXON_count": row["non_TAXON_count"],
+            },
+            "TAXON_taxa": (
+                row["TAXON_taxa"].split(",") if row["TAXON_taxa"] != "N/A" else "N/A"
+            ),
+            "non_TAXON_taxa": (
+                row["non_TAXON_taxa"].split(",")
+                if row["non_TAXON_taxa"] != "N/A"
+                else "N/A"
+            ),
+        }
+
+    return result
+
+
+def parse_pairwise_file(filepath: str, taxon_1: Optional[str], taxon_2: Optional[str]):
+    result = []
+    for row in read_tsv_file(filepath):
+        if taxon_1 and row["TAXON_1"] != taxon_1 and row["TAXON_2"] != taxon_1:
+            continue
+
+        if taxon_2 and row["TAXON_1"] != taxon_2 and row["TAXON_2"] != taxon_2:
+            continue
+
+        result.append(row)
+
+    return result
diff --git a/src/api/sessions.py b/src/api/sessions.py
new file mode 100644
index 0000000..66b5d8b
--- /dev/null
+++ b/src/api/sessions.py
@@ -0,0 +1,114 @@
+import hashlib
+import json
+import logging
+import os
+import shutil
+import signal
+import sys
+import threading
+import time
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple
+
+logger = logging.getLogger("kinfin_logger")
+
+
+class QueryManager:
+    """
+    A class to manage query sessions, including creation, retrieval, and cleanup of session directories.
+    """
+
+    def __init__(self, expiration_hours: int = 24) -> None:
+        """Initializes the QueryManager with the specified expiration time for sessions."""
+        self.results_base_dir = os.environ.get("RESULTS_BASE_DIR")
+        self.cluster_f = ""
+        self.sequence_ids_f = ""
+        self.taxon_idx_mapping_file = ""
+        self.nodesdb_f = ""
+        self.pfam_mapping_f = ""
+        self.ipr_mapping_f = ""
+        self.go_mapping_f = ""
+        if self.results_base_dir is None or not os.path.isabs(self.results_base_dir):
+            sys.exit("[ERROR] RESULTS_BASE_DIR should be an absolute path.")
+
+        self.expiration_hours = expiration_hours
+        os.makedirs(self.results_base_dir, exist_ok=True)
+
+        self.cleanup_thread = threading.Thread(target=self.cleanup_loop, daemon=True)
+        self.cleanup_thread.start()
+
+    def get_session_id(self, query: List[Dict[str, str]]) -> str:
+        """
+        Generate a unique session ID based on the query.
+
+        Args:
+            query (List[Dict[str, str]]): The query for which to generate a session ID.
+
+        Returns:
+            str: The generated session ID.
+        """
+        query_json = json.dumps(query, sort_keys=True)
+        return hashlib.md5(query_json.encode()).hexdigest()
+
+    def get_or_create_session(self, query: List[Dict[str, str]]) -> Tuple[str, str]:
+        """
+        Get or create a session directory based on the query.
+
+        Args:
+            query (List[Dict[str, str]]): The query for which to get or create a session.
+
+        Returns:
+            tuple: The session ID and the session directory path.
+        """
+        session_id = self.get_session_id(query)
+        session_dir = os.path.join(self.results_base_dir, session_id)
+
+        if not os.path.exists(session_dir):
+            os.makedirs(session_dir)
+        else:
+            os.utime(session_dir, None)
+
+        return session_id, session_dir
+
+    def get_session_dir(self, session_id: str) -> Optional[str]:
+        """
+        Get the directory path of an existing session.
+
+        Args:
+            session_id (str): The session ID for which to get the directory path.
+
+        Returns:
+            str: The session directory path, or None if the session does not exist.
+        """
+        session_dir = os.path.join(self.results_base_dir, session_id)
+        if os.path.exists(session_dir):
+            os.utime(session_dir, None)
+            return session_dir
+        return None
+
+    def cleanup_loop(self) -> None:
+        """The main loop for periodically cleaning up expired sessions."""
+        while True:
+            self.cleanup_expired_sessions()
+            time.sleep(3600)
+
+    def cleanup_expired_sessions(self) -> None:
+        """Clean up sessions that have expired based on the expiration time."""
+        now = datetime.now()
+        for session_id in os.listdir(self.results_base_dir):
+            session_dir = os.path.join(self.results_base_dir, session_id)
+            mod_time = datetime.fromtimestamp(os.path.getmtime(session_dir))
+
+            if now - mod_time > timedelta(hours=self.expiration_hours):
+                shutil.rmtree(session_dir)
+
+    def __exit__(self, _, __) -> None:
+        """Cleanup all sessions when exiting due to signal"""
+        shutil.rmtree(self.results_base_dir)
+        exit(0)
+
+
+query_manager = QueryManager()
+
+signal.signal(signal.SIGINT, query_manager.__exit__)
+signal.signal(signal.SIGTERM, query_manager.__exit__)
diff --git a/src/api/utils.py b/src/api/utils.py
new file mode 100644
index 0000000..aa2f6ca
--- /dev/null
+++ b/src/api/utils.py
@@ -0,0 +1,117 @@
+import asyncio
+import glob
+from collections import defaultdict
+
+
+def read_status(status_file):
+    status_info = {}
+    with open(status_file, "r") as file:
+        for line in file:
+            key, value = line.strip().split("=", 1)
+            status_info[key] = value
+
+    return status_info
+
+
+def write_status(
+    status_file: str,
+    status: str,
+    exit_code: int = None,
+    error: str = None,
+):
+    with open(status_file, "w") as file:
+        file.write(f"status={status}\n")
+        if exit_code is not None:
+            file.write(f"exit_code={exit_code}\n")
+        if error:
+            file.write(f"error={error}\n")
+
+
+def extract_error_message(stderr: str) -> str:
+    lines = stderr.strip().splitlines()
+    error_message_lines = []
+    error_found = False
+
+    for line in lines:
+        if "[ERROR] -" in line:
+            error_found = True
+            error_message_lines.append(line.split("[ERROR] -")[1])
+            continue
+        if error_found:
+            error_message_lines.append(line)
+
+    return (
+        " ".join(error_message_lines)
+        if error_message_lines
+        else "An unknown error occurred."
+    )
+
+
+async def run_cli_command(command: list, status_file: str):
+    write_status(status_file, "running")
+
+    try:
+        process = await asyncio.create_subprocess_exec(
+            *command,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+
+        stdout, stderr = await process.communicate()
+        stdout = stdout.decode().strip()
+        stderr = stderr.decode().strip()
+
+        if process.returncode == 0:
+            write_status(status_file, "completed")
+            return stdout
+        else:
+            error_message = extract_error_message(stderr)
+            write_status(
+                status_file,
+                "error",
+                exit_code=process.returncode,
+                error=error_message,
+            )
+            return None
+
+    except Exception as e:
+        write_status(status_file, "error", error=str(e))
+        return None
+
+
+def extract_attributes_and_taxon_sets(filepath: str):
+    files = glob.glob(f"{filepath}/**/*.cluster_metrics.txt")
+    files = [file.split(filepath)[1] for file in files]
+    attributes = set()
+    result = {"attributes": [], "taxon_set": defaultdict(list)}
+    for file in files:
+        filename = file.split("/")[-1]
+        attribute = filename.split(".")[0]
+        taxon_set = filename.split(".")[1]
+        attributes.add(attribute)
+        result["taxon_set"][attribute].append(taxon_set)
+    result["attributes"] = sorted(attributes)
+    return result
+
+
+def sort_and_paginate_result(
+    result: dict,
+    sort_by: str,
+    sort_order: str = "asc",
+    page: int = 1,
+    size: int = 20,
+) -> tuple:
+    if sort_by:
+        sort_keys = sort_by.split(",")
+        items = list(result.items())
+        items.sort(
+            key=lambda item: tuple(item[1].get(key, float("inf")) for key in sort_keys),
+            reverse=(sort_order != "asc"),
+        )
+        result = dict(items)
+    start_index = (page - 1) * size
+    end_index = start_index + size
+    paginated_result = dict(list(result.items())[start_index:end_index])
+    total_pages = -(-len(result) // size)
+
+    return paginated_result, total_pages
diff --git a/src/cli/__init__.py b/src/cli/__init__.py
new file mode 100644
index 0000000..5b52ebe
--- /dev/null
+++ b/src/cli/__init__.py
@@ -0,0 +1,20 @@
+import os
+
+from core.input import InputData
+from core.logger import setup_logger
+from core.results import analyse
+
+
+def run_cli(args: InputData) -> None:
+    """
+    Run the command-line interface to perform analysis based on the provided input data.
+
+    Args:
+        args (InputData): An instance of InputData containing input parameters and data.
+
+    Returns:
+        None
+    """
+    log_path = os.path.join(args.output_path, "kinfin.log")
+    setup_logger(log_path)
+    analyse(args)
diff --git a/src/cli/commands.py b/src/cli/commands.py
new file mode 100644
index 0000000..6a8d176
--- /dev/null
+++ b/src/cli/commands.py
@@ -0,0 +1,212 @@
+import argparse
+import sys
+from typing import Union
+
+from cli.validate import validate_cli_args
+from core.config import SUPPORTED_PLOT_FORMATS, SUPPORTED_TAXRANKS, SUPPORTED_TESTS
+from core.input import InputData, ServeArgs
+
+
+# TODO : --plotsize should take a tuple
+# TODO : --taxranks should take multiple inputs
+def parse_args(
+    nodesdb_f: str,
+    pfam_mapping_f: str,
+    ipr_mapping_f: str,
+    go_mapping_f: str,
+) -> Union[ServeArgs, InputData]:
+    """Parse command-line arguments.
+
+    Args:
+        nodesdb_f (str): filepath of nodesdb_f.
+        pfam_mapping_f (str): filepath of pfam_mapping_f.
+        ipr_mapping_f (str): filepath of ipr_mapping_f.
+        go_mapping_f (str): filepath of go_mapping_f.
+
+    Returns:
+        ServeArgs or InputData: Parsed arguments based on the command.
+
+    Raises:
+        SystemExit: If an invalid command is provided.
+    """
+
+    parser = argparse.ArgumentParser(
+        description="Kinfin proteome cluster analysis tool"
+    )
+
+    subparsers = parser.add_subparsers(title="command", required=True, dest="command")
+    api_parser = subparsers.add_parser("serve", help="Start the server")
+    api_parser.add_argument(
+        "-p",
+        "--port",
+        type=int,
+        default=8000,
+        help="Port number for the server (default: 8000)",
+    )
+
+    cli_parser = subparsers.add_parser("analyse", help="Perform analysis")
+
+    # Required Arguments
+    required_group = cli_parser.add_argument_group("Required Arguments")
+    required_group.add_argument(
+        "-g",
+        "--cluster_file",
+        help="OrthologousGroups.txt produced by OrthoFinder",
+        required=True,
+    )
+    required_group.add_argument(
+        "-c", "--config_file", help="Config file (in CSV format)", required=True
+    )
+    required_group.add_argument(
+        "-s",
+        "--sequence_ids_file",
+        help="SequenceIDs.txt used in OrthoFinder",
+        required=True,
+    )
+
+    # Other Files
+    other_files_group = cli_parser.add_argument_group("Other Files")
+    other_files_group.add_argument(
+        "-p", "--species_ids_file", help="SpeciesIDs.txt used in OrthoFinder"
+    )
+    other_files_group.add_argument(
+        "-m", "--taxon_idx_mapping", help="TAXON IDX Mapping File"
+    )
+    other_files_group.add_argument(
+        "-f",
+        "--functional_annotation",
+        help="Mapping of ProteinIDs to GO/IPRS/SignalP/Pfam (can be generated through 'iprs_to_table.py')",
+    )
+    other_files_group.add_argument("-a", "--fasta_dir", help="Directory of FASTA files")
+    other_files_group.add_argument(
+        "-t",
+        "--tree_file",
+        help="Tree file in Newick format (taxon names must be the same as TAXON in config file)",
+    )
+
+    # General Options
+    general_group = cli_parser.add_argument_group("General Options")
+    general_group.add_argument("-o", "--output_path", help="Output prefix")
+    general_group.add_argument(
+        "--infer_singletons",
+        help="Absence of proteins in clustering is interpreted as singleton (based on SequenceIDs.txt)",
+        action="store_true",
+    )
+    general_group.add_argument(
+        "--plot_tree",
+        help="Plot PDF of annotated phylogenetic tree (requires -t, full ETE3 installation and X-server/xvfb-run)",
+        action="store_true",
+    )
+    general_group.add_argument(
+        "--min_proteomes",
+        help="Required number of proteomes in a taxon-set to be used in rarefaction/representation-test computations [default: 2]",
+        default=2,
+        type=int,
+    )
+    general_group.add_argument(
+        "--test",
+        help="Test to be used in representation-test computations [default: mannwhitneyu]. Options: ttest, welch, mannwhitneyu, ks, kruskal",
+        default="mannwhitneyu",
+        choices=SUPPORTED_TESTS,
+    )
+    general_group.add_argument(
+        "-r",
+        "--taxranks",
+        help="Taxonomic ranks to be inferred from TaxIDs in config file [default: phylum,order,genus]",
+        # TODO : Add SUPPORTED_TAXRANKS here
+        default=["phylum", "order", "genus"],
+        nargs="+",
+        choices=SUPPORTED_TAXRANKS,
+    )
+    general_group.add_argument(
+        "--repetitions",
+        help="Number of repetitions for rarefaction curves [default: 30]",
+        default=30,
+        type=int,
+    )
+
+    # Fuzzy Orthology Groups
+    fuzzy_group = cli_parser.add_argument_group("Fuzzy Orthology Groups")
+    fuzzy_group.add_argument(
+        "-n",
+        "--target_count",
+        help="Target number of copies per proteome [default: 1]",
+        default=1,
+        type=int,
+    )
+    fuzzy_group.add_argument(
+        "-x",
+        "--target_fraction",
+        help="Min proportion of proteomes at target_count [default: 0.75]",
+        default=0.75,
+        type=float,
+    )
+    fuzzy_group.add_argument(
+        "--min",
+        help="Min count of proteins for proteomes outside of target_fraction [default: 0]",
+        default=0,
+        type=int,
+    )
+    fuzzy_group.add_argument(
+        "--max",
+        help="Max count of proteins for proteomes outside of target_fraction [default: 20]",
+        default=20,
+        type=int,
+    )
+
+    plotting_group = cli_parser.add_argument_group("Plotting Options")
+    plotting_group.add_argument(
+        "--fontsize", help="Fontsize for plots [default: 18]", default=18, type=int
+    )
+    plotting_group.add_argument(
+        "--plotsize",
+        help="Size (WIDTH,HEIGHT) for plots [default: 24,12]",
+        default=(24, 12),
+        nargs=2,
+    )
+    plotting_group.add_argument(
+        "--plot_format",
+        help="Plot formats [default: pdf]",
+        default="pdf",
+        choices=SUPPORTED_PLOT_FORMATS,
+    )
+
+    args = parser.parse_args()
+
+    if args.command == "serve":
+        return ServeArgs(port=args.port)
+    elif args.command == "analyse":
+        validate_cli_args(args=args)
+        fuzzy_range = {
+            x for x in range(args.min, args.max + 1) if x != args.target_count
+        }
+
+        return InputData(
+            cluster_file=args.cluster_file,
+            config_f=args.config_file,
+            sequence_ids_file=args.sequence_ids_file,
+            species_ids_file=args.species_ids_file,
+            functional_annotation_f=args.functional_annotation,
+            fasta_dir=args.fasta_dir,
+            tree_file=args.tree_file,
+            output_path=args.output_path,
+            infer_singletons=args.infer_singletons,
+            plot_tree=args.plot_tree,
+            min_proteomes=args.min_proteomes,
+            test=args.test,
+            taxranks=args.taxranks,
+            repetitions=args.repetitions + 1,
+            fuzzy_count=args.target_count,
+            fuzzy_fraction=args.target_fraction,
+            fuzzy_range=fuzzy_range,
+            fontsize=args.fontsize,
+            plotsize=args.plotsize,
+            plot_format=args.plot_format,
+            nodesdb_f=nodesdb_f,
+            pfam_mapping_f=pfam_mapping_f,
+            ipr_mapping_f=ipr_mapping_f,
+            go_mapping_f=go_mapping_f,
+            taxon_idx_mapping_file=args.taxon_idx_mapping,
+        )
+    else:
+        sys.exit()
diff --git a/src/cli/validate.py b/src/cli/validate.py
new file mode 100644
index 0000000..cc9bbe0
--- /dev/null
+++ b/src/cli/validate.py
@@ -0,0 +1,83 @@
+import logging
+import sys
+
+from core.utils import check_file
+
+logger = logging.getLogger("kinfin_logger")
+
+
+def validate_cli_args(args) -> None:
+    """Validate cli input arguments.
+
+    This function checks if all required files exist and if the arguments meet specific conditions.
+
+    Args:
+        args (InputData): Input arguments as a named tuple.
+
+    Raises:
+        SystemExit: If there are any validation errors, exits the program with error messages.
+    """
+
+    error_msgs = []
+
+    try:
+        check_file(args.cluster_file)
+    except FileNotFoundError as e:
+        error_msgs.append(str(e))
+    try:
+        if not isinstance(args.config_file, str):
+            raise ValueError("[ERROR] - Invalid config file data")
+
+        check_file(args.config_file)
+    except (FileNotFoundError, ValueError) as e:
+        error_msgs.append(str(e))
+    try:
+        check_file(args.sequence_ids_file)
+    except FileNotFoundError as e:
+        error_msgs.append(str(e))
+    try:
+        check_file(args.species_ids_file)
+    except FileNotFoundError as e:
+        error_msgs.append(str(e))
+    try:
+        check_file(args.tree_file)
+    except FileNotFoundError as e:
+        error_msgs.append(str(e))
+    try:
+        check_file(args.functional_annotation)
+    except FileNotFoundError as e:
+        error_msgs.append(str(e))
+
+    if args.fasta_dir and not args.species_ids_file:
+        error_msgs.append(
+            "[ERROR] : You have provided a FASTA-dir using '--fasta-dir'. Please also provide a Species-ID file using ('--species_ids_file')."
+        )
+
+    if args.target_count < 0:
+        error_msgs.append(
+            f"[ERROR] : --target_count {args.target_count} must be greater than 0"
+        )
+
+    if args.target_fraction < 0 or args.target_fraction > 1:
+        error_msgs.append(
+            f"[ERROR] : --target_fraction {args.target_fraction} is not between 0.0 and 1.0"
+        )
+
+    if args.min > args.max:
+        error_msgs.append(
+            f"[ERROR] : --min {args.min} is greater than --max {args.max}"
+        )
+
+    if args.repetitions <= 0:
+        error_msgs.append(
+            "[ERROR] : Please specify a positive integer for the number of repetitions for the rarefaction curves"
+        )
+
+    if args.min_proteomes <= 0:
+        error_msgs.append(
+            "[ERROR] : Please specify a positive integer for the minimum number of proteomes to consider for computations"
+        )
+
+    if error_msgs:
+        logger.error("\n".join(error_msgs))
+        sys.exit(1)
diff --git a/build/lib/kinfin/__init__.py b/src/core/__init__.py
similarity index 100%
rename from build/lib/kinfin/__init__.py
rename to src/core/__init__.py
diff --git a/src/core/alo.py b/src/core/alo.py
new file mode 100644
index 0000000..378c83c
--- /dev/null
+++ b/src/core/alo.py
@@ -0,0 +1,250 @@
+from typing import Dict, List, Literal, Optional, Set, Union
+
+from core.clusters import Cluster
+
+
+class AttributeLevel:
+    """
+    Definitions:
+        'shared' : shared between one ALO and others
+        'singleton' : cardinality of 1 ('specific', but separate)
+        'specific' : only present within one ALO
+    """
+
+    def __init__(self, attribute: str, level: str, proteomes: Set[str]) -> None:
+        self.attribute: str = attribute
+        self.level: str = level
+        self.proteomes: Set[str] = set(proteomes)
+        self.proteomes_list: List[str] = list(proteomes)
+        self.proteome_count: int = len(proteomes)
+
+        self.cluster_ids_by_cluster_type_by_cluster_status: Dict[
+            str, Dict[str, List[str]]
+        ] = {
+            # sums up to cluster_count
+            "present": {"singleton": [], "specific": [], "shared": []},
+            "absent": {"singleton": [], "specific": [], "shared": []},
+        }
+
+        self.protein_ids_by_cluster_type: Dict[str, List[str]] = {
+            # list of lists
+            "singleton": [],
+            "specific": [],
+            "shared": [],
+        }
+
+        self.protein_span_by_cluster_type: Dict[str, List[Union[int, float]]] = {
+            "singleton": [],
+            "specific": [],
+            "shared": [],
+        }
+
+        self.clusters_by_cluster_cardinality_by_cluster_type: Dict[
+            str, Dict[str, List[str]]
+        ] = {
+            "shared": {"true": [], "fuzzy": []},
+            "specific": {"true": [], "fuzzy": []},
+        }
+
+        self.cluster_status_by_cluster_id: Dict[str, Literal["absent", "present"]] = {}
+        self.cluster_type_by_cluster_id: Dict[
+            str, Literal["singleton", "shared", "specific"]
+        ] = {}
+
+        self.cluster_mwu_pvalue_by_cluster_id = {}
+        self.cluster_mwu_log2_mean_by_cluster_id = {}
+        self.cluster_mean_ALO_count_by_cluster_id = {}
+        self.cluster_mean_non_ALO_count_by_cluster_id = {}
+
+        self.domain_counter_by_domain_source_by_cluster_type = None
+        self.protein_with_domain_count_by_domain_source_by_cluster_type = None
+
+        self.protein_length_stats_by_cluster_id: Dict[
+            str, Dict[str, Union[int, float]]
+        ] = {}
+        self.protein_count_by_cluster_id: Dict[str, int] = {}
+
+    def add_cluster(
+        self,
+        cluster: Cluster,
+        attribute_cluster_type: Literal["singleton", "shared", "specific"],
+        ALO_cluster_status: Literal["absent", "present"],
+        ALO_protein_length_stats: Dict[str, Union[int, float]],
+        ALO_protein_ids_in_cluster: List[str],
+        ALO_cluster_cardinality: Optional[str],
+        mwu_pvalue: Optional[float],
+        mwu_log2_mean: Optional[float],
+        mean_ALO_count: Optional[float],
+        mean_non_ALO_count: Optional[float],
+    ) -> None:
+        """
+        Adds a cluster to various data structures maintained by the class.
+
+        Args:
+            cluster (Cluster): The cluster object to add.
+            attribute_cluster_type (Literal["singleton", "shared", "specific"]):
+                Type of the cluster as either 'singleton', 'shared', or 'specific'.
+            ALO_cluster_status (Literal["absent", "present"]):
+                Status of the cluster, either 'absent' or 'present'.
+            ALO_protein_length_stats (Dict[str, Union[int, float]]):
+                Length statistics of proteins in the cluster.
+            ALO_protein_ids_in_cluster (List[str]):
+                List of protein IDs present in the cluster.
+            ALO_cluster_cardinality (Optional[str]):
+                Cardinality of the cluster (if applicable).
+            mwu_pvalue (Optional[float]):
+                P-value from Mann-Whitney U test (if applicable).
+            mwu_log2_mean (Optional[float]):
+                Log2 transformed mean (if applicable).
+            mean_ALO_count (Optional[float]):
+                Mean count of ALO (if applicable).
+            mean_non_ALO_count (Optional[float]):
+                Mean count of non-ALO (if applicable).
+
+        Returns:
+            None
+        """
+        self.cluster_ids_by_cluster_type_by_cluster_status[ALO_cluster_status][
+            attribute_cluster_type
+        ].append(cluster.cluster_id)
+        self.cluster_status_by_cluster_id[cluster.cluster_id] = ALO_cluster_status
+        self.cluster_type_by_cluster_id[cluster.cluster_id] = attribute_cluster_type
+        self.protein_length_stats_by_cluster_id[cluster.cluster_id] = (
+            ALO_protein_length_stats
+        )
+
+        self.protein_count_by_cluster_id[cluster.cluster_id] = len(
+            ALO_protein_ids_in_cluster
+        )
+
+        if ALO_cluster_status == "present":
+            for ALO_protein_id in ALO_protein_ids_in_cluster:
+                self.protein_ids_by_cluster_type[attribute_cluster_type].append(
+                    ALO_protein_id
+                )
+            self.protein_span_by_cluster_type[attribute_cluster_type].append(
+                ALO_protein_length_stats["sum"]
+            )
+            if attribute_cluster_type != "singleton" and ALO_cluster_cardinality:
+                self.clusters_by_cluster_cardinality_by_cluster_type[
+                    attribute_cluster_type
+                ][ALO_cluster_cardinality].append(cluster.cluster_id)
+
+        self.cluster_mwu_pvalue_by_cluster_id[cluster.cluster_id] = mwu_pvalue
+        self.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id] = mwu_log2_mean
+        self.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id] = mean_ALO_count
+        self.cluster_mean_non_ALO_count_by_cluster_id[cluster.cluster_id] = (
+            mean_non_ALO_count
+        )
+
+    def get_protein_count_by_cluster_type(self, cluster_type: str) -> int:
+        """
+        Return the count of proteins for a specific cluster type.
+
+        Args:
+            cluster_type (str): Type of the cluster. Use "total" for the total count across all types.
+
+        Returns:
+            int: Number of proteins in the specified cluster type.
+
+        Raises:
+            KeyError: If 'cluster_type' is not found in self.protein_ids_by_cluster_type.
+        """
+        if cluster_type == "total":
+            return sum(
+                len(protein_ids)
+                for _, protein_ids in list(self.protein_ids_by_cluster_type.items())
+            )
+        else:
+            return len(self.protein_ids_by_cluster_type[cluster_type])
+
+    def get_cluster_count_by_cluster_status_by_cluster_type(
+        self,
+        cluster_status: str,
+        cluster_type: str,
+    ) -> int:
+        """
+        Get the count of clusters of a specific status and type.
+
+        Args:
+            cluster_status (str): The status of clusters to count.
+            cluster_type (str): The type of cluster to count. Use "total" to get
+                the total count across all cluster types for the given status.
+
+        Returns:
+            int: Number of clusters with the specified status and type.
+
+        Raises:
+            KeyError: If 'cluster_status' or 'cluster_type' is not found in
+                self.cluster_ids_by_cluster_type_by_cluster_status.
+        """
+        if cluster_type == "total":
+            return sum(
+                len(cluster_ids)
+                for _, cluster_ids in list(
+                    self.cluster_ids_by_cluster_type_by_cluster_status[
+                        cluster_status
+                    ].items()
+                )
+            )
+        else:
+            return len(
+                self.cluster_ids_by_cluster_type_by_cluster_status[cluster_status][
+                    cluster_type
+                ]
+            )
+
+    def get_protein_span_by_cluster_type(self, cluster_type: str) -> Union[int, float]:
+        """
+        Get the total span of proteins for a specific cluster type.
+
+        Args:
+            cluster_type (str): The type of cluster for which to retrieve protein span.
+                Use "total" to get the total span across all cluster types.
+
+        Returns:
+            Union[int, float]: Total span of proteins in the specified cluster type.
+                If 'cluster_type' is "total", returns the sum of spans across all
+                cluster types.
+        """
+        return (
+            sum(
+                sum(protein_ids)
+                for _, protein_ids in list(self.protein_span_by_cluster_type.items())
+            )
+            if cluster_type == "total"
+            else sum(self.protein_span_by_cluster_type[cluster_type])
+        )
+
+    def get_cluster_count_by_cluster_cardinality_by_cluster_type(
+        self,
+        cluster_type: str,
+        cluster_cardinality: str,
+    ) -> int:
+        """
+        Return the count of clusters of a specific type and cardinality.
+
+        Args:
+            cluster_type (str): Type of the cluster.
+            cluster_cardinality (str): Cardinality of the clusters.
+
+        Returns:
+            int: Number of clusters with the specified type and cardinality.
+
+        Raises:
+            KeyError: If 'cluster_type' or 'cluster_cardinality' is not found.
+        """
+        return len(
+            self.clusters_by_cluster_cardinality_by_cluster_type[cluster_type][
+                cluster_cardinality
+            ]
+        )
+
+    def get_proteomes(self) -> str:
+        """
+        Get a sorted string representation of proteome IDs.
+
+        Returns:
+            str: Comma-separated and sorted list of proteome IDs.
+        """
+        return ", ".join(sorted([str(proteome_id) for proteome_id in self.proteomes]))
diff --git a/src/core/alo_collections.py b/src/core/alo_collections.py
new file mode 100644
index 0000000..232bfc7
--- /dev/null
+++ b/src/core/alo_collections.py
@@ -0,0 +1,513 @@
+import logging
+import os
+import random
+from typing import Any, Dict, List, Optional, Set
+
+import ete3
+import matplotlib as mat
+import matplotlib.pyplot as plt
+import numpy as np
+from ete3 import Tree
+
+from core.alo import AttributeLevel
+from core.config import ATTRIBUTE_RESERVED
+
+logger = logging.getLogger("kinfin_logger")
+
+mat.use("agg")
+
+
+plt.style.use("ggplot")
+mat.rc("ytick", labelsize=20)
+mat.rc("xtick", labelsize=20)
+axis_font = {"size": "20"}
+mat.rcParams.update({"font.size": 22})
+
+
+class AloCollection:
+    def __init__(
+        self,
+        proteomes: Set[str],
+        attributes: List[str],
+        proteome_id_by_species_id: Dict[str, str],
+        level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]],
+        node_idx_by_proteome_ids: Optional[Dict[Any, Any]],
+        tree_ete: Optional[Tree],
+    ) -> None:
+        self.proteomes = proteomes
+        self.attributes_verbose = attributes
+        self.attributes = [
+            # list of attributes
+            attribute
+            for attribute in attributes
+            if attribute not in ATTRIBUTE_RESERVED
+        ]
+        self.proteome_id_by_species_id = proteome_id_by_species_id
+        self.level_by_attribute_by_proteome_id = level_by_attribute_by_proteome_id
+        self.node_idx_by_proteome_ids = node_idx_by_proteome_ids
+        self.tree_ete = tree_ete
+        self.proteome_ids_by_level_by_attribute = (
+            self.compute_proteomes_by_level_by_attribute()
+        )
+        self.fastas_parsed: bool = False
+        self.ALO_by_level_by_attribute = self.create_ALOs()
+
+    def compute_proteomes_by_level_by_attribute(
+        self,
+    ) -> Dict[str, Dict[str, Set[str]]]:
+        """
+        Compute proteomes grouped by levels for each attribute.
+
+        Args:
+            attributes (List[str]): A list of strings representing attributes.
+            level_by_attribute_by_proteome_id (Dict[str, Dict[str, str]]): A dictionary where keys
+                are proteome IDs (strings), and values are dictionaries with keys representing
+                attributes (strings) and values representing levels (strings).
+
+        Returns:
+            Dict[str, Dict[str, Set[str]]]: A dictionary where keys are attributes (strings),
+                and values are dictionaries. The inner dictionaries have keys representing
+                levels (strings) and values representing sets of proteome IDs (strings).
+        """
+        proteomes_by_level_by_attribute: Dict[str, Dict[str, Set[str]]] = {
+            attribute: {} for attribute in self.attributes
+        }
+        for proteome_id in self.level_by_attribute_by_proteome_id:
+            for attribute in self.attributes:
+                level = self.level_by_attribute_by_proteome_id[proteome_id][attribute]
+                if level not in proteomes_by_level_by_attribute[attribute]:
+                    proteomes_by_level_by_attribute[attribute][level] = set()
+                proteomes_by_level_by_attribute[attribute][level].add(proteome_id)
+        return proteomes_by_level_by_attribute
+
+    def create_ALOs(self) -> Dict[str, Dict[str, Optional[AttributeLevel]]]:
+        """
+        Creates Attribute Level Objects (ALOs) for each attribute and level based on
+        proteome IDs.
+
+        Returns:
+            Dict[str, Dict[str, Optional[AttributeLevel]]]:
+                A dictionary where each key is an attribute name (str),
+                and the corresponding value is a dictionary mapping level (str)
+                to an AttributeLevel instance or None.
+        """
+        ALO_by_level_by_attribute: Dict[str, Dict[str, Optional[AttributeLevel]]] = {
+            attribute: {} for attribute in self.attributes
+        }
+        for attribute in self.proteome_ids_by_level_by_attribute:
+            for level in self.proteome_ids_by_level_by_attribute[attribute]:
+                proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level]
+                ALO = AttributeLevel(
+                    #
+                    attribute=attribute,
+                    level=level,
+                    proteomes=proteome_ids,
+                )
+                if level not in ALO_by_level_by_attribute[attribute]:
+                    ALO_by_level_by_attribute[attribute][level] = None
+                ALO_by_level_by_attribute[attribute][level] = ALO
+        return ALO_by_level_by_attribute
+
+    def generate_header_for_node(self, node: ete3.TreeNode, dirs: Dict[str, str]):
+        """
+        Generates a header image for a given node of a tree with specified statistics.
+
+        Args:
+            node (ete3.TreeNode): The TreeNode object representing the node for which the header is generated.
+            dirs (Dict[str, str]): A dictionary containing directory paths, including 'tree_headers' where the header image will be saved.
+
+        Returns:
+            str: File path to the generated header image.
+
+        Notes:
+            - The method generates a header image in PNG format displaying various statistics (apomorphies and synapomorphies) for the given tree node.
+            - The statistics include counts of singletons, non-singletons, complete presence synapomorphies, and partial absence synapomorphies.
+            - The generated image is saved in the specified directory under 'tree_headers' with the node's name as the filename.
+
+        Raises:
+            Any exceptions that might occur during file saving or table rendering.
+        """
+
+        node_header_f = os.path.join(dirs["tree_headers"], f"{node.name}.header.png")
+        data = [
+            (
+                "Apomorphies (size=1)",
+                "{:,}".format(
+                    node.apomorphic_cluster_counts["singletons"]  # type:ignore
+                ),
+            ),
+            (
+                "Apomorphies (size>1)",
+                "{:,}".format(
+                    node.apomorphic_cluster_counts["non_singletons"]  # type:ignore
+                ),
+            ),
+            (
+                "Synapomorphies (all)",
+                "{:,}".format(
+                    node.synapomorphic_cluster_counts[  # type:ignore
+                        "complete_presence"
+                    ]
+                    + node.synapomorphic_cluster_counts[  # type:ignore
+                        "partial_absence"
+                    ]
+                ),
+            ),
+            (
+                "Synapomorphies (cov=100%)",
+                "{:,}".format(
+                    node.synapomorphic_cluster_counts[  # type:ignore
+                        "complete_presence"
+                    ]
+                ),
+            ),
+            (
+                "Synapomorphies (cov<100%)",
+                "{:,}".format(
+                    node.synapomorphic_cluster_counts[  # type: ignore
+                        "partial_absence"
+                    ]  # type:ignore
+                ),
+            ),
+        ]
+        col_labels = ("Type", "Count")
+        fig, ax = plt.subplots(figsize=(2, 0.5))
+        ax.set_facecolor("white")
+        table = ax.table(
+            cellText=data,
+            colLabels=col_labels,
+            loc="bottom",
+            fontsize=24,
+            colLoc="center",
+            rowLoc="right",
+            edges="",
+        )
+        table.set_fontsize(24)
+        table.scale(2, 1)
+        for key, cell in list(table.get_celld().items()):
+            row, col = key
+            cell._text.set_color("grey")  # type:ignore
+            cell.set_edgecolor("darkgrey")
+            cell.visible_edges = "T" if row > 0 else "B"
+            if row == len(data) - 2:
+                cell.set_edgecolor("darkgrey")
+                cell.visible_edges = "T"
+        ax.axis("tight")
+        ax.axis("off")
+        logger.info(f"[STATUS]\t- Plotting {node_header_f}")
+        fig.savefig(node_header_f, pad=0, bbox_inches="tight", format="png")
+        plt.close()
+        return node_header_f
+
+    def generate_chart_for_node(
+        self,
+        node,
+        dirs: Dict[str, str],
+        plot_format: str,
+        fontsize: int,
+    ) -> Optional[str]:
+        """
+        Generate and save a histogram chart for a given node's synapomorphies.
+
+        Args:
+        - node: The node object containing synapomorphic cluster strings.
+        - dirs: A dictionary containing directory paths, specifically 'tree_charts' for saving charts.
+        - plot_format: The format in which to save the chart ('png' or 'pdf').
+        - fontsize: Font size for axis labels and ticks.
+
+        Returns:
+        - Optional[str]: Path to the saved chart file if successful, None otherwise.
+        """
+
+        if proteome_coverages := [
+            float(synapomorphic_cluster_string[3])
+            for synapomorphic_cluster_string in node.synapomorphic_cluster_strings
+        ]:
+            chart_f = os.path.join(dirs["tree_charts"], f"{node.name}.barchart.png")
+            f, ax = plt.subplots(figsize=(3.0, 3.0))
+            ax.set_facecolor("white")
+            x_values = np.array(proteome_coverages)
+            ax.hist(
+                x_values,
+                histtype="stepfilled",
+                align="mid",
+                bins=np.arange(0.0, 1.0 + 0.1, 0.1),
+            )
+            ax.set_xlim(-0.1, 1.1)
+            for tick in ax.xaxis.get_majorticklabels():
+                tick.set_fontsize(fontsize - 2)
+                tick.set_rotation("vertical")
+            for tick in ax.yaxis.get_majorticklabels():
+                tick.set_fontsize(fontsize - 2)
+            ax.set_frame_on(False)
+            ax.xaxis.grid(True, linewidth=1, which="major", color="lightgrey")
+            ax.yaxis.grid(True, linewidth=1, which="major", color="lightgrey")
+            f.suptitle("Synapomorphies", y=1.1)
+            ax.set_ylabel("Count", fontsize=fontsize)
+            ax.set_xlabel("Proteome coverage", fontsize=fontsize)
+            logger.info(f"[STATUS]\t- Plotting {chart_f}")
+            f.savefig(chart_f, bbox_inches="tight", format="png")
+            if plot_format == "pdf":
+                pdf_chart_f = os.path.join(
+                    dirs["tree_charts"],
+                    f"{node.name}.barchart.pdf",
+                )
+                logger.info(f"[STATUS]\t- Plotting {pdf_chart_f}")
+                f.savefig(pdf_chart_f, bbox_inches="tight", format="pdf")
+            plt.close()
+            return chart_f
+
+    def plot_text_tree(self, dirs: Dict[str, str]) -> None:
+        """
+        Plot and save the textual representation of the tree.
+
+        This method uses the `tree_ete` attribute of the class to generate and save
+        both a Newick format (.nwk) and a text format (.txt) representation of the tree.
+
+        Args:
+        - dirs: A dictionary containing directory paths, specifically 'tree' for saving tree files.
+
+        Returns:
+        - None
+        """
+        if self.tree_ete:
+            tree_nwk_f = os.path.join(dirs["tree"], "tree.nwk")
+            self.tree_ete.write(format=1, outfile=tree_nwk_f)
+            tree_txt_f = os.path.join(dirs["tree"], "tree.txt")
+            with open(tree_txt_f, "w") as tree_txt_fh:
+                tree_txt_fh.write(
+                    f"{self.tree_ete.get_ascii(show_internal=True, compact=False)}\n"
+                )
+
+    def plot_tree(
+        self,
+        header_f_by_node_name,
+        charts_f_by_node_name,
+        dirs: Dict[str, str],
+    ) -> None:
+        """
+        Plot and save a tree visualization with custom header and chart images for nodes.
+
+        This method uses the `self.tree_ete` attribute of the class to visualize the tree
+        in a hierarchical manner, with customized header and chart images for each node.
+
+        Args:
+        - header_f_by_node_name: Dictionary mapping node names to header image file paths (must be PNG).
+        - charts_f_by_node_name: Dictionary mapping node names to chart image file paths (must be PNG).
+        - dirs: A dictionary containing directory paths, specifically 'tree' for saving the tree PDF.
+
+        Returns:
+        - None
+        """
+        tree_f = os.path.join(
+            dirs["tree"], "tree.pdf"
+        )  # must be PDF! (otherwise it breaks)
+        style = ete3.NodeStyle()
+        style["vt_line_width"] = 5
+        style["hz_line_width"] = 5
+        style["fgcolor"] = "darkgrey"
+        for node in self.tree_ete.traverse("levelorder"):  # type: ignore
+            node.set_style(style)
+            if header_f_by_node_name[node.name]:
+                # must be PNG! (ETE can't do PDF Faces)
+                node_header_face = ete3.faces.ImgFace(header_f_by_node_name[node.name])
+                node.add_face(node_header_face, column=0, position="branch-top")
+            if charts_f_by_node_name[node.name]:
+                # must be PNG! (ETE can't do PDF Faces)
+                node_chart_face = ete3.faces.ImgFace(charts_f_by_node_name[node.name])
+                node.add_face(node_chart_face, column=0, position="branch-bottom")
+            node_name_face = ete3.TextFace(node.name, fsize=64)
+            node.img_style["size"] = 10
+            node.img_style["shape"] = "sphere"
+            node.img_style["fgcolor"] = "black"
+            if not node.is_leaf():
+                node.add_face(node_name_face, column=0, position="branch-right")
+            node.add_face(node_name_face, column=0, position="aligned")
+        ts = ete3.TreeStyle()
+        ts.draw_guiding_lines = True
+        ts.show_scale = False
+        ts.show_leaf_name = False
+        ts.allow_face_overlap = True
+        ts.guiding_lines_color = "lightgrey"
+        logger.info(f"[STATUS] - Writing tree {tree_f}... ")
+        self.tree_ete.render(  # type: ignore
+            tree_f, dpi=600, h=1189, units="mm", tree_style=ts
+        )
+
+    def write_tree(
+        self,
+        dirs: Dict[str, str],
+        render_tree: bool,
+        plot_format: str,
+        fontsize: int,
+    ) -> None:
+        """
+        Write tree data to files and optionally render a graphical tree representation.
+
+        This method generates and saves various metrics and data related to the tree structure,
+        including node statistics and cluster metrics. It can also render a graphical tree
+        representation if specified.
+
+        Args:
+        - dirs: A dictionary containing directory paths, including 'tree' for saving tree-related files.
+        - render_tree: Boolean flag indicating whether to render a graphical tree representation.
+        - plot_format: Format for saving plots ('png', 'pdf', etc.).
+        - fontsize: Font size used for plotting.
+
+        Returns:
+        - None
+        """
+        if not self.tree_ete:
+            return
+        logger.info("[STATUS] - Writing data for tree ... ")
+        # Node stats
+        node_stats_f = os.path.join(dirs["tree"], "tree.node_metrics.txt")
+        node_stats_header: List[str] = [
+            "#nodeID",
+            "taxon_specific_apomorphies_singletons",
+            "taxon_specific_apomorphies_non_singletons",
+            "node_specific_synapomorphies_total",
+            "node_specific_synapomorphies_complete_presence",
+            "node_specific_synapomorphies_partial_absence",
+            "proteome_count",
+        ]
+        node_stats: List[str] = ["\t".join(node_stats_header)]
+        # Cluster node stats
+        node_clusters_f = os.path.join(dirs["tree"], "tree.cluster_metrics.txt")
+        node_clusters_header = [
+            "#clusterID",
+            "nodeID",
+            "synapomorphy_type",
+            "node_taxon_coverage",
+            "children_coverage",
+            "node_taxa_present",
+        ]
+        node_clusters = ["\t".join(node_clusters_header)]
+        # header_f_by_node_name
+        header_f_by_node_name = {}
+        charts_f_by_node_name = {}
+        for node in self.tree_ete.traverse("levelorder"):  # type: ignore
+            for synapomorphic_cluster_string in node.synapomorphic_cluster_strings:  # type: ignore
+                node_clusters.append(
+                    "\t".join(
+                        [str(string) for string in list(synapomorphic_cluster_string)]
+                    )
+                )
+            node_stats_line = [
+                node.name,
+                node.apomorphic_cluster_counts["singletons"],  # type: ignore
+                node.apomorphic_cluster_counts["non_singletons"],  # type: ignore
+                (
+                    # type: ignore
+                    node.synapomorphic_cluster_counts["complete_presence"]  # type: ignore
+                    # type: ignore
+                    + node.synapomorphic_cluster_counts["partial_absence"]  # type: ignore
+                ),
+                # type: ignore
+                node.synapomorphic_cluster_counts["complete_presence"],  # type: ignore
+                node.synapomorphic_cluster_counts["partial_absence"],  # type: ignore
+                len(node.proteome_ids),  # type: ignore
+            ]
+            node_stats.append("\t".join([str(string) for string in node_stats_line]))
+            if render_tree:
+                header_f_by_node_name[node.name] = self.generate_header_for_node(
+                    node, dirs
+                )
+            charts_f_by_node_name[node.name] = self.generate_chart_for_node(
+                node, dirs, plot_format, fontsize
+            )
+        logger.info(f"[STATUS] - Writing {node_stats_f} ... ")
+        with open(node_stats_f, "w") as node_stats_fh:
+            node_stats_fh.write("\n".join(node_stats) + "\n")
+        logger.info(f"[STATUS] - Writing {node_clusters_f} ... ")
+        with open(node_clusters_f, "w") as node_clusters_fh:
+            node_clusters_fh.write("\n".join(node_clusters) + "\n")
+        if render_tree:
+            self.plot_tree(header_f_by_node_name, charts_f_by_node_name, dirs)
+        else:
+            self.plot_text_tree(dirs)
+
+    def compute_repetition_for_rarefaction_curve(
+        self,
+        ALO: AttributeLevel,
+        attribute: str,
+        level: str,
+        rarefaction_by_samplesize_by_level_by_attribute: Dict[
+            str, Dict[str, Dict[int, List[int]]]
+        ],
+    ):
+        seen_cluster_ids = set()
+        random_list_of_proteome_ids = list(ALO.proteomes)
+        random.shuffle(random_list_of_proteome_ids)
+        for idx, proteome_id in enumerate(random_list_of_proteome_ids):
+            if proteome_ALO := self.ALO_by_level_by_attribute["taxon"][proteome_id]:
+                seen_cluster_ids.update(
+                    proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status[
+                        "present"
+                    ]["specific"]
+                )
+                seen_cluster_ids.update(
+                    proteome_ALO.cluster_ids_by_cluster_type_by_cluster_status[
+                        "present"
+                    ]["shared"]
+                )
+                sample_size = idx + 1
+                if (
+                    sample_size
+                    not in rarefaction_by_samplesize_by_level_by_attribute[attribute][
+                        level
+                    ]
+                ):
+                    rarefaction_by_samplesize_by_level_by_attribute[attribute][level][
+                        sample_size
+                    ] = []
+                rarefaction_by_samplesize_by_level_by_attribute[attribute][level][
+                    sample_size
+                ].append(len(seen_cluster_ids))
+
+    def compute_rarefaction_data(
+        self, repetitions: int
+    ) -> Dict[str, Dict[str, Dict[int, List[int]]]]:
+        """
+        Compute rarefaction data and generate rarefaction curves for proteome clusters.
+
+        This method computes rarefaction curves to analyze the accumulation of non-singleton
+        clusters as proteome samples increase. It generates plots for each attribute based on
+        the specified parameters.
+
+        Args:
+        - repetitions: Number of repetitions to shuffle proteome lists for random sampling.
+
+        Returns:
+        - Dict[str, Dict[str, Dict[int, List[int]]]]
+        """
+        rarefaction_by_samplesize_by_level_by_attribute: Dict[
+            str, Dict[str, Dict[int, List[int]]]
+        ] = {}
+        logger.info("[STATUS] - Generating rarefaction data ...")
+        for attribute in self.attributes:
+            for level in self.proteome_ids_by_level_by_attribute[attribute]:
+                proteome_ids = self.proteome_ids_by_level_by_attribute[attribute][level]
+                if len(proteome_ids) == 1:
+                    continue
+
+                if attribute not in rarefaction_by_samplesize_by_level_by_attribute:
+                    rarefaction_by_samplesize_by_level_by_attribute[attribute] = {}
+                if (
+                    level
+                    not in rarefaction_by_samplesize_by_level_by_attribute[attribute]
+                ):
+                    rarefaction_by_samplesize_by_level_by_attribute[attribute][
+                        level
+                    ] = {}
+                ALO = self.ALO_by_level_by_attribute[attribute][level]
+                if ALO is None:
+                    continue
+                for _ in range(repetitions):
+                    self.compute_repetition_for_rarefaction_curve(
+                        ALO=ALO,
+                        attribute=attribute,
+                        level=level,
+                        rarefaction_by_samplesize_by_level_by_attribute=rarefaction_by_samplesize_by_level_by_attribute,
+                    )
+        return rarefaction_by_samplesize_by_level_by_attribute
diff --git a/src/core/build.py b/src/core/build.py
new file mode 100644
index 0000000..772fd0d
--- /dev/null
+++ b/src/core/build.py
@@ -0,0 +1,393 @@
+import json
+import logging
+import os
+from collections import Counter, OrderedDict, defaultdict
+from typing import Dict, List, Optional, Set
+
+from core.alo_collections import AloCollection
+from core.clusters import Cluster, ClusterCollection
+from core.logic import (
+    add_taxid_attributes,
+    parse_attributes_from_config_data,
+    parse_fasta_dir,
+    parse_go_mapping,
+    parse_ipr_mapping,
+    parse_pfam_mapping,
+    parse_tree_from_file,
+)
+from core.proteins import Protein, ProteinCollection
+from core.utils import progress, yield_file_lines
+
+logger = logging.getLogger("kinfin_logger")
+
+
+def get_singletons(
+    proteinCollection: ProteinCollection,
+    cluster_list: List[Cluster],
+) -> int:
+    """
+    Identify and create singleton clusters for unclustered proteins in a protein collection.
+
+    Args:
+    - proteinCollection (ProteinCollection): An instance of ProteinCollection class.
+    - cluster_list (List[Cluster]): A list to which new singleton Cluster objects will be appended.
+
+    Returns:
+    - int: Number of singleton clusters created and appended to cluster_list.
+
+    This function iterates through proteins in the given protein collection that are not yet clustered.
+    For each unclustered protein, it creates a new singleton cluster and appends it to cluster_list.
+    """
+    logger.info("[STATUS] - Inferring singletons ...")
+    singleton_idx = 0
+    for protein in proteinCollection.proteins_list:
+        if protein.clustered is False:
+            cluster_id = f"singleton_{singleton_idx}"
+            cluster = Cluster(
+                cluster_id,
+                [protein.protein_id],
+                proteinCollection,
+            )
+            cluster_list.append(cluster)
+            singleton_idx += 1
+    return singleton_idx
+
+
+def parse_cluster_file(
+    output_dir: str,
+    cluster_f: str,
+    proteinCollection: ProteinCollection,
+    available_proteomes: Set[str],
+) -> List[Cluster]:
+    """
+    Parses a cluster file to create Cluster objects and updates protein information.
+    Saves the filtered clustering data and stats to files.
+
+    Args:
+        output_dir (str): Base directory path for saving files.
+        cluster_f (str): Path to the cluster file.
+        proteinCollection (ProteinCollection): Collection of Protein objects.
+        available_proteomes (Set[str]): Set of all available proteomes.
+
+    Returns:
+        Tuple[List[Cluster], Dict[str, any]]: List of Cluster objects and stats.
+
+    Raises:
+        FileNotFoundError: If the cluster file `cluster_f` does not exist.
+    """
+    cluster_list: List[Cluster] = []
+    stats = {
+        "total_clusters": 0,
+        "total_proteins": 0,
+        "total_proteomes": len(available_proteomes),
+        "filtered_clusters": 0,
+        "filtered_proteins": 0,
+        "included_proteins": [],
+        "excluded_proteins": [],
+        "included_proteomes": defaultdict(int),
+        "excluded_proteomes": defaultdict(int),
+    }
+
+    output_filtered_file = os.path.join(output_dir, "orthogroups.filtered.txt")
+    stats_file = os.path.join(output_dir, "summary.json")
+
+    with open(cluster_f) as fh, open(output_filtered_file, "w") as ofh:
+        for line in fh:
+            stats["total_clusters"] += 1
+            temp: List[str] = line.rstrip("\n").split(" ")
+            cluster_id, protein_ids = temp[0].replace(":", ""), temp[1:]
+            protein_ids = [protein_id for protein_id in protein_ids if protein_id]
+
+            filtered_protein_ids = []
+            for protein_id in protein_ids:
+                proteome_id = protein_id.split(".")[0]  # Extract proteome ID
+                if proteome_id in available_proteomes:
+                    filtered_protein_ids.append(protein_id)
+                    stats["included_proteins"].append(protein_id)
+                    stats["included_proteomes"][proteome_id] += 1
+                else:
+                    stats["excluded_proteins"].append(protein_id)
+                    stats["excluded_proteomes"][proteome_id] += 1
+
+            stats["total_proteins"] += len(protein_ids)
+            stats["filtered_proteins"] += len(filtered_protein_ids)
+
+            if filtered_protein_ids:
+                # Only create a cluster if there are proteins left after filtering
+                cluster = Cluster(cluster_id, filtered_protein_ids, proteinCollection)
+                for protein_id in filtered_protein_ids:
+                    protein = proteinCollection.proteins_by_protein_id[protein_id]
+                    protein.clustered = True
+                cluster_list.append(cluster)
+                filtered_protein_ids.sort()
+                ofh.write(f"{cluster_id}: {', '.join(filtered_protein_ids)}\n")
+                stats["filtered_clusters"] += 1
+
+    stats["included_proteins_count"] = len(set(stats["included_proteins"]))
+    stats["excluded_proteins_count"] = len(set(stats["excluded_proteins"]))
+
+    # Convert proteome counts to lists of counts for JSON serialization
+    stats["included_proteomes"] = dict(stats["included_proteomes"])
+    stats["excluded_proteomes"] = dict(stats["excluded_proteomes"])
+
+    # Reorder stats
+    ordered_stats = OrderedDict(
+        [
+            ("total_clusters", stats["total_clusters"]),
+            ("total_proteins", stats["total_proteins"]),
+            ("total_proteomes", stats["total_proteomes"]),
+            ("filtered_clusters", stats["filtered_clusters"]),
+            ("filtered_proteins", stats["filtered_proteins"]),
+            ("included_proteins_count", stats["included_proteins_count"]),
+            ("excluded_proteins_count", stats["excluded_proteins_count"]),
+            ("included_proteomes", stats["included_proteomes"]),
+            ("excluded_proteomes", stats["excluded_proteomes"]),
+            ("included_proteins", stats["included_proteins"]),
+            ("excluded_proteins", stats["excluded_proteins"]),
+        ]
+    )
+
+    with open(stats_file, "w") as mf:
+        json.dump(
+            ordered_stats,
+            mf,
+            separators=(", ", ": "),
+            indent=4,
+        )
+
+    return cluster_list
+
+
+# cli
+def parse_domains_from_functional_annotations_file(
+    functional_annotation_f: str,
+    proteinCollection: ProteinCollection,
+) -> None:
+    """
+    Parse functional annotations from a file and populate ProteinCollection with parsed data.
+
+    Parameters:
+    - functional_annotation_f (str): Path to the functional annotation file.
+    - proteinCollection (ProteinCollection): Instance of ProteinCollection class to store parsed data.
+    - pfam_mapping (bool): Flag indicating whether to parse Pfam mappings.
+    - ipr_mapping (bool): Flag indicating whether to parse InterPro mappings.
+    - pfam_mapping_f (str): File path to the Pfam mapping file.
+    - ipr_mapping_f (str): File path to the InterPro mapping file.
+    - go_mapping_f (str): File path to the GO mapping file.
+
+    Raises:
+    - ValueError: If the functional annotation file lacks a header.
+
+    Notes:
+    - The function reads each line of the functional annotation file, parses relevant data,
+      and populates the proteinCollection with domain annotations and GO terms.
+    - It also optionally parses additional mappings (Pfam, InterPro, GO) based on provided flags.
+    - Updates proteinCollection.functional_annotation_parsed and proteinCollection.domain_desc_by_id_by_source.
+    """
+
+    logger.info(
+        f"[STATUS] - Parsing {functional_annotation_f} ... this may take a while"
+    )
+
+    for line in yield_file_lines(functional_annotation_f):
+        temp: List[str] = line.split()
+        if temp[0].startswith("#"):
+            proteinCollection.domain_sources = temp[1:]
+
+        else:
+            if not proteinCollection.domain_sources:
+                error_msg = f"[ERROR] - {functional_annotation_f} does not seem to have a header."
+                raise ValueError(error_msg)
+
+            domain_protein_id: str = temp.pop(0)
+            go_terms: List[str] = []
+            domain_counter_by_domain_source: Dict[str, Counter[str]] = {}
+            for idx, field in enumerate(temp):
+                if field != "None":
+                    domain_source: str = proteinCollection.domain_sources[idx]
+                    domain_string: List[str] = field.split(";")
+                    domain_counts_by_domain_id: Dict[str, int] = {}
+                    for domain_id_count in domain_string:
+                        domain_id: str
+                        domain_count: int = 1
+                        if domain_source == "GO":
+                            domain_id = domain_id_count
+                        else:
+                            domain_id, domain_count_str = domain_id_count.rsplit(":", 2)
+                            domain_count = int(domain_count_str)
+                        domain_counts_by_domain_id[domain_id] = domain_count
+                    domain_counter: Counter[str] = Counter(domain_counts_by_domain_id)
+                    domain_counter_by_domain_source[domain_source] = domain_counter
+            proteinCollection.add_annotation_to_protein(
+                domain_protein_id=domain_protein_id,
+                domain_counter_by_domain_source=domain_counter_by_domain_source,
+                go_terms=go_terms,
+            )
+
+    proteinCollection.functional_annotation_parsed = True
+
+
+# common
+def build_AloCollection(
+    config_f: str,
+    nodesdb_f: str,
+    taxranks: List[str],
+    tree_f: Optional[str],
+    taxon_idx_mapping_file: Optional[str],
+) -> AloCollection:
+    """
+    Builds an AloCollection object from command-line interface (CLI) inputs.
+
+    Args:
+        config_f (str): Path to the configuration file containing proteome attributes.
+        nodesdb_f (str): Path to the nodes database file for inferring taxonomic ranks.
+        taxranks (List[str]): List of taxonomic ranks to be inferred.
+        tree_f (Optional[str]): Path to the tree file. If provided, ALOs are added from the tree.
+
+    Returns:
+        AloCollection: An instance of the AloCollection class containing parsed data.
+    """
+    (
+        proteomes,
+        proteome_id_by_species_id,
+        attributes,
+        level_by_attribute_by_proteome_id,
+    ) = parse_attributes_from_config_data(config_f, taxon_idx_mapping_file)
+    # Add taxonomy if needed
+    if "TAXID" in set(attributes):
+        logger.info(
+            "[STATUS] - Attribute 'TAXID' found, inferring taxonomic ranks from nodesDB"
+        )
+        attributes, level_by_attribute_by_proteome_id = add_taxid_attributes(
+            attributes=attributes,
+            level_by_attribute_by_proteome_id=level_by_attribute_by_proteome_id,
+            nodesdb_f=nodesdb_f,
+            taxranks=taxranks,
+        )
+
+    # Add ALOs from tree if provided
+    tree_ete, node_idx_by_proteome_ids = parse_tree_from_file(
+        tree_f,
+        attributes,
+        level_by_attribute_by_proteome_id,
+        proteomes,
+    )
+
+    logger.info("[STATUS] - Building AloCollection ...")
+    return AloCollection(
+        proteomes=proteomes,
+        attributes=attributes,
+        proteome_id_by_species_id=proteome_id_by_species_id,
+        level_by_attribute_by_proteome_id=level_by_attribute_by_proteome_id,
+        node_idx_by_proteome_ids=node_idx_by_proteome_ids,
+        tree_ete=tree_ete,
+    )
+
+
+def get_protein_list_from_seq_f(sequence_ids_f: str, aloCollection: AloCollection):
+    logger.info(f"[STATUS] - Parsing sequence IDs: {sequence_ids_f} ...")
+
+    proteins_list: List[Protein] = []
+    for line in yield_file_lines(sequence_ids_f):
+        temp = line.split(": ")
+        sequence_id = temp[0]
+        protein_id = (
+            temp[1]
+            .split(" ")[0]
+            .replace(":", "_")
+            .replace(",", "_")
+            .replace("(", "_")
+            .replace(")", "_")
+        )  # orthofinder replaces characters
+        species_id = sequence_id.split("_")[0]
+        if proteome_id := aloCollection.proteome_id_by_species_id.get(species_id, None):
+            protein = Protein(protein_id, proteome_id, species_id, sequence_id)
+            proteins_list.append(protein)
+    return proteins_list
+
+
+# common
+def build_ProteinCollection(
+    sequence_ids_f: str,
+    aloCollection: AloCollection,
+    fasta_dir: Optional[str],
+    species_ids_f: Optional[str],
+    functional_annotation_f: Optional[str],
+    pfam_mapping: bool,
+    ipr_mapping: bool,
+    pfam_mapping_f: str,
+    go_mapping_f: str,
+    ipr_mapping_f: str,
+) -> ProteinCollection:
+    proteins_list = get_protein_list_from_seq_f(
+        sequence_ids_f=sequence_ids_f,
+        aloCollection=aloCollection,
+    )
+    proteinCollection = ProteinCollection(proteins_list)
+
+    logger.info(f"[STATUS]\t - Proteins found = {proteinCollection.protein_count}")
+
+    if fasta_dir is not None and species_ids_f is not None:
+        fasta_len_by_protein_id = parse_fasta_dir(
+            fasta_dir=fasta_dir,
+            species_ids_f=species_ids_f,
+        )
+        logger.info("[STATUS] - Adding FASTAs to ProteinCollection ...")
+        parse_steps: float = proteinCollection.protein_count / 100
+        for idx, protein in enumerate(proteinCollection.proteins_list):
+            protein.update_length(fasta_len_by_protein_id[protein.protein_id])
+            progress(idx + 1, parse_steps, proteinCollection.protein_count)
+        aloCollection.fastas_parsed = True
+        proteinCollection.fastas_parsed = True
+    else:
+        logger.info(
+            "[STATUS] - No Fasta-Dir given, no AA-span information will be reported ..."
+        )
+
+    if functional_annotation_f is not None:
+        parse_domains_from_functional_annotations_file(
+            functional_annotation_f=functional_annotation_f,
+            proteinCollection=proteinCollection,
+        )
+        domain_desc_by_id_by_source = {}
+
+        if pfam_mapping and "Pfam" in proteinCollection.domain_sources:
+            domain_desc_by_id_by_source["Pfam"] = parse_pfam_mapping(pfam_mapping_f)
+
+        if ipr_mapping and "IPR" in proteinCollection.domain_sources:
+            domain_desc_by_id_by_source["IPR"] = parse_ipr_mapping(ipr_mapping_f)
+
+        if go_mapping_f:
+            domain_desc_by_id_by_source["GO"] = parse_go_mapping(go_mapping_f)
+
+        proteinCollection.domain_desc_by_id_by_source = domain_desc_by_id_by_source
+
+    return proteinCollection
+
+
+def build_ClusterCollection(
+    output_dir: str,
+    cluster_f: str,
+    proteinCollection: ProteinCollection,
+    infer_singletons: Optional[bool],
+    available_proteomes: Set[str],
+) -> ClusterCollection:
+    logger.info(f"[STATUS] - Parsing {cluster_f} ... this may take a while")
+    cluster_list: List[Cluster] = parse_cluster_file(
+        output_dir,
+        cluster_f,
+        proteinCollection,
+        available_proteomes,
+    )
+
+    inferred_singletons_count = 0
+    if infer_singletons:
+        inferred_singletons_count = get_singletons(proteinCollection, cluster_list)
+
+    return ClusterCollection(
+        cluster_list,
+        inferred_singletons_count,
+        proteinCollection.functional_annotation_parsed,
+        proteinCollection.fastas_parsed,
+        proteinCollection.domain_sources,
+    )
diff --git a/src/core/clusters.py b/src/core/clusters.py
new file mode 100644
index 0000000..79cff89
--- /dev/null
+++ b/src/core/clusters.py
@@ -0,0 +1,196 @@
+from collections import Counter
+from math import log
+from typing import DefaultDict, Dict, FrozenSet, List, Literal, Optional, Set
+
+from core.logic import compute_protein_ids_by_proteome
+from core.proteins import ProteinCollection
+from core.utils import mean, median, sd
+
+
+class Cluster:
+    def __init__(
+        self,
+        cluster_id: str,
+        protein_ids: List[str],
+        proteinCollection: ProteinCollection,
+    ) -> None:
+        self.cluster_id: str = cluster_id
+        self.protein_ids = set(protein_ids)
+        self.protein_count: int = len(protein_ids)
+        try:
+
+            self.proteomes_by_protein_id: Dict[str, str] = {
+                _id: proteinCollection.proteins_by_protein_id[_id].proteome_id
+                for _id in protein_ids
+            }
+        except KeyError as e:
+            error_msg = f"[ERROR] - Protein {e.args[0]} in clustering belongs to proteomes that are not present in the config-file."
+            error_msg += (
+                "Please add those proteomes or recluster by omitting these proteomes."
+            )
+            raise KeyError(error_msg) from e
+
+        self.proteome_ids_list: List[str] = list(self.proteomes_by_protein_id.values())
+        self.protein_count_by_proteome_id: Counter[str] = Counter(
+            self.proteome_ids_list
+        )
+        self.proteome_ids: FrozenSet[str] = frozenset(self.proteome_ids_list)
+        self.proteome_count: int = len(self.proteome_ids)
+        self.singleton: bool = self.protein_count <= 1
+        self.apomorphy: bool = self.proteome_count <= 1
+
+        self.protein_ids_by_proteome_id: DefaultDict[str, Set[str]] = (
+            compute_protein_ids_by_proteome(self.proteomes_by_protein_id)
+        )
+        self.protein_counts_of_proteomes_by_level_by_attribute: Dict[
+            str, Dict[str, List[int]]
+        ] = {}
+        self.proteome_coverage_by_level_by_attribute: Dict[str, Dict[str, float]] = {}
+        self.implicit_protein_ids_by_proteome_id_by_level_by_attribute: Dict[
+            str, Dict[str, Dict[str, List[str]]]
+        ] = {}
+        self.cluster_type_by_attribute: Dict[
+            str,
+            Literal["singleton", "shared", "specific"],
+        ] = {}
+        self.protein_median: Optional[float] = None
+        self.protein_length_stats: Optional[Dict[str, float]] = (
+            self.compute_protein_length_stats(proteinCollection, self.protein_ids)
+        )
+        self.secreted_cluster_coverage: float = self.compute_secreted_cluster_coverage(
+            proteinCollection, self.protein_ids, self.protein_count
+        )
+        self.domain_counter_by_domain_source: Dict[str, Counter[str]] = (
+            self.compute_domain_counter_by_domain_source(
+                proteinCollection, self.protein_ids
+            )
+        )
+        self.domain_entropy_by_domain_source: Dict[str, float] = (
+            self.compute_domain_entropy_by_domain_source()
+        )
+
+    def compute_protein_length_stats(
+        self,
+        proteinCollection: ProteinCollection,
+        protein_ids: Set[str],
+    ) -> Optional[Dict[str, float]]:
+        """
+        Computes statistics (mean, median, standard deviation) of protein lengths.
+
+        Parameters:
+        - proteinCollection: A ProteinCollection object containing protein data.
+        - protein_ids: A set of protein IDs for which lengths are to be computed.
+
+        Returns:
+        - Optional[Dict[str, float]]: A dictionary containing 'mean', 'median', and 'sd'
+          (standard deviation) of protein lengths, if all lengths are available and at least
+          one protein ID is provided. Returns None if no valid protein lengths are found.
+        """
+        protein_lengths: List[Optional[int]] = [
+            proteinCollection.proteins_by_protein_id[protein_id].length
+            for protein_id in protein_ids
+        ]
+        if all(protein_lengths):
+            protein_length_stats: Dict[str, float] = {"mean": mean(protein_lengths)}
+            protein_length_stats["median"] = median(protein_lengths)
+            protein_length_stats["sd"] = sd(protein_lengths)
+            return protein_length_stats
+
+    def compute_secreted_cluster_coverage(
+        self,
+        proteinCollection: ProteinCollection,
+        protein_ids: Set[str],
+        protein_count: int,
+    ) -> float:
+        """
+        Computes the fraction of secreted proteins in a given set of protein IDs.
+
+        Parameters:
+        - proteinCollection: A ProteinCollection object containing protein data.
+        - protein_ids: A set of protein IDs to compute secreted protein coverage.
+        - protein_count: Total count of proteins in the cluster.
+
+        Returns:
+        - float: Fraction of secreted proteins in the provided set of protein IDs.
+        """
+        secreted = sum(
+            bool(proteinCollection.proteins_by_protein_id[protein_id].secreted)
+            for protein_id in protein_ids
+        )
+        return secreted / protein_count
+
+    def compute_domain_counter_by_domain_source(
+        self,
+        proteinCollection: ProteinCollection,
+        protein_ids: Set[str],
+    ) -> Dict[str, Counter[str]]:
+        """
+        Computes the aggregated domain counts by domain source for a set of protein IDs.
+
+        Parameters:
+        - proteinCollection: A ProteinCollection object containing protein data.
+        - protein_ids: A set of protein IDs for which domain counts are computed.
+
+        Returns:
+        - Dict[str, Counter[str]]: A dictionary where keys are domain sources and values are
+          Counters mapping domain IDs to their respective counts.
+        """
+        cluster_domain_counter_by_domain_source: Dict[str, Counter[str]] = {}
+        for protein_id in protein_ids:
+            if protein_domain_counter_by_domain_source := proteinCollection.proteins_by_protein_id[
+                protein_id
+            ].domain_counter_by_domain_source:
+                for domain_source, protein_domain_counter in list(
+                    protein_domain_counter_by_domain_source.items()
+                ):
+                    if domain_source not in cluster_domain_counter_by_domain_source:
+                        cluster_domain_counter_by_domain_source[domain_source] = (
+                            Counter()
+                        )
+                    cluster_domain_counter_by_domain_source[
+                        domain_source
+                    ] += protein_domain_counter
+        return cluster_domain_counter_by_domain_source
+
+    def compute_domain_entropy_by_domain_source(self) -> Dict[str, float]:
+        """
+        Computes entropy for domains grouped by different sources.
+
+        Returns:
+        - Dict[str, float]: Dictionary where keys are domain sources and values are computed entropy values.
+        """
+        self.domain_entropy_by_domain_source: Dict[str, float] = {}
+        for domain_source, domain_counter in list(
+            self.domain_counter_by_domain_source.items()
+        ):
+            total_count: int = len(list(domain_counter.elements()))
+            domain_entropy: float = -sum(
+                i / total_count * log(i / total_count, 2)
+                for i in list(domain_counter.values())
+            )
+            if str(domain_entropy) == "-0.0":
+                self.domain_entropy_by_domain_source[domain_source] = 0.0
+            else:
+                self.domain_entropy_by_domain_source[domain_source] = domain_entropy
+        return self.domain_entropy_by_domain_source
+
+
+class ClusterCollection:
+    def __init__(
+        self,
+        cluster_list: List[Cluster],
+        inferred_singletons_count: int,
+        functional_annotation_parsed: bool,
+        fastas_parsed: bool,
+        domain_sources: List[str],
+    ):
+        self.cluster_list: List[Cluster] = cluster_list
+        self.cluster_list_by_cluster_id: Dict[str, Cluster] = {
+            cluster.cluster_id: cluster for cluster in cluster_list
+        }  # only for testing
+        self.cluster_count: int = len(cluster_list)
+        self.inferred_singletons_count: int = inferred_singletons_count
+        self.functional_annotation_parsed: bool = functional_annotation_parsed
+        self.fastas_parsed: bool = fastas_parsed
+        # self.domain_sources = [domain_source for domain_source in domain_sources if not domain_source == "GO"]
+        self.domain_sources: List[str] = domain_sources
diff --git a/src/core/config.py b/src/core/config.py
new file mode 100644
index 0000000..098c545
--- /dev/null
+++ b/src/core/config.py
@@ -0,0 +1,15 @@
+ATTRIBUTE_RESERVED = ["IDX", "OUT", "TAXID"]
+SUPPORTED_TESTS = {"welch", "mannwhitneyu", "ttest", "ks", "kruskal"}
+SUPPORTED_PLOT_FORMATS = {"png", "pdf", "svg"}
+SUPPORTED_TAXRANKS = {
+    "superkingdom",
+    "kingdom",
+    "phylum",
+    "class",
+    "order",
+    "superfamily",
+    "family",
+    "subfamily",
+    "genus",
+    "species",
+}
diff --git a/src/core/datastore.py b/src/core/datastore.py
new file mode 100644
index 0000000..58a4a73
--- /dev/null
+++ b/src/core/datastore.py
@@ -0,0 +1,2105 @@
+import logging
+import os
+import time
+from collections import Counter, defaultdict
+from typing import Any, Dict, FrozenSet, Generator, List, Set, Tuple, Union
+
+import matplotlib as mat
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib.lines import Line2D
+from matplotlib.ticker import FormatStrFormatter, NullFormatter
+
+from core.alo import AttributeLevel
+from core.alo_collections import AloCollection
+from core.build import (
+    build_AloCollection,
+    build_ClusterCollection,
+    build_ProteinCollection,
+)
+from core.clusters import Cluster, ClusterCollection
+from core.input import InputData
+from core.logic import get_ALO_cluster_cardinality, get_attribute_cluster_type
+from core.proteins import ProteinCollection
+from core.utils import median, progress, statistic
+
+logger = logging.getLogger("kinfin_logger")
+mat.use("agg")
+
+plt.style.use("ggplot")
+mat.rc("ytick", labelsize=20)
+mat.rc("xtick", labelsize=20)
+axis_font = {"size": "20"}
+mat.rcParams.update({"font.size": 22})
+
+
+class DataFactory:
+    def __init__(self, inputData: InputData) -> None:
+        self.dirs = {}
+        self.inputData: InputData = inputData
+        self.aloCollection: AloCollection = build_AloCollection(
+            config_f=self.inputData.config_f,
+            nodesdb_f=self.inputData.nodesdb_f,
+            tree_f=self.inputData.tree_f,
+            taxranks=self.inputData.taxranks,
+            taxon_idx_mapping_file=self.inputData.taxon_idx_mapping_file,
+        )
+        self.proteinCollection: ProteinCollection = build_ProteinCollection(
+            aloCollection=self.aloCollection,
+            fasta_dir=self.inputData.fasta_dir,
+            go_mapping_f=self.inputData.go_mapping_f,
+            functional_annotation_f=self.inputData.functional_annotation_f,
+            ipr_mapping=self.inputData.ipr_mapping,
+            ipr_mapping_f=self.inputData.ipr_mapping_f,
+            pfam_mapping=self.inputData.pfam_mapping,
+            pfam_mapping_f=self.inputData.pfam_mapping_f,
+            sequence_ids_f=self.inputData.sequence_ids_f,
+            species_ids_f=self.inputData.species_ids_f,
+        )
+        self.clusterCollection: ClusterCollection = build_ClusterCollection(
+            cluster_f=self.inputData.cluster_f,
+            output_dir=self.inputData.output_path,
+            proteinCollection=self.proteinCollection,
+            infer_singletons=self.inputData.infer_singletons,
+            available_proteomes=self.aloCollection.proteomes,
+        )
+
+    def setup_dirs(self) -> None:
+        """
+        Set up output directories for storing results and attributes.
+        """
+        output_path: str = self.inputData.output_path
+
+        self.dirs["main"] = output_path
+        logger.info("[STATUS] - Output directories in")
+        logger.info(f"\t{output_path}")
+        if not os.path.exists(output_path):
+            logger.info("[STATUS] - Creating main output directory...")
+            os.makedirs(output_path)
+
+        logger.info("[STATUS] - Creating directories ...")
+        for attribute in self.aloCollection.attributes:
+            attribute_path = os.path.join(output_path, attribute)
+            self.dirs[attribute] = attribute_path
+            if not os.path.exists(attribute_path):
+                logger.info(
+                    f"[STATUS] - Creating directory for attribute: {attribute_path}"
+                )
+                os.makedirs(attribute_path)
+
+        if self.aloCollection.tree_ete is not None:
+            tree_path = os.path.join(output_path, "tree")
+            node_chart_path = os.path.join(tree_path, "charts")
+            node_header_path = os.path.join(tree_path, "headers")
+
+            if not os.path.exists(tree_path):
+                logger.info(f"[STATUS] - Creating tree directory: {tree_path}")
+                os.makedirs(tree_path)
+                self.dirs["tree"] = tree_path
+
+            if not os.path.exists(node_chart_path):
+                logger.info(
+                    f"[STATUS] - Creating node charts directory: {node_chart_path}"
+                )
+                os.makedirs(node_chart_path)
+                self.dirs["tree_charts"] = node_chart_path
+
+            if self.inputData.plot_tree and not os.path.exists(node_header_path):
+                logger.info(
+                    f"[STATUS] - Creating node headers directory: {node_header_path}"
+                )
+                os.makedirs(node_header_path)
+                self.dirs["tree_headers"] = node_header_path
+
+    def analyse_clusters(self) -> None:
+        """
+        Analyses clusters within the cluster collection.
+
+        Then proceeds to analyse each cluster individually,
+        logging progress and timing information.
+
+        Returns:
+            None
+        """
+        if self.clusterCollection.inferred_singletons_count:
+            logger.info(
+                f"[STATUS]\t - Clusters found = {self.clusterCollection.cluster_count} (of which {self.clusterCollection.inferred_singletons_count} were inferred singletons)")  # fmt:skip
+
+        else:
+            logger.info(
+                f"[STATUS]\t - Clusters found = {self.clusterCollection.cluster_count}"
+            )
+
+        parse_steps = self.clusterCollection.cluster_count / 100
+
+        logger.info("[STATUS] - Analysing clusters ...")
+        analyse_clusters_start = time.time()
+        for idx, cluster in enumerate(self.clusterCollection.cluster_list):
+            self.__analyse_cluster(cluster)
+            progress(idx + 1, parse_steps, self.clusterCollection.cluster_count)
+        analyse_clusters_end = time.time()
+        analyse_clusters_elapsed = analyse_clusters_end - analyse_clusters_start
+        logger.info(f"[STATUS] - Took {analyse_clusters_elapsed}s to analyse clusters")
+
+    def plot_rarefaction_data(
+        self,
+        rarefaction_by_samplesize_by_level_by_attribute: Dict[
+            str, Dict[str, Dict[int, List[int]]]
+        ],
+        dirs: Dict[str, str],
+        plotsize: Tuple[float, float],
+        plot_format: str,
+        fontsize: int,
+    ) -> None:
+        """
+        Plot rarefaction curves based on provided data.
+
+        Args:
+            rarefaction_by_samplesize_by_level_by_attribute (dict): A nested dictionary
+                where keys are attribute names, and values are dictionaries where keys
+                are level names and values are dictionaries mapping sample sizes to
+                lists of non-singleton cluster counts.
+            dirs (dict): A dictionary mapping attribute names to directory paths where
+                plots will be saved.
+            plotsize (tuple): A tuple specifying the size of the plot (width, height) in inches.
+            plot_format (str): The format of the plot to save (e.g., 'png', 'pdf').
+            fontsize (int): Font size for plot labels and legend.
+
+        Returns:
+            None
+        """
+        for (
+            attribute,
+            rarefaction_by_samplesize_by_level,
+        ) in rarefaction_by_samplesize_by_level_by_attribute.items():
+            rarefaction_plot_f = os.path.join(
+                dirs[attribute], f"{attribute}.rarefaction_curve.{plot_format}"
+            )
+            f, ax = plt.subplots(figsize=plotsize)
+            ax.set_facecolor("white")
+            max_number_of_samples = 0
+            for idx, level in enumerate(rarefaction_by_samplesize_by_level):
+                number_of_samples = len(rarefaction_by_samplesize_by_level[level])
+                if number_of_samples > max_number_of_samples:
+                    max_number_of_samples = number_of_samples
+                colour = plt.cm.Paired(  # type: ignore
+                    idx / len(rarefaction_by_samplesize_by_level)
+                )  # type: ignore
+                x_values = []
+                y_mins = []
+                y_maxs = []
+                median_y_values = []
+                median_x_values = []
+                for x, y_reps in list(
+                    rarefaction_by_samplesize_by_level[level].items()
+                ):
+                    x_values.append(x)
+                    y_mins.append(min(y_reps))
+                    y_maxs.append(max(y_reps))
+                    median_y_values.append(median(y_reps))
+                    median_x_values.append(x)
+                x_array = np.array(x_values)
+                y_mins_array = np.array(y_mins)
+                y_maxs_array = np.array(y_maxs)
+                ax.plot(
+                    median_x_values,
+                    median_y_values,
+                    "-",
+                    color=colour,
+                    label=level,
+                )
+                ax.fill_between(
+                    x_array,
+                    y_mins_array,  # type:ignore
+                    y_maxs_array,  # type:ignore
+                    color=colour,
+                    alpha=0.5,
+                )
+            ax.set_xlim([0, max_number_of_samples + 1])
+            ax.set_ylabel("Count of non-singleton clusters", fontsize=fontsize)
+            ax.set_xlabel("Sampled proteomes", fontsize=fontsize)
+
+            ax.grid(True, linewidth=1, which="major", color="lightgrey")
+            legend = ax.legend(
+                ncol=1,
+                numpoints=1,
+                loc="lower right",
+                frameon=True,
+                fontsize=fontsize,
+            )
+            legend.get_frame().set_facecolor("white")
+            logger.info(f"[STATUS]\t- Plotting {rarefaction_plot_f}")
+            f.savefig(rarefaction_plot_f, format=plot_format)
+            plt.close()
+
+    def write_output(self) -> None:
+        """
+        Executes various methods to generate and write output files related to cluster analysis.
+
+        This method sequentially calls private methods to:
+        - Plot cluster sizes.
+        - Write cluster counts by taxon.
+        - Write cluster metrics related to domains.
+        - Write detailed cluster metrics related to domains.
+        - Write attribute metrics.
+        - Write a summary of cluster metrics.
+        - Write cluster metrics related to ALO (Additive Log Ratio) transformation.
+        - Write cluster 1-to-1 ALO metrics.
+        - Write pairwise representation metrics.
+
+        Each private method is responsible for generating specific outputs based on internal data.
+
+        Returns:
+            None
+        """
+        self.__plot_cluster_sizes()
+        self.__write_cluster_counts_by_taxon()
+        self.__write_cluster_metrics_domains()
+        self.__write_cluster_metrics_domains_detailed()
+        self.__write_attribute_metrics()
+        self.__write_cluster_summary()
+        self.__write_cluster_metrics_ALO()
+        self.__write_cluster_1to1_ALO()
+        self.__write_pairwise_representation()
+
+    # analyse cluster
+    def __analyse_ete_for_specific_cluster(
+        self,
+        cluster: Cluster,
+        intersection: FrozenSet[str],
+        node,
+    ) -> None:
+        """
+        Analyzes a specific cluster within an evolutionary tree node.
+
+        Updates various counts and attributes of the node based on the characteristics
+        of the given cluster and its intersection with proteome IDs.
+
+        Args:
+            cluster (Cluster): The cluster to analyze.
+            intersection (FrozenSet[str]): The intersection of proteome IDs between
+                the cluster and the current node.
+            node: The evolutionary tree node to update.
+
+        Returns:
+            None
+        """
+        node.counts["specific"] += 1  # type: ignore
+        if cluster.proteome_count == 1:
+            # But it only belongs to one proteome
+            node.apomorphic_cluster_counts["non_singletons"] += 1  # type: ignore
+        else:
+            # It has more than one proteome
+            child_nodes_covered = []
+            child_node_proteome_coverage_strings = []
+            child_node_proteome_ids_covered_count = 0
+            for child_node in node.get_children():
+                if child_node.proteome_ids.isdisjoint(cluster.proteome_ids):
+                    # No child node proteomes are not in cluster
+                    child_nodes_covered.append(False)
+                else:
+                    # At least on child node proteome in cluster
+                    child_nodes_covered.append(True)
+                    child_node_proteome_ids_covered_count = len(
+                        cluster.proteome_ids.intersection(child_node.proteome_ids)
+                    )
+                    child_node_proteome_coverage_strings.append(
+                        f"{child_node.name}=({child_node_proteome_ids_covered_count}/{len(child_node.proteome_ids)})"
+                    )
+            if all(child_nodes_covered):
+                # At least one proteome of each child node in cluster
+                # => SYNAPOMORPHY
+                node_proteome_coverage = len(intersection) / len(
+                    node.proteome_ids
+                )  # type: ignore
+                node_cluster_type = ""
+                node_cluster_type = (
+                    "complete_presence"
+                    if node_proteome_coverage == 1.0
+                    else "partial_absence"
+                )
+                # type: ignore
+                node.synapomorphic_cluster_counts[node_cluster_type] += 1
+
+                node.synapomorphic_cluster_strings.append(  # type: ignore
+                    (
+                        cluster.cluster_id,
+                        node.name,
+                        node_cluster_type,
+                        "{0:.3}".format(node_proteome_coverage),
+                        ";".join(child_node_proteome_coverage_strings),
+                        ",".join(sorted(intersection)),
+                    )
+                )
+
+    def __analyse_tree_ete(self, cluster: Cluster) -> None:
+        """
+        Analyzes a cluster within an ETE Tree if available in the ALO collection.
+
+        Traverses the ETE Tree in level order, comparing proteome IDs of each node
+        with the cluster's proteome IDs. Updates counts and attributes of nodes
+        based on the analysis results.
+
+        Args:
+            cluster (Cluster): The cluster to analyze.
+
+        Returns:
+            None
+        """
+        if not self.aloCollection.tree_ete:
+            return
+
+        for node in self.aloCollection.tree_ete.traverse("levelorder"):  # type: ignore
+            intersection = cluster.proteome_ids.intersection(
+                node.proteome_ids  # type: ignore
+            )  # type: ignore
+            difference = cluster.proteome_ids.difference(
+                node.proteome_ids  # type: ignore
+            )  # type: ignore
+
+            if len(intersection) == 0:
+                # Nothing to see here ...
+                node.counts["absent"] += 1  # type: ignore
+
+            elif cluster.singleton is True:
+                # This is a singleton
+                node.counts["singleton"] += 1  # type: ignore
+                node.apomorphic_cluster_counts["singletons"] += 1  # type: ignore
+
+            elif len(difference) > 0:
+                # This is a 'shared' cluster
+                node.counts["shared"] += 1  # type: ignore
+
+            elif len(difference) == 0:
+                # This is a node 'specific' cluster
+                self.__analyse_ete_for_specific_cluster(
+                    cluster=cluster,
+                    intersection=intersection,
+                    node=node,
+                )
+
+    def __process_level(
+        self,
+        cluster: Cluster,
+        attribute: str,
+        level: str,
+        protein_ids_by_level: Dict[str, List[str]],
+        protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]],
+        explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]],
+    ) -> None:
+        """
+        Processes a specific level within an attribute for a given cluster.
+
+        Retrieves protein IDs and their counts associated with the specified level
+        from the ALO collection and updates various attributes and collections within
+        the cluster and the class instance.
+
+        Args:
+            cluster (Cluster): The cluster for which to process the level.
+            attribute (str): The attribute associated with the level.
+            level (str): The specific level to process.
+            protein_ids_by_level (dict): A dictionary to store protein IDs by level.
+            protein_length_stats_by_level (dict): A dictionary to store protein length statistics
+                by level.
+            explicit_protein_count_by_proteome_id_by_level (dict): A dictionary to store explicit
+                protein counts by proteome ID for each level.
+
+        Returns:
+            None
+        """
+        ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level]
+        if ALO is None:
+            return
+
+        protein_ids_by_proteome_id = {}
+        protein_count_by_proteome_id = {}
+        protein_ids_by_level[level] = []
+
+        for proteome_id in ALO.proteomes_list:
+            protein_ids = list(cluster.protein_ids_by_proteome_id.get(proteome_id, []))
+            protein_ids_by_level[level].extend(protein_ids)
+            protein_count_by_proteome_id[proteome_id] = len(protein_ids)
+            if protein_count_by_proteome_id[proteome_id] != 0:
+                protein_ids_by_proteome_id[proteome_id] = protein_ids
+
+        if protein_ids_by_proteome_id:
+            cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[
+                attribute
+            ][level] = protein_ids_by_proteome_id
+
+        explicit_protein_count_by_proteome_id_by_level[level] = (
+            protein_count_by_proteome_id
+        )
+
+        protein_length_stats_by_level[level] = (
+            self.proteinCollection.get_protein_length_stats(protein_ids_by_level[level])
+        )
+
+        cluster.protein_counts_of_proteomes_by_level_by_attribute[attribute][level] = (
+            list(protein_count_by_proteome_id.values())
+        )
+
+    def __update_ALO_data(
+        self,
+        cluster: Cluster,
+        attribute: str,
+        protein_ids_by_level: Dict[str, List[str]],
+        protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]],
+        explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]],
+    ) -> None:
+        """
+        Updates ALO (Additive Log Ratio) data for a given cluster and attribute.
+
+        Iterates through each level of the ALO collection corresponding to the attribute,
+        calculates various metrics based on the cluster's protein IDs and attributes, and
+        updates the ALO object with this information.
+
+        Args:
+            cluster (Cluster): The cluster to update ALO data for.
+            attribute (str): The attribute associated with the ALO data.
+            protein_ids_by_level (dict): A dictionary mapping level names to lists of protein IDs.
+            protein_length_stats_by_level (dict): A dictionary mapping level names to dictionaries
+                containing protein length statistics.
+            explicit_protein_count_by_proteome_id_by_level (dict): A dictionary mapping level names
+                to dictionaries where keys are proteome IDs and values are explicit protein counts.
+
+        Returns:
+            None
+        """
+        for level in self.aloCollection.ALO_by_level_by_attribute[attribute]:
+            ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level]
+            if ALO is None:
+                continue
+
+            cluster.proteome_coverage_by_level_by_attribute[attribute][level] = (
+                len(
+                    cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[
+                        attribute
+                    ].get(level, [])
+                )
+                / ALO.proteome_count
+            )
+
+            ALO_cluster_status = (
+                "present"
+                if level
+                in cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[
+                    attribute
+                ]
+                else "absent"
+            )
+
+            ALO_cluster_cardinality = None
+            mwu_pvalue = None
+            mwu_log2_mean = None
+            mean_ALO_count = None
+            mean_non_ALO_count = None
+
+            if (
+                ALO_cluster_status == "present"
+                and cluster.cluster_type_by_attribute[attribute] != "singleton"
+            ):
+                ALO_proteome_counts_in_cluster = list(
+                    explicit_protein_count_by_proteome_id_by_level[level].values()
+                )
+                ALO_cluster_cardinality = get_ALO_cluster_cardinality(
+                    ALO_proteome_counts_in_cluster=ALO_proteome_counts_in_cluster,
+                    fuzzy_count=self.inputData.fuzzy_count,
+                    fuzzy_fraction=self.inputData.fuzzy_fraction,
+                    fuzzy_range=self.inputData.fuzzy_range,
+                )
+
+                if cluster.cluster_type_by_attribute[attribute] == "shared":
+                    non_ALO_proteome_counts_in_cluster = [
+                        count
+                        for non_ALO_level in explicit_protein_count_by_proteome_id_by_level
+                        if non_ALO_level != level
+                        for count in explicit_protein_count_by_proteome_id_by_level[
+                            non_ALO_level
+                        ].values()
+                    ]
+                    mwu_pvalue, mwu_log2_mean, mean_ALO_count, mean_non_ALO_count = (
+                        statistic(
+                            count_1=ALO_proteome_counts_in_cluster,
+                            count_2=non_ALO_proteome_counts_in_cluster,
+                            test=self.inputData.test,
+                            min_proteomes=self.inputData.min_proteomes,
+                        )
+                    )
+
+            ALO.add_cluster(
+                cluster=cluster,
+                attribute_cluster_type=cluster.cluster_type_by_attribute[attribute],
+                ALO_cluster_status=ALO_cluster_status,
+                ALO_protein_length_stats=protein_length_stats_by_level[level],
+                ALO_protein_ids_in_cluster=protein_ids_by_level[level],
+                ALO_cluster_cardinality=ALO_cluster_cardinality,
+                mwu_pvalue=mwu_pvalue,
+                mwu_log2_mean=mwu_log2_mean,
+                mean_ALO_count=mean_ALO_count,
+                mean_non_ALO_count=mean_non_ALO_count,
+            )
+
+    def __process_single_attribute(self, cluster: Cluster, attribute: str) -> None:
+        """
+        Processes a single attribute for a given cluster.
+
+        Retrieves and processes each level associated with the attribute from the ALO
+        collection, updating various protein and cluster metrics within the cluster object.
+
+        Args:
+            cluster (Cluster): The cluster to process the attribute for.
+            attribute (str): The attribute to process.
+
+        Returns:
+            None
+        """
+        protein_ids_by_level: Dict[str, List[str]] = {}
+        protein_length_stats_by_level: Dict[str, Dict[str, Union[int, float]]] = {}
+        explicit_protein_count_by_proteome_id_by_level: Dict[str, Dict[str, int]] = {}
+
+        cluster.protein_counts_of_proteomes_by_level_by_attribute[attribute] = {}
+        cluster.proteome_coverage_by_level_by_attribute[attribute] = {}
+        cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[attribute] = (
+            {}
+        )
+
+        for level in self.aloCollection.ALO_by_level_by_attribute[attribute]:
+            self.__process_level(
+                cluster,
+                attribute,
+                level,
+                protein_ids_by_level,
+                protein_length_stats_by_level,
+                explicit_protein_count_by_proteome_id_by_level,
+            )
+
+        cluster.cluster_type_by_attribute[attribute] = get_attribute_cluster_type(
+            cluster.singleton,
+            cluster.implicit_protein_ids_by_proteome_id_by_level_by_attribute[
+                attribute
+            ],
+        )
+
+        self.__update_ALO_data(
+            cluster,
+            attribute,
+            protein_ids_by_level,
+            protein_length_stats_by_level,
+            explicit_protein_count_by_proteome_id_by_level,
+        )
+
+    def __process_attributes(self, cluster: Cluster) -> None:
+        """
+        Processes all attributes in the ALO collection for a given cluster.
+
+        Iterates through each attribute in the ALO collection and processes it
+        using the __process_single_attribute method.
+
+        Args:
+            cluster (Cluster): The cluster to process attributes for.
+
+        Returns:
+            None
+        """
+        for attribute in self.aloCollection.attributes:
+            self.__process_single_attribute(cluster, attribute)
+
+    def __finalize_cluster_analysis(self, cluster: Cluster) -> None:
+        """
+        Finalizes the cluster analysis by calculating the median protein count.
+
+        Calculates the median protein count for the given cluster using the protein
+        counts of proteomes from specific levels and attributes.
+
+        Args:
+            cluster (Cluster): The cluster for which to finalize the analysis.
+
+        Returns:
+            None
+        """
+        cluster.protein_median = median(
+            [
+                count
+                for count in cluster.protein_counts_of_proteomes_by_level_by_attribute[
+                    "all"
+                ]["all"]
+                if count != 0
+            ]
+        )
+
+    def __analyse_cluster(self, cluster: Cluster) -> None:
+        """
+        Analyzes a cluster by performing various analysis steps.
+
+        Executes the analysis steps for the given cluster:
+        1. If an ETE tree is available in aloCollection, analyzes the tree structure.
+        2. Processes attributes associated with the cluster.
+        3. Finalizes the cluster analysis by calculating median protein counts.
+
+        Args:
+            cluster (Cluster): The cluster to be analyzed.
+
+        Returns:
+            None
+        """
+        if self.aloCollection.tree_ete:
+            self.__analyse_tree_ete(cluster=cluster)
+
+        self.__process_attributes(cluster)
+        self.__finalize_cluster_analysis(cluster)
+
+    # write output
+    # 0. __get_header_line
+    def __get_header_line(self, filetype: str, attribute: str) -> str:
+        """
+        Generates a header line for different types of file formats based on the provided
+        `filetype` and `attribute`.
+
+        Args:
+            filetype (str): The type of file for which the header line is generated. Valid values:
+                            - "attribute_metrics": Header line for attribute metrics.
+                            - "cafe": Header line for CAFE analysis.
+                            - "cluster_1to1s_ALO": Header line for cluster 1-to-1 relationships with ALO.
+                            - "cluster_metrics": Header line for cluster metrics.
+                            - "cluster_metrics_ALO": Header line for cluster metrics with ALO.
+                            - "cluster_metrics_domains": Header line for cluster metrics with domains.
+                            - "cluster_metrics_domains_detailed": Header line for detailed cluster metrics with domains.
+                            - "pairwise_representation_test": Header line for pairwise representation test.
+
+            attribute (str): The attribute associated with the cluster, used in certain file types.
+
+        Returns:
+            str: The generated header line as a tab-separated string.
+
+        Raises:
+            ValueError: If `filetype` is not recognized.
+        """
+        if filetype == "attribute_metrics":
+            attribute_metrics_header = [
+                "#attribute",
+                "taxon_set",
+                "cluster_total_count",
+                "protein_total_count",
+                "protein_total_span",
+                "singleton_cluster_count",
+                "singleton_protein_count",
+                "singleton_protein_span",
+                "specific_cluster_count",
+                "specific_protein_count",
+                "specific_protein_span",
+                "shared_cluster_count",
+                "shared_protein_count",
+                "shared_protein_span",
+                "specific_cluster_true_1to1_count",
+                "specific_cluster_fuzzy_count",
+                "shared_cluster_true_1to1_count",
+                "shared_cluster_fuzzy_count",
+                "absent_cluster_total_count",
+                "absent_cluster_singleton_count",
+                "absent_cluster_specific_count",
+                "absent_cluster_shared_count",
+                "TAXON_count",
+                "TAXON_taxa",
+            ]
+            return "\t".join(attribute_metrics_header)
+        elif filetype == "cafe":
+            cafe_header = ["#ID"]
+            cafe_header.extend(
+                iter(sorted(self.aloCollection.ALO_by_level_by_attribute["taxon"]))
+            )
+            return "\t".join(cafe_header)
+        elif filetype == "cluster_1to1s_ALO":
+            cluster_1to1s_ALO_header = [
+                "#cluster_id",
+                "cluster_type",
+                "1to1_type",
+                "proteome_count",
+                "percentage_at_target_count",
+            ]
+            return "\t".join(cluster_1to1s_ALO_header)
+        elif filetype == "cluster_metrics":
+            cluster_metrics_header = [
+                "#cluster_id",
+                "cluster_protein_count",
+                "protein_median_count",
+                "TAXON_count",
+                "attribute",
+                "attribute_cluster_type",
+                "protein_span_mean",
+                "protein_span_sd",
+            ]
+            cluster_metrics_header += [
+                f"{level}_count"
+                for level in sorted(
+                    self.aloCollection.ALO_by_level_by_attribute[attribute]
+                )
+            ]
+            if attribute != "taxon":
+                cluster_metrics_header += [
+                    f"{level}_median"
+                    for level in sorted(
+                        self.aloCollection.ALO_by_level_by_attribute[attribute]
+                    )
+                ]
+                cluster_metrics_header += [
+                    f"{level}_cov"
+                    for level in sorted(
+                        self.aloCollection.ALO_by_level_by_attribute[attribute]
+                    )
+                ]
+            return "\t".join(cluster_metrics_header)
+        elif filetype == "cluster_metrics_ALO":
+            cluster_metrics_ALO_header = [
+                "#cluster_id",
+                "cluster_status",
+                "cluster_type",
+                "cluster_protein_count",
+                "cluster_proteome_count",
+                "TAXON_protein_count",
+                "TAXON_mean_count",
+                "non_taxon_mean_count",
+                "representation",
+                "log2_mean(TAXON/others)",
+                "pvalue(TAXON vs. others)",
+                "TAXON_coverage",
+                "TAXON_count",
+                "non_TAXON_count",
+                "TAXON_taxa",
+                "non_TAXON_taxa",
+            ]
+            # for domain_source in clusterCollection.domain_sources:
+            #    cluster_metrics_ALO_header.append(domain_source)
+            return "\t".join(cluster_metrics_ALO_header)
+        elif filetype == "cluster_metrics_domains":
+            cluster_metrics_domains_header = [
+                "#cluster_id",
+                "cluster_protein_count",
+                "TAXON_count",
+                "protein_span_mean",
+                "protein_span_sd",
+                "fraction_secreted",
+            ]
+            for domain_source in self.clusterCollection.domain_sources:
+                cluster_metrics_domains_header.extend(
+                    (domain_source, f"{domain_source}_entropy")
+                )
+            return "\t".join(cluster_metrics_domains_header)
+        elif filetype == "cluster_metrics_domains_detailed":
+            cluster_metrics_domains_detailed_header = [
+                "#cluster_id",
+                "domain_source",
+                "domain_id",
+                "domain_description",
+                "protein_count",
+                "protein_count_with_domain",
+                "TAXA_with_domain_fraction",
+                "TAXA_with_domain",
+                "TAXA_without_domain",
+            ]
+            return "\t".join(cluster_metrics_domains_detailed_header)
+        elif filetype == "pairwise_representation_test":
+            pairwise_representation_test_header = [
+                "#cluster_id",
+                "TAXON_1",
+                "TAXON_1_mean",
+                "TAXON_2",
+                "TAXON_2_mean",
+                "log2_mean(TAXON_1/TAXON_2)",
+                "mwu_pvalue(TAXON_1 vs. TAXON_2)",
+            ]
+            # pairwise_representation_test_header.append("go_terms")
+            # for domain_source in clusterCollection.domain_sources:
+            #    pairwise_representation_test_header.append(domain_source)
+            return "\t".join(pairwise_representation_test_header)
+        else:
+            error_msg = f"[ERROR] {filetype} is not a valid header 'filetype'"
+            raise ValueError(error_msg)
+
+    # 1. plot_cluster_sizes
+    def __plot_cluster_sizes(self) -> None:
+        """
+        Plot the distribution of cluster sizes based on the protein counts in each cluster.
+
+        Saves the plot as a figure in the directory specified by self.dirs["main"].
+
+        Returns:
+            None
+
+        Raises:
+            ValueError: If self.inputData.plot_format is not a valid file format.
+        """
+        cluster_protein_count = [
+            cluster.protein_count for cluster in self.clusterCollection.cluster_list
+        ]
+        cluster_protein_counter = Counter(cluster_protein_count)
+        count_plot_f = os.path.join(
+            self.dirs["main"],
+            f"cluster_size_distribution.{self.inputData.plot_format}",
+        )
+        f, ax = plt.subplots(figsize=self.inputData.plotsize)
+        ax.set_facecolor("white")
+        x_values = []
+        y_values = []
+        for value, count in list(cluster_protein_counter.items()):
+            x_values.append(value)
+            y_values.append(count)
+        x_array = np.array(x_values)  # type: ignore
+        y_array = np.array(y_values)
+        ax.scatter(x_array, y_array, marker="o", alpha=0.8, s=100)  # type: ignore
+        ax.set_xlabel("Cluster size", fontsize=self.inputData.fontsize)
+        ax.set_ylabel("Count", fontsize=self.inputData.fontsize)
+        ax.set_yscale("log")
+        ax.set_xscale("log")
+        plt.margins(0.8)
+        plt.gca().set_ylim(bottom=0.8)
+        plt.gca().set_xlim(left=0.8)
+        ax.xaxis.set_major_formatter(FormatStrFormatter("%.0f"))
+        ax.yaxis.set_major_formatter(FormatStrFormatter("%.0f"))
+        f.tight_layout()
+
+        ax.grid(True, linewidth=1, which="major", color="lightgrey")
+        ax.grid(True, linewidth=0.5, which="minor", color="lightgrey")
+        logger.info(f"[STATUS] - Plotting {count_plot_f}")
+        f.savefig(count_plot_f, format=self.inputData.plot_format)
+        plt.close()
+
+    # 2. write_cluster_counts_by_taxon
+    def __write_cluster_counts_by_taxon(self) -> None:
+        """
+        Write cluster counts by taxon attribute to a text file.
+
+        This method iterates through attributes in self.aloCollection.attributes,
+        retrieves protein counts by level for clusters in self.clusterCollection.cluster_list
+        that match the attribute "taxon", and writes the data to a text file named
+        'cluster_counts_by_taxon.txt' in the directory specified by self.dirs["main"].
+
+        Raises:
+            ValueError: If the header type 'cafe' is not recognized.
+        """
+        cafe_f = os.path.join(self.dirs["main"], "cluster_counts_by_taxon.txt")
+        for attribute in self.aloCollection.attributes:
+            levels = sorted(
+                list(self.aloCollection.ALO_by_level_by_attribute[attribute])
+            )
+            cafe_output = []
+            for cluster in self.clusterCollection.cluster_list:
+                if attribute == "taxon":
+                    cafe_line = f"{cluster.cluster_id}"
+                    # cafe_line.append("None")
+                    for _level in levels:
+                        total_proteins = sum(
+                            cluster.protein_counts_of_proteomes_by_level_by_attribute[
+                                attribute
+                            ][_level]
+                        )
+                        cafe_line += f"\t{total_proteins}"
+                    cafe_output.append(cafe_line)
+            if cafe_output:
+                with open(cafe_f, "w") as cafe_fh:
+                    logger.info(f"[STATUS] - Writing {cafe_f}")
+                    cafe_output.sort()
+                    cafe_output.insert(0, self.__get_header_line("cafe", "taxon"))
+                    cafe_fh.write("\n".join(cafe_output) + "\n")
+                cafe_output = []
+
+    # 3. write_cluster_metrics_domains
+    def __write_cluster_metrics_domains(self) -> None:
+        """
+        Write cluster metrics to a file 'cluster_metrics_domains.txt'.
+
+        This method constructs and writes cluster metrics data to a text file,
+        including cluster IDs, protein counts, taxon counts, domain statistics,
+        and entropy for each domain source present in the cluster collection.
+
+        Raises:
+            IOError: If there is an issue writing to the output file.
+
+        """
+        cluster_metrics_domains_f = os.path.join(
+            self.dirs["main"], "cluster_metrics_domains.txt"
+        )
+        header = self.__get_header_line("cluster_metrics_domains", "taxon").split("\t")
+        cluster_metrics_domains_output = []
+
+        if self.clusterCollection.functional_annotation_parsed:
+            for cluster in self.clusterCollection.cluster_list:
+                line_parts = {
+                    "#cluster_id": cluster.cluster_id,
+                    "cluster_protein_count": str(cluster.protein_count),
+                    "TAXON_count": str(cluster.proteome_count),
+                    "protein_span_mean": "N/A",
+                    "protein_span_sd": "N/A",
+                    "fraction_secreted": "N/A",
+                }
+
+                if (
+                    self.clusterCollection.fastas_parsed
+                    and cluster.protein_length_stats
+                ):
+                    line_parts["protein_span_mean"] = str(
+                        cluster.protein_length_stats["mean"]
+                    )
+                    line_parts["protein_span_sd"] = str(
+                        cluster.protein_length_stats["sd"]
+                    )
+
+                if "SignalP_EUK" in self.clusterCollection.domain_sources:
+                    line_parts["fraction_secreted"] = "{0:.2f}".format(
+                        cluster.secreted_cluster_coverage
+                    )
+
+                for domain_source in self.clusterCollection.domain_sources:
+                    if domain_source in cluster.domain_counter_by_domain_source:
+                        sorted_counts = sorted(
+                            [
+                                f"{domain_id}:{count}"
+                                for domain_id, count in cluster.domain_counter_by_domain_source[
+                                    domain_source
+                                ].most_common()
+                            ],
+                            key=lambda x: (x.split(":")[-1], x.split(":")[-2]),
+                        )
+                        line_parts[domain_source] = ";".join(sorted_counts)
+                        line_parts[f"{domain_source}_entropy"] = "{0:.3f}".format(
+                            cluster.domain_entropy_by_domain_source[domain_source]
+                        )
+                    else:
+                        line_parts[domain_source] = "N/A"
+                        line_parts[f"{domain_source}_entropy"] = "N/A"
+
+                # Ensure we're following the correct order from the header
+                ordered_line = [line_parts.get(col, "N/A") for col in header]
+                cluster_metrics_domains_output.append("\t".join(ordered_line))
+
+        if cluster_metrics_domains_output:
+            with open(cluster_metrics_domains_f, "w") as cluster_metrics_domains_fh:
+                logger.info(f"[STATUS] - Writing {cluster_metrics_domains_f}")
+                cluster_metrics_domains_output.sort()
+                cluster_metrics_domains_output.insert(0, "\t".join(header))
+                cluster_metrics_domains_fh.write(
+                    "\n".join(cluster_metrics_domains_output) + "\n"
+                )
+
+    # 4. write_cluster_metrics_domains_detailed
+    def __count_proteins_with_domain(
+        self, cluster: Cluster, domain_source: str, domain_id: str
+    ) -> Tuple[Dict[str, int], Dict[str, int]]:
+        """
+        Count proteins with and without a specific domain in each proteome of a cluster.
+
+        Args:
+            cluster (Cluster): The cluster object containing proteins to be analyzed.
+            domain_source (str): The source of the domain to be counted (e.g., "Pfam", "InterPro").
+            domain_id (str): The ID of the specific domain to be counted.
+
+        Returns:
+            Tuple[Dict[str, int], Dict[str, int]]: A tuple containing:
+                - A dictionary where keys are proteome IDs and values are counts of proteins
+                  in the proteome that have the specified domain (`with_domain`).
+                - A dictionary where keys are proteome IDs and values are counts of proteins
+                  in the proteome that do not have the specified domain (`without_domain`).
+
+        """
+        with_domain = defaultdict(int)
+        without_domain = defaultdict(int)
+
+        for proteome_id, protein_ids in cluster.protein_ids_by_proteome_id.items():
+            for protein_id in protein_ids:
+                protein = self.proteinCollection.proteins_by_protein_id[protein_id]
+                if (
+                    domain_source in protein.domain_counter_by_domain_source
+                    and domain_id
+                    in protein.domain_counter_by_domain_source[domain_source]
+                ):
+                    with_domain[proteome_id] += 1
+                else:
+                    without_domain[proteome_id] += 1
+
+        return with_domain, without_domain
+
+    def __format_proteome_counts(
+        self, count_dict: Dict[str, int], cluster: Cluster
+    ) -> str:
+        """
+        Format proteome counts into a string representation.
+
+        Args:
+            count_dict (Dict[str, int]): A dictionary where keys are proteome IDs and values are counts.
+            cluster (Cluster): The cluster object associated with the counts.
+
+        Returns:
+            str: A string representation of proteome counts formatted as "proteome_id:count/total"
+                 for each proteome ID in sorted order, separated by commas. If count_dict is empty,
+                 returns "N/A".
+
+        """
+        return (
+            ",".join(
+                f"{proteome_id}:{count}/{len(cluster.protein_ids_by_proteome_id[proteome_id])}"
+                for proteome_id, count in sorted(count_dict.items())
+            )
+            or "N/A"
+        )
+
+    def __get_domain_description(self, domain_source: str, domain_id: str) -> str:
+        """
+        Get the description of a domain based on its source and ID.
+
+        Args:
+            domain_source (str): The source of the domain (e.g., "SignalP_EUK", "Pfam").
+            domain_id (str): The ID of the domain whose description is to be retrieved.
+
+        Returns:
+            str: The description of the domain if found in `self.proteinCollection.domain_desc_by_id_by_source`,
+                 otherwise returns "N/A".
+
+        """
+        if domain_source == "SignalP_EUK":
+            return domain_id
+        return self.proteinCollection.domain_desc_by_id_by_source.get(
+            domain_source, {}
+        ).get(domain_id, "N/A")
+
+    def __process_cluster_domains(
+        self, cluster: Cluster, output_by_domain_source: Dict[str, List[str]]
+    ) -> None:
+        """
+        Process domain statistics for a cluster and populate the output dictionary.
+
+        Args:
+            cluster (Cluster): The cluster object containing domain statistics to process.
+            output_by_domain_source (Dict[str, List[str]]): A dictionary where keys are domain sources
+                and values are lists of output lines to be populated with processed domain statistics.
+
+        Returns:
+            None
+
+        """
+        for (
+            domain_source,
+            domain_counter,
+        ) in cluster.domain_counter_by_domain_source.items():
+            for domain_id, count in domain_counter.most_common():
+                with_domain, without_domain = self.__count_proteins_with_domain(
+                    cluster, domain_source, domain_id
+                )
+                proteome_count_with_domain = sum(
+                    count > 0 for count in with_domain.values()
+                )
+
+                with_domain_str = self.__format_proteome_counts(with_domain, cluster)
+                without_domain_str = self.__format_proteome_counts(
+                    without_domain, cluster
+                )
+
+                domain_description = self.__get_domain_description(
+                    domain_source, domain_id
+                )
+
+                output_line = (
+                    f"{cluster.cluster_id}\t{domain_source}\t{domain_id}\t"
+                    f"{domain_description}\t{cluster.protein_count}\t"
+                    f"{sum(with_domain.values())}\t"
+                    f"{proteome_count_with_domain / cluster.proteome_count:.3f}\t"
+                    f"{with_domain_str}\t{without_domain_str}"
+                )
+
+                output_by_domain_source[domain_source].append(output_line)
+
+    def __write_domain_outputs(
+        self,
+        output_by_domain_source: Dict[str, List[str]],
+        output_files: Dict[str, str],
+    ) -> None:
+        """
+        Write domain outputs to respective output files.
+
+        Args:
+            output_by_domain_source (Dict[str, List[str]]): A dictionary where keys are domain sources
+                and values are lists of output lines to be written to output files.
+            output_files (Dict[str, str]): A dictionary where keys are domain sources and values are
+                corresponding output file paths.
+
+        Returns:
+            None
+
+        """
+        for domain_source, output_lines in output_by_domain_source.items():
+            if len(output_lines) > 1:
+                output_file = output_files[domain_source]
+                logger.info(f"[STATUS] - Writing {output_file}")
+                with open(output_file, "w") as fh:
+                    fh.write("\n".join(output_lines) + "\n")
+
+    def __write_cluster_metrics_domains_detailed(self) -> None:
+        """
+        Write detailed cluster metrics for domain annotations to respective output files.
+
+        This method constructs detailed cluster metrics for domain annotations and writes
+        them to individual output files for each domain source specified in the cluster
+        collection.
+
+        Returns:
+            None
+        """
+        output_by_domain_source: Dict[str, List[str]] = {
+            source: [] for source in self.clusterCollection.domain_sources
+        }
+
+        output_files: Dict[str, str] = {
+            source: os.path.join(
+                self.dirs["main"], f"cluster_domain_annotation.{source}.txt"
+            )
+            for source in self.clusterCollection.domain_sources
+        }
+
+        if self.clusterCollection.functional_annotation_parsed:
+            for cluster in self.clusterCollection.cluster_list:
+                self.__process_cluster_domains(cluster, output_by_domain_source)
+
+        self.__write_domain_outputs(output_by_domain_source, output_files)
+
+    # 5. write attribute metrics
+    def __get_attribute_metrics(self, ALO: AttributeLevel) -> str:
+        """
+        Retrieve attribute metrics as a formatted string.
+
+        Args:
+            ALO (AttributeLevel): An instance of AttributeLevel containing the attribute metrics.
+
+        Returns:
+            str: A tab-separated string containing various attribute metrics:
+                 - Attribute name
+                 - Attribute level
+                 - Cluster counts and protein counts/span for different cluster types and statuses.
+                 - Proteome count and other relevant metrics.
+
+        """
+        attribute_metrics = [
+            ALO.attribute,
+            ALO.level,
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type("present", "total"),
+            ALO.get_protein_count_by_cluster_type("total"),
+            ALO.get_protein_span_by_cluster_type("total"),
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type(
+                "present", "singleton"
+            ),
+            ALO.get_protein_count_by_cluster_type("singleton"),
+            ALO.get_protein_span_by_cluster_type("singleton"),
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type(
+                "present", "specific"
+            ),
+            ALO.get_protein_count_by_cluster_type("specific"),
+            ALO.get_protein_span_by_cluster_type("specific"),
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type(
+                "present", "shared"
+            ),
+            ALO.get_protein_count_by_cluster_type("shared"),
+            ALO.get_protein_span_by_cluster_type("shared"),
+            ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type(
+                "specific", "true"
+            ),
+            ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type(
+                "specific", "fuzzy"
+            ),
+            ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type(
+                "shared", "true"
+            ),
+            ALO.get_cluster_count_by_cluster_cardinality_by_cluster_type(
+                "shared", "fuzzy"
+            ),
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type("absent", "total"),
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type(
+                "absent", "singleton"
+            ),
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type(
+                "absent", "specific"
+            ),
+            ALO.get_cluster_count_by_cluster_status_by_cluster_type("absent", "shared"),
+            ALO.proteome_count,
+            ALO.get_proteomes(),
+        ]
+
+        return "\t".join(map(str, attribute_metrics))
+
+    def __write_attribute_metrics(self) -> None:
+        """
+        Write attribute metrics for each attribute to respective output files.
+
+        This method iterates over each attribute in self.aloCollection.attributes,
+        retrieves attribute metrics for each level of the attribute, and writes them
+        to individual output files named after the attribute.
+
+        Returns:
+            None
+
+        """
+        for attribute in self.aloCollection.attributes:
+            attribute_metrics_f = os.path.join(
+                self.dirs[attribute], f"{attribute}.attribute_metrics.txt"
+            )
+            attribute_metrics_output = []
+            levels = sorted(
+                list(self.aloCollection.ALO_by_level_by_attribute[attribute])
+            )
+            for level in levels:
+                if ALO := self.aloCollection.ALO_by_level_by_attribute[attribute][
+                    level
+                ]:
+                    attribute_metrics_output.append(self.__get_attribute_metrics(ALO))
+
+            if attribute_metrics_output:
+                with open(attribute_metrics_f, "w") as attribute_metrics_fh:
+                    logger.info(f"[STATUS] - Writing {attribute_metrics_f}")
+                    attribute_metrics_output.sort()
+                    header_line = self.__get_header_line("attribute_metrics", attribute)
+                    attribute_metrics_output.insert(0, header_line)
+                    attribute_metrics_fh.write(
+                        "\n".join(attribute_metrics_output) + "\n"
+                    )
+
+    # 6. write cluster summary
+    def __write_cluster_summary(self) -> None:
+        """
+        Write cluster summary metrics for each attribute to respective output files.
+
+        This method iterates over each attribute in self.aloCollection.attributes,
+        retrieves cluster summary metrics for each cluster in self.clusterCollection.cluster_list,
+        and writes them to individual output files named after the attribute.
+
+        Returns:
+            None
+
+        """
+        for attribute in self.aloCollection.attributes:
+            cluster_metrics_f = os.path.join(
+                self.dirs[attribute], f"{attribute}.cluster_summary.txt"
+            )
+
+            levels = sorted(
+                list(self.aloCollection.ALO_by_level_by_attribute[attribute])
+            )
+            cluster_metrics_output = []
+            for cluster in self.clusterCollection.cluster_list:
+                cluster_metrics_line = [
+                    str(cluster.cluster_id),
+                    str(cluster.protein_count),
+                    str(cluster.protein_median),
+                    str(cluster.proteome_count),
+                    str(attribute),
+                    str(cluster.cluster_type_by_attribute[attribute]),
+                ]
+                if (
+                    self.clusterCollection.fastas_parsed
+                    and cluster.protein_length_stats
+                ):
+                    cluster_metrics_line.extend(
+                        [
+                            str(cluster.protein_length_stats.get("mean", "N/A")),
+                            str(cluster.protein_length_stats.get("sd", "N/A")),
+                        ]
+                    )
+                else:
+                    cluster_metrics_line.extend(["N/A", "N/A"])
+
+                cluster_metrics_line.extend(
+                    str(
+                        sum(
+                            cluster.protein_counts_of_proteomes_by_level_by_attribute[
+                                attribute
+                            ][_level]
+                        )
+                    )
+                    for _level in levels
+                )
+
+                if attribute != "taxon":
+                    cluster_metrics_line.extend(
+                        [
+                            str(
+                                median(
+                                    cluster.protein_counts_of_proteomes_by_level_by_attribute[
+                                        attribute
+                                    ][
+                                        _level
+                                    ]
+                                )
+                            )
+                            for _level in levels
+                        ]
+                    )
+                    cluster_metrics_line.extend(
+                        [
+                            "{0:.2f}".format(
+                                cluster.proteome_coverage_by_level_by_attribute[
+                                    attribute
+                                ][_level]
+                            )
+                            for _level in levels
+                        ]
+                    )
+
+                cluster_metrics_output.append("\t".join(cluster_metrics_line))
+
+            if cluster_metrics_output:
+                with open(cluster_metrics_f, "w") as cluster_metrics_fh:
+                    logger.info(f"[STATUS] - Writing {cluster_metrics_f}")
+                    cluster_metrics_output.sort()
+                    header_line = self.__get_header_line("cluster_metrics", attribute)
+                    cluster_metrics_output.insert(0, header_line)
+                    cluster_metrics_fh.write("\n".join(cluster_metrics_output) + "\n")
+                cluster_metrics_output = []
+
+    # 7. Write cluster ALO metrics
+    def __get_enrichment_data(self, ALO: AttributeLevel, cluster: Cluster) -> List[str]:
+        """
+        Retrieve enrichment data for a given AttributeLevel and Cluster.
+
+        Args:
+            ALO (AttributeLevel): An instance of AttributeLevel containing enrichment data.
+            cluster (Cluster): An instance of Cluster for which enrichment data is retrieved.
+
+        Returns:
+            List[str]: A list containing enrichment data:
+                - Enrichment status ("enriched", "depleted", "equal" or "N/A" if unavailable)
+                - Log2 mean value
+                - p-value
+
+        """
+        if (
+            ALO
+            and ALO.cluster_type_by_cluster_id[cluster.cluster_id] == "shared"
+            and ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id]
+        ):
+            log2_mean = ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id]
+            enrichment = (
+                "enriched"
+                if log2_mean > 0
+                else "depleted" if log2_mean < 0 else "equal"
+            )
+            return [
+                enrichment,
+                f"{log2_mean}",
+                f"{ALO.cluster_mwu_pvalue_by_cluster_id[cluster.cluster_id]}",
+            ]
+        return ["N/A", "N/A", "N/A"]
+
+    def __get_proteome_data(self, ALO: AttributeLevel, cluster: Cluster) -> List[str]:
+        """
+        Retrieve proteome data for a given AttributeLevel and Cluster.
+
+        Args:
+            ALO (AttributeLevel): An instance of AttributeLevel containing proteome data.
+            cluster (Cluster): An instance of Cluster for which proteome data is retrieved.
+
+        Returns:
+            List[str]: A list containing proteome data:
+                - Number of proteomes present in both ALO and cluster
+                - Number of proteomes present only in cluster
+                - Sorted list of proteome IDs present in both ALO and cluster, or "N/A" if none
+                - Sorted list of proteome IDs present only in cluster, or "N/A" if none
+
+        """
+        ALO_proteomes_present = cluster.proteome_ids.intersection(
+            ALO.proteomes if ALO else set()
+        )
+        non_ALO_proteomes_present = cluster.proteome_ids.difference(
+            ALO.proteomes if ALO else set()
+        )
+        return [
+            f"{len(ALO_proteomes_present)}",
+            f"{len(non_ALO_proteomes_present)}",
+            (
+                f"{','.join(sorted(list(ALO_proteomes_present)))}"
+                if ALO_proteomes_present
+                else "N/A"
+            ),
+            (
+                f"{','.join(sorted(list(non_ALO_proteomes_present)))}"
+                if non_ALO_proteomes_present
+                else "N/A"
+            ),
+        ]
+
+    def __write_cluster_metrics_ALO(self) -> None:
+        """
+        Write cluster metrics for each attribute level object (ALO) to separate files.
+
+        For each attribute in self.aloCollection.attributes, this method writes cluster metrics
+        to a file named '{attribute}.{level}.cluster_metrics.txt' in the corresponding directory
+        under self.dirs[attribute].
+
+        Metrics include cluster ID, status, type, protein count, proteome count, counts by level,
+        mean ALO counts, mean non-ALO counts, enrichment data, and proteome coverage.
+
+        Returns:
+            None
+        """
+        for attribute in self.aloCollection.attributes:
+            levels = sorted(
+                list(self.aloCollection.ALO_by_level_by_attribute[attribute])
+            )
+
+            for level in levels:
+                ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level]
+                cluster_metrics_ALO_f = os.path.join(
+                    self.dirs[attribute], f"{attribute}.{level}.cluster_metrics.txt"
+                )
+                if ALO is None:
+                    continue
+                cluster_metrics_ALO_output = [
+                    "\t".join(
+                        [
+                            f"{cluster.cluster_id}",
+                            (
+                                f"{ALO.cluster_status_by_cluster_id[cluster.cluster_id]}"
+                                if ALO
+                                else "N/A"
+                            ),
+                            (
+                                f"{ALO.cluster_type_by_cluster_id[cluster.cluster_id]}"
+                                if ALO
+                                else "N/A"
+                            ),
+                            f"{cluster.protein_count}",
+                            f"{cluster.proteome_count}",
+                            f"{sum(cluster.protein_counts_of_proteomes_by_level_by_attribute[attribute][level])}",
+                            (
+                                f"{ALO.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id]}"
+                                if ALO
+                                and ALO.cluster_mean_ALO_count_by_cluster_id[
+                                    cluster.cluster_id
+                                ]
+                                else "N/A"
+                            ),
+                            (
+                                f"{ALO.cluster_mean_non_ALO_count_by_cluster_id[cluster.cluster_id]}"
+                                if ALO
+                                and ALO.cluster_mean_non_ALO_count_by_cluster_id[
+                                    cluster.cluster_id
+                                ]
+                                else "N/A"
+                            ),
+                            *self.__get_enrichment_data(ALO, cluster),
+                            "{0:.2f}".format(
+                                cluster.proteome_coverage_by_level_by_attribute[
+                                    attribute
+                                ][level]
+                            ),
+                            *self.__get_proteome_data(ALO, cluster),
+                        ]
+                    )
+                    for cluster in self.clusterCollection.cluster_list
+                ]
+                if cluster_metrics_ALO_output:
+                    with open(cluster_metrics_ALO_f, "w") as cluster_metrics_ALO_fh:
+                        logger.info(f"[STATUS] - Writing {cluster_metrics_ALO_f}")
+                        cluster_metrics_ALO_output.sort()
+
+                        header_line = self.__get_header_line(
+                            "cluster_metrics_ALO", attribute
+                        )
+                        cluster_metrics_ALO_output.insert(0, header_line)
+                        cluster_metrics_ALO_fh.write(
+                            "\n".join(cluster_metrics_ALO_output) + "\n"
+                        )
+
+    # 8. write cluster 1to1 ALO
+    def __write_cluster_1to1_ALO(self) -> None:
+        """
+        Write cluster 1-to-1 relationships for each attribute level object (ALO) to separate files.
+
+        For each attribute in self.aloCollection.attributes, this method writes cluster 1-to-1
+        relationships to a file named '{attribute}.{level}.cluster_1to1s.txt' in the corresponding
+        directory under self.dirs[attribute].
+
+        Relationships include cluster ID, type, cardinality, proteome count, and fuzzy count ratio.
+
+        Returns:
+            None
+        """
+        for attribute in self.aloCollection.attributes:
+            levels = sorted(
+                list(self.aloCollection.ALO_by_level_by_attribute[attribute])
+            )
+            for level in levels:
+                cluster_1to1_ALO_f = os.path.join(
+                    self.dirs[attribute], f"{attribute}.{level}.cluster_1to1s.txt"
+                )
+                cluster_1to1_ALO_output = []
+
+                ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level]
+
+                if attribute != "taxon" and ALO:
+                    for (
+                        cluster_type
+                    ) in ALO.clusters_by_cluster_cardinality_by_cluster_type:
+                        for (
+                            cluster_cardinality
+                        ) in ALO.clusters_by_cluster_cardinality_by_cluster_type[
+                            cluster_type
+                        ]:
+                            for (
+                                cluster_id
+                            ) in ALO.clusters_by_cluster_cardinality_by_cluster_type[
+                                cluster_type
+                            ][
+                                cluster_cardinality
+                            ]:
+                                cluster = (
+                                    self.clusterCollection.cluster_list_by_cluster_id[
+                                        cluster_id
+                                    ]
+                                )
+                                protein_count_by_proteome = (
+                                    cluster.protein_count_by_proteome_id
+                                )
+                                proteome_count = cluster.proteome_count
+
+                                fuzzy_proteome_ratio = (
+                                    len(
+                                        [
+                                            protein_count
+                                            for _, protein_count in protein_count_by_proteome.items()
+                                            if protein_count
+                                            == self.inputData.fuzzy_count
+                                        ]
+                                    )
+                                    / proteome_count
+                                )
+
+                                cluster_1to1_ALO_line = "\t".join(
+                                    [
+                                        str(cluster_id),
+                                        str(cluster_type),
+                                        str(cluster_cardinality),
+                                        str(proteome_count),
+                                        "{0:.2f}".format(fuzzy_proteome_ratio),
+                                    ]
+                                )
+
+                                cluster_1to1_ALO_output.append(cluster_1to1_ALO_line)
+
+                if cluster_1to1_ALO_output:
+                    with open(cluster_1to1_ALO_f, "w") as cluster_1to1_ALO_fh:
+                        logger.info(f"[STATUS] - Writing {cluster_1to1_ALO_f}")
+                        cluster_1to1_ALO_output.sort()
+                        header_line = self.__get_header_line(
+                            "cluster_1to1s_ALO", attribute
+                        )
+                        cluster_1to1_ALO_output.insert(0, header_line)
+                        cluster_1to1_ALO_fh.write(
+                            "\n".join(cluster_1to1_ALO_output) + "\n"
+                        )
+                    cluster_1to1_ALO_output = []
+
+    # 9. write_pairwise_representation
+    def __process_background_representation(
+        self,
+        attribute: str,
+        level: str,
+        ALO: AttributeLevel,
+        cluster: Cluster,
+        background_representation_test_by_pair_by_attribute,
+    ) -> None:
+        """
+        Process and append background representation test results for a cluster and attribute level.
+
+        Args:
+            attribute (str): The attribute name.
+            level (str): The attribute level.
+            ALO (AttributeLevel): The AttributeLevel object for the attribute and level.
+            cluster (Cluster): The Cluster object representing the cluster.
+            background_representation_test_by_pair_by_attribute (Dict[str, Dict[str, Any]]):
+                A nested dictionary to store background representation test results,
+                structured as [attribute][background_pair] = list of test results.
+
+        Returns:
+            None
+        """
+        background_pair = (level, "background")
+        if attribute not in background_representation_test_by_pair_by_attribute:
+            background_representation_test_by_pair_by_attribute[attribute] = {}
+        if (
+            background_pair
+            not in background_representation_test_by_pair_by_attribute[attribute]
+        ):
+            background_representation_test_by_pair_by_attribute[attribute][
+                background_pair
+            ] = []
+
+        background_representation_test = [
+            cluster.cluster_id,
+            level,
+            "background",
+            ALO.cluster_mean_ALO_count_by_cluster_id[cluster.cluster_id],
+            ALO.cluster_mean_non_ALO_count_by_cluster_id[cluster.cluster_id],
+            ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id],
+            ALO.cluster_mwu_pvalue_by_cluster_id[cluster.cluster_id],
+        ]
+        background_representation_test_by_pair_by_attribute[attribute][
+            background_pair
+        ].append(background_representation_test)
+
+    def __get_pairwise_representation_test(
+        self,
+        cluster: Cluster,
+        attribute: str,
+        level: str,
+        levels_seen: Set[str],
+        levels: List[str],
+    ) -> Generator[List[Any], None, None]:
+        """
+        Generate pairwise representation test results for a cluster and attribute level.
+
+        Args:
+            cluster (Cluster): The Cluster object representing the cluster.
+            attribute (str): The attribute name.
+            level (str): The current attribute level.
+            levels_seen (Set[str]): A set of attribute levels already processed.
+            levels (List[str]): A list of all attribute levels.
+
+        Yields:
+            Generator[List[Any], None, None]: A generator yielding lists containing pairwise representation test results.
+                Each list includes:
+                - cluster.cluster_id: ID of the cluster.
+                - level: Current attribute level.
+                - other_level: Another attribute level being compared with `level`.
+                - mean_ALO_count: Mean count of ALOs in the cluster at `level`.
+                - mean_non_ALO_count: Mean count of non-ALOs in the cluster at `level`.
+                - mwu_log2_mean: Log2 mean of the Mann-Whitney U test results between `level` and `other_level`.
+                - mwu_pvalue: P-value of the Mann-Whitney U test results between `level` and `other_level`.
+        """
+        for other_level in set(levels).difference(levels_seen):
+            if other_level != level:
+                other_ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][
+                    other_level
+                ]
+                if (
+                    other_ALO
+                    and len(cluster.proteome_ids.intersection(other_ALO.proteomes)) >= 2
+                ):
+                    protein_counts_level = [
+                        count
+                        for count in cluster.protein_counts_of_proteomes_by_level_by_attribute[
+                            attribute
+                        ][
+                            level
+                        ]
+                        if count > 0
+                    ]
+                    protein_counts_other_level = [
+                        count
+                        for count in cluster.protein_counts_of_proteomes_by_level_by_attribute[
+                            attribute
+                        ][
+                            other_level
+                        ]
+                        if count > 0
+                    ]
+                    if protein_counts_level and protein_counts_other_level:
+                        (
+                            mwu_pvalue,
+                            mwu_log2_mean,
+                            mean_ALO_count,
+                            mean_non_ALO_count,
+                        ) = statistic(
+                            protein_counts_level,
+                            protein_counts_other_level,
+                            self.inputData.test,
+                            self.inputData.min_proteomes,
+                        )
+                        yield [
+                            cluster.cluster_id,
+                            level,
+                            other_level,
+                            mean_ALO_count,
+                            mean_non_ALO_count,
+                            mwu_log2_mean,
+                            mwu_pvalue,
+                        ]
+                        # pvalue = None
+                        # try:
+                        #     pvalue = scipy.stats.mannwhitneyu(protein_counts_level, protein_counts_other_level, alternative="two-sided")[1]
+                        # except:
+                        #     pvalue = 1.0
+                        # mean_level = mean(protein_counts_level)
+                        # mean_other_level = mean(protein_counts_other_level)
+                        # log2fc_mean = log((mean_level/mean_other_level), 2)
+                        # yield [cluster.cluster_id, level, other_level, mean_level,
+                        # mean_other_level, log2fc_mean, pvalue]
+
+    def __process_pairwise_representation(
+        self,
+        attribute: str,
+        level: str,
+        levels_seen: Set[str],
+        levels: List[str],
+        cluster: Cluster,
+        pairwise_representation_test_by_pair_by_attribute,
+        pairwise_representation_test_output: List[str],
+    ) -> None:
+        """
+        Process pairwise representation tests for a specific attribute level and cluster.
+
+        Args:
+            attribute (str): The attribute name.
+            level (str): The current attribute level.
+            levels_seen (Set[str]): A set of attribute levels already processed.
+            levels (List[str]): A list of all attribute levels.
+            cluster (Cluster): The Cluster object representing the cluster.
+            pairwise_representation_test_by_pair_by_attribute (Dict[str, Dict[Tuple[str, str], List[List[Any]]]]):
+                Dictionary storing pairwise representation test results by attribute and pair of levels.
+            pairwise_representation_test_output (List[str]): List to store formatted output lines of pairwise tests.
+
+        Returns:
+            None
+        """
+        for result in self.__get_pairwise_representation_test(
+            cluster, attribute, level, levels_seen, levels
+        ):
+            if attribute not in pairwise_representation_test_by_pair_by_attribute:
+                pairwise_representation_test_by_pair_by_attribute[attribute] = {}
+            pair = (result[1], result[2])
+            if pair not in pairwise_representation_test_by_pair_by_attribute[attribute]:
+                pairwise_representation_test_by_pair_by_attribute[attribute][pair] = []
+            pairwise_representation_test_by_pair_by_attribute[attribute][pair].append(
+                result
+            )
+
+            pairwise_representation_test_output.append(
+                f"{result[0]}\t{result[1]}\t{result[3]}\t{result[2]}\t{result[4]}\t{result[5]}\t{result[6]}"
+            )
+
+    # 9.5 __plot_count_comparisons_volcano
+    def __prepare_data(self, pair_data: List[str]) -> Tuple[List[float], List[float]]:
+        """
+        Prepare data from pair_data into lists of p-values and log2 fold change (log2fc) values.
+
+        Args:
+            pair_data (List[str]): List of strings containing data for each pair.
+
+        Returns:
+            Tuple[List[float], List[float]]: Tuple containing:
+                - List[float]: p-values extracted from pair_data.
+                - List[float]: log2 fold change (log2fc) values extracted from pair_data.
+        """
+        pair_data_count = len(pair_data)
+        p_values: List[float] = []
+        log2fc_values: List[float] = []
+
+        for data in pair_data:
+            log2fc_values.append(float(data[5]))
+            pvalue = data[6] if data[6] != 0.0 else 0.01 / (pair_data_count + 1)
+            p_values.append(float(pvalue))
+
+        return p_values, log2fc_values
+
+    def __get_output_filename(self, attribute: str, pair_list: List[str]) -> str:
+        """
+        Generate an output filename based on attribute, pair_list, and plot_format.
+
+        Args:
+            attribute (str): Attribute name used in the filename.
+            pair_list (List[str]): List of strings used to form part of the filename.
+
+        Returns:
+            str: Generated output filename.
+        """
+        return os.path.join(
+            self.dirs[attribute],
+            f"{attribute}.pairwise_representation_test.{'_'.join(pair_list)}.{self.inputData.plot_format}",
+        )
+
+    def __create_volcano_plot(
+        self,
+        p_values: List[float],
+        log2fc_values: List[float],
+        pair_list: List[str],
+        output_file: str,
+    ) -> None:
+        """
+        Create a volcano plot to visualize differential expression analysis results.
+
+        Parameters:
+        - p_values (List[float]): List of p-values for each comparison.
+        - log2fc_values (List[float]): List of log2 fold change values for each comparison.
+        - pair_list (List[str]): List of pairs or labels corresponding to each comparison.
+        - output_file (str): Filepath where the plot will be saved.
+
+        Returns:
+        - None
+        """
+        plt.figure(1, figsize=self.inputData.plotsize)
+
+        axScatter, axHistx = self.__setup_plot_axes()
+
+        p_array = np.array(p_values)
+        log2fc_array = np.array(log2fc_values)
+
+        log2fc_percentile = self.__plot_data(axScatter, axHistx, log2fc_array, p_array)
+        self.__set_plot_properties(
+            axScatter, axHistx, log2fc_array, p_array, pair_list, log2fc_percentile
+        )
+
+        logger.info(f"[STATUS] - Plotting {output_file}")
+        plt.savefig(output_file, format=self.inputData.plot_format)
+        plt.close()
+
+    def __setup_plot_axes(self) -> Tuple[Any, Any]:
+        """
+        Set up the axes for a combined scatter plot and histogram.
+
+        Returns:
+        - Tuple of matplotlib.axes.Axes: Tuple containing the scatter plot axes (`axScatter`)
+        and the histogram axes (`axHistx`).
+        """
+        left, width = 0.1, 0.65
+        bottom, height = 0.1, 0.65
+        bottom_h = left + width + 0.02
+        rect_scatter = (left, bottom, width, height)
+        rect_histx = (left, bottom_h, width, 0.2)
+
+        axScatter = plt.axes(rect_scatter)
+        axScatter.set_facecolor("white")
+        axHistx = plt.axes(rect_histx)
+        axHistx.set_facecolor("white")
+        axHistx.xaxis.set_major_formatter(NullFormatter())
+        axHistx.yaxis.set_major_formatter(NullFormatter())
+
+        return axScatter, axHistx
+
+    def __plot_data(
+        self,
+        axScatter: Any,
+        axHistx: Any,
+        log2fc_array: np.ndarray,
+        p_array: np.ndarray,
+    ) -> Any:
+        """
+        Plot data on scatter and histogram axes.
+
+        Parameters:
+        - axScatter (Any): Axes for the scatter plot.
+        - axHistx (Any): Axes for the histogram plot.
+        - log2fc_array (np.ndarray): Array of log2 fold change values.
+        - p_array (np.ndarray): Array of p-values.
+
+        Returns:
+        - float: 95th percentile of log2 fold change values.
+        """
+        # Plot histogram
+        binwidth = 0.05
+        xymax = np.max(np.fabs(log2fc_array))  # type: ignore
+        lim = (int(xymax / binwidth) + 1) * binwidth
+        bins = np.arange(-lim, lim + binwidth, binwidth)
+        axHistx.hist(
+            log2fc_array, bins=bins, histtype="stepfilled", color="grey", align="mid"
+        )
+
+        # Plot scatter
+        axScatter.scatter(
+            log2fc_array, p_array, alpha=0.8, edgecolors="none", s=25, c="grey"
+        )
+
+        # Add reference lines
+        ooFive, ooOne = 0.05, 0.01
+        log2fc_percentile = np.percentile(log2fc_array, 95)
+
+        axScatter.axhline(y=ooFive, linewidth=2, color="orange", linestyle="--")
+        axScatter.axhline(y=ooOne, linewidth=2, color="red", linestyle="--")
+        axScatter.axvline(x=1.0, linewidth=2, color="purple", linestyle="--")
+        axScatter.axvline(
+            x=log2fc_percentile, linewidth=2, color="blue", linestyle="--"
+        )
+        axScatter.axvline(x=-1.0, linewidth=2, color="purple", linestyle="--")
+        axScatter.axvline(
+            x=-log2fc_percentile, linewidth=2, color="blue", linestyle="--"
+        )
+
+        return log2fc_percentile
+
+    def __set_plot_properties(
+        self,
+        axScatter: Any,
+        axHistx: Any,
+        log2fc_array: np.ndarray,
+        p_array: np.ndarray,
+        pair_list: List[str],
+        log2fc_percentile: Any,
+    ) -> None:
+        """
+        Set properties and customize the appearance of the volcano plot.
+
+        Parameters:
+        - axScatter (Any): Axes for the scatter plot.
+        - axHistx (Any): Axes for the histogram plot.
+        - log2fc_array (np.ndarray): Array of log2 fold change values.
+        - p_array (np.ndarray): Array of p-values.
+        - pair_list (List[str]): List of pairs or labels corresponding to each comparison.
+        - log2fc_percentile (Any): 95th percentile of log2 fold change values.
+
+        Returns:
+        - None
+        """
+        # Set axis limits and properties
+        x_min = -max(abs(np.min(log2fc_array)), abs(np.max(log2fc_array)))
+        x_max = -x_min
+        axScatter.set_xlim(x_min - 1, x_max + 1)
+        axScatter.grid(True, linewidth=1, which="major", color="lightgrey")
+        axScatter.grid(True, linewidth=0.5, which="minor", color="lightgrey")
+        axScatter.set_ylim(1.1, np.min(p_array) * 0.1)
+        axScatter.set_xlabel(
+            f"log2(mean({pair_list[0]})/mean({pair_list[1]}))",
+            fontsize=self.inputData.fontsize,
+        )
+        axScatter.set_ylabel("p-value", fontsize=self.inputData.fontsize)
+        axScatter.set_yscale("log")
+        axHistx.set_xlim(axScatter.get_xlim())
+
+        # Add legend
+        legend_elements = [
+            Line2D([0], [0], color="orange", linestyle="--", label="p-value = 0.05"),
+            Line2D([0], [0], color="red", linestyle="--", label="p-value = 0.01"),
+            Line2D([0], [0], color="purple", linestyle="--", label="|log2FC| = 1"),
+            Line2D(
+                [0],
+                [0],
+                color="blue",
+                linestyle="--",
+                label=f"|log2FC-95%ile| = {log2fc_percentile:.2f}",
+            ),
+        ]
+        legend = axScatter.legend(
+            handles=legend_elements, fontsize=self.inputData.fontsize, frameon=True
+        )
+        legend.get_frame().set_facecolor("white")
+
+    def __plot_count_comparisons_volcano(
+        self,
+        pairwise_representation_test_by_pair_by_attribute,
+    ) -> None:
+        """
+        Generate volcano plots for count comparisons based on pairwise representation test results.
+
+        Parameters:
+        - pairwise_representation_test_by_pair_by_attribute (Dict[str, Dict[Tuple[str, str], Any]]):
+        Dictionary containing test results organized by attribute and pair.
+
+        Returns:
+        - None
+        """
+        for attribute in pairwise_representation_test_by_pair_by_attribute:
+            for pair in pairwise_representation_test_by_pair_by_attribute[attribute]:
+                pair_list = list(pair)
+                pair_data = pairwise_representation_test_by_pair_by_attribute[
+                    attribute
+                ][pair]
+
+                p_values, log2fc_values = self.__prepare_data(pair_data)
+
+                if p_values:
+                    output_file = self.__get_output_filename(attribute, pair_list)
+                    self.__create_volcano_plot(
+                        p_values, log2fc_values, pair_list, output_file
+                    )
+
+    def __write_pairwise_representation(self) -> None:
+        """
+        Process pairwise representation tests, write results, and generate volcano plots.
+
+        Iterates through attributes in `self.aloCollection.attributes` and performs the
+        following steps for each attribute:
+        1. Initializes dictionaries `pairwise_representation_test_by_pair_by_attribute`
+        and `background_representation_test_by_pair_by_attribute`.
+        2. Prepares output file path (`pairwise_representation_test_f`) and header line
+        (`pairwise_representation_test_output`) for pairwise representation test results.
+        3. Retrieves sorted levels from `self.aloCollection.ALO_by_level_by_attribute[attribute]`.
+        4. Iterates through each level and processes pairwise and background representation
+        tests for each cluster in `self.clusterCollection.cluster_list`.
+        5. Generates volcano plots using `__plot_count_comparisons_volcano` for
+        `background_representation_test_by_pair_by_attribute` if available.
+        6. Writes pairwise representation test results to `pairwise_representation_test_f`
+        if data is available.
+        7. Generates volcano plots using `__plot_count_comparisons_volcano` for
+        `pairwise_representation_test_by_pair_by_attribute` if data is available.
+
+        Returns:
+        - None
+        """
+        for attribute in self.aloCollection.attributes:
+            pairwise_representation_test_by_pair_by_attribute: Dict[
+                str, Dict[str, str]
+            ] = {}
+            background_representation_test_by_pair_by_attribute = {}
+            pairwise_representation_test_output = []
+            pairwise_representation_test_f = os.path.join(
+                self.dirs[attribute], f"{attribute}.pairwise_representation_test.txt"
+            )
+            levels = sorted(
+                list(self.aloCollection.ALO_by_level_by_attribute[attribute])
+            )
+            levels_seen: Set[str] = set()
+
+            for level in levels:
+                ALO = self.aloCollection.ALO_by_level_by_attribute[attribute][level]
+
+                for cluster in self.clusterCollection.cluster_list:
+                    if (
+                        ALO
+                        and ALO.cluster_type_by_cluster_id[cluster.cluster_id]
+                        == "shared"
+                        and ALO.cluster_mwu_log2_mean_by_cluster_id[cluster.cluster_id]
+                    ):
+                        self.__process_background_representation(
+                            attribute,
+                            level,
+                            ALO,
+                            cluster,
+                            background_representation_test_by_pair_by_attribute,
+                        )
+
+                    ALO_proteomes_present = cluster.proteome_ids.intersection(
+                        ALO.proteomes if ALO else set("")
+                    )
+
+                    if (
+                        len(levels) > 1
+                        and len(ALO_proteomes_present) >= self.inputData.min_proteomes
+                    ):
+                        self.__process_pairwise_representation(
+                            attribute,
+                            level,
+                            levels_seen,
+                            levels,
+                            cluster,
+                            pairwise_representation_test_by_pair_by_attribute,
+                            pairwise_representation_test_output,
+                        )
+
+                levels_seen.add(level)
+
+                if background_representation_test_by_pair_by_attribute:
+                    self.__plot_count_comparisons_volcano(
+                        background_representation_test_by_pair_by_attribute
+                    )
+
+            if pairwise_representation_test_output:
+                with open(
+                    pairwise_representation_test_f, "w"
+                ) as pairwise_representation_test_fh:
+                    logger.info(f"[STATUS] - Writing {pairwise_representation_test_f}")
+                    pairwise_representation_test_output.sort()
+                    header_line = self.__get_header_line(
+                        "pairwise_representation_test", attribute
+                    )
+                    pairwise_representation_test_output.insert(0, header_line)
+                    pairwise_representation_test_fh.write(
+                        "\n".join(pairwise_representation_test_output) + "\n"
+                    )
+
+            if pairwise_representation_test_by_pair_by_attribute:
+                self.__plot_count_comparisons_volcano(
+                    pairwise_representation_test_by_pair_by_attribute
+                )
diff --git a/src/core/input.py b/src/core/input.py
new file mode 100644
index 0000000..1ab4636
--- /dev/null
+++ b/src/core/input.py
@@ -0,0 +1,77 @@
+import os
+from typing import List, Optional, Set, Tuple
+
+
+class ServeArgs:
+    def __init__(self, port: int = 8000):
+        self.port = port
+
+
+class InputData:
+    def __init__(
+        self,
+        nodesdb_f: str,
+        pfam_mapping_f: str,
+        ipr_mapping_f: str,
+        go_mapping_f: str,
+        cluster_file: str,
+        config_f: str,
+        sequence_ids_file: str,
+        species_ids_file: Optional[str] = None,
+        functional_annotation_f: Optional[str] = None,
+        fasta_dir: Optional[str] = None,
+        tree_file: Optional[str] = None,
+        output_path: Optional[str] = None,
+        infer_singletons: Optional[bool] = False,
+        plot_tree: bool = False,
+        min_proteomes: int = 2,
+        test: str = "mannwhitneyu",
+        taxranks: List[str] = None,
+        repetitions: int = 30,
+        fuzzy_count: int = 1,
+        fuzzy_fraction: float = 0.75,
+        fuzzy_range: Set[int] = {x for x in range(20 + 1) if x != 1},
+        fontsize: int = 18,
+        plotsize: Tuple[float, float] = (24, 12),
+        plot_format: str = "pdf",
+        taxon_idx_mapping_file: Optional[str] = None,
+    ) -> None:
+        if taxranks is None:
+            taxranks = ["phylum", "order", "genus"]
+        if output_path:
+            if not os.path.isabs(output_path):
+                output_path = os.path.abspath(output_path)
+        else:
+            output_path = os.path.join(os.getcwd(), "kinfin_results")
+
+        self.cluster_f = cluster_file
+        self.config_f = config_f
+        self.sequence_ids_f = sequence_ids_file
+        self.species_ids_f = species_ids_file
+        self.tree_f = tree_file
+        self.functional_annotation_f = functional_annotation_f
+        if config_f.endswith(".json") and not taxon_idx_mapping_file:
+            raise ValueError("[ERROR] - taxon_idx_mapping not present")
+        self.taxon_idx_mapping_file = taxon_idx_mapping_file
+        self.nodesdb_f = nodesdb_f
+        self.pfam_mapping_f = pfam_mapping_f
+        self.ipr_mapping_f = ipr_mapping_f
+        self.go_mapping_f = go_mapping_f
+
+        self.test = test
+        self.plot_tree = plot_tree
+        self.fasta_dir = fasta_dir
+        self.output_path = output_path
+        self.infer_singletons = infer_singletons
+        self.fuzzy_count = fuzzy_count
+        self.fuzzy_fraction = fuzzy_fraction
+        self.fuzzy_range = fuzzy_range
+        self.repetitions = repetitions
+        self.min_proteomes = min_proteomes
+        self.plot_format = plot_format
+        self.fontsize = fontsize
+        self.taxranks = taxranks
+        self.plotsize = plotsize
+
+        self.pfam_mapping = True
+        self.ipr_mapping = True
diff --git a/src/core/logger.py b/src/core/logger.py
new file mode 100644
index 0000000..322e0af
--- /dev/null
+++ b/src/core/logger.py
@@ -0,0 +1,29 @@
+import logging
+import os
+
+
+def setup_logger(log_path: str) -> logging.Logger:
+    """
+    Sets up a logger that logs messages to both the console and a file.
+
+    Args:
+        log_path (str): Path to the log file.
+
+    Returns:
+        logging.Logger: Configured logger instance.
+    """
+    os.makedirs(os.path.dirname(log_path), exist_ok=True)
+
+    logger = logging.getLogger("kinfin_logger")
+    logger.setLevel(logging.DEBUG)
+
+    console_handler = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S")
+    console_handler.setFormatter(formatter)
+    logger.addHandler(console_handler)
+
+    file_handler = logging.FileHandler(log_path, mode="w")
+    file_handler.setFormatter(formatter)
+    logger.addHandler(file_handler)
+
+    return logger
diff --git a/src/core/logic.py b/src/core/logic.py
new file mode 100644
index 0000000..84adbb1
--- /dev/null
+++ b/src/core/logic.py
@@ -0,0 +1,490 @@
+import logging
+import os
+from collections import defaultdict
+from typing import DefaultDict, Dict, List, Literal, Optional, Set, Tuple
+
+import ete3
+from ete3 import Tree, TreeNode
+
+from core.utils import progress, read_fasta_len, yield_config_lines, yield_file_lines
+
+logger = logging.getLogger("kinfin_logger")
+
+
+# common
+def parse_nodesdb(filepath: str) -> Dict[str, Dict[str, str]]:
+    """
+    Parses the nodes database file.
+
+    Args:
+        filepath (str): The path to the nodes database file.
+
+    Returns:
+        Dict[str, Dict[str, str]]: A dictionary containing node information.
+            Keys are node identifiers, and values are dictionaries with keys:
+
+                - 'rank': The rank of the node.
+                - 'name': The name of the node.
+                - 'parent': The parent of the node.
+    """
+    logger.info(f"[STATUS] - Parsing nodesDB {filepath}")
+
+    nodesdb: Dict[str, Dict[str, str]] = {}
+    nodesdb_count = 0
+    nodes_count = 0
+
+    for line in yield_file_lines(filepath):
+        if line.startswith("#"):
+            nodesdb_count = int(line.lstrip("# nodes_count = ").rstrip("\n"))
+        elif line.strip():
+            nodes_count += 1
+            try:
+                node, rank, name, parent = line.rstrip("\n").split("\t")
+                nodesdb[node] = {"rank": rank, "name": name, "parent": parent}
+            except Exception:
+                pass
+            if nodesdb_count:
+                progress(nodes_count, 1000, nodesdb_count)
+    return nodesdb
+
+
+# cli
+def get_lineage(
+    taxid: str,
+    nodesdb: Dict[str, Dict[str, str]],
+    taxranks: List[str],
+) -> Dict[str, str]:
+    """
+    Get the lineage of a taxonomic identifier.
+
+    Args:
+        taxid (str): The taxonomic identifier.
+        nodesdb (Dict[str, Dict[str, str]]): A dictionary containing information about nodes.
+        taxranks (List[str]): A list of taxonomic ranks to include in the lineage.
+
+    Returns:
+        Dict[str, str]: A dictionary containing the lineage information, with taxonomic ranks as keys
+        and corresponding names as values.
+    """
+    lineage = {taxrank: "undef" for taxrank in taxranks}
+    parent = ""
+    node = taxid
+    while parent != "1":
+        taxrank = nodesdb[node]["rank"]
+        parent = nodesdb[node]["parent"]
+        if taxrank in taxranks:
+            name = nodesdb[node]["name"]
+            lineage[taxrank] = name
+        node = parent
+    return lineage
+
+
+# cli
+def parse_attributes_from_config_data(
+    config_f: str,
+    taxon_idx_mapping_file: Optional[str],
+) -> Tuple[Set[str], Dict[str, str], List[str], Dict[str, Dict[str, str]]]:
+    """
+    Parses attributes from a configuration file.
+
+    Args:
+        config_f (str): The path to the configuration file.
+
+    Returns:
+        Tuple[Set[str], Dict[str, str], List[str], Dict[str, Dict[str, str]]]: A tuple containing:
+            - A set of proteome IDs.
+            - A dictionary mapping species IDs to proteome IDs.
+            - A list of attributes.
+            - A dictionary mapping proteome IDs to dictionaries, where each inner dictionary
+              maps attributes to their corresponding levels.
+
+    Raises:
+        FileNotFoundError: If the specified configuration file is not found.
+        ValueError: If there are errors in the configuration file format or content.
+
+    Note:
+        - The configuration file is expected to have a header line starting with '#',
+          where the first element is 'IDX' and the second element is 'TAXON'.
+        - Each subsequent non-empty line in the configuration file should contain
+          comma-separated values corresponding to the attributes defined in the header line.
+        - The 'TAXON' attribute is expected to be unique for each line.
+    """
+
+    logger.info("[STATUS] - Parsing config data ...")
+    attributes: List[str] = []
+    level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]] = {}
+    proteomes: Set[str] = set()
+    proteome_id_by_species_id: Dict[str, str] = {}
+
+    for line in yield_config_lines(config_f, taxon_idx_mapping_file):
+        if line.startswith("#"):
+            if not attributes:
+                attributes = [x.strip() for x in line.lstrip("#").split(",")]
+                if attributes[0] != "IDX" or attributes[1] != "taxon":
+                    error_msg = f"[ERROR] - First/second element have to be IDX/TAXON.\n\t{attributes}"
+                    raise ValueError(error_msg)
+        elif line.strip():
+            temp = line.split(",")
+
+            if len(temp) != len(attributes):
+                error_msg = f"[ERROR] - number of columns in line differs from header\n\t{attributes}\n\t{temp}"
+                raise ValueError(error_msg)
+
+            if temp[1] in proteomes:
+                error_msg = f"[ERROR] - 'TAXON' should be unique. {temp[0]} was encountered multiple times"  # fmt:skip
+                raise ValueError(error_msg)
+
+            species_id = temp[0]
+            proteome_id = temp[1]
+            proteomes.add(proteome_id)
+            proteome_id_by_species_id[species_id] = proteome_id
+
+            level_by_attribute_by_proteome_id[proteome_id] = dict(zip(attributes, temp))
+            level_by_attribute_by_proteome_id[proteome_id]["all"] = "all"
+    attributes.insert(0, "all")  # append to front
+    return (
+        proteomes,
+        proteome_id_by_species_id,
+        attributes,
+        level_by_attribute_by_proteome_id,
+    )
+
+
+# common
+def add_taxid_attributes(
+    nodesdb_f: str,
+    taxranks: List[str],
+    attributes: List[str],
+    level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]],
+) -> Tuple[List[str], Dict[str, Dict[str, str]]]:
+    """
+    Adds taxonomic attributes to the dictionary of attributes indexed by proteome ID.
+
+    Parameters:
+
+        - nodesdb_f (str): File path to the nodes database.
+        - taxranks (List[str]): List of taxonomic ranks to be included as attributes.
+        - attributes (List[str]): List of existing attributes.
+        - level_by_attribute_by_proteome_id (Dict[str, Dict[str, str]]): Dictionary where keys
+            are proteome IDs and values are dictionaries of attributes for each proteome ID,
+            including at least the "TAXID" attribute.
+
+    Returns:
+        Tuple[List[str], Dict[str, Dict[str, str]]]: A tuple containing:
+
+            - Updated list of attributes with taxonomic ranks added and "TAXID" removed.
+            - Updated dictionary of attributes indexed by proteome ID, with taxonomic attributes added and "TAXID" removed.
+    """
+    NODESDB = parse_nodesdb(nodesdb_f)
+    for proteome_id in level_by_attribute_by_proteome_id:
+        taxid = level_by_attribute_by_proteome_id[proteome_id]["TAXID"]
+        lineage = get_lineage(taxid=taxid, nodesdb=NODESDB, taxranks=taxranks)
+
+        # add lineage attribute/levels
+        for taxrank in taxranks:
+            level_by_attribute_by_proteome_id[proteome_id][taxrank] = lineage[taxrank]
+
+        # remove taxid-levels
+        del level_by_attribute_by_proteome_id[proteome_id]["TAXID"]
+
+    # remove taxid-attribute
+    attributes.remove("TAXID")
+
+    # add taxranks to rank
+    attributes.extend(iter(taxranks))
+    return attributes, level_by_attribute_by_proteome_id
+
+
+# cli
+def parse_tree_from_file(
+    tree_f: Optional[str],
+    attributes: List[str],
+    level_by_attribute_by_proteome_id: Dict[str, Dict[str, str]],
+    proteomes: Set[str],
+) -> Tuple[Optional[Tree], Optional[Dict[frozenset[str], str]]]:
+    """
+    Parse a phylogenetic tree from nwk file and set specified outgroups.
+
+    Args:
+        tree_f (str): Path to the nwk tree file.
+        outgroups (List[str]): List of outgroup taxa names.
+
+    Returns:
+        tuple[ete3.Tree, Dict[str, int]]: A tuple containing the parsed phylogenetic tree
+            and a dictionary mapping proteome IDs to node indices.
+    """
+    if not tree_f:
+        return None, None
+    outgroups: List[str] = []
+    if "OUT" not in attributes:
+        error_msg = "[ERROR] - Please specify one of more outgroup taxa"
+        ValueError(error_msg)
+    outgroups = [
+        proteome_id
+        for proteome_id in proteomes
+        if level_by_attribute_by_proteome_id[proteome_id]["OUT"] == "1"
+    ]
+    logger.info(f"[STATUS] - Parsing Tree file : {tree_f} ...")
+    tree_ete: TreeNode = ete3.Tree(tree_f)
+    if len(outgroups) > 1:
+        outgroup_node: TreeNode = tree_ete.get_common_ancestor(
+            outgroups
+        )  # type: ignore
+        try:
+            logger.info(
+                f"[STATUS] - Setting LCA of {', '.join(outgroups)} as outgroup : ..."
+            )
+            tree_ete.set_outgroup(outgroup_node)  # type: ignore
+        except ete3.coretype.tree.TreeError:  # type: ignore
+            logger.info("[STATUS] - Tree seems to be rooted already : ...")
+    else:
+        logger.info(f"[STATUS] - Setting {','.join(outgroups)} as outgroup : ...")
+        tree_ete.set_outgroup(outgroups[0])  # type: ignore
+    logger.info(tree_ete)
+    node_idx_by_proteome_ids: Dict[frozenset[str], str] = {}
+    for idx, node in enumerate(tree_ete.traverse("levelorder")):  # type: ignore
+        proteome_ids = frozenset(leaf.name for leaf in node)
+        if not node.name:
+            node.add_features(
+                name=f"n{idx}",
+                nodetype="node",
+                proteome_ids=proteome_ids,
+                apomorphic_cluster_counts={"singletons": 0, "non_singletons": 0},
+                synapomorphic_cluster_counts={
+                    "complete_presence": 0,
+                    "partial_absence": 0,
+                },
+                synapomorphic_cluster_strings=[],
+                counts={"specific": 0, "shared": 0, "absent": 0, "singleton": 0},
+            )
+        else:
+            node.add_features(
+                nodetype="tip",
+                proteome_ids=proteome_ids,
+                apomorphic_cluster_counts={"singletons": 0, "non_singletons": 0},
+                synapomorphic_cluster_counts={
+                    "complete_presence": 0,
+                    "partial_absence": 0,
+                },
+                synapomorphic_cluster_strings=[],
+                counts={"specific": 0, "shared": 0, "absent": 0, "singleton": 0},
+            )
+        node_idx_by_proteome_ids[proteome_ids] = node.name
+    return tree_ete, node_idx_by_proteome_ids
+
+
+def parse_fasta_dir(species_ids_f: str, fasta_dir: str) -> Dict[str, int]:
+    """
+    Parse a species IDs file to retrieve fasta file names and then calculate
+    lengths of sequences from corresponding FASTA files.
+
+    Args:
+    - species_ids_f (str): Path to the species IDs file, where each line contains
+      an index and a corresponding FASTA file name separated by ': '.
+    - fasta_dir (str): Directory path where the FASTA files are located.
+
+    Returns:
+    - Dict[str, int]: A dictionary mapping header strings (protein IDs) to their
+      corresponding sequence lengths extracted from the FASTA files.
+    """
+    logger.info("[STATUS] - Parsing FASTAs ...")
+    fasta_file_by_species_id: Dict[str, str] = {}
+
+    for line in yield_file_lines(species_ids_f):
+        if not line.startswith("#"):
+            idx, fasta = line.split(": ")
+            fasta_file_by_species_id[idx] = fasta
+
+    fasta_len_by_protein_id: Dict[str, int] = {}
+    for _, fasta_f in list(fasta_file_by_species_id.items()):
+        fasta_f = os.path.join(fasta_dir, fasta_f)
+
+        for header, length in read_fasta_len(fasta_f):
+            fasta_len_by_protein_id[header] = length
+
+    return fasta_len_by_protein_id
+
+
+def parse_pfam_mapping(pfam_mapping_f: str) -> Dict[str, str]:
+    """
+    Parse a PFAM mapping file to create a dictionary mapping PFAM domain IDs to their descriptions.
+
+    Args:
+    - pfam_mapping_f (str): Path to the PFAM mapping file, where each line contains tab-separated values
+      with the domain ID in the first column and its description in the fifth column.
+
+    Returns:
+    - Dict[str, str]: A dictionary mapping PFAM domain IDs to their corresponding descriptions.
+
+    Raises:
+    - ValueError: If conflicting descriptions are found for the same domain ID.
+    """
+    logger.info(f"[STATUS] - Parsing {pfam_mapping_f} ... ")
+
+    pfam_mapping_dict: Dict[str, str] = {}
+    for line in yield_file_lines(pfam_mapping_f):
+        temp: List[str] = line.split("\t")
+        domain_id: str = temp[0]
+        domain_desc: str = temp[4]
+        if domain_id not in pfam_mapping_dict:
+            pfam_mapping_dict[domain_id] = domain_desc
+        elif domain_desc != pfam_mapping_dict[domain_id]:
+            error_msg = f"[ERROR] : Conflicting descriptions for {domain_id}"
+            raise ValueError(error_msg)
+
+    return pfam_mapping_dict
+
+
+def parse_ipr_mapping(ipr_mapping_f: str) -> Dict[str, str]:
+    """
+    Parse an InterPro (IPR) mapping file to create a dictionary mapping InterPro IDs to their descriptions.
+
+    Args:
+    - ipr_mapping_f (str): Path to the InterPro mapping file, where each line contains an InterPro ID and its description.
+      Lines starting with "Active_site" are skipped as they are not relevant to mapping.
+
+    Returns:
+    - Dict[str, str]: A dictionary mapping InterPro IDs to their corresponding descriptions.
+
+    Raises:
+    - ValueError: If conflicting descriptions are found for the same InterPro ID.
+    """
+    logger.info(f"[STATUS] - Parsing {ipr_mapping_f} ... ")
+
+    ipr_mapping_dict: Dict[str, str] = {}
+    for line in yield_file_lines(ipr_mapping_f):
+        if not line.startswith("Active_site"):
+            temp: List[str] = line.split()
+            ipr_id: str = temp[0]
+            ipr_desc: str = " ".join(temp[1:])
+            if ipr_id not in ipr_mapping_dict:
+                ipr_mapping_dict[ipr_id] = ipr_desc
+            elif ipr_desc != ipr_mapping_dict[ipr_id]:
+                error_msg = f"[ERROR] : Conflicting descriptions for {ipr_id}"
+                raise ValueError(error_msg)
+    return ipr_mapping_dict
+
+
+def parse_go_mapping(go_mapping_f: str) -> Dict[str, str]:
+    """
+    Parse a Gene Ontology (GO) mapping file to create a dictionary mapping GO IDs to their descriptions.
+
+    Args:
+    - go_mapping_f (str): Path to the GO mapping file, where each line contains a GO ID and its description.
+      Lines starting with '!' are skipped as they are comments.
+
+    Returns:
+    - Dict[str, str]: A dictionary mapping GO IDs (without 'GO:' prefix) to their corresponding descriptions.
+
+    Raises:
+    - ValueError: If conflicting descriptions are found for the same GO ID.
+    """
+    logger.info(f"[STATUS] - Parsing {go_mapping_f} ... ")
+    go_mapping_dict: Dict[str, str] = {}
+    for line in yield_file_lines(go_mapping_f):
+        if not line.startswith("!"):
+            temp: List[str] = line.replace(" > ", "|").split("|")
+            go_string: List[str] = temp[1].split(";")
+            go_desc, go_id = go_string[0].replace("GO:", ""), go_string[1].lstrip(" ")
+
+            if go_id not in go_mapping_dict:
+                go_mapping_dict[go_id] = go_desc
+            elif go_desc != go_mapping_dict[go_id]:
+                error_msg = f"[ERROR] : Conflicting descriptions for {go_id}"
+                raise ValueError(error_msg)
+    return go_mapping_dict
+
+
+def compute_protein_ids_by_proteome(
+    proteomes_by_protein_id: Dict[str, str]
+) -> DefaultDict[str, Set[str]]:
+    """
+    Compute protein IDs grouped by proteome IDs.
+
+    Args:
+        proteomes_by_protein_id (Dict[str, str]): A dictionary mapping protein IDs to proteome IDs.
+
+    Returns:
+        DefaultDict[str, Set[str]]: A defaultdict where keys are proteome IDs and values are sets
+        of protein IDs belonging to each proteome ID.
+    """
+    protein_ids_by_proteome_id: DefaultDict[str, Set[str]] = defaultdict(set)
+    for protein_id, proteome_id in list(proteomes_by_protein_id.items()):
+        protein_ids_by_proteome_id[proteome_id].add(protein_id)
+    return protein_ids_by_proteome_id
+
+
+# common
+def get_attribute_cluster_type(
+    singleton,
+    implicit_protein_ids_by_proteome_id_by_level,
+) -> Literal["singleton", "shared", "specific"]:
+    """
+    Determines the type of cluster based on the parameters.
+
+    Parameters:
+    - singleton: A boolean indicating whether the cluster is a singleton.
+    - implicit_protein_ids_by_proteome_id_by_level: A dictionary representing protein ids
+      grouped by proteome id at different levels.
+
+    Returns:
+    - One of the following strings:
+      - "singleton": If `singleton` is True.
+      - "shared": If there are protein ids grouped under multiple proteome ids.
+      - "specific": If there is only one proteome id with protein ids.
+
+    """
+    if singleton:
+        return "singleton"
+    if len(implicit_protein_ids_by_proteome_id_by_level) > 1:
+        return "shared"
+    else:
+        return "specific"
+
+
+def get_ALO_cluster_cardinality(
+    ALO_proteome_counts_in_cluster: List[int],
+    fuzzy_range: Set[int],
+    fuzzy_count: int = 1,
+    fuzzy_fraction: float = 0.75,
+) -> Optional[str]:
+    """
+    Determine the cardinality type of a cluster based on ALO proteome counts.
+
+    Args:
+        ALO_proteome_counts_in_cluster (List[int]): List of ALO proteome counts in the cluster.
+        fuzzy_range (Set[int]): Set of integers representing the range of fuzzy counts.
+        fuzzy_count (int, optional): Specific count considered as fuzzy. Default is 1.
+        fuzzy_fraction (float, optional): Fraction threshold for considering a cluster as 'fuzzy'. Default is 0.75.
+
+    Returns:
+        Optional[str]: Returns "true" (str) if all counts are 1, "fuzzy" (str) if the cluster meets fuzzy criteria,
+                       and None otherwise.
+    """
+    if len(ALO_proteome_counts_in_cluster) > 2:
+        length = len(ALO_proteome_counts_in_cluster)
+        if all(count == 1 for count in ALO_proteome_counts_in_cluster):
+            return "true"
+        fuzzycount_count = len(
+            [
+                ALO_proteome_counts
+                for ALO_proteome_counts in ALO_proteome_counts_in_cluster
+                if ALO_proteome_counts == fuzzy_count
+            ]
+        )
+
+        fuzzyrange_count = len(
+            [
+                ALO_proteome_counts
+                for ALO_proteome_counts in ALO_proteome_counts_in_cluster
+                if ALO_proteome_counts in fuzzy_range
+            ]
+        )
+
+        if fuzzycount_count + fuzzyrange_count == length:
+            fuzzy_fr = fuzzycount_count / length
+
+            if fuzzy_fr >= fuzzy_fraction:
+                return "fuzzy"
+
+    return None
diff --git a/src/core/proteins.py b/src/core/proteins.py
new file mode 100644
index 0000000..c47439d
--- /dev/null
+++ b/src/core/proteins.py
@@ -0,0 +1,108 @@
+from collections import Counter
+from typing import Dict, List, Optional, Union
+
+from core.utils import mean, median, sd
+
+
+class Protein:
+    def __init__(
+        self,
+        protein_id: str,
+        proteome_id: str,
+        species_id: str,
+        sequence_id: str,
+    ) -> None:
+
+        self.protein_id = protein_id
+        self.proteome_id = proteome_id
+        self.species_id = species_id
+        self.sequence_id = sequence_id
+        self.length: Optional[int] = None
+        self.clustered: bool = False
+        self.secreted: bool = False
+        self.domain_counter_by_domain_source: Dict[str, Counter[str]] = {}
+        self.go_terms: List[str] = []
+
+    def update_length(self, length: int) -> None:
+        self.length = length
+
+
+class ProteinCollection:
+    def __init__(self, proteins_list: List[Protein]) -> None:
+        self.proteins_list: List[Protein] = proteins_list
+        self.proteins_by_protein_id: Dict[str, Protein] = {
+            protein.protein_id: protein for protein in proteins_list
+        }
+        self.protein_count: int = len(proteins_list)
+        self.domain_sources: List[str] = []
+        self.fastas_parsed: bool = False
+        self.functional_annotation_parsed: bool = False
+        self.domain_desc_by_id_by_source: Dict[str, Dict[str, str]] = {}
+
+    def add_annotation_to_protein(
+        self,
+        domain_protein_id: str,
+        domain_counter_by_domain_source: Dict[str, Counter],
+        go_terms: List[str],
+    ):
+        """
+        Updates a protein object with domain counters and GO terms.
+
+        Args:
+        - domain_protein_id (str): Identifier of the protein to annotate.
+        - domain_counter_by_domain_source (Dict[str, Counter]): Domain sources mapped to counters of domains.
+        - go_terms (List[str]): Gene Ontology (GO) terms associated with the protein.
+
+        This method sets domain counters, assigns GO terms, and checks if the protein is secreted
+        based on domain information ('SignalP_EUK' source).
+
+        Note: If 'SignalP_EUK' indicates 'SignalP-noTM', sets protein.secreted = True.
+        """
+        protein: Optional[Protein] = self.proteins_by_protein_id.get(
+            domain_protein_id, None
+        )
+        if protein is not None:
+            protein.domain_counter_by_domain_source = domain_counter_by_domain_source
+            signalp_notm = protein.domain_counter_by_domain_source.get(
+                "SignalP_EUK", None
+            )
+            if signalp_notm and "SignalP-noTM" in signalp_notm:
+                protein.secreted = True
+            protein.go_terms = go_terms
+
+    def get_protein_length_stats(
+        self, protein_ids: List[str]
+    ) -> Dict[str, Union[int, float]]:
+        """
+        Calculate statistics (sum, mean, median, standard deviation) of protein lengths.
+
+        Args:
+            protein_ids (List[str]): List of protein IDs for which to calculate statistics.
+
+        Returns:
+            Dict[str, Union[int, float]): A dictionary containing the calculated statistics:
+                - 'sum': Sum of lengths of proteins in the input list.
+                - 'mean': Mean length of proteins in the input list.
+                - 'median': Median length of proteins in the input list.
+                - 'sd': Standard deviation of lengths of proteins in the input list.
+
+            If no valid protein lengths could be calculated (e.g., if protein_ids is empty or no lengths
+            are available for the provided protein IDs), the values in the dictionary will default to 0 or 0.0.
+        """
+        protein_length_stats = {"sum": 0, "mean": 0.0, "median": 0, "sd": 0.0}
+        if protein_ids and self.fastas_parsed:
+            protein_lengths: List[int] = [
+                length
+                for length in [
+                    self.proteins_by_protein_id[protein_id].length
+                    for protein_id in protein_ids
+                    if protein_id in self.proteins_by_protein_id
+                ]
+                if length is not None
+            ]
+            protein_length_stats["sum"] = sum(protein_lengths)
+            protein_length_stats["mean"] = mean(protein_lengths)
+            protein_length_stats["median"] = median(protein_lengths)
+            protein_length_stats["sd"] = sd(protein_lengths)
+
+        return protein_length_stats
diff --git a/src/core/results.py b/src/core/results.py
new file mode 100644
index 0000000..9aa0f60
--- /dev/null
+++ b/src/core/results.py
@@ -0,0 +1,46 @@
+import logging
+import time
+
+from core.datastore import DataFactory
+from core.input import InputData
+
+logger = logging.getLogger("kinfin_logger")
+
+
+def analyse(input_data: InputData) -> None:
+    """
+    Performs KinFin analysis based on the provided input data using DataFactory.
+
+    Args:
+        input_data (InputData): An instance of InputData containing input parameters and data.
+
+    Returns:
+        None
+
+    Raises:
+        Any exceptions raised by DataFactory methods.
+    """
+    overall_start = time.time()
+    dataFactory = DataFactory(input_data)
+    dataFactory.setup_dirs()
+    dataFactory.analyse_clusters()
+    dataFactory.aloCollection.write_tree(
+        dataFactory.dirs,
+        dataFactory.inputData.plot_tree,
+        dataFactory.inputData.plot_format,
+        dataFactory.inputData.fontsize,
+    )
+    rarefaction_data = dataFactory.aloCollection.compute_rarefaction_data(
+        repetitions=dataFactory.inputData.repetitions
+    )
+    dataFactory.plot_rarefaction_data(
+        dirs=dataFactory.dirs,
+        plotsize=dataFactory.inputData.plotsize,
+        plot_format=dataFactory.inputData.plot_format,
+        fontsize=dataFactory.inputData.fontsize,
+        rarefaction_by_samplesize_by_level_by_attribute=rarefaction_data,
+    )
+    dataFactory.write_output()
+    overall_end = time.time()
+    overall_elapsed = overall_end - overall_start
+    logger.info(f"[STATUS] - Took {overall_elapsed}s to run kinfin.")
diff --git a/src/core/utils.py b/src/core/utils.py
new file mode 100644
index 0000000..7a2713e
--- /dev/null
+++ b/src/core/utils.py
@@ -0,0 +1,285 @@
+import gzip
+import json
+import logging
+import os
+import sys
+from math import log, sqrt
+from typing import Any, Generator, List, Optional, Tuple, Union
+
+import scipy
+
+logger = logging.getLogger("kinfin_logger")
+
+
+def progress(iteration: int, steps: Union[int, float], max_value: int) -> None:
+    """
+    Print progress in percentage based on the current iteration, steps, and maximum value.
+
+    Parameters:
+    - iteration (int): Current iteration or step number.
+    - steps (int | float): Number of steps or intervals after which progress is updated.
+    - max_value (int): Maximum value or total number of iterations.
+
+    Returns:
+    - None
+
+    Example:
+    >>> progress(5, 2, 10)
+    [PROGRESS]     - 50%
+    """
+    if iteration == max_value:
+        sys.stdout.write("\r")
+        print("[PROGRESS]\t- %d%%" % (100))
+    elif iteration % int(steps + 1) == 0:
+        sys.stdout.write("\r")
+        print("[PROGRESS]\t- %d%%" % (float(iteration / max_value) * 100), end=" ")
+        sys.stdout.flush()
+
+
+def check_file(filepath: Optional[str], install_kinfin: bool = False) -> None:
+    """
+    Check if a file exists.
+
+    Args:
+        filepath (str): Path to the file to be checked.
+
+    Raises:
+        FileNotFoundError: If the file does not exist.
+    """
+
+    if filepath is not None and not os.path.isfile(filepath):
+        error_msg = f"[ERROR] - file {filepath} not found."
+        if install_kinfin:
+            error_msg += " Please run the install script to download kinfin."
+        raise FileNotFoundError(error_msg)
+
+
+def yield_file_lines(filepath: str) -> Generator[str, Any, None]:
+    """
+    Args:
+        filepath (str): Path to the file.
+
+    Yields:
+        str: Each line from the file.
+    """
+    check_file(filepath)
+    if filepath.endswith(".gz"):
+        with gzip.open(filepath, "rb") as fh:
+            for line in fh:
+                line = line.decode("utf-8")
+                if line.startswith("nodesDB.txt"):
+                    line = f'#{line.split("#")[1]}'
+                yield line.rstrip("\n")
+    else:
+        with open(filepath) as fh:
+            for line in fh:
+                yield line.rstrip("\n")
+
+
+def yield_config_lines(
+    config_f: str,
+    taxon_idx_mapping_file: Optional[str],
+):
+    if config_f.endswith(".json"):
+        if not taxon_idx_mapping_file:
+            raise ValueError("[ERROR] - taxon_idx_mapping not present")
+
+        with (
+            open(taxon_idx_mapping_file, "r") as f_mapping,
+            open(config_f, "r") as f_config,
+        ):
+            taxon_idx_mapping = json.load(f_mapping)
+            config_data = json.load(f_config)
+            headers = ["IDX"] + list(config_data[0].keys())
+            yield "#" + ",".join(headers)
+
+            for item in config_data:
+                idx = taxon_idx_mapping[item["taxon"]]
+                row = [idx] + [item[key] for key in headers[1:]]
+                yield ",".join(row)
+    else:
+        yield from yield_file_lines(config_f)
+
+    return
+
+
+def read_fasta_len(fasta_file: str) -> Generator[Tuple[str, int], Any, None]:
+    """
+    Generator function to parse a FASTA file and yield tuples of header and sequence length.
+
+    Args:
+    - fasta_file (str): Path to the FASTA file to be parsed.
+
+    Yields:
+    Tuple[str, int]: A tuple containing the header and the length of the sequence.
+
+    Raises:
+    FileNotFoundError: If the specified FASTA file does not exist.
+    """
+    check_file(fasta_file)
+    with open(fasta_file) as fh:
+        logger.info(f"[STATUS]\t - Parsing FASTA {fasta_file}")
+        header: str = ""
+        seqs: List[str] = []
+        for line in fh:
+            if line[0] == ">":
+                if header:
+                    header = (
+                        header.replace(":", "_")
+                        .replace(",", "_")
+                        .replace("(", "_")
+                        .replace(")", "_")
+                    )  # orthofinder replaces chars
+                    yield header, len("".join(seqs))
+                header, seqs = (
+                    line[1:-1].split()[0],
+                    [],
+                )  # Header is split at first whitespace
+            else:
+                seqs.append(line[:-1])
+        header = (
+            header.replace(":", "_")
+            .replace(",", "_")
+            .replace("(", "_")
+            .replace(")", "_")
+        )  # orthofinder replaces chars
+        yield header, len("".join(seqs))
+
+
+def median(lst) -> float:
+    """
+    Calculate the median of a list of numbers.
+
+    Args:
+    - lst (list): List of numerical values.
+
+    Returns:
+    - float: Median of the list.
+    """
+    list_sorted = sorted(lst)
+    list_length = len(lst)
+    index = (list_length - 1) // 2
+    if list_length % 2:
+        return list_sorted[index] / 1.0
+    else:
+        return (list_sorted[index] + list_sorted[index + 1]) / 2.0
+
+
+def mean(lst) -> float:
+    """
+    Calculate the mean (average) of a list of numbers.
+
+    Args:
+    - lst (list): List of numerical values.
+
+    Returns:
+    - float: Mean of the list.
+    """
+    return float(sum(lst)) / len(lst) if lst else 0.0
+
+
+def sd(lst, population=True) -> float:
+    """
+    Calculate the standard deviation of a list of numbers.
+
+    Args:
+    - lst (list): List of numerical values.
+    - population (bool, optional): If True, calculates population standard deviation,
+      otherwise calculates sample standard deviation. Default is True.
+
+    Returns:
+    - float: Standard deviation of the list.
+    """
+    n = len(lst)
+    differences = [x_ - mean(lst) for x_ in lst]
+    sq_differences = [d**2 for d in differences]
+    ssd = sum(sq_differences)
+    variance = ssd / n if population is True else ssd / (n - 1)
+    return sqrt(variance)
+
+
+def statistic(
+    count_1: List[int],
+    count_2: List[int],
+    test: str,
+    min_proteomes: int,
+) -> Tuple[
+    Optional[float],
+    Optional[float],
+    Optional[float],
+    Optional[float],
+]:
+    """
+    Perform statistical tests and calculate relevant statistics between two lists of counts.
+
+    Args:
+    - count_1 (list): List of counts (integers).
+    - count_2 (list): Another list of counts (integers).
+    - test (str): Type of statistical test to perform, one of "welch", "mannwhitneyu", "ttest", "ks", "kruskal".
+    - min_proteomes (int): Minimum number of proteomes required for valid analysis.
+
+    Returns:
+    - Tuple[Optional[float], Optional[float], Optional[float], Optional[float]]:
+      Tuple containing:
+      - pvalue: p-value of the statistical test (or None if test is not applicable).
+      - log2_mean: Logarithm base 2 of the mean of count_1 divided by count_2.
+      - mean_count_1: Mean of count_1.
+      - mean_count_2: Mean of count_2.
+    """
+    pvalue: Optional[float] = None
+    log2_mean: Optional[float] = None
+    mean_count_1: Optional[float] = None
+    mean_count_2: Optional[float] = None
+
+    implicit_count_1: List[float] = [count for count in count_1 if count > 0]
+    implicit_count_2: List[float] = [count for count in count_2 if count > 0]
+
+    if len(implicit_count_1) < min_proteomes or len(implicit_count_2) < min_proteomes:
+        return None, None, None, None
+
+    mean_count_1 = mean(implicit_count_1)
+    mean_count_2 = mean(implicit_count_2)
+    log2_mean = log(mean_count_1 / mean_count_2, 2)
+
+    if (
+        len(set(implicit_count_1)) == 1
+        and len(set(implicit_count_2)) == 1
+        and set(implicit_count_1) == set(implicit_count_2)
+    ):  # equal
+        pvalue = 1.0
+    elif test == "welch":
+        # try:
+        # Welch's t-test
+        pvalue = scipy.stats.ttest_ind(
+            implicit_count_1,
+            implicit_count_2,
+            equal_var=False,
+        )[1]
+
+        if pvalue != pvalue:  # testing for "nan"
+            pvalue = 1.0
+    elif test == "mannwhitneyu":
+        try:
+            pvalue = scipy.stats.mannwhitneyu(
+                implicit_count_1,
+                implicit_count_2,
+                alternative="two-sided",
+            )[1]
+        except ValueError:  # throws ValueError when all numbers are equal
+            pvalue = 1.0
+    elif test == "ttest":
+        # try:
+        pvalue = scipy.stats.ttest_ind(implicit_count_1, implicit_count_2)[1]  # t-test
+        if pvalue != pvalue:  # testing for "nan"
+            pvalue = 1.0
+    elif test == "ks":
+        # H0 that they are drawn from the same distribution
+        pvalue = scipy.stats.ks_2samp(implicit_count_1, implicit_count_2)[1]
+        if pvalue != pvalue:  # testing for "nan"
+            pvalue = 1.0
+    elif test == "kruskal":
+        # H0 is that population median is equal
+        pvalue = scipy.stats.kruskal(implicit_count_1, implicit_count_2)[1]
+        if pvalue != pvalue:  # testing for "nan"
+            pvalue = 1.0
+    return pvalue, log2_mean, mean_count_1, mean_count_2
diff --git a/src/kinfin.py b/src/kinfin.py
index a88b82f..377db8f 100755
--- a/src/kinfin.py
+++ b/src/kinfin.py
@@ -26,7 +26,7 @@
             -t, --tree_file <FILE>              Tree file in Newick format (taxon names must be the same as TAXON in config file)
 
         General options
-            -o, --outprefix <STR>               Output prefix
+            -o, --output_path <STR>               Output prefix
             --infer_singletons                  Absence of proteins in clustering is interpreted as singleton (based on SequenceIDs.txt)
             --plot_tree                         Plot PDF of annotated phylogenetic tree (requires -t, full ETE3 installation and X-server/xvfb-run)
             --min_proteomes <INT>               Required number of proteomes in a taxon-set to be used
@@ -60,8 +60,8 @@
 
 
 import sys
-from os.path import isfile, join, exists, realpath, dirname
-from os import getcwd, mkdir, remove, environ
+from os.path import isfile, join, exists, realpath, dirname, isabs, abspath
+from os import getcwd, mkdir, remove, environ, makedirs
 import shutil
 import random
 import time
@@ -108,17 +108,6 @@
 ########################################################################
 
 
-def retrieve_ftp(remote_f, local_f):
-    try:
-        print("[STATUS] - Downloading '%s' to '%s'." % (remote_f, local_f))
-        req = urlopen(remote_f)
-        with open(local_f, 'wb') as local_fh:
-            shutil.copyfileobj(req, local_fh)
-        req.close()
-    except IOError:
-        sys.exit("[ERROR] : '%s' could not be downloaded." % (remote_f))
-
-
 def check_file(infile):
     if infile:
         if not isfile(infile):
@@ -472,7 +461,6 @@ def add_taxid_attributes(self, nodesdb_f, attributes, level_by_attribute_by_prot
         # add taxranks to rank
         for taxrank in inputObj.taxranks:
             attributes.append(taxrank)
-        self.nodesdb_file = nodesdb_f
         return attributes, level_by_attribute_by_proteome_id
 
     ###############################
@@ -480,30 +468,29 @@ def add_taxid_attributes(self, nodesdb_f, attributes, level_by_attribute_by_prot
     ###############################
 
     def setup_dirs(self, inputObj):
-        outprefix = inputObj.outprefix
+        output_path = inputObj.output_path
         self.dirs = {}
-        if outprefix:
-            if outprefix.endswith("/"):
-                result_path = "%skinfin_results" % (outprefix)
-            else:
-                result_path = "%s.kinfin_results" % (outprefix)
+        if output_path:
+            if not isabs(output_path):
+                output_path = abspath(output_path)
         else:
-            result_path = join(getcwd(), "kinfin_results")
-        self.dirs['main'] = result_path
-        print("[STATUS] - Output directories in \n\t%s" % (result_path))
-        if exists(result_path):
+            output_path = join(getcwd(), "kinfin_results")
+
+        self.dirs['main'] = output_path
+        print("[STATUS] - Output directories in \n\t%s" % (output_path))
+        if exists(output_path):
             print("[STATUS] - Directory exists. Deleting directory ...")
-            shutil.rmtree(result_path)
+            shutil.rmtree(output_path)
         print("[STATUS] - Creating directories ...")
-        mkdir(result_path)
+        makedirs(output_path)
         for attribute in aloCollection.attributes:
-            attribute_path = join(result_path, attribute)
+            attribute_path = join(output_path, attribute)
             self.dirs[attribute] = attribute_path
             if not exists(attribute_path):
                 print("\t%s" % (attribute_path))
                 mkdir(attribute_path)
         if aloCollection.tree_ete:
-            tree_path = join(result_path, "tree")
+            tree_path = join(output_path, "tree")
             node_chart_path = join(tree_path, "charts")
             node_header_path = join(tree_path, "headers")
             if not exists(tree_path):
@@ -610,7 +597,6 @@ def parse_species_ids(self, species_ids_f):
             if not line.startswith("#"):
                 idx, fasta = line.split(": ")
                 fasta_by_ortho_id[idx] = fasta
-        self.species_ids_file = species_ids_f
         return fasta_by_ortho_id
 
     ###############################
@@ -626,7 +612,6 @@ def parse_fasta_dir(self, fasta_dir, fasta_file_by_species_id):
             print("[STATUS]\t - Parsing FASTA %s" % (fasta_path))
             for header, length in readFastaLen(fasta_path):
                 fasta_len_by_protein_id[header] = length
-        self.fasta_dir = fasta_dir
         return fasta_len_by_protein_id
 
     ###############################
@@ -997,7 +982,20 @@ def write_cluster_metrics(self):
                             for domain_source in clusterCollection.domain_sources:
                                 # cluster_metrics_domains
                                 if domain_source in clusterObj.domain_counter_by_domain_source:
-                                    cluster_metrics_domains_line.append(";".join(["%s:%s" % (domain_id, count) for domain_id, count in clusterObj.domain_counter_by_domain_source[domain_source].most_common()]))
+                                    sorted_counts = sorted(
+                                        [
+                                            f"{domain_id}:{count}"
+                                            for domain_id, count in clusterObj.domain_counter_by_domain_source[
+                                                domain_source
+                                            ].most_common()
+                                        ],
+                                        key=lambda x: (
+                                            x.split(":")[-1],
+                                            x.split(":")[-2],
+                                        ),
+                                    )
+                                    sorted_counts_str = ";".join(sorted_counts)
+                                    cluster_metrics_domains_line.append(sorted_counts_str)
                                     cluster_metrics_domains_line.append("{0:.3f}".format(clusterObj.domain_entropy_by_domain_source[domain_source]))
                                 else:
                                     cluster_metrics_domains_line.append("N/A")
@@ -1703,11 +1701,11 @@ def generate_chart_for_node(self, node):
             x_values = np.array(proteome_coverages)
             ax.hist(x_values, histtype='stepfilled', align='mid', bins=np.arange(0.0, 1.0 + 0.1, 0.1))
             ax.set_xlim(-0.1, 1.1)
-            for tick in ax.xaxis.get_major_ticks():
-                tick.label.set_fontsize(inputObj.plot_font_size - 2)
-                tick.label.set_rotation('vertical')
-            for tick in ax.yaxis.get_major_ticks():
-                tick.label.set_fontsize(inputObj.plot_font_size - 2)
+            for tick in ax.xaxis.get_majorticklabels():
+                tick.set_fontsize(inputObj.plot_font_size - 2)
+                tick.set_rotation('vertical')
+            for tick in ax.yaxis.get_majorticklabels():
+                tick.set_fontsize(inputObj.plot_font_size - 2)
             ax.set_frame_on(False)
             ax.xaxis.grid(True, linewidth=1, which="major", color="lightgrey")
             ax.yaxis.grid(True, linewidth=1, which="major", color="lightgrey")
@@ -2145,8 +2143,8 @@ def __init__(self, args):
         # FASTA files
         self.fasta_dir = args['--fasta_dir']
         self.check_if_fasta_dir_and_species_ids_f()
-        # outprefix
-        self.outprefix = args['--outprefix']
+        # output_path
+        self.output_path = args['--output_path']
         # proteins
         self.infer_singletons = args['--infer_singletons']
         # values: fuzzyness
@@ -2227,28 +2225,25 @@ def check_input_files(self):
         if self.pfam_mapping:
             pfam_mapping_f = join(dirname(realpath(__file__)), "../data/Pfam-A.clans.tsv.gz")
             if not isfile(pfam_mapping_f):
-                print("[WARN] - PFAM-ID file 'data/Pfam-A.clans.tsv.gz' not found. Will be downloaded from ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz")
-                remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/Pfam/current_release/Pfam-A.clans.tsv.gz"
-                retrieve_ftp(remote_f, pfam_mapping_f)
+                print("[ERROR] - PFAM-ID file 'data/Pfam-A.clans.tsv.gz' not found. Please run the install script to download")
+                sys.exit()
             self.pfam_mapping_f = pfam_mapping_f
         if self.ipr_mapping:
             ipr_mapping_f = join(dirname(realpath(__file__)), "../data/entry.list")
             if not isfile(ipr_mapping_f):
-                print("[WARN] - IPR-ID file 'data/entry.list' not found. Will be downloaded from ftp://ftp.ebi.ac.uk/pub/databases/interpro/entry.list")
-                remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/entry.list"
-                retrieve_ftp(remote_f, ipr_mapping_f)
+                print("[ERROR] - IPR-ID file 'data/entry.list' not found. Please run the install script to download")
+                sys.exit()
             self.ipr_mapping_f = ipr_mapping_f
             go_mapping_f = join(dirname(realpath(__file__)), "../data/interpro2go")
             if not isfile(go_mapping_f):
-                print("[WARN] - GO-ID file, but 'data/interpro2go' not found. Will be downloaded from ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go")
-                remote_f = "ftp://ftp.ebi.ac.uk/pub/databases/interpro/interpro2go"
-                retrieve_ftp(remote_f, go_mapping_f)
+                print("[ERROR] - GO-ID file, but 'data/interpro2go' not found. Please run the install script to download")
+                sys.exit()
             self.go_mapping_f = go_mapping_f
 
     def check_that_ete_can_plot(self):
         if self.render_tree:
             try:
-                import PyQt4
+                import PyQt4 # type: ignore
             except ImportError:
                 sys.exit("[ERROR] : Plotting of trees requires additional ETE3 dependencies. PyQt4 is not installed. Please install PyQt4")
             if 'DISPLAY' in environ:
diff --git a/src/main.py b/src/main.py
new file mode 100755
index 0000000..8c20009
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python3
+
+import os
+import sys
+
+from api import run_server
+from cli import run_cli
+from cli.commands import parse_args
+from core.input import InputData, ServeArgs
+from core.utils import check_file
+
+if __name__ == "__main__":
+
+    # Without these files, application won't start
+    base_dir = os.getcwd()
+    nodesdb_f = os.path.join(base_dir, "data/nodesdb.txt")
+    pfam_mapping_f = os.path.join(base_dir, "data/Pfam-A.clans.tsv.gz")
+    ipr_mapping_f = os.path.join(base_dir, "data/entry.list")
+    go_mapping_f = os.path.join(base_dir, "data/interpro2go")
+
+    try:
+        check_file(nodesdb_f, install_kinfin=True)
+        check_file(pfam_mapping_f, install_kinfin=True)
+        check_file(ipr_mapping_f, install_kinfin=True)
+        check_file(go_mapping_f, install_kinfin=True)
+    except FileNotFoundError as e:
+        sys.exit(str(e))
+
+    args = parse_args(nodesdb_f, pfam_mapping_f, ipr_mapping_f, go_mapping_f)
+
+    if isinstance(args, ServeArgs):
+        # run the api server
+        cluster_f = os.environ.get("CLUSTER_FILE_PATH")
+        sequence_ids_f = os.environ.get("SEQUENCE_IDS_FILE_PATH")
+        taxon_idx_mapping_file = os.environ.get("TAXON_IDX_MAPPING_FILE_PATH")
+
+        # Without env variables being absolute paths, application won't start
+        if cluster_f is None or not os.path.isabs(cluster_f):
+            sys.exit("[ERROR] CLUSTER_FILE_PATH should be an absolute path.")
+        if sequence_ids_f is None or not os.path.isabs(sequence_ids_f):
+            sys.exit("[ERROR] SEQUENCE_IDS_FILE_PATH should be an absolute path.")
+        if taxon_idx_mapping_file is None or not os.path.isabs(taxon_idx_mapping_file):
+            sys.exit("[ERROR] TAXON_IDX_MAPPING_FILE_PATH should be an absolute path.")
+
+        try:
+            check_file(cluster_f, install_kinfin=True)
+            check_file(sequence_ids_f, install_kinfin=True)
+            check_file(taxon_idx_mapping_file, install_kinfin=True)
+        except FileNotFoundError as e:
+            sys.exit(str(e))
+
+        run_server(
+            args=args,
+            nodesdb_f=nodesdb_f,
+            go_mapping_f=go_mapping_f,
+            ipr_mapping_f=ipr_mapping_f,
+            pfam_mapping_f=pfam_mapping_f,
+            cluster_f=cluster_f,
+            sequence_ids_f=sequence_ids_f,
+            taxon_idx_mapping_file=taxon_idx_mapping_file,
+        )
+    elif isinstance(args, InputData):
+        run_cli(args)
+
+    else:
+        sys.exit("[ERROR] - invalid input provided.")
diff --git a/test b/test
deleted file mode 100755
index 20ea493..0000000
--- a/test
+++ /dev/null
@@ -1,5 +0,0 @@
-#!/usr/bin/env bash
-DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
-$DIR/src/kinfin.py -g $DIR/example/OrthologousGroups.txt -c $DIR/example/config.txt -s $DIR/example/SequenceIDs.txt -t $DIR/example/tree.nwk -o $DIR/example/test -p $DIR/example/SpeciesIDs.txt -a $DIR/example/fasta/ -f $DIR/example/functional_annotation.txt --min_proteomes 2
-#$DIR/src/kinfin.py -g $DIR/example/OrthologousGroups.txt -c $DIR/example/config.txt -s $DIR/example/SequenceIDs.txt -t $DIR/example/tree.nwk -o $DIR/example/test -p $DIR/example/SpeciesIDs.txt -a $DIR/example/fasta/ -f $DIR/example/functional_annotation.txt --min_proteomes 2 --test kruskal
-#$DIR/src/kinfin.py -g $DIR/example/OrthologousGroups.txt -c $DIR/example/config.txt -s $DIR/example/SequenceIDs.txt -t $DIR/example/tree.nwk -o $DIR/example/test -p $DIR/example/SpeciesIDs.txt -a $DIR/example/fasta/ -f $DIR/example/functional_annotation.txt --min_proteomes 2 --test ks
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..6bf2d3c
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,61 @@
+import os
+from typing import List, Tuple
+
+
+def pytest_addoption(parser) -> None:
+    """Add argument to take path to generated and expected output directories"""
+    parser.addoption(
+        "--generated",
+        action="store",
+        help="Path to the generated output directory",
+    )
+    parser.addoption(
+        "--expected",
+        action="store",
+        help="Path to the expected output directory",
+    )
+
+
+def pytest_generate_tests(metafunc) -> None:
+    """Generates test for each file"""
+    if "gen_file" in metafunc.fixturenames and "exp_file" in metafunc.fixturenames:
+        file_pairs = get_file_pairs(metafunc.config)
+        metafunc.parametrize("gen_file,exp_file", file_pairs)
+
+
+def get_file_pairs(config) -> List[Tuple[str, str]]:
+    """Get tuple of generate result file vs expected result file to compare"""
+    generated = config.getoption("generated")
+    expected = config.getoption("expected")
+
+    assert os.path.exists(generated), f"Directory '{generated}' does not exist"
+    assert os.path.exists(expected), f"Directory '{expected}' does not exist"
+
+    files1: List[str] = get_files(generated)
+    files2: List[str] = get_files(expected)
+
+    set1 = set(files1)
+    set2 = set(files2)
+
+    missing_files = set1.symmetric_difference(set2)
+
+    assert not missing_files, f"files missing: {', '.join(list(missing_files))}"
+
+    file_pairs: List[Tuple[str, str]] = [
+        (os.path.join(generated, gen_file), os.path.join(expected, exp_file))
+        for gen_file, exp_file in zip(files1, files2)
+        if gen_file.endswith(".txt")
+    ]
+    return file_pairs
+
+
+def get_files(directory) -> List[str]:
+    """
+    Recursively get all files in a directory
+    """
+    file_list = []
+    for root, _, files in os.walk(directory):
+        for file in files:
+            relative_path = os.path.relpath(os.path.join(root, file), directory)
+            file_list.append(relative_path)
+    return file_list
diff --git a/tests/run_dev_tests.sh b/tests/run_dev_tests.sh
new file mode 100755
index 0000000..9876cde
--- /dev/null
+++ b/tests/run_dev_tests.sh
@@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+
+# DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+# src/kinfin.py -g example/OrthologousGroups.txt -c example/config.txt -s example/SequenceIDs.txt -t example/tree.nwk -o example/test -p example/SpeciesIDs.txt -a example/fasta/ -f example/functional_annotation.txt --min_proteomes 2
+# #src/kinfin.py -g example/OrthologousGroups.txt -c example/config.txt -s example/SequenceIDs.txt -t example/tree.nwk -o example/test -p example/SpeciesIDs.txt -a example/fasta/ -f example/functional_annotation.txt --min_proteomes 2 --test kruskal
+# #src/kinfin.py -g example/OrthologousGroups.txt -c example/config.txt -s example/SequenceIDs.txt -t example/tree.nwk -o example/test -p example/SpeciesIDs.txt -a example/fasta/ -f example/functional_annotation.txt --min_proteomes 2 --test ks
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+cd "../$DIR"
+
+# To exit on error
+set -e 
+
+handle_error() {
+    echo "Error: $1" >&2
+    exit 1
+}
+
+src/main.py analyse -g "example/OrthologousGroups.txt" -c "example/config.txt" -s "example/SequenceIDs.txt" -o "result/example" || handle_error "Failed to run basic analysis with new tool."
+
+# # Function to check if a directory exists and is not empty
+# function is_directory_not_empty {
+#     local dir="$1"
+#     if [ -d "$dir" ] && [ "$(ls -A $dir)" ]; then
+#         return 0  # Directory exists and is not empty
+#     else
+#         return 1  # Directory does not exist or is empty
+#     fi
+# }
+
+
+# if ! is_directory_not_empty ".test_data"; then
+#     echo "Extracting test data..."
+#     tar -xzvf ./tests/test_data.tar.gz -C "./" || handle_error "Failed to extract test data."
+# else
+#     echo "Test data is already extracted and present."
+# fi
+
+# # echo "Running basic analysis with old tool (kinfin.py)..."
+# # src/kinfin.py -g ".test_data/basic/input/Orthogroups.txt" -c ".test_data/basic/input/kinfin.config.basic.txt" -s ".test_data/basic/input/kinfin.SequenceIDs.txt" -o "result/basic.cli.old" || handle_error "Failed to run basic analysis with old tool."
+
+# echo "Running basic analysis with new tool (main.py)..."
+# src/main.py analyse -g ".test_data/basic/input/Orthogroups.txt" -c ".test_data/basic/input/kinfin.config.basic.txt" -s ".test_data/basic/input/kinfin.SequenceIDs.txt" -o "result/basic.cli.new" || handle_error "Failed to run basic analysis with new tool."
+
+# # echo "Comparing output of old and new tools for basic analysis..."
+# # pytest -v ./tests/test_output_match.py --expected result/basic.cli.old --generated result/basic.cli.new
+
+# # Check pytest exit status
+# if [ $? -ne 0 ]; then
+#     echo "Basic test failed. Stopping execution."
+#     exit 1
+# fi
+
+# # If we get here, the basic test passed, so continue with advanced analysis
+# # echo "Running advanced analysis with old tool (kinfin.py)..."
+# # src/kinfin.py -g ".test_data/advanced/input/Orthogroups.txt" -c ".test_data/advanced/input/kinfin.config.advanced.txt" -s ".test_data/advanced/input/kinfin.SequenceIDs.txt" -o "result/advanced.cli.old" -p ".test_data/advanced/input/kinfin.SpeciesIDs.txt" -a ".test_data/advanced/input/fastas/" -t ".test_data/advanced/input/kinfin.tree.nwk" -f ".test_data/advanced/input/kinfin.functional_annotation.txt" || handle_error "Failed to run advanced analysis with old tool."
+
+# echo "Running advanced analysis with new tool (main.py)..."
+# src/main.py analyse -g ".test_data/advanced/input/Orthogroups.txt" -c ".test_data/advanced/input/kinfin.config.advanced.txt" -s ".test_data/advanced/input/kinfin.SequenceIDs.txt" -o "result/advanced.cli.new" -p ".test_data/advanced/input/kinfin.SpeciesIDs.txt" -a ".test_data/advanced/input/fastas/" -t ".test_data/advanced/input/kinfin.tree.nwk" -f ".test_data/advanced/input/kinfin.functional_annotation.txt" || handle_error "Failed to run advanced analysis with new tool."
+
+# # echo "Comparing output of old and new tools for advanced analysis..."
+# # pytest -v ./tests/test_output_match.py --expected result/advanced.cli.old --generated result/advanced.cli.new
\ No newline at end of file
diff --git a/tests/test_output_match.py b/tests/test_output_match.py
new file mode 100644
index 0000000..f3c9905
--- /dev/null
+++ b/tests/test_output_match.py
@@ -0,0 +1,30 @@
+def compare_files(gen_file, exp_file):
+    """
+    Compare files based on their types
+    """
+    if gen_file.endswith(".txt"):
+        return check_is_mismatch(gen_file, exp_file)
+    else:
+        return False
+
+
+def check_is_mismatch(gen_file, exp_file):
+    """
+    Compare each line of two text files
+    """
+    with open(gen_file, "r") as f1, open(exp_file, "r") as f2:
+        gen_lines = f1.readlines()
+        exp_lines = f2.readlines()
+    # Remove empty lines and strip whitespace
+    gen_lines = [line.strip() for line in gen_lines if line.strip()]
+    exp_lines = [line.strip() for line in exp_lines if line.strip()]
+    # Sort lines
+    gen_lines.sort()
+    exp_lines.sort()
+    # Compare sorted lines
+    return gen_lines != exp_lines
+
+
+def test_compare_files(gen_file, exp_file):
+    mismatch = compare_files(gen_file, exp_file)
+    assert not mismatch, f"Files '{gen_file}' and '{exp_file}' have mismatches"