step4_OG_communities_to_blocks_graph_check.py

#!/usr/bin/env python3

import argparse
import csv
import networkx as nx
import textwrap
import synphoni.utils as su
import synphoni.graph_analysis as sg
from synphoni.logo import logo_ASCII
import pickle

parser = argparse.ArgumentParser(formatter_class = argparse.RawDescriptionHelpFormatter,
                                 description = textwrap.dedent(f"""\
            
    {logo_ASCII()}
    Step 4 of the SYNPHONI (detection of ancestral SYNteny based on PHylogeny and Ortholog Network Inference) pipeline: 
    Extract syntenic blocks from orthogroup communities
    min_len, min_shared and clique size can be customized, but defaults are sensible
    """))
parser.add_argument("og_communities", help = "csv of orthogroup communities, output step 3.", type = str)
parser.add_argument("-g", "--filtered_graph",
                    help = "filtered graph, gpickle output from step3",
                    type = str,
                    required = True)
parser.add_argument("-c", "--chrom_data",
                    help = "pickle file of the chromdata, generated by step 1",
                    type = str,
                    required = True)
parser.add_argument("-l", "--min_len",
                    help = "minimum number of ortholog occurences (e.g. 3 paralogs, or 3 orthologs) two scaffold should share to be considered in the same multi_species_block",
                    default = 3)
parser.add_argument("-s", "--min_shared",
                    help = "minimum overlap coefficient of ortholog occurences two scaffold should share to be considered in the same multi_species_block",
                    default = .5,
                    type = float)
parser.add_argument("-k", "--clique_size",
                    help = "how many blocks a multi species block should have to be retained (use to percolate k cliques of extant blocks). Default is k = 3",
                    default = 3,
                    type = int)
parser.add_argument("-r", "--min_community_coverage",
                    help = "percentage of orthogroups of the original OG community a block should posess, default is .3, i.e. 30 percent",
                    default = .3,
                    type = float)
parser.add_argument("-m", "--chrom_clustering_method",
                    help = "Scaffold are grouped together to verify that they are homologs. Clique checking step of the synphoni algorithm.",
                    default = "k_clique",
                    choices = {"k_clique", "leiden"},
                    type = str)
parser.add_argument("-o", "--output",
                    help = "Prefix of the synt and clusters output files",
                    type = str,
                    required = True)
args = parser.parse_args()


with open(args.filtered_graph, "rb") as fhin:
    G_og = pickle.load(fhin)

chrom_dict = su.load_chrom_data(filepath = args.chrom_data)

ortho = {}
for og in chrom_dict.keys():
    for species in chrom_dict[og].keys():
        for chromo in chrom_dict[og][species].keys():
            ortho |= {acc:og for acc in chrom_dict[og][species][chromo].keys()}

species_ls = list(set(su.flatten([sp_ls for _, sp_ls in chrom_dict.items()])))

with open(args.og_communities, "r") as f:
    inputcsv = csv.reader(f)
    og_commus = sorted([set(row) for row in inputcsv], key = len, reverse = True)

output_prefix = f"{args.output}.len{args.min_len}.ol{args.min_shared}"
synt_path = f"{output_prefix}.synt"
multi_sp_path = f"{output_prefix}.clusters"
block_ids = {}

with open(synt_path, "w") as synt_h, open(multi_sp_path, "w") as multi_sp_h:
    synt_w = csv.writer(synt_h, delimiter = "\t")
    m_sp_w = csv.writer(multi_sp_h, delimiter = "\t")
    for current_commu in og_commus:
        current_commu_scaffolds = sg.genome_location_ogs(og_community = current_commu,
                                                         chrom_data = chrom_dict,
                                                         species_list = species_ls,
                                                         orthology = ortho,
                                                         min_og_commu = args.min_community_coverage)
        protoblock_graph = sg.og_info_to_graph(genome_location_orthogroups = current_commu_scaffolds,
                                               fullgraph_ogs_filt = G_og,
                                               min_len = args.min_len,
                                               min_shared = args.min_shared)
        if protoblock_graph != None:
            block_ids |= sg.write_blocks(blocks_writer = synt_w,
                                         multi_sp_writer = m_sp_w,
                                         genome_location_ogs_dict = current_commu_scaffolds,
                                         og_info_graph = protoblock_graph,
                                         k_perco = args.clique_size,
                                         known_dict = block_ids,
                                         method = args.chrom_clustering_method)