forked from nsmro/synphoni
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstep4_OG_communities_to_blocks_graph_check.py
98 lines (87 loc) · 4.89 KB
/
step4_OG_communities_to_blocks_graph_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python3
import argparse
import csv
import networkx as nx
import textwrap
import synphoni.utils as su
import synphoni.graph_analysis as sg
from synphoni.logo import logo_ASCII
import pickle
parser = argparse.ArgumentParser(formatter_class = argparse.RawDescriptionHelpFormatter,
description = textwrap.dedent(f"""\
{logo_ASCII()}
Step 4 of the SYNPHONI (detection of ancestral SYNteny based on PHylogeny and Ortholog Network Inference) pipeline:
Extract syntenic blocks from orthogroup communities
min_len, min_shared and clique size can be customized, but defaults are sensible
"""))
parser.add_argument("og_communities", help = "csv of orthogroup communities, output step 3.", type = str)
parser.add_argument("-g", "--filtered_graph",
help = "filtered graph, gpickle output from step3",
type = str,
required = True)
parser.add_argument("-c", "--chrom_data",
help = "pickle file of the chromdata, generated by step 1",
type = str,
required = True)
parser.add_argument("-l", "--min_len",
help = "minimum number of ortholog occurences (e.g. 3 paralogs, or 3 orthologs) two scaffold should share to be considered in the same multi_species_block",
default = 3)
parser.add_argument("-s", "--min_shared",
help = "minimum overlap coefficient of ortholog occurences two scaffold should share to be considered in the same multi_species_block",
default = .5,
type = float)
parser.add_argument("-k", "--clique_size",
help = "how many blocks a multi species block should have to be retained (use to percolate k cliques of extant blocks). Default is k = 3",
default = 3,
type = int)
parser.add_argument("-r", "--min_community_coverage",
help = "percentage of orthogroups of the original OG community a block should posess, default is .3, i.e. 30 percent",
default = .3,
type = float)
parser.add_argument("-m", "--chrom_clustering_method",
help = "Scaffold are grouped together to verify that they are homologs. Clique checking step of the synphoni algorithm.",
default = "k_clique",
choices = {"k_clique", "leiden"},
type = str)
parser.add_argument("-o", "--output",
help = "Prefix of the synt and clusters output files",
type = str,
required = True)
args = parser.parse_args()
with open(args.filtered_graph, "rb") as fhin:
G_og = pickle.load(fhin)
chrom_dict = su.load_chrom_data(filepath = args.chrom_data)
ortho = {}
for og in chrom_dict.keys():
for species in chrom_dict[og].keys():
for chromo in chrom_dict[og][species].keys():
ortho |= {acc:og for acc in chrom_dict[og][species][chromo].keys()}
species_ls = list(set(su.flatten([sp_ls for _, sp_ls in chrom_dict.items()])))
with open(args.og_communities, "r") as f:
inputcsv = csv.reader(f)
og_commus = sorted([set(row) for row in inputcsv], key = len, reverse = True)
output_prefix = f"{args.output}.len{args.min_len}.ol{args.min_shared}"
synt_path = f"{output_prefix}.synt"
multi_sp_path = f"{output_prefix}.clusters"
block_ids = {}
with open(synt_path, "w") as synt_h, open(multi_sp_path, "w") as multi_sp_h:
synt_w = csv.writer(synt_h, delimiter = "\t")
m_sp_w = csv.writer(multi_sp_h, delimiter = "\t")
for current_commu in og_commus:
current_commu_scaffolds = sg.genome_location_ogs(og_community = current_commu,
chrom_data = chrom_dict,
species_list = species_ls,
orthology = ortho,
min_og_commu = args.min_community_coverage)
protoblock_graph = sg.og_info_to_graph(genome_location_orthogroups = current_commu_scaffolds,
fullgraph_ogs_filt = G_og,
min_len = args.min_len,
min_shared = args.min_shared)
if protoblock_graph != None:
block_ids |= sg.write_blocks(blocks_writer = synt_w,
multi_sp_writer = m_sp_w,
genome_location_ogs_dict = current_commu_scaffolds,
og_info_graph = protoblock_graph,
k_perco = args.clique_size,
known_dict = block_ids,
method = args.chrom_clustering_method)