analysis_intervening_genes.py

#!/usr/bin/env python3

import argparse
import csv
import textwrap
import synphoni.utils as su
from synphoni.logo import logo_ASCII

parser = argparse.ArgumentParser(formatter_class = argparse.RawDescriptionHelpFormatter,
                                 description = textwrap.dedent(f"""\
            
    {logo_ASCII()}
    Analysis script of the SYNPHONI (detection of ancestral SYNteny based on PHylogeny and Ortholog Network Inference) pipeline: 
    Summarizes the number of intervening between consecutive block genes
    """))
parser.add_argument("-c", "--chrom_data",
                    help = "pickle file of the chromdata, generated by step 1",
                    type = str,
                    required = True)
parser.add_argument("-sy", "--synt_file",
                    help = "synt file, generated by step 4",
                    required = True,
                    type = str)
parser.add_argument("-ms", "--multi_sp_file",
                    help = "multi species file, generated by step 4",
                    required = True,
                    type = str)
parser.add_argument("-o", "--output",
                    help = "Prefix of the output tsv file",
                    type = str,
                    required = True)
args = parser.parse_args()

chrom_dict = su.load_chrom_data(filepath = args.chrom_data)
multi_sp_dict = {}
with open(args.multi_sp_file, "r") as f:
    msp_reader = csv.reader(f, delimiter = "\t")
    for row in msp_reader:
        for block in row[1:]:
            multi_sp_dict[block] = row[0]


output_file = f"{args.output}.tsv"
with open(args.synt_file, "r") as synt, open(output_file, "w") as out:
    synt_reader = csv.reader(synt, delimiter = "\t")
    tsv_writer = csv.writer(out, delimiter = "\t")
    tsv_writer.writerow(("multi_sp",
                         "block_id",
                         "chromosome",
                         "species",
                         "acc_pair",
                         "para",
                         "intervening_genes"))
    for row in synt_reader:
        block_id = row[0]
        multi_sp = multi_sp_dict[block_id]
        species = row[1]
        chrom = row[7].split(":")[0]
        acc_ls = row[9].split(",")
        og_ls = row[10].split(",")
        for (acc1, acc2), (og1, og2) in zip(su.window(acc_ls), su.window(og_ls)):
            pos1 = chrom_dict[og1][species][chrom][acc1][0]
            pos2 = chrom_dict[og2][species][chrom][acc2][0]
            nb_intervening = abs(pos2 - pos1) - 1
            para = "para" if og1 == og2 else "not_para"
            tsv_writer.writerow((multi_sp,
                                 block_id,
                                 chrom,
                                 species,
                                 f"{acc1},{acc2}",
                                 para,
                                 nb_intervening))