From 20da10f399addeeb0cd31e406e6fada9e7c21869 Mon Sep 17 00:00:00 2001 From: blackrim Date: Fri, 2 Aug 2024 13:20:36 -0400 Subject: [PATCH] more --- src/add_genera_to_paftol_join.py | 3 +- src/add_outgroup_to_matrix.py | 69 +++++--- src/change_ncbi_to_name_list.py | 45 ++++++ src/combine_datasets.py | 42 +++-- src/get_ncbi_tax_tree_no_species.py | 3 + src/get_ncbi_tax_tree_no_species_genus.py | 15 +- src/get_subset_genbank.py | 14 +- src/join_paftol_tax3.py | 186 ++++++++++++++++++++++ src/join_paftol_tax4.py | 177 ++++++++++++++++++++ src/resolve_tree.py | 102 ++++++++++++ src/run_all_all_tips.py | 58 +++---- src/run_all_polys.py | 133 ++++++++++++++++ 12 files changed, 781 insertions(+), 66 deletions(-) create mode 100644 src/change_ncbi_to_name_list.py create mode 100644 src/join_paftol_tax3.py create mode 100644 src/join_paftol_tax4.py create mode 100644 src/resolve_tree.py create mode 100644 src/run_all_polys.py diff --git a/src/add_genera_to_paftol_join.py b/src/add_genera_to_paftol_join.py index b91ced8..5b0a570 100644 --- a/src/add_genera_to_paftol_join.py +++ b/src/add_genera_to_paftol_join.py @@ -18,7 +18,8 @@ for i in t1.iternodes(): if i.label.split("_")[-1] == gn: par = i.parent + t2.label = i.label par.remove_child(i) par.add_child(t2) break - print(t1.get_newick_repr(False)+";") + print(t1.get_newick_repr(True)+";") diff --git a/src/add_outgroup_to_matrix.py b/src/add_outgroup_to_matrix.py index 47b28f6..de4e1f8 100755 --- a/src/add_outgroup_to_matrix.py +++ b/src/add_outgroup_to_matrix.py @@ -39,27 +39,54 @@ def construct_db_of_parts(infile,infileparts,outprefix): genesfn = {} #key name, value temp name genesf = {} #key name, value open file dbfn = outprefix+".tempdbfa" - tempoutf = open(dbfn,"w") - for i in infileparts: - spls = i.strip().split(" ") - name = spls[1] - shortname = name.split("/")[-1] - rnges = spls[-1].split("-") - beg = int(rnges[0]) - end = int(rnges[1]) - genes[name] = [beg,end] - genesn.append(name) - genesfn[name] = outprefix+"."+shortname #open file should be the shortname - genesf[name] = open(genesfn[name],"w") - for i in seq.read_fasta_file_iter(infile): - for j in genesn: - b,e = genes[j] - if len(i.seq[b-1:e].replace("-","")) > 100: - tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n") - genesf[j].write(">"+i.name+"\n"+i.seq[b-1:e]+"\n") - tempoutf.close() - for i in genesn: - genesf[name].close() + lf = 0 # how many lines + for _ in infileparts: + lf += 1 + infileparts.seek(0) + if lf < 1000: + tempoutf = open(dbfn,"w") + for i in infileparts: + spls = i.strip().split(" ") + name = spls[1] + shortname = name.split("/")[-1] + rnges = spls[-1].split("-") + beg = int(rnges[0]) + end = int(rnges[1]) + genes[name] = [beg,end] + genesn.append(name) + genesfn[name] = outprefix+"."+shortname #open file should be the shortname + genesf[name] = open(genesfn[name],"w") + for i in seq.read_fasta_file_iter(infile): + for j in genesn: + b,e = genes[j] + if len(i.seq[b-1:e].replace("-","")) > 100: + tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n") + genesf[j].write(">"+i.name+"\n"+i.seq[b-1:e]+"\n") + tempoutf.close() + for i in genesn: + genesf[name].close() + else: + tempoutf = open(dbfn,"w") + for i in infileparts: + spls = i.strip().split(" ") + name = spls[1] + shortname = name.split("/")[-1] + rnges = spls[-1].split("-") + beg = int(rnges[0]) + end = int(rnges[1]) + genes[name] = [beg,end] + genesn.append(name) + genesfn[name] = outprefix+"."+shortname #open file should be the shortname + genesf[name] = genesfn[name] + for i in seq.read_fasta_file_iter(infile): + for j in genesn: + gfo = open(genesf[j],"a") + b,e = genes[j] + if len(i.seq[b-1:e].replace("-","")) > 100: + tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n") + gfo.write(">"+i.name+"\n"+i.seq[b-1:e]+"\n") + gfo.close() + tempoutf.close() cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl"# > /dev/null 2>&1" os.system(cmd) os.remove(dbfn) diff --git a/src/change_ncbi_to_name_list.py b/src/change_ncbi_to_name_list.py new file mode 100644 index 0000000..3366a8a --- /dev/null +++ b/src/change_ncbi_to_name_list.py @@ -0,0 +1,45 @@ +import sys +import sqlite3 +import argparse as ap + + +def generate_argparser(): + parser = ap.ArgumentParser(prog="change_ncbi_to_name_tre.py", + formatter_class=ap.ArgumentDefaultsHelpFormatter) + parser = ap.ArgumentParser() + parser.add_argument("-d", "--db", type=str, help="NCBI database", required=True) + parser.add_argument("-i", "--infile", type=str, help="Input list", required=True) + parser.add_argument("-o", "--outfile", type=str, help="Output list", required=True) + return parser + +if __name__ == "__main__": + parser = generate_argparser() + if len(sys.argv[1:]) == 0: + sys.argv.append("-h") + args = parser.parse_args(sys.argv[1:]) + + conn = sqlite3.connect(args.db) + c = conn.cursor() + of = open(args.infile,"r") + oof = open(args.outfile,"w") + for i in of: + i = i.strip() + c.execute("select name_class,edited_name,left_value,right_value from taxonomy where ncbi_id = ?", (i, )) + nm = "" + lf = "" + rt = "" + for k in c: + if str(k[0]) == "scientific name": + nm = str(k[1]) + lf = str(k[2]) + rt = str(k[3]) + c.execute("select edited_name from taxonomy where name_class = ? and left_value < ? and right_value > ? and node_rank = ?",("scientific name",lf,rt,"family")) + fam = "" + ft = c.fetchone() + if len(ft) > 0: + fam = ft[0] + #for k in c: + # print(k) + oof.write(fam+"\t"+nm+"\n") + of.close() + oof.close() diff --git a/src/combine_datasets.py b/src/combine_datasets.py index c52b0e5..8c8f2c3 100644 --- a/src/combine_datasets.py +++ b/src/combine_datasets.py @@ -7,6 +7,8 @@ import seq from conf import perc_identity,evalue_limit,nthread +USEORIGINALNAMES = True + print(perc_identity,evalue_limit) def run_blast(blastdb,filen): cmd = "blastn -task blastn -db "+blastdb+".db -query "+filen+" -perc_identity "+str(perc_identity)+" -evalue "+str(evalue_limit)+" -num_threads "+str(nthread)+" -max_target_seqs 10000000 -out "+filen+".rawblastn -outfmt '6 qseqid qlen sseqid slen frames pident nident length mismatch gapopen qstart qend sstart send evalue bitscore' 2> NOPE" @@ -43,7 +45,7 @@ def construct_db_of_parts(infile,infileparts,outprefix): tempoutf.close() for i in genesn: genesf[name].close() - cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl"# > /dev/null 2>&1" + cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl > /dev/null 2>&1" os.system(cmd) os.remove(dbfn) return dbfn,genes,genesfn @@ -87,9 +89,14 @@ class NewGene: def __init__(self): self.geneset = set([]) self.genes = {} + self.altnames = {} - def addgene(self,la,fl): + def addgene(self,la,fl,altname = ""): self.genes[la] = fl + if altname == "": + self.altnames[la] = la + else: + self.altnames[la] = altname def issame(self,gene1,gene2): if gene1 in self.geneset: @@ -103,7 +110,7 @@ def issame(self,gene1,gene2): def writefile(self,fn): fn = open(fn,"w") for i in self.genes: - fn.write(">"+i+"\n"+get_seq_from_file(i,self.genes[i])+"\n") + fn.write(">"+self.altnames[i]+"\n"+get_seq_from_file(i,self.genes[i])+"\n") fn.close() def __str__(self): @@ -118,20 +125,29 @@ def generate_dataset(tips,files,outf): n = NewGene() n.geneset.add(lj[0]) n.geneset.add(lj[1]) - n.addgene(i,j['lg'][i]) + if USEORIGINALNAMES == False: + n.addgene(i,j['lg'][i]) + else: + n.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0]) ngs.append(n) else: t = False for k in ngs: if k.issame(lj[0],lj[1]): - k.addgene(i,j['lg'][i]) + if USEORIGINALNAMES == False: + k.addgene(i,j['lg'][i]) + else: + k.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0]) t = True break if t == False: n = NewGene() n.geneset.add(lj[0]) n.geneset.add(lj[1]) - n.addgene(i,j['lg'][i]) + if USEORIGINALNAMES == False: + n.addgene(i,j['lg'][i]) + else: + n.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0]) ngs.append(n) count = 0 ffs = [] @@ -145,6 +161,8 @@ def generate_dataset(tips,files,outf): count += 1 cmd = "pxcat -s "+" ".join(ffs)+" -o TEMPTEMPCAT -p TEMPTEMPPART" os.system(cmd) + for i in ffs: + os.remove(i) return if __name__ == "__main__": @@ -157,16 +175,18 @@ def generate_dataset(tips,files,outf): flp = [] for i in fls: flt.append(get_gene_ids(i)) - flp.append(open(i.replace("outaln","outpart"),"r")) + flp.append(i.replace("outaln","outpart")) bdbs = [] prts = [] gens = [] count = 0 for i,j in zip(fls,flp): - blastdb,parts,genesfn = construct_db_of_parts(i,j,"TEST"+str(count)) # this will be based on names + jf = open(j,"r") + blastdb,parts,genesfn = construct_db_of_parts(i,jf,"TEST"+str(count)) # this will be based on names bdbs.append(blastdb) prts.append(parts) gens.append(genesfn) + jf.close() count += 1 biggraph = nx.MultiGraph() for i in flt: @@ -213,8 +233,12 @@ def generate_dataset(tips,files,outf): tips = [] vals = {} for i in potential_tips: + if len(i) == 0: + continue x = random.choice(list(i.keys())) - #print(x,i[x]) + print(x,i[x]) tips.append(x) vals[x]=i[x] generate_dataset(tips,vals,"TEMPTEMP.fa") + #cleanup + os.system("rm TEST*.rn TEST*.rawblastn TEST*.tempdbfa.* TEMPTEMP.fa*") diff --git a/src/get_ncbi_tax_tree_no_species.py b/src/get_ncbi_tax_tree_no_species.py index dec242c..3605c40 100644 --- a/src/get_ncbi_tax_tree_no_species.py +++ b/src/get_ncbi_tax_tree_no_species.py @@ -63,6 +63,9 @@ def construct_tree(taxon, db, taxalist = None): id = stack.pop() if id in done: continue + # added, if some error, remove but add on line 81 + if id not in nodes: + continue done.add(id) c.execute("select ncbi_id,name,name_class,edited_name from taxonomy where parent_ncbi_id = ? and node_rank != 'species'",(id,)) childs = [] diff --git a/src/get_ncbi_tax_tree_no_species_genus.py b/src/get_ncbi_tax_tree_no_species_genus.py index 7e28787..10531bc 100644 --- a/src/get_ncbi_tax_tree_no_species_genus.py +++ b/src/get_ncbi_tax_tree_no_species_genus.py @@ -6,6 +6,11 @@ else: import node +""" +this is specifically for use with the paftol like trees because +of their focus on genera. +""" + exclude_un_en = True def clean_name(name): @@ -66,12 +71,18 @@ def construct_tree(taxon, db, taxalist = None): done.add(id) c.execute("select ncbi_id,name,name_class,edited_name,node_rank from taxonomy where parent_ncbi_id = ? and node_rank != 'species'",(id,)) childs = [] - for j in c: + testc = [j for j in c] + for j in testc: nr = str(j[4]) tid = str(j[0]) if includelist != None and tid not in includelist: continue - if "nom. ined" in str(j[1]) or "x " == str(j[1][0:2]) or " x " in str(j[1]) or "unclassified" in str(j[1]) or "environmental" in str(j[1]) or "incertae" in str(j[1]): + if "nom. ined" in str(j[1]) or "x " == str(j[1][0:2]) or " x " in str(j[1]) or "unclassified" in str(j[1]) or "environmental" in str(j[1]):# or "incertae" in str(j[1]): + continue + # check to see if it has children. if not. skip it. + c.execute("select ncbi_id from taxonomy where parent_ncbi_id=?",(tid,)) + tttest = [k for k in c] + if len(tttest) == 0: continue childs.append(tid) if nr != "genus": diff --git a/src/get_subset_genbank.py b/src/get_subset_genbank.py index 6727475..8eba37c 100644 --- a/src/get_subset_genbank.py +++ b/src/get_subset_genbank.py @@ -12,6 +12,10 @@ """ +def clean_name(nm): + nm = nm.replace(","," ") + return nm + def get_seq_from_gz(gzdir, filename, idtoget): fl = gzip.open(gzdir+"/"+filename,"r") for i in fl: @@ -102,7 +106,7 @@ def make_files_with_id(taxonid, DB,outfilen,outfile_tbln, gzfileloc, if tfilen not in files_ids: files_ids[tfilen] = [] files_ids[tfilen].append(str(j[2])) - ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])] + ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])] c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,)) childs = [] l = c.fetchall() @@ -205,7 +209,7 @@ def make_files_with_id_internal(taxonid, DB,outfilen,outfile_tbln,gzfileloc, if tfilen not in files_ids: files_ids[tfilen] = [] files_ids[tfilen].append(str(j[2])) - ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])] + ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])] # get the children of the taxon that have no children (and so the sequences would go here) keepers = [] c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(str(taxonid),)) @@ -266,7 +270,7 @@ def make_files_with_id_internal(taxonid, DB,outfilen,outfile_tbln,gzfileloc, files_ids[tfilen] = [] if str(j[1]) in keepers: files_ids[tfilen].append(str(j[2])) - ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])] + ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])] tblst = "\t".join(ids_props[str(j[2])]) outfile_tbl.write(tblst+"\n") c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,)) @@ -347,7 +351,7 @@ def make_files_with_id_justtable(taxonid, DB,outfile_tbln): c.execute("select * from sequence where ncbi_id = ?",(id,)) l = c.fetchall() for j in l: - tbls = str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(tname)+"\t"+str(j[5])+"\t"+str(j[6]) + tbls = str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(clean_name(tname))+"\t"+str(j[5])+"\t"+str(j[6]) if outfile_tbln != None: outfile_tbl.write(tbls+"\n") else: @@ -384,7 +388,7 @@ def make_files(taxon, DB,outfilen,outfile_tbln): for j in l: outfile.write(">"+str(j[3])+"\n") outfile.write(str(j[7])+"\n") - outfile_tbl.write(str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(tname)+"\t"+str(j[4])+"\n") + outfile_tbl.write(str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(clean_name(tname))+"\t"+str(j[4])+"\n") c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,)) childs = [] l = c.fetchall() diff --git a/src/join_paftol_tax3.py b/src/join_paftol_tax3.py new file mode 100644 index 0000000..226c7f4 --- /dev/null +++ b/src/join_paftol_tax3.py @@ -0,0 +1,186 @@ +import sys + +import node +import tree_reader +import tree_utils + +""" +TODO +- need to do the mrca using names that includes internal nodes +- need to go back to the node that is the MRCA but includes the clade of unsampled +so if it is +(a,b,c),(d,e,f) and i have (((e,f),d),a) -> ((a,b,c),(d,(e,f))) + +""" + +def intersect_taxa(n,t): + return len(set(n).intersection(t)) + +def get_mrca_wnms(n,t): + if len(n) == 1: + for i in t.iternodes(): + if i.label == n[0]: + return i + else: + return tree_utils.get_mrca_wnms(n,t) + +# n = names that we wanted to get mrca +# nd = the mrca in the tax +# ond = the mrca in the original tax tree incase there is non-monophyly +# paflvsnms = the paf tree lvs nms +def walk_back_mrca(nd,otax,paflvsnms): + rnd = nd + intn = intersect_taxa(rnd.lvsnms(),paflvsnms) + going = True + if rnd.parent == None: + going = False + while going: + nintn = intersect_taxa(rnd.parent.lvsnms(),paflvsnms) + if nintn != intn: + break + else: + # monophyly check + lab = rnd.parent.label + tnd = None + for i in otax.iternodes(): + if lab == i.label: + tnd = i + break + if tnd != None: + ntint2 = intersect_taxa(tnd.lvsnms(),paflvsnms) + if ntint2 != intn: + break + # end monophyly check + if rnd.parent == None: + break + rnd = rnd.parent + if nd != rnd: + print(nd.get_newick_repr(),rnd.label,file=sys.stderr) + return rnd + +class Bipart: + def __init__ (self,lf,rt): + self.left = lf + self.right = rt + self.union = lf.union(rt) + + def __str__(self): + x = ",".join(list(self.left)) + y = ",".join(list(self.right)) + return x+" | "+y + + def conflict(self, inbp): + if len(inbp.right.intersection(self.right)) > 0 and len(inbp.right.intersection(self.left)) > 0: + if len(inbp.left.intersection(self.right)) > 0 and len(inbp.left.intersection(self.left)) > 0 : + return True + if len(inbp.left.intersection(self.left)) > 0 and len(inbp.left.intersection(self.right)) > 0: + if len(inbp.right.intersection(self.left)) > 0 and len(inbp.right.intersection(self.right)) > 0: + return True + return False + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("python",sys.argv[0],"paf tax") + sys.exit(0) + + paf = tree_reader.read_tree_file_iter(sys.argv[1]).__next__() + tax = tree_reader.read_tree_file_iter(sys.argv[2]).__next__() + taxoriginal = tree_reader.read_tree_file_iter(sys.argv[2]).__next__() + + for i in paf.leaves(): + i.data["original_name"] = i.label + i.label = i.label.split("_")[-1] + taxnodestodo = {}# label, node + for i in tax.iternodes(): + i.data["original_name"] = i.label + i.label = "_".join(i.label.split("_")[0:-1]) + taxnodestodo[i.label] = i + for i in taxoriginal.iternodes(): + i.data["original_name"] = i.label + i.label = "_".join(i.label.split("_")[0:-1]) + + totaltaxset = set([i.label for i in tax.iternodes()]) + paftaxset = set(paf.lvsnms()) + for i in tax.iternodes(): + i.data["bp"] = Bipart(set(i.lvsnms()),totaltaxset-set(i.lvsnms())) + for i in paf.iternodes(): + i.data["bp"] = Bipart(set(i.lvsnms()),paftaxset-set(i.lvsnms())) + i.data["skip"] = False + + for j in paf.iternodes(): + for i in tax.iternodes(): + if j.data["skip"]: + continue + if i.data["bp"].conflict(j.data["bp"]): + print("NONMONO",i.label,file=sys.stderr) + j.data["skip"] = True + + count= 0 + for i in paf.iternodes(order="postorder"): + if i == paf: + continue + if i.data["skip"]: + continue + l = i.lvsnms() + l = list(set(l).intersection(totaltaxset)) + if len(i.children) < 2 or len(l) < 2: + continue + p = get_mrca_wnms(l,tax) + chds = set([]) + for j in i.children: + s = set(j.lvsnms()).intersection(totaltaxset) + if len(s) == 0: + continue + for k in p.children: + ss = set(j.lvsnms()).intersection(s) + if len(ss) > 0: + chds.add(k) + if len(chds) == 1: + continue + break + n = node.Node() + jep = False + ppp = None + potpr = [] + for j in list(chds): + pp = j.parent # need to add here if it is non-monophyletic so that things get sunk as a result + # basically need to get the old parent to the new parent and see if the old parent is in the clade of new parent, if not, remove the names + if j == p: + jep = True + ppp = pp + pp.remove_child(j) + n.add_child(j) + potpr.append(pp) + if jep == False: + p.add_child(n) + else: + ppp.add_child(n) + for pp in potpr: #if they are off on their own remove + if len(pp.children) == 0 and pp.parent != None: + tp = pp.parent + while tp != None: + tp.remove_child(pp) + if len(tp.children) == 0: + tp = tp.parent + else: + break + count += 1 + #if count == 100: + # break + for i in taxoriginal.iternodes(): + if len(i.children) == 0: + continue + for j in tax.iternodes(): + if len(j.children) == 0: + continue + if i.label == j.label: + s1 = set(i.lvsnms()) + s2 = set(j.lvsnms()) + if s1 != s2: + print("WOULD DELETE",j.label,file=sys.stderr) + j.label = "" + j.data["original_name"] = "" + for i in tax.iternodes(): + if "original_name" in i.data: + i.label = i.data["original_name"] + print(tax.get_newick_repr(False)+";") diff --git a/src/join_paftol_tax4.py b/src/join_paftol_tax4.py new file mode 100644 index 0000000..a9501b2 --- /dev/null +++ b/src/join_paftol_tax4.py @@ -0,0 +1,177 @@ +import sys + +import node +import tree_reader +import tree_utils + +""" +TODO +- need to do the mrca using names that includes internal nodes +- need to go back to the node that is the MRCA but includes the clade of unsampled +so if it is +(a,b,c),(d,e,f) and i have (((e,f),d),a) -> ((a,b,c),(d,(e,f))) + +""" + +def intersect_taxa(n,t): + return len(set(n).intersection(t)) + +def get_mrca_wnms(n,t): + if len(n) == 1: + for i in t.iternodes(): + if i.label == n[0]: + return i + else: + return tree_utils.get_mrca_wnms(n,t) + +# n = names that we wanted to get mrca +# nd = the mrca in the tax +# ond = the mrca in the original tax tree incase there is non-monophyly +# paflvsnms = the paf tree lvs nms +def walk_back_mrca(nd,otax,paflvsnms): + rnd = nd + intn = intersect_taxa(rnd.lvsnms(),paflvsnms) + going = True + if rnd.parent == None: + going = False + while going: + nintn = intersect_taxa(rnd.parent.lvsnms(),paflvsnms) + if nintn != intn: + break + else: + # monophyly check + lab = rnd.parent.label + tnd = None + for i in otax.iternodes(): + if lab == i.label: + tnd = i + break + if tnd != None: + ntint2 = intersect_taxa(tnd.lvsnms(),paflvsnms) + if ntint2 != intn: + break + # end monophyly check + if rnd.parent == None: + break + rnd = rnd.parent + if nd != rnd: + print(nd.get_newick_repr(),rnd.label,file=sys.stderr) + return rnd + +class Bipart: + def __init__ (self,lf,rt): + self.left = lf + self.right = rt + self.union = lf.union(rt) + + def __str__(self): + x = ",".join(list(self.left)) + y = ",".join(list(self.right)) + return x+" | "+y + + def conflict(self, inbp): + if len(inbp.right.intersection(self.right)) > 0 and len(inbp.right.intersection(self.left)) > 0: + if len(inbp.left.intersection(self.right)) > 0 and len(inbp.left.intersection(self.left)) > 0 : + return True + if len(inbp.left.intersection(self.left)) > 0 and len(inbp.left.intersection(self.right)) > 0: + if len(inbp.right.intersection(self.left)) > 0 and len(inbp.right.intersection(self.right)) > 0: + return True + return False + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("python",sys.argv[0],"paf tax") + sys.exit(0) + + paf = tree_reader.read_tree_file_iter(sys.argv[1]).__next__() + tax = tree_reader.read_tree_file_iter(sys.argv[2]).__next__() + taxoriginal = tree_reader.read_tree_file_iter(sys.argv[2]).__next__() + + for i in paf.leaves(): + i.data["original_name"] = i.label + i.label = i.label.split("_")[-1] + taxnodestodo = {}# label, node + for i in tax.iternodes(): + i.data["original_name"] = i.label + i.label = "_".join(i.label.split("_")[0:-1]) + taxnodestodo[i.label] = i + for i in taxoriginal.iternodes(): + i.data["original_name"] = i.label + i.label = "_".join(i.label.split("_")[0:-1]) + + totaltaxset = set([i.label for i in tax.iternodes()]) + totaltaxorigset = set([i.data["original_name"] for i in tax.iternodes()]) + totaltaxsetnodes = {} + for i in tax.iternodes(): + totaltaxsetnodes[i.label] = i + paftaxset = set(paf.lvsnms()) + for i in tax.iternodes(): + i.data["bp"] = Bipart(set(i.lvsnms()),totaltaxset-set(i.lvsnms())) + for i in paf.iternodes(): + i.data["bp"] = Bipart(set(i.lvsnms()),paftaxset-set(i.lvsnms())) + i.data["skip"] = False + + for j in paf.iternodes(): + for i in tax.iternodes(): + if j.data["skip"]: + continue + if i.data["bp"].conflict(j.data["bp"]): + print("NONMONO",i.label,file=sys.stderr) + j.data["skip"] = True + + count= 0 + for i in paf.iternodes(order="postorder"): + if i == paf: + continue + if i.data["skip"]: + continue + + l = i.lvsnms() + l = list(set(l).intersection(totaltaxset)) + if len(i.children) < 2 or len(l) < 2: + continue + p = get_mrca_wnms(l,tax) #might want to walk back + lp = i.parent.lvsnms() + lp = list(set(lp).intersection(totaltaxset)) + pp = get_mrca_wnms(lp,tax) + if p == pp: + #print(l) + skip = False + for j in i.children: + if j.data["skip"] == True: + skip = True + break + if skip == True: + continue + chds = set([]) + for j in i.children: + s = set(j.lvsnms()).intersection(totaltaxset) + if len(s) == 0: + continue + for k in p.children: + ss = set(k.lvsnms()).intersection(s) + if len(ss) > 0: + chds.add(k) + break + if len(chds) > 1: + n = node.Node() + for j in chds: + p.remove_child(j) + n.add_child(j) + j.parent = n + p.add_child(n) + n.parent = p + #print(p.get_newick_repr()) + #k = walk_back_mrca(p,taxoriginal,paf.lvsnms()) + #if k.parent == None: + # continue + count += 1 + #if count == 20: + # break + for i in tax.iternodes(): + if len(i.children) == 0: + if "original_name" in i.data: + i.label = i.data["original_name"] + else: + i.label = "" + print(tax.get_newick_repr(False)+";") diff --git a/src/resolve_tree.py b/src/resolve_tree.py new file mode 100644 index 0000000..17fdedf --- /dev/null +++ b/src/resolve_tree.py @@ -0,0 +1,102 @@ +import sys + +import tree_reader +import tree_utils +import node + +def get_mrca_int(nms1,nms2,tr,verb): + nds1 = [] + for i in tr.iternodes(): + if i.label in nms1: + nds1.append(i) + nds2 = [] + for i in tr.iternodes(): + if i.label in nms2: + nds2.append(i) + ndsa = [i for i in nds1] + ndsa.extend(nds2) + if len(ndsa) == 0: + for i in tr.iternodes(): + print(i.label,file=sys.stderr) + print(nms1,nms2,file=sys.stderr) + sys.exit(0) + if len(ndsa) == 1: + pa = ndsa[0] + else: + pa = tree_utils.get_mrca(ndsa,tr) + x1 = None + x2 = None + for i in pa.children: + si = set([nd for nd in i.iternodes()]) + if set(nds1).intersection(si) == set(nds1): + x1 = i + if set(nds2).intersection(si) == set(nds2): + x2 = i + return x1,x2,pa + +def remove_knee(tr): + going = True + while going: + going = False + for i in tr.iternodes(): + if len(i.children) == 1: + going = True + p = i.parent + #print(p.get_newick_repr(False),file=sys.stderr) + c = i.children[0] + p.remove_child(i) + p.add_child(c) + if i.label != "": + if c.label == "": + c.label = i.label + elif p.label == "": + p.label = i.label + #print(p.get_newick_repr(False),file=sys.stderr) + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("python",sys.argv[0],"bigtree smalltree") + sys.exit(0) + bt = tree_reader.read_tree_file_iter(sys.argv[1]).__next__() + st = tree_reader.read_tree_file_iter(sys.argv[2]).__next__() + + for i in bt.iternodes(): + i.data["original_name"] = i.label + i.label = i.label.split("_")[-1] + + #remove root st + nt = st.children[0] + if len(nt.leaves()) == 1: + nt = st.children[1] + + for i in nt.iternodes("postorder"): + verb = False + if len(i.leaves()) < 2: + continue + lvs1 = i.children[0].lvsnms() + lvs2 = i.children[1].lvsnms() + #print(lvs1,file=sys.stderr) + #print(lvs2,file=sys.stderr) + m1,m2,pa = get_mrca_int(lvs1,lvs2,bt,verb) + #print(m1.get_newick_repr(False),file=sys.stderr) + #print(m2.get_newick_repr(False),file=sys.stderr) + #print(pa.get_newick_repr(False),file=sys.stderr) + if m1 == m2: + continue + if m1 == None or m2 == None: + continue + nd = node.Node() + pa.remove_child(m1) + pa.remove_child(m2) + nd.add_child(m1) + nd.add_child(m2) + nd.length = i.length + m1.length = i.children[0].length + m2.length = i.children[1].length + pa.add_child(nd) + remove_knee(bt) + for i in bt.iternodes(): + if "original_name" in i.data: + i.label = i.data["original_name"] + print(bt.get_newick_repr(True)+";") + diff --git a/src/run_all_all_tips.py b/src/run_all_all_tips.py index 0ac5d2f..13f6aa8 100644 --- a/src/run_all_all_tips.py +++ b/src/run_all_all_tips.py @@ -4,18 +4,6 @@ import tree_reader import get_ncbi_tax_tree as gntt -db = "../poa.db" -f = "../gzzsseqs/poagzseqs/" - -cmd1 = "python ~/apps/PyPHLAWD/src/setup_clade_ap.py -b "+db+" -s "+f+" -t TAX -o . -l log" -cmd2 = "python ~/apps/PyPHLAWD/src/find_good_clusters_for_concat_batch.py -d DIR -b "+db -cmd3 = "python ~/apps/PyPHLAWD/src/add_outgroup_to_matrix.py -b "+db+" -m MATRIX -p PART -t TAX -o OUT -e EXT -s"+f -cmd4 = "iqtree -m GTR+G -s ALN -q PART -nt 4" - -def get_outgroup(leaf): - otax = "4613" - return otax - def get_table_names(fl): f =open(fl,"r") d = {} @@ -49,10 +37,18 @@ def clean_name(s): return s if __name__ == "__main__": - if len(sys.argv) != 2: - print("python",sys.argv[0],"tree") + if len(sys.argv) != 5: + print("python",sys.argv[0],"tree db seqdir ogroupnum") # like corncorn ../../corn.db ../../gzzsseqs/corngzseqs 41934 sys.exit(0) + db = sys.argv[2] + seqdir = sys.argv[3] + ogroup = sys.argv[4] + cmd1 = "python ~/apps/PyPHLAWD/src/setup_clade_ap.py -b "+db+" -s "+seqdir+" -t TAX -o . -l log" + cmd2 = "python ~/apps/PyPHLAWD/src/find_good_clusters_for_concat_batch.py -i -d DIR -b "+db + cmd3 = "python ~/apps/PyPHLAWD/src/add_outgroup_to_matrix.py -b "+db+" -m MATRIX -p PART -t TAX -o OUT -e EXT -s"+seqdir + cmd4 = "iqtree -m GTR+G -s ALN -q PART -nt 4" + t = tree_reader.read_tree_file_iter(sys.argv[1]).__next__() for i in t.leaves(): lf = i @@ -63,7 +59,7 @@ def clean_name(s): else: ftax = i.label tax = ftax.split("_")[-1] - #if tax != "3618": + #if tax != "2961870" and tax != "122249" and tax != "179090" and tax != "19933": # continue dir = tax+"_"+tax print(cmd1.replace("TAX",tax)) @@ -76,7 +72,7 @@ def clean_name(s): print("too small") os.system("rm -r TEMPDIR*") continue - otax = get_outgroup(lf) + otax = ogroup outm = mat+".outg" outp = mat+".outg.outpart" cc = cmd3.replace("MATRIX",mat).replace("PART",part).replace("OUT",outm).replace("TAX",otax).replace("EXT",tax) @@ -90,16 +86,22 @@ def clean_name(s): f.close() os.system(cmd4.replace("ALN",outm+".outaln").replace("PART",outp)) tf = outm+".outpart.treefile" - os.system("pxrr -t "+tf+" -g "+og+" > "+tf+".rr") - tf = tf+".rr" - ttf = tree_reader.read_tree_file_iter(tf).__next__() - rr = ttf.children[0] - if og in rr.lvsnms(): - rr = ttf.children[1] - tablenms = get_table_names(dir+"/"+dir+".table") - for j in rr.leaves(): - j.label = clean_name(tablenms[j.label]) - ft = open(tf+".final","w") - ft.write(rr.get_newick_repr(True)+";") - ft.close() + if og != None: + os.system("pxrr -t "+tf+" -g "+og+" > "+tf+".rr") + tf = tf+".rr" + ttf = tree_reader.read_tree_file_iter(tf).__next__() + rr = ttf.children[0] + if og in rr.lvsnms(): + rr = ttf.children[1] + tablenms = get_table_names(dir+"/"+dir+".table") + for j in rr.leaves(): + j.label = clean_name(tablenms[j.label]) + ft = open(tf+".final","w") + ft.write(rr.get_newick_repr(True)+";") + ft.close() + else: + rr = tree_reader.read_tree_file_iter(tf).__next__() + ft = open(tf+".final","w") + ft.write(rr.get_newick_repr(True)+";") + ft.close() os.system("rm -r TEMPDIR*") diff --git a/src/run_all_polys.py b/src/run_all_polys.py new file mode 100644 index 0000000..88fa367 --- /dev/null +++ b/src/run_all_polys.py @@ -0,0 +1,133 @@ +import sys +import os + +import tree_reader +import node +import seq + +ogdb = "../../api.db" +ogf = "../../gzzsseqs/apigzseqs/" + +cmd0 = "python ~/apps/PyPHLAWD/src/combine_datasets.py " +cmd3 = "python ~/apps/PyPHLAWD/src/add_outgroup_to_matrix.py -b "+ogdb+" -m MATRIX -p PART -t TAX -o OUT -s"+ogf +cmd4 = "iqtree -m GTR+G -s ALN -q PART -nt 4 -g CONS -redo " +cmd5 = "raxml -m GTRCAT -T 2 -g CONS -s ALN -p 12345 -n RAXRUN" +fn = "_outaln" +cmb1 = "TEMPTEMPCAT" +cmb2 = "TEMPTEMPPART" + +def get_constraint(nms,tre): + keep = [] + todel= [] + for i in tre.iternodes(): + if i.label in nms: + keep.append(i) + continue + if len(set([k.label for k in i.iternodes()]).intersection(set(nms))) == 0: + todel.append(i) + for i in todel: + p = i.parent + p.remove_child(i) + # remove knuckles + going = True + while going: + going = False + for i in tre.iternodes(): + if len(i.children) == 1: + going = True + if i.parent == None: + tre = i.children[0] + break + else: + p = i.parent + p.add_child(i.children[0]) + p.remove_child(i) + return tre + +def add_outgroup_constraint_write(cont,og): + nd2 = node.Node() + nd2.label = og + cont.add_child(nd2) + for i in cont.leaves(): + i.label=i.label.split("_")[-1] + fl = open("CONSTRAINT","w") + fl.write(cont.get_newick_repr(False)+";") + fl.close() + return "CONSTRAINT" + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("python",sys.argv[0],"treefile maindir outgroup") + sys.exit(0) + tf = sys.argv[1] + md = sys.argv[2] + og = sys.argv[3] + if md[-1] != "/": + md += "/" + t = tree_reader.read_tree_file_iter(tf).__next__() + count = 0 + for i in t.iternodes("preorder"): + if len(i.children) >= 3: + tff = "poly."+str(count)+".treefile" + if os.path.isfile(tff): + count += 1 + continue + t2 = tree_reader.read_tree_file_iter(tf).__next__() + chds = [] #intended to have the ids of what we will look for in the directories + stack = [j for j in i.children] + while len(stack) > 0: + p = stack.pop() + if p.label != "": + chds.append(p) + continue + else: + for j in p.children: + stack.append(j) + labs = [j.label for j in chds] + print("- "+" ".join(labs)) + dirs = [] + for j in labs: + jl = j.split("_")[-1] + mdjl = md+jl+"_"+jl + if os.path.exists(mdjl): + dirs.append(mdjl+"/"+jl+"_"+jl+fn) + cmd = cmd0+" ".join(dirs) + print(cmd) + os.system(cmd) + tips = set(seq.read_fasta_file_return_dict("TEMPTEMPCAT").keys()) + if len(tips) == 0: + continue + labs = [j for j in labs if j.split("_")[-1] in tips] + if len(labs) < 3: + continue + cont = get_constraint(labs,t2) + cmd = cmd3.replace("MATRIX",cmb1).replace("PART",cmb2).replace("TAX",og).replace("OUT",cmb1+".og") + print(cmd) + os.system(cmd) + tb = "TEMPTEMPCAT.og.table" + og = "" + f = open(tb,"r") + for j in f: + og = j.strip().split("\t")[1] + break + f.close() + fln = add_outgroup_constraint_write(cont,og) + mv1 = "TEMPTEMPCAT.og.outaln" + mv2 = "TEMPTEMPCAT.og.outpart" + #cmd = cmd4.replace("ALN",mv1).replace("PART",mv2).replace("CONS",fln) + cmd = cmd5.replace("ALN",mv1).replace("CONS",fln) + print(cmd) + os.system(cmd) + tff = "RAxML_bestTree.RAXRUN" + os.system("mv "+mv1+" poly."+str(count)+".outaln") + os.system("mv "+mv2+" poly."+str(count)+".outpart") + os.system("mv "+tff+" poly."+str(count)+".treefile") + tff = "poly."+str(count)+".treefile" + os.system("pxrr -t "+tff+" -g "+og+" > "+tff+".rr") + rm1 = "TEMPTEMPCAT.og.tempdbfa*" + rm2 = "RAxML*" + #rm2 = "TEMPTEMPCAT.og.outpart.*" + os.system("rm "+rm1) + os.system("rm "+ rm2) + count += 1 +