more

FePhyFoFum · Aug 2, 2024 · 20da10f · 20da10f
1 parent 3d0e5c1
commit 20da10f
Show file tree

Hide file tree

Showing 12 changed files with 781 additions and 66 deletions.
diff --git a/src/add_genera_to_paftol_join.py b/src/add_genera_to_paftol_join.py
@@ -18,7 +18,8 @@
         for i in t1.iternodes():
             if i.label.split("_")[-1] == gn:
                 par = i.parent
+                t2.label = i.label
                 par.remove_child(i)
                 par.add_child(t2)
                 break
-    print(t1.get_newick_repr(False)+";")
+    print(t1.get_newick_repr(True)+";")
diff --git a/src/add_outgroup_to_matrix.py b/src/add_outgroup_to_matrix.py
@@ -39,27 +39,54 @@ def construct_db_of_parts(infile,infileparts,outprefix):
     genesfn = {} #key name, value temp name
     genesf = {} #key name, value open file
     dbfn = outprefix+".tempdbfa"
-    tempoutf = open(dbfn,"w")
-    for i in infileparts:
-        spls = i.strip().split(" ")
-        name = spls[1]
-        shortname = name.split("/")[-1]
-        rnges = spls[-1].split("-")
-        beg = int(rnges[0])
-        end = int(rnges[1])
-        genes[name] = [beg,end]
-        genesn.append(name)
-        genesfn[name] = outprefix+"."+shortname #open file should be the shortname
-        genesf[name] = open(genesfn[name],"w")
-    for i in seq.read_fasta_file_iter(infile):
-        for j in genesn:
-            b,e = genes[j]
-            if len(i.seq[b-1:e].replace("-","")) > 100:
-                tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n")
-                genesf[j].write(">"+i.name+"\n"+i.seq[b-1:e]+"\n")
-    tempoutf.close()
-    for i in genesn:
-        genesf[name].close()
+    lf = 0 # how many lines
+    for _ in infileparts:
+        lf += 1
+    infileparts.seek(0)
+    if lf < 1000:
+        tempoutf = open(dbfn,"w")
+        for i in infileparts:
+            spls = i.strip().split(" ")
+            name = spls[1]
+            shortname = name.split("/")[-1]
+            rnges = spls[-1].split("-")
+            beg = int(rnges[0])
+            end = int(rnges[1])
+            genes[name] = [beg,end]
+            genesn.append(name)
+            genesfn[name] = outprefix+"."+shortname #open file should be the shortname
+            genesf[name] = open(genesfn[name],"w")
+        for i in seq.read_fasta_file_iter(infile):
+            for j in genesn:
+                b,e = genes[j]
+                if len(i.seq[b-1:e].replace("-","")) > 100:
+                    tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n")
+                    genesf[j].write(">"+i.name+"\n"+i.seq[b-1:e]+"\n")
+        tempoutf.close()
+        for i in genesn:
+            genesf[name].close()
+    else:
+        tempoutf = open(dbfn,"w")
+        for i in infileparts:
+            spls = i.strip().split(" ")
+            name = spls[1]
+            shortname = name.split("/")[-1]
+            rnges = spls[-1].split("-")
+            beg = int(rnges[0])
+            end = int(rnges[1])
+            genes[name] = [beg,end]
+            genesn.append(name)
+            genesfn[name] = outprefix+"."+shortname #open file should be the shortname
+            genesf[name] = genesfn[name]
+        for i in seq.read_fasta_file_iter(infile):
+            for j in genesn:
+                gfo = open(genesf[j],"a")
+                b,e = genes[j]
+                if len(i.seq[b-1:e].replace("-","")) > 100:
+                    tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n")
+                    gfo.write(">"+i.name+"\n"+i.seq[b-1:e]+"\n")
+                gfo.close()
+        tempoutf.close()
     cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl"# > /dev/null 2>&1"
     os.system(cmd)
     os.remove(dbfn)

diff --git a/src/change_ncbi_to_name_list.py b/src/change_ncbi_to_name_list.py
@@ -0,0 +1,45 @@
+import sys
+import sqlite3
+import argparse as ap
+
+
+def generate_argparser():
+    parser = ap.ArgumentParser(prog="change_ncbi_to_name_tre.py",
+        formatter_class=ap.ArgumentDefaultsHelpFormatter)
+    parser = ap.ArgumentParser()
+    parser.add_argument("-d", "--db", type=str, help="NCBI database", required=True)
+    parser.add_argument("-i", "--infile", type=str, help="Input list", required=True)
+    parser.add_argument("-o", "--outfile", type=str, help="Output list", required=True)
+    return parser
+
+if __name__ == "__main__":
+    parser = generate_argparser()
+    if len(sys.argv[1:]) == 0:
+        sys.argv.append("-h")
+    args = parser.parse_args(sys.argv[1:])
+
+    conn = sqlite3.connect(args.db)
+    c = conn.cursor()
+    of = open(args.infile,"r")
+    oof = open(args.outfile,"w")
+    for i in of:
+        i = i.strip()
+        c.execute("select name_class,edited_name,left_value,right_value from taxonomy where ncbi_id = ?", (i, ))
+        nm = ""
+        lf = ""
+        rt = ""
+        for k in c:
+            if str(k[0]) == "scientific name":
+                nm = str(k[1])
+                lf = str(k[2])
+                rt = str(k[3])
+        c.execute("select edited_name from taxonomy where name_class = ? and left_value < ? and right_value > ? and node_rank = ?",("scientific name",lf,rt,"family"))
+        fam = ""
+        ft = c.fetchone()
+        if len(ft) > 0:
+            fam = ft[0]
+        #for k in c:
+        #    print(k)
+        oof.write(fam+"\t"+nm+"\n")
+    of.close()
+    oof.close()
diff --git a/src/combine_datasets.py b/src/combine_datasets.py
@@ -7,6 +7,8 @@
 import seq
 from conf import perc_identity,evalue_limit,nthread
 
+USEORIGINALNAMES = True
+
 print(perc_identity,evalue_limit)
 def run_blast(blastdb,filen):
     cmd = "blastn -task blastn -db "+blastdb+".db -query "+filen+" -perc_identity "+str(perc_identity)+" -evalue "+str(evalue_limit)+" -num_threads "+str(nthread)+" -max_target_seqs 10000000 -out "+filen+".rawblastn -outfmt '6 qseqid qlen sseqid slen frames pident nident length mismatch gapopen qstart qend sstart send evalue bitscore' 2> NOPE"
@@ -43,7 +45,7 @@ def construct_db_of_parts(infile,infileparts,outprefix):
     tempoutf.close()
     for i in genesn:
         genesf[name].close()
-    cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl"# > /dev/null 2>&1"
+    cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl > /dev/null 2>&1"
     os.system(cmd)
     os.remove(dbfn)
     return dbfn,genes,genesfn
@@ -87,9 +89,14 @@ class NewGene:
     def __init__(self):
         self.geneset = set([])
         self.genes = {}
+        self.altnames = {} 
 
-    def addgene(self,la,fl):
+    def addgene(self,la,fl,altname = ""):
         self.genes[la] = fl
+        if altname == "":
+            self.altnames[la] = la
+        else:
+            self.altnames[la] = altname
 
     def issame(self,gene1,gene2):
         if gene1 in self.geneset:
@@ -103,7 +110,7 @@ def issame(self,gene1,gene2):
     def writefile(self,fn):
         fn = open(fn,"w")
         for i in self.genes:
-            fn.write(">"+i+"\n"+get_seq_from_file(i,self.genes[i])+"\n")
+            fn.write(">"+self.altnames[i]+"\n"+get_seq_from_file(i,self.genes[i])+"\n")
         fn.close()
 
     def __str__(self):
@@ -118,20 +125,29 @@ def generate_dataset(tips,files,outf):
                 n = NewGene()
                 n.geneset.add(lj[0])
                 n.geneset.add(lj[1])
-                n.addgene(i,j['lg'][i])
+                if USEORIGINALNAMES == False:
+                    n.addgene(i,j['lg'][i])
+                else:
+                    n.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0])
                 ngs.append(n)
             else:
                 t = False
                 for k in ngs:
                     if k.issame(lj[0],lj[1]):
-                        k.addgene(i,j['lg'][i])
+                        if USEORIGINALNAMES == False:
+                            k.addgene(i,j['lg'][i])
+                        else:
+                            k.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0])
                         t = True
                         break
                 if t == False:
                     n = NewGene()
                     n.geneset.add(lj[0])
                     n.geneset.add(lj[1])
-                    n.addgene(i,j['lg'][i])
+                    if USEORIGINALNAMES == False:
+                        n.addgene(i,j['lg'][i])
+                    else:
+                        n.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0])
                     ngs.append(n)
     count = 0 
     ffs = []
@@ -145,6 +161,8 @@ def generate_dataset(tips,files,outf):
         count += 1
     cmd = "pxcat -s "+" ".join(ffs)+" -o TEMPTEMPCAT -p TEMPTEMPPART"
     os.system(cmd)
+    for i in ffs:
+        os.remove(i)
     return
 
 if __name__ == "__main__":
@@ -157,16 +175,18 @@ def generate_dataset(tips,files,outf):
     flp = []
     for i in fls:
         flt.append(get_gene_ids(i))
-        flp.append(open(i.replace("outaln","outpart"),"r"))
+        flp.append(i.replace("outaln","outpart"))
     bdbs = []
     prts = []
     gens = []
     count = 0
     for i,j in zip(fls,flp):
-        blastdb,parts,genesfn = construct_db_of_parts(i,j,"TEST"+str(count)) # this will be based on names
+        jf = open(j,"r")
+        blastdb,parts,genesfn = construct_db_of_parts(i,jf,"TEST"+str(count)) # this will be based on names
         bdbs.append(blastdb)
         prts.append(parts)
         gens.append(genesfn)
+        jf.close()
         count += 1
     biggraph = nx.MultiGraph()
     for i in flt:
@@ -213,8 +233,12 @@ def generate_dataset(tips,files,outf):
     tips = []
     vals = {}
     for i in potential_tips:
+        if len(i) == 0:
+            continue
         x = random.choice(list(i.keys()))
-        #print(x,i[x])
+        print(x,i[x])
         tips.append(x)
         vals[x]=i[x]
     generate_dataset(tips,vals,"TEMPTEMP.fa")
+    #cleanup
+    os.system("rm TEST*.rn TEST*.rawblastn TEST*.tempdbfa.* TEMPTEMP.fa*")
diff --git a/src/get_ncbi_tax_tree_no_species.py b/src/get_ncbi_tax_tree_no_species.py
@@ -63,6 +63,9 @@ def construct_tree(taxon, db, taxalist = None):
         id = stack.pop()
         if id in done:
             continue
+        # added, if some error, remove but add on line 81
+        if id not in nodes:
+            continue
         done.add(id)
         c.execute("select ncbi_id,name,name_class,edited_name from taxonomy where parent_ncbi_id = ? and node_rank != 'species'",(id,))
         childs = []

diff --git a/src/get_ncbi_tax_tree_no_species_genus.py b/src/get_ncbi_tax_tree_no_species_genus.py
@@ -6,6 +6,11 @@
 else:
     import node
 
+"""
+this is specifically for use with the paftol like trees because
+of their focus on genera. 
+"""
+
 exclude_un_en = True
 
 def clean_name(name):
@@ -66,12 +71,18 @@ def construct_tree(taxon, db, taxalist = None):
         done.add(id)
         c.execute("select ncbi_id,name,name_class,edited_name,node_rank from taxonomy where parent_ncbi_id = ? and node_rank != 'species'",(id,))
         childs = []
-        for j in c:
+        testc = [j for j in c]
+        for j in testc:
             nr = str(j[4])
             tid = str(j[0])
             if includelist != None and tid not in includelist:
                 continue
-            if "nom. ined" in str(j[1]) or "x "  == str(j[1][0:2]) or " x " in str(j[1]) or "unclassified" in str(j[1]) or "environmental" in str(j[1]) or "incertae" in str(j[1]):
+            if "nom. ined" in str(j[1]) or "x "  == str(j[1][0:2]) or " x " in str(j[1]) or "unclassified" in str(j[1]) or "environmental" in str(j[1]):# or "incertae" in str(j[1]):
+                continue
+            # check to see if it has children. if not. skip it.
+            c.execute("select ncbi_id from taxonomy where parent_ncbi_id=?",(tid,))
+            tttest = [k for k in c]
+            if len(tttest) == 0:
                 continue
             childs.append(tid)
             if nr != "genus":

diff --git a/src/get_subset_genbank.py b/src/get_subset_genbank.py
@@ -12,6 +12,10 @@
 
 """
 
+def clean_name(nm):
+    nm = nm.replace(","," ")
+    return nm
+
 def get_seq_from_gz(gzdir, filename, idtoget):
     fl = gzip.open(gzdir+"/"+filename,"r")
     for i in fl:
@@ -102,7 +106,7 @@ def make_files_with_id(taxonid, DB,outfilen,outfile_tbln, gzfileloc,
             if tfilen not in files_ids:
                 files_ids[tfilen] = []
             files_ids[tfilen].append(str(j[2]))
-            ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])]
+            ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])]
         c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,))
         childs = []
         l = c.fetchall()
@@ -205,7 +209,7 @@ def make_files_with_id_internal(taxonid, DB,outfilen,outfile_tbln,gzfileloc,
         if tfilen not in files_ids:
             files_ids[tfilen] = []
         files_ids[tfilen].append(str(j[2]))
-        ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])]
+        ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])]
     # get the children of the taxon that have no children (and so the sequences would go here)
     keepers = []
     c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(str(taxonid),))
@@ -266,7 +270,7 @@ def make_files_with_id_internal(taxonid, DB,outfilen,outfile_tbln,gzfileloc,
                 files_ids[tfilen] = []
             if str(j[1]) in keepers:
                 files_ids[tfilen].append(str(j[2]))
-            ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])]
+            ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])]
             tblst = "\t".join(ids_props[str(j[2])])
             outfile_tbl.write(tblst+"\n")
         c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,))
@@ -347,7 +351,7 @@ def make_files_with_id_justtable(taxonid, DB,outfile_tbln):
         c.execute("select * from sequence where ncbi_id = ?",(id,))
         l = c.fetchall()
         for j in l:
-            tbls = str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(tname)+"\t"+str(j[5])+"\t"+str(j[6])
+            tbls = str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(clean_name(tname))+"\t"+str(j[5])+"\t"+str(j[6])
             if outfile_tbln != None:
                 outfile_tbl.write(tbls+"\n")
             else:
@@ -384,7 +388,7 @@ def make_files(taxon, DB,outfilen,outfile_tbln):
         for j in l:
             outfile.write(">"+str(j[3])+"\n")
             outfile.write(str(j[7])+"\n")
-            outfile_tbl.write(str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(tname)+"\t"+str(j[4])+"\n")
+            outfile_tbl.write(str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(clean_name(tname))+"\t"+str(j[4])+"\n")
         c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,))
         childs = []
         l = c.fetchall()