Skip to content

Commit

Permalink
more
Browse files Browse the repository at this point in the history
  • Loading branch information
blackrim committed Aug 2, 2024
1 parent 3d0e5c1 commit 20da10f
Show file tree
Hide file tree
Showing 12 changed files with 781 additions and 66 deletions.
3 changes: 2 additions & 1 deletion src/add_genera_to_paftol_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
for i in t1.iternodes():
if i.label.split("_")[-1] == gn:
par = i.parent
t2.label = i.label
par.remove_child(i)
par.add_child(t2)
break
print(t1.get_newick_repr(False)+";")
print(t1.get_newick_repr(True)+";")
69 changes: 48 additions & 21 deletions src/add_outgroup_to_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,27 +39,54 @@ def construct_db_of_parts(infile,infileparts,outprefix):
genesfn = {} #key name, value temp name
genesf = {} #key name, value open file
dbfn = outprefix+".tempdbfa"
tempoutf = open(dbfn,"w")
for i in infileparts:
spls = i.strip().split(" ")
name = spls[1]
shortname = name.split("/")[-1]
rnges = spls[-1].split("-")
beg = int(rnges[0])
end = int(rnges[1])
genes[name] = [beg,end]
genesn.append(name)
genesfn[name] = outprefix+"."+shortname #open file should be the shortname
genesf[name] = open(genesfn[name],"w")
for i in seq.read_fasta_file_iter(infile):
for j in genesn:
b,e = genes[j]
if len(i.seq[b-1:e].replace("-","")) > 100:
tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n")
genesf[j].write(">"+i.name+"\n"+i.seq[b-1:e]+"\n")
tempoutf.close()
for i in genesn:
genesf[name].close()
lf = 0 # how many lines
for _ in infileparts:
lf += 1
infileparts.seek(0)
if lf < 1000:
tempoutf = open(dbfn,"w")
for i in infileparts:
spls = i.strip().split(" ")
name = spls[1]
shortname = name.split("/")[-1]
rnges = spls[-1].split("-")
beg = int(rnges[0])
end = int(rnges[1])
genes[name] = [beg,end]
genesn.append(name)
genesfn[name] = outprefix+"."+shortname #open file should be the shortname
genesf[name] = open(genesfn[name],"w")
for i in seq.read_fasta_file_iter(infile):
for j in genesn:
b,e = genes[j]
if len(i.seq[b-1:e].replace("-","")) > 100:
tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n")
genesf[j].write(">"+i.name+"\n"+i.seq[b-1:e]+"\n")
tempoutf.close()
for i in genesn:
genesf[name].close()
else:
tempoutf = open(dbfn,"w")
for i in infileparts:
spls = i.strip().split(" ")
name = spls[1]
shortname = name.split("/")[-1]
rnges = spls[-1].split("-")
beg = int(rnges[0])
end = int(rnges[1])
genes[name] = [beg,end]
genesn.append(name)
genesfn[name] = outprefix+"."+shortname #open file should be the shortname
genesf[name] = genesfn[name]
for i in seq.read_fasta_file_iter(infile):
for j in genesn:
gfo = open(genesf[j],"a")
b,e = genes[j]
if len(i.seq[b-1:e].replace("-","")) > 100:
tempoutf.write(">"+j+"___"+i.name+"\n"+i.seq[b-1:e].replace("-","")+"\n")
gfo.write(">"+i.name+"\n"+i.seq[b-1:e]+"\n")
gfo.close()
tempoutf.close()
cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl"# > /dev/null 2>&1"
os.system(cmd)
os.remove(dbfn)
Expand Down
45 changes: 45 additions & 0 deletions src/change_ncbi_to_name_list.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import sys
import sqlite3
import argparse as ap


def generate_argparser():
parser = ap.ArgumentParser(prog="change_ncbi_to_name_tre.py",
formatter_class=ap.ArgumentDefaultsHelpFormatter)
parser = ap.ArgumentParser()
parser.add_argument("-d", "--db", type=str, help="NCBI database", required=True)
parser.add_argument("-i", "--infile", type=str, help="Input list", required=True)
parser.add_argument("-o", "--outfile", type=str, help="Output list", required=True)
return parser

if __name__ == "__main__":
parser = generate_argparser()
if len(sys.argv[1:]) == 0:
sys.argv.append("-h")
args = parser.parse_args(sys.argv[1:])

conn = sqlite3.connect(args.db)
c = conn.cursor()
of = open(args.infile,"r")
oof = open(args.outfile,"w")
for i in of:
i = i.strip()
c.execute("select name_class,edited_name,left_value,right_value from taxonomy where ncbi_id = ?", (i, ))
nm = ""
lf = ""
rt = ""
for k in c:
if str(k[0]) == "scientific name":
nm = str(k[1])
lf = str(k[2])
rt = str(k[3])
c.execute("select edited_name from taxonomy where name_class = ? and left_value < ? and right_value > ? and node_rank = ?",("scientific name",lf,rt,"family"))
fam = ""
ft = c.fetchone()
if len(ft) > 0:
fam = ft[0]
#for k in c:
# print(k)
oof.write(fam+"\t"+nm+"\n")
of.close()
oof.close()
42 changes: 33 additions & 9 deletions src/combine_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
import seq
from conf import perc_identity,evalue_limit,nthread

USEORIGINALNAMES = True

print(perc_identity,evalue_limit)
def run_blast(blastdb,filen):
cmd = "blastn -task blastn -db "+blastdb+".db -query "+filen+" -perc_identity "+str(perc_identity)+" -evalue "+str(evalue_limit)+" -num_threads "+str(nthread)+" -max_target_seqs 10000000 -out "+filen+".rawblastn -outfmt '6 qseqid qlen sseqid slen frames pident nident length mismatch gapopen qstart qend sstart send evalue bitscore' 2> NOPE"
Expand Down Expand Up @@ -43,7 +45,7 @@ def construct_db_of_parts(infile,infileparts,outprefix):
tempoutf.close()
for i in genesn:
genesf[name].close()
cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl"# > /dev/null 2>&1"
cmd = "makeblastdb -in "+dbfn+" -out "+dbfn+".db -dbtype nucl > /dev/null 2>&1"
os.system(cmd)
os.remove(dbfn)
return dbfn,genes,genesfn
Expand Down Expand Up @@ -87,9 +89,14 @@ class NewGene:
def __init__(self):
self.geneset = set([])
self.genes = {}
self.altnames = {}

def addgene(self,la,fl):
def addgene(self,la,fl,altname = ""):
self.genes[la] = fl
if altname == "":
self.altnames[la] = la
else:
self.altnames[la] = altname

def issame(self,gene1,gene2):
if gene1 in self.geneset:
Expand All @@ -103,7 +110,7 @@ def issame(self,gene1,gene2):
def writefile(self,fn):
fn = open(fn,"w")
for i in self.genes:
fn.write(">"+i+"\n"+get_seq_from_file(i,self.genes[i])+"\n")
fn.write(">"+self.altnames[i]+"\n"+get_seq_from_file(i,self.genes[i])+"\n")
fn.close()

def __str__(self):
Expand All @@ -118,20 +125,29 @@ def generate_dataset(tips,files,outf):
n = NewGene()
n.geneset.add(lj[0])
n.geneset.add(lj[1])
n.addgene(i,j['lg'][i])
if USEORIGINALNAMES == False:
n.addgene(i,j['lg'][i])
else:
n.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0])
ngs.append(n)
else:
t = False
for k in ngs:
if k.issame(lj[0],lj[1]):
k.addgene(i,j['lg'][i])
if USEORIGINALNAMES == False:
k.addgene(i,j['lg'][i])
else:
k.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0])
t = True
break
if t == False:
n = NewGene()
n.geneset.add(lj[0])
n.geneset.add(lj[1])
n.addgene(i,j['lg'][i])
if USEORIGINALNAMES == False:
n.addgene(i,j['lg'][i])
else:
n.addgene(i,j['lg'][i],altname=j['lg'][i].split("_")[0])
ngs.append(n)
count = 0
ffs = []
Expand All @@ -145,6 +161,8 @@ def generate_dataset(tips,files,outf):
count += 1
cmd = "pxcat -s "+" ".join(ffs)+" -o TEMPTEMPCAT -p TEMPTEMPPART"
os.system(cmd)
for i in ffs:
os.remove(i)
return

if __name__ == "__main__":
Expand All @@ -157,16 +175,18 @@ def generate_dataset(tips,files,outf):
flp = []
for i in fls:
flt.append(get_gene_ids(i))
flp.append(open(i.replace("outaln","outpart"),"r"))
flp.append(i.replace("outaln","outpart"))
bdbs = []
prts = []
gens = []
count = 0
for i,j in zip(fls,flp):
blastdb,parts,genesfn = construct_db_of_parts(i,j,"TEST"+str(count)) # this will be based on names
jf = open(j,"r")
blastdb,parts,genesfn = construct_db_of_parts(i,jf,"TEST"+str(count)) # this will be based on names
bdbs.append(blastdb)
prts.append(parts)
gens.append(genesfn)
jf.close()
count += 1
biggraph = nx.MultiGraph()
for i in flt:
Expand Down Expand Up @@ -213,8 +233,12 @@ def generate_dataset(tips,files,outf):
tips = []
vals = {}
for i in potential_tips:
if len(i) == 0:
continue
x = random.choice(list(i.keys()))
#print(x,i[x])
print(x,i[x])
tips.append(x)
vals[x]=i[x]
generate_dataset(tips,vals,"TEMPTEMP.fa")
#cleanup
os.system("rm TEST*.rn TEST*.rawblastn TEST*.tempdbfa.* TEMPTEMP.fa*")
3 changes: 3 additions & 0 deletions src/get_ncbi_tax_tree_no_species.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,9 @@ def construct_tree(taxon, db, taxalist = None):
id = stack.pop()
if id in done:
continue
# added, if some error, remove but add on line 81
if id not in nodes:
continue
done.add(id)
c.execute("select ncbi_id,name,name_class,edited_name from taxonomy where parent_ncbi_id = ? and node_rank != 'species'",(id,))
childs = []
Expand Down
15 changes: 13 additions & 2 deletions src/get_ncbi_tax_tree_no_species_genus.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,11 @@
else:
import node

"""
this is specifically for use with the paftol like trees because
of their focus on genera.
"""

exclude_un_en = True

def clean_name(name):
Expand Down Expand Up @@ -66,12 +71,18 @@ def construct_tree(taxon, db, taxalist = None):
done.add(id)
c.execute("select ncbi_id,name,name_class,edited_name,node_rank from taxonomy where parent_ncbi_id = ? and node_rank != 'species'",(id,))
childs = []
for j in c:
testc = [j for j in c]
for j in testc:
nr = str(j[4])
tid = str(j[0])
if includelist != None and tid not in includelist:
continue
if "nom. ined" in str(j[1]) or "x " == str(j[1][0:2]) or " x " in str(j[1]) or "unclassified" in str(j[1]) or "environmental" in str(j[1]) or "incertae" in str(j[1]):
if "nom. ined" in str(j[1]) or "x " == str(j[1][0:2]) or " x " in str(j[1]) or "unclassified" in str(j[1]) or "environmental" in str(j[1]):# or "incertae" in str(j[1]):
continue
# check to see if it has children. if not. skip it.
c.execute("select ncbi_id from taxonomy where parent_ncbi_id=?",(tid,))
tttest = [k for k in c]
if len(tttest) == 0:
continue
childs.append(tid)
if nr != "genus":
Expand Down
14 changes: 9 additions & 5 deletions src/get_subset_genbank.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@
"""

def clean_name(nm):
nm = nm.replace(","," ")
return nm

def get_seq_from_gz(gzdir, filename, idtoget):
fl = gzip.open(gzdir+"/"+filename,"r")
for i in fl:
Expand Down Expand Up @@ -102,7 +106,7 @@ def make_files_with_id(taxonid, DB,outfilen,outfile_tbln, gzfileloc,
if tfilen not in files_ids:
files_ids[tfilen] = []
files_ids[tfilen].append(str(j[2]))
ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])]
ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])]
c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,))
childs = []
l = c.fetchall()
Expand Down Expand Up @@ -205,7 +209,7 @@ def make_files_with_id_internal(taxonid, DB,outfilen,outfile_tbln,gzfileloc,
if tfilen not in files_ids:
files_ids[tfilen] = []
files_ids[tfilen].append(str(j[2]))
ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])]
ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])]
# get the children of the taxon that have no children (and so the sequences would go here)
keepers = []
c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(str(taxonid),))
Expand Down Expand Up @@ -266,7 +270,7 @@ def make_files_with_id_internal(taxonid, DB,outfilen,outfile_tbln,gzfileloc,
files_ids[tfilen] = []
if str(j[1]) in keepers:
files_ids[tfilen].append(str(j[2]))
ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(tname),str(j[5])]
ids_props[str(j[2])] = [str(j[0]),str(j[1]),str(j[2]),str(j[3]),str(clean_name(tname)),str(j[5])]
tblst = "\t".join(ids_props[str(j[2])])
outfile_tbl.write(tblst+"\n")
c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,))
Expand Down Expand Up @@ -347,7 +351,7 @@ def make_files_with_id_justtable(taxonid, DB,outfile_tbln):
c.execute("select * from sequence where ncbi_id = ?",(id,))
l = c.fetchall()
for j in l:
tbls = str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(tname)+"\t"+str(j[5])+"\t"+str(j[6])
tbls = str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(clean_name(tname))+"\t"+str(j[5])+"\t"+str(j[6])
if outfile_tbln != None:
outfile_tbl.write(tbls+"\n")
else:
Expand Down Expand Up @@ -384,7 +388,7 @@ def make_files(taxon, DB,outfilen,outfile_tbln):
for j in l:
outfile.write(">"+str(j[3])+"\n")
outfile.write(str(j[7])+"\n")
outfile_tbl.write(str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(tname)+"\t"+str(j[4])+"\n")
outfile_tbl.write(str(j[0])+"\t"+str(j[1])+"\t"+str(j[2])+"\t"+str(j[3])+"\t"+str(clean_name(tname))+"\t"+str(j[4])+"\n")
c.execute("select ncbi_id from taxonomy where parent_ncbi_id = ?",(id,))
childs = []
l = c.fetchall()
Expand Down
Loading

0 comments on commit 20da10f

Please sign in to comment.