-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMake_job_mmseqs2.py
154 lines (130 loc) · 8.27 KB
/
Make_job_mmseqs2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/python
import pandas as pd
import numpy as np
import sys
import argparse
import os
import subprocess
# Print out a message when the program is initiated.
print('----------------------------------------------------------------\n')
print(' MMseqs2_job_maker.\n')
print('----------------------------------------------------------------\n')
#----------------------------------- PARSE SOME ARGUMENTS ----------------------------------------
parser = argparse.ArgumentParser(description='Allow to make mmseqs jobs for slurm')
parser.add_argument("-i", "--species_name_file", help="introduce the txt file with species names")
parser.add_argument("-t", "--species_type", help="introduce the name of db species types, ex: Virus or Hymenoptera")
parser.add_argument("-db", "--data_base", help="introduce the name of file of the diamond prot db in .dmnd format")
parser.add_argument("-o", "--out",help="The ouptut path where to create the slurm files")
parser.add_argument("-p", "--path",help="The path where to find the file Project")
args = parser.parse_args()
#Ex usage viral: python3 Make_job_diamond.py -i file_all_species_name.txt -t virus -db Diamond_virus_prot_db.dmnd -o ~/M2_script/Diamond_jobs
#Ex usage busco: python3 Make_job_diamond.py -i file_all_species_name.txt -t hymenoptera -db Diamond_Busco_prot_db.dmnd -o ~/M2_script/Diamond_jobs
#Ex usage busco: python3 Make_plast_job.py -i short_file_species_name.txt -t virus -db /beegfs/data/bguinet/M2/blast_db.pin -o ~/M2_script/mmseqs2_jobs
#python3 Make_job_mmseqs2.py -i /beegfs/data/bguinet/these/Species_genome_names.txt -t virus -db /beegfs/data/bguinet/these/NCBI_protein_viruses/mmseqs2_viral_db -o /beegfs/home/bguinet/these_scripts/Mmseqs2_jobs
# Variable that stores fasta sequences
Species_name_file=args.species_name_file
Type_species=args.species_type
Data_base=args.data_base
Output_path=args.out
path=args.path
#Allow to get a letter to know where to take and save the files
if Type_species == "virus":
Type_species = "V"
if Type_species == "Virus":
Type_species = "V"
if Type_species == "Hymenoptera":
Type_species = "H"
if Type_species == "hymenoptera":
Type_species = "H"
num_species = sum(1 for line in open(Species_name_file)) # count number of lines in the file in order to now the number of decimals / by the first number given
list_of_names1=[]
for names in open(Species_name_file,"r"):
list_of_names1.append(names)
list_of_names2=[]
for names in list_of_names1:
list_of_names2.append(names.replace("\n", ""))
os.chdir(Output_path)
# jobname = str(os.path.splitext(sys.argv[2])[0])
filecount = 1
# numcmds = int(sys.argv[1])
# line = cmds.readline()
if Type_species == "H":
for names in list_of_names2:
w = open("Busco_mmseqs2_job_"+str(names)+".sh",'w')
w.write("#!/bin/bash\n")
w.write("#SBATCH --cpus-per-task=10\n")
w.write("#SBATCH --mem 5G\n")
w.write("#SBATCH --constraint=haswell\n")
w.write("#SBATCH -t 24:00:00\n")
w.write("#SBATCH -e"+path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/Busco_mmseqs2_job.log/Busco_mmseqs2_job.error\n")
w.write("#SBATCH -o"+path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/Busco_mmseqs2_job.log/Busco_mmseqs2_job.out\n")
w.write("#SBATCH -J Busco_mmseqs2_job_"+str(names)+"\n")
w.write("#SBATCH --mail-type=ALL\n")
w.write("#SBATCH [email protected]\n")
w.write("date;hostname;pwd\n")
w.write("mmseqs2=/beegfs/home/bguinet/TOOLS/MMseqs2/build/bin/mmseqs\n")
w.write("$mmseqs2 createdb\\\n")
w.write(path+"/Genome/"+str(names)+"/run_busco/run_BUSCO_v3/compiled_busco_aa\\\n")
w.write(path+"/Genome/"+str(names)+"/run_busco/run_BUSCO_v3/"+str(names)+"_mmseqs2_busco_db\n")
w.write("$mmseqs2 createdb\\\n")
w.write(path+"/Genome/"+str(names)+"/"+str(names)+".fa\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/"+str(names)+"_mmseqs2_viral_db\n")
w.write("$mmseqs2 search\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/"+str(names)+"_mmseqs2_viral_db\\\n")
w.write(path+"/Genome/"+str(names)+"/run_busco/run_BUSCO_v3/"+str(names)+"_mmseqs2_busco_db\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/result_mmseqs2\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/tpm -a -s 7.5 -e 0.01 --threads 10\n")
w.write('$mmseqs2 convertalis --format-output "query,tlen,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits"\\\n')
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/"+str(names)+"_mmseqs2_viral_db\\\n")
w.write(path+"/Genome/"+str(names)+"/run_busco/run_BUSCO_v3/"+str(names)+"_mmseqs2_busco_db\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/result_mmseqs2\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/result_mmseqs2.m8\n")
w.write("date")
w.close()
filled_len = int(round(50 * filecount / float(num_species-1)))
percents = round(100.0 * filecount / float(num_species), 1)
bar = '=' * filled_len + '-' * ((50) - filled_len)
sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', ''))
sys.stdout.flush() # As suggested by Rom Ruben
filecount += 1
if Type_species == "V":
for names in list_of_names2:
w = open("Viral_mmseqs2_job_"+str(names)+".sh",'w')
w.write("#!/bin/bash\n")
w.write("#SBATCH --cpus-per-task=10\n")
w.write("#SBATCH --mem 5G\n")
w.write("#SBATCH --constraint=haswell\n")
w.write("#SBATCH -t 24:00:00\n")
w.write("#SBATCH -e"+path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/Viral_mmseqs2_job.log/Viral_mmseqs2_job.error\n")
w.write("#SBATCH -o"+path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/Viral_mmseqs2_job.log/Viral_mmseqs2_job.out\n")
w.write("#SBATCH -J Viral_mmseqs2_job_"+str(names)+"\n")
w.write("#SBATCH --mail-type=ALL\n")
w.write("#SBATCH [email protected]\n")
w.write("date;hostname;pwd\n")
w.write("mmseqs2=/beegfs/home/bguinet/TOOLS/MMseqs2/build/bin/mmseqs\n")
w.write("$mmseqs2 createdb\\\n")
w.write(path+"/Genome/"+str(names)+"/"+str(names)+".fa\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/"+str(names)+"_mmseqs2_viral_db\n")
w.write("$mmseqs2 search\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/"+str(names)+"_mmseqs2_viral_db\\\n")
w.write(" "+Data_base+"\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/result_mmseqs2\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/tpm -a -s 7.5 -e 0.01 --threads 10\n")
w.write('$mmseqs2 convertalis --format-output "query,tlen,target,pident,alnlen,mismatch,gapopen,qstart,qend,tstart,tend,evalue,bits"\\\n')
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/"+str(names)+"_mmseqs2_viral_db\\\n")
w.write(" "+Data_base+"\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/result_mmseqs2\\\n")
w.write(path+"/Genome/"+str(names)+"/run_mmseqs2_"+Type_species+"/result_mmseqs2.m8\n")
w.write("date")
w.close()
filled_len = int(round(50 * filecount / float(num_species-1)))
percents = round(100.0 * filecount / float(num_species), 1)
bar = '=' * filled_len + '-' * ((50) - filled_len)
sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', ''))
sys.stdout.flush() # As suggested by Rom Ruben
filecount += 1
if Type_species != "H" and Type_species != "V":
print("Wrong type of species-type given")
print("Please choose one of these one:")
print("virus, Virus, Hymenoptera or hymenoptera")
print("if the species type changes, please edit the Make_job_mmseqs2.py file in order to fit your own data")