-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMake_job_busco.py
94 lines (79 loc) · 4.43 KB
/
Make_job_busco.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/python
import pandas as pd
import numpy as np
import sys
import argparse
import os
# Print out a message when the program is initiated.
print('----------------------------------------------------------------\n')
print(' Busco_job_maker.\n')
print('----------------------------------------------------------------\n')
#----------------------------------- PARSE SOME ARGUMENTS ----------------------------------------
parser = argparse.ArgumentParser(description='Allow to make busco jobs for slurm')
parser.add_argument("-i", "--species_name_file", help="introduce the .txt file with all the species names")
parser.add_argument("-o", "--out",help="The ouptut path where to create the slurm files")
parser.add_argument("-p", "--path",help="The path where to find the file Project")
args = parser.parse_args()
#Ex usage: python3 Make_job_busco.py -i /beegfs/home/bguinet/Species_genome_names.txt -o $DIR/M2_script/Busco_jobs -p /beegfs/data/bguinet/
#This script allow you to create Busco_job.sh in order to run these scripts with slurm
# Variable that stores fasta sequences
Species_name_file=args.species_name_file
Output_path=args.out
path=args.path
num_species = sum(1 for line in open(Species_name_file)) # count number of lines in the file in order to now the number of decimals / by the first number given
list_of_names1=[]
for names in open(Species_name_file,"r"):
list_of_names1.append(names)
list_of_names2=[]
for names in list_of_names1:
list_of_names2.append(names.replace("\n", ""))
os.chdir(Output_path)
filecount = 1
for names in list_of_names2:
w = open("Busco_job_"+str(names)+".sh",'w')
w.write("#!/bin/bash\n")
w.write("#SBATCH -t 64:00:00\n")
w.write("#SBATCH --cpus-per-task=3\n")
w.write("#SBATCH -e "+path+"Genomes/"+str(names)+"/run_busco/busco_job.log/busco_job.error\n")
w.write("#SBATCH -o "+path+"Genomes/"+str(names)+"/run_busco/busco_job.log/busco_job.out\n")
w.write("#SBATCH -J Genome_busco_job_"+str(names)+"\n")
w.write("#SBATCH --mail-type=ALL\n")
w.write("#SBATCH [email protected]\n") #<--------- the email where you desire to get the informations about the process running
w.write("date;hostname;pwd\n")
w.write("ASSEMBLY="+path+"Genomes/"+str(names)+"/"+str(names)+".fa\n")
w.write("LINEAGE=/beegfs/data/bguinet/these/Busco_Hymenoptera_database/\n") #<--------- the path where to find the lineage file
#w.write("NAME=/beegfs/data/bguinetGenomes/"+str(names)+"/run_busco/busco_v3_"+str(names)+"\n")
w.write("SAMP="+str(names)+"\n")
w.write("NAME=$SAMP'_BUSCO_v3'\n")
w.write("#########################################\n")
w.write("# define PATH to sofwtare used by BUSCO #\n")
w.write("#########################################\n")
w.write("#Augustus\n")
w.write("export PATH=/bin:/usr/bin:/usr/remote/bin:/beegfs/data/bguinet/TOOLS/Augustus3.3/bin:/beegfs/data/bguinet/TOOLS/Augustus3.3/scripts\n") #<--------- the path where to find Augustus program
#w.write("export PATH=/bin:/usr/bin:/usr/remote/bin:/beegfs/data/bguinet/TOOLS/Augustus3.3\n")
w.write("# hmmer\n")
w.write("PATH=$PATH:/beegfs/data/bguinet/TOOLS/hmmer-3.2.1/bin\n") #<--------- the path where to find hmmer program
w.write("# blast et python\n")
w.write("PATH=$PATH:/beegfs/data/bguinet/TOOLS/ncbi-blast-2.8.1+/bin\n") #<--------- the path where to find ncbi-blast program
w.write("PATH=$PATH:/usr/bin\n")
w.write("# augustus\n")
w.write("export AUGUSTUS_CONFIG_PATH=/beegfs/data/bguinet/TOOLS/Augustus3.3/config\n")
w.write("#Busco software path\n")
w.write("BUSCO='/beegfs/data/bguinet/myconda/bin/run_BUSCO.py'\n") #<--------- the path where to find Busco program
w.write("################\n")
w.write("# Command line #\n")
w.write("################\n")
w.write("export PATH=/usr/remote/Python-3.6.5/bin:$PATH\n") #<--------- the path where to find the Python program
w.write("PATH=$PATH:/usr/bin\n")
w.write("cd "+path+"Genomes/"+str(names)+"/run_busco\n")
w.write("export PYTHONPATH=$PYTHONPATH:/beegfs/data/bguinet/myconda/lib/python3.7/site-packages/\n")
w.write("python3 $BUSCO -i $ASSEMBLY -o $NAME -l $LINEAGE -m geno -f\n")
w.close()
filled_len = int(round(50 * filecount / float(num_species-1)))
percents = round(100.0 * filecount / float(num_species), 1)
bar = '=' * filled_len + '-' * ((50) - filled_len)
sys.stdout.write('[%s] %s%s finish%s\r' % (bar, percents, '%', ''))
sys.stdout.flush() # As suggested by Rom Ruben
filecount += 1
print("\n")
print(filecount-1," files created at :",Output_path)