-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtemplate_RM_slurmC8.sh
79 lines (66 loc) · 3.53 KB
/
template_RM_slurmC8.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/bin/bash
#SBATCH --job-name=<NAME>_RM
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --partition=nocona
#SBATCH --mem-per-cpu=100G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mail-user [email protected]
###########################3
## Usage:
## This script will run the Centos-8 version of RepeatMasker on nocona.
## You will need to replace <NAME> with the genome assembly identifier as appropriate.
## DIR may need modifying.
## sed "s|DIRPATH|path to your directory|g" template_RM_slurmC8.sh ><new file name>
## You will need to include the path to your library.
## sed "s|LIBFILE|path to your library file|g" <new file name> ><new new file name>
## You will need to include the path to your genome assembly.
## sed "s|ASSEMBLYPATH|path to your assembly file|g" <new file name> ><new new file name>
## If running lots of genomes, here's a quick way to do it using a 'list.txt' file that has all of your taxon names in it.
## Sed the DIRPATH, LIBFILE, and ASSEMBLYPATH, then:
## cat list.txt | while read i; do sed "s/<NAME>/$i/g" <new new file name> >${i}_RM_C8.sh; done
## The final script to run ${GENOME}_rm2bed.sh should be prepared before running this
## Then, submit the scripts using a similar loop.
#This script uses a python program. You need to make sure python is loaded and ready to go by activating conda.
#Your conda installation must have biopython and pyfaidx installed.
. ~/conda/etc/profile.d/conda.sh
conda activate
#makes variables used in this script
GENOME=<NAME>
RUNTYPE=${GENOME}_RM
DIR=DIRPATH/$RUNTYPE
LIBRARYPATH=LIBFILE
GENOMEPATH=ASSEMBLYPATH/${GENOME}.fa
# If GenomeAbbreviation is greater than 10 characters, will cause trouble
# with queueing of doLift.sh.
# Shorten GENOME and to 10 characters and call it SUBNAME to resolve this problem.
# Note that the qstat queries in the bottom few lines look for SUBNAME instead of GENOME.
CHARLEN=${#GENOME} #Finds length of GenomeAbbreviation
if test $CHARLEN -gt 10; then SHORTNAME=${GENOME:0:10}; else SHORTNAME=$GENOME; fi
#Creates a working directory and then goes into it to make all the files
mkdir $DIR
cd $DIR
#Creates a link to the genome assembly and unzip it. Then create a .2bit version for RepeatMasker.
ln -s $GENOMEPATH/${GENOME}.fa
#gunzip -c $GENOME".fa.gz" > $GENOME".fa"
/lustre/work/daray/software/faToTwoBit ${GENOME}.fa ${GENOME}.2bit
#creates a symbolic link to the 200 mammals TE library file so it can be used.
#cp /lustre/scratch/aosmansk/250mammals_RM_v2/final_mammal_library.fa .
#Use sge_clusterrun.py to generate all batches needed to run RepeatMasker and the doLift.sh to complile the results.
python /home/daray/gitrepositories/bioinfo_tools/slurm_clusterrunC8.py \
-i ${GENOME}.fa \
-b 50 \
-lib $LIBRARYPATH \
-dir . \
-xsmall
#Submits the RepeatMasker jobs created by sge_clusterrun.py
sh qsub.sh
#Creates a list of jobIDs to keep track of the RepeatMasker batches being run.
jobIDs=""; for i in `squeue | grep $SHORTNAME | awk '{print $1}'`; do jobIDs=$jobIDs,$i; done; jobIDs="${jobIDs:1}"
#Submits the doLift script to the queue but holds it until all jobs in the jobIDs list (the RepeatMasker batches) have cleared.
sbatch --dependency=afterok:$jobIDs doLift.sh
#Creates a list of job IDs to keep track of the doLift script.
jobIDs=""; for i in `squeue | grep $SHORTNAME | awk '{print $1}'`; do jobIDs=$jobIDs,$i; done; jobIDs="${jobIDs:1}"
#Submits the rm2bed script to the queue but holds it until all jobs in the jobIDs list (the doLift job) have cleared.
sbatch --dependency=afterok:$jobIDs ../${GENOME}_rm2bed.sh