-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathtemplate_extend_align.sh
191 lines (167 loc) · 7.54 KB
/
template_extend_align.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/bin/bash
#SBATCH --job-name=<NAME>
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --partition=nocona
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem-per-cpu=60G
#SBATCH --time=02:00:00
#SBATCH --mail-user [email protected]
#### BASICS
# This script will take in queries generated by RepeatModeler and generate
# extended consensus sequences for visualization and evaluation using a combination of
# the Ray lab's extract_align python script and Robert Hubley's extension perl script. It is
# designed to allow for a triage of extended RepeatModeler output when generating de novo curated
# TE libraries from RepeatModeler output.
#
# The output you're going to be most interested in will be in the images_and_alignments directory.
#### USAGE and OUTPUT
#
# Replace aJam in line 4 with your chosen job ID.
#
# sbatch scriptname.sh <FULL path to genome.gz> <FULL path to output directory> <FULL path to repeatmodeler queries>
#
# Input #1 = path to a genome file, zipped. The first field of the filename, as divided by ".", will end up as SUBNAME as a designation
# Input #2 = the path to the output directory
# Input #3 = path to a file with repeatmodeler consensuses with headers modified to remove "#" and "/"
#
# Example - sbatch bin/cPer_bat1k_extend_align_svaca.sh /lustre/scratch/daray/test2/assembly/cPer_bat1k.masked.fa.gz /lustre/scratch/daray/test2/cPer /lustre/scratch/daray/test2/repeatmodeler/carollia_short.fa
#
## Output by directory (extend_align/SUBNAME...):
# blastfiles = blast output, database, and queries file
# extendlogs = log files from Hubley extension tool
# extensionwork = directory for each TE evaluated with output from tool
# extract_align = output from extract_align.py, contains catTEfiles to be used by extension tool
# genomefiles = genome assembly in .fa and .2bit format
# images_and_alignments = .png files and aligned files for visual validation
# rejects = alignments filtered for too few hits
#### CONDA OPERATING ENVIRONMENT
#Set up conda environment if necessary before starting
#Note: Before using this script, I set up this working enviroment within conda using:
# $ conda create --name extend_env
# $ conda activate extend_env
# $ conda install -c bioconda rmblast
# $ conda install -c bioconda bedtools
# $ conda install biopython
# $ conda install pandas
# $ conda install -c bioconda pyfaidx
# $ conda install --channel bioconda pybedtools
#### STUFF THAT MAY NEED CHANGING -- CHECK ALL PATHS, ETC
#activate conda environment, load modules
. ~/conda/etc/profile.d/conda.sh
conda activate extend_env
#Locations of critical software
SOFTWARE=/lustre/work/daray/software
EXTENDPATH=$SOFTWARE/RepeatModeler-dev/util
GITPATH=/home/daray/gitrepositories/bioinfo_tools
#Variables for extract_align.py
SEQBUFFER=100
SEQNUMBER=50
FLANK=100
######END STUFF THAT MAY NEED CHANGING######
##
TAXON=<NAME>
GENOME=<path.to.NAME.fa.gz>
WORKDIR=<path.to.main.working.directory>
CONSENSUSFILE=$WORKDIR/${TAXON}_novel.fa
##Set paths and variables
echo "Genome file is "$GENOME
BASENAME=$(basename $GENOME .gz)
SUBNAME=$(basename $GENOME | awk -F'[.]' '{print $1}')
echo "Your working directory is "$WORKDIR
##Get RepeatModeler consensus file info
echo "Queries file is "$CONSENSUSFILE
CONSENSUSSEQS=$(basename $CONSENSUSFILE)
#Set up directory structure
mkdir -p $WORKDIR
THISGENOME=$WORKDIR/${SUBNAME}_N
#create a directory for all extension tool work
EXTENSIONWORK=$THISGENOME/extensionwork
mkdir -p $EXTENSIONWORK
#create a directory for extension log files.
EXTENDLOGS=$THISGENOME/extendlogs
mkdir -p $EXTENDLOGS
#create a directory for assembly
GENOMEFILES=$THISGENOME/genomefiles
mkdir -p $GENOMEFILES
#create a folder to store the .png files and MSAs for evaluation
IMAGES=$THISGENOME/images_and_alignments
mkdir -p $IMAGES
#create a folder for potential segmental duplications
SD=$IMAGES/possible_SD
mkdir -p $SD
#create a folder for likely TEs
TE=$IMAGES/likely_TEs
mkdir -p $TE
#create a folder to filter TEs with very few hits
REJECTS=$IMAGES/rejects
mkdir -p $REJECTS
#create a folder to store potential final consensus sequences
FINAL_CONSENSUSES=$THISGENOME/final_consensuses
mkdir -p $FINAL_CONSENSUSES
#Get genome fasta and unzip
echo "Checking genome files"
#if assembly does not exist in this directory, create it
[ ! -f $GENOMEFILES/$SUBNAME".fa" ] && gunzip -c $GENOME > $GENOMEFILES/$SUBNAME".fa"
#if .2bit version of the assembly does not exist, create it.
[ ! -f $GENOMEFILES/$SUBNAME".2bit" ] && $SOFTWARE/faToTwoBit $GENOMEFILES/$SUBNAME".fa" $GENOMEFILES/$SUBNAME".2bit"
#Run blast on queries
echo "Checking blast files"
#if the blast files directory does not exist, create it
[ ! -d $THISGENOME/blastfiles ] && mkdir $THISGENOME/blastfiles
cd $THISGENOME/blastfiles
ln -s $GENOMEFILES/$SUBNAME".fa"
cp $CONSENSUSFILE .
#if blast database doesn't exist, create it
[ ! -f *.nsq ] && makeblastdb -in $SUBNAME".fa" -dbtype nucl
#if blast output doesn't exist, run blast.
[ ! -f $SUBNAME"_blastn.out" ] && blastn -query $CONSENSUSSEQS -db $SUBNAME".fa" -outfmt 6 -out $SUBNAME"_blastn.out"
#Run extract_align
echo "Running extract_align"
#check if extract_align directory exists
[ ! -d $THISGENOME/extract_align ] && mkdir $THISGENOME/extract_align
cd $THISGENOME/extract_align
ln -s $GENOMEFILES/$SUBNAME".fa"
cp $CONSENSUSFILE .
#run extract_align.pl to pull as many as 50 of the best hits from the blast output out of the genome assembly. Those hits will go into catTEfiles directory.
python $GITPATH/extract_align.py -g $SUBNAME".fa" -b $THISGENOME/blastfiles/$SUBNAME"_blastn.out" -l $CONSENSUSSEQS -lb $SEQBUFFER -rb $SEQBUFFER -n $SEQNUMBER -a n -e n -t n
#Run extend tool
echo "Running extension tool"
#for every file in the catTEfiles directory
for FILE in $THISGENOME/extract_align/catTEfiles/*.fa
# get the name of the TE being examined from the filename
do TEID=$(basename ${FILE::-3})
echo "TEID = "$TEID
#create a diretory for it if it doesn't already exist
[ ! -d $EXTENSIONWORK/$TEID ] && mkdir $EXTENSIONWORK/$TEID
cd $EXTENSIONWORK/$TEID
#run Robert Hubley's extension tool. Note: original version of this script had option to set '-div 5'. New version has default -div as 18 (see e-mail from Robert, August 1, 2020)
$EXTENDPATH/davidExtendConsRAM.pl \
-genome $GENOMEFILES/$SUBNAME".2bit" \
-family $FILE \
-outdir . \
>$EXTENDLOGS/$TEID".extend.log"
#rename the MSA files and image files with TEID
sed "s/repam-newrep/$TEID/g" MSA-extended_with_rmod_cons.fa >$TEID"_MSA_extended.fa"
sed -i "s/CORECONS/CONSENSUS-$TEID/g" $TEID"_MSA_extended.fa"
sed "s/repam-newrep/$TEID/g" rep >$TEID"_rep.fa"
cp img.png $TEID".png"
#sort elements into categories
COUNT=$(grep ">" repseq.unextended | wc -l)
LENGTH=$(grep -v '>' $TEID"_rep.fa" | wc -m)
echo "Hit count for $TEID = "$COUNT
echo "Length of this repeat is = "$LENGTH
#sort rejects, repeats with fewer than 10 hits
if test $COUNT -lt 10; then cp $TEID".png" $REJECTS; fi
if test $COUNT -lt 10; then cp $TEID"_MSA_extended.fa" $REJECTS; fi
if test $COUNT -gt 9; then cp $TEID"_rep.fa" $FINAL_CONSENSUSES; fi
#sort possible segmental duplications, >10,000 bp consensus
if test $COUNT -gt 9 && test $LENGTH -gt 15000; then cp $TEID".png" $SD; fi
if test $COUNT -gt 9 && test $LENGTH -gt 15000; then cp $TEID"_MSA_extended.fa" $SD; fi
if test $COUNT -gt 9 && test $LENGTH -gt 15000; then cp $TEID"_rep.fa" $SD; fi
#sort all other possible TEs
if test $COUNT -gt 9 && test $LENGTH -lt 15000; then cp $TEID".png" $TE; fi
if test $COUNT -gt 9 && test $LENGTH -lt 15000; then cp $TEID"_MSA_extended.fa" $TE; fi
if test $COUNT -gt 9 && test $LENGTH -lt 15000; then cp $TEID"_rep.fa" $TE; fi
done