template_extend_align.sh

#!/bin/bash
#SBATCH --job-name=<NAME>
#SBATCH --output=%x.%j.out
#SBATCH --error=%x.%j.err
#SBATCH --partition=nocona
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --mem-per-cpu=60G
#SBATCH --time=02:00:00
#SBATCH --mail-user david.a.ray@ttu.edu


#### BASICS
# This script will take in queries generated by RepeatModeler and generate
# extended consensus sequences for visualization and evaluation using a combination of
# the Ray lab's extract_align python script and Robert Hubley's extension perl script. It is
# designed to allow for a triage of extended RepeatModeler output when generating de novo curated
# TE libraries from RepeatModeler output.
# 
# The output you're going to be most interested in will be in the images_and_alignments directory.

#### USAGE and OUTPUT
#
# Replace aJam in line 4 with your chosen job ID.
#
# sbatch scriptname.sh <FULL path to genome.gz> <FULL path to output directory> <FULL path to repeatmodeler queries>
#
# Input #1 = path to a genome file, zipped. The first field of the filename, as divided by ".", will end up as SUBNAME as a designation 
# Input #2 = the path to the output directory 
# Input #3 = path to a file with repeatmodeler consensuses with headers modified to remove "#" and "/" 
# 
# Example - sbatch bin/cPer_bat1k_extend_align_svaca.sh /lustre/scratch/daray/test2/assembly/cPer_bat1k.masked.fa.gz /lustre/scratch/daray/test2/cPer /lustre/scratch/daray/test2/repeatmodeler/carollia_short.fa
#
## Output by directory (extend_align/SUBNAME...):
# blastfiles = blast output, database, and queries file
# extendlogs = log files from Hubley extension tool
# extensionwork = directory for each TE evaluated with output from tool
# extract_align = output from extract_align.py, contains catTEfiles to be used by extension tool
# genomefiles = genome assembly in .fa and .2bit format
# images_and_alignments = .png files and aligned files for visual validation 
# rejects = alignments filtered for too few hits

#### CONDA OPERATING ENVIRONMENT
#Set up conda environment if necessary before starting
#Note: Before using this script, I set up this working enviroment within conda using:
# $ conda create --name extend_env
# $ conda activate extend_env
# $ conda install -c bioconda rmblast
# $ conda install -c bioconda bedtools
# $ conda install biopython
# $ conda install pandas
# $ conda install -c bioconda pyfaidx
# $ conda install --channel bioconda pybedtools

#### STUFF THAT MAY NEED CHANGING -- CHECK ALL PATHS, ETC

#activate conda environment, load modules
. ~/conda/etc/profile.d/conda.sh
conda activate extend_env

#Locations of critical software
SOFTWARE=/lustre/work/daray/software
EXTENDPATH=$SOFTWARE/RepeatModeler-dev/util
GITPATH=/home/daray/gitrepositories/bioinfo_tools

#Variables for extract_align.py
SEQBUFFER=100 	
SEQNUMBER=50 
FLANK=100	
######END STUFF THAT MAY NEED CHANGING######

##
TAXON=<NAME>
GENOME=<path.to.NAME.fa.gz>		
WORKDIR=<path.to.main.working.directory>
CONSENSUSFILE=$WORKDIR/${TAXON}_novel.fa
		
##Set paths and variables
echo "Genome file is "$GENOME
BASENAME=$(basename $GENOME .gz)	
SUBNAME=$(basename $GENOME | awk -F'[.]' '{print $1}')

echo "Your working directory is "$WORKDIR

##Get RepeatModeler consensus file info
echo "Queries file is "$CONSENSUSFILE
CONSENSUSSEQS=$(basename $CONSENSUSFILE)

#Set up directory structure
mkdir -p $WORKDIR
THISGENOME=$WORKDIR/${SUBNAME}_N
#create a directory for all extension tool work
EXTENSIONWORK=$THISGENOME/extensionwork
mkdir -p $EXTENSIONWORK
#create a directory for extension log files.
EXTENDLOGS=$THISGENOME/extendlogs
mkdir -p $EXTENDLOGS
#create a directory for assembly
GENOMEFILES=$THISGENOME/genomefiles
mkdir -p $GENOMEFILES
#create a folder to store the .png files and MSAs for evaluation
IMAGES=$THISGENOME/images_and_alignments
mkdir -p $IMAGES
#create a folder for potential segmental duplications
SD=$IMAGES/possible_SD
mkdir -p $SD
#create a folder for likely TEs
TE=$IMAGES/likely_TEs
mkdir -p $TE
#create a folder to filter TEs with very few hits
REJECTS=$IMAGES/rejects
mkdir -p $REJECTS
#create a folder to store potential final consensus sequences
FINAL_CONSENSUSES=$THISGENOME/final_consensuses
mkdir -p $FINAL_CONSENSUSES


#Get genome fasta and unzip
echo "Checking genome files"
#if assembly does not exist in this directory, create it
[ ! -f $GENOMEFILES/$SUBNAME".fa" ] && gunzip -c $GENOME > $GENOMEFILES/$SUBNAME".fa"
#if .2bit version of the assembly does not exist, create it.
[ ! -f $GENOMEFILES/$SUBNAME".2bit" ] && $SOFTWARE/faToTwoBit $GENOMEFILES/$SUBNAME".fa" $GENOMEFILES/$SUBNAME".2bit"

#Run blast on queries
echo "Checking blast files"
#if the blast files directory does not exist, create it
[ ! -d $THISGENOME/blastfiles ] && mkdir $THISGENOME/blastfiles
cd $THISGENOME/blastfiles
ln -s $GENOMEFILES/$SUBNAME".fa"
cp $CONSENSUSFILE .
#if blast database doesn't exist, create it
[ ! -f *.nsq ] && makeblastdb -in $SUBNAME".fa" -dbtype nucl 
#if blast output doesn't exist, run blast. 
[ ! -f $SUBNAME"_blastn.out" ] && blastn -query $CONSENSUSSEQS -db $SUBNAME".fa" -outfmt 6 -out $SUBNAME"_blastn.out"		

#Run extract_align
echo "Running extract_align"
#check if extract_align directory exists
[ ! -d $THISGENOME/extract_align ] && mkdir $THISGENOME/extract_align
cd $THISGENOME/extract_align
ln -s $GENOMEFILES/$SUBNAME".fa"
cp $CONSENSUSFILE .
#run extract_align.pl to pull as many as 50 of the best hits from the blast output out of the genome assembly. Those hits will go into catTEfiles directory. 
python  $GITPATH/extract_align.py -g $SUBNAME".fa" -b $THISGENOME/blastfiles/$SUBNAME"_blastn.out" -l $CONSENSUSSEQS -lb $SEQBUFFER -rb $SEQBUFFER -n $SEQNUMBER -a n -e n -t n

#Run extend tool
echo "Running extension tool"
#for every file in the catTEfiles directory
for FILE in $THISGENOME/extract_align/catTEfiles/*.fa
	# get the name of the TE being examined from the filename
	do TEID=$(basename ${FILE::-3})
	echo "TEID = "$TEID
	#create a diretory for it if it doesn't already exist
	[ ! -d $EXTENSIONWORK/$TEID ] && mkdir $EXTENSIONWORK/$TEID		
	cd $EXTENSIONWORK/$TEID
	#run Robert Hubley's extension tool. Note: original version of this script had option to set '-div 5'. New version has default -div as 18 (see e-mail from Robert, August 1, 2020)
	$EXTENDPATH/davidExtendConsRAM.pl \
		-genome $GENOMEFILES/$SUBNAME".2bit" \
		-family $FILE \
		-outdir . \
		>$EXTENDLOGS/$TEID".extend.log"
	#rename the MSA files and image files with TEID
	sed "s/repam-newrep/$TEID/g" MSA-extended_with_rmod_cons.fa >$TEID"_MSA_extended.fa"
	sed -i "s/CORECONS/CONSENSUS-$TEID/g" $TEID"_MSA_extended.fa"
	sed "s/repam-newrep/$TEID/g" rep >$TEID"_rep.fa"
	cp img.png $TEID".png"
	#sort elements into categories
	COUNT=$(grep ">" repseq.unextended | wc -l)
	LENGTH=$(grep -v '>' $TEID"_rep.fa" | wc -m)
	echo "Hit count for $TEID = "$COUNT
	echo "Length of this repeat is = "$LENGTH
	#sort rejects, repeats with fewer than 10 hits
	if test $COUNT -lt 10; then cp $TEID".png" $REJECTS; fi
	if test $COUNT -lt 10; then cp $TEID"_MSA_extended.fa" $REJECTS; fi
	if test $COUNT -gt 9; then cp $TEID"_rep.fa" $FINAL_CONSENSUSES; fi
	#sort possible segmental duplications, >10,000 bp consensus
	if test $COUNT -gt 9 && test $LENGTH -gt 15000; then cp $TEID".png" $SD; fi 
	if test $COUNT -gt 9 && test $LENGTH -gt 15000; then cp $TEID"_MSA_extended.fa" $SD; fi 
	if test $COUNT -gt 9 && test $LENGTH -gt 15000; then cp $TEID"_rep.fa" $SD; fi 	
	#sort all other possible TEs
	if test $COUNT -gt 9 && test $LENGTH -lt 15000; then cp $TEID".png" $TE; fi 
	if test $COUNT -gt 9 && test $LENGTH -lt 15000; then cp $TEID"_MSA_extended.fa" $TE; fi
	if test $COUNT -gt 9 && test $LENGTH -lt 15000; then cp $TEID"_rep.fa" $TE; fi
done