Skip to content

Latest commit

 

History

History
192 lines (173 loc) · 5.67 KB

13c_GenePrediction_IntialMikadoGenePrediction.md

File metadata and controls

192 lines (173 loc) · 5.67 KB

Need to predict genes for the SCN TN10 pseudomolecule assembly

Mikado Setup

#/work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado
To get blast annotations I downloaded all Tylenchina sequences from Trembl (179,064 sequences), as there were only ~5000 that have been manually reviewed for Nematoda in uniprot.
cat uniprot-reviewed\:* >uniprotCombined.fasta

ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/12_Trinity/trinity_out_dir/SCNgenome.Trinity-GG.gff3
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/15_class2/AllRNASEQClass2.gtf
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/16_Stringtie/NonRrnaRNASEQ_stringtie.gtf
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/17_Portcullis/portcullis_out/3-filt/portcullis_filtered.pass.junctions.bed
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts/SpadesAssemblyFat/01_GmapAlign/SCNgenome.transcripts.gff3
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../15_class2/AllRNASEQ_sorted.bam

vi list.txt
###############################################################################
AllRNASEQClass2.gtf     cl      True
NonRrnaRNASEQ_stringtie.gtf     st      True
SCNgenome.transcripts.gff3      sp      False
SCNgenome.Trinity-GG.gff3       tr      False
###############################################################################


###############################################################################
#!/bin/bash
#SBATCH -A its-hpc-condo-las-free
#SBATCH -N 1
#SBATCH -p freecompute
#SBATCH --ntasks-per-node=16
#SBATCH -t 96:00:00
#SBATCH -J MikadoScript_0
#SBATCH -o MikadoScript_0.o%j
#SBATCH -e MikadoScript_0.e%j
#SBATCH [email protected]
#SBATCH --mail-type=begin
#SBATCH --mail-type=end
ulimit -s unlimited
cd /work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado
conda activate mikado
#!/bin/bash
#setup variables
genome=SCNgenome.fasta
bam="AllRNASEQ_sorted.bam"
list="list.txt"
#run splice junction prediction
junctions="portcullis_filtered.pass.junctions.bed"
#configure
mikado configure \
   --list $list \
   --reference $genome \
   --mode permissive \
   --scoring worm.yaml \
   --copy-scoring worm.yaml \
   --junctions $junctions configuration.yaml
#prepare
mikado prepare \
   --json-conf configuration.yaml
#blast db
makeblastdb \
   -in uniprotCombined.fasta \
   -dbtype prot \
   -parse_seqids
#blast
blastx \
   -max_target_seqs 5 \
   -num_threads 16 \
   -query mikado_prepared.fasta \
   -outfmt 5 \
   -db uniprotCombined.fasta \
   -evalue 0.000001 2> blast.log | sed '/^$/d' > mikado.blast.xml
blastxml=mikado.blast.xml
#transdecoder
TransDecoder.LongOrfs \
   -t mikado_prepared.fasta
TransDecoder.Predict \
   -t mikado_prepared.fasta \
   --cpu 16
orfs=$(find $(pwd) -name "mikado_prepared.fasta.transdecoder.bed")
#serialise
mikado serialise \
   --start-method spawn \
   --procs 16 \
   --blast_targets ${genome} \
   --json-conf configuration.yaml \
   --xml ${blastxml} \
   --orfs ${orfs}
#pick
mikado pick \
   --start-method spawn \
   --procs 16 \
   --json-conf configuration.yaml \
   --subloci_out mikado.subloci.gff3
scontrol show job $SLURM_JOB_ID
###############################################################################

Second mikado run

#/work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado/02_Round02

ln -s ../SCNgenome.fasta
ln -s ../SCNgenome.DovetailSCNMaker4.all.maker.transcripts.gff3
ln -s ../brakermasked.gff
ln -s ../uniprotCombined.fasta
ln -s ../portcullis_filtered.pass.junctions.bed

vi list2.txt
###############################################################################
SCNgenome.DovetailSCNMaker4.all.maker.transcripts.gff3  MA      True
brakermasked.gff        BR      True
../01_Round1/mikado.loci.gff3   MI      True
###############################################################################

###############################################################################
cd /work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado/02_Round02
conda init bash
conda activate mikado
#!/bin/bash
#setup variables
genome=SCNgenome.fasta
bam="AllRNASEQ_sorted.bam"
list="list2.txt"
#run splice junction prediction
junctions="portcullis_filtered.pass.junctions.bed"
#configure
mikado configure \
   --list $list \
   --reference $genome \
   --mode stringent \
   --scoring worm.yaml \
   --copy-scoring worm.yaml \
   --junctions $junctions configuration.yaml
#prepare
mikado prepare \
   --json-conf configuration.yaml
#blast db
#makeblastdb \
   -in uniprotCombined.fasta \
   -dbtype prot \
   -parse_seqids
#blast
blastx \
   -max_target_seqs 5 \
   -num_threads 16 \
   -query mikado_prepared.fasta \
   -outfmt 5 \
   -db uniprotCombined.fasta \
   -evalue 0.000001 2> blast.log | sed '/^$/d' > mikado.blast.xml
blastxml=mikado.blast.xml
#transdecoder
TransDecoder.LongOrfs \
   -t mikado_prepared.fasta
TransDecoder.Predict \
   -t mikado_prepared.fasta \
   --cpu 16
orfs=$(find $(pwd) -name "mikado_prepared.fasta.transdecoder.bed")
#serialise
mikado serialise \
   --start-method spawn \
   --procs 16 \
   --blast_targets ${genome} \
   --json-conf configuration.yaml \
   --xml ${blastxml} \
   --orfs ${orfs}
#pick
mikado pick \
   --start-method spawn \
   --procs 16 \
   --json-conf configuration.yaml \
   --subloci_out mikado.subloci.gff3
###############################################################################

Collapse mikado loci into cufflinks transcripts

Found through manual annotation that this annotation is the most reliable.  removes single exon genes and those with early stop codons.  
/work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado

ml cufflinks
#2.2.1"


gffread  -U  -K -M -Q -VHE -g SCNgenome.fasta  mikado.loci.gff3  -o mikado.loci.ancestral.gff3  -x  mikado.loci.ancestralVHEJ_transcripts.fasta -y  mikado.loci.ancestralVHEJ_proteins.fasta