You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/16_Stringtie
#Tried to run class2 with all of the rnaseq, but times out without finishing, so used rrna removed bam.
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../20_ribosomalArrayID/NonRiboReads.bam
module load stringtie
stringtie NonRiboReads.bam -j 5 -p 16 -v -o NonRrnaRNASEQ_stringtie.gtf
#how many transcripts did we generate?
less NonRrnaRNASEQ_stringtie.gtf |awk '$3=="transcript"' |wc
41504 747072 6605478
#get transcript and exon stats since there are no genes predicted here
awk '$3=="transcript"{if($4>$5) {print $4-$5} else {print $5-$4}}' NonRrnaRNASEQ_stringtie.gtf |summary.sh
Total: 233,044,848
Count: 41,504
Mean: 5,614
Median: 2,173
Min: 199
Max: 523,622
awk '$3=="exon"{if($4>$5) {print $4-$5} else {print $5-$4}}' NonRrnaRNASEQ_stringtie.gtf |summary.sh Total: 70,045,713
Count: 277,955
Mean: 252
Median: 153
Min: 3
Max: 15,070
Set up portcullis to analyze splice junctions
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/17_Portcullis
#This initially on the bam file with the rdna reads. Even after removing the rRNA reads, I had to run this on a high mem node, used ~180GB ram.
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../20_ribosomalArrayID/NonRiboReads.bam
module load portcullis
portcullis full --threads 9 --verbose --use_csi --output portcullis_out --orientation FR SCNgenome.fasta NonRiboReads.bam
Set up braker on a masked genome, excluding the simple repeats
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/19_brakerMasked
ln -s ../11_AlignRNA/AllRNASEQ_sorted.bam
ln -s ../13_braker/SCNgenome.consensus_isoforms_sorted.bam
ln -s ../13_braker/SCNgenome.H.glycinesEST_sorted.bam
ln -s ../10_RepeatModeler/SCNgenome.fasta.masked
module load GIF/braker/2.1.0
braker.pl --species=Hglycines3 --genome=SCNgenome.fasta.masked --bam=SCNgenome.consensus_isoforms_sorted.bam,SCNgenome.H.glycinesEST_sorted.bam,AllRNASEQ_sorted.bam
#how many genes?
awk '$3=="gene"' augustus.hints.gff |grep -v "#" |wc
22408 201672 1272920
#how many transcripts
awk '$3=="transcript"' augustus.hints.gff |grep -v "#" |wc
24481 220329 1610884
#translate the gff to fasta for use with spades below
~/common_scripts/gff2fasta.pl ../../../10_RepeatModeler/SCNgenome.fasta augustus.hints.gff 4SPADES
Since cufflinks is deprecated to stringtie, I decided to use another assembler for transcripts
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../20_ribosomalArrayID/NonRiboReads.bam
ln -s ../19_brakerMasked/braker/Hglycines3/4SPADES.cdna.fasta
#extract only those reads that aligned to the genome
module load picard/2.17.0-ft5qztz; java -Xmx120G -Xms50G -jar /opt/rit/spack-app/linux-rhel7-x86_64/gcc-4.8.5/picard-2.17.0-ft5qztzntoymuxiqt3b6yi6uqcmgzmds/bin/picard.jar SamToFastq I=NonRiboReads.bam FASTQ=NonRiboReads_R1.fq F2=NonRiboReads_R2.fq FU=NonRiboReads_unpaired.fq
#Picard failed on some of the reads, due to a mate being removed in ribosomal areas. ignored this error.
#run spades with extracted reads and using trusted contigs from the masked braker gene prediction
module load spades; spades.py --trusted-contigs 4SPADES.cdna.fasta --rna -m 120 -t 16 -1 NonRiboReads_R1.fq -2 NonRiboReads_R2.fq -o SpadesAssembly
Spades Assembly
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts
SCNgenome.fasta -> ../10_RepeatModeler/SCNgenome.fasta
NonRiboReads.bam -> ../20_ribosomalArrayID/NonRiboReads.bam
4SPADES.cdna.fasta -> ../19_brakerMasked/braker/Hglycines3/4SPADES.cdna.fasta
#used the cdna's predicted from braker using isoseq, est's, and rnaseq with a masked genome.
SPAdes-3.13.1-Linux/bin/spades.py --trusted-contigs 4SPADES.cdna.fasta --rna -m 300 -t 32 -1 NonRiboReads_R1.fq -2 NonRiboReads_R2.fq -o SpadesAssemblyFat
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts/SpadesAssemblyFat
grep -c ">" transcripts.fasta
100402
#need to align these so they are available as a gff for mikado
/work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts/SpadesAssemblyFat/01_GmapAlign
sh runGmap.sh SCNgenome /work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts/SpadesAssemblyFat/01_GmapAlign/ SCNgenome.fasta transcripts.fasta
less SCNgenome.transcripts.gff3 |awk '$3=="gene"' |grep "path1"|wc
70737 636633 10925032
#get transcript and exon stats since no genes are predicted -- aligned with gmap so only counts for primary alignments here
[remkv6@condo075 23_Mikado]$ less SCNgenome.transcripts.gff3 |grep "path1" |awk '$3=="mRNA"{if($4>$5) {print $4-$5} else {print $5-$4}}
' |summary.sh
Total: 858,591,281
Count: 70,737
Mean: 12,137
Median: 795
Min: 2
Max: 1,781,465
less SCNgenome.transcripts.gff3 |grep "\.mrna1\." |awk '$3=="exon"{if($4>$5) {print $4-$5} else {print $5-$4}}' |summary.sh
Total: 63,693,622
Count: 389,188
Mean: 163
Median: 114
Min: 1
Max: 9,438