# /work/GIF/remkv6/Baum/04_Dovetail2Restart/10_RepeatModeler
module use /work/GIF/software/modules
sh runRepeatModeler.sh SCNgenome.fasta
################################################################################
#!/bin/bash
# runs repeat masking for the genome after constructing custom repeat library
# uses repeat modeler for building custom db and RepeatMasking for masking
# run it as:
# runRepeatModeler.sh Genome.fasta
# based on Rick's guide https://intranet.gif.biotech.iastate.edu/doku.php/people:remkv6:genome738polished_repeatmodeler_--de_novo_repeat_identification
if [ $# -lt 1 ] ; then
echo "usage: runRepeatModeler <genome.fasta>"
echo ""
echo "To build custom repeat library and mask the repeats of the genome"
echo ""
exit 0
fi
GENOME="$1"
module use /shared/software/GIF/modules/
module purge
module load parallel
module load GIF2/repeatmasker/4.0.6
module load GIF2/repeatmodeler/1.0.8
module load GIF2/perl/5.22.1
DATABASE="$(basename ${GENOME%.*}).DB"
BuildDatabase -name ${DATABASE} -engine ncbi ${GENOME}
RepeatModeler -database ${DATABASE} -engine ncbi -pa 16
ln -s $(find $(pwd) -name "consensi.fa.classified")
RepeatMasker -pa 16 -gff -lib consensi.fa.classified ${GENOME}
################################################################################
#Results of repeatmodeler/masker
==================================================
file name: SCNgenome.fasta
sequences: 9
total length: 157982452 bp (156301001 bp excl N/X-runs)
GC level: 36.66 %
bases masked: 61438776 bp ( 38.89 %)
==================================================
number of length percentage
elements* occupied of sequence
--------------------------------------------------
SINEs: 67 13915 bp 0.01 %
ALUs 0 0 bp 0.00 %
MIRs 0 0 bp 0.00 %
LINEs: 5375 1795726 bp 1.14 %
LINE1 422 69365 bp 0.04 %
LINE2 0 0 bp 0.00 %
L3/CR1 2654 1265055 bp 0.80 %
LTR elements: 10759 6004426 bp 3.80 %
ERVL 0 0 bp 0.00 %
ERVL-MaLRs 0 0 bp 0.00 %
ERV_classI 35 49865 bp 0.03 %
ERV_classII 757 149391 bp 0.09 %
DNA elements: 70403 14691523 bp 9.30 %
hAT-Charlie 0 0 bp 0.00 %
TcMar-Tigger 23 23125 bp 0.01 %
Unclassified: 154224 35100826 bp 22.22 %
Total interspersed repeats: 57606416 bp 36.46 %
Small RNA: 295 107683 bp 0.07 %
Satellites: 775 200791 bp 0.13 %
Simple repeats: 53062 2878802 bp 1.82 %
Low complexity: 17729 1257503 bp 0.80 %
==================================================
* most repeats fragmented by insertions or deletions
have been counted as one element
The query species was assumed to be homo
RepeatMasker version open-4.0.6 , default mode
run with rmblastn version 2.2.27+
The query was compared to classified sequences in "consensi.fa.classified"
RepBase Update 20160829, RM database version 20160829
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/11_AlignRNA
for f in /work/GIF/remkv6/Baum/03_GlandRepeat738/01_Alignment2Genome/*val_*.fq.gz;do ln -s $f;done
for f in /work/GIF/remkv6/Baum/03_GlandRepeat738/02_AlignmentWholeWorm/*val_*.fq.gz;do ln -s $f;done
ln -s ../10_RepeatModeler/SCNgenome.fasta
paste <(ls -1 *val_1*) <(ls -1 *val_2*) |while read a b ; do echo "sh runFeatureCounts.sh "$a" "$b" /work/GIF/remkv6/Baum/04_Dovetail2Restart/11_AlignRNA SCNgenome.fasta";done >align.sh
ls -l *val_1* |awk '{print $11}' |sed 's|/|\t|g' |cut -f 8 |sed 's/\.gz/_sorted.bam/g' |tr "\n" " " |awk '{print "samtools merge AllRNASEQ_sorted.bam "$0}' |less
samtools merge AllRNASEQ_sorted.bam 1703FL-02-01_S1_L001_R1_001_val_1.fq_sorted.bam 1703FL-02-02_S2_L001_R1_001_val_1.fq_sorted.bam 1703FL-02-03_S3_L001_R1_001_val_1.fq_sorted.bam 1703FL-02-04_S4_L001_R1_001_val_1.fq_sorted.bam 1703-TM101_S0_L003_R1_001_val_1.fq_sorted.bam 1703-TM102_S0_L003_R1_001_val_1.fq_sorted.bam SRR6230579_1_val_1.fq_sorted.bam SRR6230580_1_val_1.fq_sorted.bam SRR6230581_1_val_1.fq_sorted.bam SRR6230582_1_val_1.fq_sorted.bam SRR6230583_1_val_1.fq_sorted.bam SRR6230584_1_val_1.fq_sorted.bam SRR6230585_1_val_1.fq_sorted.bam SRR6230586_1_val_1.fq_sorted.bam SRR6230587_1_val_1.fq_sorted.bam
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/11_AlignRNA
ln -s /work/GIF/remkv6/Baum/CamTechGenomeComparison/58_Renamatorium/6_isoseq/consensus_isoforms.fasta
ln -s /work/GIF/remkv6/Baum/01_SCNDovetailScaffolding/09_Maker/H.glycinesEST.fasta
sh runGmap.sh SCNgenome /work/GIF/remkv6/Baum/04_Dovetail2Restart/11_AlignRNA SCNgenome.fasta consensus_isoforms.fasta
sh runGmap.sh SCNgenome /work/GIF/remkv6/Baum/04_Dovetail2Restart/11_AlignRNA SCNgenome.fasta H.glycinesEST.fasta
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/20_ribosomalArrayID
#ribosomal sequences from ncbi nucleotide (tylenchida) and fastas from the old genome
cat ../../01_SCNDovetailScaffolding/08_ribosomalArrays/RDNANucleotide.fasta ../../01_SCNDovetailScaffolding/08_ribosomalArrays/SCNSpecificRibosomalSeq.fasta >RibosomalNCBINOldGenome.fasta
#get prospective coordinates for rDNA arrays
module load blast-plus
makeblastdb -in SCNgenome.fasta -dbtype nucl -out SCNgenome.blastdb
blastn -db SCNgenome.blastdb -query RibosomalNCBINOldGenome.fasta -num_threads 16 -outfmt 6 -out ribo2scn.blastout
module load bedtools2
less ribo2scn.blastout |awk '$12>200 {print $2,$9,$10}'|awk '{if($2>$3) {print $1,$3,$2} else {print $0}}' |sort|uniq|tr " " "\t" |bedtools merge -d 2000 -i - >RiboCoords.bed
module load samtools
samtools view ../19_brakerMasked/AllRNASEQ_sorted.bam -b -h -o rdnaReads.bam -U NonRiboReads.bam -L RiboCoords.bed
#Trinity will not run in a satisfactory time with the rDNA reads, so running with them removed.
ln -s ../11_AlignRNA/SCNgenome.fasta
ln -s ../20_ribosomalArrayID/NonRiboReads.bam
sh runTrinity.sh NonRiboReads.bam
#runTrinity.sh
################################################################################
#!/bin/bash
module load GIF2/trinity
bam="$1"
out=$(basename ${bam%.*} |cut -f 1 -d "_")
Trinity \
--genome_guided_bam ${bam} \
--max_memory 110G \
--genome_guided_max_intron 30000 \
--full_cleanup \
--CPU 16
################################################################################
#finished in 32hrs
#how many transcripts did we get?
grep -c ">" Trinity-GG.fasta
110683
#map trinity transcripts back to genome
sh runGmap.sh SCNgenome /work/GIF/remkv6/Baum/04_Dovetail2Restart/12_Trinity/trinity_out_dir/ ../SCNgenome.fasta Trinity-GG.fasta
How many gene entries did we get in the gff?
less SCNgenome.Trinity-GG.gff3 |awk '$3=="gene"' |wc
188834 1699506 21760750
#note this was ran with all rRNA included
ln -s ../11_AlignRNA/AllRNASEQ_sorted.bam
ln -s ../11_AlignRNA/SCNgenome.fasta
ln -s ../11_AlignRNA/SCNgenome.consensus_isoforms_sorted.bam
ln -s ../11_AlignRNA/SCNgenome.H.glycinesEST_sorted.bam
module use /work/GIF/software/modules
module load GIF/braker/2.1.0
braker.pl --species=Hglycines2 --genome=SCNgenome.fasta --bam=SCNgenome.consensus_isoforms_sorted.bam,SCNgenome.H.glycinesEST_sorted.bam,AllRNASEQ_sorted.bam
#how many genes?
grep -v "#" augustus.hints.gff |awk '$3=="gene"' |wc
35514 319626 2025377
/work/GIF/remkv6/Baum/04_Dovetail2Restart/15_class2
ln -s ../11_AlignRNA/AllRNASEQ_sorted.bam
ln -s ../10_RepeatModeler/SCNgenome.fasta
module load GIF/class2
run_class.pl -a AllRNASEQ_sorted.bam -o AllRNASEQClass2.gtf -p 16 --verbose --clean
#how many transcripts did we generate?
less transcripts.gff |awk '$3=="transcript"' |wc
60967 853538 8375727
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/16_Stringtie
#Tried to run class2 with all of the rnaseq, but times out without finishing, so used rrna removed bam.
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../20_ribosomalArrayID/NonRiboReads.bam
module load stringtie
stringtie NonRiboReads.bam -j 5 -p 16 -v -o NonRrnaRNASEQ_stringtie.gtf
#how many transcripts did we generate?
less NonRrnaRNASEQ_stringtie.gtf |awk '$3=="transcript"' |wc
41504 747072 6605478
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/17_Portcullis
#This initially on the bam file with the rdna reads. Even after removing the rRNA reads, I had to run this on a high mem node, used ~180GB ram.
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../20_ribosomalArrayID/NonRiboReads.bam
module load portcullis
portcullis full --threads 9 --verbose --use_csi --output portcullis_out --orientation FR SCNgenome.fasta NonRiboReads.bam
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/19_brakerMasked
ln -s ../11_AlignRNA/AllRNASEQ_sorted.bam
ln -s ../13_braker/SCNgenome.consensus_isoforms_sorted.bam
ln -s ../13_braker/SCNgenome.H.glycinesEST_sorted.bam
ln -s ../10_RepeatModeler/SCNgenome.fasta.masked
module load GIF/braker/2.1.0
braker.pl --species=Hglycines3 --genome=SCNgenome.fasta.masked --bam=SCNgenome.consensus_isoforms_sorted.bam,SCNgenome.H.glycinesEST_sorted.bam,AllRNASEQ_sorted.bam
#how many genes?
awk '$3=="gene"' augustus.hints.gff |grep -v "#" |wc
22408 201672 1272920
#translate the gff to fasta for use with spades below
~/common_scripts/gff2fasta.pl ../../../10_RepeatModeler/SCNgenome.fasta augustus.hints.gff 4SPADES
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../20_ribosomalArrayID/NonRiboReads.bam
ln -s ../19_brakerMasked/braker/Hglycines3/4SPADES.cdna.fasta
#extract only those reads that aligned to the genome
module load picard/2.17.0-ft5qztz; java -Xmx120G -Xms50G -jar /opt/rit/spack-app/linux-rhel7-x86_64/gcc-4.8.5/picard-2.17.0-ft5qztzntoymuxiqt3b6yi6uqcmgzmds/bin/picard.jar SamToFastq I=NonRiboReads.bam FASTQ=NonRiboReads_R1.fq F2=NonRiboReads_R2.fq FU=NonRiboReads_unpaired.fq
#Picard failed on some of the reads, due to a mate being removed in ribosomal areas. ignored this error.
#run spades with extracted reads and using trusted contigs from the masked braker gene prediction
module load spades; spades.py --trusted-contigs 4SPADES.cdna.fasta --rna -m 120 -t 16 -1 NonRiboReads_R1.fq -2 NonRiboReads_R2.fq -o SpadesAssembly
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts
SCNgenome.fasta -> ../10_RepeatModeler/SCNgenome.fasta
NonRiboReads.bam -> ../20_ribosomalArrayID/NonRiboReads.bam
4SPADES.cdna.fasta -> ../19_brakerMasked/braker/Hglycines3/4SPADES.cdna.fasta
#used the cdna's predicted from braker using isoseq, est's, and rnaseq with a masked genome.
SPAdes-3.13.1-Linux/bin/spades.py --trusted-contigs 4SPADES.cdna.fasta --rna -m 300 -t 32 -1 NonRiboReads_R1.fq -2 NonRiboReads_R2.fq -o SpadesAssemblyFat
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts/SpadesAssemblyFat
grep -c ">" transcripts.fasta
100402
#need to align these so they are available as a gff for mikado
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado
To get blast annotations I downloaded all Tylenchina sequences from Trembl (179,064 sequences), as there were only ~5000 that have been manually reviewed for Nematoda in uniprot.
cat uniprot-reviewed\:* >uniprotCombined.fasta
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/12_Trinity/trinity_out_dir/SCNgenome.Trinity-GG.gff3
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/15_class2/AllRNASEQClass2.gtf
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/16_Stringtie/NonRrnaRNASEQ_stringtie.gtf
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/17_Portcullis/portcullis_out/3-filt/portcullis_filtered.pass.junctions.bed
ln -s /work/GIF/remkv6/Baum/04_Dovetail2Restart/22_spadesTranscripts/SpadesAssemblyFat/01_GmapAlign/SCNgenome.transcripts.gff3
ln -s ../10_RepeatModeler/SCNgenome.fasta
ln -s ../15_class2/AllRNASEQ_sorted.bam
vi list.txt
###############################################################################
AllRNASEQClass2.gtf cl True
NonRrnaRNASEQ_stringtie.gtf st True
SCNgenome.transcripts.gff3 sp False
SCNgenome.Trinity-GG.gff3 tr False
###############################################################################
###############################################################################
#!/bin/bash
#SBATCH -A its-hpc-condo-las-free
#SBATCH -N 1
#SBATCH -p freecompute
#SBATCH --ntasks-per-node=16
#SBATCH -t 96:00:00
#SBATCH -J MikadoScript_0
#SBATCH -o MikadoScript_0.o%j
#SBATCH -e MikadoScript_0.e%j
#SBATCH [email protected]
#SBATCH --mail-type=begin
#SBATCH --mail-type=end
ulimit -s unlimited
cd /work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado
conda activate mikado
#!/bin/bash
#setup variables
genome=SCNgenome.fasta
bam="AllRNASEQ_sorted.bam"
list="list.txt"
#run splice junction prediction
junctions="portcullis_filtered.pass.junctions.bed"
#configure
mikado configure \
--list $list \
--reference $genome \
--mode permissive \
--scoring worm.yaml \
--copy-scoring worm.yaml \
--junctions $junctions configuration.yaml
#prepare
mikado prepare \
--json-conf configuration.yaml
#blast db
makeblastdb \
-in uniprotCombined.fasta \
-dbtype prot \
-parse_seqids
#blast
blastx \
-max_target_seqs 5 \
-num_threads 16 \
-query mikado_prepared.fasta \
-outfmt 5 \
-db uniprotCombined.fasta \
-evalue 0.000001 2> blast.log | sed '/^$/d' > mikado.blast.xml
blastxml=mikado.blast.xml
#transdecoder
TransDecoder.LongOrfs \
-t mikado_prepared.fasta
TransDecoder.Predict \
-t mikado_prepared.fasta \
--cpu 16
orfs=$(find $(pwd) -name "mikado_prepared.fasta.transdecoder.bed")
#serialise
mikado serialise \
--start-method spawn \
--procs 16 \
--blast_targets ${genome} \
--json-conf configuration.yaml \
--xml ${blastxml} \
--orfs ${orfs}
#pick
mikado pick \
--start-method spawn \
--procs 16 \
--json-conf configuration.yaml \
--subloci_out mikado.subloci.gff3
scontrol show job $SLURM_JOB_ID
###############################################################################
#/work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado/02_Round02
ln -s ../SCNgenome.fasta
ln -s ../SCNgenome.DovetailSCNMaker4.all.maker.transcripts.gff3
ln -s ../brakermasked.gff
ln -s ../uniprotCombined.fasta
ln -s ../portcullis_filtered.pass.junctions.bed
vi list2.txt
###############################################################################
SCNgenome.DovetailSCNMaker4.all.maker.transcripts.gff3 MA True
brakermasked.gff BR True
../01_Round1/mikado.loci.gff3 MI True
###############################################################################
###############################################################################
cd /work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado/02_Round02
conda init bash
conda activate mikado
#!/bin/bash
#setup variables
genome=SCNgenome.fasta
bam="AllRNASEQ_sorted.bam"
list="list2.txt"
#run splice junction prediction
junctions="portcullis_filtered.pass.junctions.bed"
#configure
mikado configure \
--list $list \
--reference $genome \
--mode stringent \
--scoring worm.yaml \
--copy-scoring worm.yaml \
--junctions $junctions configuration.yaml
#prepare
mikado prepare \
--json-conf configuration.yaml
#blast db
#makeblastdb \
-in uniprotCombined.fasta \
-dbtype prot \
-parse_seqids
#blast
blastx \
-max_target_seqs 5 \
-num_threads 16 \
-query mikado_prepared.fasta \
-outfmt 5 \
-db uniprotCombined.fasta \
-evalue 0.000001 2> blast.log | sed '/^$/d' > mikado.blast.xml
blastxml=mikado.blast.xml
#transdecoder
TransDecoder.LongOrfs \
-t mikado_prepared.fasta
TransDecoder.Predict \
-t mikado_prepared.fasta \
--cpu 16
orfs=$(find $(pwd) -name "mikado_prepared.fasta.transdecoder.bed")
#serialise
mikado serialise \
--start-method spawn \
--procs 16 \
--blast_targets ${genome} \
--json-conf configuration.yaml \
--xml ${blastxml} \
--orfs ${orfs}
#pick
mikado pick \
--start-method spawn \
--procs 16 \
--json-conf configuration.yaml \
--subloci_out mikado.subloci.gff3
###############################################################################
#gene length
less ../../../12_SchachtiiSynteny/H_sch_gene_calls_v1_CP.gff |awk '$3=="gene"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 78,025,824
Count: 26,739
Mean: 2,918
Median: 2,129
Min: 252
Max: 56,948
#CDS length
(mikado) [remkv6@condofree032 02_Round02]$ less ../../../12_SchachtiiSynteny/H_sch_gene_calls_v1_CP.gff |awk '$3=="CDS"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 36,075,582
Count: 222,897
Mean: 161
Median: 125
Min: 1
Max: 13,080
#transcript length
less ../../../12_SchachtiiSynteny/H_sch_gene_calls_v1_CP.gff |awk '$3=="transcript"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 101,553,992
Count: 32,624
Mean: 3,112
Median: 2,275
Min: 252
Max: 56,948
#exons per transcript
less ../../../12_SchachtiiSynteny/H_sch_gene_calls_v1_CP.gff |awk '$3=="CDS" ' |cut -f 9 |awk '{print $1}' |sort |uniq -c |awk '{print $1}' |summary.sh
Total: 222,897
Count: 32,624
Mean: 6
Median: 5
Min: 1
Max: 113
#gene length
less mikado.loci.gff3 |awk '$3=="gene"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 120,922,982
Count: 38,646
Mean: 3,128
Median: 821
Min: 199
Max: 465,216
#transcript length
less mikado.loci.gff3 |awk '$3=="mRNA"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 142,902,933
Count: 45,286
Mean: 3,155
Median: 1,034
Min: 199
Max: 465,216
#exons per transcript
less mikado.loci.gff3 |awk '$3=="CDS" ' |cut -f 9 |sed 's/\./\t/3' |awk '{print $1}' |sort|uniq -c |awk '{print $1}' |summary.sh
Total: 258,802
Count: 45,286
Mean: 5
Median: 3
Min: 1
Max: 194
#exons per transcript
less mikado.loci.gff3 |awk '$3=="CDS"' |bedtools intersect -v -wo -a - -b ../../10_RepeatModeler/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/\./\t/3' |awk '{print $1}' |sort|uniq -c |awk '{print $1}' |summary.sh
Total: 228,567
Count: 31,566
Mean: 7
Median: 5
Min: 1
Max: 194
#how many genes are not repetitive
less mikado.loci.gff3 |awk '$3=="CDS"' |bedtools intersect -v -wo -a - -b ../../10_RepeatModeler/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/\./\t/2' |awk '{print $1}' |sort|uniq|wc
25180 25180 650224
#how many are repetitive then?
By subtraction that is: 13,466 genes
#gene sizes that are not repetitive
less NonRepetitiveGenes.gff3 |awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 104,488,696
Count: 25,180
Mean: 4,149
Median: 1,726
Min: 199
Max: 465,216
less augustus.hints.gff |awk '$3=="gene"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 67,011,405
Count: 22,408
Mean: 2,990
Median: 2,015
Min: 200
Max: 52,212
less augustus.hints.gff |awk '$3=="gene"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 83,144,932
Count: 35,514
Mean: 2,341
Median: 1,546
Min: 200
Max: 43,263
/work/GIF/remkv6/Baum/01_SCNDovetailScaffolding
less 12_MakerGenesOrthofinder/DovetailSCNMaker4.all.NOFASTA.gff|awk '$3=="gene"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 67,168,685
Count: 22,856
Mean: 2,938
Median: 1,811
Min: 5
Max: 65,204
less mikado.loci.gff3| awk '$3=="gene"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 131,514,741
Count: 39,516
Mean: 3,328
Median: 920
Min: 199
Max: 465,216
less ../CamTechGenomeComparison/58_Renamatorium/1_genomeNgff/fixed.augustus.gff3|awk '$3=="gene"' |grep -v "#"|awk '{if($4>$5){print $4-$5} else {print $5-$4}}' |summary.sh
Total: 71,832,760
Count: 29,769
Mean: 2,413
Median: 1,603
Min: 79
Max: 65,717
#interproscan
less 01_Interpro/interproAnnot.tsv |awk '{print $1}' |sort|uniq|wc
25779 25779 639842
#proteins to uniprot
less 04_ProtsUniprot/mikado_proteins.vs.uniprot_sprot.cul5.1e5.blastp.out |grep -v "hypothetical" |grep -v "uncharacterized" |awk '{print $1 }' |sort|uniq|wc
1604 1604 39727
#transcripts to uniref
less 05_TransUniprot/mikado_transcripts.vs.uniprot_sprot.cul5.1e5.blastx.out |grep -v "hypothetical" |grep -v "uncharacterized" |awk '{print $1 }' |sort|uniq|wc
12796 12796 317987
#prots to nr
less 02_Prots2Nr/mikado_proteinsFixed.vs.
nr.cul5.1e5.blastp.out |grep -v "hypothetical" |grep -v "uncharacterized" |aw
k '{print $1}' |sort|uniq|wc
3056 3056 75814
#transcripts to nt
ess 03_Transcrips2Nt/mikado_transcripts.vs.nt.cul5.1e5.blastn.out |grep -v "hypothetical" |grep -v "uncharacterized" |awk '{print $1}' |sort|uniq|wc
2266 2266 56298
#All databases together
cat <(less 01_Interpro/interproAnnot.tsv |awk '{print $1}' |sort|uniq) <(less 02_Prots2Nr/mikado_proteinsFixed.vs.nr.cul5.1e5.blastp.out |grep -v "hypothetical" |grep -v "uncharacterized" |awk '{print $1}' |sort|uniq) <( less 03_Transcrips2Nt/mikado_transcripts.vs.nt.cul5.1e5.blastn.out |grep -v "hypothetical" |grep -v "uncharacterized" |awk '{print $1}' |sort|uniq) <(less 04_ProtsUniprot/mikado_proteins.vs.uniprot_sprot.cul5.1e5.blastp.out |grep -v "hypothetical" |grep -v "uncharacterized" |awk '{print $1 }' |sort|uniq) <(less 05_TransUniprot/mikado_transcripts.vs.uniprot_sprot.cul5.1e5.blastx.out |grep -v "hypothetical" |grep -v "uncharacterized" |awk '{print $1 }' |sort|uniq) |sort|uniq|wc
26951 26951 668844
# Every genes name
awk '$3=="gene"' mikado.loci.gff3 |cut -f 9 |sort|uniq|sed 's/ID=//g' |sed 's/;/\t/1' |cut -f 1 >AllGenes.list
wc AllGenes.list
39516 39516 900462 AllGenes.list
# Genes that have a 50% overlap with a repeat from EDTA
bedtools intersect -f .5 -wo -a <(awk '$3=="gene"' mikado.loci.gff3) -b ../33_EDTA/EDTA/SCNgenome.fasta.EDTA.TEanno.gff |cut -f 9 |sort|uniq|sed 's/ID=//g' |sed 's/;/\t/1' |cut -f 1 >RepeatGenes.list
wc RepeatGenes.list
8316 8316 189033 RepeatGenes.list
less ../38_Expression/GeneCounts |awk '$7<2' |cut -f 1 |sed 's/\./\t/2' |cut -f 1 |sort|uniq >GenesNotExpressed.list
wc GenesNotExpressed.list
19278 19280 439092 GenesNotExpressed.list
#How many genes are expressed and have less than 50% repeat overlap with the gene space?
cat AllGenes.list RepeatGenes.list GenesNotExpressed.list |sort|uniq -c |awk '$1==1 {print $2}' >NonRepeatExpressedGenes.list
wc NonRepeatExpressedGenes.list
17431 17431 34862 NonRepeatExpressedGenes.list
#How many of these 17,431 genes have a functional annotation?
less 06_Combine/SCNgenomeFunctionalGeneAnnotations.gff3 |awk '$3=="mRNA"' |sed 's/;/\t/1' |sed 's/ID=//g' |cut -f 9- |sed 's/\./\t/2' |grep -w -f NonRepeatExpressedGenes.list - |grep -c "Note"
15968
# Separate high confidence genes from others (repetitive and not expressed)
less 06_Combine/SCNgenomeFunctionalGeneAnnotations.gff3 |awk '$3=="gene"' |sed 's/ID=//g' |sed 's/;/\t;/g' |grep -w -f NonRepeatExpressedGenes.list - |awk '{print $1,$2,$3,$4,$5,$6,$7,$8,"ID="$9$10$11}' |tr " " "\t" >SCNgenomeFunctionalGeneAnnotationsHighConfidenceGenesOnly.gff3 &
awk '$3=="mRNA"' mikado.loci.gff3 |sed 's/ID=//g' |sed 's/;/\t;/1' |cut -f 9 |sed 's/\./\t\./2' >AllmRNA.mikado.loci.list
less 06_Combine/SCNgenomeFunctionalGeneAnnotations.gff3 |sed 's/\.CDS/\t\.CDS/1' |sed 's/\.exon/\t\.exon/1' |sed 's/\.three/\t\.three/1' |sed 's/;/\t/1' |sed 's/\.five/\t\.five/1' |sed 's/ID=/ID=\t/1' >mikado.loci.Grepmod.gff3
less NonRepeatExpressedmRNAs.list |while read line; do echo "awk '\$10==\""$line"\"' mikado.loci.Grepmod.gff3 >>SCNgenomeFunctionalGeneAnnotationsHighConfidenceRemainingFeatures.gff3" ;done >GetRemainingFeatures.sh
sh GetRemainingFeatures.sh &
less SCNgenomeFunctionalGeneAnnotationsHighConfidenceRemainingFeatures.gff3 |awk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9$10$11";"$12$13}' |tr " " "\t" |cat - SCNgenomeFunctionalGeneAnnotationsHighConfidenceGenesOnly.gff3 |sort -k1,1V -k4,5n >SCNgenomeFuctionalGeneAnnotationsHighConfidenceAllGenesUnsorted.gff3
perl gff3sort/gff3sort.pl --precise --chr_order natural SCNgenomeFuctionalGeneAnnotationsHighConfidenceAllGenesUnsorted.gff3 >SCNgenomeFunctionalGeneAnnotationsHighConfidenceAllFeatures.gff3
Gene models were found to not have start codons about 50% of the time, Anju says the splicing is not right in some of the effectors.
Need to troublshoot.
#Move data from condo to nova,
from condo
/work/GIF/remkv6/Baum/04_Dovetail2Restart/23_Mikado/04_Round2Redo
to nova
/work/gif/remkv6/Baum/04_DovetailSCNGenome/01_mikadoRerurn
#had to delete all old mikado files for the rerun.
#kept getting this error
File "/home/remkv6/.conda/envs/mikado/lib/python3.6/site-packages/Mikado/serializers/blast_serializer/xml_serialiser.py", line 161, in run
for pickled in self._pickler(filename):
File "/home/remkv6/.conda/envs/mikado/lib/python3.6/site-packages/Mikado/serializers/blast_serializer/xml_serialiser.py", line 105, in _pickler
max_target_seqs=self.__max_target_seqs)
File "/home/remkv6/.conda/envs/mikado/lib/python3.6/site-packages/Mikado/serializers/blast_serializer/xml_serialiser.py", line 817, in objectify_record
current_target = _get_target_for_blast(self, alignment)
File "/home/remkv6/.conda/envs/mikado/lib/python3.6/site-packages/Mikado/serializers/blast_serializer/xml_serialiser.py", line 780, in _get_target_for_blast
raise KeyError("{} not found in the targets!".format(alignment.accession))
KeyError: '47529 not found in the targets!'
#tried lots of attempts
checked correctness of list2.txt
47529 appears to be a blast target fom the Tylenchida EST sequences.
serialize.log says some braker genes failed to be indexed
tried adding --trancripts mikado_prepared.fasta to the mikado file, but did not have an effect
redownloaded original augustus.hints.gtf from braker masked and converted to GFF3 via genometools gtf_2_gff3
ml genometools
gt gtf_to_gff3 -tidy -o augustus.hints.gff3 augustus.hints.gtf
version 1.5.9
Checked the overlap between portcullis and the other transcript files. It only seems to match up well with the braker predictions and the Round1mikado.loci.gff3.
bedtools intersect -wo -a <(awk '$3=="CDS"' SCNgenome.DovetailSCNMaker4.all.maker.transcripts.gff3) -b portcullis_filtered.pass.junctions.bed |cut -f 1,4,5,10,16,17 |less
bedtools intersect -wo -a <(awk '$3=="CDS"' augustus.hints.gff3) -b portcullis_filtered.pass.junctions.bed |cut -f 1,4,5,10,16,17 |less
bedtools intersect -wo -a <(awk '$3=="CDS"' Round1mikado.loci.gff3) -b portcullis_filtered.pass.junctions.bed |cut -f 1,4,5,10,16,17|less
sh runGmap.sh SCNgenome /work/gif/remkv6/Baum/04_DovetailSCNGenome/01_mikadoRerurn/ SCNgenome.fasta TylenchidaESTNotH.glycines.fasta
#runGmap.sh
###################################################################################
#!/bin/bash
#Makes a database and searches your sequences.
#sh runGmap.sh <database name> <folder of database file ending with a "/"> <Fasta file> <query file>
#examples
#sh run_gmap.sh red_abalone_02Jun2017_5fUJu /work/GIF/remkv6/Serb/03_DavideGMAP/ red_abalone_02Jun2017_5fUJu.fasta DavideQuerydna.fasta
#sh run_gmap.sh m.yessoensisGenome /work/GIF/remkv6/Serb/03_DavideGMAP DavideQuerydna.fasta
#sh run_gmap.sh Crassostreagigasgenome /work/GIF/remkv6/Serb/03_DavideGMAP Crassostreagigasgenome.fa DavideQuerydna.fasta
#module load gsnap
dbname=$1
dbloc=$2
dbfasta=$3
query=$4
gmap_build -d $dbname -D $dbloc $dbfasta
gmap -D $dbloc -d $dbname -B 5 -t 16 --input-buffer-size=1000000 --output-buffer-size=1000000 -f gff3_gene $query >${dbname%.*}.${query%.*}.gff3
###################################################################################
#How many EST's were there?
grep -c ">" TylenchidaESTNotH.glycines.fasta
591134
How many mapped?
awk '$3=="gene"' SCNgenome.TylenchidaESTNotH.glycines.gff3 |wc
61094 549846 5384817
#These are the counts from the related species gene preidctions I used in the tylenchida.
awk '$3=="gene"' SCNgenome.TylenchidaESTNotH.glycines.gff3 |cut -f 9 |sed 's/ID=//g' |sed 's/_/\t/1' |cut -f 1 |sort|uniq -c |sort -k1,1nr |less
8305 GPLIN
8248 GROS
372 MhA1
248 BXY
181 augustus
150 snap
134 Dd
...
#Another 43000 that were actual ESTs from NCBI's library
awk '$3=="gene"' SCNgenome.TylenchidaESTNotH.glycines.gff3 |cut -f 9 |sed 's/ID=//g' |sed 's/_/\t/1' |cut -f 1 |sort|uniq -c |sort -k1,1nr |wc
43187
Running this one step at a time to ensure I catch errors
#list3.txt
#################################################################################
SCNgenome.DovetailSCNMaker4.all.maker.transcripts.gff3 MA True -1
Round1mikado.loci.gff3 MI True
SCNgenome.TylenchidaESTNotH.glycines.gff3 RE False -1
augustus.hints.gff3 BR True 1
#################################################################################
#MikadoScript_0.sub
################################################################################
#!/bin/bash
#SBATCH -N 1
#SBATCH --ntasks-per-node=16
#SBATCH -A its-hpc-condo-las-free
#SBATCH -p freecompute
#SBATCH -t 96:00:00
#SBATCH -J MikadoScript_0
#SBATCH -o MikadoScript_0.o%j
#SBATCH -e MikadoScript_0.e%j
#SBATCH [email protected]
#SBATCH --mail-type=begin
#SBATCH --mail-type=end
ulimit -s unlimited
cd /work/gif/remkv6/Baum/04_DovetailSCNGenome/01_mikadoRerurn
source activate mikado
#!/bin/bash
#setup variables
genome=SCNgenome.fasta
bam="AllRNASEQ_sorted.bam"
list="list3.txt"
#run splice junction prediction
junctions="portcullis_filtered.pass.junctions.bed"
#configure
#mikado configure \
# --list $list \
# --reference $genome \
# --mode stringent \
# --scoring worm.yaml \
# --copy-scoring worm.yaml \
# --junctions $junctions configuration.yaml
#prepare
#mikado prepare \
# --json-conf configuration.yaml
#blast db
makeblastdb \
-in uniprotCombined.fasta \
-dbtype prot \
-parse_seqids
#blast
blastx \
-max_target_seqs 5 \
-num_threads 35 \
-query mikado_prepared.fasta \
-outfmt 5 \
-db uniprotCombined.fasta \
-evalue 0.000001 2> blast.log | sed '/^$/d' > mikado.blast.xml
blastxml=mikado.blast.xml
#transdecoder
#TransDecoder.LongOrfs \
# -t mikado_prepared.fasta
#TransDecoder.Predict \
# -t mikado_prepared.fasta \
# --cpu 16
orfs=$(find $(pwd) -name "mikado_prepared.fasta.transdecoder.bed")
#serialise
#mikado serialise \
# --start-method spawn \
# --procs 16 \
# --blast_targets ${genome} \
# --json-conf configuration.yaml \
# --xml ${blastxml} \
# --orfs ${orfs} \
# -mr .5
#pick
#mikado pick \
# --start-method spawn \
# --procs 16 \
# --json-conf configuration.yaml \
# --subloci_out mikado.subloci.gff3 \
# --pad
scontrol show job $SLURM_JOB_ID
less mikado.loci.gff3 |sort -k1,1V -k4,5nr |uniq |grep -v "#" |sed 's/transcript/mRNA/1'>UniquedRound2mikado.loci.gff3
gt gff3 -sortlines -checkids -fixregionboundaries -tidy UniquedRound2mikado.loci.gff3 >tidiedUniquedRound2mikado.loci.gff3
#this was still loaded with non methionine starts internal stops and lacking terminal stops
Since mikado fails to produce proteins with reliable splicing, start codons, and stop codons, filter all proteins that do not meet that criteria
#/work/gif/remkv6/Baum/04_DovetailSCNGenome/01_mikadoRerurn/03_MethionineProteinsOnly
#braker genes, masked
gffread ../augustus.hints.gff3 -g SCNgenome.fasta -t mRNA -VHEJ -x BrakerMasked_transcriptsVHEJ.fasta -y BrakerMasked_proteinsVHEJ.fasta
grep -c ">" BrakerMasked_proteinsVHEJ.fasta
24113
#maker genes on 368 scaffold genome
gffread ../SCNgenome.DovetailSCNMaker4.all.maker.transcripts.gff3 -g SCNgenome.fasta -t mRNA -VHEJ -x SCNgenome.DovetailSCNMaker4.all.maker.transcripts_transcriptsVHEJ.fasta -y SCNgenome.DovetailSCNMaker4.all.maker.transcripts_proteinsVHEJ.fasta
grep -c ">" SCNgenome.DovetailSCNMaker4.all.maker.transcripts_proteinsVHEJ.fasta
39896
#spades transcripts
gffread ../SCNgenome.transcripts.gff3 -g SCNgenome.fasta -t mRNA -VHEJ -x Spades_transcriptsVHEJ.fasta -y Spades_proteinsVHEJ.fasta
grep -c ">" Spades_proteinsVHEJ.fasta
23598
#Trinity transcripts
gffread ../SCNgenome.Trinity-GG.gff3 -g SCNgenome.fasta -t mRNA -VHEJ -x Trinity_transcriptsVHEJ.fasta -y Trinity_proteinsVHEJ.fasta
grep -c ">" Trinity_proteinsVHEJ.fasta
41366
# NCBI EST proteins
gffread ../SCNgenome.TylenchidaESTNotH.glycines.gff3 -g SCNgenome.fasta -t mRNA -VHEJ -x NCBIEST_transcriptsVHEJ.fasta -y NCBIEST_proteinsVHEJ.fasta
[remkv6@nova005 03_MethionineProteinsOnly]$ grep -c ">" NCBIEST_proteinsVHEJ.fasta
1750
# Class2 transcripts
gt gtf_to_gff3 -tidy <(sort -k1,1V -k4,5nr AllRNASEQClass2.gtf|grep -v "#" |uniq) >AllRNASEQClass2.gff3
gt gff3 -sortlines -tidy -fixregionboundaries AllRNASEQClass2.gff3 >tidiedAllRNASEQClass2.gff3
gt cds -startcodon -finalstopcodon -seqfile SCNgenome.fasta -matchdesc tidiedAllRNASEQClass2.gff3 >FixedClass2.gff3
gffread ../FixedClass2.gff3 -g SCNgenome.fasta -t mRNA -VHEJ -x AllRNASEQClass2_transcriptsVHEJ.fasta -y AllRNASEQClass2_proteinsVHEJ.fasta
grep -c ">" AllRNASEQClass2_proteinsVHEJ.fasta
36212
#stringtie conversion
gt gtf_to_gff3 -tidy <(sort -k1,1V -k4,5nr NonRrnaRNASEQ_stringtie.gtf|grep -v "#" |uniq) >NonRrnaRNASEQ_stringtie.gff3
gt gff3 -sortlines -tidy -fixregionboundaries NonRrnaRNASEQ_stringtie.gff3 >tidiedNonRrnaRNASEQ_stringtie.gff3
gt cds -startcodon -finalstopcodon -seqfile SCNgenome.fasta -matchdesc tidiedNonRrnaRNASEQ_stringtie.gff3 >FixedNonRrnaRNASEQ_stringtie.gff3
gffread ../FixedNonRrnaRNASEQ_stringtie.gff3 -g SCNgenome.fasta -VHEJ -t mRNA -x Stringtie_transcriptsVHEJ.fasta -y Stringtie_proteinsVHEJ.fasta
grep -c ">" Stringtie_proteinsVHEJ.fasta
29063
#braker unmasked
gt gtf_to_gff3 -tidy <(sort -k1,1V -k4,5nr BrakerUnmasked.gff|grep -v "#" |uniq) >BrakerUnmasked.gff3
gt gff3 -sortlines -tidy -fixregionboundaries BrakerUnmasked.gff3 >tidiedBrakerUnmasked.gff3
gt cds -startcodon -finalstopcodon -seqfile SCNgenome.fasta -matchdesc tidiedBrakerUnmasked.gff3 >FixedBrqakerUnmasked.gff3
gffread ../FixedBrakerUnmasked.gff3 -g SCNgenome.fasta -t mRNA -VHEJ -x BrakerUnmasked_transcriptsVHEJ.fasta -y BrakerUnmasked_proteinsVHEJ.fasta
grep -c ">" BrakerUnmasked_proteinsVHEJ.fasta
37257
#run genomethreader on all these datasets
for f in *fasta; do echo "mkdir "$f"dir; cd "$f"dir; ln -s ../SCNgenome.fasta; ln -s ../"$f" ; ml miniconda3; source activate genomethreader;gth -genomic SCNgenome.fasta -protein "$f" -gff3out -species nematode -skipalignmentout -o "${f%.*}"aln -force";done >gth.sh
#/work/gif/remkv6/Baum/04_DovetailSCNGenome/01_mikadoRerurn
less mikado_proteins.fasta |tr "\n" "\t" |sed 's/>/\n>/g' |awk '{print substr($3,1,1)}' |sort|uniq -c |less
#Results
#########################
1
177 .
374 A
297 C
290 D
239 E
324 F
331 G
212 H
440 I
477 K
1877 L
34562 M
525 N
253 P
230 Q
360 R
1414 S
285 T
343 V
197 W
31 X
181 Y
#########################
#So how many are good?
less mikado_proteinsVHEJ.fasta |tr "\n" "\t" |sed 's/>/\n>/g' |awk '{print substr($3,1,1)}' |sort|uniq -c |less
1
29959 M
#Create a format of 'mrna_name\tgene_name'
grep ">" mikado_proteinsVHEJ.fasta|awk '{print $1}' |sed 's/>//g' |cat - AllProteins.list |sort|uniq -c |awk '$1==1{print $2,$2}' |sed 's/\./\t/4' |cut -f 1 |tr " " "\t" >BadGenes.list
#How many were bad?
wc -l ../BadGenes.list
13460 ../BadGenes.list
#remove them from the gff
mikado util grep -v BadGenes.list mikado.loci.gff3 RemoveBadGenesmikado.loci.gff3
#how many genes are there?
less RemoveBadGenesmikado.loci.gff3 |awk '$3=="gene"' |wc
26016 234144 4691582
#how many mrnas/proteins are there?
less RemoveBadGenesmikado.loci.gff3 |awk '$3=="mRNA"' |wc
29959 269631 5155985
#get the bad ones for assessment
mikado util grep BadGenes.list mikado.loci.gff3 BadGenesmikado.loci.gff3
#get bad genes and compare to braker unmasked prediction to obtaion appropriate CDS for genes.
#how many will I get in the exchange?
bedtools intersect -wo -a ../BadGenesmikado.loci.gff3 -b ../FixedBrakerUnmasked.gff3 |awk '$3=="gene"' |awk '$12=="gene"' |cut -f 18 |sort|uniq|sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
8893 8893 86151
#Make a tabular file of mrna name, gene name
awk '$3=="mRNA"' ../FixedBrakerUnmasked.gff3 |cut -f 9 |sed 's/ID=//g' |sed 's/Parent=//g' |sed 's/;/\t/g' |cut -f 1,2 >brakerunmaskedGrepMod.list
bedtools intersect -wo -a ../BadGenesmikado.loci.gff3 -b ../FixedBrakerUnmasked.gff3 |awk '$3=="gene"' |awk '$12=="gene"' |cut -f 18 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|grep -w -f - brakerunmaskedGrepMod.list >List2GrabFromBraker.list
mikado util grep List2GrabFromBraker.list ../FixedBrakerUnmasked.gff3 AugustusGenes4Concat.gff3
less AugustusGenes4Concat.gff3 |grep -v "^#" |awk '{if($3=="gene" || $3=="mRNA") {print $1,"AUGUSTUS",$3,$4,$5,$6,$7,$8,$9,$9} else {print $1,"AUGUSTUS",$3,$4,$5,$6,$7,$8,$9}}' |tr " " "\t" |sed 's/;/\t/2' |cut -f 1-10 |sed 's/\tID=/;Name=/2' >augustusreformatName.gff3
cat augustusreformatName.gff3 ../RemoveBadGenesmikado.loci.gff3|grep -v "^#" >NeedsRenamedFinalGenes.gff3
gt gff3 -tidy -sortlines -checkids -fixregionboundaries NeedsRenamedFinalGenes.gff3 >SCNGenePredictions.gff3
#get the proteins and transcript sequences
gffread SCNGenePredictions.gff3 -g ../SCNgenome.fasta -t mRNA -x SCNGenePredictions_VHEJtranscripts.fasta -y SCNGenePredictionsVHEJ_proteins.fasta
#Verify that there are the right number of proteins and mrnas in the gff
(mikado) [remkv6@nova023 05_FinalGenePrediction]$ grep -c ">" FinishedGenePredictionVHEJ_proteins.fasta
39689
(mikado) [remkv6@nova023 05_FinalGenePrediction]$ awk '$3=="mRNA"' ReformattedNeedsRenamedFinalGenes.gff3 |wc
39689 357201 5205040
#################################################################################
#Maker fails with some transcripts, so abandoned for now
#ml maker maker/2.31.10_3.1
#maker maker_map_ids --suffix .t --prefix HetGly. --iterate 1 --justify 8 ReformattedNeedsRenamedFinalGenesNcrnaRemoved.gff3 >ReformattedNeedsRenamedFinalGenesNcrnaRemoved.map
maker map_gff_ids ReformattedNeedsRenamedFinalGenesNcrnaRemoved.map ReformattedNeedsRenamedFinalGenesNcrnaRemoved.gff3
#swaps around the Name= to match ID=, ditches alias, adds augustus to col2 where missing
#less ReformattedNeedsRenamedFinalGenesNcrnaRemoved.gff3 |awk '{if($3=="mRNA" || $3=="gene" ) {print $0} else {gsub(/;/,"\t",$9);print $0}}' |tr " " "\t" |cut -f 1-9 |sed 's/;Name=.*//g' |grep -v "^#" |awk '{if($3=="gene" || $3=="mRNA") {print $1,$2,$3,$4,$5,$6,$7,$8,$9,$9} else {print $0}}' |tr " " "\t" |sed 's/;/\t/2' |cut -f 1-10 |sed 's/\tID=/;Name=/2' |awk '{if ($2==".") {print $1,"AUGUSTUS",$3,$4,$5,$6,$7,$8,$9} else {print $1,$2,$3,$4,$5,$6,$7,$8,$9}}' |tr " " "\t" >FinishedGenePrediction.gff3
#How many mRNA's in the gene prediction overlap with repetitive elements?
#60% overlap
(mikado) [remkv6@nova023 05_FinalGenePrediction]$ bedtools intersect -f .6 -wo -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
5150 5150 49819
#50% overlap
(mikado) [remkv6@nova023 05_FinalGenePrediction]$ bedtools intersect -f .5 -wo -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
6133 6133 59294
#30% overlap
(mikado) [remkv6@nova023 05_FinalGenePrediction]$ bedtools intersect -f .3 -wo -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
8700 8700 84152
#20% overlap
(mikado) [remkv6@nova023 05_FinalGenePrediction]$ bedtools intersect -f .2 -wo -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
10813 10813 104663
#Any intersect
(mikado) [remkv6@nova023 05_FinalGenePrediction]$ bedtools intersect -wo -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
25929 25929 251739
#What if we consider only CDS overlaps with repeats
#Any overlap between any cds and any repeat
bedtools intersect -f .5 -wo -a <(awk '$3=="CDS"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/Parent=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
14159 14159 137011
# 50% of the CDS length, only one CDS required
bedtools intersect -f .5 -wo -a <(awk '$3=="CDS"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/Parent=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq|wc
11791 11791 114040
# requires at least 2 CDS to overlap any part of repeat
bedtools intersect -f .5 -wo -a <(awk '$3=="CDS"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/Parent=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq -c |awk '$1>1' |wc
9775 19550 172686
#requires at least 30% overlap of cds by the repeat, and two CDS's
bedtools intersect -f .3 -wo -a <(awk '$3=="CDS"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/Parent=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq -c |awk '$1>1' |wc
8313 16626 146776
#Probably the most conservative measure, same as above
bedtools intersect -f .3 -wo -a <(awk '$3=="CDS"' SCNGenePredictions.gff3) -b ../04_Expression/SCNgenome.fasta.out.gff |cut -f 9 |sed 's/Parent=//g' |sed 's/;/\t/g' |cut -f 1 |sort|uniq -c |awk '$1>1 {print $2}' >RepetitiveBy30PercOverlap2CDS.list
#How many of the mRNAs have an annotation that is transposon, helitron, or transposase?
bedtools intersect -wo -f .5 -r -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b <(awk '$3=="mRNA"' ../OldPredictionSCNgenomeFunctionalGeneAnnotations.gff3 ) |grep -i "transposon" |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort |uniq >TransposonGenes.list
bedtools intersect -wo -f .5 -r -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b <(awk '$3=="mRNA"' ../OldPredictionSCNgenomeFunctionalGeneAnnotations.gff3 ) |grep -i "transposase" |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort |uniq >TransposaseGenes.list
bedtools intersect -wo -f .5 -r -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b <(awk '$3=="mRNA"' ../OldPredictionSCNgenomeFunctionalGeneAnnotations.gff3 ) |grep -i "helitron" |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort |uniq >HelitronGenes.list
cat TransposonGenes.list TransposaseGenes.list HelitronGenes.list |sort|uniq >GenesWithTransposonAnnotations.list
less GenesWithTransposonAnnotations.list|wc
279 279 2707
#identify by name only "gland" and "effector"
bedtools intersect -wo -f .5 -r -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b <(awk '$3=="mRNA"' ../OldPredictionSCNgenomeFunctionalGeneAnnotations.gff3 ) |grep -i "effector" |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort |uniq >effector.list
bedtools intersect -wo -f .5 -r -a <(awk '$3=="mRNA"' SCNGenePredictions.gff3) -b <(awk '$3=="mRNA"' ../OldPredictionSCNgenomeFunctionalGeneAnnotations.gff3 ) |grep -i "gland" |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 |sort |uniq >>glandGenes.list
cat glandGenes.list effector.list |sort|uniq >EffectorGenes.list
wc EffectorGenes.list
396 396 3780 EffectorGenes.list
#how many of these effectors are repetitive as called by repeatmasker
cat <(sort EffectorGenes.list |uniq) <(sort RepetitiveBy30PercOverlap2CDS.list |uniq) |sort|uniq -c |awk '$1==2' |wc
176 352 3031
awk '$3=="mRNA" ' SCNGenePredictions.gff3 |cut -f 9 |sed 's/ID=//g' |sed 's/;/\t/g' |cut -f 1 >AllmRNAs.list
#Remove the overlap with repeats, and annotated transposon proteins
#Figure in non expressed transcripts later when featurecounts is available
cat AllmRNAs.list GenesWithTransposonAnnotations.list RepetitiveBy30PercOverlap2CDS.list |sort|uniq -c |awk '$1==1{print $2}' |cat - EffectorGenes.list |sort|uniq -c |awk '$1==1{print $2}' >NoRepeatFinalGene.List
wc NoRepeatFinalGene.List
31233 31233 304022 NoRepeatFinalGene.List