Skip to content

Latest commit

 

History

History
232 lines (147 loc) · 11.6 KB

virome_analysis_pipeline.md

File metadata and controls

232 lines (147 loc) · 11.6 KB

virome analysis pipeline for the global supraglacial viruses

#main scripts for the analysis of the GSgVG catalog

#by Pengfei Liu, Lin Zang, Zhihao Zhang, Huang Xinyu, Tao Ye

#1, assembly of microbial and viral metagenomics by megahit

#
#trimming reads 

for samplename in $(cat trim_sample.tsv)
do trimmomatic PE -phred33 ${samplename}.R1.fastq.gz ${samplename}.R2.fastq.gz ${samplename}_trimmed_R1.fastq.gz ${samplename}_trimmed_U_R1.fastq.gz ${samplename}_trimmed_R2.fastq.gz ${samplename}_trimmed_U_R2.fastq.gz ILLUMINACLIP:TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 -threads 20
done

#
#assembly with megahit

megahit -1 "${read1}" -2 "${read2}" --k-min 21 --k-max 141 --k-step 10 --mem-flag 2 -m 0.5 --out-prefix "${line}" -t 40 --out-dir ./"${line}" &> "${line}"_megahit.log

#2, viral contigs identification from metagenics derived assemblies

#1, virsorter2

virsorter run -w glacier_10k—vs2.out -i glacier_10k.fasta -j 120 --prep-for-dramv --min-length 10000 --include-groups dsDNAphage,NCLDV,RNA,ssDNA,lavidaviridae -d /mnt/nfs/software/VirSorter2/db  classify  &> glacier_10k—vs2_reclassify.log

#2, VIBRANT
python3 VIBRANT_run.py -i glacier_metaG10.fasta -t 20 -l 10000


#3, virfinder
R
library(VirFinder)
predResult <- VF.pred("./glacier_metaG10.fasta")
predResult[order(predResult$pvalue),]
predResult$qvalue <- VF.qvalue(predResult$pvalue)
predResult_sorter<-predResult[order(predResult$qvalue),]
write.table(predResult_sorter, file="virfinder-metaG10.txt", quote=F, sep="\t")


#4, DeepVirFinder
python /home/ptpe/software/DeepVirFinder/dvf.py -i glacier_10k.fasta -o glacier_10k_Deepvirfinder -l 10000 -c 200 &>glacier130.deepvirfinder.log


#5, viralrecall
python /home/PTPE2/Software/viralrecall/viralrecall/viralrecall.py -i /home/PTPE2/User/liupf/Projects_liupf/glacier_virome/mapping_glacier_vOTU/glacier166_bowtie2_mapping/reference/glacier_13475vOTUs.fna -p glacier_13475vOTUs_outdir -t 20 -f &> log

#3, quality checking of viral contigs by CheckV

checkv end_to_end glacier_total_virus_Can.fasta checkv_output_directory -t 30 -d /home/ptpe/biodatabase/checkv_db/checkv-db-v1.0

#discarded list: (virsorter2 SOP: https://www.protocols.io/view/viral-sequence-identification-sop-with-virsorter2-5qpvoyqebg4o/v2)
#virus gene==0 and host gene>=1

awk '$6==0&$7>0' quality_summary.tsv | cut -f1 -d$'\t' > glacier_total_virus_discarded_seq.txt

#kept
seqkit grep -n -v -f glacier_total_virus_discarded_seq.txt  ../virus_tmp.fasta -o glacier_total_virus_final1.fasta

#4, clustering of viral contigs to generate vOTUs, and the taxnomic classifcation and annotation of vOTUs

#Roux et al., 2020-Nucleic Acids Research-IMG/VR v3: an integrated ecological and evolutionary framework for interrogating genomes of uncultivated viruses
#Clustered into vOTUs following the MIUViG guidelines (14) (95% ANI - Average Nucleotide Identity and 85% AF – Aligned Fraction).

First, create a blast+ database: makeblastdb -in <my_seqs.fna> -dbtype nucl -out <my_db>
makeblastdb -in global_virus_contigs_partI5k.fna -dbtype nucl -out global_virus_contigs_partI5k


Next, use megablast from blast+ package to perform all-vs-all blastn of sequences: 
blastn -query global_virus_contigs_partI5k.fna -db global_virus_contigs_partI5k -outfmt '6 std qlen slen' -max_target_seqs 15000 -o global_virus_contigs_partI5k_blast.tsv -num_threads 32

# blastn -query <my_seqs.fna> -db <my_db> -outfmt '6 std qlen slen' -max_target_seqs 10000 -o <my_blast.tsv> -num_threads 32

Note: using the -perc_identity flag will speed up the search at the cost of sensitivity: blastn -query <my_seqs.fna> -db <my_db> -outfmt '6 std qlen slen' -max_target_seqs 10000 -perc_identity 90 -o <my_blast.tsv> -num_threads 32

Next, calculate pairwise ANI by combining local alignments between sequence pairs: anicalc.py -i <my_blast.tsv> -o <my_ani.tsv>

conda activate vibrant
python ~/User/liupf/Projects_liupf/common_files/anicalc.py -i global_virus_contigs_partI5k_blast.tsv -o global_virus_contigs_partI5k_ani.tsv

global_virus_contigs_partI5k_blast.tsv

Finally, perform UCLUST-like clustering using the MIUVIG recommended-parameters (95% ANI + 85% AF): aniclust.py --fna global_virus_contigs_partI5k.fna --ani global_virus_contigs_partI5k_ani.tsv --out global_virus_contigs_partI5k_clusters.tsv --min_ani 95 --min_tcov 85 --min_qcov 0

python ~/User/liupf/Projects_liupf/common_files/aniclust.py --fna global_virus_contigs_partI5k.fna --ani global_virus_contigs_partI5k_ani.tsv --out global_virus_contigs_partI5k_clusters.tsv --min_ani 95 --min_tcov 85 --min_qcov 0


################taxnomy classifiction of vOTUs
#vContact2
prodigal -i glacier_total_virus_final123_5k_95-85.fna  -a glacier_total_virus_final123_95-85.faa -p meta
vcontact2_gene2genome -p glacier_total_virus_final123_95-85.faa -o glacier_total_virus_final123_95-85.csv -s Prodigal-FAA
vcontact2 --raw-proteins glacier_total_virus_final123_95-85.faa --rel-mode Diamond --proteins-fp glacier_total_virus_final123_95-85.csv --db 'ProkaryoticViralRefSeq201-Merged' --pcs-mode MCL --vcs-mode ClusterONE --c1-bin /home/ptpe/software/cluster_one-1.0.jar  --output-dir glacier_virome_vOTU_vc2  -t 60


#blastn
makeblastdb -in IMGVR_all_nucleotides_replace.fna -dbtype nucl -out IMGVR_all_nucleotides -parse_seqids
blastn -task megablast -query glacier_total_virus_final123_5k_95-85.fna -db /home/ptpe/biodatabase/IMG_VR_2020-10-12_5.1/IMGVR_all_nucleotides -evalue 1e-5 -perc_identity 90 -max_target_seqs 25000  -num_threads 20 -out glacier_total_virus_final123_5k_95-85-IMGvr.txt -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"

#blastp
blastp -query glacier_total_vOTU_prot_fixheader.faa -db /home/PTPE2/User/liupf/Projects_liupf/glacier_virome/NCBI-ref206-virus/viral.123.protein -evalue 1e-3 -max_target_seqs 577642 -num_threads 40 -out glacier_total_vOTU_prot-NCBIvprotein.txt -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"

#VPF-classs
screen -r vpf-class
/home/ptpe/software/vpfclass/vpf-class-x86_64 --data-index /home/ptpe/software/vpfclass/data/data-index.yaml -i ./glacier_total_virus_final123_5k_95-85.fna -o vpfclass-glacier-vOTU --workers 20 --chunk-size 20

#Cau tree (see https://github.com/liupfskygre/GSgVG_global-supraglacial_virome/blob/main/scripts/Cau_tree.md file for details)


#virrecall for NCLDVs
python viralrecall.py -i /home/PTPE2/User/liupf/Projects_liupf/glacier_virome/mapping_glacier_vOTU/glacier166_bowtie2_mapping/reference/glacier_13475vOTUs.fna -p glacier_13475vOTUs_outdir -t 60 -f &> liupf_log

#5, in silico prediction of the life style of supraglacial viruses

#checkV, see above output

#VIBRANT, see above output

#bacphlip, 95% cutoff
bacphlip -i glacier_April1_final5k_vOTUs_Cryo21_Anta1.fna --multi_fasta

#6, marcordiversity and microdiverstiy analysis of glacial viral communities

#####marcordiversity 
##mapping with bowtie2
./bowtie2_2_vgenome.sh mapping_vOTUs_TP85_metadata.txt 24  #see scripts for sh files

#quality filtering

#coverage  ==coverM
conda activate coverm
coverm contig --methods tpm --bam-files *.sorted --min-covered-fraction 10 --output-file
#input into R, microeco package for alpha and beta diversity analysis 

####microdiverstiy
##mapping with bowtie2, same as above for
./bowtie2_2_vgenome.sh mapping_vOTUs_TP85_metadata.txt 60 #see scripts for sh files


#metapop
metapop --input_samples ./bamfile --reference ./reference --norm tp-notp-187-metapop_ctfile.txt --threads 50 --whole_genomes -o all187_metapop_out_whole &> all187_metapop_out_whole.log

#7, in silico host prediction of viral vOTUs

###1, CRISPR spacer to TG2G genomes databases
#get spacers by CRT

for file in /home/liupf/Projects/Zanglin_virus/MAGs_final/*.fa
do
java -cp /home/ptpe/software/CRT1.2-CLI.jar crt -minNR 4 -maxRL 55 -maxSL 70 $file "${file%.*}".txt
done

####custome provided spacers
##Parsing spacer files

conda activate spacepharer
spacepharer parsespacer *.txt MAGs_1020_queryDB --threads 5

spacepharer createsetdb MAGs_1020_queryDB MAGs_1020_querySetDB tmpFolder --extractorf-spacer 1


###
cd /home/liupf/Projects/glacier_virome/MAGs-TP-2661ana/MAGs_1020_drep099
cd /home/liupf/Projects/glacier_virome/MAGs-TP-2661ana/MAGs_1020_drep099/MAGs_1020_drep099_CRT

spacepharer predictmatch MAGs_1020_querySetDB /home/liupf/Projects/glacier_virome/SpacePharer_MAGs203/glacier_total_virus_final123_5k_95-85.fna.split/targetSetDB /home/liupf/Projects/glacier_virome/SpacePharer_MAGs203/glacier_total_virus_final123_5k_95-85.fna.split/controlSetDB MAGS-1020-outputFileName.tsv tmpFolder


####2, blastn based
blastn -task megablast -query /home/liupf/Projects/glacier_virome/vContact2/glacier_total_virus_final123_5k_95-85.fna -db glacier1020_nucleotides -evalue 1e-5 -perc_identity 90 -max_target_seqs 25000  -num_threads 20 -out glacier_total_vOTUs_MAGs1020.txt -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"
#Host predictions were then based on matches of ≥90% nucleotide identity covering ≥2 kb of the virus and (putative) host sequences.


##virus host matcher, VHM
python /home/ptpe/software/VirHostMatcher/vhm.py -v /home/liupf/Projects/glacier_virome/VirHostMatcher/virus -b /home/liupf/Projects/glacier_virome/VirHostMatcher/host -o /home/liupf/Projects/glacier_virome/VirHostMatcher/output-test2 &>v.log2

##tRNA blast match
#tRNA from host
tRNAscan-SE -G -Q --thread 50 -a "${file%.*}"_trna.fasta scaffolds.fna

#tRNA from virus
tRNAscan-SE -G -Q --thread 10 -a vOTUs13839_trna.fasta glacier_total_virus_final123_5k_95-85.fna

makeblastdb -in TP-nonTP-tRNA_replace.fna -dbtype nucl -out TP-nonTP-tRNA_MAGs -parse_seqids
#blastn
blastn -task blastn -query vOTUs13839_trna.fna -db TP-nonTP-tRNA_MAGs -evalue 1e-5 -perc_identity 100 -max_target_seqs 50000  -num_threads 30 -out tRNA_vOTUs_blastn_MAGs1364.txt -outfmt "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore qlen slen"

#100% perfect match between the tRNA sequences of viruses and host


#8, Auxiliary metabolic genes (AMGs) of viral vOTUs

#run virsorter with --min-score 0
virsorter run --seqname-suffix-off --viral-gene-enrich-off -w glacier_total_virus_final123_5k_95-85—vs2.out -i glacier_total_virus_final123_5k_95-85.fna --prep-for-dramv --min-length 5000 --min-score 0 -j 40 --include-groups dsDNAphage,NCLDV,RNA,ssDNA,lavidaviridae -d /home/ptpe/biodatabase/vs2_db  all  &> glacier_total_virus_final123_5k_95-85—vs2_all.log

#DRAM-v 
DRAM-v.py annotate -i final-viral-combined-for-dramv.part_001.fa -v final-viral-combined-for-dramv.part_001_for-dramv.tab -o final-viral-combined-for-dramv.part_001_annotation --use_uniref --threads 2 --min_contig_size 5000

#9, metabolic profile of MAGs

#
DRAM.py annotate -i './*.fa' -o Glacier_CRISPR_linked_MAGs --threads 40 --min_contig_size 500 --checkm_quality ~/User/liupf/Projects_liupf/glacier_virome/MAGs-Non-TP-405ana/NonTP-MAGs_344_drep099_checkm_summary.txt --checkm_quality ~/User/liupf/Projects_liupf/glacier_virome/MAGs-TP-2661ana/TP-MAGs_1020_drep099_checkm_summary.txt  --gtdb_taxonomy ~/User/liupf/Projects_liupf/glacier_virome/MAGs-Non-TP-405ana/gtdbtk_out/NonTP-MAGs_344_drep099.bac120.summary.tsv --gtdb_taxonomy ~/User/liupf/Projects_liupf/glacier_virome/MAGs-TP-2661ana/MAGs_1020_drep099/gtdbtk_out/TP-MAGs_1020_drep099.bac120.summary.tsv --gtdb_taxonomy ~/User/liupf/Projects_liupf/glacier_virome/MAGs-TP-2661ana/MAGs_1020_drep099/gtdbtk_out/TP-MAGs_1020_drep099.ar122.summary.tsv --verbose --use_uniref

#10, mapping to NCBI ref genome to assess the potential risk to public health of glacier viruses

#bowtie2
https://github.com/liupfskygre/GSgVG_global-supraglacial_virome/blob/main/scripts/mapping_coverM_NCBI_ref.md