-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path01Preprocessing_and_mapping.sh
65 lines (54 loc) · 2.4 KB
/
01Preprocessing_and_mapping.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/usr/bin/bash
# ----------------------------------------------------------------------------------------------------------------------
# Genome file preparation
cd soybean_data/genome_file
# merge soybean genome, mt, chlp and rhizobia genome to a single fatsa file
cat Gmax_275_v2.0.fa mtchlp.fa USDA110.fa > Gmax_USDA_org.fa
# build genoome index
hisat2-build Gmax_USDA_org.fa Gmax_USDA110 -p 20
hisat2-build Gmax_rRNA.fa Gmax_rRNA -p 20
# in order to using the R script to annotate the TSS on the genome, we need to convert the gff3 format using gffread.
gffread Gmax_275_Wm82.a2.v1.gene.gff3 -o Gmax_v2.0_changed.gff3
cd ../..
# ----------------------------------------------------------------------------------------------------------------------
# Preprocess the original sequencing data
cd soybean_data/stripe-seq_data
# the Script including the following steps using nodule data as example.
#remove the adapters using cutadapt
cutadapt \
-a AGATCGGAAGAGCACACG \
-a "G{100}" -n 2 -j 20 --minimum-length 50 \
-o Nod.1.rmadaptor3.fastq \
LCS8044_WXT_Nodule-RT01_R1.fastq.gz
cutadapt \
-g GACGCTCTTCCGATCT \
-j 20 --minimum-length 50 -n 4 \
-o Nod.1.rm5.fastq \
Nod.1.rmadaptor3.fastq
# select the NNNNNNNNTATAGGG contained reads, the script is from https://github.com/zentnerlab/STRIPE-seq
perl ../../selectReadsByPattern.pl -m NNNNNNNNTATAGGG -o Nod.1.pattern -r Nod.1.rm5.fastq
# change the fastq to fasta format
fastx_collapser -i Nod.1.pattern.fq -o Nod.1.uniq.fa
# remove the TATAGGG adapter
cutadapt -g TATAGGG -j 20 --minimum-length 50 -n 4 -o Nod.1.clean.fq Nod.1.uniq.fa
# remove the reads from rRNA.
hisat2 \
-t -p 20 \
--max-intronlen 10000 \
-f -x ../genome_file/Gmax_rRNA \
-U Nod.clean.fa \
-S nod.rRNA.sam \
--un Nod.keep.fasta
# remove the reads to combinative genome.
hisat2 \
-t -p 20 \
--max-intronlen 10000 \
-f -x ../genome_file/Gmax_USDA110 \
-U Nod.keep.fasta \
-S Nod.hisat.sam
# selected the high quaility reads mapped to the genome rather than organelles and rhizobium.
# we also ignore the reads mapped to scaffolds in the genome.
samtools view -@ 20 -h -q 30 Nod.hisat.sam | perl -nlae 'print if !(/JX|NC|scaff/i)' | samtools sort - -@ 20 -o Nod.soy.hisat.bam
samtools index Nod.soy.hisat.bam -@ 20
cd ../..
# ----------------------------------------------------------------------------------------------------------------------