-
Notifications
You must be signed in to change notification settings - Fork 0
/
install.sh
79 lines (60 loc) · 2.62 KB
/
install.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env bash
# Directory parameters
reads=Data/Reads
trimming=Data/Trimming
genome=Data/Genome
figures_reads=Figures/Reads
figures_trimming=Figures/Trimming
# Url parameters
reads_url=http://rssf.i2bc.paris-saclay.fr/X-fer/AtelierNGS/TPrnaseq.tar.gz
genome_url=http://hgdownload.soe.ucsc.edu/goldenPath/hg19/chromosomes/chr18.fa.gz
annotation_url=ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_24/GRCh37_mapping/gencode.v24lift37.basic.annotation.gtf.gz
# Color parameters
RED='\033[1;31m'
GREEN='\033[1;32m'
BLUE='\033[1;34m'
NC='\033[0m'
# Step 1: Download reads
mkdir -p ${reads}
echo -e "\n${BLUE}Downloading reads...${NC}"
wget ${reads_url} -P ${reads}
echo -e "${GREEN}Done.${NC}"
echo -e "\n${BLUE}Unarchiving reads...${NC}"
tar -zxf ${reads}/TPrnaseq.tar.gz -C ${reads}
echo -e "${GREEN}Done.${NC}"
echo -e "\n${BLUE}---------------Number of reads per file---------------${NC}"
for file in ${reads}/*.fastq
do
grep ^+$ ${file} | echo "${file}: $(wc -l) reads";
done
# Step 2: Quality control + Reads cleaning
mkdir -p ${figures_reads}
echo -e "\n${BLUE}Creation of the fastqc files on raw reads...${NC}"
fastqc -o ${figures_reads} -f fastq ${reads}/*.fastq -q
echo -e "${GREEN}Done.${NC}"
# Trimming procedure (Elimination of low quality sequences at the end of reads)
# conda install -c bioconda trimmomatic
mkdir -p ${trimming}
for read1_file in ${reads}/*.R1.fastq
do
paired_file_with_path=${read1_file%.R1.fastq};
paired_file_without_path=${paired_file_with_path#${reads}/};
echo -e "\n${BLUE}Trimming ${paired_file_without_path%.sampled}...${NC}";
trimmomatic PE ${paired_file_with_path}.R1.fastq ${paired_file_with_path}.R2.fastq -baseout ${trimming}/${paired_file_without_path}.fastq LEADING:20 TRAILING:20 MINLEN:50 -quiet
echo -e "${GREEN}Done.${NC}"
done
# Removal of the bases from the extremity with a quality lower than 20. If the final read is smaller than 50, it is discarded. file with U => discard. file with P => no discard.
# Remark: files 1U and 2U returns a small number of sequences (around 10 000) while files 1P and 2P returns a large number of sequences (a little smaller than the reads without cleaning)
mkdir -p ${figures_trimming}
echo -e "\n${BLUE}Creation of the fastqc files on trimmed reads...${NC}"
fastqc -o ${figures_trimming} -f fastq ${trimming}/*.fastq -q
echo -e "${GREEN}Done.${NC}"
# Step 2: download reference genome
mkdir ${genome} -p
echo -e "\n${BLUE}Downloading genome...${NC}"
wget ${genome_url} -P ${genome} -q
echo -e "${GREEN}Done.${NC}"
echo -e "\n${BLUE}Downloading annotations...${NC}"
wget ${annotation_url} -P ${genome} -q
echo -e "${GREEN}Done.${NC}"
gunzip ${genome}/*.gz