-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathSimRead
executable file
·142 lines (114 loc) · 5.17 KB
/
SimRead
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/bin/bash
#####################################################################################################################
# WORK SPACE AND DOWNLOAD GENOME REFERENCE #
# BY: FRANCISCO ASCUE #
# email: [email protected] #
#####################################################################################################################
normal=$'\e[0m'
bold=$(tput bold) # make colors bold/bright
red="$bold$(tput setaf 1)" # bright red text
green=$(tput setaf 2) # dim green text
fawn=$(tput setaf 3); beige="$fawn" # dark yellow text
yellow="$bold$fawn" # bright yellow text
darkblue=$(tput setaf 4) # dim blue text
blue="$bold$darkblue" # bright blue text
purple=$(tput setaf 5); magenta="$purple" # magenta text
pink="$bold$purple" # bright magenta text
darkcyan=$(tput setaf 6) # dim cyan text
cyan="$bold$darkcyan" # bright cyan text
gray=$(tput setaf 7) # dim white text
darkgray="$bold"$(tput setaf 0) # bold black = dark gray text
white="$bold$gray" # bright white text
usage="$(basename "$0") -- Program to create simulation reads for illumina sequencing in GNU/Linux.
program maintended at https://github.com/FranciscoAscue/Minicurso_transcriptomica/blob/master/SimRead
usage: $(basename "$0") -n <project> -g <N.A. NCBI> -r <20000> -f <fastqfile>
where:
-h Show this help text
-n <name> Name of project
-g <N.A. NCBI> N.A. of reference genome
-r <number> Number of reads simulated
-f <str> Name of fastq files
-s <number> set the seed value (default: 42)
-m <float> Rate of mutation of reference genome
-e <float> Rate of error sequencing simulation"
## defaults
seed=42
PJT="Project"
GF=""
reads=3000
name="file"
rmutation=0
rerror=0.001
## getopts
while getopts ':h:n:g:r:f:m:e:' option; do
case "$option" in
h) echo "${cyan}$usage"
exit
;;
n) PJT="$OPTARG";;
g) GF="$OPTARG";;
r) reads=$OPTARG
;;
f) name="$OPTARG";;
m) rmutation=$OPTARG;;
e) rerror=$OPTARG;;
:) printf "missing argument for -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
\?) printf "illegal option: -%s\n" "$OPTARG" >&2
echo "$usage" >&2
exit 1
;;
esac
done
shift $((OPTIND - 1))
echo "${green}started at `date`${normal}"
echo "${cyan}-------------------------------------------------------------"
echo "--------------CREATE WORKING DIRECTORY------------------"
echo "-------------------------------------------------------------"
echo ""
echo "── $PJT/
│ └── data/ <- Folder to store reads and references files
│ ├── reads/ <- Reads illumina simulated
│ ├──file_1.fastq <- Forward read
│ ├──file_2.fastq <- Reverse read
│
│ ├── reference/ <- Host genomes files (.fasta)
│ ├──NA_NCBI.fasta <- NCBI download fasta file
│
│ └── results/ <- Folder to store data generated during processing steps
│
│ └── scripts/ <- Folder to store scripts for data processing
├── logs/ <- Results logs during processing steps
${normal}"
echo ""
mkdir -p $PJT/{data/{reads,reference},results,scripts/logs}
echo "${blue}-------------------------------------------------------------"
echo "--------------DOWNLOAD GENOME REFERENCE------------------"
echo "-------------------------------------------------------------"
echo "${normal}"
### Download reference genome
echo "${fawn}"
esearch -db nucleotide -query "$GF" | esummary | sed "s/></\n/g" | sed "s/>/\t/g" | sed "s/</\t/g" | awk 'NR>=6&&NR<=41'
echo ""
esearch -db nucleotide -query "$GF" | efetch -format fasta > $PJT/data/reference/${GF}.fasta
echo "${red}-------------------------------------------------------------"
echo "--------------READS ILLUMINA SIMULATION------------------"
echo "-------------------------------------------------------------${normal}"
echo "${green}"
### Reads simulation
echo "wgsim -1151 -2151 -d300 -r"${rmutation}" -e"${rerror}" -N"${reads}" -R0 -X0 ${GF}.fastq ${name}_1.fastq ${name}_2.fasta"
wgsim -1151 -2151 -d300 -r"${rmutation}" -e"${rerror}" -N"${reads}" -R0 -X0 $PJT/data/reference/${GF}.fasta $PJT/data/reads/${name}_1.fastq $PJT/data/reads/${name}_2.fastq
for i in 0.01 0.005 0.001
do
wgsim -1151 -2151 -d300 -e${i} -N"${reads}" -R0 -X0 $PJT/data/reference/${GF}.fasta $PJT/data/reads/${i}_1.fastq $PJT/data/reads/${i}_2.fastq
cat $PJT/data/reads/${i}_1.fastq >> $PJT/data/reads/${name}_1.fastq
cat $PJT/data/reads/${i}_2.fastq >> $PJT/data/reads/${name}_2.fastq
rm $PJT/data/reads/${i}_1.fastq $PJT/data/reads/${i}_2.fastq
done
echo "total reads simulated"
echo "(forward and reverse)"
cat $PJT/data/reads/${name}_1.fastq $PJT/data/reads/${name}_2.fastq | echo $((`wc -l`/4))
echo "${purple}"
echo "Finished at `date`"