-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstart_annotation.sh
326 lines (292 loc) · 13.3 KB
/
start_annotation.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
#!/bin/bash
###############################################################################################################################################
### Annotation pipeline ###
### Author: Alejandra Hernandez-Segura ###
### Organization: Rijksinstituut voor Volksgezondheid en Milieu (RIVM) ###
### Department: Infektieziekteonderzoek, Diagnostiek en Laboratorium Surveillance (IDS), Bacteriologie (BPD) ###
### Date: 03-11-2020 ###
### ###
### Documentation: https://github.com/AleSR13/AMR_annotation.git ###
### ###
### ###
### Snakemake rules (in order of execution): ###
### 1 Circlator # correct the start of the chromosomes/plasmids if necessary ###
### 2 PGAP # Annotation using the NCBI tool, PGAP. ###
### 3 Prokka # Annotation using Prokka and the PLSD database ###
### ###
###############################################################################################################################################
# Fail if error
set -eu
# Load in necessary functions
set -o allexport
source bin/include/functions.sh
eval "$(parse_yaml config/config.yaml "configuration_")"
set +o allexport
#set -x # Debug mode if necessary
UNIQUE_ID=$(bin/include/generate_id.sh)
SET_HOSTNAME=$(bin/include/gethostname.sh)
### Conda environment
PATH_MASTER_YAML="envs/master_env.yaml"
MASTER_NAME=$(head -n 1 ${PATH_MASTER_YAML} | cut -f2 -d ' ') # Extract Conda environment name as specified in yaml file
### Default values for parameters
INPUT_DIR="raw_data"
OUTPUT_DIR="out"
GENUS="NotProvided"
SPECIES="NotProvided"
MAKE_METADATA="FALSE"
METADATA_FILE="X"
PROTEIN_DB="/mnt/db/amr_annotation_db/refseq_plasmids/db_wgenenames_refseq.fasta"
SNAKEMAKE_UNLOCK="FALSE"
CLEAN="FALSE"
HELP="FALSE"
SHEET_SUCCESS="FALSE"
### Parse the commandline arguments, if they are not part of the pipeline, they get send to Snakemake
POSITIONAL=()
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-i|--input)
INPUT_DIR="${2%/}"
shift # Next
shift # Next
;;
-o|--output)
OUTPUT_DIR="${2%/}"
shift # Next
shift # Next
;;
--genus)
GENUS="$2"
shift
shift
;;
--species)
SPECIES="$2"
shift
shift
;;
--metadata)
METADATA_FILE="$2"
shift
shift
;;
--proteins)
PROTEIN_DB="$2"
shift
shift
;;
--make-metadata)
MAKE_METADATA="TRUE"
shift
;;
-h|--help)
HELP="TRUE"
shift # Next
;;
-sh|--snakemake-help)
SNAKEMAKE_HELP="TRUE"
shift # Next
;;
--clean)
CLEAN="TRUE"
shift # Next
;;
-y)
SKIP_CONFIRMATION="TRUE"
shift # Next
;;
-u|--unlock)
SNAKEMAKE_UNLOCK="TRUE"
shift # Next
;;
*) # Any other option
POSITIONAL+=("$1") # save in array
shift # Next
;;
esac
done
set -- "${POSITIONAL[@]:-}" # Restores the positional arguments (i.e. without the case arguments above) which then can be called via `$@` or `$[0-9]` etc. These parameters are send to Snakemake.
### Print AMR_annotation pipeline help message
if [ "${HELP:-}" == "TRUE" ]; then
line
cat bin/include/help.txt
exit 0
fi
### Remove all output
if [ "${CLEAN:-}" == "TRUE" ]; then
export OUTPUT_DIR=${OUTPUT_DIR}
bash bin/include/Clean
exit 0
fi
###############################################################################################################
##### Installation block #####
###############################################################################################################
### Pre-flight check: Assess availability of required files, conda and master environment
if [ ! -e "${PATH_MASTER_YAML}" ]; then # If this yaml file does not exist, give error.
line
spacer
echo -e "ERROR: Missing file \"${PATH_MASTER_YAML}\""
exit 1
fi
## Activate mamba
set +ue # Turn bash strict mode off because that breaks conda
conda env update -f envs/mamba.yaml
source activate mamba
if [[ $PATH != *${MASTER_NAME}* ]]; then # If the master environment is not in your path (i.e. it is not currently active), do...
line
spacer
source activate "${MASTER_NAME}" # Try to activate this env
if [ ! $? -eq 0 ]; then # If exit statement is not 0, i.e. master conda env hasn't been installed yet, do...
if [ "${SKIP_CONFIRMATION}" = "TRUE" ]; then
echo -e "\tInstalling master environment..."
mamba env update -f ${PATH_MASTER_YAML}
source activate "${MASTER_NAME}"
echo -e "DONE"
else
while read -r -p "The master environment hasn't been installed yet, do you want to install this environment now? [y/n] " envanswer
do
envanswer=${envanswer,,}
if [[ "${envanswer}" =~ ^(yes|y)$ ]]; then
echo -e "\tInstalling master environment..."
mamba env update -f ${PATH_MASTER_YAML}
source activate "${MASTER_NAME}"
echo -e "DONE"
break
elif [[ "${envanswer}" =~ ^(no|n)$ ]]; then
echo -e "The master environment is a requirement. Exiting because the AMR_annotation pipeline cannot continue without this environment"
exit 1
else
echo -e "Please answer with 'yes' or 'no'"
fi
done
fi
fi
echo -e "Succesfully activated master environment"
fi
set -ue # Turn bash strict mode on again
###############################################################################################################
##### Snakemake-only parameters #####
###############################################################################################################
if [ "${SNAKEMAKE_UNLOCK}" == "TRUE" ]; then
printf "\nUnlocking working directory...\n"
snakemake --config out=$OUTPUT_DIR genus=$GENUS species=$SPECIES protein_db=$PROTEIN_DB --profile config --unlock ${@}
printf "\nDone.\n"
exit 0
fi
### Print Snakemake help
if [ "${SNAKEMAKE_HELP:-}" == "TRUE" ]; then
line
snakemake --help
exit 0
fi
###############################################################################################################
##### Check input is correct #####
###############################################################################################################
if [ ! -d "${INPUT_DIR}" ]; then
minispacer
echo -e "The input directory specified (${INPUT_DIR}) does not exist"
echo -e "Please specify an existing input directory"
minispacer
exit 1
fi
# Make metadata if asked
if [ $MAKE_METADATA == "TRUE" ]; then
echo -e "\n\nMaking metadata..."
rm -f "metadata.csv"
python bin/guess_species.py $INPUT_DIR
METADATA_FILE="./metadata.csv"
echo -e "\n\nSuccessfully created metadata.csv file"
fi
# Check provided metadata exists
if [ $METADATA_FILE != "X" ]; then
if [ ! -f $METADATA_FILE ]; then
minispacer
echo -e "The provided species file ${METADATA_FILE} does not exist. Please provide an existing file"
echo -e "If you used the option --make-metadata, please check that all the fasta files contain the .fasta
extension and that the file names have the right abbreviations for genus/species"
minispacer
exit 1
fi
fi
if [ "$GENUS" == "NotProvided" ] && [ "$MAKE_METADATA" == "FALSE" ] && [ "$METADATA_FILE" == "X" ]; then
echo "ERROR! You need to provide either the --genus, a --metadata file or choose the --make-metadata option (if your files have the right abbreviations for it)."
exit 1
fi
if [ -f ${PROTEIN_DB} ]; then
if [[ ! "${PROTEIN_DB}" =~ "fasta"$ ]] && [[ ! "${PROTEIN_DB}" =~ "gbk"$ ]]; then
minispacer
echo -e "${PROTEIN_DB} file not accepted. Only .fasta or .gbk files are accepted in --proteins. Please provide a file with a supported format."
minispacer
exit 1
fi
else
minispacer
echo -e "The provided database file ${PROTEIN_DB} does not exist. Please provide an existing file"
minispacer
exit 1
fi
### Generate sample sheet
if [ `find $INPUT_DIR -type f -name *.fasta | wc -l` -gt 0 ]; then
minispacer
echo -e "Files in input directory (${INPUT_DIR}) are present"
echo -e "Generating sample sheet..."
# Add genus and species info to sample_sheet
if [ $METADATA_FILE != "X" ]; then
python bin/generate_sample_sheet.py "${INPUT_DIR}" --metadata `realpath ${METADATA_FILE}` > sample_sheet.yaml
else
python bin/generate_sample_sheet.py "${INPUT_DIR}" > sample_sheet.yaml
fi
if [ $(wc -l sample_sheet.yaml | awk '{ print $1 }') -gt 2 ]; then
SHEET_SUCCESS="TRUE"
fi
else
minispacer
echo -e "The input directory you specified (${INPUT_DIR}) exists but is empty or does not contain the expected input files...\nPlease specify a directory with input-data."
minispacer
exit 1
fi
### Checker for succesfull creation of sample_sheet
if [ "${SHEET_SUCCESS}" == "TRUE" ]; then
echo -e "Succesfully generated the sample sheet"
echo -e "\nReady for start"
else
echo -e "Couldn't find files in the input directory that ended up being in a .fasta format"
echo -e "Please inspect the input directory (${INPUT_DIR}) and make sure the files are in .fasta format"
exit 1
fi
###############################################################################################################
##### Run AMR_annotation pipeline #####
###############################################################################################################
### Actual snakemake command with checkers for required files. N.B. here the UNIQUE_ID and SET_HOSTNAME variables are set!
if [ -e sample_sheet.yaml ]; then
echo -e "Starting snakemake"
set +ue #turn off bash strict mode because snakemake and conda can't work with it properly
echo -e "pipeline_run:\n identifier: ${UNIQUE_ID}" > config/variables.yaml
echo -e "Server_host:\n hostname: http://${SET_HOSTNAME}" >> config/variables.yaml
eval $(parse_yaml config/variables.yaml "config_")
echo -e "start_annotation call:\n" > config/amr_annotation_call.txt
echo -e "snakemake --config out=$OUTPUT_DIR genus=$GENUS species=$SPECIES protein_db=$PROTEIN_DB --profile config \
--drmaa ' -q bio -n {threads} -R \'span[hosts=1]\'' --drmaa-log-dir ${OUTPUT_DIR}/log/drmaa ${@}" >> config/amr_annotation_call.txt
echo -e "AMR_annotation pipeline run complete"
snakemake --profile config \
--config out=$OUTPUT_DIR genus=$GENUS species=$SPECIES protein_db=$PROTEIN_DB \
--drmaa " -q bio -n {threads} \
-o ${OUTPUT_DIR}/log/drmaa/{name}_{wildcards}_{jobid}.out \
-e ${OUTPUT_DIR}/log/drmaa/{name}_{wildcards}_{jobid}.err \
-R \"span[hosts=1] rusage[mem={resources.mem_mb}]\" " \
--drmaa-log-dir ${OUTPUT_DIR}/log/drmaa ${@}
RESULT=$?
# snakemake --config out=$OUTPUT_DIR genus=$GENUS species=$SPECIES protein_db=$PROTEIN_DB --profile config \
# --drmaa " -q bio -n {threads} -R \"span[hosts=1]\"" --drmaa-log-dir ${OUTPUT_DIR}/log/drmaa ${@}
set -ue #turn bash strict mode back on
else
echo -e "Sample_sheet.yaml could not be found"
echo -e "This also means that the pipeline was unable to generate a new sample sheet for you"
echo -e "Please inspect the input directory (${INPUT_DIR}) and make sure the right files are present"
exit 1
fi
# Clean up for future runs
rm -f config/amr_annotation_call.txt
rm -f config/variables.yaml
exit $RESULT