-
Notifications
You must be signed in to change notification settings - Fork 2
/
constax.sh
599 lines (561 loc) · 27.6 KB
/
constax.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
#!/bin/bash -login
VERSION=2.0.16; BUILD=0; PREFIX=placehold
TRAIN=false
BLAST=false
HELP=false
SHOW_VERSION=false
KEYWORD=null
MSU_HPCC=false
CONSERVATIVE=False
CONSISTENT=False
CONF=0.8
NTHREADS=1
MAX_HITS=10
EVALUE=1
P_IDEN=0.8
INPUT=otus.fasta
OUTPUT=./outputs
TAX=./taxonomy_assignments
SINTAXPATH_USER=false
UTAXPATH_USER=false
RDPPATH_USER=false
CONSTAXPATH_USER=false
MAKE_PLOT=false
CHECK=false
COMBINE_ONLY=false
PATHFILE=pathfile.txt
MEM=32000
ISOLATES=null
HL_DB=null
HL_FMT=null
HL_QC=75
HL_ID=1
USE_ISOS=False # Used as python bool
ISO_QC=75
ISO_ID=1
echo "Welcome to CONSTAX version $VERSION build $BUILD - The CONSensus TAXonomy classifier"
echo "This software is distributed under MIT License"
echo "© Copyright 2021, Julian A. Liber, Gian M. N. Benucci & Gregory M. Bonito"
echo "https://github.com/liberjul/CONSTAXv2"
echo "https://constax.readthedocs.io/"
echo ""
echo "Please cite us as:"
echo "CONSTAX2: Improved taxonomic classification of environmental DNA markers"
echo "Julian Aaron Liber, Gregory Bonito, Gian Maria Niccolò Benucci"
echo "Bioinformatics, Volume 37, Issue 21, 1 November 2021, Pages 3941–3943; doi: https://doi.org/10.1093/bioinformatics/btab347"
### Parse variable inputs
TEMP=`getopt -o c:n:m:e:p:d:i:o:x:tbhvf: --long conf:,num_threads:,max_hits:,evalue:,p_iden:,db:,input:,output:,tax:,train,blast,select_by_keyword:,msu_hpcc,help,version,conservative,make_plot,check,trainfile:,mem:,sintax_path:,utax_path:,rdp_path:,constax_path:,pathfile:,isolates:,isolates_query_coverage:,isolates_percent_identity:,high_level_db:,high_level_query_coverage:,high_level_percent_identity: \
-n 'constax' -- "$@"`
if [ $? != 0 ]
then
echo "Terminating..." >&2
echo ""
echo "Usage: constax [OPTION] ..."
echo "Classify input OTU sequences by CONSTAX consensus taxonomy algorithm"
echo "Example constax -t --db sh_general_release_fungi_35077_RepS_04.02.2020.fasta"
echo ""
echo "-c, --conf=0.8 Classification confidence threshold"
echo "-n, --num_threads=1 Number of threads to use"
echo "-m, --max_hits=10 Maximum number of BLAST hits to use, for use with -b option"
echo "-e, --evalue=1 Maximum expect value of BLAST hits to use, for use with -b option"
echo "-p, --p_iden=0.8 Minimum proportion identity of BLAST hits to use, for use with -b option"
echo "-d, --db Database to train classifiers, in FASTA format"
echo "-f, --trainfile=./training_files Path to which training files will be written"
echo "-i, --input=otus.fasta Input file in FASTA format containing sequence records to classify"
echo "-o, --output=./outputs Output directory for classifications"
echo "-x, --tax=./taxonomy_assignments Directory for taxonomy assignments"
echo "-t, --train Complete training if specified"
echo "-b, --blast Use BLAST instead of UTAX if specified"
echo "--select_by_keyword Takes a keyword argument and --input FASTA file to produce a filtered database with headers containing the keyword with name --output"
echo "--msu_hpcc If specified, use executable paths on Michigan State University HPCC. Overrides other path arguments"
echo "--conservative If specified, use conservative consensus rule (2 null = null winner)"
echo "--make_plot If specified, run R script to make plot of classified taxa"
echo "--check If specified, runs checks but stops before training or classifying"
echo "--mem=32000 Memory available to use for RDP, in MB. 32000MB recommended for UNITE, 128000MB for SILVA"
echo "--sintax_path Path to USEARCH/VSEARCH executable for SINTAX classification"
echo "--utax_path Path to USEARCH executable for UTAX classification"
echo "--rdp_path Path to RDP classifier.jar file"
echo "--constax_path Path to CONSTAX scripts"
echo "--pathfile File with paths to SINTAX, UTAX, RDP, and CONSTAX executables"
echo "--isolates FASTA formatted file of isolates to use BLAST against"
echo "--isolates_query_coverage=75 Threshold of sequence query coverage to report isolate matches"
echo "--isolates_percent_identity=1 Threshold of aligned sequence percent identity to report isolate matches"
echo "--high_level_db FASTA database file of representative sequences for assignment of high level taxonomy"
echo "--high_level_query_coverage=75 Threshold of sequence query coverage to report high-level taxonomy matches"
echo "--high_level_percent_identity=1 Threshold of aligned sequence percent identity to report high-level taxonomy matches"
echo "--combine_only Only combine taxonomy without rerunning classifiers"
echo "-h, --help Display this help and exit"
echo "-v, --version Display version and exit"
exit 1
fi
eval set -- "$TEMP"
while true; do
case "$1" in
-c | --conf ) CONF="$2"; shift 2 ;;
-p | --p_iden) P_IDEN="$2"; shift 2 ;;
-n | --num_threads ) NTHREADS="$2"; shift 2 ;;
-m | --max_hits ) MAX_HITS="$2"; shift 2 ;;
-e | --evalue ) EVALUE="$2"; shift 2 ;;
-d | --db ) DB="$2"; shift 2 ;;
-i | --input ) INPUT="$2"; shift 2 ;;
-o | --output ) OUTPUT="${2%/}"; shift 2 ;;
-x | --tax ) TAX="${2%/}"; shift 2 ;;
-f | --trainfile ) TFILES="${2%/}"; shift 2 ;;
--mem ) MEM="$2"; shift 2 ;;
--rdp_path ) RDPPATH_USER="$2"; shift 2 ;;
--sintax_path ) SINTAXPATH_USER="$2"; shift 2 ;;
--utax_path ) UTAXPATH_USER="$2"; shift 2 ;;
--constax_path ) CONSTAXPATH_USER="${2%/}"; shift 2 ;;
--pathfile ) PATHFILE="$2"; shift 2 ;;
--isolates ) ISOLATES="$2"; shift 2 ;;
--isolates_query_coverage ) ISO_QC="$2"; shift 2 ;;
--isolates_percent_identity ) ISO_ID="$2"; shift 2 ;;
--high_level_db ) HL_DB="$2"; shift 2 ;;
--high_level_query_coverage ) HL_QC="$2"; shift 2 ;;
--high_level_percent_identity ) HL_ID="$2"; shift 2 ;;
-t | --train ) TRAIN=true; shift ;;
-b | --blast ) BLAST=true; shift ;;
-h | --help ) HELP=true; shift ;;
-v | --version) SHOW_VERSION=true; shift ;;
--select_by_keyword) KEYWORD="$2"; shift 2 ;;
--msu_hpcc ) MSU_HPCC=true; shift ;;
--conservative ) CONSERVATIVE=True; shift ;;
--make_plot ) MAKE_PLOT=true; shift ;;
--check ) CHECK=true; shift ;;
--combine_only ) COMBINE_ONLY=true; shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
if $HELP
then
echo "Usage: constax [OPTION] ..."
echo "Classify input OTU sequences by CONSTAX consensus taxonomy algorithm"
echo "Example constax -t --db sh_general_release_fungi_35077_RepS_04.02.2020.fasta"
echo ""
echo "-c, --conf=0.8 Classification confidence threshold"
echo "-n, --num_threads=1 Number of threads to use"
echo "-m, --max_hits=10 Maximum number of BLAST hits to use, for use with -b option"
echo "-e, --evalue=1 Maximum expect value of BLAST hits to use, for use with -b option"
echo "-p, --p_iden=0.8 Minimum proportion identity of BLAST hits to use, for use with -b option"
echo "-d, --db Database to train classifiers"
echo "-f, --trainfile=./training_files Path to which training files will be written"
echo "-i, --input=otus.fasta Input file in FASTA format containing sequence records to classify"
echo "-o, --output=./outputs Output directory for classifications"
echo "-x, --tax=./taxonomy_assignments Directory for taxonomy assignments"
echo "-t, --train Complete training if specified"
echo "-b, --blast Use BLAST instead of UTAX if specified"
echo "--select_by_keyword Takes a keyword argument and --input FASTA file to produce a filtered database with headers containing the keyword with name --output"
echo "--msu_hpcc If specified, use executable paths on Michigan State University HPCC. Overrides other path arguments"
echo "--conservative If specified, use conservative consensus rule (2 null = null winner)"
echo "--make_plot If specified, run R script to make plot of classified taxa"
echo "--check If specified, runs checks but stops before training or classifying"
echo "--mem Memory available to use for RDP, in MB. 32000MB recommended for UNITE, 128000MB for SILVA"
echo "--sintax_path Path to USEARCH/VSEARCH executable for SINTAX classification"
echo "--utax_path Path to USEARCH executable for UTAX classification"
echo "--rdp_path Path to RDP classifier.jar file"
echo "--constax_path Path to CONSTAX scripts"
echo "--pathfile File with paths to SINTAX, UTAX, RDP, and CONSTAX executables"
echo "--isolates FASTA formatted file of isolates to use BLAST against"
echo "--isolates_query_coverage=75 Threshold of sequence query coverage to report isolate matches"
echo "--isolates_percent_identity=1 Threshold of aligned sequence percent identity to report isolate matches"
echo "--high_level_db FASTA database file of representative sequences for assignment of high level taxonomy"
echo "--high_level_query_coverage=75 Threshold of sequence query coverage to report high-level taxonomy matches"
echo "--high_level_percent_identity=1 Threshold of aligned sequence percent identity to report high-level taxonomy matches"
echo "--combine_only Only combine taxonomy without rerunning classifiers"
echo "-h, --help Display this help and exit"
echo "-v, --version Display version and exit"
exit 1
fi
if $SHOW_VERSION
then
echo "CONSTAX version $VERSION build $BUILD"
exit 1
fi
#Check Python version
python -V > ver_python.txt 2>&1
if grep -Fq "Python 2" ver_python.txt; then exit 2; fi
if [ $MAX_HITS -eq 0 ]
then
echo "Set -m/--max_hits to an integer greater than zero."
exit 1
elif [[ $CONF =~ '^[+-]?[0-9]+([.][0-9]+)?$' ]] || (( $(echo "$CONF > 1.0" | bc -l) )) || (( $(echo "$CONF < 0.0" | bc -l) ))
then
echo "Set -c/--conf to a float between 0 and 1"
exit 1
elif [ $NTHREADS -lt 1 2> /dev/null ] || [ $? == 2 ]
then
echo "Set -n/--nthreads to an integer greater than 0"
exit 1
elif [ $ISO_QC -lt 1 2> /dev/null ] || [ $ISO_QC -gt 100 2> /dev/null ] || [ $? == 2 ]
then
echo "Set --isolates_query_coverage to an integer greater than 0 and less than or equal to 100"
exit 1
elif [ $ISO_ID -lt 1 2> /dev/null ] || [ $ISO_ID -gt 100 2> /dev/null ] || [ $? == 2 ]
then
echo "Set --isolates_percent_identity to an integer greater than 0 and less than or equal to 100"
exit 1
elif [ $HL_QC -lt 1 2> /dev/null ] || [ $HL_QC -gt 100 2> /dev/null ] || [ $? == 2 ]
then
echo "Set --high_level_query_coverage to an integer greater than 0 and less than 100"
exit 1
elif [ $HL_ID -lt 1 2> /dev/null ] || [ $HL_ID -gt 100 2> /dev/null ] || [ $? == 2 ]
then
echo "Set --high_level_percent_identity to an integer greater than 0 and less than 100"
exit 1
elif [[ $P_IDEN =~ '^[+-]?[0-9]+([.][0-9]+)?$' ]] || (( $(echo "$P_IDEN > 1.0" | bc -l) )) || (( $(echo "$P_IDEN < 0.0" | bc -l) ))
then
echo "Set -p/--p_iden to a float between 0 and 1"
exit 1
fi
if ! [ -f "$INPUT" ]
then
echo "Input file $INPUT does not exist, exiting..."
exit 1
elif ! [ -s "$INPUT" ]
then
echo "Input file $INPUT is empty, exiting..."
exit 1
elif [[ "$INPUT" != *.fasta ]] && [[ "$INPUT" != *.fa ]] && [[ "$INPUT" != *.fna ]]
then
echo "Input file $INPUT must end with .fasta, .fa, or .fna; exiting..."
exit 1
fi
if [[ "$KEYWORD" == "null" ]]
then
if ! [ -s "$DB" ]
then
echo "Database file $DB is non-existent or empty, exiting..."
exit 1
elif [[ "$DB" != *.fasta ]] && [[ "$DB" != *.fa ]] && [[ "$DB" != *.fna ]]
then
echo "Database file $DB must end with .fasta, .fa, or .fna; exiting..."
exit 1
fi
if [ -d "$OUTPUT" ] && ! [ -z "$(ls -A $OUTPUT)" ]
then
echo "Overwritting previous classification..."
fi
if [ -d "$TAX" ] && ! [ -z "$(ls -A $TAX)" ]
then
echo "Overwritting previous taxonomy assignments..."
fi
if ! [ -d "$OUTPUT" ] # Output directory doesn't exist
then
mkdir "$OUTPUT"
fi
if ! [ -d "$TAX" ] # Taxonomic assignments directory does not exist
then
mkdir "$TAX"
fi
if $TRAIN && [ -z "$TFILES" ] # if training true and path not specified
then
TFILES="training_files"
fi
if $TRAIN && ! [ -d "$TFILES" ] # if training is true and path does not exist
then
mkdir "$TFILES"
fi
if $TRAIN
then
if [ -z "$(ls -A $TFILES)" ] # training true and training file path empty
then
echo "Training, with output to $TFILES..."
else # training true and trainfile path is not empty
echo "Performing training and overwritting training files..."
fi
else # Training not true
if [ -z "$TFILES" ] # No trainfile path provided
then
TFILES="training_files"
fi
if grep -Fxq "Classifier training complete using BLAST: $BLAST" "${TFILES}"/training_check.txt # If trainfile path doesn't exist or is empty
then
echo "Classifying without training..."
if [[ "$(blastn -version | grep -o "blastn: 2[.].*" | head -n1 | cut -d' ' -f2)" != "$(grep -o 'BLAST version 2[.].*' ${TFILES}/training_check.txt | tail -n1 | cut -d' ' -f3)" ]]
then
echo "BLAST executable version does not match the version used to generate the training files, "
echo "if BLAST Database error occurs, change your executable or use -t flag."
elif ! grep -Fxq "SINTAX executable ${SINTAXPATH##*/}" "${TFILES}"/training_check.txt
then
echo "SINTAX executable does not match the executable used to generate the training files, "
echo "if SINTAX error occurs, change your executable or use -t flag."
fi
else
echo "Cannot classify without existing training files, please specify -t"
exit 1
fi
fi
fi
if [ -f "$PATHFILE" ] # First try user-suppplied pathfile
then
echo "Using the user-supplied pathfile at $PATHFILE"
source "$PATHFILE"
elif [ -f "pathfile.txt" ] # Next try in local directory
then
echo "Using local pathfile.txt"
source pathfile.txt
else # Then try in package directory.
echo "Pathfile input not found in local directory ..."
DIR=$(conda list | head -n 1 | rev | cut -d' ' -f1 | rev | cut -d: -f1)
PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD/opt/constax-$VERSION/pathfile.txt"
if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
PATHFILE=$DIR"/pkgs/constax-$VERSION-$BUILD_STRING/opt/constax-$VERSION/pathfile.txt"
if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
PATHFILE=$DIR"/opt/constax-$VERSION/pathfile.txt"
if [ -f "$PATHFILE" ]; then source $PATHFILE; echo "Pathfile input found at $PATHFILE ..."; else echo "Pathfile input not found at $PATHFILE ..."; fi
fi
# Check for user input paths
if [ $(command -v "$SINTAXPATH_USER") ] && [[ "$SINTAXPATH_USER" != false ]]
then
SINTAXPATH="$SINTAXPATH_USER"
fi
if [ $(command -v "$UTAXPATH_USER") ] && [[ "$UTAXPATH_USER" != false ]]
then
UTAXPATH="$UTAXPATH_USER"
fi
if [[ "$RDPPATH_USER" != false ]]
then
RDPPATH="$RDPPATH_USER"
fi
if [ -d "$CONSTAXPATH_USER" ]
then
CONSTAXPATH="$CONSTAXPATH_USER"
fi
if $MSU_HPCC
then
echo "Using paths for the MSU HPCC ..."
SINTAXPATH=/mnt/research/rdp/public/thirdParty/usearch10.0.240_i86linux64
UTAXPATH=/mnt/research/rdp/public/thirdParty/usearch8.1.1831_i86linux64
RDPPATH=/mnt/research/rdp/public/RDPTools/classifier.jar
CONSTAXPATH=/mnt/ufs18/rs-022/bonito_lab/CONSTAX_May2020
elif $BLAST && [ $(command -v blastn) ] && [ $(command -v "$SINTAXPATH") ] && [ [ $(command java -jar "$RDPPATH" > /dev/null 2>&1) ] || [ $(command -v "$RDPPATH") ] ] && [ -d "$CONSTAXPATH" ]
then
echo "All needed executables exist."
echo "SINTAX: $SINTAXPATH"
echo "RDP: $RDPPATH"
echo "CONSTAX: $CONSTAXPATH"
elif ! $BLAST && [ $(command -v "$SINTAXPATH") ] && [ [ $(command java -jar "$RDPPATH" > /dev/null 2>&1) ] || [ $(command -v "$RDPPATH") ] ] && [ -d "$CONSTAXPATH" ] && [ $(command -v "$UTAXPATH") ]
then
echo "All needed executables exist."
echo "SINTAX: $SINTAXPATH"
echo "RDP: $RDPPATH"
echo "UTAX: $UTAXPATH"
echo "CONSTAX: $CONSTAXPATH"
else
echo "Please specify --msu_hpcc if using it, otherwise specify paths for --sintax_path,"
echo "--rdp_path, --utax_path (if not using BLAST), and --constax_path"
echo "SINTAX: $SINTAXPATH"
if ! [ $(command -v "$SINTAXPATH") ] ; then echo "SINTAX not executable" ; fi
echo "RDP: $RDPPATH"
if ! [ $(command -v java -jar "$RDPPATH") ] && ! [ $(command -v "$RDPPATH") ] ; then echo "RDP not executable alone or by java -jar" ; fi
echo "UTAX: $UTAXPATH"
if ! $BLAST && ! [ $(command -v "$UTAXPATH") ] ; then echo "UTAX not executable. Did you mean to use -b/--blast flag?" ; fi
if $BLAST && ! [ $(command -v blastn) ] ; then echo "BLAST not executable" ; fi
echo "CONSTAX: $CONSTAXPATH"
if [ -d "$CONSTAXPATH" ] ; then echo "CONSTAX directory not found" ; fi
exit 1
fi
if ! $BLAST && [ $(echo "$UTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -gt 9 ]
then
echo "USEARCH executable must be version 9.X or lower to use UTAX"
exit 1
fi
if [[ "$KEYWORD" != "null" ]]
then
python "$CONSTAXPATH"/fasta_select_by_keyword.py -i "$INPUT" -o "$OUTPUT" -k $KEYWORD
echo "Filtered file output to $OUTPUT"
exit 1
fi
base=$(basename -- ${DB%.*})
FORMAT=$(python "$CONSTAXPATH"/detect_format.py -d "$DB" 2>&1)
if [[ $FORMAT == "INVALID" ]]
then
echo "Database file $DB must be in UNITE or SILVA format, exiting..."
exit 1
fi
echo "Memory size: "$MEM"mb"
if [[ "$FORMAT" == "null" ]]
then
exit 1
fi
if $CHECK
then
echo "All checks passed, rerun without --check flag."
exit 0
fi
if ! $COMBINE_ONLY
then
if $TRAIN
then
python "$CONSTAXPATH"/FormatRefDB.py -d "$DB" -t "$TFILES" -f $FORMAT -p "$CONSTAXPATH"
echo "__________________________________________________________________________"
echo "Training SINTAX Classifier"
if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ]
then
"$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
else
"$SINTAXPATH" -makeudb_usearch "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
fi
if $BLAST
then
echo "__________________________________________________________________________"
echo "Training BLAST Classifier"
if $MSU_HPCC
then
module load BLAST
fi
makeblastdb -in "${TFILES}/${base}"__RDP_trained.fasta -dbtype nucl -out "${TFILES}/${base}"__BLAST
else
echo "__________________________________________________________________________"
echo "Training UTAX Classifier"
"$UTAXPATH" -utax_train "${TFILES}/${base}"__UTAX.fasta -report ${TFILES}/utax_db_report.txt -taxconfsout ${TFILES}/utax.tc \
-utax_splitlevels NVpcofgs -utax_trainlevels kpcofgs -log ${TFILES}/utax_train.log -report ${TFILES}/utax_report.txt
"$UTAXPATH" -makeudb_utax "${TFILES}/${base}"__UTAX.fasta -taxconfsin ${TFILES}/utax.tc -output ${TFILES}/utax.db \
-log ${TFILES}/make_udb.log -report ${TFILES}/utax_report.txt
fi
echo "__________________________________________________________________________"
echo "Training RDP Classifier"
if [ $(command -v "$RDPPATH") ]
then
"$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m > rdp_train.out 2>&1
else
java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt > rdp_train.out 2>&1
fi
cat rdp_train.out
if grep -Fq "duplicate taxon name" rdp_train.out
then
echo "RDP training error, redoing with duplicate taxa"
python "$CONSTAXPATH"/FormatRefDB.py -d "$DB" -t "$TFILES" -f $FORMAT -p "$CONSTAXPATH" --dup
if [ $(command -v "$RDPPATH") ]
then
"$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt -Xmx"$MEM"m > rdp_train.out 2>&1
else
java -Xmx"$MEM"m -jar "$RDPPATH" train -o "${TFILES}/." -s "${TFILES}/${base}"__RDP_trained.fasta -t "${TFILES}/${base}"__RDP_taxonomy_trained.txt > rdp_train.out 2>&1
fi
if [ -s rdp_train.out ]
then
cat rdp_train.out
exit 1
else
echo "RDP training error overcome, continuing with classification after SINTAX is retrained"
if [ $(echo "$SINTAXPATH" | sed -e 's/.*usearch\([0-9]*\).*/\1/') -lt 11 2> /dev/null ]
then
"$SINTAXPATH" -makeudb_sintax "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
else
"$SINTAXPATH" -makeudb_usearch "${TFILES}/${base}"__UTAX.fasta -output ${TFILES}/sintax.db
fi
fi
if [ -f rdp_train.out ]
then
rm rdp_train.out
fi
fi
# The rRNAClassifier.properties file should be in one of these two places
if [ -f "$CONSTAXPATH"/rRNAClassifier.properties ]
then
cp "$CONSTAXPATH"/rRNAClassifier.properties "${TFILES}"/
elif [ -f "${RDPPATH%dist/classifier.jar}"/samplefiles/rRNAClassifier.properties ]
then
cp "${RDPPATH%dist/classifier.jar}"/samplefiles/rRNAClassifier.properties "${TFILES}"/
elif [ -f "${RDPPATH%.jar}"/samplefiles/rRNAClassifier.properties ]
then
cp "${RDPPATH%.jar}"/samplefiles/rRNAClassifier.properties "${TFILES}"/
else
echo "Cannot locate rRNAClassifier.properties file, please place in $CONSTAXPATH or RDPTools/classifier/samplefiles"
fi
echo "Classifier training complete using BLAST: $BLAST" >> "${TFILES}"/training_check.txt
if $BLAST; then echo "BLAST version $(blastn -version | grep -o "blastn: 2[.].*" | head -n1 | cut -d' ' -f2)" >> "${TFILES}"/training_check.txt; fi
echo "SINTAX executable ${SINTAXPATH##*/}" >> "${TFILES}"/training_check.txt
# -Xmx set to memory in MB you want to use
fi
echo "__________________________________________________________________________"
echo "Assigning taxonomy to OTU's representative sequences"
FRM_INPUT=$(python "$CONSTAXPATH"/check_input_names.py -i "$INPUT" >&1)
"$SINTAXPATH" -sintax "$FRM_INPUT" -db "${TFILES}"/sintax.db -tabbedout "$TAX"/otu_taxonomy.sintax -strand both -sintax_cutoff $CONF -threads $NTHREADS
if [[ ${SINTAXPATH##*/} == "vsearch" ]]
then
sed -i'' -e 's|([0-1][.][0-9]\{2\}|&00|g' "$TAX"/otu_taxonomy.sintax
fi
if $BLAST
then
if $MSU_HPCC && ! $TRAIN
then
module load BLAST
fi
# workaround code for blast getting stuck
python "$CONSTAXPATH"/split_inputs.py -i "$FRM_INPUT"
echo > "$TAX"/blast.out
for i in ${FRM_INPUT%.fasta}_*".fasta"
do
blastn -query $i -db "$TFILES"/"$base"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs $MAX_HITS >> "$TAX"/blast.out
rm $i
done
python "$CONSTAXPATH"/blast_to_df.py -i "$TAX"/blast.out -o "$TAX"/otu_taxonomy.blast -d "$DB" -t "$TFILES" -f $FORMAT
else
"$UTAXPATH" -utax "$FRM_INPUT" -db "${TFILES}"/utax.db -strand both -utaxout "$TAX"/otu_taxonomy.utax -utax_cutoff $CONF -threads $NTHREADS
fi
if [ $(command -v "$RDPPATH") ]
then
"$RDPPATH" classify --conf $CONF --format allrank --train_propfile "${TFILES}"/rRNAClassifier.properties -o "$TAX"/otu_taxonomy.rdp "$FRM_INPUT" -Xmx"$MEM"m
else
java -Xmx"$MEM"m -jar "$RDPPATH" classify --conf $CONF --format allrank --train_propfile "${TFILES}"/rRNAClassifier.properties -o "$TAX"/otu_taxonomy.rdp "$FRM_INPUT"
fi
echo "__________________________________________________________________________"
if [ -f "$ISOLATES" ] && [ -s "$ISOLATES" ]
then
echo "Comparing to Isolates"
USE_ISOS=True
if $MSU_HPCC && ! $BLAST
then
module load BLAST
fi
python "$CONSTAXPATH"/check_input_names.py -i "$ISOLATES" -n "$TAX/"isolates_formatted.fasta
makeblastdb -in "$TAX/"isolates_formatted.fasta -dbtype nucl -out "$TAX/$(basename -- ${ISOLATES%.*})"__BLAST
rm "$TAX/"isolates_formatted.fasta
blastn -query "$FRM_INPUT" -db "$TAX/$(basename -- ${ISOLATES%.*})"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs 1 -evalue 0.00001 > "$TAX"/isolates_blast.out
rm "$TAX/$(basename -- ${ISOLATES%.*})"__BLAST.n*
fi
if [ -f "$HL_DB" ] && [ -s "$HL_DB" ]
then
echo "High Level Taxonomy Assignment"
HL_FMT=$(python "$CONSTAXPATH"/detect_format.py -d "$HL_DB" 2>&1)
if [[ $HL_FMT == "INVALID" ]]
then
echo "High-level taxonomy database file $HL_DB must be in UNITE or SILVA format, exiting..."
exit 1
fi
if $MSU_HPCC && ! $BLAST
then
module load BLAST
fi
python "$CONSTAXPATH"/check_input_names.py -i "$HL_DB" -n "$TAX/"hl_formatted.fasta --filter
makeblastdb -in "$TAX/"hl_formatted.fasta -dbtype nucl -out "$TAX/$(basename -- ${HL_DB%.*})"__BLAST
rm "$TAX/"hl_formatted.fasta
blastn -query "$FRM_INPUT" -db "$TAX/$(basename -- ${HL_DB%.*})"__BLAST -num_threads $NTHREADS -outfmt "7 qacc sacc evalue bitscore pident qcovs" -max_target_seqs 1 -evalue 0.001 > "$TAX"/hl_blast.out
rm "$TAX/$(basename -- ${HL_DB%.*})"__BLAST.n*
else
echo ""
fi
rm "$FRM_INPUT"
fi
echo "Combining Taxonomies"
if $BLAST
then
python "$CONSTAXPATH"/CombineTaxonomy.py -c $CONF -o "$OUTPUT/" -x "$TAX/" -b -e $EVALUE -m $MAX_HITS -p $P_IDEN -f $FORMAT -d "$DB" -t "$TFILES" -i $USE_ISOS --hl $HL_FMT --iso_qc $ISO_QC --iso_id $ISO_ID --hl_qc $HL_QC --hl_id $HL_ID -s $CONSERVATIVE -n $CONSISTENT
else
python "$CONSTAXPATH"/CombineTaxonomy.py -c $CONF -o "$OUTPUT/" -x "$TAX/" -f $FORMAT -d "$DB" -t "$TFILES" -i $USE_ISOS --hl $HL_FMT --iso_qc $ISO_QC --iso_id $ISO_ID --hl_qc $HL_QC --hl_id $HL_ID -s $CONSERVATIVE -n $CONSISTENT
fi
if $MSU_HPCC
then
module load GCC/8.3.0 OpenMPI/3.1.4
module load R
fi
# plot R
if $MAKE_PLOT && $BLAST
then
Rscript "$CONSTAXPATH"/ComparisonBars.R "$OUTPUT/" TRUE $FORMAT
elif $MAKE_PLOT
then
Rscript "$CONSTAXPATH"/ComparisonBars.R "$OUTPUT/" FALSE $FORMAT
fi