-
Notifications
You must be signed in to change notification settings - Fork 8
/
run_test_suite.sh
executable file
·120 lines (88 loc) · 5.38 KB
/
run_test_suite.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env bash
#: PROGRAM: run_test_suite.sh
#: AUTHORS: Pablo Vinuesa, Center for Genomic Sciences, UNAM, Mexico; @pvinmex
#: AIM: used for functional testing of the following scripts of the GET_PHYLOMARKERS package:
#: - run_get_phylomarkers_pipeline.sh
#: - estimate_pangenome_phylogenies.sh
#: - hcluster_pangenome_matrix.sh # <<< is officially distributed through the get_homologues GitHub repo
progname="${0##*/}"
version="2024-04-13"
# Activate set Bash's unofficial strict mode.
set -euo pipefail
function usage()
{
cat <<USAGE
1. Run the Makefile with:
make
make clean
2. As standard script
${progname} v.${version} <full path to BASE_DIRECTORY holding core_genome and pan_genome test data>
EXAMPLES:
# if running from container
${progname} /home/you/data
# if running from your host
${progname} $HOME/data/genomes/test_sequences
AIM: used for functional testing of the following scripts of the GET_PHYLOMARKERS package:
- run_get_phylomarkers_pipeline.sh
- estimate_pangenome_phylogenies.sh
- hcluster_pangenome_matrix.sh
NOTES:
Assumes that you have your test core_genome/ and pan_genome/ sequences availabe on your host machine under
~/data/genomes/test_sequences, or another base_dir provided as single argument to the script
To let the container having acces to these data, bind mount that host directory on the container instance
under /home/you/data with the following command to lauch the container
LAUNCHING THE CONTAINER AND BIND MOUNTING THE ~/data/genomes/test_sequences dir on /home/you/data
$ docker run -it --rm -v ~/data/genomes/test_sequences:/home/you/data vinuesa/get_phylomarkers:latest /bin/bash
USAGE
exit 0
}
[ $# -eq 0 ] && usage
base_dir=$1 || { echo "ERROR could not cd into $base_dir"; exit 1 ; }
# run basic test suite for GET_PHYLOMARKERS
cd "${base_dir}"/core_genome || { echo "ERROR could not cd into $base_dir/core_genome"; exit 1 ; }
if ls -d get_phylomarkers_run* &> /dev/null; then rm -rf get_phylomarkers_run*; fi
## NOTE: use -I 2 (but not higher, or a core may be dumped in some occasions by iqtree -T IQT_threads) instead of AUTO to speed-up tests
# 1. default on DNA sequences (uses IQ-TREE evaluating a subset of models specified in the detailed help)
echo ">>> Test #1: default run on DNA sequence ..."
run_get_phylomarkers_pipeline.sh -R 1 -t DNA -I 2
echo
# 2. thorough FastTree searching and molecular clock analysis on DNA sequences using 10 cores and increasing k stringency
echo ">>> Test #2 thorough FastTree searching and molecular clock analysis on DNA sequences using 10 cores and increasing k stringency ..."
run_get_phylomarkers_pipeline.sh -R 1 -t DNA -A F -k 1.2 -m 0.7 -s 20 -l 12 -T high -K -M HKY -q 0.95
echo
# 3. test multiple models with high kdetree stringency (k=1.0) and thorogh IQT searches, using 2 seed trees
echo ">>> Test #3: multiple models with high kdetree stringency (k=1.0) and thorogh IQT searches, using 2 seed trees"
run_get_phylomarkers_pipeline.sh -R 1 -t DNA -S 'TrN,TVMe,GTR' -k 1.0 -m 0.75 -T high -N 2 -I 2
echo
# 4. IQT with proteins and moderate average bipartition support
echo ">>> Test #4: IQT with proteins and moderate average bipartition support"
run_get_phylomarkers_pipeline.sh -R 1 -t PROT -m 0.2 -I 2
echo
# 5. FastTree thorough searching on a protein dataset with thorough search
echo ">>> Test # 5. FastTree thorough searching on a protein dataset with thorough search"
run_get_phylomarkers_pipeline.sh -R 1 -t PROT -A F -T high -m 0.2
echo
# 6. Run in population-genetics mode with K2P model, estimating the population tree with IQ-TREE
echo ">>> Test # 6. Run in population-genetics mode with K2P model, estimating the population tree on SNP matrix with IQ-TREE under best-fit model"
run_get_phylomarkers_pipeline.sh -R 2 -t DNA -S 'K2P'
echo
# 7. Run in population-genetics mode, estimating the population tree with FastTree
echo ">>> Test # 7. Run in population-genetics, estimating the population tree on SNP matrix with FastTree"
run_get_phylomarkers_pipeline.sh -R 2 -t DNA -A F
echo
# 8. estimate a ML pan-genome tree from the pan-genome matrix, using 2 independent IQT runs and UFBoot
echo ">>> Test # 8. estimate a ML pan-genome tree from the pan-genome matrix, using 2 independent IQT runs and UFBoot"
cd "${base_dir}"/pan_genome || { echo "ERROR could not cd into $base_dir/pan_genome"; exit 1 ; }
[ -d iqtree_PGM_2_runs ] && rm -rf iqtree_PGM_2_runs
estimate_pangenome_phylogenies.sh -f pangenome_matrix_t0.fasta -r 2 -S UFBoot -I 2
echo
# 9. estimate a PARS pan-genome tree with bootstrapping; 100 bootstrap replicates divided on 10 core (10 reps / core)
echo ">>> Test # 9. estimate a PARS pan-genome tree with bootstrapping; 100 bootstrap replicates divided on 10 core (10 reps / core)"
[ -d boot_pars ] && rm -rf boot_pars
estimate_pangenome_phylogenies.sh -c PARS -R 3 -i pangenome_matrix_t0.phylip -n 10 -b 10 -j 1 -t 1
echo
# 9. cluster the pan-genome matrix and run silhoute statistic to define the optimal number of clusters
# hcluster_pangenome_matrix.sh is officially distributed through the get_homologues GitHub repo; should not test here
#echo ">>> Test # 9. cluster the pan-genome matrix and run silhoute statistic to define the optimal number of clusters"
#hcluster_pangenome_matrix.sh -i pangenome_matrix_t0.tab -a ward.D2 -d gower -O pdf -A 'NULL,45' -X 0.8 -T "Pangenome tree"
#echo