diff --git a/CMakeLists.txt b/CMakeLists.txt
index 939820d7..8af85050 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@
# =============================================================================
cmake_minimum_required( VERSION 3.10 FATAL_ERROR )
-project( ganon VERSION 1.6.0 LANGUAGES CXX )
+project( ganon VERSION 1.7.0 LANGUAGES CXX )
# -----------------------------------------------------------------------------
# build setup
diff --git a/docs/classification.md b/docs/classification.md
index 5946b5d8..67600467 100644
--- a/docs/classification.md
+++ b/docs/classification.md
@@ -1,33 +1,46 @@
# Classification
-`ganon classify` will match single and/or paired-end reads against one or [more databases](#multiple-and-hierarchical-classification), for example:
+`ganon classify` will match single and/or paired-end sets of reads against one or [more databases](#multiple-and-hierarchical-classification).
+By default, parameters are optimized for **taxonomic profiling**.
+
+Example:
```bash
ganon classify --db-prefix my_db --paired-reads reads.1.fq.gz reads.2.fq.gz --output-prefix results --threads 32
```
-`ganon report` will be automatically executed after classification and a report will be created (`.tre`).
+`ganon report` will be automatically executed after `ganon classify` and a [report will be created `.tre`](../outputfiles/#ganon-report).
-ganon can generate both taxonomic profiling and binning results with `ganon classify` + `ganon report`. Please choose the parameters according to your application.
+ganon can perform **taxonomic profiling** and/or **binning** (one tax. assignment for each read) at a taxonomic, strain or sequence level with `ganon classify` + `ganon report`. Some guidelines are listed below, please choose the parameters according to your application:
### Profiling
`ganon classify` is set-up by default to perform taxonomic profiling. It uses:
- - strict `--rel-cutoff` and `--rel-filter` values (`0.75` and `0`, respectively)
- - `--min-count 0.0001` (0.01%) on `ganon report` to exclude low abundant groups
- - `--report-type abundance` on `ganon report` to generate taxonomic abundances, re-distributing read counts and correcting for genome sizes
+ - strict thresholds: `--rel-cutoff 0.75` and `--rel-filter 0`
+
+`ganon report` will automatically run after classification with:
+
+ - `--min-count 0.005` (0.5%) to exclude low abundant taxa
+ - `--report-type abundance` to generate taxonomic abundances, re-distributing read counts and correcting for genome sizes
+
+!!! Note
+ `ganon report` can be used independently from `ganon classify` with the output file `.rep`
### Binning
-To achieve better results for binning reads to specific references, ganon can be configured with:
+To achieve better results for taxonomic binning or sequence classification, ganon can be configured with:
- `--output-all` and `--output-lca` to write `.all` `.lca` files for binning results
- less strict `--rel-cutoff` and `--rel-filter` values (e.g. `0.25` and `0.1`, respectively)
- - activate the `--reassign` on `ganon classify` (or use the `ganon reassign` procedure) to apply a EM algorithm, re-assigning reads with LCA matches to most probable target (`--level` the database was built)
+ - activate the `--reassign` to apply an EM algorithm, re-assigning reads with LCA matches to one most probable target (defined by `--level` in the build procedure). In this case, the `.all` file will be re-generated with one assignment per read.
+
+!!! Note
+ `ganon reassign` can be used independently from `ganon classify` with the output file `.rep` and `.all`
!!! tip
- Higher `--kmer-size` values on `ganon build` can also improve read binning sensitivity
+ Database parameters can also influence your results. Lower `--max-fp` (e.g. 0.1, 0.001) and higher `--kmer-size` (e.g. `23`, `27`) will improve sensitivity of your results at cost of a larger database and memory usage
+
## Multiple and Hierarchical classification
@@ -128,4 +141,13 @@ For databases built with `--window-size`, the relative values are not based on t
A different `cutoff` can be set for every database in a multiple or hierarchical database classification. A different `filter` can be set for every level of a hierarchical database classification.
-Note that reads that remain with only one reference match (after `cutoff` and `filter` are applied) are considered a unique match.
\ No newline at end of file
+Note that reads that remain with only one reference match (after `cutoff` and `filter` are applied) are considered a unique match.
+
+### False positive of a query (--fpr-query)
+
+ganon uses Bloom Filters, probabilistic data structures that may return false positive results. The base false positive of a ganon index is controlled by `--max-fp` when building the database. However, this value is the expected false positive for each k-mer. In practice, a sequence (several k-mers) will have a way smaller false positive. ganon calculates the false positive rate of a query as suggested by (Solomon and Kingsford, 2016). The `--fpr-query` will control the max. value accepted to consider a match between a sequence and a reference, avoiding false positives that may be introduce by the properties of the data structure.
+
+By default, `--fpr-query 1e-5` is used and it is applied after the `--rel-cutoff` and `--rel-filter`. Values between `1e-3` and `1e-10` are recommended. This threshold becomes more important when building smaller databases with higher `--max-fp`, assuring that the false positive is under control. In this case however, you may notice a in sensitivity of your results.
+
+!!! Note
+ The false positive of a query was first propose in: Solomon, Brad, and Carl Kingsford. “Fast Search of Thousands of Short-Read Sequencing Experiments.” Nature Biotechnology 34, no. 3 (2016): 1–6. https://doi.org/10.1038/nbt.3442.
\ No newline at end of file
diff --git a/docs/custom_databases.md b/docs/custom_databases.md
index 6d8aad94..7795c4fe 100644
--- a/docs/custom_databases.md
+++ b/docs/custom_databases.md
@@ -182,7 +182,7 @@ ganon build-custom --input download/ --input-recursive --db-prefix fdaargos --nc
!!! note
The example above uses [genome_updater](https://github.com/pirovc/genome_updater) to download files
-### BLAST databases (nt, env_nt, nt_prok, ...)
+### BLAST databases (nt env_nt nt_prok ...)
BLAST databases. [Website](https://blast.ncbi.nlm.nih.gov/Blast.cgi)/[FTP](https://ftp.ncbi.nlm.nih.gov/blast/db/).
@@ -199,24 +199,36 @@ The example below extracts sequences and information from a BLAST db to build a
```bash
# Define BLAST db
db="16S_ribosomal_RNA"
+threads=8
-# Download BLAST db
-wget -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/blast/db/${db}*.tar.gz"
-# Using 12 threads: curl --silent --list-only ftp://ftp.ncbi.nlm.nih.gov/blast/db/ | grep "${db}.*.tar.gz$" | xargs -P 12 -I{} wget -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/blast/db/{}"
+# Download BLAST db - re-run this command many times until all finish (no more output)
+curl --silent --list-only ftp://ftp.ncbi.nlm.nih.gov/blast/db/ | grep "^${db}\..*tar.gz$" | xargs -P ${threads:-1} -I{} wget --continue -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/blast/db/{}"
-# Merge and extract BLAST db files
-cat "${db}"*.tar.gz | tar xvfz - > ex_files.txt
-ex_file=$(head -n 1 ex_files.txt)
-dbprefix="${ex_file%.*}"
+# OPTIONAL Download and check MD5
+wget -O - -nd --quiet --show-progress "ftp://ftp.ncbi.nlm.nih.gov/blast/db/${db}\.*tar.gz.md5" > "${db}.md5"
+md5sum "${db}".*tar.gz > "${db}_downloaded.md5"
+diff -s <(sort -k 2,2 "${db}.md5") <(sort -k 2,2 "${db}_downloaded.md5") # Should print "Files /dev/fd/xx and /dev/fd/xx are identical"
-# Generate sequences from BLAST db
-blastdbcmd -entry all -db "${dbprefix}" -out "${db}.fna"
+# Extract BLAST db files, if successful, remove .tar.gz
+ls "${db}"*.tar.gz | xargs -P ${threads} -I{} sh -c 'gzip -dc {} | tar --overwrite -vxf - && rm {}' > "${db}_extracted_files.txt"
-# Generate ganon input file
-blastdbcmd -entry all -db "${dbprefix}" -outfmt "%a %X" | awk -v file="${db}.fna" '{print file"\t"$1"\t"$2 }' > "${db}_ganon_input_file.tsv"
+# Create folder to write sequence files (split into 10 sub-folders)
+seq 0 9 | xargs -i mkdir -p "${db}"/{}
+
+# This command extracts sequences from the blastdb and writes them into taxid specific files
+# It also generates the --input-file for ganon
+blastdbcmd -entry all -db "${db}" -outfmt "%a %T %s" | \
+awk -v db="$(realpath ${db})" '{file=db"/"substr($2,1,1)"/"$2".fna"; print ">"$1"\n"$3 >> file; print file"\t"$2"\t"$2}' | \
+sort | uniq > "${db}_ganon_input_file.tsv"
# Build ganon database
-ganon build-custom --input-file "${db}_ganon_input_file.tsv" --db-prefix "${db}" --input-target sequence --level leaves --threads 32
+ganon build-custom --input-file "${db}_ganon_input_file.tsv" --db-prefix "${db}" --level species --threads 12
+
+# Delete extracted files and auxiliary files
+cat "${db}_extracted_files.txt" | xargs rm
+rm "${db}_extracted_files.txt" "${db}.md5" "${db}_downloaded.md5"
+# Delete sequences
+rm -rf "${db}" "${db}_ganon_input_file.tsv"
```
!!! note
@@ -234,7 +246,7 @@ ganon build-custom --input output_folder_genome_updater/version/ --input-recursi
### False positive and size (--max-fp, --filter-size)
-ganon indices are based on bloom filters and can have false positive matches. This can be controlled with `--max-fp` parameter. The lower the `--max-fp`, the less chances of false positives matches on classification, but the larger the database size will be. For example, with `--max-fp 0.01` the database will be build so any target (defined by `--level`) will have 1 in a 100 change of reporting a false k-mer match. The false positive of the query (all k-mers of the reads) is higher but directly affected.
+ganon indices are based on bloom filters and can have false positive matches. This can be controlled with `--max-fp` parameter. The lower the `--max-fp`, the less chances of false positives matches on classification, but the larger the database size will be. For example, with `--max-fp 0.01` the database will be build so any target (defined by `--level`) will have 1 in a 100 change of reporting a false k-mer match. [The false positive of the query](../classification/#false-positive-of-a-query-fpr-query) (all k-mers of a read) will be way lower, but directly affected by this value.
Alternatively, one can set a specific size for the final index with `--filter-size`. When using this option, please observe the theoretic false positive of the index reported at the end of the building process.
diff --git a/docs/default_databases.md b/docs/default_databases.md
index b2ec6518..da241df3 100644
--- a/docs/default_databases.md
+++ b/docs/default_databases.md
@@ -31,15 +31,15 @@ NCBI RefSeq and GenBank repositories are common resources to obtain reference se
| RefSeq | # assemblies | Size (GB) * | |
|---|---|---|---|
-| Complete | 295219 | 350 - 500 | cmd
`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_rs` |
+| Complete | 295219 | 160 - 500 | cmd
`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_rs` |
| One assembly per species | 52779 | 40 - 98 | cmd
`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-A 'species:1'" --db-prefix abfv_rs_t1s` |
| Complete genomes (higher quality) | 44121 | 19 - 64 | cmd
`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_rs_cg` |
| One assembly per species of complete genomes | 19713 | 8 - 27 | cmd
`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_rs_cg_t1s` |
-| One representative assembly per species | 18073 | 21 - 35 | cmd
`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-c 'representative genome'" --db-prefix abfv_rs_rg` |
+| One representative assembly per species | 18073 | 21 - 35 | cmd
`ganon build --source refseq --organism-group archaea bacteria fungi viral --threads 48 --representative-genomes --db-prefix abfv_rs_rg` |
| GenBank | # assemblies | Size (GB) * | |
|---|---|---|---|
-| Complete | 1595845 | | cmd
`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_gb` |
+| Complete | 1595845 | - | cmd
`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --db-prefix abfv_gb` |
| One assembly per species | 99505 | 91 - 420 | cmd
`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --genome-updater "-A 'species:1'" --db-prefix abfv_gb_t1s` |
| Complete genomes (higher quality) | 92917 | 24 - 132 | cmd
`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes --db-prefix abfv_gb_cg` |
| One assembly per species of complete genomes | 34497 | 10 - 34 | cmd
`ganon build --source genbank --organism-group archaea bacteria fungi viral --threads 48 --complete-genomes "-A 'species:1'" --db-prefix abfv_gb_cg_t1s` |
@@ -124,18 +124,27 @@ genome_updater.sh -e assembly_summary.txt -f "genomic.fna.gz" -o recovered_files
## Reducing database size
+### False positive
+
+A higher `--max-fp` value will generate a smaller database but with a higher number of false positive matches on classification. [More details](../custom_databases/#false-positive-and-size-max-fp-filter-size). Values between `0.001` (0.1%) and `0.3` (30%) are generally used.
+
+!!! hint
+ When using higher `--max-fp` values, more false positive results may be generated. This can be filtered with the `--fpr-query` parameter in `ganon classify`
+
+### Fixed size
+
+A fixed size for the database filter can be defined with `--filter-size`. The smaller the filter size, the higher the false positive chances on classification. When using a fixed filter size, ganon will report the max. and avg. false positive rate at the end of the build. [More details](../custom_databases/#false-positive-and-size-max-fp-filter-size).
+
### HIBF
-The Hierarchical Interleaved Bloom Filter (HIBF) is an improvement over the Interleaved Bloom Filter (IBF) and provides *smaller* databases with *faster* query times ([pre-print](https://www.biorxiv.org/content/10.1101/2022.08.01.502266v1)). However, the HIBF takes longer to build and has less flexibility regarding size control. The HIBF can be generated in `ganon build` and `ganon build-custom` with the `--hibf` parameter.
+The Hierarchical Interleaved Bloom Filter (HIBF) is an improvement over the default Interleaved Bloom Filter (IBF) and generates *smaller* databases with *faster* query times ([article](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-023-02971-4)). However, the HIBF takes longer to build and has less flexibility regarding size and further options in ganon. The HIBF can be generated in `ganon build` and `ganon build-custom` with the `--hibf` parameter.
-**This is the best method to reduce large database sizes and to achieve faster classification speed.**
+Due to differences between the default IBF used in ganon and the HIBF, it is recommended to lower the false positive when using the HIBF. A recommended value for high sensitivity is 1% (`--hibf --max-fp 0.001`).
!!! hint
- - For larger reference sets, huge amount of reads to query or production level analysis -> HIBF
- - For quick build and analysis with smaller data -> IBF (default)
+ - For large unbalanced reference sets, lots of reads to query -> HIBF
+ - For quick build and more flexibility -> IBF (default)
-!!! warning
- [raptor (v3.0.0)](https://github.com/seqan/raptor/releases/tag/raptor-v3.0.0) has to be installed to build databases with `--hibf`
### Top assemblies
@@ -146,14 +155,6 @@ RefSeq and GenBank are highly biased toward some few organisms. This means that
- `ganon build --genome-updater "-A 'species:1'"` will select one assembly for each species
- `ganon build --genome-updater "-A 'genus:3'"` will select three assemblies for each genus
-### False positive
-
-A higher `--max-fp` value will generate a smaller database but with a higher number of false positive matches on classification. [More details](../custom_databases/#false-positive-and-size-max-fp-filter-size). Values between `0.01` and `0.3` are generally used.
-
-### Fixed size
-
-A fixed size for the database filter can be defined with `--filter-size`. The smaller the filter size, the higher the false positive chances on classification. When using a fixed filter size, ganon will report the max. and avg. false positive rate at the end of the build. [More details](../custom_databases/#false-positive-and-size-max-fp-filter-size).
-
### Mode
`--mode` offers 5 different categories to build a database controlling the trade-off between size and classification speed.
@@ -163,7 +164,7 @@ A fixed size for the database filter can be defined with `--filter-size`. The sm
- `fast` or `fastest`: create bigger databases with faster classification speed
!!! Warning
- If `--filter-size` is used, `smaller` and `smallest` to the false positive and not to the database size which is fixed.
+ If `--filter-size` is used, `smaller` and `smallest` refers to the false positive and not to the database size (which is fixed).
### Split databases
@@ -183,7 +184,7 @@ Define how much unique information is stored in the database. [More details](../
### Example
-Besides the huge benefits of using [HIBF](#hibf) and specific sub-sets of big repositories shown on the [default databases table](#commonly-used-sub-sets), examples of other reduction strategies (without `--hibf`) can be seen below:
+Besides the benefits of using [HIBF](#hibf) and specific sub-sets of big repositories shown on the [default databases table](#commonly-used-sub-sets), examples of other reduction strategies (without `--hibf`) can be seen below:
*RefSeq archaeal complete genomes from 20230505*
diff --git a/docs/index.md b/docs/index.md
index 0d542e38..02eb049e 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -33,11 +33,8 @@ However, there are possible performance benefits compiling ganon from source in
## Installation from source
-### Dependencies
+### Dependencies Python
-- gcc >=7
-- cmake >=3.10
-- zlib
- python >=3.6
- pandas >=1.1.0
- [multitax](https://github.com/pirovc/multitax) >=1.3.1
@@ -46,6 +43,11 @@ However, there are possible performance benefits compiling ganon from source in
python3 -V # >=3.6
python3 -m pip install "pandas>=1.1.0" "multitax>=1.3.1"
```
+### Dependencies C++
+
+- GCC >=7
+- CMake >=3.10
+- zlib
### Downloading and building ganon + submodules
@@ -54,18 +56,20 @@ git clone --recurse-submodules https://github.com/pirovc/ganon.git
```
```bash
+# Install Python side
cd ganon
python3 setup.py install --record files.txt # optional
-mkdir build_cpp
-cd build_cpp
+# Compile and install C++ side
+mkdir -p build
+cd build
cmake -DCMAKE_BUILD_TYPE=Release -DVERBOSE_CONFIG=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCONDA=OFF -DLONGREADS=OFF ..
-make
+make -j 4
sudo make install # optional
```
- to change install location (e.g. `/myprefix/bin/`), set the installation prefix in the cmake command with `-DCMAKE_INSTALL_PREFIX=/myprefix/ `
- use `-DINCLUDE_DIRS` to set alternative paths to cxxopts and Catch2 libs.
-- to classify extremely large reads (>65000bp) use `-DLONGREADS=ÒN`
+- to classify extremely large reads or contigs that would need more than 65000 k-mers, use `-DLONGREADS=ON`
If everything was properly installed, the following commands should show the help pages without errors:
@@ -78,25 +82,36 @@ ganon -h
```bash
python3 -m unittest discover -s tests/ganon/integration/
python3 -m unittest discover -s tests/ganon/integration_online/ # optional - downloads large files
-cd build_cpp/
+cd build/
ctest -VV .
```
+
+### Installing raptor (optional)
+
+If you want to use the Hierarchical Interleaved Bloom Filter `--hibf` in `ganon build` you will need to install [raptor](https://github.com/seqan/raptor) either via conda `conda install -c bioconda -c conda-forge raptor` (already included in the installation if you installed ganon via conda) or from source:
+
+#### Dependencies
-## Important parameters
+ - CMake >= 3.18
+ - GCC 11, 12 or 13 (most recent minor version)
+ - git
-The most important parameters and trade-offs are:
+#### Downloading and building raptor + submodules
-- `ganon build` `--hibf`: build smaller databases that can be queried faster. Building will take longer.
-- `ganon build` `--window-size --kmer-size`: the *window* value should always be the same or larger than the *kmer* value. The larger the difference between them, the smaller the database will be. However, some sensitivity/precision loss in classification is expected with small *kmer* and/or large *window*. Larger *kmer* values (e.g. `31`) will improve classification, specially read binning, at a cost of way bigger databases.
----
-- `ganon classify` `--rel-cutoff`: this value defines the threshold for matches between reads and database. Higher `--rel-cutoff` values will improve precision and decrease sensitivity with expected less unique matches but an increase in overall matches. For taxonomic profiling, a higher value between `0.4` and `0.8` may provide better results. For read binning, lower values between `0.2` and `0.4` are recommended.
-- `ganon classify` `--rel-filter`: further filter top matches after cutoff is applied. Usually set between `0` and `0.2`.
-- `ganon classify` `--reassign`: runs an EM-algorithm to reassign reads that received multiple matches. It provides a unique match for each read at the level the database was built (e.g. assembly or species). Mostly useful for read binning, with little overall impact on taxonomic profiling. Can be used independently with `ganon reassign`.
----
-- `ganon report` `--report-type`: reports either taxonomic, sequence or matches abundances. Use `corr` or `abundance` for taxonomic profiling, `reads` or `dist` for sequence profiling and `matches` to report a summary of all matches.
-- `ganon report` `--min-count`: cutoff to discard underrepresented taxa. Useful to remove the common long tail of spurious matches and false positives when performing classification. Values between `0.0001` (0.01%) and `0.001` (0.1%) improved sensitivity and precision in our evaluations. The higher the value, the more precise the outcome, with a sensitivity loss. Alternatively `--top-percentile` can be used to keep a relative amount of taxa instead a hard cutoff.
+```bash
+git clone --branch raptor-v3.0.0 --recurse-submodules https://github.com/seqan/raptor
+```
+
+```bash
+cd raptor
+mkdir -p build
+cd build
+cmake ..
+make -j 4
+```
-The numeric values above are averages from several experiments with different sample types and database contents. They may not work as expected for your data. If you are not sure which values to use or see something unexpected, please open an [issue](https://github.com/pirovc/ganon/issues).
+- binaries will be located in the `bin` directory
+- you may have to inform `ganon build` where to find the binaries with `--raptor-path raptor/build/bin`
## Parameters
@@ -107,7 +122,7 @@ usage: ganon [-h] [-v]
- - - - - - - - - -
_ _ _ _ _
(_|(_|| |(_)| |
- _| v. 1.6.0
+ _| v. 1.7.0
- - - - - - - - - -
positional arguments:
@@ -131,8 +146,9 @@ options:
ganon build
```
-usage: ganon build [-h] [-g [...]] [-a [...]] [-b [...]] [-o] [-c] [-u] [-m [...]] [-z [...]] -d DB_PREFIX [-x] [-t]
- [-p] [-f] [-k] [-w] [-s] [-j] [-y] [--hibf] [--restart] [--verbose] [--quiet] [--write-info-file]
+usage: ganon build [-h] [-g [...]] [-a [...]] [-b [...]] [-o] [-c] [-r] [-u] [-m [...]] [-z [...]] [--skip-genome-size]
+ -d DB_PREFIX [-x] [-t] [-p] [-f] [-k] [-w] [-s] [-j] [-y] [--hibf] [--restart] [--verbose] [--quiet]
+ [--write-info-file]
options:
-h, --help show this help message and exit
@@ -154,12 +170,16 @@ download arguments:
-o , --top Download limited assemblies for each taxa. 0 for all. (default: 0)
-c, --complete-genomes
Download only sub-set of complete genomes (default: False)
+ -r, --representative-genomes
+ Download only sub-set of representative genomes (default: False)
-u , --genome-updater
Additional genome_updater parameters (https://github.com/pirovc/genome_updater) (default: None)
-m [ ...], --taxonomy-files [ ...]
Specific files for taxonomy - otherwise files will be downloaded (default: None)
-z [ ...], --genome-size-files [ ...]
Specific files for genome size estimation - otherwise files will be downloaded (default: None)
+ --skip-genome-size Do not attempt to get genome sizes. Activate this option when using sequences not representing
+ full genomes. (default: False)
important arguments:
-x , --taxonomy Set taxonomy to enable taxonomic classification, lca and reports [ncbi, gtdb, skip] (default:
@@ -197,9 +217,9 @@ optional arguments:
ganon build-custom
```
-usage: ganon build-custom [-h] [-i [...]] [-e] [-c] [-n] [-a] [-l] [-m [...]] [-z [...]] [-r [...]] [-q [...]] -d
- DB_PREFIX [-x] [-t] [-p] [-f] [-k] [-w] [-s] [-j] [-y] [--hibf] [--restart] [--verbose]
- [--quiet] [--write-info-file]
+usage: ganon build-custom [-h] [-i [...]] [-e] [-c] [-n] [-a] [-l] [-m [...]] [-z [...]] [--skip-genome-size] [-r [...]]
+ [-q [...]] -d DB_PREFIX [-x] [-t] [-p] [-f] [-k] [-w] [-s] [-j] [-y] [--hibf] [--restart]
+ [--verbose] [--quiet] [--write-info-file]
options:
-h, --help show this help message and exit
@@ -232,6 +252,8 @@ custom arguments:
Specific files for taxonomy - otherwise files will be downloaded (default: None)
-z [ ...], --genome-size-files [ ...]
Specific files for genome size estimation - otherwise files will be downloaded (default: None)
+ --skip-genome-size Do not attempt to get genome sizes. Activate this option when using sequences not representing
+ full genomes. (default: False)
ncbi arguments:
-r [ ...], --ncbi-sequence-info [ ...]
@@ -312,8 +334,8 @@ optional arguments:
```
usage: ganon classify [-h] -d [DB_PREFIX ...] [-s [reads.fq[.gz] ...]] [-p [reads.1.fq[.gz] reads.2.fq[.gz] ...]]
- [-c [...]] [-e [...]] [-o] [--output-lca] [--output-all] [--output-unclassified] [--output-single]
- [-t] [-l [...]] [-r [...]] [-a] [--verbose] [--quiet]
+ [-c [...]] [-e [...]] [-f [...]] [-o] [--output-lca] [--output-all] [--output-unclassified]
+ [--output-single] [-t] [-b] [-a] [-l [...]] [-r [...]] [--verbose] [--quiet]
options:
-h, --help show this help message and exit
@@ -335,6 +357,10 @@ cutoff/filter arguments:
Additional relative percentage of minimizers (relative to the best match) to keep a match.
Generally used to select best matches above cutoff. Single value or one per hierarchy (e.g. 0.1
0). 1 for no filter (default: [0.0])
+ -f [ ...], --fpr-query [ ...]
+ Max. false positive of a query to accept a match. Applied after --rel-cutoff and --rel-filter.
+ Generally used to remove false positives matches querying a database build with large --max-fp.
+ Single value or one per hierarchy (e.g. 0.1 0). 1 for no filter (default: [1e-05])
output arguments:
-o , --output-prefix
@@ -349,6 +375,10 @@ output arguments:
other arguments:
-t , --threads Number of sub-processes/threads to use (default: 1)
+ -b, --binning Optimized parameters for binning (--rel-cutoff 0.25 --reassign). Will report (.tre) sequence
+ abundances. This file can be re-generated with 'ganon report'. (default: False)
+ -a, --reassign Reassign reads with multiple matches with an EM algorithm. Will enforce --output-all. This file
+ can be re-generated with 'ganon reassign'. (default: False)
-l [ ...], --hierarchy-labels [ ...]
Hierarchy definition of --db-prefix files to be classified. Can also be a string, but input will
be sorted to define order (e.g. 1 1 2 3). The default value reported without hierarchy is 'H1'
@@ -358,8 +388,6 @@ other arguments:
phylum, class, order, family, genus, species, assembly]. This file can be re-generated with the
'ganon report' command for other types of abundances (reads, matches) with further filtration
and output options (default: [])
- -a, --reassign Reassign reads with multiple matches with an EM algorithm. Will enforce --output-all. This file
- can be re-generated with the 'ganon reassign'. (default: False)
--verbose Verbose output mode (default: False)
--quiet Quiet output mode (default: False)
```
@@ -398,9 +426,9 @@ other arguments:
ganon report
```
-usage: ganon report [-h] -i [...] [-e INPUT_EXTENSION] -o OUTPUT_PREFIX [-d [...]] [-x] [-m [...]] [-z [...]] [-f] [-t]
- [-r [...]] [-s] [-a] [-y] [-p [...]] [-k [...]] [-c] [--verbose] [--quiet] [--min-count]
- [--max-count] [--names [...]] [--names-with [...]] [--taxids [...]]
+usage: ganon report [-h] -i [...] [-e INPUT_EXTENSION] -o OUTPUT_PREFIX [-d [...]] [-x] [-m [...]] [-z [...]]
+ [--skip-genome-size] [-f] [-t] [-r [...]] [-s] [-a] [-y] [-p [...]] [-k [...]] [-c] [--verbose]
+ [--quiet] [--min-count] [--max-count] [--names [...]] [--names-with [...]] [--taxids [...]]
options:
-h, --help show this help message and exit
@@ -426,6 +454,8 @@ db/tax arguments:
Specific files for taxonomy - otherwise files will be downloaded (default: None)
-z [ ...], --genome-size-files [ ...]
Specific files for genome size estimation - otherwise files will be downloaded (default: None)
+ --skip-genome-size Do not attempt to get genome sizes. Valid only without --db-prefix. Activate this option when
+ using sequences not representing full genomes. (default: False)
output arguments:
-f , --output-format
diff --git a/docs/outputfiles.md b/docs/outputfiles.md
index 6b05b20a..045785d8 100644
--- a/docs/outputfiles.md
+++ b/docs/outputfiles.md
@@ -50,7 +50,7 @@ Each line in this report is a taxonomic entry (including the root node), with th
- The first line of the report file will show the number of unclassified reads (not for `--report-type matches`)
-- The CAMI challenge [bioboxes profiling format](https://github.com/bioboxes/rfc/blob/master/data-format/profiling.mkd) is supported using `--output-format bioboxes`. In this format, only values for the percentage/abundance (col. 9) are reported. The root node and unclassified entries are ommited.
+- The CAMI challenge [bioboxes profiling format](https://github.com/bioboxes/rfc/blob/master/data-format/profiling.mkd) is supported using `--output-format bioboxes`. In this format, only values for the percentage/abundance (col. 9) are reported. The root node and unclassified entries are omitted.
- The sum of cumulative assignments for the unclassified and root lines is 100%. The final cumulative sum of reads/matches may be under 100% if any filter is successfully applied and/or hierarchical selection is selected (keep/skip/split).
@@ -60,6 +60,8 @@ Each line in this report is a taxonomic entry (including the root node), with th
- {output_file}: a tab-separated file with counts/percentages of taxa for multiple samples
+---
+
Examples of output files
diff --git a/docs/reports.md b/docs/reports.md
index 6229f634..9c03b8e4 100644
--- a/docs/reports.md
+++ b/docs/reports.md
@@ -1,11 +1,42 @@
# Reports
+`ganon report` filters and generates several reports and summaries from the results obtained with `ganon classify`. It is possible to summarize the results in terms of taxonomic and sequence abundances as well as total number of matches.
+
+## Examples
+
+Given the output `.rep` from `ganon classify` and the database used (`--db-prefix`):
+
+### Taxonomic profile with abundance estimation (default)
+
+```bash
+ganon report --db-prefix mydb --input results.rep --output-prefix tax_profile --report-type abundance
+```
+
+### Sequence profile
+
+```bash
+ganon report --db-prefix mydb --input results.rep --output-prefix seq_profile --report-type reads
+```
+
+### Matches profile
+
+```bash
+ganon report --db-prefix mydb --input results.rep --output-prefix matches --report-type matches
+```
+
+### Filtering results
+
+```bash
+ganon report --db-prefix mydb --input results.rep --output-prefix filtered --min-count 0.0005 --top-percentile 0.8
+```
+
+This will keep only results with a min. abundance of `0.05%` and only the top `80%` most abundant.
## Parameter details
### report type (--report-type)
-Several reports are availble with `--report-type`: `reads`, `abundance`, `dist`, `corr`, `matches`:
+Several reports are available with `--report-type`: `reads`, `abundance`, `dist`, `corr`, `matches`:
`reads` reports **sequence abundances** which are the basic proportion of reads classified in the sample.
@@ -15,4 +46,4 @@ Several reports are availble with `--report-type`: `reads`, `abundance`, `dist`,
`corr` is the same of `reads` with correction by genome size
-`matches` will report the total number of matches classified, either unique or shared. *This report will output the total number of matches instead the total number of reads reported in all other reports.*
+`matches` will report the total number of matches classified, either unique or shared. *This option will output the total number of matches instead the total number of reads*
diff --git a/docs/start.md b/docs/start.md
new file mode 100644
index 00000000..d7643e75
--- /dev/null
+++ b/docs/start.md
@@ -0,0 +1,53 @@
+# Quick Start Guide
+
+## Install
+
+```sh
+conda install -c bioconda -c conda-forge ganon
+```
+
+## Download and Build a database
+
+- Bacteria - NCBI RefSeq - representative genomes
+
+```bash
+ganon build --db-prefix bac_rs_rg --source refseq --organism-group bacteria --representative-genomes --threads 24
+```
+
+- If you want to test ganon functionalities with a smaller database, use `archaea` instead of `bacteria` in the example above.
+
+## Classify and generate a tax. profile
+
+- [Download test reads](https://github.com/pirovc/ganon_benchmark/raw/master/files/reads/cami/toy/H01_1M_0.1.fq.gz)
+
+```bash
+ganon classify --db-prefix bac_rs_rg --output-prefix classify_results --single-reads H01_1M_0.1.fq.gz --threads 24
+```
+
+- `classify_results.tre` -> taxonomic profile
+
+---
+
+
+## Important parameters
+
+The most important parameters and trade-offs to be aware of when using ganon:
+
+### ganon build
+
+- `--max-fp --filter-size`: controls the false positive of the bloom filters and the size of the filter (which is the same as the amount of memory needed). The higher the `--max-fp`, the smaller the databases at a cost of sensitivity in classification. `--filter-size` can be used instead of `--max-fp` to define a specific size for your database. In this case, the false positive will be reported at the end of the build.
+- `--window-size --kmer-size`: the *window* value should always be the same or larger than the *k-mer* value. The larger the difference between them, the smaller the database will be. However, some sensitivity/precision loss in classification is expected with small *k-mer* and/or large *window*. Larger *k-mer* values (e.g. `31`) will improve classification, specially read binning, at a cost of larger databases.
+- `--hibf`: build smaller databases that can be queried faster. Building will take longer.
+
+### ganon classify
+
+- `--rel-cutoff`: this value defines the threshold for matches between reads and database. Higher values will improve precision and decrease sensitivity with expected less unique matches but an increase in overall matches. For taxonomic profiling, a higher value between `0.4` and `0.8` may provide better results. For read binning, lower values between `0.2` and `0.4` are recommended.
+- `--rel-filter`: further filter matches in relation to the best match after the cutoff is applied. Usually set between `0` and `0.2`. `0` means only matches with same score (# of *k-mers*) as the best match will be kept.
+- `--reassign`: runs an EM-algorithm to reassign reads that received multiple matches. It provides a unique match for each read at the level the database was built (e.g. assembly or species). Mostly useful for read binning, with little overall impact on taxonomic profiling. Can be used independently with `ganon reassign`.
+
+### ganon report
+
+- `--report-type`: reports either taxonomic, sequence or matches abundances. Use `corr` or `abundance` for taxonomic profiling, `reads` or `dist` for sequence profiling and `matches` to report a summary of all matches.
+- `--min-count`: cutoff to discard underrepresented taxa. Useful to remove the common long tail of spurious matches and false positives when performing classification. Values between `0.0001` (0.01%) and `0.001` (0.1%) improved sensitivity and precision in our evaluations. The higher the value, the more precise the outcome, with a sensitivity loss. Alternatively `--top-percentile` can be used to keep a relative amount of taxa instead a hard cutoff.
+
+The numeric values above are averages from several experiments with different sample types and database contents. They may not work as expected for your data. If you are not sure which values to use or see something unexpected, please open an [issue](https://github.com/pirovc/ganon/issues).
\ No newline at end of file
diff --git a/docs/table.md b/docs/table.md
new file mode 100644
index 00000000..d0ef8eda
--- /dev/null
+++ b/docs/table.md
@@ -0,0 +1,39 @@
+# Table
+
+`ganon table` filters and summarizes several reports obtained with `ganon report` into a table. Filters for each sample or for averages among all samples can also be applied.
+
+## Examples
+
+Given several `.tre` from `ganon report`:
+
+### Counts of species
+
+```bash
+ganon table --input *.tre --output-file table.tsv --rank species
+```
+
+### Abundance of species
+
+```bash
+ganon table --input *.tre --output-file table.tsv --output-value percentage --rank species
+```
+
+### Top 10 species (among all samples)
+
+```bash
+ganon table --input *.tre --output-file table.tsv --output-value percentage --rank species --top-all 10
+```
+
+### Top 10 species (from each samples)
+
+```bash
+ganon table --input *.tre --output-file table.tsv --output-value percentage --rank species --top-sample 10
+```
+
+### Filtering results
+
+```bash
+ganon table --input *.tre --output-file table.tsv --output-value percentage --rank species --min-count 0.0005
+```
+
+This will keep only results with a min. abundance of `0.05%`.
diff --git a/libs/genome_updater b/libs/genome_updater
index 56f610b3..5478e931 160000
--- a/libs/genome_updater
+++ b/libs/genome_updater
@@ -1 +1 @@
-Subproject commit 56f610b3b4d7a280e418809370212c605c8ff17f
+Subproject commit 5478e931af0e3028fc32b46e20be379b9443fd68
diff --git a/mkdocs.yml b/mkdocs.yml
index 043fd727..7b2cc012 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -2,10 +2,12 @@ site_name: ganon
theme: readthedocs
nav:
- ganon: index.md
+ - Quick Start: start.md
- Databases (ganon build): default_databases.md
- Custom databases (ganon build-custom): custom_databases.md
- Classification (ganon classify): classification.md
- Reports (ganon report): reports.md
+ - Table (ganon table): table.md
- Output files: outputfiles.md
- Tutorials: tutorials.md
markdown_extensions:
diff --git a/setup.py b/setup.py
index e2b63570..27138513 100755
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,7 @@ def read(filename):
setup(
name="ganon",
- version="1.6.0",
+ version="1.7.0",
url="https://www.github.com/pirovc/ganon",
license='MIT',
author="Vitor C. Piro",
diff --git a/src/ganon-classify/CommandLineParser.cpp b/src/ganon-classify/CommandLineParser.cpp
index 2b0bb3c9..344c23ca 100644
--- a/src/ganon-classify/CommandLineParser.cpp
+++ b/src/ganon-classify/CommandLineParser.cpp
@@ -23,6 +23,7 @@ std::optional< Config > CommandLineParser::parse( int argc, char** argv )
( "c,rel-cutoff", "Relative cutoff (i.e. percentage of minimizers). 0 for no cutoff. One or one per filter (comma-separated). Default: 0.2", cxxopts::value< std::vector< double > >() )
( "d,rel-filter", "Relative filter. Additional percentage of matches allowed (relative to the best match). 1 for no filtering. one or one per hierarchy label (comma-separated). Default: 0.0", cxxopts::value< std::vector< double > >() )
+ ( "f,fpr-query", "Min. False positive for a query. 1 for no filtering. one or one per hierarchy label (comma-separated). Default: 1.0", cxxopts::value< std::vector< double > >() )
( "o,output-prefix", "Output prefix (prefix.rep, [prefix.lca, prefix.all, prefix.unc]). If multi-level --hierarchy-labels is provided, files are generated accordingly (prefix.hierarchy.lca and prefix.hierarchy.all). Omit to output to STDOUT (only .rep will be printed)", cxxopts::value< std::string >() )
( "l,output-lca", "Runs and outputs file with lca classification (prefix.lca)", cxxopts::value< bool >() )
@@ -80,6 +81,8 @@ std::optional< Config > CommandLineParser::parse( int argc, char** argv )
config.rel_cutoff = args["rel-cutoff"].as< std::vector< double > >();
if ( args.count( "rel-filter" ) )
config.rel_filter = args["rel-filter"].as< std::vector< double > >();
+ if ( args.count( "fpr-query" ) )
+ config.fpr_query = args["fpr-query"].as< std::vector< double > >();
if ( args.count( "output-prefix" ) )
config.output_prefix = args["output-prefix"].as< std::string >();
diff --git a/src/ganon-classify/GanonClassify.cpp b/src/ganon-classify/GanonClassify.cpp
index 3132f295..a9bbeed3 100644
--- a/src/ganon-classify/GanonClassify.cpp
+++ b/src/ganon-classify/GanonClassify.cpp
@@ -48,9 +48,10 @@ typedef uint16_t TIntCount;
typedef raptor::hierarchical_interleaved_bloom_filter< seqan3::data_layout::uncompressed > THIBF;
typedef seqan3::interleaved_bloom_filter< seqan3::data_layout::uncompressed > TIBF;
-typedef robin_hood::unordered_map< std::string, size_t > TMatches;
+typedef robin_hood::unordered_map< std::string, std::tuple< size_t, double > > TMatches;
typedef std::vector< std::tuple< size_t, std::string > > TBinMap;
typedef robin_hood::unordered_map< std::string, std::vector< size_t > > TMap;
+typedef robin_hood::unordered_map< std::string, double > TTargetFpr;
struct Node
{
@@ -98,6 +99,16 @@ struct ReadBatches
struct ReadMatch
{
+ ReadMatch()
+ {
+ }
+
+ ReadMatch( std::string _target, size_t _kmer_count )
+ {
+ target = _target;
+ kmer_count = _kmer_count;
+ }
+
std::string target;
size_t kmer_count;
};
@@ -188,6 +199,7 @@ struct FilterConfig
std::string tax_file = "";
double rel_cutoff;
IBFConfig ibf_config;
+ TTargetFpr target_fpr;
};
struct HierarchyConfig
@@ -196,6 +208,7 @@ struct HierarchyConfig
uint8_t kmer_size;
uint32_t window_size;
double rel_filter;
+ double fpr_query;
std::string output_file_lca;
std::string output_file_all;
};
@@ -247,8 +260,13 @@ std::map< std::string, HierarchyConfig > parse_hierarchy( Config& config )
output_file_all = config.output_prefix + ".all";
}
- parsed_hierarchy[config.hierarchy_labels[h]] =
- HierarchyConfig{ fc, 0, 0, config.rel_filter[hierarchy_count], output_file_lca, output_file_all };
+ parsed_hierarchy[config.hierarchy_labels[h]] = HierarchyConfig{ fc,
+ 0,
+ 0,
+ config.rel_filter[hierarchy_count],
+ config.fpr_query[hierarchy_count],
+ output_file_lca,
+ output_file_all };
++hierarchy_count;
}
else
@@ -268,6 +286,7 @@ void print_hierarchy( Config const& config, auto const& parsed_hierarchy )
{
std::cerr << hierarchy_config.first << newl;
std::cerr << "--rel-filter " << hierarchy_config.second.rel_filter << newl;
+ std::cerr << "--fpr-query " << hierarchy_config.second.fpr_query << newl;
for ( auto const& filter_config : hierarchy_config.second.filters )
{
std::cerr << " " << filter_config.ibf_file;
@@ -311,31 +330,42 @@ inline size_t threshold_rel( size_t n_hashes, double p )
return std::ceil( n_hashes * p );
}
+// https://stackoverflow.com/questions/44718971/calculate-binomial-coffeficient-very-reliably
+inline double binom( double n, double k ) noexcept
+{
+ return std::exp( std::lgamma( n + 1 ) - std::lgamma( n - k + 1 ) - std::lgamma( k + 1 ) );
+}
+
+
void select_matches( Filter< TIBF >& filter,
TMatches& matches,
std::vector< size_t >& hashes,
auto& agent,
size_t threshold_cutoff,
- size_t& max_kmer_count_read )
+ size_t& max_kmer_count_read,
+ size_t n_hashes )
{
- // Count every occurance on IBF
+ // Count every occurrence on IBF
seqan3::counting_vector< detail::TIntCount > counts = agent.bulk_count( hashes );
for ( auto const& [target, bins] : filter.map )
{
- // Sum counts among bins (split target (user bins) into several tecnical bins)
+ // Sum counts among bins (split target (user bins) into several technical bins)
size_t summed_count = 0;
for ( auto const& binno : bins )
{
summed_count += counts[binno];
}
+ // summed_count can be higher than n_hashes for matches in several technical bins
+ if ( summed_count > n_hashes )
+ summed_count = n_hashes;
if ( summed_count >= threshold_cutoff )
{
// ensure that count was not already found for target with higher count
// can happen in case of ambiguos targets in multiple filters
- if ( summed_count > matches[target] )
+ if ( summed_count > std::get< 0 >( matches[target] ) )
{
- matches[target] = summed_count;
+ matches[target] = std::make_tuple( summed_count, filter.filter_config.target_fpr[target] );
if ( summed_count > max_kmer_count_read )
max_kmer_count_read = summed_count;
}
@@ -348,39 +378,59 @@ void select_matches( Filter< THIBF >& filter,
std::vector< size_t >& hashes,
auto& agent,
size_t threshold_cutoff,
- size_t& max_kmer_count_read )
+ size_t& max_kmer_count_read,
+ size_t n_hashes )
{
// Count only matches above threhsold
seqan3::counting_vector< detail::TIntCount > counts = agent.bulk_count( hashes, threshold_cutoff );
-
// Only one bin per target
for ( auto const& [target, bins] : filter.map )
{
if ( counts[bins[0]] > 0 )
{
- const size_t count = counts[bins[0]];
+ // Sum counts among bins (split target (user bins) into several technical bins)
+ size_t summed_count = counts[bins[0]];
+ // summed_count can be higher than n_hashes for matches in several technical bins
+ if ( summed_count > n_hashes )
+ summed_count = n_hashes;
// ensure that count was not already found for target with higher count
- // can happen in case of ambiguos targets in multiple filters
- if ( count > matches[target] )
+ // can happen in case of ambiguous targets in multiple filters
+ if ( summed_count > std::get< 0 >( matches[target] ) )
{
- matches[target] = count;
- if ( count > max_kmer_count_read )
- max_kmer_count_read = count;
+ matches[target] = std::make_tuple( summed_count, filter.filter_config.target_fpr[target] );
+ if ( summed_count > max_kmer_count_read )
+ max_kmer_count_read = summed_count;
}
}
}
}
-size_t filter_matches( ReadOut& read_out, TMatches& matches, TRep& rep, size_t threshold_filter )
+size_t filter_matches(
+ ReadOut& read_out, TMatches& matches, TRep& rep, size_t threshold_filter, size_t n_hashes, double min_fpr_query )
{
- for ( auto const& [target, kmer_count] : matches )
+ for ( auto const& [target, count_fpr] : matches )
{
- if ( kmer_count >= threshold_filter )
+ if ( std::get< 0 >( count_fpr ) >= threshold_filter )
{
+ // Filter by fpr-query
+ if ( min_fpr_query < 1.0 )
+ {
+ double q = 1;
+ for ( size_t i = 0; i <= std::get< 0 >( count_fpr ); i++ )
+ {
+ q -= binom( n_hashes, i ) * pow( std::get< 1 >( count_fpr ), i )
+ * pow( 1 - std::get< 1 >( count_fpr ), n_hashes - i );
+ }
+ if ( q > min_fpr_query )
+ {
+ continue;
+ }
+ }
+
rep[target].matches++;
- read_out.matches.push_back( ReadMatch{ target, kmer_count } );
+ read_out.matches.push_back( ReadMatch{ target, std::get< 0 >( count_fpr ) } );
}
}
@@ -459,7 +509,7 @@ void classify( std::vector< Filter< TFilter > >& filters,
// Best scoring kmer count
size_t max_kmer_count_read = 0;
-
+ size_t n_hashes = 0;
// if length is smaller than window, skip read
if ( read1_len >= hierarchy_config.window_size )
{
@@ -473,7 +523,7 @@ void classify( std::vector< Filter< TFilter > >& filters,
hashes.insert( hashes.end(), h2.begin(), h2.end() );
}
- const size_t n_hashes = hashes.size();
+ n_hashes = hashes.size();
// if n_hashes are bigger than int limit, skip read
if ( n_hashes <= hashes_limit )
{
@@ -495,7 +545,8 @@ void classify( std::vector< Filter< TFilter > >& filters,
threshold_cutoff = 1;
// count and select matches
- select_matches( filters[i], matches, hashes, agents[i], threshold_cutoff, max_kmer_count_read );
+ select_matches(
+ filters[i], matches, hashes, agents[i], threshold_cutoff, max_kmer_count_read, n_hashes );
}
}
}
@@ -506,52 +557,58 @@ void classify( std::vector< Filter< TFilter > >& filters,
// if read got valid matches (above cutoff)
if ( max_kmer_count_read > 0 )
{
- total.reads_classified++;
// Calculate threshold for filtering (keep matches above)
const size_t threshold_filter =
max_kmer_count_read - threshold_rel( max_kmer_count_read, hierarchy_config.rel_filter );
// Filter matches
- const size_t count_filtered_matches = filter_matches( read_out, matches, rep, threshold_filter );
+ const size_t count_filtered_matches =
+ filter_matches( read_out, matches, rep, threshold_filter, n_hashes, hierarchy_config.fpr_query );
- if ( !config.skip_lca )
+ if ( count_filtered_matches > 0 )
{
- ReadOut read_out_lca( rb.ids[readID] );
- if ( count_filtered_matches == 1 )
- {
- // just one match, copy read read_out and set as unique
- read_out_lca = read_out;
- rep[read_out.matches[0].target].unique_reads++;
- }
- else
- {
- lca_matches( read_out, read_out_lca, max_kmer_count_read, lca, rep );
- }
- if ( config.output_lca )
- classified_lca_queue.push( read_out_lca );
- }
- else
- {
- // Not running lca and has unique match
- if ( count_filtered_matches == 1 )
+ total.reads_classified++;
+
+ if ( !config.skip_lca )
{
- rep[read_out.matches[0].target].unique_reads++;
+ ReadOut read_out_lca( rb.ids[readID] );
+ if ( count_filtered_matches == 1 )
+ {
+ // just one match, copy read read_out and set as unique
+ read_out_lca = read_out;
+ rep[read_out.matches[0].target].unique_reads++;
+ }
+ else
+ {
+ lca_matches( read_out, read_out_lca, max_kmer_count_read, lca, rep );
+ }
+
+ if ( config.output_lca )
+ classified_lca_queue.push( read_out_lca );
}
else
{
- // without tax, no lca, count multi-matches to a root node
- // to keep consistency among reports (no. of classified reads)
- rep[config.tax_root_node].unique_reads++;
+ // Not running lca and has unique match
+ if ( count_filtered_matches == 1 )
+ {
+ rep[read_out.matches[0].target].unique_reads++;
+ }
+ else
+ {
+ // without tax, no lca, count multi-matches to a root node
+ // to keep consistency among reports (no. of classified reads)
+ rep[config.tax_root_node].lca_reads++;
+ }
}
- }
- if ( config.output_all )
- classified_all_queue.push( read_out );
+ if ( config.output_all )
+ classified_all_queue.push( read_out );
- // read classified, continue to the next
- continue;
+ // read classified, continue to the next
+ continue;
+ }
}
// not classified
@@ -597,7 +654,11 @@ void write_report( TRep& rep, TTax& tax, std::ofstream& out_rep, std::string hie
}
}
-size_t load_filter( THIBF& filter, IBFConfig& ibf_config, TBinMap& bin_map, std::string const& input_filter_file )
+size_t load_filter( THIBF& filter,
+ IBFConfig& ibf_config,
+ TBinMap& bin_map,
+ std::string const& input_filter_file,
+ TTargetFpr& target_fpr )
{
std::ifstream is( input_filter_file, std::ios::binary );
cereal::BinaryInputArchive archive( is );
@@ -624,6 +685,7 @@ size_t load_filter( THIBF& filter, IBFConfig& ibf_config, TBinMap& bin_map, std:
// load ibf_config from raptor params
ibf_config.window_size = window_size;
ibf_config.kmer_size = shape.count();
+ ibf_config.max_fp = fpr;
// Create map from paths
size_t binno{};
@@ -645,6 +707,8 @@ size_t load_filter( THIBF& filter, IBFConfig& ibf_config, TBinMap& bin_map, std:
}
bin_map.push_back( std::make_tuple( binno, f ) );
+ // same fpr for all
+ target_fpr[f] = fpr;
}
++binno;
}
@@ -652,7 +716,20 @@ size_t load_filter( THIBF& filter, IBFConfig& ibf_config, TBinMap& bin_map, std:
return filter.user_bins.num_user_bins();
}
-size_t load_filter( TIBF& filter, IBFConfig& ibf_config, TBinMap& bin_map, std::string const& input_filter_file )
+inline double false_positive( uint64_t bin_size_bits, uint8_t hash_functions, uint64_t n_hashes )
+{
+ /*
+ * calculates the theoretical false positive of a bin (bf) based on parameters
+ */
+ return std::pow( 1 - std::exp( -hash_functions / ( bin_size_bits / static_cast< double >( n_hashes ) ) ),
+ hash_functions );
+}
+
+size_t load_filter( TIBF& filter,
+ IBFConfig& ibf_config,
+ TBinMap& bin_map,
+ std::string const& input_filter_file,
+ TTargetFpr& target_fpr )
{
std::ifstream is( input_filter_file, std::ios::binary );
cereal::BinaryInputArchive archive( is );
@@ -665,6 +742,25 @@ size_t load_filter( TIBF& filter, IBFConfig& ibf_config, TBinMap& bin_map, std::
archive( hashes_count_std );
archive( bin_map );
archive( filter );
+
+
+ // generate fpr for each bin
+ for ( auto const& [target, count] : hashes_count_std )
+ {
+ // Use average number of hashes for each bin to calculate fp
+ uint64_t n_bins_target = std::ceil( count / static_cast< double >( ibf_config.max_hashes_bin ) );
+ // this can be off by a very small number (rounding ceil on multiple bins)
+ uint64_t n_hashes_bin = std::ceil( count / static_cast< double >( n_bins_target ) );
+
+ // false positive for the current target, considering split bins
+ target_fpr[target] =
+ 1.0
+ - std::pow( 1.0 - false_positive( ibf_config.bin_size_bits, ibf_config.hash_functions, n_hashes_bin ),
+ n_bins_target );
+ ;
+ }
+
+
return filter.bin_count();
}
@@ -693,11 +789,12 @@ bool load_files( std::vector< Filter< TFilter > >& filters, std::vector< FilterC
size_t filter_cnt = 0;
for ( auto& filter_config : fconf )
{
- TTax tax;
- IBFConfig ibf_config;
- TBinMap bin_map;
- TFilter filter;
- auto bin_count = load_filter( filter, ibf_config, bin_map, filter_config.ibf_file );
+ TTax tax;
+ IBFConfig ibf_config;
+ TBinMap bin_map;
+ TFilter filter;
+ TTargetFpr target_fpr;
+ auto bin_count = load_filter( filter, ibf_config, bin_map, filter_config.ibf_file, target_fpr );
// Parse vector with bin_map to the old map
TMap map;
@@ -707,6 +804,7 @@ bool load_files( std::vector< Filter< TFilter > >& filters, std::vector< FilterC
}
filter_config.ibf_config = ibf_config;
+ filter_config.target_fpr = target_fpr;
if ( filter_config.tax_file != "" )
tax = load_tax( filter_config.tax_file );
diff --git a/src/ganon-classify/include/ganon-classify/Config.hpp b/src/ganon-classify/include/ganon-classify/Config.hpp
index a4ab2087..c2896006 100644
--- a/src/ganon-classify/include/ganon-classify/Config.hpp
+++ b/src/ganon-classify/include/ganon-classify/Config.hpp
@@ -29,6 +29,7 @@ struct Config
std::vector< double > rel_cutoff{ 0.2 };
std::vector< double > rel_filter{ 0.0 };
+ std::vector< double > fpr_query{ 1.0 };
std::string output_prefix = "";
bool output_lca = false;
@@ -118,6 +119,21 @@ struct Config
return false;
}
+ valid_val = true;
+ for ( uint16_t i = 0; i < fpr_query.size(); ++i )
+ {
+ if ( fpr_query[i] < 0 || fpr_query[i] > 1 )
+ {
+ valid_val = false;
+ break;
+ }
+ }
+ if ( !valid_val )
+ {
+ std::cerr << "--fpr-query values should be set between 0 and 1 (1 to disable)" << std::endl;
+ return false;
+ }
+
if ( n_batches < 1 )
n_batches = 1;
@@ -163,6 +179,19 @@ struct Config
return false;
}
+ if ( fpr_query.size() == 1 && unique_hierarchy > 1 )
+ {
+ for ( uint16_t b = 1; b < unique_hierarchy; ++b )
+ {
+ fpr_query.push_back( fpr_query[0] );
+ }
+ }
+ else if ( fpr_query.size() != unique_hierarchy )
+ {
+ std::cerr << "Please provide a single or one-per-hierarchy --fpr-query value[s]" << std::endl;
+ return false;
+ }
+
if ( tax.size() > 0 && ibf.size() != tax.size() )
{
std::cerr << "The number of files provided with --ibf and --tax should match" << std::endl;
@@ -228,6 +257,7 @@ inline std::ostream& operator<<( std::ostream& stream, const Config& config )
stream << "--threads " << config.threads << newl;
stream << "--n-batches " << config.n_batches << newl;
stream << "--n-reads " << config.n_reads << newl;
+ stream << "--skip-lca " << config.skip_lca << newl;
stream << "--verbose " << config.verbose << newl;
stream << "--quiet " << config.quiet << newl;
stream << separator << newl;
diff --git a/src/ganon-classify/include/ganon-classify/LICENSE_hierarchical_interleaved_bloom_filter.md b/src/ganon-classify/include/ganon-classify/LICENSE_hierarchical_interleaved_bloom_filter.md
new file mode 100644
index 00000000..4a3fcde7
--- /dev/null
+++ b/src/ganon-classify/include/ganon-classify/LICENSE_hierarchical_interleaved_bloom_filter.md
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2023, Enrico Seiler
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this
+ list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/src/ganon-classify/include/ganon-classify/hierarchical_interleaved_bloom_filter.hpp b/src/ganon-classify/include/ganon-classify/hierarchical_interleaved_bloom_filter.hpp
index 681ee993..d75b4380 100644
--- a/src/ganon-classify/include/ganon-classify/hierarchical_interleaved_bloom_filter.hpp
+++ b/src/ganon-classify/include/ganon-classify/hierarchical_interleaved_bloom_filter.hpp
@@ -1,13 +1,18 @@
-// -----------------------------------------------------------------------------------------------------
-// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
-// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
+// --------------------------------------------------------------------------------------------------
+// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin
+// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
-// shipped with this file and also available at: https://github.com/seqan/raptor/blob/master/LICENSE.md
-// -----------------------------------------------------------------------------------------------------
+// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md
+// --------------------------------------------------------------------------------------------------
+
+/*!\file
+ * \brief Provides raptor::hierarchical_interleaved_bloom_filter.
+ * \author Enrico Seiler
+ */
#pragma once
-#include
+#include
#include
@@ -34,37 +39,42 @@ namespace raptor
*
* # Terminology
*
+ * ## User Bin
+ * The user may impose a structure on his sequence data in the form of logical groups (e.g. species). When querying the
+ * (H)IBF, the user is interested in an answer that differentiates between these groups.
+ *
* ## Technical Bin
* A Technical Bin represents an actual bin in the binning directory. In the IBF, it stores its kmers in a single Bloom
- * Filter (which is interleaved with all the other BFs).
+ * Filter (which is interleaved with all the other BFs). In the HIBF each of these bins could be merged or splitted,
+ * thus it differs to the original user bins.
*
- * ## User Bin
- * The user may impose a structure on his sequence data in the form of logical groups (e.g. species). When querying the
- * IBF, the user is interested in an answer that differentiates between these groups.
+ * ## Layout of the HIBF
+ * The relationship between user bins and technical bins in the HIBF, i.e. which are split or merged, how substructures
+ * of merged bins (lower-level IBFs) look like is called layout.
*
* # Hierarchical Interleaved Bloom Filter (HIBF)
*
- * In constrast to the [seqan3::interleaved_bloom_filter][1], the user bins may be split across multiple technical bins
- * , or multiple user bins may be merged into one technical bin. When merging multiple user bins, the HIBF stores
- * another IBF that is built over the user bins constituting the merged bin. This lower-level IBF can then be used
- * to further distinguish between merged bins.
+ * In constrast to the [seqan3::interleaved_bloom_filter][1], the user bins may be split across multiple technical bins,
+ * or multiple user bins may be merged into one technical bin. When merging multiple user bins, the HIBF stores another
+ * IBF that is built over the user bins constituting the merged bin. This lower-level IBF can then be used to further
+ * distinguish between merged bins.
*
- * In this example, user bin 1 was split into two technical bins. Bins 3, 4, and 5 were merged into a single technical
- * bin, and another IBF was added for the merged bin.
- * \image html hibf.svg
+ * In this example layout, user bin 1 was split into two technical bins. Bins 3, 4, and 5 were merged into a single
+ * technical bin, and another IBF was added for the merged bin.
+ * \image html hibf.svg width=40%
*
* The individual IBFs may have a different number of technical bins and differ in their sizes, allowing an efficient
* distribution of the user bins.
*
* ## Querying
* To query the Hierarchical Interleaved Bloom Filter for values, call
- * hibf::hierarchical_interleaved_bloom_filter::membership_agent() and use the returned
- * hibf::hierarchical_interleaved_bloom_filter::membership_agent.
+ * raptor::hierarchical_interleaved_bloom_filter::membership_agent() and use the returned
+ * raptor::hierarchical_interleaved_bloom_filter::membership_agent.
* In contrast to the [seqan3::interleaved_bloom_filter][1], the result will consist of indices of user bins.
*
* To count the occurrences in each user bin of a range of values in the Hierarchical Interleaved Bloom Filter, call
- * hibf::hierarchical_interleaved_bloom_filter::counting_agent() and use
- * the returned hibf::hierarchical_interleaved_bloom_filter::counting_agent_type.
+ * raptor::hierarchical_interleaved_bloom_filter::counting_agent() and use
+ * the returned raptor::hierarchical_interleaved_bloom_filter::counting_agent_type.
*
* ## Thread safety
*
@@ -147,7 +157,8 @@ class hierarchical_interleaved_bloom_filter
* \tparam archive_t Type of `archive`; must satisfy seqan3::cereal_archive.
* \param[in] archive The archive being serialised from/to.
*
- * \attention These functions are never called directly, see \ref serialisation for more details.
+ * \attention These functions are never called directly.
+ * \sa https://docs.seqan.de/seqan/3.2.0/group__io.html#serialisation
*/
template < seqan3::cereal_archive archive_t >
void CEREAL_SERIALIZE_FUNCTION_NAME( archive_t& archive )
@@ -195,13 +206,29 @@ class hierarchical_interleaved_bloom_filter< data_layout_mode >::user_bins
user_bin_filenames.resize( size );
}
- //!\brief Returns a vector containing user bin indices for each bin in the `idx`th IBF.
+ /*!\brief Returns a vector containing user bin indices for each bin in the `idx`th IBF.
+ * \param idx The id of the x-th IBF.
+ *
+ * \details
+ *
+ * ### Example
+ *
+ * \include test/snippet/hibf/bin_indices_of_ibf.cpp
+ */
std::vector< int64_t >& bin_indices_of_ibf( size_t const idx )
{
return ibf_bin_to_filename_position[idx];
}
- //!\brief Returns the filename of the `idx`th user bin.
+ /*!\brief Returns the filename of the `idx`th user bin.
+ * \param idx The id of the x-th user bin.
+ *
+ * \details
+ *
+ * ### Example
+ *
+ * \include test/snippet/hibf/filename_of_user_bin.cpp
+ */
std::string& filename_of_user_bin( size_t const idx )
{
return user_bin_filenames[idx];
@@ -260,7 +287,8 @@ class hierarchical_interleaved_bloom_filter< data_layout_mode >::user_bins
* \tparam archive_t Type of `archive`; must satisfy seqan3::cereal_archive.
* \param[in] archive The archive being serialised from/to.
*
- * \attention These functions are never called directly, see \ref serialisation for more details.
+ * \attention These functions are never called directly.
+ * \sa https://docs.seqan.de/seqan/3.2.0/group__io.html#serialisation
*/
template < typename archive_t >
void serialize( archive_t& archive )
@@ -271,8 +299,8 @@ class hierarchical_interleaved_bloom_filter< data_layout_mode >::user_bins
//!\endcond
};
-/*!\brief Manages membership queries for the hibf::hierarchical_interleaved_bloom_filter.
- * \see hibf::hierarchical_interleaved_bloom_filter::user_bins::filename_of_user_bin
+/*!\brief Manages membership queries for the raptor::hierarchical_interleaved_bloom_filter.
+ * \see raptor::hierarchical_interleaved_bloom_filter::user_bins::filename_of_user_bin
* \details
* In contrast to the [seqan3::interleaved_bloom_filter][1], the result will consist of indices of user bins.
*/
@@ -357,7 +385,7 @@ class hierarchical_interleaved_bloom_filter< data_layout_mode >::membership_agen
* ### Thread safety
*
* Concurrent invocations of this function are not thread safe, please create a
- * hibf::hierarchical_interleaved_bloom_filter::membership_agent for each thread.
+ * raptor::hierarchical_interleaved_bloom_filter::membership_agent for each thread.
*/
template < std::ranges::forward_range value_range_t >
[[nodiscard]] std::vector< int64_t > const& bulk_contains( value_range_t&& values,
@@ -387,7 +415,7 @@ class hierarchical_interleaved_bloom_filter< data_layout_mode >::membership_agen
};
#if RAPTOR_HIBF_HAS_COUNT
-/*!\brief Manages counting ranges of values for the hibf::hierarchical_interleaved_bloom_filter.
+/*!\brief Manages counting ranges of values for the raptor::hierarchical_interleaved_bloom_filter.
*/
template < seqan3::data_layout data_layout_mode >
template < std::integral value_t >
@@ -473,7 +501,7 @@ class hierarchical_interleaved_bloom_filter< data_layout_mode >::counting_agent_
* ### Thread safety
*
* Concurrent invocations of this function are not thread safe, please create a
- * hibf::hierarchical_interleaved_bloom_filter::counting_agent_type for each thread.
+ * raptor::hierarchical_interleaved_bloom_filter::counting_agent_type for each thread.
*/
template < std::ranges::forward_range value_range_t >
[[nodiscard]] seqan3::counting_vector< value_t > const& bulk_count( value_range_t&& values,
diff --git a/src/ganon/build_update.py b/src/ganon/build_update.py
index b292a7a1..f940e6ef 100644
--- a/src/ganon/build_update.py
+++ b/src/ganon/build_update.py
@@ -55,6 +55,7 @@ def build(cfg):
"-T '" + ",".join(cfg.taxid) + "'" if cfg.taxid else "",
"-A " + str(cfg.top) if cfg.top else "",
"-l 'complete genome'" if cfg.complete_genomes else "",
+ "-c 'representative genome'" if cfg.representative_genomes else "",
"-f 'genomic.fna.gz'",
"-t " + str(cfg.threads),
"-o " + files_output_folder,
@@ -325,9 +326,7 @@ def build_custom(cfg, which_call: str="build_custom"):
"--num-hash-functions " + str(cfg.hash_functions),
"--false-positive-rate " + str(cfg.max_fp),
"--output-filename '" + files_output_folder + "raptor_layout.binning.out'",
- "--threads " + str(cfg.threads),
- "--estimate-union",
- "--rearrange-user-bins"])
+ "--threads " + str(cfg.threads)])
run(run_raptor_layout_cmd, quiet=cfg.quiet)
print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet)
@@ -341,7 +340,7 @@ def build_custom(cfg, which_call: str="build_custom"):
"--output '" + cfg.db_prefix + ".hibf" + "'",
"--threads " + str(cfg.threads),
"--verbose" if cfg.verbose else "",
- "'" + files_output_folder + "raptor_layout.binning.out'"])
+ "--input '" + files_output_folder + "raptor_layout.binning.out'"])
run(run_raptor_build_cmd, quiet=cfg.quiet)
print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet)
@@ -656,4 +655,6 @@ def load_config(config_file):
"""
load configuration
"""
- return pickle.load(open(config_file, "rb"))
+ with open(config_file, "rb") as file:
+ cfg = pickle.load(file)
+ return cfg
diff --git a/src/ganon/classify.py b/src/ganon/classify.py
index f6556694..ee0bcb35 100644
--- a/src/ganon/classify.py
+++ b/src/ganon/classify.py
@@ -35,6 +35,7 @@ def classify(cfg):
"--hierarchy-labels " + ",".join(cfg.hierarchy_labels) if cfg.hierarchy_labels else "",
"--rel-cutoff " + ",".join([str(rc) for rc in cfg.rel_cutoff]) if cfg.rel_cutoff else "",
"--rel-filter " + ",".join([str(rf) for rf in cfg.rel_filter]) if cfg.rel_filter else "",
+ "--fpr-query " + ",".join([str(fq) for fq in cfg.fpr_query]) if cfg.fpr_query else "",
"--output-prefix " + cfg.output_prefix if cfg.output_prefix else "",
"--skip-lca" if cfg.reassign and not cfg.output_lca else "",
"--output-lca" if cfg.output_lca else "",
@@ -67,10 +68,11 @@ def classify(cfg):
report_params = {"db_prefix": cfg.db_prefix,
"input": cfg.output_prefix + ".rep",
"output_prefix": cfg.output_prefix,
- "min_count": 0.0001,
+ "min_count": 0 if cfg.binning else 0.005,
"ranks": cfg.ranks,
"output_format": "tsv",
"verbose": cfg.verbose,
+ "report_type": "reads" if cfg.binning else "abundance",
"quiet": cfg.quiet}
report_cfg = Config("report", **report_params)
print_log("- - - - - - - - - -", cfg.quiet)
diff --git a/src/ganon/config.py b/src/ganon/config.py
index c0c974ab..04f62032 100644
--- a/src/ganon/config.py
+++ b/src/ganon/config.py
@@ -8,7 +8,7 @@
class Config:
- version = "1.6.0"
+ version = "1.7.0"
path_exec = {"build": "", "classify": "", "get_seq_info": "", "genome_updater": ""}
empty = False
@@ -52,7 +52,7 @@ def __init__(self, which: str=None, **kwargs):
build_default_advanced_args.add_argument("-s", "--hash-functions", type=unsigned_int(minval=0, maxval=5), metavar="", default=4, help="The number of hash functions for the interleaved bloom filter [0-5]. 0 to detect optimal value.", choices=range(6))
build_default_advanced_args.add_argument("-j", "--mode", type=str, metavar="", default="avg", help="Create smaller or faster filters at the cost of classification speed or database size, respectively [" + ", ".join(self.choices_mode) + "]. If --filter-size is used, smaller/smallest refers to the false positive rate. By default, an average value is calculated to balance classification speed and database size.", choices=self.choices_mode)
build_default_advanced_args.add_argument("-y", "--min-length", type=unsigned_int(minval=0), metavar="", default=0, help="Skip sequences smaller then value defined. 0 to not skip any sequence.")
- build_default_advanced_args.add_argument("--hibf", action="store_true", help="Builds an HIBF with raptor/chopper (v3). --mode, --filter-size and --min-length will be ignored.")
+ build_default_advanced_args.add_argument("--hibf", action="store_true", help="Builds an HIBF with raptor/chopper (v3). --mode, --filter-size and --min-length will be ignored.")
####################################################################################################
@@ -65,10 +65,12 @@ def __init__(self, which: str=None, **kwargs):
build_download_args = build_parser.add_argument_group("download arguments")
build_download_args.add_argument("-b", "--source", type=str, nargs="*", default=["refseq"], metavar="", help="Source to download [" + ", ".join(self.choices_db_source) + "]", choices=self.choices_db_source)
build_download_args.add_argument("-o", "--top", type=unsigned_int(minval=0), default=0, metavar="", help="Download limited assemblies for each taxa. 0 for all.")
- build_download_args.add_argument("-c", "--complete-genomes", action="store_true", help="Download only sub-set of complete genomes")
+ build_download_args.add_argument("-c", "--complete-genomes", action="store_true", help="Download only sub-set of complete genomes")
+ build_download_args.add_argument("-r", "--representative-genomes", action="store_true", help="Download only sub-set of representative genomes")
build_download_args.add_argument("-u", "--genome-updater", type=str, metavar="", help="Additional genome_updater parameters (https://github.com/pirovc/genome_updater)")
build_download_args.add_argument("-m", "--taxonomy-files", type=file_exists, nargs="*", metavar="", help="Specific files for taxonomy - otherwise files will be downloaded")
build_download_args.add_argument("-z", "--genome-size-files", type=file_exists, nargs="*", metavar="", help="Specific files for genome size estimation - otherwise files will be downloaded")
+ build_download_args.add_argument("--skip-genome-size", action="store_true", help="Do not attempt to get genome sizes. Activate this option when using sequences not representing full genomes.")
####################################################################################################
@@ -85,6 +87,7 @@ def __init__(self, which: str=None, **kwargs):
build_custom_args.add_argument("-l", "--level", type=str, metavar="", help="Use a specialized target to build the database. By default, --level is the --input-target. Options: any available taxonomic rank [species, genus, ...] or 'leaves' (requires --taxonomy). Further specialization options [" + ", ".join(self.choices_level) + "]. assembly will retrieve and use the assembly accession and name. custom requires and uses the specialization field in the --input-file.")
build_custom_args.add_argument("-m", "--taxonomy-files", type=file_exists, nargs="*", metavar="", help="Specific files for taxonomy - otherwise files will be downloaded")
build_custom_args.add_argument("-z", "--genome-size-files", type=file_exists, nargs="*", metavar="", help="Specific files for genome size estimation - otherwise files will be downloaded")
+ build_custom_args.add_argument("--skip-genome-size", action="store_true", help="Do not attempt to get genome sizes. Activate this option when using sequences not representing full genomes.")
ncbi_args = build_custom_parser.add_argument_group("ncbi arguments")
ncbi_args.add_argument("-r", "--ncbi-sequence-info", type=str, nargs="*", default=[], metavar="", help="Uses NCBI e-utils webservices or downloads accession2taxid files to extract target information. [" + ", ".join(self.choices_ncbi_sequence_info) + " or one or more accession2taxid files from https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/accession2taxid/]. By default uses e-utils up-to 50000 sequences or downloads nucl_gb nucl_wgs otherwise.")
@@ -132,7 +135,8 @@ def __init__(self, which: str=None, **kwargs):
classify_group_cutoff_filter = classify_parser.add_argument_group("cutoff/filter arguments")
classify_group_cutoff_filter.add_argument("-c", "--rel-cutoff", type=int_or_float(minval=0, maxval=1), nargs="*", metavar="", default=[0.75], help="Min. percentage of a read (set of minimizers) shared with the a reference necessary to consider a match. Generally used to cutoff low similarity matches. Single value or one per database (e.g. 0.7 1 0.25). 0 for no cutoff")
classify_group_cutoff_filter.add_argument("-e", "--rel-filter", type=int_or_float(minval=0, maxval=1), nargs="*", metavar="", default=[0.0], help="Additional relative percentage of minimizers (relative to the best match) to keep a match. Generally used to select best matches above cutoff. Single value or one per hierarchy (e.g. 0.1 0). 1 for no filter")
-
+ classify_group_cutoff_filter.add_argument("-f", "--fpr-query", type=int_or_float(minval=0, maxval=1), nargs="*", metavar="", default=[1e-5], help="Max. false positive of a query to accept a match. Applied after --rel-cutoff and --rel-filter. Generally used to remove false positives matches querying a database build with large --max-fp. Single value or one per hierarchy (e.g. 0.1 0). 1 for no filter")
+
classify_group_output = classify_parser.add_argument_group("output arguments")
classify_group_output.add_argument("-o", "--output-prefix", type=str, metavar="", help="Output prefix for output (.rep) and report (.tre). Empty to output to STDOUT (only .rep)")
classify_group_output.add_argument("--output-lca", action="store_true", help="Output an additional file with one lca match for each read (.lca)")
@@ -142,10 +146,10 @@ def __init__(self, which: str=None, **kwargs):
classify_group_other = classify_parser.add_argument_group("other arguments")
classify_group_other.add_argument("-t", "--threads", type=unsigned_int(minval=1), metavar="", default=1, help="Number of sub-processes/threads to use")
+ classify_group_other.add_argument("-b", "--binning", action="store_true", help="Optimized parameters for binning (--rel-cutoff 0.25 --reassign). Will report (.tre) sequence abundances. This file can be re-generated with 'ganon report'.")
+ classify_group_other.add_argument("-a", "--reassign", action="store_true", help="Reassign reads with multiple matches with an EM algorithm. Will enforce --output-all. This file can be re-generated with 'ganon reassign'.")
classify_group_other.add_argument("-l", "--hierarchy-labels", type=str, nargs="*", metavar="", help="Hierarchy definition of --db-prefix files to be classified. Can also be a string, but input will be sorted to define order (e.g. 1 1 2 3). The default value reported without hierarchy is 'H1'")
classify_group_other.add_argument("-r", "--ranks", type=str, nargs="*", metavar="", default=[], help="Ranks to report taxonomic abundances (.tre). empty will report default ranks [" + ", ".join(self.choices_default_ranks) + "]. This file can be re-generated with the 'ganon report' command for other types of abundances (reads, matches) with further filtration and output options")
- classify_group_other.add_argument("-a", "--reassign", action="store_true", help="Reassign reads with multiple matches with an EM algorithm. Will enforce --output-all. This file can be re-generated with the 'ganon reassign'.")
-
classify_group_other.add_argument("--verbose", action="store_true", help="Verbose output mode")
classify_group_other.add_argument("--quiet", action="store_true", help="Quiet output mode")
classify_group_other.add_argument("--hibf", action="store_true", help=argparse.SUPPRESS)
@@ -184,6 +188,7 @@ def __init__(self, which: str=None, **kwargs):
report_group_dbtax.add_argument("-x", "--taxonomy", type=str, metavar="", default="ncbi", help="Taxonomy database to use [" + ", ".join(self.choices_taxonomy) + "]. Mutually exclusive with --db-prefix.", choices=self.choices_taxonomy)
report_group_dbtax.add_argument("-m", "--taxonomy-files", type=file_exists, nargs="*", metavar="", help="Specific files for taxonomy - otherwise files will be downloaded")
report_group_dbtax.add_argument("-z", "--genome-size-files", type=file_exists, nargs="*", metavar="", help="Specific files for genome size estimation - otherwise files will be downloaded")
+ report_group_dbtax.add_argument("--skip-genome-size", action="store_true", help="Do not attempt to get genome sizes. Valid only without --db-prefix. Activate this option when using sequences not representing full genomes.")
report_group_output = report_parser.add_argument_group("output arguments")
report_group_output.add_argument("-f", "--output-format", type=str, metavar="", default="tsv", help="Output format [" + ", ".join(self.choices_report_output) + "]. text outputs a tabulated formatted text file for better visualization. bioboxes is the the CAMI challenge profiling format (only percentage/abundances are reported).", choices=self.choices_report_output)
@@ -444,6 +449,10 @@ def validate(self):
print_log("--output-all / --output-lca / --output-unclassified / --reassign requires --output-prefix to be set")
return False
+ if self.binning:
+ self.rel_cutoff = [0.25]
+ self.reassign = True
+
elif self.which == "report":
if self.skip_hierarchy and self.keep_hierarchy:
diff --git a/src/ganon/reassign.py b/src/ganon/reassign.py
index ba52a1ec..f65aa974 100644
--- a/src/ganon/reassign.py
+++ b/src/ganon/reassign.py
@@ -21,7 +21,7 @@ def reassign(cfg):
if check_file(rep_file):
print_log(".rep file found: " + rep_file, cfg.quiet)
- # look for hiearchies
+ # look for hierarchies
with open(rep_file) as rep:
for line in rep:
if line[0] != "#":
@@ -179,6 +179,7 @@ def get_top_match(matches, prob):
# set first match as target (also the case for no unique matches)
target = matches[0][0]
kcount = matches[0][1]
+
max_p = 0
for m, k in matches:
if prob[m] > max_p:
diff --git a/src/ganon/report.py b/src/ganon/report.py
index 51cac422..c93d4686 100644
--- a/src/ganon/report.py
+++ b/src/ganon/report.py
@@ -344,7 +344,7 @@ def build_report(reports, counts, full_tax, genome_sizes, output_file, fixed_ran
tre_file.close()
if orphan_nodes and not cfg.no_orphan:
- print_log(" - " + str(len(orphan_nodes)) + " not found in the taxonomy (orphan nodes). " +
+ print_log(" - WARNING: " + str(len(orphan_nodes)) + " not found in the taxonomy (orphan nodes). " +
"\n Orphan nodes are reported with 'na' rank with root as a direct parent node. " +
"\n Too show them, use 'na' in --ranks or set --ranks all"
"\n Too ommit them, use --no-orphan", cfg.quiet)
@@ -450,6 +450,7 @@ def correct_genome_size(target_counts, genome_sizes, tax, default_ranks):
"""
ranked_counts = {}
lost_targets = {}
+ no_genome_size_cnt = 0
total_rank_ratio = {r: 0 for r in default_ranks}
total_rank_count = {r: 0 for r in default_ranks}
for target, count in target_counts.items():
@@ -466,10 +467,17 @@ def correct_genome_size(target_counts, genome_sizes, tax, default_ranks):
# Sum total counts for each default rank
gs = genome_sizes[closest_parent] if closest_parent in genome_sizes else genome_sizes[tax.root_node]
+ # Keep track of genome sizes = 1 (no genome size available)
+ if gs==1:
+ no_genome_size_cnt+=1
closest_rank = tax.rank(closest_parent)
total_rank_ratio[closest_rank] += count/gs
total_rank_count[closest_rank] += count
+ # Warning, some genomes have no proper size
+ if no_genome_size_cnt > 0 and len(target_counts) != no_genome_size_cnt:
+ print_log(" - WARNING: " + str(no_genome_size_cnt) + " genomes without proper genome size, abundance estimation may be biased. Use a --report-type without genome size correction or omit --db-prefix on ganon report to re-generate genome sizes.")
+
# Correct counts by the genome sizes (only default ranks)
corr_counts = {t: 0 for t in ranked_counts.keys()}
for node in ranked_counts.keys():
diff --git a/src/ganon/tax_util.py b/src/ganon/tax_util.py
index 622f338d..81707e2b 100644
--- a/src/ganon/tax_util.py
+++ b/src/ganon/tax_util.py
@@ -118,54 +118,61 @@ def get_genome_size(cfg, nodes, tax, build_output_folder):
Only used nodes and lineage are calculated, based on the full set of values provided
If information of a certain node is not provided, uses the closest estimate of parent nodes
"""
+ genome_sizes = {}
+ if cfg.skip_genome_size:
+ # Skipping genome sizes, all set to 1
+ for node in nodes:
+ for t in tax.lineage(node):
+ genome_sizes[t] = 1
+ else:
+ # Download and parse auxiliary files containing genome sizes
+ leaves_sizes = parse_genome_size_files(cfg, build_output_folder)
- # Download and parse auxiliary files containing genome sizes
- leaves_sizes = parse_genome_size_files(cfg, build_output_folder)
-
- tx = time.time()
- print_log("Estimating genome sizes", cfg.quiet)
-
- # Check if entries are on tax and distribute values to available tax. leaves
- for t in list(leaves_sizes.keys()):
- if not tax.latest(t):
- del leaves_sizes[t]
- else:
- # Store genome size estimation for all leaf nodes available in the taxonomy
- for leaf in tax.leaves(t):
- leaves_sizes[leaf] = leaves_sizes[t]
+ tx = time.time()
+ print_log("Estimating genome sizes", cfg.quiet)
- # Calculate genome size estimates for used nodes (and their lineage)
- # using the complete content of leaves_sizes (keeping approx. the same estimates between different dbs)
- genome_sizes = {}
- for node in nodes:
- # For the lineage of each target node
- for t in tax.lineage(node):
- # Skip if already calculated
- if t not in genome_sizes:
- cnt = 0
- avg = 0
- # Make average of available genome sizes in children leaves
+ # Check if entries are on tax and distribute values to available tax. leaves
+ for t in list(leaves_sizes.keys()):
+ if not tax.latest(t):
+ del leaves_sizes[t]
+ else:
+ # Store genome size estimation for all leaf nodes available in the taxonomy
for leaf in tax.leaves(t):
- if leaf in leaves_sizes:
- cnt += 1
- avg += leaves_sizes[leaf]
- genome_sizes[t] = int(avg / cnt) if cnt else 0
-
- # If there is no matching between taxonomy and leaves, average the whole and save to root to be redistributed in the next step
- if sum(genome_sizes.values())==0:
- if leaves_sizes:
- genome_sizes[tax.root_node] = int(sum(leaves_sizes.values())/len(leaves_sizes))
- else:
- genome_sizes[tax.root_node] = 1
- # Check nodes without genome size info (0) and use closest value from parent lineage
- for node in nodes:
- if genome_sizes[node] == 0:
- # Fill lineage of zeros with latest genome size estimation
+ leaves_sizes[leaf] = leaves_sizes[t]
+
+ # Calculate genome size estimates for used nodes (and their lineage)
+ # using the complete content of leaves_sizes (keeping approx. the same estimates between different dbs)
+
+ for node in nodes:
+ # For the lineage of each target node
for t in tax.lineage(node):
- if genome_sizes[t] == 0:
- genome_sizes[t] = genome_sizes[tax.parent(t)]
+ # Skip if already calculated
+ if t not in genome_sizes:
+ cnt = 0
+ avg = 0
+ # Make average of available genome sizes in children leaves
+ for leaf in tax.leaves(t):
+ if leaf in leaves_sizes:
+ cnt += 1
+ avg += leaves_sizes[leaf]
+ genome_sizes[t] = int(avg / cnt) if cnt else 0
+
+ # If there is no matching between taxonomy and leaves, average the whole and save to root to be redistributed in the next step
+ if sum(genome_sizes.values())==0:
+ if leaves_sizes:
+ genome_sizes[tax.root_node] = int(sum(leaves_sizes.values())/len(leaves_sizes))
+ else:
+ genome_sizes[tax.root_node] = 1
+ # Check nodes without genome size info (0) and use closest value from parent lineage
+ for node in nodes:
+ if genome_sizes[node] == 0:
+ # Fill lineage of zeros with latest genome size estimation
+ for t in tax.lineage(node):
+ if genome_sizes[t] == 0:
+ genome_sizes[t] = genome_sizes[tax.parent(t)]
+
+ print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet)
- print_log(" - done in " + str("%.2f" % (time.time() - tx)) + "s.\n", cfg.quiet)
return genome_sizes
diff --git a/tests/ganon-classify/GanonClassify.test.cpp b/tests/ganon-classify/GanonClassify.test.cpp
index 3ea94373..7624e4a6 100644
--- a/tests/ganon-classify/GanonClassify.test.cpp
+++ b/tests/ganon-classify/GanonClassify.test.cpp
@@ -782,6 +782,42 @@ SCENARIO( "classifying reads with errors", "[ganon-classify][with-errors]" )
}
}
+ SECTION( "--rel-cutoff 0.2 --rel-filter 0.5 --fpr-query 1e-10" )
+ {
+ std::string prefix{ folder_prefix + "rel_cutoff_0.2_rel_filter_0.5_fpr_query_1e-10" };
+ auto cfg = config_classify::defaultConfig( prefix );
+ cfg.ibf = { base_prefix + ".ibf" };
+ cfg.single_reads = { folder_prefix + "readF.fasta" };
+ cfg.rel_cutoff = { 0.2 };
+ cfg.rel_filter = { 0.5 };
+ cfg.fpr_query = { 1e-10 };
+ REQUIRE( GanonClassify::run( cfg ) );
+ config_classify::Res res{ cfg };
+ config_classify::sanity_check( cfg, res );
+
+ REQUIRE( res.all["readF"].size() == 2 );
+ REQUIRE( res.all["readF"]["e0"] == 9 );
+ REQUIRE( res.all["readF"]["e1F_e2R"] == 5 );
+
+ SECTION( "--paired-reads" )
+ {
+ prefix = prefix + "_paired";
+ cfg.output_prefix = prefix;
+ cfg.single_reads = {};
+ cfg.paired_reads = { folder_prefix + "readF.fasta", folder_prefix + "readR.fasta" };
+
+ REQUIRE( GanonClassify::run( cfg ) );
+ config_classify::Res res{ cfg };
+ config_classify::sanity_check( cfg, res );
+
+ REQUIRE( res.all["readF"].size() == 3 );
+ REQUIRE( res.all["readF"]["e0"] == 18 );
+ REQUIRE( res.all["readF"]["e1F"] == 14 );
+ REQUIRE( res.all["readF"]["e1F_e1R"] == 10 );
+ }
+ }
+
+
SECTION( "--rel-cutoff 0.6 --rel-filter 1 (OFF)" )
{
std::string prefix{ folder_prefix + "rel_cutoff_0.6_rel_filter_1" };
@@ -889,6 +925,42 @@ SCENARIO( "classifying reads with errors", "[ganon-classify][with-errors]" )
}
}
+ SECTION( "--rel-cutoff 0 (OFF) --rel-filter 1 (OFF) --fpr-query 1e-10" )
+ {
+ std::string prefix{ folder_prefix + "rel_cutoff_1_rel_filter_1_fpr_query_1e-10" };
+ auto cfg = config_classify::defaultConfig( prefix );
+ cfg.ibf = { base_prefix + ".ibf" };
+ cfg.single_reads = { folder_prefix + "readF.fasta" };
+ cfg.rel_cutoff = { 0 };
+ cfg.rel_filter = { 1 };
+ cfg.fpr_query = { 1e-10 };
+
+ REQUIRE( GanonClassify::run( cfg ) );
+ config_classify::Res res{ cfg };
+ config_classify::sanity_check( cfg, res );
+
+ REQUIRE( res.all["readF"].size() == 2 );
+ REQUIRE( res.all["readF"]["e0"] == 9 );
+ REQUIRE( res.all["readF"]["e1F_e2R"] == 5 );
+
+ SECTION( "--paired-reads" )
+ {
+ prefix = prefix + "_paired";
+ cfg.output_prefix = prefix;
+ cfg.single_reads = {};
+ cfg.paired_reads = { folder_prefix + "readF.fasta", folder_prefix + "readR.fasta" };
+
+ REQUIRE( GanonClassify::run( cfg ) );
+ config_classify::Res res{ cfg };
+ config_classify::sanity_check( cfg, res );
+
+ REQUIRE( res.all["readF"].size() == 3 );
+ REQUIRE( res.all["readF"]["e0"] == 18 );
+ REQUIRE( res.all["readF"]["e1F"] == 14 );
+ REQUIRE( res.all["readF"]["e1F_e1R"] == 10 );
+ }
+ }
+
SECTION( "--window-size" )
{
// build with --window-size
diff --git a/tests/ganon/data/build/genomes/all/GCA/000/147/015/GCA_000147015.1_ASM14701v1/GCA_000147015.1_ASM14701v1_cds_from_genomic.fna.gz b/tests/ganon/data/build/genomes/all/GCA/000/147/015/GCA_000147015.1_ASM14701v1/GCA_000147015.1_ASM14701v1_cds_from_genomic.fna.gz
deleted file mode 100644
index ab4ba6bd..00000000
Binary files a/tests/ganon/data/build/genomes/all/GCA/000/147/015/GCA_000147015.1_ASM14701v1/GCA_000147015.1_ASM14701v1_cds_from_genomic.fna.gz and /dev/null differ
diff --git a/tests/ganon/data/build/genomes/all/GCA/000/147/015/GCA_000147015.1_ASM14701v1/GCA_000147015.1_ASM14701v1_rna_from_genomic.fna.gz b/tests/ganon/data/build/genomes/all/GCA/000/147/015/GCA_000147015.1_ASM14701v1/GCA_000147015.1_ASM14701v1_rna_from_genomic.fna.gz
deleted file mode 100644
index 675cd9f3..00000000
Binary files a/tests/ganon/data/build/genomes/all/GCA/000/147/015/GCA_000147015.1_ASM14701v1/GCA_000147015.1_ASM14701v1_rna_from_genomic.fna.gz and /dev/null differ
diff --git a/tests/ganon/data/build/genomes/all/GCA/002/254/805/GCA_002254805.1_ASM225480v1/GCA_002254805.1_ASM225480v1_cds_from_genomic.fna.gz b/tests/ganon/data/build/genomes/all/GCA/002/254/805/GCA_002254805.1_ASM225480v1/GCA_002254805.1_ASM225480v1_cds_from_genomic.fna.gz
deleted file mode 100644
index a40f18e3..00000000
Binary files a/tests/ganon/data/build/genomes/all/GCA/002/254/805/GCA_002254805.1_ASM225480v1/GCA_002254805.1_ASM225480v1_cds_from_genomic.fna.gz and /dev/null differ
diff --git a/tests/ganon/data/build/genomes/all/GCA/002/254/805/GCA_002254805.1_ASM225480v1/GCA_002254805.1_ASM225480v1_rna_from_genomic.fna.gz b/tests/ganon/data/build/genomes/all/GCA/002/254/805/GCA_002254805.1_ASM225480v1/GCA_002254805.1_ASM225480v1_rna_from_genomic.fna.gz
deleted file mode 100644
index 85c814ef..00000000
Binary files a/tests/ganon/data/build/genomes/all/GCA/002/254/805/GCA_002254805.1_ASM225480v1/GCA_002254805.1_ASM225480v1_rna_from_genomic.fna.gz and /dev/null differ
diff --git a/tests/ganon/data/build/genomes/all/GCA/004/132/065/GCA_004132065.1_ASM413206v1/GCA_004132065.1_ASM413206v1_cds_from_genomic.fna.gz b/tests/ganon/data/build/genomes/all/GCA/004/132/065/GCA_004132065.1_ASM413206v1/GCA_004132065.1_ASM413206v1_cds_from_genomic.fna.gz
deleted file mode 100644
index 37eeea86..00000000
Binary files a/tests/ganon/data/build/genomes/all/GCA/004/132/065/GCA_004132065.1_ASM413206v1/GCA_004132065.1_ASM413206v1_cds_from_genomic.fna.gz and /dev/null differ
diff --git a/tests/ganon/data/classify/test_db.hibf b/tests/ganon/data/classify/test_db.hibf
new file mode 100644
index 00000000..5755549f
Binary files /dev/null and b/tests/ganon/data/classify/test_db.hibf differ
diff --git a/tests/ganon/data/classify/test_db.tax b/tests/ganon/data/classify/test_db.tax
new file mode 100644
index 00000000..a6712a27
--- /dev/null
+++ b/tests/ganon/data/classify/test_db.tax
@@ -0,0 +1,29 @@
+1 0 no rank root 4222504
+1971485 204619 species Candidatus Nardonella dryophthoridicola 7882008
+1972133 1971485 forma specialis endosymbiont of Rhynchophorus ferrugineus 7882008
+2012515 1801617 species Candidatus Pacearchaeota archaeon ex4484_31 1970502
+2565781 2856051 species Candidatus Nanohalobium constans 6896757
+871271 884215 strain Candidatus Zinderia insecticola CARI 985251
+118884 1236 no rank Gammaproteobacteria incertae sedis 7882008
+1224 2 phylum Proteobacteria 3941004
+1236 1224 class Gammaproteobacteria 5418880
+131567 1 no rank cellular organisms 4222504
+1462430 1783276 phylum Candidatus Nanohaloarchaeota 6896757
+1783276 2157 clade DPANN group 4433629
+1801617 1783276 clade Candidatus Pacearchaeota 1970502
+2 131567 superkingdom Bacteria 3941004
+204619 118884 genus Candidatus Nardonella 7882008
+2157 131567 superkingdom Archaea 4433629
+28216 1224 class Betaproteobacteria 985251
+2856051 2856054 genus Candidatus Nanohalobium 6896757
+2856052 1462430 class Candidatus Nanohalobia 6896757
+2856053 2856052 order Candidatus Nanohalobiales 6896757
+2856054 2856053 family Candidatus Nanohalobiaceae 6896757
+75682 80840 family Oxalobacteraceae 985251
+80840 28216 order Burkholderiales 985251
+884214 75682 genus Candidatus Zinderia 985251
+884215 884214 species Candidatus Zinderia insecticola 985251
+GCF_004296495.1 1972133 assembly endosymbiont of Rhynchophorus ferrugineus 7882008
+GCA_000147015.1 871271 assembly Candidatus Zinderia insecticola CARI 985251
+GCF_009617975.1 2565781 assembly Candidatus Nanohalobium constans LC1Nh 6896757
+GCA_002254805.1 2012515 assembly Candidatus Pacearchaeota archaeon ex4484_31 1970502
diff --git a/tests/ganon/integration/test_classify.py b/tests/ganon/integration/test_classify.py
index aa755a10..02ad0cf0 100644
--- a/tests/ganon/integration/test_classify.py
+++ b/tests/ganon/integration/test_classify.py
@@ -146,5 +146,21 @@ def test_reassign(self):
# There are only single matches on output
self.assertEqual(len(res["all_pd"].readid), len(res["all_pd"].readid.unique()), "ganon reassign has multiple matches")
+ def test_hibf(self):
+ """
+ Test ganon classify with HIBF
+ """
+ params = self.default_params.copy()
+ params["db_prefix"] = data_dir + "classify/test_db"
+ params["output_prefix"] = self.results_dir + "hibf"
+ params["verbose"] = True
+ # Build config from params
+ cfg = Config("classify", **params)
+ # Run
+ self.assertTrue(run_ganon(cfg, params["output_prefix"]), "ganon classify exited with an error")
+ # General sanity check of results
+ res = classify_sanity_check_and_parse(vars(cfg))
+ self.assertIsNotNone(res, "ganon table has inconsistent results")
+
if __name__ == '__main__':
unittest.main()
diff --git a/tests/ganon/integration_online/test_report.py b/tests/ganon/integration_online/test_report.py
index ea0796b1..e25ce229 100644
--- a/tests/ganon/integration_online/test_report.py
+++ b/tests/ganon/integration_online/test_report.py
@@ -87,6 +87,22 @@ def test_ncbi(self):
res = report_sanity_check_and_parse(vars(cfg))
self.assertIsNotNone(res, "ganon report has inconsistent results")
+
+ # No genome size
+ params = self.default_params.copy()
+ params["input"] = self.results_dir + "base_classify_ncbi.rep"
+ params["output_prefix"] = self.results_dir + "test_ncbi_skip_genome_size"
+ params["taxonomy"] = "ncbi"
+ params["taxonomy_files"] = data_dir + "build-custom/taxdump.tar.gz"
+ params["skip_genome_size"] = True
+ # Build config from params
+ cfg = Config("report", **params)
+ self.assertTrue(
+ run_ganon(cfg, params["output_prefix"]), "ganon report exited with an error")
+ # General sanity check of results
+ res = report_sanity_check_and_parse(vars(cfg))
+ self.assertIsNotNone(res, "ganon report has inconsistent results")
+
def test_gtdb(self):
"""
Test run with --taxonomy gtdb, downloading .tax
@@ -145,5 +161,21 @@ def test_gtdb(self):
self.assertIsNotNone(res, "ganon report has inconsistent results")
+ # No genome size
+ params = self.default_params.copy()
+ params["input"] = self.results_dir + "base_classify_gtdb.rep"
+ params["output_prefix"] = self.results_dir + "test_gtdb_skip_genome_size"
+ params["taxonomy"] = "gtdb"
+ params["taxonomy_files"] = [data_dir + "build-custom/ar53_taxonomy.tsv.gz",
+ data_dir + "build-custom/bac120_taxonomy.tsv.gz"]
+ params["skip_genome_size"] = True
+ # Build config from params
+ cfg = Config("report", **params)
+ self.assertTrue(
+ run_ganon(cfg, params["output_prefix"]), "ganon report exited with an error")
+ # General sanity check of results
+ res = report_sanity_check_and_parse(vars(cfg))
+ self.assertIsNotNone(res, "ganon report has inconsistent results")
+
if __name__ == '__main__':
unittest.main()