diff --git a/CMakeLists.txt b/CMakeLists.txt index 91dc0428..bcba0c50 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,11 @@ cmake_minimum_required (VERSION 3.8) # Define the application name and version. project (raptor VERSION 2.0.0) +# Messages +string (ASCII 27 Esc) +set (FontBold "${Esc}[1m") +set (FontReset "${Esc}[m") + # Fallback to these values if there is no git or no git repository set (RAPTOR_COMMIT_DATE "2021-08-20--no-git" CACHE STRING @@ -52,24 +57,32 @@ if (NOT CMAKE_BUILD_TYPE) FORCE) endif () +set (RAPTOR_NATIVE_BUILD ON CACHE BOOL "Optimize build for current architecture.") +if (RAPTOR_NATIVE_BUILD) + message (STATUS "${FontBold}Native build enabled. Built binaries will be optimized for this system.${FontReset}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native") +else () + message (STATUS "${FontBold}Native build disabled. Detecting popcnt support.${FontReset}") + include (CheckCXXCompilerFlag) + check_cxx_compiler_flag ("-mpopcnt" RAPTOR_HAS_POPCNT) + if (RAPTOR_HAS_POPCNT) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mpopcnt") + endif () +endif () + # Specify the directories where to store the built archives, libraries and executables set (CMAKE_ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set (CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib") set (CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin") -# Messages -string (ASCII 27 Esc) -set (FontBold "${Esc}[1m") -set (FontReset "${Esc}[m") - # Dependency: SeqAn3. set (SEQAN3_CEREAL ON CACHE BOOL "Require cereal to be present.") set (SEQAN3_SUBMODULES_DIR "lib") find_package (SeqAn3 QUIET REQUIRED HINTS lib/seqan3/build_system) # Use ccache. -set (USE_CCACHE ON CACHE BOOL "Use ccache if available.") -if (USE_CCACHE) +set (RAPTOR_USE_CCACHE ON CACHE BOOL "Use ccache if available.") +if (RAPTOR_USE_CCACHE) include ("${SEQAN3_CLONE_DIR}/test/cmake/seqan3_require_ccache.cmake") seqan3_require_ccache () endif () diff --git a/README.md b/README.md index 482a7483..7c644181 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,18 @@ -# Raptor [![build status](https://github.com/seqan/raptor/workflows/Raptor%20CI/badge.svg?branch=master)](https://github.com/seqan/raptor/actions) [![codecov](https://codecov.io/gh/seqan/raptor/branch/master/graph/badge.svg?token=SJVMYRUKW2)](https://codecov.io/gh/seqan/raptor) +# Raptor [![build status][1]][2] [![codecov][3]][4] [![install with bioconda][5]][6] + +[1]: https://github.com/seqan/raptor/workflows/Raptor%20CI/badge.svg?branch=master +[2]: https://github.com/seqan/raptor/actions +[3]: https://codecov.io/gh/seqan/raptor/branch/master/graph/badge.svg?token=SJVMYRUKW2 +[4]: https://codecov.io/gh/seqan/raptor +[5]: https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat +[6]: https://bioconda.github.io/recipes/raptor/README.html + ### A fast and space-efficient pre-filter for querying very large collections of nucleotide sequences ## Download and Installation -There may be performance benefits when compiling from source, especially when using `-march=native` as compiler -directive. +There may be performance benefits when compiling from source as the build can be optimized for the host system. ### Install with bioconda (Linux) -[![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg?style=flat)](http://bioconda.github.io/recipes/raptor/README.html) ```bash conda install -c bioconda -c conda-forge raptor @@ -62,16 +68,20 @@ make The binary can be found in `bin`. -You may want to add the raptor executable yo your PATH: +You may want to add the Raptor executable to your PATH: ``` export PATH=$(pwd)/bin:$PATH raptor --version ``` +By default, Raptor will be built with host specific optimizations (`-march=native`). This behavior can be disabled by +passing `-DRAPTOR_NATIVE_BUILD=OFF` to CMake. + ## Example Data and Usage -A toy data set can be found [here](https://ftp.imp.fu-berlin.de/pub/seiler/raptor/). +A toy data set (124 MiB compressed, 983 MiB decompressed) can be found +[here](https://ftp.imp.fu-berlin.de/pub/seiler/raptor/). ```bash wget https://ftp.imp.fu-berlin.de/pub/seiler/raptor/example_data.tar.gz @@ -113,7 +123,7 @@ Afterwards, we can search for some reads: raptor search --error 2 --index raptor.index --query example_data/64/reads/mini.fastq --output search.output ``` -The output starts with a header section (lines starting with `\#`). The header maps a number to each input file. +The output starts with a header section (lines starting with `#`). The header maps a number to each input file. After the header section, each line of the output consists of the read ID (in the toy example these are numbers) and the corresponding bins in which they were found: ```text @@ -169,6 +179,18 @@ The preprocessing applies the same cutoffs as used in Mantis This means that only minimisers that occur more often than the cutoff specifies are included in the output. If you wish to process all minimisers, you can use `--disable-cutoffs`. +### Partitioned indices +To reduce the overall memory consumption, the index can be divided into multiple (a power of two) parts. +This can be done by passing `--parts n` to `raptor build`, where `n` is the number of parts you want to create. +This will create `n` files, each representing one part of the index. The `--size` parameter describes the overall size +of the index. For example, `--size 8g --parts 4` will create four 2 GiB indices. This will reduce the memory consumption +of `raptor build` and `raptor search` by approximately 6 GiB, since there will only be one part in memory at any given +time. `raptor search` will automatically detect the parts, and does not need any special parameters. + +### Upgrading the index (v1.1.0 to v2.0.0) +An old index can be upgraded by running `raptor upgrade` and providing some information about how the index was +constructed. + ### SOCKS interface We implement the core interface of [SOCKS](https://gitlab.ub.uni-bielefeld.de/gi/socks). For a list of options, see the help pages: diff --git a/src/argument_parsing/build.cpp b/src/argument_parsing/build.cpp index c22d3db7..6f816abd 100644 --- a/src/argument_parsing/build.cpp +++ b/src/argument_parsing/build.cpp @@ -44,7 +44,7 @@ void init_build_parser(seqan3::argument_parser & parser, build_arguments & argum '\0', "shape", "The shape to use for k-mers. Mutually exclusive with --kmer.", - seqan3::option_spec::hidden, // Add help in kmer_size + seqan3::option_spec::advanced, // Add help in kmer_size seqan3::regex_validator{"[01]+"}); parser.add_option(arguments.out_path, '\0', @@ -73,6 +73,11 @@ void init_build_parser(seqan3::argument_parser & parser, build_arguments & argum "compute-minimiser", "Computes minimisers using cutoffs from Mantis (Pandey et al.). Does not create the index.", arguments.is_socks ? seqan3::option_spec::hidden : seqan3::option_spec::standard); + parser.add_flag(arguments.compute_minimiser, + '\0', + "compute-minimizer", + "Hidden flag, alias of --compute-minimiser.", + seqan3::option_spec::hidden); parser.add_flag(arguments.disable_cutoffs, '\0', "disable-cutoffs", @@ -119,8 +124,13 @@ void run_build(seqan3::argument_parser & parser, bool const is_socks) arguments.window_size = arguments.shape.size(); } - std::filesystem::path output_directory = parser.is_option_set("compute-minimiser") ? arguments.out_path : - arguments.out_path.parent_path(); + bool const is_compute_minimiser_set{parser.is_option_set("compute-minimiser") || + parser.is_option_set("compute-minimizer")}; + + arguments.compute_minimiser = is_compute_minimiser_set; + + std::filesystem::path output_directory = is_compute_minimiser_set ? arguments.out_path : + arguments.out_path.parent_path(); std::error_code ec{}; std::filesystem::create_directories(output_directory, ec); @@ -132,7 +142,7 @@ void run_build(seqan3::argument_parser & parser, bool const is_socks) ec.message())}; // LCOV_EXCL_END - if (!parser.is_option_set("compute-minimiser")) + if (!is_compute_minimiser_set) { seqan3::output_file_validator{}(arguments.out_path); diff --git a/test/cli/raptor_options_test.cpp b/test/cli/raptor_options_test.cpp index 0e8b6238..8e87928a 100644 --- a/test/cli/raptor_options_test.cpp +++ b/test/cli/raptor_options_test.cpp @@ -131,6 +131,17 @@ TEST_F(raptor_build, directory_missing) EXPECT_EQ(result.err, std::string{"[Error] Option --output is required but not set.\n"}); } +TEST_F(raptor_build, alias) +{ + cli_test_result const result = execute_app("raptor", "build", + "--size 8m", + "--compute-minimizer", + tmp_bin_list_file.file_path); + EXPECT_NE(result.exit_code, 0); + EXPECT_EQ(result.out, std::string{}); + EXPECT_EQ(result.err, std::string{"[Error] Option --output is required but not set.\n"}); +} + TEST_F(raptor_build, size_missing) { cli_test_result const result = execute_app("raptor", "build", diff --git a/test/util/collect_compile_stats.sh b/test/util/collect_compile_stats.sh index c0c78ca3..c7ce896c 100755 --- a/test/util/collect_compile_stats.sh +++ b/test/util/collect_compile_stats.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" &> /dev/null && pwd)" @@ -8,14 +9,14 @@ reset_scripts() { } trap reset_scripts EXIT -set -ex - -cmake $SCRIPT_DIR/../.. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=$SCRIPT_DIR/g++.sh -DCMAKE_C_COMPILER=$SCRIPT_DIR/gcc.sh -DUSE_CCACHE=OFF +cmake $SCRIPT_DIR/../.. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_CXX_COMPILER=$SCRIPT_DIR/g++.sh -DCMAKE_C_COMPILER=$SCRIPT_DIR/gcc.sh -DRAPTOR_USE_CCACHE=OFF -DRAPTOR_NATIVE_BUILD=ON sed -i "s/DO_TIME=0/DO_TIME=1/" $SCRIPT_DIR/gcc.sh sed -i "s/DO_TIME=0/DO_TIME=1/" $SCRIPT_DIR/g++.sh -make -k -j4 cli_test api_test +make -k -j6 cli_test api_test find . -name "ram_usage.*" -exec cat {} + > complete.txt $SCRIPT_DIR/parse.py complete.txt stats.csv + +echo "Results can be found in $(pwd)/stats.csv"