generated from seqan/app-template
-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
34389f2
commit f45f6ba
Showing
1 changed file
with
152 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
// -------------------------------------------------------------------------------------------------- | ||
// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin | ||
// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik | ||
// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License | ||
// shipped with this file and also available at: https://github.com/seqan/raptor/blob/main/LICENSE.md | ||
// -------------------------------------------------------------------------------------------------- | ||
|
||
/*!\file | ||
* \brief Provides raptor::search_singular_ibf. | ||
* \author Enrico Seiler <enrico.seiler AT fu-berlin.de> | ||
*/ | ||
|
||
#pragma once | ||
|
||
#include <seqan3/search/views/minimiser_hash.hpp> | ||
|
||
#include <raptor/adjust_seed.hpp> | ||
#include <raptor/contrib/std/chunk_view.hpp> | ||
#include <raptor/dna4_traits.hpp> | ||
#include <raptor/search/do_parallel.hpp> | ||
#include <raptor/search/load_index.hpp> | ||
#include <raptor/search/sync_out.hpp> | ||
#include <raptor/threshold/threshold.hpp> | ||
|
||
namespace raptor | ||
{ | ||
|
||
template <typename index_t> | ||
void search_singular_ibf(search_arguments const & arguments, index_t && index) | ||
{ | ||
constexpr bool is_ibf = std::same_as<index_t, raptor_index<index_structure::ibf>>; | ||
|
||
auto cereal_future = std::async(std::launch::async, | ||
[&]() | ||
{ | ||
load_index(index, arguments); | ||
}); | ||
|
||
seqan3::sequence_file_input<dna4_traits, seqan3::fields<seqan3::field::id, seqan3::field::seq>> fin{ | ||
arguments.query_file}; | ||
using record_type = typename decltype(fin)::record_type; | ||
std::vector<record_type> records{}; | ||
|
||
sync_out synced_out{arguments}; | ||
|
||
raptor::threshold::threshold const thresholder{arguments.make_threshold_parameters()}; | ||
|
||
auto worker = [&](size_t const start, size_t const extent) | ||
{ | ||
seqan::hibf::serial_timer local_compute_minimiser_timer{}; | ||
seqan::hibf::serial_timer local_query_ibf_timer{}; | ||
seqan::hibf::serial_timer local_generate_results_timer{}; | ||
|
||
#if defined(__clang__) | ||
auto counter = [&index]() | ||
#else | ||
auto counter = [&index, is_ibf]() | ||
#endif | ||
{ | ||
if constexpr (is_ibf) | ||
return index.ibf().template counting_agent<uint16_t>(); | ||
else | ||
return index.ibf().membership_agent(); | ||
}(); | ||
std::string result_string{}; | ||
std::vector<uint64_t> minimiser; | ||
|
||
auto hash_adaptor = seqan3::views::minimiser_hash(arguments.shape, | ||
seqan3::window_size{arguments.window_size}, | ||
seqan3::seed{adjust_seed(arguments.shape_weight)}); | ||
|
||
for (auto && [id, seq] : std::span{records.data() + start, extent}) | ||
{ | ||
result_string.clear(); | ||
result_string += id; | ||
result_string += '\t'; | ||
|
||
auto minimiser_view = seq | hash_adaptor | std::views::common; | ||
local_compute_minimiser_timer.start(); | ||
minimiser.assign(minimiser_view.begin(), minimiser_view.end()); | ||
local_compute_minimiser_timer.stop(); | ||
|
||
size_t const minimiser_count{minimiser.size()}; | ||
size_t const threshold = thresholder.get(minimiser_count); | ||
|
||
if constexpr (is_ibf) | ||
{ | ||
local_query_ibf_timer.start(); | ||
auto & result = counter.bulk_count(minimiser); | ||
local_query_ibf_timer.stop(); | ||
size_t current_bin{0}; | ||
local_generate_results_timer.start(); | ||
for (auto && count : result) | ||
{ | ||
if (count >= threshold) | ||
{ | ||
result_string += std::to_string(current_bin); | ||
result_string += ','; | ||
} | ||
++current_bin; | ||
} | ||
} | ||
else | ||
{ | ||
local_query_ibf_timer.start(); | ||
auto & result = counter.membership_for(minimiser, threshold); // Results contains user bin IDs | ||
local_query_ibf_timer.stop(); | ||
local_generate_results_timer.start(); | ||
for (auto && count : result) | ||
{ | ||
result_string += std::to_string(count); | ||
result_string += ','; | ||
} | ||
} | ||
|
||
if (auto & last_char = result_string.back(); last_char == ',') | ||
last_char = '\n'; | ||
else | ||
result_string += '\n'; | ||
|
||
synced_out.write(result_string); | ||
local_generate_results_timer.stop(); | ||
} | ||
|
||
arguments.compute_minimiser_timer += local_compute_minimiser_timer; | ||
arguments.query_ibf_timer += local_query_ibf_timer; | ||
arguments.generate_results_timer += local_generate_results_timer; | ||
}; | ||
|
||
auto write_header = [&]() | ||
{ | ||
if constexpr (is_ibf) | ||
return synced_out.write_header(arguments, index.ibf().hash_function_count()); | ||
else | ||
return synced_out.write_header(arguments, index.ibf().ibf_vector[0].hash_function_count()); | ||
}; | ||
|
||
for (auto && chunked_records : fin | seqan::stl::views::chunk((1ULL << 20) * 10)) | ||
{ | ||
records.clear(); | ||
arguments.query_file_io_timer.start(); | ||
std::ranges::move(chunked_records, std::back_inserter(records)); | ||
arguments.query_file_io_timer.stop(); | ||
|
||
cereal_future.get(); | ||
[[maybe_unused]] static bool header_written = write_header(); // called exactly once | ||
|
||
do_parallel(worker, records.size(), arguments.threads); | ||
} | ||
} | ||
|
||
} // namespace raptor |