Skip to content

Commit

Permalink
Merge pull request #237 from smehringer/multi_filenames
Browse files Browse the repository at this point in the history
[FEATURE] Allow multiple filenames per user bin.
  • Loading branch information
eseiler authored Dec 6, 2023
2 parents 17d067b + 8521f01 commit bdcecb6
Show file tree
Hide file tree
Showing 18 changed files with 137 additions and 62 deletions.
2 changes: 1 addition & 1 deletion include/chopper/input_functor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ struct input_functor
seqan3::fields<seqan3::field::seq>,
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>;

std::vector<std::string> filenames;
std::vector<std::vector<std::string>> filenames;

bool input_are_precomputed_files{false};

Expand Down
2 changes: 1 addition & 1 deletion include/chopper/layout/execute.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
namespace chopper::layout
{

int execute(chopper::configuration & config, std::vector<std::string> const & filenames);
int execute(chopper::configuration & config, std::vector<std::vector<std::string>> const & filenames);

} // namespace chopper::layout
2 changes: 1 addition & 1 deletion include/chopper/layout/output.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@
namespace chopper::layout
{

void write_user_bins_to(std::vector<std::string> const & filenames, std::ostream & stream);
void write_user_bins_to(std::vector<std::vector<std::string>> const & filenames, std::ostream & stream);

} // namespace chopper::layout
3 changes: 3 additions & 0 deletions include/chopper/sketch/check_filenames.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,7 @@ namespace chopper::sketch
//!\brief Checks the `filenames` for consistent files, either precomputed or sequence files.
void check_filenames(std::vector<std::string> const & filenames, configuration & config);

//!\overload
void check_filenames(std::vector<std::vector<std::string>> const & filenames, configuration & config);

} // namespace chopper::sketch
2 changes: 1 addition & 1 deletion include/chopper/sketch/read_data_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,6 @@
namespace chopper::sketch
{

void read_data_file(configuration const & config, std::vector<std::string> & filenames);
void read_data_file(configuration const & config, std::vector<std::vector<std::string>> & filenames);

} // namespace chopper::sketch
2 changes: 1 addition & 1 deletion src/chopper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ int main(int argc, char const * argv[])

int exit_code{};

std::vector<std::string> filenames{};
std::vector<std::vector<std::string>> filenames{};

chopper::sketch::read_data_file(config, filenames);

Expand Down
24 changes: 15 additions & 9 deletions src/input_functor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,30 @@ void input_functor::operator()(size_t const num, seqan::hibf::insert_iterator it
char * const hash_data{reinterpret_cast<char *>(&hash)};
std::streamsize const hash_bytes{sizeof(hash)};

std::ifstream infile{filenames[num], std::ios::binary};
for (std::string const & filename : filenames[num])
{
std::ifstream infile{filename, std::ios::binary};

while (infile.read(hash_data, hash_bytes))
it = hash;
while (infile.read(hash_data, hash_bytes))
it = hash;
}
}
else
{
sequence_file_type fin{filenames[num]};

seqan3::shape shape = seqan3::ungapped{kmer_size};
seqan3::shape const shape = seqan3::ungapped{kmer_size};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{window_size},
seqan3::seed{adjust_seed(shape.count())});

for (auto && [seq] : fin)
for (std::string const & filename : filenames[num])
{
for (auto hash_value : seq | minimizer_view)
it = hash_value;
sequence_file_type fin{filename};

for (auto && [seq] : fin)
{
for (auto hash_value : seq | minimizer_view)
it = hash_value;
}
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions src/layout/execute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
namespace chopper::layout
{

int execute(chopper::configuration & config, std::vector<std::string> const & filenames)
int execute(chopper::configuration & config, std::vector<std::vector<std::string>> const & filenames)
{
assert(config.hibf_config.number_of_user_bins > 0);

Expand Down Expand Up @@ -102,7 +102,7 @@ int execute(chopper::configuration & config, std::vector<std::string> const & fi

assert(filenames.size() == sketches.size());
for (size_t i = 0; i < filenames.size(); ++i)
sketch::write_sketch_file(filenames[i], sketches[i], config);
sketch::write_sketch_file(filenames[i][0], sketches[i], config);
}

// brief Write the output to the layout file.
Expand Down
14 changes: 11 additions & 3 deletions src/layout/output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,20 @@
namespace chopper::layout
{

void write_user_bins_to(std::vector<std::string> const & filenames, std::ostream & stream)
void write_user_bins_to(std::vector<std::vector<std::string>> const & filenames, std::ostream & stream)
{
stream << chopper::prefix::meta_chopper_user_bins_start << '\n';
size_t counter{};
for (auto const & filename : filenames)
stream << seqan::hibf::prefix::meta_header << counter++ << ' ' << filename << '\n';
for (auto const & filenames_of_user_bin : filenames)
{
// the below will write lines like this:
// @0 file1.fa file2.fa
// @1 fileABC.fa
stream << seqan::hibf::prefix::meta_header << counter++;
for (std::string const & filename : filenames_of_user_bin)
stream << ' ' << filename;
stream << '\n';
}
stream << chopper::prefix::meta_chopper_user_bins_end << '\n';
}

Expand Down
6 changes: 6 additions & 0 deletions src/sketch/check_filenames.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,4 +70,10 @@ void check_filenames(std::vector<std::string> const & filenames, configuration &
}
}

void check_filenames(std::vector<std::vector<std::string>> const & filenames, configuration & config)
{
for (auto const & filenames_per_user_bin : filenames)
check_filenames(filenames_per_user_bin, config);
}

} // namespace chopper::sketch
26 changes: 16 additions & 10 deletions src/sketch/read_data_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,21 @@

#include <filesystem>
#include <fstream>
#include <ranges>
#include <stdexcept>
#include <string>
#include <string_view>
#include <vector>

#include <seqan3/utility/range/to.hpp>

#include <chopper/configuration.hpp>
#include <chopper/sketch/read_data_file.hpp>

namespace chopper::sketch
{

void read_data_file(configuration const & config, std::vector<std::string> & filenames)
void read_data_file(configuration const & config, std::vector<std::vector<std::string>> & filenames)
{
std::ifstream fin{config.data_file.string()};

Expand All @@ -27,18 +31,20 @@ void read_data_file(configuration const & config, std::vector<std::string> & fil
std::string line;
while (std::getline(fin, line))
{
auto tab_pos = line.find('\t');
std::vector<std::string> names;

if (tab_pos == std::string::npos)
{
std::string const filename{line.begin(), line.end()};
filenames.push_back(filename);
}
else
auto const tab_pos = line.find('\t');
std::string_view const filename_sv{line.begin(),
(tab_pos != std::string::npos) ? line.begin() + tab_pos : line.end()};

// multiple filenames may be separated by ' '
for (auto && name : std::views::split(filename_sv, ' '))
{
std::string const filename{line.begin(), line.begin() + tab_pos};
filenames.push_back(filename);
auto common_view = std::views::common(name);
names.emplace_back(common_view.begin(), common_view.end());
}

filenames.push_back(std::move(names));
}
}

Expand Down
12 changes: 7 additions & 5 deletions test/api/layout/execute_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,13 @@ TEST(execute_test, few_ubs)
config.disable_sketch_output = true;
config.hibf_config.disable_estimate_union = true; // also disables rearrangement

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"};
std::vector<std::vector<std::string>>
filenames{{"seq0a", "seq0b"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};

chopper::layout::execute(config, filenames);

std::string const expected_file{"@CHOPPER_USER_BINS\n"
"@0 seq0\n"
"@0 seq0a seq0b\n"
"@1 seq1\n"
"@2 seq2\n"
"@3 seq3\n"
Expand Down Expand Up @@ -131,7 +132,8 @@ TEST(execute_test, set_default_tmax)
config.hibf_config.number_of_user_bins = 8;
config.hibf_config.disable_estimate_union = true; // also disables rearrangement

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"};
std::vector<std::vector<std::string>>
filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};

chopper::layout::execute(config, filenames);

Expand All @@ -143,10 +145,10 @@ TEST(execute_test, many_ubs)
seqan3::test::tmp_directory tmp_dir{};
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down
34 changes: 19 additions & 15 deletions test/api/layout/execute_with_estimation_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ TEST(execute_estimation_test, few_ubs)
config.output_filename = layout_file;
config.hibf_config.disable_estimate_union = true; // also disables rearrangement

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7"};
std::vector<std::vector<std::string>>
filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}};

chopper::layout::execute(config, filenames);

Expand Down Expand Up @@ -85,10 +86,10 @@ TEST(execute_estimation_test, many_ubs)
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};
std::filesystem::path const stats_file{layout_file.string() + ".stats"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down Expand Up @@ -486,11 +487,11 @@ TEST(execute_estimation_test, many_ubs_force_all)
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};
std::filesystem::path const stats_file{layout_file.string() + ".stats"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;
std::vector<size_t> many_kmer_counts;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down Expand Up @@ -559,16 +560,16 @@ TEST(execute_estimation_test, with_rearrangement)
std::filesystem::path const stats_file{layout_file.string() + ".stats"};
size_t const kmer_size{15};

std::vector<std::string> filenames{};
std::vector<std::vector<std::string>> filenames{};
std::vector<std::string> hll_filenames;
std::vector<size_t> expected_kmer_counts;

for (size_t i{0}; i < 49u; ++i)
{
filenames.push_back(data("seq1.fa").string());
filenames.push_back(data("seq2.fa").string());
filenames.push_back(data("seq3.fa").string());
filenames.push_back(data("small.fa").string());
filenames.push_back({data("seq1.fa").string()});
filenames.push_back({data("seq2.fa").string()});
filenames.push_back({data("seq3.fa").string()});
filenames.push_back({data("small.fa").string()});

hll_filenames.push_back("seq1.hll");
hll_filenames.push_back("seq2.hll");
Expand All @@ -584,12 +585,15 @@ TEST(execute_estimation_test, with_rearrangement)
// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto data_input = [&](size_t const num, seqan::hibf::insert_iterator it)
{
sequence_file_type3 fin{filenames[num]};

for (auto && [seq] : fin)
for (std::string const & filename : filenames[num])
{
for (auto hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size}))
it = hash_value;
sequence_file_type3 fin{filename};

for (auto && [seq] : fin)
{
for (auto hash_value : seq | seqan3::views::kmer_hash(seqan3::ungapped{kmer_size}))
it = hash_value;
}
}
};

Expand Down
7 changes: 4 additions & 3 deletions test/api/layout/hibf_statistics_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,10 +106,10 @@ TEST(execute_test, chopper_layout_statistics)
seqan3::test::tmp_directory tmp_dir{};
std::filesystem::path const layout_file{tmp_dir.path() / "layout.tsv"};

std::vector<std::string> many_filenames;
std::vector<std::vector<std::string>> many_filenames;

for (size_t i{0}; i < 96u; ++i)
many_filenames.push_back(seqan3::detail::to_string("seq", i));
many_filenames.push_back({seqan3::detail::to_string("seq", i)});

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down Expand Up @@ -160,7 +160,8 @@ TEST(execute_test, chopper_layout_statistics_determine_best_bins)
std::filesystem::path const binning_filename{tmp_dir.path() / "output.binning"};
std::filesystem::path const stats_file{binning_filename.string() + ".stats"};

std::vector<std::string> filenames{"seq0", "seq1", "seq2", "seq3", "seq4", "seq5", "seq6", "seq7", "seq8", "seq9"};
std::vector<std::vector<std::string>>
filenames{{"seq0"}, {"seq1"}, {"seq2"}, {"seq3"}, {"seq4"}, {"seq5"}, {"seq6"}, {"seq7"}, {"seq8"}, {"seq9"}};

// There are 20 files with a count of {100,200,300,400} each. There are 16 files with count 500.
auto simulated_input = [&](size_t const num, seqan::hibf::insert_iterator it)
Expand Down
11 changes: 7 additions & 4 deletions test/api/layout/user_bin_io_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,16 @@

TEST(output, user_bins)
{
std::vector<std::string> const filenames{"file1.fa", "file2.fa", "path/to/file3.fa", "file4.fastq"};
std::vector<std::vector<std::string>> const filenames{{"file1.fa", "fileB.fa"},
{"file2.fa"},
{"path/to/file3.fa"},
{"file4.fastq"}};

std::stringstream ss{};
chopper::layout::write_user_bins_to(filenames, ss);

std::string const expected{"@CHOPPER_USER_BINS\n"
"@0 file1.fa\n"
"@0 file1.fa fileB.fa\n"
"@1 file2.fa\n"
"@2 path/to/file3.fa\n"
"@3 file4.fastq\n"
Expand All @@ -27,14 +30,14 @@ TEST(output, user_bins)
TEST(input, user_bins)
{
std::stringstream ss{"@CHOPPER_USER_BINS\n"
"@0 file1.fa\n"
"@0 file1.fa fileB.fa\n"
"@1 file2.fa\n"
"@2 path/to/file3.fa\n"
"@3 file4.fastq\n"
"@CHOPPER_USER_BINS_END\n"};

std::vector<std::vector<std::string>> filenames = chopper::layout::read_filenames_from(ss);
std::vector<std::vector<std::string>> const expected{{"file1.fa"},
std::vector<std::vector<std::string>> const expected{{"file1.fa", "fileB.fa"},
{"file2.fa"},
{"path/to/file3.fa"},
{"file4.fastq"}};
Expand Down
13 changes: 13 additions & 0 deletions test/api/sketch/check_filenames_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,19 @@ TEST(check_filenames_test, sequence_filenames)
EXPECT_FALSE(config.precomputed_files);
}

TEST(check_filenames_test, overload)
{
std::vector<std::vector<std::string>> filenames{{data("seq1.fa").string()},
{data("seq2.fa").string()},
{data("seq3.fa").string()}};

chopper::configuration config;

EXPECT_NO_THROW(chopper::sketch::check_filenames(filenames, config));

EXPECT_FALSE(config.precomputed_files);
}

TEST(check_filenames_test, minimiser_filenames)
{
std::vector<std::string> filenames{data("small.minimiser").string(),
Expand Down
Loading

0 comments on commit bdcecb6

Please sign in to comment.