Skip to content

Commit

Permalink
feat: adding param --partial-matching-lower-bound to pandora map, dis…
Browse files Browse the repository at this point in the history
…cover and command subcommands
  • Loading branch information
leoisl committed Dec 13, 2023
1 parent 4a5651c commit 2b5f362
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 6 deletions.
1 change: 1 addition & 0 deletions include/compare_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,7 @@ struct CompareOptions {
float min_allele_fraction_covg_gt { 0 };
float genotyping_error_rate { 0.01 };
uint16_t confidence_threshold { 1 };
float partial_matching_lower_bound { 0.5 };
bool keep_extra_debugging_files { false };
};

Expand Down
1 change: 1 addition & 0 deletions include/denovo_discovery/discover_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ struct DiscoverOptions {
bool no_gene_coverage_filtering { false };
uint32_t min_cluster_size { 10 };
uint32_t max_num_kmers_to_avg { 100 };
float partial_matching_lower_bound { 0.5 };
bool keep_extra_debugging_files { false };
};

Expand Down
1 change: 1 addition & 0 deletions include/map_main.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ struct MapOptions {
float min_allele_fraction_covg_gt { 0 };
float genotyping_error_rate { 0.01 };
uint16_t confidence_threshold { 1 };
float partial_matching_lower_bound { 0.5 };
bool keep_extra_debugging_files { false };
};

Expand Down
2 changes: 1 addition & 1 deletion include/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ uint32_t pangraph_from_read_file(const SampleData& sample,
const float conflicting_clusters_overlap_threshold=0.8,
const float conflicting_clusters_minimiser_tolerance=0.05,
uint32_t threads = 1, const bool keep_extra_debugging_files = false,
const uint32_t rng_seed = 0);
const uint32_t rng_seed = 0, const float partial_matching_lower_bound=0.5);

void infer_most_likely_prg_path_for_pannode(
const std::vector<std::shared_ptr<LocalPRG>>&, PanNode*, uint32_t, float);
Expand Down
14 changes: 13 additions & 1 deletion src/compare_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,17 @@ void setup_compare_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Mapping");

description
= "Allows for partial matching between reads and a PRG. If this value is for e.g. 0.5, it means that "
"pandora will match a read to a PRG if the cluster of hits has size at least 0.5 * expected cluster size "
"for the given error rate and kmer value. Lower values allow for more hits, but possibly for false positive "
"matches.";
compare_subcmd
->add_option("--partial-matching-lower-bound", opt->partial_matching_lower_bound, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

description = "Maximum number of kmers to average over when selecting the maximum "
"likelihood path";
compare_subcmd->add_option("--kmer-avg", opt->max_num_kmers_to_avg, description)
Expand Down Expand Up @@ -326,7 +337,8 @@ int pandora_compare(CompareOptions& opt)
opt.max_diff, opt.error_rate, sample_outdir,
opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed,
opt.partial_matching_lower_bound);

if (pangraph_sample->nodes.empty()) {
BOOST_LOG_TRIVIAL(warning)
Expand Down
14 changes: 13 additions & 1 deletion src/denovo_discovery/discover_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ void setup_discover_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Mapping");

description
= "Allows for partial matching between reads and a PRG. If this value is for e.g. 0.5, it means that "
"pandora will match a read to a PRG if the cluster of hits has size at least 0.5 * expected cluster size "
"for the given error rate and kmer value. Lower values allow for more hits, but possibly for false positive "
"matches.";
discover_subcmd
->add_option("--partial-matching-lower-bound", opt->partial_matching_lower_bound, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

description
= "When two clusters of hits are conflicting, the one with highest number of unique minimisers "
"will be kept. However, if the difference between the number of unique minimisers is too small, "
Expand Down Expand Up @@ -241,7 +252,8 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio
= pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate, sample_outdir,
opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed,
opt.partial_matching_lower_bound);

if (pangraph->nodes.empty()) {
BOOST_LOG_TRIVIAL(warning)
Expand Down
14 changes: 13 additions & 1 deletion src/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,17 @@ void setup_map_subcommand(CLI::App& app)
->type_name("FLOAT")
->group("Mapping");

description
= "Allows for partial matching between reads and a PRG. If this value is for e.g. 0.5, it means that "
"pandora will match a read to a PRG if the cluster of hits has size at least 0.5 * expected cluster size "
"for the given error rate and kmer value. Lower values allow for more hits, but possibly for false positive "
"matches.";
map_subcmd
->add_option("--partial-matching-lower-bound", opt->partial_matching_lower_bound, description)
->capture_default_str()
->type_name("FLOAT")
->group("Mapping");

map_subcmd
->add_flag("--kg", opt->output_kg,
"Save kmer graphs with forward and reverse coverage annotations for found "
Expand Down Expand Up @@ -322,7 +333,8 @@ int pandora_map(MapOptions& opt)
= pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate,
opt.outdir, opt.min_cluster_size, opt.genome_size, opt.max_covg,
opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance,
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed);
opt.threads, opt.keep_extra_debugging_files, opt.rng_seed,
opt.partial_matching_lower_bound);

if (pangraph->nodes.empty()) {
BOOST_LOG_TRIVIAL(info) << "Found none of the LocalPRGs in the reads.";
Expand Down
5 changes: 3 additions & 2 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -544,15 +544,16 @@ uint32_t pangraph_from_read_file(const SampleData& sample,
const float conflicting_clusters_overlap_threshold,
const float conflicting_clusters_minimiser_tolerance,
uint32_t threads,
const bool keep_extra_debugging_files, const uint32_t rng_seed)
const bool keep_extra_debugging_files, const uint32_t rng_seed,
const float partial_matching_lower_bound)
{
// constant variables
const SampleIdText sample_name = sample.first;
const SampleFpath sample_filepath = sample.second;
const std::string tag = "[Sample " + sample_name + "]: ";
const uint32_t w = index.get_window_size();
const uint32_t k = index.get_kmer_size();
const double fraction_kmers_required_for_cluster = 0.1 / exp(e_rate * k);
const double fraction_kmers_required_for_cluster = partial_matching_lower_bound / exp(e_rate * k);
const uint32_t nb_reads_to_map_in_a_batch = 1000;

BOOST_LOG_TRIVIAL(trace) << tag << "e_rate: " << e_rate;
Expand Down

0 comments on commit 2b5f362

Please sign in to comment.