diff --git a/include/compare_main.h b/include/compare_main.h index e989576a..d9f69a9e 100644 --- a/include/compare_main.h +++ b/include/compare_main.h @@ -60,6 +60,7 @@ struct CompareOptions { float min_allele_fraction_covg_gt { 0 }; float genotyping_error_rate { 0.01 }; uint16_t confidence_threshold { 1 }; + float partial_matching_lower_bound { 0.5 }; bool keep_extra_debugging_files { false }; }; diff --git a/include/denovo_discovery/discover_main.h b/include/denovo_discovery/discover_main.h index 27b6b78f..97fa89e9 100644 --- a/include/denovo_discovery/discover_main.h +++ b/include/denovo_discovery/discover_main.h @@ -39,6 +39,7 @@ struct DiscoverOptions { bool no_gene_coverage_filtering { false }; uint32_t min_cluster_size { 10 }; uint32_t max_num_kmers_to_avg { 100 }; + float partial_matching_lower_bound { 0.5 }; bool keep_extra_debugging_files { false }; }; diff --git a/include/map_main.h b/include/map_main.h index 4377e8ec..28e32097 100644 --- a/include/map_main.h +++ b/include/map_main.h @@ -62,6 +62,7 @@ struct MapOptions { float min_allele_fraction_covg_gt { 0 }; float genotyping_error_rate { 0.01 }; uint16_t confidence_threshold { 1 }; + float partial_matching_lower_bound { 0.5 }; bool keep_extra_debugging_files { false }; }; diff --git a/include/utils.h b/include/utils.h index a4ae4809..892a600e 100644 --- a/include/utils.h +++ b/include/utils.h @@ -115,7 +115,7 @@ uint32_t pangraph_from_read_file(const SampleData& sample, const float conflicting_clusters_overlap_threshold=0.8, const float conflicting_clusters_minimiser_tolerance=0.05, uint32_t threads = 1, const bool keep_extra_debugging_files = false, - const uint32_t rng_seed = 0); + const uint32_t rng_seed = 0, const float partial_matching_lower_bound=0.5); void infer_most_likely_prg_path_for_pannode( const std::vector>&, PanNode*, uint32_t, float); diff --git a/src/compare_main.cpp b/src/compare_main.cpp index 5f7c06dc..d5dfc767 100644 --- a/src/compare_main.cpp +++ b/src/compare_main.cpp @@ -198,6 +198,17 @@ void setup_compare_subcommand(CLI::App& app) ->type_name("FLOAT") ->group("Mapping"); + description + = "Allows for partial matching between reads and a PRG. If this value is for e.g. 0.5, it means that " + "pandora will match a read to a PRG if the cluster of hits has size at least 0.5 * expected cluster size " + "for the given error rate and kmer value. Lower values allow for more hits, but possibly for false positive " + "matches."; + compare_subcmd + ->add_option("--partial-matching-lower-bound", opt->partial_matching_lower_bound, description) + ->capture_default_str() + ->type_name("FLOAT") + ->group("Mapping"); + description = "Maximum number of kmers to average over when selecting the maximum " "likelihood path"; compare_subcmd->add_option("--kmer-avg", opt->max_num_kmers_to_avg, description) @@ -326,7 +337,8 @@ int pandora_compare(CompareOptions& opt) opt.max_diff, opt.error_rate, sample_outdir, opt.min_cluster_size, opt.genome_size, opt.max_covg, opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance, - opt.threads, opt.keep_extra_debugging_files, opt.rng_seed); + opt.threads, opt.keep_extra_debugging_files, opt.rng_seed, + opt.partial_matching_lower_bound); if (pangraph_sample->nodes.empty()) { BOOST_LOG_TRIVIAL(warning) diff --git a/src/denovo_discovery/discover_main.cpp b/src/denovo_discovery/discover_main.cpp index 3f0833c2..0fb21b9d 100644 --- a/src/denovo_discovery/discover_main.cpp +++ b/src/denovo_discovery/discover_main.cpp @@ -75,6 +75,17 @@ void setup_discover_subcommand(CLI::App& app) ->type_name("FLOAT") ->group("Mapping"); + description + = "Allows for partial matching between reads and a PRG. If this value is for e.g. 0.5, it means that " + "pandora will match a read to a PRG if the cluster of hits has size at least 0.5 * expected cluster size " + "for the given error rate and kmer value. Lower values allow for more hits, but possibly for false positive " + "matches."; + discover_subcmd + ->add_option("--partial-matching-lower-bound", opt->partial_matching_lower_bound, description) + ->capture_default_str() + ->type_name("FLOAT") + ->group("Mapping"); + description = "When two clusters of hits are conflicting, the one with highest number of unique minimisers " "will be kept. However, if the difference between the number of unique minimisers is too small, " @@ -241,7 +252,8 @@ void pandora_discover_core(const SampleData& sample, Index &index, DiscoverOptio = pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate, sample_outdir, opt.min_cluster_size, opt.genome_size, opt.max_covg, opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance, - opt.threads, opt.keep_extra_debugging_files, opt.rng_seed); + opt.threads, opt.keep_extra_debugging_files, opt.rng_seed, + opt.partial_matching_lower_bound); if (pangraph->nodes.empty()) { BOOST_LOG_TRIVIAL(warning) diff --git a/src/map_main.cpp b/src/map_main.cpp index f6cb9e5d..f03bda08 100644 --- a/src/map_main.cpp +++ b/src/map_main.cpp @@ -87,6 +87,17 @@ void setup_map_subcommand(CLI::App& app) ->type_name("FLOAT") ->group("Mapping"); + description + = "Allows for partial matching between reads and a PRG. If this value is for e.g. 0.5, it means that " + "pandora will match a read to a PRG if the cluster of hits has size at least 0.5 * expected cluster size " + "for the given error rate and kmer value. Lower values allow for more hits, but possibly for false positive " + "matches."; + map_subcmd + ->add_option("--partial-matching-lower-bound", opt->partial_matching_lower_bound, description) + ->capture_default_str() + ->type_name("FLOAT") + ->group("Mapping"); + map_subcmd ->add_flag("--kg", opt->output_kg, "Save kmer graphs with forward and reverse coverage annotations for found " @@ -322,7 +333,8 @@ int pandora_map(MapOptions& opt) = pangraph_from_read_file(sample, pangraph, index, opt.max_diff, opt.error_rate, opt.outdir, opt.min_cluster_size, opt.genome_size, opt.max_covg, opt.conflicting_clusters_overlap_threshold, opt.conflicting_clusters_minimiser_tolerance, - opt.threads, opt.keep_extra_debugging_files, opt.rng_seed); + opt.threads, opt.keep_extra_debugging_files, opt.rng_seed, + opt.partial_matching_lower_bound); if (pangraph->nodes.empty()) { BOOST_LOG_TRIVIAL(info) << "Found none of the LocalPRGs in the reads."; diff --git a/src/utils.cpp b/src/utils.cpp index 88910356..32594769 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -544,7 +544,8 @@ uint32_t pangraph_from_read_file(const SampleData& sample, const float conflicting_clusters_overlap_threshold, const float conflicting_clusters_minimiser_tolerance, uint32_t threads, - const bool keep_extra_debugging_files, const uint32_t rng_seed) + const bool keep_extra_debugging_files, const uint32_t rng_seed, + const float partial_matching_lower_bound) { // constant variables const SampleIdText sample_name = sample.first; @@ -552,7 +553,7 @@ uint32_t pangraph_from_read_file(const SampleData& sample, const std::string tag = "[Sample " + sample_name + "]: "; const uint32_t w = index.get_window_size(); const uint32_t k = index.get_kmer_size(); - const double fraction_kmers_required_for_cluster = 0.1 / exp(e_rate * k); + const double fraction_kmers_required_for_cluster = partial_matching_lower_bound / exp(e_rate * k); const uint32_t nb_reads_to_map_in_a_batch = 1000; BOOST_LOG_TRIVIAL(trace) << tag << "e_rate: " << e_rate;