From 79661646fb23d776817306ef59dbda0faca9ee82 Mon Sep 17 00:00:00 2001 From: Lucas Czech Date: Wed, 22 May 2024 19:23:00 +0200 Subject: [PATCH] Add filter status category summary functions --- .../filter/sample_counts_filter.cpp | 39 +++++++++++++++- .../filter/sample_counts_filter.hpp | 23 ++++++++++ .../population/filter/variant_filter.cpp | 45 ++++++++++++++++++- .../population/filter/variant_filter.hpp | 25 +++++++++++ 4 files changed, 130 insertions(+), 2 deletions(-) diff --git a/lib/genesis/population/filter/sample_counts_filter.cpp b/lib/genesis/population/filter/sample_counts_filter.cpp index 2c33a62d..2c0b663d 100644 --- a/lib/genesis/population/filter/sample_counts_filter.cpp +++ b/lib/genesis/population/filter/sample_counts_filter.cpp @@ -44,7 +44,42 @@ namespace genesis { namespace population { // ================================================================================================= -// Sample Counts Filter +// Stats +// ================================================================================================= + +std::array sample_counts_filter_stats_category_counts( + SampleCountsFilterStats const& stats +) { + // We want to make sure that the tags enum is exactly as expected here. In case that we later + // add other values to that enum, we want to know here, in order to adapt the function + // accordingly. In the printing function below, we use a loop with a switch statement to be + // notified of any missing enum values. Here, this would be a bit too inefficient, as we expect + // this function here to be called once per position or window. Hence, we just statically + // assert that the last value of the enum has the numerical representation that we expect. + // If this fails, we know that we are missing a value. + static_assert( + static_cast( SampleCountsFilterTag::kEnd ) == 10, + "SampleCountsFilterTag::kEnd != 10. The enum has values that are not accounted for." + ); + + // Now we can build our result with some confidence, by simply adding up the values + // to our simple categories / classes. + auto result = std::array{}; + result[0] += stats[ SampleCountsFilterTag::kPassed ]; + result[1] += stats[ SampleCountsFilterTag::kMissing ]; + result[1] += stats[ SampleCountsFilterTag::kNotPassed ]; + result[1] += stats[ SampleCountsFilterTag::kInvalid ]; + result[2] += stats[ SampleCountsFilterTag::kEmpty ]; + result[2] += stats[ SampleCountsFilterTag::kBelowMinCoverage ]; + result[2] += stats[ SampleCountsFilterTag::kAboveMaxCoverage ]; + result[2] += stats[ SampleCountsFilterTag::kAboveDeletionsCountLimit ]; + result[2] += stats[ SampleCountsFilterTag::kNotSnp ]; + result[2] += stats[ SampleCountsFilterTag::kNotBiallelicSnp ]; + return result; +} + +// ================================================================================================= +// Printing // ================================================================================================= std::ostream& print_sample_counts_filter_stats( @@ -57,6 +92,8 @@ std::ostream& print_sample_counts_filter_stats( // We use an explicit loop over the enum values here, which makes sure that we cannot // forget about any values in the future. This is a bit inefficient, but we do not expect // to call this function more than once. + // If this fails, the above function also needs to be updated. There, we expect the function + // to be called many times, and hence do not want this inefficient loop. for( size_t i = 0; i < static_cast( SampleCountsFilterTag::kEnd ); ++i ) { switch( static_cast(i) ) { case SampleCountsFilterTag::kPassed: { diff --git a/lib/genesis/population/filter/sample_counts_filter.hpp b/lib/genesis/population/filter/sample_counts_filter.hpp index 6275779c..403db7eb 100644 --- a/lib/genesis/population/filter/sample_counts_filter.hpp +++ b/lib/genesis/population/filter/sample_counts_filter.hpp @@ -36,6 +36,7 @@ #include "genesis/population/filter/filter_status.hpp" #include "genesis/population/variant.hpp" +#include #include #include #include @@ -156,6 +157,28 @@ enum class SampleCountsFilterPolicy */ using SampleCountsFilterStats = FilterStats; +/** + * @brief Generate summary counts for a SampleCountsFilterStats counter. + * + * The given @p stats contain counts for different reasons of filters that could have failed when + * filtering a SampleCounts. This function summarizes those stats into three basic categories, + * and gives their sums: + * + * 0. Passing + * 1. Missing data and other invalids (basically, all non-numeric filters) + * 2. Numeric filters, such as zero counts, outside of read depth limits, etc + * + * This is meant as a broad summary, for instance for user output, where it might not be overly + * relevant which exact numerical filter got triggered how often by a particular filter, but rather + * we want to have an overview of which classes or categories of filters got triggered how often. + * + * Hence, the returned numbers depend on the exact usage of the SampleCountsFilterTag tags here. + * If other types of tags are used for the SampleCounts::status instead, this function cannot be used. + */ +std::array sample_counts_filter_stats_category_counts( + SampleCountsFilterStats const& stats +); + // ================================================================================================= // Filter Functions // ================================================================================================= diff --git a/lib/genesis/population/filter/variant_filter.cpp b/lib/genesis/population/filter/variant_filter.cpp index a2caa688..f0931dfd 100644 --- a/lib/genesis/population/filter/variant_filter.cpp +++ b/lib/genesis/population/filter/variant_filter.cpp @@ -33,6 +33,7 @@ #include "genesis/population/function/functions.hpp" #include "genesis/utils/text/char.hpp" +#include #include #include #include @@ -44,7 +45,49 @@ namespace genesis { namespace population { // ================================================================================================= -// Variant Filter +// Stats +// ================================================================================================= + +std::array variant_filter_stats_category_counts( + VariantFilterStats const& stats +) { + // We want to make sure that the tags enum is exactly as expected here. In case that we later + // add other values to that enum, we want to know here, in order to adapt the function + // accordingly. In the printing function below, we use a loop with a switch statement to be + // notified of any missing enum values. Here, this would be a bit too inefficient, as we expect + // this function here to be called once per position or window. Hence, we just statically + // assert that the last value of the enum has the numerical representation that we expect. + // If this fails, we know that we are missing a value. + static_assert( + static_cast( VariantFilterTag::kEnd ) == 17, + "VariantFilterTag::kEnd != 17. The enum has values that are not accounted for." + ); + + // Now we can build our result with some confidence, by simply adding up the values + // to our simple categories / classes. + auto result = std::array{}; + result[0] += stats[ VariantFilterTag::kPassed ]; + result[1] += stats[ VariantFilterTag::kMissing ]; + result[1] += stats[ VariantFilterTag::kNotPassed ]; + result[1] += stats[ VariantFilterTag::kInvalid ]; + result[2] += stats[ VariantFilterTag::kMaskedPosition ]; + result[2] += stats[ VariantFilterTag::kMaskedRegion ]; + result[3] += stats[ VariantFilterTag::kNoSamplePassed ]; + result[3] += stats[ VariantFilterTag::kNotAllSamplesPassed ]; + result[4] += stats[ VariantFilterTag::kEmpty ]; + result[4] += stats[ VariantFilterTag::kBelowMinCoverage ]; + result[4] += stats[ VariantFilterTag::kAboveMaxCoverage ]; + result[4] += stats[ VariantFilterTag::kAboveDeletionsCountLimit ]; + result[5] += stats[ VariantFilterTag::kNotSnp ]; + result[5] += stats[ VariantFilterTag::kNotBiallelicSnp ]; + result[5] += stats[ VariantFilterTag::kBelowSnpMinCount ]; + result[5] += stats[ VariantFilterTag::kAboveSnpMaxCount ]; + result[5] += stats[ VariantFilterTag::kBelowMinAlleleFreq ]; + return result; +} + +// ================================================================================================= +// Printing // ================================================================================================= std::ostream& print_variant_filter_stats( diff --git a/lib/genesis/population/filter/variant_filter.hpp b/lib/genesis/population/filter/variant_filter.hpp index dba77948..34a45ebb 100644 --- a/lib/genesis/population/filter/variant_filter.hpp +++ b/lib/genesis/population/filter/variant_filter.hpp @@ -252,6 +252,31 @@ enum class VariantFilterTag : FilterStatus::IntType */ using VariantFilterStats = FilterStats; +/** + * @brief Generate summary counts for a VariantFilterStats counter. + * + * The given @p stats contain counts for different reasons of filters that could have failed when + * filtering a Variant. This function summarizes those stats into some basic categories, and gives + * their sums: + * + * 0. Passing + * 1. Missing data and other invalids + * 2. Masked positions or regions + * 3. Samples failed, either all or some, depending on SampleCountsFilterPolicy + * 4. Numeric filters for read depth limits, etc + * 5. Not a (biallelic) SNP, according to the relevant numerical filters + * + * This is meant as a broad summary, for instance for user output, where it might not be overly + * relevant which exact numerical filter got triggered how often by a particular filter, but rather + * we want to have an overview of which classes or categories of filters got triggered how often. + * + * Hence, the returned numbers depend on the exact usage of the VariantFilterTag tags here. + * If other types of tags are used for the Variant::status instead, this function cannot be used. + */ +std::array variant_filter_stats_category_counts( + VariantFilterStats const& stats +); + // ================================================================================================= // Filter Functions // =================================================================================================