From ad54cbec7c757bd18b6bb2ade7fa5c7ccc828a9e Mon Sep 17 00:00:00 2001 From: Orri Erling Date: Thu, 26 Dec 2024 13:49:17 -0800 Subject: [PATCH] Add optional HLL for distinct value count to StatisticsBuilder Adds an optional HLL counter for distinct values to StatisticsBuilder. This is used in Verax sampling to estimate column cardinalities for scalar types. --- velox/dwio/common/Statistics.h | 17 ++++++- .../dwrf/test/TestStatisticsBuilderUtils.cpp | 16 ++++++- velox/dwio/dwrf/writer/CMakeLists.txt | 1 + velox/dwio/dwrf/writer/StatisticsBuilder.cpp | 17 ++++++- velox/dwio/dwrf/writer/StatisticsBuilder.h | 45 ++++++++++++++++--- 5 files changed, 85 insertions(+), 11 deletions(-) diff --git a/velox/dwio/common/Statistics.h b/velox/dwio/common/Statistics.h index 699066c1df4d..e6107c6cd344 100644 --- a/velox/dwio/common/Statistics.h +++ b/velox/dwio/common/Statistics.h @@ -81,11 +81,13 @@ class ColumnStatistics { std::optional valueCount, std::optional hasNull, std::optional rawSize, - std::optional size) + std::optional size, + std::optional numDistinct = std::nullopt) : valueCount_(valueCount), hasNull_(hasNull), rawSize_(rawSize), - size_(size) {} + size_(size), + numDistinct_(numDistinct) {} virtual ~ColumnStatistics() = default; @@ -123,6 +125,16 @@ class ColumnStatistics { return size_; } + std::optional numDistinct() const { + return numDistinct_; + } + + void setNumDistinct(int64_t count) { + VELOX_CHECK( + !numDistinct_.has_value(), "numDistinct_ can be set only once."); + numDistinct_ = count; + } + /** * return string representation of this stats object */ @@ -145,6 +157,7 @@ class ColumnStatistics { std::optional hasNull_; std::optional rawSize_; std::optional size_; + std::optional numDistinct_; }; /** diff --git a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp index 6f4afacbd297..d13b09917b23 100644 --- a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp +++ b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,8 +27,6 @@ using namespace facebook::velox::dwio::common; using namespace facebook::velox; using namespace facebook::velox::dwrf; -StatisticsBuilderOptions options{16}; - template std::shared_ptr> makeFlatVector( facebook::velox::memory::MemoryPool* pool, @@ -58,8 +57,15 @@ class TestStatisticsBuilderUtils : public testing::Test { memory::MemoryManager::testingSetInstance({}); } + void SetUp() override { + StatisticsBuilderOptions options{16}; + } + const std::shared_ptr pool_ = memory::memoryManager()->addLeafPool(); + std::unique_ptr allocator_ = + std::make_unique(pool_.get()); + StatisticsBuilderOptions options{16, 100, true, allocator_.get()}; }; TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { @@ -85,6 +91,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { EXPECT_EQ(10, intStats->getMaximum().value()); EXPECT_EQ(1, intStats->getMinimum().value()); EXPECT_EQ(55, intStats->getSum()); + EXPECT_EQ(10, intStats->numDistinct()); } // add values with null @@ -103,6 +110,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { EXPECT_EQ(10, intStats->getMaximum().value()); EXPECT_EQ(1, intStats->getMinimum().value()); EXPECT_EQ(106, intStats->getSum().value()); + EXPECT_EQ(10, intStats->numDistinct()); } } @@ -129,6 +137,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { EXPECT_EQ(10, doubleStats->getMaximum().value()); EXPECT_EQ(1, doubleStats->getMinimum().value()); EXPECT_EQ(55, doubleStats->getSum()); + EXPECT_EQ(10, doubleStats->numDistinct().value()); } // add values with null @@ -147,6 +156,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { EXPECT_EQ(10, doubleStats->getMaximum().value()); EXPECT_EQ(1, doubleStats->getMinimum().value()); EXPECT_EQ(106, doubleStats->getSum()); + EXPECT_EQ(10, doubleStats->numDistinct().value()); } } @@ -174,6 +184,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { EXPECT_EQ("j", strStats->getMaximum().value()); EXPECT_EQ("a", strStats->getMinimum().value()); EXPECT_EQ(10, strStats->getTotalLength()); + EXPECT_EQ(10, strStats->numDistinct()); } // add values with null @@ -191,6 +202,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { EXPECT_EQ("j", strStats->getMaximum().value()); EXPECT_EQ("a", strStats->getMinimum().value()); EXPECT_EQ(19, strStats->getTotalLength().value()); + EXPECT_EQ(10, strStats->numDistinct()); } } diff --git a/velox/dwio/dwrf/writer/CMakeLists.txt b/velox/dwio/dwrf/writer/CMakeLists.txt index 72726d3e79cd..3150c30e55b8 100644 --- a/velox/dwio/dwrf/writer/CMakeLists.txt +++ b/velox/dwio/dwrf/writer/CMakeLists.txt @@ -28,6 +28,7 @@ velox_add_library( velox_link_libraries( velox_dwio_dwrf_writer + velox_common_hyperloglog velox_dwio_common velox_dwio_dwrf_common velox_dwio_dwrf_utils diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp index 3940e29f17b1..3000230c1156 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp @@ -93,6 +93,12 @@ void StatisticsBuilder::merge( // Merge size mergeCount(size_, other.getSize()); } + if (hll_) { + auto* otherBuilder = dynamic_cast(&other); + VELOX_CHECK_NOT_NULL(otherBuilder); + VELOX_CHECK_NOT_NULL(otherBuilder->hll_); + hll_->mergeWith(*otherBuilder->hll_); + } } void StatisticsBuilder::toProto(proto::ColumnStatistics& stats) const { @@ -115,8 +121,15 @@ std::unique_ptr StatisticsBuilder::build() proto::ColumnStatistics stats; toProto(stats); StatsContext context{WriterVersion_CURRENT}; - return buildColumnStatisticsFromProto( - ColumnStatisticsWrapper(&stats), context); + auto result = + buildColumnStatisticsFromProto(ColumnStatisticsWrapper(&stats), context); + // We do not alter the proto since this is part of the file format + // and the file format. The distinct count does not exist in the + // file format but is added here for use in on demand sampling. + if (hll_) { + result->setNumDistinct(hll_->cardinality()); + } + return result; } std::unique_ptr StatisticsBuilder::create( diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.h b/velox/dwio/dwrf/writer/StatisticsBuilder.h index 35832145a302..d6e361359177 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.h +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include "velox/dwio/dwrf/common/Config.h" #include "velox/dwio/dwrf/common/Statistics.h" #include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h" @@ -76,11 +77,22 @@ inline dwio::common::KeyInfo constructKey(const dwrf::proto::KeyInfo& keyInfo) { struct StatisticsBuilderOptions { explicit StatisticsBuilderOptions( uint32_t stringLengthLimit, - std::optional initialSize = std::nullopt) - : stringLengthLimit{stringLengthLimit}, initialSize{initialSize} {} + std::optional initialSize = std::nullopt, + bool countDistincts = false, + HashStringAllocator* allocator = nullptr) + : stringLengthLimit{stringLengthLimit}, + initialSize{initialSize}, + countDistincts(countDistincts), + allocator(allocator) {} uint32_t stringLengthLimit; std::optional initialSize; + bool countDistincts{false}; + HashStringAllocator* allocator; + + StatisticsBuilderOptions withoutNumDistinct() const { + return StatisticsBuilderOptions(stringLengthLimit, initialSize); + } static StatisticsBuilderOptions fromConfig(const Config& config) { return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)}; @@ -90,9 +102,13 @@ struct StatisticsBuilderOptions { /* * Base class for stats builder. Stats builder is used in writer and file merge * to collect and merge stats. + * It can also be used for gathering stats in ad hoc sampling. In this case it + * may also count distinct values if enabled in 'options'. */ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { public: + /// Constructs with 'options'. If 'options' enable count distinct and + /// 'disableNumDistinct' is true, distinct values will not be counted. explicit StatisticsBuilder(const StatisticsBuilderOptions& options) : options_{options} { init(); @@ -132,6 +148,18 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { } } + template + void addHash(const T& data) { + if (hll_) { + hll_->insertHash(folly::hasher()(data)); + } + } + + int64_t cardinality() const { + VELOX_CHECK_NOT_NULL(hll_); + return hll_->cardinality(); + } + /* * Merge stats of same type. This is used in writer to aggregate file level * stats. @@ -170,17 +198,21 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { hasNull_ = false; rawSize_ = 0; size_ = options_.initialSize; + if (options_.countDistincts) { + hll_ = std::make_shared(options_.allocator); + } } protected: StatisticsBuilderOptions options_; + std::shared_ptr hll_; }; class BooleanStatisticsBuilder : public StatisticsBuilder, public dwio::common::BooleanColumnStatistics { public: explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { + : StatisticsBuilder{options.withoutNumDistinct()} { init(); } @@ -229,6 +261,7 @@ class IntegerStatisticsBuilder : public StatisticsBuilder, max_ = value; } addWithOverflowCheck(sum_, value, count); + addHash(value); } void merge( @@ -278,6 +311,7 @@ class DoubleStatisticsBuilder : public StatisticsBuilder, if (max_.has_value() && value > max_.value()) { max_ = value; } + addHash(value); // value * count sometimes is not same as adding values (count) times. So // add in a loop if (sum_.has_value()) { @@ -342,6 +376,7 @@ class StringStatisticsBuilder : public StatisticsBuilder, max_ = value; } } + addHash(value); addWithOverflowCheck(length_, value.size(), count); } @@ -375,7 +410,7 @@ class BinaryStatisticsBuilder : public StatisticsBuilder, public dwio::common::BinaryColumnStatistics { public: explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { + : StatisticsBuilder{options.withoutNumDistinct()} { init(); } @@ -409,7 +444,7 @@ class MapStatisticsBuilder : public StatisticsBuilder, MapStatisticsBuilder( const Type& type, const StatisticsBuilderOptions& options) - : StatisticsBuilder{options}, + : StatisticsBuilder{options.withoutNumDistinct()}, valueType_{type.as().valueType()} { init(); }