diff --git a/velox/dwio/common/Statistics.h b/velox/dwio/common/Statistics.h index 699066c1df4d..e6107c6cd344 100644 --- a/velox/dwio/common/Statistics.h +++ b/velox/dwio/common/Statistics.h @@ -81,11 +81,13 @@ class ColumnStatistics { std::optional valueCount, std::optional hasNull, std::optional rawSize, - std::optional size) + std::optional size, + std::optional numDistinct = std::nullopt) : valueCount_(valueCount), hasNull_(hasNull), rawSize_(rawSize), - size_(size) {} + size_(size), + numDistinct_(numDistinct) {} virtual ~ColumnStatistics() = default; @@ -123,6 +125,16 @@ class ColumnStatistics { return size_; } + std::optional numDistinct() const { + return numDistinct_; + } + + void setNumDistinct(int64_t count) { + VELOX_CHECK( + !numDistinct_.has_value(), "numDistinct_ can be set only once."); + numDistinct_ = count; + } + /** * return string representation of this stats object */ @@ -145,6 +157,7 @@ class ColumnStatistics { std::optional hasNull_; std::optional rawSize_; std::optional size_; + std::optional numDistinct_; }; /** diff --git a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp index 6f4afacbd297..d13b09917b23 100644 --- a/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp +++ b/velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,8 +27,6 @@ using namespace facebook::velox::dwio::common; using namespace facebook::velox; using namespace facebook::velox::dwrf; -StatisticsBuilderOptions options{16}; - template std::shared_ptr> makeFlatVector( facebook::velox::memory::MemoryPool* pool, @@ -58,8 +57,15 @@ class TestStatisticsBuilderUtils : public testing::Test { memory::MemoryManager::testingSetInstance({}); } + void SetUp() override { + StatisticsBuilderOptions options{16}; + } + const std::shared_ptr pool_ = memory::memoryManager()->addLeafPool(); + std::unique_ptr allocator_ = + std::make_unique(pool_.get()); + StatisticsBuilderOptions options{16, 100, true, allocator_.get()}; }; TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { @@ -85,6 +91,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { EXPECT_EQ(10, intStats->getMaximum().value()); EXPECT_EQ(1, intStats->getMinimum().value()); EXPECT_EQ(55, intStats->getSum()); + EXPECT_EQ(10, intStats->numDistinct()); } // add values with null @@ -103,6 +110,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) { EXPECT_EQ(10, intStats->getMaximum().value()); EXPECT_EQ(1, intStats->getMinimum().value()); EXPECT_EQ(106, intStats->getSum().value()); + EXPECT_EQ(10, intStats->numDistinct()); } } @@ -129,6 +137,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { EXPECT_EQ(10, doubleStats->getMaximum().value()); EXPECT_EQ(1, doubleStats->getMinimum().value()); EXPECT_EQ(55, doubleStats->getSum()); + EXPECT_EQ(10, doubleStats->numDistinct().value()); } // add values with null @@ -147,6 +156,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) { EXPECT_EQ(10, doubleStats->getMaximum().value()); EXPECT_EQ(1, doubleStats->getMinimum().value()); EXPECT_EQ(106, doubleStats->getSum()); + EXPECT_EQ(10, doubleStats->numDistinct().value()); } } @@ -174,6 +184,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { EXPECT_EQ("j", strStats->getMaximum().value()); EXPECT_EQ("a", strStats->getMinimum().value()); EXPECT_EQ(10, strStats->getTotalLength()); + EXPECT_EQ(10, strStats->numDistinct()); } // add values with null @@ -191,6 +202,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) { EXPECT_EQ("j", strStats->getMaximum().value()); EXPECT_EQ("a", strStats->getMinimum().value()); EXPECT_EQ(19, strStats->getTotalLength().value()); + EXPECT_EQ(10, strStats->numDistinct()); } } diff --git a/velox/dwio/dwrf/writer/CMakeLists.txt b/velox/dwio/dwrf/writer/CMakeLists.txt index 72726d3e79cd..3150c30e55b8 100644 --- a/velox/dwio/dwrf/writer/CMakeLists.txt +++ b/velox/dwio/dwrf/writer/CMakeLists.txt @@ -28,6 +28,7 @@ velox_add_library( velox_link_libraries( velox_dwio_dwrf_writer + velox_common_hyperloglog velox_dwio_common velox_dwio_dwrf_common velox_dwio_dwrf_utils diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp index 3940e29f17b1..3000230c1156 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.cpp +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.cpp @@ -93,6 +93,12 @@ void StatisticsBuilder::merge( // Merge size mergeCount(size_, other.getSize()); } + if (hll_) { + auto* otherBuilder = dynamic_cast(&other); + VELOX_CHECK_NOT_NULL(otherBuilder); + VELOX_CHECK_NOT_NULL(otherBuilder->hll_); + hll_->mergeWith(*otherBuilder->hll_); + } } void StatisticsBuilder::toProto(proto::ColumnStatistics& stats) const { @@ -115,8 +121,15 @@ std::unique_ptr StatisticsBuilder::build() proto::ColumnStatistics stats; toProto(stats); StatsContext context{WriterVersion_CURRENT}; - return buildColumnStatisticsFromProto( - ColumnStatisticsWrapper(&stats), context); + auto result = + buildColumnStatisticsFromProto(ColumnStatisticsWrapper(&stats), context); + // We do not alter the proto since this is part of the file format + // and the file format. The distinct count does not exist in the + // file format but is added here for use in on demand sampling. + if (hll_) { + result->setNumDistinct(hll_->cardinality()); + } + return result; } std::unique_ptr StatisticsBuilder::create( diff --git a/velox/dwio/dwrf/writer/StatisticsBuilder.h b/velox/dwio/dwrf/writer/StatisticsBuilder.h index 35832145a302..d6e361359177 100644 --- a/velox/dwio/dwrf/writer/StatisticsBuilder.h +++ b/velox/dwio/dwrf/writer/StatisticsBuilder.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include "velox/dwio/dwrf/common/Config.h" #include "velox/dwio/dwrf/common/Statistics.h" #include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h" @@ -76,11 +77,22 @@ inline dwio::common::KeyInfo constructKey(const dwrf::proto::KeyInfo& keyInfo) { struct StatisticsBuilderOptions { explicit StatisticsBuilderOptions( uint32_t stringLengthLimit, - std::optional initialSize = std::nullopt) - : stringLengthLimit{stringLengthLimit}, initialSize{initialSize} {} + std::optional initialSize = std::nullopt, + bool countDistincts = false, + HashStringAllocator* allocator = nullptr) + : stringLengthLimit{stringLengthLimit}, + initialSize{initialSize}, + countDistincts(countDistincts), + allocator(allocator) {} uint32_t stringLengthLimit; std::optional initialSize; + bool countDistincts{false}; + HashStringAllocator* allocator; + + StatisticsBuilderOptions withoutNumDistinct() const { + return StatisticsBuilderOptions(stringLengthLimit, initialSize); + } static StatisticsBuilderOptions fromConfig(const Config& config) { return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)}; @@ -90,9 +102,13 @@ struct StatisticsBuilderOptions { /* * Base class for stats builder. Stats builder is used in writer and file merge * to collect and merge stats. + * It can also be used for gathering stats in ad hoc sampling. In this case it + * may also count distinct values if enabled in 'options'. */ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { public: + /// Constructs with 'options'. If 'options' enable count distinct and + /// 'disableNumDistinct' is true, distinct values will not be counted. explicit StatisticsBuilder(const StatisticsBuilderOptions& options) : options_{options} { init(); @@ -132,6 +148,18 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { } } + template + void addHash(const T& data) { + if (hll_) { + hll_->insertHash(folly::hasher()(data)); + } + } + + int64_t cardinality() const { + VELOX_CHECK_NOT_NULL(hll_); + return hll_->cardinality(); + } + /* * Merge stats of same type. This is used in writer to aggregate file level * stats. @@ -170,17 +198,21 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics { hasNull_ = false; rawSize_ = 0; size_ = options_.initialSize; + if (options_.countDistincts) { + hll_ = std::make_shared(options_.allocator); + } } protected: StatisticsBuilderOptions options_; + std::shared_ptr hll_; }; class BooleanStatisticsBuilder : public StatisticsBuilder, public dwio::common::BooleanColumnStatistics { public: explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { + : StatisticsBuilder{options.withoutNumDistinct()} { init(); } @@ -229,6 +261,7 @@ class IntegerStatisticsBuilder : public StatisticsBuilder, max_ = value; } addWithOverflowCheck(sum_, value, count); + addHash(value); } void merge( @@ -278,6 +311,7 @@ class DoubleStatisticsBuilder : public StatisticsBuilder, if (max_.has_value() && value > max_.value()) { max_ = value; } + addHash(value); // value * count sometimes is not same as adding values (count) times. So // add in a loop if (sum_.has_value()) { @@ -342,6 +376,7 @@ class StringStatisticsBuilder : public StatisticsBuilder, max_ = value; } } + addHash(value); addWithOverflowCheck(length_, value.size(), count); } @@ -375,7 +410,7 @@ class BinaryStatisticsBuilder : public StatisticsBuilder, public dwio::common::BinaryColumnStatistics { public: explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options) - : StatisticsBuilder{options} { + : StatisticsBuilder{options.withoutNumDistinct()} { init(); } @@ -409,7 +444,7 @@ class MapStatisticsBuilder : public StatisticsBuilder, MapStatisticsBuilder( const Type& type, const StatisticsBuilderOptions& options) - : StatisticsBuilder{options}, + : StatisticsBuilder{options.withoutNumDistinct()}, valueType_{type.as().valueType()} { init(); }