Skip to content

Commit

Permalink
Add optional HLL for distinct value count to StatisticsBuilder
Browse files Browse the repository at this point in the history
Adds an optional HLL counter for distinct values to
StatisticsBuilder. This is used in Verax sampling to estimate column
cardinalities for scalar types.
  • Loading branch information
Orri Erling committed Dec 26, 2024
1 parent 61b0b39 commit ad54cbe
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 11 deletions.
17 changes: 15 additions & 2 deletions velox/dwio/common/Statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,13 @@ class ColumnStatistics {
std::optional<uint64_t> valueCount,
std::optional<bool> hasNull,
std::optional<uint64_t> rawSize,
std::optional<uint64_t> size)
std::optional<uint64_t> size,
std::optional<int64_t> numDistinct = std::nullopt)
: valueCount_(valueCount),
hasNull_(hasNull),
rawSize_(rawSize),
size_(size) {}
size_(size),
numDistinct_(numDistinct) {}

virtual ~ColumnStatistics() = default;

Expand Down Expand Up @@ -123,6 +125,16 @@ class ColumnStatistics {
return size_;
}

std::optional<uint64_t> numDistinct() const {
return numDistinct_;
}

void setNumDistinct(int64_t count) {
VELOX_CHECK(
!numDistinct_.has_value(), "numDistinct_ can be set only once.");
numDistinct_ = count;
}

/**
* return string representation of this stats object
*/
Expand All @@ -145,6 +157,7 @@ class ColumnStatistics {
std::optional<bool> hasNull_;
std::optional<uint64_t> rawSize_;
std::optional<uint64_t> size_;
std::optional<uint64_t> numDistinct_;
};

/**
Expand Down
16 changes: 14 additions & 2 deletions velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <velox/common/memory/HashStringAllocator.h>
#include <velox/common/memory/Memory.h>
#include <cmath>

using namespace facebook::velox::dwio::common;
using namespace facebook::velox;
using namespace facebook::velox::dwrf;

StatisticsBuilderOptions options{16};

template <typename T>
std::shared_ptr<FlatVector<T>> makeFlatVector(
facebook::velox::memory::MemoryPool* pool,
Expand Down Expand Up @@ -58,8 +57,15 @@ class TestStatisticsBuilderUtils : public testing::Test {
memory::MemoryManager::testingSetInstance({});
}

void SetUp() override {
StatisticsBuilderOptions options{16};
}

const std::shared_ptr<memory::MemoryPool> pool_ =
memory::memoryManager()->addLeafPool();
std::unique_ptr<HashStringAllocator> allocator_ =
std::make_unique<HashStringAllocator>(pool_.get());
StatisticsBuilderOptions options{16, 100, true, allocator_.get()};
};

TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
Expand All @@ -85,6 +91,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
EXPECT_EQ(10, intStats->getMaximum().value());
EXPECT_EQ(1, intStats->getMinimum().value());
EXPECT_EQ(55, intStats->getSum());
EXPECT_EQ(10, intStats->numDistinct());
}

// add values with null
Expand All @@ -103,6 +110,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
EXPECT_EQ(10, intStats->getMaximum().value());
EXPECT_EQ(1, intStats->getMinimum().value());
EXPECT_EQ(106, intStats->getSum().value());
EXPECT_EQ(10, intStats->numDistinct());
}
}

Expand All @@ -129,6 +137,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) {
EXPECT_EQ(10, doubleStats->getMaximum().value());
EXPECT_EQ(1, doubleStats->getMinimum().value());
EXPECT_EQ(55, doubleStats->getSum());
EXPECT_EQ(10, doubleStats->numDistinct().value());
}

// add values with null
Expand All @@ -147,6 +156,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) {
EXPECT_EQ(10, doubleStats->getMaximum().value());
EXPECT_EQ(1, doubleStats->getMinimum().value());
EXPECT_EQ(106, doubleStats->getSum());
EXPECT_EQ(10, doubleStats->numDistinct().value());
}
}

Expand Down Expand Up @@ -174,6 +184,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) {
EXPECT_EQ("j", strStats->getMaximum().value());
EXPECT_EQ("a", strStats->getMinimum().value());
EXPECT_EQ(10, strStats->getTotalLength());
EXPECT_EQ(10, strStats->numDistinct());
}

// add values with null
Expand All @@ -191,6 +202,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) {
EXPECT_EQ("j", strStats->getMaximum().value());
EXPECT_EQ("a", strStats->getMinimum().value());
EXPECT_EQ(19, strStats->getTotalLength().value());
EXPECT_EQ(10, strStats->numDistinct());
}
}

Expand Down
1 change: 1 addition & 0 deletions velox/dwio/dwrf/writer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ velox_add_library(

velox_link_libraries(
velox_dwio_dwrf_writer
velox_common_hyperloglog
velox_dwio_common
velox_dwio_dwrf_common
velox_dwio_dwrf_utils
Expand Down
17 changes: 15 additions & 2 deletions velox/dwio/dwrf/writer/StatisticsBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ void StatisticsBuilder::merge(
// Merge size
mergeCount(size_, other.getSize());
}
if (hll_) {
auto* otherBuilder = dynamic_cast<const StatisticsBuilder*>(&other);
VELOX_CHECK_NOT_NULL(otherBuilder);
VELOX_CHECK_NOT_NULL(otherBuilder->hll_);
hll_->mergeWith(*otherBuilder->hll_);
}
}

void StatisticsBuilder::toProto(proto::ColumnStatistics& stats) const {
Expand All @@ -115,8 +121,15 @@ std::unique_ptr<dwio::common::ColumnStatistics> StatisticsBuilder::build()
proto::ColumnStatistics stats;
toProto(stats);
StatsContext context{WriterVersion_CURRENT};
return buildColumnStatisticsFromProto(
ColumnStatisticsWrapper(&stats), context);
auto result =
buildColumnStatisticsFromProto(ColumnStatisticsWrapper(&stats), context);
// We do not alter the proto since this is part of the file format
// and the file format. The distinct count does not exist in the
// file format but is added here for use in on demand sampling.
if (hll_) {
result->setNumDistinct(hll_->cardinality());
}
return result;
}

std::unique_ptr<StatisticsBuilder> StatisticsBuilder::create(
Expand Down
45 changes: 40 additions & 5 deletions velox/dwio/dwrf/writer/StatisticsBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <velox/common/base/Exceptions.h>
#include <velox/common/hyperloglog/SparseHll.h>
#include "velox/dwio/dwrf/common/Config.h"
#include "velox/dwio/dwrf/common/Statistics.h"
#include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h"
Expand Down Expand Up @@ -76,11 +77,22 @@ inline dwio::common::KeyInfo constructKey(const dwrf::proto::KeyInfo& keyInfo) {
struct StatisticsBuilderOptions {
explicit StatisticsBuilderOptions(
uint32_t stringLengthLimit,
std::optional<uint64_t> initialSize = std::nullopt)
: stringLengthLimit{stringLengthLimit}, initialSize{initialSize} {}
std::optional<uint64_t> initialSize = std::nullopt,
bool countDistincts = false,
HashStringAllocator* allocator = nullptr)
: stringLengthLimit{stringLengthLimit},
initialSize{initialSize},
countDistincts(countDistincts),
allocator(allocator) {}

uint32_t stringLengthLimit;
std::optional<uint64_t> initialSize;
bool countDistincts{false};
HashStringAllocator* allocator;

StatisticsBuilderOptions withoutNumDistinct() const {
return StatisticsBuilderOptions(stringLengthLimit, initialSize);
}

static StatisticsBuilderOptions fromConfig(const Config& config) {
return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)};
Expand All @@ -90,9 +102,13 @@ struct StatisticsBuilderOptions {
/*
* Base class for stats builder. Stats builder is used in writer and file merge
* to collect and merge stats.
* It can also be used for gathering stats in ad hoc sampling. In this case it
* may also count distinct values if enabled in 'options'.
*/
class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
public:
/// Constructs with 'options'. If 'options' enable count distinct and
/// 'disableNumDistinct' is true, distinct values will not be counted.
explicit StatisticsBuilder(const StatisticsBuilderOptions& options)
: options_{options} {
init();
Expand Down Expand Up @@ -132,6 +148,18 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
}
}

template <typename T>
void addHash(const T& data) {
if (hll_) {
hll_->insertHash(folly::hasher<T>()(data));
}
}

int64_t cardinality() const {
VELOX_CHECK_NOT_NULL(hll_);
return hll_->cardinality();
}

/*
* Merge stats of same type. This is used in writer to aggregate file level
* stats.
Expand Down Expand Up @@ -170,17 +198,21 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
hasNull_ = false;
rawSize_ = 0;
size_ = options_.initialSize;
if (options_.countDistincts) {
hll_ = std::make_shared<common::hll::SparseHll>(options_.allocator);
}
}

protected:
StatisticsBuilderOptions options_;
std::shared_ptr<common::hll::SparseHll> hll_;
};

class BooleanStatisticsBuilder : public StatisticsBuilder,
public dwio::common::BooleanColumnStatistics {
public:
explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options)
: StatisticsBuilder{options} {
: StatisticsBuilder{options.withoutNumDistinct()} {
init();
}

Expand Down Expand Up @@ -229,6 +261,7 @@ class IntegerStatisticsBuilder : public StatisticsBuilder,
max_ = value;
}
addWithOverflowCheck(sum_, value, count);
addHash(value);
}

void merge(
Expand Down Expand Up @@ -278,6 +311,7 @@ class DoubleStatisticsBuilder : public StatisticsBuilder,
if (max_.has_value() && value > max_.value()) {
max_ = value;
}
addHash(value);
// value * count sometimes is not same as adding values (count) times. So
// add in a loop
if (sum_.has_value()) {
Expand Down Expand Up @@ -342,6 +376,7 @@ class StringStatisticsBuilder : public StatisticsBuilder,
max_ = value;
}
}
addHash(value);

addWithOverflowCheck<uint64_t>(length_, value.size(), count);
}
Expand Down Expand Up @@ -375,7 +410,7 @@ class BinaryStatisticsBuilder : public StatisticsBuilder,
public dwio::common::BinaryColumnStatistics {
public:
explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options)
: StatisticsBuilder{options} {
: StatisticsBuilder{options.withoutNumDistinct()} {
init();
}

Expand Down Expand Up @@ -409,7 +444,7 @@ class MapStatisticsBuilder : public StatisticsBuilder,
MapStatisticsBuilder(
const Type& type,
const StatisticsBuilderOptions& options)
: StatisticsBuilder{options},
: StatisticsBuilder{options.withoutNumDistinct()},
valueType_{type.as<velox::TypeKind::MAP>().valueType()} {
init();
}
Expand Down

0 comments on commit ad54cbe

Please sign in to comment.