Skip to content

Commit

Permalink
Add optional HLL for distinct value count to StatisticsBuilder (faceb…
Browse files Browse the repository at this point in the history
…ookincubator#11928)

Summary:
Adds an optional HLL counter for distinct values to StatisticsBuilder. This is used in Verax sampling to estimate column cardinalities for scalar types.

Pull Request resolved: facebookincubator#11928

Reviewed By: xiaoxmeng

Differential Revision: D67565289

Pulled By: oerling

fbshipit-source-id: e5cbe96059a932377a3d977bca816be6ec52e038
  • Loading branch information
Orri Erling authored and athmaja-n committed Jan 10, 2025
1 parent 23c5845 commit 3aba4b8
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 10 deletions.
17 changes: 15 additions & 2 deletions velox/dwio/common/Statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,13 @@ class ColumnStatistics {
std::optional<uint64_t> valueCount,
std::optional<bool> hasNull,
std::optional<uint64_t> rawSize,
std::optional<uint64_t> size)
std::optional<uint64_t> size,
std::optional<int64_t> numDistinct = std::nullopt)
: valueCount_(valueCount),
hasNull_(hasNull),
rawSize_(rawSize),
size_(size) {}
size_(size),
numDistinct_(numDistinct) {}

virtual ~ColumnStatistics() = default;

Expand Down Expand Up @@ -123,6 +125,16 @@ class ColumnStatistics {
return size_;
}

std::optional<uint64_t> numDistinct() const {
return numDistinct_;
}

void setNumDistinct(int64_t count) {
VELOX_CHECK(
!numDistinct_.has_value(), "numDistinct_ can be set only once.");
numDistinct_ = count;
}

/**
* return string representation of this stats object
*/
Expand All @@ -145,6 +157,7 @@ class ColumnStatistics {
std::optional<bool> hasNull_;
std::optional<uint64_t> rawSize_;
std::optional<uint64_t> size_;
std::optional<uint64_t> numDistinct_;
};

/**
Expand Down
16 changes: 14 additions & 2 deletions velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <velox/common/memory/HashStringAllocator.h>
#include <velox/common/memory/Memory.h>
#include <cmath>

using namespace facebook::velox::dwio::common;
using namespace facebook::velox;
using namespace facebook::velox::dwrf;

StatisticsBuilderOptions options{16};

template <typename T>
std::shared_ptr<FlatVector<T>> makeFlatVector(
facebook::velox::memory::MemoryPool* pool,
Expand Down Expand Up @@ -58,8 +57,15 @@ class TestStatisticsBuilderUtils : public testing::Test {
memory::MemoryManager::testingSetInstance({});
}

void SetUp() override {
StatisticsBuilderOptions options{16};
}

const std::shared_ptr<memory::MemoryPool> pool_ =
memory::memoryManager()->addLeafPool();
std::unique_ptr<HashStringAllocator> allocator_ =
std::make_unique<HashStringAllocator>(pool_.get());
StatisticsBuilderOptions options{16, 100, true, allocator_.get()};
};

TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
Expand All @@ -85,6 +91,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
EXPECT_EQ(10, intStats->getMaximum().value());
EXPECT_EQ(1, intStats->getMinimum().value());
EXPECT_EQ(55, intStats->getSum());
EXPECT_EQ(10, intStats->numDistinct());
}

// add values with null
Expand All @@ -103,6 +110,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
EXPECT_EQ(10, intStats->getMaximum().value());
EXPECT_EQ(1, intStats->getMinimum().value());
EXPECT_EQ(106, intStats->getSum().value());
EXPECT_EQ(10, intStats->numDistinct());
}
}

Expand All @@ -129,6 +137,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) {
EXPECT_EQ(10, doubleStats->getMaximum().value());
EXPECT_EQ(1, doubleStats->getMinimum().value());
EXPECT_EQ(55, doubleStats->getSum());
EXPECT_EQ(10, doubleStats->numDistinct().value());
}

// add values with null
Expand All @@ -147,6 +156,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) {
EXPECT_EQ(10, doubleStats->getMaximum().value());
EXPECT_EQ(1, doubleStats->getMinimum().value());
EXPECT_EQ(106, doubleStats->getSum());
EXPECT_EQ(10, doubleStats->numDistinct().value());
}
}

Expand Down Expand Up @@ -174,6 +184,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) {
EXPECT_EQ("j", strStats->getMaximum().value());
EXPECT_EQ("a", strStats->getMinimum().value());
EXPECT_EQ(10, strStats->getTotalLength());
EXPECT_EQ(10, strStats->numDistinct());
}

// add values with null
Expand All @@ -191,6 +202,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) {
EXPECT_EQ("j", strStats->getMaximum().value());
EXPECT_EQ("a", strStats->getMinimum().value());
EXPECT_EQ(19, strStats->getTotalLength().value());
EXPECT_EQ(10, strStats->numDistinct());
}
}

Expand Down
1 change: 1 addition & 0 deletions velox/dwio/dwrf/writer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ velox_add_library(

velox_link_libraries(
velox_dwio_dwrf_writer
velox_common_hyperloglog
velox_dwio_common
velox_dwio_dwrf_common
velox_dwio_dwrf_utils
Expand Down
17 changes: 15 additions & 2 deletions velox/dwio/dwrf/writer/StatisticsBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ void StatisticsBuilder::merge(
// Merge size
mergeCount(size_, other.getSize());
}
if (hll_) {
auto* otherBuilder = dynamic_cast<const StatisticsBuilder*>(&other);
VELOX_CHECK_NOT_NULL(otherBuilder);
VELOX_CHECK_NOT_NULL(otherBuilder->hll_);
hll_->mergeWith(*otherBuilder->hll_);
}
}

void StatisticsBuilder::toProto(proto::ColumnStatistics& stats) const {
Expand All @@ -115,8 +121,15 @@ std::unique_ptr<dwio::common::ColumnStatistics> StatisticsBuilder::build()
proto::ColumnStatistics stats;
toProto(stats);
StatsContext context{WriterVersion_CURRENT};
return buildColumnStatisticsFromProto(
ColumnStatisticsWrapper(&stats), context);
auto result =
buildColumnStatisticsFromProto(ColumnStatisticsWrapper(&stats), context);
// We do not alter the proto since this is part of the file format
// and the file format. The distinct count does not exist in the
// file format but is added here for use in on demand sampling.
if (hll_) {
result->setNumDistinct(hll_->cardinality());
}
return result;
}

std::unique_ptr<StatisticsBuilder> StatisticsBuilder::create(
Expand Down
43 changes: 39 additions & 4 deletions velox/dwio/dwrf/writer/StatisticsBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <velox/common/base/Exceptions.h>
#include <velox/common/hyperloglog/SparseHll.h>
#include "velox/dwio/dwrf/common/Config.h"
#include "velox/dwio/dwrf/common/Statistics.h"
#include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h"
Expand Down Expand Up @@ -76,11 +77,22 @@ inline dwio::common::KeyInfo constructKey(const dwrf::proto::KeyInfo& keyInfo) {
struct StatisticsBuilderOptions {
explicit StatisticsBuilderOptions(
uint32_t stringLengthLimit,
std::optional<uint64_t> initialSize = std::nullopt)
: stringLengthLimit{stringLengthLimit}, initialSize{initialSize} {}
std::optional<uint64_t> initialSize = std::nullopt,
bool countDistincts = false,
HashStringAllocator* allocator = nullptr)
: stringLengthLimit{stringLengthLimit},
initialSize{initialSize},
countDistincts(countDistincts),
allocator(allocator) {}

uint32_t stringLengthLimit;
std::optional<uint64_t> initialSize;
bool countDistincts{false};
HashStringAllocator* allocator;

StatisticsBuilderOptions withoutNumDistinct() const {
return StatisticsBuilderOptions(stringLengthLimit, initialSize);
}

static StatisticsBuilderOptions fromConfig(const Config& config) {
return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)};
Expand All @@ -90,9 +102,12 @@ struct StatisticsBuilderOptions {
/*
* Base class for stats builder. Stats builder is used in writer and file merge
* to collect and merge stats.
* It can also be used for gathering stats in ad hoc sampling. In this case it
* may also count distinct values if enabled in 'options'.
*/
class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
public:
/// Constructs with 'options'.
explicit StatisticsBuilder(const StatisticsBuilderOptions& options)
: options_{options} {
init();
Expand Down Expand Up @@ -132,6 +147,18 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
}
}

template <typename T>
void addHash(const T& data) {
if (hll_) {
hll_->insertHash(folly::hasher<T>()(data));
}
}

int64_t cardinality() const {
VELOX_CHECK_NOT_NULL(hll_);
return hll_->cardinality();
}

/*
* Merge stats of same type. This is used in writer to aggregate file level
* stats.
Expand Down Expand Up @@ -170,17 +197,21 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
hasNull_ = false;
rawSize_ = 0;
size_ = options_.initialSize;
if (options_.countDistincts) {
hll_ = std::make_shared<common::hll::SparseHll>(options_.allocator);
}
}

protected:
StatisticsBuilderOptions options_;
std::shared_ptr<common::hll::SparseHll> hll_;
};

class BooleanStatisticsBuilder : public StatisticsBuilder,
public dwio::common::BooleanColumnStatistics {
public:
explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options)
: StatisticsBuilder{options} {
: StatisticsBuilder{options.withoutNumDistinct()} {
init();
}

Expand Down Expand Up @@ -229,6 +260,7 @@ class IntegerStatisticsBuilder : public StatisticsBuilder,
max_ = value;
}
addWithOverflowCheck(sum_, value, count);
addHash(value);
}

void merge(
Expand Down Expand Up @@ -278,6 +310,7 @@ class DoubleStatisticsBuilder : public StatisticsBuilder,
if (max_.has_value() && value > max_.value()) {
max_ = value;
}
addHash(value);
// value * count sometimes is not same as adding values (count) times. So
// add in a loop
if (sum_.has_value()) {
Expand Down Expand Up @@ -342,6 +375,7 @@ class StringStatisticsBuilder : public StatisticsBuilder,
max_ = value;
}
}
addHash(value);

addWithOverflowCheck<uint64_t>(length_, value.size(), count);
}
Expand Down Expand Up @@ -375,7 +409,7 @@ class BinaryStatisticsBuilder : public StatisticsBuilder,
public dwio::common::BinaryColumnStatistics {
public:
explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options)
: StatisticsBuilder{options} {
: StatisticsBuilder{options.withoutNumDistinct()} {
init();
}

Expand Down Expand Up @@ -412,6 +446,7 @@ class MapStatisticsBuilder : public StatisticsBuilder,
: StatisticsBuilder{options},
valueType_{type.as<velox::TypeKind::MAP>().valueType()} {
init();
hll_.reset();
}

~MapStatisticsBuilder() override = default;
Expand Down

0 comments on commit 3aba4b8

Please sign in to comment.