Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional HLL for distinct value count to StatisticsBuilder #11928

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 15 additions & 2 deletions velox/dwio/common/Statistics.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,11 +81,13 @@ class ColumnStatistics {
std::optional<uint64_t> valueCount,
std::optional<bool> hasNull,
std::optional<uint64_t> rawSize,
std::optional<uint64_t> size)
std::optional<uint64_t> size,
std::optional<int64_t> numDistinct = std::nullopt)
: valueCount_(valueCount),
hasNull_(hasNull),
rawSize_(rawSize),
size_(size) {}
size_(size),
numDistinct_(numDistinct) {}

virtual ~ColumnStatistics() = default;

Expand Down Expand Up @@ -123,6 +125,16 @@ class ColumnStatistics {
return size_;
}

std::optional<uint64_t> numDistinct() const {
return numDistinct_;
}

void setNumDistinct(int64_t count) {
xiaoxmeng marked this conversation as resolved.
Show resolved Hide resolved
VELOX_CHECK(
!numDistinct_.has_value(), "numDistinct_ can be set only once.");
numDistinct_ = count;
}

/**
* return string representation of this stats object
*/
Expand All @@ -145,6 +157,7 @@ class ColumnStatistics {
std::optional<bool> hasNull_;
std::optional<uint64_t> rawSize_;
std::optional<uint64_t> size_;
std::optional<uint64_t> numDistinct_;
};

/**
Expand Down
16 changes: 14 additions & 2 deletions velox/dwio/dwrf/test/TestStatisticsBuilderUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,14 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include <velox/common/memory/HashStringAllocator.h>
#include <velox/common/memory/Memory.h>
#include <cmath>

using namespace facebook::velox::dwio::common;
using namespace facebook::velox;
using namespace facebook::velox::dwrf;

StatisticsBuilderOptions options{16};

template <typename T>
std::shared_ptr<FlatVector<T>> makeFlatVector(
facebook::velox::memory::MemoryPool* pool,
Expand Down Expand Up @@ -58,8 +57,15 @@ class TestStatisticsBuilderUtils : public testing::Test {
memory::MemoryManager::testingSetInstance({});
}

void SetUp() override {
StatisticsBuilderOptions options{16};
}

const std::shared_ptr<memory::MemoryPool> pool_ =
memory::memoryManager()->addLeafPool();
std::unique_ptr<HashStringAllocator> allocator_ =
std::make_unique<HashStringAllocator>(pool_.get());
StatisticsBuilderOptions options{16, 100, true, allocator_.get()};
};

TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
Expand All @@ -85,6 +91,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
EXPECT_EQ(10, intStats->getMaximum().value());
EXPECT_EQ(1, intStats->getMinimum().value());
EXPECT_EQ(55, intStats->getSum());
EXPECT_EQ(10, intStats->numDistinct());
}

// add values with null
Expand All @@ -103,6 +110,7 @@ TEST_F(TestStatisticsBuilderUtils, addIntegerValues) {
EXPECT_EQ(10, intStats->getMaximum().value());
EXPECT_EQ(1, intStats->getMinimum().value());
EXPECT_EQ(106, intStats->getSum().value());
EXPECT_EQ(10, intStats->numDistinct());
}
}

Expand All @@ -129,6 +137,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) {
EXPECT_EQ(10, doubleStats->getMaximum().value());
EXPECT_EQ(1, doubleStats->getMinimum().value());
EXPECT_EQ(55, doubleStats->getSum());
EXPECT_EQ(10, doubleStats->numDistinct().value());
}

// add values with null
Expand All @@ -147,6 +156,7 @@ TEST_F(TestStatisticsBuilderUtils, addDoubleValues) {
EXPECT_EQ(10, doubleStats->getMaximum().value());
EXPECT_EQ(1, doubleStats->getMinimum().value());
EXPECT_EQ(106, doubleStats->getSum());
EXPECT_EQ(10, doubleStats->numDistinct().value());
}
}

Expand Down Expand Up @@ -174,6 +184,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) {
EXPECT_EQ("j", strStats->getMaximum().value());
EXPECT_EQ("a", strStats->getMinimum().value());
EXPECT_EQ(10, strStats->getTotalLength());
EXPECT_EQ(10, strStats->numDistinct());
}

// add values with null
Expand All @@ -191,6 +202,7 @@ TEST_F(TestStatisticsBuilderUtils, addStringValues) {
EXPECT_EQ("j", strStats->getMaximum().value());
EXPECT_EQ("a", strStats->getMinimum().value());
EXPECT_EQ(19, strStats->getTotalLength().value());
EXPECT_EQ(10, strStats->numDistinct());
}
}

Expand Down
1 change: 1 addition & 0 deletions velox/dwio/dwrf/writer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ velox_add_library(

velox_link_libraries(
velox_dwio_dwrf_writer
velox_common_hyperloglog
velox_dwio_common
velox_dwio_dwrf_common
velox_dwio_dwrf_utils
Expand Down
17 changes: 15 additions & 2 deletions velox/dwio/dwrf/writer/StatisticsBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,12 @@ void StatisticsBuilder::merge(
// Merge size
mergeCount(size_, other.getSize());
}
if (hll_) {
auto* otherBuilder = dynamic_cast<const StatisticsBuilder*>(&other);
VELOX_CHECK_NOT_NULL(otherBuilder);
VELOX_CHECK_NOT_NULL(otherBuilder->hll_);
hll_->mergeWith(*otherBuilder->hll_);
}
}

void StatisticsBuilder::toProto(proto::ColumnStatistics& stats) const {
Expand All @@ -115,8 +121,15 @@ std::unique_ptr<dwio::common::ColumnStatistics> StatisticsBuilder::build()
proto::ColumnStatistics stats;
toProto(stats);
StatsContext context{WriterVersion_CURRENT};
return buildColumnStatisticsFromProto(
ColumnStatisticsWrapper(&stats), context);
auto result =
buildColumnStatisticsFromProto(ColumnStatisticsWrapper(&stats), context);
// We do not alter the proto since this is part of the file format
// and the file format. The distinct count does not exist in the
// file format but is added here for use in on demand sampling.
if (hll_) {
result->setNumDistinct(hll_->cardinality());
xiaoxmeng marked this conversation as resolved.
Show resolved Hide resolved
}
return result;
}

std::unique_ptr<StatisticsBuilder> StatisticsBuilder::create(
Expand Down
43 changes: 39 additions & 4 deletions velox/dwio/dwrf/writer/StatisticsBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#pragma once

#include <velox/common/base/Exceptions.h>
#include <velox/common/hyperloglog/SparseHll.h>
#include "velox/dwio/dwrf/common/Config.h"
#include "velox/dwio/dwrf/common/Statistics.h"
#include "velox/dwio/dwrf/common/wrap/dwrf-proto-wrapper.h"
Expand Down Expand Up @@ -76,11 +77,22 @@ inline dwio::common::KeyInfo constructKey(const dwrf::proto::KeyInfo& keyInfo) {
struct StatisticsBuilderOptions {
explicit StatisticsBuilderOptions(
uint32_t stringLengthLimit,
std::optional<uint64_t> initialSize = std::nullopt)
: stringLengthLimit{stringLengthLimit}, initialSize{initialSize} {}
std::optional<uint64_t> initialSize = std::nullopt,
bool countDistincts = false,
HashStringAllocator* allocator = nullptr)
: stringLengthLimit{stringLengthLimit},
initialSize{initialSize},
countDistincts(countDistincts),
allocator(allocator) {}

uint32_t stringLengthLimit;
xiaoxmeng marked this conversation as resolved.
Show resolved Hide resolved
std::optional<uint64_t> initialSize;
bool countDistincts{false};
HashStringAllocator* allocator;

StatisticsBuilderOptions withoutNumDistinct() const {
return StatisticsBuilderOptions(stringLengthLimit, initialSize);
}

static StatisticsBuilderOptions fromConfig(const Config& config) {
return StatisticsBuilderOptions{config.get(Config::STRING_STATS_LIMIT)};
Expand All @@ -90,9 +102,12 @@ struct StatisticsBuilderOptions {
/*
* Base class for stats builder. Stats builder is used in writer and file merge
* to collect and merge stats.
* It can also be used for gathering stats in ad hoc sampling. In this case it
* may also count distinct values if enabled in 'options'.
*/
class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
public:
/// Constructs with 'options'.
explicit StatisticsBuilder(const StatisticsBuilderOptions& options)
: options_{options} {
init();
Expand Down Expand Up @@ -132,6 +147,18 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
}
}

template <typename T>
void addHash(const T& data) {
if (hll_) {
hll_->insertHash(folly::hasher<T>()(data));
}
}

int64_t cardinality() const {
VELOX_CHECK_NOT_NULL(hll_);
return hll_->cardinality();
}

/*
* Merge stats of same type. This is used in writer to aggregate file level
* stats.
Expand Down Expand Up @@ -170,17 +197,21 @@ class StatisticsBuilder : public virtual dwio::common::ColumnStatistics {
hasNull_ = false;
rawSize_ = 0;
size_ = options_.initialSize;
if (options_.countDistincts) {
hll_ = std::make_shared<common::hll::SparseHll>(options_.allocator);
}
}

protected:
StatisticsBuilderOptions options_;
std::shared_ptr<common::hll::SparseHll> hll_;
};

class BooleanStatisticsBuilder : public StatisticsBuilder,
public dwio::common::BooleanColumnStatistics {
public:
explicit BooleanStatisticsBuilder(const StatisticsBuilderOptions& options)
: StatisticsBuilder{options} {
: StatisticsBuilder{options.withoutNumDistinct()} {
init();
}

Expand Down Expand Up @@ -229,6 +260,7 @@ class IntegerStatisticsBuilder : public StatisticsBuilder,
max_ = value;
}
addWithOverflowCheck(sum_, value, count);
addHash(value);
}

void merge(
Expand Down Expand Up @@ -278,6 +310,7 @@ class DoubleStatisticsBuilder : public StatisticsBuilder,
if (max_.has_value() && value > max_.value()) {
max_ = value;
}
addHash(value);
// value * count sometimes is not same as adding values (count) times. So
// add in a loop
if (sum_.has_value()) {
Expand Down Expand Up @@ -342,6 +375,7 @@ class StringStatisticsBuilder : public StatisticsBuilder,
max_ = value;
}
}
addHash(value);

addWithOverflowCheck<uint64_t>(length_, value.size(), count);
}
Expand Down Expand Up @@ -375,7 +409,7 @@ class BinaryStatisticsBuilder : public StatisticsBuilder,
public dwio::common::BinaryColumnStatistics {
public:
explicit BinaryStatisticsBuilder(const StatisticsBuilderOptions& options)
: StatisticsBuilder{options} {
: StatisticsBuilder{options.withoutNumDistinct()} {
init();
}

Expand Down Expand Up @@ -412,6 +446,7 @@ class MapStatisticsBuilder : public StatisticsBuilder,
: StatisticsBuilder{options},
valueType_{type.as<velox::TypeKind::MAP>().valueType()} {
init();
hll_.reset();
}

~MapStatisticsBuilder() override = default;
Expand Down
Loading