Skip to content

Commit

Permalink
Add null checks to zone map (#4642)
Browse files Browse the repository at this point in the history
  • Loading branch information
royi-luo authored Dec 20, 2024
1 parent 3f52346 commit ea98fb1
Show file tree
Hide file tree
Showing 19 changed files with 267 additions and 75 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
-NAME zonemap-node-null
-QUERY MATCH (c:Comment) WHERE c.length IS NULL RETURN *;
---- 0
4 changes: 4 additions & 0 deletions src/common/expression_type.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,10 @@ std::string ExpressionTypeUtil::toParsableString(ExpressionType type) {
return "<";
case ExpressionType::LESS_THAN_EQUALS:
return "<=";
case ExpressionType::IS_NULL:
return "IS NULL";
case ExpressionType::IS_NOT_NULL:
return "IS NOT NULL";
default:
throw RuntimeException(stringFormat(
"ExpressionTypeUtil::toParsableString not implemented for {}", toString(type)));
Expand Down
13 changes: 8 additions & 5 deletions src/include/storage/predicate/column_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
namespace kuzu {
namespace storage {

struct ColumnChunkStats;
struct MergedColumnChunkStats;

class ColumnPredicate;
class KUZU_API ColumnPredicateSet {
Expand All @@ -18,9 +18,10 @@ class KUZU_API ColumnPredicateSet {
void addPredicate(std::unique_ptr<ColumnPredicate> predicate) {
predicates.push_back(std::move(predicate));
}
void tryAddPredicate(const binder::Expression& column, const binder::Expression& predicate);
bool isEmpty() const { return predicates.empty(); }

common::ZoneMapCheckResult checkZoneMap(const ColumnChunkStats& stats) const;
common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const;

std::string toString() const;

Expand All @@ -34,13 +35,14 @@ class KUZU_API ColumnPredicateSet {

class KUZU_API ColumnPredicate {
public:
explicit ColumnPredicate(std::string columnName) : columnName{std::move(columnName)} {}
ColumnPredicate(std::string columnName, common::ExpressionType expressionType)
: columnName{std::move(columnName)}, expressionType(expressionType) {}

virtual ~ColumnPredicate() = default;

virtual common::ZoneMapCheckResult checkZoneMap(const ColumnChunkStats& stats) const = 0;
virtual common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const = 0;

virtual std::string toString() = 0;
virtual std::string toString();

virtual std::unique_ptr<ColumnPredicate> copy() const = 0;

Expand All @@ -51,6 +53,7 @@ class KUZU_API ColumnPredicate {

protected:
std::string columnName;
common::ExpressionType expressionType;
};

struct ColumnPredicateUtil {
Expand Down
6 changes: 2 additions & 4 deletions src/include/storage/predicate/constant_predicate.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,9 @@ class ColumnConstantPredicate : public ColumnPredicate {
public:
ColumnConstantPredicate(std::string columnName, common::ExpressionType expressionType,
common::Value value)
: ColumnPredicate{std::move(columnName)}, expressionType{expressionType},
value{std::move(value)} {}
: ColumnPredicate{std::move(columnName), expressionType}, value{std::move(value)} {}

common::ZoneMapCheckResult checkZoneMap(const ColumnChunkStats& stats) const override;
common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const override;

std::string toString() override;

Expand All @@ -23,7 +22,6 @@ class ColumnConstantPredicate : public ColumnPredicate {
}

private:
common::ExpressionType expressionType;
common::Value value;
};

Expand Down
25 changes: 25 additions & 0 deletions src/include/storage/predicate/null_predicate.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include "column_predicate.h"
#include "common/enums/expression_type.h"

namespace kuzu {
namespace storage {

class ColumnNullPredicate : public ColumnPredicate {
public:
explicit ColumnNullPredicate(std::string columnName, common::ExpressionType type)
: ColumnPredicate{std::move(columnName), type} {
KU_ASSERT(
type == common::ExpressionType::IS_NULL || type == common::ExpressionType::IS_NOT_NULL);
}

common::ZoneMapCheckResult checkZoneMap(const MergedColumnChunkStats& stats) const override;

std::unique_ptr<ColumnPredicate> copy() const override {
return std::make_unique<ColumnNullPredicate>(columnName, expressionType);
}
};

} // namespace storage
} // namespace kuzu
3 changes: 3 additions & 0 deletions src/include/storage/store/column_chunk.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,9 @@ class ColumnChunk {
void loadFromDisk() { data->loadFromDisk(); }
uint64_t spillToDisk() { return data->spillToDisk(); }

MergedColumnChunkStats getMergedColumnChunkStats(
const transaction::Transaction* transaction) const;

private:
void scanCommittedUpdates(const transaction::Transaction* transaction, ColumnChunkData& output,
common::offset_t startOffsetInOutput, common::row_idx_t startRowScanned,
Expand Down
32 changes: 19 additions & 13 deletions src/include/storage/store/column_chunk_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ class ColumnChunkData {
void loadFromDisk();
uint64_t spillToDisk();

ColumnChunkStats getMergedColumnChunkStats() const;
MergedColumnChunkStats getMergedColumnChunkStats() const;

void updateStats(const common::ValueVector* vector, const common::SelectionVector& selVector);

Expand Down Expand Up @@ -325,37 +325,45 @@ class BoolChunkData : public ColumnChunkData {
class NullChunkData final : public BoolChunkData {
public:
NullChunkData(MemoryManager& mm, uint64_t capacity, bool enableCompression, ResidencyState type)
: BoolChunkData(mm, capacity, enableCompression, type, false /*hasNullData*/),
mayHaveNullValue{false} {}
: BoolChunkData(mm, capacity, enableCompression, type, false /*hasNullData*/) {}
NullChunkData(MemoryManager& mm, bool enableCompression, const ColumnChunkMetadata& metadata)
: BoolChunkData{mm, enableCompression, metadata, false /*hasNullData*/},
mayHaveNullValue{false} {}
: BoolChunkData{mm, enableCompression, metadata, false /*hasNullData*/} {}

// Maybe this should be combined with BoolChunkData if the only difference is these
// functions?
bool isNull(common::offset_t pos) const { return getValue<bool>(pos); }
void setNull(common::offset_t pos, bool isNull);

bool mayHaveNull() const { return mayHaveNullValue; }
bool noNullsGuaranteedInMem() const {
return !inMemoryStats.max || !inMemoryStats.max->get<bool>();
}
bool allNullsGuaranteedInMem() const {
return !inMemoryStats.min || inMemoryStats.min->get<bool>();
}
bool haveNoNullsGuaranteed() const;
bool haveAllNullsGuaranteed() const;

void resetToEmpty() override {
resetToNoNull();
numValues = 0;
}
void resetToNoNull() {
memset(getData(), 0 /* non null */, getBufferSize());
mayHaveNullValue = false;
inMemoryStats.min = inMemoryStats.max = false;
}
void resetToAllNull() override {
memset(getData(), 0xFF /* null */, getBufferSize());
mayHaveNullValue = true;
inMemoryStats.min = inMemoryStats.max = true;
}

void copyFromBuffer(uint64_t* srcBuffer, uint64_t srcOffset, uint64_t dstOffset,
uint64_t numBits, bool invert = false) {
if (common::NullMask::copyNullMask(srcBuffer, srcOffset, getData<uint64_t>(), dstOffset,
numBits, invert)) {
mayHaveNullValue = true;
// we pessimistically assume that the buffer contains both true/false values
// so the zone map won't skip scans
inMemoryStats.min = false;
inMemoryStats.max = true;
}
}

Expand All @@ -376,11 +384,9 @@ class NullChunkData final : public BoolChunkData {
common::Deserializer& deSer);

common::NullMask getNullMask() const {
return common::NullMask(std::span(getData<uint64_t>(), capacity / 64), mayHaveNullValue);
return common::NullMask(std::span(getData<uint64_t>(), capacity / 64),
!noNullsGuaranteedInMem());
}

protected:
bool mayHaveNullValue;
};

class InternalIDChunkData final : public ColumnChunkData {
Expand Down
14 changes: 13 additions & 1 deletion src/include/storage/store/column_chunk_stats.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,20 @@ struct ColumnChunkStats {
common::PhysicalTypeID dataType);
void update(StorageValue val, common::PhysicalTypeID dataType);
void update(uint8_t* data, uint64_t offset, uint64_t numValues,
common::PhysicalTypeID physicalType);
const common::NullMask* nullMask, common::PhysicalTypeID physicalType);
void reset();
};

struct MergedColumnChunkStats {
MergedColumnChunkStats(ColumnChunkStats stats, bool guaranteedNoNulls, bool guaranteedAllNulls)
: stats(stats), guaranteedNoNulls(guaranteedNoNulls),
guaranteedAllNulls(guaranteedAllNulls) {}

ColumnChunkStats stats;
bool guaranteedNoNulls;
bool guaranteedAllNulls;

void merge(const MergedColumnChunkStats& o, common::PhysicalTypeID dataType);
};

} // namespace kuzu::storage
1 change: 1 addition & 0 deletions src/storage/predicate/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
add_library(kuzu_storage_predicate
OBJECT
null_predicate.cpp
column_predicate.cpp
constant_predicate.cpp)

Expand Down
41 changes: 37 additions & 4 deletions src/storage/predicate/column_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
#include "binder/expression/literal_expression.h"
#include "binder/expression/scalar_function_expression.h"
#include "storage/predicate/constant_predicate.h"
#include "storage/predicate/null_predicate.h"

using namespace kuzu::binder;
using namespace kuzu::common;

namespace kuzu {
namespace storage {

ZoneMapCheckResult ColumnPredicateSet::checkZoneMap(const ColumnChunkStats& stats) const {
ZoneMapCheckResult ColumnPredicateSet::checkZoneMap(const MergedColumnChunkStats& stats) const {
for (auto& predicate : predicates) {
if (predicate->checkZoneMap(stats) == ZoneMapCheckResult::SKIP_SCAN) {
return ZoneMapCheckResult::SKIP_SCAN;
Expand Down Expand Up @@ -45,9 +46,12 @@ static bool isCastedColumnRef(const Expression& expr) {
return false;
}

static bool isColumnOrCastedColumnRef(const Expression& expr) {
return isColumnRef(expr.expressionType) || isCastedColumnRef(expr);
}

static bool isColumnRefConstantPair(const Expression& left, const Expression& right) {
return (isColumnRef(left.expressionType) || isCastedColumnRef(left)) &&
right.expressionType == ExpressionType::LITERAL;
return isColumnOrCastedColumnRef(left) && right.expressionType == ExpressionType::LITERAL;
}

static bool columnMatchesExprChild(const Expression& column, const Expression& expr) {
Expand Down Expand Up @@ -78,12 +82,41 @@ static std::unique_ptr<ColumnPredicate> tryConvertToConstColumnPredicate(const E
return nullptr;
}

static std::unique_ptr<ColumnPredicate> tryConvertToIsNull(const Expression& column,
const Expression& predicate) {
// we only convert simple predicates
if (isColumnOrCastedColumnRef(*predicate.getChild(0))) {
return std::make_unique<ColumnNullPredicate>(column.toString(), ExpressionType::IS_NULL);
}
return nullptr;
}

static std::unique_ptr<ColumnPredicate> tryConvertToIsNotNull(const Expression& column,
const Expression& predicate) {
if (isColumnOrCastedColumnRef(*predicate.getChild(0))) {
return std::make_unique<ColumnNullPredicate>(column.toString(),
ExpressionType::IS_NOT_NULL);
}
return nullptr;
}

std::unique_ptr<ColumnPredicate> ColumnPredicateUtil::tryConvert(const Expression& property,
const Expression& predicate) {
if (ExpressionTypeUtil::isComparison(predicate.expressionType)) {
return tryConvertToConstColumnPredicate(property, predicate);
}
return nullptr;
switch (predicate.expressionType) {
case common::ExpressionType::IS_NULL:
return tryConvertToIsNull(property, predicate);
case common::ExpressionType::IS_NOT_NULL:
return tryConvertToIsNotNull(property, predicate);
default:
return nullptr;
}
}

std::string ColumnPredicate::toString() {
return stringFormat("{} {}", columnName, ExpressionTypeUtil::toParsableString(expressionType));
}

} // namespace storage
Expand Down
16 changes: 8 additions & 8 deletions src/storage/predicate/constant_predicate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@ bool inRange(T min, T max, T val) {
}

template<typename T>
ZoneMapCheckResult checkZoneMapSwitch(const ColumnChunkStats& stats, ExpressionType expressionType,
const Value& value) {
KU_ASSERT(stats.min.has_value() && stats.max.has_value());
auto max = stats.max->get<T>();
auto min = stats.min->get<T>();
ZoneMapCheckResult checkZoneMapSwitch(const MergedColumnChunkStats& mergedStats,
ExpressionType expressionType, const Value& value) {
KU_ASSERT(mergedStats.stats.min.has_value() && mergedStats.stats.max.has_value());
auto max = mergedStats.stats.max->get<T>();
auto min = mergedStats.stats.min->get<T>();
auto constant = value.getValue<T>();
switch (expressionType) {
case ExpressionType::EQUALS: {
Expand Down Expand Up @@ -62,7 +62,8 @@ ZoneMapCheckResult checkZoneMapSwitch(const ColumnChunkStats& stats, ExpressionT
return ZoneMapCheckResult::ALWAYS_SCAN;
}

ZoneMapCheckResult ColumnConstantPredicate::checkZoneMap(const ColumnChunkStats& stats) const {
ZoneMapCheckResult ColumnConstantPredicate::checkZoneMap(
const MergedColumnChunkStats& stats) const {
auto physicalType = value.getDataType().getPhysicalType();
return TypeUtils::visit(
physicalType,
Expand All @@ -84,8 +85,7 @@ std::string ColumnConstantPredicate::toString() {
} else {
valStr = value.toString();
}
return stringFormat("{} {} {}", columnName,
ExpressionTypeUtil::toParsableString(expressionType), valStr);
return stringFormat("{} {}", ColumnPredicate::toString(), valStr);
}

} // namespace storage
Expand Down
15 changes: 15 additions & 0 deletions src/storage/predicate/null_predicate.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#include "storage/predicate/null_predicate.h"

#include "storage/store/column_chunk_stats.h"

namespace kuzu::storage {
common::ZoneMapCheckResult ColumnNullPredicate::checkZoneMap(
const MergedColumnChunkStats& mergedStats) const {
const bool statToCheck = (expressionType == common::ExpressionType::IS_NULL) ?
mergedStats.guaranteedNoNulls :
mergedStats.guaranteedAllNulls;
return statToCheck ? common::ZoneMapCheckResult::SKIP_SCAN :
common::ZoneMapCheckResult::ALWAYS_SCAN;
}

} // namespace kuzu::storage
13 changes: 4 additions & 9 deletions src/storage/store/chunked_node_group.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,8 @@ void ChunkedNodeGroup::write(const ChunkedNodeGroup& data, column_id_t offsetCol
}
}

static ZoneMapCheckResult getZoneMapResult(const TableScanState& scanState,
const std::vector<std::unique_ptr<ColumnChunk>>& chunks) {
static ZoneMapCheckResult getZoneMapResult(const Transaction* transaction,
const TableScanState& scanState, const std::vector<std::unique_ptr<ColumnChunk>>& chunks) {
if (!scanState.columnPredicateSets.empty()) {
for (auto i = 0u; i < scanState.columnIDs.size(); i++) {
const auto columnID = scanState.columnIDs[i];
Expand All @@ -206,12 +206,7 @@ static ZoneMapCheckResult getZoneMapResult(const TableScanState& scanState,

KU_ASSERT(i < scanState.columnPredicateSets.size());
const auto columnZoneMapResult = scanState.columnPredicateSets[i].checkZoneMap(
chunks[columnID]->getData().getMergedColumnChunkStats());
RUNTIME_CHECK(const bool columnHasStorageValueType =
TypeUtils::visit(chunks[columnID]->getDataType().getPhysicalType(),
[]<typename T>(T) { return StorageValueType<T>; }));
KU_ASSERT(columnHasStorageValueType ||
columnZoneMapResult == common::ZoneMapCheckResult::ALWAYS_SCAN);
chunks[columnID]->getMergedColumnChunkStats(transaction));
if (columnZoneMapResult == common::ZoneMapCheckResult::SKIP_SCAN) {
return common::ZoneMapCheckResult::SKIP_SCAN;
}
Expand All @@ -225,7 +220,7 @@ void ChunkedNodeGroup::scan(const Transaction* transaction, const TableScanState
length_t numRowsToScan) const {
KU_ASSERT(rowIdxInGroup + numRowsToScan <= numRows);
auto& anchorSelVector = scanState.outState->getSelVectorUnsafe();
if (getZoneMapResult(scanState, chunks) == common::ZoneMapCheckResult::SKIP_SCAN) {
if (getZoneMapResult(transaction, scanState, chunks) == common::ZoneMapCheckResult::SKIP_SCAN) {
anchorSelVector.setToFiltered(0);
return;
}
Expand Down
Loading

0 comments on commit ea98fb1

Please sign in to comment.