Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] refactor array_map #51244

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions be/src/column/array_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

#include "column/column_helper.h"
#include "column/fixed_length_column.h"
#include "column/nullable_column.h"
#include "column/vectorized_fwd.h"
#include "exprs/function_helper.h"
#include "gutil/bits.h"
#include "gutil/casts.h"
#include "gutil/strings/fastmem.h"
Expand Down Expand Up @@ -616,4 +618,98 @@ Status ArrayColumn::unfold_const_children(const starrocks::TypeDescriptor& type)
return Status::OK();
}

size_t ArrayColumn::get_total_elements_num(const NullColumnPtr& null_column) const {
if (null_column == nullptr) {
return _elements->size();
}
DCHECK_LE(_offsets->size() -1, null_column->size());
size_t elements_num = 0;
size_t num_rows = _offsets->size() - 1;
const auto& null_data = null_column->get_data();
for (size_t i = 0;i < num_rows;i++) {
if (!null_data[i]) {
elements_num += _offsets->get_data()[i + 1] - _offsets->get_data()[i];
}
}
return elements_num;
}

template <bool ConstV1, bool ConstV2, bool IgnoreNull>
bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2,
const NullColumnPtr& null_column) {
[[maybe_unused]] uint8_t* null_data = nullptr;
if constexpr (!IgnoreNull) {
null_data = null_column->get_data().data();
}

size_t num_rows = v1.size() - 1;
LOG(INFO) << "num_rows: " << num_rows;
if constexpr (ConstV1 && ConstV2) {
// if both are const column, we only compare the first row once
num_rows = 1;
}
bool result = true;
const auto& offsets_v1 = v1.get_data();
const auto& offsets_v2 = v2.get_data();
for (size_t i = 0;i < offsets_v1.size();i++) {
LOG(INFO) << "offset v1: " << offsets_v1[i] << ", v2:" << offsets_v2[i];
}


for (size_t i = 0; i < num_rows && result; i++) {
[[maybe_unused]] uint32_t len1 =
(ConstV1) ? (offsets_v1[1] - offsets_v1[0]) : (offsets_v1[i + 1] - offsets_v1[i]);
[[maybe_unused]] uint32_t len2 =
(ConstV2) ? (offsets_v2[1] - offsets_v2[0]) : (offsets_v2[i + 1] - offsets_v2[i]);
if constexpr (IgnoreNull) {
if (len1 != len2) {
LOG(INFO) << "array len mismatch, v1: " << len1 << ", v2: " << len2 << ", idx: " << i;
}
result &= (len1 == len2);
} else {
LOG(INFO) << "check idx: " << i << ", null: " << static_cast<uint32_t>(null_data[i]);
if (!null_data[i]) {
if (len1 != len2) {
LOG(INFO) << "array len mismatch, v1: " << len1 << ", v2: " << len2 << ", idx: " << i;
}
result &= (len1 == len2);
}
}
}
return result;
}

template <bool IgnoreNull>
bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2,
const NullColumnPtr& null_column) {
DCHECK(v1->is_array() && v2->is_array());
DCHECK(!v1->is_nullable() && !v2->is_nullable());

if (v1->size() != v2->size()) {
LOG(INFO) << "size not equal, v1: " << v1->size() << ", v2: " << v2->size();
return false;
}
auto data_v1 = FunctionHelper::get_data_column_of_const(v1);
auto data_v2 = FunctionHelper::get_data_column_of_const(v2);
auto* array_v1 = down_cast<ArrayColumn*>(data_v1.get());
auto* array_v2 = down_cast<ArrayColumn*>(data_v2.get());
const auto& offsets_v1 = array_v1->offsets();
const auto& offsets_v2 = array_v2->offsets();
LOG(INFO) << "v1 size: " << v1->size() << ", v2 size: " << v2->size() << ", offset v1: " << offsets_v1.size() << ", offset v2: " << offsets_v2.size();
if (v1->is_constant() && v2->is_constant()) {
return compare_lengths_from_offsets<true, true, IgnoreNull>(offsets_v1, offsets_v2, null_column);
} else if (v1->is_constant() && !v2->is_constant()) {
return compare_lengths_from_offsets<true, false, IgnoreNull>(offsets_v1, offsets_v2, null_column);
} else if (!v1->is_constant() && v2->is_constant()) {
return compare_lengths_from_offsets<false, true, IgnoreNull>(offsets_v1, offsets_v2, null_column);
}

return compare_lengths_from_offsets<false, false, IgnoreNull>(offsets_v1, offsets_v2, null_column);
}

template bool ArrayColumn::is_all_array_lengths_equal<true>(const ColumnPtr& v1, const ColumnPtr& v2,
const NullColumnPtr& null_data);
template bool ArrayColumn::is_all_array_lengths_equal<false>(const ColumnPtr& v1, const ColumnPtr& v2,
const NullColumnPtr& null_data);

} // namespace starrocks
20 changes: 19 additions & 1 deletion be/src/column/array_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ class ArrayColumn final : public ColumnFactory<Column, ArrayColumn> {

void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override;

std::string get_name() const override { return "array"; }
std::string get_name() const override { return "array-" + _elements->get_name(); }

Datum get(size_t idx) const override;

Expand Down Expand Up @@ -173,6 +173,7 @@ class ArrayColumn final : public ColumnFactory<Column, ArrayColumn> {

const UInt32Column& offsets() const { return *_offsets; }
UInt32Column::Ptr& offsets_column() { return _offsets; }
UInt32Column::Ptr offsets_column() const { return _offsets; }

bool is_nullable() const override { return false; }

Expand All @@ -195,7 +196,19 @@ class ArrayColumn final : public ColumnFactory<Column, ArrayColumn> {

Status unfold_const_children(const starrocks::TypeDescriptor& type) override;

// calculate all non-null elements' size
size_t get_total_elements_num(const NullColumnPtr& null_column) const;

// check if all of arrays' size is equal
// v1 and v2 must be one of ArrayColumn or Const(ArrayColumn)
template <bool IgnoreNull>
static bool is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data);

private:
template <bool ConstV1, bool ConstV2, bool IgnoreNull>
static bool compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2,
const NullColumnPtr& null_data);

// Elements must be NullableColumn to facilitate handling nested types.
ColumnPtr _elements;
// Offsets column will store the start position of every array element.
Expand All @@ -205,4 +218,9 @@ class ArrayColumn final : public ColumnFactory<Column, ArrayColumn> {
UInt32Column::Ptr _offsets;
};

extern template bool ArrayColumn::is_all_array_lengths_equal<true>(const ColumnPtr& v1, const ColumnPtr& v2,
const NullColumnPtr& null_data);
extern template bool ArrayColumn::is_all_array_lengths_equal<false>(const ColumnPtr& v1, const ColumnPtr& v2,
const NullColumnPtr& null_data);

} // namespace starrocks
1 change: 1 addition & 0 deletions be/src/column/chunk.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ void Chunk::append_column(ColumnPtr column, const FieldPtr& field) {
}

void Chunk::append_column(ColumnPtr column, SlotId slot_id) {
DCHECK(!_slot_id_to_index.contains(slot_id));
_slot_id_to_index[slot_id] = _columns.size();
_columns.emplace_back(std::move(column));
check_or_die();
Expand Down
3 changes: 2 additions & 1 deletion be/src/column/column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ StatusOr<ColumnPtr> Column::upgrade_helper_func(ColumnPtr* col) {
}

bool Column::empty_null_in_complex_column(const Filter& null_data, const Buffer<uint32_t>& offsets) {
DCHECK(null_data.size() == this->size());
// DCHECK(null_data.size() == this->size());
DCHECK_EQ(null_data.size(), this->size());
if (!is_array() && !is_map()) {
throw std::runtime_error("empty_null_in_complex_column() only works for array and map column.");
}
Expand Down
3 changes: 3 additions & 0 deletions be/src/column/column.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ class Column {

virtual bool is_array() const { return false; }

virtual bool is_array_view() const { return false; }

virtual bool is_map() const { return false; }

virtual bool is_struct() const { return false; }
Expand Down Expand Up @@ -177,6 +179,7 @@ class Column {
}
return dest;
}

// Update elements to default value which hit by the filter
virtual void fill_default(const Filter& filter) = 0;

Expand Down
1 change: 1 addition & 0 deletions be/src/column/const_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ class ConstColumn final : public ColumnFactory<Column, ConstColumn> {

bool is_nullable() const override { return _data->is_nullable(); }
bool is_json() const override { return _data->is_json(); }
bool is_array() const override { return _data->is_array(); }

bool is_null(size_t index) const override { return _data->is_null(0); }

Expand Down
1 change: 1 addition & 0 deletions be/src/column/nullable_column.h
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class NullableColumn : public ColumnFactory<Column, NullableColumn> {

bool is_nullable() const override { return true; }
bool is_json() const override { return _data_column->is_json(); }
bool is_array() const override { return _data_column->is_array(); }

bool is_null(size_t index) const override {
DCHECK_EQ(_null_column->size(), _data_column->size());
Expand Down
2 changes: 1 addition & 1 deletion be/src/exec/pipeline/scan/olap_scan_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ Status OlapScanContext::capture_tablet_rowsets(const std::vector<TInternalScanRa
ASSIGN_OR_RETURN(tablet_rowsets[i], OlapScanNode::capture_tablet_rowsets(tablet, scan_range));

VLOG(1) << "capture tablet rowsets: " << tablet->full_name() << ", rowsets: " << tablet_rowsets[i].size()
<< ", version: " << scan_range->version << ", gtid: " << scan_range->gtid;
<< ", version: " << scan_range->version;

_tablets[i] = std::move(tablet);
}
Expand Down
1 change: 1 addition & 0 deletions be/src/exec/sorting/sort_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "column/map_column.h"
#include "column/nullable_column.h"
#include "column/struct_column.h"
#include "common/status.h"
#include "exec/sorting/sort_helper.h"
#include "exec/sorting/sort_permute.h"
#include "exec/sorting/sorting.h"
Expand Down
10 changes: 9 additions & 1 deletion be/src/exprs/array_functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1097,14 +1097,21 @@ StatusOr<ColumnPtr> ArrayFunctions::all_match(FunctionContext* context, const Co
}

StatusOr<ColumnPtr> ArrayFunctions::any_match(FunctionContext* context, const Columns& columns) {
LOG(INFO) << "evaluate any_match";
return ArrayMatch<true>::process(context, columns);
}

StatusOr<ColumnPtr> ArrayFunctions::concat(FunctionContext* ctx, const Columns& columns) {
RETURN_IF_COLUMNS_ONLY_NULL(columns);

auto num_rows = columns[0]->size();
// @TODO optimize for const column

auto num_rows = columns[0]->size();
LOG(INFO) << "array_concat, num_rows: " << num_rows;
for (auto& column : columns) {
LOG(INFO) << "column size: " << column->size() << ", is_const: " << column->is_constant()
<< ", is_nullable: " << column->is_nullable();
}
// compute nulls
NullColumnPtr nulls;
for (auto& column : columns) {
Expand All @@ -1126,6 +1133,7 @@ StatusOr<ColumnPtr> ArrayFunctions::concat(FunctionContext* ctx, const Columns&
auto nullable_column = down_cast<NullableColumn*>(column.get());
array_columns.emplace_back(std::static_pointer_cast<ArrayColumn>(nullable_column->data_column()));
} else if (column->is_constant()) {
// @TODO no need
// NOTE: I'm not sure if there will be const array, just to be safe
array_columns.emplace_back(std::static_pointer_cast<ArrayColumn>(
ColumnHelper::unpack_and_duplicate_const_column(num_rows, column)));
Expand Down
1 change: 1 addition & 0 deletions be/src/exprs/array_functions.tpp
Original file line number Diff line number Diff line change
Expand Up @@ -943,6 +943,7 @@ private:
}

size_t chunk_size = columns[0]->size();
// @TODO avoid unpack
ColumnPtr src_column = ColumnHelper::unpack_and_duplicate_const_column(chunk_size, columns[0]);
ColumnPtr dest_column = src_column->clone_empty();
if (columns[1]->only_null()) { // return empty array for non-null array by design, keep the same null with src.
Expand Down
Loading
Loading