From 2b154d9e9119bdcff93b94df63751b854f9b6a16 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Thu, 12 Sep 2024 11:05:03 +0800 Subject: [PATCH 01/17] stash Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/CMakeLists.txt | 1 + be/src/column/array_column.cpp | 16 + be/src/column/array_column.h | 4 +- be/src/column/array_view_column.cpp | 187 ++++++++++++ be/src/column/array_view_column.h | 313 ++++++++++++++++++++ be/src/column/column.h | 20 ++ be/src/column/column_visitor.cpp | 1 + be/src/column/column_visitor.h | 2 + be/src/column/column_visitor_adapter.h | 4 + be/src/column/column_visitor_mutable.cpp | 1 + be/src/column/column_visitor_mutable.h | 1 + be/src/column/const_column.h | 2 + be/src/column/nullable_column.h | 2 + be/src/column/vectorized_fwd.h | 1 + be/src/exec/sorted_streaming_aggregator.cpp | 8 + be/src/exec/sorting/compare_column.cpp | 8 + be/src/exec/sorting/sort_column.cpp | 10 + be/src/exec/sorting/sort_permute.cpp | 5 + be/src/exprs/array_functions.cpp | 6 +- be/src/exprs/array_map_expr.cpp | 190 ++++++++++-- be/src/serde/column_array_serde.cpp | 17 ++ 21 files changed, 770 insertions(+), 29 deletions(-) create mode 100644 be/src/column/array_view_column.cpp create mode 100644 be/src/column/array_view_column.h diff --git a/be/src/column/CMakeLists.txt b/be/src/column/CMakeLists.txt index b1b1418fdf06b..901de66d29393 100644 --- a/be/src/column/CMakeLists.txt +++ b/be/src/column/CMakeLists.txt @@ -16,6 +16,7 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/column") add_library(Column STATIC array_column.cpp + array_view_column.cpp adaptive_nullable_column.cpp chunk.cpp chunk_extra_data.cpp diff --git a/be/src/column/array_column.cpp b/be/src/column/array_column.cpp index 1926f2070bab7..4c306e19b7f0c 100644 --- a/be/src/column/array_column.cpp +++ b/be/src/column/array_column.cpp @@ -18,6 +18,7 @@ #include "column/column_helper.h" #include "column/fixed_length_column.h" +#include "column/nullable_column.h" #include "column/vectorized_fwd.h" #include "gutil/bits.h" #include "gutil/casts.h" @@ -616,4 +617,19 @@ Status ArrayColumn::unfold_const_children(const starrocks::TypeDescriptor& type) return Status::OK(); } +bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& lhs, const ColumnPtr& rhs, const NullColumnPtr& null_column) { + if (!lhs->is_array() || !rhs->is_array()) { + throw std::runtime_error("input of is_all_array_lengths_equal shoule be array"); + } + if (lhs->size() != rhs->size()) { + return false; + } + // @TODO reject nullable column + + // @TODO consider nullable column + // if one of them is null, skip check + // otherwise, check length + return true; + } // namespace starrocks +} diff --git a/be/src/column/array_column.h b/be/src/column/array_column.h index 007b563e8878c..de44025ac2d9a 100644 --- a/be/src/column/array_column.h +++ b/be/src/column/array_column.h @@ -145,7 +145,7 @@ class ArrayColumn final : public ColumnFactory { void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override; - std::string get_name() const override { return "array"; } + std::string get_name() const override { return "array-"+ _elements->get_name(); } Datum get(size_t idx) const override; @@ -195,6 +195,8 @@ class ArrayColumn final : public ColumnFactory { Status unfold_const_children(const starrocks::TypeDescriptor& type) override; + static bool is_all_array_lengths_equal(const ColumnPtr& lhs, const ColumnPtr& rhs, const NullColumnPtr& null_data); + private: // Elements must be NullableColumn to facilitate handling nested types. ColumnPtr _elements; diff --git a/be/src/column/array_view_column.cpp b/be/src/column/array_view_column.cpp new file mode 100644 index 0000000000000..f13b1aa8833ed --- /dev/null +++ b/be/src/column/array_view_column.cpp @@ -0,0 +1,187 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "column/array_view_column.h" +#include +#include +#include +#include "column/array_column.h" +#include "column/chunk.h" +#include "column/vectorized_fwd.h" +#include "gutil/casts.h" + +namespace starrocks { + +ColumnPtr ArrayViewColumn::replicate(const Buffer& offsets) { + // @TODO clone empty??? + // auto dest = this->clone_empty(); + auto dest_size = offsets.size() - 1; + auto new_offsets = UInt32Column::create(); + auto new_lengths = UInt32Column::create(); + new_offsets->reserve(offsets.back()); + new_lengths->reserve(offsets.back()); + + for (size_t i = 0;i < dest_size;i++) { + uint32_t repeat_times = offsets[i + 1] - offsets[i]; + new_offsets->append_value_multiple_times(*_offsets, i, repeat_times); + new_lengths->append_value_multiple_times(*_lengths, i, repeat_times); + } + return ArrayViewColumn::create(_elements, new_offsets, new_lengths); +} + +void ArrayViewColumn::append(const Column& src, size_t offset, size_t count) { + const auto& array_view_column = down_cast(src); + const auto& src_offsets = array_view_column.offsets(); + const auto& src_lengths = array_view_column.lengths(); + + if (_elements == array_view_column._elements) { + LOG(INFO) << "shared elements, only copy offsets and lengths"; + // if these two array view column share the same elements, just append offset and lengths + _offsets->append(src_offsets, offset, count); + _lengths->append(src_lengths, offset, count); + } else { + LOG(INFO) << "not shared elements, should copy all"; + // append elements and re-compute offset and length for new data + // @TODO should optimize + // @TODO should avoid this copy... + uint32_t offset = _elements->size(); + for (size_t i = 0;i < count;i++) { + uint32_t src_offset = src_offsets.get_data()[offset + i]; + uint32_t src_length = src_lengths.get_data()[offset + i]; + DCHECK_LE(src_offset + src_length, array_view_column._elements->size()); + _elements->append(*(array_view_column._elements), src_offset, src_length); + _offsets->append(src_offset + offset); + _lengths->append(src_length); + } + } +} + +void ArrayViewColumn::check_or_die() const { + DCHECK(_elements); + DCHECK(_offsets); + DCHECK(_lengths); + DCHECK_EQ(_offsets->size(), _lengths->size()); + for (size_t i = 0;i < _offsets->size();i++) { + uint32_t offset = _offsets->get_data()[i]; + uint32_t length = _lengths->get_data()[i]; + DCHECK_LE(offset + length, _elements->size()); + } +} + +// @TODO clone should share elements? +MutableColumnPtr ArrayViewColumn::clone_empty() const { + return create_mutable(_elements, UInt32Column::create(), UInt32Column::create()); +} + +StatusOr ArrayViewColumn::to_array_column() const { + LOG(INFO) << "ArrayViewColumn::to_array_column, cosnt ? " << is_constant(); + // @TODO consider nullable ??? + auto array_elements = _elements->clone_empty(); + auto array_offsets = UInt32Column::create(); + // @TODO reserve elements too? + LOG(INFO) << "ArrayViewColumn::to_array_column, size: " << _offsets->size(); + array_offsets->reserve(_offsets->size() + 1); + array_offsets->append(0); + uint32_t last_offset = 0; + size_t num_rows = _offsets->size(); + // @TODO maybe copy alot... + for (size_t i = 0;i < num_rows;i++) { + uint32_t offset = _offsets->get_data()[i]; + uint32_t length = _lengths->get_data()[i]; + LOG(INFO) << "offset: " << offset << ", len: " << length; + // append lement + array_elements->append(*_elements, offset, length); + array_offsets->append(last_offset + length); + last_offset += length; + } + return ArrayColumn::create(std::move(array_elements), std::move(array_offsets)); +} + + +StatusOr ArrayViewColumn::from_array_column(const ColumnPtr& column) { + if (!column->is_array()) { + LOG(INFO) << "from_array_column error..."; + return Status::InternalError("input column must be array column"); + } + LOG(INFO) << "from_array_column, size: " << column->size(); + auto view_offsets = UInt32Column::create(); + auto view_lengths = UInt32Column::create(); + view_offsets->reserve(column->size()); + view_lengths->reserve(column->size()); + ColumnPtr view_elements; + + // const ArrayColumn* array_column = nullptr; + if (column->is_nullable()) { + auto nullable_column = down_cast(column.get()); + DCHECK(nullable_column != nullptr); + const auto& null_data = nullable_column->null_column()->get_data(); + auto array_column = down_cast(nullable_column->data_column().get()); + const auto& array_offsets = array_column->offsets().get_data(); + + view_elements = array_column->elements_column(); + LOG(INFO) << "elements size: " << view_elements->size(); + LOG(INFO) << "null size: " << nullable_column->null_column()->size(); + // array column: [[1,2],null,[],[4]] + // null_data [0,1,0,0] + // elements column: [1,2,3,4] + // offsets column: [0, 2, 2, 2, 3] + + // array view column: [[1,2], null, [], [4]] + // null_data[0,1,0,0] + // elements column: [1,2,3,4] + // offsets column: [0,2,2,3] + // length column: [2,0,0,1] + for (size_t i = 0;i < column->size(); i ++) { + uint32_t offset = array_offsets[i]; + uint32_t length = null_data[i] ? 0: (array_offsets[i + 1] - offset); + LOG(INFO) << "append offset: " << offset << ", length: " << length; + view_offsets->append(offset); + view_lengths->append(length); + } + auto ret = NullableColumn::create(ArrayViewColumn::create(view_elements, view_offsets, view_lengths), nullable_column->null_column()); + ret->check_or_die(); + return ret; + } + + auto array_column = down_cast(column.get()); + view_elements = array_column->elements_column(); + const auto& array_offsets = array_column->offsets().get_data(); + + for (size_t i = 0;i < column->size();i++) { + uint32_t offset = array_offsets[i]; + uint32_t length = array_offsets[i + 1] - offset; + view_offsets->append(offset); + view_lengths->append(length); + } + return ArrayViewColumn::create(view_elements, view_offsets, view_lengths); +} + +StatusOr ArrayViewColumn::to_array_column(const ColumnPtr& column) { + if (!column->is_array_view()) { + LOG(INFO) << "to_array_column error...."; + return Status::InternalError("input column must be array view column"); + } + + if (column->is_nullable()) { + auto nullable_column = down_cast(column.get()); + DCHECK(nullable_column != nullptr); + auto array_view_column = down_cast(nullable_column->data_column().get()); + LOG(INFO) << "to_array_column"; + ASSIGN_OR_RETURN(auto array_column, array_view_column->to_array_column()); + return NullableColumn::create(std::move(array_column), nullable_column->null_column()); + } + auto array_view_column = down_cast(column.get()); + return array_view_column->to_array_column(); +} +} \ No newline at end of file diff --git a/be/src/column/array_view_column.h b/be/src/column/array_view_column.h new file mode 100644 index 0000000000000..ce5f10099ac41 --- /dev/null +++ b/be/src/column/array_view_column.h @@ -0,0 +1,313 @@ +// Copyright 2021-present StarRocks, Inc. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "column/column.h" +#include "column/fixed_length_column.h" +#include "column/nullable_column.h" +#include "column/vectorized_fwd.h" + +namespace starrocks { + +class ArrayViewColumn final: public ColumnFactory { + friend class ColumnFactory; + +public: + using ValueType = void; // @TODO need array view? + + ArrayViewColumn(ColumnPtr elements, UInt32Column::Ptr offsets, UInt32Column::Ptr lengths): + _elements(std::move(elements)), _offsets(std::move(offsets)), _lengths(std::move(lengths)) {} + + ArrayViewColumn(const ArrayViewColumn& rhs) + : _elements(rhs._elements), + _offsets(std::static_pointer_cast(rhs._offsets->clone_shared())), + _lengths(std::static_pointer_cast(rhs._lengths->clone_shared())) {} + + ArrayViewColumn(ArrayViewColumn&& rhs) noexcept: + _elements(std::move(rhs._elements)), _offsets(std::move(rhs._offsets)), _lengths(std::move(rhs._lengths)) {} + + ArrayViewColumn& operator=(const ArrayViewColumn& rhs) { + // @TODO + return *this; + } + ArrayViewColumn& operator=(ArrayViewColumn&& rhs) noexcept { + // @TODO + return *this; + } + + ~ArrayViewColumn() override = default; + + bool is_array_view() const override { + return true; + } + + const uint8_t* raw_data() const override{ + DCHECK(false) << "ArrayViewColumn::raw_data() is not supported"; + return nullptr; + } + + uint8_t* mutable_raw_data() override { + DCHECK(false) << "ArrayViewColumn::mutable_raw_data() is not supported"; + return nullptr; + } + + size_t size() const override { + return _offsets->size(); + } + size_t capacity() const override { + return _offsets->capacity() + _lengths->capacity(); + } + + size_t type_size() const override { + // @TODO need a array view type + return 0; + } + + size_t byte_size() const override { + // @TODO + return 0; + } + size_t byte_size(size_t from, size_t size) const override { + // @TODO + return 0; + } + + size_t byte_size(size_t idx) const override { + // @TODO + return 0; + } + + void reserve(size_t n) override { + _elements->reserve(n); + _offsets->reserve(n); + _lengths->reserve(n); + + } + void resize(size_t n) override { + // DCHECK(false) << "ArrayViewColumn::resize() is not supported"; + _elements->resize(n); + _offsets->resize(n); + _lengths->resize(n); + } + void assign(size_t n, size_t idx) override { + // @TODO + DCHECK(false) << "ArrayViewColumn::assign() is not supported"; + } + void append_datum(const Datum& datum) override { + DCHECK(false) << "ArrayViewColumn::append_datum() is not supported"; + } + + void append(const Column& src, size_t offset, size_t count) override; + + void append_selective(const Column& src, const uint32_t* indexes, uint32_t from, uint32_t size) override { + DCHECK(false) << "ArrayViewColumn::append_selective() is not supported"; + } + + void append_value_multiple_times(const Column& src, uint32_t idx, uint32_t size) override { + DCHECK(false) << "ArrayViewColumn::append_value_multiple_times() is not supported"; + } + + bool append_nulls(size_t count) override { + // @TODO + return false; + } + + size_t append_numbers(const void* buff, size_t length) override { + // @TODO + return -1; + } + void append_value_multiple_times(const void* value, size_t count) override { + DCHECK(false) << "ArrayViewColumn::append_value_multiple_times() is not supported"; + } + + void append_default() override { + DCHECK(false) << "ArrayViewColumn::append_default() is not supported"; + } + void append_default(size_t count) override { + DCHECK(false) << "ArrayViewColumn::append_default() is not supported"; + } + void fill_default(const Filter& filter) override { + // @TODO + } + void update_rows(const Column& src, const uint32_t* indexes) override { + DCHECK(false) << "ArrayViewColumn:::update_rows() is not supported"; + } + void remove_first_n_values(size_t count) override { + // @TODO + } + uint32_t max_one_element_serialize_size() const override { + DCHECK(false) << "ArrayViewColumn::max_one_element_serialize_size() is not supported"; + return 0; + } + uint32_t serialize(size_t idx, uint8_t* pos) override { + DCHECK(false); + return 0; + } + uint32_t serialize_default(uint8_t* pos) override { + DCHECK(false); + return 0; + } + + void serialize_batch(uint8_t* dst, Buffer& slice_sized, size_t chunk_size, uint32_t max_one_row_size) override { + DCHECK(false); + } + const uint8_t* deserialize_and_append(const uint8_t* pos) override { + DCHECK(false); + return nullptr; + } + + uint32_t serialize_size(size_t idx) const override { + DCHECK(false); + return 0; + } + void deserialize_and_append_batch(Buffer& srcs, size_t chunk_size) override { + DCHECK(false); + } + + MutableColumnPtr clone_empty() const override; + + size_t filter_range(const Filter& filter, size_t from, size_t to) override { + // @TODO + return 0; + } + + int compare_at(size_t left, size_t rifht, const Column& right_column, int nan_direction_hint) const override { + // @TODO + return 0; + } + + void compare_column(const Column& rhs, std::vector* output) const { + + } + + int equals(size_t left, const Column& rhs, size_t right, bool safe_eq = true) const override { + return 0; + } + + void crc32_hash_at(uint32_t *seed, uint32_t idx) const override { + + } + void fnv_hash_at(uint32_t* seed, uint32_t idx) const override { + + } + void fnv_hash(uint32_t* hash, uint32_t from, uint32_t to) const override { + + } + + void crc32_hash(uint32_t* hash, uint32_t from, uint32_t to) const override { + + } + + int64_t xor_checksum(uint32_t from, uint32_t to) const override { + return 0; + } + + void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override { + + } + + ColumnPtr replicate(const Buffer& offsets) override; + + std::string get_name() const override { return "array-view"; } + + Datum get(size_t idx) const override { + return Datum(); + } + + size_t get_element_null_count(size_t idx) const { + return 0; + } + size_t get_element_size(size_t idx) const { + return 0; + } + + bool set_null(size_t idx) override { + return false; + } + + size_t memory_usage() const override { return _elements->memory_usage() + _offsets->memory_usage(); } + + size_t container_memory_usage() const override { + return _elements->container_memory_usage() + _offsets->container_memory_usage(); + } + + size_t reference_memory_usage(size_t from, size_t size) const override { + return 0; + } + + void swap_column(Column& rhs) override {} + + void reset_column() override {} + + const Column& elements() const { return *_elements; } + ColumnPtr& elements_column() { return _elements; } + ColumnPtr elements_column() const { return _elements; } + + const UInt32Column& offsets() const { return *_offsets; } + UInt32Column::Ptr& offsets_column() { return _offsets; } + const UInt32Column& lengths() const { + return *_lengths; + } + UInt32Column::Ptr& lengths_column() { return _lengths; } + + bool is_nullable() const override { return false; } + + std::string debug_item(size_t idx) const override { + return ""; + } + + std::string debug_string() const override { + return "array-view-column"; + } + + Status capacity_limit_reached() const override { + RETURN_IF_ERROR(_elements->capacity_limit_reached()); + return _offsets->capacity_limit_reached(); + } + + StatusOr upgrade_if_overflow() override { + return nullptr; + } + + StatusOr downgrade() override { + return nullptr; + } + + bool has_large_column() const override { return _elements->has_large_column(); } + + void check_or_die() const override; + + Status unfold_const_children(const starrocks::TypeDescriptor& type) override { + return Status::NotSupported("TBD"); + } + + // build array_view column from array_column, how to solve null?? + // if array_column is nullable, return Nullable(ArrayViewColumn) + // else return ArrayViewColumn + static StatusOr from_array_column(const ColumnPtr& column); + static StatusOr to_array_column(const ColumnPtr& column); + // @TODO to_array_column + StatusOr to_array_column() const; + +private: + // Elements must be NullableColumn to facilitate handling nested types. + ColumnPtr _elements; + UInt32Column::Ptr _offsets; + UInt32Column::Ptr _lengths; +}; +} \ No newline at end of file diff --git a/be/src/column/column.h b/be/src/column/column.h index e889fa272f81f..979165a98baed 100644 --- a/be/src/column/column.h +++ b/be/src/column/column.h @@ -97,6 +97,8 @@ class Column { virtual bool is_array() const { return false; } + virtual bool is_array_view() const { return false; } + virtual bool is_map() const { return false; } virtual bool is_struct() const { return false; } @@ -177,6 +179,24 @@ class Column { } return dest; } + + // align columns' offsets + // column(1,2)->align_offsets({0,2,5}) -> column(1,_,2,_,_) + virtual ColumnPtr align_offsets(const Buffer& offsets) { + auto dest = this->clone_empty(); + auto dest_size = offsets.size() - 1; + DCHECK(this->size() >= dest_size) << "The size of the source column is less when aligning offsets."; + dest->reserve(offsets.back()); + for (size_t i = 0;i < dest_size;i++) { + // first value is itself, others append default + dest->append_value_multiple_times(*this, i, 1); + if (offsets[i + 1] - offsets[i] > 1) { + dest->append_default(offsets[i + 1] - offsets[i] - 1); + } + + } + return dest; + } // Update elements to default value which hit by the filter virtual void fill_default(const Filter& filter) = 0; diff --git a/be/src/column/column_visitor.cpp b/be/src/column/column_visitor.cpp index 5ffdfd43ae266..a5c72213a9420 100644 --- a/be/src/column/column_visitor.cpp +++ b/be/src/column/column_visitor.cpp @@ -69,5 +69,6 @@ VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(ObjectColumn) +VISIT_IMPL(ArrayViewColumn) } // namespace starrocks diff --git a/be/src/column/column_visitor.h b/be/src/column/column_visitor.h index dd37e1d70ea46..7a400c3a916dc 100644 --- a/be/src/column/column_visitor.h +++ b/be/src/column/column_visitor.h @@ -80,6 +80,8 @@ class ColumnVisitor { virtual Status visit(const FixedLengthColumnBase& column); virtual Status visit(const FixedLengthColumnBase& column); virtual Status visit(const ObjectColumn& column); + virtual Status visit(const ArrayViewColumn& column); + }; } // namespace starrocks diff --git a/be/src/column/column_visitor_adapter.h b/be/src/column/column_visitor_adapter.h index 3f5a9d022ba02..85feb659808d2 100644 --- a/be/src/column/column_visitor_adapter.h +++ b/be/src/column/column_visitor_adapter.h @@ -93,6 +93,8 @@ class ColumnVisitorAdapter : public ColumnVisitor { Status visit(const LargeBinaryColumn& column) override { return _impl->do_visit(column); } + Status visit(const ArrayViewColumn& column) override { return _impl->do_visit(column); } + private: Impl* _impl; }; @@ -166,6 +168,8 @@ class ColumnVisitorMutableAdapter : public ColumnVisitorMutable { Status visit(LargeBinaryColumn* column) override { return _impl->do_visit(column); } + Status visit(ArrayViewColumn* column) override { return _impl->do_visit(column); } + private: Impl* _impl; }; diff --git a/be/src/column/column_visitor_mutable.cpp b/be/src/column/column_visitor_mutable.cpp index cd91d66081a58..9f303d7a73da2 100644 --- a/be/src/column/column_visitor_mutable.cpp +++ b/be/src/column/column_visitor_mutable.cpp @@ -69,5 +69,6 @@ VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) +VISIT_IMPL(ArrayViewColumn) } // namespace starrocks diff --git a/be/src/column/column_visitor_mutable.h b/be/src/column/column_visitor_mutable.h index 87291d2b1b3d9..00011434d0204 100644 --- a/be/src/column/column_visitor_mutable.h +++ b/be/src/column/column_visitor_mutable.h @@ -80,6 +80,7 @@ class ColumnVisitorMutable { virtual Status visit(FixedLengthColumnBase* column); virtual Status visit(FixedLengthColumnBase* column); virtual Status visit(ObjectColumn* column); + virtual Status visit(ArrayViewColumn* column); }; } // namespace starrocks diff --git a/be/src/column/const_column.h b/be/src/column/const_column.h index 04adbf617ca97..78d02d76c57fe 100644 --- a/be/src/column/const_column.h +++ b/be/src/column/const_column.h @@ -49,6 +49,8 @@ class ConstColumn final : public ColumnFactory { bool is_nullable() const override { return _data->is_nullable(); } bool is_json() const override { return _data->is_json(); } + bool is_array() const override { return _data->is_array(); } + bool is_array_view() const override { return _data->is_array_view(); } bool is_null(size_t index) const override { return _data->is_null(0); } diff --git a/be/src/column/nullable_column.h b/be/src/column/nullable_column.h index 44906cc51cfc5..02de533f35112 100644 --- a/be/src/column/nullable_column.h +++ b/be/src/column/nullable_column.h @@ -82,6 +82,8 @@ class NullableColumn : public ColumnFactory { bool is_nullable() const override { return true; } bool is_json() const override { return _data_column->is_json(); } + bool is_array() const override { return _data_column->is_array(); } + bool is_array_view() const override { return _data_column->is_array_view(); } bool is_null(size_t index) const override { DCHECK_EQ(_null_column->size(), _data_column->size()); diff --git a/be/src/column/vectorized_fwd.h b/be/src/column/vectorized_fwd.h index bea88d860a119..00472ef2f54ca 100644 --- a/be/src/column/vectorized_fwd.h +++ b/be/src/column/vectorized_fwd.h @@ -46,6 +46,7 @@ template using Buffer = std::vector>; class ArrayColumn; +class ArrayViewColumn; class MapColumn; class StructColumn; class NullableColumn; diff --git a/be/src/exec/sorted_streaming_aggregator.cpp b/be/src/exec/sorted_streaming_aggregator.cpp index f7c0d92d9af12..d2a715a269f85 100644 --- a/be/src/exec/sorted_streaming_aggregator.cpp +++ b/be/src/exec/sorted_streaming_aggregator.cpp @@ -154,6 +154,10 @@ class ColumnSelfComparator : public ColumnVisitorAdapter { return Status::NotSupported("Unsupported struct column in column wise comparator"); } + Status do_visit(const ArrayViewColumn& column) { + return Status::NotSupported("Unsupported array view column in column wise comparator"); + } + private: const ColumnPtr& _first_column; std::vector& _cmp_vector; @@ -251,6 +255,10 @@ class AppendWithMask : public ColumnVisitorMutableAdapter { return Status::NotSupported("Unsupported struct column in column wise comparator"); } + Status do_visit(ArrayViewColumn* column) { + return Status::NotSupported("Unsupported array view column in column wise comparator"); + } + private: Column* _column; const SelMask _sel_mask; diff --git a/be/src/exec/sorting/compare_column.cpp b/be/src/exec/sorting/compare_column.cpp index 5c33b1ab9f248..fef976df8588b 100644 --- a/be/src/exec/sorting/compare_column.cpp +++ b/be/src/exec/sorting/compare_column.cpp @@ -238,6 +238,11 @@ class ColumnCompare final : public ColumnVisitorAdapter { return Status::OK(); } + Status do_visit(const ArrayViewColumn& column) { + DCHECK(false) << "not support array view column sort_and_tie"; + return Status::NotSupported("not suport array view column"); + } + size_t get_equal_count() const { return _equal_count; } private: @@ -312,6 +317,9 @@ class ColumnTieBuilder final : public ColumnVisitorAdapter { Status do_visit(const ObjectColumn& column) { return Status::NotSupported("not support"); } + Status do_visit(const ArrayViewColumn& column) { + return Status::NotSupported("Not support"); + } private: const ColumnPtr _column; diff --git a/be/src/exec/sorting/sort_column.cpp b/be/src/exec/sorting/sort_column.cpp index 8df27f6086343..baa629ea04382 100644 --- a/be/src/exec/sorting/sort_column.cpp +++ b/be/src/exec/sorting/sort_column.cpp @@ -27,6 +27,7 @@ #include "column/map_column.h" #include "column/nullable_column.h" #include "column/struct_column.h" +#include "common/status.h" #include "exec/sorting/sort_helper.h" #include "exec/sorting/sort_permute.h" #include "exec/sorting/sorting.h" @@ -217,6 +218,11 @@ class ColumnSorter final : public ColumnVisitorAdapter> { return sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), _permutation, _tie, cmp, _range_or_ranges, _build_tie); } + Status do_visit(const ArrayViewColumn& column) { + DCHECK(false) << "not support array view column sort_and_tie"; + + return Status::NotSupported("not support array view column sort_and_tie"); + } private: const std::atomic& _cancel; @@ -406,6 +412,10 @@ class VerticalColumnSorter final : public ColumnVisitorAdapter diff --git a/be/src/exec/sorting/sort_permute.cpp b/be/src/exec/sorting/sort_permute.cpp index 83edb0101e474..84354df0b6ab1 100644 --- a/be/src/exec/sorting/sort_permute.cpp +++ b/be/src/exec/sorting/sort_permute.cpp @@ -234,6 +234,11 @@ class ColumnAppendPermutation final : public ColumnVisitorMutableAdapter ArrayFunctions::concat(FunctionContext* ctx, const Columns& RETURN_IF_COLUMNS_ONLY_NULL(columns); auto num_rows = columns[0]->size(); - + LOG(INFO) << "array_concat, num_rows: " << num_rows; + for (auto& column: columns) { + LOG(INFO) << "column size: " << column->size() << ", is_const: " << column->is_constant() << ", is_nullable: " << column->is_nullable(); + } // compute nulls NullColumnPtr nulls; for (auto& column : columns) { diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index 9f97cc38fa2bd..f645270efd50b 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -15,13 +15,17 @@ #include "exprs/array_map_expr.h" #include +#include #include "column/array_column.h" #include "column/chunk.h" #include "column/column_helper.h" #include "column/const_column.h" #include "column/fixed_length_column.h" +#include "column/array_view_column.h" #include "column/vectorized_fwd.h" +#include "common/constexpr.h" +#include "common/statusor.h" #include "exprs/anyval_util.h" #include "exprs/expr_context.h" #include "exprs/function_helper.h" @@ -41,8 +45,10 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* std::vector input_elements; NullColumnPtr null_column = nullptr; bool is_single_nullable_child = false; - ArrayColumn* input_array = nullptr; + // ArrayColumn* input_array = nullptr; + ColumnPtr input_array = nullptr; ColumnPtr input_array_ptr_ref = nullptr; // hold shared_ptr to avoid early deleted. + // for many valid arguments: // if one of them is a null literal, the result is a null literal; // if one of them is only null, then results are null; @@ -55,37 +61,80 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* if (child_col->only_null()) { return ColumnHelper::align_return_type(child_col, type(), chunk->num_rows(), true); } + LOG(INFO) << "eval child: " << child_col->get_name(); // no optimization for const columns. - child_col = ColumnHelper::unpack_and_duplicate_const_column(child_col->size(), child_col); + if (child_col->is_constant()) { + LOG(INFO) << "unpack const, " << child_col->get_name(); + } - auto column = child_col; - if (child_col->is_nullable()) { - auto nullable = down_cast(child_col.get()); - DCHECK(nullable != nullptr); - column = nullable->data_column(); + bool is_const = child_col->is_constant(); + bool is_nullable = child_col->is_nullable(); + size_t num_rows = child_col->size(); + + auto data_column = child_col; + if (is_const) { + auto const_column = down_cast(child_col.get()); + data_column = const_column->data_column(); + } + + // child_col = ColumnHelper::unpack_and_duplicate_const_column(child_col->size(), child_col); + // @TODO consider const nullable + if (is_nullable) { + // auto nullable = down_cast(child_col.get()); + auto nullable_column = down_cast(data_column.get()); + DCHECK(nullable_column); + data_column = nullable_column->data_column(); // empty null array with non-zero elements - column->empty_null_in_complex_column(nullable->null_column()->get_data(), - down_cast(column.get())->offsets().get_data()); + // @TODO remove this, do all things in align phase? + // column->empty_null_in_complex_column(nullable->null_column()->get_data(), + // down_cast(column.get())->offsets().get_data()); + + // @TODO what is is_single_nullable_child.. if (null_column) { is_single_nullable_child = false; - null_column = FunctionHelper::union_null_column(nullable->null_column(), null_column); // merge null + null_column = FunctionHelper::union_null_column(nullable_column->null_column(), null_column); // merge null } else { is_single_nullable_child = true; - null_column = nullable->null_column(); + null_column = nullable_column->null_column(); } } - DCHECK(column->is_array()); - auto cur_array = down_cast(column.get()); + DCHECK(data_column->is_array() && !data_column->is_nullable()); + // @TODO column maybe const + // auto cur_array = down_cast(column.get()); + + // @TODO should keep one column ,make sure array len is same + // check each array size in this column? if (input_array == nullptr) { - input_array = cur_array; - input_array_ptr_ref = column; + // input_array = cur_array; + input_array = data_column; + input_array_ptr_ref = data_column; + LOG(INFO) << "input_array: " << data_column->get_name(); + // input_array = column; + // input_array_ptr_ref = column; + // @TODO what if input_array is const column } else { - if (UNLIKELY(!ColumnHelper::offsets_equal(cur_array->offsets_column(), input_array->offsets_column()))) { - return Status::InternalError("Input array element's size is not equal in array_map()."); - } + // @TODO need a function to check each array size + // if (UNLIKELY(!ColumnHelper::offsets_equal(cur_array->offsets_column(), input_array->offsets_column()))) { + // return Status::InternalError("Input array element's size is not equal in array_map()."); + // } + } + + // @TODO + // elements maybe const + ColumnPtr elements_column = nullptr; + auto array_column = down_cast(data_column.get()); + if (is_const) { + // if original column is const column, should keep const + elements_column = ConstColumn::create(array_column->elements_column(), num_rows); + } else { + elements_column = array_column->elements_column(); } - input_elements.push_back(cur_array->elements_column()); + + // @TODO put all element column into input elements + input_elements.emplace_back(elements_column); + + // input_elements.push_back(cur_array->elements_column()); } if (is_single_nullable_child) { @@ -96,7 +145,11 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* } ColumnPtr column = nullptr; - if (input_array->elements_column()->empty()) { // arrays may be null or empty + // @TODO handle empty case? + // @TODO if all is null + if (null_column->only_null()) { + // @TODO need check + // if (input_array->elements_column()->empty()) { // arrays may be null or empty column = ColumnHelper::create_column(type().children[0], true); // array->elements must be of return array->elements' type } else { @@ -109,8 +162,12 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* DCHECK(argument_num == input_elements.size()); for (int i = 0; i < argument_num; ++i) { cur_chunk->append_column(input_elements[i], arguments_ids[i]); // column ref + LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; } + + // @TODO align all element column // put captured columns into the new chunk aligning with the first array's offsets + // @TODO we don't need align? std::vector slot_ids; _children[0]->get_slot_ids(&slot_ids); for (auto id : slot_ids) { @@ -120,16 +177,68 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* return Status::InternalError(fmt::format( "The size of the captured column {} is less than array's size.", captured->get_name())); } - cur_chunk->append_column(captured->replicate(input_array->offsets_column()->get_data()), id); + // @TODO maybe we need binary view column too... + // @TODO if capture is a binary column, replicate is expansive too + // @TODO not sure if captured is array?? + LOG(INFO) << "capture column: " << captured->get_name() << ", id: " << id; + // if (captured->is_array()) { + // LOG(INFO) << "build array view column from captured, slot id: " << id; + // ASSIGN_OR_RETURN(captured, ArrayViewColumn::from_array_column(captured)); + // captured->check_or_die(); + // } + // capture column may be not lambda arguement? + + // align offsets + if (!captured->is_constant()) { + // @TODO replicate?? + // for non-const column, we should align-up offsets. + // @TODO must replicate... + cur_chunk->append_column(captured, id); + } else { + cur_chunk->append_column(captured, id); + } + + // @TODO align array-argument is enough, if other column don't ref to array, replictte is not necessar + // @TODO consider alignup, what if first column is const?? + // cur_chunk->append_column(captured->align_offsets({}), id); + // cur_chunk->append_column(captured->replicate(input_array->offsets_column()->get_data()), id); } - if (cur_chunk->num_rows() <= chunk->num_rows() * 8) { - ASSIGN_OR_RETURN(column, context->evaluate(_children[0], cur_chunk.get())); - column = ColumnHelper::align_return_type(column, type().children[0], cur_chunk->num_rows(), true); - } else { // split large chunks into small ones to avoid too large or various batch_size + + // @TODO + { + // @TODO evalu param may be very large?? + // cut tmp chunk from cur_chunk, and eval + // cut data + // if cur_chunk has view_column, we should convert view_column to column again + + // @TODO cut row [x,y] into a tmp chunk ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); accumulator.finalize(); while (auto tmp_chunk = accumulator.pull()) { + // if contains view, should translate it back + // TODO change column + auto new_chunk = std::make_shared(); + const auto& columns = tmp_chunk->columns(); + for(size_t idx = 0;idx < columns.size();idx++) { + const auto& column = columns[idx]; + if (column->is_array_view()) { + LOG(INFO) << "convert array-view to array, " << column->get_name(); + ASSIGN_OR_RETURN(auto new_column, ArrayViewColumn::to_array_column(column)); + LOG(INFO) << "convert done"; + new_column->check_or_die(); + // auto array_view_column = down_cast(column.get()); + // ASSIGN_OR_RETURN(auto new_column, array_view_column->to_array_column()); + LOG(INFO) << "update column, idx: " << idx; + tmp_chunk->update_column_by_index(new_column, idx); + } + } + tmp_chunk->check_or_die(); + for (const auto& column: tmp_chunk->columns()) { + LOG(INFO) << "column: " << column->get_name(); + DCHECK(!column->is_array_view()) << "unexpected array view"; + } + ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); if (column == nullptr) { @@ -140,13 +249,40 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* } } + + // @TODO + // evaluate lambda expr? + // if (cur_chunk->num_rows() <= chunk->num_rows() * 8) { + // ASSIGN_OR_RETURN(column, context->evaluate(_children[0], cur_chunk.get())); + // column = ColumnHelper::align_return_type(column, type().children[0], cur_chunk->num_rows(), true); + // } else { // split large chunks into small ones to avoid too large or various batch_size + // ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); + // RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); + // accumulator.finalize(); + // while (auto tmp_chunk = accumulator.pull()) { + // ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); + // tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); + // if (column == nullptr) { + // column = tmp_col; + // } else { + // column->append(*tmp_col); + // } + // } + // } + // construct the result array DCHECK(column != nullptr); column = ColumnHelper::cast_to_nullable_column(column); + } + // @TODO handle const? + // attach offsets - auto array_col = std::make_shared( - column, ColumnHelper::as_column(input_array->offsets_column()->clone_shared())); + // auto array_col = std::make_shared( + // column, ColumnHelper::as_column(input_array->offsets_column()->clone_shared())); + auto array_col = std::make_shared(column, UInt32Column::create()); + + if (null_column != nullptr) { return NullableColumn::create(std::move(array_col), null_column); } diff --git a/be/src/serde/column_array_serde.cpp b/be/src/serde/column_array_serde.cpp index f86700d65e0d5..fe11e7b377ac6 100644 --- a/be/src/serde/column_array_serde.cpp +++ b/be/src/serde/column_array_serde.cpp @@ -31,6 +31,8 @@ #include "column/nullable_column.h" #include "column/object_column.h" #include "column/struct_column.h" +#include "column/vectorized_fwd.h" +#include "common/status.h" #include "gutil/strings/substitute.h" #include "runtime/descriptors.h" #include "serde/protobuf_serde.h" @@ -542,6 +544,11 @@ class ColumnSerializedSizeVisitor final : public ColumnVisitorAdapter Date: Fri, 13 Sep 2024 16:38:09 +0800 Subject: [PATCH 02/17] extract common expr in array_map Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/array_column.h | 1 + be/src/exprs/array_functions.cpp | 3 + be/src/exprs/array_map_expr.cpp | 101 ++++++++++++++++++-------- be/src/exprs/array_map_expr.h | 6 ++ be/src/exprs/expr.cpp | 3 +- be/src/exprs/expr.h | 4 ++ be/src/exprs/lambda_function.cpp | 120 ++++++++++++++++++++++++++++--- be/src/exprs/lambda_function.h | 23 +++++- 8 files changed, 220 insertions(+), 41 deletions(-) diff --git a/be/src/column/array_column.h b/be/src/column/array_column.h index de44025ac2d9a..4e57b583b5c54 100644 --- a/be/src/column/array_column.h +++ b/be/src/column/array_column.h @@ -173,6 +173,7 @@ class ArrayColumn final : public ColumnFactory { const UInt32Column& offsets() const { return *_offsets; } UInt32Column::Ptr& offsets_column() { return _offsets; } + UInt32Column::Ptr offsets_column() const { return _offsets; } bool is_nullable() const override { return false; } diff --git a/be/src/exprs/array_functions.cpp b/be/src/exprs/array_functions.cpp index 52821ca847c5b..cf24dde409b40 100644 --- a/be/src/exprs/array_functions.cpp +++ b/be/src/exprs/array_functions.cpp @@ -1104,6 +1104,8 @@ StatusOr ArrayFunctions::any_match(FunctionContext* context, const Co StatusOr ArrayFunctions::concat(FunctionContext* ctx, const Columns& columns) { RETURN_IF_COLUMNS_ONLY_NULL(columns); + // @TODO optimize for const column + auto num_rows = columns[0]->size(); LOG(INFO) << "array_concat, num_rows: " << num_rows; for (auto& column: columns) { @@ -1130,6 +1132,7 @@ StatusOr ArrayFunctions::concat(FunctionContext* ctx, const Columns& auto nullable_column = down_cast(column.get()); array_columns.emplace_back(std::static_pointer_cast(nullable_column->data_column())); } else if (column->is_constant()) { + // @TODO no need // NOTE: I'm not sure if there will be const array, just to be safe array_columns.emplace_back(std::static_pointer_cast( ColumnHelper::unpack_and_duplicate_const_column(num_rows, column))); diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index f645270efd50b..56a1d112a74ee 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -38,6 +38,28 @@ ArrayMapExpr::ArrayMapExpr(const TExprNode& node) : Expr(node, false) {} ArrayMapExpr::ArrayMapExpr(TypeDescriptor type) : Expr(std::move(type), false) {} +Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { + for (int i = 1;i < _children.size(); ++i) { + RETURN_IF_ERROR(_children[i]->prepare(state, context)); + } + auto lambda_expr = down_cast(_children[0]); + // before prepare lambda + // collect max slot id + LambdaFunction::ExtractContext extract_ctx; + extract_ctx.next_slot_id = lambda_expr->max_used_slot_id() + 1; + + LOG(INFO) << "ArrayMap::prepre, next slot id: " << extract_ctx.next_slot_id; + RETURN_IF_ERROR(lambda_expr->extract_outer_common_exprs(state, &extract_ctx)); + _outer_common_exprs.swap(extract_ctx.outer_common_exprs); + + for (auto [_, expr]: _outer_common_exprs) { + RETURN_IF_ERROR(expr->prepare(state, context)); + } + RETURN_IF_ERROR(lambda_expr->prepare(state, context)); + + return Status::OK(); +} + // The input array column maybe nullable, so first remove the wrap of nullable property. // The result of lambda expressions do not change the offsets of the current array and the null map. // NOTE the return column must be of the return type. @@ -49,6 +71,9 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* ColumnPtr input_array = nullptr; ColumnPtr input_array_ptr_ref = nullptr; // hold shared_ptr to avoid early deleted. + ColumnPtr aligned_offsets; + // @TODO we should eval common expr first + // for many valid arguments: // if one of them is a null literal, the result is a null literal; // if one of them is only null, then results are null; @@ -61,11 +86,11 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* if (child_col->only_null()) { return ColumnHelper::align_return_type(child_col, type(), chunk->num_rows(), true); } - LOG(INFO) << "eval child: " << child_col->get_name(); + LOG(INFO) << "eval child: " << child_col->get_name() << ", " << _children[i]->debug_string(); // no optimization for const columns. - if (child_col->is_constant()) { - LOG(INFO) << "unpack const, " << child_col->get_name(); - } + // if (child_col->is_constant()) { + // LOG(INFO) << "unpack const, " << child_col->get_name(); + // } bool is_const = child_col->is_constant(); bool is_nullable = child_col->is_nullable(); @@ -85,9 +110,9 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* DCHECK(nullable_column); data_column = nullable_column->data_column(); // empty null array with non-zero elements - // @TODO remove this, do all things in align phase? - // column->empty_null_in_complex_column(nullable->null_column()->get_data(), - // down_cast(column.get())->offsets().get_data()); + // @TODO can we remove it?? + data_column->empty_null_in_complex_column(nullable_column->null_column()->get_data(), + down_cast(data_column.get())->offsets().get_data()); // @TODO what is is_single_nullable_child.. if (null_column) { @@ -104,15 +129,20 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // @TODO should keep one column ,make sure array len is same + auto array_column = down_cast(data_column.get()); // check each array size in this column? if (input_array == nullptr) { // input_array = cur_array; input_array = data_column; input_array_ptr_ref = data_column; LOG(INFO) << "input_array: " << data_column->get_name(); - // input_array = column; - // input_array_ptr_ref = column; - // @TODO what if input_array is const column + // compute aligned_offsets + if (is_const) { + aligned_offsets = ColumnHelper::unpack_and_duplicate_const_column(child_col->size(), ConstColumn::create(array_column->offsets_column(), 1)); + } else { + aligned_offsets = array_column->offsets_column(); + } + } else { // @TODO need a function to check each array size // if (UNLIKELY(!ColumnHelper::offsets_equal(cur_array->offsets_column(), input_array->offsets_column()))) { @@ -123,7 +153,6 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // @TODO // elements maybe const ColumnPtr elements_column = nullptr; - auto array_column = down_cast(data_column.get()); if (is_const) { // if original column is const column, should keep const elements_column = ConstColumn::create(array_column->elements_column(), num_rows); @@ -137,6 +166,7 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // input_elements.push_back(cur_array->elements_column()); } + if (is_single_nullable_child) { DCHECK(null_column != nullptr); // If there are more than one nullable children, the nullable column has been cloned when calling @@ -155,6 +185,9 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* } else { // construct a new chunk to evaluate the lambda expression. auto cur_chunk = std::make_shared(); + // @TODO assign column id + // @TODO eval common expr + // put all arguments into the new chunk std::vector arguments_ids; auto lambda_func = dynamic_cast(_children[0]); @@ -164,12 +197,29 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* cur_chunk->append_column(input_elements[i], arguments_ids[i]); // column ref LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; } + // @TODO how to know + // @TODO we can choos to filter all non before eval?, not sure + + // @TODO capture column dont need // @TODO align all element column // put captured columns into the new chunk aligning with the first array's offsets // @TODO we don't need align? + + + // const auto& independent_capture_expr = lambda_func->get_independent_capture_exprs(); + LOG(INFO) << "eval outer common exprs, size: " << _outer_common_exprs.size(); + for (const auto& [column_ref, expr]: _outer_common_exprs) { + auto slot_id = down_cast(column_ref)->slot_id(); + LOG(INFO) << "eval non-capture expr: " << slot_id; + ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); + chunk->append_column(col, slot_id); + } std::vector slot_ids; - _children[0]->get_slot_ids(&slot_ids); + lambda_func->get_slot_ids(&slot_ids); + for (auto id: slot_ids) { + LOG(INFO) << "lambda capture column: " << id << ", " << chunk->get_column_by_slot_id(id)->get_name(); + } for (auto id : slot_ids) { DCHECK(id > 0); auto captured = chunk->get_column_by_slot_id(id); @@ -180,28 +230,21 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // @TODO maybe we need binary view column too... // @TODO if capture is a binary column, replicate is expansive too // @TODO not sure if captured is array?? - LOG(INFO) << "capture column: " << captured->get_name() << ", id: " << id; + LOG(INFO) << "capture column: " << captured->get_name() << ", id: " << id + << ", size: " << captured->size() << ", num_rows:" << cur_chunk->num_rows(); + // @TODO how to know // if (captured->is_array()) { // LOG(INFO) << "build array view column from captured, slot id: " << id; // ASSIGN_OR_RETURN(captured, ArrayViewColumn::from_array_column(captured)); // captured->check_or_die(); // } // capture column may be not lambda arguement? + /// if this capture column is not lambada argument, we treat it as const column to avoid slot + auto offsets = down_cast(aligned_offsets.get())->get_data(); // align offsets - if (!captured->is_constant()) { - // @TODO replicate?? - // for non-const column, we should align-up offsets. - // @TODO must replicate... - cur_chunk->append_column(captured, id); - } else { - cur_chunk->append_column(captured, id); - } - - // @TODO align array-argument is enough, if other column don't ref to array, replictte is not necessar - // @TODO consider alignup, what if first column is const?? - // cur_chunk->append_column(captured->align_offsets({}), id); - // cur_chunk->append_column(captured->replicate(input_array->offsets_column()->get_data()), id); + LOG(INFO) << "relicate captured column, id: "<append_column(captured->replicate(offsets), id); } // @TODO @@ -278,10 +321,8 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // @TODO handle const? // attach offsets - // auto array_col = std::make_shared( - // column, ColumnHelper::as_column(input_array->offsets_column()->clone_shared())); - auto array_col = std::make_shared(column, UInt32Column::create()); - + auto array_col = std::make_shared( + column, ColumnHelper::as_column(aligned_offsets->clone_shared())); if (null_column != nullptr) { return NullableColumn::create(std::move(array_col), null_column); diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index 98718103564ad..0b954c35b4015 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -16,6 +16,7 @@ #include #include +#include #include "common/global_types.h" #include "common/object_pool.h" @@ -34,8 +35,13 @@ class ArrayMapExpr final : public Expr { // for tests explicit ArrayMapExpr(TypeDescriptor type); + Status prepare(RuntimeState* state, ExprContext* context) override; Expr* clone(ObjectPool* pool) const override { return pool->add(new ArrayMapExpr(*this)); } StatusOr evaluate_checked(ExprContext* context, Chunk* ptr) override; + +private: + // use map to make sure the order of execution + std::map _outer_common_exprs; }; } // namespace starrocks diff --git a/be/src/exprs/expr.cpp b/be/src/exprs/expr.cpp index cd805dcff4e61..5849df1cc2669 100644 --- a/be/src/exprs/expr.cpp +++ b/be/src/exprs/expr.cpp @@ -73,6 +73,7 @@ #include "exprs/match_expr.h" #include "exprs/placeholder_ref.h" #include "exprs/subfield_expr.h" +#include "gutil/casts.h" #include "gutil/strings/substitute.h" #include "runtime/runtime_state.h" #include "types/logical_type.h" @@ -606,7 +607,7 @@ std::string Expr::debug_string(const std::vector& exprs) { out << "["; for (int i = 0; i < exprs.size(); ++i) { - out << (i == 0 ? "" : " ") << exprs[i]->debug_string(); + out << (i == 0 ? "" : "\n") << exprs[i]->debug_string(); } out << "]"; diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index 65182c7c17495..54bcbce03a294 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -66,6 +66,7 @@ class JITContext; class JITExpr; struct JitScore; struct LLVMDatum; +class LambdaFunction; // This is the superclass of all expr evaluation nodes. class Expr { @@ -197,6 +198,7 @@ class Expr { static void close(const std::vector& exprs); virtual std::string debug_string() const; + static std::string debug_string(const std::vector& exprs); static std::string debug_string(const std::vector& ctxs); @@ -270,6 +272,8 @@ class Expr { friend class Literal; friend class ExprContext; friend class ColumnPredicateRewriter; + friend class LambdaFunction; + friend class ArrayMapExpr; explicit Expr(TypeDescriptor type); explicit Expr(const TExprNode& node); diff --git a/be/src/exprs/lambda_function.cpp b/be/src/exprs/lambda_function.cpp index 3dff0cee3bb3e..1aa74eca14cdd 100644 --- a/be/src/exprs/lambda_function.cpp +++ b/be/src/exprs/lambda_function.cpp @@ -16,16 +16,87 @@ #include +#include #include #include "column/chunk.h" #include "column/column_helper.h" #include "column/vectorized_fwd.h" +#include "exec/exec_node.h" +#include "exprs/column_ref.h" +#include "exprs/expr.h" #include "exprs/expr_context.h" namespace starrocks { -LambdaFunction::LambdaFunction(const TExprNode& node) : Expr(node, false), _common_sub_expr_num(node.output_column) {} +LambdaFunction::LambdaFunction(const TExprNode& node) : Expr(node, false), _common_sub_expr_num(node.output_column) { +} + +Status LambdaFunction::extract_outer_common_exprs( + RuntimeState* state, Expr* expr, ExtractContext* ctx) { + if (expr->is_slotref()) { + return Status::OK(); + } + int child_num = expr->get_num_children(); + std::vector slot_ids; + for (int i = 0;i < child_num;i++) { + auto child = expr->get_child(i); + + RETURN_IF_ERROR(extract_outer_common_exprs(state, child, ctx)); + if (child->is_slotref()) { + continue; + } + slot_ids.clear(); + child->get_slot_ids(&slot_ids); + bool is_independent = std::all_of(slot_ids.begin(), slot_ids.end(), [ctx](const SlotId& id) { + return ctx->lambda_arguments.find(id) == ctx->lambda_arguments.end(); + }); + if (is_independent) { + SlotId slot_id = ctx->next_slot_id++; + ColumnRef* column_ref = state->obj_pool()->add(new ColumnRef(child->type(), slot_id)); + LOG(INFO) << "add new common expr, slot_id: " << slot_id << ", new expr: " << column_ref->debug_string() + << ", old expr: " << child->debug_string(); + expr->_children[i] = column_ref; + ctx->outer_common_exprs.insert({column_ref, child}); + } + } + return Status::OK(); +} + +Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, ExtractContext* ctx) { + RETURN_IF_ERROR(collect_lambda_argument_ids()); + for (auto argument_id: _arguments_ids) { + ctx->lambda_arguments.insert(argument_id); + LOG(INFO) << "lambda arg id: " << argument_id; + } + auto lambda_expr = _children[0]; + RETURN_IF_ERROR(extract_outer_common_exprs(state, lambda_expr, ctx)); + return Status::OK(); +} + +Status LambdaFunction::collect_lambda_argument_ids() { + if (!_arguments_ids.empty()) { + return Status::OK(); + } + const int child_num = get_num_children() - 2 * _common_sub_expr_num; + for (int i = 1;i < child_num;i++) { + _children[i]->get_slot_ids(&_arguments_ids); + } + if (child_num - 1 != _arguments_ids.size()) { + return Status::InternalError(fmt::format("Lambda arguments get ids failed, just get {} ids from {} arguments.", + _arguments_ids.size(), child_num - 1)); + } + return Status::OK(); +} + +SlotId LambdaFunction::max_used_slot_id() const { + std::vector ids; + for (auto child: _children) { + child->get_slot_ids(&ids); + } + DCHECK(!ids.empty()); + return *std::max_element(ids.begin(), ids.end()); +} Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprContext* context) { RETURN_IF_ERROR(Expr::prepare(state, context)); @@ -33,16 +104,26 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo return Status::OK(); } _is_prepared = true; + // common sub expressions include 2 parts in a pair: (slot id, expression) const int child_num = get_num_children() - 2 * _common_sub_expr_num; - // collect the slot ids of lambda arguments - for (int i = 1; i < child_num; ++i) { - get_child(i)->get_slot_ids(&_arguments_ids); - } - if (child_num - 1 != _arguments_ids.size()) { - return Status::InternalError(fmt::format("Lambda arguments get ids failed, just get {} ids from {} arguments.", - _arguments_ids.size(), child_num - 1)); + LOG(INFO) << "lambda child num: " << child_num << ", common: " << _common_sub_expr_num; + LOG(INFO) << debug_string(); + for (int i = 0; i< child_num;i++) { + LOG(INFO) << "child[" << i << "] = " << get_child(i)->debug_string(); } + RETURN_IF_ERROR(collect_lambda_argument_ids()); + // collect the slot ids of lambda arguments + // for (int i = 1; i < child_num; ++i) { + // get_child(i)->get_slot_ids(&_arguments_ids); + // } + // for (const auto& arg_id: _arguments_ids) { + // LOG(INFO) << "lambda arg id: " << arg_id; + // } + // if (child_num - 1 != _arguments_ids.size()) { + // return Status::InternalError(fmt::format("Lambda arguments get ids failed, just get {} ids from {} arguments.", + // _arguments_ids.size(), child_num - 1)); + // } // sorted common sub expressions so that the later expressions can reference the previous ones. for (auto i = child_num; i < child_num + _common_sub_expr_num; ++i) { get_child(i)->get_slot_ids(&_common_sub_expr_ids); @@ -52,18 +133,28 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo fmt::format("Lambda common sub expression id's size {} is not equal to expected {}", _common_sub_expr_ids.size(), _common_sub_expr_num)); } + LOG(INFO) << "lambda common_sub_expr_num: " << _common_sub_expr_num; for (auto i = child_num + _common_sub_expr_num; i < child_num + 2 * _common_sub_expr_num; ++i) { + LOG(INFO) << "commom expr: " << i << ", " << get_child(i)->debug_string(); _common_sub_expr.push_back(get_child(i)); get_child(i)->get_slot_ids(&_captured_slot_ids); + // @TODO why put into captured slot id } if (_common_sub_expr.size() != _common_sub_expr_num) { return Status::InternalError(fmt::format("Lambda common sub expressions' size {} is not equal to expected {}", _common_sub_expr.size(), _common_sub_expr_num)); - } + } + // get slot ids from the lambda expression get_child(0)->get_slot_ids(&_captured_slot_ids); + for (auto id: _captured_slot_ids) { + LOG(INFO) << "lambda capture id: " << id ; + } + + // @TODO find all independent capture column, evaluate them first... + // remove current argument ids and duplicated ids from captured_slot_ids std::map captured_mask; @@ -101,8 +192,19 @@ StatusOr LambdaFunction::evaluate_checked(ExprContext* context, Chunk for (auto i = 0; i < _common_sub_expr.size(); ++i) { auto sub_col = EVALUATE_NULL_IF_ERROR(context, _common_sub_expr[i], chunk); chunk->append_column(sub_col, _common_sub_expr_ids[i]); + LOG(INFO) << "eval common expr: " << _common_sub_expr_ids[i]; } return get_child(0)->evaluate_checked(context, chunk); } +std::string LambdaFunction::debug_string() const { + std::stringstream out; + auto expr_debug_string = Expr::debug_string(); + out << "LambaFunction ("; + for (int i = 0;i < _children.size();i++) { + out << (i == 0 ? "lambda expr, ": "input argument, ") << _children[i]->debug_string(); + } + out << ")"; + return out.str(); +} } // namespace starrocks diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index 2be3d08e0ccd0..6a8ec38a815d4 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -16,11 +16,11 @@ #include #include +#include #include #include "common/global_types.h" #include "common/object_pool.h" -#include "exprs/column_ref.h" #include "exprs/expr.h" #include "glog/logging.h" #include "gutil/casts.h" @@ -47,6 +47,7 @@ class LambdaFunction final : public Expr { // the slot ids of lambda expression may be originally from the arguments of this lambda function // or its parent lambda functions, or captured columns, remove the first one. + // only capture column id, int get_slot_ids(std::vector* slot_ids) const override { slot_ids->insert(slot_ids->end(), _captured_slot_ids.begin(), _captured_slot_ids.end()); return _captured_slot_ids.size(); @@ -58,12 +59,32 @@ class LambdaFunction final : public Expr { } Expr* get_lambda_expr() const { return _children[0]; } + std::string debug_string() const override; + + struct ExtractContext { + std::unordered_set lambda_arguments; + SlotId next_slot_id; + std::map outer_common_exprs; + }; + SlotId max_used_slot_id() const; + + Status extract_outer_common_exprs(RuntimeState* state, ExtractContext* ctx); private: + Status collect_lambda_argument_ids(); + Status collect_capture_slot_ids(); + Status extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx); + // void extract_outer_common_exprs(RuntimeState* state); + // static const SlotId kIndependentStartId = 10000; + // void find_all_independent_capture_column(Expr* expr, std::vector* ids); + // void try_to_replace_commom_expr(RuntimeState* state, Expr* expr); + std::vector _captured_slot_ids; std::vector _arguments_ids; std::vector _common_sub_expr_ids; std::vector _common_sub_expr; + + // std::unordered_map _outer_common_exprs; int _common_sub_expr_num; bool _is_prepared = false; }; From 9e1b4b4f7cdb27fbda37464c89c34df902bbeaaa Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Wed, 18 Sep 2024 21:10:42 +0800 Subject: [PATCH 03/17] stash Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/array_column.cpp | 67 +++++- be/src/column/array_column.h | 12 +- be/src/column/chunk.cpp | 1 + be/src/column/column.cpp | 3 +- be/src/column/column_helper.h | 1 + be/src/exprs/array_map_expr.cpp | 353 ++++++++++++++++++------------- be/src/exprs/array_map_expr.h | 2 +- be/src/exprs/expr.h | 3 + be/src/exprs/lambda_function.cpp | 25 ++- be/src/exprs/lambda_function.h | 4 + 10 files changed, 299 insertions(+), 172 deletions(-) diff --git a/be/src/column/array_column.cpp b/be/src/column/array_column.cpp index 4c306e19b7f0c..633c2d1bc04cc 100644 --- a/be/src/column/array_column.cpp +++ b/be/src/column/array_column.cpp @@ -17,6 +17,7 @@ #include #include "column/column_helper.h" +#include "exprs/function_helper.h" #include "column/fixed_length_column.h" #include "column/nullable_column.h" #include "column/vectorized_fwd.h" @@ -617,19 +618,65 @@ Status ArrayColumn::unfold_const_children(const starrocks::TypeDescriptor& type) return Status::OK(); } -bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& lhs, const ColumnPtr& rhs, const NullColumnPtr& null_column) { - if (!lhs->is_array() || !rhs->is_array()) { - throw std::runtime_error("input of is_all_array_lengths_equal shoule be array"); + +template +bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2, const NullColumnPtr& null_column) { + [[maybe_unused]] uint8_t* null_data = nullptr; + if constexpr (!IgnoreNull) { + null_data = null_column->get_data().data(); + } + + size_t num_rows = v1.size(); + if constexpr (ConstV1 && ConstV2) { + // if both are const column, we only compare the first row once + num_rows = 1; } - if (lhs->size() != rhs->size()) { + + bool result = true; + const auto& offsets_v1 = v1.get_data(); + const auto& offsets_v2 = v2.get_data(); + for (size_t i = 0;i < num_rows && result;i++) { + [[maybe_unused]] uint32_t len1 = (ConstV1) ? (offsets_v1[1] - offsets_v1[0]) : (offsets_v1[i + 1] - offsets_v1[i]); + [[maybe_unused]] uint32_t len2 = (ConstV2) ? (offsets_v2[1] - offsets_v2[0]) : (offsets_v2[i + 1] - offsets_v2[i]); + if constexpr (IgnoreNull) { + result &= (len1 == len2); + } else { + if (!null_data[i]) { + result &= (len1 == len2); + } + } + } + return result; +} + + +template +bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_column) { + DCHECK(v1->is_array() && v2->is_array()); + DCHECK(!v1->is_nullable() && !v2->is_nullable()); + + if (v1->size() != v2->size()) { return false; } - // @TODO reject nullable column + auto data_v1 = FunctionHelper::get_data_column_of_const(v1); + auto data_v2 = FunctionHelper::get_data_column_of_const(v2); + auto* array_v1 = down_cast(data_v1.get()); + auto* array_v2 = down_cast(data_v2.get()); + const auto& offsets_v1 = array_v1->offsets(); + const auto& offsets_v2 = array_v2->offsets(); + if (v1->is_constant() && v2->is_constant()) { + return compare_lengths_from_offsets(offsets_v1, offsets_v2, null_column); + } else if (v1->is_constant() && !v2->is_constant()) { + return compare_lengths_from_offsets(offsets_v1, offsets_v2, null_column); + } else if (!v1->is_constant() && v2->is_constant()) { + return compare_lengths_from_offsets(offsets_v1, offsets_v2, null_column); + } + + return compare_lengths_from_offsets(offsets_v1, offsets_v2, null_column); +} + +template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); +template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); - // @TODO consider nullable column - // if one of them is null, skip check - // otherwise, check length - return true; } // namespace starrocks -} diff --git a/be/src/column/array_column.h b/be/src/column/array_column.h index 4e57b583b5c54..15a15532fea53 100644 --- a/be/src/column/array_column.h +++ b/be/src/column/array_column.h @@ -196,9 +196,16 @@ class ArrayColumn final : public ColumnFactory { Status unfold_const_children(const starrocks::TypeDescriptor& type) override; - static bool is_all_array_lengths_equal(const ColumnPtr& lhs, const ColumnPtr& rhs, const NullColumnPtr& null_data); + + // check if all of arrays' size is equal + // v1 and v2 must be one of ArrayColumn or Const(ArrayColumn) + template + static bool is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); private: + template + static bool compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2, const NullColumnPtr& null_data); + // Elements must be NullableColumn to facilitate handling nested types. ColumnPtr _elements; // Offsets column will store the start position of every array element. @@ -208,4 +215,7 @@ class ArrayColumn final : public ColumnFactory { UInt32Column::Ptr _offsets; }; +extern template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); +extern template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); + } // namespace starrocks diff --git a/be/src/column/chunk.cpp b/be/src/column/chunk.cpp index 887ed8f480658..343b66c5ec6ec 100644 --- a/be/src/column/chunk.cpp +++ b/be/src/column/chunk.cpp @@ -142,6 +142,7 @@ void Chunk::append_vector_column(ColumnPtr column, const FieldPtr& field, SlotId } void Chunk::append_column(ColumnPtr column, SlotId slot_id) { + DCHECK(!_slot_id_to_index.contains(slot_id)); _slot_id_to_index[slot_id] = _columns.size(); _columns.emplace_back(std::move(column)); check_or_die(); diff --git a/be/src/column/column.cpp b/be/src/column/column.cpp index 09c3da865e788..9960fc8dfc5b9 100644 --- a/be/src/column/column.cpp +++ b/be/src/column/column.cpp @@ -67,7 +67,8 @@ StatusOr Column::upgrade_helper_func(ColumnPtr* col) { } bool Column::empty_null_in_complex_column(const Filter& null_data, const Buffer& offsets) { - DCHECK(null_data.size() == this->size()); + // DCHECK(null_data.size() == this->size()); + DCHECK_EQ(null_data.size(), this->size()); if (!is_array() && !is_map()) { throw std::runtime_error("empty_null_in_complex_column() only works for array and map column."); } diff --git a/be/src/column/column_helper.h b/be/src/column/column_helper.h index 94cd37889ec73..e3f35b66a5fe7 100644 --- a/be/src/column/column_helper.h +++ b/be/src/column/column_helper.h @@ -124,6 +124,7 @@ class ColumnHelper { return column; } + static inline bool offsets_equal(const UInt32Column::Ptr& offset0, const UInt32Column::Ptr& offset1) { if (offset0->size() != offset1->size()) { return false; diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index 56a1d112a74ee..c87bc5e3593a5 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -31,6 +31,7 @@ #include "exprs/function_helper.h" #include "exprs/lambda_function.h" #include "runtime/user_function_cache.h" +#include "simd/simd.h" #include "storage/chunk_helper.h" namespace starrocks { @@ -42,17 +43,24 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { for (int i = 1;i < _children.size(); ++i) { RETURN_IF_ERROR(_children[i]->prepare(state, context)); } + // if child 0 is not lambda, what will happen whe nevaluate + + // @TODO if children[0] not lambda + // @TODO _children[0] maybe not a lambda function? auto lambda_expr = down_cast(_children[0]); // before prepare lambda // collect max slot id LambdaFunction::ExtractContext extract_ctx; extract_ctx.next_slot_id = lambda_expr->max_used_slot_id() + 1; - LOG(INFO) << "ArrayMap::prepre, next slot id: " << extract_ctx.next_slot_id; + LOG(INFO) << "ArrayMap::prepare, next slot id: " << extract_ctx.next_slot_id << ", this: " << (void*)this; RETURN_IF_ERROR(lambda_expr->extract_outer_common_exprs(state, &extract_ctx)); _outer_common_exprs.swap(extract_ctx.outer_common_exprs); for (auto [_, expr]: _outer_common_exprs) { + // @TODO + LOG(INFO) << "prepare common expr: " << expr->debug_string(); + // @TODO if after rewrite, first expr of array_map become column ref, we can remove it? RETURN_IF_ERROR(expr->prepare(state, context)); } RETURN_IF_ERROR(lambda_expr->prepare(state, context)); @@ -64,16 +72,22 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { // The result of lambda expressions do not change the offsets of the current array and the null map. // NOTE the return column must be of the return type. StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* chunk) { + // @TODO just use one vector store array column std::vector input_elements; - NullColumnPtr null_column = nullptr; + + // NullColumnPtr null_column = nullptr; bool is_single_nullable_child = false; // ArrayColumn* input_array = nullptr; ColumnPtr input_array = nullptr; ColumnPtr input_array_ptr_ref = nullptr; // hold shared_ptr to avoid early deleted. - ColumnPtr aligned_offsets; + // ColumnPtr aligned_offsets; + UInt32Column::Ptr aligned_offsets; // @TODO we should eval common expr first + // maybe a NullColumn or a Const(NullColumn) + NullColumnPtr result_null_column = nullptr; + bool all_input_is_constant = true; // for many valid arguments: // if one of them is a null literal, the result is a null literal; // if one of them is only null, then results are null; @@ -86,15 +100,12 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* if (child_col->only_null()) { return ColumnHelper::align_return_type(child_col, type(), chunk->num_rows(), true); } - LOG(INFO) << "eval child: " << child_col->get_name() << ", " << _children[i]->debug_string(); - // no optimization for const columns. - // if (child_col->is_constant()) { - // LOG(INFO) << "unpack const, " << child_col->get_name(); - // } + LOG(INFO) << "eval child: " << child_col->get_name(); bool is_const = child_col->is_constant(); bool is_nullable = child_col->is_nullable(); size_t num_rows = child_col->size(); + all_input_is_constant &= is_const; auto data_column = child_col; if (is_const) { @@ -102,150 +113,195 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* data_column = const_column->data_column(); } - // child_col = ColumnHelper::unpack_and_duplicate_const_column(child_col->size(), child_col); // @TODO consider const nullable if (is_nullable) { - // auto nullable = down_cast(child_col.get()); auto nullable_column = down_cast(data_column.get()); DCHECK(nullable_column); data_column = nullable_column->data_column(); // empty null array with non-zero elements // @TODO can we remove it?? + + // @TODO we can check it before?? + // this will re-build data column, replace null element to empty array data_column->empty_null_in_complex_column(nullable_column->null_column()->get_data(), down_cast(data_column.get())->offsets().get_data()); - // @TODO what is is_single_nullable_child.. - if (null_column) { + auto null_column = nullable_column->null_column(); + if (is_const) { + LOG(INFO) << "input is const, should unpack null column"; + // if null_column is from const_column, should unpack + null_column->assign(num_rows, 0); + } + + // try to merge null column + if (result_null_column) { is_single_nullable_child = false; - null_column = FunctionHelper::union_null_column(nullable_column->null_column(), null_column); // merge null + // union two null column + LOG(INFO) << "union result_null_column, size: " << result_null_column->size() << ", null size: " << null_column->size(); + result_null_column = FunctionHelper::union_null_column(null_column, result_null_column); + LOG(INFO) << "union done: " << result_null_column->size(); } else { is_single_nullable_child = true; - null_column = nullable_column->null_column(); + result_null_column = null_column; + LOG(INFO) << "assign result_null_column, size: " << null_column->size(); } } DCHECK(data_column->is_array() && !data_column->is_nullable()); - // @TODO column maybe const - // auto cur_array = down_cast(column.get()); - - // @TODO should keep one column ,make sure array len is same - - auto array_column = down_cast(data_column.get()); - // check each array size in this column? - if (input_array == nullptr) { - // input_array = cur_array; - input_array = data_column; - input_array_ptr_ref = data_column; - LOG(INFO) << "input_array: " << data_column->get_name(); - // compute aligned_offsets - if (is_const) { - aligned_offsets = ColumnHelper::unpack_and_duplicate_const_column(child_col->size(), ConstColumn::create(array_column->offsets_column(), 1)); - } else { - aligned_offsets = array_column->offsets_column(); - } - } else { - // @TODO need a function to check each array size - // if (UNLIKELY(!ColumnHelper::offsets_equal(cur_array->offsets_column(), input_array->offsets_column()))) { - // return Status::InternalError("Input array element's size is not equal in array_map()."); - // } + ColumnPtr column = data_column; + if (is_const) { + // keep it as a Const(ArrayColumn) in input elelents + column = ConstColumn::create(data_column, num_rows); } - // @TODO - // elements maybe const - ColumnPtr elements_column = nullptr; - if (is_const) { - // if original column is const column, should keep const - elements_column = ConstColumn::create(array_column->elements_column(), num_rows); - } else { - elements_column = array_column->elements_column(); + // check each array's lengths in input_elements + if (!input_elements.empty()) { + const auto& first_input = input_elements[0]; + + bool is_array_lengths_valid = result_null_column ? + ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column): + ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column); + if (!is_array_lengths_valid) { + return Status::InternalError("Input array element's size is not equal in array_map()."); + } } - // @TODO put all element column into input elements - input_elements.emplace_back(elements_column); - - // input_elements.push_back(cur_array->elements_column()); + input_elements.emplace_back(column); } - if (is_single_nullable_child) { - DCHECK(null_column != nullptr); + DCHECK(result_null_column != nullptr); // If there are more than one nullable children, the nullable column has been cloned when calling // union_null_column to merge, so only one nullable child needs to be cloned. - null_column = ColumnHelper::as_column(null_column->clone_shared()); + result_null_column = ColumnHelper::as_column(result_null_column->clone_shared()); } ColumnPtr column = nullptr; - // @TODO handle empty case? - // @TODO if all is null - if (null_column->only_null()) { - // @TODO need check - // if (input_array->elements_column()->empty()) { // arrays may be null or empty + size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()): 0; + if (null_rows == input_elements[0]->size()) { + // all input is null column = ColumnHelper::create_column(type().children[0], true); // array->elements must be of return array->elements' type } else { // construct a new chunk to evaluate the lambda expression. auto cur_chunk = std::make_shared(); - // @TODO assign column id - // @TODO eval common expr - - // put all arguments into the new chunk - std::vector arguments_ids; - auto lambda_func = dynamic_cast(_children[0]); - int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); - DCHECK(argument_num == input_elements.size()); - for (int i = 0; i < argument_num; ++i) { - cur_chunk->append_column(input_elements[i], arguments_ids[i]); // column ref - LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; - } - // @TODO how to know - // @TODO we can choos to filter all non before eval?, not sure - - // @TODO capture column dont need - // @TODO align all element column - // put captured columns into the new chunk aligning with the first array's offsets - // @TODO we don't need align? - - - // const auto& independent_capture_expr = lambda_func->get_independent_capture_exprs(); + // 1. evaluate all outer common exprs LOG(INFO) << "eval outer common exprs, size: " << _outer_common_exprs.size(); for (const auto& [column_ref, expr]: _outer_common_exprs) { auto slot_id = down_cast(column_ref)->slot_id(); LOG(INFO) << "eval non-capture expr: " << slot_id; ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); + LOG(INFO) << "col size: " << col->size(); chunk->append_column(col, slot_id); } + + auto lambda_func = dynamic_cast(_children[0]); std::vector slot_ids; lambda_func->get_slot_ids(&slot_ids); - for (auto id: slot_ids) { - LOG(INFO) << "lambda capture column: " << id << ", " << chunk->get_column_by_slot_id(id)->get_name(); - } - for (auto id : slot_ids) { - DCHECK(id > 0); - auto captured = chunk->get_column_by_slot_id(id); - if (UNLIKELY(captured->size() < input_array->size())) { + // 2. check captured columns size + for (auto slot_id : slot_ids) { + LOG(INFO) << "check slot id: " << slot_id; + DCHECK(slot_id > 0); + auto captured_column = chunk->get_column_by_slot_id(slot_id); + if (UNLIKELY(captured_column->size() < input_elements[0]->size())) { return Status::InternalError(fmt::format( - "The size of the captured column {} is less than array's size.", captured->get_name())); + "The size of the captured column {} is less than array's size.", captured_column->get_name())); + } + } + + // 3. align up all columns offsets + // if most value is null, we remove all null column, create a new one to evaluate + // else alignup offset + // @TODO we can't avoid copy data here?? + // should we replicate capture column??? + // empty all null is ok + + // @TODO if all input is const, we don't need unpack const + if (all_input_is_constant) { + // if all input arguments are ConstColumn, we don't need unpack, just evaluate on ConstColumn + LOG(INFO) << "all inputs of array_map are ConstColumn"; + + } + // @TODO udpate aligned_offsets, we can use arg0's offsets? + + std::vector arguments_ids; + int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); + DCHECK(argument_num == input_elements.size()); + for (int i = 0; i < argument_num; ++i) { + auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); + auto array_column = down_cast(data_column.get()); + auto elements_column = array_column->elements_column(); + UInt32Column::Ptr offsets_column = array_column->offsets_column(); + + if (input_elements[i]->is_constant()) { + // if input is const, we should assign data multiple times + // seems we cant avoid copy data if we don't have view column? + // if input is const, we should wrap its element column as a const column too + // @TODO elements should not be a const column + size_t elements_num = array_column->get_element_size(0); + elements_column = elements_column->clone(); + // create a new offsets + // offsets_column = UInt32Column::create(); + offsets_column = UInt32Column::create(); + // replicate N time and ignore null + size_t repeat_times = input_elements[i]->size() - null_rows; + offsets_column->append(0); + size_t offset = elements_num; + for (size_t i = 0;i < repeat_times;i++) { + elements_column->append(*elements_column, 0, elements_num); + offset += elements_num; + offsets_column->append(offset); + } + + } else { + // @TODO null data size is ok, only one row, why offsets has too many data? + + // @TODO empty_null should apply on array column.. + // elements_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); + data_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); + elements_column = down_cast(data_column.get())->elements_column(); + } + if (aligned_offsets == nullptr) { + aligned_offsets = offsets_column; + } + //append elemt + cur_chunk->append_column(elements_column, arguments_ids[i]); + LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; + } + // @TODO put outer common expr into cur_chunk, + // align offset + for (const auto& [column_ref, expr]: _outer_common_exprs) { + auto slot_id = down_cast(column_ref)->slot_id(); + auto column = chunk->get_column_by_slot_id(slot_id); + column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); + // replicate column and put int into cur_chunk + // @TODO what if column is const? + // @TODO this should be in cur_chunk and chunk? + auto aligned_column = column->replicate(aligned_offsets->get_data()); + cur_chunk->append_column(aligned_column, slot_id); + LOG(INFO) << "append outer common column: " << slot_id; + // chunk->append_column(col, slot_id); + } + for (auto slot_id : slot_ids) { + DCHECK(slot_id > 0); + if (cur_chunk->is_slot_exist(slot_id)) { + continue; } - // @TODO maybe we need binary view column too... - // @TODO if capture is a binary column, replicate is expansive too - // @TODO not sure if captured is array?? - LOG(INFO) << "capture column: " << captured->get_name() << ", id: " << id - << ", size: " << captured->size() << ", num_rows:" << cur_chunk->num_rows(); - // @TODO how to know - // if (captured->is_array()) { - // LOG(INFO) << "build array view column from captured, slot id: " << id; - // ASSIGN_OR_RETURN(captured, ArrayViewColumn::from_array_column(captured)); - // captured->check_or_die(); - // } - // capture column may be not lambda arguement? - /// if this capture column is not lambada argument, we treat it as const column to avoid slot - auto offsets = down_cast(aligned_offsets.get())->get_data(); - - // align offsets - LOG(INFO) << "relicate captured column, id: "<append_column(captured->replicate(offsets), id); + auto captured_column = chunk->get_column_by_slot_id(slot_id); + auto aligned_column = captured_column->replicate(aligned_offsets->get_data()); + cur_chunk->append_column(aligned_column, slot_id); + LOG(INFO) << "append capture column, " << slot_id; } + #ifdef DEBUG + { + auto first_column = cur_chunk->get_column_by_slot_id(arguments_ids[0]); + for (int i = 1;i < argument_num;i++) { + auto column = cur_chunk->get_column_by_slot_id(arguments_ids[i]); + DCHECK_EQ(column->size(), first_column->size()) << "input arguments size should be same"; + } + } + #endif // @TODO { @@ -254,35 +310,45 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // cut data // if cur_chunk has view_column, we should convert view_column to column again + // @TODO can we find common expr from chunk? + for (const auto& [slot_id, _]: chunk->get_slot_id_to_index_map()) { + LOG(INFO) << "chunk contains slot id: " << slot_id; + } + for (const auto& [slot_id, _] : cur_chunk->get_slot_id_to_index_map()) { + LOG(INFO) << "cur_chunk contains slot id: " << slot_id; + } // @TODO cut row [x,y] into a tmp chunk ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); + LOG(INFO) << "cur_chunk rows: " << cur_chunk->num_rows(); RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); accumulator.finalize(); while (auto tmp_chunk = accumulator.pull()) { // if contains view, should translate it back // TODO change column auto new_chunk = std::make_shared(); - const auto& columns = tmp_chunk->columns(); - for(size_t idx = 0;idx < columns.size();idx++) { - const auto& column = columns[idx]; - if (column->is_array_view()) { - LOG(INFO) << "convert array-view to array, " << column->get_name(); - ASSIGN_OR_RETURN(auto new_column, ArrayViewColumn::to_array_column(column)); - LOG(INFO) << "convert done"; - new_column->check_or_die(); - // auto array_view_column = down_cast(column.get()); - // ASSIGN_OR_RETURN(auto new_column, array_view_column->to_array_column()); - LOG(INFO) << "update column, idx: " << idx; - tmp_chunk->update_column_by_index(new_column, idx); - } - } + // const auto& columns = tmp_chunk->columns(); + LOG(INFO) << "tmp_chunk rows: " << tmp_chunk->num_rows(); + // for(size_t idx = 0;idx < columns.size();idx++) { + // const auto& column = columns[idx]; + // if (column->is_array_view()) { + // LOG(INFO) << "convert array-view to array, " << column->get_name(); + // ASSIGN_OR_RETURN(auto new_column, ArrayViewColumn::to_array_column(column)); + // LOG(INFO) << "convert done"; + // new_column->check_or_die(); + // // auto array_view_column = down_cast(column.get()); + // // ASSIGN_OR_RETURN(auto new_column, array_view_column->to_array_column()); + // LOG(INFO) << "update column, idx: " << idx; + // tmp_chunk->update_column_by_index(new_column, idx); + // } + // } tmp_chunk->check_or_die(); - for (const auto& column: tmp_chunk->columns()) { - LOG(INFO) << "column: " << column->get_name(); - DCHECK(!column->is_array_view()) << "unexpected array view"; - } + // for (const auto& column: tmp_chunk->columns()) { + // LOG(INFO) << "column: " << column->get_name(); + // DCHECK(!column->is_array_view()) << "unexpected array view"; + // } ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); + tmp_col->check_or_die(); tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); if (column == nullptr) { column = tmp_col; @@ -291,28 +357,6 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* } } } - - - // @TODO - // evaluate lambda expr? - // if (cur_chunk->num_rows() <= chunk->num_rows() * 8) { - // ASSIGN_OR_RETURN(column, context->evaluate(_children[0], cur_chunk.get())); - // column = ColumnHelper::align_return_type(column, type().children[0], cur_chunk->num_rows(), true); - // } else { // split large chunks into small ones to avoid too large or various batch_size - // ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); - // RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); - // accumulator.finalize(); - // while (auto tmp_chunk = accumulator.pull()) { - // ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); - // tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); - // if (column == nullptr) { - // column = tmp_col; - // } else { - // column->append(*tmp_col); - // } - // } - // } - // construct the result array DCHECK(column != nullptr); column = ColumnHelper::cast_to_nullable_column(column); @@ -320,14 +364,27 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* } // @TODO handle const? + // @TODO aligned offsets maybe null + // attach offsets auto array_col = std::make_shared( column, ColumnHelper::as_column(aligned_offsets->clone_shared())); - - if (null_column != nullptr) { - return NullableColumn::create(std::move(array_col), null_column); + array_col->check_or_die(); + if (result_null_column != nullptr) { + return NullableColumn::create(std::move(array_col), result_null_column); } return array_col; } +std::string ArrayMapExpr::debug_string() const { + std::stringstream out; + auto expr_debug_string = Expr::debug_string(); + out << "array_map ("; + for (int i = 0;i < _children.size();i++) { + out << (i == 0 ? "": ", ") << _children[i]->debug_string(); + } + out << ")"; + return out.str(); +} + } // namespace starrocks diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index 0b954c35b4015..b0d339956ad67 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -39,7 +39,7 @@ class ArrayMapExpr final : public Expr { Expr* clone(ObjectPool* pool) const override { return pool->add(new ArrayMapExpr(*this)); } StatusOr evaluate_checked(ExprContext* context, Chunk* ptr) override; - + std::string debug_string() const override; private: // use map to make sure the order of execution std::map _outer_common_exprs; diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index 54bcbce03a294..f8a57a2f72147 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -120,6 +120,9 @@ class Expr { bool is_monotonic() const { return _is_monotonic; } bool is_cast_expr() const { return _node_type == TExprNodeType::CAST_EXPR; } + virtual bool is_lambda_function() const { + return false; + } // In most time, this field is passed from FE // Sometimes we want to construct expr on BE implicitly and we have knowledge about `monotonicity` diff --git a/be/src/exprs/lambda_function.cpp b/be/src/exprs/lambda_function.cpp index 1aa74eca14cdd..ca1abc7fcc6e3 100644 --- a/be/src/exprs/lambda_function.cpp +++ b/be/src/exprs/lambda_function.cpp @@ -37,8 +37,18 @@ Status LambdaFunction::extract_outer_common_exprs( if (expr->is_slotref()) { return Status::OK(); } + LOG(INFO) << "extract expr: " << expr->debug_string(); + + // @TODO can we remove lambda?? + // any_match(array_map( -> any_match(array_map( -> < 10, 3: arr_largeint)), 3: arr_largeint)) + // -> any_match(array_map( -> , arr_largeint)) + // slot 8: array_map( -> < 10, arr_largeint) + // slot 9: any_match(, arr_largeint) + // @OTOD what if expr is LambdaFunction int child_num = expr->get_num_children(); std::vector slot_ids; + + // @TODO we can't replace lambda? for (int i = 0;i < child_num;i++) { auto child = expr->get_child(i); @@ -51,6 +61,8 @@ Status LambdaFunction::extract_outer_common_exprs( bool is_independent = std::all_of(slot_ids.begin(), slot_ids.end(), [ctx](const SlotId& id) { return ctx->lambda_arguments.find(id) == ctx->lambda_arguments.end(); }); + // + // if (is_independent && !child->is_lambda_function()) { if (is_independent) { SlotId slot_id = ctx->next_slot_id++; ColumnRef* column_ref = state->obj_pool()->add(new ColumnRef(child->type(), slot_id)); @@ -113,17 +125,7 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo LOG(INFO) << "child[" << i << "] = " << get_child(i)->debug_string(); } RETURN_IF_ERROR(collect_lambda_argument_ids()); - // collect the slot ids of lambda arguments - // for (int i = 1; i < child_num; ++i) { - // get_child(i)->get_slot_ids(&_arguments_ids); - // } - // for (const auto& arg_id: _arguments_ids) { - // LOG(INFO) << "lambda arg id: " << arg_id; - // } - // if (child_num - 1 != _arguments_ids.size()) { - // return Status::InternalError(fmt::format("Lambda arguments get ids failed, just get {} ids from {} arguments.", - // _arguments_ids.size(), child_num - 1)); - // } + // sorted common sub expressions so that the later expressions can reference the previous ones. for (auto i = child_num; i < child_num + _common_sub_expr_num; ++i) { get_child(i)->get_slot_ids(&_common_sub_expr_ids); @@ -189,6 +191,7 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo } StatusOr LambdaFunction::evaluate_checked(ExprContext* context, Chunk* chunk) { + LOG(INFO) << "evaluate LambdaFunction, " << (void*)this; for (auto i = 0; i < _common_sub_expr.size(); ++i) { auto sub_col = EVALUATE_NULL_IF_ERROR(context, _common_sub_expr[i], chunk); chunk->append_column(sub_col, _common_sub_expr_ids[i]); diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index 6a8ec38a815d4..b9fe5b511c945 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -58,6 +58,10 @@ class LambdaFunction final : public Expr { return _arguments_ids.size(); } + bool is_lambda_function() const override { + return true; + } + Expr* get_lambda_expr() const { return _children[0]; } std::string debug_string() const override; From d38b66150b7a614e245b3627bb4db96ae597eb34 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Thu, 19 Sep 2024 15:23:21 +0800 Subject: [PATCH 04/17] stash: pass common test Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/array_column.cpp | 43 ++- be/src/column/array_column.h | 12 +- be/src/column/array_view_column.cpp | 26 +- be/src/column/array_view_column.h | 124 ++----- be/src/column/column.h | 3 +- be/src/column/column_helper.h | 1 - be/src/column/column_visitor.h | 1 - be/src/exec/sorting/compare_column.cpp | 4 +- be/src/exprs/array_functions.cpp | 8 +- be/src/exprs/array_map_expr.cpp | 484 +++++++++++++++++-------- be/src/exprs/array_map_expr.h | 4 +- be/src/exprs/expr.h | 4 +- be/src/exprs/lambda_function.cpp | 46 ++- be/src/exprs/lambda_function.h | 11 +- 14 files changed, 461 insertions(+), 310 deletions(-) diff --git a/be/src/column/array_column.cpp b/be/src/column/array_column.cpp index 633c2d1bc04cc..584f3a5ddf6f2 100644 --- a/be/src/column/array_column.cpp +++ b/be/src/column/array_column.cpp @@ -17,10 +17,10 @@ #include #include "column/column_helper.h" -#include "exprs/function_helper.h" #include "column/fixed_length_column.h" #include "column/nullable_column.h" #include "column/vectorized_fwd.h" +#include "exprs/function_helper.h" #include "gutil/bits.h" #include "gutil/casts.h" #include "gutil/strings/fastmem.h" @@ -618,30 +618,44 @@ Status ArrayColumn::unfold_const_children(const starrocks::TypeDescriptor& type) return Status::OK(); } - template -bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2, const NullColumnPtr& null_column) { +bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2, + const NullColumnPtr& null_column) { [[maybe_unused]] uint8_t* null_data = nullptr; if constexpr (!IgnoreNull) { null_data = null_column->get_data().data(); } - size_t num_rows = v1.size(); + size_t num_rows = v1.size() - 1; + LOG(INFO) << "num_rows: " << num_rows; if constexpr (ConstV1 && ConstV2) { // if both are const column, we only compare the first row once num_rows = 1; } - bool result = true; const auto& offsets_v1 = v1.get_data(); const auto& offsets_v2 = v2.get_data(); - for (size_t i = 0;i < num_rows && result;i++) { - [[maybe_unused]] uint32_t len1 = (ConstV1) ? (offsets_v1[1] - offsets_v1[0]) : (offsets_v1[i + 1] - offsets_v1[i]); - [[maybe_unused]] uint32_t len2 = (ConstV2) ? (offsets_v2[1] - offsets_v2[0]) : (offsets_v2[i + 1] - offsets_v2[i]); + for (size_t i = 0;i < offsets_v1.size();i++) { + LOG(INFO) << "offset v1: " << offsets_v1[i] << ", v2:" << offsets_v2[i]; + } + + + for (size_t i = 0; i < num_rows && result; i++) { + [[maybe_unused]] uint32_t len1 = + (ConstV1) ? (offsets_v1[1] - offsets_v1[0]) : (offsets_v1[i + 1] - offsets_v1[i]); + [[maybe_unused]] uint32_t len2 = + (ConstV2) ? (offsets_v2[1] - offsets_v2[0]) : (offsets_v2[i + 1] - offsets_v2[i]); if constexpr (IgnoreNull) { + if (len1 != len2) { + LOG(INFO) << "array len mismatch, v1: " << len1 << ", v2: " << len2 << ", idx: " << i; + } result &= (len1 == len2); } else { + LOG(INFO) << "check idx: " << i << ", null: " << static_cast(null_data[i]); if (!null_data[i]) { + if (len1 != len2) { + LOG(INFO) << "array len mismatch, v1: " << len1 << ", v2: " << len2 << ", idx: " << i; + } result &= (len1 == len2); } } @@ -649,13 +663,14 @@ bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UIn return result; } - template -bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_column) { +bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, + const NullColumnPtr& null_column) { DCHECK(v1->is_array() && v2->is_array()); DCHECK(!v1->is_nullable() && !v2->is_nullable()); if (v1->size() != v2->size()) { + LOG(INFO) << "size not equal, v1: " << v1->size() << ", v2: " << v2->size(); return false; } auto data_v1 = FunctionHelper::get_data_column_of_const(v1); @@ -664,6 +679,7 @@ bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPt auto* array_v2 = down_cast(data_v2.get()); const auto& offsets_v1 = array_v1->offsets(); const auto& offsets_v2 = array_v2->offsets(); + LOG(INFO) << "v1 size: " << v1->size() << ", v2 size: " << v2->size() << ", offset v1: " << offsets_v1.size() << ", offset v2: " << offsets_v2.size(); if (v1->is_constant() && v2->is_constant()) { return compare_lengths_from_offsets(offsets_v1, offsets_v2, null_column); } else if (v1->is_constant() && !v2->is_constant()) { @@ -675,8 +691,9 @@ bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPt return compare_lengths_from_offsets(offsets_v1, offsets_v2, null_column); } -template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); -template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); - +template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, + const NullColumnPtr& null_data); +template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, + const NullColumnPtr& null_data); } // namespace starrocks diff --git a/be/src/column/array_column.h b/be/src/column/array_column.h index 15a15532fea53..c95796f1cd675 100644 --- a/be/src/column/array_column.h +++ b/be/src/column/array_column.h @@ -145,7 +145,7 @@ class ArrayColumn final : public ColumnFactory { void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override; - std::string get_name() const override { return "array-"+ _elements->get_name(); } + std::string get_name() const override { return "array-" + _elements->get_name(); } Datum get(size_t idx) const override; @@ -196,7 +196,6 @@ class ArrayColumn final : public ColumnFactory { Status unfold_const_children(const starrocks::TypeDescriptor& type) override; - // check if all of arrays' size is equal // v1 and v2 must be one of ArrayColumn or Const(ArrayColumn) template @@ -204,7 +203,8 @@ class ArrayColumn final : public ColumnFactory { private: template - static bool compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2, const NullColumnPtr& null_data); + static bool compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2, + const NullColumnPtr& null_data); // Elements must be NullableColumn to facilitate handling nested types. ColumnPtr _elements; @@ -215,7 +215,9 @@ class ArrayColumn final : public ColumnFactory { UInt32Column::Ptr _offsets; }; -extern template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); -extern template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); +extern template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, + const NullColumnPtr& null_data); +extern template bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, + const NullColumnPtr& null_data); } // namespace starrocks diff --git a/be/src/column/array_view_column.cpp b/be/src/column/array_view_column.cpp index f13b1aa8833ed..45259efa70f2b 100644 --- a/be/src/column/array_view_column.cpp +++ b/be/src/column/array_view_column.cpp @@ -13,9 +13,11 @@ // limitations under the License. #include "column/array_view_column.h" + #include #include #include + #include "column/array_column.h" #include "column/chunk.h" #include "column/vectorized_fwd.h" @@ -26,13 +28,13 @@ namespace starrocks { ColumnPtr ArrayViewColumn::replicate(const Buffer& offsets) { // @TODO clone empty??? // auto dest = this->clone_empty(); - auto dest_size = offsets.size() - 1; + auto dest_size = offsets.size() - 1; auto new_offsets = UInt32Column::create(); auto new_lengths = UInt32Column::create(); new_offsets->reserve(offsets.back()); new_lengths->reserve(offsets.back()); - for (size_t i = 0;i < dest_size;i++) { + for (size_t i = 0; i < dest_size; i++) { uint32_t repeat_times = offsets[i + 1] - offsets[i]; new_offsets->append_value_multiple_times(*_offsets, i, repeat_times); new_lengths->append_value_multiple_times(*_lengths, i, repeat_times); @@ -56,7 +58,7 @@ void ArrayViewColumn::append(const Column& src, size_t offset, size_t count) { // @TODO should optimize // @TODO should avoid this copy... uint32_t offset = _elements->size(); - for (size_t i = 0;i < count;i++) { + for (size_t i = 0; i < count; i++) { uint32_t src_offset = src_offsets.get_data()[offset + i]; uint32_t src_length = src_lengths.get_data()[offset + i]; DCHECK_LE(src_offset + src_length, array_view_column._elements->size()); @@ -72,7 +74,7 @@ void ArrayViewColumn::check_or_die() const { DCHECK(_offsets); DCHECK(_lengths); DCHECK_EQ(_offsets->size(), _lengths->size()); - for (size_t i = 0;i < _offsets->size();i++) { + for (size_t i = 0; i < _offsets->size(); i++) { uint32_t offset = _offsets->get_data()[i]; uint32_t length = _lengths->get_data()[i]; DCHECK_LE(offset + length, _elements->size()); @@ -96,7 +98,7 @@ StatusOr ArrayViewColumn::to_array_column() const { uint32_t last_offset = 0; size_t num_rows = _offsets->size(); // @TODO maybe copy alot... - for (size_t i = 0;i < num_rows;i++) { + for (size_t i = 0; i < num_rows; i++) { uint32_t offset = _offsets->get_data()[i]; uint32_t length = _lengths->get_data()[i]; LOG(INFO) << "offset: " << offset << ", len: " << length; @@ -108,7 +110,6 @@ StatusOr ArrayViewColumn::to_array_column() const { return ArrayColumn::create(std::move(array_elements), std::move(array_offsets)); } - StatusOr ArrayViewColumn::from_array_column(const ColumnPtr& column) { if (!column->is_array()) { LOG(INFO) << "from_array_column error..."; @@ -142,23 +143,24 @@ StatusOr ArrayViewColumn::from_array_column(const ColumnPtr& column) // elements column: [1,2,3,4] // offsets column: [0,2,2,3] // length column: [2,0,0,1] - for (size_t i = 0;i < column->size(); i ++) { + for (size_t i = 0; i < column->size(); i++) { uint32_t offset = array_offsets[i]; - uint32_t length = null_data[i] ? 0: (array_offsets[i + 1] - offset); + uint32_t length = null_data[i] ? 0 : (array_offsets[i + 1] - offset); LOG(INFO) << "append offset: " << offset << ", length: " << length; view_offsets->append(offset); view_lengths->append(length); } - auto ret = NullableColumn::create(ArrayViewColumn::create(view_elements, view_offsets, view_lengths), nullable_column->null_column()); + auto ret = NullableColumn::create(ArrayViewColumn::create(view_elements, view_offsets, view_lengths), + nullable_column->null_column()); ret->check_or_die(); return ret; - } + } auto array_column = down_cast(column.get()); view_elements = array_column->elements_column(); const auto& array_offsets = array_column->offsets().get_data(); - for (size_t i = 0;i < column->size();i++) { + for (size_t i = 0; i < column->size(); i++) { uint32_t offset = array_offsets[i]; uint32_t length = array_offsets[i + 1] - offset; view_offsets->append(offset); @@ -184,4 +186,4 @@ StatusOr ArrayViewColumn::to_array_column(const ColumnPtr& column) { auto array_view_column = down_cast(column.get()); return array_view_column->to_array_column(); } -} \ No newline at end of file +} // namespace starrocks \ No newline at end of file diff --git a/be/src/column/array_view_column.h b/be/src/column/array_view_column.h index ce5f10099ac41..e185faf37e2f9 100644 --- a/be/src/column/array_view_column.h +++ b/be/src/column/array_view_column.h @@ -24,22 +24,24 @@ namespace starrocks { -class ArrayViewColumn final: public ColumnFactory { +class ArrayViewColumn final : public ColumnFactory { friend class ColumnFactory; public: using ValueType = void; // @TODO need array view? - ArrayViewColumn(ColumnPtr elements, UInt32Column::Ptr offsets, UInt32Column::Ptr lengths): - _elements(std::move(elements)), _offsets(std::move(offsets)), _lengths(std::move(lengths)) {} + ArrayViewColumn(ColumnPtr elements, UInt32Column::Ptr offsets, UInt32Column::Ptr lengths) + : _elements(std::move(elements)), _offsets(std::move(offsets)), _lengths(std::move(lengths)) {} ArrayViewColumn(const ArrayViewColumn& rhs) - : _elements(rhs._elements), - _offsets(std::static_pointer_cast(rhs._offsets->clone_shared())), - _lengths(std::static_pointer_cast(rhs._lengths->clone_shared())) {} + : _elements(rhs._elements), + _offsets(std::static_pointer_cast(rhs._offsets->clone_shared())), + _lengths(std::static_pointer_cast(rhs._lengths->clone_shared())) {} - ArrayViewColumn(ArrayViewColumn&& rhs) noexcept: - _elements(std::move(rhs._elements)), _offsets(std::move(rhs._offsets)), _lengths(std::move(rhs._lengths)) {} + ArrayViewColumn(ArrayViewColumn&& rhs) noexcept + : _elements(std::move(rhs._elements)), + _offsets(std::move(rhs._offsets)), + _lengths(std::move(rhs._lengths)) {} ArrayViewColumn& operator=(const ArrayViewColumn& rhs) { // @TODO @@ -52,11 +54,9 @@ class ArrayViewColumn final: public ColumnFactory { ~ArrayViewColumn() override = default; - bool is_array_view() const override { - return true; - } + bool is_array_view() const override { return true; } - const uint8_t* raw_data() const override{ + const uint8_t* raw_data() const override { DCHECK(false) << "ArrayViewColumn::raw_data() is not supported"; return nullptr; } @@ -66,12 +66,8 @@ class ArrayViewColumn final: public ColumnFactory { return nullptr; } - size_t size() const override { - return _offsets->size(); - } - size_t capacity() const override { - return _offsets->capacity() + _lengths->capacity(); - } + size_t size() const override { return _offsets->size(); } + size_t capacity() const override { return _offsets->capacity() + _lengths->capacity(); } size_t type_size() const override { // @TODO need a array view type @@ -96,7 +92,6 @@ class ArrayViewColumn final: public ColumnFactory { _elements->reserve(n); _offsets->reserve(n); _lengths->reserve(n); - } void resize(size_t n) override { // DCHECK(false) << "ArrayViewColumn::resize() is not supported"; @@ -135,9 +130,7 @@ class ArrayViewColumn final: public ColumnFactory { DCHECK(false) << "ArrayViewColumn::append_value_multiple_times() is not supported"; } - void append_default() override { - DCHECK(false) << "ArrayViewColumn::append_default() is not supported"; - } + void append_default() override { DCHECK(false) << "ArrayViewColumn::append_default() is not supported"; } void append_default(size_t count) override { DCHECK(false) << "ArrayViewColumn::append_default() is not supported"; } @@ -163,7 +156,8 @@ class ArrayViewColumn final: public ColumnFactory { return 0; } - void serialize_batch(uint8_t* dst, Buffer& slice_sized, size_t chunk_size, uint32_t max_one_row_size) override { + void serialize_batch(uint8_t* dst, Buffer& slice_sized, size_t chunk_size, + uint32_t max_one_row_size) override { DCHECK(false); } const uint8_t* deserialize_and_append(const uint8_t* pos) override { @@ -175,9 +169,7 @@ class ArrayViewColumn final: public ColumnFactory { DCHECK(false); return 0; } - void deserialize_and_append_batch(Buffer& srcs, size_t chunk_size) override { - DCHECK(false); - } + void deserialize_and_append_batch(Buffer& srcs, size_t chunk_size) override { DCHECK(false); } MutableColumnPtr clone_empty() const override; @@ -191,54 +183,30 @@ class ArrayViewColumn final: public ColumnFactory { return 0; } - void compare_column(const Column& rhs, std::vector* output) const { + void compare_column(const Column& rhs, std::vector* output) const {} - } + int equals(size_t left, const Column& rhs, size_t right, bool safe_eq = true) const override { return 0; } - int equals(size_t left, const Column& rhs, size_t right, bool safe_eq = true) const override { - return 0; - } + void crc32_hash_at(uint32_t* seed, uint32_t idx) const override {} + void fnv_hash_at(uint32_t* seed, uint32_t idx) const override {} + void fnv_hash(uint32_t* hash, uint32_t from, uint32_t to) const override {} - void crc32_hash_at(uint32_t *seed, uint32_t idx) const override { + void crc32_hash(uint32_t* hash, uint32_t from, uint32_t to) const override {} - } - void fnv_hash_at(uint32_t* seed, uint32_t idx) const override { - - } - void fnv_hash(uint32_t* hash, uint32_t from, uint32_t to) const override { - - } - - void crc32_hash(uint32_t* hash, uint32_t from, uint32_t to) const override { - - } - - int64_t xor_checksum(uint32_t from, uint32_t to) const override { - return 0; - } + int64_t xor_checksum(uint32_t from, uint32_t to) const override { return 0; } - void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override { - - } + void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override {} ColumnPtr replicate(const Buffer& offsets) override; std::string get_name() const override { return "array-view"; } - Datum get(size_t idx) const override { - return Datum(); - } + Datum get(size_t idx) const override { return Datum(); } - size_t get_element_null_count(size_t idx) const { - return 0; - } - size_t get_element_size(size_t idx) const { - return 0; - } + size_t get_element_null_count(size_t idx) const { return 0; } + size_t get_element_size(size_t idx) const { return 0; } - bool set_null(size_t idx) override { - return false; - } + bool set_null(size_t idx) override { return false; } size_t memory_usage() const override { return _elements->memory_usage() + _offsets->memory_usage(); } @@ -246,9 +214,7 @@ class ArrayViewColumn final: public ColumnFactory { return _elements->container_memory_usage() + _offsets->container_memory_usage(); } - size_t reference_memory_usage(size_t from, size_t size) const override { - return 0; - } + size_t reference_memory_usage(size_t from, size_t size) const override { return 0; } void swap_column(Column& rhs) override {} @@ -260,42 +226,30 @@ class ArrayViewColumn final: public ColumnFactory { const UInt32Column& offsets() const { return *_offsets; } UInt32Column::Ptr& offsets_column() { return _offsets; } - const UInt32Column& lengths() const { - return *_lengths; - } + const UInt32Column& lengths() const { return *_lengths; } UInt32Column::Ptr& lengths_column() { return _lengths; } bool is_nullable() const override { return false; } - std::string debug_item(size_t idx) const override { - return ""; - } + std::string debug_item(size_t idx) const override { return ""; } - std::string debug_string() const override { - return "array-view-column"; - } + std::string debug_string() const override { return "array-view-column"; } Status capacity_limit_reached() const override { RETURN_IF_ERROR(_elements->capacity_limit_reached()); return _offsets->capacity_limit_reached(); } - StatusOr upgrade_if_overflow() override { - return nullptr; - } + StatusOr upgrade_if_overflow() override { return nullptr; } - StatusOr downgrade() override { - return nullptr; - } + StatusOr downgrade() override { return nullptr; } bool has_large_column() const override { return _elements->has_large_column(); } void check_or_die() const override; - Status unfold_const_children(const starrocks::TypeDescriptor& type) override { - return Status::NotSupported("TBD"); - } - + Status unfold_const_children(const starrocks::TypeDescriptor& type) override { return Status::NotSupported("TBD"); } + // build array_view column from array_column, how to solve null?? // if array_column is nullable, return Nullable(ArrayViewColumn) // else return ArrayViewColumn @@ -310,4 +264,4 @@ class ArrayViewColumn final: public ColumnFactory { UInt32Column::Ptr _offsets; UInt32Column::Ptr _lengths; }; -} \ No newline at end of file +} // namespace starrocks \ No newline at end of file diff --git a/be/src/column/column.h b/be/src/column/column.h index 979165a98baed..6d653ad0022ce 100644 --- a/be/src/column/column.h +++ b/be/src/column/column.h @@ -187,13 +187,12 @@ class Column { auto dest_size = offsets.size() - 1; DCHECK(this->size() >= dest_size) << "The size of the source column is less when aligning offsets."; dest->reserve(offsets.back()); - for (size_t i = 0;i < dest_size;i++) { + for (size_t i = 0; i < dest_size; i++) { // first value is itself, others append default dest->append_value_multiple_times(*this, i, 1); if (offsets[i + 1] - offsets[i] > 1) { dest->append_default(offsets[i + 1] - offsets[i] - 1); } - } return dest; } diff --git a/be/src/column/column_helper.h b/be/src/column/column_helper.h index e3f35b66a5fe7..94cd37889ec73 100644 --- a/be/src/column/column_helper.h +++ b/be/src/column/column_helper.h @@ -124,7 +124,6 @@ class ColumnHelper { return column; } - static inline bool offsets_equal(const UInt32Column::Ptr& offset0, const UInt32Column::Ptr& offset1) { if (offset0->size() != offset1->size()) { return false; diff --git a/be/src/column/column_visitor.h b/be/src/column/column_visitor.h index 7a400c3a916dc..eb86bfa781627 100644 --- a/be/src/column/column_visitor.h +++ b/be/src/column/column_visitor.h @@ -81,7 +81,6 @@ class ColumnVisitor { virtual Status visit(const FixedLengthColumnBase& column); virtual Status visit(const ObjectColumn& column); virtual Status visit(const ArrayViewColumn& column); - }; } // namespace starrocks diff --git a/be/src/exec/sorting/compare_column.cpp b/be/src/exec/sorting/compare_column.cpp index fef976df8588b..4d7e309529c5e 100644 --- a/be/src/exec/sorting/compare_column.cpp +++ b/be/src/exec/sorting/compare_column.cpp @@ -317,9 +317,7 @@ class ColumnTieBuilder final : public ColumnVisitorAdapter { Status do_visit(const ObjectColumn& column) { return Status::NotSupported("not support"); } - Status do_visit(const ArrayViewColumn& column) { - return Status::NotSupported("Not support"); - } + Status do_visit(const ArrayViewColumn& column) { return Status::NotSupported("Not support"); } private: const ColumnPtr _column; diff --git a/be/src/exprs/array_functions.cpp b/be/src/exprs/array_functions.cpp index cf24dde409b40..a3eaf6d04b407 100644 --- a/be/src/exprs/array_functions.cpp +++ b/be/src/exprs/array_functions.cpp @@ -17,11 +17,11 @@ #include #include "column/array_column.h" +#include "column/array_view_column.h" #include "column/column_hash.h" #include "column/map_column.h" #include "column/struct_column.h" #include "column/type_traits.h" -#include "column/array_view_column.h" #include "common/statusor.h" #include "simd/simd.h" #include "util/raw_container.h" @@ -1098,6 +1098,7 @@ StatusOr ArrayFunctions::all_match(FunctionContext* context, const Co } StatusOr ArrayFunctions::any_match(FunctionContext* context, const Columns& columns) { + LOG(INFO) << "evaluate any_match"; return ArrayMatch::process(context, columns); } @@ -1108,8 +1109,9 @@ StatusOr ArrayFunctions::concat(FunctionContext* ctx, const Columns& auto num_rows = columns[0]->size(); LOG(INFO) << "array_concat, num_rows: " << num_rows; - for (auto& column: columns) { - LOG(INFO) << "column size: " << column->size() << ", is_const: " << column->is_constant() << ", is_nullable: " << column->is_nullable(); + for (auto& column : columns) { + LOG(INFO) << "column size: " << column->size() << ", is_const: " << column->is_constant() + << ", is_nullable: " << column->is_nullable(); } // compute nulls NullColumnPtr nulls; diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index c87bc5e3593a5..71e8f326f7197 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -15,14 +15,17 @@ #include "exprs/array_map_expr.h" #include + #include +#include #include "column/array_column.h" +#include "column/array_view_column.h" #include "column/chunk.h" #include "column/column_helper.h" #include "column/const_column.h" #include "column/fixed_length_column.h" -#include "column/array_view_column.h" +#include "column/nullable_column.h" #include "column/vectorized_fwd.h" #include "common/constexpr.h" #include "common/statusor.h" @@ -40,7 +43,7 @@ ArrayMapExpr::ArrayMapExpr(const TExprNode& node) : Expr(node, false) {} ArrayMapExpr::ArrayMapExpr(TypeDescriptor type) : Expr(std::move(type), false) {} Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { - for (int i = 1;i < _children.size(); ++i) { + for (int i = 1; i < _children.size(); ++i) { RETURN_IF_ERROR(_children[i]->prepare(state, context)); } // if child 0 is not lambda, what will happen whe nevaluate @@ -57,8 +60,8 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { RETURN_IF_ERROR(lambda_expr->extract_outer_common_exprs(state, &extract_ctx)); _outer_common_exprs.swap(extract_ctx.outer_common_exprs); - for (auto [_, expr]: _outer_common_exprs) { - // @TODO + for (auto [_, expr] : _outer_common_exprs) { + // @TODO LOG(INFO) << "prepare common expr: " << expr->debug_string(); // @TODO if after rewrite, first expr of array_map become column ref, we can remove it? RETURN_IF_ERROR(expr->prepare(state, context)); @@ -82,7 +85,7 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* ColumnPtr input_array_ptr_ref = nullptr; // hold shared_ptr to avoid early deleted. // ColumnPtr aligned_offsets; - UInt32Column::Ptr aligned_offsets; + UInt32Column::Ptr aligned_offsets = nullptr; // @TODO we should eval common expr first // maybe a NullColumn or a Const(NullColumn) @@ -123,8 +126,9 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // @TODO we can check it before?? // this will re-build data column, replace null element to empty array - data_column->empty_null_in_complex_column(nullable_column->null_column()->get_data(), - down_cast(data_column.get())->offsets().get_data()); + // data_column->empty_null_in_complex_column( + // nullable_column->null_column()->get_data(), + // down_cast(data_column.get())->offsets().get_data()); auto null_column = nullable_column->null_column(); if (is_const) { @@ -137,13 +141,24 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* if (result_null_column) { is_single_nullable_child = false; // union two null column - LOG(INFO) << "union result_null_column, size: " << result_null_column->size() << ", null size: " << null_column->size(); + LOG(INFO) << "union result_null_column, size: " << result_null_column->size() + << ", null size: " << null_column->size(); result_null_column = FunctionHelper::union_null_column(null_column, result_null_column); LOG(INFO) << "union done: " << result_null_column->size(); + std::ostringstream oss; + for (auto null_data: result_null_column->get_data()) { + oss << static_cast(null_data) << ","; + } + LOG(INFO) << "null data: " << oss.str(); } else { is_single_nullable_child = true; result_null_column = null_column; LOG(INFO) << "assign result_null_column, size: " << null_column->size(); + std::ostringstream oss; + for (auto null_data: result_null_column->get_data()) { + oss << static_cast(null_data) << ","; + } + LOG(INFO) << "null data: " << oss.str(); } } DCHECK(data_column->is_array() && !data_column->is_nullable()); @@ -158,14 +173,17 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* if (!input_elements.empty()) { const auto& first_input = input_elements[0]; - bool is_array_lengths_valid = result_null_column ? - ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column): - ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column); + bool is_array_lengths_valid = + result_null_column + ? ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column) + : ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column); if (!is_array_lengths_valid) { + LOG(INFO) << "chunk num: " << chunk->num_rows() << ", current idx: " << i << ", first input size: " << first_input->size() + << ", column size: " << column->size() << ", null size: " << result_null_column->size(); return Status::InternalError("Input array element's size is not equal in array_map()."); } } - + input_elements.emplace_back(column); } @@ -177,24 +195,41 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* } ColumnPtr column = nullptr; - size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()): 0; + size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()) : 0; if (null_rows == input_elements[0]->size()) { // all input is null + // @TODO we can give a Const(Nullable(ArrayColumn)) + LOG(INFO) << "all input is null, null_rows: " << null_rows << ", input_elements[0]->size(): " << input_elements[0]->size(); + // @TODO create an array column with all null column = ColumnHelper::create_column(type().children[0], true); // array->elements must be of return array->elements' type + column->append_default(1); + // @TODO handle aligned offsets + aligned_offsets = UInt32Column::create(0); + aligned_offsets->append(0); + aligned_offsets->append(1); + auto array_col = std::make_shared(column, aligned_offsets); + array_col->check_or_die(); + LOG(INFO) << "array_col size: " << array_col->size(); + result_null_column->resize(1); + auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); + result->check_or_die(); + return result; + // @TODO shoulw give + // aligned_offsets->append(0); } else { // construct a new chunk to evaluate the lambda expression. auto cur_chunk = std::make_shared(); // 1. evaluate all outer common exprs LOG(INFO) << "eval outer common exprs, size: " << _outer_common_exprs.size(); - for (const auto& [column_ref, expr]: _outer_common_exprs) { - auto slot_id = down_cast(column_ref)->slot_id(); - LOG(INFO) << "eval non-capture expr: " << slot_id; + for (const auto& [slot_id, expr] : _outer_common_exprs) { + LOG(INFO) << "eval non-capture expr, slot_id: " << slot_id << ", expr: " << expr->debug_string(); ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); LOG(INFO) << "col size: " << col->size(); chunk->append_column(col, slot_id); } + LOG(INFO) << "eval outer common exprs done"; auto lambda_func = dynamic_cast(_children[0]); std::vector slot_ids; @@ -209,163 +244,296 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* "The size of the captured column {} is less than array's size.", captured_column->get_name())); } } + // if lambda expr is independet, we can treat is as a const column, skip repliacate + if (lambda_func->is_lambda_expr_independent()) { + LOG(INFO) << "lambda expr is independent, we can skip replicate"; + // we can eval lambda expr and return a const column + // @TODO consider const + // @TODO skip + std::vector arguments_ids; + int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); + DCHECK(argument_num == input_elements.size()); + for (int i = 0; i < argument_num; ++i) { + auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); + auto array_column = down_cast(data_column.get()); + auto elements_column = array_column->elements_column(); + UInt32Column::Ptr offsets_column = array_column->offsets_column(); + + if (input_elements[i]->is_constant()) { + // if input is const, we should assign data multiple times + // seems we cant avoid copy data if we don't have view column? + // if input is const, we should wrap its element column as a const column too + // @TODO elements should not be a const column + size_t elements_num = array_column->get_element_size(0); + elements_column = elements_column->clone(); + // create a new offsets + // offsets_column = UInt32Column::create(); + offsets_column = UInt32Column::create(); + // replicate N time and ignore null + size_t repeat_times = input_elements[i]->size() - null_rows; + offsets_column->append(0); + size_t offset = elements_num; + for (size_t i = 0; i < repeat_times; i++) { + elements_column->append(*elements_column, 0, elements_num); + offset += elements_num; + offsets_column->append(offset); + } + } else { + // @TODO null data size is ok, only one row, why offsets has too many data? - // 3. align up all columns offsets - // if most value is null, we remove all null column, create a new one to evaluate - // else alignup offset - // @TODO we can't avoid copy data here?? - // should we replicate capture column??? - // empty all null is ok - - // @TODO if all input is const, we don't need unpack const - if (all_input_is_constant) { - // if all input arguments are ConstColumn, we don't need unpack, just evaluate on ConstColumn - LOG(INFO) << "all inputs of array_map are ConstColumn"; - - } - // @TODO udpate aligned_offsets, we can use arg0's offsets? - - std::vector arguments_ids; - int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); - DCHECK(argument_num == input_elements.size()); - for (int i = 0; i < argument_num; ++i) { - auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); - auto array_column = down_cast(data_column.get()); - auto elements_column = array_column->elements_column(); - UInt32Column::Ptr offsets_column = array_column->offsets_column(); - - if (input_elements[i]->is_constant()) { - // if input is const, we should assign data multiple times - // seems we cant avoid copy data if we don't have view column? - // if input is const, we should wrap its element column as a const column too - // @TODO elements should not be a const column - size_t elements_num = array_column->get_element_size(0); - elements_column = elements_column->clone(); - // create a new offsets - // offsets_column = UInt32Column::create(); - offsets_column = UInt32Column::create(); - // replicate N time and ignore null - size_t repeat_times = input_elements[i]->size() - null_rows; - offsets_column->append(0); - size_t offset = elements_num; - for (size_t i = 0;i < repeat_times;i++) { - elements_column->append(*elements_column, 0, elements_num); - offset += elements_num; - offsets_column->append(offset); + // @TODO empty_null should apply on array column.. + // elements_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); + data_column->empty_null_in_complex_column(result_null_column->get_data(), + array_column->offsets().get_data()); + elements_column = down_cast(data_column.get())->elements_column(); + } + if (aligned_offsets == nullptr) { + LOG(INFO) << "assign offsets: " << offsets_column->size(); + aligned_offsets = offsets_column; } + //append elemt + // cur_chunk->append_column(elements_column, arguments_ids[i]); + LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; + } + DCHECK(aligned_offsets != nullptr); + LOG(INFO) << "begin append outer common column, num: " << _outer_common_exprs.size(); + for (const auto& [slot_id, expr] : _outer_common_exprs) { + auto column = chunk->get_column_by_slot_id(slot_id); + LOG(INFO) << "unpack const column: " << column->get_name() << ", size: " << column->size(); + column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); + // replicate column and put int into cur_chunk + // @TODO what if column is const? + // @TODO this should be in cur_chunk and chunk? + LOG(INFO) << "replicate column"; + // @TODO how to avoid replicate... + // @TODO replicate may cost a lot of memory, can we evalute directly to avoid replicate? + // auto aligned_column = column->replicate(aligned_offsets->get_data()); + auto aligned_column = column; + LOG(INFO) << "append outer common column: " << slot_id; + cur_chunk->append_column(aligned_column, slot_id); + // chunk->append_column(col, slot_id); + } + LOG(INFO) << "begin append capture column, num: " << slot_ids.size(); + for (auto slot_id : slot_ids) { + DCHECK(slot_id > 0); + if (cur_chunk->is_slot_exist(slot_id)) { + continue; + } + auto captured_column = chunk->get_column_by_slot_id(slot_id); + // auto aligned_column = captured_column->replicate(aligned_offsets->get_data()); + auto aligned_column = captured_column; + cur_chunk->append_column(aligned_column, slot_id); + LOG(INFO) << "append capture column, " << slot_id; + } + // @TODO + // @TODO we should eval first, get column, then replicate it? + + // eval lambda + // @TODO since lambda not depend on argument, after eval all comon expr, we can get result, wrap it as a const column + LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); + ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); + tmp_col->check_or_die(); + // tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], cur_chunk->num_rows(), true); + LOG(INFO) << "replicate result, result size: " << tmp_col->size() << ", align offset size: " << aligned_offsets->get_data().back() << ", offset num:" << aligned_offsets->size(); + column = tmp_col->replicate(aligned_offsets->get_data()); + LOG(INFO) << "column size: " << column->size(); + column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); + column = ColumnHelper::cast_to_nullable_column(column); + } else { + // 3. align up all columns offsets + // if most value is null, we remove all null column, create a new one to evaluate + // else alignup offset + // @TODO we can't avoid copy data here?? + // should we replicate capture column??? + // empty all null is ok + + // @TODO if all input is const, we don't need unpack const + if (all_input_is_constant) { + // if all input arguments are ConstColumn, we don't need unpack, just evaluate on ConstColumn + LOG(INFO) << "all inputs of array_map are ConstColumn"; + // just eval, no unpack ,no xx... + } + // @TODO udpate aligned_offsets, we can use arg0's offsets? + + std::vector arguments_ids; + int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); + DCHECK(argument_num == input_elements.size()); + for (int i = 0; i < argument_num; ++i) { + auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); + auto array_column = down_cast(data_column.get()); + auto elements_column = array_column->elements_column(); + UInt32Column::Ptr offsets_column = array_column->offsets_column(); + + if (input_elements[i]->is_constant()) { + // if input is const, we should assign data multiple times + // seems we cant avoid copy data if we don't have view column? + // if input is const, we should wrap its element column as a const column too + // @TODO elements should not be a const column + if (all_input_is_constant) { + LOG(INFO) << "all input is const, we just keep const"; + } else { + size_t elements_num = array_column->get_element_size(0); + elements_column = elements_column->clone(); + // create a new offsets + // offsets_column = UInt32Column::create(); + offsets_column = UInt32Column::create(); + // replicate N time and ignore null + size_t repeat_times = input_elements[i]->size() - null_rows; + offsets_column->append(0); + size_t offset = elements_num; + for (size_t i = 0; i < repeat_times; i++) { + elements_column->append(*elements_column, 0, elements_num); + offset += elements_num; + offsets_column->append(offset); + } + } - } else { - // @TODO null data size is ok, only one row, why offsets has too many data? + } else { + // @TODO null data size is ok, only one row, why offsets has too many data? - // @TODO empty_null should apply on array column.. - // elements_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); - data_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); - elements_column = down_cast(data_column.get())->elements_column(); + // @TODO empty_null should apply on array column.. + // elements_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); + data_column->empty_null_in_complex_column(result_null_column->get_data(), + array_column->offsets().get_data()); + elements_column = down_cast(data_column.get())->elements_column(); + } + // @TODO consider all const case + if (aligned_offsets == nullptr) { + LOG(INFO) << "assign offsets: " << offsets_column->size(); + aligned_offsets = offsets_column; + } + //append elemt + cur_chunk->append_column(elements_column, arguments_ids[i]); + // LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; } - if (aligned_offsets == nullptr) { - aligned_offsets = offsets_column; + // @TODO put outer common expr into cur_chunk, + DCHECK(aligned_offsets != nullptr); + + // if capture column is empty + // align offset + LOG(INFO) << "begin append outer common column, num: " << _outer_common_exprs.size(); + for (const auto& [slot_id, expr] : _outer_common_exprs) { + auto column = chunk->get_column_by_slot_id(slot_id); + LOG(INFO) << "unpack const column: " << column->get_name() << ", size: " << column->size(); + column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); + // replicate column and put int into cur_chunk + // @TODO what if column is const? + // @TODO this should be in cur_chunk and chunk? + LOG(INFO) << "replicate column"; + // @TODO how to avoid replicate... + // @TODO replicate may cost a lot of memory, can we evalute directly to avoid replicate? + auto aligned_column = column->replicate(aligned_offsets->get_data()); + LOG(INFO) << "append outer common column: " << slot_id; + cur_chunk->append_column(aligned_column, slot_id); + // chunk->append_column(col, slot_id); } - //append elemt - cur_chunk->append_column(elements_column, arguments_ids[i]); - LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; - } - // @TODO put outer common expr into cur_chunk, - // align offset - for (const auto& [column_ref, expr]: _outer_common_exprs) { - auto slot_id = down_cast(column_ref)->slot_id(); - auto column = chunk->get_column_by_slot_id(slot_id); - column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); - // replicate column and put int into cur_chunk - // @TODO what if column is const? - // @TODO this should be in cur_chunk and chunk? - auto aligned_column = column->replicate(aligned_offsets->get_data()); - cur_chunk->append_column(aligned_column, slot_id); - LOG(INFO) << "append outer common column: " << slot_id; - // chunk->append_column(col, slot_id); - } - for (auto slot_id : slot_ids) { - DCHECK(slot_id > 0); - if (cur_chunk->is_slot_exist(slot_id)) { - continue; + LOG(INFO) << "begin append capture column, num: " << slot_ids.size(); + for (auto slot_id : slot_ids) { + DCHECK(slot_id > 0); + if (cur_chunk->is_slot_exist(slot_id)) { + continue; + } + auto captured_column = chunk->get_column_by_slot_id(slot_id); + auto aligned_column = captured_column->replicate(aligned_offsets->get_data()); + cur_chunk->append_column(aligned_column, slot_id); + LOG(INFO) << "append capture column, " << slot_id; } - auto captured_column = chunk->get_column_by_slot_id(slot_id); - auto aligned_column = captured_column->replicate(aligned_offsets->get_data()); - cur_chunk->append_column(aligned_column, slot_id); - LOG(INFO) << "append capture column, " << slot_id; - } - #ifdef DEBUG - { - auto first_column = cur_chunk->get_column_by_slot_id(arguments_ids[0]); - for (int i = 1;i < argument_num;i++) { - auto column = cur_chunk->get_column_by_slot_id(arguments_ids[i]); - DCHECK_EQ(column->size(), first_column->size()) << "input arguments size should be same"; + #ifdef DEBUG + { + auto first_column = cur_chunk->get_column_by_slot_id(arguments_ids[0]); + for (int i = 1; i < argument_num; i++) { + auto column = cur_chunk->get_column_by_slot_id(arguments_ids[i]); + DCHECK_EQ(column->size(), first_column->size()) << "input arguments size should be same"; + } + LOG(INFO) << "check length done"; } - } - #endif - + #endif + // @TODO - { - // @TODO evalu param may be very large?? - // cut tmp chunk from cur_chunk, and eval - // cut data - // if cur_chunk has view_column, we should convert view_column to column again - - // @TODO can we find common expr from chunk? - for (const auto& [slot_id, _]: chunk->get_slot_id_to_index_map()) { - LOG(INFO) << "chunk contains slot id: " << slot_id; - } - for (const auto& [slot_id, _] : cur_chunk->get_slot_id_to_index_map()) { - LOG(INFO) << "cur_chunk contains slot id: " << slot_id; - } - // @TODO cut row [x,y] into a tmp chunk - ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); - LOG(INFO) << "cur_chunk rows: " << cur_chunk->num_rows(); - RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); - accumulator.finalize(); - while (auto tmp_chunk = accumulator.pull()) { - // if contains view, should translate it back - // TODO change column - auto new_chunk = std::make_shared(); - // const auto& columns = tmp_chunk->columns(); - LOG(INFO) << "tmp_chunk rows: " << tmp_chunk->num_rows(); - // for(size_t idx = 0;idx < columns.size();idx++) { - // const auto& column = columns[idx]; - // if (column->is_array_view()) { - // LOG(INFO) << "convert array-view to array, " << column->get_name(); - // ASSIGN_OR_RETURN(auto new_column, ArrayViewColumn::to_array_column(column)); - // LOG(INFO) << "convert done"; - // new_column->check_or_die(); - // // auto array_view_column = down_cast(column.get()); - // // ASSIGN_OR_RETURN(auto new_column, array_view_column->to_array_column()); - // LOG(INFO) << "update column, idx: " << idx; - // tmp_chunk->update_column_by_index(new_column, idx); - // } - // } - tmp_chunk->check_or_die(); - // for (const auto& column: tmp_chunk->columns()) { - // LOG(INFO) << "column: " << column->get_name(); - // DCHECK(!column->is_array_view()) << "unexpected array view"; - // } - - ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); - tmp_col->check_or_die(); - tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); - if (column == nullptr) { - column = tmp_col; + { + // @TODO evalu param may be very large?? + // cut tmp chunk from cur_chunk, and eval + // cut data + // if cur_chunk has view_column, we should convert view_column to column again + if (all_input_is_constant) { + LOG(INFO) << "all input is constant, we just eval const column"; + LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); + ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); + tmp_col->check_or_die(); + // tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], cur_chunk->num_rows(), true); + LOG(INFO) << "replicate result, result size: " << tmp_col->size() << ", align offset size: " << aligned_offsets->get_data().back() << ", offset num:" << aligned_offsets->size(); + column = ConstColumn::create(FunctionHelper::get_data_column_of_const(tmp_col), tmp_col->size()); + // column = FunctionHelper::get_data_column_of_nullable(tmp_col); + // column = tmp_col->replicate(aligned_offsets->get_data()); + LOG(INFO) << "column size: " << column->size(); + column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); + LOG(INFO) << "final column: " << column->get_name(); + // column = ColumnHelper::cast_to_nullable_column(column); } else { - column->append(*tmp_col); + + // @TODO can we find common expr from chunk? + for (const auto& [slot_id, _] : chunk->get_slot_id_to_index_map()) { + LOG(INFO) << "chunk contains slot id: " << slot_id; + } + for (const auto& [slot_id, _] : cur_chunk->get_slot_id_to_index_map()) { + LOG(INFO) << "cur_chunk contains slot id: " << slot_id; + } + // @TODO cut row [x,y] into a tmp chunk + ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); + LOG(INFO) << "cur_chunk rows: " << cur_chunk->num_rows(); + RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); + accumulator.finalize(); + while (auto tmp_chunk = accumulator.pull()) { + // if contains view, should translate it back + // TODO change column + auto new_chunk = std::make_shared(); + // const auto& columns = tmp_chunk->columns(); + LOG(INFO) << "tmp_chunk rows: " << tmp_chunk->num_rows(); + + tmp_chunk->check_or_die(); + + LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); + ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); + tmp_col->check_or_die(); + tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); + if (column == nullptr) { + column = tmp_col; + } else { + column->append(*tmp_col); + } + } } } + // construct the result array + DCHECK(column != nullptr); + column = ColumnHelper::cast_to_nullable_column(column); } - // construct the result array - DCHECK(column != nullptr); - column = ColumnHelper::cast_to_nullable_column(column); - } // @TODO handle const? + if (all_input_is_constant) { + LOG(INFO) << "all input is const, create a const column as result"; + auto data_column = FunctionHelper::get_data_column_of_const(column); + aligned_offsets = UInt32Column::create(); + aligned_offsets->append(0); + aligned_offsets->append(column->size()); + auto array_col = std::make_shared( + data_column, ColumnHelper::as_column(aligned_offsets->clone_shared())); + array_col->check_or_die(); + ColumnPtr result_column = array_col; + if (result_null_column != nullptr) { + result_column = NullableColumn::create(std::move(array_col), result_null_column); + result_column->check_or_die(); + // return ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), result_null_column->size()); + // return NullableColumn::create(std::move(array_col), result_null_column); + } + result_column = ConstColumn::create(result_column, chunk->num_rows()); + result_column->check_or_die(); + LOG(INFO) << "result: " << result_column->get_name() << ", size: " << result_column->size(); + return result_column; + } // @TODO aligned offsets maybe null - + // @TODO // attach offsets auto array_col = std::make_shared( column, ColumnHelper::as_column(aligned_offsets->clone_shared())); @@ -380,8 +548,8 @@ std::string ArrayMapExpr::debug_string() const { std::stringstream out; auto expr_debug_string = Expr::debug_string(); out << "array_map ("; - for (int i = 0;i < _children.size();i++) { - out << (i == 0 ? "": ", ") << _children[i]->debug_string(); + for (int i = 0; i < _children.size(); i++) { + out << (i == 0 ? "" : ", ") << _children[i]->debug_string(); } out << ")"; return out.str(); diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index b0d339956ad67..84dc5776957d5 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -40,8 +40,10 @@ class ArrayMapExpr final : public Expr { StatusOr evaluate_checked(ExprContext* context, Chunk* ptr) override; std::string debug_string() const override; + private: // use map to make sure the order of execution - std::map _outer_common_exprs; + // std::map _outer_common_exprs; + std::map _outer_common_exprs; }; } // namespace starrocks diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index f8a57a2f72147..ef861b1411313 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -120,9 +120,7 @@ class Expr { bool is_monotonic() const { return _is_monotonic; } bool is_cast_expr() const { return _node_type == TExprNodeType::CAST_EXPR; } - virtual bool is_lambda_function() const { - return false; - } + virtual bool is_lambda_function() const { return false; } // In most time, this field is passed from FE // Sometimes we want to construct expr on BE implicitly and we have knowledge about `monotonicity` diff --git a/be/src/exprs/lambda_function.cpp b/be/src/exprs/lambda_function.cpp index ca1abc7fcc6e3..f854b937b8a3a 100644 --- a/be/src/exprs/lambda_function.cpp +++ b/be/src/exprs/lambda_function.cpp @@ -29,11 +29,9 @@ namespace starrocks { -LambdaFunction::LambdaFunction(const TExprNode& node) : Expr(node, false), _common_sub_expr_num(node.output_column) { -} +LambdaFunction::LambdaFunction(const TExprNode& node) : Expr(node, false), _common_sub_expr_num(node.output_column) {} -Status LambdaFunction::extract_outer_common_exprs( - RuntimeState* state, Expr* expr, ExtractContext* ctx) { +Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx) { if (expr->is_slotref()) { return Status::OK(); } @@ -49,11 +47,11 @@ Status LambdaFunction::extract_outer_common_exprs( std::vector slot_ids; // @TODO we can't replace lambda? - for (int i = 0;i < child_num;i++) { + for (int i = 0; i < child_num; i++) { auto child = expr->get_child(i); RETURN_IF_ERROR(extract_outer_common_exprs(state, child, ctx)); - if (child->is_slotref()) { + if (child->is_slotref() || child->is_lambda_function()) { continue; } slot_ids.clear(); @@ -67,9 +65,9 @@ Status LambdaFunction::extract_outer_common_exprs( SlotId slot_id = ctx->next_slot_id++; ColumnRef* column_ref = state->obj_pool()->add(new ColumnRef(child->type(), slot_id)); LOG(INFO) << "add new common expr, slot_id: " << slot_id << ", new expr: " << column_ref->debug_string() - << ", old expr: " << child->debug_string(); + << ", old expr: " << child->debug_string(); expr->_children[i] = column_ref; - ctx->outer_common_exprs.insert({column_ref, child}); + ctx->outer_common_exprs.insert({slot_id, child}); } } return Status::OK(); @@ -77,10 +75,11 @@ Status LambdaFunction::extract_outer_common_exprs( Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, ExtractContext* ctx) { RETURN_IF_ERROR(collect_lambda_argument_ids()); - for (auto argument_id: _arguments_ids) { + for (auto argument_id : _arguments_ids) { ctx->lambda_arguments.insert(argument_id); LOG(INFO) << "lambda arg id: " << argument_id; } + // @TODO what if lambda_expr is independent? auto lambda_expr = _children[0]; RETURN_IF_ERROR(extract_outer_common_exprs(state, lambda_expr, ctx)); return Status::OK(); @@ -91,7 +90,7 @@ Status LambdaFunction::collect_lambda_argument_ids() { return Status::OK(); } const int child_num = get_num_children() - 2 * _common_sub_expr_num; - for (int i = 1;i < child_num;i++) { + for (int i = 1; i < child_num; i++) { _children[i]->get_slot_ids(&_arguments_ids); } if (child_num - 1 != _arguments_ids.size()) { @@ -103,7 +102,7 @@ Status LambdaFunction::collect_lambda_argument_ids() { SlotId LambdaFunction::max_used_slot_id() const { std::vector ids; - for (auto child: _children) { + for (auto child : _children) { child->get_slot_ids(&ids); } DCHECK(!ids.empty()); @@ -121,7 +120,7 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo const int child_num = get_num_children() - 2 * _common_sub_expr_num; LOG(INFO) << "lambda child num: " << child_num << ", common: " << _common_sub_expr_num; LOG(INFO) << debug_string(); - for (int i = 0; i< child_num;i++) { + for (int i = 0; i < child_num; i++) { LOG(INFO) << "child[" << i << "] = " << get_child(i)->debug_string(); } RETURN_IF_ERROR(collect_lambda_argument_ids()); @@ -146,18 +145,27 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo if (_common_sub_expr.size() != _common_sub_expr_num) { return Status::InternalError(fmt::format("Lambda common sub expressions' size {} is not equal to expected {}", _common_sub_expr.size(), _common_sub_expr_num)); - } - + } // get slot ids from the lambda expression get_child(0)->get_slot_ids(&_captured_slot_ids); - for (auto id: _captured_slot_ids) { - LOG(INFO) << "lambda capture id: " << id ; + // bool is_lambda_independent = true; + _is_lambda_expr_independent = true; + for (auto id : _captured_slot_ids) { + LOG(INFO) << "lambda capture id: " << id; + for (const auto& arguments_id : _arguments_ids) { + if (id == arguments_id) { + // is_lambda_independent = false; + _is_lambda_expr_independent = false; + break; + } + } } + LOG(INFO) << "lambda is independent: " << _is_lambda_expr_independent; + // if lambda expr is independent, mark // @TODO find all independent capture column, evaluate them first... - // remove current argument ids and duplicated ids from captured_slot_ids std::map captured_mask; int valid_id = 0; @@ -204,8 +212,8 @@ std::string LambdaFunction::debug_string() const { std::stringstream out; auto expr_debug_string = Expr::debug_string(); out << "LambaFunction ("; - for (int i = 0;i < _children.size();i++) { - out << (i == 0 ? "lambda expr, ": "input argument, ") << _children[i]->debug_string(); + for (int i = 0; i < _children.size(); i++) { + out << (i == 0 ? "lambda expr, " : "input argument, ") << _children[i]->debug_string(); } out << ")"; return out.str(); diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index b9fe5b511c945..16b50bdfafb3e 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -47,7 +47,7 @@ class LambdaFunction final : public Expr { // the slot ids of lambda expression may be originally from the arguments of this lambda function // or its parent lambda functions, or captured columns, remove the first one. - // only capture column id, + // only capture column id, int get_slot_ids(std::vector* slot_ids) const override { slot_ids->insert(slot_ids->end(), _captured_slot_ids.begin(), _captured_slot_ids.end()); return _captured_slot_ids.size(); @@ -58,8 +58,9 @@ class LambdaFunction final : public Expr { return _arguments_ids.size(); } - bool is_lambda_function() const override { - return true; + bool is_lambda_function() const override { return true; } + bool is_lambda_expr_independent() const { + return _is_lambda_expr_independent; } Expr* get_lambda_expr() const { return _children[0]; } @@ -68,7 +69,7 @@ class LambdaFunction final : public Expr { struct ExtractContext { std::unordered_set lambda_arguments; SlotId next_slot_id; - std::map outer_common_exprs; + std::map outer_common_exprs; }; SlotId max_used_slot_id() const; @@ -84,6 +85,7 @@ class LambdaFunction final : public Expr { // void try_to_replace_commom_expr(RuntimeState* state, Expr* expr); std::vector _captured_slot_ids; + // @TODO change to set std::vector _arguments_ids; std::vector _common_sub_expr_ids; std::vector _common_sub_expr; @@ -91,5 +93,6 @@ class LambdaFunction final : public Expr { // std::unordered_map _outer_common_exprs; int _common_sub_expr_num; bool _is_prepared = false; + bool _is_lambda_expr_independent = false; }; } // namespace starrocks From ac6eae768cf717ee124158ea98e2b09facbf837f Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Fri, 20 Sep 2024 15:31:49 +0800 Subject: [PATCH 05/17] stash Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/array_column.cpp | 16 ++ be/src/column/array_column.h | 3 + be/src/exprs/array_map_expr.cpp | 330 ++++++++++++++++++++++-- be/src/exprs/array_map_expr.h | 7 + test/sql/test_array_fn/T/test_array_map | 58 +++++ 5 files changed, 387 insertions(+), 27 deletions(-) create mode 100644 test/sql/test_array_fn/T/test_array_map diff --git a/be/src/column/array_column.cpp b/be/src/column/array_column.cpp index 584f3a5ddf6f2..8b07a04fa7d22 100644 --- a/be/src/column/array_column.cpp +++ b/be/src/column/array_column.cpp @@ -618,6 +618,22 @@ Status ArrayColumn::unfold_const_children(const starrocks::TypeDescriptor& type) return Status::OK(); } +size_t ArrayColumn::get_total_elements_num(const NullColumnPtr& null_column) const { + if (null_column == nullptr) { + return _elements->size(); + } + DCHECK_LE(_offsets->size() -1, null_column->size()); + size_t elements_num = 0; + size_t num_rows = _offsets->size() - 1; + const auto& null_data = null_column->get_data(); + for (size_t i = 0;i < num_rows;i++) { + if (!null_data[i]) { + elements_num += _offsets->get_data()[i + 1] - _offsets->get_data()[i]; + } + } + return elements_num; +} + template bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UInt32Column& v2, const NullColumnPtr& null_column) { diff --git a/be/src/column/array_column.h b/be/src/column/array_column.h index c95796f1cd675..73db78fb86d2f 100644 --- a/be/src/column/array_column.h +++ b/be/src/column/array_column.h @@ -196,6 +196,9 @@ class ArrayColumn final : public ColumnFactory { Status unfold_const_children(const starrocks::TypeDescriptor& type) override; + // calculate all non-null elements' size + size_t get_total_elements_num(const NullColumnPtr& null_column) const; + // check if all of arrays' size is equal // v1 and v2 must be one of ArrayColumn or Const(ArrayColumn) template diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index 71e8f326f7197..a25b9a512f4f6 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -71,6 +71,228 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { return Status::OK(); } +template +StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chunk* chunk, const std::vector& input_elements, NullColumnPtr result_null_column) { + + // create a new chunk to evaluate the lambda expression + auto cur_chunk = std::make_shared(); + + // 1. evaluate all outer common expressions + LOG(INFO) << "eval outer common exprs, size: " << _outer_common_exprs.size(); + for (const auto& [slot_id, expr] : _outer_common_exprs) { + LOG(INFO) << "eval non-capture expr, slot_id: " << slot_id << ", expr: " << expr->debug_string(); + ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); + LOG(INFO) << "col size: " << col->size(); + chunk->append_column(col, slot_id); + } + LOG(INFO) << "eval outer common exprs done"; + auto lambda_func = dynamic_cast(_children[0]); + std::vector capture_slot_ids; + lambda_func->get_slot_ids(&capture_slot_ids); + // 2. check captured columns size + for (auto slot_id : capture_slot_ids) { + LOG(INFO) << "check slot id: " << slot_id; + DCHECK(slot_id > 0); + auto captured_column = chunk->get_column_by_slot_id(slot_id); + // @TODO why? + if (UNLIKELY(captured_column->size() < input_elements[0]->size())) { + return Status::InternalError(fmt::format( + "The size of the captured column {} is less than array's size.", captured_column->get_name())); + } + } + + // 3. prepare lambda arguments: + // 3.1 put all elements column into cur_chunk + // 3.2 get aligned_offset + + UInt32Column::Ptr aligned_offsets = nullptr; + size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()): 0; + + std::vector arguments_ids; + int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); + for (int i = 0; i < argument_num; ++i) { + auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); + auto array_column = down_cast(data_column.get()); + auto elements_column = array_column->elements_column(); + UInt32Column::Ptr offsets_column = array_column->offsets_column(); + LOG(INFO) << "input element " << i << ", " << input_elements[i]->get_name() << ", size: " << input_elements[i]->size(); + if constexpr (!all_const_input) { + if (input_elements[i]->is_constant()) { + // @TODO const may be null + size_t elements_num = array_column->get_element_size(0); + elements_column = elements_column->clone(); + LOG(INFO) << "element size: " << elements_column->size(); + offsets_column = UInt32Column::create(); + // replicate N time and ignore null + size_t repeat_times = input_elements[i]->size() - null_rows; + LOG(INFO) << "repeat times: " << repeat_times << ", null_rows:" << null_rows; + size_t offset = elements_num; + offsets_column->append(0); + offsets_column->append(offset); + for (size_t i = 1; i < repeat_times; i++) { + elements_column->append(*elements_column, 0, elements_num); + LOG(INFO) << "element size: " << elements_column->size(); + offset += elements_num; + offsets_column->append(offset); + } + LOG(INFO) << "offset: " << offset; + } else { + data_column->empty_null_in_complex_column(result_null_column->get_data(), + array_column->offsets().get_data()); + elements_column = down_cast(data_column.get())->elements_column(); + } + } + + if (aligned_offsets == nullptr) { + LOG(INFO) << "assign offsets: " << offsets_column->size(); + aligned_offsets = offsets_column; + } + // if lambda expr doesn't rely on argument, we don't need to put it into cur_chunk + if constexpr (!independent_lambda_expr) { + // @TODO what if it is a const + cur_chunk->append_column(elements_column, arguments_ids[i]); + LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; + } + } + // @TODO put outer common expr into cur_chunk, + DCHECK(aligned_offsets != nullptr); + + // 4. prepare outer common expr + for (const auto& [slot_id, expr] : _outer_common_exprs) { + auto column = chunk->get_column_by_slot_id(slot_id); + LOG(INFO) << "unpack const column: " << column->get_name() << ", size: " << column->size(); + column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); + LOG(INFO) << "append outer common column: " << slot_id; + if constexpr (independent_lambda_expr) { + // if lambda expr doesn't rely on arguments, we don't need to align offset + cur_chunk->append_column(column, slot_id); + } else { + cur_chunk->append_column(column->replicate(aligned_offsets->get_data()), slot_id); + } + LOG(INFO) << "append outer common column: " << slot_id; + } + // 5. append capture column + for (auto slot_id : capture_slot_ids) { + if (cur_chunk->is_slot_exist(slot_id)) { + continue; + } + auto captured_column = chunk->get_column_by_slot_id(slot_id); + if constexpr (independent_lambda_expr) { + cur_chunk->append_column(captured_column, slot_id); + } else { + cur_chunk->append_column(captured_column->replicate(aligned_offsets->get_data()), slot_id); + } + LOG(INFO) << "append capture column: " << slot_id; + } + // 6. eval lambda expr + ColumnPtr column = nullptr; + if constexpr (independent_lambda_expr) { + // if lambda expr doesn't rely on arguments, we evaluate it first, and then align offsets + // @TODO cur_chunk may empty + ColumnPtr tmp_col; + if (!cur_chunk->has_columns()) { + ASSIGN_OR_RETURN(tmp_col, context->evaluate(_children[0], nullptr)); + } else { + ASSIGN_OR_RETURN(tmp_col, context->evaluate(_children[0], cur_chunk.get())); + } + tmp_col->check_or_die(); + column = tmp_col->replicate(aligned_offsets->get_data()); + column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); + // column = ColumnHelper::cast_to_nullable_column(column); + } else { + // if all input arguments are const, + // @TODO what if cur_chunk is empty???? + if constexpr (all_const_input) { + // @TODO + ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); + tmp_col->check_or_die(); + LOG(INFO) << "tmp col: " << tmp_col->get_name() << ", tmp_col size: " << tmp_col->size(); + if (tmp_col->is_nullable()) { + auto null_col = ColumnHelper::as_raw_column(tmp_col)->null_column(); + std::stringstream oss; + for (size_t i = 0;i < null_col->get_data().size();i++) { + oss << static_cast(null_col->get_data()[i]) << ","; + } + LOG(INFO) << "tmp col null data: " << oss.str(); + } + // @TODO null + // @TODO pending fix + // @TODO don't need create + // column = ConstColumn::create(FunctionHelper::get_data_column_of_const(tmp_col), tmp_col->size()); + column = FunctionHelper::get_data_column_of_const(tmp_col); + column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); + // @TODO null??? + } else { + // create a ChunkAccumulator? + // do we need accumulator?? + ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); + LOG(INFO) << "cur_chunk rows: " << cur_chunk->num_rows(); + RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); + accumulator.finalize(); + while (auto tmp_chunk = accumulator.pull()) { + LOG(INFO) << "tmp_chunk rows: " << tmp_chunk->num_rows(); + tmp_chunk->check_or_die(); + LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); + ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); + tmp_col->check_or_die(); + tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); + if (column == nullptr) { + column = tmp_col; + } else { + column->append(*tmp_col); + } + } + } + } + DCHECK(column != nullptr); + column = ColumnHelper::cast_to_nullable_column(column); + + if constexpr (all_const_input) { + LOG(INFO) << "all input arguments are constant, return a const column, has_null: " << (result_null_column == nullptr ? 0: SIMD::count_nonzero(result_null_column->get_data())); + LOG(INFO) << "column is nullable: " << column->is_nullable(); + // if all input is const, we can return a const column + // @TODO consider null + // @TODO column may be const/nullable column + + auto data_column = FunctionHelper::get_data_column_of_const(column); + LOG(INFO) << "data column: " << data_column->get_name(); + if (data_column->is_nullable()) { + auto null_column = ColumnHelper::as_column(data_column)->null_column(); + std::ostringstream oss; + for (size_t i = 0;i < null_column->get_data().size();i++) { + oss << static_cast(null_column->get_data()[i]) << ","; + } + LOG(INFO) << "null data: " << oss.str(); + } + aligned_offsets = UInt32Column::create(); + aligned_offsets->append(0); + aligned_offsets->append(column->size()); + auto array_column = std::make_shared(data_column, ColumnHelper::as_column(aligned_offsets)); + array_column->check_or_die(); + ColumnPtr result_column = array_column; + if (result_null_column != nullptr) { + result_column = NullableColumn::create(std::move(array_column), result_null_column); + result_column->check_or_die(); + } + result_column = ConstColumn::create(result_column, chunk->num_rows()); + result_column->check_or_die(); + return result_column; + } else { + auto array_column = std::make_shared( + column, ColumnHelper::as_column(aligned_offsets->clone_shared())); + array_column->check_or_die(); + if (result_null_column != nullptr) { + return NullableColumn::create(std::move(array_column), result_null_column); + } + return array_column; + } +} + +// split into multi process +// 1. eval lambda arugments and check array_length +// 2. if all input is null, return result +// 3. else prepare lambda expr input: consider all input is const and lambda expr don't rely on argument + // The input array column maybe nullable, so first remove the wrap of nullable property. // The result of lambda expressions do not change the offsets of the current array and the null map. // NOTE the return column must be of the return type. @@ -80,9 +302,6 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // NullColumnPtr null_column = nullptr; bool is_single_nullable_child = false; - // ArrayColumn* input_array = nullptr; - ColumnPtr input_array = nullptr; - ColumnPtr input_array_ptr_ref = nullptr; // hold shared_ptr to avoid early deleted. // ColumnPtr aligned_offsets; UInt32Column::Ptr aligned_offsets = nullptr; @@ -94,6 +313,7 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // for many valid arguments: // if one of them is a null literal, the result is a null literal; // if one of them is only null, then results are null; + // unfold const columns. // make sure all inputs have the same offsets. // TODO(fzh): support several arrays with different offsets and set null for non-equal size of arrays. @@ -121,14 +341,6 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* auto nullable_column = down_cast(data_column.get()); DCHECK(nullable_column); data_column = nullable_column->data_column(); - // empty null array with non-zero elements - // @TODO can we remove it?? - - // @TODO we can check it before?? - // this will re-build data column, replace null element to empty array - // data_column->empty_null_in_complex_column( - // nullable_column->null_column()->get_data(), - // down_cast(data_column.get())->offsets().get_data()); auto null_column = nullable_column->null_column(); if (is_const) { @@ -141,24 +353,10 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* if (result_null_column) { is_single_nullable_child = false; // union two null column - LOG(INFO) << "union result_null_column, size: " << result_null_column->size() - << ", null size: " << null_column->size(); result_null_column = FunctionHelper::union_null_column(null_column, result_null_column); - LOG(INFO) << "union done: " << result_null_column->size(); - std::ostringstream oss; - for (auto null_data: result_null_column->get_data()) { - oss << static_cast(null_data) << ","; - } - LOG(INFO) << "null data: " << oss.str(); } else { is_single_nullable_child = true; result_null_column = null_column; - LOG(INFO) << "assign result_null_column, size: " << null_column->size(); - std::ostringstream oss; - for (auto null_data: result_null_column->get_data()) { - oss << static_cast(null_data) << ","; - } - LOG(INFO) << "null data: " << oss.str(); } } DCHECK(data_column->is_array() && !data_column->is_nullable()); @@ -191,11 +389,18 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* DCHECK(result_null_column != nullptr); // If there are more than one nullable children, the nullable column has been cloned when calling // union_null_column to merge, so only one nullable child needs to be cloned. + // @TODO why?? result_null_column = ColumnHelper::as_column(result_null_column->clone_shared()); } ColumnPtr column = nullptr; size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()) : 0; + // @TODO we should know if elements are empty + size_t total_elements_num = down_cast( + FunctionHelper::get_data_column_of_const(input_elements[0]).get())->get_total_elements_num(result_null_column); + LOG(INFO) << "total elements num: " << total_elements_num; + + // @TODO what if array is empty if (null_rows == input_elements[0]->size()) { // all input is null // @TODO we can give a Const(Nullable(ArrayColumn)) @@ -211,13 +416,50 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* auto array_col = std::make_shared(column, aligned_offsets); array_col->check_or_die(); LOG(INFO) << "array_col size: " << array_col->size(); - result_null_column->resize(1); - auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); + if (result_null_column) { + result_null_column->resize(1); + auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); + result->check_or_die(); + return result; + } + // @TODO empty?? + auto result = ConstColumn::create(std::move(array_col), chunk->num_rows()); result->check_or_die(); + // result_null_column->resize(1); + // auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); + // result->check_or_die(); return result; // @TODO shoulw give // aligned_offsets->append(0); + } else if (total_elements_num == 0) { + LOG(INFO) << "all input are empty, should return a const empty"; + column = ColumnHelper::create_column(type().children[0], + true); + aligned_offsets = UInt32Column::create(0); + aligned_offsets->append_default(2); + auto array_col = std::make_shared(column, aligned_offsets); + array_col->check_or_die(); + auto result = ConstColumn::create(std::move(array_col), chunk->num_rows() - null_rows); + result->check_or_die(); + return result; } else { + // @TODO move to a new function + + if (true) { + auto lambda_func = dynamic_cast(_children[0]); + bool is_lambda_expr_independent = lambda_func->is_lambda_expr_independent(); + if (all_input_is_constant && is_lambda_expr_independent) { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); + } else if (all_input_is_constant && !is_lambda_expr_independent) { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); + } else if (!all_input_is_constant && is_lambda_expr_independent) { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); + } else { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); + } + } + + // construct a new chunk to evaluate the lambda expression. auto cur_chunk = std::make_shared(); @@ -295,7 +537,11 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // cur_chunk->append_column(elements_column, arguments_ids[i]); LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; } + // @TODO if elements is null + DCHECK(aligned_offsets != nullptr); + LOG(INFO) << "last offset: " << aligned_offsets->get_data().back(); + LOG(INFO) << "begin append outer common column, num: " << _outer_common_exprs.size(); for (const auto& [slot_id, expr] : _outer_common_exprs) { auto column = chunk->get_column_by_slot_id(slot_id); @@ -347,6 +593,8 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // should we replicate capture column??? // empty all null is ok + // @TODO what if all input is empty + // @TODO if all input is const, we don't need unpack const if (all_input_is_constant) { // if all input arguments are ConstColumn, we don't need unpack, just evaluate on ConstColumn @@ -409,6 +657,28 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // @TODO put outer common expr into cur_chunk, DCHECK(aligned_offsets != nullptr); + LOG(INFO) << "last offset: " << aligned_offsets->get_data().back(); + if (aligned_offsets->get_data().back() == 0) { + // this means no elements for input, we can return an empty array column directly + // @OTOD + LOG(INFO) << "all input is empty, just return an empty array column"; + // @TODO create an array column with all null + column = ColumnHelper::create_column(type().children[0], + true); // array->elements must be of return array->elements' type + // column->append_default(1); + // @TODO handle aligned offsets + aligned_offsets = UInt32Column::create(0); + aligned_offsets->append(0); + // aligned_offsets->append(1); + auto array_col = std::make_shared(column, aligned_offsets); + array_col->check_or_die(); + LOG(INFO) << "array_col size: " << array_col->size(); + + result_null_column->resize(1); + auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); + result->check_or_die(); + return result; + } // if capture column is empty // align offset LOG(INFO) << "begin append outer common column, num: " << _outer_common_exprs.size(); @@ -455,6 +725,12 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* // cut tmp chunk from cur_chunk, and eval // cut data // if cur_chunk has view_column, we should convert view_column to column again + + if (cur_chunk->is_empty()) { + // all input is empty??? should return empty result + + + } if (all_input_is_constant) { LOG(INFO) << "all input is constant, we just eval const column"; LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index 84dc5776957d5..fde35309f7124 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -22,6 +22,7 @@ #include "common/object_pool.h" #include "exprs/column_ref.h" #include "exprs/expr.h" +#include "column/nullable_column.h" #include "glog/logging.h" namespace starrocks { @@ -42,6 +43,12 @@ class ArrayMapExpr final : public Expr { std::string debug_string() const override; private: + template + StatusOr evaluate_lambda_expr(ExprContext* context, Chunk* chunk, + const std::vector& arguments, NullColumnPtr null_column); + + // Status prepare_lambda_arguments(ExprContext* context, Chunk* chunk, std::vector* input_elements); + // use map to make sure the order of execution // std::map _outer_common_exprs; std::map _outer_common_exprs; diff --git a/test/sql/test_array_fn/T/test_array_map b/test/sql/test_array_fn/T/test_array_map new file mode 100644 index 0000000000000..7862ac3280aba --- /dev/null +++ b/test/sql/test_array_fn/T/test_array_map @@ -0,0 +1,58 @@ +-- name: test_array_map_2 +CREATE TABLE `array_map_test` ( + `id` tinyint(4) NOT NULL COMMENT "", + `arr_str` array NULL COMMENT "", + `arr_largeint` array NULL COMMENT "" +) ENGINE=OLAP +DUPLICATE KEY(`id`) +DISTRIBUTED BY RANDOM +PROPERTIES ( +"replication_num" = "1" +); + +insert into array_map_test values (1, array_repeat("abcdefghasdasdasirnqwrq", 20000), array_repeat(100, 20000)); + +select count() from array_map_test where array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint)) > 10 ; +select count(array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint))) from array_map_test; + +select count() from array_map_test where any_match(x->any_match(x->x<10, arr_largeint), arr_largeint); +select count(any_match(x->any_match(x->x<10, arr_largeint), arr_largeint)) from array_map_test; +select count(array_map(x->array_length(array_concat(arr_str,[])), arr_largeint)) from array_map_test; + +set @arr=array_repeat("12345",1000000); +select array_length(array_map((x,y)->x > y, @arr,@arr)) from table(generate_series(1,10,1)); + +-- name: test_array_map_3 +CREATE TABLE `t` ( + `k` bigint NOT NULL COMMENT "", + `arr_0` array NOT NULL COMMENT "", + `arr_1` array NULL COMMENT "", + `arr_2` array NULL COMMENT "" +) ENGINE=OLAP +primary KEY(`k`) +DISTRIBUTED BY RANDOM BUCKETS 1 +PROPERTIES ( +"replication_num" = "1" +); + +insert into t values (1, [1,2], [1,2],[2,3]), (2, [1,2], null, [2,3]), (3, [1,2],[1,2],null),(4, [1,2],[null,null],[2,3]), (5, [1], [1,2], [3]); +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t; +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t where k != 5 order by k; +delete from t where k = 5; + +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t order by k; +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1,2]) from t order by k; +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1]) from t order by k; + +select array_map(x->x, arr_0) from t order by k; +-- independent expr +select array_map((x,y,z)->10, arr_0, arr_1, arr_2) from t; +select array_map((x,y)-> k, arr_0, arr_1) from t order by k; + +-- independent expr with all const +select array_map((x,y)->k, [1,2],[2,3]) from t order by k; + +-- non indepentdent with all const +select array_map((x,y,z)->x+y+z, [1,2],[2,3],[3,4]) from t; +select array_map((x,y,z)->x+y+z, [1,2],[2,null],[3,4]) from t; +select array_map((x,y,z)->x+y+z, [1,2],[2,null],null) from t; From 3a19f68cbe3364aa6440567158204d5e35f0da06 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Fri, 20 Sep 2024 16:29:16 +0800 Subject: [PATCH 06/17] remove unused code Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/exprs/array_map_expr.cpp | 519 ++-------------------- be/src/exprs/array_map_expr.h | 3 - test/sql/test_array/R/test_array_map | 3 +- test/sql/test_array/T/test_array_map | 58 +++ test/sql/test_array_fn/R/test_array_map_2 | 150 +++++++ test/sql/test_array_fn/T/test_array_map_2 | 60 +++ 6 files changed, 302 insertions(+), 491 deletions(-) create mode 100644 test/sql/test_array_fn/R/test_array_map_2 create mode 100644 test/sql/test_array_fn/T/test_array_map_2 diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index a25b9a512f4f6..741ddcd76102a 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -46,13 +46,9 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { for (int i = 1; i < _children.size(); ++i) { RETURN_IF_ERROR(_children[i]->prepare(state, context)); } - // if child 0 is not lambda, what will happen whe nevaluate - // @TODO if children[0] not lambda - // @TODO _children[0] maybe not a lambda function? auto lambda_expr = down_cast(_children[0]); - // before prepare lambda - // collect max slot id + LambdaFunction::ExtractContext extract_ctx; extract_ctx.next_slot_id = lambda_expr->max_used_slot_id() + 1; @@ -61,9 +57,7 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { _outer_common_exprs.swap(extract_ctx.outer_common_exprs); for (auto [_, expr] : _outer_common_exprs) { - // @TODO LOG(INFO) << "prepare common expr: " << expr->debug_string(); - // @TODO if after rewrite, first expr of array_map become column ref, we can remove it? RETURN_IF_ERROR(expr->prepare(state, context)); } RETURN_IF_ERROR(lambda_expr->prepare(state, context)); @@ -78,23 +72,19 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu auto cur_chunk = std::make_shared(); // 1. evaluate all outer common expressions - LOG(INFO) << "eval outer common exprs, size: " << _outer_common_exprs.size(); for (const auto& [slot_id, expr] : _outer_common_exprs) { - LOG(INFO) << "eval non-capture expr, slot_id: " << slot_id << ", expr: " << expr->debug_string(); ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); - LOG(INFO) << "col size: " << col->size(); chunk->append_column(col, slot_id); } - LOG(INFO) << "eval outer common exprs done"; + auto lambda_func = dynamic_cast(_children[0]); std::vector capture_slot_ids; lambda_func->get_slot_ids(&capture_slot_ids); - // 2. check captured columns size + + // 2. check captured columnss size for (auto slot_id : capture_slot_ids) { - LOG(INFO) << "check slot id: " << slot_id; DCHECK(slot_id > 0); auto captured_column = chunk->get_column_by_slot_id(slot_id); - // @TODO why? if (UNLIKELY(captured_column->size() < input_elements[0]->size())) { return Status::InternalError(fmt::format( "The size of the captured column {} is less than array's size.", captured_column->get_name())); @@ -115,27 +105,21 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu auto array_column = down_cast(data_column.get()); auto elements_column = array_column->elements_column(); UInt32Column::Ptr offsets_column = array_column->offsets_column(); - LOG(INFO) << "input element " << i << ", " << input_elements[i]->get_name() << ", size: " << input_elements[i]->size(); if constexpr (!all_const_input) { if (input_elements[i]->is_constant()) { - // @TODO const may be null size_t elements_num = array_column->get_element_size(0); elements_column = elements_column->clone(); - LOG(INFO) << "element size: " << elements_column->size(); offsets_column = UInt32Column::create(); // replicate N time and ignore null size_t repeat_times = input_elements[i]->size() - null_rows; - LOG(INFO) << "repeat times: " << repeat_times << ", null_rows:" << null_rows; size_t offset = elements_num; offsets_column->append(0); offsets_column->append(offset); for (size_t i = 1; i < repeat_times; i++) { elements_column->append(*elements_column, 0, elements_num); - LOG(INFO) << "element size: " << elements_column->size(); offset += elements_num; offsets_column->append(offset); } - LOG(INFO) << "offset: " << offset; } else { data_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); @@ -144,32 +128,27 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu } if (aligned_offsets == nullptr) { - LOG(INFO) << "assign offsets: " << offsets_column->size(); aligned_offsets = offsets_column; } + // if lambda expr doesn't rely on argument, we don't need to put it into cur_chunk if constexpr (!independent_lambda_expr) { // @TODO what if it is a const cur_chunk->append_column(elements_column, arguments_ids[i]); - LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; } } - // @TODO put outer common expr into cur_chunk, DCHECK(aligned_offsets != nullptr); // 4. prepare outer common expr for (const auto& [slot_id, expr] : _outer_common_exprs) { auto column = chunk->get_column_by_slot_id(slot_id); - LOG(INFO) << "unpack const column: " << column->get_name() << ", size: " << column->size(); column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); - LOG(INFO) << "append outer common column: " << slot_id; if constexpr (independent_lambda_expr) { // if lambda expr doesn't rely on arguments, we don't need to align offset cur_chunk->append_column(column, slot_id); } else { cur_chunk->append_column(column->replicate(aligned_offsets->get_data()), slot_id); } - LOG(INFO) << "append outer common column: " << slot_id; } // 5. append capture column for (auto slot_id : capture_slot_ids) { @@ -182,13 +161,12 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu } else { cur_chunk->append_column(captured_column->replicate(aligned_offsets->get_data()), slot_id); } - LOG(INFO) << "append capture column: " << slot_id; } - // 6. eval lambda expr + + // 6. evaluate lambda expr ColumnPtr column = nullptr; if constexpr (independent_lambda_expr) { // if lambda expr doesn't rely on arguments, we evaluate it first, and then align offsets - // @TODO cur_chunk may empty ColumnPtr tmp_col; if (!cur_chunk->has_columns()) { ASSIGN_OR_RETURN(tmp_col, context->evaluate(_children[0], nullptr)); @@ -198,41 +176,20 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu tmp_col->check_or_die(); column = tmp_col->replicate(aligned_offsets->get_data()); column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); - // column = ColumnHelper::cast_to_nullable_column(column); } else { // if all input arguments are const, - // @TODO what if cur_chunk is empty???? if constexpr (all_const_input) { - // @TODO ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); tmp_col->check_or_die(); - LOG(INFO) << "tmp col: " << tmp_col->get_name() << ", tmp_col size: " << tmp_col->size(); - if (tmp_col->is_nullable()) { - auto null_col = ColumnHelper::as_raw_column(tmp_col)->null_column(); - std::stringstream oss; - for (size_t i = 0;i < null_col->get_data().size();i++) { - oss << static_cast(null_col->get_data()[i]) << ","; - } - LOG(INFO) << "tmp col null data: " << oss.str(); - } - // @TODO null - // @TODO pending fix - // @TODO don't need create - // column = ConstColumn::create(FunctionHelper::get_data_column_of_const(tmp_col), tmp_col->size()); + column = FunctionHelper::get_data_column_of_const(tmp_col); column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); - // @TODO null??? } else { - // create a ChunkAccumulator? - // do we need accumulator?? ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); - LOG(INFO) << "cur_chunk rows: " << cur_chunk->num_rows(); RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); accumulator.finalize(); while (auto tmp_chunk = accumulator.pull()) { - LOG(INFO) << "tmp_chunk rows: " << tmp_chunk->num_rows(); tmp_chunk->check_or_die(); - LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); tmp_col->check_or_die(); tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); @@ -248,22 +205,9 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu column = ColumnHelper::cast_to_nullable_column(column); if constexpr (all_const_input) { - LOG(INFO) << "all input arguments are constant, return a const column, has_null: " << (result_null_column == nullptr ? 0: SIMD::count_nonzero(result_null_column->get_data())); - LOG(INFO) << "column is nullable: " << column->is_nullable(); - // if all input is const, we can return a const column - // @TODO consider null - // @TODO column may be const/nullable column - + // if all input arguments are const, we can return a const column auto data_column = FunctionHelper::get_data_column_of_const(column); - LOG(INFO) << "data column: " << data_column->get_name(); - if (data_column->is_nullable()) { - auto null_column = ColumnHelper::as_column(data_column)->null_column(); - std::ostringstream oss; - for (size_t i = 0;i < null_column->get_data().size();i++) { - oss << static_cast(null_column->get_data()[i]) << ","; - } - LOG(INFO) << "null data: " << oss.str(); - } + aligned_offsets = UInt32Column::create(); aligned_offsets->append(0); aligned_offsets->append(column->size()); @@ -288,42 +232,23 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu } } -// split into multi process -// 1. eval lambda arugments and check array_length -// 2. if all input is null, return result -// 3. else prepare lambda expr input: consider all input is const and lambda expr don't rely on argument - // The input array column maybe nullable, so first remove the wrap of nullable property. // The result of lambda expressions do not change the offsets of the current array and the null map. // NOTE the return column must be of the return type. StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* chunk) { - // @TODO just use one vector store array column std::vector input_elements; - // NullColumnPtr null_column = nullptr; bool is_single_nullable_child = false; - // ColumnPtr aligned_offsets; - UInt32Column::Ptr aligned_offsets = nullptr; - // @TODO we should eval common expr first - - // maybe a NullColumn or a Const(NullColumn) NullColumnPtr result_null_column = nullptr; bool all_input_is_constant = true; - // for many valid arguments: - // if one of them is a null literal, the result is a null literal; - // if one of them is only null, then results are null; - // unfold const columns. - // make sure all inputs have the same offsets. - // TODO(fzh): support several arrays with different offsets and set null for non-equal size of arrays. for (int i = 1; i < _children.size(); ++i) { ASSIGN_OR_RETURN(auto child_col, context->evaluate(_children[i], chunk)); // the column is a null literal. if (child_col->only_null()) { return ColumnHelper::align_return_type(child_col, type(), chunk->num_rows(), true); } - LOG(INFO) << "eval child: " << child_col->get_name(); bool is_const = child_col->is_constant(); bool is_nullable = child_col->is_nullable(); @@ -336,7 +261,6 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* data_column = const_column->data_column(); } - // @TODO consider const nullable if (is_nullable) { auto nullable_column = down_cast(data_column.get()); DCHECK(nullable_column); @@ -344,15 +268,12 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* auto null_column = nullable_column->null_column(); if (is_const) { - LOG(INFO) << "input is const, should unpack null column"; // if null_column is from const_column, should unpack null_column->assign(num_rows, 0); } - // try to merge null column if (result_null_column) { is_single_nullable_child = false; - // union two null column result_null_column = FunctionHelper::union_null_column(null_column, result_null_column); } else { is_single_nullable_child = true; @@ -363,7 +284,7 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* ColumnPtr column = data_column; if (is_const) { - // keep it as a Const(ArrayColumn) in input elelents + // keep it as a const array column in input_elements column = ConstColumn::create(data_column, num_rows); } @@ -376,8 +297,6 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* ? ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column) : ArrayColumn::is_all_array_lengths_equal(first_input, column, result_null_column); if (!is_array_lengths_valid) { - LOG(INFO) << "chunk num: " << chunk->num_rows() << ", current idx: " << i << ", first input size: " << first_input->size() - << ", column size: " << column->size() << ", null size: " << result_null_column->size(); return Status::InternalError("Input array element's size is not equal in array_map()."); } } @@ -389,435 +308,61 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* DCHECK(result_null_column != nullptr); // If there are more than one nullable children, the nullable column has been cloned when calling // union_null_column to merge, so only one nullable child needs to be cloned. - // @TODO why?? result_null_column = ColumnHelper::as_column(result_null_column->clone_shared()); } ColumnPtr column = nullptr; size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()) : 0; - // @TODO we should know if elements are empty - size_t total_elements_num = down_cast( - FunctionHelper::get_data_column_of_const(input_elements[0]).get())->get_total_elements_num(result_null_column); - LOG(INFO) << "total elements num: " << total_elements_num; - // @TODO what if array is empty if (null_rows == input_elements[0]->size()) { - // all input is null - // @TODO we can give a Const(Nullable(ArrayColumn)) - LOG(INFO) << "all input is null, null_rows: " << null_rows << ", input_elements[0]->size(): " << input_elements[0]->size(); - // @TODO create an array column with all null + // if all input rows are null, just return a const nullable array column as result column = ColumnHelper::create_column(type().children[0], true); // array->elements must be of return array->elements' type column->append_default(1); - // @TODO handle aligned offsets - aligned_offsets = UInt32Column::create(0); + auto aligned_offsets = UInt32Column::create(0); aligned_offsets->append(0); aligned_offsets->append(1); auto array_col = std::make_shared(column, aligned_offsets); array_col->check_or_die(); - LOG(INFO) << "array_col size: " << array_col->size(); if (result_null_column) { result_null_column->resize(1); auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); result->check_or_die(); return result; } - // @TODO empty?? auto result = ConstColumn::create(std::move(array_col), chunk->num_rows()); result->check_or_die(); - // result_null_column->resize(1); - // auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); - // result->check_or_die(); return result; - // @TODO shoulw give - // aligned_offsets->append(0); - } else if (total_elements_num == 0) { - LOG(INFO) << "all input are empty, should return a const empty"; + } + + size_t total_elements_num = down_cast( + FunctionHelper::get_data_column_of_const(input_elements[0]).get())->get_total_elements_num(result_null_column); + + if (total_elements_num == 0) { + // if all input rows are empty arrays, return a const empty array column as result column = ColumnHelper::create_column(type().children[0], true); - aligned_offsets = UInt32Column::create(0); + auto aligned_offsets = UInt32Column::create(0); aligned_offsets->append_default(2); auto array_col = std::make_shared(column, aligned_offsets); array_col->check_or_die(); auto result = ConstColumn::create(std::move(array_col), chunk->num_rows() - null_rows); result->check_or_die(); return result; - } else { - // @TODO move to a new function - - if (true) { - auto lambda_func = dynamic_cast(_children[0]); - bool is_lambda_expr_independent = lambda_func->is_lambda_expr_independent(); - if (all_input_is_constant && is_lambda_expr_independent) { - return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); - } else if (all_input_is_constant && !is_lambda_expr_independent) { - return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); - } else if (!all_input_is_constant && is_lambda_expr_independent) { - return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); - } else { - return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); - } - } - - - // construct a new chunk to evaluate the lambda expression. - auto cur_chunk = std::make_shared(); - - // 1. evaluate all outer common exprs - LOG(INFO) << "eval outer common exprs, size: " << _outer_common_exprs.size(); - for (const auto& [slot_id, expr] : _outer_common_exprs) { - LOG(INFO) << "eval non-capture expr, slot_id: " << slot_id << ", expr: " << expr->debug_string(); - ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); - LOG(INFO) << "col size: " << col->size(); - chunk->append_column(col, slot_id); - } - LOG(INFO) << "eval outer common exprs done"; - - auto lambda_func = dynamic_cast(_children[0]); - std::vector slot_ids; - lambda_func->get_slot_ids(&slot_ids); - // 2. check captured columns size - for (auto slot_id : slot_ids) { - LOG(INFO) << "check slot id: " << slot_id; - DCHECK(slot_id > 0); - auto captured_column = chunk->get_column_by_slot_id(slot_id); - if (UNLIKELY(captured_column->size() < input_elements[0]->size())) { - return Status::InternalError(fmt::format( - "The size of the captured column {} is less than array's size.", captured_column->get_name())); - } - } - // if lambda expr is independet, we can treat is as a const column, skip repliacate - if (lambda_func->is_lambda_expr_independent()) { - LOG(INFO) << "lambda expr is independent, we can skip replicate"; - // we can eval lambda expr and return a const column - // @TODO consider const - // @TODO skip - std::vector arguments_ids; - int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); - DCHECK(argument_num == input_elements.size()); - for (int i = 0; i < argument_num; ++i) { - auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); - auto array_column = down_cast(data_column.get()); - auto elements_column = array_column->elements_column(); - UInt32Column::Ptr offsets_column = array_column->offsets_column(); - - if (input_elements[i]->is_constant()) { - // if input is const, we should assign data multiple times - // seems we cant avoid copy data if we don't have view column? - // if input is const, we should wrap its element column as a const column too - // @TODO elements should not be a const column - size_t elements_num = array_column->get_element_size(0); - elements_column = elements_column->clone(); - // create a new offsets - // offsets_column = UInt32Column::create(); - offsets_column = UInt32Column::create(); - // replicate N time and ignore null - size_t repeat_times = input_elements[i]->size() - null_rows; - offsets_column->append(0); - size_t offset = elements_num; - for (size_t i = 0; i < repeat_times; i++) { - elements_column->append(*elements_column, 0, elements_num); - offset += elements_num; - offsets_column->append(offset); - } - } else { - // @TODO null data size is ok, only one row, why offsets has too many data? - - // @TODO empty_null should apply on array column.. - // elements_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); - data_column->empty_null_in_complex_column(result_null_column->get_data(), - array_column->offsets().get_data()); - elements_column = down_cast(data_column.get())->elements_column(); - } - if (aligned_offsets == nullptr) { - LOG(INFO) << "assign offsets: " << offsets_column->size(); - aligned_offsets = offsets_column; - } - //append elemt - // cur_chunk->append_column(elements_column, arguments_ids[i]); - LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; - } - // @TODO if elements is null - - DCHECK(aligned_offsets != nullptr); - LOG(INFO) << "last offset: " << aligned_offsets->get_data().back(); - - LOG(INFO) << "begin append outer common column, num: " << _outer_common_exprs.size(); - for (const auto& [slot_id, expr] : _outer_common_exprs) { - auto column = chunk->get_column_by_slot_id(slot_id); - LOG(INFO) << "unpack const column: " << column->get_name() << ", size: " << column->size(); - column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); - // replicate column and put int into cur_chunk - // @TODO what if column is const? - // @TODO this should be in cur_chunk and chunk? - LOG(INFO) << "replicate column"; - // @TODO how to avoid replicate... - // @TODO replicate may cost a lot of memory, can we evalute directly to avoid replicate? - // auto aligned_column = column->replicate(aligned_offsets->get_data()); - auto aligned_column = column; - LOG(INFO) << "append outer common column: " << slot_id; - cur_chunk->append_column(aligned_column, slot_id); - // chunk->append_column(col, slot_id); - } - LOG(INFO) << "begin append capture column, num: " << slot_ids.size(); - for (auto slot_id : slot_ids) { - DCHECK(slot_id > 0); - if (cur_chunk->is_slot_exist(slot_id)) { - continue; - } - auto captured_column = chunk->get_column_by_slot_id(slot_id); - // auto aligned_column = captured_column->replicate(aligned_offsets->get_data()); - auto aligned_column = captured_column; - cur_chunk->append_column(aligned_column, slot_id); - LOG(INFO) << "append capture column, " << slot_id; - } - // @TODO - // @TODO we should eval first, get column, then replicate it? - - // eval lambda - // @TODO since lambda not depend on argument, after eval all comon expr, we can get result, wrap it as a const column - LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); - ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); - tmp_col->check_or_die(); - // tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], cur_chunk->num_rows(), true); - LOG(INFO) << "replicate result, result size: " << tmp_col->size() << ", align offset size: " << aligned_offsets->get_data().back() << ", offset num:" << aligned_offsets->size(); - column = tmp_col->replicate(aligned_offsets->get_data()); - LOG(INFO) << "column size: " << column->size(); - column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); - column = ColumnHelper::cast_to_nullable_column(column); - } else { - // 3. align up all columns offsets - // if most value is null, we remove all null column, create a new one to evaluate - // else alignup offset - // @TODO we can't avoid copy data here?? - // should we replicate capture column??? - // empty all null is ok - - // @TODO what if all input is empty - - // @TODO if all input is const, we don't need unpack const - if (all_input_is_constant) { - // if all input arguments are ConstColumn, we don't need unpack, just evaluate on ConstColumn - LOG(INFO) << "all inputs of array_map are ConstColumn"; - // just eval, no unpack ,no xx... - } - // @TODO udpate aligned_offsets, we can use arg0's offsets? - - std::vector arguments_ids; - int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); - DCHECK(argument_num == input_elements.size()); - for (int i = 0; i < argument_num; ++i) { - auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); - auto array_column = down_cast(data_column.get()); - auto elements_column = array_column->elements_column(); - UInt32Column::Ptr offsets_column = array_column->offsets_column(); - - if (input_elements[i]->is_constant()) { - // if input is const, we should assign data multiple times - // seems we cant avoid copy data if we don't have view column? - // if input is const, we should wrap its element column as a const column too - // @TODO elements should not be a const column - if (all_input_is_constant) { - LOG(INFO) << "all input is const, we just keep const"; - } else { - size_t elements_num = array_column->get_element_size(0); - elements_column = elements_column->clone(); - // create a new offsets - // offsets_column = UInt32Column::create(); - offsets_column = UInt32Column::create(); - // replicate N time and ignore null - size_t repeat_times = input_elements[i]->size() - null_rows; - offsets_column->append(0); - size_t offset = elements_num; - for (size_t i = 0; i < repeat_times; i++) { - elements_column->append(*elements_column, 0, elements_num); - offset += elements_num; - offsets_column->append(offset); - } - } - - } else { - // @TODO null data size is ok, only one row, why offsets has too many data? - - // @TODO empty_null should apply on array column.. - // elements_column->empty_null_in_complex_column(result_null_column->get_data(), array_column->offsets().get_data()); - data_column->empty_null_in_complex_column(result_null_column->get_data(), - array_column->offsets().get_data()); - elements_column = down_cast(data_column.get())->elements_column(); - } - // @TODO consider all const case - if (aligned_offsets == nullptr) { - LOG(INFO) << "assign offsets: " << offsets_column->size(); - aligned_offsets = offsets_column; - } - //append elemt - cur_chunk->append_column(elements_column, arguments_ids[i]); - // LOG(INFO) << "input elements: " << input_elements[i]->get_name() << ", arg id: " << arguments_ids[i]; - } - // @TODO put outer common expr into cur_chunk, - DCHECK(aligned_offsets != nullptr); - - LOG(INFO) << "last offset: " << aligned_offsets->get_data().back(); - if (aligned_offsets->get_data().back() == 0) { - // this means no elements for input, we can return an empty array column directly - // @OTOD - LOG(INFO) << "all input is empty, just return an empty array column"; - // @TODO create an array column with all null - column = ColumnHelper::create_column(type().children[0], - true); // array->elements must be of return array->elements' type - // column->append_default(1); - // @TODO handle aligned offsets - aligned_offsets = UInt32Column::create(0); - aligned_offsets->append(0); - // aligned_offsets->append(1); - auto array_col = std::make_shared(column, aligned_offsets); - array_col->check_or_die(); - LOG(INFO) << "array_col size: " << array_col->size(); - - result_null_column->resize(1); - auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); - result->check_or_die(); - return result; - } - // if capture column is empty - // align offset - LOG(INFO) << "begin append outer common column, num: " << _outer_common_exprs.size(); - for (const auto& [slot_id, expr] : _outer_common_exprs) { - auto column = chunk->get_column_by_slot_id(slot_id); - LOG(INFO) << "unpack const column: " << column->get_name() << ", size: " << column->size(); - column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); - // replicate column and put int into cur_chunk - // @TODO what if column is const? - // @TODO this should be in cur_chunk and chunk? - LOG(INFO) << "replicate column"; - // @TODO how to avoid replicate... - // @TODO replicate may cost a lot of memory, can we evalute directly to avoid replicate? - auto aligned_column = column->replicate(aligned_offsets->get_data()); - LOG(INFO) << "append outer common column: " << slot_id; - cur_chunk->append_column(aligned_column, slot_id); - // chunk->append_column(col, slot_id); - } - LOG(INFO) << "begin append capture column, num: " << slot_ids.size(); - for (auto slot_id : slot_ids) { - DCHECK(slot_id > 0); - if (cur_chunk->is_slot_exist(slot_id)) { - continue; - } - auto captured_column = chunk->get_column_by_slot_id(slot_id); - auto aligned_column = captured_column->replicate(aligned_offsets->get_data()); - cur_chunk->append_column(aligned_column, slot_id); - LOG(INFO) << "append capture column, " << slot_id; - } - #ifdef DEBUG - { - auto first_column = cur_chunk->get_column_by_slot_id(arguments_ids[0]); - for (int i = 1; i < argument_num; i++) { - auto column = cur_chunk->get_column_by_slot_id(arguments_ids[i]); - DCHECK_EQ(column->size(), first_column->size()) << "input arguments size should be same"; - } - LOG(INFO) << "check length done"; - } - #endif - - // @TODO - { - // @TODO evalu param may be very large?? - // cut tmp chunk from cur_chunk, and eval - // cut data - // if cur_chunk has view_column, we should convert view_column to column again - - if (cur_chunk->is_empty()) { - // all input is empty??? should return empty result - - - } - if (all_input_is_constant) { - LOG(INFO) << "all input is constant, we just eval const column"; - LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); - ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); - tmp_col->check_or_die(); - // tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], cur_chunk->num_rows(), true); - LOG(INFO) << "replicate result, result size: " << tmp_col->size() << ", align offset size: " << aligned_offsets->get_data().back() << ", offset num:" << aligned_offsets->size(); - column = ConstColumn::create(FunctionHelper::get_data_column_of_const(tmp_col), tmp_col->size()); - // column = FunctionHelper::get_data_column_of_nullable(tmp_col); - // column = tmp_col->replicate(aligned_offsets->get_data()); - LOG(INFO) << "column size: " << column->size(); - column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); - LOG(INFO) << "final column: " << column->get_name(); - // column = ColumnHelper::cast_to_nullable_column(column); - } else { - - // @TODO can we find common expr from chunk? - for (const auto& [slot_id, _] : chunk->get_slot_id_to_index_map()) { - LOG(INFO) << "chunk contains slot id: " << slot_id; - } - for (const auto& [slot_id, _] : cur_chunk->get_slot_id_to_index_map()) { - LOG(INFO) << "cur_chunk contains slot id: " << slot_id; - } - // @TODO cut row [x,y] into a tmp chunk - ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); - LOG(INFO) << "cur_chunk rows: " << cur_chunk->num_rows(); - RETURN_IF_ERROR(accumulator.push(std::move(cur_chunk))); - accumulator.finalize(); - while (auto tmp_chunk = accumulator.pull()) { - // if contains view, should translate it back - // TODO change column - auto new_chunk = std::make_shared(); - // const auto& columns = tmp_chunk->columns(); - LOG(INFO) << "tmp_chunk rows: " << tmp_chunk->num_rows(); - - tmp_chunk->check_or_die(); - - LOG(INFO) << "eval lambda: " << _children[0]->debug_string(); - ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], tmp_chunk.get())); - tmp_col->check_or_die(); - tmp_col = ColumnHelper::align_return_type(tmp_col, type().children[0], tmp_chunk->num_rows(), true); - if (column == nullptr) { - column = tmp_col; - } else { - column->append(*tmp_col); - } - } - } - } - // construct the result array - DCHECK(column != nullptr); - column = ColumnHelper::cast_to_nullable_column(column); - } } - // @TODO handle const? - if (all_input_is_constant) { - LOG(INFO) << "all input is const, create a const column as result"; - auto data_column = FunctionHelper::get_data_column_of_const(column); - aligned_offsets = UInt32Column::create(); - aligned_offsets->append(0); - aligned_offsets->append(column->size()); - auto array_col = std::make_shared( - data_column, ColumnHelper::as_column(aligned_offsets->clone_shared())); - array_col->check_or_die(); - ColumnPtr result_column = array_col; - if (result_null_column != nullptr) { - result_column = NullableColumn::create(std::move(array_col), result_null_column); - result_column->check_or_die(); - // return ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), result_null_column->size()); - // return NullableColumn::create(std::move(array_col), result_null_column); - } - result_column = ConstColumn::create(result_column, chunk->num_rows()); - result_column->check_or_die(); - LOG(INFO) << "result: " << result_column->get_name() << ", size: " << result_column->size(); - return result_column; - } - // @TODO aligned offsets maybe null - // @TODO - // attach offsets - auto array_col = std::make_shared( - column, ColumnHelper::as_column(aligned_offsets->clone_shared())); - array_col->check_or_die(); - if (result_null_column != nullptr) { - return NullableColumn::create(std::move(array_col), result_null_column); + auto lambda_func = dynamic_cast(_children[0]); + bool is_lambda_expr_independent = lambda_func->is_lambda_expr_independent(); + if (all_input_is_constant && is_lambda_expr_independent) { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); + } else if (all_input_is_constant && !is_lambda_expr_independent) { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); + } else if (!all_input_is_constant && is_lambda_expr_independent) { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); + } else { + return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); } - return array_col; + } std::string ArrayMapExpr::debug_string() const { diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index fde35309f7124..314af7c448b80 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -47,10 +47,7 @@ class ArrayMapExpr final : public Expr { StatusOr evaluate_lambda_expr(ExprContext* context, Chunk* chunk, const std::vector& arguments, NullColumnPtr null_column); - // Status prepare_lambda_arguments(ExprContext* context, Chunk* chunk, std::vector* input_elements); - // use map to make sure the order of execution - // std::map _outer_common_exprs; std::map _outer_common_exprs; }; } // namespace starrocks diff --git a/test/sql/test_array/R/test_array_map b/test/sql/test_array/R/test_array_map index f76f33c62b23d..9806b92c57b63 100644 --- a/test/sql/test_array/R/test_array_map +++ b/test/sql/test_array/R/test_array_map @@ -81,4 +81,5 @@ VALUES ARRAY_GENERATE(1, 1000) )); -- result: --- !result \ No newline at end of file +-- !result + diff --git a/test/sql/test_array/T/test_array_map b/test/sql/test_array/T/test_array_map index a91bc1024ff92..19e71a3788ec0 100644 --- a/test/sql/test_array/T/test_array_map +++ b/test/sql/test_array/T/test_array_map @@ -79,4 +79,62 @@ VALUES ARRAY_GENERATE(1, 1000) )); +-- name: test_array_map_2 +CREATE TABLE `array_map_test` ( + `id` tinyint(4) NOT NULL COMMENT "", + `arr_str` array NULL COMMENT "", + `arr_largeint` array NULL COMMENT "" +) ENGINE=OLAP +DUPLICATE KEY(`id`) +DISTRIBUTED BY RANDOM +PROPERTIES ( +"replication_num" = "1" +); + +insert into array_map_test values (1, array_repeat("abcdefghasdasdasirnqwrq", 20000), array_repeat(100, 20000)); + +select count() from array_map_test where array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint)) > 10 ; +select count(array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint))) from array_map_test; + +select count() from array_map_test where any_match(x->any_match(x->x<10, arr_largeint), arr_largeint); +select count(any_match(x->any_match(x->x<10, arr_largeint), arr_largeint)) from array_map_test; +select count(array_map(x->array_length(array_concat(arr_str,[])), arr_largeint)) from array_map_test; + +set @arr=array_repeat("12345",1000000); +select array_length(array_map((x,y)->x > y, @arr,@arr)) from table(generate_series(1,10,1)); + +-- name: test_array_map_3 +CREATE TABLE `t` ( + `k` bigint NOT NULL COMMENT "", + `arr_0` array NOT NULL COMMENT "", + `arr_1` array NULL COMMENT "", + `arr_2` array NULL COMMENT "" +) ENGINE=OLAP +primary KEY(`k`) +DISTRIBUTED BY RANDOM BUCKETS 1 +PROPERTIES ( +"replication_num" = "1" +); + +insert into t values (1, [1,2], [1,2],[2,3]), (2, [1,2], null, [2,3]), (3, [1,2],[1,2],null),(4, [1,2],[null,null],[2,3]), (5, [1], [1,2], [3]); +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t; +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t where k != 5 order by k; +delete from t where k = 5; + +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t order by k; +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1,2]) from t order by k; +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1]) from t order by k; + +select array_map(x->x, arr_0) from t order by k; +-- independent expr +select array_map((x,y,z)->10, arr_0, arr_1, arr_2) from t; +select array_map((x,y)-> k, arr_0, arr_1) from t order by k; + +-- independent expr with all const +select array_map((x,y)->k, [1,2],[2,3]) from t order by k; + +-- non indepentdent with all const +select array_map((x,y,z)->x+y+z, [1,2],[2,3],[3,4]) from t; +select array_map((x,y,z)->x+y+z, [1,2],[2,null],[3,4]) from t; +select array_map((x,y,z)->x+y+z, [1,2],[2,null],null) from t; diff --git a/test/sql/test_array_fn/R/test_array_map_2 b/test/sql/test_array_fn/R/test_array_map_2 new file mode 100644 index 0000000000000..66b8544923384 --- /dev/null +++ b/test/sql/test_array_fn/R/test_array_map_2 @@ -0,0 +1,150 @@ +-- name: test_array_map_2 +CREATE TABLE `array_map_test` ( + `id` tinyint(4) NOT NULL COMMENT "", + `arr_str` array NULL COMMENT "", + `arr_largeint` array NULL COMMENT "" +) ENGINE=OLAP +DUPLICATE KEY(`id`) +DISTRIBUTED BY RANDOM +PROPERTIES ( +"replication_num" = "1" +); +-- result: +-- !result +insert into array_map_test values (1, array_repeat("abcdefghasdasdasirnqwrq", 20000), array_repeat(100, 20000)); +-- result: +-- !result +select count() from array_map_test where array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint)) > 10 ; +-- result: +1 +-- !result +select count(array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint))) from array_map_test; +-- result: +1 +-- !result +select count() from array_map_test where any_match(x->any_match(x->x<10, arr_largeint), arr_largeint); +-- result: +0 +-- !result +select count(any_match(x->any_match(x->x<10, arr_largeint), arr_largeint)) from array_map_test; +-- result: +1 +-- !result +select count(array_map(x->array_length(array_concat(arr_str,[])), arr_largeint)) from array_map_test; +-- result: +1 +-- !result +set @arr=array_repeat("12345",1000000); +-- result: +-- !result +select array_length(array_map((x,y)->x > y, @arr,@arr)) from table(generate_series(1,10,1)); +-- result: +1000000 +1000000 +1000000 +1000000 +1000000 +1000000 +1000000 +1000000 +1000000 +1000000 +-- !result +-- name: test_array_map_3 +CREATE TABLE `t` ( + `k` bigint NOT NULL COMMENT "", + `arr_0` array NOT NULL COMMENT "", + `arr_1` array NULL COMMENT "", + `arr_2` array NULL COMMENT "" +) ENGINE=OLAP +PRIMARY KEY(`k`) +DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES ( +"replication_num" = "1" +); +-- result: +-- !result +insert into t values (1, [1,2], [1,2],[2,3]), (2, [1,2], null, [2,3]), (3, [1,2],[1,2],null),(4, [1,2],[null,null],[2,3]), (5, [1], [1,2], [3]); +-- result: +-- !result +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t; +-- result: +E: (1064, "Input array element's size is not equal in array_map().") +-- !result +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t where k != 5 order by k; +-- result: +[4,7] +None +None +[null,null] +-- !result +delete from t where k = 5; +-- result: +-- !result +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t order by k; +-- result: +[4,7] +None +None +[null,null] +-- !result +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1,2]) from t order by k; +-- result: +[5,9] +None +None +[null,null] +-- !result +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1]) from t order by k; +-- result: +E: (1064, "Input array element's size is not equal in array_map().") +-- !result +select array_map(x->x, arr_0) from t order by k; +-- result: +[1,2] +[1,2] +[1,2] +[1,2] +-- !result +select array_map((x,y,z)->10, arr_0, arr_1, arr_2) from t; +-- result: +[10,10] +None +None +[10,10] +-- !result +select array_map((x,y)-> k, arr_0, arr_1) from t order by k; +-- result: +[1,1] +None +[3,3] +[4,4] +-- !result +select array_map((x,y)->k, [1,2],[2,3]) from t order by k; +-- result: +[1,1] +[1,1] +[1,1] +[1,1] +-- !result +select array_map((x,y,z)->x+y+z, [1,2],[2,3],[3,4]) from t; +-- result: +[6,9] +[6,9] +[6,9] +[6,9] +-- !result +select array_map((x,y,z)->x+y+z, [1,2],[2,null],[3,4]) from t; +-- result: +[6,null] +[6,null] +[6,null] +[6,null] +-- !result +select array_map((x,y,z)->x+y+z, [1,2],[2,null],null) from t; +-- result: +None +None +None +None +-- !result \ No newline at end of file diff --git a/test/sql/test_array_fn/T/test_array_map_2 b/test/sql/test_array_fn/T/test_array_map_2 new file mode 100644 index 0000000000000..cfc22d00c61fc --- /dev/null +++ b/test/sql/test_array_fn/T/test_array_map_2 @@ -0,0 +1,60 @@ + +-- name: test_array_map_2 +CREATE TABLE `array_map_test` ( + `id` tinyint(4) NOT NULL COMMENT "", + `arr_str` array NULL COMMENT "", + `arr_largeint` array NULL COMMENT "" +) ENGINE=OLAP +DUPLICATE KEY(`id`) +DISTRIBUTED BY RANDOM +PROPERTIES ( +"replication_num" = "1" +); + +insert into array_map_test values (1, array_repeat("abcdefghasdasdasirnqwrq", 20000), array_repeat(100, 20000)); + +select count() from array_map_test where array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint)) > 10 ; +select count(array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint))) from array_map_test; + +select count() from array_map_test where any_match(x->any_match(x->x<10, arr_largeint), arr_largeint); +select count(any_match(x->any_match(x->x<10, arr_largeint), arr_largeint)) from array_map_test; +select count(array_map(x->array_length(array_concat(arr_str,[])), arr_largeint)) from array_map_test; + +set @arr=array_repeat("12345",1000000); +select array_length(array_map((x,y)->x > y, @arr,@arr)) from table(generate_series(1,10,1)); + +-- name: test_array_map_3 +CREATE TABLE `t` ( + `k` bigint NOT NULL COMMENT "", + `arr_0` array NOT NULL COMMENT "", + `arr_1` array NULL COMMENT "", + `arr_2` array NULL COMMENT "" +) ENGINE=OLAP +PRIMARY KEY(`k`) +DISTRIBUTED BY HASH(`k`) BUCKETS 1 +PROPERTIES ( +"replication_num" = "1" +); + +insert into t values (1, [1,2], [1,2],[2,3]), (2, [1,2], null, [2,3]), (3, [1,2],[1,2],null),(4, [1,2],[null,null],[2,3]), (5, [1], [1,2], [3]); +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t; +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t where k != 5 order by k; +delete from t where k = 5; + +select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t order by k; +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1,2]) from t order by k; +select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1]) from t order by k; + +select array_map(x->x, arr_0) from t order by k; +-- independent expr +select array_map((x,y,z)->10, arr_0, arr_1, arr_2) from t; +select array_map((x,y)-> k, arr_0, arr_1) from t order by k; + +-- independent expr with all const +select array_map((x,y)->k, [1,2],[2,3]) from t order by k; + +-- non indepentdent with all const +select array_map((x,y,z)->x+y+z, [1,2],[2,3],[3,4]) from t; +select array_map((x,y,z)->x+y+z, [1,2],[2,null],[3,4]) from t; +select array_map((x,y,z)->x+y+z, [1,2],[2,null],null) from t; + From 41d632e91a7b2babe9dc052cca7c92cab4154fe6 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Sat, 21 Sep 2024 19:30:30 +0800 Subject: [PATCH 07/17] remove unused code Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/CMakeLists.txt | 1 - be/src/column/array_view_column.cpp | 189 ------------- be/src/column/array_view_column.h | 267 ------------------ be/src/column/column.h | 16 -- be/src/column/column_visitor.cpp | 1 - be/src/column/column_visitor.h | 1 - be/src/column/column_visitor_adapter.h | 4 - be/src/column/column_visitor_mutable.cpp | 1 - be/src/column/column_visitor_mutable.h | 1 - be/src/column/const_column.h | 1 - be/src/column/nullable_column.h | 1 - be/src/column/vectorized_fwd.h | 1 - .../exec/pipeline/scan/olap_scan_context.cpp | 2 +- be/src/exec/sorted_streaming_aggregator.cpp | 8 - be/src/exec/sorting/compare_column.cpp | 6 - be/src/exec/sorting/sort_column.cpp | 9 - be/src/exec/sorting/sort_permute.cpp | 5 - be/src/exprs/array_functions.cpp | 1 - be/src/exprs/array_map_expr.cpp | 1 - be/src/serde/column_array_serde.cpp | 15 - 20 files changed, 1 insertion(+), 530 deletions(-) delete mode 100644 be/src/column/array_view_column.cpp delete mode 100644 be/src/column/array_view_column.h diff --git a/be/src/column/CMakeLists.txt b/be/src/column/CMakeLists.txt index 901de66d29393..b1b1418fdf06b 100644 --- a/be/src/column/CMakeLists.txt +++ b/be/src/column/CMakeLists.txt @@ -16,7 +16,6 @@ set(LIBRARY_OUTPUT_PATH "${BUILD_DIR}/src/column") add_library(Column STATIC array_column.cpp - array_view_column.cpp adaptive_nullable_column.cpp chunk.cpp chunk_extra_data.cpp diff --git a/be/src/column/array_view_column.cpp b/be/src/column/array_view_column.cpp deleted file mode 100644 index 45259efa70f2b..0000000000000 --- a/be/src/column/array_view_column.cpp +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright 2021-present StarRocks, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "column/array_view_column.h" - -#include -#include -#include - -#include "column/array_column.h" -#include "column/chunk.h" -#include "column/vectorized_fwd.h" -#include "gutil/casts.h" - -namespace starrocks { - -ColumnPtr ArrayViewColumn::replicate(const Buffer& offsets) { - // @TODO clone empty??? - // auto dest = this->clone_empty(); - auto dest_size = offsets.size() - 1; - auto new_offsets = UInt32Column::create(); - auto new_lengths = UInt32Column::create(); - new_offsets->reserve(offsets.back()); - new_lengths->reserve(offsets.back()); - - for (size_t i = 0; i < dest_size; i++) { - uint32_t repeat_times = offsets[i + 1] - offsets[i]; - new_offsets->append_value_multiple_times(*_offsets, i, repeat_times); - new_lengths->append_value_multiple_times(*_lengths, i, repeat_times); - } - return ArrayViewColumn::create(_elements, new_offsets, new_lengths); -} - -void ArrayViewColumn::append(const Column& src, size_t offset, size_t count) { - const auto& array_view_column = down_cast(src); - const auto& src_offsets = array_view_column.offsets(); - const auto& src_lengths = array_view_column.lengths(); - - if (_elements == array_view_column._elements) { - LOG(INFO) << "shared elements, only copy offsets and lengths"; - // if these two array view column share the same elements, just append offset and lengths - _offsets->append(src_offsets, offset, count); - _lengths->append(src_lengths, offset, count); - } else { - LOG(INFO) << "not shared elements, should copy all"; - // append elements and re-compute offset and length for new data - // @TODO should optimize - // @TODO should avoid this copy... - uint32_t offset = _elements->size(); - for (size_t i = 0; i < count; i++) { - uint32_t src_offset = src_offsets.get_data()[offset + i]; - uint32_t src_length = src_lengths.get_data()[offset + i]; - DCHECK_LE(src_offset + src_length, array_view_column._elements->size()); - _elements->append(*(array_view_column._elements), src_offset, src_length); - _offsets->append(src_offset + offset); - _lengths->append(src_length); - } - } -} - -void ArrayViewColumn::check_or_die() const { - DCHECK(_elements); - DCHECK(_offsets); - DCHECK(_lengths); - DCHECK_EQ(_offsets->size(), _lengths->size()); - for (size_t i = 0; i < _offsets->size(); i++) { - uint32_t offset = _offsets->get_data()[i]; - uint32_t length = _lengths->get_data()[i]; - DCHECK_LE(offset + length, _elements->size()); - } -} - -// @TODO clone should share elements? -MutableColumnPtr ArrayViewColumn::clone_empty() const { - return create_mutable(_elements, UInt32Column::create(), UInt32Column::create()); -} - -StatusOr ArrayViewColumn::to_array_column() const { - LOG(INFO) << "ArrayViewColumn::to_array_column, cosnt ? " << is_constant(); - // @TODO consider nullable ??? - auto array_elements = _elements->clone_empty(); - auto array_offsets = UInt32Column::create(); - // @TODO reserve elements too? - LOG(INFO) << "ArrayViewColumn::to_array_column, size: " << _offsets->size(); - array_offsets->reserve(_offsets->size() + 1); - array_offsets->append(0); - uint32_t last_offset = 0; - size_t num_rows = _offsets->size(); - // @TODO maybe copy alot... - for (size_t i = 0; i < num_rows; i++) { - uint32_t offset = _offsets->get_data()[i]; - uint32_t length = _lengths->get_data()[i]; - LOG(INFO) << "offset: " << offset << ", len: " << length; - // append lement - array_elements->append(*_elements, offset, length); - array_offsets->append(last_offset + length); - last_offset += length; - } - return ArrayColumn::create(std::move(array_elements), std::move(array_offsets)); -} - -StatusOr ArrayViewColumn::from_array_column(const ColumnPtr& column) { - if (!column->is_array()) { - LOG(INFO) << "from_array_column error..."; - return Status::InternalError("input column must be array column"); - } - LOG(INFO) << "from_array_column, size: " << column->size(); - auto view_offsets = UInt32Column::create(); - auto view_lengths = UInt32Column::create(); - view_offsets->reserve(column->size()); - view_lengths->reserve(column->size()); - ColumnPtr view_elements; - - // const ArrayColumn* array_column = nullptr; - if (column->is_nullable()) { - auto nullable_column = down_cast(column.get()); - DCHECK(nullable_column != nullptr); - const auto& null_data = nullable_column->null_column()->get_data(); - auto array_column = down_cast(nullable_column->data_column().get()); - const auto& array_offsets = array_column->offsets().get_data(); - - view_elements = array_column->elements_column(); - LOG(INFO) << "elements size: " << view_elements->size(); - LOG(INFO) << "null size: " << nullable_column->null_column()->size(); - // array column: [[1,2],null,[],[4]] - // null_data [0,1,0,0] - // elements column: [1,2,3,4] - // offsets column: [0, 2, 2, 2, 3] - - // array view column: [[1,2], null, [], [4]] - // null_data[0,1,0,0] - // elements column: [1,2,3,4] - // offsets column: [0,2,2,3] - // length column: [2,0,0,1] - for (size_t i = 0; i < column->size(); i++) { - uint32_t offset = array_offsets[i]; - uint32_t length = null_data[i] ? 0 : (array_offsets[i + 1] - offset); - LOG(INFO) << "append offset: " << offset << ", length: " << length; - view_offsets->append(offset); - view_lengths->append(length); - } - auto ret = NullableColumn::create(ArrayViewColumn::create(view_elements, view_offsets, view_lengths), - nullable_column->null_column()); - ret->check_or_die(); - return ret; - } - - auto array_column = down_cast(column.get()); - view_elements = array_column->elements_column(); - const auto& array_offsets = array_column->offsets().get_data(); - - for (size_t i = 0; i < column->size(); i++) { - uint32_t offset = array_offsets[i]; - uint32_t length = array_offsets[i + 1] - offset; - view_offsets->append(offset); - view_lengths->append(length); - } - return ArrayViewColumn::create(view_elements, view_offsets, view_lengths); -} - -StatusOr ArrayViewColumn::to_array_column(const ColumnPtr& column) { - if (!column->is_array_view()) { - LOG(INFO) << "to_array_column error...."; - return Status::InternalError("input column must be array view column"); - } - - if (column->is_nullable()) { - auto nullable_column = down_cast(column.get()); - DCHECK(nullable_column != nullptr); - auto array_view_column = down_cast(nullable_column->data_column().get()); - LOG(INFO) << "to_array_column"; - ASSIGN_OR_RETURN(auto array_column, array_view_column->to_array_column()); - return NullableColumn::create(std::move(array_column), nullable_column->null_column()); - } - auto array_view_column = down_cast(column.get()); - return array_view_column->to_array_column(); -} -} // namespace starrocks \ No newline at end of file diff --git a/be/src/column/array_view_column.h b/be/src/column/array_view_column.h deleted file mode 100644 index e185faf37e2f9..0000000000000 --- a/be/src/column/array_view_column.h +++ /dev/null @@ -1,267 +0,0 @@ -// Copyright 2021-present StarRocks, Inc. All rights reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// https://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include - -#include "column/column.h" -#include "column/fixed_length_column.h" -#include "column/nullable_column.h" -#include "column/vectorized_fwd.h" - -namespace starrocks { - -class ArrayViewColumn final : public ColumnFactory { - friend class ColumnFactory; - -public: - using ValueType = void; // @TODO need array view? - - ArrayViewColumn(ColumnPtr elements, UInt32Column::Ptr offsets, UInt32Column::Ptr lengths) - : _elements(std::move(elements)), _offsets(std::move(offsets)), _lengths(std::move(lengths)) {} - - ArrayViewColumn(const ArrayViewColumn& rhs) - : _elements(rhs._elements), - _offsets(std::static_pointer_cast(rhs._offsets->clone_shared())), - _lengths(std::static_pointer_cast(rhs._lengths->clone_shared())) {} - - ArrayViewColumn(ArrayViewColumn&& rhs) noexcept - : _elements(std::move(rhs._elements)), - _offsets(std::move(rhs._offsets)), - _lengths(std::move(rhs._lengths)) {} - - ArrayViewColumn& operator=(const ArrayViewColumn& rhs) { - // @TODO - return *this; - } - ArrayViewColumn& operator=(ArrayViewColumn&& rhs) noexcept { - // @TODO - return *this; - } - - ~ArrayViewColumn() override = default; - - bool is_array_view() const override { return true; } - - const uint8_t* raw_data() const override { - DCHECK(false) << "ArrayViewColumn::raw_data() is not supported"; - return nullptr; - } - - uint8_t* mutable_raw_data() override { - DCHECK(false) << "ArrayViewColumn::mutable_raw_data() is not supported"; - return nullptr; - } - - size_t size() const override { return _offsets->size(); } - size_t capacity() const override { return _offsets->capacity() + _lengths->capacity(); } - - size_t type_size() const override { - // @TODO need a array view type - return 0; - } - - size_t byte_size() const override { - // @TODO - return 0; - } - size_t byte_size(size_t from, size_t size) const override { - // @TODO - return 0; - } - - size_t byte_size(size_t idx) const override { - // @TODO - return 0; - } - - void reserve(size_t n) override { - _elements->reserve(n); - _offsets->reserve(n); - _lengths->reserve(n); - } - void resize(size_t n) override { - // DCHECK(false) << "ArrayViewColumn::resize() is not supported"; - _elements->resize(n); - _offsets->resize(n); - _lengths->resize(n); - } - void assign(size_t n, size_t idx) override { - // @TODO - DCHECK(false) << "ArrayViewColumn::assign() is not supported"; - } - void append_datum(const Datum& datum) override { - DCHECK(false) << "ArrayViewColumn::append_datum() is not supported"; - } - - void append(const Column& src, size_t offset, size_t count) override; - - void append_selective(const Column& src, const uint32_t* indexes, uint32_t from, uint32_t size) override { - DCHECK(false) << "ArrayViewColumn::append_selective() is not supported"; - } - - void append_value_multiple_times(const Column& src, uint32_t idx, uint32_t size) override { - DCHECK(false) << "ArrayViewColumn::append_value_multiple_times() is not supported"; - } - - bool append_nulls(size_t count) override { - // @TODO - return false; - } - - size_t append_numbers(const void* buff, size_t length) override { - // @TODO - return -1; - } - void append_value_multiple_times(const void* value, size_t count) override { - DCHECK(false) << "ArrayViewColumn::append_value_multiple_times() is not supported"; - } - - void append_default() override { DCHECK(false) << "ArrayViewColumn::append_default() is not supported"; } - void append_default(size_t count) override { - DCHECK(false) << "ArrayViewColumn::append_default() is not supported"; - } - void fill_default(const Filter& filter) override { - // @TODO - } - void update_rows(const Column& src, const uint32_t* indexes) override { - DCHECK(false) << "ArrayViewColumn:::update_rows() is not supported"; - } - void remove_first_n_values(size_t count) override { - // @TODO - } - uint32_t max_one_element_serialize_size() const override { - DCHECK(false) << "ArrayViewColumn::max_one_element_serialize_size() is not supported"; - return 0; - } - uint32_t serialize(size_t idx, uint8_t* pos) override { - DCHECK(false); - return 0; - } - uint32_t serialize_default(uint8_t* pos) override { - DCHECK(false); - return 0; - } - - void serialize_batch(uint8_t* dst, Buffer& slice_sized, size_t chunk_size, - uint32_t max_one_row_size) override { - DCHECK(false); - } - const uint8_t* deserialize_and_append(const uint8_t* pos) override { - DCHECK(false); - return nullptr; - } - - uint32_t serialize_size(size_t idx) const override { - DCHECK(false); - return 0; - } - void deserialize_and_append_batch(Buffer& srcs, size_t chunk_size) override { DCHECK(false); } - - MutableColumnPtr clone_empty() const override; - - size_t filter_range(const Filter& filter, size_t from, size_t to) override { - // @TODO - return 0; - } - - int compare_at(size_t left, size_t rifht, const Column& right_column, int nan_direction_hint) const override { - // @TODO - return 0; - } - - void compare_column(const Column& rhs, std::vector* output) const {} - - int equals(size_t left, const Column& rhs, size_t right, bool safe_eq = true) const override { return 0; } - - void crc32_hash_at(uint32_t* seed, uint32_t idx) const override {} - void fnv_hash_at(uint32_t* seed, uint32_t idx) const override {} - void fnv_hash(uint32_t* hash, uint32_t from, uint32_t to) const override {} - - void crc32_hash(uint32_t* hash, uint32_t from, uint32_t to) const override {} - - int64_t xor_checksum(uint32_t from, uint32_t to) const override { return 0; } - - void put_mysql_row_buffer(MysqlRowBuffer* buf, size_t idx, bool is_binary_protocol = false) const override {} - - ColumnPtr replicate(const Buffer& offsets) override; - - std::string get_name() const override { return "array-view"; } - - Datum get(size_t idx) const override { return Datum(); } - - size_t get_element_null_count(size_t idx) const { return 0; } - size_t get_element_size(size_t idx) const { return 0; } - - bool set_null(size_t idx) override { return false; } - - size_t memory_usage() const override { return _elements->memory_usage() + _offsets->memory_usage(); } - - size_t container_memory_usage() const override { - return _elements->container_memory_usage() + _offsets->container_memory_usage(); - } - - size_t reference_memory_usage(size_t from, size_t size) const override { return 0; } - - void swap_column(Column& rhs) override {} - - void reset_column() override {} - - const Column& elements() const { return *_elements; } - ColumnPtr& elements_column() { return _elements; } - ColumnPtr elements_column() const { return _elements; } - - const UInt32Column& offsets() const { return *_offsets; } - UInt32Column::Ptr& offsets_column() { return _offsets; } - const UInt32Column& lengths() const { return *_lengths; } - UInt32Column::Ptr& lengths_column() { return _lengths; } - - bool is_nullable() const override { return false; } - - std::string debug_item(size_t idx) const override { return ""; } - - std::string debug_string() const override { return "array-view-column"; } - - Status capacity_limit_reached() const override { - RETURN_IF_ERROR(_elements->capacity_limit_reached()); - return _offsets->capacity_limit_reached(); - } - - StatusOr upgrade_if_overflow() override { return nullptr; } - - StatusOr downgrade() override { return nullptr; } - - bool has_large_column() const override { return _elements->has_large_column(); } - - void check_or_die() const override; - - Status unfold_const_children(const starrocks::TypeDescriptor& type) override { return Status::NotSupported("TBD"); } - - // build array_view column from array_column, how to solve null?? - // if array_column is nullable, return Nullable(ArrayViewColumn) - // else return ArrayViewColumn - static StatusOr from_array_column(const ColumnPtr& column); - static StatusOr to_array_column(const ColumnPtr& column); - // @TODO to_array_column - StatusOr to_array_column() const; - -private: - // Elements must be NullableColumn to facilitate handling nested types. - ColumnPtr _elements; - UInt32Column::Ptr _offsets; - UInt32Column::Ptr _lengths; -}; -} // namespace starrocks \ No newline at end of file diff --git a/be/src/column/column.h b/be/src/column/column.h index 6d653ad0022ce..e7a008b85636e 100644 --- a/be/src/column/column.h +++ b/be/src/column/column.h @@ -180,22 +180,6 @@ class Column { return dest; } - // align columns' offsets - // column(1,2)->align_offsets({0,2,5}) -> column(1,_,2,_,_) - virtual ColumnPtr align_offsets(const Buffer& offsets) { - auto dest = this->clone_empty(); - auto dest_size = offsets.size() - 1; - DCHECK(this->size() >= dest_size) << "The size of the source column is less when aligning offsets."; - dest->reserve(offsets.back()); - for (size_t i = 0; i < dest_size; i++) { - // first value is itself, others append default - dest->append_value_multiple_times(*this, i, 1); - if (offsets[i + 1] - offsets[i] > 1) { - dest->append_default(offsets[i + 1] - offsets[i] - 1); - } - } - return dest; - } // Update elements to default value which hit by the filter virtual void fill_default(const Filter& filter) = 0; diff --git a/be/src/column/column_visitor.cpp b/be/src/column/column_visitor.cpp index a5c72213a9420..5ffdfd43ae266 100644 --- a/be/src/column/column_visitor.cpp +++ b/be/src/column/column_visitor.cpp @@ -69,6 +69,5 @@ VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(ObjectColumn) -VISIT_IMPL(ArrayViewColumn) } // namespace starrocks diff --git a/be/src/column/column_visitor.h b/be/src/column/column_visitor.h index eb86bfa781627..dd37e1d70ea46 100644 --- a/be/src/column/column_visitor.h +++ b/be/src/column/column_visitor.h @@ -80,7 +80,6 @@ class ColumnVisitor { virtual Status visit(const FixedLengthColumnBase& column); virtual Status visit(const FixedLengthColumnBase& column); virtual Status visit(const ObjectColumn& column); - virtual Status visit(const ArrayViewColumn& column); }; } // namespace starrocks diff --git a/be/src/column/column_visitor_adapter.h b/be/src/column/column_visitor_adapter.h index 85feb659808d2..3f5a9d022ba02 100644 --- a/be/src/column/column_visitor_adapter.h +++ b/be/src/column/column_visitor_adapter.h @@ -93,8 +93,6 @@ class ColumnVisitorAdapter : public ColumnVisitor { Status visit(const LargeBinaryColumn& column) override { return _impl->do_visit(column); } - Status visit(const ArrayViewColumn& column) override { return _impl->do_visit(column); } - private: Impl* _impl; }; @@ -168,8 +166,6 @@ class ColumnVisitorMutableAdapter : public ColumnVisitorMutable { Status visit(LargeBinaryColumn* column) override { return _impl->do_visit(column); } - Status visit(ArrayViewColumn* column) override { return _impl->do_visit(column); } - private: Impl* _impl; }; diff --git a/be/src/column/column_visitor_mutable.cpp b/be/src/column/column_visitor_mutable.cpp index 9f303d7a73da2..cd91d66081a58 100644 --- a/be/src/column/column_visitor_mutable.cpp +++ b/be/src/column/column_visitor_mutable.cpp @@ -69,6 +69,5 @@ VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) VISIT_IMPL(FixedLengthColumnBase) -VISIT_IMPL(ArrayViewColumn) } // namespace starrocks diff --git a/be/src/column/column_visitor_mutable.h b/be/src/column/column_visitor_mutable.h index 00011434d0204..87291d2b1b3d9 100644 --- a/be/src/column/column_visitor_mutable.h +++ b/be/src/column/column_visitor_mutable.h @@ -80,7 +80,6 @@ class ColumnVisitorMutable { virtual Status visit(FixedLengthColumnBase* column); virtual Status visit(FixedLengthColumnBase* column); virtual Status visit(ObjectColumn* column); - virtual Status visit(ArrayViewColumn* column); }; } // namespace starrocks diff --git a/be/src/column/const_column.h b/be/src/column/const_column.h index 78d02d76c57fe..9492d48d58612 100644 --- a/be/src/column/const_column.h +++ b/be/src/column/const_column.h @@ -50,7 +50,6 @@ class ConstColumn final : public ColumnFactory { bool is_nullable() const override { return _data->is_nullable(); } bool is_json() const override { return _data->is_json(); } bool is_array() const override { return _data->is_array(); } - bool is_array_view() const override { return _data->is_array_view(); } bool is_null(size_t index) const override { return _data->is_null(0); } diff --git a/be/src/column/nullable_column.h b/be/src/column/nullable_column.h index 02de533f35112..87701f2a41bfd 100644 --- a/be/src/column/nullable_column.h +++ b/be/src/column/nullable_column.h @@ -83,7 +83,6 @@ class NullableColumn : public ColumnFactory { bool is_nullable() const override { return true; } bool is_json() const override { return _data_column->is_json(); } bool is_array() const override { return _data_column->is_array(); } - bool is_array_view() const override { return _data_column->is_array_view(); } bool is_null(size_t index) const override { DCHECK_EQ(_null_column->size(), _data_column->size()); diff --git a/be/src/column/vectorized_fwd.h b/be/src/column/vectorized_fwd.h index 00472ef2f54ca..bea88d860a119 100644 --- a/be/src/column/vectorized_fwd.h +++ b/be/src/column/vectorized_fwd.h @@ -46,7 +46,6 @@ template using Buffer = std::vector>; class ArrayColumn; -class ArrayViewColumn; class MapColumn; class StructColumn; class NullableColumn; diff --git a/be/src/exec/pipeline/scan/olap_scan_context.cpp b/be/src/exec/pipeline/scan/olap_scan_context.cpp index b9e9c43045c7d..fe98a32971f51 100644 --- a/be/src/exec/pipeline/scan/olap_scan_context.cpp +++ b/be/src/exec/pipeline/scan/olap_scan_context.cpp @@ -90,7 +90,7 @@ Status OlapScanContext::capture_tablet_rowsets(const std::vectorfull_name() << ", rowsets: " << tablet_rowsets[i].size() - << ", version: " << scan_range->version << ", gtid: " << scan_range->gtid; + << ", version: " << scan_range->version; _tablets[i] = std::move(tablet); } diff --git a/be/src/exec/sorted_streaming_aggregator.cpp b/be/src/exec/sorted_streaming_aggregator.cpp index d2a715a269f85..f7c0d92d9af12 100644 --- a/be/src/exec/sorted_streaming_aggregator.cpp +++ b/be/src/exec/sorted_streaming_aggregator.cpp @@ -154,10 +154,6 @@ class ColumnSelfComparator : public ColumnVisitorAdapter { return Status::NotSupported("Unsupported struct column in column wise comparator"); } - Status do_visit(const ArrayViewColumn& column) { - return Status::NotSupported("Unsupported array view column in column wise comparator"); - } - private: const ColumnPtr& _first_column; std::vector& _cmp_vector; @@ -255,10 +251,6 @@ class AppendWithMask : public ColumnVisitorMutableAdapter { return Status::NotSupported("Unsupported struct column in column wise comparator"); } - Status do_visit(ArrayViewColumn* column) { - return Status::NotSupported("Unsupported array view column in column wise comparator"); - } - private: Column* _column; const SelMask _sel_mask; diff --git a/be/src/exec/sorting/compare_column.cpp b/be/src/exec/sorting/compare_column.cpp index 4d7e309529c5e..5c33b1ab9f248 100644 --- a/be/src/exec/sorting/compare_column.cpp +++ b/be/src/exec/sorting/compare_column.cpp @@ -238,11 +238,6 @@ class ColumnCompare final : public ColumnVisitorAdapter { return Status::OK(); } - Status do_visit(const ArrayViewColumn& column) { - DCHECK(false) << "not support array view column sort_and_tie"; - return Status::NotSupported("not suport array view column"); - } - size_t get_equal_count() const { return _equal_count; } private: @@ -317,7 +312,6 @@ class ColumnTieBuilder final : public ColumnVisitorAdapter { Status do_visit(const ObjectColumn& column) { return Status::NotSupported("not support"); } - Status do_visit(const ArrayViewColumn& column) { return Status::NotSupported("Not support"); } private: const ColumnPtr _column; diff --git a/be/src/exec/sorting/sort_column.cpp b/be/src/exec/sorting/sort_column.cpp index baa629ea04382..1d8f440725268 100644 --- a/be/src/exec/sorting/sort_column.cpp +++ b/be/src/exec/sorting/sort_column.cpp @@ -218,11 +218,6 @@ class ColumnSorter final : public ColumnVisitorAdapter> { return sort_and_tie_helper(_cancel, &column, _sort_desc.asc_order(), _permutation, _tie, cmp, _range_or_ranges, _build_tie); } - Status do_visit(const ArrayViewColumn& column) { - DCHECK(false) << "not support array view column sort_and_tie"; - - return Status::NotSupported("not support array view column sort_and_tie"); - } private: const std::atomic& _cancel; @@ -412,10 +407,6 @@ class VerticalColumnSorter final : public ColumnVisitorAdapter diff --git a/be/src/exec/sorting/sort_permute.cpp b/be/src/exec/sorting/sort_permute.cpp index 84354df0b6ab1..83edb0101e474 100644 --- a/be/src/exec/sorting/sort_permute.cpp +++ b/be/src/exec/sorting/sort_permute.cpp @@ -234,11 +234,6 @@ class ColumnAppendPermutation final : public ColumnVisitorMutableAdapter #include "column/array_column.h" -#include "column/array_view_column.h" #include "column/column_hash.h" #include "column/map_column.h" #include "column/struct_column.h" diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index 741ddcd76102a..051bbaf65d568 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -20,7 +20,6 @@ #include #include "column/array_column.h" -#include "column/array_view_column.h" #include "column/chunk.h" #include "column/column_helper.h" #include "column/const_column.h" diff --git a/be/src/serde/column_array_serde.cpp b/be/src/serde/column_array_serde.cpp index fe11e7b377ac6..aad194952df7f 100644 --- a/be/src/serde/column_array_serde.cpp +++ b/be/src/serde/column_array_serde.cpp @@ -544,11 +544,6 @@ class ColumnSerializedSizeVisitor final : public ColumnVisitorAdapter Date: Mon, 23 Sep 2024 09:56:55 +0800 Subject: [PATCH 08/17] fix Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/array_column.cpp | 13 ------------- be/src/column/column.cpp | 1 - be/src/column/column.h | 2 -- be/src/exec/pipeline/scan/olap_scan_context.cpp | 2 +- be/src/exec/sorting/sort_column.cpp | 1 - be/src/exprs/array_functions.cpp | 9 --------- be/src/exprs/expr.cpp | 2 +- be/src/exprs/lambda_function.h | 5 ----- be/src/serde/column_array_serde.cpp | 2 -- 9 files changed, 2 insertions(+), 35 deletions(-) diff --git a/be/src/column/array_column.cpp b/be/src/column/array_column.cpp index 8b07a04fa7d22..bbf8b93ca9285 100644 --- a/be/src/column/array_column.cpp +++ b/be/src/column/array_column.cpp @@ -643,7 +643,6 @@ bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UIn } size_t num_rows = v1.size() - 1; - LOG(INFO) << "num_rows: " << num_rows; if constexpr (ConstV1 && ConstV2) { // if both are const column, we only compare the first row once num_rows = 1; @@ -651,9 +650,6 @@ bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UIn bool result = true; const auto& offsets_v1 = v1.get_data(); const auto& offsets_v2 = v2.get_data(); - for (size_t i = 0;i < offsets_v1.size();i++) { - LOG(INFO) << "offset v1: " << offsets_v1[i] << ", v2:" << offsets_v2[i]; - } for (size_t i = 0; i < num_rows && result; i++) { @@ -662,16 +658,9 @@ bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UIn [[maybe_unused]] uint32_t len2 = (ConstV2) ? (offsets_v2[1] - offsets_v2[0]) : (offsets_v2[i + 1] - offsets_v2[i]); if constexpr (IgnoreNull) { - if (len1 != len2) { - LOG(INFO) << "array len mismatch, v1: " << len1 << ", v2: " << len2 << ", idx: " << i; - } result &= (len1 == len2); } else { - LOG(INFO) << "check idx: " << i << ", null: " << static_cast(null_data[i]); if (!null_data[i]) { - if (len1 != len2) { - LOG(INFO) << "array len mismatch, v1: " << len1 << ", v2: " << len2 << ", idx: " << i; - } result &= (len1 == len2); } } @@ -686,7 +675,6 @@ bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPt DCHECK(!v1->is_nullable() && !v2->is_nullable()); if (v1->size() != v2->size()) { - LOG(INFO) << "size not equal, v1: " << v1->size() << ", v2: " << v2->size(); return false; } auto data_v1 = FunctionHelper::get_data_column_of_const(v1); @@ -695,7 +683,6 @@ bool ArrayColumn::is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPt auto* array_v2 = down_cast(data_v2.get()); const auto& offsets_v1 = array_v1->offsets(); const auto& offsets_v2 = array_v2->offsets(); - LOG(INFO) << "v1 size: " << v1->size() << ", v2 size: " << v2->size() << ", offset v1: " << offsets_v1.size() << ", offset v2: " << offsets_v2.size(); if (v1->is_constant() && v2->is_constant()) { return compare_lengths_from_offsets(offsets_v1, offsets_v2, null_column); } else if (v1->is_constant() && !v2->is_constant()) { diff --git a/be/src/column/column.cpp b/be/src/column/column.cpp index 9960fc8dfc5b9..25c7db43901b7 100644 --- a/be/src/column/column.cpp +++ b/be/src/column/column.cpp @@ -67,7 +67,6 @@ StatusOr Column::upgrade_helper_func(ColumnPtr* col) { } bool Column::empty_null_in_complex_column(const Filter& null_data, const Buffer& offsets) { - // DCHECK(null_data.size() == this->size()); DCHECK_EQ(null_data.size(), this->size()); if (!is_array() && !is_map()) { throw std::runtime_error("empty_null_in_complex_column() only works for array and map column."); diff --git a/be/src/column/column.h b/be/src/column/column.h index e7a008b85636e..1b6ad54ae91f5 100644 --- a/be/src/column/column.h +++ b/be/src/column/column.h @@ -97,8 +97,6 @@ class Column { virtual bool is_array() const { return false; } - virtual bool is_array_view() const { return false; } - virtual bool is_map() const { return false; } virtual bool is_struct() const { return false; } diff --git a/be/src/exec/pipeline/scan/olap_scan_context.cpp b/be/src/exec/pipeline/scan/olap_scan_context.cpp index fe98a32971f51..b9e9c43045c7d 100644 --- a/be/src/exec/pipeline/scan/olap_scan_context.cpp +++ b/be/src/exec/pipeline/scan/olap_scan_context.cpp @@ -90,7 +90,7 @@ Status OlapScanContext::capture_tablet_rowsets(const std::vectorfull_name() << ", rowsets: " << tablet_rowsets[i].size() - << ", version: " << scan_range->version; + << ", version: " << scan_range->version << ", gtid: " << scan_range->gtid; _tablets[i] = std::move(tablet); } diff --git a/be/src/exec/sorting/sort_column.cpp b/be/src/exec/sorting/sort_column.cpp index 1d8f440725268..8df27f6086343 100644 --- a/be/src/exec/sorting/sort_column.cpp +++ b/be/src/exec/sorting/sort_column.cpp @@ -27,7 +27,6 @@ #include "column/map_column.h" #include "column/nullable_column.h" #include "column/struct_column.h" -#include "common/status.h" #include "exec/sorting/sort_helper.h" #include "exec/sorting/sort_permute.h" #include "exec/sorting/sorting.h" diff --git a/be/src/exprs/array_functions.cpp b/be/src/exprs/array_functions.cpp index e0357b0e88464..b9ea4b989b55e 100644 --- a/be/src/exprs/array_functions.cpp +++ b/be/src/exprs/array_functions.cpp @@ -1097,21 +1097,13 @@ StatusOr ArrayFunctions::all_match(FunctionContext* context, const Co } StatusOr ArrayFunctions::any_match(FunctionContext* context, const Columns& columns) { - LOG(INFO) << "evaluate any_match"; return ArrayMatch::process(context, columns); } StatusOr ArrayFunctions::concat(FunctionContext* ctx, const Columns& columns) { RETURN_IF_COLUMNS_ONLY_NULL(columns); - // @TODO optimize for const column - auto num_rows = columns[0]->size(); - LOG(INFO) << "array_concat, num_rows: " << num_rows; - for (auto& column : columns) { - LOG(INFO) << "column size: " << column->size() << ", is_const: " << column->is_constant() - << ", is_nullable: " << column->is_nullable(); - } // compute nulls NullColumnPtr nulls; for (auto& column : columns) { @@ -1133,7 +1125,6 @@ StatusOr ArrayFunctions::concat(FunctionContext* ctx, const Columns& auto nullable_column = down_cast(column.get()); array_columns.emplace_back(std::static_pointer_cast(nullable_column->data_column())); } else if (column->is_constant()) { - // @TODO no need // NOTE: I'm not sure if there will be const array, just to be safe array_columns.emplace_back(std::static_pointer_cast( ColumnHelper::unpack_and_duplicate_const_column(num_rows, column))); diff --git a/be/src/exprs/expr.cpp b/be/src/exprs/expr.cpp index 5849df1cc2669..1e020edd1b01e 100644 --- a/be/src/exprs/expr.cpp +++ b/be/src/exprs/expr.cpp @@ -607,7 +607,7 @@ std::string Expr::debug_string(const std::vector& exprs) { out << "["; for (int i = 0; i < exprs.size(); ++i) { - out << (i == 0 ? "" : "\n") << exprs[i]->debug_string(); + out << (i == 0 ? "" : " ") << exprs[i]->debug_string(); } out << "]"; diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index 16b50bdfafb3e..874aa695ded2f 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -79,10 +79,6 @@ class LambdaFunction final : public Expr { Status collect_lambda_argument_ids(); Status collect_capture_slot_ids(); Status extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx); - // void extract_outer_common_exprs(RuntimeState* state); - // static const SlotId kIndependentStartId = 10000; - // void find_all_independent_capture_column(Expr* expr, std::vector* ids); - // void try_to_replace_commom_expr(RuntimeState* state, Expr* expr); std::vector _captured_slot_ids; // @TODO change to set @@ -90,7 +86,6 @@ class LambdaFunction final : public Expr { std::vector _common_sub_expr_ids; std::vector _common_sub_expr; - // std::unordered_map _outer_common_exprs; int _common_sub_expr_num; bool _is_prepared = false; bool _is_lambda_expr_independent = false; diff --git a/be/src/serde/column_array_serde.cpp b/be/src/serde/column_array_serde.cpp index aad194952df7f..f86700d65e0d5 100644 --- a/be/src/serde/column_array_serde.cpp +++ b/be/src/serde/column_array_serde.cpp @@ -31,8 +31,6 @@ #include "column/nullable_column.h" #include "column/object_column.h" #include "column/struct_column.h" -#include "column/vectorized_fwd.h" -#include "common/status.h" #include "gutil/strings/substitute.h" #include "runtime/descriptors.h" #include "serde/protobuf_serde.h" From 07a5dc61b0b24774767770526d6102e56dd1a01d Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Mon, 23 Sep 2024 13:00:00 +0800 Subject: [PATCH 09/17] fix Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/column/array_column.cpp | 5 ++-- be/src/column/array_column.h | 4 +-- be/src/exprs/array_map_expr.cpp | 40 +++++++++++++------------ be/src/exprs/array_map_expr.h | 4 +-- be/src/exprs/lambda_function.cpp | 50 +++++++++----------------------- be/src/exprs/lambda_function.h | 17 +++++++---- 6 files changed, 52 insertions(+), 68 deletions(-) diff --git a/be/src/column/array_column.cpp b/be/src/column/array_column.cpp index bbf8b93ca9285..fdf99d881202e 100644 --- a/be/src/column/array_column.cpp +++ b/be/src/column/array_column.cpp @@ -622,11 +622,11 @@ size_t ArrayColumn::get_total_elements_num(const NullColumnPtr& null_column) con if (null_column == nullptr) { return _elements->size(); } - DCHECK_LE(_offsets->size() -1, null_column->size()); + DCHECK_LE(_offsets->size() - 1, null_column->size()); size_t elements_num = 0; size_t num_rows = _offsets->size() - 1; const auto& null_data = null_column->get_data(); - for (size_t i = 0;i < num_rows;i++) { + for (size_t i = 0; i < num_rows; i++) { if (!null_data[i]) { elements_num += _offsets->get_data()[i + 1] - _offsets->get_data()[i]; } @@ -651,7 +651,6 @@ bool ArrayColumn::compare_lengths_from_offsets(const UInt32Column& v1, const UIn const auto& offsets_v1 = v1.get_data(); const auto& offsets_v2 = v2.get_data(); - for (size_t i = 0; i < num_rows && result; i++) { [[maybe_unused]] uint32_t len1 = (ConstV1) ? (offsets_v1[1] - offsets_v1[0]) : (offsets_v1[i + 1] - offsets_v1[i]); diff --git a/be/src/column/array_column.h b/be/src/column/array_column.h index 73db78fb86d2f..1398666d2c16f 100644 --- a/be/src/column/array_column.h +++ b/be/src/column/array_column.h @@ -196,10 +196,10 @@ class ArrayColumn final : public ColumnFactory { Status unfold_const_children(const starrocks::TypeDescriptor& type) override; - // calculate all non-null elements' size + // get the number of all non-null elements size_t get_total_elements_num(const NullColumnPtr& null_column) const; - // check if all of arrays' size is equal + // check if the length of each array in two columns is equal // v1 and v2 must be one of ArrayColumn or Const(ArrayColumn) template static bool is_all_array_lengths_equal(const ColumnPtr& v1, const ColumnPtr& v2, const NullColumnPtr& null_data); diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index 051bbaf65d568..ae8d66a36ad4e 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -65,8 +65,9 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { } template -StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chunk* chunk, const std::vector& input_elements, NullColumnPtr result_null_column) { - +StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chunk* chunk, + const std::vector& input_elements, + NullColumnPtr result_null_column) { // create a new chunk to evaluate the lambda expression auto cur_chunk = std::make_shared(); @@ -85,8 +86,8 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu DCHECK(slot_id > 0); auto captured_column = chunk->get_column_by_slot_id(slot_id); if (UNLIKELY(captured_column->size() < input_elements[0]->size())) { - return Status::InternalError(fmt::format( - "The size of the captured column {} is less than array's size.", captured_column->get_name())); + return Status::InternalError(fmt::format("The size of the captured column {} is less than array's size.", + captured_column->get_name())); } } @@ -95,7 +96,7 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu // 3.2 get aligned_offset UInt32Column::Ptr aligned_offsets = nullptr; - size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()): 0; + size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()) : 0; std::vector arguments_ids; int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); @@ -121,7 +122,7 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu } } else { data_column->empty_null_in_complex_column(result_null_column->get_data(), - array_column->offsets().get_data()); + array_column->offsets().get_data()); elements_column = down_cast(data_column.get())->elements_column(); } } @@ -129,10 +130,10 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu if (aligned_offsets == nullptr) { aligned_offsets = offsets_column; } - + // if lambda expr doesn't rely on argument, we don't need to put it into cur_chunk if constexpr (!independent_lambda_expr) { - // @TODO what if it is a const + // @TODO what if it is a const cur_chunk->append_column(elements_column, arguments_ids[i]); } } @@ -143,7 +144,7 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu auto column = chunk->get_column_by_slot_id(slot_id); column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); if constexpr (independent_lambda_expr) { - // if lambda expr doesn't rely on arguments, we don't need to align offset + // if lambda expr doesn't rely on arguments, we don't need to align offset cur_chunk->append_column(column, slot_id); } else { cur_chunk->append_column(column->replicate(aligned_offsets->get_data()), slot_id); @@ -176,11 +177,11 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu column = tmp_col->replicate(aligned_offsets->get_data()); column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); } else { - // if all input arguments are const, + // if all input arguments are const, if constexpr (all_const_input) { ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); tmp_col->check_or_die(); - + column = FunctionHelper::get_data_column_of_const(tmp_col); column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); } else { @@ -206,11 +207,12 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu if constexpr (all_const_input) { // if all input arguments are const, we can return a const column auto data_column = FunctionHelper::get_data_column_of_const(column); - + aligned_offsets = UInt32Column::create(); aligned_offsets->append(0); aligned_offsets->append(column->size()); - auto array_column = std::make_shared(data_column, ColumnHelper::as_column(aligned_offsets)); + auto array_column = + std::make_shared(data_column, ColumnHelper::as_column(aligned_offsets)); array_column->check_or_die(); ColumnPtr result_column = array_column; if (result_null_column != nullptr) { @@ -325,7 +327,8 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* array_col->check_or_die(); if (result_null_column) { result_null_column->resize(1); - auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), chunk->num_rows()); + auto result = ConstColumn::create(NullableColumn::create(std::move(array_col), result_null_column), + chunk->num_rows()); result->check_or_die(); return result; } @@ -334,13 +337,13 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* return result; } - size_t total_elements_num = down_cast( - FunctionHelper::get_data_column_of_const(input_elements[0]).get())->get_total_elements_num(result_null_column); + size_t total_elements_num = + down_cast(FunctionHelper::get_data_column_of_const(input_elements[0]).get()) + ->get_total_elements_num(result_null_column); if (total_elements_num == 0) { // if all input rows are empty arrays, return a const empty array column as result - column = ColumnHelper::create_column(type().children[0], - true); + column = ColumnHelper::create_column(type().children[0], true); auto aligned_offsets = UInt32Column::create(0); aligned_offsets->append_default(2); auto array_col = std::make_shared(column, aligned_offsets); @@ -361,7 +364,6 @@ StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* } else { return evaluate_lambda_expr(context, chunk, input_elements, result_null_column); } - } std::string ArrayMapExpr::debug_string() const { diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index 314af7c448b80..13fa59f627f31 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -18,11 +18,11 @@ #include #include +#include "column/nullable_column.h" #include "common/global_types.h" #include "common/object_pool.h" #include "exprs/column_ref.h" #include "exprs/expr.h" -#include "column/nullable_column.h" #include "glog/logging.h" namespace starrocks { @@ -45,7 +45,7 @@ class ArrayMapExpr final : public Expr { private: template StatusOr evaluate_lambda_expr(ExprContext* context, Chunk* chunk, - const std::vector& arguments, NullColumnPtr null_column); + const std::vector& arguments, NullColumnPtr null_column); // use map to make sure the order of execution std::map _outer_common_exprs; diff --git a/be/src/exprs/lambda_function.cpp b/be/src/exprs/lambda_function.cpp index f854b937b8a3a..8b98521ea6a9e 100644 --- a/be/src/exprs/lambda_function.cpp +++ b/be/src/exprs/lambda_function.cpp @@ -32,25 +32,14 @@ namespace starrocks { LambdaFunction::LambdaFunction(const TExprNode& node) : Expr(node, false), _common_sub_expr_num(node.output_column) {} Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx) { - if (expr->is_slotref()) { - return Status::OK(); - } - LOG(INFO) << "extract expr: " << expr->debug_string(); - - // @TODO can we remove lambda?? - // any_match(array_map( -> any_match(array_map( -> < 10, 3: arr_largeint)), 3: arr_largeint)) - // -> any_match(array_map( -> , arr_largeint)) - // slot 8: array_map( -> < 10, arr_largeint) - // slot 9: any_match(, arr_largeint) - // @OTOD what if expr is LambdaFunction int child_num = expr->get_num_children(); std::vector slot_ids; - // @TODO we can't replace lambda? for (int i = 0; i < child_num; i++) { auto child = expr->get_child(i); RETURN_IF_ERROR(extract_outer_common_exprs(state, child, ctx)); + // if child is a slotref or a lambda function, we can't replace it. if (child->is_slotref() || child->is_lambda_function()) { continue; } @@ -59,13 +48,12 @@ Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* exp bool is_independent = std::all_of(slot_ids.begin(), slot_ids.end(), [ctx](const SlotId& id) { return ctx->lambda_arguments.find(id) == ctx->lambda_arguments.end(); }); - // - // if (is_independent && !child->is_lambda_function()) { + if (is_independent) { SlotId slot_id = ctx->next_slot_id++; ColumnRef* column_ref = state->obj_pool()->add(new ColumnRef(child->type(), slot_id)); - LOG(INFO) << "add new common expr, slot_id: " << slot_id << ", new expr: " << column_ref->debug_string() - << ", old expr: " << child->debug_string(); + VLOG(1) << "add new common expr, slot_id: " << slot_id << ", new expr: " << column_ref->debug_string() + << ", old expr: " << child->debug_string(); expr->_children[i] = column_ref; ctx->outer_common_exprs.insert({slot_id, child}); } @@ -77,9 +65,8 @@ Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, ExtractCo RETURN_IF_ERROR(collect_lambda_argument_ids()); for (auto argument_id : _arguments_ids) { ctx->lambda_arguments.insert(argument_id); - LOG(INFO) << "lambda arg id: " << argument_id; } - // @TODO what if lambda_expr is independent? + auto lambda_expr = _children[0]; RETURN_IF_ERROR(extract_outer_common_exprs(state, lambda_expr, ctx)); return Status::OK(); @@ -118,11 +105,6 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo // common sub expressions include 2 parts in a pair: (slot id, expression) const int child_num = get_num_children() - 2 * _common_sub_expr_num; - LOG(INFO) << "lambda child num: " << child_num << ", common: " << _common_sub_expr_num; - LOG(INFO) << debug_string(); - for (int i = 0; i < child_num; i++) { - LOG(INFO) << "child[" << i << "] = " << get_child(i)->debug_string(); - } RETURN_IF_ERROR(collect_lambda_argument_ids()); // sorted common sub expressions so that the later expressions can reference the previous ones. @@ -134,13 +116,10 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo fmt::format("Lambda common sub expression id's size {} is not equal to expected {}", _common_sub_expr_ids.size(), _common_sub_expr_num)); } - LOG(INFO) << "lambda common_sub_expr_num: " << _common_sub_expr_num; for (auto i = child_num + _common_sub_expr_num; i < child_num + 2 * _common_sub_expr_num; ++i) { - LOG(INFO) << "commom expr: " << i << ", " << get_child(i)->debug_string(); _common_sub_expr.push_back(get_child(i)); get_child(i)->get_slot_ids(&_captured_slot_ids); - // @TODO why put into captured slot id } if (_common_sub_expr.size() != _common_sub_expr_num) { return Status::InternalError(fmt::format("Lambda common sub expressions' size {} is not equal to expected {}", @@ -149,22 +128,21 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo // get slot ids from the lambda expression get_child(0)->get_slot_ids(&_captured_slot_ids); - // bool is_lambda_independent = true; + _is_lambda_expr_independent = true; - for (auto id : _captured_slot_ids) { - LOG(INFO) << "lambda capture id: " << id; + + // if all captured slot ids are not in lambda arguments ids, then lambda expr is independent. + // for example, + // in array_map(x->id, arg1), the lambda expr `id` is independent. + // but in array_map(x->arg1+id, arg1), the lambda expr `arg1+id` is not independent. + for (size_t i = 0; i < _captured_slot_ids.size() && _is_lambda_expr_independent; ++i) { for (const auto& arguments_id : _arguments_ids) { - if (id == arguments_id) { - // is_lambda_independent = false; + if (_captured_slot_ids[i] == arguments_id) { _is_lambda_expr_independent = false; break; } } } - LOG(INFO) << "lambda is independent: " << _is_lambda_expr_independent; - // if lambda expr is independent, mark - - // @TODO find all independent capture column, evaluate them first... // remove current argument ids and duplicated ids from captured_slot_ids std::map captured_mask; @@ -199,11 +177,9 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo } StatusOr LambdaFunction::evaluate_checked(ExprContext* context, Chunk* chunk) { - LOG(INFO) << "evaluate LambdaFunction, " << (void*)this; for (auto i = 0; i < _common_sub_expr.size(); ++i) { auto sub_col = EVALUATE_NULL_IF_ERROR(context, _common_sub_expr[i], chunk); chunk->append_column(sub_col, _common_sub_expr_ids[i]); - LOG(INFO) << "eval common expr: " << _common_sub_expr_ids[i]; } return get_child(0)->evaluate_checked(context, chunk); } diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index 874aa695ded2f..833b04dc3deca 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -47,7 +47,6 @@ class LambdaFunction final : public Expr { // the slot ids of lambda expression may be originally from the arguments of this lambda function // or its parent lambda functions, or captured columns, remove the first one. - // only capture column id, int get_slot_ids(std::vector* slot_ids) const override { slot_ids->insert(slot_ids->end(), _captured_slot_ids.begin(), _captured_slot_ids.end()); return _captured_slot_ids.size(); @@ -59,20 +58,28 @@ class LambdaFunction final : public Expr { } bool is_lambda_function() const override { return true; } - bool is_lambda_expr_independent() const { - return _is_lambda_expr_independent; - } + bool is_lambda_expr_independent() const { return _is_lambda_expr_independent; } Expr* get_lambda_expr() const { return _children[0]; } std::string debug_string() const override; + SlotId max_used_slot_id() const; + struct ExtractContext { std::unordered_set lambda_arguments; SlotId next_slot_id; std::map outer_common_exprs; }; - SlotId max_used_slot_id() const; + // Extract the outer common expression in lambda expr. + // Outer common expr is an expression that does not depend on lambda arguments at all. Such expressions can be calculated independently. + // NOTE: Calling this interface may rewrite Lambda expr, and all outer common expr will be replaced with ColumnRef expr. + // Functions using lambda expressions can extract common expressions first and calculate them separately, so as to optimize. + + // take `array_map(x->any_match(array_map(x->x < 10, arr1)), arr1)` as an example, + // `any_match(array_map(x->x<10, arr1))` is an outer common expr. it will create 2 column ref exprs to replace them. + // 1. slot 1 -> array_map(x->x<10, arr1) + // 2. slot 2 -> any_match(slot 1, arr1) Status extract_outer_common_exprs(RuntimeState* state, ExtractContext* ctx); private: From 33424ea1cc0fefe4856d8a18da901366cdc65f89 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Mon, 23 Sep 2024 13:10:39 +0800 Subject: [PATCH 10/17] fix Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/exprs/array_map_expr.cpp | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index ae8d66a36ad4e..e30ef6b7e9f7e 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -49,14 +49,13 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { auto lambda_expr = down_cast(_children[0]); LambdaFunction::ExtractContext extract_ctx; + // assign slot ids to outer common exprs starting with max_used_slot_id + 1 extract_ctx.next_slot_id = lambda_expr->max_used_slot_id() + 1; - LOG(INFO) << "ArrayMap::prepare, next slot id: " << extract_ctx.next_slot_id << ", this: " << (void*)this; RETURN_IF_ERROR(lambda_expr->extract_outer_common_exprs(state, &extract_ctx)); _outer_common_exprs.swap(extract_ctx.outer_common_exprs); for (auto [_, expr] : _outer_common_exprs) { - LOG(INFO) << "prepare common expr: " << expr->debug_string(); RETURN_IF_ERROR(expr->prepare(state, context)); } RETURN_IF_ERROR(lambda_expr->prepare(state, context)); @@ -71,7 +70,7 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu // create a new chunk to evaluate the lambda expression auto cur_chunk = std::make_shared(); - // 1. evaluate all outer common expressions + // 1. evaluate outer common expressions for (const auto& [slot_id, expr] : _outer_common_exprs) { ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); chunk->append_column(col, slot_id); @@ -81,7 +80,7 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu std::vector capture_slot_ids; lambda_func->get_slot_ids(&capture_slot_ids); - // 2. check captured columnss size + // 2. check captured columns' size for (auto slot_id : capture_slot_ids) { DCHECK(slot_id > 0); auto captured_column = chunk->get_column_by_slot_id(slot_id); @@ -91,15 +90,13 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu } } - // 3. prepare lambda arguments: - // 3.1 put all elements column into cur_chunk - // 3.2 get aligned_offset - UInt32Column::Ptr aligned_offsets = nullptr; size_t null_rows = result_null_column ? SIMD::count_nonzero(result_null_column->get_data()) : 0; std::vector arguments_ids; int argument_num = lambda_func->get_lambda_arguments_ids(&arguments_ids); + + // 3. prepare arguments of lambda expr, put all arguments into cur_chunk for (int i = 0; i < argument_num; ++i) { auto data_column = FunctionHelper::get_data_column_of_const(input_elements[i]); auto array_column = down_cast(data_column.get()); @@ -133,13 +130,12 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu // if lambda expr doesn't rely on argument, we don't need to put it into cur_chunk if constexpr (!independent_lambda_expr) { - // @TODO what if it is a const cur_chunk->append_column(elements_column, arguments_ids[i]); } } DCHECK(aligned_offsets != nullptr); - // 4. prepare outer common expr + // 4. prepare outer common exprs for (const auto& [slot_id, expr] : _outer_common_exprs) { auto column = chunk->get_column_by_slot_id(slot_id); column = ColumnHelper::unpack_and_duplicate_const_column(column->size(), column); @@ -150,7 +146,8 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu cur_chunk->append_column(column->replicate(aligned_offsets->get_data()), slot_id); } } - // 5. append capture column + + // 5. prepare capture columns for (auto slot_id : capture_slot_ids) { if (cur_chunk->is_slot_exist(slot_id)) { continue; From 54e7527302a491c26dd8993fd403f6c9ea24c871 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Mon, 23 Sep 2024 16:52:31 +0800 Subject: [PATCH 11/17] fix test Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- test/sql/test_array/T/test_array_map | 60 ------------------------- test/sql/test_array_fn/T/test_array_map | 58 ------------------------ 2 files changed, 118 deletions(-) delete mode 100644 test/sql/test_array_fn/T/test_array_map diff --git a/test/sql/test_array/T/test_array_map b/test/sql/test_array/T/test_array_map index 19e71a3788ec0..6655e83d8e189 100644 --- a/test/sql/test_array/T/test_array_map +++ b/test/sql/test_array/T/test_array_map @@ -78,63 +78,3 @@ VALUES x -> CAST(x AS STRING), ARRAY_GENERATE(1, 1000) )); - --- name: test_array_map_2 -CREATE TABLE `array_map_test` ( - `id` tinyint(4) NOT NULL COMMENT "", - `arr_str` array NULL COMMENT "", - `arr_largeint` array NULL COMMENT "" -) ENGINE=OLAP -DUPLICATE KEY(`id`) -DISTRIBUTED BY RANDOM -PROPERTIES ( -"replication_num" = "1" -); - -insert into array_map_test values (1, array_repeat("abcdefghasdasdasirnqwrq", 20000), array_repeat(100, 20000)); - -select count() from array_map_test where array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint)) > 10 ; -select count(array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint))) from array_map_test; - -select count() from array_map_test where any_match(x->any_match(x->x<10, arr_largeint), arr_largeint); -select count(any_match(x->any_match(x->x<10, arr_largeint), arr_largeint)) from array_map_test; -select count(array_map(x->array_length(array_concat(arr_str,[])), arr_largeint)) from array_map_test; - -set @arr=array_repeat("12345",1000000); -select array_length(array_map((x,y)->x > y, @arr,@arr)) from table(generate_series(1,10,1)); - --- name: test_array_map_3 -CREATE TABLE `t` ( - `k` bigint NOT NULL COMMENT "", - `arr_0` array NOT NULL COMMENT "", - `arr_1` array NULL COMMENT "", - `arr_2` array NULL COMMENT "" -) ENGINE=OLAP -primary KEY(`k`) -DISTRIBUTED BY RANDOM BUCKETS 1 -PROPERTIES ( -"replication_num" = "1" -); - -insert into t values (1, [1,2], [1,2],[2,3]), (2, [1,2], null, [2,3]), (3, [1,2],[1,2],null),(4, [1,2],[null,null],[2,3]), (5, [1], [1,2], [3]); -select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t; -select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t where k != 5 order by k; -delete from t where k = 5; - -select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t order by k; -select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1,2]) from t order by k; -select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1]) from t order by k; - -select array_map(x->x, arr_0) from t order by k; --- independent expr -select array_map((x,y,z)->10, arr_0, arr_1, arr_2) from t; -select array_map((x,y)-> k, arr_0, arr_1) from t order by k; - --- independent expr with all const -select array_map((x,y)->k, [1,2],[2,3]) from t order by k; - --- non indepentdent with all const -select array_map((x,y,z)->x+y+z, [1,2],[2,3],[3,4]) from t; -select array_map((x,y,z)->x+y+z, [1,2],[2,null],[3,4]) from t; -select array_map((x,y,z)->x+y+z, [1,2],[2,null],null) from t; - diff --git a/test/sql/test_array_fn/T/test_array_map b/test/sql/test_array_fn/T/test_array_map deleted file mode 100644 index 7862ac3280aba..0000000000000 --- a/test/sql/test_array_fn/T/test_array_map +++ /dev/null @@ -1,58 +0,0 @@ --- name: test_array_map_2 -CREATE TABLE `array_map_test` ( - `id` tinyint(4) NOT NULL COMMENT "", - `arr_str` array NULL COMMENT "", - `arr_largeint` array NULL COMMENT "" -) ENGINE=OLAP -DUPLICATE KEY(`id`) -DISTRIBUTED BY RANDOM -PROPERTIES ( -"replication_num" = "1" -); - -insert into array_map_test values (1, array_repeat("abcdefghasdasdasirnqwrq", 20000), array_repeat(100, 20000)); - -select count() from array_map_test where array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint)) > 10 ; -select count(array_length(array_map((x,y)->(id+length(x)+y), arr_str, arr_largeint))) from array_map_test; - -select count() from array_map_test where any_match(x->any_match(x->x<10, arr_largeint), arr_largeint); -select count(any_match(x->any_match(x->x<10, arr_largeint), arr_largeint)) from array_map_test; -select count(array_map(x->array_length(array_concat(arr_str,[])), arr_largeint)) from array_map_test; - -set @arr=array_repeat("12345",1000000); -select array_length(array_map((x,y)->x > y, @arr,@arr)) from table(generate_series(1,10,1)); - --- name: test_array_map_3 -CREATE TABLE `t` ( - `k` bigint NOT NULL COMMENT "", - `arr_0` array NOT NULL COMMENT "", - `arr_1` array NULL COMMENT "", - `arr_2` array NULL COMMENT "" -) ENGINE=OLAP -primary KEY(`k`) -DISTRIBUTED BY RANDOM BUCKETS 1 -PROPERTIES ( -"replication_num" = "1" -); - -insert into t values (1, [1,2], [1,2],[2,3]), (2, [1,2], null, [2,3]), (3, [1,2],[1,2],null),(4, [1,2],[null,null],[2,3]), (5, [1], [1,2], [3]); -select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t; -select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t where k != 5 order by k; -delete from t where k = 5; - -select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t order by k; -select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1,2]) from t order by k; -select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1]) from t order by k; - -select array_map(x->x, arr_0) from t order by k; --- independent expr -select array_map((x,y,z)->10, arr_0, arr_1, arr_2) from t; -select array_map((x,y)-> k, arr_0, arr_1) from t order by k; - --- independent expr with all const -select array_map((x,y)->k, [1,2],[2,3]) from t order by k; - --- non indepentdent with all const -select array_map((x,y,z)->x+y+z, [1,2],[2,3],[3,4]) from t; -select array_map((x,y,z)->x+y+z, [1,2],[2,null],[3,4]) from t; -select array_map((x,y,z)->x+y+z, [1,2],[2,null],null) from t; From cabbe0fa2a3efa9c3daa45f1c6b60baa573ce088 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Thu, 26 Sep 2024 14:56:15 +0800 Subject: [PATCH 12/17] fix ut Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/exprs/array_map_expr.cpp | 8 +- be/src/exprs/lambda_function.h | 1 - be/test/exprs/lambda_array_expr_test.cpp | 166 ++++++++++++----------- 3 files changed, 87 insertions(+), 88 deletions(-) diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index e30ef6b7e9f7e..5581c1c58238a 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -69,7 +69,6 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu NullColumnPtr result_null_column) { // create a new chunk to evaluate the lambda expression auto cur_chunk = std::make_shared(); - // 1. evaluate outer common expressions for (const auto& [slot_id, expr] : _outer_common_exprs) { ASSIGN_OR_RETURN(auto col, context->evaluate(expr, chunk)); @@ -178,8 +177,8 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu if constexpr (all_const_input) { ASSIGN_OR_RETURN(auto tmp_col, context->evaluate(_children[0], cur_chunk.get())); tmp_col->check_or_die(); - - column = FunctionHelper::get_data_column_of_const(tmp_col); + // if result is a const column, we should unpack it first and make it to be the elements column of array column + column = ColumnHelper::unpack_and_duplicate_const_column(tmp_col->size(), tmp_col); column = ColumnHelper::align_return_type(column, type().children[0], column->size(), true); } else { ChunkAccumulator accumulator(DEFAULT_CHUNK_SIZE); @@ -207,7 +206,7 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu aligned_offsets = UInt32Column::create(); aligned_offsets->append(0); - aligned_offsets->append(column->size()); + aligned_offsets->append(data_column->size()); auto array_column = std::make_shared(data_column, ColumnHelper::as_column(aligned_offsets)); array_column->check_or_die(); @@ -235,7 +234,6 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu // NOTE the return column must be of the return type. StatusOr ArrayMapExpr::evaluate_checked(ExprContext* context, Chunk* chunk) { std::vector input_elements; - bool is_single_nullable_child = false; NullColumnPtr result_null_column = nullptr; diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index 833b04dc3deca..1c1e79dec3992 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -88,7 +88,6 @@ class LambdaFunction final : public Expr { Status extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx); std::vector _captured_slot_ids; - // @TODO change to set std::vector _arguments_ids; std::vector _common_sub_expr_ids; std::vector _common_sub_expr; diff --git a/be/test/exprs/lambda_array_expr_test.cpp b/be/test/exprs/lambda_array_expr_test.cpp index 876a643fdf4e5..5fd7f149b0c0a 100644 --- a/be/test/exprs/lambda_array_expr_test.cpp +++ b/be/test/exprs/lambda_array_expr_test.cpp @@ -16,6 +16,7 @@ #include #include +#include #include "butil/time.h" #include "column/column_helper.h" @@ -28,6 +29,7 @@ #include "exprs/is_null_predicate.h" #include "exprs/lambda_function.h" #include "exprs/literal.h" +#include "exprs/function_helper.h" #include "exprs/mock_vectorized_expr.h" #include "runtime/runtime_state.h" #include "testutil/assert.h" @@ -272,37 +274,37 @@ TEST_F(VectorizedLambdaFunctionExprTest, array_map_lambda_test_normal_array) { ASSERT_FALSE(result->is_constant()); ASSERT_FALSE(result->is_numeric()); - EXPECT_EQ(3, result->size()); - EXPECT_EQ(1, result->get(0).get_array()[0].get_int32()); - EXPECT_EQ(4, result->get(0).get_array()[1].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(1, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(4, result->get(0).get_array()[1].get_int32()); ASSERT_TRUE(result->get(1).get_array()[0].is_null()); ASSERT_TRUE(result->get(1).get_array()[1].is_null()); ASSERT_TRUE(result->get(2).get_array()[0].is_null()); - EXPECT_EQ(12, result->get(2).get_array()[1].get_int32()); + ASSERT_EQ(12, result->get(2).get_array()[1].get_int32()); } else if (i == 0 && j == 1) { // array_map(x -> x is null, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(0, result->get(0).get_array()[0].get_int8()); - EXPECT_EQ(0, result->get(0).get_array()[1].get_int8()); - EXPECT_EQ(1, result->get(1).get_array()[0].get_int8()); - EXPECT_EQ(1, result->get(1).get_array()[1].get_int8()); - EXPECT_EQ(1, result->get(2).get_array()[0].get_int8()); - EXPECT_EQ(0, result->get(2).get_array()[1].get_int8()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(0, result->get(0).get_array()[0].get_int8()); + ASSERT_EQ(0, result->get(0).get_array()[1].get_int8()); + ASSERT_EQ(1, result->get(1).get_array()[0].get_int8()); + ASSERT_EQ(1, result->get(1).get_array()[1].get_int8()); + ASSERT_EQ(1, result->get(2).get_array()[0].get_int8()); + ASSERT_EQ(0, result->get(2).get_array()[1].get_int8()); } else if (i == 0 && j == 2) { // // array_map(x -> x+a, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(2, result->get(0).get_array()[0].get_int32()); - EXPECT_EQ(5, result->get(0).get_array()[1].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(2, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(5, result->get(0).get_array()[1].get_int32()); ASSERT_TRUE(result->get(1).get_array()[0].is_null()); ASSERT_TRUE(result->get(1).get_array()[1].is_null()); ASSERT_TRUE(result->get(2).get_array()[0].is_null()); - EXPECT_EQ(13, result->get(2).get_array()[1].get_int32()); + ASSERT_EQ(13, result->get(2).get_array()[1].get_int32()); } else if (i == 0 && j == 3) { - EXPECT_EQ(3, result->size()); - EXPECT_EQ(-110, result->get(0).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(0).get_array()[1].get_int32()); - EXPECT_EQ(-110, result->get(1).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(1).get_array()[1].get_int32()); - EXPECT_EQ(-110, result->get(2).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(2).get_array()[1].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(-110, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(0).get_array()[1].get_int32()); + ASSERT_EQ(-110, result->get(1).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(1).get_array()[1].get_int32()); + ASSERT_EQ(-110, result->get(2).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(2).get_array()[1].get_int32()); } Expr::close(expr_ctxs, &_runtime_state); @@ -338,40 +340,39 @@ TEST_F(VectorizedLambdaFunctionExprTest, array_map_lambda_test_special_array) { } else { ASSERT_TRUE(ids.empty()); } - ColumnPtr result = array_map_expr.evaluate(&exprContext, cur_chunk.get()); if (i == 1) { // array_map(x->xxx,null) - EXPECT_EQ(1, result->size()); + ASSERT_EQ(3, result->size()); ASSERT_TRUE(result->is_null(0)); } else if (i == 2 && (j == 0 || j == 2)) { // array_map( x->x || x->x+a, [null]) - EXPECT_EQ(1, result->size()); + ASSERT_EQ(1, result->size()); ASSERT_TRUE(result->get(0).get_array()[0].is_null()); } else if (i == 2 && j == 1) { // array_map(x -> x is null,[null]) - EXPECT_EQ(1, result->size()); - EXPECT_EQ(1, result->get(0).get_array()[0].get_int8()); + ASSERT_EQ(1, result->size()); + ASSERT_EQ(1, result->get(0).get_array()[0].get_int8()); } else if (i == 2 && j == 3) { // array_map(x -> -110,[null]) - EXPECT_EQ(1, result->size()); - EXPECT_EQ(-110, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(1, result->size()); + ASSERT_EQ(-110, result->get(0).get_array()[0].get_int32()); } else if (i == 3) { // array_map(x->xxx,[]) - EXPECT_EQ(1, result->size()); + ASSERT_EQ(3, result->size()); ASSERT_TRUE(result->get(0).get_array().empty()); } else if (i == 4 && (j == 0 || j == 2)) { // array_map(x->x || x->x+a, array) // [null] // [] // NULL - EXPECT_EQ(3, result->size()); + ASSERT_EQ(3, result->size()); ASSERT_TRUE(result->get(0).get_array()[0].is_null()); ASSERT_TRUE(result->get(1).get_array().empty()); ASSERT_TRUE(result->is_null(2)); } else if (i == 4 && j == 1) { // array_map(x->x is null, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(1, result->get(0).get_array()[0].get_int8()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(1, result->get(0).get_array()[0].get_int8()); ASSERT_TRUE(result->get(1).get_array().empty()); ASSERT_TRUE(result->is_null(2)); } else if (i == 4 && j == 3) { // array_map(x-> -110, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(-110, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(-110, result->get(0).get_array()[0].get_int32()); ASSERT_TRUE(result->get(1).get_array().empty()); ASSERT_TRUE(result->is_null(2)); } @@ -409,77 +410,78 @@ TEST_F(VectorizedLambdaFunctionExprTest, array_map_lambda_test_const_array) { } else { ASSERT_TRUE(ids.empty()); } - ColumnPtr result = array_map_expr.evaluate(&exprContext, cur_chunk.get()); if (i == 5 && j == 0) { // array_map( x->x, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(1, result->get(0).get_array()[0].get_int32()); - EXPECT_EQ(4, result->get(0).get_array()[1].get_int32()); - EXPECT_EQ(1, result->get(1).get_array()[0].get_int32()); - EXPECT_EQ(4, result->get(1).get_array()[1].get_int32()); - EXPECT_EQ(1, result->get(2).get_array()[0].get_int32()); - EXPECT_EQ(4, result->get(2).get_array()[1].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(1, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(4, result->get(0).get_array()[1].get_int32()); + ASSERT_EQ(1, result->get(1).get_array()[0].get_int32()); + ASSERT_EQ(4, result->get(1).get_array()[1].get_int32()); + ASSERT_EQ(1, result->get(2).get_array()[0].get_int32()); + ASSERT_EQ(4, result->get(2).get_array()[1].get_int32()); } else if (i == 5 && j == 1) { // array_map(x->x is null, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(0, result->get(0).get_array()[0].get_int8()); - EXPECT_EQ(0, result->get(0).get_array()[1].get_int8()); - EXPECT_EQ(0, result->get(1).get_array()[0].get_int8()); - EXPECT_EQ(0, result->get(1).get_array()[1].get_int8()); - EXPECT_EQ(0, result->get(2).get_array()[0].get_int8()); - EXPECT_EQ(0, result->get(2).get_array()[1].get_int8()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(0, result->get(0).get_array()[0].get_int8()); + ASSERT_EQ(0, result->get(0).get_array()[1].get_int8()); + ASSERT_EQ(0, result->get(1).get_array()[0].get_int8()); + ASSERT_EQ(0, result->get(1).get_array()[1].get_int8()); + ASSERT_EQ(0, result->get(2).get_array()[0].get_int8()); + ASSERT_EQ(0, result->get(2).get_array()[1].get_int8()); + LOG(INFO) << "pass"; } else if (i == 5 && j == 2) { // // array_map( x->x + a, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(2, result->get(0).get_array()[0].get_int32()); - EXPECT_EQ(5, result->get(0).get_array()[1].get_int32()); - EXPECT_EQ(2, result->get(1).get_array()[0].get_int32()); - EXPECT_EQ(5, result->get(1).get_array()[1].get_int32()); - EXPECT_EQ(2, result->get(2).get_array()[0].get_int32()); - EXPECT_EQ(5, result->get(2).get_array()[1].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(2, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(5, result->get(0).get_array()[1].get_int32()); + ASSERT_EQ(2, result->get(1).get_array()[0].get_int32()); + ASSERT_EQ(5, result->get(1).get_array()[1].get_int32()); + ASSERT_EQ(2, result->get(2).get_array()[0].get_int32()); + ASSERT_EQ(5, result->get(2).get_array()[1].get_int32()); } else if (i == 5 && j == 3) { // // array_map( x-> -110, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(-110, result->get(0).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(0).get_array()[1].get_int32()); - EXPECT_EQ(-110, result->get(1).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(1).get_array()[1].get_int32()); - EXPECT_EQ(-110, result->get(2).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(2).get_array()[1].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(-110, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(0).get_array()[1].get_int32()); + ASSERT_EQ(-110, result->get(1).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(1).get_array()[1].get_int32()); + ASSERT_EQ(-110, result->get(2).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(2).get_array()[1].get_int32()); } else if (i == 6) { // array_map(x -> x || x->x is null || x -> x+a, array) - EXPECT_EQ(3, result->size()); + ASSERT_EQ(3, result->size()); ASSERT_TRUE(result->is_null(0)); ASSERT_TRUE(result->is_null(1)); ASSERT_TRUE(result->is_null(2)); } else if (i == 7 && (j == 0 || j == 2)) { // array_map(x -> x || x-> x+a,array) - EXPECT_EQ(3, result->size()); + ASSERT_EQ(3, result->size()); ASSERT_TRUE(result->get(0).get_array()[0].is_null()); ASSERT_TRUE(result->get(1).get_array()[0].is_null()); ASSERT_TRUE(result->get(2).get_array()[0].is_null()); } else if (i == 7 && j == 1) { // array_map(x -> x is null, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(1, result->get(0).get_array()[0].get_int8()); - EXPECT_EQ(1, result->get(1).get_array()[0].get_int8()); - EXPECT_EQ(1, result->get(2).get_array()[0].get_int8()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(1, result->get(0).get_array()[0].get_int8()); + ASSERT_EQ(1, result->get(1).get_array()[0].get_int8()); + ASSERT_EQ(1, result->get(2).get_array()[0].get_int8()); } else if (i == 7 && j == 3) { // array_map(x -> -110, array) - EXPECT_EQ(3, result->size()); - EXPECT_EQ(-110, result->get(0).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(1).get_array()[0].get_int32()); - EXPECT_EQ(-110, result->get(2).get_array()[0].get_int32()); + ASSERT_EQ(3, result->size()); + ASSERT_EQ(-110, result->get(0).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(1).get_array()[0].get_int32()); + ASSERT_EQ(-110, result->get(2).get_array()[0].get_int32()); } else if (i == 8) { // array_map(x -> x || x -> x is null || x -> x+a || x -> -110, array) - EXPECT_EQ(3, result->size()); + ASSERT_EQ(3, result->size()); ASSERT_TRUE(result->get(0).get_array().empty()); ASSERT_TRUE(result->get(1).get_array().empty()); ASSERT_TRUE(result->get(2).get_array().empty()); } if (j == 1) { // array -> array - if (result->is_nullable()) { - auto col = std::dynamic_pointer_cast(result); - auto array_col = std::dynamic_pointer_cast(col->data_column()); - EXPECT_EQ(2, array_col->elements_column()->type_size()); // nullable bool - } else { - auto array_col = std::dynamic_pointer_cast(result); - EXPECT_EQ(2, array_col->elements_column()->type_size()); // nullable bool + auto data_column = result; + if (data_column->is_constant()) { + data_column = FunctionHelper::get_data_column_of_const(data_column); + } + if (data_column->is_nullable()) { + data_column = down_cast(data_column.get())->data_column(); } + auto array_col = std::dynamic_pointer_cast(data_column); + ASSERT_EQ(2, array_col->elements_column()->type_size()); } Expr::close(expr_ctxs, &_runtime_state); } From 7f5a4797e8fe04a830fa8e39641636b3859f9d0a Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Thu, 26 Sep 2024 15:11:05 +0800 Subject: [PATCH 13/17] fix format Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/test/exprs/lambda_array_expr_test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/test/exprs/lambda_array_expr_test.cpp b/be/test/exprs/lambda_array_expr_test.cpp index 5fd7f149b0c0a..74ebc15e84bec 100644 --- a/be/test/exprs/lambda_array_expr_test.cpp +++ b/be/test/exprs/lambda_array_expr_test.cpp @@ -26,10 +26,10 @@ #include "exprs/array_map_expr.h" #include "exprs/cast_expr.h" #include "exprs/function_call_expr.h" +#include "exprs/function_helper.h" #include "exprs/is_null_predicate.h" #include "exprs/lambda_function.h" #include "exprs/literal.h" -#include "exprs/function_helper.h" #include "exprs/mock_vectorized_expr.h" #include "runtime/runtime_state.h" #include "testutil/assert.h" From 74800176457f1b57eed3c3fc7f7981de50f91d75 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Sun, 29 Sep 2024 16:15:07 +0800 Subject: [PATCH 14/17] fix clang-tidy Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/exprs/array_map_expr.cpp | 2 +- be/src/exprs/array_map_expr.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index 5581c1c58238a..ebeb87375af2e 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -66,7 +66,7 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { template StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chunk* chunk, const std::vector& input_elements, - NullColumnPtr result_null_column) { + const NullColumnPtr& result_null_column) { // create a new chunk to evaluate the lambda expression auto cur_chunk = std::make_shared(); // 1. evaluate outer common expressions diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index 13fa59f627f31..4becf3b9ca21e 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -45,7 +45,7 @@ class ArrayMapExpr final : public Expr { private: template StatusOr evaluate_lambda_expr(ExprContext* context, Chunk* chunk, - const std::vector& arguments, NullColumnPtr null_column); + const std::vector& arguments, const NullColumnPtr& null_column); // use map to make sure the order of execution std::map _outer_common_exprs; From e50d4f886f02c4082d49885c32cc39a13e372cc3 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:39:54 +0800 Subject: [PATCH 15/17] fix Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/exprs/array_map_expr.cpp | 12 +++++-- be/src/exprs/array_map_expr.h | 1 + be/src/exprs/expr.h | 1 + be/src/exprs/lambda_function.cpp | 43 +++++++++++++++++------ be/src/exprs/lambda_function.h | 12 ++++--- be/src/exprs/literal.h | 2 +- test/sql/test_array_fn/R/test_array_map_2 | 4 +-- 7 files changed, 56 insertions(+), 19 deletions(-) diff --git a/be/src/exprs/array_map_expr.cpp b/be/src/exprs/array_map_expr.cpp index ebeb87375af2e..2b19370b09719 100644 --- a/be/src/exprs/array_map_expr.cpp +++ b/be/src/exprs/array_map_expr.cpp @@ -54,7 +54,6 @@ Status ArrayMapExpr::prepare(RuntimeState* state, ExprContext* context) { RETURN_IF_ERROR(lambda_expr->extract_outer_common_exprs(state, &extract_ctx)); _outer_common_exprs.swap(extract_ctx.outer_common_exprs); - for (auto [_, expr] : _outer_common_exprs) { RETURN_IF_ERROR(expr->prepare(state, context)); } @@ -77,7 +76,7 @@ StatusOr ArrayMapExpr::evaluate_lambda_expr(ExprContext* context, Chu auto lambda_func = dynamic_cast(_children[0]); std::vector capture_slot_ids; - lambda_func->get_slot_ids(&capture_slot_ids); + lambda_func->get_captured_slot_ids(&capture_slot_ids); // 2. check captured columns' size for (auto slot_id : capture_slot_ids) { @@ -372,4 +371,13 @@ std::string ArrayMapExpr::debug_string() const { return out.str(); } +int ArrayMapExpr::get_slot_ids(std::vector* slot_ids) const { + int num = Expr::get_slot_ids(slot_ids); + for (const auto& [slot_id, _] : _outer_common_exprs) { + slot_ids->push_back(slot_id); + num++; + } + return num; +} + } // namespace starrocks diff --git a/be/src/exprs/array_map_expr.h b/be/src/exprs/array_map_expr.h index 4becf3b9ca21e..85e4c5435f3db 100644 --- a/be/src/exprs/array_map_expr.h +++ b/be/src/exprs/array_map_expr.h @@ -41,6 +41,7 @@ class ArrayMapExpr final : public Expr { StatusOr evaluate_checked(ExprContext* context, Chunk* ptr) override; std::string debug_string() const override; + int get_slot_ids(std::vector* slot_ids) const override; private: template diff --git a/be/src/exprs/expr.h b/be/src/exprs/expr.h index ef861b1411313..6d01eadc6e736 100644 --- a/be/src/exprs/expr.h +++ b/be/src/exprs/expr.h @@ -121,6 +121,7 @@ class Expr { bool is_monotonic() const { return _is_monotonic; } bool is_cast_expr() const { return _node_type == TExprNodeType::CAST_EXPR; } virtual bool is_lambda_function() const { return false; } + virtual bool is_literal() const { return false; } // In most time, this field is passed from FE // Sometimes we want to construct expr on BE implicitly and we have knowledge about `monotonicity` diff --git a/be/src/exprs/lambda_function.cpp b/be/src/exprs/lambda_function.cpp index 8b98521ea6a9e..b9ec0fc70775f 100644 --- a/be/src/exprs/lambda_function.cpp +++ b/be/src/exprs/lambda_function.cpp @@ -22,16 +22,28 @@ #include "column/chunk.h" #include "column/column_helper.h" #include "column/vectorized_fwd.h" +#include "column_ref.h" #include "exec/exec_node.h" #include "exprs/column_ref.h" #include "exprs/expr.h" #include "exprs/expr_context.h" +#include "lambda_function.h" namespace starrocks { LambdaFunction::LambdaFunction(const TExprNode& node) : Expr(node, false), _common_sub_expr_num(node.output_column) {} Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx) { + std::unordered_set cur_arguments; + if (expr->is_lambda_function()) { + auto lambda_function = static_cast(expr); + RETURN_IF_ERROR(lambda_function->collect_lambda_argument_ids()); + for (auto argument_id : lambda_function->get_lambda_arguments_ids()) { + cur_arguments.insert(argument_id); + ctx->lambda_arguments.insert(argument_id); + } + } + int child_num = expr->get_num_children(); std::vector slot_ids; @@ -39,10 +51,11 @@ Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* exp auto child = expr->get_child(i); RETURN_IF_ERROR(extract_outer_common_exprs(state, child, ctx)); - // if child is a slotref or a lambda function, we can't replace it. - if (child->is_slotref() || child->is_lambda_function()) { + // if child is a slotref or a lambda function or a literal, we can't replace it. + if (child->is_slotref() || child->is_lambda_function() || child->is_literal()) { continue; } + slot_ids.clear(); child->get_slot_ids(&slot_ids); bool is_independent = std::all_of(slot_ids.begin(), slot_ids.end(), [ctx](const SlotId& id) { @@ -58,17 +71,17 @@ Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* exp ctx->outer_common_exprs.insert({slot_id, child}); } } + if (!cur_arguments.empty()) { + for (const auto id : cur_arguments) { + ctx->lambda_arguments.erase(id); + } + } + return Status::OK(); } Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, ExtractContext* ctx) { - RETURN_IF_ERROR(collect_lambda_argument_ids()); - for (auto argument_id : _arguments_ids) { - ctx->lambda_arguments.insert(argument_id); - } - - auto lambda_expr = _children[0]; - RETURN_IF_ERROR(extract_outer_common_exprs(state, lambda_expr, ctx)); + RETURN_IF_ERROR(extract_outer_common_exprs(state, this, ctx)); return Status::OK(); } @@ -184,12 +197,22 @@ StatusOr LambdaFunction::evaluate_checked(ExprContext* context, Chunk return get_child(0)->evaluate_checked(context, chunk); } +int LambdaFunction::get_slot_ids(std::vector* slot_ids) const { + if (_is_prepared) { + slot_ids->insert(slot_ids->end(), _captured_slot_ids.begin(), _captured_slot_ids.end()); + slot_ids->insert(slot_ids->end(), _arguments_ids.begin(), _arguments_ids.end()); + return _captured_slot_ids.size() + _arguments_ids.size(); + } else { + return Expr::get_slot_ids(slot_ids); + } +} + std::string LambdaFunction::debug_string() const { std::stringstream out; auto expr_debug_string = Expr::debug_string(); out << "LambaFunction ("; for (int i = 0; i < _children.size(); i++) { - out << (i == 0 ? "lambda expr, " : "input argument, ") << _children[i]->debug_string(); + out << (i == 0 ? "lambda expr: " : " input argument: ") << _children[i]->debug_string(); } out << ")"; return out.str(); diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index 1c1e79dec3992..f07725ff18e0e 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -45,18 +45,22 @@ class LambdaFunction final : public Expr { StatusOr evaluate_checked(ExprContext* context, Chunk* ptr) override; - // the slot ids of lambda expression may be originally from the arguments of this lambda function - // or its parent lambda functions, or captured columns, remove the first one. - int get_slot_ids(std::vector* slot_ids) const override { + int get_slot_ids(std::vector* slot_ids) const override; + + int get_captured_slot_ids(std::vector* slot_ids) const { + DCHECK(_is_prepared); slot_ids->insert(slot_ids->end(), _captured_slot_ids.begin(), _captured_slot_ids.end()); return _captured_slot_ids.size(); } - int get_lambda_arguments_ids(std::vector* ids) { + int get_lambda_arguments_ids(std::vector* ids) const { + DCHECK(_is_prepared); ids->assign(_arguments_ids.begin(), _arguments_ids.end()); return _arguments_ids.size(); } + const std::vector& get_lambda_arguments_ids() const { return _arguments_ids; } + bool is_lambda_function() const override { return true; } bool is_lambda_expr_independent() const { return _is_lambda_expr_independent; } diff --git a/be/src/exprs/literal.h b/be/src/exprs/literal.h index a01001a602f45..f177995edef5e 100644 --- a/be/src/exprs/literal.h +++ b/be/src/exprs/literal.h @@ -39,7 +39,7 @@ class VectorizedLiteral final : public Expr { StatusOr generate_ir_impl(ExprContext* context, JITContext* jit_ctx) override; #endif - + bool is_literal() const override { return true; } std::string debug_string() const override; private: diff --git a/test/sql/test_array_fn/R/test_array_map_2 b/test/sql/test_array_fn/R/test_array_map_2 index 66b8544923384..dac1097c06b0e 100644 --- a/test/sql/test_array_fn/R/test_array_map_2 +++ b/test/sql/test_array_fn/R/test_array_map_2 @@ -69,7 +69,7 @@ insert into t values (1, [1,2], [1,2],[2,3]), (2, [1,2], null, [2,3]), (3, [1,2] -- !result select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t; -- result: -E: (1064, "Input array element's size is not equal in array_map().") +[REGEX].*Input array element's size is not equal in array_map().* -- !result select array_map((x,y,z)->x+y+z, arr_0, arr_1, arr_2) from t where k != 5 order by k; -- result: @@ -97,7 +97,7 @@ None -- !result select array_map((x,y,z,d)->x+y+z+d, arr_0, arr_1, arr_2, [1]) from t order by k; -- result: -E: (1064, "Input array element's size is not equal in array_map().") +[REGEX].*Input array element's size is not equal in array_map().* -- !result select array_map(x->x, arr_0) from t order by k; -- result: From d9a4ed9bea8d49a12113c98afd14da23e03a176d Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Mon, 30 Sep 2024 18:21:36 +0800 Subject: [PATCH 16/17] fix ut Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/exprs/map_apply_expr.cpp | 3 ++- be/test/exprs/lambda_array_expr_test.cpp | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/be/src/exprs/map_apply_expr.cpp b/be/src/exprs/map_apply_expr.cpp index bc21157271e9a..e262b6d94ed92 100644 --- a/be/src/exprs/map_apply_expr.cpp +++ b/be/src/exprs/map_apply_expr.cpp @@ -112,8 +112,9 @@ StatusOr MapApplyExpr::evaluate_checked(ExprContext* context, Chunk* cur_chunk->append_column(input_columns[i], _arguments_ids[i]); // column ref } // put captured columns into the new chunk aligning with the first map's offsets + auto lambda_func = dynamic_cast(_children[0]); std::vector slot_ids; - _children[0]->get_slot_ids(&slot_ids); + lambda_func->get_captured_slot_ids(&slot_ids); for (auto id : slot_ids) { DCHECK(id > 0); auto captured = chunk->get_column_by_slot_id(id); diff --git a/be/test/exprs/lambda_array_expr_test.cpp b/be/test/exprs/lambda_array_expr_test.cpp index 74ebc15e84bec..4bcfa3b18cd44 100644 --- a/be/test/exprs/lambda_array_expr_test.cpp +++ b/be/test/exprs/lambda_array_expr_test.cpp @@ -255,7 +255,7 @@ TEST_F(VectorizedLambdaFunctionExprTest, array_map_lambda_test_normal_array) { // check LambdaFunction::prepare() std::vector ids, arguments; - lambda->get_slot_ids(&ids); + lambda->get_captured_slot_ids(&ids); lambda->get_lambda_arguments_ids(&arguments); ASSERT_TRUE(arguments.size() == 1 && arguments[0] == 100000); // the x's slot_id = 100000 @@ -331,7 +331,7 @@ TEST_F(VectorizedLambdaFunctionExprTest, array_map_lambda_test_special_array) { // check LambdaFunction::prepare() std::vector ids, arguments; - lambda->get_slot_ids(&ids); + lambda->get_captured_slot_ids(&ids); lambda->get_lambda_arguments_ids(&arguments); ASSERT_TRUE(arguments.size() == 1 && arguments[0] == 100000); // the x's slot_id = 100000 @@ -401,7 +401,7 @@ TEST_F(VectorizedLambdaFunctionExprTest, array_map_lambda_test_const_array) { // check LambdaFunction::prepare() std::vector ids, arguments; - lambda->get_slot_ids(&ids); + lambda->get_captured_slot_ids(&ids); lambda->get_lambda_arguments_ids(&arguments); ASSERT_TRUE(arguments.size() == 1 && arguments[0] == 100000); // the x's slot_id = 100000 From 5978dcfa168d74e4514bcbf05c7e0c40e3cbe354 Mon Sep 17 00:00:00 2001 From: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> Date: Mon, 30 Sep 2024 20:36:18 +0800 Subject: [PATCH 17/17] fix ut Signed-off-by: silverbullet233 <3675229+silverbullet233@users.noreply.github.com> --- be/src/exprs/lambda_function.cpp | 48 ++++++++++++++++++++++---------- be/src/exprs/lambda_function.h | 4 +++ 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/be/src/exprs/lambda_function.cpp b/be/src/exprs/lambda_function.cpp index b9ec0fc70775f..d2a186e0a521d 100644 --- a/be/src/exprs/lambda_function.cpp +++ b/be/src/exprs/lambda_function.cpp @@ -22,28 +22,41 @@ #include "column/chunk.h" #include "column/column_helper.h" #include "column/vectorized_fwd.h" -#include "column_ref.h" #include "exec/exec_node.h" #include "exprs/column_ref.h" #include "exprs/expr.h" #include "exprs/expr_context.h" -#include "lambda_function.h" +#include "util/defer_op.h" namespace starrocks { LambdaFunction::LambdaFunction(const TExprNode& node) : Expr(node, false), _common_sub_expr_num(node.output_column) {} Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx) { - std::unordered_set cur_arguments; if (expr->is_lambda_function()) { auto lambda_function = static_cast(expr); RETURN_IF_ERROR(lambda_function->collect_lambda_argument_ids()); for (auto argument_id : lambda_function->get_lambda_arguments_ids()) { - cur_arguments.insert(argument_id); ctx->lambda_arguments.insert(argument_id); } + RETURN_IF_ERROR(lambda_function->collect_common_sub_exprs()); + for (auto slot_id : lambda_function->get_common_sub_expr_ids()) { + ctx->common_sub_expr_ids.insert(slot_id); + } } + DeferOp defer([&]() { + if (expr->is_lambda_function()) { + auto lambda_function = static_cast(expr); + for (auto argument_id : lambda_function->get_lambda_arguments_ids()) { + ctx->lambda_arguments.erase(argument_id); + } + for (auto slot_id : lambda_function->get_common_sub_expr_ids()) { + ctx->common_sub_expr_ids.erase(slot_id); + } + } + }); + int child_num = expr->get_num_children(); std::vector slot_ids; @@ -59,7 +72,8 @@ Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* exp slot_ids.clear(); child->get_slot_ids(&slot_ids); bool is_independent = std::all_of(slot_ids.begin(), slot_ids.end(), [ctx](const SlotId& id) { - return ctx->lambda_arguments.find(id) == ctx->lambda_arguments.end(); + return ctx->lambda_arguments.find(id) == ctx->lambda_arguments.end() && + ctx->common_sub_expr_ids.find(id) == ctx->common_sub_expr_ids.end(); }); if (is_independent) { @@ -71,11 +85,6 @@ Status LambdaFunction::extract_outer_common_exprs(RuntimeState* state, Expr* exp ctx->outer_common_exprs.insert({slot_id, child}); } } - if (!cur_arguments.empty()) { - for (const auto id : cur_arguments) { - ctx->lambda_arguments.erase(id); - } - } return Status::OK(); } @@ -109,16 +118,13 @@ SlotId LambdaFunction::max_used_slot_id() const { return *std::max_element(ids.begin(), ids.end()); } -Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprContext* context) { - RETURN_IF_ERROR(Expr::prepare(state, context)); - if (_is_prepared) { +Status LambdaFunction::collect_common_sub_exprs() { + if (!_common_sub_expr_ids.empty()) { return Status::OK(); } - _is_prepared = true; // common sub expressions include 2 parts in a pair: (slot id, expression) const int child_num = get_num_children() - 2 * _common_sub_expr_num; - RETURN_IF_ERROR(collect_lambda_argument_ids()); // sorted common sub expressions so that the later expressions can reference the previous ones. for (auto i = child_num; i < child_num + _common_sub_expr_num; ++i) { @@ -139,6 +145,18 @@ Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprCo _common_sub_expr.size(), _common_sub_expr_num)); } + return Status::OK(); +} + +Status LambdaFunction::prepare(starrocks::RuntimeState* state, starrocks::ExprContext* context) { + RETURN_IF_ERROR(Expr::prepare(state, context)); + if (_is_prepared) { + return Status::OK(); + } + _is_prepared = true; + + RETURN_IF_ERROR(collect_lambda_argument_ids()); + RETURN_IF_ERROR(collect_common_sub_exprs()); // get slot ids from the lambda expression get_child(0)->get_slot_ids(&_captured_slot_ids); diff --git a/be/src/exprs/lambda_function.h b/be/src/exprs/lambda_function.h index f07725ff18e0e..c6bf1928299f8 100644 --- a/be/src/exprs/lambda_function.h +++ b/be/src/exprs/lambda_function.h @@ -60,6 +60,7 @@ class LambdaFunction final : public Expr { } const std::vector& get_lambda_arguments_ids() const { return _arguments_ids; } + const std::vector& get_common_sub_expr_ids() const { return _common_sub_expr_ids; } bool is_lambda_function() const override { return true; } bool is_lambda_expr_independent() const { return _is_lambda_expr_independent; } @@ -71,6 +72,8 @@ class LambdaFunction final : public Expr { struct ExtractContext { std::unordered_set lambda_arguments; + // slot id of common sub expr inside lambda expr + std::unordered_set common_sub_expr_ids; SlotId next_slot_id; std::map outer_common_exprs; }; @@ -89,6 +92,7 @@ class LambdaFunction final : public Expr { private: Status collect_lambda_argument_ids(); Status collect_capture_slot_ids(); + Status collect_common_sub_exprs(); Status extract_outer_common_exprs(RuntimeState* state, Expr* expr, ExtractContext* ctx); std::vector _captured_slot_ids;