From 3c60280d2782523d3a02e2faa77a6da1f5fd10f7 Mon Sep 17 00:00:00 2001 From: eldenmoon Date: Tue, 24 Dec 2024 23:28:08 +0800 Subject: [PATCH] fix 10 --- .../olap/rowset/segment_v2/column_reader.cpp | 8 ++++---- .../segment_v2/hierarchical_data_reader.cpp | 18 ++++++++++++++++++ .../segment_v2/variant_column_writer_impl.cpp | 2 ++ be/src/vec/columns/column_object.cpp | 14 ++++++++++++++ .../variant_p0/compaction/test_compaction.out | 16 ++++++++-------- 5 files changed, 46 insertions(+), 12 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index 99a5dfc8c242a4c..b0cc41c288bd5a9 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -331,11 +331,11 @@ Status VariantColumnReader::init(const ColumnReaderOptions& opts, const SegmentF if (self_column_pb.has_variant_statistics()) { _statistics = std::make_unique(); const auto& variant_stats = self_column_pb.variant_statistics(); - for (const auto& [path, _] : variant_stats.sparse_column_non_null_size()) { - _statistics->sparse_column_non_null_size.emplace(path.data(), path.size()); + for (const auto& [path, size] : variant_stats.sparse_column_non_null_size()) { + _statistics->sparse_column_non_null_size.emplace(path, size); } - for (const auto& [path, _] : variant_stats.subcolumn_non_null_size()) { - _statistics->subcolumns_non_null_size.emplace(path.data(), path.size()); + for (const auto& [path, size] : variant_stats.subcolumn_non_null_size()) { + _statistics->subcolumns_non_null_size.emplace(path, size); } } return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp index 38ac20807da6ae2..651cfb696556dbb 100644 --- a/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp +++ b/be/src/olap/rowset/segment_v2/hierarchical_data_reader.cpp @@ -353,8 +353,26 @@ Status HierarchicalDataReader::_process_sparse_column(vectorized::ColumnObject& auto sub_path = get_sub_path(path, path_prefix); sparse_data_paths->insert_data(sub_path.data(), sub_path.size()); sparse_data_values->insert_from(src_sparse_data_values, lower_bound_index); + } else { + // insert into root column, example: access v['b'] and b is in sparse column + // data example: + // {"b" : 123} + // {"b" : {"c" : 456}} + // b maybe in sparse column, and b.c is in subolumn, put `b` into root column to distinguish + // from "" which is empty path and root + if (container_variant.is_null_root()) { + container_variant.add_sub_column({}, sparse_data_offsets.size()); + } + const auto& data = ColumnObject::deserialize_from_sparse_column( + &src_sparse_data_values, lower_bound_index); + container_variant.get_subcolumn({})->insert(data.first, data.second); } } + // if root was created, and not seen in sparse data, insert default + if (!container_variant.is_null_root() && + container_variant.get_subcolumn({})->size() == sparse_data_offsets.size()) { + container_variant.get_subcolumn({})->insert_default(); + } sparse_data_offsets.push_back(sparse_data_paths->size()); } } diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index 2fe9f642100fff1..0326e31f09619ea 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -110,6 +110,7 @@ Status VariantColumnWriterImpl::_get_subcolumn_paths_from_stats(std::setfirst << " as sparse column"; sorted_src_subcolumn_for_sparse_column.emplace_back(it->first, it->second); ++it; } @@ -1967,6 +1968,10 @@ Status ColumnObject::finalize(FinalizeMode mode) { // 3. pick MAX_SUBCOLUMNS selected subcolumns for (size_t i = 0; i < std::min(MAX_SUBCOLUMNS, sorted_by_size.size()); ++i) { + // if too many null values, then consider it as sparse column + if (sorted_by_size[i].second < num_rows * 0.95) { + continue; + } selected_path.insert(sorted_by_size[i].first); } std::map remaing_subcolumns; @@ -1975,6 +1980,7 @@ Status ColumnObject::finalize(FinalizeMode mode) { if (selected_path.find(entry->path.get_path()) != selected_path.end()) { new_subcolumns.add(entry->path, entry->data); } else { + VLOG_DEBUG << "pick " << entry->path.get_path() << " as sparse column"; remaing_subcolumns.emplace(entry->path.get_path(), entry->data); } } @@ -2143,7 +2149,15 @@ const DataTypePtr ColumnObject::NESTED_TYPE = std::make_shared(std::make_shared( std::make_shared()))); +// const size_t ColumnObject::MAX_SUBCOLUMNS = 5; +#ifndef NDEBUG +const size_t ColumnObject::MAX_SUBCOLUMNS = []() -> size_t { + std::srand(std::time(nullptr)); // 初始化随机数种子 + return 1 + std::rand() % 10; // 随机值范围 [1, 10] +}(); +#else const size_t ColumnObject::MAX_SUBCOLUMNS = 5; +#endif DataTypePtr ColumnObject::get_root_type() const { return subcolumns.get_root()->data.get_least_common_type(); diff --git a/regression-test/data/variant_p0/compaction/test_compaction.out b/regression-test/data/variant_p0/compaction/test_compaction.out index 0b905e3930fc7f1..7ccf1277bc0b013 100644 --- a/regression-test/data/variant_p0/compaction/test_compaction.out +++ b/regression-test/data/variant_p0/compaction/test_compaction.out @@ -8,8 +8,8 @@ 3 {"x":[3]} 4 {"y":1} 4 {"y":1} -5 {"z":2.0} -5 {"z":2.0} +5 {"z":2} +5 {"z":2} 6 {"x":111} 6 {"x":111} 7 {"m":1} @@ -96,8 +96,8 @@ 3 {"x":[3]} 4 {"y":1} 4 {"y":1} -5 {"z":2.0} -5 {"z":2.0} +5 {"z":2} +5 {"z":2} 6 {"x":111} 6 {"x":111} 7 {"m":1} @@ -180,7 +180,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2} @@ -233,7 +233,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2} @@ -284,7 +284,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2} @@ -337,7 +337,7 @@ 2 {"a":"1"} 3 {"x":[3]} 4 {"y":1} -5 {"z":2.0} +5 {"z":2} 6 {"x":111} 7 {"m":1} 8 {"l":2}