diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index 59971f01a4df24..e79f45f3d17fa8 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -98,6 +98,10 @@ std::unordered_map> s_task_signatur std::atomic_ulong s_report_version(time(nullptr) * 10000); +void increase_report_version() { + s_report_version.fetch_add(1, std::memory_order_relaxed); +} + // FIXME(plat1ko): Paired register and remove task info bool register_task_info(const TTaskType::type task_type, int64_t signature) { if (task_type == TTaskType::type::PUSH_STORAGE_POLICY || @@ -197,7 +201,7 @@ void alter_tablet(StorageEngine& engine, const TAgentTaskRequest& agent_task_req } if (status.ok()) { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); } // Return result to fe @@ -1363,7 +1367,9 @@ void create_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) COUNTER_UPDATE(profile->total_time_counter(), elapsed_time); std::stringstream ss; profile->pretty_print(&ss); - LOG(WARNING) << "create tablet cost(s) " << elapsed_time / 1e9 << std::endl << ss.str(); + LOG(WARNING) << "create tablet " << create_tablet_req.tablet_id << " cost(s) " + << elapsed_time / 1e9 << std::endl + << ss.str(); } }); DorisMetrics::instance()->create_tablet_requests_total->increment(1); @@ -1379,7 +1385,7 @@ void create_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) .tag("tablet_id", create_tablet_req.tablet_id) .error(status); } else { - s_report_version.fetch_add(1, std::memory_order_relaxed); + increase_report_version(); // get path hash of the created tablet TabletSharedPtr tablet; { @@ -1474,7 +1480,7 @@ void push_callback(const TAgentTaskRequest& req) { .tag("signature", req.signature) .tag("tablet_id", push_req.tablet_id) .tag("push_type", push_req.push_type); - ++s_report_version; + increase_report_version(); finish_task_request.__set_finish_tablet_infos(tablet_infos); } else { LOG_WARNING("failed to execute push task") @@ -1741,6 +1747,10 @@ void clone_callback(StorageEngine& engine, const TMasterInfo& master_info, LOG_INFO("successfully clone tablet") .tag("signature", req.signature) .tag("tablet_id", clone_req.tablet_id); + if (engine_task.is_new_tablet()) { + increase_report_version(); + finish_task_request.__set_report_version(s_report_version); + } finish_task_request.__set_finish_tablet_infos(tablet_infos); } diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 7bd237baf4970c..0289fadb716857 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -378,6 +378,7 @@ DEFINE_mInt32(max_single_replica_compaction_threads, "-1"); DEFINE_Bool(enable_base_compaction_idle_sched, "true"); DEFINE_mInt64(base_compaction_min_rowset_num, "5"); +DEFINE_mInt64(base_compaction_max_compaction_score, "20"); DEFINE_mDouble(base_compaction_min_data_ratio, "0.3"); DEFINE_mInt64(base_compaction_dup_key_max_file_size_mbytes, "1024"); @@ -408,6 +409,7 @@ DEFINE_mInt64(compaction_min_size_mbytes, "64"); // cumulative compaction policy: min and max delta file's number DEFINE_mInt64(cumulative_compaction_min_deltas, "5"); DEFINE_mInt64(cumulative_compaction_max_deltas, "1000"); +DEFINE_mInt32(cumulative_compaction_max_deltas_factor, "10"); // This config can be set to limit thread number in multiget thread pool. DEFINE_mInt32(multi_get_max_threads, "10"); @@ -1033,11 +1035,11 @@ DEFINE_Int32(inverted_index_read_buffer_size, "4096"); // tree depth for bkd index DEFINE_Int32(max_depth_in_bkd_tree, "32"); // index compaction -DEFINE_mBool(inverted_index_compaction_enable, "false"); +DEFINE_mBool(inverted_index_compaction_enable, "true"); // Only for debug, do not use in production DEFINE_mBool(debug_inverted_index_compaction, "false"); // index by RAM directory -DEFINE_mBool(inverted_index_ram_dir_enable, "false"); +DEFINE_mBool(inverted_index_ram_dir_enable, "true"); // use num_broadcast_buffer blocks as buffer to do broadcast DEFINE_Int32(num_broadcast_buffer, "32"); @@ -1051,8 +1053,6 @@ DEFINE_mInt64(max_tablet_io_errors, "-1"); DEFINE_Int32(tablet_path_check_interval_seconds, "-1"); DEFINE_mInt32(tablet_path_check_batch_size, "1000"); -// Page size of row column, default 4KB -DEFINE_mInt64(row_column_page_size, "4096"); // it must be larger than or equal to 5MB DEFINE_mInt32(s3_write_buffer_size, "5242880"); // The timeout config for S3 buffer allocation @@ -1256,10 +1256,14 @@ DEFINE_Int64(min_row_group_size, "134217728"); // The time out milliseconds for remote fetch schema RPC, default 60s DEFINE_mInt64(fetch_remote_schema_rpc_timeout_ms, "60000"); +DEFINE_mInt64(compaction_memory_bytes_limit, "1073741824"); + +DEFINE_mInt64(compaction_batch_size, "-1"); + // If set to false, the parquet reader will not use page index to filter data. // This is only for debug purpose, in case sometimes the page index // filter wrong data. -DEFINE_mBool(enable_parquet_page_index, "true"); +DEFINE_mBool(enable_parquet_page_index, "false"); DEFINE_mBool(ignore_not_found_file_in_external_table, "true"); diff --git a/be/src/common/config.h b/be/src/common/config.h index 2514b4f2fa86de..d226623f0e5496 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -434,6 +434,7 @@ DECLARE_mInt32(max_single_replica_compaction_threads); DECLARE_Bool(enable_base_compaction_idle_sched); DECLARE_mInt64(base_compaction_min_rowset_num); +DECLARE_mInt64(base_compaction_max_compaction_score); DECLARE_mDouble(base_compaction_min_data_ratio); DECLARE_mInt64(base_compaction_dup_key_max_file_size_mbytes); @@ -464,6 +465,7 @@ DECLARE_mInt64(compaction_min_size_mbytes); // cumulative compaction policy: min and max delta file's number DECLARE_mInt64(cumulative_compaction_min_deltas); DECLARE_mInt64(cumulative_compaction_max_deltas); +DECLARE_mInt32(cumulative_compaction_max_deltas_factor); // This config can be set to limit thread number in multiget thread pool. DECLARE_mInt32(multi_get_max_threads); @@ -1095,8 +1097,6 @@ DECLARE_mInt64(max_tablet_io_errors); DECLARE_Int32(tablet_path_check_interval_seconds); DECLARE_mInt32(tablet_path_check_batch_size); -// Page size of row column, default 4KB -DECLARE_mInt64(row_column_page_size); // it must be larger than or equal to 5MB DECLARE_mInt32(s3_write_buffer_size); // The timeout config for S3 buffer allocation @@ -1346,6 +1346,10 @@ DECLARE_mInt64(fetch_remote_schema_rpc_timeout_ms); // The minimum row group size when exporting Parquet files. DECLARE_Int64(min_row_group_size); +DECLARE_mInt64(compaction_memory_bytes_limit); + +DECLARE_mInt64(compaction_batch_size); + DECLARE_mBool(enable_parquet_page_index); // Wheather to ignore not found file in external teble(eg, hive) diff --git a/be/src/common/status.cpp b/be/src/common/status.cpp index d17e18951c5615..cc6c10c29414de 100644 --- a/be/src/common/status.cpp +++ b/be/src/common/status.cpp @@ -34,6 +34,13 @@ void Status::to_thrift(TStatus* s) const { // << "The error code has to > 0 because TStatusCode need it > 0, it's actual value is " // << _code; s->status_code = (int16_t)_code > 0 ? (TStatusCode::type)_code : TStatusCode::INTERNAL_ERROR; + + if (_code == ErrorCode::VERSION_ALREADY_MERGED) { + s->status_code = TStatusCode::OLAP_ERR_VERSION_ALREADY_MERGED; + } else if (_code == ErrorCode::TABLE_NOT_FOUND) { + s->status_code = TStatusCode::TABLET_MISSING; + } + s->error_msgs.push_back(fmt::format("({})[{}]{}", BackendOptions::get_localhost(), code_as_string(), _err_msg ? _err_msg->_msg : "")); s->__isset.error_msgs = true; diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index 2b6b2c1f3c0321..d9cafcf9049a53 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -50,7 +50,10 @@ #include "exec/schema_scanner/schema_workload_groups_scanner.h" #include "exec/schema_scanner/schema_workload_sched_policy_scanner.h" #include "olap/hll.h" +#include "pipeline/pipeline_x/dependency.h" #include "runtime/define_primitive_type.h" +#include "runtime/fragment_mgr.h" +#include "runtime/types.h" #include "util/string_util.h" #include "util/types.h" #include "vec/columns/column.h" @@ -64,6 +67,7 @@ #include "vec/core/column_with_type_and_name.h" #include "vec/core/types.h" #include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_factory.hpp" namespace doris { class ObjectPool; @@ -84,7 +88,60 @@ Status SchemaScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaScanner::get_next_block(RuntimeState* state, vectorized::Block* block, bool* eos) { + if (_data_block == nullptr) { + return Status::InternalError("No data left!"); + } + DCHECK(_async_thread_running == false); + RETURN_IF_ERROR(_scanner_status.status()); + for (size_t i = 0; i < block->columns(); i++) { + std::move(*block->get_by_position(i).column) + .mutate() + ->insert_range_from(*_data_block->get_by_position(i).column, 0, + _data_block->rows()); + } + _data_block->clear_column_data(); + *eos = _eos; + if (!*eos) { + RETURN_IF_ERROR(get_next_block_async(state)); + } + return Status::OK(); +} + +Status SchemaScanner::get_next_block_async(RuntimeState* state) { + _dependency->block(); + auto task_ctx = state->get_task_execution_context(); + RETURN_IF_ERROR(ExecEnv::GetInstance()->fragment_mgr()->get_thread_pool()->submit_func( + [this, task_ctx, state]() { + DCHECK(_async_thread_running == false); + auto task_lock = task_ctx.lock(); + if (task_lock == nullptr) { + _scanner_status.update(Status::InternalError("Task context not exists!")); + return; + } + SCOPED_ATTACH_TASK(state); + _dependency->block(); + _async_thread_running = true; + _finish_dependency->block(); + if (!_opened) { + _data_block = vectorized::Block::create_unique(); + _init_block(_data_block.get()); + _scanner_status.update(start(state)); + _opened = true; + } + bool eos = false; + _scanner_status.update(get_next_block_internal(_data_block.get(), &eos)); + _eos = eos; + _async_thread_running = false; + _dependency->set_ready(); + if (eos) { + _finish_dependency->set_ready(); + } + })); + return Status::OK(); +} + +Status SchemaScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("used before initialized."); } @@ -173,6 +230,16 @@ std::unique_ptr SchemaScanner::create(TSchemaTableType::type type } } +void SchemaScanner::_init_block(vectorized::Block* src_block) { + const std::vector& columns_desc(get_column_desc()); + for (int i = 0; i < columns_desc.size(); ++i) { + TypeDescriptor descriptor(columns_desc[i].type); + auto data_type = vectorized::DataTypeFactory::instance().create_data_type(descriptor, true); + src_block->insert(vectorized::ColumnWithTypeAndName(data_type->create_column(), data_type, + columns_desc[i].name)); + } +} + Status SchemaScanner::fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas) { const ColumnDesc& col_desc = _columns[pos]; diff --git a/be/src/exec/schema_scanner.h b/be/src/exec/schema_scanner.h index a23706ac6a440a..da61d58b943fc4 100644 --- a/be/src/exec/schema_scanner.h +++ b/be/src/exec/schema_scanner.h @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -43,6 +44,10 @@ namespace vectorized { class Block; } +namespace pipeline { +class Dependency; +} + struct SchemaScannerCommonParam { SchemaScannerCommonParam() : db(nullptr), @@ -64,6 +69,7 @@ struct SchemaScannerCommonParam { int32_t port; // frontend thrift port int64_t thread_id; const std::string* catalog = nullptr; + std::set fe_addr_list; }; // scanner parameter from frontend @@ -94,15 +100,23 @@ class SchemaScanner { // init object need information, schema etc. virtual Status init(SchemaScannerParam* param, ObjectPool* pool); + Status get_next_block(RuntimeState* state, vectorized::Block* block, bool* eos); // Start to work virtual Status start(RuntimeState* state); - virtual Status get_next_block(vectorized::Block* block, bool* eos); + virtual Status get_next_block_internal(vectorized::Block* block, bool* eos); const std::vector& get_column_desc() const { return _columns; } // factory function static std::unique_ptr create(TSchemaTableType::type type); TSchemaTableType::type type() const { return _schema_table_type; } + void set_dependency(std::shared_ptr dep, + std::shared_ptr fin_dep) { + _dependency = dep; + _finish_dependency = fin_dep; + } + Status get_next_block_async(RuntimeState* state); protected: + void _init_block(vectorized::Block* src_block); Status fill_dest_column_for_range(vectorized::Block* block, size_t pos, const std::vector& datas); @@ -125,6 +139,15 @@ class SchemaScanner { RuntimeProfile::Counter* _get_table_timer = nullptr; RuntimeProfile::Counter* _get_describe_timer = nullptr; RuntimeProfile::Counter* _fill_block_timer = nullptr; + + std::shared_ptr _dependency = nullptr; + std::shared_ptr _finish_dependency = nullptr; + + std::unique_ptr _data_block; + AtomicStatus _scanner_status; + std::atomic _eos = false; + std::atomic _opened = false; + std::atomic _async_thread_running = false; }; } // namespace doris diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp index 2115a38a6ebce3..46522a36242fc1 100644 --- a/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.cpp @@ -137,7 +137,7 @@ Status SchemaActiveQueriesScanner::_get_active_queries_block_from_fe() { return Status::OK(); } -Status SchemaActiveQueriesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaActiveQueriesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_active_queries_scanner.h b/be/src/exec/schema_scanner/schema_active_queries_scanner.h index 1df5b1f9d7402d..7e9ae4b8034083 100644 --- a/be/src/exec/schema_scanner/schema_active_queries_scanner.h +++ b/be/src/exec/schema_scanner/schema_active_queries_scanner.h @@ -36,7 +36,7 @@ class SchemaActiveQueriesScanner : public SchemaScanner { ~SchemaActiveQueriesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp index f1155796ed434d..b35e84a9f9c9f4 100644 --- a/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp +++ b/be/src/exec/schema_scanner/schema_backend_active_tasks.cpp @@ -51,7 +51,8 @@ Status SchemaBackendActiveTasksScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaBackendActiveTasksScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaBackendActiveTasksScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_backend_active_tasks.h b/be/src/exec/schema_scanner/schema_backend_active_tasks.h index d8a2a1ffa3f96a..43819818b57f69 100644 --- a/be/src/exec/schema_scanner/schema_backend_active_tasks.h +++ b/be/src/exec/schema_scanner/schema_backend_active_tasks.h @@ -36,7 +36,7 @@ class SchemaBackendActiveTasksScanner : public SchemaScanner { ~SchemaBackendActiveTasksScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_charsets_scanner.cpp b/be/src/exec/schema_scanner/schema_charsets_scanner.cpp index 9bd7ad7919cdc8..ff42e7f5a059ac 100644 --- a/be/src/exec/schema_scanner/schema_charsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_charsets_scanner.cpp @@ -48,7 +48,7 @@ SchemaCharsetsScanner::SchemaCharsetsScanner() SchemaCharsetsScanner::~SchemaCharsetsScanner() {} -Status SchemaCharsetsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaCharsetsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_charsets_scanner.h b/be/src/exec/schema_scanner/schema_charsets_scanner.h index 1f01070875ccf6..d5089c62826b0b 100644 --- a/be/src/exec/schema_scanner/schema_charsets_scanner.h +++ b/be/src/exec/schema_scanner/schema_charsets_scanner.h @@ -36,7 +36,7 @@ class SchemaCharsetsScanner : public SchemaScanner { SchemaCharsetsScanner(); ~SchemaCharsetsScanner() override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct CharsetStruct { diff --git a/be/src/exec/schema_scanner/schema_collations_scanner.cpp b/be/src/exec/schema_scanner/schema_collations_scanner.cpp index 812a8cff18e997..271c9a6fb78485 100644 --- a/be/src/exec/schema_scanner/schema_collations_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_collations_scanner.cpp @@ -50,7 +50,7 @@ SchemaCollationsScanner::SchemaCollationsScanner() SchemaCollationsScanner::~SchemaCollationsScanner() {} -Status SchemaCollationsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaCollationsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_collations_scanner.h b/be/src/exec/schema_scanner/schema_collations_scanner.h index f0f60538cacce0..2fe200da78d04d 100644 --- a/be/src/exec/schema_scanner/schema_collations_scanner.h +++ b/be/src/exec/schema_scanner/schema_collations_scanner.h @@ -36,7 +36,7 @@ class SchemaCollationsScanner : public SchemaScanner { SchemaCollationsScanner(); ~SchemaCollationsScanner() override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct CollationStruct { diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.cpp b/be/src/exec/schema_scanner/schema_columns_scanner.cpp index 763f24b9e531ce..440370c36c9aec 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_columns_scanner.cpp @@ -347,7 +347,7 @@ Status SchemaColumnsScanner::_get_new_table() { return Status::OK(); } -Status SchemaColumnsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaColumnsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("use this class before inited."); } diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.h b/be/src/exec/schema_scanner/schema_columns_scanner.h index 2499db7ed82a2b..99150c36d109a2 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.h +++ b/be/src/exec/schema_scanner/schema_columns_scanner.h @@ -38,7 +38,7 @@ class SchemaColumnsScanner : public SchemaScanner { SchemaColumnsScanner(); ~SchemaColumnsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_dummy_scanner.cpp b/be/src/exec/schema_scanner/schema_dummy_scanner.cpp index 1d5956f390ea26..9e3a703d9fb5d6 100644 --- a/be/src/exec/schema_scanner/schema_dummy_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_dummy_scanner.cpp @@ -40,7 +40,7 @@ Status SchemaDummyScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaDummyScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaDummyScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { *eos = true; return Status::OK(); } diff --git a/be/src/exec/schema_scanner/schema_dummy_scanner.h b/be/src/exec/schema_scanner/schema_dummy_scanner.h index a67f6fa25c1648..0c5e4aabe357e4 100644 --- a/be/src/exec/schema_scanner/schema_dummy_scanner.h +++ b/be/src/exec/schema_scanner/schema_dummy_scanner.h @@ -33,7 +33,7 @@ class SchemaDummyScanner : public SchemaScanner { SchemaDummyScanner(); ~SchemaDummyScanner() override; Status start(RuntimeState* state = nullptr) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; }; } // namespace doris diff --git a/be/src/exec/schema_scanner/schema_files_scanner.cpp b/be/src/exec/schema_scanner/schema_files_scanner.cpp index 55b7a338c319e8..20aa07fa69116c 100644 --- a/be/src/exec/schema_scanner/schema_files_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_files_scanner.cpp @@ -113,7 +113,7 @@ Status SchemaFilesScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaFilesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaFilesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_files_scanner.h b/be/src/exec/schema_scanner/schema_files_scanner.h index 6805a04be4aacc..bb3b2d68493147 100644 --- a/be/src/exec/schema_scanner/schema_files_scanner.h +++ b/be/src/exec/schema_scanner/schema_files_scanner.h @@ -38,7 +38,7 @@ class SchemaFilesScanner : public SchemaScanner { ~SchemaFilesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; int _db_index; int _table_index; diff --git a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp index ef7b2b69c1e710..aacd94315249ba 100644 --- a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.cpp @@ -225,7 +225,7 @@ Status SchemaMetadataNameIdsScanner::_fill_block_impl(vectorized::Block* block) return Status::OK(); } -Status SchemaMetadataNameIdsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaMetadataNameIdsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h index 9981d441d856aa..c3beea7769754d 100644 --- a/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h +++ b/be/src/exec/schema_scanner/schema_metadata_name_ids_scanner.h @@ -39,7 +39,7 @@ class SchemaMetadataNameIdsScanner : public SchemaScanner { ~SchemaMetadataNameIdsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp index f1ad1f594f883f..ea7394e15e12d2 100644 --- a/be/src/exec/schema_scanner/schema_partitions_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_partitions_scanner.cpp @@ -101,7 +101,7 @@ Status SchemaPartitionsScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaPartitionsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaPartitionsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_partitions_scanner.h b/be/src/exec/schema_scanner/schema_partitions_scanner.h index 47e1d1fcf87d15..87e55db984a3de 100644 --- a/be/src/exec/schema_scanner/schema_partitions_scanner.h +++ b/be/src/exec/schema_scanner/schema_partitions_scanner.h @@ -38,7 +38,7 @@ class SchemaPartitionsScanner : public SchemaScanner { ~SchemaPartitionsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; int _db_index; int _table_index; diff --git a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp index f5f5bc236343b9..c65e1d14c2c5ad 100644 --- a/be/src/exec/schema_scanner/schema_processlist_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_processlist_scanner.cpp @@ -55,14 +55,19 @@ Status SchemaProcessListScanner::start(RuntimeState* state) { TShowProcessListRequest request; request.__set_show_full_sql(true); - RETURN_IF_ERROR(SchemaHelper::show_process_list(*(_param->common_param->ip), - _param->common_param->port, request, - &_process_list_result)); + for (const auto& fe_addr : _param->common_param->fe_addr_list) { + TShowProcessListResult tmp_ret; + RETURN_IF_ERROR( + SchemaHelper::show_process_list(fe_addr.hostname, fe_addr.port, request, &tmp_ret)); + _process_list_result.process_list.insert(_process_list_result.process_list.end(), + tmp_ret.process_list.begin(), + tmp_ret.process_list.end()); + } return Status::OK(); } -Status SchemaProcessListScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaProcessListScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_processlist_scanner.h b/be/src/exec/schema_scanner/schema_processlist_scanner.h index 8aae87e1ef6d0f..c0b0a47f6154ee 100644 --- a/be/src/exec/schema_scanner/schema_processlist_scanner.h +++ b/be/src/exec/schema_scanner/schema_processlist_scanner.h @@ -40,7 +40,7 @@ class SchemaProcessListScanner : public SchemaScanner { ~SchemaProcessListScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_processlist_columns; diff --git a/be/src/exec/schema_scanner/schema_profiling_scanner.cpp b/be/src/exec/schema_scanner/schema_profiling_scanner.cpp index 2f71eb96f2613a..0a2a64330bb018 100644 --- a/be/src/exec/schema_scanner/schema_profiling_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_profiling_scanner.cpp @@ -88,7 +88,7 @@ Status SchemaProfilingScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaProfilingScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaProfilingScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_profiling_scanner.h b/be/src/exec/schema_scanner/schema_profiling_scanner.h index 5399cb14eb43f5..6b969a478aca69 100644 --- a/be/src/exec/schema_scanner/schema_profiling_scanner.h +++ b/be/src/exec/schema_scanner/schema_profiling_scanner.h @@ -38,7 +38,7 @@ class SchemaProfilingScanner : public SchemaScanner { ~SchemaProfilingScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; }; diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.cpp b/be/src/exec/schema_scanner/schema_routine_scanner.cpp index 3d55addee6c093..8c263c99d2d6c8 100644 --- a/be/src/exec/schema_scanner/schema_routine_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_routine_scanner.cpp @@ -141,7 +141,7 @@ Status SchemaRoutinesScanner::get_block_from_fe() { return Status::OK(); } -Status SchemaRoutinesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaRoutinesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_routine_scanner.h b/be/src/exec/schema_scanner/schema_routine_scanner.h index 543f9e8e8f684a..c60d72340e1104 100644 --- a/be/src/exec/schema_scanner/schema_routine_scanner.h +++ b/be/src/exec/schema_scanner/schema_routine_scanner.h @@ -36,7 +36,7 @@ class SchemaRoutinesScanner : public SchemaScanner { ~SchemaRoutinesScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp index b760d8bde04acc..8bfd785ce5df2e 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.cpp @@ -92,7 +92,7 @@ Status SchemaRowsetsScanner::_get_all_rowsets() { return Status::OK(); } -Status SchemaRowsetsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaRowsetsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_rowsets_scanner.h b/be/src/exec/schema_scanner/schema_rowsets_scanner.h index b975cc4231bc20..cad34fc04945e4 100644 --- a/be/src/exec/schema_scanner/schema_rowsets_scanner.h +++ b/be/src/exec/schema_scanner/schema_rowsets_scanner.h @@ -40,7 +40,7 @@ class SchemaRowsetsScanner : public SchemaScanner { ~SchemaRowsetsScanner() override = default; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_all_rowsets(); diff --git a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp index 09c470ff50a3a7..c24b1fbb071264 100644 --- a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.cpp @@ -82,7 +82,7 @@ Status SchemaSchemaPrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaSchemaPrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaSchemaPrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h index af2ad49634bd49..9522fba908bb2a 100644 --- a/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_schema_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaSchemaPrivilegesScanner : public SchemaScanner { ~SchemaSchemaPrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_schemata_scanner.cpp b/be/src/exec/schema_scanner/schema_schemata_scanner.cpp index e09817ca31044b..e70b3b7a32ec23 100644 --- a/be/src/exec/schema_scanner/schema_schemata_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_schemata_scanner.cpp @@ -81,7 +81,7 @@ Status SchemaSchemataScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaSchemataScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaSchemataScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before Initialized."); } diff --git a/be/src/exec/schema_scanner/schema_schemata_scanner.h b/be/src/exec/schema_scanner/schema_schemata_scanner.h index 46fad31af1fd5e..39a5ddda495bdd 100644 --- a/be/src/exec/schema_scanner/schema_schemata_scanner.h +++ b/be/src/exec/schema_scanner/schema_schemata_scanner.h @@ -38,7 +38,7 @@ class SchemaSchemataScanner : public SchemaScanner { ~SchemaSchemataScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _fill_block_impl(vectorized::Block* block); diff --git a/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp index 41a3faf7c5a8a4..4d2e6246656ba9 100644 --- a/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_table_privileges_scanner.cpp @@ -84,7 +84,7 @@ Status SchemaTablePrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaTablePrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaTablePrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_table_privileges_scanner.h b/be/src/exec/schema_scanner/schema_table_privileges_scanner.h index aa79c88304b7c5..4cfcc16d3583ce 100644 --- a/be/src/exec/schema_scanner/schema_table_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_table_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaTablePrivilegesScanner : public SchemaScanner { ~SchemaTablePrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_tables_scanner.cpp b/be/src/exec/schema_scanner/schema_tables_scanner.cpp index 375ceb2c470f7c..6aaafe1ae57272 100644 --- a/be/src/exec/schema_scanner/schema_tables_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_tables_scanner.cpp @@ -342,7 +342,7 @@ Status SchemaTablesScanner::_fill_block_impl(vectorized::Block* block) { return Status::OK(); } -Status SchemaTablesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaTablesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_tables_scanner.h b/be/src/exec/schema_scanner/schema_tables_scanner.h index 11a96bf65d5271..7f8eb11f397e06 100644 --- a/be/src/exec/schema_scanner/schema_tables_scanner.h +++ b/be/src/exec/schema_scanner/schema_tables_scanner.h @@ -39,7 +39,7 @@ class SchemaTablesScanner : public SchemaScanner { ~SchemaTablesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp b/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp index b636ff65fd73cc..f9f4b272aaaf4c 100644 --- a/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_user_privileges_scanner.cpp @@ -81,7 +81,7 @@ Status SchemaUserPrivilegesScanner::_get_new_table() { return Status::OK(); } -Status SchemaUserPrivilegesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaUserPrivilegesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_user_privileges_scanner.h b/be/src/exec/schema_scanner/schema_user_privileges_scanner.h index eb8f3c63f1433b..ffc3840db676c4 100644 --- a/be/src/exec/schema_scanner/schema_user_privileges_scanner.h +++ b/be/src/exec/schema_scanner/schema_user_privileges_scanner.h @@ -38,7 +38,7 @@ class SchemaUserPrivilegesScanner : public SchemaScanner { ~SchemaUserPrivilegesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_user_scanner.cpp b/be/src/exec/schema_scanner/schema_user_scanner.cpp index 9b153414380350..e56f18f05aea93 100644 --- a/be/src/exec/schema_scanner/schema_user_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_user_scanner.cpp @@ -76,7 +76,7 @@ Status SchemaUserScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaUserScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaUserScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_user_scanner.h b/be/src/exec/schema_scanner/schema_user_scanner.h index c55f216804d5dd..bdc618eb5a0332 100644 --- a/be/src/exec/schema_scanner/schema_user_scanner.h +++ b/be/src/exec/schema_scanner/schema_user_scanner.h @@ -40,7 +40,7 @@ class SchemaUserScanner : public SchemaScanner { ~SchemaUserScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_user_columns; diff --git a/be/src/exec/schema_scanner/schema_variables_scanner.cpp b/be/src/exec/schema_scanner/schema_variables_scanner.cpp index 491a11f25722ef..445089b36ab370 100644 --- a/be/src/exec/schema_scanner/schema_variables_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_variables_scanner.cpp @@ -70,7 +70,7 @@ Status SchemaVariablesScanner::start(RuntimeState* state) { return Status::OK(); } -Status SchemaVariablesScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaVariablesScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("call this before initial."); } diff --git a/be/src/exec/schema_scanner/schema_variables_scanner.h b/be/src/exec/schema_scanner/schema_variables_scanner.h index 2d207ff8b2e6c2..31bbacf713be0f 100644 --- a/be/src/exec/schema_scanner/schema_variables_scanner.h +++ b/be/src/exec/schema_scanner/schema_variables_scanner.h @@ -40,7 +40,7 @@ class SchemaVariablesScanner : public SchemaScanner { ~SchemaVariablesScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: struct VariableStruct { diff --git a/be/src/exec/schema_scanner/schema_views_scanner.cpp b/be/src/exec/schema_scanner/schema_views_scanner.cpp index 7d9ce671a663fd..5b5f0c1b72933b 100644 --- a/be/src/exec/schema_scanner/schema_views_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_views_scanner.cpp @@ -113,7 +113,7 @@ Status SchemaViewsScanner::_get_new_table() { return Status::OK(); } -Status SchemaViewsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaViewsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_views_scanner.h b/be/src/exec/schema_scanner/schema_views_scanner.h index bc473057905a12..b86ad922e5e76a 100644 --- a/be/src/exec/schema_scanner/schema_views_scanner.h +++ b/be/src/exec/schema_scanner/schema_views_scanner.h @@ -38,7 +38,7 @@ class SchemaViewsScanner : public SchemaScanner { ~SchemaViewsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; private: Status _get_new_table(); diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp index def52df531df7b..8b0c6be536b212 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp @@ -114,7 +114,7 @@ Status SchemaWorkloadGroupsScanner::_get_workload_groups_block_from_fe() { return Status::OK(); } -Status SchemaWorkloadGroupsScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaWorkloadGroupsScanner::get_next_block_internal(vectorized::Block* block, bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.h b/be/src/exec/schema_scanner/schema_workload_groups_scanner.h index bf7a103526dc80..3121c4dbac149e 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.h +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.h @@ -36,7 +36,7 @@ class SchemaWorkloadGroupsScanner : public SchemaScanner { ~SchemaWorkloadGroupsScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp index 035d3bfe217aec..2d91f151f5f2bb 100644 --- a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.cpp @@ -106,7 +106,8 @@ Status SchemaWorkloadSchedulePolicyScanner::_get_workload_schedule_policy_block_ return Status::OK(); } -Status SchemaWorkloadSchedulePolicyScanner::get_next_block(vectorized::Block* block, bool* eos) { +Status SchemaWorkloadSchedulePolicyScanner::get_next_block_internal(vectorized::Block* block, + bool* eos) { if (!_is_init) { return Status::InternalError("Used before initialized."); } diff --git a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h index 5284975fe66b31..da8d9f15c4989e 100644 --- a/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h +++ b/be/src/exec/schema_scanner/schema_workload_sched_policy_scanner.h @@ -36,7 +36,7 @@ class SchemaWorkloadSchedulePolicyScanner : public SchemaScanner { ~SchemaWorkloadSchedulePolicyScanner() override; Status start(RuntimeState* state) override; - Status get_next_block(vectorized::Block* block, bool* eos) override; + Status get_next_block_internal(vectorized::Block* block, bool* eos) override; static std::vector _s_tbls_columns; diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 0ea095e9a50a00..b03676d43c5a50 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -1755,8 +1755,8 @@ void IRuntimeFilter::to_protobuf(PMinMaxFilter* filter) { switch (_wrapper->column_type()) { case TYPE_BOOLEAN: { - filter->mutable_min_val()->set_boolval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_boolval(*reinterpret_cast(max_data)); + filter->mutable_min_val()->set_boolval(*reinterpret_cast(min_data)); + filter->mutable_max_val()->set_boolval(*reinterpret_cast(max_data)); return; } case TYPE_TINYINT: { diff --git a/be/src/http/action/show_nested_index_file_action.cpp b/be/src/http/action/show_nested_index_file_action.cpp new file mode 100644 index 00000000000000..08194ee5355c28 --- /dev/null +++ b/be/src/http/action/show_nested_index_file_action.cpp @@ -0,0 +1,80 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "http/action/show_nested_index_file_action.h" + +#include + +#include +#include + +#include "common/status.h" +#include "http/http_channel.h" +#include "http/http_headers.h" +#include "http/http_request.h" +#include "http/http_status.h" +#include "olap/storage_engine.h" +#include "olap/tablet_manager.h" +#include "util/stopwatch.hpp" + +namespace doris { +using namespace ErrorCode; + +const static std::string HEADER_JSON = "application/json"; + +ShowNestedIndexFileAction::ShowNestedIndexFileAction(ExecEnv* exec_env, TPrivilegeHier::type hier, + TPrivilegeType::type ptype) + : HttpHandlerWithAuth(exec_env, hier, ptype) {} + +// show the nested inverted index file in the tablet +Status ShowNestedIndexFileAction::_handle_show_nested_index_file(HttpRequest* req, + std::string* json_meta) { + req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.c_str()); + std::string req_tablet_id = req->param(TABLET_ID_KEY); + uint64_t tablet_id = 0; + try { + tablet_id = std::stoull(req_tablet_id); + } catch (const std::exception& e) { + LOG(WARNING) << "invalid argument.tablet_id:" << req_tablet_id; + return Status::InternalError("convert failed, {}", e.what()); + } + + auto base_tablet = DORIS_TRY(ExecEnv::get_tablet(tablet_id)); + // cast base tablet to tablet + auto tablet = std::dynamic_pointer_cast(base_tablet); + RETURN_IF_ERROR(tablet->show_nested_index_file(json_meta)); + return Status::OK(); +} + +void ShowNestedIndexFileAction::handle(HttpRequest* req) { + MonotonicStopWatch timer; + timer.start(); + + std::string json_meta; + Status status = _handle_show_nested_index_file(req, &json_meta); + std::string status_result = status.to_json(); + timer.stop(); + LOG(INFO) << "handle show_nested_index_file request finished, result:" << status_result + << ", use time = " << timer.elapsed_time() / 1000000 << "ms"; + if (status.ok()) { + HttpChannel::send_reply(req, HttpStatus::OK, json_meta); + } else { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, status_result); + } +} + +} // end namespace doris diff --git a/be/src/http/action/show_nested_index_file_action.h b/be/src/http/action/show_nested_index_file_action.h new file mode 100644 index 00000000000000..913eec0aa27a7e --- /dev/null +++ b/be/src/http/action/show_nested_index_file_action.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include + +#include "common/status.h" +#include "http/http_handler_with_auth.h" + +namespace doris { +class HttpRequest; +class BaseStorageEngine; +class ExecEnv; + +// This action is used to show nested inverted index file in tablet +class ShowNestedIndexFileAction : public HttpHandlerWithAuth { +public: + ShowNestedIndexFileAction(ExecEnv* exec_env, TPrivilegeHier::type hier, + TPrivilegeType::type ptype); + + ~ShowNestedIndexFileAction() override = default; + + void handle(HttpRequest* req) override; + +private: + Status _handle_show_nested_index_file(HttpRequest* req, std::string* json_header); +}; + +} // end namespace doris diff --git a/be/src/io/fs/multi_table_pipe.cpp b/be/src/io/fs/multi_table_pipe.cpp index fa38b6440c1b1a..4469174211e9e8 100644 --- a/be/src/io/fs/multi_table_pipe.cpp +++ b/be/src/io/fs/multi_table_pipe.cpp @@ -324,6 +324,19 @@ void MultiTablePipe::_handle_consumer_finished() { _ctx->number_filtered_rows = _number_filtered_rows; _ctx->number_unselected_rows = _number_unselected_rows; _ctx->commit_infos = _tablet_commit_infos; + + // remove ctx to avoid memory leak. + for (const auto& pair : _planned_tables) { + if (pair.second) { + doris::ExecEnv::GetInstance()->new_load_stream_mgr()->remove(pair.second->id); + } + } + for (const auto& pair : _unplanned_tables) { + if (pair.second) { + doris::ExecEnv::GetInstance()->new_load_stream_mgr()->remove(pair.second->id); + } + } + LOG(INFO) << "all plan for multi-table load complete. number_total_rows=" << _ctx->number_total_rows << " number_loaded_rows=" << _ctx->number_loaded_rows << " number_filtered_rows=" << _ctx->number_filtered_rows diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index 474909cbf45b65..a9455d453818bb 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -154,6 +154,16 @@ Status BaseCompaction::pick_rowsets_to_compact() { "situation, no need to do base compaction."); } + int score = 0; + int rowset_cnt = 0; + while (rowset_cnt < _input_rowsets.size()) { + score += _input_rowsets[rowset_cnt++]->rowset_meta()->get_compaction_score(); + if (score > config::base_compaction_max_compaction_score) { + break; + } + } + _input_rowsets.resize(rowset_cnt); + // 1. cumulative rowset must reach base_compaction_num_cumulative_deltas threshold if (_input_rowsets.size() > config::base_compaction_min_rowset_num) { VLOG_NOTICE << "satisfy the base compaction policy. tablet=" << _tablet->tablet_id() diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index 768c69624fa92b..4338986efe6431 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -22,6 +22,7 @@ #include #include "common/status.h" +#include "olap/iterators.h" #include "olap/olap_common.h" #include "olap/tablet_fwd.h" #include "olap/tablet_meta.h" @@ -104,6 +105,10 @@ class BaseTablet { IntCounter* flush_finish_count = nullptr; std::atomic published_count = 0; + std::mutex sample_info_lock; + std::vector sample_infos; + Status last_compaction_status = Status::OK(); + std::atomic read_block_count = 0; std::atomic write_count = 0; std::atomic compaction_count = 0; diff --git a/be/src/olap/column_mapping.h b/be/src/olap/column_mapping.h index 047af1e9d1190b..bf3a6118d76bac 100644 --- a/be/src/olap/column_mapping.h +++ b/be/src/olap/column_mapping.h @@ -30,11 +30,11 @@ struct ColumnMapping { ColumnMapping() = default; virtual ~ColumnMapping() = default; - bool has_reference() const { return expr != nullptr || ref_column >= 0; } + bool has_reference() const { return expr != nullptr || ref_column_idx >= 0; } // <0: use default value // >=0: use origin column - int32_t ref_column = -1; + int32_t ref_column_idx = -1; // normally for default value. stores values for filters WrapperField* default_value = nullptr; std::shared_ptr expr; diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index 171b68f30b6428..2256d1c304efb3 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -316,6 +316,15 @@ bool Compaction::handle_ordered_data_compaction() { return st.ok(); } +int64_t Compaction::merge_way_num() { + int64_t way_num = 0; + for (auto&& rowset : _input_rowsets) { + way_num += rowset->rowset_meta()->get_merge_way_num(); + } + + return way_num; +} + Status Compaction::do_compaction_impl(int64_t permits) { OlapStopWatch watch; @@ -324,7 +333,11 @@ Status Compaction::do_compaction_impl(int64_t permits) { int64_t now = UnixMillis(); if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION) { - _tablet->set_last_cumu_compaction_success_time(now); + // TIME_SERIES_POLICY, generating an empty rowset doesn't need to update the timestamp. + if (!(_tablet->tablet_meta()->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY && + _output_rowset->num_segments() == 0)) { + _tablet->set_last_cumu_compaction_success_time(now); + } } else if (compaction_type() == ReaderType::READER_BASE_COMPACTION) { _tablet->set_last_base_compaction_success_time(now); } else if (compaction_type() == ReaderType::READER_FULL_COMPACTION) { @@ -363,6 +376,7 @@ Status Compaction::do_compaction_impl(int64_t permits) { _tablet->enable_unique_key_merge_on_write())) { stats.rowid_conversion = &_rowid_conversion; } + int64_t way_num = merge_way_num(); Status res; { @@ -370,13 +384,15 @@ Status Compaction::do_compaction_impl(int64_t permits) { if (vertical_compaction) { res = Merger::vertical_merge_rowsets(_tablet, compaction_type(), _cur_tablet_schema, _input_rs_readers, _output_rs_writer.get(), - get_avg_segment_rows(), &stats); + get_avg_segment_rows(), way_num, &stats); } else { res = Merger::vmerge_rowsets(_tablet, compaction_type(), _cur_tablet_schema, _input_rs_readers, _output_rs_writer.get(), &stats); } } + _tablet->last_compaction_status = res; + if (!res.ok()) { LOG(WARNING) << "fail to do " << compaction_name() << ". res=" << res << ", tablet=" << _tablet->tablet_id() @@ -736,7 +752,11 @@ Status Compaction::do_compaction_impl(int64_t permits) { int64_t now = UnixMillis(); // TODO(yingchun): do the judge in Tablet class if (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION) { - _tablet->set_last_cumu_compaction_success_time(now); + // TIME_SERIES_POLICY, generating an empty rowset doesn't need to update the timestamp. + if (!(_tablet->tablet_meta()->compaction_policy() == CUMULATIVE_TIME_SERIES_POLICY && + _output_rowset->num_segments() == 0)) { + _tablet->set_last_cumu_compaction_success_time(now); + } } else if (compaction_type() == ReaderType::READER_BASE_COMPACTION) { _tablet->set_last_base_compaction_success_time(now); } else if (compaction_type() == ReaderType::READER_FULL_COMPACTION) { diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index 5b1580f209defb..5aa3e260194319 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -105,6 +105,7 @@ class Compaction { private: bool _check_if_includes_input_rowsets(const RowsetIdUnorderedSet& commit_rowset_ids_set) const; void _load_segment_to_cache(); + int64_t merge_way_num(); protected: // the root tracker for this compaction diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index 42748012cabfc6..04504432f195fa 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -82,8 +82,10 @@ Status CumulativeCompaction::execute_compact_impl() { _state = CompactionState::SUCCESS; // 5. set cumulative level - _tablet->cumulative_compaction_policy()->update_compaction_level(_tablet.get(), _input_rowsets, - _output_rowset); + if (_tablet->tablet_meta()->time_series_compaction_level_threshold() >= 2) { + _tablet->cumulative_compaction_policy()->update_compaction_level( + _tablet.get(), _input_rowsets, _output_rowset); + } // 6. set cumulative point _tablet->cumulative_compaction_policy()->update_cumulative_point( @@ -116,11 +118,20 @@ Status CumulativeCompaction::pick_rowsets_to_compact() { << ", tablet=" << _tablet->tablet_id(); } + int64_t max_score = config::cumulative_compaction_max_deltas; + auto process_memory_usage = doris::GlobalMemoryArbitrator::process_memory_usage(); + bool memory_usage_high = process_memory_usage > MemInfo::soft_mem_limit() * 0.8; + if (_tablet->last_compaction_status.is() || memory_usage_high) { + max_score = std::max(config::cumulative_compaction_max_deltas / + config::cumulative_compaction_max_deltas_factor, + config::cumulative_compaction_min_deltas + 1); + } + size_t compaction_score = 0; _tablet->cumulative_compaction_policy()->pick_input_rowsets( - _tablet.get(), candidate_rowsets, config::cumulative_compaction_max_deltas, - config::cumulative_compaction_min_deltas, &_input_rowsets, &_last_delete_version, - &compaction_score, allow_delete_in_cumu_compaction()); + _tablet.get(), candidate_rowsets, max_score, config::cumulative_compaction_min_deltas, + &_input_rowsets, &_last_delete_version, &compaction_score, + allow_delete_in_cumu_compaction()); // Cumulative compaction will process with at least 1 rowset. // So when there is no rowset being chosen, we should return Status::Error(): diff --git a/be/src/olap/iterators.h b/be/src/olap/iterators.h index deb14ff554f658..5d752a2bf735a6 100644 --- a/be/src/olap/iterators.h +++ b/be/src/olap/iterators.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "common/status.h" @@ -122,6 +123,12 @@ class StorageReadOptions { size_t topn_limit = 0; }; +struct CompactionSampleInfo { + int64_t bytes = 0; + int64_t rows = 0; + int64_t group_data_size; +}; + class RowwiseIterator; using RowwiseIteratorUPtr = std::unique_ptr; class RowwiseIterator { @@ -134,7 +141,13 @@ class RowwiseIterator { // Input options may contain scan range in which this scan. // Return Status::OK() if init successfully, // Return other error otherwise - virtual Status init(const StorageReadOptions& opts) = 0; + virtual Status init(const StorageReadOptions& opts) { + return Status::NotSupported("to be implemented"); + } + + virtual Status init(const StorageReadOptions& opts, CompactionSampleInfo* sample_info) { + return Status::NotSupported("to be implemented"); + } // If there is any valid data, this function will load data // into input batch with Status::OK() returned diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 5bdfdfd9cacc2a..067280e84d670c 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -51,9 +51,9 @@ Status MatchPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& nam if (iterator == nullptr) { return Status::OK(); } - if (_skip_evaluate(iterator)) { - return Status::Error( - "match predicate evaluate skipped."); + if (_check_evaluate(iterator)) { + return Status::Error( + "phrase queries require setting support_phrase = true"); } auto type = name_with_type.second; const std::string& name = name_with_type.first; @@ -122,13 +122,14 @@ InvertedIndexQueryType MatchPredicate::_to_inverted_index_query_type(MatchType m return ret; } -bool MatchPredicate::_skip_evaluate(InvertedIndexIterator* iterator) const { - if ((_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || - _match_type == MatchType::MATCH_PHRASE_EDGE) && - iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && - get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == - INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { - return true; +bool MatchPredicate::_check_evaluate(InvertedIndexIterator* iterator) const { + if (_match_type == MatchType::MATCH_PHRASE || _match_type == MatchType::MATCH_PHRASE_PREFIX || + _match_type == MatchType::MATCH_PHRASE_EDGE) { + if (iterator->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT && + get_parser_phrase_support_string_from_properties(iterator->get_index_properties()) == + INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) { + return true; + } } return false; } diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index 17d8e76ac88e11..ad202b7b2427cf 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -79,7 +79,7 @@ class MatchPredicate : public ColumnPredicate { std::string info = "MatchPredicate"; return info; } - bool _skip_evaluate(InvertedIndexIterator* iterator) const; + bool _check_evaluate(InvertedIndexIterator* iterator) const; private: std::string _value; diff --git a/be/src/olap/memtable_flush_executor.cpp b/be/src/olap/memtable_flush_executor.cpp index 4fc48f18edfd49..cf96d5f46a5dea 100644 --- a/be/src/olap/memtable_flush_executor.cpp +++ b/be/src/olap/memtable_flush_executor.cpp @@ -140,6 +140,7 @@ Status FlushToken::_do_flush_memtable(MemTable* memtable, int32_t segment_id, in SCOPED_RAW_TIMER(&duration_ns); SCOPED_ATTACH_TASK(memtable->query_thread_context()); signal::set_signal_task_id(_rowset_writer->load_id()); + signal::tablet_id = memtable->tablet_id(); { SCOPED_CONSUME_MEM_TRACKER(memtable->flush_mem_tracker()); std::unique_ptr block; diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index b73c5bda645563..b2e789b5b5a7c1 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -33,6 +34,8 @@ #include "common/config.h" #include "common/logging.h" +#include "common/status.h" +#include "olap/iterators.h" #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/rowid_conversion.h" @@ -42,6 +45,7 @@ #include "olap/rowset/segment_v2/segment_writer.h" #include "olap/storage_engine.h" #include "olap/tablet.h" +#include "olap/tablet_fwd.h" #include "olap/tablet_reader.h" #include "olap/utils.h" #include "util/slice.h" @@ -212,7 +216,8 @@ Status Merger::vertical_compact_one_group( const std::vector& column_group, vectorized::RowSourcesBuffer* row_source_buf, const std::vector& src_rowset_readers, RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, Statistics* stats_output, - std::vector key_group_cluster_key_idxes) { + std::vector key_group_cluster_key_idxes, int64_t batch_size, + CompactionSampleInfo* sample_info) { // build tablet reader VLOG_NOTICE << "vertical compact one group, max_rows_per_segment=" << max_rows_per_segment; vectorized::VerticalBlockReader reader(row_source_buf); @@ -250,7 +255,8 @@ Status Merger::vertical_compact_one_group( reader_params.return_columns = column_group; reader_params.origin_return_columns = &reader_params.return_columns; - RETURN_IF_ERROR(reader.init(reader_params)); + reader_params.batch_size = batch_size; + RETURN_IF_ERROR(reader.init(reader_params, sample_info)); if (reader_params.record_rowids) { stats_output->rowid_conversion->set_dst_rowset_id(dst_rowset_writer->rowset_id()); @@ -356,6 +362,55 @@ Status Merger::vertical_compact_one_group(TabletSharedPtr tablet, ReaderType rea return Status::OK(); } +int64_t estimate_batch_size(int group_index, BaseTabletSPtr tablet, int64_t way_cnt) { + std::unique_lock lock(tablet->sample_info_lock); + CompactionSampleInfo info = tablet->sample_infos[group_index]; + if (way_cnt <= 0) { + LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " + << tablet->tablet_id() << " way cnt: " << way_cnt; + return 4096 - 32; + } + int64_t block_mem_limit = config::compaction_memory_bytes_limit / way_cnt; + if (tablet->last_compaction_status.is()) { + block_mem_limit /= 4; + } + + int64_t group_data_size = 0; + if (info.group_data_size > 0 && info.bytes > 0 && info.rows > 0) { + float smoothing_factor = 0.5; + group_data_size = int64_t(info.group_data_size * (1 - smoothing_factor) + + info.bytes / info.rows * smoothing_factor); + tablet->sample_infos[group_index].group_data_size = group_data_size; + } else if (info.group_data_size > 0 && (info.bytes <= 0 || info.rows <= 0)) { + group_data_size = info.group_data_size; + } else if (info.group_data_size <= 0 && info.bytes > 0 && info.rows > 0) { + group_data_size = info.bytes / info.rows; + tablet->sample_infos[group_index].group_data_size = group_data_size; + } else { + LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " + << tablet->tablet_id() << " group data size: " << info.group_data_size + << " row num: " << info.rows << " consume bytes: " << info.bytes; + return 1024 - 32; + } + + if (group_data_size <= 0) { + LOG(WARNING) << "estimate batch size for vertical compaction, tablet id: " + << tablet->tablet_id() << " unexpected group data size: " << group_data_size; + return 4096 - 32; + } + + tablet->sample_infos[group_index].bytes = 0; + tablet->sample_infos[group_index].rows = 0; + + int64_t batch_size = block_mem_limit / group_data_size; + int64_t res = std::max(std::min(batch_size, int64_t(4096 - 32)), int64_t(32L)); + LOG(INFO) << "estimate batch size for vertical compaction, tablet id: " << tablet->tablet_id() + << " group data size: " << info.group_data_size << " row num: " << info.rows + << " consume bytes: " << info.bytes << " way cnt: " << way_cnt + << " batch size: " << res; + return res; +} + // steps to do vertical merge: // 1. split columns into column groups // 2. compact groups one by one, generate a row_source_buf when compact key group @@ -365,7 +420,7 @@ Status Merger::vertical_merge_rowsets(TabletSharedPtr tablet, ReaderType reader_ TabletSchemaSPtr tablet_schema, const std::vector& src_rowset_readers, RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, - Statistics* stats_output) { + int64_t merge_way_num, Statistics* stats_output) { LOG(INFO) << "Start to do vertical compaction, tablet_id: " << tablet->tablet_id(); std::vector> column_groups; vertical_split_columns(tablet_schema, &column_groups); @@ -376,14 +431,18 @@ Status Merger::vertical_merge_rowsets(TabletSharedPtr tablet, ReaderType reader_ vectorized::RowSourcesBuffer row_sources_buf(tablet->tablet_id(), tablet->tablet_path(), reader_type); + tablet->sample_infos.resize(column_groups.size(), {0, 0, 0}); // compact group one by one for (auto i = 0; i < column_groups.size(); ++i) { VLOG_NOTICE << "row source size: " << row_sources_buf.total_size(); bool is_key = (i == 0); + int64_t batch_size = config::compaction_batch_size != -1 + ? config::compaction_batch_size + : estimate_batch_size(i, tablet, merge_way_num); RETURN_IF_ERROR(vertical_compact_one_group( tablet, reader_type, tablet_schema, is_key, column_groups[i], &row_sources_buf, src_rowset_readers, dst_rowset_writer, max_rows_per_segment, stats_output, - key_group_cluster_key_idxes)); + key_group_cluster_key_idxes, batch_size, &(tablet->sample_infos[i]))); if (is_key) { RETURN_IF_ERROR(row_sources_buf.flush()); } diff --git a/be/src/olap/merger.h b/be/src/olap/merger.h index ab948f55ed9e61..49ca1e5227fe6e 100644 --- a/be/src/olap/merger.h +++ b/be/src/olap/merger.h @@ -23,6 +23,7 @@ #include "common/status.h" #include "io/io_common.h" +#include "olap/iterators.h" #include "olap/rowset/rowset_reader.h" #include "olap/tablet.h" #include "olap/tablet_schema.h" @@ -62,7 +63,7 @@ class Merger { static Status vertical_merge_rowsets( TabletSharedPtr tablet, ReaderType reader_type, TabletSchemaSPtr tablet_schema, const std::vector& src_rowset_readers, - RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, + RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, int64_t merge_way_num, Statistics* stats_output); public: @@ -75,7 +76,8 @@ class Merger { vectorized::RowSourcesBuffer* row_source_buf, const std::vector& src_rowset_readers, RowsetWriter* dst_rowset_writer, int64_t max_rows_per_segment, Statistics* stats_output, - std::vector key_group_cluster_key_idxes); + std::vector key_group_cluster_key_idxes, int64_t batch_size, + CompactionSampleInfo* sample_info); // for segcompaction static Status vertical_compact_one_group(TabletSharedPtr tablet, ReaderType reader_type, diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index c1a2e3c18b5b66..7e0cf6645d2bff 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -361,10 +361,13 @@ struct OlapReaderStatistics { int64_t inverted_index_query_timer = 0; int64_t inverted_index_query_cache_hit = 0; int64_t inverted_index_query_cache_miss = 0; + int64_t inverted_index_query_null_bitmap_timer = 0; int64_t inverted_index_query_bitmap_copy_timer = 0; int64_t inverted_index_query_bitmap_op_timer = 0; int64_t inverted_index_searcher_open_timer = 0; int64_t inverted_index_searcher_search_timer = 0; + int64_t inverted_index_searcher_cache_hit = 0; + int64_t inverted_index_searcher_cache_miss = 0; int64_t output_index_result_column_timer = 0; // number of segment filtered by column stat when creating seg iterator diff --git a/be/src/olap/partial_update_info.cpp b/be/src/olap/partial_update_info.cpp new file mode 100644 index 00000000000000..5867a77559b36d --- /dev/null +++ b/be/src/olap/partial_update_info.cpp @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "olap/partial_update_info.h" + +#include + +#include "olap/tablet_schema.h" + +namespace doris { + +void PartialUpdateInfo::init(const TabletSchema& tablet_schema, bool partial_update, + const std::set& partial_update_cols, bool is_strict_mode, + int64_t timestamp_ms, const std::string& timezone, + const std::string& auto_increment_column, int64_t cur_max_version) { + is_partial_update = partial_update; + partial_update_input_columns = partial_update_cols; + max_version_in_flush_phase = cur_max_version; + this->timestamp_ms = timestamp_ms; + this->timezone = timezone; + missing_cids.clear(); + update_cids.clear(); + for (auto i = 0; i < tablet_schema.num_columns(); ++i) { + auto tablet_column = tablet_schema.column(i); + if (!partial_update_input_columns.contains(tablet_column.name())) { + missing_cids.emplace_back(i); + if (!tablet_column.has_default_value() && !tablet_column.is_nullable() && + tablet_schema.auto_increment_column() != tablet_column.name()) { + can_insert_new_rows_in_partial_update = false; + } + } else { + update_cids.emplace_back(i); + } + if (auto_increment_column == tablet_column.name()) { + is_schema_contains_auto_inc_column = true; + } + } + this->is_strict_mode = is_strict_mode; + is_input_columns_contains_auto_inc_column = + is_partial_update && partial_update_input_columns.contains(auto_increment_column); + _generate_default_values_for_missing_cids(tablet_schema); +} + +void PartialUpdateInfo::to_pb(PartialUpdateInfoPB* partial_update_info_pb) const { + partial_update_info_pb->set_is_partial_update(is_partial_update); + partial_update_info_pb->set_max_version_in_flush_phase(max_version_in_flush_phase); + for (const auto& col : partial_update_input_columns) { + partial_update_info_pb->add_partial_update_input_columns(col); + } + for (auto cid : missing_cids) { + partial_update_info_pb->add_missing_cids(cid); + } + for (auto cid : update_cids) { + partial_update_info_pb->add_update_cids(cid); + } + partial_update_info_pb->set_can_insert_new_rows_in_partial_update( + can_insert_new_rows_in_partial_update); + partial_update_info_pb->set_is_strict_mode(is_strict_mode); + partial_update_info_pb->set_timestamp_ms(timestamp_ms); + partial_update_info_pb->set_timezone(timezone); + partial_update_info_pb->set_is_input_columns_contains_auto_inc_column( + is_input_columns_contains_auto_inc_column); + partial_update_info_pb->set_is_schema_contains_auto_inc_column( + is_schema_contains_auto_inc_column); + for (const auto& value : default_values) { + partial_update_info_pb->add_default_values(value); + } +} + +void PartialUpdateInfo::from_pb(PartialUpdateInfoPB* partial_update_info_pb) { + is_partial_update = partial_update_info_pb->is_partial_update(); + max_version_in_flush_phase = partial_update_info_pb->has_max_version_in_flush_phase() + ? partial_update_info_pb->max_version_in_flush_phase() + : -1; + partial_update_input_columns.clear(); + for (const auto& col : partial_update_info_pb->partial_update_input_columns()) { + partial_update_input_columns.insert(col); + } + missing_cids.clear(); + for (auto cid : partial_update_info_pb->missing_cids()) { + missing_cids.push_back(cid); + } + update_cids.clear(); + for (auto cid : partial_update_info_pb->update_cids()) { + update_cids.push_back(cid); + } + can_insert_new_rows_in_partial_update = + partial_update_info_pb->can_insert_new_rows_in_partial_update(); + is_strict_mode = partial_update_info_pb->is_strict_mode(); + timestamp_ms = partial_update_info_pb->timestamp_ms(); + timezone = partial_update_info_pb->timezone(); + is_input_columns_contains_auto_inc_column = + partial_update_info_pb->is_input_columns_contains_auto_inc_column(); + is_schema_contains_auto_inc_column = + partial_update_info_pb->is_schema_contains_auto_inc_column(); + default_values.clear(); + for (const auto& value : partial_update_info_pb->default_values()) { + default_values.push_back(value); + } +} + +std::string PartialUpdateInfo::summary() const { + return fmt::format( + "update_cids={}, missing_cids={}, is_strict_mode={}, max_version_in_flush_phase={}", + update_cids.size(), missing_cids.size(), is_strict_mode, max_version_in_flush_phase); +} + +void PartialUpdateInfo::_generate_default_values_for_missing_cids( + const TabletSchema& tablet_schema) { + for (unsigned int cur_cid : missing_cids) { + const auto& column = tablet_schema.column(cur_cid); + if (column.has_default_value()) { + std::string default_value; + if (UNLIKELY(tablet_schema.column(cur_cid).type() == + FieldType::OLAP_FIELD_TYPE_DATETIMEV2 && + to_lower(tablet_schema.column(cur_cid).default_value()) + .find(to_lower("CURRENT_TIMESTAMP")) != + std::string::npos)) { + DateV2Value dtv; + dtv.from_unixtime(timestamp_ms / 1000, timezone); + default_value = dtv.debug_string(); + } else if (UNLIKELY(tablet_schema.column(cur_cid).type() == + FieldType::OLAP_FIELD_TYPE_DATEV2 && + to_lower(tablet_schema.column(cur_cid).default_value()) + .find(to_lower("CURRENT_DATE")) != + std::string::npos)) { + DateV2Value dv; + dv.from_unixtime(timestamp_ms / 1000, timezone); + default_value = dv.debug_string(); + } else { + default_value = tablet_schema.column(cur_cid).default_value(); + } + default_values.emplace_back(default_value); + } else { + // place an empty string here + default_values.emplace_back(); + } + } + CHECK_EQ(missing_cids.size(), default_values.size()); +} +} // namespace doris diff --git a/be/src/olap/partial_update_info.h b/be/src/olap/partial_update_info.h index f20f9680b0b57a..987f31ec7f7eb9 100644 --- a/be/src/olap/partial_update_info.h +++ b/be/src/olap/partial_update_info.h @@ -16,81 +16,30 @@ // under the License. #pragma once - -#include "olap/tablet_schema.h" +#include +#include +#include +#include namespace doris { +class TabletSchema; +class PartialUpdateInfoPB; struct PartialUpdateInfo { void init(const TabletSchema& tablet_schema, bool partial_update, - const std::set& partial_update_cols, bool is_strict_mode, + const std::set& partial_update_cols, bool is_strict_mode, int64_t timestamp_ms, const std::string& timezone, - const std::string& auto_increment_column) { - is_partial_update = partial_update; - partial_update_input_columns = partial_update_cols; - - this->timestamp_ms = timestamp_ms; - this->timezone = timezone; - missing_cids.clear(); - update_cids.clear(); - for (auto i = 0; i < tablet_schema.num_columns(); ++i) { - auto tablet_column = tablet_schema.column(i); - if (!partial_update_input_columns.contains(tablet_column.name())) { - missing_cids.emplace_back(i); - if (!tablet_column.has_default_value() && !tablet_column.is_nullable() && - tablet_schema.auto_increment_column() != tablet_column.name()) { - can_insert_new_rows_in_partial_update = false; - } - } else { - update_cids.emplace_back(i); - } - if (auto_increment_column == tablet_column.name()) { - is_schema_contains_auto_inc_column = true; - } - } - this->is_strict_mode = is_strict_mode; - is_input_columns_contains_auto_inc_column = - is_partial_update && partial_update_input_columns.contains(auto_increment_column); - _generate_default_values_for_missing_cids(tablet_schema); - } + const std::string& auto_increment_column, int64_t cur_max_version = -1); + void to_pb(PartialUpdateInfoPB* partial_update_info) const; + void from_pb(PartialUpdateInfoPB* partial_update_info); + std::string summary() const; private: - void _generate_default_values_for_missing_cids(const TabletSchema& tablet_schema) { - for (auto i = 0; i < missing_cids.size(); ++i) { - auto cur_cid = missing_cids[i]; - const auto& column = tablet_schema.column(cur_cid); - if (column.has_default_value()) { - std::string default_value; - if (UNLIKELY(tablet_schema.column(cur_cid).type() == - FieldType::OLAP_FIELD_TYPE_DATETIMEV2 && - to_lower(tablet_schema.column(cur_cid).default_value()) - .find(to_lower("CURRENT_TIMESTAMP")) != - std::string::npos)) { - DateV2Value dtv; - dtv.from_unixtime(timestamp_ms / 1000, timezone); - default_value = dtv.debug_string(); - } else if (UNLIKELY(tablet_schema.column(cur_cid).type() == - FieldType::OLAP_FIELD_TYPE_DATEV2 && - to_lower(tablet_schema.column(cur_cid).default_value()) - .find(to_lower("CURRENT_DATE")) != - std::string::npos)) { - DateV2Value dv; - dv.from_unixtime(timestamp_ms / 1000, timezone); - default_value = dv.debug_string(); - } else { - default_value = tablet_schema.column(cur_cid).default_value(); - } - default_values.emplace_back(default_value); - } else { - // place an empty string here - default_values.emplace_back(); - } - } - CHECK_EQ(missing_cids.size(), default_values.size()); - } + void _generate_default_values_for_missing_cids(const TabletSchema& tablet_schema); public: bool is_partial_update {false}; + int64_t max_version_in_flush_phase {-1}; std::set partial_update_input_columns; std::vector missing_cids; std::vector update_cids; diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index 3372781f6e9d72..6e0c854ae2cdc3 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -40,6 +40,7 @@ #include "olap/rowset/beta_rowset_reader.h" #include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "olap/rowset/segment_v2/inverted_index_file_reader.h" #include "olap/tablet_schema.h" #include "olap/utils.h" #include "util/crc32c.h" @@ -702,4 +703,131 @@ Status BetaRowset::calc_local_file_crc(uint32_t* crc_value, int64_t* file_count) return Status::OK(); } +Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, + rapidjson::Document::AllocatorType& allocator) { + const auto& fs = _rowset_meta->fs(); + auto storage_format = _schema->get_inverted_index_storage_format(); + const auto* format_str = storage_format == InvertedIndexStorageFormatPB::V1 ? "V1" : "V2"; + auto rs_id = rowset_id().to_string(); + rowset_value->AddMember("rowset_id", rapidjson::Value(rs_id.c_str(), allocator), allocator); + rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str, allocator), + allocator); + rapidjson::Value segments(rapidjson::kArrayType); + for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { + rapidjson::Value segment(rapidjson::kObjectType); + segment.AddMember("segment_id", rapidjson::Value(seg_id).Move(), allocator); + + auto seg_path = segment_file_path(seg_id); + // std::string convert to path and get parent path + auto seg_parent_path = std::filesystem::path(seg_path).parent_path(); + auto seg_file_name = std::filesystem::path(seg_path).filename().string(); + auto inverted_index_file_reader = std::make_unique( + fs, seg_parent_path, seg_file_name, storage_format); + RETURN_IF_ERROR(inverted_index_file_reader->init()); + auto dirs = inverted_index_file_reader->get_all_directories(); + + auto add_file_info_to_json = [&](const std::string& path, + rapidjson::Value& json_value) -> Status { + json_value.AddMember("idx_file_path", rapidjson::Value(path.c_str(), allocator), + allocator); + int64_t idx_file_size = 0; + auto st = fs->file_size(path, &idx_file_size); + if (st != Status::OK()) { + LOG(WARNING) << "show nested index file get file size error, file: " << path + << ", error: " << st.msg(); + return st; + } + json_value.AddMember("idx_file_size", rapidjson::Value(idx_file_size).Move(), + allocator); + return Status::OK(); + }; + + auto process_files = [&allocator, &inverted_index_file_reader]( + auto& index_meta, rapidjson::Value& indices, + rapidjson::Value& index) -> Status { + rapidjson::Value files_value(rapidjson::kArrayType); + std::vector files; + auto ret = inverted_index_file_reader->open(&index_meta); + if (!ret.has_value()) { + LOG(INFO) << "InvertedIndexFileReader open error:" << ret.error(); + return Status::InternalError("InvertedIndexFileReader open error"); + } + using T = std::decay_t; + auto reader = std::forward(ret).value(); + reader->list(&files); + for (auto& file : files) { + rapidjson::Value file_value(rapidjson::kObjectType); + auto size = reader->fileLength(file.c_str()); + file_value.AddMember("name", rapidjson::Value(file.c_str(), allocator), allocator); + file_value.AddMember("size", rapidjson::Value(size).Move(), allocator); + files_value.PushBack(file_value, allocator); + } + index.AddMember("files", files_value, allocator); + indices.PushBack(index, allocator); + return Status::OK(); + }; + + if (storage_format != InvertedIndexStorageFormatPB::V1) { + auto path = InvertedIndexDescriptor::get_index_file_name(seg_path); + auto st = add_file_info_to_json(path, segment); + if (!st.ok()) { + return st; + } + rapidjson::Value indices(rapidjson::kArrayType); + for (auto& dir : *dirs) { + rapidjson::Value index(rapidjson::kObjectType); + auto index_id = dir.first.first; + auto index_suffix = dir.first.second; + index.AddMember("index_id", rapidjson::Value(index_id).Move(), allocator); + index.AddMember("index_suffix", rapidjson::Value(index_suffix.c_str(), allocator), + allocator); + + rapidjson::Value files_value(rapidjson::kArrayType); + std::vector files; + doris::TabletIndexPB index_pb; + index_pb.set_index_id(index_id); + index_pb.set_index_suffix_name(index_suffix); + TabletIndex index_meta; + index_meta.init_from_pb(index_pb); + + auto status = process_files(index_meta, indices, index); + if (!status.ok()) { + return status; + } + } + segment.AddMember("indices", indices, allocator); + segments.PushBack(segment, allocator); + } else { + rapidjson::Value indices(rapidjson::kArrayType); + for (auto column : _rowset_meta->tablet_schema()->columns()) { + const auto* index_meta = _rowset_meta->tablet_schema()->get_inverted_index(*column); + if (index_meta == nullptr) { + continue; + } + rapidjson::Value index(rapidjson::kObjectType); + auto index_id = index_meta->index_id(); + auto index_suffix = index_meta->get_index_suffix(); + index.AddMember("index_id", rapidjson::Value(index_id).Move(), allocator); + index.AddMember("index_suffix", rapidjson::Value(index_suffix.c_str(), allocator), + allocator); + auto path = InvertedIndexDescriptor::get_index_file_name(seg_path, index_id, + index_suffix); + auto st = add_file_info_to_json(path, index); + if (!st.ok()) { + return st; + } + + auto status = process_files(*index_meta, indices, index); + if (!status.ok()) { + return status; + } + } + segment.AddMember("indices", indices, allocator); + segments.PushBack(segment, allocator); + } + } + rowset_value->AddMember("segments", segments, allocator); + return Status::OK(); +} + } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset.h b/be/src/olap/rowset/beta_rowset.h index be7bb0c0e184fb..18c5f8cba4db0d 100644 --- a/be/src/olap/rowset/beta_rowset.h +++ b/be/src/olap/rowset/beta_rowset.h @@ -102,6 +102,9 @@ class BetaRowset final : public Rowset { Status calc_local_file_crc(uint32_t* crc_value, int64_t* file_count); + Status show_nested_index_file(rapidjson::Value* rowset_value, + rapidjson::Document::AllocatorType& allocator); + protected: BetaRowset(const TabletSchemaSPtr& schema, const std::string& tablet_path, const RowsetMetaSharedPtr& rowset_meta); diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index fd09f8b0a7e830..101571f0256831 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -781,9 +781,9 @@ Status BaseBetaRowsetWriter::_check_segment_number_limit() { if (UNLIKELY(total_segment_num > config::max_segment_num_per_rowset)) { return Status::Error( "too many segments in rowset. tablet_id:{}, rowset_id:{}, max:{}, " - "_num_segment:{}, ", + "_num_segment:{}, rowset_num_rows:{}", _context.tablet_id, _context.rowset_id.to_string(), - config::max_segment_num_per_rowset, _num_segment); + config::max_segment_num_per_rowset, _num_segment, get_rowset_num_rows()); } return Status::OK(); } @@ -795,10 +795,10 @@ Status BetaRowsetWriter::_check_segment_number_limit() { if (UNLIKELY(total_segment_num > config::max_segment_num_per_rowset)) { return Status::Error( "too many segments in rowset. tablet_id:{}, rowset_id:{}, max:{}, _num_segment:{}, " - "_segcompacted_point:{}, _num_segcompacted:{}", + "_segcompacted_point:{}, _num_segcompacted:{}, rowset_num_rows:{}", _context.tablet_id, _context.rowset_id.to_string(), config::max_segment_num_per_rowset, _num_segment, _segcompacted_point, - _num_segcompacted); + _num_segcompacted, get_rowset_num_rows()); } return Status::OK(); } diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index f169ce055fb210..3e285c7e508fb7 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -147,10 +147,13 @@ class BaseBetaRowsetWriter : public RowsetWriter { virtual int64_t _num_seg() const; // build a tmp rowset for load segment to calc delete_bitmap for this segment Status _build_tmp(RowsetSharedPtr& rowset_ptr); + uint64_t get_rowset_num_rows() { + std::lock_guard l(_segid_statistics_map_mutex); + return std::accumulate(_segment_num_rows.begin(), _segment_num_rows.end(), uint64_t(0)); + } RowsetWriterContext _context; std::shared_ptr _rowset_meta; - std::atomic _num_segment; // number of consecutive flushed segments roaring::Roaring _segment_set; // bitmap set to record flushed segment id std::mutex _segment_set_mutex; // mutex for _segment_set diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index 72c6c2fa29bec8..7677015f2e0d5c 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -166,6 +166,7 @@ class Rowset : public std::enable_shared_from_this { bool is_segments_overlapping() const { return rowset_meta()->is_segments_overlapping(); } KeysType keys_type() { return _schema->keys_type(); } RowsetStatePB rowset_meta_state() const { return rowset_meta()->rowset_state(); } + bool produced_by_compaction() const { return rowset_meta()->produced_by_compaction(); } // remove all files in this rowset // TODO should we rename the method to remove_files() to be more specific? diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index 30457d30bc65da..99221789b8163c 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -242,6 +242,12 @@ class RowsetMeta { return num_segments() > 1 && is_singleton_delta() && segments_overlap() != NONOVERLAPPING; } + bool produced_by_compaction() const { + return has_version() && + (start_version() < end_version() || + (start_version() == end_version() && segments_overlap() == NONOVERLAPPING)); + } + // get the compaction score of this rowset. // if segments are overlapping, the score equals to the number of segments, // otherwise, score is 1. @@ -256,6 +262,21 @@ class RowsetMeta { return score; } + uint32_t get_merge_way_num() const { + uint32_t way_num = 0; + if (!is_segments_overlapping()) { + if (num_segments() == 0) { + way_num = 0; + } else { + way_num = 1; + } + } else { + way_num = num_segments(); + CHECK(way_num > 0); + } + return way_num; + } + void get_segments_key_bounds(std::vector* segments_key_bounds) const { for (const KeyBoundsPB& key_range : _rowset_meta_pb.segments_key_bounds()) { segments_key_bounds->push_back(key_range); diff --git a/be/src/olap/rowset/rowset_meta_manager.cpp b/be/src/olap/rowset/rowset_meta_manager.cpp index 38911327d84c65..d89be5ab8ecd93 100644 --- a/be/src/olap/rowset/rowset_meta_manager.cpp +++ b/be/src/olap/rowset/rowset_meta_manager.cpp @@ -535,4 +535,98 @@ Status RowsetMetaManager::load_json_rowset_meta(OlapMeta* meta, return status; } +Status RowsetMetaManager::save_partial_update_info( + OlapMeta* meta, int64_t tablet_id, int64_t partition_id, int64_t txn_id, + const PartialUpdateInfoPB& partial_update_info_pb) { + std::string key = + fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, partition_id, txn_id); + std::string value; + if (!partial_update_info_pb.SerializeToString(&value)) { + return Status::Error( + "serialize partial update info failed. key={}", key); + } + VLOG_NOTICE << "save partial update info, key=" << key << ", value_size=" << value.size(); + return meta->put(META_COLUMN_FAMILY_INDEX, key, value); +} + +Status RowsetMetaManager::try_get_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id, + PartialUpdateInfoPB* partial_update_info_pb) { + std::string key = + fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, partition_id, txn_id); + std::string value; + Status status = meta->get(META_COLUMN_FAMILY_INDEX, key, &value); + if (status.is()) { + return status; + } + if (!status.ok()) { + LOG_WARNING("failed to get partial update info. tablet_id={}, partition_id={}, txn_id={}", + tablet_id, partition_id, txn_id); + return status; + } + if (!partial_update_info_pb->ParseFromString(value)) { + return Status::Error( + "fail to parse partial update info content to protobuf object. tablet_id={}, " + "partition_id={}, txn_id={}", + tablet_id, partition_id, txn_id); + } + return Status::OK(); +} + +Status RowsetMetaManager::traverse_partial_update_info( + OlapMeta* meta, + std::function const& func) { + auto traverse_partial_update_info_func = [&func](const std::string& key, + const std::string& value) -> bool { + std::vector parts; + // key format: pui_{tablet_id}_{partition_id}_{txn_id} + RETURN_IF_ERROR(split_string(key, '_', &parts)); + if (parts.size() != 4) { + LOG_WARNING("invalid rowset key={}, splitted size={}", key, parts.size()); + return true; + } + int64_t tablet_id = std::stoll(parts[1]); + int64_t partition_id = std::stoll(parts[2]); + int64_t txn_id = std::stoll(parts[3]); + return func(tablet_id, partition_id, txn_id, value); + }; + return meta->iterate(META_COLUMN_FAMILY_INDEX, PARTIAL_UPDATE_INFO_PREFIX, + traverse_partial_update_info_func); +} + +Status RowsetMetaManager::remove_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id) { + std::string key = + fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, partition_id, txn_id); + Status res = meta->remove(META_COLUMN_FAMILY_INDEX, key); + VLOG_NOTICE << "remove partial update info, key=" << key; + return res; +} + +Status RowsetMetaManager::remove_partial_update_infos( + OlapMeta* meta, const std::vector>& keys) { + std::vector remove_keys; + for (auto [tablet_id, partition_id, txn_id] : keys) { + remove_keys.push_back(fmt::format("{}{}_{}_{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id, + partition_id, txn_id)); + } + Status res = meta->remove(META_COLUMN_FAMILY_INDEX, remove_keys); + VLOG_NOTICE << "remove partial update info, remove_keys.size()=" << remove_keys.size(); + return res; +} + +Status RowsetMetaManager::remove_tablet_related_partial_update_info(OlapMeta* meta, + int64_t tablet_id) { + std::string prefix = fmt::format("{}{}", PARTIAL_UPDATE_INFO_PREFIX, tablet_id); + std::vector remove_keys; + auto get_remove_keys_func = [&](const std::string& key, const std::string& value) -> bool { + remove_keys.emplace_back(key); + return true; + }; + VLOG_NOTICE << "remove tablet related partial update info, tablet_id: " << tablet_id + << " removed keys size: " << remove_keys.size(); + RETURN_IF_ERROR(meta->iterate(META_COLUMN_FAMILY_INDEX, prefix, get_remove_keys_func)); + return meta->remove(META_COLUMN_FAMILY_INDEX, remove_keys); +} + } // namespace doris diff --git a/be/src/olap/rowset/rowset_meta_manager.h b/be/src/olap/rowset/rowset_meta_manager.h index 9517ce3f51a2d6..0cfbb3383e3935 100644 --- a/be/src/olap/rowset/rowset_meta_manager.h +++ b/be/src/olap/rowset/rowset_meta_manager.h @@ -18,6 +18,8 @@ #ifndef DORIS_BE_SRC_OLAP_ROWSET_ROWSET_META_MANAGER_H #define DORIS_BE_SRC_OLAP_ROWSET_ROWSET_META_MANAGER_H +#include + #include #include #include @@ -32,11 +34,15 @@ namespace doris { class OlapMeta; class RowsetMetaPB; +class PartialUpdateInfoPB; } // namespace doris namespace doris { namespace { const std::string ROWSET_PREFIX = "rst_"; + +const std::string PARTIAL_UPDATE_INFO_PREFIX = "pui_"; + } // namespace // Helper class for managing rowset meta of one root path. @@ -80,6 +86,21 @@ class RowsetMetaManager { static Status load_json_rowset_meta(OlapMeta* meta, const std::string& rowset_meta_path); + static Status save_partial_update_info(OlapMeta* meta, int64_t tablet_id, int64_t partition_id, + int64_t txn_id, + const PartialUpdateInfoPB& partial_update_info_pb); + static Status try_get_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id, + PartialUpdateInfoPB* partial_update_info_pb); + static Status traverse_partial_update_info( + OlapMeta* meta, + std::function const& func); + static Status remove_partial_update_info(OlapMeta* meta, int64_t tablet_id, + int64_t partition_id, int64_t txn_id); + static Status remove_partial_update_infos( + OlapMeta* meta, const std::vector>& keys); + static Status remove_tablet_related_partial_update_info(OlapMeta* meta, int64_t tablet_id); + private: static Status _save(OlapMeta* meta, TabletUid tablet_uid, const RowsetId& rowset_id, const RowsetMetaPB& rowset_meta_pb); diff --git a/be/src/olap/rowset/segcompaction.cpp b/be/src/olap/rowset/segcompaction.cpp index 8fee04ccb80e43..1c152c75c0fb43 100644 --- a/be/src/olap/rowset/segcompaction.cpp +++ b/be/src/olap/rowset/segcompaction.cpp @@ -102,7 +102,8 @@ Status SegcompactionWorker::_get_segcompaction_reader( reader_params.tablet = tablet; reader_params.return_columns = return_columns; reader_params.is_key_column_group = is_key; - return (*reader)->init(reader_params); + reader_params.use_page_cache = false; + return (*reader)->init(reader_params, nullptr); } std::unique_ptr SegcompactionWorker::_create_segcompaction_writer( diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index e9315f7a220c61..e463b883fd206d 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -506,6 +506,7 @@ Status ScalarColumnWriter::init() { return Status::OK(); } Status add_nulls(uint32_t count) override { return Status::OK(); } + Status add_array_nulls(uint32_t row_id) override { return Status::OK(); } Status finish() override { return Status::OK(); } int64_t size() const override { return 0; } int64_t file_size() const override { return 0; } @@ -1014,10 +1015,18 @@ Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t size_t num_rows) { RETURN_IF_ERROR(append_data(ptr, num_rows)); if (is_nullable()) { + if (_opts.need_inverted_index) { + for (int row_id = 0; row_id < num_rows; row_id++) { + if (null_map[row_id] == 1) { + RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(row_id)); + } + } + } RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); } return Status::OK(); } + Status ArrayColumnWriter::finish() { RETURN_IF_ERROR(_offset_writer->finish()); if (is_nullable()) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp index 428dc05e6f6aa5..ec1b5bdd9e4d35 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -31,7 +31,9 @@ namespace doris::segment_v2 { PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr& searcher, const TQueryOptions& query_options) - : _searcher(searcher), _query(std::make_unique()) {} + : _searcher(searcher), + _query(std::make_unique()), + _max_expansions(query_options.inverted_index_max_expansions) {} void PhraseEdgeQuery::add(const std::wstring& field_name, const std::vector& terms) { if (terms.empty()) { @@ -50,9 +52,9 @@ void PhraseEdgeQuery::search(roaring::Roaring& roaring) { } void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { - size_t count = 0; + bool first = true; std::wstring sub_term = StringUtil::string_to_wstring(_terms[0]); - find_words([this, &count, &sub_term, &roaring](Term* term) { + find_words([this, &first, &sub_term, &roaring](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); if (ws_term.find(sub_term) == std::wstring::npos) { return; @@ -70,12 +72,12 @@ void PhraseEdgeQuery::search_one_term(roaring::Roaring& roaring) { } _CLDELETE(term_doc); - if (count) { + if (!first) { roaring.swap(result); + first = false; } else { roaring |= result; } - count++; }); } @@ -86,15 +88,19 @@ void PhraseEdgeQuery::search_multi_term(roaring::Roaring& roaring) { std::vector suffix_terms; std::vector prefix_terms; - find_words([&suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { + find_words([this, &suffix_term, &suffix_terms, &prefix_term, &prefix_terms](Term* term) { std::wstring_view ws_term(term->text(), term->textLength()); - if (ws_term.ends_with(suffix_term)) { - suffix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || suffix_terms.size() < _max_expansions) { + if (ws_term.ends_with(suffix_term)) { + suffix_terms.push_back(_CL_POINTER(term)); + } } - if (ws_term.starts_with(prefix_term)) { - prefix_terms.push_back(_CL_POINTER(term)); + if (_max_expansions == 0 || prefix_terms.size() < _max_expansions) { + if (ws_term.starts_with(prefix_term)) { + prefix_terms.push_back(_CL_POINTER(term)); + } } }); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h index 823f46285b1d00..5daf382e0d08fa 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -52,6 +52,7 @@ class PhraseEdgeQuery : public Query { std::wstring _field_name; std::vector _terms; std::unique_ptr _query; + int32_t _max_expansions = 50; }; } // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index 1f988a758452c8..40a1d2218f8190 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -58,7 +58,7 @@ Status compact_column(int64_t index_id, std::vector& for (auto* d : src_index_dirs) { if (d != nullptr) { d->close(); - //_CLDELETE(d); + _CLDELETE(d); } } for (auto* d : dest_index_dirs) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index 03306110a2816f..1d17bed4a4af81 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -51,7 +51,7 @@ class DorisCompoundFileWriter : LUCENE_BASE { class FileInfo { public: std::string filename; - int32_t filesize; + int64_t filesize; }; void sort_files(std::vector& file_infos); @@ -95,4 +95,4 @@ class InvertedIndexFileWriter { size_t _file_size = 0; }; } // namespace segment_v2 -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index 4e7fb3b3a557fa..7411bf9c4f76e9 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -146,8 +146,10 @@ void InvertedIndexReader::get_analyse_result(std::vector& analyse_r } } -Status InvertedIndexReader::read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, +Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, + InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir) { + SCOPED_RAW_TIMER(&stats->inverted_index_query_null_bitmap_timer); lucene::store::IndexInput* null_bitmap_in = nullptr; bool owned_dir = false; try { @@ -202,9 +204,11 @@ Status InvertedIndexReader::handle_searcher_cache( InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); if (InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, inverted_index_cache_handle)) { + stats->inverted_index_searcher_cache_hit++; return Status::OK(); } else { // searcher cache miss + stats->inverted_index_searcher_cache_miss++; auto mem_tracker = std::make_unique("InvertedIndexSearcherCacheWithRead"); SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); IndexSearcherPtr searcher; @@ -214,7 +218,7 @@ Status InvertedIndexReader::handle_searcher_cache( // to avoid open directory additionally for null_bitmap // TODO: handle null bitmap procedure in new format. InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - static_cast(read_null_bitmap(&null_bitmap_cache_handle, dir.get())); + static_cast(read_null_bitmap(stats, &null_bitmap_cache_handle, dir.get())); RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, mem_tracker.get(), type())); auto* cache_value = new InvertedIndexSearcherCache::CacheValue( std::move(searcher), mem_tracker->consumption(), UnixMillis()); @@ -242,6 +246,27 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, return Status::OK(); }; +Status InvertedIndexReader::match_index_search( + OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, + const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, + const std::shared_ptr& term_match_bitmap) { + TQueryOptions queryOptions = runtime_state->query_options(); + try { + SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); + auto query = QueryFactory::create(query_type, index_searcher, queryOptions); + if (!query) { + return Status::Error( + "query type " + query_type_to_string(query_type) + ", query is nullptr"); + } + query->add(query_info); + query->search(*term_match_bitmap); + } catch (const CLuceneError& e) { + return Status::Error("CLuceneError occured: {}", + e.what()); + } + return Status::OK(); +} + Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) { *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); @@ -321,7 +346,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run if (cache_status.ok()) { return Status::OK(); } - stats->inverted_index_query_cache_miss++; FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; @@ -343,27 +367,6 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run } } -Status FullTextIndexReader::match_index_search( - OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr& term_match_bitmap) { - TQueryOptions queryOptions = runtime_state->query_options(); - try { - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - auto query = QueryFactory::create(query_type, index_searcher, queryOptions); - if (!query) { - return Status::Error( - "query type " + query_type_to_string(query_type) + ", query is nullptr"); - } - query->add(query_info); - query->search(*term_match_bitmap); - } catch (const CLuceneError& e) { - return Status::Error("CLuceneError occured: {}", - e.what()); - } - return Status::OK(); -} - InvertedIndexReaderType FullTextIndexReader::type() { return InvertedIndexReaderType::FULLTEXT; } @@ -420,13 +423,6 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, std::string search_str(search_query->data, act_len); VLOG_DEBUG << "begin to query the inverted index from clucene" << ", column_name: " << column_name << ", search_str: " << search_str; - std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); - std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); - // unique_ptr with custom deleter - std::unique_ptr term { - _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), - [](lucene::index::Term* term) { _CLDECDELETE(term); }}; - std::unique_ptr query; auto index_file_key = _inverted_index_file_reader->get_index_file_key(&_index_meta); @@ -435,13 +431,18 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, search_str}; auto* cache = InvertedIndexQueryCache::instance(); InvertedIndexQueryCacheHandle cache_handler; - auto cache_status = handle_query_cache(cache, cache_key, &cache_handler, stats, bit_map); if (cache_status.ok()) { return Status::OK(); } - roaring::Roaring result; + std::wstring column_name_ws = StringUtil::string_to_wstring(column_name); + + InvertedIndexQueryInfo query_info; + query_info.field_name = column_name_ws; + query_info.terms.emplace_back(search_str); + + auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); @@ -453,33 +454,29 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: case InvertedIndexQueryType::EQUAL_QUERY: { - query = std::make_unique(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr)->_search(query.get(), [&result](DocRange* doc_range) { - if (doc_range->type_ == DocRangeType::kMany) { - result.addMany(doc_range->doc_many_size_, doc_range->doc_many->data()); - } else { - result.addRange(doc_range->doc_range.first, doc_range->doc_range.second); - } - }); + RETURN_IF_ERROR(match_index_search(stats, runtime_state, + InvertedIndexQueryType::MATCH_ANY_QUERY, + query_info, *searcher_ptr, result)); break; } - case InvertedIndexQueryType::MATCH_PHRASE_QUERY: { - query = std::make_unique(term.get()); - SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - (*searcher_ptr) - ->_search(query.get(), - [&result](const int32_t docid, const float_t /*score*/) { - // docid equal to rowid in segment - result.add(docid); - }); + case InvertedIndexQueryType::MATCH_PHRASE_QUERY: + case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: + case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { + RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + *searcher_ptr, result)); break; } - case InvertedIndexQueryType::LESS_THAN_QUERY: case InvertedIndexQueryType::LESS_EQUAL_QUERY: case InvertedIndexQueryType::GREATER_THAN_QUERY: case InvertedIndexQueryType::GREATER_EQUAL_QUERY: { + std::wstring search_str_ws = StringUtil::string_to_wstring(search_str); + // unique_ptr with custom deleter + std::unique_ptr term { + _CLNEW lucene::index::Term(column_name_ws.c_str(), search_str_ws.c_str()), + [](lucene::index::Term* term) { _CLDECDELETE(term); }}; + std::unique_ptr query; + bool include_upper = query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY; bool include_lower = query_type == InvertedIndexQueryType::GREATER_EQUAL_QUERY; @@ -496,7 +493,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, (*searcher_ptr) ->_search(query.get(), [&result](const int32_t docid, const float_t /*score*/) { - result.add(docid); + result->add(docid); }); break; } @@ -519,12 +516,10 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, } // add to cache - std::shared_ptr term_match_bitmap = - std::make_shared(result); - term_match_bitmap->runOptimize(); - cache->insert(cache_key, term_match_bitmap, &cache_handler); + result->runOptimize(); + cache->insert(cache_key, result, &cache_handler); - bit_map = term_match_bitmap; + bit_map = result; } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index e496761ef08b71..cd5d89d4916f64 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -72,7 +72,6 @@ class InvertedIndexIterator; class InvertedIndexQueryCacheHandle; class InvertedIndexFileReader; struct InvertedIndexQueryInfo; - class InvertedIndexReader : public std::enable_shared_from_this { public: explicit InvertedIndexReader( @@ -93,7 +92,8 @@ class InvertedIndexReader : public std::enable_shared_from_this& term_match_bitmap); + friend class InvertedIndexIterator; std::shared_ptr _inverted_index_file_reader; TabletIndex _index_meta; bool _has_null = true; }; +using InvertedIndexReaderPtr = std::shared_ptr; class FullTextIndexReader : public InvertedIndexReader { ENABLE_FACTORY_CREATOR(FullTextIndexReader); @@ -176,13 +183,6 @@ class FullTextIndexReader : public InvertedIndexReader { const std::map& properties); static void setup_analyzer_use_stopwords(std::unique_ptr& analyzer, const std::map& properties); - -private: - Status match_index_search(OlapReaderStatistics* stats, RuntimeState* runtime_state, - InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, - const FulltextIndexSearcherPtr& index_searcher, - const std::shared_ptr& term_match_bitmap); }; class StringTypeInvertedIndexReader : public InvertedIndexReader { @@ -299,13 +299,15 @@ class InvertedIndexIterator { Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr) { - return _reader->read_null_bitmap(cache_handle, dir); + return _reader->read_null_bitmap(_stats, cache_handle, dir); } [[nodiscard]] InvertedIndexReaderType get_inverted_index_reader_type() const; [[nodiscard]] const std::map& get_index_properties() const; [[nodiscard]] bool has_null() { return _reader->has_null(); }; + const InvertedIndexReaderPtr& reader() { return _reader; } + private: OlapReaderStatistics* _stats = nullptr; RuntimeState* _runtime_state = nullptr; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index e2815dfa108500..5ac18999e9e898 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -304,6 +304,11 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { return Status::OK(); } + Status add_array_nulls(uint32_t row_id) override { + _null_bitmap.add(row_id); + return Status::OK(); + } + void new_inverted_index_field(const char* field_value_data, size_t field_value_size) { if (_parser_type != InvertedIndexParserType::PARSER_UNKNOWN && _parser_type != InvertedIndexParserType::PARSER_NONE) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index 3b4e5ba2709a7d..c29bb8c0b9d8c1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -64,6 +64,7 @@ class InvertedIndexColumnWriter { size_t count) = 0; virtual Status add_nulls(uint32_t count) = 0; + virtual Status add_array_nulls(uint32_t row_id) = 0; virtual Status finish() = 0; diff --git a/be/src/olap/rowset/segment_v2/options.h b/be/src/olap/rowset/segment_v2/options.h index 19041f4c51d1db..e68e4f6b6bc5c6 100644 --- a/be/src/olap/rowset/segment_v2/options.h +++ b/be/src/olap/rowset/segment_v2/options.h @@ -24,6 +24,8 @@ namespace segment_v2 { static constexpr size_t DEFAULT_PAGE_SIZE = 1024 * 1024; // default size: 1M +constexpr long ROW_STORE_PAGE_SIZE_DEFAULT_VALUE = 16384; // default row column page size: 16KB + struct PageBuilderOptions { size_t data_page_size = DEFAULT_PAGE_SIZE; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 9040cbf3e2733c..9e1133f46206e1 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -314,6 +314,7 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { for (auto& expr : _remaining_conjunct_roots) { _calculate_pred_in_remaining_conjunct_root(expr); } + _calculate_func_in_remaining_conjunct_root(); _column_predicate_info.reset(new ColumnPredicateInfo()); if (_schema->rowid_col_idx() > 0) { @@ -365,12 +366,18 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { std::set push_down_preds; for (auto* pred : _col_predicates) { if (!_check_apply_by_inverted_index(pred)) { + //column predicate, like column predicate etc. always need read data + auto cid = pred->column_id(); + _need_read_data_indices[cid] = true; continue; } push_down_preds.insert(_gen_predicate_result_sign(pred)); } for (auto* pred : _col_preds_except_leafnode_of_andnode) { if (!_check_apply_by_inverted_index(pred)) { + //column predicate, like column predicate etc. always need read data + auto cid = pred->column_id(); + _need_read_data_indices[cid] = true; continue; } push_down_preds.insert(_gen_predicate_result_sign(pred)); @@ -552,6 +559,8 @@ Status SegmentIterator::_get_row_ranges_by_column_conditions() { ++it; } } + _col_preds_except_leafnode_of_andnode.clear(); + compound_func_exprs.clear(); // 1. if all conditions in the compound hit the inverted index and there are no other expr to handle. // 2. then there is no need to generate index_result_column. if (_enable_common_expr_pushdown && _remaining_conjunct_roots.empty()) { @@ -687,6 +696,11 @@ Status SegmentIterator::_get_row_ranges_from_conditions(RowRanges* condition_row if (_opts.io_ctx.reader_type == ReaderType::READER_QUERY) { RowRanges dict_row_ranges = RowRanges::create_single(num_rows()); for (auto cid : cids) { + if (!_segment->can_apply_predicate_safely(cid, + _opts.col_id_to_predicates.at(cid).get(), + *_schema, _opts.io_ctx.reader_type)) { + continue; + } RowRanges tmp_row_ranges = RowRanges::create_single(num_rows()); DCHECK(_opts.col_id_to_predicates.count(cid) > 0); RETURN_IF_ERROR(_column_iterators[cid]->get_row_ranges_by_dict( @@ -796,25 +810,32 @@ Status SegmentIterator::_execute_predicates_except_leafnode_of_andnode( auto v_literal_expr = std::dynamic_pointer_cast(expr); _column_predicate_info->query_values.insert(v_literal_expr->value()); } else if (node_type == TExprNodeType::BINARY_PRED || node_type == TExprNodeType::MATCH_PRED || - node_type == TExprNodeType::IN_PRED) { - if (node_type == TExprNodeType::MATCH_PRED) { - _column_predicate_info->query_op = "match"; - } else if (node_type == TExprNodeType::IN_PRED) { - if (expr->op() == TExprOpcode::type::FILTER_IN) { - _column_predicate_info->query_op = "in"; + node_type == TExprNodeType::IN_PRED || node_type == TExprNodeType::FUNCTION_CALL) { + std::string result_sign; + if (node_type == TExprNodeType::FUNCTION_CALL) { + result_sign = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); + } else { + if (node_type == TExprNodeType::MATCH_PRED) { + _column_predicate_info->query_op = "match"; + } else if (node_type == TExprNodeType::IN_PRED) { + if (expr->op() == TExprOpcode::type::FILTER_IN) { + _column_predicate_info->query_op = "in"; + } else { + _column_predicate_info->query_op = "not_in"; + } } else { - _column_predicate_info->query_op = "not_in"; + _column_predicate_info->query_op = expr->fn().name.function_name; } - } else { - _column_predicate_info->query_op = expr->fn().name.function_name; + result_sign = _gen_predicate_result_sign(_column_predicate_info.get()); } + // get child condition result in compound conditions - auto pred_result_sign = _gen_predicate_result_sign(_column_predicate_info.get()); _column_predicate_info.reset(new ColumnPredicateInfo()); - VLOG_DEBUG << "_gen_predicate_result_sign " << pred_result_sign; - if (_rowid_result_for_index.count(pred_result_sign) > 0 && - _rowid_result_for_index[pred_result_sign].first) { - auto apply_result = _rowid_result_for_index[pred_result_sign].second; + VLOG_DEBUG << "result_sign " << result_sign; + if (_rowid_result_for_index.count(result_sign) > 0 && + _rowid_result_for_index[result_sign].first) { + auto apply_result = _rowid_result_for_index[result_sign].second; _pred_except_leafnode_of_andnode_evaluate_result.push_back(apply_result); } } else if (node_type == TExprNodeType::COMPOUND_PRED) { @@ -858,7 +879,7 @@ Status SegmentIterator::_execute_compound_fn(const std::string& function_name) { bool SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() { // no compound predicates push down, so no need to filter - if (_col_preds_except_leafnode_of_andnode.size() == 0) { + if (_col_preds_except_leafnode_of_andnode.empty() && compound_func_exprs.empty()) { return false; } for (auto pred : _col_preds_except_leafnode_of_andnode) { @@ -872,6 +893,14 @@ bool SegmentIterator::_can_filter_by_preds_except_leafnode_of_andnode() { return false; } } + for (const auto& func_expr_pair : compound_func_exprs) { + const auto& expr = func_expr_pair.first; + std::string pred_result_sign = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); + if (!_rowid_result_for_index.contains(pred_result_sign)) { + return false; + } + } return true; } @@ -879,7 +908,8 @@ bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_inverted_index_query) { return false; } - if (_inverted_index_iterators[pred->column_id()] == nullptr) { + auto pred_column_id = pred->column_id(); + if (_inverted_index_iterators[pred_column_id] == nullptr) { //this column without inverted index return false; } @@ -894,13 +924,21 @@ bool SegmentIterator::_check_apply_by_inverted_index(ColumnPredicate* pred, bool return false; } + // UNTOKENIZED strings exceed ignore_above, they are written as null, causing range query errors + if (PredicateTypeTraits::is_range(pred->type()) && + _inverted_index_iterators[pred_column_id] != nullptr && + _inverted_index_iterators[pred_column_id]->get_inverted_index_reader_type() == + InvertedIndexReaderType::STRING_TYPE) { + return false; + } + // Function filter no apply inverted index if (dynamic_cast*>(pred) != nullptr || dynamic_cast*>(pred) != nullptr) { return false; } - bool handle_by_fulltext = _column_has_fulltext_index(pred->column_id()); + bool handle_by_fulltext = _column_has_fulltext_index(pred_column_id); if (handle_by_fulltext) { // when predicate in compound condition which except leafNode of andNode, // only can apply match query for fulltext index, @@ -974,11 +1012,23 @@ Status SegmentIterator::_apply_index_except_leafnode_of_andnode() { } } + for (const auto& func_expr_pair : compound_func_exprs) { + const auto& expr = func_expr_pair.first; + const auto& expr_ctx = func_expr_pair.second; + auto result = std::make_shared(); + RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result)); + std::string result_sign = + BeConsts::BLOCK_TEMP_COLUMN_PREFIX + std::to_string(expr->index_unique_id()); + _rowid_result_for_index.emplace(result_sign, std::make_pair(true, std::move(*result))); + } + return Status::OK(); } bool SegmentIterator::_downgrade_without_index(Status res, bool need_remaining) { - if (res.code() == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND || + bool is_fallback = + _opts.runtime_state->query_options().enable_fallback_on_missing_inverted_index; + if ((res.code() == ErrorCode::INVERTED_INDEX_FILE_NOT_FOUND && is_fallback) || res.code() == ErrorCode::INVERTED_INDEX_BYPASS || res.code() == ErrorCode::INVERTED_INDEX_EVALUATE_SKIPPED || (res.code() == ErrorCode::INVERTED_INDEX_NO_TERMS && need_remaining)) { @@ -1168,6 +1218,9 @@ Status SegmentIterator::_apply_inverted_index_on_block_column_predicate( } bool SegmentIterator::_need_read_data(ColumnId cid) { + if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_no_need_read_data_opt) { + return true; + } // only support DUP_KEYS and UNIQUE_KEYS with MOW if (!((_opts.tablet_schema->keys_type() == KeysType::DUP_KEYS || (_opts.tablet_schema->keys_type() == KeysType::UNIQUE_KEYS && @@ -1244,18 +1297,6 @@ Status SegmentIterator::_apply_inverted_index() { std::vector remaining_predicates; std::set no_need_to_pass_column_predicate_set; - // TODO:Comment out this code before introducing range query functionality - /*for (const auto& entry : _opts.col_id_to_predicates) { - ColumnId column_id = entry.first; - auto pred = entry.second; - bool continue_apply = true; - RETURN_IF_ERROR(_apply_inverted_index_on_block_column_predicate( - column_id, pred.get(), no_need_to_pass_column_predicate_set, &continue_apply)); - if (!continue_apply) { - break; - } - }*/ - for (auto pred : _col_predicates) { if (no_need_to_pass_column_predicate_set.count(pred) > 0) { continue; @@ -1291,6 +1332,23 @@ Status SegmentIterator::_apply_inverted_index() { } } + for (const auto& func_expr_pair : no_compound_func_exprs) { + const auto& expr = func_expr_pair.first; + const auto& expr_ctx = func_expr_pair.second; + auto result = std::make_shared(); + RETURN_IF_ERROR(execute_func_expr(expr, expr_ctx, result)); + _row_bitmap &= *result; + for (auto it = _remaining_conjunct_roots.begin(); it != _remaining_conjunct_roots.end();) { + if (*it == expr) { + std::erase_if(_common_expr_ctxs_push_down, + [&it](const auto& iter) { return iter->root() == *it; }); + it = _remaining_conjunct_roots.erase(it); + } else { + ++it; + } + } + } + _col_predicates = std::move(remaining_predicates); _opts.stats->rows_inverted_index_filtered += (input_rows - _row_bitmap.cardinality()); return Status::OK(); @@ -1367,6 +1425,17 @@ Status SegmentIterator::_init_inverted_index_iterators() { return Status::OK(); } +Status SegmentIterator::_init_inverted_index_iterators(ColumnId cid) { + std::lock_guard lock(_idx_init_lock); + if (_inverted_index_iterators[cid] == nullptr) { + return _segment->new_inverted_index_iterator( + _opts.tablet_schema->column(cid), + _segment->_tablet_schema->get_inverted_index(_opts.tablet_schema->column(cid)), + _opts, &_inverted_index_iterators[cid]); + } + return Status::OK(); +} + Status SegmentIterator::_lookup_ordinal(const RowCursor& key, bool is_include, rowid_t upper_bound, rowid_t* rowid) { if (_segment->_tablet_schema->keys_type() == UNIQUE_KEYS && @@ -1865,7 +1934,8 @@ Status SegmentIterator::_read_columns(const std::vector& column_ids, } Status SegmentIterator::_init_current_block( - vectorized::Block* block, std::vector& current_columns) { + vectorized::Block* block, std::vector& current_columns, + uint32_t nrows_read_limit) { block->clear_column_data(_schema->num_column_ids()); for (size_t i = 0; i < _schema->num_column_ids(); i++) { @@ -1885,7 +1955,7 @@ Status SegmentIterator::_init_current_block( column_desc->path() == nullptr ? "" : column_desc->path()->get_path()); // TODO reuse current_columns[cid] = file_column_type->create_column(); - current_columns[cid]->reserve(_opts.block_row_max); + current_columns[cid]->reserve(nrows_read_limit); } else { // the column in block must clear() here to insert new data if (_is_pred_column[cid] || @@ -1904,7 +1974,7 @@ Status SegmentIterator::_init_current_block( } else if (column_desc->type() == FieldType::OLAP_FIELD_TYPE_DATETIME) { current_columns[cid]->set_datetime_type(); } - current_columns[cid]->reserve(_opts.block_row_max); + current_columns[cid]->reserve(nrows_read_limit); } } } @@ -1962,7 +2032,16 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32 } DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", { - return Status::Error("{} does not need to read data"); + auto debug_col_name = DebugPoints::instance()->get_debug_param_or_default( + "segment_iterator._read_columns_by_index", "column_name", ""); + if (debug_col_name.empty()) { + return Status::Error("does not need to read data"); + } + auto col_name = _opts.tablet_schema->column(cid).name(); + if (debug_col_name.find(col_name) != std::string::npos) { + return Status::Error("does not need to read data, {}", + debug_col_name); + } }) if (is_continuous) { @@ -2156,9 +2235,27 @@ Status SegmentIterator::_read_columns_by_rowids(std::vector& read_colu } for (auto cid : read_column_ids) { - if (_prune_column(cid, (*mutable_columns)[cid], true, select_size)) { + auto& colunm = (*mutable_columns)[cid]; + if (_no_need_read_key_data(cid, colunm, select_size)) { continue; } + if (_prune_column(cid, colunm, true, select_size)) { + continue; + } + + DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", { + auto debug_col_name = DebugPoints::instance()->get_debug_param_or_default( + "segment_iterator._read_columns_by_index", "column_name", ""); + if (debug_col_name.empty()) { + return Status::Error("does not need to read data"); + } + auto col_name = _opts.tablet_schema->column(cid).name(); + if (debug_col_name.find(col_name) != std::string::npos) { + return Status::Error("does not need to read data, {}", + debug_col_name); + } + }) + RETURN_IF_ERROR(_column_iterators[cid]->read_by_rowids(rowids.data(), select_size, _current_return_columns[cid])); } @@ -2310,14 +2407,16 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { } } } - RETURN_IF_ERROR(_init_current_block(block, _current_return_columns)); - _converted_column_ids.assign(_schema->columns().size(), 0); - _current_batch_rows_read = 0; uint32_t nrows_read_limit = _opts.block_row_max; if (_can_opt_topn_reads()) { nrows_read_limit = std::min(static_cast(_opts.topn_limit), nrows_read_limit); } + + RETURN_IF_ERROR(_init_current_block(block, _current_return_columns, nrows_read_limit)); + _converted_column_ids.assign(_schema->columns().size(), 0); + + _current_batch_rows_read = 0; RETURN_IF_ERROR(_read_columns_by_index( nrows_read_limit, _current_batch_rows_read, _lazy_materialization_read || _opts.record_rowids || _is_need_expr_eval)); @@ -2767,9 +2866,69 @@ void SegmentIterator::_calculate_pred_in_remaining_conjunct_root( } } +void SegmentIterator::_calculate_func_in_remaining_conjunct_root() { + auto hash = [](const vectorized::VExprSPtr& expr) -> std::size_t { + return std::hash()(expr->expr_name()); + }; + auto equal = [](const vectorized::VExprSPtr& lhs, const vectorized::VExprSPtr& rhs) -> bool { + return lhs->equals(*rhs); + }; + + uint32_t next_id = 0; + std::unordered_map unique_map( + 0, hash, equal); + + auto gen_func_unique_id = [&unique_map, &next_id](const vectorized::VExprSPtr& expr) { + auto it = unique_map.find(expr); + if (it != unique_map.end()) { + return it->second; + } else { + unique_map[expr] = ++next_id; + return next_id; + } + }; + + for (const auto& root_expr_ctx : _common_expr_ctxs_push_down) { + const auto& root_expr = root_expr_ctx->root(); + if (root_expr == nullptr) { + continue; + } + + std::stack> stack; + stack.emplace(root_expr, false); + + while (!stack.empty()) { + const auto& [expr, has_compound_pred] = stack.top(); + stack.pop(); + + bool current_has_compound_pred = + has_compound_pred || (expr->node_type() == TExprNodeType::COMPOUND_PRED); + + if (expr->node_type() == TExprNodeType::FUNCTION_CALL && + expr->can_push_down_to_index()) { + expr->set_index_unique_id(gen_func_unique_id(expr)); + if (current_has_compound_pred) { + compound_func_exprs.emplace_back(expr, root_expr_ctx); + } else { + no_compound_func_exprs.emplace_back(expr, root_expr_ctx); + } + } + + const auto& children = expr->children(); + for (int32_t i = children.size() - 1; i >= 0; --i) { + if (!children[i]->children().empty()) { + stack.emplace(children[i], current_has_compound_pred); + } + } + } + } +} + bool SegmentIterator::_no_need_read_key_data(ColumnId cid, vectorized::MutableColumnPtr& column, size_t nrows_read) { - if (_opts.tablet_schema->keys_type() != KeysType::DUP_KEYS) { + if (!((_opts.tablet_schema->keys_type() == KeysType::DUP_KEYS || + (_opts.tablet_schema->keys_type() == KeysType::UNIQUE_KEYS && + _opts.enable_unique_key_merge_on_write)))) { return false; } @@ -2827,11 +2986,38 @@ bool SegmentIterator::_can_opt_topn_reads() const { return false; } - if (!_col_predicates.empty() || !_col_preds_except_leafnode_of_andnode.empty()) { - return false; + std::set cids; + for (auto* pred : _col_predicates) { + cids.insert(pred->column_id()); + } + for (auto* pred : _col_preds_except_leafnode_of_andnode) { + cids.insert(pred->column_id()); } - return true; + uint32_t delete_sign_idx = _opts.tablet_schema->delete_sign_idx(); + bool result = std::ranges::all_of(cids.begin(), cids.end(), [delete_sign_idx](auto cid) { + return cid == delete_sign_idx; + }); + + return result; +} + +Status SegmentIterator::execute_func_expr(const vectorized::VExprSPtr& expr, + const vectorized::VExprContextSPtr& expr_ctx, + std::shared_ptr& result) { + const auto& expr0 = expr->get_child(0); + if (!expr0 || expr0->node_type() != TExprNodeType::SLOT_REF) { + return Status::RuntimeError("cannot perform index filtering"); + } + + FuncExprParams params; + auto slot_expr = std::static_pointer_cast(expr0); + params._column_id = _schema->column_id(slot_expr->column_id()); + params._unique_id = _schema->unique_id(slot_expr->column_id()); + params._column_name = _opts.tablet_schema->column(params._column_id).name(); + params._segment_iterator = this; + + return expr->eval_inverted_index(expr_ctx.get(), params, result); } } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 6383a9435e8558..ecae1ea6affafe 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -107,6 +107,15 @@ struct ColumnPredicateInfo { int32_t column_id; }; +class SegmentIterator; +struct FuncExprParams { + ColumnId _column_id = 0; + uint32_t _unique_id = 0; + std::string _column_name; + SegmentIterator* _segment_iterator = nullptr; + std::shared_ptr result; +}; + class SegmentIterator : public RowwiseIterator { public: SegmentIterator(std::shared_ptr segment, SchemaSPtr schema); @@ -123,6 +132,8 @@ class SegmentIterator : public RowwiseIterator { std::vector* block_row_locations) override; const Schema& schema() const override { return *_schema; } + Segment& segment() { return *_segment; } + StorageReadOptions& storage_read_options() { return _opts; } bool is_lazy_materialization_read() const override { return _lazy_materialization_read; } uint64_t data_id() const override { return _segment->id(); } RowsetId rowset_id() const { return _segment->rowset_id(); } @@ -142,6 +153,11 @@ class SegmentIterator : public RowwiseIterator { return updated; } + std::vector>& inverted_index_iterators() { + return _inverted_index_iterators; + } + [[nodiscard]] Status _init_inverted_index_iterators(ColumnId cid); + private: Status _next_batch_internal(vectorized::Block* block); @@ -222,7 +238,8 @@ class SegmentIterator : public RowwiseIterator { bool set_block_rowid); void _replace_version_col(size_t num_rows); Status _init_current_block(vectorized::Block* block, - std::vector& non_pred_vector); + std::vector& non_pred_vector, + uint32_t nrows_read_limit); uint16_t _evaluate_vectorization_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); uint16_t _evaluate_short_circuit_predicate(uint16_t* sel_rowid_idx, uint16_t selected_size); void _output_non_pred_columns(vectorized::Block* block); @@ -307,6 +324,7 @@ class SegmentIterator : public RowwiseIterator { bool _check_column_pred_all_push_down(const std::string& column_name, bool in_compound = false, bool is_match = false); void _calculate_pred_in_remaining_conjunct_root(const vectorized::VExprSPtr& expr); + void _calculate_func_in_remaining_conjunct_root(); // todo(wb) remove this method after RowCursor is removed void _convert_rowcursor_to_short_key(const RowCursor& key, size_t num_keys) { @@ -386,6 +404,10 @@ class SegmentIterator : public RowwiseIterator { bool _can_opt_topn_reads() const; + Status execute_func_expr(const vectorized::VExprSPtr& expr, + const vectorized::VExprContextSPtr& expr_ctx, + std::shared_ptr& result); + class BitmapRangeIterator; class BackwardBitmapRangeIterator; @@ -451,6 +473,11 @@ class SegmentIterator : public RowwiseIterator { // make a copy of `_opts.column_predicates` in order to make local changes std::vector _col_predicates; std::vector _col_preds_except_leafnode_of_andnode; + + using FuncExprPair = std::pair; + std::vector no_compound_func_exprs; + std::vector compound_func_exprs; + vectorized::VExprContextSPtrs _common_expr_ctxs_push_down; bool _enable_common_expr_pushdown = false; std::vector _remaining_conjunct_roots; @@ -492,6 +519,13 @@ class SegmentIterator : public RowwiseIterator { std::set _output_columns; std::unique_ptr _path_reader; + + std::vector _ret_flags; + + std::unordered_map> + _column_predicate_inverted_index_status; + + std::mutex _idx_init_lock; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 38b79f47f101e9..54c27205431d82 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -256,8 +256,11 @@ Status SegmentWriter::init(const std::vector& col_ids, bool has_key) { if (column.is_row_store_column()) { // smaller page size for row store column - opts.data_page_size = config::row_column_page_size; + auto page_size = _tablet_schema->row_store_page_size(); + opts.data_page_size = + (page_size > 0) ? page_size : segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE; } + std::unique_ptr writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); RETURN_IF_ERROR(writer->init()); @@ -520,7 +523,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* // 1. if the delete sign is marked, it means that the value columns of the row will not // be read. So we don't need to read the missing values from the previous rows. - // 2. the one exception is when there are sequence columns in the table, we need to read + // 2. the one exception is when there is sequence column in the table, we need to read // the sequence columns, otherwise it may cause the merge-on-read based compaction // policy to produce incorrect results if (have_delete_sign && !_tablet_schema->has_sequence_col()) { @@ -639,9 +642,9 @@ Status SegmentWriter::fill_missing_columns(vectorized::MutableColumns& mutable_f auto rowset = _rsid_to_rowset[rs_it.first]; CHECK(rowset); std::vector rids; - for (auto id_and_pos : seg_it.second) { - rids.emplace_back(id_and_pos.rid); - read_index[id_and_pos.pos] = read_idx++; + for (auto [rid, pos] : seg_it.second) { + rids.emplace_back(rid); + read_index[pos] = read_idx++; } if (has_row_column) { auto st = tablet->fetch_value_through_row_column( @@ -695,7 +698,7 @@ Status SegmentWriter::fill_missing_columns(vectorized::MutableColumns& mutable_f // fill all missing value from mutable_old_columns, need to consider default value and null value for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { - // `use_default_or_null_flag[idx] == true` doesn't mean that we should read values from the old row + // `use_default_or_null_flag[idx] == false` doesn't mean that we should read values from the old row // for the missing columns. For example, if a table has sequence column, the rows with DELETE_SIGN column // marked will not be marked in delete bitmap(see https://github.com/apache/doris/pull/24011), so it will // be found in Tablet::lookup_row_key() and `use_default_or_null_flag[idx]` will be false. But we should not diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index a6f9ceee7c521d..d32e75fbebbd6b 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -213,8 +213,11 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo if (column.is_row_store_column()) { // smaller page size for row store column - opts.data_page_size = config::row_column_page_size; + auto page_size = _tablet_schema->row_store_page_size(); + opts.data_page_size = + (page_size > 0) ? page_size : segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE; } + std::unique_ptr writer; RETURN_IF_ERROR(ColumnWriter::create(opts, &column, _file_writer, &writer)); RETURN_IF_ERROR(writer->init()); @@ -460,7 +463,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da // 1. if the delete sign is marked, it means that the value columns of the row will not // be read. So we don't need to read the missing values from the previous rows. - // 2. the one exception is when there are sequence columns in the table, we need to read + // 2. the one exception is when there is sequence column in the table, we need to read // the sequence columns, otherwise it may cause the merge-on-read based compaction // policy to produce incorrect results if (have_delete_sign && !_tablet_schema->has_sequence_col()) { @@ -579,9 +582,9 @@ Status VerticalSegmentWriter::_fill_missing_columns( auto rowset = _rsid_to_rowset[rs_it.first]; CHECK(rowset); std::vector rids; - for (auto id_and_pos : seg_it.second) { - rids.emplace_back(id_and_pos.rid); - read_index[id_and_pos.pos] = read_idx++; + for (auto [rid, pos] : seg_it.second) { + rids.emplace_back(rid); + read_index[pos] = read_idx++; } if (has_row_column) { auto st = tablet->fetch_value_through_row_column( @@ -633,7 +636,7 @@ Status VerticalSegmentWriter::_fill_missing_columns( // fill all missing value from mutable_old_columns, need to consider default value and null value for (auto idx = 0; idx < use_default_or_null_flag.size(); idx++) { - // `use_default_or_null_flag[idx] == true` doesn't mean that we should read values from the old row + // `use_default_or_null_flag[idx] == false` doesn't mean that we should read values from the old row // for the missing columns. For example, if a table has sequence column, the rows with DELETE_SIGN column // marked will not be marked in delete bitmap(see https://github.com/apache/doris/pull/24011), so it will // be found in Tablet::lookup_row_key() and `use_default_or_null_flag[idx]` will be false. But we should not diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index 6ec2f4ff88376d..7dc34c507cbc51 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -96,36 +96,32 @@ Status VerticalBetaRowsetWriter::add_columns(const vectorized::Block* block, RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, 0, num_rows)); } else { // value columns - uint32_t num_rows_written = _segment_writers[_cur_writer_idx]->num_rows_written(); - VLOG_NOTICE << "num_rows_written: " << num_rows_written - << ", _cur_writer_idx: " << _cur_writer_idx; - uint32_t num_rows_key_group = _segment_writers[_cur_writer_idx]->row_count(); - // init if it's first value column write in current segment - if (_cur_writer_idx == 0 && num_rows_written == 0) { - VLOG_NOTICE << "init first value column segment writer"; - RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); - } - // when splitting segment, need to make rows align between key columns and value columns - size_t start_offset = 0; - size_t limit = num_rows; - if (num_rows_written + num_rows >= num_rows_key_group && - _cur_writer_idx < _segment_writers.size() - 1) { - RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block( - block, 0, num_rows_key_group - num_rows_written)); - RETURN_IF_ERROR(_flush_columns(&_segment_writers[_cur_writer_idx])); - start_offset = num_rows_key_group - num_rows_written; - limit = num_rows - start_offset; - ++_cur_writer_idx; - // switch to next writer - RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); - num_rows_written = 0; - num_rows_key_group = _segment_writers[_cur_writer_idx]->row_count(); - } - if (limit > 0) { - RETURN_IF_ERROR( - _segment_writers[_cur_writer_idx]->append_block(block, start_offset, limit)); - DCHECK(_segment_writers[_cur_writer_idx]->num_rows_written() <= - _segment_writers[_cur_writer_idx]->row_count()); + int64_t left = num_rows; + while (left > 0) { + uint32_t num_rows_written = _segment_writers[_cur_writer_idx]->num_rows_written(); + VLOG_NOTICE << "num_rows_written: " << num_rows_written + << ", _cur_writer_idx: " << _cur_writer_idx; + uint32_t num_rows_key_group = _segment_writers[_cur_writer_idx]->row_count(); + CHECK_LT(num_rows_written, num_rows_key_group); + // init if it's first value column write in current segment + if (num_rows_written == 0) { + VLOG_NOTICE << "init first value column segment writer"; + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->init(col_ids, is_key)); + } + + int64_t to_write = num_rows_written + left >= num_rows_key_group + ? num_rows_key_group - num_rows_written + : left; + RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, num_rows - left, + to_write)); + left -= to_write; + CHECK_GE(left, 0); + + if (num_rows_key_group == num_rows_written + to_write && + _cur_writer_idx < _segment_writers.size() - 1) { + RETURN_IF_ERROR(_flush_columns(&_segment_writers[_cur_writer_idx])); + ++_cur_writer_idx; + } } } if (is_key) { diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index a1edc61e4784a1..4194d3ae6c3009 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -40,6 +40,7 @@ #include "olap/rowset/beta_rowset_writer.h" #include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" +#include "olap/rowset/rowset_meta_manager.h" #include "olap/rowset/rowset_writer.h" #include "olap/rowset/rowset_writer_context.h" #include "olap/schema_change.h" @@ -118,7 +119,7 @@ void RowsetBuilder::_garbage_collection() { Status RowsetBuilder::init_mow_context(std::shared_ptr& mow_context) { std::lock_guard lck(tablet()->get_header_lock()); - int64_t cur_max_version = tablet()->max_version_unlocked().second; + _max_version_in_flush_phase = tablet()->max_version_unlocked().second; std::vector rowset_ptrs; // tablet is under alter process. The delete bitmap will be calculated after conversion. if (tablet()->tablet_state() == TABLET_NOTREADY) { @@ -130,12 +131,12 @@ Status RowsetBuilder::init_mow_context(std::shared_ptr& mow_context) } _rowset_ids.clear(); } else { - RETURN_IF_ERROR(tablet()->all_rs_id(cur_max_version, &_rowset_ids)); + RETURN_IF_ERROR(tablet()->all_rs_id(_max_version_in_flush_phase, &_rowset_ids)); rowset_ptrs = tablet()->get_rowset_by_ids(&_rowset_ids); } _delete_bitmap = std::make_shared(tablet()->tablet_id()); - mow_context = std::make_shared(cur_max_version, _req.txn_id, _rowset_ids, - rowset_ptrs, _delete_bitmap); + mow_context = std::make_shared(_max_version_in_flush_phase, _req.txn_id, + _rowset_ids, rowset_ptrs, _delete_bitmap); return Status::OK(); } @@ -325,10 +326,11 @@ Status RowsetBuilder::commit_txn() { // => update_schema: A(bigint), B(double), C(int), D(int) RETURN_IF_ERROR(tablet()->update_by_least_common_schema(rw_ctx.tablet_schema)); } + // Transfer ownership of `PendingRowsetGuard` to `TxnManager` - Status res = _engine.txn_manager()->commit_txn(_req.partition_id, *tablet(), _req.txn_id, - _req.load_id, _rowset, - std::move(_pending_rs_guard), false); + Status res = _engine.txn_manager()->commit_txn( + _req.partition_id, *tablet(), _req.txn_id, _req.load_id, _rowset, + std::move(_pending_rs_guard), false, _partial_update_info); if (!res && !res.is()) { LOG(WARNING) << "Failed to commit txn: " << _req.txn_id @@ -402,7 +404,8 @@ void BaseRowsetBuilder::_build_current_tablet_schema(int64_t index_id, table_schema_param->partial_update_input_columns(), table_schema_param->is_strict_mode(), table_schema_param->timestamp_ms(), table_schema_param->timezone(), - table_schema_param->auto_increment_coulumn()); + table_schema_param->auto_increment_coulumn(), + _max_version_in_flush_phase); } } // namespace doris diff --git a/be/src/olap/rowset_builder.h b/be/src/olap/rowset_builder.h index 8f254074c3716d..362e976da71976 100644 --- a/be/src/olap/rowset_builder.h +++ b/be/src/olap/rowset_builder.h @@ -107,6 +107,7 @@ class BaseRowsetBuilder { std::unique_ptr _calc_delete_bitmap_token; // current rowset_ids, used to do diff in publish_version RowsetIdUnorderedSet _rowset_ids; + int64_t _max_version_in_flush_phase {-1}; std::shared_ptr _partial_update_info; diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index a0483ad5d8ec37..81e8d022ecc5b7 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -17,6 +17,10 @@ #include "olap/schema_change.h" +#include +#include +#include + #include #include #include @@ -279,51 +283,63 @@ Status BlockChanger::change_block(vectorized::Block* ref_block, vectorized::VExprContext::filter_block(ctx.get(), ref_block, ref_block->columns())); } - const int row_size = ref_block->rows(); - const int column_size = new_block->columns(); + const int row_num = ref_block->rows(); + const int new_schema_cols_num = new_block->columns(); - // swap ref_block[key] and new_block[value] + // will be used for swaping ref_block[entry.first] and new_block[entry.second] std::list> swap_idx_list; - for (int idx = 0; idx < column_size; idx++) { - if (_schema_mapping[idx].expr != nullptr) { + for (int idx = 0; idx < new_schema_cols_num; idx++) { + auto expr = _schema_mapping[idx].expr; + if (expr != nullptr) { vectorized::VExprContextSPtr ctx; - RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(*_schema_mapping[idx].expr, ctx)); + RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(*expr, ctx)); RETURN_IF_ERROR(ctx->prepare(state.get(), row_desc)); RETURN_IF_ERROR(ctx->open(state.get())); - int result_column_id = -1; - RETURN_IF_ERROR(ctx->execute(ref_block, &result_column_id)); - if (ref_block->get_by_position(result_column_id).column == nullptr) { + int result_tmp_column_idx = -1; + RETURN_IF_ERROR(ctx->execute(ref_block, &result_tmp_column_idx)); + auto& result_tmp_column_def = ref_block->get_by_position(result_tmp_column_idx); + if (result_tmp_column_def.column == nullptr) { return Status::Error( - "{} result column is nullptr", - ref_block->get_by_position(result_column_id).name); + "result column={} is nullptr, input expr={}", result_tmp_column_def.name, + apache::thrift::ThriftDebugString(*expr)); } - ref_block->replace_by_position_if_const(result_column_id); + ref_block->replace_by_position_if_const(result_tmp_column_idx); - if (ref_block->get_by_position(result_column_id).column->size() != row_size) { + if (result_tmp_column_def.column->size() != row_num) { return Status::Error( - "{} size invalid, expect={}, real={}", new_block->get_by_position(idx).name, - row_size, ref_block->get_by_position(result_column_id).column->size()); + "result size invalid, expect={}, real={}; input expr={}", row_num, + result_tmp_column_def.column->size(), + apache::thrift::ThriftDebugString(*expr)); + } + + if (_type == SCHEMA_CHANGE) { + // danger casts (expected to be rejected by upstream caller) may cause data to be null and result in data loss in schema change + // for rollup, this check is unecessary, and ref columns are not set in this case, it works on exprs + + // column_idx in base schema + int32_t ref_column_idx = _schema_mapping[idx].ref_column_idx; + DCHECK_GE(ref_column_idx, 0); + auto& ref_column_def = ref_block->get_by_position(ref_column_idx); + RETURN_IF_ERROR( + _check_cast_valid(ref_column_def.column, result_tmp_column_def.column)); } - RETURN_IF_ERROR(_check_cast_valid(ref_block->get_by_position(idx).column, - ref_block->get_by_position(result_column_id).column, - _type)); - swap_idx_list.emplace_back(result_column_id, idx); - } else if (_schema_mapping[idx].ref_column < 0) { + swap_idx_list.emplace_back(result_tmp_column_idx, idx); + } else if (_schema_mapping[idx].ref_column_idx < 0) { // new column, write default value auto* value = _schema_mapping[idx].default_value; auto column = new_block->get_by_position(idx).column->assume_mutable(); if (value->is_null()) { DCHECK(column->is_nullable()); - column->insert_many_defaults(row_size); + column->insert_many_defaults(row_num); } else { auto type_info = get_type_info(_schema_mapping[idx].new_column); DefaultValueColumnIterator::insert_default_data(type_info.get(), value->size(), - value->ptr(), column, row_size); + value->ptr(), column, row_num); } } else { // same type, just swap column - swap_idx_list.emplace_back(_schema_mapping[idx].ref_column, idx); + swap_idx_list.emplace_back(_schema_mapping[idx].ref_column_idx, idx); } } @@ -361,81 +377,90 @@ Status BlockChanger::change_block(vectorized::Block* ref_block, return Status::OK(); } -// This check is to prevent schema-change from causing data loss -Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr ref_column, - vectorized::ColumnPtr new_column, - AlterTabletType type) const { - if (ref_column->size() != new_column->size()) { +// This check can prevent schema-change from causing data loss after type cast +Status BlockChanger::_check_cast_valid(vectorized::ColumnPtr input_column, + vectorized::ColumnPtr output_column) { + if (input_column->size() != output_column->size()) { return Status::InternalError( - "column size is changed, ref_column_size={}, new_column_size={}", - ref_column->size(), new_column->size()); - } - if (type == ROLLUP) { - return Status::OK(); + "column size is changed, input_column_size={}, output_column_size={}; " + "input_column={}", + input_column->size(), output_column->size(), input_column->get_name()); } - if (ref_column->is_nullable() != new_column->is_nullable()) { - if (ref_column->is_nullable()) { + DCHECK_EQ(input_column->size(), output_column->size()) + << "length check should have done before calling this function!"; + + if (input_column->is_nullable() != output_column->is_nullable()) { + if (input_column->is_nullable()) { const auto* ref_null_map = - vectorized::check_and_get_column(ref_column) + vectorized::check_and_get_column(input_column) ->get_null_map_column() .get_data() .data(); bool is_changed = false; - for (size_t i = 0; i < ref_column->size(); i++) { + for (size_t i = 0; i < input_column->size(); i++) { is_changed |= ref_null_map[i]; } if (is_changed) { - return Status::DataQualityError("Null data is changed to not nullable"); + return Status::DataQualityError( + "some null data is changed to not null, intput_column={}", + input_column->get_name()); } } else { const auto& null_map_column = - vectorized::check_and_get_column(new_column) + vectorized::check_and_get_column(output_column) ->get_null_map_column(); const auto& nested_column = - vectorized::check_and_get_column(new_column) + vectorized::check_and_get_column(output_column) ->get_nested_column(); const auto* new_null_map = null_map_column.get_data().data(); - if (null_map_column.size() != new_column->size() || - nested_column.size() != new_column->size()) { - DCHECK(false) << "null_map_column_size=" << null_map_column.size() - << " new_column_size=" << new_column->size() - << " nested_column_size=" << nested_column.size(); + if (null_map_column.size() != output_column->size()) { return Status::InternalError( - "null_map_column size is changed, null_map_column_size={}, " - "new_column_size={}", - null_map_column.size(), new_column->size()); + "null_map_column size mismatch output_column_size, " + "null_map_column_size={}, output_column_size={}; input_column={}", + null_map_column.size(), output_column->size(), input_column->get_name()); + } + + if (nested_column.size() != output_column->size()) { + return Status::InternalError( + "nested_column size is changed, nested_column_size={}, " + "ouput_column_size={}; input_column={}", + nested_column.size(), output_column->size(), input_column->get_name()); } bool is_changed = false; - for (size_t i = 0; i < ref_column->size(); i++) { + for (size_t i = 0; i < input_column->size(); i++) { is_changed |= new_null_map[i]; } if (is_changed) { - return Status::DataQualityError("Some data is changed to null"); + return Status::DataQualityError( + "some not null data is changed to null, intput_column={}", + input_column->get_name()); } } } - if (ref_column->is_nullable() && new_column->is_nullable()) { + if (input_column->is_nullable() && output_column->is_nullable()) { const auto* ref_null_map = - vectorized::check_and_get_column(ref_column) + vectorized::check_and_get_column(input_column) ->get_null_map_column() .get_data() .data(); const auto* new_null_map = - vectorized::check_and_get_column(new_column) + vectorized::check_and_get_column(output_column) ->get_null_map_column() .get_data() .data(); bool is_changed = false; - for (size_t i = 0; i < ref_column->size(); i++) { + for (size_t i = 0; i < input_column->size(); i++) { is_changed |= (ref_null_map[i] != new_null_map[i]); } if (is_changed) { - return Status::DataQualityError("is_null of data is changed!"); + return Status::DataQualityError( + "null map is changed after calculation, input_column={}", + input_column->get_name()); } } return Status::OK(); @@ -1197,6 +1222,8 @@ Status SchemaChangeHandler::_parse_request(const SchemaChangeParams& sc_params, ColumnMapping* column_mapping = changer->get_mutable_column_mapping(i); column_mapping->new_column = &new_column; + column_mapping->ref_column_idx = base_tablet_schema->field_index(new_column.name()); + if (materialized_function_map.find(column_name_lower) != materialized_function_map.end()) { auto mv_param = materialized_function_map.find(column_name_lower)->second; column_mapping->expr = mv_param.expr; @@ -1205,9 +1232,7 @@ Status SchemaChangeHandler::_parse_request(const SchemaChangeParams& sc_params, } } - int32_t column_index = base_tablet_schema->field_index(new_column.name()); - if (column_index >= 0) { - column_mapping->ref_column = column_index; + if (column_mapping->ref_column_idx >= 0) { continue; } @@ -1230,7 +1255,7 @@ Status SchemaChangeHandler::_parse_request(const SchemaChangeParams& sc_params, return Status::InternalError("failed due to operate on shadow column"); } // Newly added column go here - column_mapping->ref_column = -1; + column_mapping->ref_column_idx = -1; if (i < base_tablet_schema->num_short_key_columns()) { *sc_directly = true; @@ -1259,7 +1284,7 @@ Status SchemaChangeHandler::_parse_request(const SchemaChangeParams& sc_params, continue; } - if (column_mapping->ref_column != i - num_default_value) { + if (column_mapping->ref_column_idx != i - num_default_value) { *sc_sorting = true; return Status::OK(); } @@ -1316,9 +1341,9 @@ Status SchemaChangeHandler::_parse_request(const SchemaChangeParams& sc_params, if (column_mapping->expr != nullptr) { *sc_directly = true; return Status::OK(); - } else if (column_mapping->ref_column >= 0) { + } else if (column_mapping->ref_column_idx >= 0) { const auto& column_new = new_tablet_schema->column(i); - const auto& column_old = base_tablet_schema->column(column_mapping->ref_column); + const auto& column_old = base_tablet_schema->column(column_mapping->ref_column_idx); // index changed if (column_new.is_bf_column() != column_old.is_bf_column() || column_new.has_bitmap_index() != column_old.has_bitmap_index()) { diff --git a/be/src/olap/schema_change.h b/be/src/olap/schema_change.h index 6528b587a9aa0a..fb54f1c8a1b1b2 100644 --- a/be/src/olap/schema_change.h +++ b/be/src/olap/schema_change.h @@ -84,8 +84,8 @@ class BlockChanger { bool has_where() const { return _where_expr != nullptr; } private: - Status _check_cast_valid(vectorized::ColumnPtr ref_column, vectorized::ColumnPtr new_column, - AlterTabletType type) const; + static Status _check_cast_valid(vectorized::ColumnPtr ref_column, + vectorized::ColumnPtr new_column); // @brief column-mapping specification of new schema SchemaMapping _schema_mapping; diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index ffa3c5403a83f0..6e36d0756af195 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -88,6 +88,9 @@ Status SnapshotManager::make_snapshot(const TSnapshotRequest& request, string* s TabletSharedPtr ref_tablet = StorageEngine::instance()->tablet_manager()->get_tablet(request.tablet_id); + + DBUG_EXECUTE_IF("SnapshotManager::make_snapshot.inject_failure", { ref_tablet = nullptr; }) + if (ref_tablet == nullptr) { return Status::Error("failed to get tablet. tablet={}", request.tablet_id); } diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index 82c07a59152455..f4b11b8fb62145 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -63,6 +63,7 @@ #include "olap/olap_common.h" #include "olap/olap_define.h" #include "olap/olap_meta.h" +#include "olap/rowset/rowset_fwd.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" #include "olap/rowset/unique_rowset_id_generator.h" @@ -807,6 +808,9 @@ Status StorageEngine::start_trash_sweep(double* usage, bool ignore_guard) { // cleand unused pending publish info for deleted tablet _clean_unused_pending_publish_info(); + // clean unused partial update info for finished txns + _clean_unused_partial_update_info(); + // clean unused rowsets in remote storage backends for (auto data_dir : get_stores()) { data_dir->perform_remote_rowset_gc(); @@ -970,6 +974,34 @@ void StorageEngine::_clean_unused_pending_publish_info() { } } +void StorageEngine::_clean_unused_partial_update_info() { + std::vector> remove_infos; + auto unused_partial_update_info_collector = + [this, &remove_infos](int64_t tablet_id, int64_t partition_id, int64_t txn_id, + std::string_view value) -> bool { + TabletSharedPtr tablet = _tablet_manager->get_tablet(tablet_id); + if (tablet == nullptr) { + remove_infos.emplace_back(tablet_id, partition_id, txn_id); + return true; + } + TxnState txn_state = + _txn_manager->get_txn_state(partition_id, txn_id, tablet_id, tablet->tablet_uid()); + if (txn_state == TxnState::NOT_FOUND || txn_state == TxnState::ABORTED || + txn_state == TxnState::DELETED) { + remove_infos.emplace_back(tablet_id, partition_id, txn_id); + return true; + } + return true; + }; + auto data_dirs = get_stores(); + for (auto* data_dir : data_dirs) { + static_cast(RowsetMetaManager::traverse_partial_update_info( + data_dir->get_meta(), unused_partial_update_info_collector)); + static_cast( + RowsetMetaManager::remove_partial_update_infos(data_dir->get_meta(), remove_infos)); + } +} + void StorageEngine::gc_binlogs(const std::unordered_map& gc_tablet_infos) { for (auto [tablet_id, version] : gc_tablet_infos) { LOG(INFO) << fmt::format("start to gc binlogs for tablet_id: {}, version: {}", tablet_id, diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index f647869e82500c..5562257133c5fb 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -255,6 +255,8 @@ class StorageEngine { void _clean_unused_pending_publish_info(); + void _clean_unused_partial_update_info(); + Status _do_sweep(const std::string& scan_root, const time_t& local_tm_now, const int32_t expire); diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index fc72bd568644fb..79405b1fe0b214 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -969,6 +969,14 @@ Status Tablet::capture_consistent_versions(const Version& spec_version, } } } + + DBUG_EXECUTE_IF("TTablet::capture_consistent_versions.inject_failure", { + auto tablet_id = dp->param("tablet_id", -1); + if (tablet_id != -1 && tablet_id == _tablet_meta->tablet_id()) { + status = Status::Error("version already merged"); + } + }); + return status; } @@ -1997,8 +2005,11 @@ Status Tablet::prepare_compaction_and_calculate_permits(CompactionType compactio } permits = 0; - for (auto&& rowset : compaction->input_rowsets()) { - permits += rowset->rowset_meta()->get_compaction_score(); + // Time series policy does not rely on permits, it uses goal size to control memory + if (tablet->tablet_meta()->compaction_policy() != CUMULATIVE_TIME_SERIES_POLICY) { + for (auto&& rowset : compaction->input_rowsets()) { + permits += rowset->rowset_meta()->get_compaction_score(); + } } return Status::OK(); } @@ -3224,27 +3235,56 @@ Status Tablet::generate_new_block_for_partial_update( auto old_block = rowset_schema->create_block_by_cids(missing_cids); auto update_block = rowset_schema->create_block_by_cids(update_cids); - std::map read_index_old; - RETURN_IF_ERROR(read_columns_by_plan(rowset_schema, missing_cids, read_plan_ori, rsid_to_rowset, - old_block, &read_index_old)); + auto get_delete_sign_column_data = [](vectorized::Block& block, + size_t rows) -> const signed char* { + if (const vectorized::ColumnWithTypeAndName* delete_sign_column = + block.try_get_by_name(DELETE_SIGN); + delete_sign_column != nullptr) { + const auto& delete_sign_col = + reinterpret_cast(*(delete_sign_column->column)); + if (delete_sign_col.size() >= rows) { + return delete_sign_col.get_data().data(); + } + } + return nullptr; + }; + // rowid in the final block(start from 0, increase continuously) -> rowid to read in update_block std::map read_index_update; + + // read current rowset first, if a row in the current rowset has delete sign mark + // we don't need to read values from old block RETURN_IF_ERROR(read_columns_by_plan(rowset_schema, update_cids, read_plan_update, rsid_to_rowset, update_block, &read_index_update)); - const vectorized::Int8* delete_sign_column_data = nullptr; - if (const vectorized::ColumnWithTypeAndName* delete_sign_column = - old_block.try_get_by_name(DELETE_SIGN); - delete_sign_column != nullptr) { - auto& delete_sign_col = - reinterpret_cast(*(delete_sign_column->column)); - delete_sign_column_data = delete_sign_col.get_data().data(); + size_t update_rows = read_index_update.size(); + for (auto i = 0; i < update_cids.size(); ++i) { + for (auto idx = 0; idx < update_rows; ++idx) { + full_mutable_columns[update_cids[i]]->insert_from( + *update_block.get_columns_with_type_and_name()[i].column.get(), + read_index_update[idx]); + } } + // if there is sequence column in the table, we need to read the sequence column, + // otherwise it may cause the merge-on-read based compaction policy to produce incorrect results + const auto* __restrict new_block_delete_signs = + rowset_schema->has_sequence_col() + ? nullptr + : get_delete_sign_column_data(update_block, update_rows); + + // rowid in the final block(start from 0, increase, may not continuous becasue we skip to read some rows) -> rowid to read in old_block + std::map read_index_old; + RETURN_IF_ERROR(read_columns_by_plan(rowset_schema, missing_cids, read_plan_ori, rsid_to_rowset, + old_block, &read_index_old, new_block_delete_signs)); + size_t old_rows = read_index_old.size(); + const auto* __restrict old_block_delete_signs = + get_delete_sign_column_data(old_block, old_rows); + // build default value block auto default_value_block = old_block.clone_empty(); auto mutable_default_value_columns = default_value_block.mutate_columns(); - if (delete_sign_column_data != nullptr) { + if (old_block_delete_signs != nullptr || new_block_delete_signs != nullptr) { for (auto i = 0; i < missing_cids.size(); ++i) { const auto& column = rowset_schema->column(missing_cids[i]); if (column.has_default_value()) { @@ -3257,22 +3297,26 @@ Status Tablet::generate_new_block_for_partial_update( } } - // build full block - CHECK(read_index_old.size() == read_index_update.size()); + CHECK(update_rows >= old_rows); + // build full block for (auto i = 0; i < missing_cids.size(); ++i) { const auto& rs_column = rowset_schema->column(missing_cids[i]); - for (auto idx = 0; idx < read_index_old.size(); ++idx) { - // if the conflict update is a delete sign, which means that the key is - // not exist now, we should not read old values from the deleted data, - // and should use default value instead. - // NOTE: since now we are in the publishing phase, all data is commited - // before, even the `strict_mode` is true (which requires partial update - // load job can't insert new keys), this "new" key MUST be written into - // the new generated segment file. - if (delete_sign_column_data != nullptr && - delete_sign_column_data[read_index_old[idx]] != 0) { - auto& mutable_column = full_mutable_columns[missing_cids[i]]; + auto& mutable_column = full_mutable_columns[missing_cids[i]]; + for (auto idx = 0; idx < update_rows; ++idx) { + // There are two cases we don't need to read values from old data: + // 1. if the conflicting new row's delete sign is marked, which means the value columns + // of the row will not be read. So we don't need to read the missing values from the previous rows. + // 2. if the conflicting old row's delete sign is marked, which means that the key is not exist now, + // we should not read old values from the deleted data, and should use default value instead. + // NOTE: since now we are in the publishing phase, all data is commited + // before, even the `strict_mode` is true (which requires partial update + // load job can't insert new keys), this "new" key MUST be written into + // the new generated segment file. + if (new_block_delete_signs != nullptr && new_block_delete_signs[idx]) { + mutable_column->insert_default(); + } else if (old_block_delete_signs != nullptr && + old_block_delete_signs[read_index_old[idx]] != 0) { if (rs_column.has_default_value()) { mutable_column->insert_from(*mutable_default_value_columns[i].get(), 0); } else if (rs_column.is_nullable()) { @@ -3281,18 +3325,11 @@ Status Tablet::generate_new_block_for_partial_update( } else { mutable_column->insert_default(); } - continue; + } else { + mutable_column->insert_from( + *old_block.get_columns_with_type_and_name()[i].column.get(), + read_index_old[idx]); } - full_mutable_columns[missing_cids[i]]->insert_from( - *old_block.get_columns_with_type_and_name()[i].column.get(), - read_index_old[idx]); - } - } - for (auto i = 0; i < update_cids.size(); ++i) { - for (auto idx = 0; idx < read_index_update.size(); ++idx) { - full_mutable_columns[update_cids[i]]->insert_from( - *update_block.get_columns_with_type_and_name()[i].column.get(), - read_index_update[idx]); } } output_block->set_columns(std::move(full_mutable_columns)); @@ -3307,7 +3344,8 @@ Status Tablet::read_columns_by_plan(TabletSchemaSPtr tablet_schema, const PartialUpdateReadPlan& read_plan, const std::map& rsid_to_rowset, vectorized::Block& block, - std::map* read_index) { + std::map* read_index, + const signed char* __restrict skip_map) { bool has_row_column = tablet_schema->store_row_column(); auto mutable_columns = block.mutate_columns(); size_t read_idx = 0; @@ -3316,9 +3354,12 @@ Status Tablet::read_columns_by_plan(TabletSchemaSPtr tablet_schema, auto rowset_iter = rsid_to_rowset.find(rs_it.first); CHECK(rowset_iter != rsid_to_rowset.end()); std::vector rids; - for (auto id_and_pos : seg_it.second) { - rids.emplace_back(id_and_pos.rid); - (*read_index)[id_and_pos.pos] = read_idx++; + for (auto [rid, pos] : seg_it.second) { + if (skip_map && skip_map[pos]) { + continue; + } + rids.emplace_back(rid); + (*read_index)[pos] = read_idx++; } if (has_row_column) { auto st = fetch_value_through_row_column(rowset_iter->second, *tablet_schema, @@ -3506,7 +3547,9 @@ Status Tablet::update_delete_bitmap(TabletTxnInfo* txn_info, int64_t txn_id) { // When the new segment flush fails or the rowset build fails, the deletion marker for the // duplicate key of the original segment should not remain in `txn_info->delete_bitmap`, // so we need to make a copy of `txn_info->delete_bitmap` and make changes on it. - if (txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update) { + bool is_partial_update = + txn_info->partial_update_info && txn_info->partial_update_info->is_partial_update; + if (is_partial_update) { delete_bitmap = std::make_shared(*(txn_info->delete_bitmap)); } @@ -3540,6 +3583,37 @@ Status Tablet::update_delete_bitmap(TabletTxnInfo* txn_info, int64_t txn_id) { } auto t3 = watch.get_elapse_time_us(); + // If a rowset is produced by compaction before the commit phase of the partial update load + // and is not included in txn_info->rowset_ids, we can skip the alignment process of that rowset + // because data remains the same before and after compaction. But we still need to calculate the + // the delete bitmap for that rowset. + std::vector rowsets_skip_alignment; + if (is_partial_update) { + int64_t max_version_in_flush_phase = + txn_info->partial_update_info->max_version_in_flush_phase; + DCHECK(max_version_in_flush_phase != -1); + std::vector remained_rowsets; + for (const auto& rowset : specified_rowsets) { + if (rowset->end_version() <= max_version_in_flush_phase && + rowset->produced_by_compaction()) { + rowsets_skip_alignment.emplace_back(rowset); + } else { + remained_rowsets.emplace_back(rowset); + } + } + if (!rowsets_skip_alignment.empty()) { + specified_rowsets = std::move(remained_rowsets); + } + } + + if (!rowsets_skip_alignment.empty()) { + auto token = _engine.calc_delete_bitmap_executor()->create_token(); + // set rowset_writer to nullptr to skip the alignment process + RETURN_IF_ERROR(calc_delete_bitmap(rowset, segments, rowsets_skip_alignment, delete_bitmap, + cur_version - 1, token.get(), nullptr)); + RETURN_IF_ERROR(token->wait()); + } + auto token = _engine.calc_delete_bitmap_executor()->create_token(); RETURN_IF_ERROR(calc_delete_bitmap(rowset, segments, specified_rowsets, delete_bitmap, cur_version - 1, token.get(), rowset_writer.get())); @@ -3945,6 +4019,7 @@ Status Tablet::ingest_binlog_metas(RowsetBinlogMetasPB* metas_pb) { void Tablet::clear_cache() { std::shared_lock rlock(get_header_lock()); + SCOPED_SIMPLE_TRACE_IF_TIMEOUT(TRACE_TABLET_LOCK_THRESHOLD); static auto recycle_segment_cache = [](const auto& rowset_map) { for (auto& [_, rowset] : rowset_map) { rowset->clear_cache(); @@ -4106,4 +4181,40 @@ Status Tablet::calc_local_file_crc(uint32_t* crc_value, int64_t start_version, i return Status::OK(); } +Status Tablet::show_nested_index_file(std::string* json_meta) { + Version v(0, max_version_unlocked().second); + std::vector rowsets; + traverse_rowsets([&rowsets, &v](const auto& rs) { + // get all rowsets + if (v.contains(rs->version())) { + rowsets.emplace_back(rs); + } + }); + std::sort(rowsets.begin(), rowsets.end(), Rowset::comparator); + + rapidjson::Document doc; + doc.SetObject(); + rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); + rapidjson::Value tabletIdValue(tablet_id()); + doc.AddMember("tablet_id", tabletIdValue, allocator); + + rapidjson::Value rowsets_value(rapidjson::kArrayType); + + for (const auto& rs : rowsets) { + rapidjson::Value rowset_value(rapidjson::kObjectType); + + auto rowset = std::static_pointer_cast(rs); + RETURN_IF_ERROR(rowset->show_nested_index_file(&rowset_value, allocator)); + rowsets_value.PushBack(rowset_value, allocator); + } + doc.AddMember("rowsets", rowsets_value, allocator); + + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter writer(buffer); + doc.Accept(writer); + *json_meta = std::string(buffer.GetString()); + + return Status::OK(); +} + } // namespace doris diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index 4de6a05d74532c..b4aca7ba3cbf67 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -486,7 +486,8 @@ class Tablet final : public BaseTablet { const std::vector cids_to_read, const PartialUpdateReadPlan& read_plan, const std::map& rsid_to_rowset, - vectorized::Block& block, std::map* read_index); + vectorized::Block& block, std::map* read_index, + const signed char* __restrict skip_map = nullptr); void prepare_to_read(const RowLocation& row_location, size_t pos, PartialUpdateReadPlan* read_plan); Status generate_new_block_for_partial_update( @@ -605,6 +606,7 @@ class Tablet final : public BaseTablet { void clear_cache(); Status calc_local_file_crc(uint32_t* crc_value, int64_t start_version, int64_t end_version, int32_t* rowset_count, int64_t* file_count); + Status show_nested_index_file(std::string* json_meta); private: Status _init_once_action(); diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 72bf0f0ee39ad2..4f20cbc01f589b 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -298,6 +298,9 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id if (tablet_schema.__isset.store_row_column) { schema->set_store_row_column(tablet_schema.store_row_column); } + if (tablet_schema.__isset.row_store_page_size) { + schema->set_row_store_page_size(tablet_schema.row_store_page_size); + } if (tablet_schema.__isset.skip_write_index_on_load) { schema->set_skip_write_index_on_load(tablet_schema.skip_write_index_on_load); } diff --git a/be/src/olap/tablet_reader.h b/be/src/olap/tablet_reader.h index 3bf83ec296c04b..942c61f8207727 100644 --- a/be/src/olap/tablet_reader.h +++ b/be/src/olap/tablet_reader.h @@ -184,6 +184,8 @@ class TabletReader { void check_validation() const; std::string to_string() const; + + int64_t batch_size = -1; }; TabletReader() = default; diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index 26d9d913f2f4e3..62d80fb28c195c 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -547,11 +547,18 @@ void TabletColumn::init_from_pb(const ColumnPB& column) { _visible = column.visible(); } if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) { - CHECK(column.children_columns_size() == 1) << "ARRAY type has more than 1 children types."; + CHECK(column.children_columns_size() == 1) + << "ARRAY type should has 1 children types, but got " + << column.children_columns_size(); } if (_type == FieldType::OLAP_FIELD_TYPE_MAP) { - DCHECK(column.children_columns_size() == 2) << "MAP type has more than 2 children types."; - LOG(WARNING) << "MAP type has more than 2 children types."; + DCHECK(column.children_columns_size() == 2) + << "MAP type should has 2 children types, but got " + << column.children_columns_size(); + if (UNLIKELY(column.children_columns_size() != 2)) { + LOG(WARNING) << "MAP type should has 2 children types, but got " + << column.children_columns_size(); + } } for (size_t i = 0; i < column.children_columns_size(); i++) { TabletColumn child_column; @@ -617,11 +624,15 @@ void TabletColumn::to_schema_pb(ColumnPB* column) const { column->set_visible(_visible); if (_type == FieldType::OLAP_FIELD_TYPE_ARRAY) { - CHECK(_sub_columns.size() == 1) << "ARRAY type has more than 1 children types."; + CHECK(_sub_columns.size() == 1) + << "ARRAY type should has 1 children types, but got " << _sub_columns.size(); } if (_type == FieldType::OLAP_FIELD_TYPE_MAP) { - DCHECK(_sub_columns.size() == 2) << "MAP type has more than 2 children types."; - LOG(WARNING) << "MAP type has more than 2 children types."; + DCHECK(_sub_columns.size() == 2) + << "MAP type should has 2 children types, but got " << _sub_columns.size(); + if (UNLIKELY(_sub_columns.size() != 2)) { + LOG(WARNING) << "MAP type should has 2 children types, but got " << _sub_columns.size(); + } } for (size_t i = 0; i < _sub_columns.size(); i++) { @@ -982,6 +993,7 @@ void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extrac _sort_type = schema.sort_type(); _sort_col_num = schema.sort_col_num(); _compression_type = schema.compression_type(); + _row_store_page_size = schema.row_store_page_size(); _schema_version = schema.schema_version(); // Default to V1 inverted index storage format for backward compatibility if not specified in schema. if (!schema.has_inverted_index_storage_format()) { @@ -1040,6 +1052,7 @@ void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version _skip_write_index_on_load = ori_tablet_schema.skip_write_index_on_load(); _sort_type = ori_tablet_schema.sort_type(); _sort_col_num = ori_tablet_schema.sort_col_num(); + _row_store_page_size = ori_tablet_schema.row_store_page_size(); // copy from table_schema_param _schema_version = version; @@ -1193,6 +1206,7 @@ void TabletSchema::to_schema_pb(TabletSchemaPB* tablet_schema_pb) const { tablet_schema_pb->set_sort_col_num(_sort_col_num); tablet_schema_pb->set_schema_version(_schema_version); tablet_schema_pb->set_compression_type(_compression_type); + tablet_schema_pb->set_row_store_page_size(_row_store_page_size); tablet_schema_pb->set_version_col_idx(_version_col_idx); tablet_schema_pb->set_inverted_index_storage_format(_inverted_index_storage_format); } @@ -1494,6 +1508,7 @@ bool operator==(const TabletSchema& a, const TabletSchema& b) { if (a._disable_auto_compaction != b._disable_auto_compaction) return false; if (a._enable_single_replica_compaction != b._enable_single_replica_compaction) return false; if (a._store_row_column != b._store_row_column) return false; + if (a._row_store_page_size != b._row_store_page_size) return false; if (a._skip_write_index_on_load != b._skip_write_index_on_load) return false; return true; } diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index bd3b1f6ca4efad..ed7ab896107e36 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -36,6 +36,7 @@ #include "common/status.h" #include "gutil/stringprintf.h" #include "olap/olap_common.h" +#include "olap/rowset/segment_v2/options.h" #include "runtime/define_primitive_type.h" #include "runtime/descriptors.h" #include "util/string_util.h" @@ -338,6 +339,8 @@ class TabletSchema { void set_version_col_idx(int32_t version_col_idx) { _version_col_idx = version_col_idx; } int32_t version_col_idx() const { return _version_col_idx; } segment_v2::CompressionTypePB compression_type() const { return _compression_type; } + void set_row_store_page_size(long page_size) { _row_store_page_size = page_size; } + long row_store_page_size() const { return _row_store_page_size; } const std::vector& indexes() const { return _indexes; } bool has_inverted_index() const { @@ -482,6 +485,7 @@ class TabletSchema { size_t _num_rows_per_row_block = 0; CompressKind _compress_kind = COMPRESS_NONE; segment_v2::CompressionTypePB _compression_type = segment_v2::CompressionTypePB::LZ4F; + long _row_store_page_size = segment_v2::ROW_STORE_PAGE_SIZE_DEFAULT_VALUE; size_t _next_column_unique_id = 0; std::string _auto_increment_column; diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 62af1fec61a2b1..300b65527c1006 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -190,7 +190,7 @@ Status EngineCloneTask::_do_clone() { tablet->tablet_id(), tablet->replica_id(), false)); tablet.reset(); } - bool is_new_tablet = tablet == nullptr; + _is_new_tablet = tablet == nullptr; // try to incremental clone std::vector missed_versions; // try to repair a tablet with missing version @@ -229,7 +229,7 @@ Status EngineCloneTask::_do_clone() { if (missed_versions.empty()) { LOG(INFO) << "missed version size = 0, skip clone and return success. tablet_id=" << _clone_req.tablet_id << " replica_id=" << _clone_req.replica_id; - RETURN_IF_ERROR(_set_tablet_info(is_new_tablet)); + RETURN_IF_ERROR(_set_tablet_info()); return Status::OK(); } @@ -308,10 +308,11 @@ Status EngineCloneTask::_do_clone() { TabletMeta::construct_header_file_path(tablet_dir, _clone_req.tablet_id); RETURN_IF_ERROR(io::global_local_filesystem()->delete_file(header_path)); } - return _set_tablet_info(is_new_tablet); + + return _set_tablet_info(); } -Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { +Status EngineCloneTask::_set_tablet_info() { // Get clone tablet info TTabletInfo tablet_info; tablet_info.__set_tablet_id(_clone_req.tablet_id); @@ -321,7 +322,7 @@ Status EngineCloneTask::_set_tablet_info(bool is_new_tablet) { if (_clone_req.__isset.version && tablet_info.version < _clone_req.version) { // if it is a new tablet and clone failed, then remove the tablet // if it is incremental clone, then must not drop the tablet - if (is_new_tablet) { + if (_is_new_tablet) { // we need to check if this cloned table's version is what we expect. // if not, maybe this is a stale remaining table which is waiting for drop. // we drop it. diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index 6924bfc2aa9ad7..80b9fdf4213f4a 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -56,6 +56,8 @@ class EngineCloneTask : public EngineTask { vector* tablet_infos); ~EngineCloneTask() {} + bool is_new_tablet() const { return _is_new_tablet; } + private: Status _do_clone(); @@ -72,7 +74,7 @@ class EngineCloneTask : public EngineTask { const vector& missing_versions, bool* allow_incremental_clone); - Status _set_tablet_info(bool is_new_tablet); + Status _set_tablet_info(); // Download tablet files from Status _download_files(DataDir* data_dir, const std::string& remote_url_prefix, @@ -95,6 +97,7 @@ class EngineCloneTask : public EngineTask { int64_t _copy_size; int64_t _copy_time_ms; std::vector _pending_rs_guards; + bool _is_new_tablet = false; }; // EngineTask } // namespace doris diff --git a/be/src/olap/task/engine_publish_version_task.cpp b/be/src/olap/task/engine_publish_version_task.cpp index 96cad7f934d1b6..6108e81bae3b59 100644 --- a/be/src/olap/task/engine_publish_version_task.cpp +++ b/be/src/olap/task/engine_publish_version_task.cpp @@ -110,6 +110,20 @@ Status EnginePublishVersionTask::execute() { std::this_thread::sleep_for(std::chrono::milliseconds(wait)); } }); + DBUG_EXECUTE_IF("EnginePublishVersionTask::execute.enable_spin_wait", { + auto token = dp->param("token", "invalid_token"); + while (DebugPoints::instance()->is_enable("EnginePublishVersionTask::execute.block")) { + auto block_dp = DebugPoints::instance()->get_debug_point( + "EnginePublishVersionTask::execute.block"); + if (block_dp) { + auto pass_token = block_dp->param("pass_token", ""); + if (pass_token == token) { + break; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + }); std::unique_ptr token = StorageEngine::instance()->tablet_publish_txn_thread_pool()->new_token( ThreadPool::ExecutionMode::CONCURRENT); diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index 2ed1ac5674d53f..373c398df61fb1 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -33,9 +33,11 @@ #include "common/config.h" #include "common/logging.h" +#include "common/status.h" #include "olap/data_dir.h" #include "olap/delta_writer.h" #include "olap/olap_common.h" +#include "olap/partial_update_info.h" #include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset_meta.h" #include "olap/rowset/rowset_meta_manager.h" @@ -173,10 +175,11 @@ Status TxnManager::prepare_txn(TPartitionId partition_id, TTransactionId transac Status TxnManager::commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery) { + bool is_recovery, + std::shared_ptr partial_update_info) { return commit_txn(tablet.data_dir()->get_meta(), partition_id, transaction_id, tablet.tablet_id(), tablet.tablet_uid(), load_id, rowset_ptr, - std::move(guard), is_recovery); + std::move(guard), is_recovery, partial_update_info); } Status TxnManager::publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, @@ -259,7 +262,8 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid, const PUniqueId& load_id, const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery) { + bool is_recovery, + std::shared_ptr partial_update_info) { if (partition_id < 1 || transaction_id < 1 || tablet_id < 1) { LOG(WARNING) << "invalid commit req " << " partition_id=" << partition_id << " transaction_id=" << transaction_id @@ -369,6 +373,36 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, save_status.append(fmt::format(", txn id: {}", transaction_id)); return save_status; } + + if (partial_update_info && partial_update_info->is_partial_update) { + PartialUpdateInfoPB partial_update_info_pb; + partial_update_info->to_pb(&partial_update_info_pb); + save_status = RowsetMetaManager::save_partial_update_info( + meta, tablet_id, partition_id, transaction_id, partial_update_info_pb); + if (!save_status.ok()) { + save_status.append(fmt::format(", txn_id: {}", transaction_id)); + return save_status; + } + } + } + + TabletSharedPtr tablet; + std::shared_ptr decoded_partial_update_info {nullptr}; + if (is_recovery) { + tablet = _engine.tablet_manager()->get_tablet(tablet_id, tablet_uid); + if (tablet != nullptr && tablet->enable_unique_key_merge_on_write()) { + PartialUpdateInfoPB partial_update_info_pb; + auto st = RowsetMetaManager::try_get_partial_update_info( + meta, tablet_id, partition_id, transaction_id, &partial_update_info_pb); + if (st.ok()) { + decoded_partial_update_info = std::make_shared(); + decoded_partial_update_info->from_pb(&partial_update_info_pb); + DCHECK(decoded_partial_update_info->is_partial_update); + } else if (!st.is()) { + // the load is not a partial update + return st; + } + } } { @@ -376,11 +410,17 @@ Status TxnManager::commit_txn(OlapMeta* meta, TPartitionId partition_id, auto load_info = std::make_shared(load_id, rowset_ptr); load_info->pending_rs_guard = std::move(guard); if (is_recovery) { - TabletSharedPtr tablet = _engine.tablet_manager()->get_tablet(tablet_info.tablet_id, - tablet_info.tablet_uid); if (tablet != nullptr && tablet->enable_unique_key_merge_on_write()) { load_info->unique_key_merge_on_write = true; load_info->delete_bitmap.reset(new DeleteBitmap(tablet->tablet_id())); + if (decoded_partial_update_info) { + LOG_INFO( + "get partial update info from RocksDB during recovery. txn_id={}, " + "partition_id={}, tablet_id={}, partial_update_info=[{}]", + transaction_id, partition_id, tablet_id, + decoded_partial_update_info->summary()); + load_info->partial_update_info = decoded_partial_update_info; + } } } load_info->commit(); @@ -513,6 +553,20 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, return status; } + if (tablet_txn_info->unique_key_merge_on_write && tablet_txn_info->partial_update_info && + tablet_txn_info->partial_update_info->is_partial_update) { + status = RowsetMetaManager::remove_partial_update_info(meta, tablet_id, partition_id, + transaction_id); + if (!status) { + // discard the error status and print the warning log + LOG_WARNING( + "fail to remove partial update info from RocksDB. txn_id={}, rowset_id={}, " + "tablet_id={}, tablet_uid={}", + transaction_id, rowset->rowset_id().to_string(), tablet_id, + tablet_uid.to_string()); + } + } + // TODO(Drogon): remove these test codes if (enable_binlog) { auto version_str = fmt::format("{}", version.first); @@ -692,6 +746,13 @@ void TxnManager::force_rollback_tablet_related_txns(OlapMeta* meta, TTabletId ta } } } + if (meta != nullptr) { + Status st = RowsetMetaManager::remove_tablet_related_partial_update_info(meta, tablet_id); + if (!st.ok()) { + LOG_WARNING("failed to partial update info, tablet_id={}, err={}", tablet_id, + st.to_string()); + } + } } void TxnManager::get_txn_related_tablets(const TTransactionId transaction_id, diff --git a/be/src/olap/txn_manager.h b/be/src/olap/txn_manager.h index 431ce6e49cf43d..ab34113c7e76c9 100644 --- a/be/src/olap/txn_manager.h +++ b/be/src/olap/txn_manager.h @@ -36,7 +36,6 @@ #include "common/status.h" #include "olap/olap_common.h" -#include "olap/partial_update_info.h" #include "olap/rowset/pending_rowset_helper.h" #include "olap/rowset/rowset.h" #include "olap/rowset/rowset_meta.h" @@ -52,6 +51,7 @@ namespace doris { class DeltaWriter; class OlapMeta; struct TabletPublishStatistics; +struct PartialUpdateInfo; enum class TxnState { NOT_FOUND = 0, @@ -143,8 +143,8 @@ class TxnManager { Status commit_txn(TPartitionId partition_id, const Tablet& tablet, TTransactionId transaction_id, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery); + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, + std::shared_ptr partial_update_info = nullptr); Status publish_txn(TPartitionId partition_id, const TabletSharedPtr& tablet, TTransactionId transaction_id, const Version& version, @@ -159,8 +159,8 @@ class TxnManager { Status commit_txn(OlapMeta* meta, TPartitionId partition_id, TTransactionId transaction_id, TTabletId tablet_id, TabletUid tablet_uid, const PUniqueId& load_id, - const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, - bool is_recovery); + const RowsetSharedPtr& rowset_ptr, PendingRowsetGuard guard, bool is_recovery, + std::shared_ptr partial_update_info = nullptr); // remove a txn from txn manager // not persist rowset meta because diff --git a/be/src/olap/wal/wal_table.cpp b/be/src/olap/wal/wal_table.cpp index ec0c412379af2d..e45157626a09f1 100644 --- a/be/src/olap/wal/wal_table.cpp +++ b/be/src/olap/wal/wal_table.cpp @@ -86,17 +86,22 @@ void WalTable::_pick_relay_wals() { Status WalTable::_relay_wal_one_by_one() { std::vector> need_retry_wals; - std::vector> need_delete_wals; for (auto wal_info : _replaying_queue) { wal_info->add_retry_num(); auto st = _replay_wal_internal(wal_info->get_wal_path()); auto msg = st.msg(); if (st.ok() || st.is() || st.is() || st.is() || - msg.find("LabelAlreadyUsedException") != msg.npos) { + (msg.find("LabelAlreadyUsedException") != msg.npos && + (msg.find("[COMMITTED]") != msg.npos || msg.find("[VISIBLE]") != msg.npos))) { LOG(INFO) << "succeed to replay wal=" << wal_info->get_wal_path() << ", st=" << st.to_string(); - need_delete_wals.push_back(wal_info); + // delete wal + WARN_IF_ERROR(_exec_env->wal_mgr()->delete_wal(_table_id, wal_info->get_wal_id()), + "failed to delete wal=" + wal_info->get_wal_path()); + if (config::group_commit_wait_replay_wal_finish) { + RETURN_IF_ERROR(_exec_env->wal_mgr()->notify_relay_wal(wal_info->get_wal_id())); + } } else { doris::wal_fail << 1; LOG(WARNING) << "failed to replay wal=" << wal_info->get_wal_path() @@ -111,13 +116,6 @@ Status WalTable::_relay_wal_one_by_one() { _replay_wal_map.emplace(retry_wal_info->get_wal_path(), retry_wal_info); } } - for (auto delete_wal_info : need_delete_wals) { - [[maybe_unused]] auto st = - _exec_env->wal_mgr()->delete_wal(_table_id, delete_wal_info->get_wal_id()); - if (config::group_commit_wait_replay_wal_finish) { - RETURN_IF_ERROR(_exec_env->wal_mgr()->notify_relay_wal(delete_wal_info->get_wal_id())); - } - } return Status::OK(); } @@ -167,16 +165,14 @@ Status WalTable::_try_abort_txn(int64_t db_id, std::string& label) { request.__set_auth_code(0); // this is a fake, fe not check it now request.__set_db_id(db_id); request.__set_label(label); - std::string reason = "relay wal with label " + label; - request.__set_reason(reason); + request.__set_reason("relay wal with label " + label); TLoadTxnRollbackResult result; TNetworkAddress master_addr = _exec_env->master_info()->network_address; auto st = ThriftRpcHelper::rpc( master_addr.hostname, master_addr.port, [&request, &result](FrontendServiceConnection& client) { client->loadTxnRollback(result, request); - }, - 10000L); + }); auto result_status = Status::create(result.status); LOG(INFO) << "abort label " << label << ", st:" << st << ", result_status:" << result_status; return result_status; @@ -196,6 +192,8 @@ Status WalTable::_replay_wal_internal(const std::string& wal) { [[maybe_unused]] auto st = _try_abort_txn(_db_id, label); } #endif + DBUG_EXECUTE_IF("WalTable.replay_wals.stop", + { return Status::InternalError("WalTable.replay_wals.stop"); }); return _replay_one_wal_with_streamload(wal_id, wal, label); } diff --git a/be/src/pipeline/exec/file_scan_operator.cpp b/be/src/pipeline/exec/file_scan_operator.cpp index 6182d35b97cc68..d36d01427145b1 100644 --- a/be/src/pipeline/exec/file_scan_operator.cpp +++ b/be/src/pipeline/exec/file_scan_operator.cpp @@ -73,11 +73,13 @@ void FileScanLocalState::set_scan_ranges(RuntimeState* state, auto split_source = scan_range.split_source; RuntimeProfile::Counter* get_split_timer = ADD_TIMER(_runtime_profile, "GetSplitTime"); _split_source = std::make_shared( - state, get_split_timer, split_source.split_source_id, split_source.num_splits); + state, get_split_timer, split_source.split_source_id, split_source.num_splits, + _max_scanners); } } if (_split_source == nullptr) { - _split_source = std::make_shared(scan_ranges); + _split_source = + std::make_shared(scan_ranges, _max_scanners); } _max_scanners = std::min(_max_scanners, _split_source->num_scan_ranges()); if (scan_ranges.size() > 0 && diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index 002a79f2db2e0b..b7dd0622fe3349 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -21,6 +21,9 @@ #include "common/logging.h" #include "pipeline/exec/operator.h" +#include "runtime/descriptors.h" +#include "vec/common/assert_cast.h" +#include "vec/data_types/data_type_nullable.h" namespace doris { namespace pipeline { @@ -637,6 +640,54 @@ Status HashJoinProbeOperatorX::prepare(RuntimeState* state) { _left_table_data_types = vectorized::VectorizedUtils::get_data_types(_child_x->row_desc()); _right_table_column_names = vectorized::VectorizedUtils::get_column_names(_build_side_child->row_desc()); + + std::vector slots_to_check; + for (const auto& tuple_descriptor : _intermediate_row_desc->tuple_descriptors()) { + for (const auto& slot : tuple_descriptor->slots()) { + slots_to_check.emplace_back(slot); + } + } + + if (_is_mark_join) { + const auto* last_one = slots_to_check.back(); + slots_to_check.pop_back(); + auto data_type = last_one->get_data_type_ptr(); + if (!data_type->is_nullable()) { + return Status::InternalError( + "The last column for mark join should be Nullable(UInt8), not {}", + data_type->get_name()); + } + + const auto& null_data_type = assert_cast(*data_type); + if (null_data_type.get_nested_type()->get_type_id() != vectorized::TypeIndex::UInt8) { + return Status::InternalError( + "The last column for mark join should be Nullable(UInt8), not {}", + data_type->get_name()); + } + } + + const int right_col_idx = + (_is_right_semi_anti && !_have_other_join_conjunct) ? 0 : _left_table_data_types.size(); + size_t idx = 0; + for (const auto* slot : slots_to_check) { + auto data_type = slot->get_data_type_ptr(); + auto target_data_type = idx < right_col_idx ? _left_table_data_types[idx] + : _right_table_data_types[idx - right_col_idx]; + ++idx; + if (data_type->equals(*target_data_type)) { + continue; + } + + auto data_type_non_nullable = vectorized::remove_nullable(data_type); + if (data_type_non_nullable->equals(*target_data_type)) { + continue; + } + + return Status::InternalError("intermediate slot({}) data type not match: '{}' vs '{}'", + slot->id(), data_type->get_name(), + _left_table_data_types[idx]->get_name()); + } + _build_side_child.reset(); return Status::OK(); } diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index d9e228463776c6..e3bd69cb91e9a1 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -125,6 +125,8 @@ Status OlapScanLocalState::_init_profile() { _inverted_index_query_cache_miss_counter = ADD_COUNTER(_segment_profile, "InvertedIndexQueryCacheMiss", TUnit::UNIT); _inverted_index_query_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryTime"); + _inverted_index_query_null_bitmap_timer = + ADD_TIMER(_segment_profile, "InvertedIndexQueryNullBitmapTime"); _inverted_index_query_bitmap_copy_timer = ADD_TIMER(_segment_profile, "InvertedIndexQueryBitmapCopyTime"); _inverted_index_query_bitmap_op_timer = @@ -133,6 +135,10 @@ Status OlapScanLocalState::_init_profile() { ADD_TIMER(_segment_profile, "InvertedIndexSearcherOpenTime"); _inverted_index_searcher_search_timer = ADD_TIMER(_segment_profile, "InvertedIndexSearcherSearchTime"); + _inverted_index_searcher_cache_hit_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexSearcherCacheHit", TUnit::UNIT); + _inverted_index_searcher_cache_miss_counter = + ADD_COUNTER(_segment_profile, "InvertedIndexSearcherCacheMiss", TUnit::UNIT); _output_index_result_column_timer = ADD_TIMER(_segment_profile, "OutputIndexResultColumnTimer"); diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index 8ec318e853bb71..d2659fd68f6821 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -174,6 +174,7 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_filter_counter = nullptr; RuntimeProfile::Counter* _inverted_index_filter_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_query_null_bitmap_timer = nullptr; RuntimeProfile::Counter* _inverted_index_query_cache_hit_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_cache_miss_counter = nullptr; RuntimeProfile::Counter* _inverted_index_query_timer = nullptr; @@ -181,6 +182,8 @@ class OlapScanLocalState final : public ScanLocalState { RuntimeProfile::Counter* _inverted_index_query_bitmap_op_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_open_timer = nullptr; RuntimeProfile::Counter* _inverted_index_searcher_search_timer = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_cache_hit_counter = nullptr; + RuntimeProfile::Counter* _inverted_index_searcher_cache_miss_counter = nullptr; RuntimeProfile::Counter* _output_index_result_column_timer = nullptr; diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index 7322b08556477a..39a57bee25b6fc 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -1268,9 +1268,8 @@ Status ScanLocalState::_start_scanners( state(), this, p._output_tuple_desc, p.output_row_descriptor(), scanners, p.limit(), state()->scan_queue_mem_limit(), _scan_dependency, // 1. If data distribution is ignored , we use 1 instance to scan. - // 2. Else if this operator is not file scan operator, we use config::doris_scanner_thread_pool_thread_num scanners to scan. - // 3. Else, file scanner will consume much memory so we use config::doris_scanner_thread_pool_thread_num / query_parallel_instance_num scanners to scan. - p.ignore_data_distribution() || !p.is_file_scan_operator() + // 2. Else, file scanner will consume much memory so we use config::doris_scanner_thread_pool_thread_num / query_parallel_instance_num scanners to scan. + p.ignore_data_distribution() && !p.is_file_scan_operator() ? 1 : state()->query_parallel_instance_num()); return Status::OK(); @@ -1499,7 +1498,7 @@ Status ScanOperatorX::get_block(RuntimeState* state, vectorized: if (local_state._scanner_ctx) { local_state._scanner_ctx->stop_scanners(state); } - return Status::Cancelled("Query cancelled in ScanOperator"); + return Status::Cancelled(state->cancel_reason()); } if (local_state._eos) { diff --git a/be/src/pipeline/exec/schema_scan_operator.cpp b/be/src/pipeline/exec/schema_scan_operator.cpp index f26b2d706b7d95..d5353655ab070a 100644 --- a/be/src/pipeline/exec/schema_scan_operator.cpp +++ b/be/src/pipeline/exec/schema_scan_operator.cpp @@ -61,6 +61,7 @@ Status SchemaScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { // new one scanner _schema_scanner = SchemaScanner::create(schema_table->schema_table_type()); + _schema_scanner->set_dependency(_data_dependency, _finish_dependency); if (nullptr == _schema_scanner) { return Status::InternalError("schema scanner get nullptr pointer."); } @@ -72,7 +73,7 @@ Status SchemaScanLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(PipelineXLocalState<>::open(state)); - return _schema_scanner->start(state); + return _schema_scanner->get_next_block_async(state); } SchemaScanOperatorX::SchemaScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, int operator_id, @@ -132,6 +133,17 @@ Status SchemaScanOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { _common_scanner_param->catalog = state->obj_pool()->add(new std::string(tnode.schema_scan_node.catalog)); } + + if (tnode.schema_scan_node.__isset.fe_addr_list) { + for (const auto& fe_addr : tnode.schema_scan_node.fe_addr_list) { + _common_scanner_param->fe_addr_list.insert(fe_addr); + } + } else if (tnode.schema_scan_node.__isset.ip && tnode.schema_scan_node.__isset.port) { + TNetworkAddress fe_addr; + fe_addr.hostname = tnode.schema_scan_node.ip; + fe_addr.port = tnode.schema_scan_node.port; + _common_scanner_param->fe_addr_list.insert(fe_addr); + } return Status::OK(); } @@ -239,8 +251,12 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* bl while (true) { RETURN_IF_CANCELLED(state); + if (local_state._data_dependency->is_blocked_by() != nullptr) { + break; + } // get all slots from schema table. - RETURN_IF_ERROR(local_state._schema_scanner->get_next_block(&src_block, &schema_eos)); + RETURN_IF_ERROR( + local_state._schema_scanner->get_next_block(state, &src_block, &schema_eos)); if (schema_eos) { *eos = true; diff --git a/be/src/pipeline/exec/schema_scan_operator.h b/be/src/pipeline/exec/schema_scan_operator.h index bd336132efb2b7..c026c105e95be7 100644 --- a/be/src/pipeline/exec/schema_scan_operator.h +++ b/be/src/pipeline/exec/schema_scan_operator.h @@ -55,18 +55,30 @@ class SchemaScanLocalState final : public PipelineXLocalState<> { ENABLE_FACTORY_CREATOR(SchemaScanLocalState); SchemaScanLocalState(RuntimeState* state, OperatorXBase* parent) - : PipelineXLocalState<>(state, parent) {} + : PipelineXLocalState<>(state, parent) { + _finish_dependency = + std::make_shared(parent->operator_id(), parent->node_id(), + parent->get_name() + "_FINISH_DEPENDENCY", true); + _data_dependency = std::make_shared(parent->operator_id(), parent->node_id(), + parent->get_name() + "_DEPENDENCY", true); + } ~SchemaScanLocalState() override = default; Status init(RuntimeState* state, LocalStateInfo& info) override; Status open(RuntimeState* state) override; + Dependency* finishdependency() override { return _finish_dependency.get(); } + std::vector dependencies() const override { return {_data_dependency.get()}; } + private: friend class SchemaScanOperatorX; SchemaScannerParam _scanner_param; std::unique_ptr _schema_scanner; + + std::shared_ptr _finish_dependency; + std::shared_ptr _data_dependency; }; class SchemaScanOperatorX final : public OperatorX { diff --git a/be/src/pipeline/exec/set_sink_operator.cpp b/be/src/pipeline/exec/set_sink_operator.cpp index aceeac25596213..1851519138865d 100644 --- a/be/src/pipeline/exec/set_sink_operator.cpp +++ b/be/src/pipeline/exec/set_sink_operator.cpp @@ -140,9 +140,13 @@ Status SetSinkOperatorX::_extract_build_column( block.get_by_position(result_col_id).column = block.get_by_position(result_col_id).column->convert_to_full_column_if_const(); + // Do make nullable should not change the origin column and type in origin block + // which may cause coredump problem if (local_state._shared_state->build_not_ignore_null[i]) { - block.get_by_position(result_col_id).column = - make_nullable(block.get_by_position(result_col_id).column); + auto column_ptr = make_nullable(block.get_by_position(result_col_id).column, false); + block.insert( + {column_ptr, make_nullable(block.get_by_position(result_col_id).type), ""}); + result_col_id = block.columns() - 1; } const auto* column = block.get_by_position(result_col_id).column.get(); diff --git a/be/src/pipeline/pipeline_x/dependency.h b/be/src/pipeline/pipeline_x/dependency.h index c4b8b9b9ff04f9..dfdb13b4414beb 100644 --- a/be/src/pipeline/pipeline_x/dependency.h +++ b/be/src/pipeline/pipeline_x/dependency.h @@ -89,20 +89,11 @@ class Dependency : public std::enable_shared_from_this { public: ENABLE_FACTORY_CREATOR(Dependency); Dependency(int id, int node_id, std::string name) - : _id(id), - _node_id(node_id), - _name(std::move(name)), - _is_write_dependency(false), - _ready(false) {} + : _id(id), _node_id(node_id), _name(std::move(name)), _ready(false) {} Dependency(int id, int node_id, std::string name, bool ready) - : _id(id), - _node_id(node_id), - _name(std::move(name)), - _is_write_dependency(true), - _ready(ready) {} + : _id(id), _node_id(node_id), _name(std::move(name)), _ready(ready) {} virtual ~Dependency() = default; - bool is_write_dependency() const { return _is_write_dependency; } [[nodiscard]] int id() const { return _id; } [[nodiscard]] virtual std::string name() const { return _name; } BasicSharedState* shared_state() { return _shared_state; } @@ -119,12 +110,10 @@ class Dependency : public std::enable_shared_from_this { // Notify downstream pipeline tasks this dependency is ready. void set_ready(); void set_ready_to_read() { - DCHECK(_is_write_dependency) << debug_string(); DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); _shared_state->source_deps.front()->set_ready(); } void set_block_to_read() { - DCHECK(_is_write_dependency) << debug_string(); DCHECK(_shared_state->source_deps.size() == 1) << debug_string(); _shared_state->source_deps.front()->block(); } @@ -167,7 +156,6 @@ class Dependency : public std::enable_shared_from_this { const int _id; const int _node_id; const std::string _name; - const bool _is_write_dependency; std::atomic _ready; BasicSharedState* _shared_state = nullptr; @@ -684,10 +672,12 @@ struct SetSharedState : public BasicSharedState { // (select 0) intersect (select null) the build side hash table should not // ignore null value. std::vector data_types; + int i = 0; for (const auto& ctx : child_exprs_lists[0]) { - data_types.emplace_back(build_not_ignore_null[0] + data_types.emplace_back(build_not_ignore_null[i] ? make_nullable(ctx->root()->data_type()) : ctx->root()->data_type()); + i++; } if (!try_get_hash_map_context_fixed( *hash_table_variants, data_types)) { @@ -798,13 +788,13 @@ struct LocalExchangeSharedState : public BasicSharedState { } void add_total_mem_usage(size_t delta) { - if (mem_usage.fetch_add(delta) > config::local_exchange_buffer_mem_limit) { + if (mem_usage.fetch_add(delta) + delta > config::local_exchange_buffer_mem_limit) { sink_deps.front()->block(); } } void sub_total_mem_usage(size_t delta) { - if (mem_usage.fetch_sub(delta) <= config::local_exchange_buffer_mem_limit) { + if (mem_usage.fetch_sub(delta) - delta <= config::local_exchange_buffer_mem_limit) { sink_deps.front()->set_ready(); } } diff --git a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.h index 99b88747a98199..a32ecc21e000d4 100644 --- a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_sink_operator.h @@ -49,6 +49,8 @@ class LocalExchangeSinkLocalState final : public PipelineXSinkLocalState + friend class Exchanger; ExchangerBase* _exchanger = nullptr; diff --git a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp index 086a3b551fd126..b3a28a6404f46e 100644 --- a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp +++ b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.cpp @@ -66,11 +66,13 @@ std::string LocalExchangeSourceLocalState::debug_string(int indentation_level) c fmt::memory_buffer debug_string_buffer; fmt::format_to(debug_string_buffer, "{}, _channel_id: {}, _num_partitions: {}, _num_senders: {}, _num_sources: {}, " - "_running_sink_operators: {}, _running_source_operators: {}, mem_usage: {}", + "_running_sink_operators: {}, _running_source_operators: {}, mem_usage: {}, " + "data queue info: {}", Base::debug_string(indentation_level), _channel_id, _exchanger->_num_partitions, _exchanger->_num_senders, _exchanger->_num_sources, _exchanger->_running_sink_operators, _exchanger->_running_source_operators, - _shared_state->mem_usage.load()); + _shared_state->mem_usage.load(), + _exchanger->data_queue_debug_string(_channel_id)); size_t i = 0; fmt::format_to(debug_string_buffer, ", MemTrackers: "); for (auto* mem_tracker : _shared_state->mem_trackers) { diff --git a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.h b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.h index 7cefc1ca9000e1..193b1c553f9ed9 100644 --- a/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.h +++ b/be/src/pipeline/pipeline_x/local_exchange/local_exchange_source_operator.h @@ -47,6 +47,8 @@ class LocalExchangeSourceLocalState final : public PipelineXLocalState + friend class Exchanger; ExchangerBase* _exchanger = nullptr; int _channel_id; diff --git a/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.cpp b/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.cpp index 7a044aaa77f153..eb3875dcf7cb21 100644 --- a/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.cpp +++ b/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.cpp @@ -24,6 +24,37 @@ namespace doris::pipeline { +template +bool Exchanger::_enqueue_data_and_set_ready(int channel_id, + LocalExchangeSinkLocalState& local_state, + BlockType&& block) { + std::unique_lock l(_m); + if (_data_queue[channel_id].enqueue(std::move(block))) { + local_state._shared_state->set_ready_to_read(channel_id); + return true; + } + return false; +} + +template +bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, + BlockType& block, bool* eos) { + bool all_finished = _running_sink_operators == 0; + if (_data_queue[local_state._channel_id].try_dequeue(block)) { + return true; + } else if (all_finished) { + *eos = true; + } else { + std::unique_lock l(_m); + if (_data_queue[local_state._channel_id].try_dequeue(block)) { + return true; + } + COUNTER_UPDATE(local_state._get_block_failed_counter, 1); + local_state._dependency->block(); + } + return false; +} + Status ShuffleExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, LocalExchangeSinkLocalState& local_state) { { @@ -72,17 +103,11 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block return Status::OK(); }; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(partitioned_block)) { + if (_dequeue_data(local_state, partitioned_block, eos)) { SCOPED_TIMER(local_state._copy_data_timer); mutable_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->data_block); RETURN_IF_ERROR(get_data(block)); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -90,7 +115,6 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, bool eos, LocalExchangeSinkLocalState& local_state) { - auto& data_queue = _data_queue; const auto rows = block->rows(); auto row_idx = std::make_shared>(rows); { @@ -133,9 +157,9 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (size > 0) { local_state._shared_state->add_mem_usage( it.second, new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[it.second].enqueue({new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(it.second); - } else { + + if (!_enqueue_data_and_set_ready(it.second, local_state, + {new_block_wrapper, {row_idx, start, size}})) { local_state._shared_state->sub_mem_usage( it.second, new_block_wrapper->data_block.allocated_bytes(), false); new_block_wrapper->unref(local_state._shared_state); @@ -152,10 +176,8 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (size > 0) { local_state._shared_state->add_mem_usage( i % _num_sources, new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[i % _num_sources].enqueue( - {new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(i % _num_sources); - } else { + if (!_enqueue_data_and_set_ready(i % _num_sources, local_state, + {new_block_wrapper, {row_idx, start, size}})) { local_state._shared_state->sub_mem_usage( i % _num_sources, new_block_wrapper->data_block.allocated_bytes(), false); @@ -175,9 +197,8 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (size > 0) { local_state._shared_state->add_mem_usage( map[i], new_block_wrapper->data_block.allocated_bytes(), false); - if (data_queue[map[i]].enqueue({new_block_wrapper, {row_idx, start, size}})) { - local_state._shared_state->set_ready_to_read(map[i]); - } else { + if (!_enqueue_data_and_set_ready(map[i], local_state, + {new_block_wrapper, {row_idx, start, size}})) { local_state._shared_state->sub_mem_usage( map[i], new_block_wrapper->data_block.allocated_bytes(), false); new_block_wrapper->unref(local_state._shared_state); @@ -201,9 +222,7 @@ Status PassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_blo auto channel_id = (local_state._channel_id++) % _num_partitions; size_t memory_usage = new_block.allocated_bytes(); local_state._shared_state->add_mem_usage(channel_id, memory_usage); - if (_data_queue[channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(channel_id); - } else { + if (!_enqueue_data_and_set_ready(channel_id, local_state, std::move(new_block))) { local_state._shared_state->sub_mem_usage(channel_id, memory_usage); } @@ -222,19 +241,13 @@ void PassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { Status PassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { block->swap(next_block); local_state._shared_state->sub_mem_usage(local_state._channel_id, block->allocated_bytes()); if (_free_block_limit == 0 || _free_blocks.size_approx() < _free_block_limit * _num_sources) { _free_blocks.enqueue(std::move(next_block)); } - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -243,9 +256,7 @@ Status PassToOneExchanger::sink(RuntimeState* state, vectorized::Block* in_block LocalExchangeSinkLocalState& local_state) { vectorized::Block new_block(in_block->clone_empty()); new_block.swap(*in_block); - if (_data_queue[0].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(0); - } + _enqueue_data_and_set_ready(0, local_state, std::move(new_block)); return Status::OK(); } @@ -257,14 +268,8 @@ Status PassToOneExchanger::get_block(RuntimeState* state, vectorized::Block* blo return Status::OK(); } vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[0].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { *block = std::move(next_block); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -274,9 +279,7 @@ Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block for (size_t i = 0; i < _num_partitions; i++) { auto mutable_block = vectorized::MutableBlock::create_unique(in_block->clone_empty()); RETURN_IF_ERROR(mutable_block->add_rows(in_block, 0, in_block->rows())); - if (_data_queue[i].enqueue(mutable_block->to_block())) { - local_state._shared_state->set_ready_to_read(i); - } + _enqueue_data_and_set_ready(i, local_state, mutable_block->to_block()); } return Status::OK(); @@ -293,14 +296,8 @@ void BroadcastExchanger::close(LocalExchangeSourceLocalState& local_state) { Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { *block = std::move(next_block); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } @@ -316,9 +313,8 @@ Status AdaptivePassthroughExchanger::_passthrough_sink(RuntimeState* state, auto channel_id = (local_state._channel_id++) % _num_partitions; size_t memory_usage = new_block.allocated_bytes(); local_state._shared_state->add_mem_usage(channel_id, memory_usage); - if (_data_queue[channel_id].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(channel_id); - } else { + + if (!_enqueue_data_and_set_ready(channel_id, local_state, std::move(new_block))) { local_state._shared_state->sub_mem_usage(channel_id, memory_usage); } @@ -349,7 +345,6 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, vectorized::Block* block, bool eos, LocalExchangeSinkLocalState& local_state) { - auto& data_queue = _data_queue; const auto rows = block->rows(); auto row_idx = std::make_shared>(rows); { @@ -378,9 +373,7 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, size_t memory_usage = new_block.allocated_bytes(); local_state._shared_state->add_mem_usage(i, memory_usage); - if (data_queue[i].enqueue(std::move(new_block))) { - local_state._shared_state->set_ready_to_read(i); - } else { + if (!_enqueue_data_and_set_ready(i, local_state, std::move(new_block))) { local_state._shared_state->sub_mem_usage(i, memory_usage); } } @@ -404,19 +397,13 @@ Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, vectorized:: bool* eos, LocalExchangeSourceLocalState& local_state) { vectorized::Block next_block; - bool all_finished = _running_sink_operators == 0; - if (_data_queue[local_state._channel_id].try_dequeue(next_block)) { + if (_dequeue_data(local_state, next_block, eos)) { block->swap(next_block); if (_free_block_limit == 0 || _free_blocks.size_approx() < _free_block_limit * _num_sources) { _free_blocks.enqueue(std::move(next_block)); } local_state._shared_state->sub_mem_usage(local_state._channel_id, block->allocated_bytes()); - } else if (all_finished) { - *eos = true; - } else { - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); } return Status::OK(); } diff --git a/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.h b/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.h index ee0b5e286def73..fda86b5bb550b6 100644 --- a/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.h +++ b/be/src/pipeline/pipeline_x/local_exchange/local_exchanger.h @@ -54,6 +54,8 @@ class ExchangerBase { virtual DependencySPtr get_local_state_dependency(int _channel_id) { return nullptr; } + virtual std::string data_queue_debug_string(int i) = 0; + protected: friend struct LocalExchangeSharedState; friend struct ShuffleBlockWrapper; @@ -114,9 +116,19 @@ class Exchanger : public ExchangerBase { : ExchangerBase(running_sink_operators, num_sources, num_partitions, free_block_limit) { } ~Exchanger() override = default; + std::string data_queue_debug_string(int i) override { + return fmt::format("Data Queue {}: [size approx = {}, eos = {}]", i, + _data_queue[i].data_queue.size_approx(), _data_queue[i].eos); + } protected: + bool _enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState& local_state, + BlockType&& block); + bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos); std::vector> _data_queue; + +private: + std::mutex _m; }; class LocalExchangeSourceLocalState; diff --git a/be/src/pipeline/pipeline_x/operator.cpp b/be/src/pipeline/pipeline_x/operator.cpp index a8902cf2321170..6c2c2a6294d98e 100644 --- a/be/src/pipeline/pipeline_x/operator.cpp +++ b/be/src/pipeline/pipeline_x/operator.cpp @@ -220,7 +220,7 @@ Status OperatorXBase::do_projections(RuntimeState* state, vectorized::Block* ori vectorized::Block input_block = *origin_block; std::vector result_column_ids; - for (const auto& projections : _intermediate_projections) { + for (const auto& projections : local_state->_intermediate_projections) { result_column_ids.resize(projections.size()); for (int i = 0; i < projections.size(); i++) { RETURN_IF_ERROR(projections[i]->execute(&input_block, &result_column_ids[i])); diff --git a/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp b/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp index 90c5394debd76a..55a25718e0043d 100644 --- a/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp +++ b/be/src/pipeline/pipeline_x/pipeline_x_fragment_context.cpp @@ -178,6 +178,9 @@ Status PipelineXFragmentContext::prepare(const doris::TPipelineFragmentParams& r if (_prepared) { return Status::InternalError("Already prepared"); } + if (request.__isset.query_options && request.query_options.__isset.execution_timeout) { + _timeout = request.query_options.execution_timeout; + } _num_instances = request.local_params.size(); _total_instances = request.__isset.total_instances ? request.total_instances : _num_instances; _runtime_profile = std::make_unique("PipelineContext"); diff --git a/be/src/pipeline/pipeline_x/pipeline_x_task.cpp b/be/src/pipeline/pipeline_x/pipeline_x_task.cpp index b723fe02d7acab..f05b491d50bcd4 100644 --- a/be/src/pipeline/pipeline_x/pipeline_x_task.cpp +++ b/be/src/pipeline/pipeline_x/pipeline_x_task.cpp @@ -149,8 +149,6 @@ Status PipelineXTask::_extract_dependencies() { { auto* local_state = _state->get_sink_local_state(); write_dependencies = local_state->dependencies(); - DCHECK(std::all_of(write_dependencies.begin(), write_dependencies.end(), - [](auto* dep) { return dep->is_write_dependency(); })); auto* fin_dep = local_state->finishdependency(); if (fin_dep) { finish_dependencies.push_back(fin_dep); diff --git a/be/src/pipeline/pipeline_x/pipeline_x_task.h b/be/src/pipeline/pipeline_x/pipeline_x_task.h index ae89fe2cdde536..47746b76fb0194 100644 --- a/be/src/pipeline/pipeline_x/pipeline_x_task.h +++ b/be/src/pipeline/pipeline_x/pipeline_x_task.h @@ -139,6 +139,7 @@ class PipelineXTask : public PipelineTask { int task_id() const { return _index; }; void clear_blocking_state() { + _state->get_query_ctx()->get_execution_dependency()->set_always_ready(); // We use a lock to assure all dependencies are not deconstructed here. std::unique_lock lc(_dependency_lock); if (!_finished) { diff --git a/be/src/pipeline/task_queue.cpp b/be/src/pipeline/task_queue.cpp index 617cd7a78d110a..293769162f6aa3 100644 --- a/be/src/pipeline/task_queue.cpp +++ b/be/src/pipeline/task_queue.cpp @@ -130,37 +130,46 @@ Status PriorityTaskQueue::push(PipelineTask* task) { return Status::OK(); } -int PriorityTaskQueue::task_size() { - std::unique_lock lock(_work_size_mutex); - return _total_task_size; -} - MultiCoreTaskQueue::~MultiCoreTaskQueue() = default; -MultiCoreTaskQueue::MultiCoreTaskQueue(size_t core_size) : TaskQueue(core_size), _closed(false) { - _prio_task_queue_list.reset(new PriorityTaskQueue[core_size]); +MultiCoreTaskQueue::MultiCoreTaskQueue(int core_size) : TaskQueue(core_size), _closed(false) { + _prio_task_queue_list = + std::make_shared>>(core_size); + for (int i = 0; i < core_size; i++) { + (*_prio_task_queue_list)[i] = std::make_unique(); + } } void MultiCoreTaskQueue::close() { + if (_closed) { + return; + } _closed = true; for (int i = 0; i < _core_size; ++i) { - _prio_task_queue_list[i].close(); + (*_prio_task_queue_list)[i]->close(); } + std::atomic_store(&_prio_task_queue_list, + std::shared_ptr>>(nullptr)); } -PipelineTask* MultiCoreTaskQueue::take(size_t core_id) { +PipelineTask* MultiCoreTaskQueue::take(int core_id) { PipelineTask* task = nullptr; + auto prio_task_queue_list = + std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); while (!_closed) { - task = _prio_task_queue_list[core_id].try_take(false); + DCHECK(prio_task_queue_list->size() > core_id) + << " list size: " << prio_task_queue_list->size() << " core_id: " << core_id + << " _core_size: " << _core_size << " _next_core: " << _next_core.load(); + task = (*prio_task_queue_list)[core_id]->try_take(false); if (task) { task->set_core_id(core_id); break; } - task = _steal_take(core_id); + task = _steal_take(core_id, *prio_task_queue_list); if (task) { break; } - task = _prio_task_queue_list[core_id].take(WAIT_CORE_TASK_TIMEOUT_MS /* timeout_ms */); + task = (*prio_task_queue_list)[core_id]->take(WAIT_CORE_TASK_TIMEOUT_MS /* timeout_ms */); if (task) { task->set_core_id(core_id); break; @@ -172,16 +181,17 @@ PipelineTask* MultiCoreTaskQueue::take(size_t core_id) { return task; } -PipelineTask* MultiCoreTaskQueue::_steal_take(size_t core_id) { +PipelineTask* MultiCoreTaskQueue::_steal_take( + int core_id, std::vector>& prio_task_queue_list) { DCHECK(core_id < _core_size); - size_t next_id = core_id; - for (size_t i = 1; i < _core_size; ++i) { + int next_id = core_id; + for (int i = 1; i < _core_size; ++i) { ++next_id; if (next_id == _core_size) { next_id = 0; } DCHECK(next_id < _core_size); - auto task = _prio_task_queue_list[next_id].try_take(true); + auto task = prio_task_queue_list[next_id]->try_take(true); if (task) { task->set_core_id(next_id); return task; @@ -198,10 +208,12 @@ Status MultiCoreTaskQueue::push_back(PipelineTask* task) { return push_back(task, core_id); } -Status MultiCoreTaskQueue::push_back(PipelineTask* task, size_t core_id) { +Status MultiCoreTaskQueue::push_back(PipelineTask* task, int core_id) { DCHECK(core_id < _core_size); task->put_in_runnable_queue(); - return _prio_task_queue_list[core_id].push(task); + auto prio_task_queue_list = + std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); + return (*prio_task_queue_list)[core_id]->push(task); } } // namespace pipeline diff --git a/be/src/pipeline/task_queue.h b/be/src/pipeline/task_queue.h index 02994511019f7d..3ac9de460250d0 100644 --- a/be/src/pipeline/task_queue.h +++ b/be/src/pipeline/task_queue.h @@ -37,25 +37,25 @@ namespace pipeline { class TaskQueue { public: - TaskQueue(size_t core_size) : _core_size(core_size) {} + TaskQueue(int core_size) : _core_size(core_size) {} virtual ~TaskQueue(); virtual void close() = 0; // Get the task by core id. // TODO: To think the logic is useful? - virtual PipelineTask* take(size_t core_id) = 0; + virtual PipelineTask* take(int core_id) = 0; // push from scheduler virtual Status push_back(PipelineTask* task) = 0; // push from worker - virtual Status push_back(PipelineTask* task, size_t core_id) = 0; + virtual Status push_back(PipelineTask* task, int core_id) = 0; virtual void update_statistics(PipelineTask* task, int64_t time_spent) {} int cores() const { return _core_size; } protected: - size_t _core_size; + int _core_size; static constexpr auto WAIT_CORE_TASK_TIMEOUT_MS = 100; }; @@ -105,8 +105,6 @@ class PriorityTaskQueue { _sub_queues[level].inc_runtime(runtime); } - int task_size(); - private: PipelineTask* _try_take_unprotected(bool is_steal); static constexpr auto LEVEL_QUEUE_TIME_FACTOR = 2; @@ -130,32 +128,34 @@ class PriorityTaskQueue { // Need consider NUMA architecture class MultiCoreTaskQueue : public TaskQueue { public: - explicit MultiCoreTaskQueue(size_t core_size); + explicit MultiCoreTaskQueue(int core_size); ~MultiCoreTaskQueue() override; void close() override; // Get the task by core id. - // TODO: To think the logic is useful? - PipelineTask* take(size_t core_id) override; + PipelineTask* take(int core_id) override; // TODO combine these methods to `push_back(task, core_id = -1)` Status push_back(PipelineTask* task) override; - Status push_back(PipelineTask* task, size_t core_id) override; + Status push_back(PipelineTask* task, int core_id) override; void update_statistics(PipelineTask* task, int64_t time_spent) override { task->inc_runtime_ns(time_spent); - _prio_task_queue_list[task->get_core_id()].inc_sub_queue_runtime(task->get_queue_level(), - time_spent); + auto prio_task_queue_list = + std::atomic_load_explicit(&_prio_task_queue_list, std::memory_order_relaxed); + (*prio_task_queue_list)[task->get_core_id()]->inc_sub_queue_runtime(task->get_queue_level(), + time_spent); } private: - PipelineTask* _steal_take(size_t core_id); + PipelineTask* _steal_take( + int core_id, std::vector>& prio_task_queue_list); - std::unique_ptr _prio_task_queue_list; - std::atomic _next_core = 0; + std::shared_ptr>> _prio_task_queue_list; + std::atomic _next_core = 0; std::atomic _closed; }; diff --git a/be/src/pipeline/task_scheduler.cpp b/be/src/pipeline/task_scheduler.cpp index f2c86168180910..de697469575bc9 100644 --- a/be/src/pipeline/task_scheduler.cpp +++ b/be/src/pipeline/task_scheduler.cpp @@ -205,13 +205,13 @@ TaskScheduler::~TaskScheduler() { Status TaskScheduler::start() { int cores = _task_queue->cores(); - // Must be mutil number of cpu cores RETURN_IF_ERROR(ThreadPoolBuilder(_name) .set_min_threads(cores) .set_max_threads(cores) .set_max_queue_size(0) .set_cgroup_cpu_ctl(_cgroup_cpu_ctl) .build(&_fix_thread_pool)); + LOG_INFO("TaskScheduler set cores").tag("size", cores); _markers.reserve(cores); for (size_t i = 0; i < cores; ++i) { _markers.push_back(std::make_unique>(true)); diff --git a/be/src/runtime/buffer_control_block.cpp b/be/src/runtime/buffer_control_block.cpp index e16cad020bf09d..a10ce354325b2c 100644 --- a/be/src/runtime/buffer_control_block.cpp +++ b/be/src/runtime/buffer_control_block.cpp @@ -24,6 +24,7 @@ #include // IWYU pragma: no_include #include // IWYU pragma: keep +#include #include #include #include @@ -80,6 +81,13 @@ void GetResultBatchCtx::on_data(const std::unique_ptr& t_resul result->set_packet_seq(packet_seq); result->set_eos(eos); } + + /// The size limit of proto buffer message is 2G + if (result->ByteSizeLong() > std::numeric_limits::max()) { + st = Status::InternalError("Message size exceeds 2GB: {}", result->ByteSizeLong()); + result->clear_row_batch(); + result->set_empty_batch(true); + } st.to_protobuf(result->mutable_status()); { done->Run(); } delete this; diff --git a/be/src/runtime/group_commit_mgr.cpp b/be/src/runtime/group_commit_mgr.cpp index 7bb30b1cc8b1d0..d97b268fc27cdd 100644 --- a/be/src/runtime/group_commit_mgr.cpp +++ b/be/src/runtime/group_commit_mgr.cpp @@ -397,13 +397,18 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ Status result_status; DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.err_status", { status = Status::InternalError(""); }); + DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.load_error", + { status = Status::InternalError("load_error"); }); if (status.ok()) { + DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.commit_error", + { status = Status::InternalError(""); }); // commit txn TLoadTxnCommitRequest request; request.__set_auth_code(0); // this is a fake, fe not check it now request.__set_db_id(db_id); request.__set_table_id(table_id); request.__set_txnId(txn_id); + request.__set_thrift_rpc_timeout_ms(config::txn_commit_rpc_timeout_ms); request.__set_groupCommit(true); request.__set_receiveBytes(state->num_bytes_load_total()); if (_exec_env->master_info()->__isset.backend_id) { @@ -421,8 +426,10 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ [&request, &result](FrontendServiceConnection& client) { client->loadTxnCommit(result, request); }, - 10000L); + config::txn_commit_rpc_timeout_ms); result_status = Status::create(result.status); + DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.commit_success_and_rpc_error", + { result_status = Status::InternalError("commit_success_and_rpc_error"); }); } else { // abort txn TLoadTxnRollbackRequest request; @@ -436,8 +443,7 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ master_addr.hostname, master_addr.port, [&request, &result](FrontendServiceConnection& client) { client->loadTxnRollback(result, request); - }, - 10000L); + }); result_status = Status::create(result.status); DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.err_status", { std ::string msg = "abort txn"; @@ -503,10 +509,12 @@ Status GroupCommitTable::_finish_group_commit_load(int64_t db_id, int64_t table_ } LOG(INFO) << ss.str(); DBUG_EXECUTE_IF("LoadBlockQueue._finish_group_commit_load.get_wal_back_pressure_msg", { - std ::string msg = _exec_env->wal_mgr()->get_wal_dirs_info_string(); - LOG(INFO) << "debug promise set: " << msg; - ExecEnv::GetInstance()->group_commit_mgr()->debug_promise.set_value( - Status ::InternalError(msg)); + if (dp->param("table_id", -1) == table_id) { + std ::string msg = _exec_env->wal_mgr()->get_wal_dirs_info_string(); + LOG(INFO) << "table_id" << std::to_string(table_id) << " set debug promise: " << msg; + ExecEnv::GetInstance()->group_commit_mgr()->debug_promise.set_value( + Status ::InternalError(msg)); + } };); return st; } diff --git a/be/src/runtime/snapshot_loader.cpp b/be/src/runtime/snapshot_loader.cpp index a5061c4decfb03..1764e3d4322e14 100644 --- a/be/src/runtime/snapshot_loader.cpp +++ b/be/src/runtime/snapshot_loader.cpp @@ -464,7 +464,6 @@ Status SnapshotLoader::remote_http_download( } // Step 3: Validate remote tablet snapshot paths && remote files map - // TODO(Drogon): Add md5sum check // key is remote snapshot paths, value is filelist // get all these use http download action // http://172.16.0.14:6781/api/_tablet/_download?token=e804dd27-86da-4072-af58-70724075d2a4&file=/home/ubuntu/doris_master/output/be/storage/snapshot/20230410102306.9.180//2774718/217609978/2774718.hdr diff --git a/be/src/runtime/tablets_channel.cpp b/be/src/runtime/tablets_channel.cpp index adaced0b76ebfd..06bd73a3b200ce 100644 --- a/be/src/runtime/tablets_channel.cpp +++ b/be/src/runtime/tablets_channel.cpp @@ -133,8 +133,9 @@ Status BaseTabletsChannel::open(const PTabletWriterOpenRequest& request) { if (_state == kOpened || _state == kFinished) { return Status::OK(); } - LOG(INFO) << fmt::format("open tablets channel of index {}, tablets num: {} timeout(s): {}", - _index_id, request.tablets().size(), request.load_channel_timeout_s()); + LOG(INFO) << fmt::format("open tablets channel {}, tablets num: {} timeout(s): {}", + _key.to_string(), request.tablets().size(), + request.load_channel_timeout_s()); _txn_id = request.txn_id(); _index_id = request.index_id(); _schema = std::make_shared(); @@ -215,6 +216,7 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para ss << "LocalTabletsChannel txn_id: " << _txn_id << " load_id: " << print_id(params.id()) << " incremental open delta writer: "; + // every change will hold _lock. this find in under _lock too. so no need _tablet_writers_lock again. for (const auto& tablet : params.tablets()) { if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) { continue; @@ -238,6 +240,7 @@ Status BaseTabletsChannel::incremental_open(const PTabletWriterOpenRequest& para _profile, _load_id); ss << "[" << tablet.tablet_id() << "]"; { + // here we modify _tablet_writers. so need lock. std::lock_guard l(_tablet_writers_lock); _tablet_writers.emplace(tablet.tablet_id(), std::move(delta_writer)); } @@ -479,6 +482,7 @@ Status BaseTabletsChannel::_open_all_writers(const PTabletWriterOpenRequest& req #endif int tablet_cnt = 0; + // under _lock. no need _tablet_writers_lock again. for (const auto& tablet : request.tablets()) { if (_tablet_writers.find(tablet.tablet_id()) != _tablet_writers.end()) { continue; @@ -578,6 +582,11 @@ Status BaseTabletsChannel::add_batch(const PTabletWriterAddBlockRequest& request std::function write_func) { google::protobuf::RepeatedPtrField* tablet_errors = response->mutable_tablet_errors(); + + // add_batch may concurrency with inc_open but not under _lock. + // so need to protect it with _tablet_writers_lock. + std::lock_guard l(_tablet_writers_lock); + auto tablet_writer_it = _tablet_writers.find(tablet_id); if (tablet_writer_it == _tablet_writers.end()) { return Status::InternalError("unknown tablet to append data, tablet={}", tablet_id); diff --git a/be/src/runtime/tablets_channel.h b/be/src/runtime/tablets_channel.h index 54438be7690db8..8ed4c7ab1aa3c8 100644 --- a/be/src/runtime/tablets_channel.h +++ b/be/src/runtime/tablets_channel.h @@ -136,11 +136,8 @@ class BaseTabletsChannel { // id of this load channel TabletsChannelKey _key; - // make execute sequence + // protect _state change. open and close. when add_batch finished, lock to change _next_seqs also std::mutex _lock; - - SpinLock _tablet_writers_lock; - enum State { kInitialized, kOpened, @@ -166,8 +163,10 @@ class BaseTabletsChannel { // currently it's OK. Status _close_status; - // tablet_id -> TabletChannel + // tablet_id -> TabletChannel. it will only be changed in open() or inc_open() std::unordered_map> _tablet_writers; + // protect _tablet_writers + SpinLock _tablet_writers_lock; // broken tablet ids. // If a tablet write fails, it's id will be added to this set. // So that following batch will not handle this tablet anymore. diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index f9d873fc5bb356..15462a0915f066 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -51,6 +51,7 @@ #include "http/action/report_action.h" #include "http/action/reset_rpc_channel_action.h" #include "http/action/restore_tablet_action.h" +#include "http/action/show_nested_index_file_action.h" #include "http/action/snapshot_action.h" #include "http/action/stream_load.h" #include "http/action/stream_load_2pc.h" @@ -331,6 +332,11 @@ Status HttpService::start() { _pool.add(new CalcFileCrcAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); _ev_http_server->register_handler(HttpMethod::GET, "/api/calc_crc", calc_crc_action); + ShowNestedIndexFileAction* show_nested_index_file_action = _pool.add( + new ShowNestedIndexFileAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/show_nested_index_file", + show_nested_index_file_action); + ReportAction* report_task_action = _pool.add( new ReportAction(_env, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN, "REPORT_TASK")); _ev_http_server->register_handler(HttpMethod::GET, "/api/report/task", report_task_action); diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index c74e22bdbcd330..74700dff17fba3 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -43,6 +43,8 @@ class AggregateFunctionBitmapCount; template class AggregateFunctionBitmapOp; struct AggregateFunctionBitmapUnionOp; +class IAggregateFunction; +using AggregateFunctionPtr = std::shared_ptr; using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; @@ -178,11 +180,6 @@ class IAggregateFunction { const size_t offset, IColumn& to, const size_t num_rows) const = 0; - /** Returns true for aggregate functions of type -State. - * They are executed as other aggregate functions, but not finalized (return an aggregation state that can be combined with another). - */ - virtual bool is_state() const { return false; } - /** Contains a loop with calls to "add" function. You can collect arguments into array "places" * and do a single call to "add_batch" for devirtualization and inlining. */ @@ -223,6 +220,8 @@ class IAggregateFunction { virtual void set_version(const int version_) { version = version_; } + virtual AggregateFunctionPtr transmit_to_stable() { return nullptr; } + protected: DataTypes argument_types; int version {}; @@ -519,8 +518,6 @@ class IAggregateFunctionDataHelper : public IAggregateFunctionHelper { } }; -using AggregateFunctionPtr = std::shared_ptr; - class AggregateFunctionGuard { public: using AggregateData = std::remove_pointer_t; diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp index 5b2269a27d9a0b..4773a620e0ab72 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.cpp @@ -29,6 +29,16 @@ namespace doris::vectorized { +template +struct Reducer { + template + using Output = AggregateFunctionDistinctSingleNumericData; + using AggregateFunctionDistinctNormal = AggregateFunctionDistinct; +}; + +template +using AggregateFunctionDistinctNumeric = Reducer::AggregateFunctionDistinctNormal; + class AggregateFunctionCombinatorDistinct final : public IAggregateFunctionCombinator { public: String get_name() const override { return "Distinct"; } @@ -51,22 +61,15 @@ class AggregateFunctionCombinatorDistinct final : public IAggregateFunctionCombi if (arguments.size() == 1) { AggregateFunctionPtr res( - creator_with_numeric_type::create( + creator_with_numeric_type::create( arguments, result_is_nullable, nested_function)); if (res) { return res; } - if (arguments[0]->is_value_unambiguously_represented_in_contiguous_memory_region()) { - res = creator_without_type::create>>( - arguments, result_is_nullable, nested_function); - } else { - res = creator_without_type::create>>( - arguments, result_is_nullable, nested_function); - } + res = creator_without_type::create< + AggregateFunctionDistinct>( + arguments, result_is_nullable, nested_function); return res; } return creator_without_type::create< diff --git a/be/src/vec/aggregate_functions/aggregate_function_distinct.h b/be/src/vec/aggregate_functions/aggregate_function_distinct.h index c0c7a5b66dd58f..4f42e8509f2acc 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_distinct.h +++ b/be/src/vec/aggregate_functions/aggregate_function_distinct.h @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include "vec/aggregate_functions/aggregate_function.h" @@ -54,105 +56,170 @@ struct DefaultHash; namespace doris::vectorized { -template +template struct AggregateFunctionDistinctSingleNumericData { /// When creating, the hash table must be small. - using Set = HashSetWithStackMemory, 4>; - using Self = AggregateFunctionDistinctSingleNumericData; - Set set; + using Container = std::conditional_t, + HashSetWithStackMemory, 4>>; + using Self = AggregateFunctionDistinctSingleNumericData; + Container data; void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena*) { const auto& vec = assert_cast&>(*columns[0]).get_data(); - set.insert(vec[row_num]); + if constexpr (stable) { + data.emplace(vec[row_num], data.size()); + } else { + data.insert(vec[row_num]); + } } - void merge(const Self& rhs, Arena*) { set.merge(rhs.set); } + void merge(const Self& rhs, Arena*) { + DCHECK(!stable); + if constexpr (!stable) { + data.merge(rhs.data); + } + } - void serialize(BufferWritable& buf) const { set.write(buf); } + void serialize(BufferWritable& buf) const { + DCHECK(!stable); + if constexpr (!stable) { + data.write(buf); + } + } - void deserialize(BufferReadable& buf, Arena*) { set.read(buf); } + void deserialize(BufferReadable& buf, Arena*) { + DCHECK(!stable); + if constexpr (!stable) { + data.read(buf); + } + } MutableColumns get_arguments(const DataTypes& argument_types) const { MutableColumns argument_columns; argument_columns.emplace_back(argument_types[0]->create_column()); - for (const auto& elem : set) { - argument_columns[0]->insert(elem.get_value()); + + if constexpr (stable) { + argument_columns[0]->resize(data.size()); + auto ptr = (T*)const_cast(argument_columns[0]->get_raw_data().data); + for (auto it : data) { + ptr[it.second] = it.first; + } + } else { + for (const auto& elem : data) { + argument_columns[0]->insert(elem.get_value()); + } } return argument_columns; } }; +template struct AggregateFunctionDistinctGenericData { /// When creating, the hash table must be small. - using Set = HashSetWithStackMemory; + using Container = std::conditional_t, + HashSetWithStackMemory>; using Self = AggregateFunctionDistinctGenericData; - Set set; + Container data; void merge(const Self& rhs, Arena* arena) { - Set::LookupResult it; - bool inserted; - for (const auto& elem : rhs.set) { - StringRef key = elem.get_value(); - key.data = arena->insert(key.data, key.size); - set.emplace(key, it, inserted); + DCHECK(!stable); + if constexpr (!stable) { + typename Container::LookupResult it; + bool inserted; + for (const auto& elem : rhs.data) { + StringRef key = elem.get_value(); + key.data = arena->insert(key.data, key.size); + data.emplace(key, it, inserted); + } } } void serialize(BufferWritable& buf) const { - write_var_uint(set.size(), buf); - for (const auto& elem : set) { - write_string_binary(elem.get_value(), buf); + DCHECK(!stable); + if constexpr (!stable) { + write_var_uint(data.size(), buf); + for (const auto& elem : data) { + write_string_binary(elem.get_value(), buf); + } } } void deserialize(BufferReadable& buf, Arena* arena) { - UInt64 size; - read_var_uint(size, buf); - - StringRef ref; - for (size_t i = 0; i < size; ++i) { - read_string_binary(ref, buf); - set.insert(ref); + DCHECK(!stable); + if constexpr (!stable) { + UInt64 size; + read_var_uint(size, buf); + + StringRef ref; + for (size_t i = 0; i < size; ++i) { + read_string_binary(ref, buf); + data.insert(ref); + } } } }; -template -struct AggregateFunctionDistinctSingleGenericData : public AggregateFunctionDistinctGenericData { +template +struct AggregateFunctionDistinctSingleGenericData + : public AggregateFunctionDistinctGenericData { + using Base = AggregateFunctionDistinctGenericData; + using Base::data; void add(const IColumn** columns, size_t /* columns_num */, size_t row_num, Arena* arena) { - Set::LookupResult it; - bool inserted; auto key = columns[0]->get_data_at(row_num); key.data = arena->insert(key.data, key.size); - set.emplace(key, it, inserted); + + if constexpr (stable) { + data.emplace(key, data.size()); + } else { + typename Base::Container::LookupResult it; + bool inserted; + data.emplace(key, it, inserted); + } } MutableColumns get_arguments(const DataTypes& argument_types) const { MutableColumns argument_columns; argument_columns.emplace_back(argument_types[0]->create_column()); - for (const auto& elem : set) { - argument_columns[0]->insert_data(elem.get_value().data, elem.get_value().size); + if constexpr (stable) { + std::vector tmp(data.size()); + for (auto it : data) { + tmp[it.second] = it.first; + } + for (int i = 0; i < data.size(); i++) { + argument_columns[0]->insert_data(tmp[i].data, tmp[i].size); + } + } else { + for (const auto& elem : data) { + argument_columns[0]->insert_data(elem.get_value().data, elem.get_value().size); + } } return argument_columns; } }; -struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDistinctGenericData { +template +struct AggregateFunctionDistinctMultipleGenericData + : public AggregateFunctionDistinctGenericData { + using Base = AggregateFunctionDistinctGenericData; + using Base::data; void add(const IColumn** columns, size_t columns_num, size_t row_num, Arena* arena) { const char* begin = nullptr; - StringRef value(begin, 0); + StringRef key(begin, 0); for (size_t i = 0; i < columns_num; ++i) { auto cur_ref = columns[i]->serialize_value_into_arena(row_num, *arena, begin); - value.data = cur_ref.data - value.size; - value.size += cur_ref.size; + key.data = cur_ref.data - key.size; + key.size += cur_ref.size; } - Set::LookupResult it; - bool inserted; - value.data = arena->insert(value.data, value.size); - set.emplace(value, it, inserted); + if constexpr (stable) { + data.emplace(key, data.size()); + } else { + typename Base::Container::LookupResult it; + bool inserted; + data.emplace(key, it, inserted); + } } MutableColumns get_arguments(const DataTypes& argument_types) const { @@ -161,10 +228,23 @@ struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDi argument_columns[i] = argument_types[i]->create_column(); } - for (const auto& elem : set) { - const char* begin = elem.get_value().data; - for (auto& column : argument_columns) { - begin = column->deserialize_and_insert_from_arena(begin); + if constexpr (stable) { + std::vector tmp(data.size()); + for (auto it : data) { + tmp[it.second] = it.first; + } + for (int i = 0; i < data.size(); i++) { + const char* begin = tmp[i].data; + for (auto& column : argument_columns) { + begin = column->deserialize_and_insert_from_arena(begin); + } + } + } else { + for (const auto& elem : data) { + const char* begin = elem.get_value().data; + for (auto& column : argument_columns) { + begin = column->deserialize_and_insert_from_arena(begin); + } } } @@ -175,9 +255,10 @@ struct AggregateFunctionDistinctMultipleGenericData : public AggregateFunctionDi /** Adaptor for aggregate functions. * Adding -Distinct suffix to aggregate function **/ -template +template