diff --git a/.asf.yaml b/.asf.yaml index 3892aca2eddb77..e3d516b35c19a5 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -56,15 +56,13 @@ github: - cloud_p0 (Doris Cloud Regression) - FE UT (Doris FE UT) - BE UT (Doris BE UT) - - Build Broker - - ShellCheck + - Build Broker - Build Third Party Libraries (Linux) - Build Third Party Libraries (macOS) - Build Third Party Libraries (macOS-arm64) - COMPILE (DORIS_COMPILE) - Need_2_Approval - Cloud UT (Doris Cloud UT) - - performance (Doris Performance) required_pull_request_reviews: dismiss_stale_reviews: true @@ -80,7 +78,6 @@ github: - Clang Formatter - CheckStyle - Build Broker - - ShellCheck - Build Third Party Libraries (Linux) - Build Third Party Libraries (macOS) - FE UT (Doris FE UT) @@ -103,7 +100,6 @@ github: - Clang Formatter - CheckStyle - Build Broker - - ShellCheck - Build Third Party Libraries (Linux) - Build Third Party Libraries (macOS) - COMPILE (DORIS_COMPILE) @@ -128,7 +124,6 @@ github: - FE UT (Doris FE UT) - BE UT (Doris BE UT) - Build Broker - - ShellCheck - Build Third Party Libraries (Linux) - Build Third Party Libraries (macOS) - COMPILE (DORIS_COMPILE) diff --git a/.github/workflows/auto-cherry-pick.yml b/.github/workflows/auto-cherry-pick.yml index 2581de3f31cc4e..df1a44153ac9dd 100644 --- a/.github/workflows/auto-cherry-pick.yml +++ b/.github/workflows/auto-cherry-pick.yml @@ -21,6 +21,7 @@ on: pull_request_target: types: - closed + - labeled branches: - master permissions: @@ -30,7 +31,7 @@ permissions: jobs: auto_cherry_pick: runs-on: ubuntu-latest - if: ${{ (contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) && github.event.pull_request.merged == true }} + if: ${{(contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') || contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') ||github.event.label.name == 'dev/3.0.x' || github.event.label.name == 'dev/2.1.x') && github.event.pull_request.merged == true }} steps: - name: Checkout repository uses: actions/checkout@v3 @@ -54,18 +55,18 @@ jobs: echo "SHA matches: $calculated_sha" fi - name: Auto cherry-pick to branch-3.0 - if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/3.0.x') }} + if: ${{ ((github.event.action == 'labeled' && github.event.label.name == 'dev/3.0.x'))|| ((github.event_name == 'pull_request_target' && github.event.action == 'closed') && contains(github.event.pull_request.labels.*.name, 'dev/3.0.x')) }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_NAME: ${{ github.repository }} - CONFLICT_LABEL: cherry-pick-conflict-in-3.0 + CONFLICT_LABEL: dev/3.0.x-conflict run: | python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-3.0 - name: Auto cherry-pick to branch-2.1 - if: ${{ contains(github.event.pull_request.labels.*.name, 'dev/2.1.x') }} + if: ${{ ((github.event.action == 'labeled' && github.event.label.name == 'dev/2.1.x'))|| ((github.event_name == 'pull_request_target' && github.event.action == 'closed') && contains(github.event.pull_request.labels.*.name, 'dev/2.1.x')) }} env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} REPO_NAME: ${{ github.repository }} - CONFLICT_LABEL: cherry-pick-conflict-in-2.1.x + CONFLICT_LABEL: dev/2.1.x-conflict run: | python tools/auto-pick-script.py ${{ github.event.pull_request.number }} branch-2.1 diff --git a/README.md b/README.md index c999651ddee68d..94f9f4b777f8f5 100644 --- a/README.md +++ b/README.md @@ -177,15 +177,10 @@ In terms of optimizers, Doris uses a combination of CBO and RBO. RBO supports co **Apache Doris has graduated from Apache incubator successfully and become a Top-Level Project in June 2022**. -Currently, the Apache Doris community has gathered more than 400 contributors from nearly 200 companies in different industries, and the number of active contributors is close to 100 per month. - - -[![Monthly Active Contributors](https://contributor-overtime-api.apiseven.com/contributors-svg?chart=contributorMonthlyActivity&repo=apache/doris)](https://www.apiseven.com/en/contributor-graph?chart=contributorMonthlyActivity&repo=apache/doris) - -[![Contributor over time](https://contributor-overtime-api.apiseven.com/contributors-svg?chart=contributorOverTime&repo=apache/doris)](https://www.apiseven.com/en/contributor-graph?chart=contributorOverTime&repo=apache/doris) - We deeply appreciate ๐Ÿ”—[community contributors](https://github.com/apache/doris/graphs/contributors) for their contribution to Apache Doris. +[![contrib graph](https://contrib.rocks/image?repo=apache/doris)](https://github.com/apache/doris/graphs/contributors) + ## ๐Ÿ‘จโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘ฆ Users Apache Doris now has a wide user base in China and around the world, and as of today, **Apache Doris is used in production environments in thousands of companies worldwide.** More than 80% of the top 50 Internet companies in China in terms of market capitalization or valuation have been using Apache Doris for a long time, including Baidu, Meituan, Xiaomi, Jingdong, Bytedance, Tencent, NetEase, Kwai, Sina, 360, Mihoyo, and Ke Holdings. It is also widely used in some traditional industries such as finance, energy, manufacturing, and telecommunications. diff --git a/aazcp.tar.gz b/aazcp.tar.gz new file mode 100644 index 00000000000000..681acf72cde859 Binary files /dev/null and b/aazcp.tar.gz differ diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 1d79048f96511c..d476af8e2110df 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -130,6 +130,8 @@ message(STATUS "THIRDPARTY_DIR is ${THIRDPARTY_DIR}") option(MAKE_TEST "ON for make unit test or OFF for not" OFF) message(STATUS "make test: ${MAKE_TEST}") +option(BUILD_BENCHMARK "ON for make google benchmark or OFF for not" OFF) +message(STATUS "make benchmark: ${BUILD_BENCHMARK}") option(WITH_MYSQL "Support access MySQL" ON) @@ -568,7 +570,7 @@ if (OS_MACOSX) ) endif() -if (MAKE_TEST) +if (BUILD_BENCHMARK) set(COMMON_THIRDPARTY ${COMMON_THIRDPARTY} benchmark @@ -708,6 +710,11 @@ if (MAKE_TEST) endif() endif () +# use this to avoid some runtime tracker. reuse BE_TEST symbol, no need another. +if (BUILD_BENCHMARK) + add_definitions(-DBE_TEST) +endif() + get_directory_property(COMPILER_FLAGS COMPILE_OPTIONS) get_directory_property(COMPILER_DEFINES COMPILE_DEFINITIONS) message(STATUS "Compiler: ${CMAKE_CXX_COMPILER_ID}-${CMAKE_CXX_COMPILER_VERSION}") @@ -754,7 +761,7 @@ add_subdirectory(${SRC_DIR}/http) add_subdirectory(${SRC_DIR}/io) add_subdirectory(${SRC_DIR}/olap) add_subdirectory(${SRC_DIR}/runtime) -add_subdirectory(${SRC_DIR}/service) +add_subdirectory(${SRC_DIR}/service) # this include doris_be add_subdirectory(${SRC_DIR}/udf) add_subdirectory(${SRC_DIR}/cloud) @@ -772,36 +779,44 @@ add_subdirectory(${SRC_DIR}/util) add_subdirectory(${SRC_DIR}/vec) add_subdirectory(${SRC_DIR}/pipeline) +# this include doris_be_test if (MAKE_TEST) add_subdirectory(${TEST_DIR}) endif () add_subdirectory(${COMMON_SRC_DIR}/cpp ${BUILD_DIR}/src/common_cpp) -# Install be -install(DIRECTORY DESTINATION ${OUTPUT_DIR}) -install(DIRECTORY DESTINATION ${OUTPUT_DIR}/bin) -install(DIRECTORY DESTINATION ${OUTPUT_DIR}/conf) - -install(FILES - ${BASE_DIR}/../bin/start_be.sh - ${BASE_DIR}/../bin/stop_be.sh - ${BASE_DIR}/../tools/jeprof - PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE - GROUP_READ GROUP_WRITE GROUP_EXECUTE - WORLD_READ WORLD_EXECUTE - DESTINATION ${OUTPUT_DIR}/bin) - -install(FILES - ${BASE_DIR}/../conf/be.conf - ${BASE_DIR}/../conf/odbcinst.ini - ${BASE_DIR}/../conf/asan_suppr.conf - ${BASE_DIR}/../conf/lsan_suppr.conf - DESTINATION ${OUTPUT_DIR}/conf) +if(NOT BUILD_BENCHMARK) + # Install be + install(DIRECTORY DESTINATION ${OUTPUT_DIR}) + install(DIRECTORY DESTINATION ${OUTPUT_DIR}/bin) + install(DIRECTORY DESTINATION ${OUTPUT_DIR}/conf) + + install(FILES + ${BASE_DIR}/../bin/start_be.sh + ${BASE_DIR}/../bin/stop_be.sh + ${BASE_DIR}/../tools/jeprof + PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE + GROUP_READ GROUP_WRITE GROUP_EXECUTE + WORLD_READ WORLD_EXECUTE + DESTINATION ${OUTPUT_DIR}/bin) + + install(FILES + ${BASE_DIR}/../conf/be.conf + ${BASE_DIR}/../conf/odbcinst.ini + ${BASE_DIR}/../conf/asan_suppr.conf + ${BASE_DIR}/../conf/lsan_suppr.conf + DESTINATION ${OUTPUT_DIR}/conf) +endif() get_property(dirs DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES) foreach(dir ${dirs}) message(STATUS "dir='${dir}'") endforeach() - +if (BUILD_BENCHMARK) + add_executable(benchmark_test ${BASE_DIR}/benchmark/benchmark_main.cpp) + target_link_libraries(benchmark_test ${DORIS_LINK_LIBS}) + message(STATUS "Add benchmark to build") + install(TARGETS benchmark_test DESTINATION ${OUTPUT_DIR}/lib) +endif() \ No newline at end of file diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp new file mode 100644 index 00000000000000..cad6463e981852 --- /dev/null +++ b/be/benchmark/benchmark_main.cpp @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "vec/columns/column_string.h" +#include "vec/core/block.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_string.h" + +namespace doris::vectorized { // change if need + +static void Example1(benchmark::State& state) { + // init. dont time it. + state.PauseTiming(); + Block block; + DataTypePtr str_type = std::make_shared(); + std::vector vals {100, "content"}; + state.ResumeTiming(); + + // do test + for (auto _ : state) { + auto str_col = ColumnString::create(); + for (auto& v : vals) { + str_col->insert_data(v.data(), v.size()); + } + block.insert({std::move(str_col), str_type, "col"}); + benchmark::DoNotOptimize(block); // mark the watched target + } +} +// could BENCHMARK many functions to compare them together. +BENCHMARK(Example1); + +} // namespace doris::vectorized + +BENCHMARK_MAIN(); diff --git a/be/src/agent/cgroup_cpu_ctl.cpp b/be/src/agent/cgroup_cpu_ctl.cpp index e68535a708c49b..76b72f2c9d00ae 100644 --- a/be/src/agent/cgroup_cpu_ctl.cpp +++ b/be/src/agent/cgroup_cpu_ctl.cpp @@ -158,11 +158,11 @@ uint64_t CgroupCpuCtl::cpu_soft_limit_default_value() { return _is_enable_cgroup_v2_in_env ? 100 : 1024; } -std::unique_ptr CgroupCpuCtl::create_cgroup_cpu_ctl(uint64_t wg_id) { +std::shared_ptr CgroupCpuCtl::create_cgroup_cpu_ctl(uint64_t wg_id) { if (_is_enable_cgroup_v2_in_env) { - return std::make_unique(wg_id); + return std::make_shared(wg_id); } else if (_is_enable_cgroup_v1_in_env) { - return std::make_unique(wg_id); + return std::make_shared(wg_id); } return nullptr; } diff --git a/be/src/agent/cgroup_cpu_ctl.h b/be/src/agent/cgroup_cpu_ctl.h index 84e191159f15f1..b23f1f4dd9cadb 100644 --- a/be/src/agent/cgroup_cpu_ctl.h +++ b/be/src/agent/cgroup_cpu_ctl.h @@ -52,7 +52,7 @@ class CgroupCpuCtl { static Status delete_unused_cgroup_path(std::set& used_wg_ids); - static std::unique_ptr create_cgroup_cpu_ctl(uint64_t wg_id); + static std::shared_ptr create_cgroup_cpu_ctl(uint64_t wg_id); static bool is_a_valid_cgroup_path(std::string cg_path); diff --git a/be/src/agent/task_worker_pool.cpp b/be/src/agent/task_worker_pool.cpp index c0f16d304a2b72..a8ab93de455c3b 100644 --- a/be/src/agent/task_worker_pool.cpp +++ b/be/src/agent/task_worker_pool.cpp @@ -1630,11 +1630,13 @@ void drop_tablet_callback(StorageEngine& engine, const TAgentTaskRequest& req) { dropped_tablet->tablet_uid()); LOG_INFO("successfully drop tablet") .tag("signature", req.signature) - .tag("tablet_id", drop_tablet_req.tablet_id); + .tag("tablet_id", drop_tablet_req.tablet_id) + .tag("replica_id", drop_tablet_req.replica_id); } else { LOG_WARNING("failed to drop tablet") .tag("signature", req.signature) .tag("tablet_id", drop_tablet_req.tablet_id) + .tag("replica_id", drop_tablet_req.replica_id) .error(status); } diff --git a/be/src/agent/topic_subscriber.cpp b/be/src/agent/topic_subscriber.cpp index f62bdaef0991c9..b470e1534e1c6f 100644 --- a/be/src/agent/topic_subscriber.cpp +++ b/be/src/agent/topic_subscriber.cpp @@ -40,14 +40,12 @@ void TopicSubscriber::handle_topic_info(const TPublishTopicRequest& topic_reques // eg, update workload info may delay other listener, then we need add a thread here // to handle_topic_info asynchronous std::shared_lock lock(_listener_mtx); - LOG(INFO) << "[topic_publish]begin handle topic info"; for (auto& listener_pair : _registered_listeners) { if (topic_request.topic_map.find(listener_pair.first) != topic_request.topic_map.end()) { - LOG(INFO) << "[topic_publish]begin handle topic " << listener_pair.first - << ", size=" << topic_request.topic_map.at(listener_pair.first).size(); listener_pair.second->handle_topic_info( topic_request.topic_map.at(listener_pair.first)); - LOG(INFO) << "[topic_publish]finish handle topic " << listener_pair.first; + LOG(INFO) << "[topic_publish]finish handle topic " << listener_pair.first + << ", size=" << topic_request.topic_map.at(listener_pair.first).size(); } } } diff --git a/be/src/agent/workload_group_listener.cpp b/be/src/agent/workload_group_listener.cpp index f0f57869f2545a..0cd5a3ee1ac748 100644 --- a/be/src/agent/workload_group_listener.cpp +++ b/be/src/agent/workload_group_listener.cpp @@ -17,6 +17,7 @@ #include "agent/workload_group_listener.h" +#include "runtime/exec_env.h" #include "runtime/workload_group/workload_group.h" #include "runtime/workload_group/workload_group_manager.h" #include "util/mem_info.h" @@ -59,7 +60,7 @@ void WorkloadGroupListener::handle_topic_info(const std::vector& topi workload_group_info.enable_cpu_hard_limit); // 4 create and update task scheduler - wg->upsert_task_scheduler(&workload_group_info, _exec_env); + wg->upsert_task_scheduler(&workload_group_info); // 5 upsert io throttle wg->upsert_scan_io_throttle(&workload_group_info); diff --git a/be/src/agent/workload_group_listener.h b/be/src/agent/workload_group_listener.h index f596535908d079..9578a36f70d63e 100644 --- a/be/src/agent/workload_group_listener.h +++ b/be/src/agent/workload_group_listener.h @@ -20,10 +20,11 @@ #include #include "agent/topic_listener.h" -#include "runtime/exec_env.h" namespace doris { +class ExecEnv; + class WorkloadGroupListener : public TopicListener { public: ~WorkloadGroupListener() {} diff --git a/be/src/apache-orc b/be/src/apache-orc index db01184f765c03..2f937bdc76406f 160000 --- a/be/src/apache-orc +++ b/be/src/apache-orc @@ -1 +1 @@ -Subproject commit db01184f765c03496e4107bd3ac37c077ac4bc5f +Subproject commit 2f937bdc76406f150b484b6e57629aa8a03d48b6 diff --git a/be/src/cloud/cloud_base_compaction.cpp b/be/src/cloud/cloud_base_compaction.cpp index 88d83000e95dfa..9742e57dcf9d34 100644 --- a/be/src/cloud/cloud_base_compaction.cpp +++ b/be/src/cloud/cloud_base_compaction.cpp @@ -125,6 +125,7 @@ Status CloudBaseCompaction::prepare_compact() { _input_row_num += rs->num_rows(); _input_segments += rs->num_segments(); _input_rowsets_data_size += rs->data_disk_size(); + _input_rowsets_index_size += rs->index_disk_size(); _input_rowsets_total_size += rs->total_disk_size(); } LOG_INFO("start CloudBaseCompaction, tablet_id={}, range=[{}-{}]", _tablet->tablet_id(), @@ -320,6 +321,10 @@ Status CloudBaseCompaction::modify_rowsets() { compaction_job->add_output_versions(_output_rowset->end_version()); compaction_job->add_txn_id(_output_rowset->txn_id()); compaction_job->add_output_rowset_ids(_output_rowset->rowset_id().to_string()); + compaction_job->set_index_size_input_rowsets(_input_rowsets_index_size); + compaction_job->set_segment_size_input_rowsets(_input_rowsets_data_size); + compaction_job->set_index_size_output_rowsets(_output_rowset->index_disk_size()); + compaction_job->set_segment_size_output_rowsets(_output_rowset->data_disk_size()); DeleteBitmapPtr output_rowset_delete_bitmap = nullptr; if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && diff --git a/be/src/cloud/cloud_cumulative_compaction.cpp b/be/src/cloud/cloud_cumulative_compaction.cpp index 6b74e70ee1b4b8..1acf8efe32e62b 100644 --- a/be/src/cloud/cloud_cumulative_compaction.cpp +++ b/be/src/cloud/cloud_cumulative_compaction.cpp @@ -33,6 +33,7 @@ #include "util/uuid_generator.h" namespace doris { +#include "common/compile_check_begin.h" using namespace ErrorCode; bvar::Adder cumu_output_size("cumu_compaction", "output_size"); @@ -91,6 +92,10 @@ Status CloudCumulativeCompaction::prepare_compact() { // plus 1 to skip the delete version. // NOTICE: after that, the cumulative point may be larger than max version of this tablet, but it doesn't matter. update_cumulative_point(); + if (!config::enable_sleep_between_delete_cumu_compaction) { + st = Status::Error( + "_last_delete_version.first not equal to -1"); + } } return st; } @@ -263,6 +268,10 @@ Status CloudCumulativeCompaction::modify_rowsets() { compaction_job->add_output_versions(_output_rowset->end_version()); compaction_job->add_txn_id(_output_rowset->txn_id()); compaction_job->add_output_rowset_ids(_output_rowset->rowset_id().to_string()); + compaction_job->set_index_size_input_rowsets(_input_rowsets_index_size); + compaction_job->set_segment_size_input_rowsets(_input_rowsets_data_size); + compaction_job->set_index_size_output_rowsets(_output_rowset->index_disk_size()); + compaction_job->set_segment_size_output_rowsets(_output_rowset->data_disk_size()); DBUG_EXECUTE_IF("CloudCumulativeCompaction::modify_rowsets.enable_spin_wait", { LOG(INFO) << "CloudCumulativeCompaction::modify_rowsets.enable_spin_wait, start"; @@ -371,40 +380,17 @@ Status CloudCumulativeCompaction::modify_rowsets() { Status CloudCumulativeCompaction::process_old_version_delete_bitmap() { // agg previously rowset old version delete bitmap std::vector pre_rowsets {}; - std::vector pre_rowset_ids {}; for (const auto& it : cloud_tablet()->rowset_map()) { if (it.first.second < _input_rowsets.front()->start_version()) { pre_rowsets.emplace_back(it.second); - pre_rowset_ids.emplace_back(it.second->rowset_id().to_string()); } } std::sort(pre_rowsets.begin(), pre_rowsets.end(), Rowset::comparator); if (!pre_rowsets.empty()) { - auto pre_max_version = _output_rowset->version().second; - DeleteBitmapPtr new_delete_bitmap = - std::make_shared(_tablet->tablet_meta()->tablet_id()); std::vector> to_remove_vec; - for (auto& rowset : pre_rowsets) { - if (rowset->rowset_meta()->total_disk_size() == 0) { - continue; - } - for (uint32_t seg_id = 0; seg_id < rowset->num_segments(); ++seg_id) { - rowset->rowset_id().to_string(); - DeleteBitmap::BitmapKey start {rowset->rowset_id(), seg_id, 0}; - DeleteBitmap::BitmapKey end {rowset->rowset_id(), seg_id, pre_max_version}; - DeleteBitmap::BitmapKey before_end {rowset->rowset_id(), seg_id, - pre_max_version - 1}; - auto d = _tablet->tablet_meta()->delete_bitmap().get_agg( - {rowset->rowset_id(), seg_id, pre_max_version}); - to_remove_vec.emplace_back( - std::make_tuple(_tablet->tablet_id(), start, before_end)); - if (d->isEmpty()) { - continue; - } - new_delete_bitmap->set(end, *d); - } - } + DeleteBitmapPtr new_delete_bitmap = nullptr; + agg_and_remove_old_version_delete_bitmap(pre_rowsets, to_remove_vec, new_delete_bitmap); if (!new_delete_bitmap->empty()) { // store agg delete bitmap DBUG_EXECUTE_IF("CloudCumulativeCompaction.modify_rowsets.update_delete_bitmap_failed", @@ -424,9 +410,9 @@ Status CloudCumulativeCompaction::process_old_version_delete_bitmap() { } _tablet->tablet_meta()->delete_bitmap().add_to_remove_queue(version.to_string(), to_remove_vec); - DBUG_EXECUTE_IF( - "CloudCumulativeCompaction.modify_rowsets.delete_expired_stale_rowsets", - { static_cast(_tablet.get())->delete_expired_stale_rowsets(); }); + DBUG_EXECUTE_IF("CumulativeCompaction.modify_rowsets.delete_expired_stale_rowsets", { + static_cast(_tablet.get())->delete_expired_stale_rowsets(); + }); } } return Status::OK(); @@ -489,8 +475,10 @@ Status CloudCumulativeCompaction::pick_rowsets_to_compact() { } int64_t max_score = config::cumulative_compaction_max_deltas; - auto process_memory_usage = doris::GlobalMemoryArbitrator::process_memory_usage(); - bool memory_usage_high = process_memory_usage > MemInfo::soft_mem_limit() * 0.8; + double process_memory_usage = + cast_set(doris::GlobalMemoryArbitrator::process_memory_usage()); + bool memory_usage_high = + process_memory_usage > cast_set(MemInfo::soft_mem_limit()) * 0.8; if (cloud_tablet()->last_compaction_status.is() || memory_usage_high) { max_score = std::max(config::cumulative_compaction_max_deltas / @@ -620,4 +608,5 @@ void CloudCumulativeCompaction::do_lease() { } } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_cumulative_compaction.h b/be/src/cloud/cloud_cumulative_compaction.h index 1159dcb59ceef1..87fc0b62c9c389 100644 --- a/be/src/cloud/cloud_cumulative_compaction.h +++ b/be/src/cloud/cloud_cumulative_compaction.h @@ -24,6 +24,7 @@ #include "olap/compaction.h" namespace doris { +#include "common/compile_check_begin.h" class CloudCumulativeCompaction : public CloudCompactionMixin { public: @@ -60,4 +61,5 @@ class CloudCumulativeCompaction : public CloudCompactionMixin { Version _last_delete_version {-1, -1}; }; +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_cumulative_compaction_policy.cpp b/be/src/cloud/cloud_cumulative_compaction_policy.cpp index 5a9879387b2327..92a47fcc69f8d7 100644 --- a/be/src/cloud/cloud_cumulative_compaction_policy.cpp +++ b/be/src/cloud/cloud_cumulative_compaction_policy.cpp @@ -31,6 +31,7 @@ #include "olap/tablet_meta.h" namespace doris { +#include "common/compile_check_begin.h" CloudSizeBasedCumulativeCompactionPolicy::CloudSizeBasedCumulativeCompactionPolicy( int64_t promotion_size, double promotion_ratio, int64_t promotion_min_size, @@ -48,7 +49,7 @@ int64_t CloudSizeBasedCumulativeCompactionPolicy::_level_size(const int64_t size return (int64_t)1 << (sizeof(size) * 8 - 1 - __builtin_clzl(size)); } -int32_t CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets( +int64_t CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets( CloudTablet* tablet, const std::vector& candidate_rowsets, const int64_t max_compaction_score, const int64_t min_compaction_score, std::vector* input_rowsets, Version* last_delete_version, @@ -114,8 +115,8 @@ int32_t CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets( size_t new_compaction_score = *compaction_score; while (rs_begin != input_rowsets->end()) { auto& rs_meta = (*rs_begin)->rowset_meta(); - int current_level = _level_size(rs_meta->total_disk_size()); - int remain_level = _level_size(total_size - rs_meta->total_disk_size()); + int64_t current_level = _level_size(rs_meta->total_disk_size()); + int64_t remain_level = _level_size(total_size - rs_meta->total_disk_size()); // if current level less then remain level, input rowsets contain current rowset // and process return; otherwise, input rowsets do not contain current rowset. if (current_level <= remain_level) { @@ -185,7 +186,7 @@ int32_t CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets( } int64_t CloudSizeBasedCumulativeCompactionPolicy::cloud_promotion_size(CloudTablet* t) const { - int64_t promotion_size = int64_t(t->base_size() * _promotion_ratio); + int64_t promotion_size = int64_t(cast_set(t->base_size()) * _promotion_ratio); // promotion_size is between _size_based_promotion_size and _size_based_promotion_min_size return promotion_size > _promotion_size ? _promotion_size : promotion_size < _promotion_min_size ? _promotion_min_size @@ -215,7 +216,7 @@ int64_t CloudSizeBasedCumulativeCompactionPolicy::new_cumulative_point( : last_cumulative_point; } -int32_t CloudTimeSeriesCumulativeCompactionPolicy::pick_input_rowsets( +int64_t CloudTimeSeriesCumulativeCompactionPolicy::pick_input_rowsets( CloudTablet* tablet, const std::vector& candidate_rowsets, const int64_t max_compaction_score, const int64_t min_compaction_score, std::vector* input_rowsets, Version* last_delete_version, @@ -377,4 +378,5 @@ int64_t CloudTimeSeriesCumulativeCompactionPolicy::new_cumulative_point( return output_rowset->end_version() + 1; } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_cumulative_compaction_policy.h b/be/src/cloud/cloud_cumulative_compaction_policy.h index c142a8a6d3dffe..9373728547241b 100644 --- a/be/src/cloud/cloud_cumulative_compaction_policy.h +++ b/be/src/cloud/cloud_cumulative_compaction_policy.h @@ -30,6 +30,7 @@ #include "olap/rowset/rowset_meta.h" namespace doris { +#include "common/compile_check_begin.h" class Tablet; struct Version; @@ -44,7 +45,7 @@ class CloudCumulativeCompactionPolicy { virtual int64_t new_compaction_level(const std::vector& input_rowsets) = 0; - virtual int32_t pick_input_rowsets(CloudTablet* tablet, + virtual int64_t pick_input_rowsets(CloudTablet* tablet, const std::vector& candidate_rowsets, const int64_t max_compaction_score, const int64_t min_compaction_score, @@ -71,7 +72,7 @@ class CloudSizeBasedCumulativeCompactionPolicy : public CloudCumulativeCompactio return 0; } - int32_t pick_input_rowsets(CloudTablet* tablet, + int64_t pick_input_rowsets(CloudTablet* tablet, const std::vector& candidate_rowsets, const int64_t max_compaction_score, const int64_t min_compaction_score, @@ -106,7 +107,7 @@ class CloudTimeSeriesCumulativeCompactionPolicy : public CloudCumulativeCompacti int64_t new_compaction_level(const std::vector& input_rowsets) override; - int32_t pick_input_rowsets(CloudTablet* tablet, + int64_t pick_input_rowsets(CloudTablet* tablet, const std::vector& candidate_rowsets, const int64_t max_compaction_score, const int64_t min_compaction_score, @@ -115,4 +116,5 @@ class CloudTimeSeriesCumulativeCompactionPolicy : public CloudCumulativeCompacti bool allow_delete = false) override; }; +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp index 6abc3958650ef6..fbf4b9cf303570 100644 --- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp +++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.cpp @@ -34,6 +34,7 @@ #include "runtime/memory/mem_tracker_limiter.h" namespace doris { +#include "common/compile_check_begin.h" CloudEngineCalcDeleteBitmapTask::CloudEngineCalcDeleteBitmapTask( CloudStorageEngine& engine, const TCalcDeleteBitmapRequest& cal_delete_bitmap_req, @@ -75,7 +76,7 @@ Status CloudEngineCalcDeleteBitmapTask::execute() { for (size_t i = 0; i < partition.tablet_ids.size(); i++) { auto tablet_id = partition.tablet_ids[i]; auto tablet_calc_delete_bitmap_ptr = std::make_shared( - _engine, this, tablet_id, transaction_id, version); + _engine, this, tablet_id, transaction_id, version, partition.sub_txn_ids); if (has_compaction_stats) { tablet_calc_delete_bitmap_ptr->set_compaction_stats( partition.base_compaction_cnts[i], partition.cumulative_compaction_cnts[i], @@ -107,12 +108,13 @@ Status CloudEngineCalcDeleteBitmapTask::execute() { CloudTabletCalcDeleteBitmapTask::CloudTabletCalcDeleteBitmapTask( CloudStorageEngine& engine, CloudEngineCalcDeleteBitmapTask* engine_task, int64_t tablet_id, - int64_t transaction_id, int64_t version) + int64_t transaction_id, int64_t version, const std::vector& sub_txn_ids) : _engine(engine), _engine_calc_delete_bitmap_task(engine_task), _tablet_id(tablet_id), _transaction_id(transaction_id), - _version(version) { + _version(version), + _sub_txn_ids(sub_txn_ids) { _mem_tracker = MemTrackerLimiter::create_shared( MemTrackerLimiter::Type::OTHER, fmt::format("CloudTabletCalcDeleteBitmapTask#_transaction_id={}", _transaction_id)); @@ -189,6 +191,60 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const { return error_st; } + int64_t t3 = MonotonicMicros(); + Status status; + if (_sub_txn_ids.empty()) { + status = _handle_rowset(tablet, _version); + } else { + std::stringstream ss; + for (const auto& sub_txn_id : _sub_txn_ids) { + ss << sub_txn_id << ", "; + } + LOG(INFO) << "start calc delete bitmap for txn_id=" << _transaction_id << ", sub_txn_ids=[" + << ss.str() << "], table_id=" << tablet->table_id() + << ", partition_id=" << tablet->partition_id() << ", tablet_id=" << _tablet_id + << ", start_version=" << _version; + std::vector invisible_rowsets; + DeleteBitmapPtr tablet_delete_bitmap = + std::make_shared(tablet->tablet_meta()->delete_bitmap()); + for (int i = 0; i < _sub_txn_ids.size(); ++i) { + int64_t sub_txn_id = _sub_txn_ids[i]; + int64_t version = _version + i; + LOG(INFO) << "start calc delete bitmap for txn_id=" << _transaction_id + << ", sub_txn_id=" << sub_txn_id << ", table_id=" << tablet->table_id() + << ", partition_id=" << tablet->partition_id() << ", tablet_id=" << _tablet_id + << ", start_version=" << _version << ", cur_version=" << version; + status = _handle_rowset(tablet, version, sub_txn_id, &invisible_rowsets, + tablet_delete_bitmap); + if (!status.ok()) { + LOG(INFO) << "failed to calculate delete bitmap on tablet" + << ", table_id=" << tablet->table_id() + << ", transaction_id=" << _transaction_id << ", sub_txn_id=" << sub_txn_id + << ", tablet_id=" << tablet->tablet_id() << ", start version=" << _version + << ", cur_version=" << version << ", status=" << status; + return status; + } + DCHECK(invisible_rowsets.size() == i + 1); + } + } + auto total_update_delete_bitmap_time_us = MonotonicMicros() - t3; + LOG(INFO) << "finish calculate delete bitmap on tablet" + << ", table_id=" << tablet->table_id() << ", transaction_id=" << _transaction_id + << ", tablet_id=" << tablet->tablet_id() + << ", get_tablet_time_us=" << get_tablet_time_us + << ", sync_rowset_time_us=" << sync_rowset_time_us + << ", total_update_delete_bitmap_time_us=" << total_update_delete_bitmap_time_us + << ", res=" << status; + return status; +} + +Status CloudTabletCalcDeleteBitmapTask::_handle_rowset( + std::shared_ptr tablet, int64_t version, int64_t sub_txn_id, + std::vector* invisible_rowsets, + DeleteBitmapPtr tablet_delete_bitmap) const { + int64_t transaction_id = sub_txn_id == -1 ? _transaction_id : sub_txn_id; + std::string txn_str = "txn_id=" + std::to_string(_transaction_id) + + (sub_txn_id == -1 ? "" : ", sub_txn_id=" + std::to_string(sub_txn_id)); RowsetSharedPtr rowset; DeleteBitmapPtr delete_bitmap; RowsetIdUnorderedSet rowset_ids; @@ -197,60 +253,78 @@ Status CloudTabletCalcDeleteBitmapTask::handle() const { int64_t txn_expiration; TxnPublishInfo previous_publish_info; Status status = _engine.txn_delete_bitmap_cache().get_tablet_txn_info( - _transaction_id, _tablet_id, &rowset, &delete_bitmap, &rowset_ids, &txn_expiration, + transaction_id, _tablet_id, &rowset, &delete_bitmap, &rowset_ids, &txn_expiration, &partial_update_info, &publish_status, &previous_publish_info); if (status != Status::OK()) { - LOG(WARNING) << "failed to get tablet txn info. tablet_id=" << _tablet_id - << ", txn_id=" << _transaction_id << ", status=" << status; + LOG(WARNING) << "failed to get tablet txn info. tablet_id=" << _tablet_id << ", " << txn_str + << ", status=" << status; _engine_calc_delete_bitmap_task->add_error_tablet_id(_tablet_id, status); return status; } int64_t t3 = MonotonicMicros(); - rowset->set_version(Version(_version, _version)); + rowset->set_version(Version(version, version)); TabletTxnInfo txn_info; txn_info.rowset = rowset; txn_info.delete_bitmap = delete_bitmap; txn_info.rowset_ids = rowset_ids; txn_info.partial_update_info = partial_update_info; txn_info.publish_status = publish_status; - txn_info.publish_info = {.publish_version = _version, + txn_info.publish_info = {.publish_version = version, .base_compaction_cnt = _ms_base_compaction_cnt, .cumulative_compaction_cnt = _ms_cumulative_compaction_cnt, .cumulative_point = _ms_cumulative_point}; - auto update_delete_bitmap_time_us = 0; + int64_t update_delete_bitmap_time_us = 0; if (txn_info.publish_status && (*(txn_info.publish_status) == PublishStatus::SUCCEED) && - _version == previous_publish_info.publish_version && + version == previous_publish_info.publish_version && _ms_base_compaction_cnt == previous_publish_info.base_compaction_cnt && _ms_cumulative_compaction_cnt == previous_publish_info.cumulative_compaction_cnt && _ms_cumulative_point == previous_publish_info.cumulative_point) { // if version or compaction stats can't match, it means that this is a retry and there are // compaction or other loads finished successfully on the same tablet. So the previous publish // is stale and we should re-calculate the delete bitmap - LOG(INFO) << "tablet=" << _tablet_id << ",txn=" << _transaction_id + LOG(INFO) << "tablet=" << _tablet_id << ", " << txn_str << ",publish_status=SUCCEED,not need to recalculate and update delete_bitmap."; } else { - status = CloudTablet::update_delete_bitmap(tablet, &txn_info, _transaction_id, - txn_expiration); + if (invisible_rowsets == nullptr) { + status = CloudTablet::update_delete_bitmap(tablet, &txn_info, transaction_id, + txn_expiration); + } else { + txn_info.is_txn_load = true; + txn_info.invisible_rowsets = *invisible_rowsets; + txn_info.lock_id = _transaction_id; + txn_info.next_visible_version = _version; + status = CloudTablet::update_delete_bitmap(tablet, &txn_info, transaction_id, + txn_expiration, tablet_delete_bitmap); + } update_delete_bitmap_time_us = MonotonicMicros() - t3; } if (status != Status::OK()) { LOG(WARNING) << "failed to calculate delete bitmap. rowset_id=" << rowset->rowset_id() - << ", tablet_id=" << _tablet_id << ", txn_id=" << _transaction_id - << ", status=" << status; + << ", tablet_id=" << _tablet_id << ", " << txn_str << ", status=" << status; _engine_calc_delete_bitmap_task->add_error_tablet_id(_tablet_id, status); return status; } _engine_calc_delete_bitmap_task->add_succ_tablet_id(_tablet_id); LOG(INFO) << "calculate delete bitmap successfully on tablet" - << ", table_id=" << tablet->table_id() << ", transaction_id=" << _transaction_id + << ", table_id=" << tablet->table_id() << ", " << txn_str << ", tablet_id=" << tablet->tablet_id() << ", num_rows=" << rowset->num_rows() - << ", get_tablet_time_us=" << get_tablet_time_us - << ", sync_rowset_time_us=" << sync_rowset_time_us << ", update_delete_bitmap_time_us=" << update_delete_bitmap_time_us << ", res=" << status; + if (invisible_rowsets != nullptr) { + invisible_rowsets->push_back(rowset); + // see CloudTablet::save_delete_bitmap + auto dm = txn_info.delete_bitmap->delete_bitmap; + for (auto it = dm.begin(); it != dm.end(); ++it) { + if (std::get<1>(it->first) != DeleteBitmap::INVALID_SEGMENT_ID) { + tablet_delete_bitmap->merge( + {std::get<0>(it->first), std::get<1>(it->first), version}, it->second); + } + } + } return status; } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h index e3733d3e696ff8..c70a9cfa3903ba 100644 --- a/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h +++ b/be/src/cloud/cloud_engine_calc_delete_bitmap_task.h @@ -34,7 +34,8 @@ class CloudTabletCalcDeleteBitmapTask { public: CloudTabletCalcDeleteBitmapTask(CloudStorageEngine& engine, CloudEngineCalcDeleteBitmapTask* engine_task, int64_t tablet_id, - int64_t transaction_id, int64_t version); + int64_t transaction_id, int64_t version, + const std::vector& sub_txn_ids); ~CloudTabletCalcDeleteBitmapTask() = default; void set_compaction_stats(int64_t ms_base_compaction_cnt, int64_t ms_cumulative_compaction_cnt, @@ -43,12 +44,18 @@ class CloudTabletCalcDeleteBitmapTask { Status handle() const; private: + Status _handle_rowset(std::shared_ptr tablet, int64_t version, + int64_t sub_txn_id = -1, + std::vector* invisible_rowsets = nullptr, + DeleteBitmapPtr tablet_delete_bitmap = nullptr) const; + CloudStorageEngine& _engine; CloudEngineCalcDeleteBitmapTask* _engine_calc_delete_bitmap_task; int64_t _tablet_id; int64_t _transaction_id; int64_t _version; + std::vector _sub_txn_ids; int64_t _ms_base_compaction_cnt {-1}; int64_t _ms_cumulative_compaction_cnt {-1}; diff --git a/be/src/cloud/cloud_full_compaction.cpp b/be/src/cloud/cloud_full_compaction.cpp index c27b728c93d29b..bce00c9a2e74f6 100644 --- a/be/src/cloud/cloud_full_compaction.cpp +++ b/be/src/cloud/cloud_full_compaction.cpp @@ -216,6 +216,10 @@ Status CloudFullCompaction::modify_rowsets() { compaction_job->add_output_versions(_output_rowset->end_version()); compaction_job->add_txn_id(_output_rowset->txn_id()); compaction_job->add_output_rowset_ids(_output_rowset->rowset_id().to_string()); + compaction_job->set_index_size_input_rowsets(_input_rowsets_index_size); + compaction_job->set_segment_size_input_rowsets(_input_rowsets_data_size); + compaction_job->set_index_size_output_rowsets(_output_rowset->index_disk_size()); + compaction_job->set_segment_size_output_rowsets(_output_rowset->data_disk_size()); DeleteBitmapPtr output_rowset_delete_bitmap = nullptr; if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index 5c699ae0159050..835e74ca7d5687 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -64,6 +64,7 @@ #include "util/thrift_rpc_helper.h" namespace doris::cloud { +#include "common/compile_check_begin.h" using namespace ErrorCode; Status bthread_fork_join(const std::vector>& tasks, int concurrency) { @@ -243,12 +244,12 @@ class MetaServiceProxy { long deadline = now; // connection age only works without list endpoint. if (!is_meta_service_endpoint_list && - config::meta_service_connection_age_base_minutes > 0) { + config::meta_service_connection_age_base_seconds > 0) { std::default_random_engine rng(static_cast(now)); std::uniform_int_distribution<> uni( - config::meta_service_connection_age_base_minutes, - config::meta_service_connection_age_base_minutes * 2); - deadline = now + duration_cast(minutes(uni(rng))).count(); + config::meta_service_connection_age_base_seconds, + config::meta_service_connection_age_base_seconds * 2); + deadline = now + duration_cast(seconds(uni(rng))).count(); } else { deadline = LONG_MAX; } @@ -385,7 +386,7 @@ Status CloudMetaMgr::get_tablet_meta(int64_t tablet_id, TabletMetaSharedPtr* tab } Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_data, - bool sync_delete_bitmap) { + bool sync_delete_bitmap, bool full_sync) { using namespace std::chrono; TEST_SYNC_POINT_RETURN_WITH_VALUE("CloudMetaMgr::sync_tablet_rowsets", Status::OK(), tablet); @@ -411,7 +412,11 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ idx->set_partition_id(tablet->partition_id()); { std::shared_lock rlock(tablet->get_header_lock()); - req.set_start_version(tablet->max_version_unlocked() + 1); + if (full_sync) { + req.set_start_version(0); + } else { + req.set_start_version(tablet->max_version_unlocked() + 1); + } req.set_base_compaction_cnt(tablet->base_compaction_cnt()); req.set_cumulative_compaction_cnt(tablet->cumulative_compaction_cnt()); req.set_cumulative_point(tablet->cumulative_layer_point()); @@ -471,7 +476,7 @@ Status CloudMetaMgr::sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_ DeleteBitmap delete_bitmap(tablet_id); int64_t old_max_version = req.start_version() - 1; auto st = sync_tablet_delete_bitmap(tablet, old_max_version, resp.rowset_meta(), - resp.stats(), req.idx(), &delete_bitmap); + resp.stats(), req.idx(), &delete_bitmap, full_sync); if (st.is() && tried++ < retry_times) { LOG_WARNING("rowset meta is expired, need to retry") .tag("tablet", tablet->tablet_id()) @@ -606,8 +611,9 @@ bool CloudMetaMgr::sync_tablet_delete_bitmap_by_cache(CloudTablet* tablet, int64 engine.txn_delete_bitmap_cache().remove_unused_tablet_txn_info(txn_id, tablet->tablet_id()); } else { - LOG(WARNING) << "failed to get tablet txn info. tablet_id=" << tablet->tablet_id() - << ", txn_id=" << txn_id << ", status=" << status; + LOG_EVERY_N(INFO, 20) + << "delete bitmap not found in cache, will sync rowset to get. tablet_id= " + << tablet->tablet_id() << ", txn_id=" << txn_id << ", status=" << status; return false; } } @@ -617,16 +623,15 @@ bool CloudMetaMgr::sync_tablet_delete_bitmap_by_cache(CloudTablet* tablet, int64 Status CloudMetaMgr::sync_tablet_delete_bitmap(CloudTablet* tablet, int64_t old_max_version, std::ranges::range auto&& rs_metas, const TabletStatsPB& stats, const TabletIndexPB& idx, - DeleteBitmap* delete_bitmap) { + DeleteBitmap* delete_bitmap, bool full_sync) { if (rs_metas.empty()) { return Status::OK(); } - if (sync_tablet_delete_bitmap_by_cache(tablet, old_max_version, rs_metas, delete_bitmap)) { + if (!full_sync && + sync_tablet_delete_bitmap_by_cache(tablet, old_max_version, rs_metas, delete_bitmap)) { return Status::OK(); } else { - LOG(WARNING) << "failed to sync delete bitmap by txn info. tablet_id=" - << tablet->tablet_id(); DeleteBitmapPtr new_delete_bitmap = std::make_shared(tablet->tablet_id()); *delete_bitmap = *new_delete_bitmap; } @@ -713,7 +718,7 @@ Status CloudMetaMgr::sync_tablet_delete_bitmap(CloudTablet* tablet, int64_t old_ "rowset_ids.size={},segment_ids.size={},vers.size={},delete_bitmaps.size={}", rowset_ids.size(), segment_ids.size(), vers.size(), delete_bitmaps.size()); } - for (size_t i = 0; i < rowset_ids.size(); i++) { + for (int i = 0; i < rowset_ids.size(); i++) { RowsetId rst_id; rst_id.init(rowset_ids[i]); delete_bitmap->merge( @@ -753,10 +758,10 @@ Status CloudMetaMgr::prepare_rowset(const RowsetMeta& rs_meta, Status st = retry_rpc("prepare rowset", req, &resp, &MetaService_Stub::prepare_rowset); if (!st.ok() && resp.status().code() == MetaServiceCode::ALREADY_EXISTED) { if (existed_rs_meta != nullptr && resp.has_existed_rowset_meta()) { - RowsetMetaPB doris_rs_meta = + RowsetMetaPB doris_rs_meta_tmp = cloud_rowset_meta_to_doris(std::move(*resp.mutable_existed_rowset_meta())); *existed_rs_meta = std::make_shared(); - (*existed_rs_meta)->init_from_pb(doris_rs_meta); + (*existed_rs_meta)->init_from_pb(doris_rs_meta_tmp); } return Status::AlreadyExist("failed to prepare rowset: {}", resp.status().msg()); } @@ -1282,4 +1287,5 @@ int64_t CloudMetaMgr::get_inverted_index_file_szie(const RowsetMeta& rs_meta) { return total_inverted_index_size; } +#include "common/compile_check_end.h" } // namespace doris::cloud diff --git a/be/src/cloud/cloud_meta_mgr.h b/be/src/cloud/cloud_meta_mgr.h index a657c0fdd8e350..913ef59489a1b3 100644 --- a/be/src/cloud/cloud_meta_mgr.h +++ b/be/src/cloud/cloud_meta_mgr.h @@ -27,6 +27,7 @@ #include "util/s3_util.h" namespace doris { +#include "common/compile_check_begin.h" class DeleteBitmap; class StreamLoadContext; @@ -58,7 +59,7 @@ class CloudMetaMgr { Status get_tablet_meta(int64_t tablet_id, std::shared_ptr* tablet_meta); Status sync_tablet_rowsets(CloudTablet* tablet, bool warmup_delta_data = false, - bool sync_delete_bitmap = true); + bool sync_delete_bitmap = true, bool full_sync = false); Status prepare_rowset(const RowsetMeta& rs_meta, std::shared_ptr* existed_rs_meta = nullptr); @@ -116,11 +117,13 @@ class CloudMetaMgr { Status sync_tablet_delete_bitmap(CloudTablet* tablet, int64_t old_max_version, std::ranges::range auto&& rs_metas, const TabletStatsPB& stats, - const TabletIndexPB& idx, DeleteBitmap* delete_bitmap); + const TabletIndexPB& idx, DeleteBitmap* delete_bitmap, + bool full_sync = false); void check_table_size_correctness(const RowsetMeta& rs_meta); int64_t get_segment_file_size(const RowsetMeta& rs_meta); int64_t get_inverted_index_file_szie(const RowsetMeta& rs_meta); }; } // namespace cloud +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index 896804578d7db9..b086def3c03ee5 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -169,6 +169,15 @@ Status CloudSchemaChangeJob::process_alter_tablet(const TAlterTabletReqV2& reque reader_context.batch_size = ALTER_TABLE_BATCH_SIZE; reader_context.delete_bitmap = &_base_tablet->tablet_meta()->delete_bitmap(); reader_context.version = Version(0, start_resp.alter_version()); + std::vector cluster_key_idxes; + if (!_base_tablet_schema->cluster_key_uids().empty()) { + for (const auto& uid : _base_tablet_schema->cluster_key_uids()) { + cluster_key_idxes.emplace_back(_base_tablet_schema->field_index(uid)); + } + reader_context.read_orderby_key_columns = &cluster_key_idxes; + reader_context.is_unique = false; + reader_context.sequence_id_idx = -1; + } for (auto& split : rs_splits) { RETURN_IF_ERROR(split.rs_reader->init(&reader_context)); @@ -340,17 +349,23 @@ Status CloudSchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParam int64_t num_output_rows = 0; int64_t size_output_rowsets = 0; int64_t num_output_segments = 0; + int64_t index_size_output_rowsets = 0; + int64_t segment_size_output_rowsets = 0; for (auto& rs : _output_rowsets) { sc_job->add_txn_ids(rs->txn_id()); sc_job->add_output_versions(rs->end_version()); num_output_rows += rs->num_rows(); size_output_rowsets += rs->total_disk_size(); num_output_segments += rs->num_segments(); + index_size_output_rowsets += rs->index_disk_size(); + segment_size_output_rowsets += rs->data_disk_size(); } sc_job->set_num_output_rows(num_output_rows); sc_job->set_size_output_rowsets(size_output_rowsets); sc_job->set_num_output_segments(num_output_segments); sc_job->set_num_output_rowsets(_output_rowsets.size()); + sc_job->set_index_size_output_rowsets(index_size_output_rowsets); + sc_job->set_segment_size_output_rowsets(segment_size_output_rowsets); } _output_cumulative_point = std::min(_output_cumulative_point, sc_job->alter_version() + 1); sc_job->set_output_cumulative_point(_output_cumulative_point); @@ -363,7 +378,8 @@ Status CloudSchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParam // If there are historical versions of rowsets, we need to recalculate their delete // bitmaps, otherwise we will miss the delete bitmaps of incremental rowsets int64_t start_calc_delete_bitmap_version = - already_exist_any_version ? 0 : sc_job->alter_version() + 1; + // [0-1] is a placeholder rowset, start from 2. + already_exist_any_version ? 2 : sc_job->alter_version() + 1; RETURN_IF_ERROR(_process_delete_bitmap(sc_job->alter_version(), start_calc_delete_bitmap_version, initiator)); sc_job->set_delete_bitmap_lock_initiator(initiator); diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index 5d7b445917aa20..650909a29157cd 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -52,6 +52,7 @@ #include "util/parse_util.h" namespace doris { +#include "common/compile_check_begin.h" using namespace std::literals; @@ -166,7 +167,8 @@ Status CloudStorageEngine::open() { _memtable_flush_executor = std::make_unique(); // Use file cache disks number - _memtable_flush_executor->init(io::FileCacheFactory::instance()->get_cache_instance_size()); + _memtable_flush_executor->init( + cast_set(io::FileCacheFactory::instance()->get_cache_instance_size())); _calc_delete_bitmap_executor = std::make_unique(); _calc_delete_bitmap_executor->init(); @@ -231,7 +233,7 @@ Result CloudStorageEngine::get_tablet(int64_t tablet_id) { }); } -Status CloudStorageEngine::start_bg_threads() { +Status CloudStorageEngine::start_bg_threads(std::shared_ptr wg_sptr) { RETURN_IF_ERROR(Thread::create( "CloudStorageEngine", "refresh_s3_info_thread", [this]() { this->_refresh_storage_vault_info_thread_callback(); }, @@ -266,14 +268,27 @@ Status CloudStorageEngine::start_bg_threads() { // compaction tasks producer thread int base_thread_num = get_base_thread_num(); int cumu_thread_num = get_cumu_thread_num(); - RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") - .set_min_threads(base_thread_num) - .set_max_threads(base_thread_num) - .build(&_base_compaction_thread_pool)); - RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") - .set_min_threads(cumu_thread_num) - .set_max_threads(cumu_thread_num) - .build(&_cumu_compaction_thread_pool)); + if (wg_sptr->get_cgroup_cpu_ctl_wptr().lock()) { + RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") + .set_min_threads(base_thread_num) + .set_max_threads(base_thread_num) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") + .set_min_threads(cumu_thread_num) + .set_max_threads(cumu_thread_num) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_cumu_compaction_thread_pool)); + } else { + RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") + .set_min_threads(base_thread_num) + .set_max_threads(base_thread_num) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") + .set_min_threads(cumu_thread_num) + .set_max_threads(cumu_thread_num) + .build(&_cumu_compaction_thread_pool)); + } RETURN_IF_ERROR(Thread::create( "StorageEngine", "compaction_tasks_producer_thread", [this]() { this->_compaction_tasks_producer_callback(); }, @@ -308,7 +323,7 @@ void CloudStorageEngine::_check_file_cache_ttl_block_valid() { for (const auto& rowset : rowsets) { int64_t ttl_seconds = tablet->tablet_meta()->ttl_seconds(); if (rowset->newest_write_timestamp() + ttl_seconds <= UnixSeconds()) continue; - for (int64_t seg_id = 0; seg_id < rowset->num_segments(); seg_id++) { + for (uint32_t seg_id = 0; seg_id < rowset->num_segments(); seg_id++) { auto hash = Segment::file_cache_key(rowset->rowset_id().to_string(), seg_id); auto* file_cache = io::FileCacheFactory::instance()->get_by_path(hash); file_cache->update_ttl_atime(hash); @@ -337,11 +352,11 @@ void CloudStorageEngine::sync_storage_vault() { for (auto& [id, vault_info, path_format] : vault_infos) { auto fs = get_filesystem(id); - auto st = (fs == nullptr) - ? std::visit(VaultCreateFSVisitor {id, path_format}, vault_info) - : std::visit(RefreshFSVaultVisitor {id, std::move(fs), path_format}, - vault_info); - if (!st.ok()) [[unlikely]] { + auto status = (fs == nullptr) + ? std::visit(VaultCreateFSVisitor {id, path_format}, vault_info) + : std::visit(RefreshFSVaultVisitor {id, std::move(fs), path_format}, + vault_info); + if (!status.ok()) [[unlikely]] { LOG(WARNING) << vault_process_error(id, vault_info, std::move(st)); } } @@ -491,13 +506,13 @@ void CloudStorageEngine::_compaction_tasks_producer_callback() { /// If it is not cleaned up, the reference count of the tablet will always be greater than 1, /// thus cannot be collected by the garbage collector. (TabletManager::start_trash_sweep) for (const auto& tablet : tablets_compaction) { - Status st = submit_compaction_task(tablet, compaction_type); - if (st.ok()) continue; - if ((!st.is() && - !st.is()) || + Status status = submit_compaction_task(tablet, compaction_type); + if (status.ok()) continue; + if ((!status.is() && + !status.is()) || VLOG_DEBUG_IS_ON) { LOG(WARNING) << "failed to submit compaction task for tablet: " - << tablet->tablet_id() << ", err: " << st; + << tablet->tablet_id() << ", err: " << status; } } interval = config::generate_compaction_tasks_interval_ms; @@ -531,7 +546,8 @@ std::vector CloudStorageEngine::_generate_cloud_compaction_task int num_cumu = std::accumulate(submitted_cumu_compactions.begin(), submitted_cumu_compactions.end(), 0, [](int a, auto& b) { return a + b.second.size(); }); - int num_base = submitted_base_compactions.size() + submitted_full_compactions.size(); + int num_base = + cast_set(submitted_base_compactions.size() + submitted_full_compactions.size()); int n = thread_per_disk - num_cumu - num_base; if (compaction_type == CompactionType::BASE_COMPACTION) { // We need to reserve at least one thread for cumulative compaction, @@ -661,7 +677,8 @@ Status CloudStorageEngine::_submit_cumulative_compaction_task(const CloudTabletS auto st = compaction->prepare_compact(); if (!st.ok()) { long now = duration_cast(system_clock::now().time_since_epoch()).count(); - if (st.is()) { + if (st.is() && + st.msg() != "_last_delete_version.first not equal to -1") { // Backoff strategy if no suitable version tablet->last_cumu_no_suitable_version_ms = now; } @@ -809,7 +826,7 @@ Status CloudStorageEngine::get_compaction_status_json(std::string* result) { // cumu std::string_view cumu = "CumulativeCompaction"; rapidjson::Value cumu_key; - cumu_key.SetString(cumu.data(), cumu.length(), root.GetAllocator()); + cumu_key.SetString(cumu.data(), cast_set(cumu.length()), root.GetAllocator()); rapidjson::Document cumu_arr; cumu_arr.SetArray(); for (auto& [tablet_id, v] : _submitted_cumu_compactions) { @@ -821,7 +838,7 @@ Status CloudStorageEngine::get_compaction_status_json(std::string* result) { // base std::string_view base = "BaseCompaction"; rapidjson::Value base_key; - base_key.SetString(base.data(), base.length(), root.GetAllocator()); + base_key.SetString(base.data(), cast_set(base.length()), root.GetAllocator()); rapidjson::Document base_arr; base_arr.SetArray(); for (auto& [tablet_id, _] : _submitted_base_compactions) { @@ -844,4 +861,5 @@ std::shared_ptr CloudStorageEngine::cumu_compac return _cumulative_compaction_policies.at(compaction_policy); } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_storage_engine.h b/be/src/cloud/cloud_storage_engine.h index 92d2917a916f6a..072b8366542253 100644 --- a/be/src/cloud/cloud_storage_engine.h +++ b/be/src/cloud/cloud_storage_engine.h @@ -57,7 +57,7 @@ class CloudStorageEngine final : public BaseStorageEngine { Result get_tablet(int64_t tablet_id) override; - Status start_bg_threads() override; + Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) override; Status set_cluster_id(int32_t cluster_id) override { _effective_cluster_id = cluster_id; diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index ebd1fea3dd9fac..93c7128756738c 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -50,6 +50,7 @@ #include "vec/common/schema_util.h" namespace doris { +#include "common/compile_check_begin.h" using namespace ErrorCode; static constexpr int COMPACTION_DELETE_BITMAP_LOCK_ID = -1; @@ -219,6 +220,7 @@ Status CloudTablet::sync_if_not_running() { } TabletSchemaSPtr CloudTablet::merged_tablet_schema() const { + std::shared_lock rlock(_meta_lock); return _merged_tablet_schema; } @@ -380,7 +382,7 @@ void CloudTablet::delete_rowsets(const std::vector& to_delete, _tablet_meta->modify_rs_metas({}, rs_metas, false); } -int CloudTablet::delete_expired_stale_rowsets() { +uint64_t CloudTablet::delete_expired_stale_rowsets() { std::vector expired_rowsets; int64_t expired_stale_sweep_endtime = ::time(nullptr) - config::tablet_rowset_stale_sweep_time_sec; @@ -397,8 +399,8 @@ int CloudTablet::delete_expired_stale_rowsets() { } for (int64_t path_id : path_ids) { - int start_version = -1; - int end_version = -1; + int64_t start_version = -1; + int64_t end_version = -1; // delete stale versions in version graph auto version_path = _timestamped_version_tracker.fetch_and_delete_path_by_id(path_id); for (auto& v_ts : version_path->timestamped_versions()) { @@ -449,6 +451,12 @@ void CloudTablet::recycle_cached_data(const std::vector& rowset if (config::enable_file_cache) { for (const auto& rs : rowsets) { + if (rs.use_count() >= 1) { + LOG(WARNING) << "Rowset " << rs->rowset_id().to_string() << " has " + << rs.use_count() + << " references. File Cache won't be recycled when query is using it."; + continue; + } for (int seg_id = 0; seg_id < rs->num_segments(); ++seg_id) { // TODO: Segment::file_cache_key auto file_key = Segment::file_cache_key(rs->rowset_id().to_string(), seg_id); @@ -533,7 +541,7 @@ Result> CloudTablet::create_transient_rowset_write return RowsetFactory::create_rowset_writer(_engine, context, false) .transform([&](auto&& writer) { - writer->set_segment_start_id(rowset.num_segments()); + writer->set_segment_start_id(cast_set(rowset.num_segments())); return writer; }); } @@ -611,7 +619,8 @@ void CloudTablet::get_compaction_status(std::string* json_result) { } rapidjson::Value value; std::string version_str = rowset->get_rowset_info_str(); - value.SetString(version_str.c_str(), version_str.length(), versions_arr.GetAllocator()); + value.SetString(version_str.c_str(), cast_set(version_str.length()), + versions_arr.GetAllocator()); versions_arr.PushBack(value, versions_arr.GetAllocator()); last_version = ver.second; } @@ -624,7 +633,7 @@ void CloudTablet::get_compaction_status(std::string* json_result) { for (auto& rowset : stale_rowsets) { rapidjson::Value value; std::string version_str = rowset->get_rowset_info_str(); - value.SetString(version_str.c_str(), version_str.length(), + value.SetString(version_str.c_str(), cast_set(version_str.length()), stale_versions_arr.GetAllocator()); stale_versions_arr.PushBack(value, stale_versions_arr.GetAllocator()); } @@ -684,7 +693,8 @@ CalcDeleteBitmapExecutor* CloudTablet::calc_delete_bitmap_executor() { Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t txn_id, DeleteBitmapPtr delete_bitmap, RowsetWriter* rowset_writer, - const RowsetIdUnorderedSet& cur_rowset_ids) { + const RowsetIdUnorderedSet& cur_rowset_ids, + int64_t lock_id) { RowsetSharedPtr rowset = txn_info->rowset; int64_t cur_version = rowset->start_version(); // update delete bitmap info, in order to avoid recalculation when trying again @@ -708,8 +718,9 @@ Status CloudTablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t tx } } + auto ms_lock_id = lock_id == -1 ? txn_id : lock_id; RETURN_IF_ERROR(_engine.meta_mgr().update_delete_bitmap( - *this, txn_id, COMPACTION_DELETE_BITMAP_LOCK_ID, new_delete_bitmap.get())); + *this, ms_lock_id, COMPACTION_DELETE_BITMAP_LOCK_ID, new_delete_bitmap.get())); // store the delete bitmap with sentinel marks in txn_delete_bitmap_cache because if the txn is retried for some reason, // it will use the delete bitmap from txn_delete_bitmap_cache when re-calculating the delete bitmap, during which it will do @@ -767,7 +778,8 @@ Status CloudTablet::calc_delete_bitmap_for_compaction( } std::unique_ptr> location_map; - if (config::enable_rowid_conversion_correctness_check) { + if (config::enable_rowid_conversion_correctness_check && + tablet_schema()->cluster_key_uids().empty()) { location_map = std::make_unique>(); LOG(INFO) << "Location Map inited succ for tablet:" << tablet_id(); } @@ -916,4 +928,5 @@ void CloudTablet::build_tablet_report_info(TTabletInfo* tablet_info) { // but it may be used in the future. } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 0fde2f5b1d93ff..fc0d64a493d316 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -92,7 +92,7 @@ class CloudTablet final : public BaseTablet { void clear_cache() override; // Return number of deleted stale rowsets - int delete_expired_stale_rowsets(); + uint64_t delete_expired_stale_rowsets(); bool has_stale_rowsets() const { return !_stale_rs_version_map.empty(); } @@ -170,7 +170,8 @@ class CloudTablet final : public BaseTablet { Status save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t txn_id, DeleteBitmapPtr delete_bitmap, RowsetWriter* rowset_writer, - const RowsetIdUnorderedSet& cur_rowset_ids) override; + const RowsetIdUnorderedSet& cur_rowset_ids, + int64_t lock_id = -1) override; Status calc_delete_bitmap_for_compaction(const std::vector& input_rowsets, const RowsetSharedPtr& output_rowset, diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index 32e3250f87c258..f90bf536f63018 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -20,6 +20,7 @@ #include "common/status.h" namespace doris::config { +#include "common/compile_check_begin.h" DEFINE_String(deploy_mode, ""); DEFINE_mString(cloud_unique_id, ""); @@ -28,7 +29,7 @@ DEFINE_Bool(meta_service_use_load_balancer, "false"); DEFINE_mInt32(meta_service_rpc_timeout_ms, "10000"); DEFINE_Bool(meta_service_connection_pooled, "true"); DEFINE_mInt64(meta_service_connection_pool_size, "20"); -DEFINE_mInt32(meta_service_connection_age_base_minutes, "5"); +DEFINE_mInt32(meta_service_connection_age_base_seconds, "30"); DEFINE_mInt32(meta_service_idle_connection_timeout_ms, "0"); DEFINE_mInt32(meta_service_rpc_retry_times, "200"); DEFINE_mInt32(meta_service_brpc_timeout_ms, "10000"); @@ -76,4 +77,5 @@ DEFINE_mInt32(tablet_txn_info_min_expired_seconds, "120"); DEFINE_mBool(enable_use_cloud_unique_id_from_fe, "true"); DEFINE_mBool(enable_cloud_tablet_report, "true"); +#include "common/compile_check_end.h" } // namespace doris::config diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index 8af967afb8c67b..a8a7c0c48ec91f 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -20,6 +20,7 @@ #include "common/config.h" namespace doris::config { +#include "common/compile_check_begin.h" DECLARE_String(deploy_mode); // deprecated do not configure directly @@ -34,7 +35,7 @@ static inline bool is_cloud_mode() { // If meta services are deployed behind a load balancer, set this config to "host:port" of the load balancer. // Here is a set of configs to configure the connection behaviors: // - meta_service_connection_pooled: distribute the long connections to different RS of the VIP. -// - meta_service_connection_age_base_minutes: expire the connection after a random time during [base, 2*base], +// - meta_service_connection_age_base_seconds: expire the connection after a random time during [base, 2*base], // so that the BE has a chance to connect to a new RS. (When you add a new RS, the BE will connect to it) // - meta_service_idle_connection_timeout_ms: rebuild the idle connections after the timeout exceeds. Some LB // vendors will reset the connection if it is idle for a long time. @@ -50,7 +51,7 @@ DECLARE_mInt64(meta_service_connection_pool_size); // has a chance to connect to a new RS. Set zero to disable it. // // Only works when meta_service_endpoint is set to a single host. -DECLARE_mInt32(meta_service_connection_age_base_minutes); +DECLARE_mInt32(meta_service_connection_age_base_seconds); // Rebuild the idle connections after the timeout exceeds. Set zero to disable it. // // Only works when meta_service_endpoint is set to a single host. @@ -110,4 +111,5 @@ DECLARE_mBool(enable_use_cloud_unique_id_from_fe); DECLARE_Bool(enable_cloud_tablet_report); +#include "common/compile_check_end.h" } // namespace doris::config diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index bff7d8388d30d8..e655ceacf2f08d 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -324,7 +324,7 @@ void doris_tablet_schema_to_cloud(TabletSchemaCloudPB* out, const TabletSchemaPB out->set_store_row_column(in.store_row_column()); out->set_enable_single_replica_compaction(in.enable_single_replica_compaction()); out->set_skip_write_index_on_load(in.skip_write_index_on_load()); - out->mutable_cluster_key_idxes()->CopyFrom(in.cluster_key_idxes()); + out->mutable_cluster_key_uids()->CopyFrom(in.cluster_key_uids()); out->set_is_dynamic_schema(in.is_dynamic_schema()); out->mutable_row_store_column_unique_ids()->CopyFrom(in.row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); @@ -353,7 +353,7 @@ void doris_tablet_schema_to_cloud(TabletSchemaCloudPB* out, TabletSchemaPB&& in) out->set_store_row_column(in.store_row_column()); out->set_enable_single_replica_compaction(in.enable_single_replica_compaction()); out->set_skip_write_index_on_load(in.skip_write_index_on_load()); - out->mutable_cluster_key_idxes()->Swap(in.mutable_cluster_key_idxes()); + out->mutable_cluster_key_uids()->Swap(in.mutable_cluster_key_uids()); out->set_is_dynamic_schema(in.is_dynamic_schema()); out->mutable_row_store_column_unique_ids()->Swap(in.mutable_row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); @@ -395,7 +395,7 @@ void cloud_tablet_schema_to_doris(TabletSchemaPB* out, const TabletSchemaCloudPB out->set_store_row_column(in.store_row_column()); out->set_enable_single_replica_compaction(in.enable_single_replica_compaction()); out->set_skip_write_index_on_load(in.skip_write_index_on_load()); - out->mutable_cluster_key_idxes()->CopyFrom(in.cluster_key_idxes()); + out->mutable_cluster_key_uids()->CopyFrom(in.cluster_key_uids()); out->set_is_dynamic_schema(in.is_dynamic_schema()); out->mutable_row_store_column_unique_ids()->CopyFrom(in.row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); @@ -425,7 +425,7 @@ void cloud_tablet_schema_to_doris(TabletSchemaPB* out, TabletSchemaCloudPB&& in) out->set_store_row_column(in.store_row_column()); out->set_enable_single_replica_compaction(in.enable_single_replica_compaction()); out->set_skip_write_index_on_load(in.skip_write_index_on_load()); - out->mutable_cluster_key_idxes()->Swap(in.mutable_cluster_key_idxes()); + out->mutable_cluster_key_uids()->Swap(in.mutable_cluster_key_uids()); out->set_is_dynamic_schema(in.is_dynamic_schema()); out->mutable_row_store_column_unique_ids()->Swap(in.mutable_row_store_column_unique_ids()); out->set_inverted_index_storage_format(in.inverted_index_storage_format()); diff --git a/be/src/clucene b/be/src/clucene index 7cf6cf410d41d9..a506dbb6c523aa 160000 --- a/be/src/clucene +++ b/be/src/clucene @@ -1 +1 @@ -Subproject commit 7cf6cf410d41d95456edba263cc55b7b6f5ab027 +Subproject commit a506dbb6c523aa65044eb1c527a066d236172543 diff --git a/be/src/common/cast_set.h b/be/src/common/cast_set.h index 08a9c8fbb7ebaa..dcc744d8122a3a 100644 --- a/be/src/common/cast_set.h +++ b/be/src/common/cast_set.h @@ -61,6 +61,12 @@ void cast_set(T& a, U b) { a = static_cast(b); } +template + requires std::is_floating_point_v and std::is_integral_v +void cast_set(T& a, U b) { + a = static_cast(b); +} + template requires std::is_integral_v && std::is_integral_v T cast_set(U b) { @@ -70,4 +76,10 @@ T cast_set(U b) { return static_cast(b); } +template + requires std::is_floating_point_v and std::is_integral_v +T cast_set(U b) { + return static_cast(b); +} + } // namespace doris diff --git a/be/src/common/cgroup_memory_ctl.cpp b/be/src/common/cgroup_memory_ctl.cpp index a29432bdb4ede5..dddcbd50338d82 100644 --- a/be/src/common/cgroup_memory_ctl.cpp +++ b/be/src/common/cgroup_memory_ctl.cpp @@ -27,6 +27,7 @@ #include "common/status.h" #include "util/cgroup_util.h" +#include "util/error_util.h" namespace doris { @@ -84,14 +85,23 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { : _mount_file_dir(std::move(mount_file_dir)) {} Status read_memory_limit(int64_t* value) override { - RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file((_mount_file_dir / "memory.max"), - value)); + std::filesystem::path file_path = _mount_file_dir / "memory.max"; + std::string line; + std::ifstream file_stream(file_path, std::ios::in); + getline(file_stream, line); + if (file_stream.fail() || file_stream.bad()) { + return Status::CgroupError("Error reading {}: {}", file_path.string(), + get_str_err_msg()); + } + if (line == "max") { + *value = std::numeric_limits::max(); + return Status::OK(); + } + RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file(file_path, value)); return Status::OK(); } Status read_memory_usage(int64_t* value) override { - // memory.current contains a single number - // the reason why we subtract it described here: https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 RETURN_IF_ERROR(CGroupUtil::read_int_line_from_cgroup_file( (_mount_file_dir / "memory.current"), value)); std::unordered_map metrics_map; @@ -100,7 +110,12 @@ struct CgroupsV2Reader : CGroupMemoryCtl::ICgroupsReader { if (*value < metrics_map["inactive_file"]) { return Status::CgroupError("CgroupsV2Reader read_memory_usage negative memory usage"); } + // the reason why we subtract inactive_file described here: + // https://github.com/ClickHouse/ClickHouse/issues/64652#issuecomment-2149630667 *value -= metrics_map["inactive_file"]; + // Part of "slab" that might be reclaimed, such as dentries and inodes. + // https://arthurchiao.art/blog/cgroupv2-zh/ + *value -= metrics_map["slab_reclaimable"]; return Status::OK(); } diff --git a/be/src/common/compile_check_begin.h b/be/src/common/compile_check_begin.h index 6da403f2894885..4d860d39d1cf72 100644 --- a/be/src/common/compile_check_begin.h +++ b/be/src/common/compile_check_begin.h @@ -23,8 +23,9 @@ #ifdef __clang__ #pragma clang diagnostic push #pragma clang diagnostic error "-Wconversion" +#pragma clang diagnostic error "-Wshadow" #pragma clang diagnostic ignored "-Wsign-conversion" #pragma clang diagnostic ignored "-Wfloat-conversion" #endif -//#include "common/compile_check_begin.h" \ No newline at end of file +//#include "common/compile_check_begin.h" diff --git a/be/src/common/compile_check_end.h b/be/src/common/compile_check_end.h index 0897965dc74a3d..40df41b6bdfc6c 100644 --- a/be/src/common/compile_check_end.h +++ b/be/src/common/compile_check_end.h @@ -20,4 +20,4 @@ #endif #undef COMPILE_CHECK -// #include "common/compile_check_end.h" \ No newline at end of file +// #include "common/compile_check_end.h" diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index a37a006acf0b6f..95a3e61fb5517a 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -63,8 +63,29 @@ DEFINE_Int32(brpc_port, "8060"); DEFINE_Int32(arrow_flight_sql_port, "-1"); -DEFINE_mString(public_access_ip, ""); -DEFINE_Int32(public_access_port, "-1"); +// If the external client cannot directly access priority_networks, set public_host to be accessible +// to external client. +// There are usually two usage scenarios: +// 1. in production environment, it is often inconvenient to expose Doris BE nodes to the external network. +// However, a reverse proxy (such as Nginx) can be added to all Doris BE nodes, and the external client will be +// randomly routed to a Doris BE node when connecting to Nginx. set public_host to the host of Nginx. +// 2. if priority_networks is an internal network IP, and BE node has its own independent external IP, +// but Doris currently does not support modifying priority_networks, setting public_host to the real external IP. +DEFINE_mString(public_host, ""); + +// If the BE node is connected to the external network through a reverse proxy like Nginx +// and need to use Arrow Flight SQL, should add a server in Nginx to reverse proxy +// `Nginx:arrow_flight_sql_proxy_port` to `BE_priority_networks:arrow_flight_sql_port`. For example: +// upstream arrowflight { +// server 10.16.10.8:8069; +// server 10.16.10.8:8068; +//} +// server { +// listen 8167 http2; +// listen [::]:8167 http2; +// server_name doris.arrowflight.com; +// } +DEFINE_Int32(arrow_flight_sql_proxy_port, "-1"); // the number of bthreads for brpc, the default value is set to -1, // which means the number of bthreads is #cpu-cores @@ -229,6 +250,8 @@ DEFINE_mInt32(max_download_speed_kbps, "50000"); DEFINE_mInt32(download_low_speed_limit_kbps, "50"); // download low speed time(seconds) DEFINE_mInt32(download_low_speed_time, "300"); +// whether to download small files in batch +DEFINE_mBool(enable_batch_download, "true"); DEFINE_String(sys_log_dir, ""); DEFINE_String(user_function_dir, "${DORIS_HOME}/lib/udf"); @@ -397,7 +420,9 @@ DEFINE_mInt64(base_compaction_max_compaction_score, "20"); DEFINE_mDouble(base_compaction_min_data_ratio, "0.3"); DEFINE_mInt64(base_compaction_dup_key_max_file_size_mbytes, "1024"); -DEFINE_Bool(enable_skip_tablet_compaction, "false"); +DEFINE_Bool(enable_skip_tablet_compaction, "true"); +DEFINE_mInt32(skip_tablet_compaction_second, "10"); + // output rowset of cumulative compaction total disk size exceed this config size, // this rowset will be given to base compaction, unit is m byte. DEFINE_mInt64(compaction_promotion_size_mbytes, "1024"); @@ -430,10 +455,10 @@ DEFINE_mInt32(cumulative_compaction_max_deltas_factor, "10"); DEFINE_mInt32(multi_get_max_threads, "10"); // The upper limit of "permits" held by all compaction tasks. This config can be set to limit memory consumption for compaction. -DEFINE_mInt64(total_permits_for_compaction_score, "10000"); +DEFINE_mInt64(total_permits_for_compaction_score, "1000000"); // sleep interval in ms after generated compaction tasks -DEFINE_mInt32(generate_compaction_tasks_interval_ms, "10"); +DEFINE_mInt32(generate_compaction_tasks_interval_ms, "100"); // sleep interval in second after update replica infos DEFINE_mInt32(update_replica_infos_interval_seconds, "60"); @@ -531,6 +556,8 @@ DEFINE_Bool(enable_brpc_builtin_services, "true"); // Enable brpc connection check DEFINE_Bool(enable_brpc_connection_check, "false"); +DEFINE_mInt64(brpc_connection_check_timeout_ms, "10000"); + // The maximum amount of data that can be processed by a stream load DEFINE_mInt64(streaming_load_max_mb, "102400"); // Some data formats, such as JSON, cannot be streamed. @@ -936,8 +963,6 @@ DEFINE_Int32(doris_remote_scanner_thread_pool_thread_num, "48"); // number of s3 scanner thread pool queue size DEFINE_Int32(doris_remote_scanner_thread_pool_queue_size, "102400"); DEFINE_mInt64(block_cache_wait_timeout_ms, "1000"); -DEFINE_mInt64(cache_lock_long_tail_threshold, "1000"); -DEFINE_Int64(file_cache_recycle_keys_size, "1000000"); // limit the queue of pending batches which will be sent by a single nodechannel DEFINE_mInt64(nodechannel_pending_queue_max_bytes, "67108864"); @@ -1004,7 +1029,7 @@ DEFINE_mBool(variant_throw_exeception_on_invalid_json, "false"); DEFINE_Bool(enable_file_cache, "false"); // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240}] // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240},{"path":"/path/to/file_cache2","total_size":21474836480,"query_limit":10737418240}] -// format: {"path": "/path/to/file_cache", "total_size":53687091200, "normal_percent":85, "disposable_percent":10, "index_percent":5} +// format: {"path": "/path/to/file_cache", "total_size":53687091200, "ttl_percent":50, "normal_percent":40, "disposable_percent":5, "index_percent":5} // format: [{"path": "xxx", "total_size":53687091200, "storage": "memory"}] // Note1: storage is "disk" by default // Note2: when the storage is "memory", the path is ignored. So you can set xxx to anything you like @@ -1020,7 +1045,7 @@ DEFINE_Int64(file_cache_each_block_size, "1048576"); // 1MB DEFINE_Bool(clear_file_cache, "false"); DEFINE_Bool(enable_file_cache_query_limit, "false"); -DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "90"); +DEFINE_mInt32(file_cache_enter_disk_resource_limit_mode_percent, "88"); DEFINE_mInt32(file_cache_exit_disk_resource_limit_mode_percent, "80"); DEFINE_mBool(enable_read_cache_file_directly, "false"); DEFINE_mBool(file_cache_enable_evict_from_other_queue_by_size, "true"); @@ -1031,6 +1056,9 @@ DEFINE_Bool(enable_ttl_cache_evict_using_lru, "true"); DEFINE_mBool(enbale_dump_error_file, "true"); // limit the max size of error log on disk DEFINE_mInt64(file_cache_error_log_limit_bytes, "209715200"); // 200MB +DEFINE_mInt64(cache_lock_long_tail_threshold, "1000"); +DEFINE_Int64(file_cache_recycle_keys_size, "1000000"); +DEFINE_mBool(enable_file_cache_keep_base_compaction_output, "false"); DEFINE_mInt32(index_cache_entry_stay_time_after_lookup_s, "1800"); DEFINE_mInt32(inverted_index_cache_stale_sweep_time_sec, "600"); @@ -1140,6 +1168,9 @@ DEFINE_mBool(enable_missing_rows_correctness_check, "false"); // When the number of missing versions is more than this value, do not directly // retry the publish and handle it through async publish. DEFINE_mInt32(mow_publish_max_discontinuous_version_num, "20"); +// When the size of primary keys in memory exceeds this value, finish current segment +// and create a new segment, used in compaction. Default 50MB. +DEFINE_mInt64(mow_primary_key_index_max_size_in_memory, "52428800"); // When the version is not continuous for MOW table in publish phase and the gap between // current txn's publishing version and the max version of the tablet exceeds this value, // don't print warning log @@ -1180,11 +1211,13 @@ DEFINE_Bool(exit_on_exception, "false"); DEFINE_Bool(enable_flush_file_cache_async, "true"); // cgroup -DEFINE_mString(doris_cgroup_cpu_path, ""); +DEFINE_String(doris_cgroup_cpu_path, ""); DEFINE_mBool(enable_be_proc_monitor, "false"); DEFINE_mInt32(be_proc_monitor_interval_ms, "10000"); +DEFINE_Int32(workload_group_metrics_interval_ms, "5000"); + DEFINE_mBool(enable_workload_group_memory_gc, "true"); DEFINE_Bool(ignore_always_true_predicate_for_segment, "true"); @@ -1301,8 +1334,6 @@ DEFINE_Int64(num_buffered_reader_prefetch_thread_pool_max_thread, "64"); DEFINE_Int64(num_s3_file_upload_thread_pool_min_thread, "16"); // The max thread num for S3FileUploadThreadPool DEFINE_Int64(num_s3_file_upload_thread_pool_max_thread, "64"); -// The max ratio for ttl cache's size -DEFINE_mInt64(max_ttl_cache_ratio, "50"); // The maximum jvm heap usage ratio for hdfs write workload DEFINE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio, "0.5"); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1370,6 +1401,8 @@ DEFINE_Int32(query_cache_size, "512"); DEFINE_mBool(enable_delete_bitmap_merge_on_compaction, "false"); // Enable validation to check the correctness of table size. DEFINE_Bool(enable_table_size_correctness_check, "false"); +DEFINE_Bool(force_regenerate_rowsetid_on_start_error, "false"); +DEFINE_mBool(enable_sleep_between_delete_cumu_compaction, "false"); // clang-format off #ifdef BE_TEST diff --git a/be/src/common/config.h b/be/src/common/config.h index 63d62b219c12f8..f8a9c3f7480b33 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -100,11 +100,29 @@ DECLARE_Int32(brpc_port); // Default -1, do not start arrow flight sql server. DECLARE_Int32(arrow_flight_sql_port); -// If priority_networks is incorrect but cannot be modified, set public_access_ip as BEโ€™s real IP. -// For ADBC client fetch result, default is empty, the ADBC client uses the backend ip to fetch the result. -// If ADBC client cannot access the backend ip, can set public_access_ip to modify the fetch result ip. -DECLARE_mString(public_access_ip); -DECLARE_Int32(public_access_port); +// If the external client cannot directly access priority_networks, set public_host to be accessible +// to external client. +// There are usually two usage scenarios: +// 1. in production environment, it is often inconvenient to expose Doris BE nodes to the external network. +// However, a reverse proxy (such as Nginx) can be added to all Doris BE nodes, and the external client will be +// randomly routed to a Doris BE node when connecting to Nginx. set public_host to the host of Nginx. +// 2. if priority_networks is an internal network IP, and BE node has its own independent external IP, +// but Doris currently does not support modifying priority_networks, setting public_host to the real external IP. +DECLARE_mString(public_host); + +// If the BE node is connected to the external network through a reverse proxy like Nginx +// and need to use Arrow Flight SQL, should add a server in Nginx to reverse proxy +// `Nginx:arrow_flight_sql_proxy_port` to `BE_priority_networks:arrow_flight_sql_port`. For example: +// upstream arrowflight { +// server 10.16.10.8:8069; +// server 10.16.10.8:8068; +//} +// server { +// listen 8167 http2; +// listen [::]:8167 http2; +// server_name doris.arrowflight.com; +// } +DECLARE_Int32(arrow_flight_sql_proxy_port); // the number of bthreads for brpc, the default value is set to -1, // which means the number of bthreads is #cpu-cores @@ -280,6 +298,8 @@ DECLARE_mInt32(max_download_speed_kbps); DECLARE_mInt32(download_low_speed_limit_kbps); // download low speed time(seconds) DECLARE_mInt32(download_low_speed_time); +// whether to download small files in batch. +DECLARE_mBool(enable_batch_download); // deprecated, use env var LOG_DIR in be.conf DECLARE_String(sys_log_dir); @@ -452,6 +472,7 @@ DECLARE_mDouble(base_compaction_min_data_ratio); DECLARE_mInt64(base_compaction_dup_key_max_file_size_mbytes); DECLARE_Bool(enable_skip_tablet_compaction); +DECLARE_mInt32(skip_tablet_compaction_second); // output rowset of cumulative compaction total disk size exceed this config size, // this rowset will be given to base compaction, unit is m byte. DECLARE_mInt64(compaction_promotion_size_mbytes); @@ -992,13 +1013,13 @@ DECLARE_mInt64(nodechannel_pending_queue_max_bytes); // The batch size for sending data by brpc streaming client DECLARE_mInt64(brpc_streaming_client_batch_bytes); DECLARE_mInt64(block_cache_wait_timeout_ms); -DECLARE_mInt64(cache_lock_long_tail_threshold); -DECLARE_Int64(file_cache_recycle_keys_size); DECLARE_Bool(enable_brpc_builtin_services); DECLARE_Bool(enable_brpc_connection_check); +DECLARE_mInt64(brpc_connection_check_timeout_ms); + // Max waiting time to wait the "plan fragment start" rpc. // If timeout, the fragment will be cancelled. // This parameter is usually only used when the FE loses connection, @@ -1050,7 +1071,7 @@ DECLARE_Int32(pipeline_executor_size); DECLARE_Bool(enable_file_cache); // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240}] // format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240},{"path":"/path/to/file_cache2","total_size":21474836480,"query_limit":10737418240}] -// format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240,"normal_percent":85, "disposable_percent":10, "index_percent":5}] +// format: [{"path":"/path/to/file_cache","total_size":21474836480,"query_limit":10737418240, "ttl_percent":50, "normal_percent":40, "disposable_percent":5, "index_percent":5}] // format: [{"path": "xxx", "total_size":53687091200, "storage": "memory"}] // Note1: storage is "disk" by default // Note2: when the storage is "memory", the path is ignored. So you can set xxx to anything you like @@ -1076,6 +1097,15 @@ DECLARE_Bool(enable_ttl_cache_evict_using_lru); DECLARE_mBool(enbale_dump_error_file); // limit the max size of error log on disk DECLARE_mInt64(file_cache_error_log_limit_bytes); +DECLARE_mInt64(cache_lock_long_tail_threshold); +DECLARE_Int64(file_cache_recycle_keys_size); +// Base compaction may retrieve and produce some less frequently accessed data, +// potentially affecting the file cache hit rate. +// This configuration determines whether to retain the output within the file cache. +// Make your choice based on the following considerations: +// If your file cache is ample enough to accommodate all the data in your database, +// enable this option; otherwise, it is recommended to leave it disabled. +DECLARE_mBool(enable_file_cache_keep_base_compaction_output); // inverted index searcher cache // cache entry stay time after lookup @@ -1208,6 +1238,9 @@ DECLARE_mBool(enable_missing_rows_correctness_check); // When the number of missing versions is more than this value, do not directly // retry the publish and handle it through async publish. DECLARE_mInt32(mow_publish_max_discontinuous_version_num); +// When the size of primary keys in memory exceeds this value, finish current segment +// and create a new segment, used in compaction. +DECLARE_mInt64(mow_primary_key_index_max_size_in_memory); // When the version is not continuous for MOW table in publish phase and the gap between // current txn's publishing version and the max version of the tablet exceeds this value, // don't print warning log @@ -1259,9 +1292,10 @@ DECLARE_mInt32(tablet_schema_cache_capacity); DECLARE_mBool(exit_on_exception); // cgroup -DECLARE_mString(doris_cgroup_cpu_path); +DECLARE_String(doris_cgroup_cpu_path); DECLARE_mBool(enable_be_proc_monitor); DECLARE_mInt32(be_proc_monitor_interval_ms); +DECLARE_Int32(workload_group_metrics_interval_ms); DECLARE_mBool(enable_workload_group_memory_gc); @@ -1382,8 +1416,6 @@ DECLARE_Int64(num_buffered_reader_prefetch_thread_pool_max_thread); DECLARE_Int64(num_s3_file_upload_thread_pool_min_thread); // The max thread num for S3FileUploadThreadPool DECLARE_Int64(num_s3_file_upload_thread_pool_max_thread); -// The max ratio for ttl cache's size -DECLARE_mInt64(max_ttl_cache_ratio); // The maximum jvm heap usage ratio for hdfs write workload DECLARE_mDouble(max_hdfs_wirter_jni_heap_usage_ratio); // The sleep milliseconds duration when hdfs write exceeds the maximum usage @@ -1450,10 +1482,13 @@ DECLARE_mInt32(check_score_rounds_num); // MB DECLARE_Int32(query_cache_size); +DECLARE_Bool(force_regenerate_rowsetid_on_start_error); DECLARE_mBool(enable_delete_bitmap_merge_on_compaction); // Enable validation to check the correctness of table size. DECLARE_Bool(enable_table_size_correctness_check); +// Enable sleep 5s between delete cumulative compaction. +DECLARE_mBool(enable_sleep_between_delete_cumu_compaction); #ifdef BE_TEST // test s3 diff --git a/be/src/common/daemon.cpp b/be/src/common/daemon.cpp index ce2a6878dba034..12bf1749a5694d 100644 --- a/be/src/common/daemon.cpp +++ b/be/src/common/daemon.cpp @@ -437,6 +437,8 @@ void Daemon::calculate_metrics_thread() { // update lst map DorisMetrics::instance()->system_metrics()->get_network_traffic( &lst_net_send_bytes, &lst_net_receive_bytes); + + DorisMetrics::instance()->system_metrics()->update_be_avail_cpu_num(); } update_rowsets_and_segments_num_metrics(); } @@ -500,15 +502,18 @@ void Daemon::cache_adjust_capacity_thread() { void Daemon::cache_prune_stale_thread() { int32_t interval = config::cache_periodic_prune_stale_sweep_sec; while (!_stop_background_threads_latch.wait_for(std::chrono::seconds(interval))) { - if (interval <= 0) { - LOG(WARNING) << "config of cache clean interval is illegal: [" << interval - << "], force set to 3600 "; - interval = 3600; + if (config::cache_periodic_prune_stale_sweep_sec <= 0) { + LOG(WARNING) << "config of cache clean interval is: [" << interval + << "], it means the cache prune stale thread is disabled, will wait 3s " + "and check again."; + interval = 3; + continue; } if (config::disable_memory_gc) { continue; } CacheManager::instance()->for_each_cache_prune_stale(); + interval = config::cache_periodic_prune_stale_sweep_sec; } } @@ -519,6 +524,13 @@ void Daemon::be_proc_monitor_thread() { } } +void Daemon::calculate_workload_group_metrics_thread() { + while (!_stop_background_threads_latch.wait_for( + std::chrono::milliseconds(config::workload_group_metrics_interval_ms))) { + ExecEnv::GetInstance()->workload_group_mgr()->refresh_workload_group_metrics(); + } +} + void Daemon::start() { Status st; st = Thread::create( @@ -567,6 +579,12 @@ void Daemon::start() { &_threads.emplace_back()); } CHECK(st.ok()) << st; + + st = Thread::create( + "Daemon", "workload_group_metrics", + [this]() { this->calculate_workload_group_metrics_thread(); }, + &_threads.emplace_back()); + CHECK(st.ok()) << st; } void Daemon::stop() { diff --git a/be/src/common/daemon.h b/be/src/common/daemon.h index fe723877dcd027..bd635f5a4b1920 100644 --- a/be/src/common/daemon.h +++ b/be/src/common/daemon.h @@ -47,6 +47,7 @@ class Daemon { void cache_prune_stale_thread(); void report_runtime_query_statistics_thread(); void be_proc_monitor_thread(); + void calculate_workload_group_metrics_thread(); CountDownLatch _stop_background_threads_latch; std::vector> _threads; diff --git a/be/src/common/status.h b/be/src/common/status.h index fac63b19f075ff..d059f289402cea 100644 --- a/be/src/common/status.h +++ b/be/src/common/status.h @@ -77,6 +77,7 @@ namespace ErrorCode { TStatusError(TABLET_MISSING, true); \ TStatusError(NOT_MASTER, true); \ TStatusError(OBTAIN_LOCK_FAILED, false); \ + TStatusError(SNAPSHOT_EXPIRED, false); \ TStatusError(DELETE_BITMAP_LOCK_ERROR, false); // E error_name, error_code, print_stacktrace #define APPLY_FOR_OLAP_ERROR_CODES(E) \ @@ -488,6 +489,7 @@ class [[nodiscard]] Status { ERROR_CTOR_NOSTACK(NeedSendAgain, NEED_SEND_AGAIN) ERROR_CTOR_NOSTACK(CgroupError, CGROUP_ERROR) ERROR_CTOR_NOSTACK(ObtainLockFailed, OBTAIN_LOCK_FAILED) + ERROR_CTOR_NOSTACK(NetworkError, NETWORK_ERROR) #undef ERROR_CTOR template @@ -568,7 +570,7 @@ class [[nodiscard]] Status { // and another thread is call to_string method, it may core, because the _err_msg is an unique ptr and // it is deconstructed during copy method. // And also we could not use lock, because we need get status frequently to check if it is cancelled. -// The defaule value is ok. +// The default value is ok. class AtomicStatus { public: AtomicStatus() : error_st_(Status::OK()) {} diff --git a/be/src/common/version_internal.cpp b/be/src/common/version_internal.cpp index 1190242b6aa687..55402fab209400 100644 --- a/be/src/common/version_internal.cpp +++ b/be/src/common/version_internal.cpp @@ -34,6 +34,9 @@ int doris_build_version_minor() { int doris_build_version_patch() { return DORIS_BUILD_VERSION_PATCH; } +int doris_build_version_hotfix() { + return DORIS_BUILD_VERSION_HOTFIX; +} const char* doris_build_version_rc_version() { return DORIS_BUILD_VERSION_RC_VERSION; } @@ -56,4 +59,4 @@ const char* doris_build_info() { } // namespace version -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/common/version_internal.h b/be/src/common/version_internal.h index 8852d26dba9531..f4deaa15aff545 100644 --- a/be/src/common/version_internal.h +++ b/be/src/common/version_internal.h @@ -24,6 +24,7 @@ extern const char* doris_build_version_prefix(); extern int doris_build_version_major(); extern int doris_build_version_minor(); extern int doris_build_version_patch(); +extern int doris_build_version_hotfix(); extern const char* doris_build_version_rc_version(); extern const char* doris_build_version(); @@ -34,4 +35,4 @@ extern const char* doris_build_info(); } // namespace version -} // namespace doris \ No newline at end of file +} // namespace doris diff --git a/be/src/exec/decompressor.cpp b/be/src/exec/decompressor.cpp index 9365bb00288db1..5da2e6acbb9bdf 100644 --- a/be/src/exec/decompressor.cpp +++ b/be/src/exec/decompressor.cpp @@ -492,15 +492,15 @@ Status Lz4BlockDecompressor::decompress(uint8_t* input, size_t input_len, size_t auto* output_ptr = output; while (input_len > 0) { - //if faild , fall back to large block begin - auto* large_block_input_ptr = input_ptr; - auto* large_block_output_ptr = output_ptr; - if (input_len < sizeof(uint32_t)) { - return Status::InvalidArgument(strings::Substitute( - "fail to do hadoop-lz4 decompress, input_len=$0", input_len)); + *more_input_bytes = sizeof(uint32_t) - input_len; + break; } + //if faild, fall back to large block begin + auto* large_block_input_ptr = input_ptr; + auto* large_block_output_ptr = output_ptr; + uint32_t remaining_decompressed_large_block_len = BigEndian::Load32(input_ptr); input_ptr += sizeof(uint32_t); @@ -609,15 +609,15 @@ Status SnappyBlockDecompressor::decompress(uint8_t* input, size_t input_len, auto* output_ptr = output; while (input_len > 0) { - //if faild , fall back to large block begin - auto* large_block_input_ptr = input_ptr; - auto* large_block_output_ptr = output_ptr; - if (input_len < sizeof(uint32_t)) { - return Status::InvalidArgument(strings::Substitute( - "fail to do hadoop-snappy decompress, input_len=$0", input_len)); + *more_input_bytes = sizeof(uint32_t) - input_len; + break; } + //if faild, fall back to large block begin + auto* large_block_input_ptr = input_ptr; + auto* large_block_output_ptr = output_ptr; + uint32_t remaining_decompressed_large_block_len = BigEndian::Load32(input_ptr); input_ptr += sizeof(uint32_t); diff --git a/be/src/exec/lzo_decompressor.cpp b/be/src/exec/lzo_decompressor.cpp index b075509202b70f..b240e2995a0414 100644 --- a/be/src/exec/lzo_decompressor.cpp +++ b/be/src/exec/lzo_decompressor.cpp @@ -103,6 +103,7 @@ Status LzopDecompressor::decompress(uint8_t* input, size_t input_len, size_t* in ptr = get_uint32(ptr, &uncompressed_size); left_input_len -= sizeof(uint32_t); if (uncompressed_size == 0) { + *input_bytes_read += sizeof(uint32_t); *stream_end = true; return Status::OK(); } diff --git a/be/src/exec/schema_scanner.cpp b/be/src/exec/schema_scanner.cpp index 39dd45163322ac..4b430f04289d04 100644 --- a/be/src/exec/schema_scanner.cpp +++ b/be/src/exec/schema_scanner.cpp @@ -124,7 +124,6 @@ Status SchemaScanner::get_next_block_async(RuntimeState* state) { } SCOPED_ATTACH_TASK(state); _async_thread_running = true; - _finish_dependency->block(); if (!_opened) { _data_block = vectorized::Block::create_unique(); _init_block(_data_block.get()); @@ -140,9 +139,6 @@ Status SchemaScanner::get_next_block_async(RuntimeState* state) { _eos = eos; _async_thread_running = false; _dependency->set_ready(); - if (eos) { - _finish_dependency->set_ready(); - } })); return Status::OK(); } diff --git a/be/src/exec/schema_scanner.h b/be/src/exec/schema_scanner.h index 440912bff1d729..6e7a229b7fd7b9 100644 --- a/be/src/exec/schema_scanner.h +++ b/be/src/exec/schema_scanner.h @@ -106,11 +106,7 @@ class SchemaScanner { // factory function static std::unique_ptr create(TSchemaTableType::type type); TSchemaTableType::type type() const { return _schema_table_type; } - void set_dependency(std::shared_ptr dep, - std::shared_ptr fin_dep) { - _dependency = dep; - _finish_dependency = fin_dep; - } + void set_dependency(std::shared_ptr dep) { _dependency = dep; } Status get_next_block_async(RuntimeState* state); protected: @@ -139,7 +135,6 @@ class SchemaScanner { RuntimeProfile::Counter* _fill_block_timer = nullptr; std::shared_ptr _dependency = nullptr; - std::shared_ptr _finish_dependency = nullptr; std::unique_ptr _data_block; AtomicStatus _scanner_status; diff --git a/be/src/exec/schema_scanner/schema_columns_scanner.cpp b/be/src/exec/schema_scanner/schema_columns_scanner.cpp index 8325a7f5dc4f2d..b60dfc3d203f89 100644 --- a/be/src/exec/schema_scanner/schema_columns_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_columns_scanner.cpp @@ -450,7 +450,19 @@ Status SchemaColumnsScanner::_fill_block_impl(vectorized::Block* block) { RETURN_IF_ERROR(fill_dest_column_for_range(block, 4, datas)); } // COLUMN_DEFAULT - { RETURN_IF_ERROR(fill_dest_column_for_range(block, 5, null_datas)); } + { + std::vector strs(columns_num); + for (int i = 0; i < columns_num; ++i) { + if (_desc_result.columns[i].columnDesc.__isset.defaultValue) { + strs[i] = StringRef(_desc_result.columns[i].columnDesc.defaultValue.c_str(), + _desc_result.columns[i].columnDesc.defaultValue.length()); + datas[i] = strs.data() + i; + } else { + datas[i] = nullptr; + } + } + RETURN_IF_ERROR(fill_dest_column_for_range(block, 5, datas)); + } // IS_NULLABLE { StringRef str_yes = StringRef("YES", 3); diff --git a/be/src/exec/schema_scanner/schema_tables_scanner.cpp b/be/src/exec/schema_scanner/schema_tables_scanner.cpp index 23710b81971c15..3aba0dfcc4f83c 100644 --- a/be/src/exec/schema_scanner/schema_tables_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_tables_scanner.cpp @@ -236,7 +236,7 @@ Status SchemaTablesScanner::_fill_block_impl(vectorized::Block* block) { std::vector srcs(table_num); for (int i = 0; i < table_num; ++i) { const TTableStatus& tbl_status = _table_result.tables[i]; - if (tbl_status.__isset.avg_row_length) { + if (tbl_status.__isset.data_length) { srcs[i] = tbl_status.data_length; datas[i] = srcs.data() + i; } else { @@ -248,7 +248,19 @@ Status SchemaTablesScanner::_fill_block_impl(vectorized::Block* block) { // max_data_length { RETURN_IF_ERROR(fill_dest_column_for_range(block, 10, null_datas)); } // index_length - { RETURN_IF_ERROR(fill_dest_column_for_range(block, 11, null_datas)); } + { + std::vector srcs(table_num); + for (int i = 0; i < table_num; ++i) { + const TTableStatus& tbl_status = _table_result.tables[i]; + if (tbl_status.__isset.index_length) { + srcs[i] = tbl_status.index_length; + datas[i] = srcs.data() + i; + } else { + datas[i] = nullptr; + } + } + RETURN_IF_ERROR(fill_dest_column_for_range(block, 11, datas)); + } // data_free { RETURN_IF_ERROR(fill_dest_column_for_range(block, 12, null_datas)); } // auto_increment diff --git a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp index 43562a8f52cbf1..481360eee90557 100644 --- a/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp +++ b/be/src/exec/schema_scanner/schema_workload_groups_scanner.cpp @@ -39,8 +39,8 @@ std::vector SchemaWorkloadGroupsScanner::_s_tbls_colu {"SCAN_THREAD_NUM", TYPE_BIGINT, sizeof(int64_t), true}, {"MAX_REMOTE_SCAN_THREAD_NUM", TYPE_BIGINT, sizeof(int64_t), true}, {"MIN_REMOTE_SCAN_THREAD_NUM", TYPE_BIGINT, sizeof(int64_t), true}, - {"SPILL_THRESHOLD_LOW_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, - {"SPILL_THRESHOLD_HIGH_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, + {"MEMORY_LOW_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, + {"MEMORY_HIGH_WATERMARK", TYPE_VARCHAR, sizeof(StringRef), true}, {"TAG", TYPE_VARCHAR, sizeof(StringRef), true}, {"READ_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), true}, {"REMOTE_READ_BYTES_PER_SECOND", TYPE_BIGINT, sizeof(int64_t), true}, diff --git a/be/src/exec/tablet_info.cpp b/be/src/exec/tablet_info.cpp index f1c0ad60e06455..acd923741eb73d 100644 --- a/be/src/exec/tablet_info.cpp +++ b/be/src/exec/tablet_info.cpp @@ -17,6 +17,7 @@ #include "exec/tablet_info.h" +#include #include #include #include @@ -180,6 +181,17 @@ Status OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema) { auto it = slots_map.find(to_lower(pcolumn_desc.name()) + "+" + data_type_str + is_null_str); if (it == std::end(slots_map)) { + std::string keys {}; + for (const auto& [key, _] : slots_map) { + keys += fmt::format("{},", key); + } + LOG_EVERY_SECOND(WARNING) << fmt::format( + "[OlapTableSchemaParam::init(const POlapTableSchemaParam& pschema)]: " + "unknown index column, column={}, type={}, data_type_str={}, " + "is_null_str={}, slots_map.keys()=[{}], {}\npschema={}", + pcolumn_desc.name(), pcolumn_desc.type(), data_type_str, is_null_str, + keys, debug_string(), pschema.ShortDebugString()); + return Status::InternalError("unknown index column, column={}, type={}", pcolumn_desc.name(), pcolumn_desc.type()); } @@ -286,6 +298,18 @@ Status OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema) { auto it = slots_map.find(to_lower(tcolumn_desc.column_name) + "+" + data_type_str + is_null_str); if (it == slots_map.end()) { + std::stringstream ss; + ss << tschema; + std::string keys {}; + for (const auto& [key, _] : slots_map) { + keys += fmt::format("{},", key); + } + LOG_EVERY_SECOND(WARNING) << fmt::format( + "[OlapTableSchemaParam::init(const TOlapTableSchemaParam& tschema)]: " + "unknown index column, column={}, type={}, data_type_str={}, " + "is_null_str={}, slots_map.keys()=[{}], {}\ntschema={}", + tcolumn_desc.column_name, tcolumn_desc.column_type.type, data_type_str, + is_null_str, keys, debug_string(), ss.str()); return Status::InternalError("unknown index column, column={}, type={}", tcolumn_desc.column_name, tcolumn_desc.column_type.type); diff --git a/be/src/exprs/bitmapfilter_predicate.h b/be/src/exprs/bitmapfilter_predicate.h index 5cb2b812220b10..8b161bf6213f40 100644 --- a/be/src/exprs/bitmapfilter_predicate.h +++ b/be/src/exprs/bitmapfilter_predicate.h @@ -30,11 +30,7 @@ namespace doris { // only used in Runtime Filter class BitmapFilterFuncBase : public RuntimeFilterFuncBase { public: - virtual void insert(const void* data) = 0; virtual void insert_many(const std::vector& bitmaps) = 0; - virtual bool empty() = 0; - virtual Status assign(BitmapValue* bitmap_value) = 0; - virtual void light_copy(BitmapFilterFuncBase* other) { _not_in = other->_not_in; } virtual uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number) = 0; virtual void find_batch(const char* data, const uint8* nullmap, size_t number, @@ -58,8 +54,6 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { ~BitmapFilterFunc() override = default; - void insert(const void* data) override; - void insert_many(const std::vector& bitmaps) override; uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, @@ -68,21 +62,8 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { void find_batch(const char* data, const uint8* nullmap, size_t number, uint8* results) const override; - bool empty() override { return _bitmap_value->empty(); } - - Status assign(BitmapValue* bitmap_value) override { - *_bitmap_value = *bitmap_value; - return Status::OK(); - } - - void light_copy(BitmapFilterFuncBase* bitmapfilter_func) override; - size_t size() const override { return _bitmap_value->cardinality(); } - uint64_t max() { return _bitmap_value->max(nullptr); } - - uint64_t min() { return _bitmap_value->min(nullptr); } - bool contains_any(CppType left, CppType right) { if (right < 0) { return false; @@ -90,23 +71,12 @@ class BitmapFilterFunc : public BitmapFilterFuncBase { return _bitmap_value->contains_any(std::max(left, (CppType)0), right); } - std::shared_ptr get_inner_bitmap() { return _bitmap_value; } - private: std::shared_ptr _bitmap_value; bool find(CppType data) const { return _not_in ^ (data >= 0 && _bitmap_value->contains(data)); } }; -template -void BitmapFilterFunc::insert(const void* data) { - if (data == nullptr) { - return; - } - - *_bitmap_value |= *reinterpret_cast(data); -} - template void BitmapFilterFunc::insert_many(const std::vector& bitmaps) { if (bitmaps.empty()) { @@ -147,12 +117,4 @@ void BitmapFilterFunc::find_batch(const char* data, const uint8* nullmap, } } -template -void BitmapFilterFunc::light_copy(BitmapFilterFuncBase* bitmapfilter_func) { - BitmapFilterFuncBase::light_copy(bitmapfilter_func); - auto other_func = reinterpret_cast(bitmapfilter_func); - _bitmap_value = other_func->_bitmap_value; - set_filter_id(bitmapfilter_func->get_filter_id()); -} - } // namespace doris diff --git a/be/src/exprs/bloom_filter_func.h b/be/src/exprs/bloom_filter_func.h index b5204fa767d59e..4d221f7bfe8421 100644 --- a/be/src/exprs/bloom_filter_func.h +++ b/be/src/exprs/bloom_filter_func.h @@ -17,6 +17,8 @@ #pragma once +#include "common/exception.h" +#include "common/status.h" #include "exprs/block_bloom_filter.hpp" #include "exprs/runtime_filter.h" #include "olap/rowset/segment_v2/bloom_filter.h" // IWYU pragma: keep @@ -24,17 +26,41 @@ #include "vec/common/string_ref.h" namespace doris { +// there are problems with the implementation of the old datetimev2. for compatibility reason, we will keep this code temporary. +struct fixed_len_to_uint32 { + template + uint32_t operator()(T value) { + if constexpr (sizeof(T) <= sizeof(uint32_t)) { + if constexpr (std::is_same_v>) { + return (uint32_t)value.to_int64(); + } else { + return (uint32_t)value; + } + } + return std::hash()(value); + } +}; + +struct fixed_len_to_uint32_v2 { + template + uint32_t operator()(T value) { + if constexpr (sizeof(T) <= sizeof(uint32_t)) { + if constexpr (std::is_same_v>) { + return (uint32_t)value.to_date_int_val(); + } else { + return (uint32_t)value; + } + } + return std::hash()(value); + } +}; class BloomFilterAdaptor { public: - BloomFilterAdaptor(bool null_aware = false) : _null_aware(null_aware) { + BloomFilterAdaptor(bool null_aware) : _null_aware(null_aware) { _bloom_filter = std::make_shared(); } - static int64_t optimal_bit_num(int64_t expect_num, double fpp) { - return doris::segment_v2::BloomFilter::optimal_bit_num(expect_num, fpp) / 8; - } - static BloomFilterAdaptor* create(bool null_aware) { return new BloomFilterAdaptor(null_aware); } @@ -55,27 +81,23 @@ class BloomFilterAdaptor { size_t size() { return _bloom_filter->directory().size; } - template - bool test(T data) const { - return _bloom_filter->find(data); - } + bool test(uint32_t data) const { return _bloom_filter->find(data); } - // test_element/find_element only used on vectorized engine - template + template bool test_element(T element) const { if constexpr (std::is_same_v) { return _bloom_filter->find(element); } else { - return _bloom_filter->find(HashUtil::fixed_len_to_uint32(element)); + return _bloom_filter->find(fixed_len_to_uint32_method()(element)); } } - template + template void add_element(T element) { if constexpr (std::is_same_v) { _bloom_filter->insert(element); } else { - _bloom_filter->insert(HashUtil::fixed_len_to_uint32(element)); + _bloom_filter->insert(fixed_len_to_uint32_method()(element)); } } @@ -155,19 +177,18 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { return Status::InternalError("bloomfilter_func is nullptr"); } if (bloomfilter_func->_bloom_filter == nullptr) { - return Status::InternalError("bloomfilter_func->_bloom_filter is nullptr"); + return Status::InternalError( + "bloomfilter_func->_bloom_filter is nullptr, bloomfilter_func->inited: {}", + bloomfilter_func->_inited); } // If `_inited` is false, there is no memory allocated in bloom filter and this is the first // call for `merge` function. So we just reuse this bloom filter, and we don't need to // allocate memory again. if (!_inited) { - auto* other_func = static_cast(bloomfilter_func); if (_bloom_filter != nullptr) { - return Status::InternalError("_bloom_filter must is nullptr"); + return Status::InternalError("_bloom_filter must is nullptr, inited: {}", _inited); } - _bloom_filter = bloomfilter_func->_bloom_filter; - _bloom_filter_alloced = other_func->_bloom_filter_alloced; - _inited = true; + light_copy(bloomfilter_func); return Status::OK(); } auto* other_func = static_cast(bloomfilter_func); @@ -205,13 +226,16 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { bool contain_null() const { if (!_bloom_filter) { - throw Status::InternalError("_bloom_filter is nullptr"); + throw Exception(ErrorCode::INTERNAL_ERROR, "_bloom_filter is nullptr, inited: {}", + _inited); } return _bloom_filter->contain_null(); } void set_contain_null_and_null_aware() { _bloom_filter->set_contain_null_and_null_aware(); } + void set_enable_fixed_len_to_uint32_v2() { _enable_fixed_len_to_uint32_v2 = true; } + size_t get_size() const { return _bloom_filter ? _bloom_filter->size() : 0; } void light_copy(BloomFilterFuncBase* bloomfilter_func) { @@ -219,6 +243,7 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { _bloom_filter_alloced = other_func->_bloom_filter_alloced; _bloom_filter = other_func->_bloom_filter; _inited = other_func->_inited; + _enable_fixed_len_to_uint32_v2 |= other_func->_enable_fixed_len_to_uint32_v2; } virtual void insert(const void* data) = 0; @@ -231,6 +256,8 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { uint16_t* offsets, int number, bool is_parse_column) = 0; + bool inited() const { return _inited; } + private: void _limit_length() { if (_runtime_bloom_filter_min_size > 0) { @@ -251,9 +278,10 @@ class BloomFilterFuncBase : public RuntimeFilterFuncBase { int64_t _runtime_bloom_filter_max_size; bool _build_bf_exactly = false; bool _bloom_filter_size_calculated_by_ndv = false; + bool _enable_fixed_len_to_uint32_v2 = false; }; -template +template uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap, uint16_t* offsets, int number, const bool is_parse_column) { @@ -277,7 +305,8 @@ uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* dat if (nullmap == nullptr) { for (int i = 0; i < number; i++) { uint16_t idx = offsets[i]; - if (!bloom_filter.test_element(get_element(data, idx))) { + if (!bloom_filter.test_element( + get_element(data, idx))) { continue; } offsets[new_size++] = idx; @@ -290,7 +319,8 @@ uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* dat continue; } } else { - if (!bloom_filter.test_element(get_element(data, idx))) { + if (!bloom_filter.test_element( + get_element(data, idx))) { continue; } } @@ -300,7 +330,7 @@ uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* dat } else { if (nullmap == nullptr) { for (int i = 0; i < number; i++) { - if (!bloom_filter.test_element(get_element(data, i))) { + if (!bloom_filter.test_element(get_element(data, i))) { continue; } offsets[new_size++] = i; @@ -312,7 +342,8 @@ uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* dat continue; } } else { - if (!bloom_filter.test_element(get_element(data, i))) { + if (!bloom_filter.test_element( + get_element(data, i))) { continue; } } @@ -323,16 +354,17 @@ uint16_t find_batch_olap(const BloomFilterAdaptor& bloom_filter, const char* dat return new_size; } -template +template struct CommonFindOp { - uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, - const uint8* nullmap, uint16_t* offsets, int number, - const bool is_parse_column) { - return find_batch_olap(bloom_filter, data, nullmap, offsets, number, is_parse_column); + static uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, + const uint8* nullmap, uint16_t* offsets, int number, + const bool is_parse_column) { + return find_batch_olap(bloom_filter, data, nullmap, offsets, + number, is_parse_column); } - void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, - size_t start) const { + static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, + size_t start) { const auto size = column->size(); if (column->is_nullable()) { const auto* nullable = assert_cast(column.get()); @@ -344,7 +376,7 @@ struct CommonFindOp { const T* data = (T*)col.get_raw_data().data; for (size_t i = start; i < size; i++) { if (!nullmap[i]) { - bloom_filter.add_element(*(data + i)); + bloom_filter.add_element(*(data + i)); } else { bloom_filter.set_contain_null(); } @@ -352,13 +384,13 @@ struct CommonFindOp { } else { const T* data = (T*)column->get_raw_data().data; for (size_t i = start; i < size; i++) { - bloom_filter.add_element(*(data + i)); + bloom_filter.add_element(*(data + i)); } } } - void find_batch(const BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, - uint8_t* results) const { + static void find_batch(const BloomFilterAdaptor& bloom_filter, + const vectorized::ColumnPtr& column, uint8_t* results) { const T* __restrict data = nullptr; const uint8_t* __restrict nullmap = nullptr; if (column->is_nullable()) { @@ -378,31 +410,32 @@ struct CommonFindOp { if (nullmap) { for (size_t i = 0; i < size; i++) { if (!nullmap[i]) { - results[i] = bloom_filter.test_element(data[i]); + results[i] = bloom_filter.test_element(data[i]); } else { results[i] = bloom_filter.contain_null(); } } } else { for (size_t i = 0; i < size; i++) { - results[i] = bloom_filter.test_element(data[i]); + results[i] = bloom_filter.test_element(data[i]); } } } - void insert(BloomFilterAdaptor& bloom_filter, const void* data) const { - bloom_filter.add_element(*(T*)data); + static void insert(BloomFilterAdaptor& bloom_filter, const void* data) { + bloom_filter.add_element(*(T*)data); } }; -struct StringFindOp : CommonFindOp { +template +struct StringFindOp : CommonFindOp { static void insert_batch(BloomFilterAdaptor& bloom_filter, const vectorized::ColumnPtr& column, size_t start) { auto _insert_batch_col_str = [&](const auto& col, const uint8_t* __restrict nullmap, size_t start, size_t size) { for (size_t i = start; i < size; i++) { if (nullmap == nullptr || !nullmap[i]) { - bloom_filter.add_element(col.get_data_at(i)); + bloom_filter.add_element(col.get_data_at(i)); } else { bloom_filter.set_contain_null(); } @@ -447,20 +480,23 @@ struct StringFindOp : CommonFindOp { if (nullable->has_null()) { for (size_t i = 0; i < col.size(); i++) { if (!nullmap[i]) { - results[i] = bloom_filter.test_element(col.get_data_at(i)); + results[i] = bloom_filter.test_element( + col.get_data_at(i)); } else { results[i] = bloom_filter.contain_null(); } } } else { for (size_t i = 0; i < col.size(); i++) { - results[i] = bloom_filter.test_element(col.get_data_at(i)); + results[i] = bloom_filter.test_element( + col.get_data_at(i)); } } } else { const auto& col = assert_cast(column.get()); for (size_t i = 0; i < col->size(); i++) { - results[i] = bloom_filter.test_element(col->get_data_at(i)); + results[i] = + bloom_filter.test_element(col->get_data_at(i)); } } } @@ -468,34 +504,35 @@ struct StringFindOp : CommonFindOp { // We do not need to judge whether data is empty, because null will not appear // when filer used by the storage engine -struct FixedStringFindOp : public StringFindOp { +template +struct FixedStringFindOp : public StringFindOp { static uint16_t find_batch_olap_engine(const BloomFilterAdaptor& bloom_filter, const char* data, const uint8* nullmap, uint16_t* offsets, int number, const bool is_parse_column) { - return find_batch_olap(bloom_filter, data, nullmap, offsets, number, - is_parse_column); + return find_batch_olap( + bloom_filter, data, nullmap, offsets, number, is_parse_column); } }; -template +template struct BloomFilterTypeTraits { using T = typename PrimitiveTypeTraits::CppType; - using FindOp = CommonFindOp; + using FindOp = CommonFindOp; }; -template <> -struct BloomFilterTypeTraits { - using FindOp = FixedStringFindOp; +template +struct BloomFilterTypeTraits { + using FindOp = FixedStringFindOp; }; -template <> -struct BloomFilterTypeTraits { - using FindOp = StringFindOp; +template +struct BloomFilterTypeTraits { + using FindOp = StringFindOp; }; -template <> -struct BloomFilterTypeTraits { - using FindOp = StringFindOp; +template +struct BloomFilterTypeTraits { + using FindOp = StringFindOp; }; template @@ -507,16 +544,28 @@ class BloomFilterFunc final : public BloomFilterFuncBase { void insert(const void* data) override { DCHECK(_bloom_filter != nullptr); - dummy.insert(*_bloom_filter, data); + if (_enable_fixed_len_to_uint32_v2) { + OpV2::insert(*_bloom_filter, data); + } else { + Op::insert(*_bloom_filter, data); + } } void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) override { DCHECK(_bloom_filter != nullptr); - dummy.insert_batch(*_bloom_filter, column, start); + if (_enable_fixed_len_to_uint32_v2) { + OpV2::insert_batch(*_bloom_filter, column, start); + } else { + Op::insert_batch(*_bloom_filter, column, start); + } } void find_fixed_len(const vectorized::ColumnPtr& column, uint8_t* results) override { - dummy.find_batch(*_bloom_filter, column, results); + if (_enable_fixed_len_to_uint32_v2) { + OpV2::find_batch(*_bloom_filter, column, results); + } else { + Op::find_batch(*_bloom_filter, column, results); + } } template @@ -538,12 +587,18 @@ class BloomFilterFunc final : public BloomFilterFuncBase { uint16_t find_fixed_len_olap_engine(const char* data, const uint8* nullmap, uint16_t* offsets, int number, bool is_parse_column) override { - return dummy.find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number, - is_parse_column); + if (_enable_fixed_len_to_uint32_v2) { + return OpV2::find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number, + is_parse_column); + } else { + return Op::find_batch_olap_engine(*_bloom_filter, data, nullmap, offsets, number, + is_parse_column); + } } private: - typename BloomFilterTypeTraits::FindOp dummy; + using Op = typename BloomFilterTypeTraits::FindOp; + using OpV2 = typename BloomFilterTypeTraits::FindOp; }; } // namespace doris diff --git a/be/src/exprs/create_predicate_function.h b/be/src/exprs/create_predicate_function.h index 387be1f9f0b11c..57a8b6376a9fed 100644 --- a/be/src/exprs/create_predicate_function.h +++ b/be/src/exprs/create_predicate_function.h @@ -17,6 +17,8 @@ #pragma once +#include "common/exception.h" +#include "common/status.h" #include "exprs/hybrid_set.h" #include "exprs/minmax_predicate.h" #include "function_filter.h" @@ -230,7 +232,7 @@ inline auto create_bitmap_filter(PrimitiveType type) { template ColumnPredicate* create_olap_column_predicate(uint32_t column_id, const std::shared_ptr& filter, - int be_exec_version, const TabletColumn*) { + const TabletColumn*) { std::shared_ptr filter_olap; filter_olap.reset(create_bloom_filter(PT)); filter_olap->light_copy(filter.get()); @@ -241,21 +243,18 @@ ColumnPredicate* create_olap_column_predicate(uint32_t column_id, template ColumnPredicate* create_olap_column_predicate(uint32_t column_id, const std::shared_ptr& filter, - int be_exec_version, const TabletColumn*) { + const TabletColumn*) { if constexpr (PT == TYPE_TINYINT || PT == TYPE_SMALLINT || PT == TYPE_INT || PT == TYPE_BIGINT) { - std::shared_ptr filter_olap; - filter_olap.reset(create_bitmap_filter(PT)); - filter_olap->light_copy(filter.get()); - return new BitmapFilterColumnPredicate(column_id, filter, be_exec_version); + return new BitmapFilterColumnPredicate(column_id, filter); } else { - return nullptr; + throw Exception(ErrorCode::INTERNAL_ERROR, "bitmap filter do not support type {}", PT); } } template ColumnPredicate* create_olap_column_predicate(uint32_t column_id, - const std::shared_ptr& filter, int, + const std::shared_ptr& filter, const TabletColumn* column = nullptr) { return create_in_list_predicate(column_id, filter, column->length()); @@ -263,40 +262,34 @@ ColumnPredicate* create_olap_column_predicate(uint32_t column_id, template ColumnPredicate* create_olap_column_predicate(uint32_t column_id, - const std::shared_ptr& filter, int, + const std::shared_ptr& filter, const TabletColumn* column = nullptr) { // currently only support like predicate - if constexpr (PT == TYPE_CHAR || PT == TYPE_VARCHAR || PT == TYPE_STRING) { - if constexpr (PT == TYPE_CHAR) { - return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, - filter->_string_param); - } else { - return new LikeColumnPredicate(filter->_opposite, column_id, - filter->_fn_ctx, filter->_string_param); - } - } else { - return nullptr; + if constexpr (PT == TYPE_CHAR) { + return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, + filter->_string_param); + } else if constexpr (PT == TYPE_VARCHAR || PT == TYPE_STRING) { + return new LikeColumnPredicate(filter->_opposite, column_id, filter->_fn_ctx, + filter->_string_param); } + throw Exception(ErrorCode::INTERNAL_ERROR, "function filter do not support type {}", PT); } template ColumnPredicate* create_column_predicate(uint32_t column_id, const std::shared_ptr& filter, - FieldType type, int be_exec_version, - const TabletColumn* column = nullptr) { + FieldType type, const TabletColumn* column = nullptr) { switch (type) { -#define M(NAME) \ - case FieldType::OLAP_FIELD_##NAME: { \ - return create_olap_column_predicate(column_id, filter, be_exec_version, column); \ +#define M(NAME) \ + case FieldType::OLAP_FIELD_##NAME: { \ + return create_olap_column_predicate(column_id, filter, column); \ } APPLY_FOR_PRIMTYPE(M) #undef M case FieldType::OLAP_FIELD_TYPE_DECIMAL: { - return create_olap_column_predicate(column_id, filter, be_exec_version, - column); + return create_olap_column_predicate(column_id, filter, column); } case FieldType::OLAP_FIELD_TYPE_BOOL: { - return create_olap_column_predicate(column_id, filter, be_exec_version, - column); + return create_olap_column_predicate(column_id, filter, column); } default: return nullptr; diff --git a/be/src/exprs/hybrid_set.h b/be/src/exprs/hybrid_set.h index 6536ec2430fe08..dbf31a54772ea3 100644 --- a/be/src/exprs/hybrid_set.h +++ b/be/src/exprs/hybrid_set.h @@ -17,21 +17,8 @@ #pragma once -#include - -#include - -#include "common/exception.h" -#include "common/object_pool.h" -#include "common/status.h" #include "exprs/runtime_filter.h" -#include "runtime/decimalv2_value.h" -#include "runtime/define_primitive_type.h" -#include "runtime/primitive_type.h" -#include "vec/columns/column_nullable.h" -#include "vec/columns/column_string.h" -#include "vec/common/hash_table/phmap_fwd_decl.h" -#include "vec/common/string_ref.h" +#include "exprs/runtime_filter_convertor.h" namespace doris { @@ -221,30 +208,19 @@ class HybridSetBase : public RuntimeFilterFuncBase { virtual bool find(const void* data, size_t) const = 0; virtual void find_batch(const doris::vectorized::IColumn& column, size_t rows, - doris::vectorized::ColumnUInt8::Container& results) { - LOG(FATAL) << "HybridSetBase not support find_batch"; - __builtin_unreachable(); - } - + doris::vectorized::ColumnUInt8::Container& results) = 0; virtual void find_batch_negative(const doris::vectorized::IColumn& column, size_t rows, - doris::vectorized::ColumnUInt8::Container& results) { - LOG(FATAL) << "HybridSetBase not support find_batch_negative"; - __builtin_unreachable(); - } - + doris::vectorized::ColumnUInt8::Container& results) = 0; virtual void find_batch_nullable(const doris::vectorized::IColumn& column, size_t rows, const doris::vectorized::NullMap& null_map, - doris::vectorized::ColumnUInt8::Container& results) { - LOG(FATAL) << "HybridSetBase not support find_batch_nullable"; - __builtin_unreachable(); - } + doris::vectorized::ColumnUInt8::Container& results) = 0; - virtual void find_batch_nullable_negative(const doris::vectorized::IColumn& column, size_t rows, - const doris::vectorized::NullMap& null_map, - doris::vectorized::ColumnUInt8::Container& results) { - LOG(FATAL) << "HybridSetBase not support find_batch_nullable_negative"; - __builtin_unreachable(); - } + virtual void find_batch_nullable_negative( + const doris::vectorized::IColumn& column, size_t rows, + const doris::vectorized::NullMap& null_map, + doris::vectorized::ColumnUInt8::Container& results) = 0; + + virtual void to_pb(PInFilter* filter) = 0; class IteratorBase { public: @@ -261,26 +237,6 @@ class HybridSetBase : public RuntimeFilterFuncBase { bool _contains_null = false; }; -template -const Type* check_and_get_hybrid_set(const HybridSetBase& column) { - return typeid_cast(&column); -} - -template -const Type* check_and_get_hybrid_set(const HybridSetBase* column) { - return typeid_cast(column); -} - -template -bool check_hybrid_set(const HybridSetBase& column) { - return check_and_get_hybrid_set(&column); -} - -template -bool check_hybrid_set(const HybridSetBase* column) { - return check_and_get_hybrid_set(column); -} - template ::CppType>, typename _ColumnType = typename PrimitiveTypeTraits::ColumnType> @@ -409,6 +365,14 @@ class HybridSet : public HybridSetBase { ContainerType* get_inner_set() { return &_set; } + void set_pb(PInFilter* filter, auto f) { + for (auto v : _set) { + f(filter->add_values(), v); + } + } + + void to_pb(PInFilter* filter) override { set_pb(filter, get_convertor()); } + private: ContainerType _set; ObjectPool _pool; @@ -569,6 +533,14 @@ class StringSet : public HybridSetBase { ContainerType* get_inner_set() { return &_set; } + void set_pb(PInFilter* filter, auto f) { + for (const auto& v : _set) { + f(filter->add_values(), v); + } + } + + void to_pb(PInFilter* filter) override { set_pb(filter, get_convertor()); } + private: ContainerType _set; ObjectPool _pool; @@ -735,6 +707,10 @@ class StringValueSet : public HybridSetBase { ContainerType* get_inner_set() { return &_set; } + void to_pb(PInFilter* filter) override { + throw Exception(ErrorCode::INTERNAL_ERROR, "StringValueSet do not support to_pb"); + } + private: ContainerType _set; ObjectPool _pool; diff --git a/be/src/exprs/minmax_predicate.h b/be/src/exprs/minmax_predicate.h index 377b33696c82b9..6c5d8a2d3c4bd5 100644 --- a/be/src/exprs/minmax_predicate.h +++ b/be/src/exprs/minmax_predicate.h @@ -17,16 +17,8 @@ #pragma once -#include - -#include "common/object_pool.h" #include "exprs/runtime_filter.h" -#include "runtime/type_limit.h" -#include "vec/columns/column.h" -#include "vec/columns/column_nullable.h" -#include "vec/columns/column_string.h" -#include "vec/common/assert_cast.h" -#include "vec/common/string_ref.h" +#include "exprs/runtime_filter_convertor.h" namespace doris { // only used in Runtime Filter @@ -45,6 +37,8 @@ class MinMaxFuncBase : public RuntimeFilterFuncBase { void set_contain_null() { _contain_null = true; } + virtual void to_pb(PMinMaxFilter* filter) = 0; + protected: bool _contain_null = false; }; @@ -165,6 +159,17 @@ class MinMaxNumFunc : public MinMaxFuncBase { return Status::OK(); } + void set_pb(PMinMaxFilter* filter, auto f) { + if constexpr (NeedMin) { + f(filter->mutable_min_val(), _min); + } + if constexpr (NeedMax) { + f(filter->mutable_max_val(), _max); + } + } + + void to_pb(PMinMaxFilter* filter) override { set_pb(filter, get_convertor()); } + protected: T _max = type_limit::min(); T _min = type_limit::max(); diff --git a/be/src/exprs/runtime_filter.cpp b/be/src/exprs/runtime_filter.cpp index 85f1c535c7038b..d1567a8fa79cb4 100644 --- a/be/src/exprs/runtime_filter.cpp +++ b/be/src/exprs/runtime_filter.cpp @@ -362,8 +362,11 @@ class RuntimePredicateWrapper { } Status init_bloom_filter(const size_t build_bf_cardinality) { - DCHECK(_filter_type == RuntimeFilterType::BLOOM_FILTER || - _filter_type == RuntimeFilterType::IN_OR_BLOOM_FILTER); + if (_filter_type != RuntimeFilterType::BLOOM_FILTER && + _filter_type != RuntimeFilterType::IN_OR_BLOOM_FILTER) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "init_bloom_filter meet invalid input type {}", int(_filter_type)); + } return _context->bloom_filter_func->init_with_cardinality(build_bf_cardinality); } @@ -391,7 +394,9 @@ class RuntimePredicateWrapper { BloomFilterFuncBase* get_bloomfilter() const { return _context->bloom_filter_func.get(); } void insert_fixed_len(const vectorized::ColumnPtr& column, size_t start) { - DCHECK(!is_ignored()); + if (is_ignored()) { + throw Exception(ErrorCode::INTERNAL_ERROR, "insert_fixed_len meet ignored rf"); + } switch (_filter_type) { case RuntimeFilterType::IN_FILTER: { _context->hybrid_set->insert_fixed_len(column, start); @@ -493,12 +498,12 @@ class RuntimePredicateWrapper { switch (_filter_type) { case RuntimeFilterType::IN_FILTER: { if (!_context->hybrid_set) { - _context->ignored = true; + set_ignored(); return Status::OK(); } _context->hybrid_set->insert(wrapper->_context->hybrid_set.get()); if (_max_in_num >= 0 && _context->hybrid_set->size() >= _max_in_num) { - _context->ignored = true; + set_ignored(); // release in filter _context->hybrid_set.reset(); } @@ -735,6 +740,12 @@ class RuntimePredicateWrapper { return Status::OK(); } + void set_enable_fixed_len_to_uint32_v2() { + if (is_bloomfilter()) { + _context->bloom_filter_func->set_enable_fixed_len_to_uint32_v2(); + } + } + // used by shuffle runtime filter // assign this filter by protobuf Status assign(const PBloomFilter* bloom_filter, butil::IOBufAsZeroCopyInputStream* data, @@ -898,17 +909,10 @@ class RuntimePredicateWrapper { return Status::InternalError("not support!"); } - HybridSetBase::IteratorBase* get_in_filter_iterator() { return _context->hybrid_set->begin(); } - void get_bloom_filter_desc(char** data, int* filter_length) { _context->bloom_filter_func->get_data(data, filter_length); } - void get_minmax_filter_desc(void** min_data, void** max_data) { - *min_data = _context->minmax_func->get_min(); - *max_data = _context->minmax_func->get_max(); - } - PrimitiveType column_type() { return _column_return_type; } bool is_bloomfilter() const { return get_real_type() == RuntimeFilterType::BLOOM_FILTER; } @@ -918,7 +922,10 @@ class RuntimePredicateWrapper { return _context->bloom_filter_func->contain_null(); } if (_context->hybrid_set) { - DCHECK(get_real_type() == RuntimeFilterType::IN_FILTER); + if (get_real_type() != RuntimeFilterType::IN_FILTER) { + throw Exception(ErrorCode::INTERNAL_ERROR, "rf has hybrid_set but real type is {}", + int(get_real_type())); + } return _context->hybrid_set->contain_null(); } if (_context->minmax_func) { @@ -974,11 +981,10 @@ class RuntimePredicateWrapper { Status IRuntimeFilter::create(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, - int node_id, std::shared_ptr* res, - bool build_bf_exactly, bool need_local_merge) { - *res = std::make_shared(state, desc, need_local_merge); + int node_id, std::shared_ptr* res) { + *res = std::make_shared(state, desc); (*res)->set_role(role); - return (*res)->init_with_desc(desc, query_options, node_id, build_bf_exactly); + return (*res)->init_with_desc(desc, query_options, node_id); } RuntimeFilterContextSPtr& IRuntimeFilter::get_shared_context_ref() { @@ -990,53 +996,62 @@ void IRuntimeFilter::insert_batch(const vectorized::ColumnPtr column, size_t sta _wrapper->insert_batch(column, start); } -Status IRuntimeFilter::publish(bool publish_local) { +Status IRuntimeFilter::publish(RuntimeState* state, bool publish_local) { DCHECK(is_producer()); - auto send_to_remote = [&](IRuntimeFilter* filter) { + auto send_to_remote_targets = [&](IRuntimeFilter* filter, uint64_t local_merge_time) { TNetworkAddress addr; DCHECK(_state != nullptr); - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_merge_addr(&addr)); - return filter->push_to_remote(&addr); + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_merge_addr(&addr)); + return filter->push_to_remote(state, &addr, local_merge_time); }; - auto send_to_local = [&](std::shared_ptr wrapper) { - std::vector> filters; - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_consume_filters(_filter_id, filters)); - DCHECK(!filters.empty()); - // push down + auto send_to_local_targets = [&](std::shared_ptr wrapper, bool global, + uint64_t local_merge_time = 0) { + std::vector> filters = + global ? _state->global_runtime_filter_mgr()->get_consume_filters(_filter_id) + : _state->local_runtime_filter_mgr()->get_consume_filters(_filter_id); for (auto filter : filters) { filter->_wrapper = wrapper; - filter->update_runtime_filter_type_to_profile(); + filter->update_runtime_filter_type_to_profile(local_merge_time); filter->signal(); } return Status::OK(); }; - auto do_local_merge = [&]() { - LocalMergeFilters* local_merge_filters = nullptr; - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_local_merge_producer_filters( - _filter_id, &local_merge_filters)); - std::lock_guard l(*local_merge_filters->lock); - RETURN_IF_ERROR(local_merge_filters->filters[0]->merge_from(_wrapper.get())); - local_merge_filters->merge_time--; - if (local_merge_filters->merge_time == 0) { - if (_has_local_target) { - RETURN_IF_ERROR(send_to_local(local_merge_filters->filters[0]->_wrapper)); - } else { - RETURN_IF_ERROR(send_to_remote(local_merge_filters->filters[0].get())); + auto do_merge = [&]() { + if (!_state->global_runtime_filter_mgr()->get_consume_filters(_filter_id).empty()) { + LocalMergeFilters* local_merge_filters = nullptr; + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_local_merge_producer_filters( + _filter_id, &local_merge_filters)); + local_merge_filters->merge_watcher.start(); + std::lock_guard l(*local_merge_filters->lock); + RETURN_IF_ERROR(local_merge_filters->filters[0]->merge_from(_wrapper.get())); + local_merge_filters->merge_time--; + local_merge_filters->merge_watcher.stop(); + if (local_merge_filters->merge_time == 0) { + if (_has_local_target) { + RETURN_IF_ERROR(send_to_local_targets( + local_merge_filters->filters[0]->_wrapper, true, + local_merge_filters->merge_watcher.elapsed_time())); + } else { + RETURN_IF_ERROR(send_to_remote_targets( + local_merge_filters->filters[0].get(), + local_merge_filters->merge_watcher.elapsed_time())); + } } } return Status::OK(); }; - if (_need_local_merge && _has_local_target) { - RETURN_IF_ERROR(do_local_merge()); - } else if (_has_local_target) { - RETURN_IF_ERROR(send_to_local(_wrapper)); + if (_has_local_target) { + // A runtime filter may have multiple targets and some of those are local-merge RF and others are not. + // So for all runtime filters' producers, `publish` should notify all consumers in global RF mgr which manages local-merge RF and local RF mgr which manages others. + RETURN_IF_ERROR(do_merge()); + RETURN_IF_ERROR(send_to_local_targets(_wrapper, false)); } else if (!publish_local) { - if (_is_broadcast_join || _state->be_exec_version < USE_NEW_SERDE) { - RETURN_IF_ERROR(send_to_remote(this)); + if (_is_broadcast_join || _state->get_query_ctx()->be_exec_version() < USE_NEW_SERDE) { + RETURN_IF_ERROR(send_to_remote_targets(this, 0)); } else { - RETURN_IF_ERROR(do_local_merge()); + RETURN_IF_ERROR(do_merge()); } } else { // remote broadcast join only push onetime in build shared hash table @@ -1088,16 +1103,18 @@ class SyncSizeClosure : public AutoReleaseClosure req, std::shared_ptr> callback, std::shared_ptr dependency, - RuntimeFilterContextSPtr rf_context) - : Base(req, callback), _dependency(std::move(dependency)), _rf_context(rf_context) {} + RuntimeFilterContextSPtr rf_context, std::weak_ptr context) + : Base(req, callback, context), + _dependency(std::move(dependency)), + _rf_context(rf_context) {} }; Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filter_size) { DCHECK(is_producer()); - if (_need_local_merge) { + if (!_state->global_runtime_filter_mgr()->get_consume_filters(_filter_id).empty()) { LocalMergeFilters* local_merge_filters = nullptr; - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_local_merge_producer_filters( + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_local_merge_producer_filters( _filter_id, &local_merge_filters)); std::lock_guard l(*local_merge_filters->lock); local_merge_filters->merge_size_times--; @@ -1121,9 +1138,9 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt TNetworkAddress addr; DCHECK(_state != nullptr); - RETURN_IF_ERROR(_state->runtime_filter_mgr->get_merge_addr(&addr)); + RETURN_IF_ERROR(_state->global_runtime_filter_mgr()->get_merge_addr(&addr)); std::shared_ptr stub( - _state->exec_env->brpc_internal_client_cache()->get_client(addr)); + _state->get_query_ctx()->exec_env()->brpc_internal_client_cache()->get_client(addr)); if (!stub) { return Status::InternalError("Get rpc stub failed, host={}, port={}", addr.hostname, addr.port); @@ -1133,11 +1150,13 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt auto callback = DummyBrpcCallback::create_shared(); // IRuntimeFilter maybe deconstructed before the rpc finished, so that could not use // a raw pointer in closure. Has to use the context's shared ptr. - auto closure = - SyncSizeClosure::create_unique(request, callback, _dependency, _wrapper->_context); + auto closure = SyncSizeClosure::create_unique( + request, callback, _dependency, _wrapper->_context, + state->query_options().ignore_runtime_filter_error ? std::weak_ptr {} + : state->get_query_ctx_weak()); auto* pquery_id = request->mutable_query_id(); - pquery_id->set_hi(_state->query_id.hi()); - pquery_id->set_lo(_state->query_id.lo()); + pquery_id->set_hi(_state->get_query_ctx()->query_id().hi); + pquery_id->set_lo(_state->get_query_ctx()->query_id().lo); auto* source_addr = request->mutable_source_addr(); source_addr->set_hostname(BackendOptions::get_local_backend().host); @@ -1157,10 +1176,11 @@ Status IRuntimeFilter::send_filter_size(RuntimeState* state, uint64_t local_filt return Status::OK(); } -Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { +Status IRuntimeFilter::push_to_remote(RuntimeState* state, const TNetworkAddress* addr, + uint64_t local_merge_time) { DCHECK(is_producer()); std::shared_ptr stub( - _state->exec_env->brpc_internal_client_cache()->get_client(*addr)); + _state->get_query_ctx()->exec_env()->brpc_internal_client_cache()->get_client(*addr)); if (!stub) { return Status::InternalError( fmt::format("Get rpc stub failed, host={}, port={}", addr->hostname, addr->port)); @@ -1170,24 +1190,28 @@ Status IRuntimeFilter::push_to_remote(const TNetworkAddress* addr) { auto merge_filter_callback = DummyBrpcCallback::create_shared(); auto merge_filter_closure = AutoReleaseClosure>:: - create_unique(merge_filter_request, merge_filter_callback); + create_unique(merge_filter_request, merge_filter_callback, + state->query_options().ignore_runtime_filter_error + ? std::weak_ptr {} + : state->get_query_ctx_weak()); void* data = nullptr; int len = 0; auto* pquery_id = merge_filter_request->mutable_query_id(); - pquery_id->set_hi(_state->query_id.hi()); - pquery_id->set_lo(_state->query_id.lo()); + pquery_id->set_hi(_state->get_query_ctx()->query_id().hi); + pquery_id->set_lo(_state->get_query_ctx()->query_id().lo); auto* pfragment_instance_id = merge_filter_request->mutable_fragment_instance_id(); pfragment_instance_id->set_hi(BackendOptions::get_local_backend().id); pfragment_instance_id->set_lo((int64_t)this); merge_filter_request->set_filter_id(_filter_id); + merge_filter_request->set_local_merge_time(local_merge_time); auto column_type = _wrapper->column_type(); RETURN_IF_CATCH_EXCEPTION(merge_filter_request->set_column_type(to_proto(column_type))); merge_filter_callback->cntl_->set_timeout_ms( - get_execution_rpc_timeout_ms(_state->execution_timeout)); + get_execution_rpc_timeout_ms(_state->get_query_ctx()->execution_timeout())); if (config::execution_ignore_eovercrowded) { merge_filter_callback->cntl_->ignore_eovercrowded(); } @@ -1222,9 +1246,9 @@ Status IRuntimeFilter::get_push_expr_ctxs(std::listadd_info_string("Info", formatted_state()); // The runtime filter is pushed down, adding filtering information. - auto* expr_filtered_rows_counter = ADD_COUNTER(_profile, "expr_filtered_rows", TUnit::UNIT); - auto* expr_input_rows_counter = ADD_COUNTER(_profile, "expr_input_rows", TUnit::UNIT); - auto* always_true_counter = ADD_COUNTER(_profile, "always_true_pass_rows", TUnit::UNIT); + auto* expr_filtered_rows_counter = ADD_COUNTER(_profile, "ExprFilteredRows", TUnit::UNIT); + auto* expr_input_rows_counter = ADD_COUNTER(_profile, "ExprInputRows", TUnit::UNIT); + auto* always_true_counter = ADD_COUNTER(_profile, "AlwaysTruePassRows", TUnit::UNIT); for (auto i = origin_size; i < push_exprs.size(); i++) { push_exprs[i]->attach_profile_counter(expr_filtered_rows_counter, expr_input_rows_counter, always_true_counter); @@ -1234,8 +1258,8 @@ Status IRuntimeFilter::get_push_expr_ctxs(std::listexecution_timeout * 1000; - auto runtime_filter_wait_time_ms = _state->runtime_filter_wait_time_ms; + auto execution_timeout = _state->get_query_ctx()->execution_timeout() * 1000; + auto runtime_filter_wait_time_ms = _state->get_query_ctx()->runtime_filter_wait_time_ms(); // bitmap filter is precise filter and only filter once, so it must be applied. int64_t wait_times_ms = _runtime_filter_type == RuntimeFilterType::BITMAP_FILTER ? execution_timeout @@ -1244,6 +1268,8 @@ void IRuntimeFilter::update_state() { // In pipelineX, runtime filters will be ready or timeout before open phase. if (expected == RuntimeFilterState::NOT_READY) { DCHECK(MonotonicMillis() - registration_time_ >= wait_times_ms); + COUNTER_SET(_wait_timer, + int64_t((MonotonicMillis() - registration_time_) * NANOS_PER_MILLIS)); _rf_state_atomic = RuntimeFilterState::TIME_OUT; } } @@ -1262,6 +1288,14 @@ PrimitiveType IRuntimeFilter::column_type() const { void IRuntimeFilter::signal() { DCHECK(is_consumer()); + + if (!_wrapper->is_ignored() && _wrapper->is_bloomfilter() && + !_wrapper->get_bloomfilter()->inited()) { + throw Exception(ErrorCode::INTERNAL_ERROR, "bf not inited and not ignored, rf: {}", + debug_string()); + } + + COUNTER_SET(_wait_timer, int64_t((MonotonicMillis() - registration_time_) * NANOS_PER_MILLIS)); _rf_state_atomic.store(RuntimeFilterState::READY); if (!_filter_timer.empty()) { for (auto& timer : _filter_timer) { @@ -1303,7 +1337,7 @@ void IRuntimeFilter::set_synced_size(uint64_t global_size) { } void IRuntimeFilter::set_ignored() { - _wrapper->_context->ignored = true; + _wrapper->set_ignored(); } bool IRuntimeFilter::get_ignored() { @@ -1312,14 +1346,14 @@ bool IRuntimeFilter::get_ignored() { std::string IRuntimeFilter::formatted_state() const { return fmt::format( - "[IsPushDown = {}, RuntimeFilterState = {}, HasRemoteTarget = {}, " + "[Id = {}, IsPushDown = {}, RuntimeFilterState = {}, HasRemoteTarget = {}, " "HasLocalTarget = {}, Ignored = {}]", - _is_push_down, _get_explain_state_string(), _has_remote_target, _has_local_target, - _wrapper->_context->ignored); + _filter_id, _is_push_down, _get_explain_state_string(), _has_remote_target, + _has_local_target, _wrapper->_context->ignored); } Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQueryOptions* options, - int node_id, bool build_bf_exactly) { + int node_id) { // if node_id == -1 , it shouldn't be a consumer DCHECK(node_id >= 0 || (node_id == -1 && !is_consumer())); @@ -1329,6 +1363,8 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue _expr_order = desc->expr_order; vectorized::VExprContextSPtr build_ctx; RETURN_IF_ERROR(vectorized::VExpr::create_expr_tree(desc->src_expr, build_ctx)); + _enable_fixed_len_to_uint32_v2 = options->__isset.enable_fixed_len_to_uint32_v2 && + options->enable_fixed_len_to_uint32_v2; RuntimeFilterParams params; params.filter_id = _filter_id; @@ -1341,20 +1377,10 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue params.runtime_bloom_filter_max_size = options->__isset.runtime_bloom_filter_max_size ? options->runtime_bloom_filter_max_size : 0; - // We build runtime filter by exact distinct count iff three conditions are met: - // 1. Only 1 join key - // 2. Do not have remote target (e.g. do not need to merge), or broadcast join - // 3. Bloom filter - params.build_bf_exactly = - build_bf_exactly && (_runtime_filter_type == RuntimeFilterType::BLOOM_FILTER || - _runtime_filter_type == RuntimeFilterType::IN_OR_BLOOM_FILTER); + params.build_bf_exactly = desc->__isset.build_bf_exactly && desc->build_bf_exactly; params.bloom_filter_size_calculated_by_ndv = desc->bloom_filter_size_calculated_by_ndv; - if (!desc->__isset.sync_filter_size || !desc->sync_filter_size) { - params.build_bf_exactly &= (!_has_remote_target || _is_broadcast_join); - } - if (desc->__isset.bloom_filter_size_bytes) { params.bloom_filter_size = desc->bloom_filter_size_bytes; } @@ -1389,7 +1415,11 @@ Status IRuntimeFilter::init_with_desc(const TRuntimeFilterDesc* desc, const TQue } _wrapper = std::make_shared(¶ms); - return _wrapper->init(¶ms); + RETURN_IF_ERROR(_wrapper->init(¶ms)); + if (_enable_fixed_len_to_uint32_v2) { + _wrapper->set_enable_fixed_len_to_uint32_v2(); + } + return Status::OK(); } Status IRuntimeFilter::serialize(PMergeFilterRequest* request, void** data, int* len) { @@ -1500,19 +1530,25 @@ void IRuntimeFilter::init_profile(RuntimeProfile* parent_profile) { _profile_init = true; parent_profile->add_child(_profile.get(), true, nullptr); _profile->add_info_string("Info", formatted_state()); + _wait_timer = ADD_TIMER(_profile, "WaitTime"); } } -void IRuntimeFilter::update_runtime_filter_type_to_profile() { +void IRuntimeFilter::update_runtime_filter_type_to_profile(uint64_t local_merge_time) { _profile->add_info_string("RealRuntimeFilterType", to_string(_wrapper->get_real_type())); + _profile->add_info_string("LocalMergeTime", + std::to_string((double)local_merge_time / NANOS_PER_SEC) + " s"); } std::string IRuntimeFilter::debug_string() const { return fmt::format( - "RuntimeFilter: (id = {}, type = {}, need_local_merge: {}, is_broadcast: {}, " - "build_bf_cardinality: {}, error_msg: {}", - _filter_id, to_string(_runtime_filter_type), _need_local_merge, _is_broadcast_join, - _wrapper->get_build_bf_cardinality(), _wrapper->_context->err_msg); + "RuntimeFilter: (id = {}, type = {}, is_broadcast: {}, ignored: {}, " + "build_bf_cardinality: {}, dependency: {}, synced_size: {}, has_local_target: {}, " + "has_remote_target: {}, error_msg: [{}]", + _filter_id, to_string(_runtime_filter_type), _is_broadcast_join, + _wrapper->_context->ignored, _wrapper->get_build_bf_cardinality(), + _dependency ? _dependency->debug_string() : "none", _synced_size, _has_local_target, + _has_remote_target, _wrapper->_context->err_msg); } Status IRuntimeFilter::merge_from(const RuntimePredicateWrapper* wrapper) { @@ -1524,17 +1560,6 @@ Status IRuntimeFilter::merge_from(const RuntimePredicateWrapper* wrapper) { return Status::OK(); } -template -void batch_copy(PInFilter* filter, HybridSetBase::IteratorBase* it, - void (*set_func)(PColumnValue*, const T*)) { - while (it->has_next()) { - const void* void_value = it->get_value(); - auto origin_value = reinterpret_cast(void_value); - set_func(filter->add_values(), origin_value); - it->next(); - } -} - template Status IRuntimeFilter::serialize_impl(T* request, void** data, int* len) { auto real_runtime_filter_type = _wrapper->get_real_type(); @@ -1562,273 +1587,13 @@ Status IRuntimeFilter::serialize_impl(T* request, void** data, int* len) { } void IRuntimeFilter::to_protobuf(PInFilter* filter) { - auto column_type = _wrapper->column_type(); - filter->set_column_type(to_proto(column_type)); - - auto* it = _wrapper->get_in_filter_iterator(); - DCHECK(it != nullptr); - - switch (column_type) { - case TYPE_BOOLEAN: { - batch_copy(filter, it, [](PColumnValue* column, const bool* value) { - column->set_boolval(*value); - }); - return; - } - case TYPE_TINYINT: { - batch_copy(filter, it, [](PColumnValue* column, const int8_t* value) { - column->set_intval(*value); - }); - return; - } - case TYPE_SMALLINT: { - batch_copy(filter, it, [](PColumnValue* column, const int16_t* value) { - column->set_intval(*value); - }); - return; - } - case TYPE_INT: { - batch_copy(filter, it, [](PColumnValue* column, const int32_t* value) { - column->set_intval(*value); - }); - return; - } - case TYPE_BIGINT: { - batch_copy(filter, it, [](PColumnValue* column, const int64_t* value) { - column->set_longval(*value); - }); - return; - } - case TYPE_LARGEINT: { - batch_copy(filter, it, [](PColumnValue* column, const int128_t* value) { - column->set_stringval(LargeIntValue::to_string(*value)); - }); - return; - } - case TYPE_FLOAT: { - batch_copy(filter, it, [](PColumnValue* column, const float* value) { - column->set_doubleval(*value); - }); - return; - } - case TYPE_DOUBLE: { - batch_copy(filter, it, [](PColumnValue* column, const double* value) { - column->set_doubleval(*value); - }); - return; - } - case TYPE_DATEV2: { - batch_copy>( - filter, it, [](PColumnValue* column, const DateV2Value* value) { - column->set_intval(*reinterpret_cast(value)); - }); - return; - } - case TYPE_DATETIMEV2: { - batch_copy>( - filter, it, - [](PColumnValue* column, const DateV2Value* value) { - column->set_longval(*reinterpret_cast(value)); - }); - return; - } - case TYPE_DATE: - case TYPE_DATETIME: { - batch_copy(filter, it, - [](PColumnValue* column, const VecDateTimeValue* value) { - char convert_buffer[30]; - value->to_string(convert_buffer); - column->set_stringval(convert_buffer); - }); - return; - } - case TYPE_DECIMALV2: { - batch_copy(filter, it, - [](PColumnValue* column, const DecimalV2Value* value) { - column->set_stringval(value->to_string()); - }); - return; - } - case TYPE_DECIMAL32: { - batch_copy(filter, it, [](PColumnValue* column, const int32_t* value) { - column->set_intval(*value); - }); - return; - } - case TYPE_DECIMAL64: { - batch_copy(filter, it, [](PColumnValue* column, const int64_t* value) { - column->set_longval(*value); - }); - return; - } - case TYPE_DECIMAL128I: { - batch_copy(filter, it, [](PColumnValue* column, const int128_t* value) { - column->set_stringval(LargeIntValue::to_string(*value)); - }); - return; - } - case TYPE_DECIMAL256: { - batch_copy(filter, it, [](PColumnValue* column, const wide::Int256* value) { - column->set_stringval(wide::to_string(*value)); - }); - return; - } - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: { - //const void* void_value = it->get_value(); - //Now the get_value return void* is StringRef - batch_copy(filter, it, [](PColumnValue* column, const StringRef* value) { - column->set_stringval(value->to_string()); - }); - return; - } - case TYPE_IPV4: { - batch_copy(filter, it, [](PColumnValue* column, const IPv4* value) { - column->set_intval(*reinterpret_cast(value)); - }); - return; - } - case TYPE_IPV6: { - batch_copy(filter, it, [](PColumnValue* column, const IPv6* value) { - column->set_stringval(LargeIntValue::to_string(*value)); - }); - return; - } - default: { - throw Exception(ErrorCode::INTERNAL_ERROR, - "runtime filter meet invalid PrimitiveType type {}", int(column_type)); - } - } + filter->set_column_type(to_proto(_wrapper->column_type())); + _wrapper->_context->hybrid_set->to_pb(filter); } void IRuntimeFilter::to_protobuf(PMinMaxFilter* filter) { - void* min_data = nullptr; - void* max_data = nullptr; - _wrapper->get_minmax_filter_desc(&min_data, &max_data); - DCHECK(min_data != nullptr && max_data != nullptr); filter->set_column_type(to_proto(_wrapper->column_type())); - - switch (_wrapper->column_type()) { - case TYPE_BOOLEAN: { - filter->mutable_min_val()->set_boolval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_boolval(*reinterpret_cast(max_data)); - return; - } - case TYPE_TINYINT: { - filter->mutable_min_val()->set_intval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_intval(*reinterpret_cast(max_data)); - return; - } - case TYPE_SMALLINT: { - filter->mutable_min_val()->set_intval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_intval(*reinterpret_cast(max_data)); - return; - } - case TYPE_INT: { - filter->mutable_min_val()->set_intval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_intval(*reinterpret_cast(max_data)); - return; - } - case TYPE_BIGINT: { - filter->mutable_min_val()->set_longval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_longval(*reinterpret_cast(max_data)); - return; - } - case TYPE_LARGEINT: { - filter->mutable_min_val()->set_stringval( - LargeIntValue::to_string(*reinterpret_cast(min_data))); - filter->mutable_max_val()->set_stringval( - LargeIntValue::to_string(*reinterpret_cast(max_data))); - return; - } - case TYPE_FLOAT: { - filter->mutable_min_val()->set_doubleval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_doubleval(*reinterpret_cast(max_data)); - return; - } - case TYPE_DOUBLE: { - filter->mutable_min_val()->set_doubleval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_doubleval(*reinterpret_cast(max_data)); - return; - } - case TYPE_DATEV2: { - filter->mutable_min_val()->set_intval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_intval(*reinterpret_cast(max_data)); - return; - } - case TYPE_DATETIMEV2: { - filter->mutable_min_val()->set_longval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_longval(*reinterpret_cast(max_data)); - return; - } - case TYPE_DATE: - case TYPE_DATETIME: { - char convert_buffer[30]; - reinterpret_cast(min_data)->to_string(convert_buffer); - filter->mutable_min_val()->set_stringval(convert_buffer); - reinterpret_cast(max_data)->to_string(convert_buffer); - filter->mutable_max_val()->set_stringval(convert_buffer); - return; - } - case TYPE_DECIMALV2: { - filter->mutable_min_val()->set_stringval( - reinterpret_cast(min_data)->to_string()); - filter->mutable_max_val()->set_stringval( - reinterpret_cast(max_data)->to_string()); - return; - } - case TYPE_DECIMAL32: { - filter->mutable_min_val()->set_intval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_intval(*reinterpret_cast(max_data)); - return; - } - case TYPE_DECIMAL64: { - filter->mutable_min_val()->set_longval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_longval(*reinterpret_cast(max_data)); - return; - } - case TYPE_DECIMAL128I: { - filter->mutable_min_val()->set_stringval( - LargeIntValue::to_string(*reinterpret_cast(min_data))); - filter->mutable_max_val()->set_stringval( - LargeIntValue::to_string(*reinterpret_cast(max_data))); - return; - } - case TYPE_DECIMAL256: { - filter->mutable_min_val()->set_stringval( - wide::to_string(*reinterpret_cast(min_data))); - filter->mutable_max_val()->set_stringval( - wide::to_string(*reinterpret_cast(max_data))); - return; - } - case TYPE_CHAR: - case TYPE_VARCHAR: - case TYPE_STRING: { - const auto* min_string_value = reinterpret_cast(min_data); - filter->mutable_min_val()->set_stringval(*min_string_value); - const auto* max_string_value = reinterpret_cast(max_data); - filter->mutable_max_val()->set_stringval(*max_string_value); - break; - } - case TYPE_IPV4: { - filter->mutable_min_val()->set_intval(*reinterpret_cast(min_data)); - filter->mutable_max_val()->set_intval(*reinterpret_cast(max_data)); - return; - } - case TYPE_IPV6: { - filter->mutable_min_val()->set_stringval( - LargeIntValue::to_string(*reinterpret_cast(min_data))); - filter->mutable_max_val()->set_stringval( - LargeIntValue::to_string(*reinterpret_cast(max_data))); - return; - } - default: { - throw Exception(ErrorCode::INTERNAL_ERROR, - "runtime filter meet invalid PrimitiveType type {}", - int(_wrapper->column_type())); - } - } + _wrapper->_context->minmax_func->to_pb(filter); } RuntimeFilterType IRuntimeFilter::get_real_type() { @@ -1836,29 +1601,12 @@ RuntimeFilterType IRuntimeFilter::get_real_type() { } bool IRuntimeFilter::need_sync_filter_size() { - return (type() == RuntimeFilterType::IN_OR_BLOOM_FILTER || - type() == RuntimeFilterType::BLOOM_FILTER) && - _wrapper->get_build_bf_cardinality() && !_is_broadcast_join; -} - -Status IRuntimeFilter::update_filter(const UpdateRuntimeFilterParams* param) { - _profile->add_info_string("MergeTime", std::to_string(param->request->merge_time()) + " ms"); - - if (param->request->has_ignored() && param->request->ignored()) { - set_ignored(); - } else { - std::unique_ptr wrapper; - RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(param, &wrapper)); - RETURN_IF_ERROR(_wrapper->merge(wrapper.get())); - update_runtime_filter_type_to_profile(); - } - this->signal(); - - return Status::OK(); + return _wrapper->get_build_bf_cardinality() && !_is_broadcast_join; } void IRuntimeFilter::update_filter(std::shared_ptr wrapper, - int64_t merge_time, int64_t start_apply) { + int64_t merge_time, int64_t start_apply, + uint64_t local_merge_time) { _profile->add_info_string("UpdateTime", std::to_string(MonotonicMillis() - start_apply) + " ms"); _profile->add_info_string("MergeTime", std::to_string(merge_time) + " ms"); @@ -1868,7 +1616,10 @@ void IRuntimeFilter::update_filter(std::shared_ptr wrap wrapper->_column_return_type = _wrapper->_column_return_type; } _wrapper = wrapper; - update_runtime_filter_type_to_profile(); + if (_enable_fixed_len_to_uint32_v2) { + _wrapper->set_enable_fixed_len_to_uint32_v2(); + } + update_runtime_filter_type_to_profile(local_merge_time); signal(); } diff --git a/be/src/exprs/runtime_filter.h b/be/src/exprs/runtime_filter.h index f5a069d9e55f85..b0e82a75335cc5 100644 --- a/be/src/exprs/runtime_filter.h +++ b/be/src/exprs/runtime_filter.h @@ -192,8 +192,7 @@ enum RuntimeFilterState { /// that can be pushed down to node based on the results of the right table. class IRuntimeFilter { public: - IRuntimeFilter(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, - bool need_local_merge = false) + IRuntimeFilter(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc) : _state(state), _filter_id(desc->filter_id), _is_broadcast_join(true), @@ -203,20 +202,18 @@ class IRuntimeFilter { _role(RuntimeFilterRole::PRODUCER), _expr_order(-1), registration_time_(MonotonicMillis()), - _wait_infinitely(_state->runtime_filter_wait_infinitely), - _rf_wait_time_ms(_state->runtime_filter_wait_time_ms), + _wait_infinitely(_state->get_query_ctx()->runtime_filter_wait_infinitely()), + _rf_wait_time_ms(_state->get_query_ctx()->runtime_filter_wait_time_ms()), _runtime_filter_type(get_runtime_filter_type(desc)), - _profile( - new RuntimeProfile(fmt::format("RuntimeFilter: (id = {}, type = {})", - _filter_id, to_string(_runtime_filter_type)))), - _need_local_merge(need_local_merge) {} + _profile(new RuntimeProfile(fmt::format("RuntimeFilter: (id = {}, type = {})", + _filter_id, + to_string(_runtime_filter_type)))) {} ~IRuntimeFilter() = default; static Status create(RuntimeFilterParamsContext* state, const TRuntimeFilterDesc* desc, const TQueryOptions* query_options, const RuntimeFilterRole role, - int node_id, std::shared_ptr* res, - bool build_bf_exactly = false, bool need_local_merge = false); + int node_id, std::shared_ptr* res); RuntimeFilterContextSPtr& get_shared_context_ref(); @@ -225,7 +222,7 @@ class IRuntimeFilter { // publish filter // push filter to remote node or push down it to scan_node - Status publish(bool publish_local = false); + Status publish(RuntimeState* state, bool publish_local = false); Status send_filter_size(RuntimeState* state, uint64_t local_filter_size); @@ -262,7 +259,7 @@ class IRuntimeFilter { // init filter with desc Status init_with_desc(const TRuntimeFilterDesc* desc, const TQueryOptions* options, - int node_id = -1, bool build_bf_exactly = false); + int node_id = -1); // serialize _wrapper to protobuf Status serialize(PMergeFilterRequest* request, void** data, int* len); @@ -280,9 +277,8 @@ class IRuntimeFilter { std::shared_ptr* wrapper); Status change_to_bloom_filter(); Status init_bloom_filter(const size_t build_bf_cardinality); - Status update_filter(const UpdateRuntimeFilterParams* param); void update_filter(std::shared_ptr filter_wrapper, int64_t merge_time, - int64_t start_apply); + int64_t start_apply, uint64_t local_merge_time); void set_ignored(); @@ -293,13 +289,14 @@ class IRuntimeFilter { bool need_sync_filter_size(); // async push runtimefilter to remote node - Status push_to_remote(const TNetworkAddress* addr); + Status push_to_remote(RuntimeState* state, const TNetworkAddress* addr, + uint64_t local_merge_time); void init_profile(RuntimeProfile* parent_profile); std::string debug_string() const; - void update_runtime_filter_type_to_profile(); + void update_runtime_filter_type_to_profile(uint64_t local_merge_time); int filter_id() const { return _filter_id; } @@ -335,7 +332,7 @@ class IRuntimeFilter { int32_t wait_time_ms() const { int32_t res = 0; if (wait_infinitely()) { - res = _state->execution_timeout; + res = _state->get_query_ctx()->execution_timeout(); // Convert to ms res *= 1000; } else { @@ -356,9 +353,13 @@ class IRuntimeFilter { void set_finish_dependency( const std::shared_ptr& dependency); - int64_t get_synced_size() const { return _synced_size; } - - bool isset_synced_size() const { return _synced_size != -1; } + int64_t get_synced_size() const { + if (_synced_size == -1 || !_dependency) { + throw Exception(doris::ErrorCode::INTERNAL_ERROR, + "sync filter size meet error, filter: {}", debug_string()); + } + return _synced_size; + } protected: // serialize _wrapper to protobuf @@ -417,14 +418,14 @@ class IRuntimeFilter { // parent profile // only effect on consumer std::unique_ptr _profile; - // `_need_local_merge` indicates whether this runtime filter is global on this BE. - // All runtime filters should be merged on each BE before push_to_remote or publish. - bool _need_local_merge = false; + RuntimeProfile::Counter* _wait_timer = nullptr; std::vector> _filter_timer; int64_t _synced_size = -1; std::shared_ptr _dependency; + + bool _enable_fixed_len_to_uint32_v2 = false; }; // avoid expose RuntimePredicateWrapper diff --git a/be/src/exprs/runtime_filter_convertor.h b/be/src/exprs/runtime_filter_convertor.h new file mode 100644 index 00000000000000..82df75e4abf329 --- /dev/null +++ b/be/src/exprs/runtime_filter_convertor.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "runtime/large_int_value.h" +#include "vec/common/string_ref.h" +#include "vec/core/wide_integer.h" + +namespace doris { + +template +auto get_convertor() { + if constexpr (std::is_same_v) { + return [](PColumnValue* value, const T& data) { value->set_boolval(data); }; + } else if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v) { + return [](PColumnValue* value, const T& data) { value->set_intval(data); }; + } else if constexpr (std::is_same_v || std::is_same_v) { + return [](PColumnValue* value, const T& data) { value->set_longval(data); }; + } else if constexpr (std::is_same_v || std::is_same_v) { + return [](PColumnValue* value, const T& data) { value->set_doubleval(data); }; + } else if constexpr (std::is_same_v || std::is_same_v || + std::is_same_v) { + return [](PColumnValue* value, const T& data) { + value->set_stringval(LargeIntValue::to_string(data)); + }; + } else if constexpr (std::is_same_v) { + return [](PColumnValue* value, const T& data) { + value->set_stringval(wide::to_string(wide::Int256(data))); + }; + } else if constexpr (std::is_same_v) { + return [](PColumnValue* value, const T& data) { value->set_stringval(data); }; + } else if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + return [](PColumnValue* value, const T& data) { value->set_stringval(data.to_string()); }; + } else if constexpr (std::is_same_v) { + return [](PColumnValue* value, const T& data) { + char convert_buffer[30]; + data.to_string(convert_buffer); + value->set_stringval(convert_buffer); + }; + } else if constexpr (std::is_same_v>) { + return [](PColumnValue* value, const T& data) { + value->set_intval(data.to_date_int_val()); + }; + } else if constexpr (std::is_same_v>) { + return [](PColumnValue* value, const T& data) { + value->set_longval(data.to_date_int_val()); + }; + } else { + throw Exception(ErrorCode::INTERNAL_ERROR, + "runtime filter data convertor meet invalid type {}", typeid(T).name()); + return [](PColumnValue* value, const T& data) {}; + } +} + +} // namespace doris diff --git a/be/src/exprs/runtime_filter_slots.h b/be/src/exprs/runtime_filter_slots.h index 42c5f598633ad9..7eb8c131c8a303 100644 --- a/be/src/exprs/runtime_filter_slots.h +++ b/be/src/exprs/runtime_filter_slots.h @@ -62,9 +62,8 @@ class VRuntimeFilterSlots { } // use synced size when this rf has global merged - static uint64_t get_real_size(IRuntimeFilter* runtime_filter, uint64_t hash_table_size) { - return runtime_filter->isset_synced_size() ? runtime_filter->get_synced_size() - : hash_table_size; + static uint64_t get_real_size(IRuntimeFilter* filter, uint64_t hash_table_size) { + return filter->need_sync_filter_size() ? filter->get_synced_size() : hash_table_size; } Status ignore_filters(RuntimeState* state) { @@ -119,10 +118,6 @@ class VRuntimeFilterSlots { } if (filter->get_real_type() == RuntimeFilterType::BLOOM_FILTER) { - if (filter->need_sync_filter_size() != filter->isset_synced_size()) { - return Status::InternalError("sync filter size meet error, filter: {}", - filter->debug_string()); - } RETURN_IF_ERROR(filter->init_bloom_filter( get_real_size(filter.get(), local_hash_table_size))); } @@ -149,10 +144,10 @@ class VRuntimeFilterSlots { } // publish runtime filter - Status publish(bool publish_local) { + Status publish(RuntimeState* state, bool publish_local) { for (auto& pair : _runtime_filters_map) { for (auto& filter : pair.second) { - RETURN_IF_ERROR(filter->publish(publish_local)); + RETURN_IF_ERROR(filter->publish(state, publish_local)); } } return Status::OK(); diff --git a/be/src/exprs/runtime_filter_slots_cross.h b/be/src/exprs/runtime_filter_slots_cross.h index 01ae21a75992de..a49f2928f842a9 100644 --- a/be/src/exprs/runtime_filter_slots_cross.h +++ b/be/src/exprs/runtime_filter_slots_cross.h @@ -72,9 +72,9 @@ class VRuntimeFilterSlotsCross { return Status::OK(); } - Status publish() { + Status publish(RuntimeState* state) { for (auto filter : _runtime_filters) { - RETURN_IF_ERROR(filter->publish()); + RETURN_IF_ERROR(filter->publish(state)); } return Status::OK(); } diff --git a/be/src/gutil/strings/stringpiece.h b/be/src/gutil/strings/stringpiece.h index 38e36a27099279..7a4ebabbf098e7 100644 --- a/be/src/gutil/strings/stringpiece.h +++ b/be/src/gutil/strings/stringpiece.h @@ -149,6 +149,12 @@ class StringPiece { assert(length <= static_cast(std::numeric_limits::max())); length_ = static_cast(length); } + StringPiece(std::string_view view) // NOLINT(runtime/explicit) + : ptr_(view.data()), length_(0) { + size_t length = view.size(); + assert(length <= static_cast(std::numeric_limits::max())); + length_ = static_cast(length); + } StringPiece(const char* offset, int len) : ptr_(offset), length_(len) { assert(len >= 0); } // Substring of another StringPiece. diff --git a/be/src/http/action/batch_download_action.cpp b/be/src/http/action/batch_download_action.cpp new file mode 100644 index 00000000000000..d486883e90be28 --- /dev/null +++ b/be/src/http/action/batch_download_action.cpp @@ -0,0 +1,216 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "http/action/batch_download_action.h" + +#include +#include +#include +#include + +#include "common/config.h" +#include "common/logging.h" +#include "common/status.h" +#include "gutil/strings/split.h" +#include "http/http_channel.h" +#include "http/http_method.h" +#include "http/http_request.h" +#include "http/utils.h" +#include "io/fs/local_file_system.h" +#include "runtime/exec_env.h" +#include "util/security.h" + +namespace doris { +namespace { +const std::string CHECK_PARAMETER = "check"; +const std::string LIST_PARAMETER = "list"; +const std::string DIR_PARAMETER = "dir"; +const std::string TOKEN_PARAMETER = "token"; +} // namespace + +BatchDownloadAction::BatchDownloadAction( + ExecEnv* exec_env, std::shared_ptr rate_limit_group, + const std::vector& allow_dirs) + : HttpHandlerWithAuth(exec_env), _rate_limit_group(std::move(rate_limit_group)) { + for (const auto& dir : allow_dirs) { + std::string p; + Status st = io::global_local_filesystem()->canonicalize(dir, &p); + if (!st.ok()) { + continue; + } + _allow_paths.emplace_back(std::move(p)); + } +} + +void BatchDownloadAction::handle(HttpRequest* req) { + if (VLOG_CRITICAL_IS_ON) { + VLOG_CRITICAL << "accept one batch download request " << req->debug_string(); + } + + if (req->param(CHECK_PARAMETER) == "true") { + // For API support check + HttpChannel::send_reply(req, "OK"); + return; + } + + // Get 'dir' parameter, then assembly file absolute path + const std::string& dir_path = req->param(DIR_PARAMETER); + if (dir_path.empty()) { + std::string error_msg = + std::string("parameter " + DIR_PARAMETER + " not specified in url."); + LOG(WARNING) << "handle batch download request: " << error_msg + << ", url: " << mask_token(req->uri()); + HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); + return; + } + + if (dir_path.find("..") != std::string::npos) { + std::string error_msg = "Not allowed to read relative path: " + dir_path; + LOG(WARNING) << "handle batch download request: " << error_msg + << ", url: " << mask_token(req->uri()); + HttpChannel::send_reply(req, HttpStatus::FORBIDDEN, error_msg); + return; + } + + Status status; + if (config::enable_token_check) { + status = _check_token(req); + if (!status.ok()) { + std::string error_msg = status.to_string(); + if (status.is()) { + HttpChannel::send_reply(req, HttpStatus::UNAUTHORIZED, error_msg); + return; + } else { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, error_msg); + return; + } + } + } + + status = _check_path_is_allowed(dir_path); + if (!status.ok()) { + std::string error_msg = status.to_string(); + if (status.is() || status.is()) { + HttpChannel::send_reply(req, HttpStatus::NOT_FOUND, error_msg); + return; + } else if (status.is()) { + HttpChannel::send_reply(req, HttpStatus::UNAUTHORIZED, error_msg); + return; + } else { + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, error_msg); + return; + } + } + + bool is_dir = false; + status = io::global_local_filesystem()->is_directory(dir_path, &is_dir); + if (!status.ok()) { + LOG(WARNING) << "handle batch download request: " << status.to_string() + << ", url: " << mask_token(req->uri()); + HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, status.to_string()); + return; + } + + if (!is_dir) { + std::string error_msg = fmt::format("The requested path is not a directory: {}", dir_path); + LOG(WARNING) << "handle batch download request: " << error_msg + << ", url: " << mask_token(req->uri()); + HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); + return; + } + + _handle(req, dir_path); + + VLOG_CRITICAL << "deal with batch download request finished! "; +} + +void BatchDownloadAction::_handle(HttpRequest* req, const std::string& dir_path) { + bool is_list_request = req->param(LIST_PARAMETER) == "true"; + if (is_list_request) { + // return the list of files in the specified directory + bool is_acquire_filesize = true; + do_dir_response(dir_path, req, is_acquire_filesize); + } else { + _handle_batch_download(req, dir_path); + } +} + +void BatchDownloadAction::_handle_batch_download(HttpRequest* req, const std::string& dir_path) { + std::vector files = + strings::Split(req->get_request_body(), "\n", strings::SkipWhitespace()); + if (files.empty()) { + std::string error_msg = "No file specified in request body."; + LOG(WARNING) << "handle batch download request: " << error_msg + << ", url: " << mask_token(req->uri()); + HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); + return; + } + + if (files.size() > 64) { + std::string error_msg = + "The number of files to download in a batch should be less than 64."; + LOG(WARNING) << "handle batch download request: " << error_msg + << ", url: " << mask_token(req->uri()); + HttpChannel::send_reply(req, HttpStatus::BAD_REQUEST, error_msg); + return; + } + + for (const auto& file : files) { + if (file.find('/') != std::string::npos) { + std::string error_msg = + fmt::format("Not allowed to read relative path: {}, dir: {}", file, dir_path); + LOG(WARNING) << "handle batch download request: " << error_msg + << ", url: " << mask_token(req->uri()); + HttpChannel::send_reply(req, HttpStatus::FORBIDDEN, error_msg); + return; + } + } + + HttpChannel::send_files(req, dir_path, std::move(files)); +} + +Status BatchDownloadAction::_check_token(HttpRequest* req) { + const std::string& token_str = req->param(TOKEN_PARAMETER); + if (token_str.empty()) { + LOG(WARNING) << "token is not specified in request. url: " << mask_token(req->uri()); + return Status::NotAuthorized("token is not specified."); + } + + const std::string& local_token = _exec_env->token(); + if (token_str != local_token) { + LOG(WARNING) << "invalid download token: " << mask_token(token_str) + << ", local token: " << mask_token(local_token) + << ", url: " << mask_token(req->uri()); + return Status::NotAuthorized("invalid token {}", mask_token(token_str)); + } + + return Status::OK(); +} + +Status BatchDownloadAction::_check_path_is_allowed(const std::string& file_path) { + std::string canonical_file_path; + RETURN_IF_ERROR(io::global_local_filesystem()->canonicalize(file_path, &canonical_file_path)); + for (auto& allow_path : _allow_paths) { + if (io::LocalFileSystem::contain_path(allow_path, canonical_file_path)) { + return Status::OK(); + } + } + + return Status::NotAuthorized("file path is not allowed: {}", canonical_file_path); +} + +} // end namespace doris diff --git a/be/src/http/action/batch_download_action.h b/be/src/http/action/batch_download_action.h new file mode 100644 index 00000000000000..f0b7e3576b9937 --- /dev/null +++ b/be/src/http/action/batch_download_action.h @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "common/status.h" +#include "http/http_handler.h" +#include "http/http_handler_with_auth.h" +#include "util/threadpool.h" + +struct bufferevent_rate_limit_group; + +namespace doris { + +class ExecEnv; +class HttpRequest; + +// A simple handler that serves incoming HTTP requests of batching file-download to send their +// respective HTTP responses. +// +// We use parameter named 'dir' to specify the static resource path, it is an absolute path. +// +// In HEAD request, then this handler will return the list of files in the specified directory. +// +// In GET request, the file names to download are specified in the request body as a list of strings, +// separated by '\n'. To avoid cost resource, the maximum number of files to download in a batch is 64. +class BatchDownloadAction : public HttpHandlerWithAuth { +public: + BatchDownloadAction(ExecEnv* exec_env, + std::shared_ptr rate_limit_group, + const std::vector& allow_dirs); + + ~BatchDownloadAction() override = default; + + void handle(HttpRequest* req) override; + +private: + Status _check_token(HttpRequest* req); + Status _check_path_is_allowed(const std::string& path); + + void _handle(HttpRequest* req, const std::string& dir_path); + void _handle_batch_download(HttpRequest* req, const std::string& dir_path); + + std::vector _allow_paths; + std::shared_ptr _rate_limit_group; +}; + +} // end namespace doris diff --git a/be/src/http/action/calc_file_crc_action.cpp b/be/src/http/action/calc_file_crc_action.cpp index 66ec96a2a9ac65..123f55dd7fd744 100644 --- a/be/src/http/action/calc_file_crc_action.cpp +++ b/be/src/http/action/calc_file_crc_action.cpp @@ -46,7 +46,7 @@ CalcFileCrcAction::CalcFileCrcAction(ExecEnv* exec_env, BaseStorageEngine& engin // calculate the crc value of the files in the tablet Status CalcFileCrcAction::_handle_calc_crc(HttpRequest* req, uint32_t* crc_value, int64_t* start_version, int64_t* end_version, - int32_t* rowset_count, int64_t* file_count) { + uint32_t* rowset_count, int64_t* file_count) { uint64_t tablet_id = 0; const auto& req_tablet_id = req->param(TABLET_ID_KEY); if (req_tablet_id.empty()) { @@ -110,7 +110,7 @@ void CalcFileCrcAction::handle(HttpRequest* req) { uint32_t crc_value = 0; int64_t start_version = 0; int64_t end_version = 0; - int32_t rowset_count = 0; + uint32_t rowset_count = 0; int64_t file_count = 0; MonotonicStopWatch timer; diff --git a/be/src/http/action/calc_file_crc_action.h b/be/src/http/action/calc_file_crc_action.h index 30df8bfe629cf3..ea5fb894957d19 100644 --- a/be/src/http/action/calc_file_crc_action.h +++ b/be/src/http/action/calc_file_crc_action.h @@ -44,7 +44,7 @@ class CalcFileCrcAction : public HttpHandlerWithAuth { private: Status _handle_calc_crc(HttpRequest* req, uint32_t* crc_value, int64_t* start_version, - int64_t* end_version, int32_t* rowset_count, int64_t* file_count); + int64_t* end_version, uint32_t* rowset_count, int64_t* file_count); private: BaseStorageEngine& _engine; diff --git a/be/src/cloud/cloud_delete_bitmap_action.cpp b/be/src/http/action/delete_bitmap_action.cpp similarity index 52% rename from be/src/cloud/cloud_delete_bitmap_action.cpp rename to be/src/http/action/delete_bitmap_action.cpp index 60db5896dfab8a..2fa0a73c2f338c 100644 --- a/be/src/cloud/cloud_delete_bitmap_action.cpp +++ b/be/src/http/action/delete_bitmap_action.cpp @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "cloud_delete_bitmap_action.h" +#include "delete_bitmap_action.h" #include #include @@ -33,8 +33,11 @@ #include #include +#include "cloud/cloud_meta_mgr.h" +#include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet.h" #include "cloud/cloud_tablet_mgr.h" +#include "cloud/config.h" #include "common/logging.h" #include "common/status.h" #include "gutil/strings/substitute.h" @@ -43,12 +46,12 @@ #include "http/http_request.h" #include "http/http_status.h" #include "olap/olap_define.h" -#include "olap/storage_engine.h" #include "olap/tablet_manager.h" #include "util/doris_metrics.h" #include "util/stopwatch.hpp" namespace doris { +#include "common/compile_check_begin.h" using namespace ErrorCode; namespace { @@ -57,10 +60,9 @@ constexpr std::string_view HEADER_JSON = "application/json"; } // namespace -CloudDeleteBitmapAction::CloudDeleteBitmapAction(DeleteBitmapActionType ctype, ExecEnv* exec_env, - CloudStorageEngine& engine, - TPrivilegeHier::type hier, - TPrivilegeType::type ptype) +DeleteBitmapAction::DeleteBitmapAction(DeleteBitmapActionType ctype, ExecEnv* exec_env, + BaseStorageEngine& engine, TPrivilegeHier::type hier, + TPrivilegeType::type ptype) : HttpHandlerWithAuth(exec_env, hier, ptype), _engine(engine), _delete_bitmap_action_type(ctype) {} @@ -78,8 +80,8 @@ static Status _check_param(HttpRequest* req, uint64_t* tablet_id) { return Status::OK(); } -Status CloudDeleteBitmapAction::_handle_show_delete_bitmap_count(HttpRequest* req, - std::string* json_result) { +Status DeleteBitmapAction::_handle_show_local_delete_bitmap_count(HttpRequest* req, + std::string* json_result) { uint64_t tablet_id = 0; // check & retrieve tablet_id from req if it contains RETURN_NOT_OK_STATUS_WITH_WARN(_check_param(req, &tablet_id), "check param failed"); @@ -87,14 +89,62 @@ Status CloudDeleteBitmapAction::_handle_show_delete_bitmap_count(HttpRequest* re return Status::InternalError("check param failed: missing tablet_id"); } - CloudTabletSPtr tablet = DORIS_TRY(_engine.tablet_mgr().get_tablet(tablet_id)); + BaseTabletSPtr tablet = nullptr; + if (config::is_cloud_mode()) { + tablet = DORIS_TRY(_engine.to_cloud().tablet_mgr().get_tablet(tablet_id)); + } else { + tablet = _engine.to_local().tablet_manager()->get_tablet(tablet_id); + } if (tablet == nullptr) { return Status::NotFound("Tablet not found. tablet_id={}", tablet_id); } + auto count = tablet->tablet_meta()->delete_bitmap().get_delete_bitmap_count(); + auto cardinality = tablet->tablet_meta()->delete_bitmap().cardinality(); + auto size = tablet->tablet_meta()->delete_bitmap().get_size(); + LOG(INFO) << "show_local_delete_bitmap_count,tablet_id=" << tablet_id << ",count=" << count + << ",cardinality=" << cardinality << ",size=" << size; + rapidjson::Document root; + root.SetObject(); + root.AddMember("delete_bitmap_count", count, root.GetAllocator()); + root.AddMember("cardinality", cardinality, root.GetAllocator()); + root.AddMember("size", size, root.GetAllocator()); + + // to json string + rapidjson::StringBuffer strbuf; + rapidjson::PrettyWriter writer(strbuf); + root.Accept(writer); + *json_result = std::string(strbuf.GetString()); + + return Status::OK(); +} + +Status DeleteBitmapAction::_handle_show_ms_delete_bitmap_count(HttpRequest* req, + std::string* json_result) { + uint64_t tablet_id = 0; + // check & retrieve tablet_id from req if it contains + RETURN_NOT_OK_STATUS_WITH_WARN(_check_param(req, &tablet_id), "check param failed"); + if (tablet_id == 0) { + return Status::InternalError("check param failed: missing tablet_id"); + } + TabletMetaSharedPtr tablet_meta; + auto st = _engine.to_cloud().meta_mgr().get_tablet_meta(tablet_id, &tablet_meta); + if (!st.ok()) { + LOG(WARNING) << "failed to get_tablet_meta tablet=" << tablet_id + << ", st=" << st.to_string(); + return st; + } + auto tablet = std::make_shared(_engine.to_cloud(), std::move(tablet_meta)); + st = _engine.to_cloud().meta_mgr().sync_tablet_rowsets(tablet.get(), false, true, true); + if (!st.ok()) { + LOG(WARNING) << "failed to sync tablet=" << tablet_id << ", st=" << st; + return st; + } auto count = tablet->tablet_meta()->delete_bitmap().get_delete_bitmap_count(); auto cardinality = tablet->tablet_meta()->delete_bitmap().cardinality(); auto size = tablet->tablet_meta()->delete_bitmap().get_size(); + LOG(INFO) << "show_ms_delete_bitmap_count,tablet_id=" << tablet_id << ",count=" << count + << ",cardinality=" << cardinality << ",size=" << size; rapidjson::Document root; root.SetObject(); @@ -111,11 +161,19 @@ Status CloudDeleteBitmapAction::_handle_show_delete_bitmap_count(HttpRequest* re return Status::OK(); } -void CloudDeleteBitmapAction::handle(HttpRequest* req) { +void DeleteBitmapAction::handle(HttpRequest* req) { req->add_output_header(HttpHeaders::CONTENT_TYPE, HEADER_JSON.data()); - if (_delete_bitmap_action_type == DeleteBitmapActionType::COUNT_INFO) { + if (_delete_bitmap_action_type == DeleteBitmapActionType::COUNT_LOCAL) { + std::string json_result; + Status st = _handle_show_local_delete_bitmap_count(req, &json_result); + if (!st.ok()) { + HttpChannel::send_reply(req, HttpStatus::OK, st.to_json()); + } else { + HttpChannel::send_reply(req, HttpStatus::OK, json_result); + } + } else if (_delete_bitmap_action_type == DeleteBitmapActionType::COUNT_MS) { std::string json_result; - Status st = _handle_show_delete_bitmap_count(req, &json_result); + Status st = _handle_show_ms_delete_bitmap_count(req, &json_result); if (!st.ok()) { HttpChannel::send_reply(req, HttpStatus::OK, st.to_json()); } else { @@ -124,4 +182,5 @@ void CloudDeleteBitmapAction::handle(HttpRequest* req) { } } +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/cloud/cloud_delete_bitmap_action.h b/be/src/http/action/delete_bitmap_action.h similarity index 64% rename from be/src/cloud/cloud_delete_bitmap_action.h rename to be/src/http/action/delete_bitmap_action.h index 9321661374c195..284e8dbcf5705b 100644 --- a/be/src/cloud/cloud_delete_bitmap_action.h +++ b/be/src/http/action/delete_bitmap_action.h @@ -21,34 +21,36 @@ #include -#include "cloud/cloud_storage_engine.h" #include "common/status.h" #include "http/http_handler_with_auth.h" +#include "olap/storage_engine.h" #include "olap/tablet.h" namespace doris { +#include "common/compile_check_begin.h" class HttpRequest; class ExecEnv; -enum class DeleteBitmapActionType { COUNT_INFO = 1 }; +enum class DeleteBitmapActionType { COUNT_LOCAL = 1, COUNT_MS = 2 }; /// This action is used for viewing the delete bitmap status -class CloudDeleteBitmapAction : public HttpHandlerWithAuth { +class DeleteBitmapAction : public HttpHandlerWithAuth { public: - CloudDeleteBitmapAction(DeleteBitmapActionType ctype, ExecEnv* exec_env, - CloudStorageEngine& engine, TPrivilegeHier::type hier, - TPrivilegeType::type ptype); + DeleteBitmapAction(DeleteBitmapActionType ctype, ExecEnv* exec_env, BaseStorageEngine& engine, + TPrivilegeHier::type hier, TPrivilegeType::type ptype); - ~CloudDeleteBitmapAction() override = default; + ~DeleteBitmapAction() override = default; void handle(HttpRequest* req) override; private: - Status _handle_show_delete_bitmap_count(HttpRequest* req, std::string* json_result); + Status _handle_show_local_delete_bitmap_count(HttpRequest* req, std::string* json_result); + Status _handle_show_ms_delete_bitmap_count(HttpRequest* req, std::string* json_result); private: - CloudStorageEngine& _engine; + BaseStorageEngine& _engine; DeleteBitmapActionType _delete_bitmap_action_type; }; +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/http/action/download_binlog_action.cpp b/be/src/http/action/download_binlog_action.cpp index 54701c5e463481..4bb8b8b70dd722 100644 --- a/be/src/http/action/download_binlog_action.cpp +++ b/be/src/http/action/download_binlog_action.cpp @@ -21,11 +21,9 @@ #include #include -#include #include #include #include -#include #include "common/config.h" #include "common/logging.h" @@ -34,7 +32,6 @@ #include "http/utils.h" #include "io/fs/local_file_system.h" #include "olap/storage_engine.h" -#include "olap/tablet.h" #include "olap/tablet_manager.h" #include "runtime/exec_env.h" @@ -147,8 +144,19 @@ void handle_get_segment_index_file(StorageEngine& engine, HttpRequest* req, const auto& rowset_id = get_http_param(req, kRowsetIdParameter); const auto& segment_index = get_http_param(req, kSegmentIndexParameter); const auto& segment_index_id = req->param(kSegmentIndexIdParameter); - segment_index_file_path = - tablet->get_segment_index_filepath(rowset_id, segment_index, segment_index_id); + auto segment_file_path = tablet->get_segment_filepath(rowset_id, segment_index); + if (tablet->tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + // now CCR not support for variant + index v1 + constexpr std::string_view index_suffix = ""; + segment_index_file_path = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path), + std::stoll(segment_index_id), index_suffix); + } else { + DCHECK(segment_index_id == "-1"); + segment_index_file_path = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path)); + } is_acquire_md5 = !req->param(kAcquireMD5Parameter).empty(); } catch (const std::exception& e) { HttpChannel::send_reply(req, HttpStatus::INTERNAL_SERVER_ERROR, e.what()); diff --git a/be/src/http/action/stream_load.cpp b/be/src/http/action/stream_load.cpp index 7e71f3eb910053..e8db5cb542fb4b 100644 --- a/be/src/http/action/stream_load.cpp +++ b/be/src/http/action/stream_load.cpp @@ -145,7 +145,8 @@ void StreamLoadAction::handle(HttpRequest* req) { << ctx->commit_and_publish_txn_cost_nanos / 1000000 << ", number_total_rows=" << ctx->number_total_rows << ", number_loaded_rows=" << ctx->number_loaded_rows - << ", receive_bytes=" << ctx->receive_bytes << ", loaded_bytes=" << ctx->loaded_bytes; + << ", receive_bytes=" << ctx->receive_bytes << ", loaded_bytes=" << ctx->loaded_bytes + << ", error_url=" << ctx->error_url; // update statistics streaming_load_requests_total->increment(1); diff --git a/be/src/http/http_channel.cpp b/be/src/http/http_channel.cpp index 96679195316dac..312f1ab9286909 100644 --- a/be/src/http/http_channel.cpp +++ b/be/src/http/http_channel.cpp @@ -20,8 +20,8 @@ #include #include #include +#include -#include #include #include #include @@ -57,7 +57,7 @@ void HttpChannel::send_reply(HttpRequest* request, HttpStatus status) { } void HttpChannel::send_reply(HttpRequest* request, HttpStatus status, const std::string& content) { - auto evb = evbuffer_new(); + auto* evb = evbuffer_new(); std::string compressed_content; if (compress_content(request->header(HttpHeaders::ACCEPT_ENCODING), content, &compressed_content)) { @@ -72,7 +72,7 @@ void HttpChannel::send_reply(HttpRequest* request, HttpStatus status, const std: void HttpChannel::send_file(HttpRequest* request, int fd, size_t off, size_t size, bufferevent_rate_limit_group* rate_limit_group) { - auto evb = evbuffer_new(); + auto* evb = evbuffer_new(); evbuffer_add_file(evb, fd, off, size); auto* evhttp_request = request->get_evhttp_request(); if (rate_limit_group) { @@ -84,6 +84,56 @@ void HttpChannel::send_file(HttpRequest* request, int fd, size_t off, size_t siz evbuffer_free(evb); } +void HttpChannel::send_files(HttpRequest* request, const std::string& root_dir, + std::vector local_files, + bufferevent_rate_limit_group* rate_limit_group) { + if (rate_limit_group) { + auto* evhttp_request = request->get_evhttp_request(); + auto* evhttp_connection = evhttp_request_get_connection(evhttp_request); + auto* buffer_event = evhttp_connection_get_bufferevent(evhttp_connection); + bufferevent_add_to_rate_limit_group(buffer_event, rate_limit_group); + } + + send_files(request, root_dir, std::move(local_files)); +} + +void HttpChannel::send_files(HttpRequest* request, const std::string& root_dir, + std::vector local_files) { + std::unique_ptr evb(evbuffer_new(), &evbuffer_free); + for (const std::string& file : local_files) { + std::string file_path = fmt::format("{}/{}", root_dir, file); + int fd = open(file_path.c_str(), O_RDONLY); + if (fd < 0) { + std::string error_msg = "Failed to open file: " + file_path; + LOG(WARNING) << "http channel send files: " << error_msg; + HttpChannel::send_reply(request, HttpStatus::NOT_FOUND, error_msg); + return; + } + struct stat st; + auto res = fstat(fd, &st); + if (res < 0) { + close(fd); + std::string error_msg = "Failed to open file: " + file_path; + LOG(WARNING) << "http channel send files: " << error_msg; + HttpChannel::send_reply(request, HttpStatus::NOT_FOUND, error_msg); + return; + } + + int64_t file_size = st.st_size; + VLOG_DEBUG << "http channel send file " << file_path << ", size: " << file_size; + + evbuffer_add_printf(evb.get(), "File-Name: %s\r\n", file.c_str()); + evbuffer_add_printf(evb.get(), "Content-Length: %ld\r\n", file_size); + evbuffer_add_printf(evb.get(), "\r\n"); + if (file_size > 0) { + evbuffer_add_file(evb.get(), fd, 0, file_size); + } + } + + evhttp_send_reply(request->get_evhttp_request(), HttpStatus::OK, + default_reason(HttpStatus::OK).c_str(), evb.get()); +} + bool HttpChannel::compress_content(const std::string& accept_encoding, const std::string& input, std::string* output) { // Don't bother compressing empty content. diff --git a/be/src/http/http_channel.h b/be/src/http/http_channel.h index ee1e6c0888f1d3..0d5e5d4260af8c 100644 --- a/be/src/http/http_channel.h +++ b/be/src/http/http_channel.h @@ -20,6 +20,7 @@ #include #include +#include #include "http/http_status.h" @@ -47,6 +48,13 @@ class HttpChannel { static void send_file(HttpRequest* request, int fd, size_t off, size_t size, bufferevent_rate_limit_group* rate_limit_group = nullptr); + static void send_files(HttpRequest* request, const std::string& root_dir, + std::vector local_files, + bufferevent_rate_limit_group* rate_limit_group); + + static void send_files(HttpRequest* request, const std::string& root_dir, + std::vector local_files); + static bool compress_content(const std::string& accept_encoding, const std::string& input, std::string* output); }; diff --git a/be/src/http/http_client.cpp b/be/src/http/http_client.cpp index c842a4fe2dd4ce..767377cea3f365 100644 --- a/be/src/http/http_client.cpp +++ b/be/src/http/http_client.cpp @@ -24,13 +24,225 @@ #include #include "common/config.h" +#include "common/status.h" #include "http/http_headers.h" -#include "http/http_status.h" #include "runtime/exec_env.h" +#include "util/security.h" #include "util/stack_util.h" namespace doris { +class MultiFileSplitter { +public: + MultiFileSplitter(std::string local_dir, std::unordered_set expected_files) + : _local_dir_path(std::move(local_dir)), _expected_files(std::move(expected_files)) {} + ~MultiFileSplitter() { + if (_fd >= 0) { + close(_fd); + } + + if (!_status.ok() && !downloaded_files.empty()) { + LOG(WARNING) << "download files to " << _local_dir_path << " failed, try remove the " + << downloaded_files.size() << " downloaded files"; + for (const auto& file : downloaded_files) { + remove(file.c_str()); + } + } + } + + bool append(const char* data, size_t length) { + // Already failed. + if (!_status.ok()) { + return false; + } + + std::string buf; + if (!_buffer.empty()) { + buf.swap(_buffer); + buf.append(data, length); + data = buf.data(); + length = buf.size(); + } + return append_inner(data, length); + } + + Status finish() { + if (_status.ok()) { + _status = finish_inner(); + } + + return _status; + } + +private: + bool append_inner(const char* data, size_t length) { + while (length > 0) { + int consumed = 0; + if (_is_reading_header) { + consumed = parse_header(data, length); + } else { + consumed = append_file(data, length); + } + + if (consumed < 0) { + return false; + } + + DCHECK(consumed <= length); + data += consumed; + length -= consumed; + } + return true; + } + + int parse_header(const char* data, size_t length) { + DCHECK(_fd < 0); + + std::string_view buf(data, length); + size_t pos = buf.find("\r\n\r\n"); + if (pos == std::string::npos) { + _buffer.append(data, length); + return static_cast(length); + } + + // header already read. + _is_reading_header = false; + + bool has_file_name = false; + bool has_file_size = false; + std::string_view header = buf.substr(0, pos); + std::vector headers = + strings::Split(header, "\r\n", strings::SkipWhitespace()); + for (auto& s : headers) { + size_t header_pos = s.find(':'); + if (header_pos == std::string::npos) { + continue; + } + std::string_view header_view(s); + std::string_view key = header_view.substr(0, header_pos); + std::string_view value = header_view.substr(header_pos + 1); + if (value.starts_with(' ')) { + value.remove_prefix(std::min(value.find_first_not_of(' '), value.size())); + } + if (key == "File-Name") { + _file_name = value; + has_file_name = true; + } else if (key == "Content-Length") { + auto res = std::from_chars(value.data(), value.data() + value.size(), _file_size); + if (res.ec != std::errc()) { + std::string error_msg = fmt::format("invalid content length: {}", value); + LOG(WARNING) << "download files to " << _local_dir_path + << "failed, err=" << error_msg; + _status = Status::HttpError(std::move(error_msg)); + return -1; + } + has_file_size = true; + } + } + + if (!has_file_name || !has_file_size) { + std::string error_msg = + fmt::format("invalid multi part header, has file name: {}, has file size: {}", + has_file_name, has_file_size); + LOG(WARNING) << "download files to " << _local_dir_path << "failed, err=" << error_msg; + _status = Status::HttpError(std::move(error_msg)); + return -1; + } + + if (!_expected_files.contains(_file_name)) { + std::string error_msg = fmt::format("unexpected file: {}", _file_name); + LOG(WARNING) << "download files to " << _local_dir_path << "failed, err=" << error_msg; + _status = Status::HttpError(std::move(error_msg)); + return -1; + } + + VLOG_DEBUG << "receive file " << _file_name << ", size " << _file_size; + + _written_size = 0; + _local_file_path = fmt::format("{}/{}", _local_dir_path, _file_name); + _fd = open(_local_file_path.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644); + if (_fd < 0) { + std::string error_msg = "fail to open file to write: " + _local_file_path; + LOG(WARNING) << "download files to " << _local_dir_path << "failed, err=" << error_msg; + _status = Status::IOError(std::move(error_msg)); + return -1; + } + downloaded_files.push_back(_local_file_path); + + return static_cast(pos + 4); + } + + int append_file(const char* data, size_t length) { + DCHECK(_fd >= 0); + DCHECK(_file_size >= _written_size); + + size_t write_size = std::min(length, _file_size - _written_size); + if (write_size > 0 && write(_fd, data, write_size) < 0) { + auto msg = fmt::format("write file failed, file={}, error={}", _local_file_path, + strerror(errno)); + LOG(WARNING) << "download files to " << _local_dir_path << "failed, err=" << msg; + _status = Status::HttpError(std::move(msg)); + return -1; + } + + _written_size += write_size; + if (_written_size == _file_size) { + // This file has been downloaded, switch to the next one. + switchToNextFile(); + } + + return write_size; + } + + Status finish_inner() { + if (!_is_reading_header && _written_size == _file_size) { + switchToNextFile(); + } + + if (_fd >= 0) { + // This file is not completely downloaded. + close(_fd); + _fd = -1; + auto error_msg = fmt::format("file {} is not completely downloaded", _local_file_path); + LOG(WARNING) << "download files to " << _local_dir_path << "failed, err=" << error_msg; + return Status::HttpError(std::move(error_msg)); + } + + if (!_expected_files.empty()) { + auto error_msg = fmt::format("not all files are downloaded, {} missing files", + _expected_files.size()); + LOG(WARNING) << "download files to " << _local_dir_path << "failed, err=" << error_msg; + return Status::HttpError(std::move(error_msg)); + } + + downloaded_files.clear(); + return Status::OK(); + } + + void switchToNextFile() { + DCHECK(_fd >= 0); + DCHECK(_written_size == _file_size); + + close(_fd); + _fd = -1; + _expected_files.erase(_file_name); + _is_reading_header = true; + } + + const std::string _local_dir_path; + std::string _buffer; + std::unordered_set _expected_files; + Status _status; + + bool _is_reading_header = true; + int _fd = -1; + std::string _local_file_path; + std::string _file_name; + size_t _file_size = 0; + size_t _written_size = 0; + std::vector downloaded_files; +}; + static const char* header_error_msg(CURLHcode code) { switch (code) { case CURLHE_OK: @@ -173,6 +385,12 @@ void HttpClient::set_method(HttpMethod method) { } } +void HttpClient::set_speed_limit() { + curl_easy_setopt(_curl, CURLOPT_LOW_SPEED_LIMIT, config::download_low_speed_limit_kbps * 1024); + curl_easy_setopt(_curl, CURLOPT_LOW_SPEED_TIME, config::download_low_speed_time); + curl_easy_setopt(_curl, CURLOPT_MAX_RECV_SPEED_LARGE, config::max_download_speed_kbps * 1024); +} + size_t HttpClient::on_response_data(const void* data, size_t length) { if (*_callback != nullptr) { bool is_continue = (*_callback)(data, length); @@ -183,12 +401,6 @@ size_t HttpClient::on_response_data(const void* data, size_t length) { return length; } -// Status HttpClient::execute_post_request(const std::string& post_data, const std::function& callback = {}) { -// _callback = &callback; -// set_post_body(post_data); -// return execute(callback); -// } - Status HttpClient::execute_post_request(const std::string& payload, std::string* response) { set_method(POST); set_payload(payload); @@ -205,9 +417,11 @@ Status HttpClient::execute(const std::function fp(fopen(local_path.c_str(), "w"), fp_closer); @@ -267,6 +475,20 @@ Status HttpClient::download(const std::string& local_path) { return status; } +Status HttpClient::download_multi_files(const std::string& local_dir, + const std::unordered_set& expected_files) { + set_speed_limit(); + + MultiFileSplitter splitter(local_dir, expected_files); + auto callback = [&](const void* data, size_t length) { + return splitter.append(reinterpret_cast(data), length); + }; + if (auto s = execute(callback); !s.ok()) { + return s; + } + return splitter.finish(); +} + Status HttpClient::execute(std::string* response) { auto callback = [response](const void* data, size_t length) { response->append((char*)data, length); @@ -275,13 +497,22 @@ Status HttpClient::execute(std::string* response) { return execute(callback); } -const char* HttpClient::_to_errmsg(CURLcode code) { +const char* HttpClient::_to_errmsg(CURLcode code) const { if (_error_buf[0] == 0) { return curl_easy_strerror(code); } return _error_buf; } +const char* HttpClient::_get_url() const { + const char* url = nullptr; + curl_easy_getinfo(_curl, CURLINFO_EFFECTIVE_URL, &url); + if (!url) { + url = ""; + } + return url; +} + Status HttpClient::execute_with_retry(int retry_times, int sleep_time, const std::function& callback) { Status status; @@ -293,7 +524,9 @@ Status HttpClient::execute_with_retry(int retry_times, int sleep_time, if (http_status == 200) { return status; } else { - auto error_msg = fmt::format("http status code is not 200, code={}", http_status); + std::string url = mask_token(client._get_url()); + auto error_msg = fmt::format("http status code is not 200, code={}, url={}", + http_status, url); LOG(WARNING) << error_msg; return Status::HttpError(error_msg); } diff --git a/be/src/http/http_client.h b/be/src/http/http_client.h index fb692c50268484..a6f2f4fdff514b 100644 --- a/be/src/http/http_client.h +++ b/be/src/http/http_client.h @@ -24,6 +24,7 @@ #include #include #include +#include #include "common/status.h" #include "http/http_headers.h" @@ -81,6 +82,8 @@ class HttpClient { curl_easy_setopt(_curl, CURLOPT_SSL_VERIFYHOST, 0L); } + void set_speed_limit(); + // TODO(zc): support set header // void set_header(const std::string& key, const std::string& value) { // _cntl.http_request().SetHeader(key, value); @@ -141,6 +144,8 @@ class HttpClient { // helper function to download a file, you can call this function to download // a file to local_path Status download(const std::string& local_path); + Status download_multi_files(const std::string& local_dir, + const std::unordered_set& expected_files); Status execute_post_request(const std::string& payload, std::string* response); @@ -164,7 +169,8 @@ class HttpClient { Status _escape_url(const std::string& url, std::string* escaped_url); private: - const char* _to_errmsg(CURLcode code); + const char* _to_errmsg(CURLcode code) const; + const char* _get_url() const; private: CURL* _curl = nullptr; diff --git a/be/src/http/http_handler_with_auth.cpp b/be/src/http/http_handler_with_auth.cpp index 518b9868de191e..ae5c024e76d093 100644 --- a/be/src/http/http_handler_with_auth.cpp +++ b/be/src/http/http_handler_with_auth.cpp @@ -35,6 +35,7 @@ HttpHandlerWithAuth::HttpHandlerWithAuth(ExecEnv* exec_env, TPrivilegeHier::type : _exec_env(exec_env), _hier(hier), _type(type) {} int HttpHandlerWithAuth::on_header(HttpRequest* req) { + //if u return value isn't 0,u should `send_reply`,Avoid requesting links that never return. TCheckAuthRequest auth_request; TCheckAuthResult auth_result; AuthInfo auth_info; @@ -83,6 +84,11 @@ int HttpHandlerWithAuth::on_header(HttpRequest* req) { #ifndef BE_TEST TNetworkAddress master_addr = _exec_env->cluster_info()->master_fe_addr; + if (master_addr.hostname.empty() || master_addr.port == 0) { + LOG(WARNING) << "Not found master fe, Can't auth API request: " << req->debug_string(); + HttpChannel::send_error(req, HttpStatus::SERVICE_UNAVAILABLE); + return -1; + } { auto status = ThriftRpcHelper::rpc( master_addr.hostname, master_addr.port, @@ -90,6 +96,10 @@ int HttpHandlerWithAuth::on_header(HttpRequest* req) { client->checkAuth(auth_result, auth_request); }); if (!status) { + LOG(WARNING) << "CheckAuth Rpc Fail.Fe Ip:" << master_addr.hostname + << ", Fe port:" << master_addr.port << ".Status:" << status.to_string() + << ".Request: " << req->debug_string(); + HttpChannel::send_error(req, HttpStatus::SERVICE_UNAVAILABLE); return -1; } } @@ -98,6 +108,7 @@ int HttpHandlerWithAuth::on_header(HttpRequest* req) { auth_result.status.status_code = TStatusCode::type::OK; auth_result.status.error_msgs.clear(); } else { + HttpChannel::send_reply(req, HttpStatus::FORBIDDEN); return -1; } #endif diff --git a/be/src/http/utils.cpp b/be/src/http/utils.cpp index f91610476b4dc9..ee7a78113e555a 100644 --- a/be/src/http/utils.cpp +++ b/be/src/http/utils.cpp @@ -23,6 +23,8 @@ #include #include +#include +#include #include #include "common/config.h" @@ -30,6 +32,7 @@ #include "common/status.h" #include "common/utils.h" #include "http/http_channel.h" +#include "http/http_client.h" #include "http/http_common.h" #include "http/http_headers.h" #include "http/http_method.h" @@ -41,10 +44,15 @@ #include "runtime/exec_env.h" #include "util/md5.h" #include "util/path_util.h" +#include "util/security.h" #include "util/url_coding.h" namespace doris { +const uint32_t CHECK_SUPPORT_TIMEOUT = 3; +const uint32_t DOWNLOAD_FILE_MAX_RETRY = 3; +const uint32_t LIST_REMOTE_FILE_TIMEOUT = 15; + std::string encode_basic_auth(const std::string& user, const std::string& passwd) { std::string auth = user + ":" + passwd; std::string encoded_auth; @@ -190,20 +198,26 @@ void do_file_response(const std::string& file_path, HttpRequest* req, HttpChannel::send_file(req, fd, 0, file_size, rate_limit_group); } -void do_dir_response(const std::string& dir_path, HttpRequest* req) { +void do_dir_response(const std::string& dir_path, HttpRequest* req, bool is_acquire_filesize) { bool exists = true; std::vector files; Status st = io::global_local_filesystem()->list(dir_path, true, &files, &exists); if (!st.ok()) { LOG(WARNING) << "Failed to scan dir. " << st; HttpChannel::send_error(req, HttpStatus::INTERNAL_SERVER_ERROR); + return; } + VLOG_DEBUG << "list dir: " << dir_path << ", file count: " << files.size(); + const std::string FILE_DELIMITER_IN_DIR_RESPONSE = "\n"; std::stringstream result; for (auto& file : files) { result << file.file_name << FILE_DELIMITER_IN_DIR_RESPONSE; + if (is_acquire_filesize) { + result << file.file_size << FILE_DELIMITER_IN_DIR_RESPONSE; + } } std::string result_str = result.str(); @@ -221,4 +235,118 @@ bool load_size_smaller_than_wal_limit(int64_t content_length) { return (content_length < 0.8 * max_available_size); } +Status is_support_batch_download(const std::string& endpoint) { + std::string url = fmt::format("http://{}/api/_tablet/_batch_download?check=true", endpoint); + auto check_support_cb = [&url](HttpClient* client) { + RETURN_IF_ERROR(client->init(url)); + client->set_timeout_ms(CHECK_SUPPORT_TIMEOUT * 1000); + client->set_method(HttpMethod::HEAD); + std::string response; + return client->execute(&response); + }; + return HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, check_support_cb); +} + +Status list_remote_files_v2(const std::string& address, const std::string& token, + const std::string& remote_dir, + std::vector>* file_info_list) { + std::string remote_url = + fmt::format("http://{}/api/_tablet/_batch_download?token={}&dir={}&list=true", address, + token, remote_dir); + + std::string file_list_str; + auto list_files_cb = [&](HttpClient* client) { + file_list_str.clear(); + RETURN_IF_ERROR(client->init(remote_url, false)); + client->set_method(HttpMethod::GET); + client->set_timeout_ms(LIST_REMOTE_FILE_TIMEOUT * 1000); + return client->execute(&file_list_str); + }; + Status status = HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, list_files_cb); + if (!status.ok()) { + LOG(WARNING) << "failed to list remote files from " << remote_url + << ", status: " << status.to_string() << ", response: " << file_list_str; + return status; + } + + std::vector file_list = strings::Split(file_list_str, "\n", strings::SkipWhitespace()); + if (file_list.size() % 2 != 0) { + return Status::InternalError("batch download files: invalid file list, size is not even"); + } + + VLOG_DEBUG << "list remote files from " << remote_url + << ", file count: " << file_list.size() / 2; + + for (size_t i = 0; i < file_list.size(); i += 2) { + uint64_t file_size = 0; + try { + file_size = std::stoull(file_list[i + 1]); + } catch (std::exception&) { + return Status::InternalError("batch download files: invalid file size format: " + + file_list[i + 1]); + } + file_info_list->emplace_back(std::move(file_list[i]), file_size); + } + + return Status::OK(); +} + +Status download_files_v2(const std::string& address, const std::string& token, + const std::string& remote_dir, const std::string& local_dir, + const std::vector>& file_info_list) { + std::string remote_url = fmt::format("http://{}/api/_tablet/_batch_download?dir={}&token={}", + address, remote_dir, token); + + size_t batch_file_size = 0; + std::unordered_set expected_files; + std::stringstream ss; + for (const auto& file_info : file_info_list) { + ss << file_info.first << "\n"; + batch_file_size += file_info.second; + expected_files.insert(file_info.first); + } + std::string payload = ss.str(); + + uint64_t estimate_timeout = batch_file_size / config::download_low_speed_limit_kbps / 1024; + if (estimate_timeout < config::download_low_speed_time) { + estimate_timeout = config::download_low_speed_time; + } + + LOG(INFO) << "begin to download files from " << remote_url << " to " << local_dir + << ", file count: " << file_info_list.size() << ", total size: " << batch_file_size + << ", timeout: " << estimate_timeout; + + auto callback = [&](HttpClient* client) -> Status { + RETURN_IF_ERROR(client->init(remote_url, false)); + client->set_method(HttpMethod::POST); + client->set_payload(payload); + client->set_timeout_ms(estimate_timeout * 1000); + RETURN_IF_ERROR(client->download_multi_files(local_dir, expected_files)); + for (auto&& [file_name, file_size] : file_info_list) { + std::string local_file_path = local_dir + "/" + file_name; + + std::error_code ec; + // Check file length + uint64_t local_file_size = std::filesystem::file_size(local_file_path, ec); + if (ec) { + LOG(WARNING) << "download file error: " << ec.message(); + return Status::IOError("can't retrive file_size of {}, due to {}", local_file_path, + ec.message()); + } + if (local_file_size != file_size) { + LOG(WARNING) << "download file length error" + << ", remote_path=" << mask_token(remote_url) + << ", file_name=" << file_name << ", file_size=" << file_size + << ", local_file_size=" << local_file_size; + return Status::InternalError("downloaded file size is not equal"); + } + RETURN_IF_ERROR(io::global_local_filesystem()->permission( + local_file_path, io::LocalFileSystem::PERMS_OWNER_RW)); + } + + return Status::OK(); + }; + return HttpClient::execute_with_retry(DOWNLOAD_FILE_MAX_RETRY, 1, callback); +} + } // namespace doris diff --git a/be/src/http/utils.h b/be/src/http/utils.h index 20be6c0fcd7439..b9abb7c6208efb 100644 --- a/be/src/http/utils.h +++ b/be/src/http/utils.h @@ -40,9 +40,22 @@ void do_file_response(const std::string& dir_path, HttpRequest* req, bufferevent_rate_limit_group* rate_limit_group = nullptr, bool is_acquire_md5 = false); -void do_dir_response(const std::string& dir_path, HttpRequest* req); +void do_dir_response(const std::string& dir_path, HttpRequest* req, + bool is_acquire_filesize = false); std::string get_content_type(const std::string& file_name); bool load_size_smaller_than_wal_limit(int64_t content_length); + +// Whether a backend supports batch download +Status is_support_batch_download(const std::string& address); + +Status list_remote_files_v2(const std::string& address, const std::string& token, + const std::string& remote_dir, + std::vector>* file_info_list); + +Status download_files_v2(const std::string& address, const std::string& token, + const std::string& remote_dir, const std::string& local_dir, + const std::vector>& file_info_list); + } // namespace doris diff --git a/be/src/index-tools/index_tool.cpp b/be/src/index-tools/index_tool.cpp index adea2cd84c95f6..e45902c0f24df1 100644 --- a/be/src/index-tools/index_tool.cpp +++ b/be/src/index-tools/index_tool.cpp @@ -170,7 +170,7 @@ void search(lucene::store::Directory* dir, std::string& field, std::string& toke std::vector terms = split(token, '|'); doris::TQueryOptions queryOptions; - ConjunctionQuery conjunct_query(s, queryOptions); + ConjunctionQuery conjunct_query(s, queryOptions, nullptr); conjunct_query.add(field_ws, terms); conjunct_query.search(result); @@ -562,7 +562,7 @@ int main(int argc, char** argv) { auto dir = std::forward(st).value(); auto analyzer = _CLNEW lucene::analysis::standard95::StandardAnalyzer(); // auto analyzer = _CLNEW lucene::analysis::SimpleAnalyzer(); - auto indexwriter = _CLNEW lucene::index::IndexWriter(dir, analyzer, true, true); + auto indexwriter = _CLNEW lucene::index::IndexWriter(dir.get(), analyzer, true, true); indexwriter->setRAMBufferSizeMB(512); indexwriter->setMaxFieldLength(0x7FFFFFFFL); indexwriter->setMergeFactor(100000000); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index 4fb3f3e02cb58c..2a59a5158e46c2 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -86,42 +86,42 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _total_evict_size_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_total_evict_size"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = + "file_cache_evict_by_time_disposable_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_index"); - _evict_by_heat_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = + "file_cache_evict_by_time_disposable_to_index"); + _evict_by_time_metrics_matrix[FileCacheType::DISPOSABLE][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_disposable_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_disposable_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = + "file_cache_evict_by_time_normal_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_index"); - _evict_by_heat_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = + "file_cache_evict_by_time_normal_to_index"); + _evict_by_time_metrics_matrix[FileCacheType::NORMAL][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_normal_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_normal_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = + "file_cache_evict_by_time_index_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = + "file_cache_evict_by_time_index_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::INDEX][FileCacheType::TTL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_index_to_ttl"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = + "file_cache_evict_by_time_index_to_ttl"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_disposable"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = + "file_cache_evict_by_time_ttl_to_disposable"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::NORMAL] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_normal"); - _evict_by_heat_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = + "file_cache_evict_by_time_ttl_to_normal"); + _evict_by_time_metrics_matrix[FileCacheType::TTL][FileCacheType::INDEX] = std::make_shared>(_cache_base_path.c_str(), - "file_cache_evict_by_heat_ttl_to_index"); + "file_cache_evict_by_time_ttl_to_index"); _evict_by_self_lru_metrics_matrix[FileCacheType::DISPOSABLE] = std::make_shared>(_cache_base_path.c_str(), @@ -197,8 +197,8 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); - _disk_limit_mode_metrics = - std::make_shared>(_cache_base_path.c_str(), "disk_limit_mode", 0); + _disk_limit_mode_metrics = std::make_shared>( + _cache_base_path.c_str(), "file_cache_disk_limit_mode", 0); _disposable_queue = LRUQueue(cache_settings.disposable_queue_size, cache_settings.disposable_queue_elements, 60 * 60); @@ -393,6 +393,15 @@ FileBlocks BlockFileCache::get_impl(const UInt128Wrapper& hash, const CacheConte auto& file_blocks = it->second; DCHECK(!file_blocks.empty()); + if (file_blocks.empty()) { + LOG(WARNING) << "file_blocks is empty for hash=" << hash.to_string() + << " cache type=" << context.cache_type + << " cache expiration time=" << context.expiration_time + << " cache range=" << range.left << " " << range.right + << " query id=" << context.query_id; + _files.erase(hash); + return {}; + } // change to ttl if the blocks aren't ttl if (context.cache_type == FileCacheType::TTL && _key_to_time.find(hash) == _key_to_time.end()) { for (auto& [_, cell] : file_blocks) { @@ -970,67 +979,6 @@ void BlockFileCache::find_evict_candidates(LRUQueue& queue, size_t size, size_t } } -bool BlockFileCache::try_reserve_for_ttl_without_lru(size_t size, - std::lock_guard& cache_lock) { - size_t removed_size = 0; - size_t cur_cache_size = _cur_cache_size; - auto limit = config::max_ttl_cache_ratio * _capacity; - - TEST_INJECTION_POINT_CALLBACK("BlockFileCache::change_limit1", &limit); - - if ((_cur_ttl_size + size) * 100 > limit) { - return false; - } - - size_t normal_queue_size = _normal_queue.get_capacity(cache_lock); - size_t disposable_queue_size = _disposable_queue.get_capacity(cache_lock); - size_t index_queue_size = _index_queue.get_capacity(cache_lock); - if (is_overflow(removed_size, size, cur_cache_size) && normal_queue_size == 0 && - disposable_queue_size == 0 && index_queue_size == 0) { - return false; - } - std::vector to_evict; - auto collect_eliminate_fragments = [&](LRUQueue& queue) { - size_t cur_removed_size = 0; - find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - cur_removed_size); - }; - if (disposable_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::DISPOSABLE)); - } - if (normal_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::NORMAL)); - } - if (index_queue_size != 0) { - collect_eliminate_fragments(get_queue(FileCacheType::INDEX)); - } - remove_file_blocks(to_evict, cache_lock); - if (is_overflow(removed_size, size, cur_cache_size)) { - return false; - } - return true; -} - -bool BlockFileCache::try_reserve_for_ttl(size_t size, std::lock_guard& cache_lock) { - if (try_reserve_for_ttl_without_lru(size, cache_lock)) { - return true; - } else if (config::enable_ttl_cache_evict_using_lru) { - auto& queue = get_queue(FileCacheType::TTL); - size_t removed_size = 0; - size_t cur_cache_size = _cur_cache_size; - - std::vector to_evict; - size_t cur_removed_size = 0; - find_evict_candidates(queue, size, cur_cache_size, removed_size, to_evict, cache_lock, - cur_removed_size); - remove_file_blocks_and_clean_time_maps(to_evict, cache_lock); - - return !is_overflow(removed_size, size, cur_cache_size); - } else { - return false; - } -} - // 1. if async load file cache not finish // a. evict from lru queue // 2. if ttl cache @@ -1145,15 +1093,16 @@ bool BlockFileCache::remove_if_ttl_file_unlock(const UInt128Wrapper& file_key, b _key_to_time.find(file_key) != _key_to_time.end()) { if (!remove_directly) { for (auto& [_, cell] : _files[file_key]) { - if (cell.file_block->cache_type() == FileCacheType::TTL) { - Status st = cell.file_block->update_expiration_time(0); - if (!st.ok()) { - LOG_WARNING("Failed to update expiration time to 0").error(st); - } + if (cell.file_block->cache_type() != FileCacheType::TTL) { + continue; + } + Status st = cell.file_block->update_expiration_time(0); + if (!st.ok()) { + LOG_WARNING("Failed to update expiration time to 0").error(st); } if (cell.file_block->cache_type() == FileCacheType::NORMAL) continue; - auto st = cell.file_block->change_cache_type_between_ttl_and_others( + st = cell.file_block->change_cache_type_between_ttl_and_others( FileCacheType::NORMAL); if (st.ok()) { if (cell.queue_iterator) { @@ -1283,7 +1232,7 @@ void BlockFileCache::reset_range(const UInt128Wrapper& hash, size_t offset, size _cur_cache_size += new_size; } -bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( +bool BlockFileCache::try_reserve_from_other_queue_by_time_interval( FileCacheType cur_type, std::vector other_cache_types, size_t size, int64_t cur_time, std::lock_guard& cache_lock) { size_t removed_size = 0; @@ -1316,7 +1265,7 @@ bool BlockFileCache::try_reserve_from_other_queue_by_hot_interval( remove_size_per_type += cell_size; } } - *(_evict_by_heat_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; + *(_evict_by_time_metrics_matrix[cache_type][cur_type]) << remove_size_per_type; } remove_file_blocks(to_evict, cache_lock); @@ -1365,7 +1314,7 @@ bool BlockFileCache::try_reserve_from_other_queue(FileCacheType cur_cache_type, std::lock_guard& cache_lock) { // currently, TTL cache is not considered as a candidate auto other_cache_types = get_other_cache_type_without_ttl(cur_cache_type); - bool reserve_success = try_reserve_from_other_queue_by_hot_interval( + bool reserve_success = try_reserve_from_other_queue_by_time_interval( cur_cache_type, other_cache_types, size, cur_time, cache_lock); if (reserve_success || !config::file_cache_enable_evict_from_other_queue_by_size) { return reserve_success; @@ -1730,13 +1679,16 @@ void BlockFileCache::check_disk_resource_limit() { LOG_ERROR("").tag("file cache path", _cache_base_path).tag("error", strerror(errno)); return; } - auto [capacity_percentage, inode_percentage] = percent; - auto inode_is_insufficient = [](const int& inode_percentage) { - return inode_percentage >= config::file_cache_enter_disk_resource_limit_mode_percent; + auto [space_percentage, inode_percentage] = percent; + auto is_insufficient = [](const int& percentage) { + return percentage >= config::file_cache_enter_disk_resource_limit_mode_percent; }; - DCHECK(capacity_percentage >= 0 && capacity_percentage <= 100); - DCHECK(inode_percentage >= 0 && inode_percentage <= 100); - // ATTN: due to that can be change, so if its invalid, set it to default value + DCHECK_GE(space_percentage, 0); + DCHECK_LE(space_percentage, 100); + DCHECK_GE(inode_percentage, 0); + DCHECK_LE(inode_percentage, 100); + // ATTN: due to that can be changed dynamically, set it to default value if it's invalid + // FIXME: reject with config validator if (config::file_cache_enter_disk_resource_limit_mode_percent <= config::file_cache_exit_disk_resource_limit_mode_percent) { LOG_WARNING("config error, set to default value") @@ -1745,23 +1697,21 @@ void BlockFileCache::check_disk_resource_limit() { config::file_cache_enter_disk_resource_limit_mode_percent = 90; config::file_cache_exit_disk_resource_limit_mode_percent = 80; } - if (capacity_percentage >= config::file_cache_enter_disk_resource_limit_mode_percent || - inode_is_insufficient(inode_percentage)) { + if (is_insufficient(space_percentage) || is_insufficient(inode_percentage)) { _disk_resource_limit_mode = true; _disk_limit_mode_metrics->set_value(1); } else if (_disk_resource_limit_mode && - (capacity_percentage < config::file_cache_exit_disk_resource_limit_mode_percent) && + (space_percentage < config::file_cache_exit_disk_resource_limit_mode_percent) && (inode_percentage < config::file_cache_exit_disk_resource_limit_mode_percent)) { _disk_resource_limit_mode = false; _disk_limit_mode_metrics->set_value(0); } if (_disk_resource_limit_mode) { - // log per mins - LOG_EVERY_N(WARNING, 3) << "file cache background thread space percent=" - << capacity_percentage << " inode percent=" << inode_percentage - << " is inode insufficient=" - << inode_is_insufficient(inode_percentage) - << " mode run in resource limit"; + LOG(WARNING) << "file_cache=" << get_base_path() << " space_percent=" << space_percentage + << " inode_percent=" << inode_percentage + << " is_space_insufficient=" << is_insufficient(space_percentage) + << " is_inode_insufficient=" << is_insufficient(inode_percentage) + << " mode run in resource limit"; } } @@ -1777,50 +1727,56 @@ void BlockFileCache::run_background_operation() { break; } } + // report + { + SCOPED_CACHE_LOCK(_mutex); + _cur_cache_size_metrics->set_value(_cur_cache_size); + _cur_ttl_cache_size_metrics->set_value(_cur_cache_size - + _index_queue.get_capacity(cache_lock) - + _normal_queue.get_capacity(cache_lock) - + _disposable_queue.get_capacity(cache_lock)); + _cur_ttl_cache_lru_queue_cache_size_metrics->set_value( + _ttl_queue.get_capacity(cache_lock)); + _cur_ttl_cache_lru_queue_element_count_metrics->set_value( + _ttl_queue.get_elements_num(cache_lock)); + _cur_normal_queue_cache_size_metrics->set_value(_normal_queue.get_capacity(cache_lock)); + _cur_normal_queue_element_count_metrics->set_value( + _normal_queue.get_elements_num(cache_lock)); + _cur_index_queue_cache_size_metrics->set_value(_index_queue.get_capacity(cache_lock)); + _cur_index_queue_element_count_metrics->set_value( + _index_queue.get_elements_num(cache_lock)); + _cur_disposable_queue_cache_size_metrics->set_value( + _disposable_queue.get_capacity(cache_lock)); + _cur_disposable_queue_element_count_metrics->set_value( + _disposable_queue.get_elements_num(cache_lock)); + + if (_num_read_blocks->get_value() > 0) { + _hit_ratio->set_value((double)_num_hit_blocks->get_value() / + _num_read_blocks->get_value()); + } + if (_num_read_blocks_5m->get_value() > 0) { + _hit_ratio_5m->set_value((double)_num_hit_blocks_5m->get_value() / + _num_read_blocks_5m->get_value()); + } + if (_num_read_blocks_1h->get_value() > 0) { + _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / + _num_read_blocks_1h->get_value()); + } + } + recycle_stale_rowset_async_bottom_half(); recycle_deleted_blocks(); // gc - int64_t cur_time = UnixSeconds(); - SCOPED_CACHE_LOCK(_mutex); - while (!_time_to_key.empty()) { - auto begin = _time_to_key.begin(); - if (cur_time < begin->first) { - break; + { + int64_t cur_time = UnixSeconds(); + SCOPED_CACHE_LOCK(_mutex); + while (!_time_to_key.empty()) { + auto begin = _time_to_key.begin(); + if (cur_time < begin->first) { + break; + } + remove_if_ttl_file_unlock(begin->second, false, cache_lock); } - remove_if_ttl_file_unlock(begin->second, false, cache_lock); - } - - // report - _cur_cache_size_metrics->set_value(_cur_cache_size); - _cur_ttl_cache_size_metrics->set_value(_cur_cache_size - - _index_queue.get_capacity(cache_lock) - - _normal_queue.get_capacity(cache_lock) - - _disposable_queue.get_capacity(cache_lock)); - _cur_ttl_cache_lru_queue_cache_size_metrics->set_value(_ttl_queue.get_capacity(cache_lock)); - _cur_ttl_cache_lru_queue_element_count_metrics->set_value( - _ttl_queue.get_elements_num(cache_lock)); - _cur_normal_queue_cache_size_metrics->set_value(_normal_queue.get_capacity(cache_lock)); - _cur_normal_queue_element_count_metrics->set_value( - _normal_queue.get_elements_num(cache_lock)); - _cur_index_queue_cache_size_metrics->set_value(_index_queue.get_capacity(cache_lock)); - _cur_index_queue_element_count_metrics->set_value( - _index_queue.get_elements_num(cache_lock)); - _cur_disposable_queue_cache_size_metrics->set_value( - _disposable_queue.get_capacity(cache_lock)); - _cur_disposable_queue_element_count_metrics->set_value( - _disposable_queue.get_elements_num(cache_lock)); - - if (_num_read_blocks->get_value() > 0) { - _hit_ratio->set_value((double)_num_hit_blocks->get_value() / - _num_read_blocks->get_value()); - } - if (_num_read_blocks_5m->get_value() > 0) { - _hit_ratio_5m->set_value((double)_num_hit_blocks_5m->get_value() / - _num_read_blocks_5m->get_value()); - } - if (_num_read_blocks_1h->get_value() > 0) { - _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / - _num_read_blocks_1h->get_value()); } } } diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index 0de33dadc8249d..f23d5a3799e0cf 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -397,10 +397,6 @@ class BlockFileCache { size_t get_available_cache_size(FileCacheType cache_type) const; - bool try_reserve_for_ttl(size_t size, std::lock_guard& cache_lock); - - bool try_reserve_for_ttl_without_lru(size_t size, std::lock_guard& cache_lock); - FileBlocks split_range_into_cells(const UInt128Wrapper& hash, const CacheContext& context, size_t offset, size_t size, FileBlock::State state, std::lock_guard& cache_lock); @@ -436,10 +432,10 @@ class BlockFileCache { void recycle_deleted_blocks(); - bool try_reserve_from_other_queue_by_hot_interval(FileCacheType cur_type, - std::vector other_cache_types, - size_t size, int64_t cur_time, - std::lock_guard& cache_lock); + bool try_reserve_from_other_queue_by_time_interval(FileCacheType cur_type, + std::vector other_cache_types, + size_t size, int64_t cur_time, + std::lock_guard& cache_lock); bool try_reserve_from_other_queue_by_size(FileCacheType cur_type, std::vector other_cache_types, @@ -515,7 +511,7 @@ class BlockFileCache { std::shared_ptr> _cur_disposable_queue_cache_size_metrics; std::array>, 4> _queue_evict_size_metrics; std::shared_ptr> _total_evict_size_metrics; - std::shared_ptr> _evict_by_heat_metrics_matrix[4][4]; + std::shared_ptr> _evict_by_time_metrics_matrix[4][4]; std::shared_ptr> _evict_by_size_metrics_matrix[4][4]; std::shared_ptr> _evict_by_self_lru_metrics_matrix[4]; std::shared_ptr> _evict_by_try_release; diff --git a/be/src/io/cache/block_file_cache_profile.h b/be/src/io/cache/block_file_cache_profile.h index 19d7f4139f7f15..54118d5094cd52 100644 --- a/be/src/io/cache/block_file_cache_profile.h +++ b/be/src/io/cache/block_file_cache_profile.h @@ -75,6 +75,7 @@ struct FileCacheProfile { struct FileCacheProfileReporter { RuntimeProfile::Counter* num_local_io_total = nullptr; RuntimeProfile::Counter* num_remote_io_total = nullptr; + RuntimeProfile::Counter* num_inverted_index_remote_io_total = nullptr; RuntimeProfile::Counter* local_io_timer = nullptr; RuntimeProfile::Counter* bytes_scanned_from_cache = nullptr; RuntimeProfile::Counter* bytes_scanned_from_remote = nullptr; @@ -90,6 +91,8 @@ struct FileCacheProfileReporter { cache_profile, 1); num_remote_io_total = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "NumRemoteIOTotal", TUnit::UNIT, cache_profile, 1); + num_inverted_index_remote_io_total = ADD_CHILD_COUNTER_WITH_LEVEL( + profile, "NumInvertedIndexRemoteIOTotal", TUnit::UNIT, cache_profile, 1); local_io_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "LocalIOUseTimer", cache_profile, 1); remote_io_timer = ADD_CHILD_TIMER_WITH_LEVEL(profile, "RemoteIOUseTimer", cache_profile, 1); write_cache_io_timer = @@ -107,6 +110,8 @@ struct FileCacheProfileReporter { void update(const FileCacheStatistics* statistics) const { COUNTER_UPDATE(num_local_io_total, statistics->num_local_io_total); COUNTER_UPDATE(num_remote_io_total, statistics->num_remote_io_total); + COUNTER_UPDATE(num_inverted_index_remote_io_total, + statistics->num_inverted_index_remote_io_total); COUNTER_UPDATE(local_io_timer, statistics->local_io_timer); COUNTER_UPDATE(remote_io_timer, statistics->remote_io_timer); COUNTER_UPDATE(write_cache_io_timer, statistics->write_cache_io_timer); diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index c9a273c5d368a6..f16e0019b6dcc5 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -126,7 +126,7 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* ReadStatistics stats; auto defer_func = [&](int*) { if (io_ctx->file_cache_stats) { - _update_state(stats, io_ctx->file_cache_stats); + _update_state(stats, io_ctx->file_cache_stats, io_ctx->is_inverted_index); io::FileCacheProfile::instance().update(io_ctx->file_cache_stats); } }; @@ -312,7 +312,8 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* } void CachedRemoteFileReader::_update_state(const ReadStatistics& read_stats, - FileCacheStatistics* statis) const { + FileCacheStatistics* statis, + bool is_inverted_index) const { if (statis == nullptr) { return; } @@ -320,6 +321,9 @@ void CachedRemoteFileReader::_update_state(const ReadStatistics& read_stats, statis->num_local_io_total++; statis->bytes_read_from_local += read_stats.bytes_read; } else { + if (is_inverted_index) { + statis->num_inverted_index_remote_io_total++; + } statis->num_remote_io_total++; statis->bytes_read_from_remote += read_stats.bytes_read; } diff --git a/be/src/io/cache/cached_remote_file_reader.h b/be/src/io/cache/cached_remote_file_reader.h index b3efb83c0803c8..685414cfa3aba6 100644 --- a/be/src/io/cache/cached_remote_file_reader.h +++ b/be/src/io/cache/cached_remote_file_reader.h @@ -76,7 +76,8 @@ class CachedRemoteFileReader final : public FileReader { int64_t local_read_timer = 0; int64_t local_write_timer = 0; }; - void _update_state(const ReadStatistics& stats, FileCacheStatistics* state) const; + void _update_state(const ReadStatistics& stats, FileCacheStatistics* state, + bool is_inverted_index) const; }; } // namespace doris::io diff --git a/be/src/io/cache/file_cache_common.cpp b/be/src/io/cache/file_cache_common.cpp index 674879300452df..19041938a08346 100644 --- a/be/src/io/cache/file_cache_common.cpp +++ b/be/src/io/cache/file_cache_common.cpp @@ -42,7 +42,8 @@ std::string FileCacheSettings::to_string() const { FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cache_size, size_t normal_percent, size_t disposable_percent, - size_t index_percent, const std::string& storage) { + size_t index_percent, size_t ttl_percent, + const std::string& storage) { io::FileCacheSettings settings; if (capacity == 0) return settings; settings.capacity = capacity; @@ -59,12 +60,12 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach std::max(settings.index_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); - settings.ttl_queue_size = per_size * config::max_ttl_cache_ratio; + settings.ttl_queue_size = per_size * ttl_percent; settings.ttl_queue_elements = std::max(settings.ttl_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); - settings.query_queue_size = - settings.capacity - settings.disposable_queue_size - settings.index_queue_size; + settings.query_queue_size = settings.capacity - settings.disposable_queue_size - + settings.index_queue_size - settings.ttl_queue_size; settings.query_queue_elements = std::max(settings.query_queue_size / settings.max_file_block_size, REMOTE_FS_OBJECTS_CACHE_DEFAULT_ELEMENTS); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index 30579ba7851b28..0d700d9303191f 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -29,6 +29,7 @@ inline static constexpr size_t FILE_CACHE_MAX_FILE_BLOCK_SIZE = 1 * 1024 * 1024; inline static constexpr size_t DEFAULT_NORMAL_PERCENT = 40; inline static constexpr size_t DEFAULT_DISPOSABLE_PERCENT = 5; inline static constexpr size_t DEFAULT_INDEX_PERCENT = 5; +inline static constexpr size_t DEFAULT_TTL_PERCENT = 50; using uint128_t = vectorized::UInt128; @@ -107,6 +108,7 @@ FileCacheSettings get_file_cache_settings(size_t capacity, size_t max_query_cach size_t normal_percent = DEFAULT_NORMAL_PERCENT, size_t disposable_percent = DEFAULT_DISPOSABLE_PERCENT, size_t index_percent = DEFAULT_INDEX_PERCENT, + size_t ttl_percent = DEFAULT_TTL_PERCENT, const std::string& storage = "disk"); struct CacheContext { diff --git a/be/src/io/fs/err_utils.cpp b/be/src/io/fs/err_utils.cpp index 6552d454824796..e9bed7f5887dc3 100644 --- a/be/src/io/fs/err_utils.cpp +++ b/be/src/io/fs/err_utils.cpp @@ -122,13 +122,13 @@ Status s3fs_error(const Aws::S3::S3Error& err, std::string_view msg) { using namespace Aws::Http; switch (err.GetResponseCode()) { case HttpResponseCode::NOT_FOUND: - return Status::Error("{}: {} {} type={}, request_id={}", msg, - err.GetExceptionName(), err.GetMessage(), + return Status::Error("{}: {} {} code=NOT_FOUND, type={}, request_id={}", + msg, err.GetExceptionName(), err.GetMessage(), err.GetErrorType(), err.GetRequestId()); case HttpResponseCode::FORBIDDEN: - return Status::Error("{}: {} {} type={}, request_id={}", msg, - err.GetExceptionName(), err.GetMessage(), - err.GetErrorType(), err.GetRequestId()); + return Status::Error( + "{}: {} {} code=FORBIDDEN, type={}, request_id={}", msg, err.GetExceptionName(), + err.GetMessage(), err.GetErrorType(), err.GetRequestId()); default: return Status::Error( "{}: {} {} code={} type={}, request_id={}", msg, err.GetExceptionName(), diff --git a/be/src/io/fs/file_system.cpp b/be/src/io/fs/file_system.cpp index 3579a5323d9217..e6b5ef7df1a8f5 100644 --- a/be/src/io/fs/file_system.cpp +++ b/be/src/io/fs/file_system.cpp @@ -25,58 +25,70 @@ namespace io { Status FileSystem::create_file(const Path& file, FileWriterPtr* writer, const FileWriterOptions* opts) { - auto path = absolute_path(file); + Path path; + RETURN_IF_ERROR(absolute_path(file, path)); FILESYSTEM_M(create_file_impl(path, writer, opts)); } Status FileSystem::open_file(const Path& file, FileReaderSPtr* reader, const FileReaderOptions* opts) { - auto path = absolute_path(file); + Path path; + RETURN_IF_ERROR(absolute_path(file, path)); FILESYSTEM_M(open_file_impl(path, reader, opts)); } Status FileSystem::create_directory(const Path& dir, bool failed_if_exists) { - auto path = absolute_path(dir); + Path path; + RETURN_IF_ERROR(absolute_path(dir, path)); FILESYSTEM_M(create_directory_impl(path, failed_if_exists)); } Status FileSystem::delete_file(const Path& file) { - auto path = absolute_path(file); + Path path; + RETURN_IF_ERROR(absolute_path(file, path)); FILESYSTEM_M(delete_file_impl(path)); } Status FileSystem::delete_directory(const Path& dir) { - auto path = absolute_path(dir); + Path path; + RETURN_IF_ERROR(absolute_path(dir, path)); FILESYSTEM_M(delete_directory_impl(path)); } Status FileSystem::batch_delete(const std::vector& files) { std::vector abs_files; for (auto& file : files) { - abs_files.push_back(absolute_path(file)); + Path abs_file; + RETURN_IF_ERROR(absolute_path(file, abs_file)); + abs_files.push_back(abs_file); } FILESYSTEM_M(batch_delete_impl(abs_files)); } Status FileSystem::exists(const Path& path, bool* res) const { - auto fs_path = absolute_path(path); + Path fs_path; + RETURN_IF_ERROR(absolute_path(path, fs_path)); FILESYSTEM_M(exists_impl(fs_path, res)); } Status FileSystem::file_size(const Path& file, int64_t* file_size) const { - auto path = absolute_path(file); + Path path; + RETURN_IF_ERROR(absolute_path(file, path)); FILESYSTEM_M(file_size_impl(path, file_size)); } Status FileSystem::list(const Path& dir, bool only_file, std::vector* files, bool* exists) { - auto path = absolute_path(dir); + Path path; + RETURN_IF_ERROR(absolute_path(dir, path)); FILESYSTEM_M(list_impl(path, only_file, files, exists)); } Status FileSystem::rename(const Path& orig_name, const Path& new_name) { - auto orig_path = absolute_path(orig_name); - auto new_path = absolute_path(new_name); + Path orig_path; + RETURN_IF_ERROR(absolute_path(orig_name, orig_path)); + Path new_path; + RETURN_IF_ERROR(absolute_path(new_name, new_path)); FILESYSTEM_M(rename_impl(orig_path, new_path)); } diff --git a/be/src/io/fs/file_system.h b/be/src/io/fs/file_system.h index a8ccc8756bb60a..6baf07917d35cb 100644 --- a/be/src/io/fs/file_system.h +++ b/be/src/io/fs/file_system.h @@ -163,7 +163,7 @@ class FileSystem { // FIMXE(plat1ko): The implementation and semantics of this function are not completely // consistent, which is confused. - virtual Path absolute_path(const Path& path) const = 0; + virtual Status absolute_path(const Path& path, Path& abs_path) const = 0; FileSystem(std::string id, FileSystemType type) : _id(std::move(id)), _type(type) {} diff --git a/be/src/io/fs/local_file_system.cpp b/be/src/io/fs/local_file_system.cpp index 0107ed57dc8fb1..9270d919a37519 100644 --- a/be/src/io/fs/local_file_system.cpp +++ b/be/src/io/fs/local_file_system.cpp @@ -471,4 +471,54 @@ Status LocalFileSystem::permission_impl(const Path& file, std::filesystem::perms return Status::OK(); } +Status LocalFileSystem::convert_to_abs_path(const Path& input_path_str, Path& abs_path) { + // valid path include: + // 1. abc/def will return abc/def + // 2. /abc/def will return /abc/def + // 3. file:/abc/def will return /abc/def + // 4. file:///abc/def will return /abc/def + std::string path_str = input_path_str; + size_t slash = path_str.find('/'); + if (slash == 0) { + abs_path = input_path_str; + return Status::OK(); + } + + // Initialize scheme and authority + std::string scheme; + size_t start = 0; + + // Parse URI scheme + size_t colon = path_str.find(':'); + if (colon != std::string::npos && (slash == std::string::npos || colon < slash)) { + // Has a scheme + scheme = path_str.substr(0, colon); + if (scheme != "file") { + return Status::InternalError( + "Only supports `file` type scheme, like 'file:///path', 'file:/path'."); + } + start = colon + 1; + } + + // Parse URI authority, if any + if (path_str.compare(start, 2, "//") == 0 && path_str.length() - start > 2) { + // Has authority + // such as : path_str = "file://authority/abc/def" + // and now : start = 5 + size_t next_slash = path_str.find('/', start + 2); + // now : next_slash = 16 + if (next_slash == std::string::npos) { + return Status::InternalError( + "This input string only has authority, but has no path information"); + } + // We will skit authority + // now : start = 16 + start = next_slash; + } + + // URI path is the rest of the string + abs_path = path_str.substr(start); + return Status::OK(); +} + } // namespace doris::io diff --git a/be/src/io/fs/local_file_system.h b/be/src/io/fs/local_file_system.h index c6295b0bae1d6c..4540df47c16d81 100644 --- a/be/src/io/fs/local_file_system.h +++ b/be/src/io/fs/local_file_system.h @@ -34,6 +34,8 @@ class LocalFileSystem final : public FileSystem { public: ~LocalFileSystem() override; + static Status convert_to_abs_path(const Path& path, Path& abs_path); + /// hard link dest file to src file Status link_file(const Path& src, const Path& dest); @@ -104,7 +106,9 @@ class LocalFileSystem final : public FileSystem { // `LocalFileSystem` always use absolute path as arguments // FIXME(plat1ko): Eliminate this method - Path absolute_path(const Path& path) const override { return path; } + Status absolute_path(const Path& path, Path& abs_path) const override { + return convert_to_abs_path(path, abs_path); + } friend const std::shared_ptr& global_local_filesystem(); }; diff --git a/be/src/io/fs/remote_file_system.cpp b/be/src/io/fs/remote_file_system.cpp index 2b6af2af046afc..fd793f60cdc5d8 100644 --- a/be/src/io/fs/remote_file_system.cpp +++ b/be/src/io/fs/remote_file_system.cpp @@ -29,7 +29,8 @@ namespace doris::io { Status RemoteFileSystem::upload(const Path& local_file, const Path& dest_file) { - auto dest_path = absolute_path(dest_file); + Path dest_path; + RETURN_IF_ERROR(absolute_path(dest_file, dest_path)); FILESYSTEM_M(upload_impl(local_file, dest_path)); } @@ -37,13 +38,16 @@ Status RemoteFileSystem::batch_upload(const std::vector& local_files, const std::vector& remote_files) { std::vector remote_paths; for (auto& path : remote_files) { - remote_paths.push_back(absolute_path(path)); + Path abs_path; + RETURN_IF_ERROR(absolute_path(path, abs_path)); + remote_paths.push_back(abs_path); } FILESYSTEM_M(batch_upload_impl(local_files, remote_paths)); } Status RemoteFileSystem::download(const Path& remote_file, const Path& local) { - auto remote_path = absolute_path(remote_file); + Path remote_path; + RETURN_IF_ERROR(absolute_path(remote_file, remote_path)); FILESYSTEM_M(download_impl(remote_path, local)); } diff --git a/be/src/io/fs/remote_file_system.h b/be/src/io/fs/remote_file_system.h index e9472140ab7b08..de0a1b71519a92 100644 --- a/be/src/io/fs/remote_file_system.h +++ b/be/src/io/fs/remote_file_system.h @@ -64,11 +64,13 @@ class RemoteFileSystem : public FileSystem { virtual Status open_file_internal(const Path& file, FileReaderSPtr* reader, const FileReaderOptions& opts) = 0; - Path absolute_path(const Path& path) const override { + Status absolute_path(const Path& path, Path& abs_path) const override { if (path.is_absolute()) { - return path; + abs_path = path; + } else { + abs_path = _root_path / path; } - return _root_path / path; + return Status::OK(); } Path _root_path; diff --git a/be/src/io/fs/s3_file_system.h b/be/src/io/fs/s3_file_system.h index d1e8b5b6e31a61..f6efa5053324ff 100644 --- a/be/src/io/fs/s3_file_system.h +++ b/be/src/io/fs/s3_file_system.h @@ -113,16 +113,17 @@ class S3FileSystem final : public RemoteFileSystem { const std::vector& remote_files) override; Status download_impl(const Path& remote_file, const Path& local_file) override; - Path absolute_path(const Path& path) const override { + Status absolute_path(const Path& path, Path& abs_path) const override { if (path.string().find("://") != std::string::npos) { // the path is with schema, which means this is a full path like: // s3://bucket/path/to/file.txt // so no need to concat with prefix - return path; + abs_path = path; } else { // path with no schema - return _root_path / path; + abs_path = _prefix / path; } + return Status::OK(); } private: diff --git a/be/src/io/fs/s3_file_writer.cpp b/be/src/io/fs/s3_file_writer.cpp index e40b9e171eb08f..7a06ce22074621 100644 --- a/be/src/io/fs/s3_file_writer.cpp +++ b/be/src/io/fs/s3_file_writer.cpp @@ -379,7 +379,14 @@ Status S3FileWriter::_set_upload_to_remote_less_than_buffer_size() { } void S3FileWriter::_put_object(UploadFileBuffer& buf) { - DCHECK(state() != State::CLOSED) << fmt::format("state is {}", state()); + if (state() == State::CLOSED) { + DCHECK(state() != State::CLOSED) + << "state=" << (int)state() << " path=" << _obj_storage_path_opts.path.native(); + LOG_WARNING("failed to put object because file closed, file path {}", + _obj_storage_path_opts.path.native()); + buf.set_status(Status::InternalError("try to put closed file")); + return; + } const auto& client = _obj_client->get(); if (nullptr == client) { buf.set_status(Status::InternalError("invalid obj storage client")); diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 80a594473dc376..4acc0538b7ef4f 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -38,6 +38,7 @@ namespace io { struct FileCacheStatistics { int64_t num_local_io_total = 0; int64_t num_remote_io_total = 0; + int64_t num_inverted_index_remote_io_total = 0; int64_t local_io_timer = 0; int64_t bytes_read_from_local = 0; int64_t bytes_read_from_remote = 0; @@ -60,6 +61,7 @@ struct IOContext { int64_t expiration_time = 0; const TUniqueId* query_id = nullptr; // Ref FileCacheStatistics* file_cache_stats = nullptr; // Ref + bool is_inverted_index = false; }; } // namespace io diff --git a/be/src/olap/base_compaction.cpp b/be/src/olap/base_compaction.cpp index 8b9cbd75ed33b8..9e428f27b0b576 100644 --- a/be/src/olap/base_compaction.cpp +++ b/be/src/olap/base_compaction.cpp @@ -23,6 +23,7 @@ #include #include +#include "common/cast_set.h" #include "common/config.h" #include "common/logging.h" #include "olap/compaction.h" @@ -35,6 +36,8 @@ #include "util/trace.h" namespace doris { +#include "common/compile_check_begin.h" + using namespace ErrorCode; BaseCompaction::BaseCompaction(StorageEngine& engine, const TabletSharedPtr& tablet) @@ -184,7 +187,8 @@ Status BaseCompaction::pick_rowsets_to_compact() { // set to 1 to void divide by zero base_size = 1; } - double cumulative_base_ratio = static_cast(cumulative_total_size) / base_size; + double cumulative_base_ratio = + cast_set(cumulative_total_size) / cast_set(base_size); if (cumulative_base_ratio > min_data_ratio) { VLOG_NOTICE << "satisfy the base compaction policy. tablet=" << _tablet->tablet_id() diff --git a/be/src/olap/base_tablet.cpp b/be/src/olap/base_tablet.cpp index e5ec38738155e5..82dc122e19f5ef 100644 --- a/be/src/olap/base_tablet.cpp +++ b/be/src/olap/base_tablet.cpp @@ -20,6 +20,10 @@ #include #include +#include +#include + +#include "common/cast_set.h" #include "common/logging.h" #include "common/status.h" #include "olap/calc_delete_bitmap_executor.h" @@ -45,6 +49,8 @@ #include "vec/jsonb/serialize.h" namespace doris { +#include "common/compile_check_begin.h" + using namespace ErrorCode; namespace { @@ -370,7 +376,7 @@ Status BaseTablet::calc_delete_bitmap_between_segments( seq_col_length = _tablet_meta->tablet_schema()->column(seq_col_idx).length() + 1; } size_t rowid_length = 0; - if (!_tablet_meta->tablet_schema()->cluster_key_idxes().empty()) { + if (!_tablet_meta->tablet_schema()->cluster_key_uids().empty()) { rowid_length = PrimaryKeyIndexReader::ROW_ID_LENGTH; } @@ -432,7 +438,6 @@ Status BaseTablet::lookup_row_data(const Slice& encoded_key, const RowLocation& StringRef value = string_column->get_data_at(0); values = value.to_string(); if (write_to_cache) { - StringRef value = string_column->get_data_at(0); RowCache::instance()->insert({tablet_id(), encoded_key}, Slice {value.data, value.size}); } return Status::OK(); @@ -444,7 +449,8 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest RowLocation* row_location, uint32_t version, std::vector>& segment_caches, RowsetSharedPtr* rowset, bool with_rowid, - std::string* encoded_seq_value, OlapReaderStatistics* stats) { + std::string* encoded_seq_value, OlapReaderStatistics* stats, + DeleteBitmapPtr delete_bitmap) { SCOPED_BVAR_LATENCY(g_tablet_lookup_rowkey_latency); size_t seq_col_length = 0; // use the latest tablet schema to decide if the tablet has sequence column currently @@ -454,25 +460,27 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest seq_col_length = schema->column(schema->sequence_col_idx()).length() + 1; } size_t rowid_length = 0; - if (with_rowid && !schema->cluster_key_idxes().empty()) { + if (with_rowid && !schema->cluster_key_uids().empty()) { rowid_length = PrimaryKeyIndexReader::ROW_ID_LENGTH; } Slice key_without_seq = Slice(encoded_key.get_data(), encoded_key.get_size() - seq_col_length - rowid_length); RowLocation loc; + auto tablet_delete_bitmap = + delete_bitmap == nullptr ? _tablet_meta->delete_bitmap_ptr() : delete_bitmap; for (size_t i = 0; i < specified_rowsets.size(); i++) { - auto& rs = specified_rowsets[i]; - auto& segments_key_bounds = rs->rowset_meta()->get_segments_key_bounds(); - int num_segments = rs->num_segments(); + const auto& rs = specified_rowsets[i]; + const auto& segments_key_bounds = rs->rowset_meta()->get_segments_key_bounds(); + int num_segments = cast_set(rs->num_segments()); DCHECK_EQ(segments_key_bounds.size(), num_segments); std::vector picked_segments; - for (int i = num_segments - 1; i >= 0; i--) { - if (key_without_seq.compare(segments_key_bounds[i].max_key()) > 0 || - key_without_seq.compare(segments_key_bounds[i].min_key()) < 0) { + for (int j = num_segments - 1; j >= 0; j--) { + if (key_without_seq.compare(segments_key_bounds[j].max_key()) > 0 || + key_without_seq.compare(segments_key_bounds[j].min_key()) < 0) { continue; } - picked_segments.emplace_back(i); + picked_segments.emplace_back(j); } if (picked_segments.empty()) { continue; @@ -495,7 +503,7 @@ Status BaseTablet::lookup_row_key(const Slice& encoded_key, TabletSchema* latest if (!s.ok() && !s.is()) { return s; } - if (s.ok() && _tablet_meta->delete_bitmap().contains_agg_without_cache( + if (s.ok() && tablet_delete_bitmap->contains_agg_without_cache( {loc.rowset_id, loc.segment_id, version}, loc.row_id)) { // if has sequence col, we continue to compare the sequence_id of // all rowsets, util we find an existing key. @@ -529,7 +537,8 @@ Status BaseTablet::calc_delete_bitmap(const BaseTabletSPtr& tablet, RowsetShared const std::vector& segments, const std::vector& specified_rowsets, DeleteBitmapPtr delete_bitmap, int64_t end_version, - CalcDeleteBitmapToken* token, RowsetWriter* rowset_writer) { + CalcDeleteBitmapToken* token, RowsetWriter* rowset_writer, + DeleteBitmapPtr tablet_delete_bitmap) { auto rowset_id = rowset->rowset_id(); if (specified_rowsets.empty() || segments.empty()) { LOG(INFO) << "skip to construct delete bitmap tablet: " << tablet->tablet_id() @@ -542,10 +551,11 @@ Status BaseTablet::calc_delete_bitmap(const BaseTabletSPtr& tablet, RowsetShared const auto& seg = segment; if (token != nullptr) { RETURN_IF_ERROR(token->submit(tablet, rowset, seg, specified_rowsets, end_version, - delete_bitmap, rowset_writer)); + delete_bitmap, rowset_writer, tablet_delete_bitmap)); } else { RETURN_IF_ERROR(tablet->calc_segment_delete_bitmap( - rowset, segment, specified_rowsets, delete_bitmap, end_version, rowset_writer)); + rowset, segment, specified_rowsets, delete_bitmap, end_version, rowset_writer, + tablet_delete_bitmap)); } } @@ -556,7 +566,8 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, const segment_v2::SegmentSharedPtr& seg, const std::vector& specified_rowsets, DeleteBitmapPtr delete_bitmap, int64_t end_version, - RowsetWriter* rowset_writer) { + RowsetWriter* rowset_writer, + DeleteBitmapPtr tablet_delete_bitmap) { OlapStopWatch watch; auto rowset_id = rowset->rowset_id(); Version dummy_version(end_version + 1, end_version + 1); @@ -642,7 +653,7 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, Slice key = Slice(index_column->get_data_at(i).data, index_column->get_data_at(i).size); RowLocation loc; // calculate row id - if (!_tablet_meta->tablet_schema()->cluster_key_idxes().empty()) { + if (!_tablet_meta->tablet_schema()->cluster_key_uids().empty()) { size_t seq_col_length = 0; if (_tablet_meta->tablet_schema()->has_sequence_col()) { seq_col_length = @@ -670,8 +681,16 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, } RowsetSharedPtr rowset_find; - auto st = lookup_row_key(key, rowset_schema.get(), true, specified_rowsets, &loc, - dummy_version.first - 1, segment_caches, &rowset_find); + Status st = Status::OK(); + if (tablet_delete_bitmap == nullptr) { + st = lookup_row_key(key, rowset_schema.get(), true, specified_rowsets, &loc, + cast_set(dummy_version.first - 1), segment_caches, + &rowset_find); + } else { + st = lookup_row_key(key, rowset_schema.get(), true, specified_rowsets, &loc, + cast_set(dummy_version.first - 1), segment_caches, + &rowset_find, true, nullptr, nullptr, tablet_delete_bitmap); + } bool expected_st = st.ok() || st.is() || st.is(); // It's a defensive DCHECK, we need to exclude some common errors to avoid core-dump // while stress test @@ -758,11 +777,11 @@ Status BaseTablet::calc_segment_delete_bitmap(RowsetSharedPtr rowset, if (config::enable_merge_on_write_correctness_check) { RowsetIdUnorderedSet rowsetids; - for (const auto& rowset : specified_rowsets) { - rowsetids.emplace(rowset->rowset_id()); + for (const auto& specified_rowset : specified_rowsets) { + rowsetids.emplace(specified_rowset->rowset_id()); VLOG_NOTICE << "[tabletID:" << tablet_id() << "]" << "[add_sentinel_mark_to_delete_bitmap][end_version:" << end_version << "]" - << "add:" << rowset->rowset_id(); + << "add:" << specified_rowset->rowset_id(); } add_sentinel_mark_to_delete_bitmap(delete_bitmap.get(), rowsetids); } @@ -872,11 +891,11 @@ Status BaseTablet::fetch_value_through_row_column(RowsetSharedPtr input_rowset, std::vector default_values; default_values.resize(cids.size()); for (int i = 0; i < cids.size(); ++i) { - const TabletColumn& column = tablet_schema.column(cids[i]); + const TabletColumn& tablet_column = tablet_schema.column(cids[i]); vectorized::DataTypePtr type = - vectorized::DataTypeFactory::instance().create_data_type(column); - col_uid_to_idx[column.unique_id()] = i; - default_values[i] = column.default_value(); + vectorized::DataTypeFactory::instance().create_data_type(tablet_column); + col_uid_to_idx[tablet_column.unique_id()] = i; + default_values[i] = tablet_column.default_value(); serdes[i] = type->get_serde(); } vectorized::JsonbSerializeUtil::jsonb_to_block(serdes, *string_column, col_uid_to_idx, block, @@ -1130,7 +1149,7 @@ Status BaseTablet::generate_new_block_for_flexible_partial_update( const signed char* delete_sign_column_data) { if (skipped) { if (delete_sign_column_data != nullptr && - delete_sign_column_data[read_index_old[idx]] != 0) { + delete_sign_column_data[read_index_old[cast_set(idx)]] != 0) { if (tablet_column.has_default_value()) { new_col->insert_from(default_value_col, 0); } else if (tablet_column.is_nullable()) { @@ -1300,20 +1319,22 @@ Status BaseTablet::check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap for (const auto& rowset : *rowsets) { rapidjson::Value value; std::string version_str = rowset->get_rowset_info_str(); - value.SetString(version_str.c_str(), version_str.length(), + value.SetString(version_str.c_str(), + cast_set(version_str.length()), required_rowsets_arr.GetAllocator()); required_rowsets_arr.PushBack(value, required_rowsets_arr.GetAllocator()); } } else { - std::vector rowsets; + std::vector tablet_rowsets; { std::shared_lock meta_rlock(_meta_lock); - rowsets = get_rowset_by_ids(&rowset_ids); + tablet_rowsets = get_rowset_by_ids(&rowset_ids); } - for (const auto& rowset : rowsets) { + for (const auto& rowset : tablet_rowsets) { rapidjson::Value value; std::string version_str = rowset->get_rowset_info_str(); - value.SetString(version_str.c_str(), version_str.length(), + value.SetString(version_str.c_str(), + cast_set(version_str.length()), required_rowsets_arr.GetAllocator()); required_rowsets_arr.PushBack(value, required_rowsets_arr.GetAllocator()); } @@ -1321,7 +1342,8 @@ Status BaseTablet::check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap for (const auto& missing_rowset_id : missing_ids) { rapidjson::Value miss_value; std::string rowset_id_str = missing_rowset_id.to_string(); - miss_value.SetString(rowset_id_str.c_str(), rowset_id_str.length(), + miss_value.SetString(rowset_id_str.c_str(), + cast_set(rowset_id_str.length()), missing_rowsets_arr.GetAllocator()); missing_rowsets_arr.PushBack(miss_value, missing_rowsets_arr.GetAllocator()); } @@ -1341,7 +1363,8 @@ Status BaseTablet::check_delete_bitmap_correctness(DeleteBitmapPtr delete_bitmap } Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInfo* txn_info, - int64_t txn_id, int64_t txn_expiration) { + int64_t txn_id, int64_t txn_expiration, + DeleteBitmapPtr tablet_delete_bitmap) { SCOPED_BVAR_LATENCY(g_tablet_update_delete_bitmap_latency); RowsetIdUnorderedSet cur_rowset_ids; RowsetIdUnorderedSet rowset_ids_to_add; @@ -1370,6 +1393,8 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf auto t1 = watch.get_elapse_time_us(); { + int64_t next_visible_version = txn_info->is_txn_load ? txn_info->next_visible_version + : txn_info->rowset->start_version(); std::shared_lock meta_rlock(self->_meta_lock); // tablet is under alter process. The delete bitmap will be calculated after conversion. if (self->tablet_state() == TABLET_NOTREADY) { @@ -1377,7 +1402,7 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf << self->tablet_id(); return Status::OK(); } - RETURN_IF_ERROR(self->get_all_rs_id_unlocked(cur_version - 1, &cur_rowset_ids)); + RETURN_IF_ERROR(self->get_all_rs_id_unlocked(next_visible_version - 1, &cur_rowset_ids)); } auto t2 = watch.get_elapse_time_us(); @@ -1392,6 +1417,15 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf std::shared_lock meta_rlock(self->_meta_lock); specified_rowsets = self->get_rowset_by_ids(&rowset_ids_to_add); } + if (txn_info->is_txn_load) { + for (auto invisible_rowset : txn_info->invisible_rowsets) { + specified_rowsets.emplace_back(invisible_rowset); + } + std::sort(specified_rowsets.begin(), specified_rowsets.end(), + [](RowsetSharedPtr& lhs, RowsetSharedPtr& rhs) { + return lhs->end_version() > rhs->end_version(); + }); + } auto t3 = watch.get_elapse_time_us(); // If a rowset is produced by compaction before the commit phase of the partial update load @@ -1404,12 +1438,12 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf txn_info->partial_update_info->max_version_in_flush_phase; DCHECK(max_version_in_flush_phase != -1); std::vector remained_rowsets; - for (const auto& rowset : specified_rowsets) { - if (rowset->end_version() <= max_version_in_flush_phase && - rowset->produced_by_compaction()) { - rowsets_skip_alignment.emplace_back(rowset); + for (const auto& specified_rowset : specified_rowsets) { + if (specified_rowset->end_version() <= max_version_in_flush_phase && + specified_rowset->produced_by_compaction()) { + rowsets_skip_alignment.emplace_back(specified_rowset); } else { - remained_rowsets.emplace_back(rowset); + remained_rowsets.emplace_back(specified_rowset); } } if (!rowsets_skip_alignment.empty()) { @@ -1436,7 +1470,8 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf auto token = self->calc_delete_bitmap_executor()->create_token(); // set rowset_writer to nullptr to skip the alignment process RETURN_IF_ERROR(calc_delete_bitmap(self, rowset, segments, rowsets_skip_alignment, - delete_bitmap, cur_version - 1, token.get(), nullptr)); + delete_bitmap, cur_version - 1, token.get(), nullptr, + tablet_delete_bitmap)); RETURN_IF_ERROR(token->wait()); } @@ -1444,13 +1479,14 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf // Otherwise, it will be submitted to the thread pool for calculation. if (segments.size() <= 1) { RETURN_IF_ERROR(calc_delete_bitmap(self, rowset, segments, specified_rowsets, delete_bitmap, - cur_version - 1, nullptr, transient_rs_writer.get())); + cur_version - 1, nullptr, transient_rs_writer.get(), + tablet_delete_bitmap)); } else { auto token = self->calc_delete_bitmap_executor()->create_token(); RETURN_IF_ERROR(calc_delete_bitmap(self, rowset, segments, specified_rowsets, delete_bitmap, - cur_version - 1, token.get(), - transient_rs_writer.get())); + cur_version - 1, token.get(), transient_rs_writer.get(), + tablet_delete_bitmap)); RETURN_IF_ERROR(token->wait()); } @@ -1501,8 +1537,9 @@ Status BaseTablet::update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInf segments.begin(), segments.end(), 0, [](size_t sum, const segment_v2::SegmentSharedPtr& s) { return sum += s->num_rows(); }); auto t5 = watch.get_elapse_time_us(); + int64_t lock_id = txn_info->is_txn_load ? txn_info->lock_id : -1; RETURN_IF_ERROR(self->save_delete_bitmap(txn_info, txn_id, delete_bitmap, - transient_rs_writer.get(), cur_rowset_ids)); + transient_rs_writer.get(), cur_rowset_ids, lock_id)); LOG(INFO) << "[Publish] construct delete bitmap tablet: " << self->tablet_id() << ", rowset_ids to add: " << rowset_ids_to_add.size() << ", rowset_ids to del: " << rowset_ids_to_del.size() @@ -1720,8 +1757,8 @@ std::vector BaseTablet::get_snapshot_rowset(bool include_stale_ void BaseTablet::calc_consecutive_empty_rowsets( std::vector* empty_rowsets, - const std::vector& candidate_rowsets, int limit) { - int len = candidate_rowsets.size(); + const std::vector& candidate_rowsets, int64_t limit) { + int len = cast_set(candidate_rowsets.size()); for (int i = 0; i < len - 1; ++i) { auto rowset = candidate_rowsets[i]; auto next_rowset = candidate_rowsets[i + 1]; @@ -1757,7 +1794,7 @@ void BaseTablet::calc_consecutive_empty_rowsets( } Status BaseTablet::calc_file_crc(uint32_t* crc_value, int64_t start_version, int64_t end_version, - int32_t* rowset_count, int64_t* file_count) { + uint32_t* rowset_count, int64_t* file_count) { Version v(start_version, end_version); std::vector rowsets; traverse_rowsets([&rowsets, &v](const auto& rs) { @@ -1767,7 +1804,7 @@ Status BaseTablet::calc_file_crc(uint32_t* crc_value, int64_t start_version, int } }); std::sort(rowsets.begin(), rowsets.end(), Rowset::comparator); - *rowset_count = rowsets.size(); + *rowset_count = cast_set(rowsets.size()); *crc_value = 0; *file_count = 0; diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index b5da0e3bf06be1..c6de447200f87c 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -156,7 +156,8 @@ class BaseTablet { std::vector>& segment_caches, RowsetSharedPtr* rowset = nullptr, bool with_rowid = true, std::string* encoded_seq_value = nullptr, - OlapReaderStatistics* stats = nullptr); + OlapReaderStatistics* stats = nullptr, + DeleteBitmapPtr tablet_delete_bitmap = nullptr); // calc delete bitmap when flush memtable, use a fake version to calc // For example, cur max version is 5, and we use version 6 to calc but @@ -169,13 +170,15 @@ class BaseTablet { const std::vector& specified_rowsets, DeleteBitmapPtr delete_bitmap, int64_t version, CalcDeleteBitmapToken* token, - RowsetWriter* rowset_writer = nullptr); + RowsetWriter* rowset_writer = nullptr, + DeleteBitmapPtr tablet_delete_bitmap = nullptr); Status calc_segment_delete_bitmap(RowsetSharedPtr rowset, const segment_v2::SegmentSharedPtr& seg, const std::vector& specified_rowsets, DeleteBitmapPtr delete_bitmap, int64_t end_version, - RowsetWriter* rowset_writer); + RowsetWriter* rowset_writer, + DeleteBitmapPtr tablet_delete_bitmap = nullptr); Status calc_delete_bitmap_between_segments( RowsetSharedPtr rowset, const std::vector& segments, @@ -235,11 +238,13 @@ class BaseTablet { int64_t txn_expiration = 0) = 0; static Status update_delete_bitmap(const BaseTabletSPtr& self, TabletTxnInfo* txn_info, - int64_t txn_id, int64_t txn_expiration = 0); + int64_t txn_id, int64_t txn_expiration = 0, + DeleteBitmapPtr tablet_delete_bitmap = nullptr); virtual Status save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t txn_id, DeleteBitmapPtr delete_bitmap, RowsetWriter* rowset_writer, - const RowsetIdUnorderedSet& cur_rowset_ids) = 0; + const RowsetIdUnorderedSet& cur_rowset_ids, + int64_t lock_id = -1) = 0; virtual CalcDeleteBitmapExecutor* calc_delete_bitmap_executor() = 0; void calc_compaction_output_rowset_delete_bitmap( @@ -271,10 +276,13 @@ class BaseTablet { // Find the first consecutive empty rowsets. output->size() >= limit void calc_consecutive_empty_rowsets(std::vector* empty_rowsets, const std::vector& candidate_rowsets, - int limit); + int64_t limit); // Return the merged schema of all rowsets - virtual TabletSchemaSPtr merged_tablet_schema() const { return _max_version_schema; } + virtual TabletSchemaSPtr merged_tablet_schema() const { + std::shared_lock rlock(_meta_lock); + return _max_version_schema; + } void traverse_rowsets(std::function visitor, bool include_stale = false) { @@ -289,7 +297,7 @@ class BaseTablet { } Status calc_file_crc(uint32_t* crc_value, int64_t start_version, int64_t end_version, - int32_t* rowset_count, int64_t* file_count); + uint32_t* rowset_count, int64_t* file_count); Status show_nested_index_file(std::string* json_meta); diff --git a/be/src/olap/bitmap_filter_predicate.h b/be/src/olap/bitmap_filter_predicate.h index 716c99927bf2d6..8d89c7a31fb271 100644 --- a/be/src/olap/bitmap_filter_predicate.h +++ b/be/src/olap/bitmap_filter_predicate.h @@ -37,7 +37,7 @@ class BitmapFilterColumnPredicate : public ColumnPredicate { using SpecificFilter = BitmapFilterFunc; BitmapFilterColumnPredicate(uint32_t column_id, - const std::shared_ptr& filter, int) + const std::shared_ptr& filter) : ColumnPredicate(column_id), _filter(filter), _specific_filter(assert_cast(_filter.get())) {} diff --git a/be/src/olap/bloom_filter.hpp b/be/src/olap/bloom_filter.hpp deleted file mode 100644 index 5c7cb5f9e6419f..00000000000000 --- a/be/src/olap/bloom_filter.hpp +++ /dev/null @@ -1,272 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef DORIS_BE_SRC_OLAP_COLUMN_FILE_BLOOM_FILTER_HPP -#define DORIS_BE_SRC_OLAP_COLUMN_FILE_BLOOM_FILTER_HPP - -#include - -#include -#include - -#include "olap/olap_define.h" -#include "olap/utils.h" -#include "util/hash_util.hpp" - -namespace doris { - -static const uint64_t DEFAULT_SEED = 104729; -static const uint64_t BLOOM_FILTER_NULL_HASHCODE = 2862933555777941757ULL; - -struct BloomFilterIndexHeader { - uint64_t block_count; - BloomFilterIndexHeader() : block_count(0) {} -} __attribute__((packed)); - -// Bare metal bit set implementation. For performance reasons, this implementation does not -// check for index bounds nor expand the bit set if the specified index is greater than the size. -class BitSet { -public: - BitSet() : _data(nullptr), _data_len(0) {} - - ~BitSet() { SAFE_DELETE_ARRAY(_data); } - - // Init BitSet with given bit_num, which will align up to uint64_t - bool init(uint32_t bit_num) { - if (bit_num <= 0) { - return false; - } - - _data_len = (bit_num + sizeof(uint64_t) * 8 - 1) / (sizeof(uint64_t) * 8); - _data = new (std::nothrow) uint64_t[_data_len]; - if (_data == nullptr) { - return false; - } - - memset(_data, 0, _data_len * sizeof(uint64_t)); - return true; - } - - // Init BitSet with given buffer - bool init(uint64_t* data, uint32_t data_len) { - _data = data; - _data_len = data_len; - return true; - } - - // Set the bit specified by param, note that uint64_t type contains 2^6 bits - void set(uint32_t index) { _data[index >> 6] |= 1L << (index % 64); } - - // Return true if the bit specified by param is set - bool get(uint32_t index) const { return (_data[index >> 6] & (1L << (index % 64))) != 0; } - - // Merge with another BitSet by byte, return false when the length is not equal - bool merge(const BitSet& set) { - if (_data_len != set.data_len()) { - return false; - } - - for (uint32_t i = 0; i < _data_len; ++i) { - _data[i] |= set.data()[i]; - } - - return true; - } - - // Convert BitSet to string to convenient debug and test - std::string to_string() const { - uint32_t bit_num = _data_len * sizeof(uint64_t) * 8; - std::string str(bit_num, '0'); - for (uint32_t i = 0; i < bit_num; ++i) { - if ((_data[i >> 6] & (1L << i)) != 0) { - str[i] = '1'; - } - } - - return str; - } - - uint64_t* data() const { return _data; } - - uint32_t data_len() const { return _data_len; } - - uint32_t bit_num() const { return _data_len * sizeof(uint64_t) * 8; } - - void clear() { memset(_data, 0, _data_len * sizeof(uint64_t)); } - - void reset() { - _data = NULL; - _data_len = 0; - } - -private: - uint64_t* _data; - uint32_t _data_len; -}; - -class BloomFilter { -public: - BloomFilter() : _bit_num(0), _hash_function_num(0) {} - ~BloomFilter() {} - - // Create BloomFilter with given entry num and fpp, which is used for loading data - bool init(int64_t expected_entries, double fpp) { - uint32_t bit_num = _optimal_bit_num(expected_entries, fpp); - if (!_bit_set.init(bit_num)) { - return false; - } - - _bit_num = _bit_set.bit_num(); - _hash_function_num = _optimal_hash_function_num(expected_entries, _bit_num); - return true; - } - - // Create BloomFilter with given entry num and default fpp - bool init(int64_t expected_entries) { - return this->init(expected_entries, BLOOM_FILTER_DEFAULT_FPP); - } - - // Init BloomFilter with given buffer, which is used for query - bool init(uint64_t* data, uint32_t len, uint32_t hash_function_num) { - _bit_num = sizeof(uint64_t) * 8 * len; - _hash_function_num = hash_function_num; - return _bit_set.init(data, len); - } - - // Compute hash value of given buffer and add to BloomFilter - void add_bytes(const char* buf, uint32_t len) { - uint64_t hash = buf == nullptr ? BLOOM_FILTER_NULL_HASHCODE - : HashUtil::hash64(buf, len, DEFAULT_SEED); - add_hash(hash); - } - - // Generate multiple hash value according to following rule: - // new_hash_value = hash_high_part + (i * hash_low_part) - void add_hash(uint64_t hash) { - uint32_t hash1 = (uint32_t)hash; - uint32_t hash2 = (uint32_t)(hash >> 32); - - for (uint32_t i = 0; i < _hash_function_num; ++i) { - uint64_t combine_hash = hash1 + hash2 * i; - uint32_t index = combine_hash % _bit_num; - _bit_set.set(index); - } - } - - // Compute hash value of given buffer and verify whether exist in BloomFilter - bool test_bytes(const char* buf, uint32_t len) const { - uint64_t hash = buf == nullptr ? BLOOM_FILTER_NULL_HASHCODE - : HashUtil::hash64(buf, len, DEFAULT_SEED); - return test_hash(hash); - } - - // Verify whether hash value in BloomFilter - bool test_hash(uint64_t hash) const { - uint32_t hash1 = (uint32_t)hash; - uint32_t hash2 = (uint32_t)(hash >> 32); - - for (uint32_t i = 0; i < _hash_function_num; ++i) { - uint64_t combine_hash = hash1 + hash2 * i; - uint32_t index = combine_hash % _bit_num; - if (!_bit_set.get(index)) { - return false; - } - } - - return true; - } - - // Merge with another BloomFilter, return false when the length - // and hash function number is not equal - bool merge(const BloomFilter& that) { - if (_bit_num == that.bit_num() && _hash_function_num == that.hash_function_num()) { - _bit_set.merge(that.bit_set()); - return true; - } - - return false; - } - - void clear() { _bit_set.clear(); } - - void reset() { - _bit_num = 0; - _hash_function_num = 0; - _bit_set.reset(); - } - - uint32_t bit_num() const { return _bit_num; } - - uint32_t hash_function_num() const { return _hash_function_num; } - - const BitSet& bit_set() const { return _bit_set; } - - uint64_t* bit_set_data() const { return _bit_set.data(); } - - uint32_t bit_set_data_len() const { return _bit_set.data_len(); } - - // Convert BloomFilter to string to convenient debug and test - std::string to_string() const { - std::stringstream bf_stream; - bf_stream << "bit_num:" << _bit_num << " hash_function_num:" << _hash_function_num - << " bit_set:" << _bit_set.to_string(); - return bf_stream.str(); - } - - // Get points which set by given buffer in the BitSet - std::string get_bytes_points_string(const char* buf, uint32_t len) const { - uint64_t hash = buf == nullptr ? BLOOM_FILTER_NULL_HASHCODE - : HashUtil::hash64(buf, len, DEFAULT_SEED); - uint32_t hash1 = (uint32_t)hash; - uint32_t hash2 = (uint32_t)(hash >> 32); - - std::stringstream stream; - for (uint32_t i = 0; i < _hash_function_num; ++i) { - if (i != 0) { - stream << "-"; - } - - uint32_t combine_hash = hash1 + hash2 * i; - uint32_t index = combine_hash % _bit_num; - stream << index; - } - - return stream.str(); - } - -private: - // Compute the optimal bit number according to the following rule: - // m = -n * ln(fpp) / (ln(2) ^ 2) - uint32_t _optimal_bit_num(int64_t n, double fpp) { - return (uint32_t)(-n * log(fpp) / (log(2) * log(2))); - } - - // Compute the optimal hash function number according to the following rule: - // k = round(m * ln(2) / n) - uint32_t _optimal_hash_function_num(int64_t n, uint32_t m) { - uint32_t k = (uint32_t)round(m * log(2) / n); - return k > 1 ? k : 1; - } - - BitSet _bit_set; - uint32_t _bit_num; - uint32_t _hash_function_num; -}; - -} // namespace doris - -#endif // DORIS_BE_SRC_OLAP_COLUMN_FILE_BLOOM_FILTER_HPP diff --git a/be/src/olap/bloom_filter_predicate.h b/be/src/olap/bloom_filter_predicate.h index d9d37d13198bbd..cd4f89b57ec50d 100644 --- a/be/src/olap/bloom_filter_predicate.h +++ b/be/src/olap/bloom_filter_predicate.h @@ -65,7 +65,9 @@ class BloomFilterColumnPredicate : public ColumnPredicate { uint16_t evaluate(const vectorized::IColumn& column, const uint8_t* null_map, uint16_t* sel, uint16_t size) const { if constexpr (is_nullable) { - DCHECK(null_map); + if (!null_map) { + throw Exception(ErrorCode::INTERNAL_ERROR, "null_map is nullptr"); + } } uint16_t new_size = 0; @@ -91,7 +93,9 @@ class BloomFilterColumnPredicate : public ColumnPredicate { int get_filter_id() const override { int filter_id = _filter->get_filter_id(); - DCHECK(filter_id != -1); + if (filter_id == 1) { + throw Exception(ErrorCode::INTERNAL_ERROR, "filter_id is -1"); + } return filter_id; } diff --git a/be/src/olap/calc_delete_bitmap_executor.cpp b/be/src/olap/calc_delete_bitmap_executor.cpp index 3983dc0a98642a..e45f9801f68ba4 100644 --- a/be/src/olap/calc_delete_bitmap_executor.cpp +++ b/be/src/olap/calc_delete_bitmap_executor.cpp @@ -34,7 +34,8 @@ Status CalcDeleteBitmapToken::submit(BaseTabletSPtr tablet, RowsetSharedPtr cur_ const segment_v2::SegmentSharedPtr& cur_segment, const std::vector& target_rowsets, int64_t end_version, DeleteBitmapPtr delete_bitmap, - RowsetWriter* rowset_writer) { + RowsetWriter* rowset_writer, + DeleteBitmapPtr tablet_delete_bitmap) { { std::shared_lock rlock(_lock); RETURN_IF_ERROR(_status); @@ -44,7 +45,8 @@ Status CalcDeleteBitmapToken::submit(BaseTabletSPtr tablet, RowsetSharedPtr cur_ return _thread_token->submit_func([=, this]() { SCOPED_ATTACH_TASK(_query_thread_context); auto st = tablet->calc_segment_delete_bitmap(cur_rowset, cur_segment, target_rowsets, - delete_bitmap, end_version, rowset_writer); + delete_bitmap, end_version, rowset_writer, + tablet_delete_bitmap); if (!st.ok()) { LOG(WARNING) << "failed to calc segment delete bitmap, tablet_id: " << tablet->tablet_id() << " rowset: " << cur_rowset->rowset_id() diff --git a/be/src/olap/calc_delete_bitmap_executor.h b/be/src/olap/calc_delete_bitmap_executor.h index fa1e79b7feaa19..288108b04971df 100644 --- a/be/src/olap/calc_delete_bitmap_executor.h +++ b/be/src/olap/calc_delete_bitmap_executor.h @@ -52,7 +52,8 @@ class CalcDeleteBitmapToken { Status submit(BaseTabletSPtr tablet, RowsetSharedPtr cur_rowset, const segment_v2::SegmentSharedPtr& cur_segment, const std::vector& target_rowsets, int64_t end_version, - DeleteBitmapPtr delete_bitmap, RowsetWriter* rowset_writer); + DeleteBitmapPtr delete_bitmap, RowsetWriter* rowset_writer, + DeleteBitmapPtr tablet_delete_bitmap); // wait all tasks in token to be completed. Status wait(); diff --git a/be/src/olap/compaction.cpp b/be/src/olap/compaction.cpp index a40e28669e90cc..aec38699e014a2 100644 --- a/be/src/olap/compaction.cpp +++ b/be/src/olap/compaction.cpp @@ -44,6 +44,7 @@ #include "io/fs/file_system.h" #include "io/fs/file_writer.h" #include "io/fs/remote_file_system.h" +#include "io/io_common.h" #include "olap/cumulative_compaction_policy.h" #include "olap/cumulative_compaction_time_series_policy.h" #include "olap/data_dir.h" @@ -190,11 +191,14 @@ Status Compaction::merge_input_rowsets() { SCOPED_TIMER(_merge_rowsets_latency_timer); // 1. Merge segment files and write bkd inverted index if (_is_vertical) { + if (!_tablet->tablet_schema()->cluster_key_uids().empty()) { + RETURN_IF_ERROR(update_delete_bitmap()); + } res = Merger::vertical_merge_rowsets(_tablet, compaction_type(), *_cur_tablet_schema, input_rs_readers, _output_rs_writer.get(), get_avg_segment_rows(), way_num, &_stats); } else { - if (!_tablet->tablet_schema()->cluster_key_idxes().empty()) { + if (!_tablet->tablet_schema()->cluster_key_uids().empty()) { return Status::InternalError( "mow table with cluster keys does not support non vertical compaction"); } @@ -345,8 +349,9 @@ bool CompactionMixin::handle_ordered_data_compaction() { if (!config::enable_ordered_data_compaction) { return false; } - if (compaction_type() == ReaderType::READER_COLD_DATA_COMPACTION) { - // The remote file system does not support to link files. + if (compaction_type() == ReaderType::READER_COLD_DATA_COMPACTION || + compaction_type() == ReaderType::READER_FULL_COMPACTION) { + // The remote file system and full compaction does not support to link files. return false; } if (_tablet->keys_type() == KeysType::UNIQUE_KEYS && @@ -490,10 +495,35 @@ Status CompactionMixin::execute_compact_impl(int64_t permits) { Status Compaction::do_inverted_index_compaction() { const auto& ctx = _output_rs_writer->context(); if (!config::inverted_index_compaction_enable || _input_row_num <= 0 || - !_stats.rowid_conversion || ctx.columns_to_do_index_compaction.empty()) { + ctx.columns_to_do_index_compaction.empty()) { return Status::OK(); } + auto error_handler = [this](int64_t index_id, int64_t column_uniq_id) { + LOG(WARNING) << "failed to do index compaction" + << ". tablet=" << _tablet->tablet_id() << ". column uniq id=" << column_uniq_id + << ". index_id=" << index_id; + for (auto& rowset : _input_rowsets) { + rowset->set_skip_index_compaction(column_uniq_id); + LOG(INFO) << "mark skipping inverted index compaction next time" + << ". tablet=" << _tablet->tablet_id() << ", rowset=" << rowset->rowset_id() + << ", column uniq id=" << column_uniq_id << ", index_id=" << index_id; + } + }; + + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_rowid_conversion_null", + { _stats.rowid_conversion = nullptr; }) + if (!_stats.rowid_conversion) { + LOG(WARNING) << "failed to do index compaction, rowid conversion is null" + << ". tablet=" << _tablet->tablet_id() + << ", input row number=" << _input_row_num; + mark_skip_index_compaction(ctx, error_handler); + + return Status::Error( + "failed to do index compaction, rowid conversion is null. tablet={}", + _tablet->tablet_id()); + } + OlapStopWatch inverted_watch; // translation vec @@ -516,8 +546,7 @@ Status Compaction::do_inverted_index_compaction() { auto src_segment_num = src_seg_to_id_map.size(); auto dest_segment_num = dest_segment_num_rows.size(); - DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_dest_segment_num_is_zero", - { dest_segment_num = 0; }) + // when all the input rowsets are deleted, the output rowset will be empty and dest_segment_num will be 0. if (dest_segment_num <= 0) { LOG(INFO) << "skip doing index compaction due to no output segments" << ". tablet=" << _tablet->tablet_id() << ", input row number=" << _input_row_num @@ -595,29 +624,62 @@ Status Compaction::do_inverted_index_compaction() { DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_find_rowset_error", { find_it = rs_id_to_rowset_map.end(); }) if (find_it == rs_id_to_rowset_map.end()) [[unlikely]] { - // DCHECK(false) << _tablet->tablet_id() << ' ' << rowset_id; - return Status::InternalError("cannot find rowset. tablet_id={} rowset_id={}", - _tablet->tablet_id(), rowset_id.to_string()); + LOG(WARNING) << "failed to do index compaction, cannot find rowset. tablet_id=" + << _tablet->tablet_id() << " rowset_id=" << rowset_id.to_string(); + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "failed to do index compaction, cannot find rowset. tablet_id={} rowset_id={}", + _tablet->tablet_id(), rowset_id.to_string()); } auto* rowset = find_it->second; auto fs = rowset->rowset_meta()->fs(); DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_get_fs_error", { fs = nullptr; }) if (!fs) { - return Status::InternalError("get fs failed, resource_id={}", - rowset->rowset_meta()->resource_id()); + LOG(WARNING) << "failed to do index compaction, get fs failed. resource_id=" + << rowset->rowset_meta()->resource_id(); + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "get fs failed, resource_id={}", rowset->rowset_meta()->resource_id()); } - auto seg_path = DORIS_TRY(rowset->segment_path(seg_id)); + auto seg_path = rowset->segment_path(seg_id); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_seg_path_nullptr", { + seg_path = ResultError(Status::Error( + "do_inverted_index_compaction_seg_path_nullptr")); + }) + if (!seg_path.has_value()) { + LOG(WARNING) << "failed to do index compaction, get segment path failed. tablet_id=" + << _tablet->tablet_id() << " rowset_id=" << rowset_id.to_string() + << " seg_id=" << seg_id; + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "get segment path failed. tablet_id={} rowset_id={} seg_id={}", + _tablet->tablet_id(), rowset_id.to_string(), seg_id); + } auto inverted_index_file_reader = std::make_unique( - fs, std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path)}, + fs, + std::string {InvertedIndexDescriptor::get_index_file_path_prefix(seg_path.value())}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(seg_id)); - bool open_idx_file_cache = false; - RETURN_NOT_OK_STATUS_WITH_WARN( - inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache), - "inverted_index_file_reader init failed"); + auto st = inverted_index_file_reader->init(config::inverted_index_read_buffer_size); + DBUG_EXECUTE_IF("Compaction::do_inverted_index_compaction_init_inverted_index_file_reader", + { + st = Status::Error( + "debug point: " + "Compaction::do_inverted_index_compaction_init_inverted_index_" + "file_reader error"); + }) + if (!st.ok()) { + LOG(WARNING) << "failed to do index compaction, init inverted index file reader " + "failed. tablet_id=" + << _tablet->tablet_id() << " rowset_id=" << rowset_id.to_string() + << " seg_id=" << seg_id; + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "init inverted index file reader failed. tablet_id={} rowset_id={} seg_id={}", + _tablet->tablet_id(), rowset_id.to_string(), seg_id); + } inverted_index_file_readers[m.second] = std::move(inverted_index_file_reader); } @@ -625,7 +687,20 @@ Status Compaction::do_inverted_index_compaction() { // format: rowsetId_segmentId auto& inverted_index_file_writers = dynamic_cast(_output_rs_writer.get()) ->inverted_index_file_writers(); - DCHECK_EQ(inverted_index_file_writers.size(), dest_segment_num); + DBUG_EXECUTE_IF( + "Compaction::do_inverted_index_compaction_inverted_index_file_writers_size_error", + { inverted_index_file_writers.clear(); }) + if (inverted_index_file_writers.size() != dest_segment_num) { + LOG(WARNING) << "failed to do index compaction, dest segment num not match. tablet_id=" + << _tablet->tablet_id() << " dest_segment_num=" << dest_segment_num + << " inverted_index_file_writers.size()=" + << inverted_index_file_writers.size(); + mark_skip_index_compaction(ctx, error_handler); + return Status::Error( + "dest segment num not match. tablet_id={} dest_segment_num={} " + "inverted_index_file_writers.size()={}", + _tablet->tablet_id(), dest_segment_num, inverted_index_file_writers.size()); + } // use tmp file dir to store index files auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); @@ -634,18 +709,6 @@ Status Compaction::do_inverted_index_compaction() { << ". tablet=" << _tablet->tablet_id() << ", source index size=" << src_segment_num << ", destination index size=" << dest_segment_num << "."; - auto error_handler = [this](int64_t index_id, int64_t column_uniq_id) { - LOG(WARNING) << "failed to do index compaction" - << ". tablet=" << _tablet->tablet_id() << ". column uniq id=" << column_uniq_id - << ". index_id=" << index_id; - for (auto& rowset : _input_rowsets) { - rowset->set_skip_index_compaction(column_uniq_id); - LOG(INFO) << "mark skipping inverted index compaction next time" - << ". tablet=" << _tablet->tablet_id() << ", rowset=" << rowset->rowset_id() - << ", column uniq id=" << column_uniq_id << ", index_id=" << index_id; - } - }; - Status status = Status::OK(); for (auto&& column_uniq_id : ctx.columns_to_do_index_compaction) { auto col = _cur_tablet_schema->column_by_uid(column_uniq_id); @@ -655,6 +718,10 @@ Status Compaction::do_inverted_index_compaction() { if (index_meta == nullptr) { status = Status::Error( fmt::format("Can not find index_meta for col {}", col.name())); + LOG(WARNING) << "failed to do index compaction, can not find index_meta for column" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id; + error_handler(-1, column_uniq_id); break; } @@ -662,13 +729,38 @@ Status Compaction::do_inverted_index_compaction() { try { std::vector> src_idx_dirs(src_segment_num); for (int src_segment_id = 0; src_segment_id < src_segment_num; src_segment_id++) { - src_idx_dirs[src_segment_id] = - DORIS_TRY(inverted_index_file_readers[src_segment_id]->open(index_meta)); + auto res = inverted_index_file_readers[src_segment_id]->open(index_meta); + DBUG_EXECUTE_IF("Compaction::open_inverted_index_file_reader", { + res = ResultError(Status::Error( + "debug point: Compaction::open_index_file_reader error")); + }) + if (!res.has_value()) { + LOG(WARNING) << "failed to do index compaction, open inverted index file " + "reader failed" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id + << ", src_segment_id=" << src_segment_id; + throw Exception(ErrorCode::INVERTED_INDEX_COMPACTION_ERROR, res.error().msg()); + } + src_idx_dirs[src_segment_id] = std::move(res.value()); } for (int dest_segment_id = 0; dest_segment_id < dest_segment_num; dest_segment_id++) { - auto* dest_dir = - DORIS_TRY(inverted_index_file_writers[dest_segment_id]->open(index_meta)); - dest_index_dirs[dest_segment_id] = dest_dir; + auto res = inverted_index_file_writers[dest_segment_id]->open(index_meta); + DBUG_EXECUTE_IF("Compaction::open_inverted_index_file_writer", { + res = ResultError(Status::Error( + "debug point: Compaction::open_inverted_index_file_writer error")); + }) + if (!res.has_value()) { + LOG(WARNING) << "failed to do index compaction, open inverted index file " + "writer failed" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id + << ", dest_segment_id=" << dest_segment_id; + throw Exception(ErrorCode::INVERTED_INDEX_COMPACTION_ERROR, res.error().msg()); + } + // Destination directories in dest_index_dirs do not need to be deconstructed, + // but their lifecycle must be managed by inverted_index_file_writers. + dest_index_dirs[dest_segment_id] = res.value().get(); } auto st = compact_column(index_meta->index_id(), src_idx_dirs, dest_index_dirs, index_tmp_path.native(), trans_vec, dest_segment_num_rows); @@ -679,6 +771,9 @@ Status Compaction::do_inverted_index_compaction() { } catch (CLuceneError& e) { error_handler(index_meta->index_id(), column_uniq_id); status = Status::Error(e.what()); + } catch (const Exception& e) { + error_handler(index_meta->index_id(), column_uniq_id); + status = Status::Error(e.what()); } } @@ -693,6 +788,23 @@ Status Compaction::do_inverted_index_compaction() { return Status::OK(); } +void Compaction::mark_skip_index_compaction( + const RowsetWriterContext& context, + const std::function& error_handler) { + for (auto&& column_uniq_id : context.columns_to_do_index_compaction) { + auto col = _cur_tablet_schema->column_by_uid(column_uniq_id); + const auto* index_meta = _cur_tablet_schema->inverted_index(col); + if (index_meta == nullptr) { + LOG(WARNING) << "mark skip index compaction, can not find index_meta for column" + << ". tablet=" << _tablet->tablet_id() + << ", column uniq id=" << column_uniq_id; + error_handler(-1, column_uniq_id); + continue; + } + error_handler(index_meta->index_id(), column_uniq_id); + } +} + void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { for (const auto& index : _cur_tablet_schema->inverted_indexes()) { auto col_unique_ids = index->col_unique_ids(); @@ -768,7 +880,8 @@ void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { // TODO: inverted_index_path auto seg_path = rowset->segment_path(i); DBUG_EXECUTE_IF("Compaction::construct_skip_inverted_index_seg_path_nullptr", { - seg_path = ResultError(Status::Error("error")); + seg_path = ResultError(Status::Error( + "construct_skip_inverted_index_seg_path_nullptr")); }) if (!seg_path) { LOG(WARNING) << seg_path.error(); @@ -779,13 +892,12 @@ void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { try { auto inverted_index_file_reader = std::make_unique( fs, - std::string { - InvertedIndexDescriptor::get_index_file_path_prefix(*seg_path)}, + std::string {InvertedIndexDescriptor::get_index_file_path_prefix( + seg_path.value())}, _cur_tablet_schema->get_inverted_index_storage_format(), rowset->rowset_meta()->inverted_index_file_info(i)); - bool open_idx_file_cache = false; auto st = inverted_index_file_reader->init( - config::inverted_index_read_buffer_size, open_idx_file_cache); + config::inverted_index_read_buffer_size); index_file_path = inverted_index_file_reader->get_index_file_path(index_meta); DBUG_EXECUTE_IF( "Compaction::construct_skip_inverted_index_index_file_reader_init_" @@ -855,6 +967,87 @@ void Compaction::construct_index_compaction_columns(RowsetWriterContext& ctx) { } } +Status CompactionMixin::update_delete_bitmap() { + // for mow with cluster keys, compaction read data with delete bitmap + // if tablet is not ready(such as schema change), we need to update delete bitmap + { + std::shared_lock meta_rlock(_tablet->get_header_lock()); + if (_tablet->tablet_state() != TABLET_NOTREADY) { + return Status::OK(); + } + } + OlapStopWatch watch; + std::vector rowsets; + for (const auto& rowset : _input_rowsets) { + std::lock_guard rwlock(tablet()->get_rowset_update_lock()); + std::shared_lock rlock(_tablet->get_header_lock()); + Status st = _tablet->update_delete_bitmap_without_lock(_tablet, rowset, &rowsets); + if (!st.ok()) { + LOG(INFO) << "failed update_delete_bitmap_without_lock for tablet_id=" + << _tablet->tablet_id() << ", st=" << st.to_string(); + return st; + } + rowsets.push_back(rowset); + } + LOG(INFO) << "finish update delete bitmap for tablet: " << _tablet->tablet_id() + << ", rowsets: " << _input_rowsets.size() << ", cost: " << watch.get_elapse_time_us() + << "(us)"; + return Status::OK(); +} + +Status CloudCompactionMixin::update_delete_bitmap() { + // for mow with cluster keys, compaction read data with delete bitmap + // if tablet is not ready(such as schema change), we need to update delete bitmap + { + std::shared_lock meta_rlock(_tablet->get_header_lock()); + if (_tablet->tablet_state() != TABLET_NOTREADY) { + return Status::OK(); + } + } + OlapStopWatch watch; + std::vector rowsets; + for (const auto& rowset : _input_rowsets) { + Status st = _tablet->update_delete_bitmap_without_lock(_tablet, rowset, &rowsets); + if (!st.ok()) { + LOG(INFO) << "failed update_delete_bitmap_without_lock for tablet_id=" + << _tablet->tablet_id() << ", st=" << st.to_string(); + return st; + } + rowsets.push_back(rowset); + } + LOG(INFO) << "finish update delete bitmap for tablet: " << _tablet->tablet_id() + << ", rowsets: " << _input_rowsets.size() << ", cost: " << watch.get_elapse_time_us() + << "(us)"; + return Status::OK(); +} + +void Compaction::agg_and_remove_old_version_delete_bitmap( + std::vector& pre_rowsets, + std::vector>& + to_remove_vec, + DeleteBitmapPtr& new_delete_bitmap) { + // agg previously rowset old version delete bitmap + auto pre_max_version = _output_rowset->version().second; + new_delete_bitmap = std::make_shared(_tablet->tablet_meta()->tablet_id()); + for (auto& rowset : pre_rowsets) { + if (rowset->rowset_meta()->total_disk_size() == 0) { + continue; + } + for (uint32_t seg_id = 0; seg_id < rowset->num_segments(); ++seg_id) { + rowset->rowset_id().to_string(); + DeleteBitmap::BitmapKey start {rowset->rowset_id(), seg_id, 0}; + DeleteBitmap::BitmapKey end {rowset->rowset_id(), seg_id, pre_max_version}; + auto d = _tablet->tablet_meta()->delete_bitmap().get_agg( + {rowset->rowset_id(), seg_id, pre_max_version}); + to_remove_vec.emplace_back(std::make_tuple(_tablet->tablet_id(), start, end)); + if (d->isEmpty()) { + continue; + } + new_delete_bitmap->set(end, *d); + } + } +} + Status CompactionMixin::construct_output_rowset_writer(RowsetWriterContext& ctx) { // only do index compaction for dup_keys and unique_keys with mow enabled if (config::inverted_index_compaction_enable && @@ -891,7 +1084,8 @@ Status CompactionMixin::modify_rowsets() { LOG(INFO) << "RowLocation Set inited succ for tablet:" << _tablet->tablet_id(); } std::unique_ptr> location_map; - if (config::enable_rowid_conversion_correctness_check) { + if (config::enable_rowid_conversion_correctness_check && + tablet()->tablet_schema()->cluster_key_uids().empty()) { location_map = std::make_unique>(); LOG(INFO) << "Location Map inited succ for tablet:" << _tablet->tablet_id(); } @@ -908,11 +1102,34 @@ Status CompactionMixin::modify_rowsets() { if (missed_rows) { missed_rows_size = missed_rows->size(); std::size_t merged_missed_rows_size = _stats.merged_rows; - if (!_tablet->tablet_meta()->tablet_schema()->cluster_key_idxes().empty()) { + if (!_tablet->tablet_meta()->tablet_schema()->cluster_key_uids().empty()) { merged_missed_rows_size += _stats.filtered_rows; } + + // Suppose a heavy schema change process on BE converting tablet A to tablet B. + // 1. during schema change double write, new loads write [X-Y] on tablet B. + // 2. rowsets with version [a],[a+1],...,[b-1],[b] on tablet B are picked for cumu compaction(X<=aget_header_lock()); + need_to_check_missed_rows = + std::all_of(_input_rowsets.begin(), _input_rowsets.end(), + [&](const RowsetSharedPtr& rowset) { + return tablet()->rowset_exists_unlocked(rowset); + }); + } + if (_tablet->tablet_state() == TABLET_RUNNING && - merged_missed_rows_size != missed_rows_size) { + merged_missed_rows_size != missed_rows_size && need_to_check_missed_rows) { std::stringstream ss; ss << "cumulative compaction: the merged rows(" << _stats.merged_rows << "), filtered rows(" << _stats.filtered_rows @@ -1028,6 +1245,13 @@ Status CompactionMixin::modify_rowsets() { tablet()->delete_expired_stale_rowset(); } + if (config::enable_delete_bitmap_merge_on_compaction && + compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION && + _tablet->keys_type() == KeysType::UNIQUE_KEYS && + _tablet->enable_unique_key_merge_on_write() && _input_rowsets.size() != 1) { + process_old_version_delete_bitmap(); + } + int64_t cur_max_version = 0; { std::shared_lock rlock(_tablet->get_header_lock()); @@ -1046,6 +1270,36 @@ Status CompactionMixin::modify_rowsets() { return Status::OK(); } +void CompactionMixin::process_old_version_delete_bitmap() { + std::vector pre_rowsets {}; + for (const auto& it : tablet()->rowset_map()) { + if (it.first.second < _input_rowsets.front()->start_version()) { + pre_rowsets.emplace_back(it.second); + } + } + std::sort(pre_rowsets.begin(), pre_rowsets.end(), Rowset::comparator); + if (!pre_rowsets.empty()) { + std::vector> + to_remove_vec; + DeleteBitmapPtr new_delete_bitmap = nullptr; + agg_and_remove_old_version_delete_bitmap(pre_rowsets, to_remove_vec, new_delete_bitmap); + if (!new_delete_bitmap->empty()) { + // store agg delete bitmap + Version version(_input_rowsets.front()->start_version(), + _input_rowsets.back()->end_version()); + for (auto it = new_delete_bitmap->delete_bitmap.begin(); + it != new_delete_bitmap->delete_bitmap.end(); it++) { + _tablet->tablet_meta()->delete_bitmap().set(it->first, it->second); + } + _tablet->tablet_meta()->delete_bitmap().add_to_remove_queue(version.to_string(), + to_remove_vec); + DBUG_EXECUTE_IF("CumulativeCompaction.modify_rowsets.delete_expired_stale_rowsets", { + static_cast(_tablet.get())->delete_expired_stale_rowset(); + }); + } + } +} + bool CompactionMixin::_check_if_includes_input_rowsets( const RowsetIdUnorderedSet& commit_rowset_ids_set) const { std::vector commit_rowset_ids {}; @@ -1127,6 +1381,18 @@ Status CloudCompactionMixin::execute_compact_impl(int64_t permits) { RETURN_IF_ERROR(merge_input_rowsets()); + DBUG_EXECUTE_IF("CloudFullCompaction::modify_rowsets.wrong_rowset_id", { + DCHECK(compaction_type() == ReaderType::READER_FULL_COMPACTION); + RowsetId id; + id.version = 2; + id.hi = _output_rowset->rowset_meta()->rowset_id().hi + ((int64_t)(1) << 56); + id.mi = _output_rowset->rowset_meta()->rowset_id().mi; + id.lo = _output_rowset->rowset_meta()->rowset_id().lo; + _output_rowset->rowset_meta()->set_rowset_id(id); + LOG(INFO) << "[Debug wrong rowset id]:" + << _output_rowset->rowset_meta()->rowset_id().to_string(); + }) + RETURN_IF_ERROR(_engine.meta_mgr().commit_rowset(*_output_rowset->rowset_meta().get())); // 4. modify rowsets in memory @@ -1177,8 +1443,12 @@ Status CloudCompactionMixin::construct_output_rowset_writer(RowsetWriterContext& ctx.compaction_level = _engine.cumu_compaction_policy(compaction_policy) ->new_compaction_level(_input_rowsets); } - - ctx.write_file_cache = compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION; + // We presume that the data involved in cumulative compaction is sufficiently 'hot' + // and should always be retained in the cache. + // TODO(gavin): Ensure that the retention of hot data is implemented with precision. + ctx.write_file_cache = (compaction_type() == ReaderType::READER_CUMULATIVE_COMPACTION) || + (config::enable_file_cache_keep_base_compaction_output && + compaction_type() == ReaderType::READER_BASE_COMPACTION); ctx.file_cache_ttl_sec = _tablet->ttl_seconds(); _output_rs_writer = DORIS_TRY(_tablet->create_rowset_writer(ctx, _is_vertical)); RETURN_IF_ERROR(_engine.meta_mgr().prepare_rowset(*_output_rs_writer->rowset_meta().get())); diff --git a/be/src/olap/compaction.h b/be/src/olap/compaction.h index 06ef4268529247..057f4084b068b3 100644 --- a/be/src/olap/compaction.h +++ b/be/src/olap/compaction.h @@ -70,6 +70,10 @@ class Compaction { // merge inverted index files Status do_inverted_index_compaction(); + // mark all columns in columns_to_do_index_compaction to skip index compaction next time. + void mark_skip_index_compaction(const RowsetWriterContext& context, + const std::function& error_handler); + void construct_index_compaction_columns(RowsetWriterContext& ctx); virtual Status construct_output_rowset_writer(RowsetWriterContext& ctx) = 0; @@ -84,6 +88,14 @@ class Compaction { int64_t merge_way_num(); + virtual Status update_delete_bitmap() = 0; + + void agg_and_remove_old_version_delete_bitmap( + std::vector& pre_rowsets, + std::vector>& + to_remove_vec, + DeleteBitmapPtr& new_delete_bitmap); + // the root tracker for this compaction std::shared_ptr _mem_tracker; @@ -146,6 +158,8 @@ class CompactionMixin : public Compaction { virtual Status modify_rowsets(); + Status update_delete_bitmap() override; + StorageEngine& _engine; private: @@ -158,6 +172,8 @@ class CompactionMixin : public Compaction { Status do_compact_ordered_rowsets(); + void process_old_version_delete_bitmap(); + bool _check_if_includes_input_rowsets(const RowsetIdUnorderedSet& commit_rowset_ids_set) const; PendingRowsetGuard _pending_rs_guard; @@ -175,6 +191,8 @@ class CloudCompactionMixin : public Compaction { protected: CloudTablet* cloud_tablet() { return static_cast(_tablet.get()); } + Status update_delete_bitmap() override; + virtual void garbage_collection(); CloudStorageEngine& _engine; diff --git a/be/src/olap/cumulative_compaction.cpp b/be/src/olap/cumulative_compaction.cpp index b961c694ede4d0..a9509a005763f6 100644 --- a/be/src/olap/cumulative_compaction.cpp +++ b/be/src/olap/cumulative_compaction.cpp @@ -100,6 +100,20 @@ Status CumulativeCompaction::prepare_compact() { } Status CumulativeCompaction::execute_compact() { + DBUG_EXECUTE_IF("CumulativeCompaction::execute_compact.block", { + auto target_tablet_id = dp->param("tablet_id", -1); + if (target_tablet_id == _tablet->tablet_id()) { + LOG(INFO) << "start debug block " + << "CumulativeCompaction::execute_compact.block"; + while (DebugPoints::instance()->is_enable( + "CumulativeCompaction::execute_compact.block")) { + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + } + LOG(INFO) << "end debug block " + << "CumulativeCompaction::execute_compact.block"; + } + }) + std::unique_lock lock(tablet()->get_cumulative_compaction_lock(), std::try_to_lock); if (!lock.owns_lock()) { return Status::Error( @@ -145,7 +159,7 @@ Status CumulativeCompaction::pick_rowsets_to_compact() { DCHECK(missing_versions.size() % 2 == 0); LOG(WARNING) << "There are missed versions among rowsets. " << "total missed version size: " << missing_versions.size() / 2 - << " first missed version prev rowset verison=" << missing_versions[0] + << ", first missed version prev rowset verison=" << missing_versions[0] << ", first missed version next rowset version=" << missing_versions[1] << ", tablet=" << _tablet->tablet_id(); } diff --git a/be/src/olap/cumulative_compaction_policy.cpp b/be/src/olap/cumulative_compaction_policy.cpp index ee7a2b1812a0ae..c812a12b656580 100644 --- a/be/src/olap/cumulative_compaction_policy.cpp +++ b/be/src/olap/cumulative_compaction_policy.cpp @@ -28,6 +28,7 @@ #include "olap/olap_common.h" #include "olap/tablet.h" #include "olap/tablet_meta.h" +#include "util/debug_points.h" namespace doris { @@ -246,6 +247,21 @@ int SizeBasedCumulativeCompactionPolicy::pick_input_rowsets( const int64_t max_compaction_score, const int64_t min_compaction_score, std::vector* input_rowsets, Version* last_delete_version, size_t* compaction_score, bool allow_delete) { + DBUG_EXECUTE_IF("SizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", { + auto target_tablet_id = dp->param("tablet_id", -1); + if (target_tablet_id == tablet->tablet_id()) { + auto start_version = dp->param("start_version", -1); + auto end_version = dp->param("end_version", -1); + for (auto& rowset : candidate_rowsets) { + if (rowset->start_version() >= start_version && + rowset->end_version() <= end_version) { + input_rowsets->push_back(rowset); + } + } + } + return input_rowsets->size(); + }) + size_t promotion_size = tablet->cumulative_promotion_size(); auto max_version = tablet->max_version().first; int transient_size = 0; diff --git a/be/src/olap/cumulative_compaction_time_series_policy.cpp b/be/src/olap/cumulative_compaction_time_series_policy.cpp index 6fa4b8d014313f..64e51c77641311 100644 --- a/be/src/olap/cumulative_compaction_time_series_policy.cpp +++ b/be/src/olap/cumulative_compaction_time_series_policy.cpp @@ -27,11 +27,14 @@ namespace doris { uint32_t TimeSeriesCumulativeCompactionPolicy::calc_cumulative_compaction_score(Tablet* tablet) { uint32_t score = 0; + uint32_t level0_score = 0; bool base_rowset_exist = false; const int64_t point = tablet->cumulative_layer_point(); + int64_t level0_total_size = 0; RowsetMetaSharedPtr first_meta; int64_t first_version = INT64_MAX; + std::list checked_rs_metas; // NOTE: tablet._meta_lock is hold auto& rs_metas = tablet->tablet_meta()->all_rs_metas(); // check the base rowset and collect the rowsets of cumulative part @@ -50,6 +53,12 @@ uint32_t TimeSeriesCumulativeCompactionPolicy::calc_cumulative_compaction_score( } else { // collect the rowsets of cumulative part score += rs_meta->get_compaction_score(); + if (rs_meta->compaction_level() == 0) { + level0_total_size += rs_meta->total_disk_size(); + level0_score += rs_meta->get_compaction_score(); + } else { + checked_rs_metas.push_back(rs_meta); + } } } @@ -64,7 +73,64 @@ uint32_t TimeSeriesCumulativeCompactionPolicy::calc_cumulative_compaction_score( return 0; } - return score; + // Condition 1: the size of input files for compaction meets the requirement of parameter compaction_goal_size + int64_t compaction_goal_size_mbytes = + tablet->tablet_meta()->time_series_compaction_goal_size_mbytes(); + if (level0_total_size >= compaction_goal_size_mbytes * 1024 * 1024) { + return score; + } + + // Condition 2: the number of input files reaches the threshold specified by parameter compaction_file_count_threshold + if (level0_score >= tablet->tablet_meta()->time_series_compaction_file_count_threshold()) { + return score; + } + + // Condition 3: level1 achieve compaction_goal_size + if (tablet->tablet_meta()->time_series_compaction_level_threshold() >= 2) { + checked_rs_metas.sort([](const RowsetMetaSharedPtr& a, const RowsetMetaSharedPtr& b) { + return a->version().first < b->version().first; + }); + int32_t rs_meta_count = 0; + int64_t continuous_size = 0; + for (const auto& rs_meta : checked_rs_metas) { + rs_meta_count++; + continuous_size += rs_meta->total_disk_size(); + if (rs_meta_count >= 2) { + if (continuous_size >= compaction_goal_size_mbytes * 1024 * 1024) { + return score; + } + } + } + } + + int64_t now = UnixMillis(); + int64_t last_cumu = tablet->last_cumu_compaction_success_time(); + if (last_cumu != 0) { + int64_t cumu_interval = now - last_cumu; + + // Condition 4: the time interval between compactions exceeds the value specified by parameter _compaction_time_threshold_second + if (cumu_interval > + (tablet->tablet_meta()->time_series_compaction_time_threshold_seconds() * 1000)) { + return score; + } + } else if (score > 0) { + // If the compaction process has not been successfully executed, + // the condition for triggering compaction based on the last successful compaction time (condition 3) will never be met + tablet->set_last_cumu_compaction_success_time(now); + } + + // Condition 5: If there is a continuous set of empty rowsets, prioritize merging. + std::vector input_rowsets; + std::vector candidate_rowsets = + tablet->pick_candidate_rowsets_to_cumulative_compaction(); + tablet->calc_consecutive_empty_rowsets( + &input_rowsets, candidate_rowsets, + tablet->tablet_meta()->time_series_compaction_empty_rowsets_threshold()); + if (!input_rowsets.empty()) { + return score; + } + + return 0; } void TimeSeriesCumulativeCompactionPolicy::calculate_cumulative_point( diff --git a/be/src/olap/delete_bitmap_calculator.cpp b/be/src/olap/delete_bitmap_calculator.cpp index 6f6e0ec8889954..017e3cff3d0489 100644 --- a/be/src/olap/delete_bitmap_calculator.cpp +++ b/be/src/olap/delete_bitmap_calculator.cpp @@ -90,8 +90,10 @@ bool MergeIndexDeleteBitmapCalculatorContext::Comparator::operator()( // std::proiroty_queue is a max heap, and function should return the result of `lhs < rhs` // so if the result of the function is true, rhs will be popped before lhs Slice key1, key2; - RETURN_IF_ERROR(lhs->get_current_key(key1)); - RETURN_IF_ERROR(rhs->get_current_key(key2)); + // MergeIndexDeleteBitmapCalculatorContext::get_current_key may return non-OK status if encounter + // memory allocation failure, we can only throw exception here to propagate error in this situation + THROW_IF_ERROR(lhs->get_current_key(key1)); + THROW_IF_ERROR(rhs->get_current_key(key2)); if (_sequence_length == 0 && _rowid_length == 0) { auto cmp_result = key1.compare(key2); // when key1 is the same as key2, @@ -135,28 +137,30 @@ Status MergeIndexDeleteBitmapCalculator::init(RowsetId rowset_id, std::vector const& segments, size_t seq_col_length, size_t rowdid_length, size_t max_batch_size) { - _rowset_id = rowset_id; - _seq_col_length = seq_col_length; - _rowid_length = rowdid_length; - _comparator = - MergeIndexDeleteBitmapCalculatorContext::Comparator(seq_col_length, _rowid_length); - _contexts.reserve(segments.size()); - _heap = std::make_unique(_comparator); + RETURN_IF_CATCH_EXCEPTION({ + _rowset_id = rowset_id; + _seq_col_length = seq_col_length; + _rowid_length = rowdid_length; + _comparator = + MergeIndexDeleteBitmapCalculatorContext::Comparator(seq_col_length, _rowid_length); + _contexts.reserve(segments.size()); + _heap = std::make_unique(_comparator); - for (auto& segment : segments) { - RETURN_IF_ERROR(segment->load_index()); - auto pk_idx = segment->get_primary_key_index(); - std::unique_ptr index; - RETURN_IF_ERROR(pk_idx->new_iterator(&index)); - auto index_type = vectorized::DataTypeFactory::instance().create_data_type( - pk_idx->type_info()->type(), 1, 0); - _contexts.emplace_back(std::move(index), index_type, segment->id(), pk_idx->num_rows()); - _heap->push(&_contexts.back()); - } - if (_rowid_length > 0) { - _rowid_coder = get_key_coder( - get_scalar_type_info()->type()); - } + for (auto& segment : segments) { + RETURN_IF_ERROR(segment->load_index()); + auto pk_idx = segment->get_primary_key_index(); + std::unique_ptr index; + RETURN_IF_ERROR(pk_idx->new_iterator(&index)); + auto index_type = vectorized::DataTypeFactory::instance().create_data_type( + pk_idx->type_info()->type(), 1, 0); + _contexts.emplace_back(std::move(index), index_type, segment->id(), pk_idx->num_rows()); + _heap->push(&_contexts.back()); + } + if (_rowid_length > 0) { + _rowid_coder = get_key_coder( + get_scalar_type_info()->type()); + } + }); return Status::OK(); } @@ -209,16 +213,18 @@ Status MergeIndexDeleteBitmapCalculator::calculate_one(RowLocation& loc) { } Status MergeIndexDeleteBitmapCalculator::calculate_all(DeleteBitmapPtr delete_bitmap) { - RowLocation loc; - while (true) { - auto st = calculate_one(loc); - if (st.is()) { - break; + RETURN_IF_CATCH_EXCEPTION({ + RowLocation loc; + while (true) { + auto st = calculate_one(loc); + if (st.is()) { + break; + } + RETURN_IF_ERROR(st); + delete_bitmap->add({_rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, + loc.row_id); } - RETURN_IF_ERROR(st); - delete_bitmap->add({_rowset_id, loc.segment_id, DeleteBitmap::TEMP_VERSION_COMMON}, - loc.row_id); - } + }); return Status::OK(); } diff --git a/be/src/olap/delta_writer_v2.h b/be/src/olap/delta_writer_v2.h index beeb3d3ecd3ec5..f9c2800a68f499 100644 --- a/be/src/olap/delta_writer_v2.h +++ b/be/src/olap/delta_writer_v2.h @@ -46,7 +46,6 @@ namespace doris { class FlushToken; class MemTable; -class MemTracker; class Schema; class StorageEngine; class TupleDescriptor; diff --git a/be/src/olap/full_compaction.cpp b/be/src/olap/full_compaction.cpp index 9d675f731924c1..529efa2e069faa 100644 --- a/be/src/olap/full_compaction.cpp +++ b/be/src/olap/full_compaction.cpp @@ -59,6 +59,9 @@ Status FullCompaction::prepare_compact() { std::unique_lock cumu_lock(tablet()->get_cumulative_compaction_lock()); tablet()->set_is_full_compaction_running(true); + DBUG_EXECUTE_IF("FullCompaction.prepare_compact.set_cumu_point", + { tablet()->set_cumulative_layer_point(tablet()->max_version_unlocked() + 1); }) + // 1. pick rowsets to compact RETURN_IF_ERROR(pick_rowsets_to_compact()); diff --git a/be/src/olap/hll.cpp b/be/src/olap/hll.cpp index 80be40f5643977..2b5d213c952589 100644 --- a/be/src/olap/hll.cpp +++ b/be/src/olap/hll.cpp @@ -367,43 +367,4 @@ int64_t HyperLogLog::estimate_cardinality() const { return (int64_t)(estimate + 0.5); } -void HllSetResolver::parse() { - // skip LengthValueType - char* pdata = _buf_ref; - _set_type = (HllDataType)pdata[0]; - char* sparse_data = nullptr; - switch (_set_type) { - case HLL_DATA_EXPLICIT: - // first byte : type - // second๏ฝžfive byte : hash values's number - // five byte later : hash value - _explicit_num = (ExplicitLengthValueType)(pdata[sizeof(SetTypeValueType)]); - _explicit_value = - (uint64_t*)(pdata + sizeof(SetTypeValueType) + sizeof(ExplicitLengthValueType)); - break; - case HLL_DATA_SPARSE: - // first byte : type - // second ๏ฝž๏ผˆ2^HLL_COLUMN_PRECISION)/8 byte : bitmap mark which is not zero - // 2^HLL_COLUMN_PRECISION)/8 ๏ผ‹ 1ไปฅๅŽvalue - _sparse_count = (SparseLengthValueType*)(pdata + sizeof(SetTypeValueType)); - sparse_data = pdata + sizeof(SetTypeValueType) + sizeof(SparseLengthValueType); - for (int i = 0; i < *_sparse_count; i++) { - auto* index = (SparseIndexType*)sparse_data; - sparse_data += sizeof(SparseIndexType); - auto* value = (SparseValueType*)sparse_data; - _sparse_map[*index] = *value; - sparse_data += sizeof(SetTypeValueType); - } - break; - case HLL_DATA_FULL: - // first byte : type - // second byte later : hll register value - _full_value_position = pdata + sizeof(SetTypeValueType); - break; - default: - // HLL_DATA_EMPTY - break; - } -} - } // namespace doris diff --git a/be/src/olap/hll.h b/be/src/olap/hll.h index 56cbd97110dd97..1d01223c2573ad 100644 --- a/be/src/olap/hll.h +++ b/be/src/olap/hll.h @@ -303,57 +303,4 @@ class HyperLogLog { uint8_t* _registers = nullptr; }; -// todo(kks): remove this when dpp_sink class was removed -class HllSetResolver { -public: - HllSetResolver() = default; - - ~HllSetResolver() = default; - - using SetTypeValueType = uint8_t; - using ExplicitLengthValueType = uint8_t; - using SparseLengthValueType = int32_t; - using SparseIndexType = uint16_t; - using SparseValueType = uint8_t; - - // only save pointer - void init(char* buf, int len) { - this->_buf_ref = buf; - this->_buf_len = len; - } - - // hll set type - HllDataType get_hll_data_type() { return _set_type; } - - // explicit value num - int get_explicit_count() const { return (int)_explicit_num; } - - // get explicit index value 64bit - uint64_t get_explicit_value(int index) { - if (index >= _explicit_num) { - return -1; - } - return _explicit_value[index]; - } - - // get full register value - char* get_full_value() { return _full_value_position; } - - // get (index, value) map - std::map& get_sparse_map() { return _sparse_map; } - - // parse set , call after copy() or init() - void parse(); - -private: - char* _buf_ref = nullptr; // set - int _buf_len {}; // set len - HllDataType _set_type {}; //set type - char* _full_value_position = nullptr; - uint64_t* _explicit_value = nullptr; - ExplicitLengthValueType _explicit_num {}; - std::map _sparse_map; - SparseLengthValueType* _sparse_count; -}; - } // namespace doris diff --git a/be/src/olap/inverted_index_parser.cpp b/be/src/olap/inverted_index_parser.cpp index f7e511970d91f2..f1de5a5e0c10fc 100644 --- a/be/src/olap/inverted_index_parser.cpp +++ b/be/src/olap/inverted_index_parser.cpp @@ -136,4 +136,13 @@ std::string get_parser_stopwords_from_properties( } } +std::string get_parser_dict_compression_from_properties( + const std::map& properties) { + if (properties.find(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY) != properties.end()) { + return properties.at(INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY); + } else { + return ""; + } +} + } // namespace doris diff --git a/be/src/olap/inverted_index_parser.h b/be/src/olap/inverted_index_parser.h index 0b8426d74c7ab3..f1f85995a206a8 100644 --- a/be/src/olap/inverted_index_parser.h +++ b/be/src/olap/inverted_index_parser.h @@ -83,6 +83,8 @@ const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case"; const std::string INVERTED_INDEX_PARSER_STOPWORDS_KEY = "stopwords"; +const std::string INVERTED_INDEX_PARSER_DICT_COMPRESSION_KEY = "dict_compression"; + std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type); InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::string& parser_str); @@ -119,4 +121,7 @@ std::string get_parser_lowercase_from_properties( std::string get_parser_stopwords_from_properties( const std::map& properties); +std::string get_parser_dict_compression_from_properties( + const std::map& properties); + } // namespace doris diff --git a/be/src/olap/memtable.cpp b/be/src/olap/memtable.cpp index e0f19b1624df5b..f8cc79b205535f 100644 --- a/be/src/olap/memtable.cpp +++ b/be/src/olap/memtable.cpp @@ -34,6 +34,7 @@ #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/thread_context.h" +#include "util/debug_points.h" #include "util/runtime_profile.h" #include "util/stopwatch.hpp" #include "vec/aggregate_functions/aggregate_function_reader.h" @@ -43,7 +44,6 @@ namespace doris { bvar::Adder g_memtable_cnt("memtable_cnt"); -bvar::Adder g_memtable_input_block_allocated_size("memtable_input_block_allocated_size"); using namespace ErrorCode; @@ -151,7 +151,6 @@ MemTable::~MemTable() { << _mem_tracker->consumption(); } } - g_memtable_input_block_allocated_size << -_input_mutable_block.allocated_bytes(); g_memtable_cnt << -1; if (_keys_type != KeysType::DUP_KEYS) { for (auto it = _row_in_blocks.begin(); it != _row_in_blocks.end(); it++) { @@ -222,11 +221,8 @@ Status MemTable::insert(const vectorized::Block* input_block, auto num_rows = row_idxs.size(); size_t cursor_in_mutableblock = _input_mutable_block.rows(); - auto block_size0 = _input_mutable_block.allocated_bytes(); RETURN_IF_ERROR(_input_mutable_block.add_rows(input_block, row_idxs.data(), row_idxs.data() + num_rows, &_column_offset)); - auto block_size1 = _input_mutable_block.allocated_bytes(); - g_memtable_input_block_allocated_size << block_size1 - block_size0; for (int i = 0; i < num_rows; i++) { _row_in_blocks.emplace_back(new RowInBlock {cursor_in_mutableblock + i}); } @@ -355,7 +351,7 @@ Status MemTable::_sort_by_cluster_keys() { } Tie tie = Tie(0, mutable_block.rows()); - for (auto cid : _tablet_schema->cluster_key_idxes()) { + for (auto cid : _tablet_schema->cluster_key_uids()) { auto index = _tablet_schema->field_index(cid); if (index == -1) { return Status::InternalError("could not find cluster key column with unique_id=" + @@ -385,8 +381,12 @@ Status MemTable::_sort_by_cluster_keys() { for (int i = 0; i < row_in_blocks.size(); i++) { row_pos_vec.emplace_back(row_in_blocks[i]->_row_pos); } + std::vector column_offset; + for (int i = 0; i < _column_offset.size(); ++i) { + column_offset.emplace_back(i); + } return _output_mutable_block.add_rows(&in_block, row_pos_vec.data(), - row_pos_vec.data() + in_block.rows(), &_column_offset); + row_pos_vec.data() + in_block.rows(), &column_offset); } void MemTable::_sort_one_column(std::vector& row_in_blocks, Tie& tie, @@ -590,6 +590,7 @@ void MemTable::shrink_memtable_by_agg() { } bool MemTable::need_flush() const { + DBUG_EXECUTE_IF("MemTable.need_flush", { return true; }); auto max_size = config::write_buffer_size; if (_partial_update_mode == UniqueKeyUpdateModePB::UPDATE_FIXED_COLUMNS) { auto update_columns_size = _num_columns; @@ -620,14 +621,13 @@ Status MemTable::_to_block(std::unique_ptr* res) { (_skip_bitmap_col_idx == -1) ? _aggregate() : _aggregate(); } if (_keys_type == KeysType::UNIQUE_KEYS && _enable_unique_key_mow && - !_tablet_schema->cluster_key_idxes().empty()) { + !_tablet_schema->cluster_key_uids().empty()) { if (_partial_update_mode != UniqueKeyUpdateModePB::UPSERT) { return Status::InternalError( "Partial update for mow with cluster keys is not supported"); } RETURN_IF_ERROR(_sort_by_cluster_keys()); } - g_memtable_input_block_allocated_size << -_input_mutable_block.allocated_bytes(); _input_mutable_block.clear(); // After to block, all data in arena is saved in the block _arena.reset(); diff --git a/be/src/olap/memtable_memory_limiter.cpp b/be/src/olap/memtable_memory_limiter.cpp index 1cb6c0c8e2de04..043ce9967fbe5a 100644 --- a/be/src/olap/memtable_memory_limiter.cpp +++ b/be/src/olap/memtable_memory_limiter.cpp @@ -141,7 +141,7 @@ void MemTableMemoryLimiter::handle_memtable_flush() { << ", flush: " << PrettyPrinter::print_bytes(_flush_mem_usage); _flush_active_memtables(need_flush); } - } while (_hard_limit_reached()); + } while (_hard_limit_reached() && !_load_usage_low()); g_memtable_memory_limit_waiting_threads << -1; timer.stop(); int64_t time_ms = timer.elapsed_time() / 1000 / 1000; diff --git a/be/src/olap/memtable_writer.h b/be/src/olap/memtable_writer.h index fb07e740fa3cf6..713400793a1754 100644 --- a/be/src/olap/memtable_writer.h +++ b/be/src/olap/memtable_writer.h @@ -45,7 +45,6 @@ namespace doris { class FlushToken; class MemTable; -class MemTracker; class StorageEngine; class TupleDescriptor; class SlotDescriptor; diff --git a/be/src/olap/merger.cpp b/be/src/olap/merger.cpp index a79434551b5cc1..975aaa0bca3de5 100644 --- a/be/src/olap/merger.cpp +++ b/be/src/olap/merger.cpp @@ -86,7 +86,7 @@ Status Merger::vmerge_rowsets(BaseTabletSPtr tablet, ReaderType reader_type, merge_tablet_schema->merge_dropped_columns(*del_pred_rs->tablet_schema()); } reader_params.tablet_schema = merge_tablet_schema; - if (!tablet->tablet_schema()->cluster_key_idxes().empty()) { + if (!tablet->tablet_schema()->cluster_key_uids().empty()) { reader_params.delete_bitmap = &tablet->tablet_meta()->delete_bitmap(); } @@ -173,8 +173,8 @@ void Merger::vertical_split_columns(const TabletSchema& tablet_schema, if (delete_sign_idx != -1) { key_columns.emplace_back(delete_sign_idx); } - if (!tablet_schema.cluster_key_idxes().empty()) { - for (const auto& cid : tablet_schema.cluster_key_idxes()) { + if (!tablet_schema.cluster_key_uids().empty()) { + for (const auto& cid : tablet_schema.cluster_key_uids()) { auto idx = tablet_schema.field_index(cid); DCHECK(idx >= 0) << "could not find cluster key column with unique_id=" << cid << " in tablet schema, table_id=" << tablet_schema.table_id(); @@ -186,7 +186,7 @@ void Merger::vertical_split_columns(const TabletSchema& tablet_schema, // cluster key unique ids: [3, 1, 4] // the key_columns should be [0, 1, 3, 5] // the key_group_cluster_key_idxes should be [2, 1, 3] - for (const auto& cid : tablet_schema.cluster_key_idxes()) { + for (const auto& cid : tablet_schema.cluster_key_uids()) { auto idx = tablet_schema.field_index(cid); for (auto i = 0; i < key_columns.size(); ++i) { if (idx == key_columns[i]) { @@ -201,7 +201,7 @@ void Merger::vertical_split_columns(const TabletSchema& tablet_schema, << ", delete_sign_idx=" << delete_sign_idx; // for duplicate no keys if (!key_columns.empty()) { - column_groups->emplace_back(std::move(key_columns)); + column_groups->emplace_back(key_columns); } std::vector value_columns; @@ -260,8 +260,10 @@ Status Merger::vertical_compact_one_group( } reader_params.tablet_schema = merge_tablet_schema; - if (!tablet->tablet_schema()->cluster_key_idxes().empty()) { + bool has_cluster_key = false; + if (!tablet->tablet_schema()->cluster_key_uids().empty()) { reader_params.delete_bitmap = &tablet->tablet_meta()->delete_bitmap(); + has_cluster_key = true; } if (is_key && stats_output && stats_output->rowid_conversion) { @@ -290,7 +292,8 @@ Status Merger::vertical_compact_one_group( "failed to read next block when merging rowsets of tablet " + std::to_string(tablet->tablet_id())); RETURN_NOT_OK_STATUS_WITH_WARN( - dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment), + dst_rowset_writer->add_columns(&block, column_group, is_key, max_rows_per_segment, + has_cluster_key), "failed to write block when merging rowsets of tablet " + std::to_string(tablet->tablet_id())); diff --git a/be/src/olap/metadata_adder.h b/be/src/olap/metadata_adder.h index bdc9e7a398d8a3..5b5ba16322490e 100644 --- a/be/src/olap/metadata_adder.h +++ b/be/src/olap/metadata_adder.h @@ -20,11 +20,18 @@ #include #include +#include "runtime/exec_env.h" +#include "runtime/memory/mem_tracker_limiter.h" +#include "util/runtime_profile.h" + namespace doris { inline bvar::Adder g_rowset_meta_mem_bytes("doris_rowset_meta_mem_bytes"); inline bvar::Adder g_rowset_meta_num("doris_rowset_meta_num"); +inline bvar::Adder g_rowset_mem_bytes("doris_rowset_mem_bytes"); +inline bvar::Adder g_rowset_num("doris_rowset_num"); + inline bvar::Adder g_tablet_meta_mem_bytes("doris_tablet_meta_mem_bytes"); inline bvar::Adder g_tablet_meta_num("doris_tablet_meta_num"); @@ -39,6 +46,7 @@ inline bvar::Adder g_tablet_schema_num("doris_tablet_schema_num"); inline bvar::Adder g_segment_mem_bytes("doris_segment_mem_bytes"); inline bvar::Adder g_segment_num("doris_segment_num"); +inline bvar::Adder g_segment_estimate_mem_bytes("doris_segment_estimate_mem_bytes"); inline bvar::Adder g_column_reader_mem_bytes("doris_column_reader_mem_bytes"); inline bvar::Adder g_column_reader_num("doris_column_reader_num"); @@ -91,6 +99,10 @@ class ZoneMapIndexReader; When a derived Class extends MetadataAdder, then the Class's number and fixed length field's memory can be counted automatically. But if the Class has variable length field, then you should overwrite get_metadata_size and call update_metadata_size when the Class's memory changes. + get_metadata_size is only the memory of the metadata object itself, not include child objects, + for example, TabletMeta::get_metadata_size does not include the memory of TabletSchema. + Note, the memory allocated by Doris Allocator is not included. + There are some special situations that need to be noted: 1. when the derived Class override copy constructor, you'd better update memory size(call update_metadata_size) if derived class's memory changed in its copy constructor or you not call MetadataAdder's copy constructor. @@ -104,6 +116,33 @@ class MetadataAdder { public: MetadataAdder(); + static void dump_metadata_object(RuntimeProfile* object_heap_dump_snapshot); + + static int64_t get_all_tablets_size() { + return g_tablet_meta_mem_bytes.get_value() + g_tablet_column_mem_bytes.get_value() + + g_tablet_index_mem_bytes.get_value() + g_tablet_schema_mem_bytes.get_value(); + } + + static int64_t get_all_rowsets_size() { + return g_rowset_meta_mem_bytes.get_value() + g_rowset_mem_bytes.get_value(); + } + + static int64_t get_all_segments_size() { + return g_segment_mem_bytes.get_value() + g_column_reader_mem_bytes.get_value() + + g_bitmap_index_reader_mem_bytes.get_value() + + g_bloom_filter_index_reader_mem_bytes.get_value() + + g_index_page_reader_mem_bytes.get_value() + + g_indexed_column_reader_mem_bytes.get_value() + + g_inverted_index_reader_mem_bytes.get_value() + + g_ordinal_index_reader_mem_bytes.get_value() + + g_zone_map_index_reader_mem_bytes.get_value(); + } + + // Doris currently uses the estimated segments memory as the basis, maybe it is more realistic. + static int64_t get_all_segments_estimate_size() { + return g_segment_estimate_mem_bytes.get_value(); + } + protected: MetadataAdder(const MetadataAdder& other); @@ -115,7 +154,6 @@ class MetadataAdder { MetadataAdder& operator=(const MetadataAdder& other) = default; -private: int64_t _current_meta_size {0}; void add_mem_size(int64_t val); @@ -159,6 +197,8 @@ void MetadataAdder::add_mem_size(int64_t val) { } if constexpr (std::is_same_v) { g_rowset_meta_mem_bytes << val; + } else if constexpr (std::is_same_v) { + g_rowset_mem_bytes << val; } else if constexpr (std::is_same_v) { g_tablet_meta_mem_bytes << val; } else if constexpr (std::is_same_v) { @@ -185,6 +225,9 @@ void MetadataAdder::add_mem_size(int64_t val) { g_ordinal_index_reader_mem_bytes << val; } else if constexpr (std::is_same_v) { g_zone_map_index_reader_mem_bytes << val; + } else { + LOG(FATAL) << "add_mem_size not match class type: " << typeid(T).name() << ", " << val; + __builtin_unreachable(); } } @@ -195,6 +238,8 @@ void MetadataAdder::add_num(int64_t val) { } if constexpr (std::is_same_v) { g_rowset_meta_num << val; + } else if constexpr (std::is_same_v) { + g_rowset_num << val; } else if constexpr (std::is_same_v) { g_tablet_meta_num << val; } else if constexpr (std::is_same_v) { @@ -221,7 +266,123 @@ void MetadataAdder::add_num(int64_t val) { g_ordinal_index_reader_num << val; } else if constexpr (std::is_same_v) { g_zone_map_index_reader_num << val; + } else { + LOG(FATAL) << "add_num not match class type: " << typeid(T).name() << ", " << val; + __builtin_unreachable(); } } -}; // namespace doris \ No newline at end of file +template +void MetadataAdder::dump_metadata_object(RuntimeProfile* object_heap_dump_snapshot) { + RuntimeProfile::Counter* rowset_meta_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "RowsetMetaMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* rowset_meta_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "RowsetMetaNum", TUnit::UNIT); + COUNTER_SET(rowset_meta_mem_bytes_counter, g_rowset_meta_mem_bytes.get_value()); + COUNTER_SET(rowset_meta_num_counter, g_rowset_meta_num.get_value()); + + RuntimeProfile::Counter* rowset_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "RowsetMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* rowset_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "RowsetNum", TUnit::UNIT); + COUNTER_SET(rowset_mem_bytes_counter, g_rowset_mem_bytes.get_value()); + COUNTER_SET(rowset_num_counter, g_rowset_num.get_value()); + + RuntimeProfile::Counter* tablet_meta_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletMetaMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* tablet_meta_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletMetaNum", TUnit::UNIT); + COUNTER_SET(tablet_meta_mem_bytes_counter, g_tablet_meta_mem_bytes.get_value()); + COUNTER_SET(tablet_meta_num_counter, g_tablet_meta_num.get_value()); + + RuntimeProfile::Counter* tablet_column_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletColumnMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* tablet_column_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletColumnNum", TUnit::UNIT); + COUNTER_SET(tablet_column_mem_bytes_counter, g_tablet_column_mem_bytes.get_value()); + COUNTER_SET(tablet_column_num_counter, g_tablet_column_num.get_value()); + + RuntimeProfile::Counter* tablet_index_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletIndexMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* tablet_index_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletIndexNum", TUnit::UNIT); + COUNTER_SET(tablet_index_mem_bytes_counter, g_tablet_index_mem_bytes.get_value()); + COUNTER_SET(tablet_index_num_counter, g_tablet_index_num.get_value()); + + RuntimeProfile::Counter* tablet_schema_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletSchemaMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* tablet_schema_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "TabletSchemaNum", TUnit::UNIT); + COUNTER_SET(tablet_schema_mem_bytes_counter, g_tablet_schema_mem_bytes.get_value()); + COUNTER_SET(tablet_schema_num_counter, g_tablet_schema_num.get_value()); + + RuntimeProfile::Counter* segment_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "SegmentMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* segment_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "SegmentNum", TUnit::UNIT); + COUNTER_SET(segment_mem_bytes_counter, g_segment_mem_bytes.get_value()); + COUNTER_SET(segment_num_counter, g_segment_num.get_value()); + + RuntimeProfile::Counter* column_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "ColumnReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* column_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "ColumnReaderNum", TUnit::UNIT); + COUNTER_SET(column_reader_mem_bytes_counter, g_column_reader_mem_bytes.get_value()); + COUNTER_SET(column_reader_num_counter, g_column_reader_num.get_value()); + + RuntimeProfile::Counter* bitmap_index_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "BitmapIndexReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* bitmap_index_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "BitmapIndexReaderNum", TUnit::UNIT); + COUNTER_SET(bitmap_index_reader_mem_bytes_counter, g_bitmap_index_reader_mem_bytes.get_value()); + COUNTER_SET(bitmap_index_reader_num_counter, g_bitmap_index_reader_num.get_value()); + + RuntimeProfile::Counter* bloom_filter_index_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "BloomFilterIndexReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* filter_index_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "BloomFilterIndexReaderNum", TUnit::UNIT); + COUNTER_SET(bloom_filter_index_reader_mem_bytes_counter, + g_bloom_filter_index_reader_mem_bytes.get_value()); + COUNTER_SET(filter_index_reader_num_counter, g_bloom_filter_index_reader_num.get_value()); + + RuntimeProfile::Counter* index_page_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "IndexPageReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* index_page_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "IndexPageReaderNum", TUnit::UNIT); + COUNTER_SET(index_page_reader_mem_bytes_counter, g_index_page_reader_mem_bytes.get_value()); + COUNTER_SET(index_page_reader_num_counter, g_index_page_reader_num.get_value()); + + RuntimeProfile::Counter* indexed_column_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "IndexedColumnReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* indexed_column_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "IndexedColumnReaderNum", TUnit::UNIT); + COUNTER_SET(indexed_column_reader_mem_bytes_counter, + g_indexed_column_reader_mem_bytes.get_value()); + COUNTER_SET(indexed_column_reader_num_counter, g_indexed_column_reader_num.get_value()); + + RuntimeProfile::Counter* inverted_index_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "InvertedIndexReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* inverted_index_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "InvertedIndexReaderNum", TUnit::UNIT); + COUNTER_SET(inverted_index_reader_mem_bytes_counter, + g_inverted_index_reader_mem_bytes.get_value()); + COUNTER_SET(inverted_index_reader_num_counter, g_inverted_index_reader_num.get_value()); + + RuntimeProfile::Counter* ordinal_index_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "OrdinalIndexReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* ordinal_index_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "OrdinalIndexReaderNum", TUnit::UNIT); + COUNTER_SET(ordinal_index_reader_mem_bytes_counter, + g_ordinal_index_reader_mem_bytes.get_value()); + COUNTER_SET(ordinal_index_reader_num_counter, g_ordinal_index_reader_num.get_value()); + + RuntimeProfile::Counter* zone_map_index_reader_mem_bytes_counter = + ADD_COUNTER(object_heap_dump_snapshot, "ZoneMapIndexReaderMemBytes", TUnit::BYTES); + RuntimeProfile::Counter* zone_map_index_reader_num_counter = + ADD_COUNTER(object_heap_dump_snapshot, "ZoneMapIndexReaderNum", TUnit::UNIT); + COUNTER_SET(zone_map_index_reader_mem_bytes_counter, + g_zone_map_index_reader_mem_bytes.get_value()); + COUNTER_SET(zone_map_index_reader_num_counter, g_zone_map_index_reader_num.get_value()); +} + +}; // namespace doris diff --git a/be/src/olap/olap_common.h b/be/src/olap/olap_common.h index d3bd0f0a3a2436..11249bafb1e3c0 100644 --- a/be/src/olap/olap_common.h +++ b/be/src/olap/olap_common.h @@ -35,6 +35,7 @@ #include #include +#include "common/config.h" #include "io/io_common.h" #include "olap/olap_define.h" #include "olap/rowset/rowset_fwd.h" @@ -394,6 +395,8 @@ using ColumnId = uint32_t; using UniqueIdSet = std::set; // Column unique Id -> column id map using UniqueIdToColumnIdMap = std::map; +struct RowsetId; +RowsetId next_rowset_id(); // 8 bit rowset id version // 56 bit, inc number from 1 @@ -412,7 +415,12 @@ struct RowsetId { auto [_, ec] = std::from_chars(rowset_id_str.data(), rowset_id_str.data() + rowset_id_str.length(), high); if (ec != std::errc {}) [[unlikely]] { - LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + if (config::force_regenerate_rowsetid_on_start_error) { + LOG(WARNING) << "failed to init rowset id: " << rowset_id_str; + high = next_rowset_id().hi; + } else { + LOG(FATAL) << "failed to init rowset id: " << rowset_id_str; + } } init(1, high, 0, 0); } else { diff --git a/be/src/olap/olap_server.cpp b/be/src/olap/olap_server.cpp index a0c5a05636bfa2..90d0883984e78b 100644 --- a/be/src/olap/olap_server.cpp +++ b/be/src/olap/olap_server.cpp @@ -210,7 +210,7 @@ static int32_t get_single_replica_compaction_threads_num(size_t data_dirs_num) { return threads_num; } -Status StorageEngine::start_bg_threads() { +Status StorageEngine::start_bg_threads(std::shared_ptr wg_sptr) { RETURN_IF_ERROR(Thread::create( "StorageEngine", "unused_rowset_monitor_thread", [this]() { this->_unused_rowset_monitor_thread_callback(); }, @@ -243,29 +243,60 @@ Status StorageEngine::start_bg_threads() { auto single_replica_compaction_threads = get_single_replica_compaction_threads_num(data_dirs.size()); - RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") - .set_min_threads(base_compaction_threads) - .set_max_threads(base_compaction_threads) - .build(&_base_compaction_thread_pool)); - RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") - .set_min_threads(cumu_compaction_threads) - .set_max_threads(cumu_compaction_threads) - .build(&_cumu_compaction_thread_pool)); - RETURN_IF_ERROR(ThreadPoolBuilder("SingleReplicaCompactionTaskThreadPool") - .set_min_threads(single_replica_compaction_threads) - .set_max_threads(single_replica_compaction_threads) - .build(&_single_replica_compaction_thread_pool)); - - if (config::enable_segcompaction) { - RETURN_IF_ERROR(ThreadPoolBuilder("SegCompactionTaskThreadPool") - .set_min_threads(config::segcompaction_num_threads) - .set_max_threads(config::segcompaction_num_threads) - .build(&_seg_compaction_thread_pool)); + if (wg_sptr->get_cgroup_cpu_ctl_wptr().lock()) { + RETURN_IF_ERROR(ThreadPoolBuilder("gBaseCompactionTaskThreadPool") + .set_min_threads(base_compaction_threads) + .set_max_threads(base_compaction_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("gCumuCompactionTaskThreadPool") + .set_min_threads(cumu_compaction_threads) + .set_max_threads(cumu_compaction_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_cumu_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("gSingleReplicaCompactionTaskThreadPool") + .set_min_threads(single_replica_compaction_threads) + .set_max_threads(single_replica_compaction_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_single_replica_compaction_thread_pool)); + + if (config::enable_segcompaction) { + RETURN_IF_ERROR(ThreadPoolBuilder("gSegCompactionTaskThreadPool") + .set_min_threads(config::segcompaction_num_threads) + .set_max_threads(config::segcompaction_num_threads) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_seg_compaction_thread_pool)); + } + RETURN_IF_ERROR(ThreadPoolBuilder("gColdDataCompactionTaskThreadPool") + .set_min_threads(config::cold_data_compaction_thread_num) + .set_max_threads(config::cold_data_compaction_thread_num) + .set_cgroup_cpu_ctl(wg_sptr->get_cgroup_cpu_ctl_wptr()) + .build(&_cold_data_compaction_thread_pool)); + } else { + RETURN_IF_ERROR(ThreadPoolBuilder("BaseCompactionTaskThreadPool") + .set_min_threads(base_compaction_threads) + .set_max_threads(base_compaction_threads) + .build(&_base_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("CumuCompactionTaskThreadPool") + .set_min_threads(cumu_compaction_threads) + .set_max_threads(cumu_compaction_threads) + .build(&_cumu_compaction_thread_pool)); + RETURN_IF_ERROR(ThreadPoolBuilder("SingleReplicaCompactionTaskThreadPool") + .set_min_threads(single_replica_compaction_threads) + .set_max_threads(single_replica_compaction_threads) + .build(&_single_replica_compaction_thread_pool)); + + if (config::enable_segcompaction) { + RETURN_IF_ERROR(ThreadPoolBuilder("SegCompactionTaskThreadPool") + .set_min_threads(config::segcompaction_num_threads) + .set_max_threads(config::segcompaction_num_threads) + .build(&_seg_compaction_thread_pool)); + } + RETURN_IF_ERROR(ThreadPoolBuilder("ColdDataCompactionTaskThreadPool") + .set_min_threads(config::cold_data_compaction_thread_num) + .set_max_threads(config::cold_data_compaction_thread_num) + .build(&_cold_data_compaction_thread_pool)); } - RETURN_IF_ERROR(ThreadPoolBuilder("ColdDataCompactionTaskThreadPool") - .set_min_threads(config::cold_data_compaction_thread_num) - .set_max_threads(config::cold_data_compaction_thread_num) - .build(&_cold_data_compaction_thread_pool)); // compaction tasks producer thread RETURN_IF_ERROR(Thread::create( @@ -1040,7 +1071,8 @@ Status StorageEngine::_submit_compaction_task(TabletSharedPtr tablet, if (!tablet->can_do_compaction(tablet->data_dir()->path_hash(), compaction_type)) { LOG(INFO) << "Tablet state has been changed, no need to begin this compaction " "task, tablet_id=" - << tablet->tablet_id() << "tablet_state=" << tablet->tablet_state(); + << tablet->tablet_id() << ", tablet_state=" << tablet->tablet_state(); + _pop_tablet_from_submitted_compaction(tablet, compaction_type); return; } tablet->compaction_stage = CompactionStage::EXECUTING; diff --git a/be/src/olap/options.cpp b/be/src/olap/options.cpp index 9c500c10993395..6e4cb61e3d0182 100644 --- a/be/src/olap/options.cpp +++ b/be/src/olap/options.cpp @@ -32,6 +32,7 @@ #include "common/status.h" #include "gutil/strings/split.h" #include "gutil/strings/strip.h" +#include "io/cache/file_cache_common.h" #include "io/fs/local_file_system.h" #include "olap/olap_define.h" #include "olap/utils.h" @@ -56,6 +57,7 @@ static std::string CACHE_QUERY_LIMIT_SIZE = "query_limit"; static std::string CACHE_NORMAL_PERCENT = "normal_percent"; static std::string CACHE_DISPOSABLE_PERCENT = "disposable_percent"; static std::string CACHE_INDEX_PERCENT = "index_percent"; +static std::string CACHE_TTL_PERCENT = "ttl_percent"; static std::string CACHE_STORAGE = "storage"; static std::string CACHE_STORAGE_DISK = "disk"; static std::string CACHE_STORAGE_MEMORY = "memory"; @@ -206,7 +208,7 @@ void parse_conf_broken_store_paths(const string& config_path, std::set& auto rowset = reader->rowset(); const auto rowset_id = rowset->rowset_id(); - DCHECK(_segment_cache_handles.contains(rowset_id)); - auto& segment_cache_handle = _segment_cache_handles[rowset_id]; + const auto& segments_rows = _all_segments_rows[rowset_id]; if (rowset->num_rows() == 0) { continue; } - const auto& segments = segment_cache_handle.get_segments(); int segment_start = 0; auto split = RowSetSplits(reader->clone()); - for (size_t i = 0; i != segments.size(); ++i) { - const auto& segment = segments[i]; + for (size_t i = 0; i != segments_rows.size(); ++i) { + const size_t rows_of_segment = segments_rows[i]; RowRanges row_ranges; - const size_t rows_of_segment = segment->num_rows(); int64_t offset_in_segment = 0; // try to split large segments into RowRanges @@ -125,7 +123,7 @@ Status ParallelScannerBuilder::_build_scanners_by_rowid(std::list& // The non-empty `row_ranges` means there are some rows left in this segment not added into `split`. if (!row_ranges.is_empty()) { DCHECK_GT(rows_collected, 0); - DCHECK_EQ(row_ranges.to(), segment->num_rows()); + DCHECK_EQ(row_ranges.to(), rows_of_segment); split.segment_row_ranges.emplace_back(std::move(row_ranges)); } } @@ -133,7 +131,7 @@ Status ParallelScannerBuilder::_build_scanners_by_rowid(std::list& DCHECK_LE(rows_collected, _rows_per_scanner); if (rows_collected > 0) { split.segment_offsets.first = segment_start; - split.segment_offsets.second = segments.size(); + split.segment_offsets.second = segments_rows.size(); DCHECK_GT(split.segment_offsets.second, split.segment_offsets.first); DCHECK_EQ(split.segment_row_ranges.size(), split.segment_offsets.second - split.segment_offsets.first); @@ -181,11 +179,15 @@ Status ParallelScannerBuilder::_load() { auto rowset = rs_split.rs_reader->rowset(); RETURN_IF_ERROR(rowset->load()); const auto rowset_id = rowset->rowset_id(); - auto& segment_cache_handle = _segment_cache_handles[rowset_id]; + SegmentCacheHandle segment_cache_handle; RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( std::dynamic_pointer_cast(rowset), &segment_cache_handle, enable_segment_cache, false)); + + for (const auto& segment : segment_cache_handle.get_segments()) { + _all_segments_rows[rowset_id].emplace_back(segment->num_rows()); + } _total_rows += rowset->num_rows(); } } diff --git a/be/src/olap/parallel_scanner_builder.h b/be/src/olap/parallel_scanner_builder.h index 934d769ed59aa0..7c6b5648e89e04 100644 --- a/be/src/olap/parallel_scanner_builder.h +++ b/be/src/olap/parallel_scanner_builder.h @@ -83,7 +83,7 @@ class ParallelScannerBuilder { size_t _rows_per_scanner {_min_rows_per_scanner}; - std::map _segment_cache_handles; + std::map> _all_segments_rows; std::shared_ptr _scanner_profile; RuntimeState* _state; diff --git a/be/src/olap/primary_key_index.cpp b/be/src/olap/primary_key_index.cpp index 9d40ff5a8fad51..5f7bedb01fc8de 100644 --- a/be/src/olap/primary_key_index.cpp +++ b/be/src/olap/primary_key_index.cpp @@ -50,8 +50,8 @@ Status PrimaryKeyIndexBuilder::init() { auto opt = segment_v2::BloomFilterOptions(); opt.fpp = 0.01; - _bloom_filter_index_builder.reset( - new segment_v2::PrimaryKeyBloomFilterIndexWriterImpl(opt, type_info)); + RETURN_IF_ERROR(segment_v2::PrimaryKeyBloomFilterIndexWriterImpl::create( + opt, type_info, &_bloom_filter_index_builder)); return Status::OK(); } @@ -64,6 +64,9 @@ Status PrimaryKeyIndexBuilder::add_item(const Slice& key) { if (UNLIKELY(_num_rows == 0)) { _min_key.append(key.get_data(), key.get_size()); } + DCHECK(key.compare(_max_key) > 0) + << "found duplicate key or key is not sorted! current key: " << key + << ", last max key: " << _max_key; _max_key.clear(); _max_key.append(key.get_data(), key.get_size()); _num_rows++; diff --git a/be/src/olap/rowid_conversion.h b/be/src/olap/rowid_conversion.h index 01a2cea0d5e308..8f9d96a136aab4 100644 --- a/be/src/olap/rowid_conversion.h +++ b/be/src/olap/rowid_conversion.h @@ -37,21 +37,33 @@ class RowIdConversion { ~RowIdConversion() { RELEASE_THREAD_MEM_TRACKER(_seg_rowid_map_mem_used); } // resize segment rowid map to its rows num - void init_segment_map(const RowsetId& src_rowset_id, const std::vector& num_rows) { - size_t delta_std_pair_cap = 0; + Status init_segment_map(const RowsetId& src_rowset_id, const std::vector& num_rows) { for (size_t i = 0; i < num_rows.size(); i++) { + constexpr size_t RESERVED_MEMORY = 10 * 1024 * 1024; // 10M + if (doris::GlobalMemoryArbitrator::is_exceed_hard_mem_limit(RESERVED_MEMORY)) { + return Status::MemoryLimitExceeded(fmt::format( + "RowIdConversion init_segment_map failed, memory exceed limit, {}, " + "consuming " + "tracker:<{}>, peak used {}, current used {}.", + doris::GlobalMemoryArbitrator::process_limit_exceeded_errmsg_str(), + doris::thread_context()->thread_mem_tracker()->label(), + doris::thread_context()->thread_mem_tracker()->peak_consumption(), + doris::thread_context()->thread_mem_tracker()->consumption())); + } + uint32_t id = _segments_rowid_map.size(); _segment_to_id_map.emplace(std::pair {src_rowset_id, i}, id); _id_to_segment_map.emplace_back(src_rowset_id, i); std::vector> vec( num_rows[i], std::pair(UINT32_MAX, UINT32_MAX)); - delta_std_pair_cap += vec.capacity(); + + //NOTE: manually count _segments_rowid_map's memory here, because _segments_rowid_map could be used by indexCompaction. + // indexCompaction is a thridparty code, it's too complex to modify it. + // refer compact_column. + track_mem_usage(vec.capacity()); _segments_rowid_map.emplace_back(std::move(vec)); } - //NOTE: manually count _segments_rowid_map's memory here, because _segments_rowid_map could be used by indexCompaction. - // indexCompaction is a thridparty code, it's too complex to modify it. - // refer compact_column. - track_mem_usage(delta_std_pair_cap); + return Status::OK(); } // set dst rowset id @@ -124,9 +136,7 @@ class RowIdConversion { size_t new_size = _std_pair_cap * sizeof(std::pair) + _segments_rowid_map.capacity() * sizeof(std::vector>); - - RELEASE_THREAD_MEM_TRACKER(_seg_rowid_map_mem_used); - CONSUME_THREAD_MEM_TRACKER(new_size); + CONSUME_THREAD_MEM_TRACKER(new_size - _seg_rowid_map_mem_used); _seg_rowid_map_mem_used = new_size; } diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index bbb2ca72b4ae7f..a328b1b9e8b90e 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -18,6 +18,7 @@ #include "olap/rowset/beta_rowset.h" #include +#include #include #include @@ -557,10 +558,6 @@ Status BetaRowset::add_to_binlog() { } const auto& fs = io::global_local_filesystem(); - - // all segments are in the same directory, so cache binlog_dir without multi times check - std::string binlog_dir; - auto segments_num = num_segments(); VLOG_DEBUG << fmt::format("add rowset to binlog. rowset_id={}, segments_num={}", rowset_id().to_string(), segments_num); @@ -569,17 +566,25 @@ Status BetaRowset::add_to_binlog() { std::vector linked_success_files; Defer remove_linked_files {[&]() { // clear linked files if errors happen if (!status.ok()) { - LOG(WARNING) << "will delete linked success files due to error " << status; + LOG(WARNING) << "will delete linked success files due to error " + << status.to_string_no_stack(); std::vector paths; for (auto& file : linked_success_files) { paths.emplace_back(file); LOG(WARNING) << "will delete linked success file " << file << " due to error"; } static_cast(fs->batch_delete(paths)); - LOG(WARNING) << "done delete linked success files due to error " << status; + LOG(WARNING) << "done delete linked success files due to error " + << status.to_string_no_stack(); } }}; + // The publish_txn might fail even if the add_to_binlog success, so we need to check + // whether a file already exists before linking. + auto errno_is_file_exists = []() { return Errno::no() == EEXIST; }; + + // all segments are in the same directory, so cache binlog_dir without multi times check + std::string binlog_dir; for (int i = 0; i < segments_num; ++i) { auto seg_file = local_segment_path(_tablet_path, rowset_id().to_string(), i); @@ -597,7 +602,7 @@ Status BetaRowset::add_to_binlog() { (std::filesystem::path(binlog_dir) / std::filesystem::path(seg_file).filename()) .string(); VLOG_DEBUG << "link " << seg_file << " to " << binlog_file; - if (!fs->link_file(seg_file, binlog_file).ok()) { + if (!fs->link_file(seg_file, binlog_file).ok() && !errno_is_file_exists()) { status = Status::Error("fail to create hard link. from={}, to={}, errno={}", seg_file, binlog_file, Errno::no()); return status; @@ -614,7 +619,12 @@ Status BetaRowset::add_to_binlog() { std::filesystem::path(index_file).filename()) .string(); VLOG_DEBUG << "link " << index_file << " to " << binlog_index_file; - RETURN_IF_ERROR(fs->link_file(index_file, binlog_index_file)); + if (!fs->link_file(index_file, binlog_index_file).ok() && !errno_is_file_exists()) { + status = Status::Error( + "fail to create hard link. from={}, to={}, errno={}", index_file, + binlog_index_file, Errno::no()); + return status; + } linked_success_files.push_back(binlog_index_file); } } else { @@ -625,7 +635,12 @@ Status BetaRowset::add_to_binlog() { std::filesystem::path(index_file).filename()) .string(); VLOG_DEBUG << "link " << index_file << " to " << binlog_index_file; - RETURN_IF_ERROR(fs->link_file(index_file, binlog_index_file)); + if (!fs->link_file(index_file, binlog_index_file).ok() && !errno_is_file_exists()) { + status = Status::Error( + "fail to create hard link. from={}, to={}, errno={}", index_file, + binlog_index_file, Errno::no()); + return status; + } linked_success_files.push_back(binlog_index_file); } } @@ -703,10 +718,24 @@ Status BetaRowset::show_nested_index_file(rapidjson::Value* rowset_value, rapidjson::Document::AllocatorType& allocator) { const auto& fs = _rowset_meta->fs(); auto storage_format = _schema->get_inverted_index_storage_format(); - auto format_str = storage_format == InvertedIndexStorageFormatPB::V1 ? "V1" : "V2"; + std::string format_str; + switch (storage_format) { + case InvertedIndexStorageFormatPB::V1: + format_str = "V1"; + break; + case InvertedIndexStorageFormatPB::V2: + format_str = "V2"; + break; + case InvertedIndexStorageFormatPB::V3: + format_str = "V3"; + break; + default: + return Status::InternalError("inverted index storage format error"); + break; + } auto rs_id = rowset_id().to_string(); rowset_value->AddMember("rowset_id", rapidjson::Value(rs_id.c_str(), allocator), allocator); - rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str, allocator), + rowset_value->AddMember("index_storage_format", rapidjson::Value(format_str.c_str(), allocator), allocator); rapidjson::Value segments(rapidjson::kArrayType); for (int seg_id = 0; seg_id < num_segments(); ++seg_id) { diff --git a/be/src/olap/rowset/beta_rowset_reader.cpp b/be/src/olap/rowset/beta_rowset_reader.cpp index 042893f1374374..47cf9b820e8562 100644 --- a/be/src/olap/rowset/beta_rowset_reader.cpp +++ b/be/src/olap/rowset/beta_rowset_reader.cpp @@ -239,7 +239,8 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context // init segment rowid map for rowid conversion std::vector segment_num_rows; RETURN_IF_ERROR(get_segment_num_rows(&segment_num_rows)); - _read_context->rowid_conversion->init_segment_map(rowset()->rowset_id(), segment_num_rows); + RETURN_IF_ERROR(_read_context->rowid_conversion->init_segment_map(rowset()->rowset_id(), + segment_num_rows)); } auto [seg_start, seg_end] = _segment_offsets; diff --git a/be/src/olap/rowset/beta_rowset_writer.cpp b/be/src/olap/rowset/beta_rowset_writer.cpp index 198b4e8595ed20..dc155efe0165bc 100644 --- a/be/src/olap/rowset/beta_rowset_writer.cpp +++ b/be/src/olap/rowset/beta_rowset_writer.cpp @@ -60,6 +60,7 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" using namespace ErrorCode; namespace { @@ -336,7 +337,8 @@ Status BaseBetaRowsetWriter::_generate_delete_bitmap(int32_t segment_id) { LOG(INFO) << "[Memtable Flush] construct delete bitmap tablet: " << _context.tablet->tablet_id() << ", rowset_ids: " << _context.mow_context->rowset_ids.size() << ", cur max_version: " << _context.mow_context->max_version - << ", transaction_id: " << _context.mow_context->txn_id + << ", transaction_id: " << _context.mow_context->txn_id << ", delete_bitmap_count: " + << _context.tablet->tablet_meta()->delete_bitmap().get_delete_bitmap_count() << ", cost: " << watch.get_elapse_time_us() << "(us), total rows: " << total_rows; return Status::OK(); } @@ -475,15 +477,15 @@ Status BetaRowsetWriter::_rename_compacted_segments(int64_t begin, int64_t end) return Status::OK(); } -void BetaRowsetWriter::_clear_statistics_for_deleting_segments_unsafe(uint64_t begin, - uint64_t end) { +void BetaRowsetWriter::_clear_statistics_for_deleting_segments_unsafe(uint32_t begin, + uint32_t end) { VLOG_DEBUG << "_segid_statistics_map clear record segid range from:" << begin << " to:" << end; - for (int i = begin; i <= end; ++i) { + for (uint32_t i = begin; i <= end; ++i) { _segid_statistics_map.erase(i); } } -Status BetaRowsetWriter::_rename_compacted_segment_plain(uint64_t seg_id) { +Status BetaRowsetWriter::_rename_compacted_segment_plain(uint32_t seg_id) { if (seg_id == _num_segcompacted) { ++_num_segcompacted; return Status::OK(); @@ -581,7 +583,7 @@ Status BetaRowsetWriter::_segcompaction_if_necessary() { Status status = Status::OK(); // if not doing segcompaction, just check segment number if (!config::enable_segcompaction || !_context.enable_segcompaction || - !_context.tablet_schema->cluster_key_idxes().empty() || + !_context.tablet_schema->cluster_key_uids().empty() || _context.tablet_schema->num_variant_columns() > 0) { return _check_segment_number_limit(_num_segment); } @@ -653,7 +655,7 @@ Status BaseBetaRowsetWriter::add_rowset(RowsetSharedPtr rowset) { _num_rows_written += rowset->num_rows(); _total_data_size += rowset->rowset_meta()->data_disk_size(); _total_index_size += rowset->rowset_meta()->index_disk_size(); - _num_segment += rowset->num_segments(); + _num_segment += cast_set(rowset->num_segments()); // append key_bounds to current rowset RETURN_IF_ERROR(rowset->get_segments_key_bounds(&_segments_encoded_key_bounds)); @@ -1043,7 +1045,7 @@ Status BaseBetaRowsetWriter::add_segment(uint32_t segment_id, const SegmentStati if (segment_id >= _segment_num_rows.size()) { _segment_num_rows.resize(segment_id + 1); } - _segment_num_rows[segid_offset] = segstat.row_num; + _segment_num_rows[segid_offset] = cast_set(segstat.row_num); } VLOG_DEBUG << "_segid_statistics_map add new record. segment_id:" << segment_id << " row_num:" << segstat.row_num << " data_size:" << segstat.data_size @@ -1111,4 +1113,5 @@ Status BetaRowsetWriter::flush_segment_writer_for_segcompaction( return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/olap/rowset/beta_rowset_writer.h b/be/src/olap/rowset/beta_rowset_writer.h index d96301af22630d..a69d1063a55086 100644 --- a/be/src/olap/rowset/beta_rowset_writer.h +++ b/be/src/olap/rowset/beta_rowset_writer.h @@ -298,9 +298,9 @@ class BetaRowsetWriter : public BaseBetaRowsetWriter { Status _load_noncompacted_segment(segment_v2::SegmentSharedPtr& segment, int32_t segment_id); Status _find_longest_consecutive_small_segment(SegCompactionCandidatesSharedPtr& segments); Status _rename_compacted_segments(int64_t begin, int64_t end); - Status _rename_compacted_segment_plain(uint64_t seg_id); + Status _rename_compacted_segment_plain(uint32_t seg_id); Status _rename_compacted_indices(int64_t begin, int64_t end, uint64_t seg_id); - void _clear_statistics_for_deleting_segments_unsafe(uint64_t begin, uint64_t end); + void _clear_statistics_for_deleting_segments_unsafe(uint32_t begin, uint32_t end); StorageEngine& _engine; diff --git a/be/src/olap/rowset/rowset.cpp b/be/src/olap/rowset/rowset.cpp index 256f4d35313d13..ac3a2a7a1dc5c2 100644 --- a/be/src/olap/rowset/rowset.cpp +++ b/be/src/olap/rowset/rowset.cpp @@ -27,8 +27,6 @@ namespace doris { -static bvar::Adder g_total_rowset_num("doris_total_rowset_num"); - Rowset::Rowset(const TabletSchemaSPtr& schema, RowsetMetaSharedPtr rowset_meta, std::string tablet_path) : _rowset_meta(std::move(rowset_meta)), @@ -56,11 +54,6 @@ Rowset::Rowset(const TabletSchemaSPtr& schema, RowsetMetaSharedPtr rowset_meta, } // build schema from RowsetMeta.tablet_schema or Tablet.tablet_schema _schema = _rowset_meta->tablet_schema() ? _rowset_meta->tablet_schema() : schema; - g_total_rowset_num << 1; -} - -Rowset::~Rowset() { - g_total_rowset_num << -1; } Status Rowset::load(bool use_cache) { diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index e1a2347f6aeaa8..98d88ba19f2068 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -33,6 +33,7 @@ #include "common/logging.h" #include "common/status.h" +#include "olap/metadata_adder.h" #include "olap/olap_common.h" #include "olap/rowset/rowset_meta.h" #include "olap/tablet_schema.h" @@ -116,10 +117,8 @@ class RowsetStateMachine { RowsetState _rowset_state; }; -class Rowset : public std::enable_shared_from_this { +class Rowset : public std::enable_shared_from_this, public MetadataAdder { public: - virtual ~Rowset(); - // Open all segment files in this rowset and load necessary metadata. // - `use_cache` : whether to use fd cache, only applicable to alpha rowset now // diff --git a/be/src/olap/rowset/rowset_writer.h b/be/src/olap/rowset/rowset_writer.h index ad42982488b316..f84ff964ea3051 100644 --- a/be/src/olap/rowset/rowset_writer.h +++ b/be/src/olap/rowset/rowset_writer.h @@ -80,7 +80,7 @@ class RowsetWriter { "RowsetWriter not support add_block"); } virtual Status add_columns(const vectorized::Block* block, const std::vector& col_ids, - bool is_key, uint32_t max_rows_per_segment) { + bool is_key, uint32_t max_rows_per_segment, bool has_cluster_key) { return Status::Error( "RowsetWriter not support add_columns"); } diff --git a/be/src/olap/rowset/segment_v2/binary_plain_page.h b/be/src/olap/rowset/segment_v2/binary_plain_page.h index e043164ef28633..3b3c6ad3feab92 100644 --- a/be/src/olap/rowset/segment_v2/binary_plain_page.h +++ b/be/src/olap/rowset/segment_v2/binary_plain_page.h @@ -266,8 +266,7 @@ class BinaryPlainPageDecoder : public PageDecoder { auto total = *n; size_t read_count = 0; - _len_array.resize(total); - _start_offset_array.resize(total); + _binary_data.resize(total); for (size_t i = 0; i < total; ++i) { ordinal_t ord = rowids[i] - page_first_ordinal; if (UNLIKELY(ord >= _num_elems)) { @@ -275,14 +274,13 @@ class BinaryPlainPageDecoder : public PageDecoder { } const uint32_t start_offset = offset(ord); - _start_offset_array[read_count] = start_offset; - _len_array[read_count] = offset(ord + 1) - start_offset; + _binary_data[read_count].data = _data.mutable_data() + start_offset; + _binary_data[read_count].size = offset(ord + 1) - start_offset; read_count++; } if (LIKELY(read_count > 0)) { - dst->insert_many_binary_data(_data.mutable_data(), _len_array.data(), - _start_offset_array.data(), read_count); + dst->insert_many_strings(_binary_data.data(), read_count); } *n = read_count; @@ -342,13 +340,11 @@ class BinaryPlainPageDecoder : public PageDecoder { if (idx >= _num_elems) { return _offsets_pos; } - const uint8_t* p = - reinterpret_cast(&_data[_offsets_pos + idx * SIZE_OF_INT32]); - return decode_fixed32_le(p); + return guarded_offset(idx); } uint32_t guarded_offset(size_t idx) const { - const uint8_t* p = + const auto* p = reinterpret_cast(&_data[_offsets_pos + idx * SIZE_OF_INT32]); return decode_fixed32_le(p); } @@ -361,8 +357,7 @@ class BinaryPlainPageDecoder : public PageDecoder { uint32_t _offsets_pos; std::vector _offsets; - std::vector _len_array; - std::vector _start_offset_array; + std::vector _binary_data; // Index of the currently seeked element in the page. uint32_t _cur_idx; diff --git a/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h b/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h index f68ddd7e74bfc5..8dc470d9da4f88 100644 --- a/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/block_split_bloom_filter.h @@ -34,7 +34,6 @@ class BlockSplitBloomFilter : public BloomFilter { void add_hash(uint64_t hash) override; bool test_hash(uint64_t hash) const override; - bool contains(const BloomFilter&) const override { return true; } private: // Bytes in a tiny Bloom filter block. diff --git a/be/src/olap/rowset/segment_v2/bloom_filter.h b/be/src/olap/rowset/segment_v2/bloom_filter.h index 20a903e65c1b3b..4f4adf0fd12283 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter.h @@ -132,14 +132,20 @@ class BloomFilter { // for read // use deep copy to acquire the data virtual Status init(const char* buf, uint32_t size, HashStrategyPB strategy) { + if (size <= 1) { + return Status::InvalidArgument("invalid size:{}", size); + } DCHECK(size > 1); if (strategy == HASH_MURMUR3_X64_64) { _hash_func = murmur_hash3_x64_64; } else { return Status::InvalidArgument("invalid strategy:{}", strategy); } - if (size == 0) { - return Status::InvalidArgument("invalid size:{}", size); + if (buf == nullptr) { + return Status::InvalidArgument("buf is nullptr"); + } + if (((size - 1) & (size - 2)) != 0) { + return Status::InvalidArgument("size - 1 must be power of two"); } _data = new char[size]; memcpy(_data, buf, size); @@ -180,7 +186,7 @@ class BloomFilter { /// Checks if this contains everything from another bloom filter. /// Bloom filters must have equal size and seed. - virtual bool contains(const BloomFilter& bf_) const = 0; + virtual bool contains(const BloomFilter& bf_) const { return true; }; virtual char* data() const { return _data; } diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp index 609d21ce4f5c22..8c63c25d20acee 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_reader.cpp @@ -70,6 +70,7 @@ Status BloomFilterIndexIterator::read_bloom_filter(rowid_t ordinal, auto column = data_type->create_column(); RETURN_IF_ERROR(_bloom_filter_iter.seek_to_ordinal(ordinal)); + DCHECK(current_bloom_filter_index() == ordinal); size_t num_read = num_to_read; RETURN_IF_ERROR(_bloom_filter_iter.next_batch(&num_read, column)); DCHECK(num_to_read == num_read); diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp index 98669ccb141ae7..3f9fb94df0a844 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.cpp @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -68,15 +69,12 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { explicit BloomFilterIndexWriterImpl(const BloomFilterOptions& bf_options, const TypeInfo* type_info) - : _bf_options(bf_options), - _type_info(type_info), - _has_null(false), - _bf_buffer_size(0) {} + : _bf_options(bf_options), _type_info(type_info) {} ~BloomFilterIndexWriterImpl() override = default; Status add_values(const void* values, size_t count) override { - const CppType* v = (const CppType*)values; + const auto* v = (const CppType*)values; for (int i = 0; i < count; ++i) { if (_values.find(*v) == _values.end()) { if constexpr (_is_slice_type()) { @@ -105,7 +103,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { bf->set_has_null(_has_null); for (auto& v : _values) { if constexpr (_is_slice_type()) { - Slice* s = (Slice*)&v; + auto* s = (Slice*)&v; bf->add_bytes(s->data, s->size); } else { bf->add_bytes((char*)&v, sizeof(CppType)); @@ -160,11 +158,11 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { static constexpr bool _is_int128() { return field_type == FieldType::OLAP_FIELD_TYPE_LARGEINT; } private: - BloomFilterOptions _bf_options; - const TypeInfo* _type_info; + BloomFilterOptions _bf_options {}; + const TypeInfo* _type_info = nullptr; vectorized::Arena _arena; - bool _has_null; - uint64_t _bf_buffer_size; + bool _has_null = false; + uint64_t _bf_buffer_size = 0; // distinct values ValueDict _values; std::vector> _bfs; @@ -173,7 +171,7 @@ class BloomFilterIndexWriterImpl : public BloomFilterIndexWriter { } // namespace Status PrimaryKeyBloomFilterIndexWriterImpl::add_values(const void* values, size_t count) { - const Slice* v = (const Slice*)values; + const auto* v = (const Slice*)values; for (int i = 0; i < count; ++i) { Slice new_value; RETURN_IF_CATCH_EXCEPTION(_type_info->deep_copy(&new_value, v, &_arena)); @@ -189,7 +187,7 @@ Status PrimaryKeyBloomFilterIndexWriterImpl::flush() { RETURN_IF_ERROR(bf->init(_values.size(), _bf_options.fpp, _bf_options.strategy)); bf->set_has_null(_has_null); for (auto& v : _values) { - Slice* s = (Slice*)&v; + auto* s = (Slice*)&v; bf->add_bytes(s->data, s->size); } _bf_buffer_size += bf->size(); @@ -205,7 +203,7 @@ Status PrimaryKeyBloomFilterIndexWriterImpl::flush() { Status PrimaryKeyBloomFilterIndexWriterImpl::finish(io::FileWriter* file_writer, ColumnIndexMetaPB* index_meta) { - if (_values.size() > 0) { + if (!_values.empty()) { RETURN_IF_ERROR(flush()); } index_meta->set_type(BLOOM_FILTER_INDEX); @@ -246,7 +244,7 @@ NGramBloomFilterIndexWriterImpl::NGramBloomFilterIndexWriterImpl( } Status NGramBloomFilterIndexWriterImpl::add_values(const void* values, size_t count) { - const Slice* src = reinterpret_cast(values); + const auto* src = reinterpret_cast(values); for (int i = 0; i < count; ++i, ++src) { if (src->size < _gram_size) { continue; @@ -339,7 +337,8 @@ Status NGramBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_opti case FieldType::OLAP_FIELD_TYPE_CHAR: case FieldType::OLAP_FIELD_TYPE_VARCHAR: case FieldType::OLAP_FIELD_TYPE_STRING: - res->reset(new NGramBloomFilterIndexWriterImpl(bf_options, gram_size, gram_bf_size)); + *res = std::make_unique(bf_options, gram_size, + gram_bf_size); break; default: return Status::NotSupported("unsupported type for ngram bloom filter index:{}", @@ -348,5 +347,22 @@ Status NGramBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_opti return Status::OK(); } +Status PrimaryKeyBloomFilterIndexWriterImpl::create(const BloomFilterOptions& bf_options, + const TypeInfo* typeinfo, + std::unique_ptr* res) { + FieldType type = typeinfo->type(); + switch (type) { + case FieldType::OLAP_FIELD_TYPE_CHAR: + case FieldType::OLAP_FIELD_TYPE_VARCHAR: + case FieldType::OLAP_FIELD_TYPE_STRING: + *res = std::make_unique(bf_options, typeinfo); + break; + default: + return Status::NotSupported("unsupported type for primary key bloom filter index:{}", + std::to_string(int(type))); + } + return Status::OK(); +} + } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h index 2cdf7171e3e276..a94982438f651a 100644 --- a/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h +++ b/be/src/olap/rowset/segment_v2/bloom_filter_index_writer.h @@ -85,6 +85,8 @@ class PrimaryKeyBloomFilterIndexWriterImpl : public BloomFilterIndexWriter { } }; + static Status create(const BloomFilterOptions& bf_options, const TypeInfo* typeinfo, + std::unique_ptr* res); // This method may allocate large memory for bf, will return error // when memory is exhaused to prevent oom. Status add_values(const void* values, size_t count) override; diff --git a/be/src/olap/rowset/segment_v2/column_reader.cpp b/be/src/olap/rowset/segment_v2/column_reader.cpp index aad3725d5a3f6e..9d5328de869304 100644 --- a/be/src/olap/rowset/segment_v2/column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/column_reader.cpp @@ -344,7 +344,7 @@ Status ColumnReader::new_inverted_index_iterator( { std::shared_lock rlock(_load_index_lock); if (_inverted_index) { - RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.stats, + RETURN_IF_ERROR(_inverted_index->new_iterator(read_options.io_ctx, read_options.stats, read_options.runtime_state, iterator)); } } @@ -411,7 +411,7 @@ Status ColumnReader::next_batch_of_zone_map(size_t* n, vectorized::MutableColumn } else { if (is_string) { auto sv = (StringRef*)min_value->cell_ptr(); - dst->insert_many_data(sv->data, sv->size, size); + dst->insert_data_repeatedly(sv->data, sv->size, size); } else { // TODO: the work may cause performance problem, opt latter for (int i = 0; i < size; ++i) { @@ -871,8 +871,18 @@ Status MapFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnPtr size_t num_read = *n; auto null_map_ptr = static_cast(*dst).get_null_map_column_ptr(); - bool null_signs_has_null = false; - RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + // in not-null to null linked-schemachange mode, + // actually we do not change dat data include meta in footer, + // so may dst from changed meta which is nullable but old data is not nullable, + // if so, we should set null_map to all null by default + if (_null_iterator) { + bool null_signs_has_null = false; + RETURN_IF_ERROR( + _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + } else { + auto& null_map = assert_cast(*null_map_ptr); + null_map.insert_many_vals(0, num_read); + } DCHECK(num_read == *n); } return Status::OK(); @@ -932,8 +942,18 @@ Status StructFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumn size_t num_read = *n; auto null_map_ptr = static_cast(*dst).get_null_map_column_ptr(); - bool null_signs_has_null = false; - RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + // in not-null to null linked-schemachange mode, + // actually we do not change dat data include meta in footer, + // so may dst from changed meta which is nullable but old data is not nullable, + // if so, we should set null_map to all null by default + if (_null_iterator) { + bool null_signs_has_null = false; + RETURN_IF_ERROR( + _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + } else { + auto& null_map = assert_cast(*null_map_ptr); + null_map.insert_many_vals(0, num_read); + } DCHECK(num_read == *n); } @@ -1086,8 +1106,18 @@ Status ArrayFileColumnIterator::next_batch(size_t* n, vectorized::MutableColumnP auto null_map_ptr = static_cast(*dst).get_null_map_column_ptr(); size_t num_read = *n; - bool null_signs_has_null = false; - RETURN_IF_ERROR(_null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + // in not-null to null linked-schemachange mode, + // actually we do not change dat data include meta in footer, + // so may dst from changed meta which is nullable but old data is not nullable, + // if so, we should set null_map to all null by default + if (_null_iterator) { + bool null_signs_has_null = false; + RETURN_IF_ERROR( + _null_iterator->next_batch(&num_read, null_map_ptr, &null_signs_has_null)); + } else { + auto& null_map = assert_cast(*null_map_ptr); + null_map.insert_many_vals(0, num_read); + } DCHECK(num_read == *n); } @@ -1508,7 +1538,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, value.cast_to_date(); int64 = binary_cast(value); - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_DATETIME: { @@ -1526,7 +1556,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, value.to_datetime(); int64 = binary_cast(value); - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_DECIMAL: { @@ -1538,7 +1568,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, sizeof(FieldTypeTraits::CppType)); //decimal12_t decimal12_t* d = (decimal12_t*)mem_value; int128 = DecimalV2Value(d->integer, d->fraction).value(); - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_STRING: @@ -1548,7 +1578,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, case FieldType::OLAP_FIELD_TYPE_AGG_STATE: { char* data_ptr = ((Slice*)mem_value)->data; size_t data_len = ((Slice*)mem_value)->size; - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); break; } case FieldType::OLAP_FIELD_TYPE_ARRAY: { @@ -1566,7 +1596,7 @@ void DefaultValueColumnIterator::insert_default_data(const TypeInfo* type_info, default: { char* data_ptr = (char*)mem_value; size_t data_len = type_size; - dst->insert_many_data(data_ptr, data_len, n); + dst->insert_data_repeatedly(data_ptr, data_len, n); } } } diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp index 3028211f266157..da6beff5d8d6a2 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.cpp @@ -81,7 +81,8 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, _sole_data_page = PagePointer(_meta.ordinal_index_meta().root_page()); } else { RETURN_IF_ERROR(load_index_page(_meta.ordinal_index_meta().root_page(), - &_ordinal_index_page_handle, &_ordinal_index_reader)); + &_ordinal_index_page_handle, + _ordinal_index_reader.get())); _has_index_page = true; } } @@ -92,7 +93,7 @@ Status IndexedColumnReader::load(bool use_page_cache, bool kept_in_memory, _sole_data_page = PagePointer(_meta.value_index_meta().root_page()); } else { RETURN_IF_ERROR(load_index_page(_meta.value_index_meta().root_page(), - &_value_index_page_handle, &_value_index_reader)); + &_value_index_page_handle, _value_index_reader.get())); _has_index_page = true; } } diff --git a/be/src/olap/rowset/segment_v2/indexed_column_reader.h b/be/src/olap/rowset/segment_v2/indexed_column_reader.h index c3469f9f6bed0d..c9640c0007c153 100644 --- a/be/src/olap/rowset/segment_v2/indexed_column_reader.h +++ b/be/src/olap/rowset/segment_v2/indexed_column_reader.h @@ -50,9 +50,12 @@ class EncodingInfo; class IndexedColumnReader : public MetadataAdder { public: explicit IndexedColumnReader(io::FileReaderSPtr file_reader, const IndexedColumnMetaPB& meta) - : _file_reader(std::move(file_reader)), _meta(meta) {} + : _file_reader(std::move(file_reader)), _meta(meta) { + _ordinal_index_reader = std::make_unique(); + _value_index_reader = std::make_unique(); + } - ~IndexedColumnReader(); + ~IndexedColumnReader() override; Status load(bool use_page_cache, bool kept_in_memory, OlapReaderStatistics* index_load_stats = nullptr); @@ -90,8 +93,8 @@ class IndexedColumnReader : public MetadataAdder { bool _has_index_page = false; // valid only when the column contains only one data page PagePointer _sole_data_page; - IndexPageReader _ordinal_index_reader; - IndexPageReader _value_index_reader; + std::unique_ptr _ordinal_index_reader; + std::unique_ptr _value_index_reader; PageHandle _ordinal_index_page_handle; PageHandle _value_index_page_handle; @@ -108,8 +111,8 @@ class IndexedColumnIterator { explicit IndexedColumnIterator(const IndexedColumnReader* reader, OlapReaderStatistics* stats = nullptr) : _reader(reader), - _ordinal_iter(&reader->_ordinal_index_reader), - _value_iter(&reader->_value_index_reader), + _ordinal_iter(reader->_ordinal_index_reader.get()), + _value_iter(reader->_value_index_reader.get()), _stats(stats) {} // Seek to the given ordinal entry. Entry 0 is the first entry. diff --git a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h index d9e5080d2d584d..1e5e6f5d5cedd0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter.h @@ -17,7 +17,7 @@ #pragma once -#include +#include // IWYU pragma: keep #include #include diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp index fb2479517166fc..6e9d61db7fddb4 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.cpp @@ -20,8 +20,9 @@ namespace doris::segment_v2 { ConjunctionQuery::ConjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), + _io_ctx(io_ctx), _index_version(_searcher->getReader()->getIndexVersion()), _conjunction_ratio(query_options.inverted_index_conjunction_opt_threshold) {} @@ -48,7 +49,7 @@ void ConjunctionQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termDocs(t); + TermDocs* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); _term_docs.push_back(term_doc); iterators.emplace_back(term_doc); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h index 2571392d5294e9..b9bfee2bfb1f7a 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/conjunction_query.h @@ -27,7 +27,7 @@ namespace doris::segment_v2 { class ConjunctionQuery : public Query { public: ConjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~ConjunctionQuery() override; void add(const std::wstring& field_name, const std::vector& terms) override; @@ -41,6 +41,7 @@ class ConjunctionQuery : public Query { public: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; IndexVersion _index_version = IndexVersion::kV0; int32_t _conjunction_ratio = 1000; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp index 650a88c064611c..852357073d3b1d 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.cpp @@ -20,8 +20,8 @@ namespace doris::segment_v2 { DisjunctionQuery::DisjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) - : _searcher(searcher) {} + const TQueryOptions& query_options, const io::IOContext* io_ctx) + : _searcher(searcher), _io_ctx(io_ctx) {} void DisjunctionQuery::add(const std::wstring& field_name, const std::vector& terms) { if (terms.empty()) { @@ -36,7 +36,7 @@ void DisjunctionQuery::search(roaring::Roaring& roaring) { auto func = [this, &roaring](const std::string& term, bool first) { std::wstring ws_term = StringUtil::string_to_wstring(term); auto* t = _CLNEW Term(_field_name.c_str(), ws_term.c_str()); - auto* term_doc = _searcher->getReader()->termDocs(t); + auto* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); TermIterator iterator(term_doc); DocRange doc_range; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h index 357831461571c7..8d0559ee4b0c98 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/disjunction_query.h @@ -27,7 +27,7 @@ namespace doris::segment_v2 { class DisjunctionQuery : public Query { public: DisjunctionQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~DisjunctionQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; @@ -35,6 +35,7 @@ class DisjunctionQuery : public Query { private: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; std::wstring _field_name; std::vector _terms; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp index ec1b5bdd9e4d35..f82433826e9581 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.cpp @@ -30,7 +30,7 @@ namespace doris::segment_v2 { PhraseEdgeQuery::PhraseEdgeQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), _query(std::make_unique()), _max_expansions(query_options.inverted_index_max_expansions) {} diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h index 5daf382e0d08fa..9eb3bd57c4a916 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_edge_query.h @@ -31,7 +31,7 @@ namespace doris::segment_v2 { class PhraseEdgeQuery : public Query { public: PhraseEdgeQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhraseEdgeQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp index 407e515dc9212f..88bb3c1171fa30 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.cpp @@ -23,7 +23,8 @@ namespace doris::segment_v2 { PhrasePrefixQuery::PhrasePrefixQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, + const io::IOContext* io_ctx) : _searcher(searcher), _query(std::make_unique()), _max_expansions(query_options.inverted_index_max_expansions) {} diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h index e565c0409cf4cd..5cac597951eac7 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_prefix_query.h @@ -31,7 +31,7 @@ namespace doris::segment_v2 { class PhrasePrefixQuery : public Query { public: PhrasePrefixQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhrasePrefixQuery() override = default; void add(const std::wstring& field_name, const std::vector& terms) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp index 9a3ecc68f89fa0..38e60b0f089dc0 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.cpp @@ -123,8 +123,8 @@ bool OrderedSloppyPhraseMatcher::stretch_to_order(PostingsAndPosition* prev_post } PhraseQuery::PhraseQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) - : _searcher(searcher) {} + const TQueryOptions& query_options, const io::IOContext* io_ctx) + : _searcher(searcher), _io_ctx(io_ctx) {} PhraseQuery::~PhraseQuery() { for (auto& term_doc : _term_docs) { @@ -173,7 +173,7 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termDocs(t); + TermDocs* term_doc = _searcher->getReader()->termDocs(t, _io_ctx); _term_docs.push_back(term_doc); _lead1 = TermIterator(term_doc); return; @@ -185,7 +185,7 @@ void PhraseQuery::add(const std::wstring& field_name, const std::vectorgetReader()->termPositions(t); + TermPositions* term_pos = _searcher->getReader()->termPositions(t, _io_ctx); _term_docs.push_back(term_pos); if (is_save_iter) { iterators.emplace_back(term_pos); diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h index 35a479ff7f9781..a2c3a7ae91afcc 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/phrase_query.h @@ -87,7 +87,7 @@ using Matcher = std::variant; class PhraseQuery : public Query { public: PhraseQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~PhraseQuery() override; void add(const InvertedIndexQueryInfo& query_info) override; @@ -112,6 +112,7 @@ class PhraseQuery : public Query { private: std::shared_ptr _searcher; + const io::IOContext* _io_ctx = nullptr; TermIterator _lead1; TermIterator _lead2; diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h index c295765ec63478..c0eac69deaeaf3 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/query.h @@ -27,6 +27,7 @@ #include #include "common/status.h" +#include "io/io_common.h" #include "roaring/roaring.hh" CL_NS_USE(index) diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp index 007da8289dcdb0..69de4b7818b870 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.cpp @@ -25,10 +25,10 @@ namespace doris::segment_v2 { RegexpQuery::RegexpQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options) + const TQueryOptions& query_options, const io::IOContext* io_ctx) : _searcher(searcher), _max_expansions(query_options.inverted_index_max_expansions), - _query(searcher, query_options) {} + _query(searcher, query_options, io_ctx) {} void RegexpQuery::add(const std::wstring& field_name, const std::vector& patterns) { if (patterns.size() != 1) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h index 336b2d0b6a671d..650ad2bf10b002 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h +++ b/be/src/olap/rowset/segment_v2/inverted_index/query/regexp_query.h @@ -28,7 +28,7 @@ namespace doris::segment_v2 { class RegexpQuery : public Query { public: RegexpQuery(const std::shared_ptr& searcher, - const TQueryOptions& query_options); + const TQueryOptions& query_options, const io::IOContext* io_ctx); ~RegexpQuery() override = default; void add(const std::wstring& field_name, const std::vector& patterns) override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_common.h b/be/src/olap/rowset/segment_v2/inverted_index_common.h new file mode 100644 index 00000000000000..1fdb7df2931de4 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index_common.h @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include // IWYU pragma: keep + +#include + +#include "common/logging.h" + +namespace lucene::store { +class Directory; +} // namespace lucene::store + +namespace doris::segment_v2 { + +struct DirectoryDeleter { + void operator()(lucene::store::Directory* ptr) const { _CLDECDELETE(ptr); } +}; + +struct ErrorContext { + std::string err_msg; + std::exception_ptr eptr; +}; + +template +concept HasClose = requires(T t) { + { t->close() }; +}; + +template + requires HasClose +void finally_close(PtrType& resource, ErrorContext& error_context) { + if (resource) { + try { + resource->close(); + } catch (CLuceneError& err) { + error_context.eptr = std::current_exception(); + error_context.err_msg.append("Error occurred while closing resource: "); + error_context.err_msg.append(err.what()); + LOG(ERROR) << error_context.err_msg; + } catch (...) { + error_context.eptr = std::current_exception(); + error_context.err_msg.append("Error occurred while closing resource"); + LOG(ERROR) << error_context.err_msg; + } + } +} + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunused-macros" +#endif + +#define FINALLY_CLOSE(resource) \ + { \ + static_assert(sizeof(error_context) > 0, \ + "error_context must be defined before using FINALLY macro!"); \ + finally_close(resource, error_context); \ + } + +// Return ERROR after finally +#define FINALLY(finally_block) \ + { \ + static_assert(sizeof(error_context) > 0, \ + "error_context must be defined before using FINALLY macro!"); \ + finally_block; \ + if (error_context.eptr) { \ + return Status::Error(error_context.err_msg); \ + } \ + } + +// Re-throw the exception after finally +#define FINALLY_EXCEPTION(finally_block) \ + { \ + static_assert(sizeof(error_context) > 0, \ + "error_context must be defined before using FINALLY macro!"); \ + finally_block; \ + if (error_context.eptr) { \ + std::rethrow_exception(error_context.eptr); \ + } \ + } + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +} // namespace doris::segment_v2 \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp index 88a8f2417228bc..dcbdca921ab8e8 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compaction.cpp @@ -76,17 +76,19 @@ Status compact_column(int64_t index_id, // when index_writer is destroyed, if closeDir is set, dir will be close // _CLDECDELETE(dir) will try to ref_cnt--, when it decreases to 1, dir will be destroyed. _CLDECDELETE(dir) - for (auto* d : dest_index_dirs) { - if (d != nullptr) { - // NOTE: DO NOT close dest dir here, because it will be closed when dest index writer finalize. - //d->close(); - //_CLDELETE(d); - } - } // delete temporary segment_path, only when inverted_index_ram_dir_enable is false if (!config::inverted_index_ram_dir_enable) { - std::ignore = io::global_local_filesystem()->delete_directory(tmp_path.data()); + auto st = io::global_local_filesystem()->delete_directory(tmp_path.data()); + DBUG_EXECUTE_IF("compact_column_delete_tmp_path_error", { + st = Status::Error( + "debug point: compact_column_delete_tmp_path_error in index compaction"); + }) + if (!st.ok()) { + LOG(WARNING) << "compact column failed to delete tmp path: " << tmp_path + << ", error: " << st.to_string(); + return st; + } } return Status::OK(); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp index 7613df112ed9aa..f1b2b0eaedd4fd 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_compound_reader.cpp @@ -59,6 +59,8 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { CL_NS(store)::IndexInput* base; int64_t fileOffset; int64_t _length; + const io::IOContext* _io_ctx = nullptr; + bool _is_index_file = false; // Indicates if the file is a TII file protected: void readInternal(uint8_t* /*b*/, const int32_t /*len*/) override; @@ -75,6 +77,8 @@ class CSIndexInput : public lucene::store::BufferedIndexInput { const char* getDirectoryType() const override { return DorisCompoundReader::getClassName(); } const char* getObjectName() const override { return getClassName(); } static const char* getClassName() { return "CSIndexInput"; } + void setIoContext(const void* io_ctx) override; + void setIndexFile(bool isIndexFile) override; }; CSIndexInput::CSIndexInput(CL_NS(store)::IndexInput* base, const int64_t fileOffset, @@ -92,9 +96,19 @@ void CSIndexInput::readInternal(uint8_t* b, const int32_t len) { if (start + len > _length) { _CLTHROWA(CL_ERR_IO, "read past EOF"); } + + if (_io_ctx) { + base->setIoContext(_io_ctx); + } + + base->setIndexFile(_is_index_file); base->seek(fileOffset + start); bool read_from_buffer = true; base->readBytes(b, len, read_from_buffer); + + if (_io_ctx) { + base->setIoContext(nullptr); + } } CSIndexInput::~CSIndexInput() = default; @@ -111,6 +125,14 @@ CSIndexInput::CSIndexInput(const CSIndexInput& clone) : BufferedIndexInput(clone void CSIndexInput::close() {} +void CSIndexInput::setIoContext(const void* io_ctx) { + _io_ctx = static_cast(io_ctx); +} + +void CSIndexInput::setIndexFile(bool isIndexFile) { + _is_index_file = isIndexFile; +} + DorisCompoundReader::DorisCompoundReader(CL_NS(store)::IndexInput* stream, int32_t read_buffer_size) : _ram_dir(new lucene::store::RAMDirectory()), _stream(stream), diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp index e0c75922c98bb2..813a78f2a3fa86 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.cpp @@ -27,22 +27,27 @@ namespace doris::segment_v2 { -Status InvertedIndexFileReader::init(int32_t read_buffer_size, bool open_idx_file_cache) { +Status InvertedIndexFileReader::init(int32_t read_buffer_size, const io::IOContext* io_ctx) { if (!_inited) { _read_buffer_size = read_buffer_size; - _open_idx_file_cache = open_idx_file_cache; - if (_storage_format == InvertedIndexStorageFormatPB::V2) { - auto st = _init_from_v2(read_buffer_size); + if (_storage_format >= InvertedIndexStorageFormatPB::V2) { + auto st = _init_from(read_buffer_size, io_ctx); if (!st.ok()) { return st; } } _inited = true; + } else { + if (_storage_format == InvertedIndexStorageFormatPB::V2) { + if (_stream) { + _stream->setIoContext(io_ctx); + } + } } return Status::OK(); } -Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { +Status InvertedIndexFileReader::_init_from(int32_t read_buffer_size, const io::IOContext* io_ctx) { auto index_file_full_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); std::unique_lock lock(_mutex); // Lock for writing @@ -76,12 +81,12 @@ Status InvertedIndexFileReader::_init_from_v2(int32_t read_buffer_size) { "CLuceneError occur when open idx file {}, error msg: {}", index_file_full_path, err.what()); } - index_input->setIdxFileCache(_open_idx_file_cache); _stream = std::unique_ptr(index_input); + _stream->setIoContext(io_ctx); // 3. read file int32_t version = _stream->readInt(); // Read version number - if (version == InvertedIndexStorageFormatPB::V2) { + if (version >= InvertedIndexStorageFormatPB::V2) { DCHECK(version == _storage_format); int32_t numIndices = _stream->readInt(); // Read number of indices ReaderFileEntry* entry = nullptr; @@ -198,7 +203,6 @@ Result> InvertedIndexFileReader::_open( } // 3. read file in DorisCompoundReader - index_input->setIdxFileCache(_open_idx_file_cache); compound_reader = std::make_unique(index_input, _read_buffer_size); } catch (CLuceneError& err) { return ResultError(Status::Error( diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h index 8bc28b1882f9d8..ed6ee85e7d7bf1 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_reader.h @@ -59,7 +59,7 @@ class InvertedIndexFileReader { _idx_file_info(idx_file_info) {} Status init(int32_t read_buffer_size = config::inverted_index_read_buffer_size, - bool open_idx_file_cache = false); + const io::IOContext* io_ctx = nullptr); Result> open(const TabletIndex* index_meta) const; void debug_file_entries(); std::string get_index_file_cache_key(const TabletIndex* index_meta) const; @@ -71,7 +71,7 @@ class InvertedIndexFileReader { int64_t get_inverted_file_size() const { return _stream == nullptr ? 0 : _stream->length(); } private: - Status _init_from_v2(int32_t read_buffer_size); + Status _init_from(int32_t read_buffer_size, const io::IOContext* io_ctx); Result> _open(int64_t index_id, const std::string& index_suffix) const; @@ -80,7 +80,6 @@ class InvertedIndexFileReader { const io::FileSystemSPtr _fs; std::string _index_path_prefix; int32_t _read_buffer_size = -1; - bool _open_idx_file_cache = false; InvertedIndexStorageFormatPB _storage_format; mutable std::shared_mutex _mutex; // Use mutable for const read operations bool _inited = false; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp index 5599faa351dfd6..4d6892aa78568f 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.cpp @@ -19,17 +19,14 @@ #include +#include #include #include "common/status.h" -#include "io/fs/file_writer.h" -#include "io/fs/local_file_system.h" -#include "olap/rowset/segment_v2/inverted_index_cache.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/rowset/segment_v2/inverted_index_reader.h" #include "olap/tablet_schema.h" -#include "runtime/exec_env.h" namespace doris::segment_v2 { @@ -38,32 +35,11 @@ Status InvertedIndexFileWriter::initialize(InvertedIndexDirectoryMap& indices_di return Status::OK(); } -Result InvertedIndexFileWriter::open(const TabletIndex* index_meta) { - auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); - const auto& local_fs = io::global_local_filesystem(); - auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( - tmp_file_dir.native(), _rowset_id, _seg_id, index_meta->index_id(), - index_meta->get_index_suffix()); - bool exists = false; - auto st = local_fs->exists(local_fs_index_path, &exists); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_error", - { st = Status::Error("debug point: no such file error"); }) - if (!st.ok()) { - LOG(ERROR) << "index_path:" << local_fs_index_path << " exists error:" << st; - return ResultError(st); - } - DBUG_EXECUTE_IF("InvertedIndexFileWriter::open_local_fs_exists_true", { exists = true; }) - if (exists) { - LOG(ERROR) << "try to init a directory:" << local_fs_index_path << " already exists"; - return ResultError( - Status::InternalError("InvertedIndexFileWriter::open directory already exists")); - } - - bool can_use_ram_dir = true; - auto* dir = DorisFSDirectoryFactory::getDirectory(local_fs, local_fs_index_path.c_str(), - can_use_ram_dir); - auto key = std::make_pair(index_meta->index_id(), index_meta->get_index_suffix()); - auto [it, inserted] = _indices_dirs.emplace(key, std::unique_ptr(dir)); +Status InvertedIndexFileWriter::_insert_directory_into_map(int64_t index_id, + const std::string& index_suffix, + std::shared_ptr dir) { + auto key = std::make_pair(index_id, index_suffix); + auto [it, inserted] = _indices_dirs.emplace(key, std::move(dir)); if (!inserted) { LOG(ERROR) << "InvertedIndexFileWriter::open attempted to insert a duplicate key: (" << key.first << ", " << key.second << ")"; @@ -71,8 +47,23 @@ Result InvertedIndexFileWriter::open(const TabletIndex* index for (const auto& entry : _indices_dirs) { LOG(ERROR) << "Key: (" << entry.first.first << ", " << entry.first.second << ")"; } - return ResultError(Status::InternalError( - "InvertedIndexFileWriter::open attempted to insert a duplicate dir")); + return Status::InternalError( + "InvertedIndexFileWriter::open attempted to insert a duplicate dir"); + } + return Status::OK(); +} + +Result> InvertedIndexFileWriter::open( + const TabletIndex* index_meta) { + auto local_fs_index_path = InvertedIndexDescriptor::get_temporary_index_path( + _tmp_dir, _rowset_id, _seg_id, index_meta->index_id(), index_meta->get_index_suffix()); + bool can_use_ram_dir = true; + auto dir = std::shared_ptr(DorisFSDirectoryFactory::getDirectory( + _local_fs, local_fs_index_path.c_str(), can_use_ram_dir)); + auto st = + _insert_directory_into_map(index_meta->index_id(), index_meta->get_index_suffix(), dir); + if (!st.ok()) { + return ResultError(st); } return dir; @@ -159,7 +150,7 @@ Status InvertedIndexFileWriter::close() { } } else { try { - RETURN_IF_ERROR(write_v2()); + RETURN_IF_ERROR(write()); for (const auto& entry : _indices_dirs) { const auto& dir = entry.second; // delete index path, which contains separated inverted index files @@ -222,7 +213,7 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire int64_t chunk = bufferLength; while (remainder > 0) { - int64_t len = std::min(std::min(chunk, length), remainder); + int64_t len = std::min({chunk, length, remainder}); input->readBytes(buffer, len); output->writeBytes(buffer, len); remainder -= len; @@ -252,244 +243,316 @@ void InvertedIndexFileWriter::copyFile(const char* fileName, lucene::store::Dire Status InvertedIndexFileWriter::write_v1() { int64_t total_size = 0; + std::unique_ptr out_dir = nullptr; + std::unique_ptr output = nullptr; + ErrorContext error_context; for (const auto& entry : _indices_dirs) { const int64_t index_id = entry.first.first; const auto& index_suffix = entry.first.second; try { - const auto& directory = entry.second; - std::vector files; - directory->list(&files); - // remove write.lock file - auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); - if (it != files.end()) { - files.erase(it); - } + const auto& directory = entry.second.get(); - std::vector sorted_files; - for (auto file : files) { - FileInfo file_info; - file_info.filename = file; - file_info.filesize = directory->fileLength(file.c_str()); - sorted_files.emplace_back(std::move(file_info)); - } - sort_files(sorted_files); - - int32_t file_count = sorted_files.size(); - - io::Path cfs_path(InvertedIndexDescriptor::get_index_file_path_v1( - _index_path_prefix, index_id, index_suffix)); - auto idx_path = cfs_path.parent_path(); - std::string idx_name = cfs_path.filename(); - // write file entries to ram directory to get header length - lucene::store::RAMDirectory ram_dir; - auto* out_idx = ram_dir.createOutput(idx_name.c_str()); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_ram_output_is_nullptr", - { out_idx = nullptr; }) - if (out_idx == nullptr) { - LOG(WARNING) << "Write compound file error: RAMDirectory output is nullptr."; - _CLTHROWA(CL_ERR_IO, "Create RAMDirectory output error"); - } + // Prepare sorted file list + auto sorted_files = prepare_sorted_files(directory); + + // Calculate header length + auto [header_length, header_file_count] = + calculate_header_length(sorted_files, directory); + + // Create output stream + auto result = create_output_stream_v1(index_id, index_suffix); + out_dir = std::move(result.first); + output = std::move(result.second); - std::unique_ptr ram_output(out_idx); - ram_output->writeVInt(file_count); - // write file entries in ram directory - // number of files, which data are in header - int header_file_count = 0; - int64_t header_file_length = 0; - const int64_t buffer_length = 16384; - uint8_t ram_buffer[buffer_length]; - for (auto file : sorted_files) { - ram_output->writeString(file.filename); // file name - ram_output->writeLong(0); // data offset - ram_output->writeLong(file.filesize); // file length - header_file_length += file.filesize; - if (header_file_length <= DorisFSDirectory::MAX_HEADER_DATA_SIZE) { - copyFile(file.filename.c_str(), directory.get(), ram_output.get(), ram_buffer, - buffer_length); - header_file_count++; - } - } - auto header_len = ram_output->getFilePointer(); - ram_output->close(); - ram_dir.deleteFile(idx_name.c_str()); - ram_dir.close(); - - auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, idx_path.c_str()); - out_dir->set_file_writer_opts(_opts); - - auto* out = out_dir->createOutput(idx_name.c_str()); - DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_out_dir_createOutput_nullptr", - { out = nullptr; }); - if (out == nullptr) { - LOG(WARNING) << "Write compound file error: CompoundDirectory output is nullptr."; - _CLTHROWA(CL_ERR_IO, "Create CompoundDirectory output error"); - } - std::unique_ptr output(out); size_t start = output->getFilePointer(); - output->writeVInt(file_count); - // write file entries - int64_t data_offset = header_len; - uint8_t header_buffer[buffer_length]; - for (int i = 0; i < sorted_files.size(); ++i) { - auto file = sorted_files[i]; - output->writeString(file.filename); // FileName - // DataOffset - if (i < header_file_count) { - // file data write in header, so we set its offset to -1. - output->writeLong(-1); - } else { - output->writeLong(data_offset); - } - output->writeLong(file.filesize); // FileLength - if (i < header_file_count) { - // append data - copyFile(file.filename.c_str(), directory.get(), output.get(), header_buffer, - buffer_length); - } else { - data_offset += file.filesize; - } - } - // write rest files' data - uint8_t data_buffer[buffer_length]; - for (int i = header_file_count; i < sorted_files.size(); ++i) { - auto file = sorted_files[i]; - copyFile(file.filename.c_str(), directory.get(), output.get(), data_buffer, - buffer_length); - } - out_dir->close(); - // NOTE: need to decrease ref count, but not to delete here, - // because index cache may get the same directory from DIRECTORIES - _CLDECDELETE(out_dir) + // Write header and data + write_header_and_data_v1(output.get(), sorted_files, directory, header_length, + header_file_count); + + // Collect file information auto compound_file_size = output->getFilePointer() - start; - output->close(); - //LOG(INFO) << (idx_path / idx_name).c_str() << " size:" << compound_file_size; total_size += compound_file_size; - InvertedIndexFileInfo_IndexInfo index_info; - index_info.set_index_id(index_id); - index_info.set_index_suffix(index_suffix); - index_info.set_index_file_size(compound_file_size); - auto* new_index_info = _file_info.add_index_info(); - *new_index_info = index_info; + add_index_info(index_id, index_suffix, compound_file_size); } catch (CLuceneError& err) { + error_context.eptr = std::current_exception(); auto index_path = InvertedIndexDescriptor::get_index_file_path_v1( _index_path_prefix, index_id, index_suffix); - LOG(ERROR) << "CLuceneError occur when write_v1 idx file " << index_path - << " error msg: " << err.what(); - - return Status::Error( - "CLuceneError occur when write_v1 idx file: {}, error msg: {}", index_path, - err.what()); + error_context.err_msg.append("CLuceneError occur when write_v1 idx file: "); + error_context.err_msg.append(index_path); + error_context.err_msg.append(", error msg: "); + error_context.err_msg.append(err.what()); + LOG(ERROR) << error_context.err_msg; } + FINALLY({ + FINALLY_CLOSE(output); + FINALLY_CLOSE(out_dir); + }) } + _total_file_size = total_size; return Status::OK(); } -Status InvertedIndexFileWriter::write_v2() { - io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; - std::unique_ptr compound_file_output; +Status InvertedIndexFileWriter::write() { + std::unique_ptr out_dir = nullptr; + std::unique_ptr compound_file_output = nullptr; + ErrorContext error_context; try { - // Create the output stream to write the compound file + // Calculate header length and initialize offset int64_t current_offset = headerLength(); + // Prepare file metadata + auto file_metadata = prepare_file_metadata(current_offset); - io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; + // Create output stream + auto result = create_output_stream(); + out_dir = std::move(result.first); + compound_file_output = std::move(result.second); - auto* out_dir = - DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); - out_dir->set_file_writer_opts(_opts); + // Write version and number of indices + write_version_and_indices_count(compound_file_output.get()); - std::unique_ptr compound_file_output; + // Write index headers and file metadata + write_index_headers_and_metadata(compound_file_output.get(), file_metadata); - DCHECK(_idx_v2_writer != nullptr) << "inverted index file writer v2 is nullptr"; - compound_file_output = std::unique_ptr( - out_dir->createOutputV2(_idx_v2_writer.get())); + // Copy file data + copy_files_data(compound_file_output.get(), file_metadata); - // Write the version number - compound_file_output->writeInt(InvertedIndexStorageFormatPB::V2); + _total_file_size = compound_file_output->getFilePointer(); + _file_info.set_index_size(_total_file_size); + } catch (CLuceneError& err) { + error_context.eptr = std::current_exception(); + auto index_path = InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix); + error_context.err_msg.append("CLuceneError occur when close idx file: "); + error_context.err_msg.append(index_path); + error_context.err_msg.append(", error msg: "); + error_context.err_msg.append(err.what()); + LOG(ERROR) << error_context.err_msg; + } + FINALLY({ + FINALLY_CLOSE(compound_file_output); + FINALLY_CLOSE(out_dir); + }) - // Write the number of indices - const auto numIndices = static_cast(_indices_dirs.size()); - compound_file_output->writeInt(numIndices); + return Status::OK(); +} - std::vector> - file_metadata; // Store file name, offset, file length, and corresponding directory +// Helper function implementations +std::vector InvertedIndexFileWriter::prepare_sorted_files( + lucene::store::Directory* directory) { + std::vector files; + directory->list(&files); + + // Remove write.lock file + files.erase(std::remove(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE), + files.end()); + + std::vector sorted_files; + for (const auto& file : files) { + FileInfo file_info; + file_info.filename = file; + file_info.filesize = directory->fileLength(file.c_str()); + sorted_files.push_back(std::move(file_info)); + } - // First, write all index information and file metadata - for (const auto& entry : _indices_dirs) { - const int64_t index_id = entry.first.first; - const auto& index_suffix = entry.first.second; - const auto& dir = entry.second; - std::vector files; - dir->list(&files); - - auto it = std::find(files.begin(), files.end(), DorisFSDirectory::WRITE_LOCK_FILE); - if (it != files.end()) { - files.erase(it); - } - // sort file list by file length - std::vector> sorted_files; - for (const auto& file : files) { - sorted_files.emplace_back(file, dir->fileLength(file.c_str())); - } + // Sort the files + sort_files(sorted_files); + return sorted_files; +} - std::sort( - sorted_files.begin(), sorted_files.end(), - [](const std::pair& a, - const std::pair& b) { return (a.second < b.second); }); - - int32_t file_count = sorted_files.size(); - - // Write the index ID and the number of files - compound_file_output->writeLong(index_id); - compound_file_output->writeInt(static_cast(index_suffix.length())); - compound_file_output->writeBytes(reinterpret_cast(index_suffix.data()), - index_suffix.length()); - compound_file_output->writeInt(file_count); - - // Calculate the offset for each file and write the file metadata - for (const auto& file : sorted_files) { - int64_t file_length = dir->fileLength(file.first.c_str()); - compound_file_output->writeInt(static_cast(file.first.length())); - compound_file_output->writeBytes( - reinterpret_cast(file.first.data()), file.first.length()); - compound_file_output->writeLong(current_offset); - compound_file_output->writeLong(file_length); - - file_metadata.emplace_back(file.first, current_offset, file_length, dir.get()); - current_offset += file_length; // Update the data offset - } +void InvertedIndexFileWriter::add_index_info(int64_t index_id, const std::string& index_suffix, + int64_t compound_file_size) { + InvertedIndexFileInfo_IndexInfo index_info; + index_info.set_index_id(index_id); + index_info.set_index_suffix(index_suffix); + index_info.set_index_file_size(compound_file_size); + auto* new_index_info = _file_info.add_index_info(); + *new_index_info = index_info; +} + +std::pair InvertedIndexFileWriter::calculate_header_length( + const std::vector& sorted_files, lucene::store::Directory* directory) { + // Use RAMDirectory to calculate header length + lucene::store::RAMDirectory ram_dir; + auto* out_idx = ram_dir.createOutput("temp_idx"); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::calculate_header_length_ram_output_is_nullptr", + { out_idx = nullptr; }) + if (out_idx == nullptr) { + LOG(WARNING) << "InvertedIndexFileWriter::calculate_header_length error: RAMDirectory " + "output is nullptr."; + _CLTHROWA(CL_ERR_IO, "Create RAMDirectory output error"); + } + std::unique_ptr ram_output(out_idx); + int32_t file_count = sorted_files.size(); + ram_output->writeVInt(file_count); + + int64_t header_file_length = 0; + const int64_t buffer_length = 16384; + uint8_t ram_buffer[buffer_length]; + int32_t header_file_count = 0; + for (const auto& file : sorted_files) { + ram_output->writeString(file.filename); + ram_output->writeLong(0); + ram_output->writeLong(file.filesize); + header_file_length += file.filesize; + + if (header_file_length <= DorisFSDirectory::MAX_HEADER_DATA_SIZE) { + copyFile(file.filename.c_str(), directory, ram_output.get(), ram_buffer, buffer_length); + header_file_count++; } + } + + int64_t header_length = ram_output->getFilePointer(); + ram_output->close(); + ram_dir.close(); + return {header_length, header_file_count}; +} - const int64_t buffer_length = 16384; - uint8_t header_buffer[buffer_length]; +std::pair, + std::unique_ptr> +InvertedIndexFileWriter::create_output_stream_v1(int64_t index_id, + const std::string& index_suffix) { + io::Path cfs_path(InvertedIndexDescriptor::get_index_file_path_v1(_index_path_prefix, index_id, + index_suffix)); + auto idx_path = cfs_path.parent_path(); + std::string idx_name = cfs_path.filename(); + + auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, idx_path.c_str()); + out_dir->set_file_writer_opts(_opts); + std::unique_ptr out_dir_ptr(out_dir); + + auto* out = out_dir->createOutput(idx_name.c_str()); + DBUG_EXECUTE_IF("InvertedIndexFileWriter::write_v1_out_dir_createOutput_nullptr", + { out = nullptr; }); + if (out == nullptr) { + LOG(WARNING) << "InvertedIndexFileWriter::create_output_stream_v1 error: CompoundDirectory " + "output is nullptr."; + _CLTHROWA(CL_ERR_IO, "Create CompoundDirectory output error"); + } + std::unique_ptr output(out); - // Next, write the file data - for (const auto& info : file_metadata) { - const std::string& file = std::get<0>(info); - auto* dir = std::get<3>(info); + return {std::move(out_dir_ptr), std::move(output)}; +} - // Write the actual file data - copyFile(file.c_str(), dir, compound_file_output.get(), header_buffer, buffer_length); +void InvertedIndexFileWriter::write_header_and_data_v1(lucene::store::IndexOutput* output, + const std::vector& sorted_files, + lucene::store::Directory* directory, + int64_t header_length, + int32_t header_file_count) { + output->writeVInt(sorted_files.size()); + int64_t data_offset = header_length; + const int64_t buffer_length = 16384; + uint8_t buffer[buffer_length]; + + for (int i = 0; i < sorted_files.size(); ++i) { + auto file = sorted_files[i]; + output->writeString(file.filename); + + // DataOffset + if (i < header_file_count) { + // file data write in header, so we set its offset to -1. + output->writeLong(-1); + } else { + output->writeLong(data_offset); + } + output->writeLong(file.filesize); // FileLength + if (i < header_file_count) { + // append data + copyFile(file.filename.c_str(), directory, output, buffer, buffer_length); + } else { + data_offset += file.filesize; } + } - out_dir->close(); - // NOTE: need to decrease ref count, but not to delete here, - // because index cache may get the same directory from DIRECTORIES - _CLDECDELETE(out_dir) - _total_file_size = compound_file_output->getFilePointer(); - compound_file_output->close(); - _file_info.set_index_size(_total_file_size); - } catch (CLuceneError& err) { - LOG(ERROR) << "CLuceneError occur when close idx file " << index_path - << " error msg: " << err.what(); - if (compound_file_output) { - compound_file_output->close(); - compound_file_output.reset(); + for (size_t i = header_file_count; i < sorted_files.size(); ++i) { + copyFile(sorted_files[i].filename.c_str(), directory, output, buffer, buffer_length); + } +} + +std::pair, + std::unique_ptr> +InvertedIndexFileWriter::create_output_stream() { + io::Path index_path {InvertedIndexDescriptor::get_index_file_path_v2(_index_path_prefix)}; + + auto* out_dir = DorisFSDirectoryFactory::getDirectory(_fs, index_path.parent_path().c_str()); + out_dir->set_file_writer_opts(_opts); + std::unique_ptr out_dir_ptr(out_dir); + + DCHECK(_idx_v2_writer != nullptr) << "inverted index file writer v2 is nullptr"; + auto compound_file_output = std::unique_ptr( + out_dir->createOutputV2(_idx_v2_writer.get())); + + return {std::move(out_dir_ptr), std::move(compound_file_output)}; +} + +void InvertedIndexFileWriter::write_version_and_indices_count(lucene::store::IndexOutput* output) { + // Write the version number + output->writeInt(_storage_format); + + // Write the number of indices + const auto num_indices = static_cast(_indices_dirs.size()); + output->writeInt(num_indices); +} + +std::vector InvertedIndexFileWriter::prepare_file_metadata( + int64_t& current_offset) { + std::vector file_metadata; + + for (const auto& entry : _indices_dirs) { + const int64_t index_id = entry.first.first; + const auto& index_suffix = entry.first.second; + auto* dir = entry.second.get(); + + // Get sorted files + auto sorted_files = prepare_sorted_files(dir); + + for (const auto& file : sorted_files) { + file_metadata.emplace_back(index_id, index_suffix, file.filename, current_offset, + file.filesize, dir); + current_offset += file.filesize; // Update the data offset } - return Status::Error( - "CLuceneError occur when close idx file: {}, error msg: {}", index_path.c_str(), - err.what()); } - return Status::OK(); + return file_metadata; +} + +void InvertedIndexFileWriter::write_index_headers_and_metadata( + lucene::store::IndexOutput* output, const std::vector& file_metadata) { + // Group files by index_id and index_suffix + std::map, std::vector> indices; + + for (const auto& meta : file_metadata) { + indices[{meta.index_id, meta.index_suffix}].push_back(meta); + } + + for (const auto& index_entry : indices) { + int64_t index_id = index_entry.first.first; + const std::string& index_suffix = index_entry.first.second; + const auto& files = index_entry.second; + + // Write the index ID and the number of files + output->writeLong(index_id); + output->writeInt(static_cast(index_suffix.length())); + output->writeBytes(reinterpret_cast(index_suffix.data()), + index_suffix.length()); + output->writeInt(static_cast(files.size())); + + // Write file metadata + for (const auto& file : files) { + output->writeInt(static_cast(file.filename.length())); + output->writeBytes(reinterpret_cast(file.filename.data()), + file.filename.length()); + output->writeLong(file.offset); + output->writeLong(file.length); + } + } +} + +void InvertedIndexFileWriter::copy_files_data(lucene::store::IndexOutput* output, + const std::vector& file_metadata) { + const int64_t buffer_length = 16384; + uint8_t buffer[buffer_length]; + + for (const auto& meta : file_metadata) { + copyFile(meta.filename.c_str(), meta.directory, output, buffer, buffer_length); + } } } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index 31e287d6dd3f71..ab7cdbff152460 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -28,7 +28,10 @@ #include "io/fs/file_system.h" #include "io/fs/file_writer.h" +#include "io/fs/local_file_system.h" +#include "olap/rowset/segment_v2/inverted_index_common.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "runtime/exec_env.h" namespace doris { class TabletIndex; @@ -36,7 +39,7 @@ class TabletIndex; namespace segment_v2 { class DorisFSDirectory; using InvertedIndexDirectoryMap = - std::map, std::unique_ptr>; + std::map, std::shared_ptr>; class InvertedIndexFileWriter; using InvertedIndexFileWriterPtr = std::unique_ptr; @@ -58,16 +61,19 @@ class InvertedIndexFileWriter { _rowset_id(std::move(rowset_id)), _seg_id(seg_id), _storage_format(storage_format), - _idx_v2_writer(std::move(file_writer)) {} + _local_fs(io::global_local_filesystem()), + _idx_v2_writer(std::move(file_writer)) { + auto tmp_file_dir = ExecEnv::GetInstance()->get_tmp_file_dirs()->get_tmp_file_dir(); + _tmp_dir = tmp_file_dir.native(); + } - Result open(const TabletIndex* index_meta); + Result> open(const TabletIndex* index_meta); Status delete_index(const TabletIndex* index_meta); Status initialize(InvertedIndexDirectoryMap& indices_dirs); - ~InvertedIndexFileWriter() = default; - Status write_v2(); + virtual ~InvertedIndexFileWriter() = default; + Status write(); Status write_v1(); Status close(); - int64_t headerLength(); const InvertedIndexFileInfo* get_index_file_info() const { DCHECK(_closed) << debug_string(); return &_file_info; @@ -77,11 +83,7 @@ class InvertedIndexFileWriter { return _total_file_size; } const io::FileSystemSPtr& get_fs() const { return _fs; } - void sort_files(std::vector& file_infos); - void copyFile(const char* fileName, lucene::store::Directory* dir, - lucene::store::IndexOutput* output, uint8_t* buffer, int64_t bufferLength); InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; } - void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } std::string debug_string() const { @@ -99,12 +101,62 @@ class InvertedIndexFileWriter { } private: + // Helper functions shared between write_v1 and write_v2 + std::vector prepare_sorted_files(lucene::store::Directory* directory); + void sort_files(std::vector& file_infos); + void copyFile(const char* fileName, lucene::store::Directory* dir, + lucene::store::IndexOutput* output, uint8_t* buffer, int64_t bufferLength); + void add_index_info(int64_t index_id, const std::string& index_suffix, + int64_t compound_file_size); + int64_t headerLength(); + // Helper functions specific to write_v1 + std::pair calculate_header_length(const std::vector& sorted_files, + lucene::store::Directory* directory); + virtual std::pair, + std::unique_ptr> + create_output_stream_v1(int64_t index_id, const std::string& index_suffix); + virtual void write_header_and_data_v1(lucene::store::IndexOutput* output, + const std::vector& sorted_files, + lucene::store::Directory* directory, + int64_t header_length, int32_t header_file_count); + // Helper functions specific to write_v2 + virtual std::pair, + std::unique_ptr> + create_output_stream(); + void write_version_and_indices_count(lucene::store::IndexOutput* output); + struct FileMetadata { + int64_t index_id; + std::string index_suffix; + std::string filename; + int64_t offset; + int64_t length; + lucene::store::Directory* directory; + + FileMetadata(int64_t id, const std::string& suffix, const std::string& file, int64_t off, + int64_t len, lucene::store::Directory* dir) + : index_id(id), + index_suffix(suffix), + filename(file), + offset(off), + length(len), + directory(dir) {} + }; + std::vector prepare_file_metadata(int64_t& current_offset); + virtual void write_index_headers_and_metadata(lucene::store::IndexOutput* output, + const std::vector& file_metadata); + void copy_files_data(lucene::store::IndexOutput* output, + const std::vector& file_metadata); + Status _insert_directory_into_map(int64_t index_id, const std::string& index_suffix, + std::shared_ptr dir); + // Member variables... InvertedIndexDirectoryMap _indices_dirs; const io::FileSystemSPtr _fs; std::string _index_path_prefix; std::string _rowset_id; int64_t _seg_id; InvertedIndexStorageFormatPB _storage_format; + std::string _tmp_dir; + const std::shared_ptr& _local_fs; // write to disk or stream io::FileWriterPtr _idx_v2_writer = nullptr; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp index ded71c8a6cc73e..fe0a81c41a6970 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.cpp @@ -83,39 +83,6 @@ namespace doris::segment_v2 { const char* const DorisFSDirectory::WRITE_LOCK_FILE = "write.lock"; -class DorisFSDirectory::FSIndexOutput : public lucene::store::BufferedIndexOutput { -protected: - void flushBuffer(const uint8_t* b, const int32_t size) override; - -public: - FSIndexOutput() = default; - void init(const io::FileSystemSPtr& fs, const char* path); - ~FSIndexOutput() override; - void close() override; - int64_t length() const override; - - void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } - -private: - io::FileWriterPtr _writer; - io::FileWriterOptions _opts; -}; - -class DorisFSDirectory::FSIndexOutputV2 : public lucene::store::BufferedIndexOutput { -private: - io::FileWriter* _index_v2_file_writer = nullptr; - -protected: - void flushBuffer(const uint8_t* b, const int32_t size) override; - -public: - FSIndexOutputV2() = default; - void init(io::FileWriter* file_writer); - ~FSIndexOutputV2() override; - void close() override; - int64_t length() const override; -}; - bool DorisFSDirectory::FSIndexInput::open(const io::FileSystemSPtr& fs, const char* path, IndexInput*& ret, CLuceneError& error, int32_t buffer_size, int64_t file_size) { @@ -219,6 +186,27 @@ void DorisFSDirectory::FSIndexInput::close() { }*/ } +void DorisFSDirectory::FSIndexInput::setIoContext(const void* io_ctx) { + if (io_ctx) { + const auto& ctx = static_cast(io_ctx); + _io_ctx.reader_type = ctx->reader_type; + _io_ctx.query_id = ctx->query_id; + _io_ctx.file_cache_stats = ctx->file_cache_stats; + } else { + _io_ctx.reader_type = ReaderType::UNKNOWN; + _io_ctx.query_id = nullptr; + _io_ctx.file_cache_stats = nullptr; + } +} + +const void* DorisFSDirectory::FSIndexInput::getIoContext() { + return &_io_ctx; +} + +void DorisFSDirectory::FSIndexInput::setIndexFile(bool isIndexFile) { + _io_ctx.is_index_data = isIndexFile; +} + void DorisFSDirectory::FSIndexInput::seekInternal(const int64_t position) { CND_PRECONDITION(position >= 0 && position < _handle->_length, "Seeking out of range"); _pos = position; @@ -239,9 +227,23 @@ void DorisFSDirectory::FSIndexInput::readInternal(uint8_t* b, const int32_t len) _handle->_fpos = _pos; } + DBUG_EXECUTE_IF( + "DorisFSDirectory::FSIndexInput::readInternal", ({ + static thread_local std::unordered_map + thread_file_cache_map; + auto it = thread_file_cache_map.find(_io_ctx.query_id); + if (it != thread_file_cache_map.end()) { + if (_io_ctx.file_cache_stats != it->second) { + _CLTHROWA(CL_ERR_IO, "File cache statistics mismatch"); + } + } else { + thread_file_cache_map[_io_ctx.query_id] = _io_ctx.file_cache_stats; + } + })); + Slice result {b, (size_t)len}; size_t bytes_read = 0; - auto st = _handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx); + Status st = _handle->_reader->read_at(_pos, result, &bytes_read, &_io_ctx); DBUG_EXECUTE_IF("DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error", { st = Status::InternalError( "debug point: DorisFSDirectory::FSIndexInput::readInternal_reader_read_at_error"); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h index 59ae6db1a9630d..41d9fb48356299 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_fs_directory.h @@ -180,8 +180,7 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput : BufferedIndexInput(buffer_size) { this->_pos = 0; this->_handle = std::move(handle); - this->_io_ctx.reader_type = ReaderType::READER_QUERY; - this->_io_ctx.is_index_data = false; + _io_ctx.is_inverted_index = true; } protected: @@ -199,8 +198,9 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput const char* getDirectoryType() const override { return DorisFSDirectory::getClassName(); } const char* getObjectName() const override { return getClassName(); } static const char* getClassName() { return "FSIndexInput"; } - - void setIdxFileCache(bool index) override { _io_ctx.is_index_data = index; } + void setIoContext(const void* io_ctx) override; + const void* getIoContext() override; + void setIndexFile(bool isIndexFile) override; std::mutex _this_lock; @@ -211,6 +211,39 @@ class DorisFSDirectory::FSIndexInput : public lucene::store::BufferedIndexInput void readInternal(uint8_t* b, const int32_t len) override; }; +class DorisFSDirectory::FSIndexOutput : public lucene::store::BufferedIndexOutput { +protected: + void flushBuffer(const uint8_t* b, const int32_t size) override; + +public: + FSIndexOutput() = default; + void init(const io::FileSystemSPtr& fs, const char* path); + ~FSIndexOutput() override; + void close() override; + int64_t length() const override; + + void set_file_writer_opts(const io::FileWriterOptions& opts) { _opts = opts; } + +private: + io::FileWriterPtr _writer; + io::FileWriterOptions _opts; +}; + +class DorisFSDirectory::FSIndexOutputV2 : public lucene::store::BufferedIndexOutput { +private: + io::FileWriter* _index_v2_file_writer = nullptr; + +protected: + void flushBuffer(const uint8_t* b, const int32_t size) override; + +public: + FSIndexOutputV2() = default; + void init(io::FileWriter* file_writer); + ~FSIndexOutputV2() override; + void close() override; + int64_t length() const override; +}; + /** * Factory function to create DorisFSDirectory */ diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp index b7cfe7dfaffb31..b40f9121125207 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.cpp @@ -102,7 +102,8 @@ std::string InvertedIndexReader::get_index_file_path() { return _inverted_index_file_reader->get_index_file_path(&_index_meta); } -Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, +Status InvertedIndexReader::read_null_bitmap(const io::IOContext* io_ctx, + OlapReaderStatistics* stats, InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir) { SCOPED_RAW_TIMER(&stats->inverted_index_query_null_bitmap_timer); @@ -120,9 +121,8 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, if (!dir) { // TODO: ugly code here, try to refact. - bool open_idx_file_cache = true; auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); + io_ctx); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -165,7 +165,8 @@ Status InvertedIndexReader::read_null_bitmap(OlapReaderStatistics* stats, } Status InvertedIndexReader::handle_searcher_cache( - InvertedIndexCacheHandle* inverted_index_cache_handle, OlapReaderStatistics* stats) { + InvertedIndexCacheHandle* inverted_index_cache_handle, const io::IOContext* io_ctx, + OlapReaderStatistics* stats) { auto index_file_key = _inverted_index_file_reader->get_index_file_cache_key(&_index_meta); InvertedIndexSearcherCache::CacheKey searcher_cache_key(index_file_key); if (InvertedIndexSearcherCache::instance()->lookup(searcher_cache_key, @@ -179,9 +180,8 @@ Status InvertedIndexReader::handle_searcher_cache( SCOPED_RAW_TIMER(&stats->inverted_index_searcher_open_timer); IndexSearcherPtr searcher; - bool open_idx_file_cache = true; - auto st = _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, - open_idx_file_cache); + auto st = + _inverted_index_file_reader->init(config::inverted_index_read_buffer_size, io_ctx); if (!st.ok()) { LOG(WARNING) << st; return st; @@ -191,7 +191,7 @@ Status InvertedIndexReader::handle_searcher_cache( // to avoid open directory additionally for null_bitmap // TODO: handle null bitmap procedure in new format. InvertedIndexQueryCacheHandle null_bitmap_cache_handle; - static_cast(read_null_bitmap(stats, &null_bitmap_cache_handle, dir.get())); + static_cast(read_null_bitmap(io_ctx, stats, &null_bitmap_cache_handle, dir.get())); RETURN_IF_ERROR(create_index_searcher(dir.release(), &searcher, mem_tracker.get(), type())); auto* cache_value = new InvertedIndexSearcherCache::CacheValue( std::move(searcher), mem_tracker->consumption(), UnixMillis()); @@ -211,22 +211,24 @@ Status InvertedIndexReader::create_index_searcher(lucene::store::Directory* dir, auto searcher_result = DORIS_TRY(index_searcher_builder->get_index_searcher(dir)); *searcher = searcher_result; - if (std::string(dir->getObjectName()) == "DorisCompoundReader") { - static_cast(dir)->getDorisIndexInput()->setIdxFileCache(false); - } + + // When the meta information has been read, the ioContext needs to be reset to prevent it from being used by other queries. + static_cast(dir)->getDorisIndexInput()->setIoContext(nullptr); + // NOTE: before mem_tracker hook becomes active, we caculate reader memory size by hand. mem_tracker->consume(index_searcher_builder->get_reader_size()); return Status::OK(); }; Status InvertedIndexReader::match_index_search( - OlapReaderStatistics* stats, RuntimeState* runtime_state, InvertedIndexQueryType query_type, - const InvertedIndexQueryInfo& query_info, const FulltextIndexSearcherPtr& index_searcher, + const io::IOContext* io_ctx, OlapReaderStatistics* stats, RuntimeState* runtime_state, + InvertedIndexQueryType query_type, const InvertedIndexQueryInfo& query_info, + const FulltextIndexSearcherPtr& index_searcher, const std::shared_ptr& term_match_bitmap) { TQueryOptions queryOptions = runtime_state->query_options(); try { SCOPED_RAW_TIMER(&stats->inverted_index_searcher_search_timer); - auto query = QueryFactory::create(query_type, index_searcher, queryOptions); + auto query = QueryFactory::create(query_type, index_searcher, queryOptions, io_ctx); if (!query) { return Status::Error( "query type " + query_type_to_string(query_type) + ", query is nullptr"); @@ -240,15 +242,17 @@ Status InvertedIndexReader::match_index_search( return Status::OK(); } -Status FullTextIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, +Status FullTextIndexReader::new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } -Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status FullTextIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); @@ -314,12 +318,12 @@ Status FullTextIndexReader::query(OlapReaderStatistics* stats, RuntimeState* run FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { term_match_bitmap = std::make_shared(); - RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, query_info, *searcher_ptr, term_match_bitmap)); term_match_bitmap->runOptimize(); cache->insert(cache_key, term_match_bitmap, &cache_handler); @@ -337,13 +341,15 @@ InvertedIndexReaderType FullTextIndexReader::type() { } Status StringTypeInvertedIndexReader::new_iterator( - OlapReaderStatistics* stats, RuntimeState* runtime_state, + const io::IOContext& io_ctx, OlapReaderStatistics* stats, RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } -Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, +Status StringTypeInvertedIndexReader::query(const io::IOContext* io_ctx, + OlapReaderStatistics* stats, RuntimeState* runtime_state, const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, @@ -387,7 +393,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, auto result = std::make_shared(); FulltextIndexSearcherPtr* searcher_ptr = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); searcher_ptr = std::get_if(&searcher_variant); if (searcher_ptr != nullptr) { @@ -396,7 +402,7 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_ANY_QUERY: case InvertedIndexQueryType::MATCH_ALL_QUERY: case InvertedIndexQueryType::EQUAL_QUERY: { - RETURN_IF_ERROR(match_index_search(stats, runtime_state, + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, InvertedIndexQueryType::MATCH_ANY_QUERY, query_info, *searcher_ptr, result)); break; @@ -404,8 +410,8 @@ Status StringTypeInvertedIndexReader::query(OlapReaderStatistics* stats, case InvertedIndexQueryType::MATCH_PHRASE_QUERY: case InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY: case InvertedIndexQueryType::MATCH_REGEXP_QUERY: { - RETURN_IF_ERROR(match_index_search(stats, runtime_state, query_type, query_info, - *searcher_ptr, result)); + RETURN_IF_ERROR(match_index_search(io_ctx, stats, runtime_state, query_type, + query_info, *searcher_ptr, result)); break; } case InvertedIndexQueryType::LESS_THAN_QUERY: @@ -470,9 +476,11 @@ InvertedIndexReaderType StringTypeInvertedIndexReader::type() { return InvertedIndexReaderType::STRING_TYPE; } -Status BkdIndexReader::new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, +Status BkdIndexReader::new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) { - *iterator = InvertedIndexIterator::create_unique(stats, runtime_state, shared_from_this()); + *iterator = + InvertedIndexIterator::create_unique(io_ctx, stats, runtime_state, shared_from_this()); return Status::OK(); } @@ -600,12 +608,12 @@ Status BkdIndexReader::invoke_bkd_query(const void* query_value, InvertedIndexQu return Status::OK(); } -Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) { +Status BkdIndexReader::try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) { try { std::shared_ptr r; - auto st = get_bkd_reader(r, stats); + auto st = get_bkd_reader(r, io_ctx, stats); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -637,15 +645,15 @@ Status BkdIndexReader::try_query(OlapReaderStatistics* stats, const std::string& return Status::OK(); } -Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, +Status BkdIndexReader::query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) { SCOPED_RAW_TIMER(&stats->inverted_index_query_timer); try { std::shared_ptr r; - auto st = get_bkd_reader(r, stats); + auto st = get_bkd_reader(r, io_ctx, stats); if (!st.ok()) { LOG(WARNING) << "get bkd reader for " << _inverted_index_file_reader->get_index_file_path(&_index_meta) @@ -681,11 +689,11 @@ Status BkdIndexReader::query(OlapReaderStatistics* stats, RuntimeState* runtime_ } } -Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, +Status BkdIndexReader::get_bkd_reader(BKDIndexSearcherPtr& bkd_reader, const io::IOContext* io_ctx, OlapReaderStatistics* stats) { BKDIndexSearcherPtr* bkd_searcher = nullptr; InvertedIndexCacheHandle inverted_index_cache_handle; - RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, stats)); + RETURN_IF_ERROR(handle_searcher_cache(&inverted_index_cache_handle, io_ctx, stats)); auto searcher_variant = inverted_index_cache_handle.get_index_searcher(); bkd_searcher = std::get_if(&searcher_variant); if (bkd_searcher) { @@ -1115,8 +1123,8 @@ Status InvertedIndexIterator::read_from_inverted_index( } } - RETURN_IF_ERROR( - _reader->query(_stats, _runtime_state, column_name, query_value, query_type, bit_map)); + RETURN_IF_ERROR(_reader->query(&_io_ctx, _stats, _runtime_state, column_name, query_value, + query_type, bit_map)); return Status::OK(); } @@ -1130,7 +1138,8 @@ Status InvertedIndexIterator::try_read_from_inverted_index(const std::string& co query_type == InvertedIndexQueryType::LESS_EQUAL_QUERY || query_type == InvertedIndexQueryType::LESS_THAN_QUERY || query_type == InvertedIndexQueryType::EQUAL_QUERY) { - RETURN_IF_ERROR(_reader->try_query(_stats, column_name, query_value, query_type, count)); + RETURN_IF_ERROR( + _reader->try_query(&_io_ctx, _stats, column_name, query_value, query_type, count)); } return Status::OK(); } @@ -1148,4 +1157,5 @@ template class InvertedIndexVisitor; template class InvertedIndexVisitor; template class InvertedIndexVisitor; template class InvertedIndexVisitor; + } // namespace doris::segment_v2 diff --git a/be/src/olap/rowset/segment_v2/inverted_index_reader.h b/be/src/olap/rowset/segment_v2/inverted_index_reader.h index d3a0ff3cf118ba..a1445603286619 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_reader.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_reader.h @@ -182,17 +182,18 @@ class InvertedIndexReader : public std::enable_shared_from_this* iterator) = 0; - virtual Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + virtual Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) = 0; - virtual Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) = 0; + virtual Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) = 0; - Status read_null_bitmap(OlapReaderStatistics* stats, + Status read_null_bitmap(const io::IOContext* io_ctx, OlapReaderStatistics* stats, InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr); @@ -223,15 +224,15 @@ class InvertedIndexReader : public std::enable_shared_from_this& term_match_bitmap); @@ -253,15 +254,16 @@ class FullTextIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~FullTextIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override { + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override { return Status::Error( "FullTextIndexReader not support try_query"); } @@ -279,15 +281,16 @@ class StringTypeInvertedIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~StringTypeInvertedIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override { + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override { return Status::Error( "StringTypeInvertedIndexReader not support try_query"); } @@ -338,16 +341,17 @@ class BkdIndexReader : public InvertedIndexReader { : InvertedIndexReader(index_meta, inverted_index_file_reader) {} ~BkdIndexReader() override = default; - Status new_iterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, + Status new_iterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::unique_ptr* iterator) override; - Status query(OlapReaderStatistics* stats, RuntimeState* runtime_state, - const std::string& column_name, const void* query_value, - InvertedIndexQueryType query_type, + Status query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, const std::string& column_name, + const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr& bit_map) override; - Status try_query(OlapReaderStatistics* stats, const std::string& column_name, - const void* query_value, InvertedIndexQueryType query_type, - uint32_t* count) override; + Status try_query(const io::IOContext* io_ctx, OlapReaderStatistics* stats, + const std::string& column_name, const void* query_value, + InvertedIndexQueryType query_type, uint32_t* count) override; Status invoke_bkd_try_query(const void* query_value, InvertedIndexQueryType query_type, std::shared_ptr r, uint32_t* count); Status invoke_bkd_query(const void* query_value, InvertedIndexQueryType query_type, @@ -359,7 +363,8 @@ class BkdIndexReader : public InvertedIndexReader { InvertedIndexVisitor* visitor); InvertedIndexReaderType type() override; - Status get_bkd_reader(BKDIndexSearcherPtr& reader, OlapReaderStatistics* stats); + Status get_bkd_reader(BKDIndexSearcherPtr& reader, const io::IOContext* io_ctx, + OlapReaderStatistics* stats); private: const TypeInfo* _type_info {}; @@ -447,9 +452,12 @@ class InvertedIndexIterator { ENABLE_FACTORY_CREATOR(InvertedIndexIterator); public: - InvertedIndexIterator(OlapReaderStatistics* stats, RuntimeState* runtime_state, - std::shared_ptr reader) - : _stats(stats), _runtime_state(runtime_state), _reader(std::move(reader)) {} + InvertedIndexIterator(const io::IOContext& io_ctx, OlapReaderStatistics* stats, + RuntimeState* runtime_state, std::shared_ptr reader) + : _io_ctx(io_ctx), + _stats(stats), + _runtime_state(runtime_state), + _reader(std::move(reader)) {} Status read_from_inverted_index(const std::string& column_name, const void* query_value, InvertedIndexQueryType query_type, uint32_t segment_num_rows, @@ -460,7 +468,7 @@ class InvertedIndexIterator { Status read_null_bitmap(InvertedIndexQueryCacheHandle* cache_handle, lucene::store::Directory* dir = nullptr) { - return _reader->read_null_bitmap(_stats, cache_handle, dir); + return _reader->read_null_bitmap(&_io_ctx, _stats, cache_handle, dir); } [[nodiscard]] InvertedIndexReaderType get_inverted_index_reader_type() const; @@ -470,6 +478,7 @@ class InvertedIndexIterator { const InvertedIndexReaderPtr& reader() { return _reader; } private: + io::IOContext _io_ctx; OlapReaderStatistics* _stats = nullptr; RuntimeState* _runtime_state = nullptr; std::shared_ptr _reader; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp b/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp index de8b494cd8be6d..5dfbd984813fd8 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_searcher.cpp @@ -34,6 +34,9 @@ Status FulltextIndexSearcherBuilder::build(lucene::store::Directory* directory, reader = lucene::index::IndexReader::open( directory, config::inverted_index_read_buffer_size, close_directory); } catch (const CLuceneError& e) { + std::vector file_names; + directory->list(&file_names); + LOG(ERROR) << fmt::format("Directory list: {}", fmt::join(file_names, ", ")); std::string msg = "FulltextIndexSearcherBuilder build error: " + std::string(e.what()); if (e.number() == CL_ERR_EmptyIndexSegment) { return Status::Error(msg); diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 29fe4609e59e9c..02edf2f1976e3e 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -51,6 +51,7 @@ #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h" #include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h" +#include "olap/rowset/segment_v2/inverted_index_common.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" #include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" @@ -63,11 +64,6 @@ #include "util/slice.h" #include "util/string_util.h" -#define FINALLY_CLOSE_OUTPUT(x) \ - try { \ - if (x != nullptr) x->close(); \ - } catch (...) { \ - } namespace doris::segment_v2 { const int32_t MAX_FIELD_LEN = 0x7FFFFFFFL; const int32_t MERGE_FACTOR = 100000000; @@ -138,13 +134,6 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { } } - void close() { - if (_index_writer) { - _index_writer->close(); - _index_writer.reset(); - } - } - void close_on_error() override { try { DBUG_EXECUTE_IF("InvertedIndexColumnWriter::close_on_error_throw_exception", @@ -197,7 +186,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { bool create_index = true; bool close_dir_on_shutdown = true; auto index_writer = std::make_unique( - _dir, _analyzer.get(), create_index, close_dir_on_shutdown); + _dir.get(), _analyzer.get(), create_index, close_dir_on_shutdown); DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setRAMBufferSizeMB_error", { index_writer->setRAMBufferSizeMB(-100); }) DBUG_EXECUTE_IF("InvertedIndexColumnWriter::create_index_writer_setMaxBufferedDocs_error", @@ -223,6 +212,28 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { (*field)->setOmitTermFreqAndPositions( !(get_parser_phrase_support_string_from_properties(_index_meta->properties()) == INVERTED_INDEX_PARSER_PHRASE_SUPPORT_YES)); + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_v3", { + if (_index_file_writer->get_storage_format() != InvertedIndexStorageFormatPB::V3) { + return Status::Error( + "debug point: InvertedIndexColumnWriterImpl::create_field_v3 error"); + } + }) + if (_index_file_writer->get_storage_format() >= InvertedIndexStorageFormatPB::V3) { + (*field)->setIndexVersion(IndexVersion::kV3); + // Only effective in v3 + std::string dict_compression = + get_parser_dict_compression_from_properties(_index_meta->properties()); + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::create_field_dic_compression", { + if (dict_compression != INVERTED_INDEX_PARSER_TRUE) { + return Status::Error( + "debug point: " + "InvertedIndexColumnWriterImpl::create_field_dic_compression error"); + } + }) + if (dict_compression == INVERTED_INDEX_PARSER_TRUE) { + (*field)->updateFlag(FlagBits::DICT_COMPRESS); + } + } return Status::OK(); } @@ -618,7 +629,6 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { buf.resize(size); _null_bitmap.write(reinterpret_cast(buf.data()), false); null_bitmap_out->writeBytes(buf.data(), size); - null_bitmap_out->close(); } } @@ -628,6 +638,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { std::unique_ptr data_out = nullptr; std::unique_ptr index_out = nullptr; std::unique_ptr meta_out = nullptr; + ErrorContext error_context; try { // write bkd file if constexpr (field_is_numeric_type(field_type)) { @@ -656,16 +667,11 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { << "Inverted index writer create output error occurred: nullptr"; _CLTHROWA(CL_ERR_IO, "Create output error with nullptr"); } - meta_out->close(); - data_out->close(); - index_out->close(); - _dir->close(); } else if constexpr (field_is_slice_type(field_type)) { null_bitmap_out = std::unique_ptr< lucene::store::IndexOutput>(_dir->createOutput( InvertedIndexDescriptor::get_temporary_null_bitmap_file_name())); write_null_bitmap(null_bitmap_out.get()); - close(); DBUG_EXECUTE_IF( "InvertedIndexWriter._throw_clucene_error_in_fulltext_writer_close", { _CLTHROWA(CL_ERR_IO, @@ -673,19 +679,24 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { }); } } catch (CLuceneError& e) { - FINALLY_CLOSE_OUTPUT(null_bitmap_out) - FINALLY_CLOSE_OUTPUT(meta_out) - FINALLY_CLOSE_OUTPUT(data_out) - FINALLY_CLOSE_OUTPUT(index_out) + error_context.eptr = std::current_exception(); + error_context.err_msg.append("Inverted index writer finish error occurred: "); + error_context.err_msg.append(e.what()); + LOG(ERROR) << error_context.err_msg; + } + FINALLY({ + FINALLY_CLOSE(null_bitmap_out); + FINALLY_CLOSE(meta_out); + FINALLY_CLOSE(data_out); + FINALLY_CLOSE(index_out); if constexpr (field_is_numeric_type(field_type)) { - FINALLY_CLOSE_OUTPUT(_dir) + FINALLY_CLOSE(_dir); } else if constexpr (field_is_slice_type(field_type)) { - FINALLY_CLOSE_OUTPUT(_index_writer); + FINALLY_CLOSE(_index_writer); + // After closing the _index_writer, it needs to be reset to null to prevent issues of not closing it or closing it multiple times. + _index_writer.reset(); } - LOG(WARNING) << "Inverted index writer finish error occurred: " << e.what(); - return Status::Error( - "Inverted index writer finish error occurred:{}", e.what()); - } + }) return Status::OK(); } @@ -708,7 +719,7 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter { std::unique_ptr _char_string_reader = nullptr; std::shared_ptr _bkd_writer = nullptr; InvertedIndexCtxSPtr _inverted_index_ctx = nullptr; - DorisFSDirectory* _dir = nullptr; + std::shared_ptr _dir = nullptr; const KeyCoder* _value_key_coder; const TabletIndex* _index_meta; InvertedIndexParserType _parser_type; diff --git a/be/src/olap/rowset/segment_v2/page_handle.h b/be/src/olap/rowset/segment_v2/page_handle.h index b1e53ee808697e..d4dfdfb2ff3c55 100644 --- a/be/src/olap/rowset/segment_v2/page_handle.h +++ b/be/src/olap/rowset/segment_v2/page_handle.h @@ -23,6 +23,10 @@ #include "util/slice.h" // for Slice namespace doris { + +// After disable page cache, sometimes we need to know the percentage of data pages in query memory. +inline bvar::Adder g_page_no_cache_mem_bytes("doris_page_no_cache_mem_bytes"); + namespace segment_v2 { // When a column page is read into memory, we use this to store it. @@ -37,8 +41,7 @@ class PageHandle { // This class will take the ownership of input data's memory. It will // free it when deconstructs. PageHandle(DataPage* data) : _is_data_owner(true), _data(data) { - _page_tracker = ExecEnv::GetInstance()->page_no_cache_mem_tracker(); - _page_tracker->consume(_data->capacity()); + g_page_no_cache_mem_bytes << _data->capacity(); } // This class will take the content of cache data, and will make input @@ -51,20 +54,18 @@ class PageHandle { // we can use std::exchange if we switch c++14 on std::swap(_is_data_owner, other._is_data_owner); std::swap(_data, other._data); - _page_tracker = ExecEnv::GetInstance()->page_no_cache_mem_tracker(); } PageHandle& operator=(PageHandle&& other) noexcept { std::swap(_is_data_owner, other._is_data_owner); std::swap(_data, other._data); _cache_data = std::move(other._cache_data); - _page_tracker = ExecEnv::GetInstance()->page_no_cache_mem_tracker(); return *this; } ~PageHandle() { if (_is_data_owner) { - _page_tracker->release(_data->capacity()); + g_page_no_cache_mem_bytes << -_data->capacity(); delete _data; } else { DCHECK(_data == nullptr); @@ -85,7 +86,6 @@ class PageHandle { // otherwise _cache_data is valid, and data is belong to cache. bool _is_data_owner = false; DataPage* _data = nullptr; - std::shared_ptr _page_tracker; PageCacheHandle _cache_data; // Don't allow copy and assign diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 0ad799683fc458..513c0be4f8cd14 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -163,7 +163,11 @@ Segment::Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr table _tablet_schema(std::move(tablet_schema)), _idx_file_info(idx_file_info) {} -Segment::~Segment() = default; +Segment::~Segment() { + g_segment_estimate_mem_bytes << -_tracked_meta_mem_usage; + // if failed, fix `_tracked_meta_mem_usage` accuracy + DCHECK(_tracked_meta_mem_usage == meta_mem_usage()); +} io::UInt128Wrapper Segment::file_cache_key(std::string_view rowset_id, uint32_t seg_id) { return io::BlockFileCache::hash(fmt::format("{}_{}.dat", rowset_id, seg_id)); @@ -174,6 +178,12 @@ int64_t Segment::get_metadata_size() const { (_pk_index_meta ? _pk_index_meta->ByteSizeLong() : 0); } +void Segment::update_metadata_size() { + MetadataAdder::update_metadata_size(); + g_segment_estimate_mem_bytes << _meta_mem_usage - _tracked_meta_mem_usage; + _tracked_meta_mem_usage = _meta_mem_usage; +} + Status Segment::_open() { _footer_pb = std::make_unique(); RETURN_IF_ERROR(_parse_footer(_footer_pb.get())); @@ -191,8 +201,6 @@ Status Segment::_open() { _meta_mem_usage += _pk_index_meta->ByteSizeLong(); } - update_metadata_size(); - _meta_mem_usage += sizeof(*this); _meta_mem_usage += _tablet_schema->num_columns() * config::estimated_mem_per_column_reader; @@ -201,6 +209,8 @@ Status Segment::_open() { // 0.01 comes from PrimaryKeyIndexBuilder::init _meta_mem_usage += BloomFilter::optimal_bit_num(_num_rows, 0.01) / 8; + update_metadata_size(); + return Status::OK(); } @@ -467,6 +477,7 @@ Status Segment::_load_pk_bloom_filter() { // for BE UT "segment_cache_test" return _load_pk_bf_once.call([this] { _meta_mem_usage += 100; + update_metadata_size(); return Status::OK(); }); } @@ -955,7 +966,7 @@ Status Segment::lookup_row_key(const Slice& key, const TabletSchema* latest_sche std::string* encoded_seq_value, OlapReaderStatistics* stats) { RETURN_IF_ERROR(load_pk_index_and_bf()); bool has_seq_col = latest_schema->has_sequence_col(); - bool has_rowid = !latest_schema->cluster_key_idxes().empty(); + bool has_rowid = !latest_schema->cluster_key_uids().empty(); size_t seq_col_length = 0; if (has_seq_col) { seq_col_length = latest_schema->column(latest_schema->sequence_col_idx()).length() + 1; @@ -1065,7 +1076,7 @@ Status Segment::read_key_by_rowid(uint32_t row_id, std::string* key) { RETURN_IF_ERROR(iter->next_batch(&num_read, index_column)); CHECK(num_read == 1); // trim row id - if (_tablet_schema->cluster_key_idxes().empty()) { + if (_tablet_schema->cluster_key_uids().empty()) { *key = index_column->get_data_at(0).to_string(); } else { Slice sought_key = diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index bc5ab1e1fdc80a..1b20c1f066bdf9 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -57,7 +57,6 @@ class IDataType; class ShortKeyIndexDecoder; class Schema; class StorageReadOptions; -class MemTracker; class PrimaryKeyIndexReader; class RowwiseIterator; struct RowLocation; @@ -93,6 +92,7 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd ~Segment(); int64_t get_metadata_size() const override; + void update_metadata_size(); Status new_iterator(SchemaSPtr schema, const StorageReadOptions& read_options, std::unique_ptr* iter); @@ -163,6 +163,8 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd io::FileReaderSPtr file_reader() { return _file_reader; } + // Including the column reader memory. + // another method `get_metadata_size` not include the column reader, only the segment object itself. int64_t meta_mem_usage() const { return _meta_mem_usage; } // Identify the column by unique id or path info @@ -249,9 +251,8 @@ class Segment : public std::enable_shared_from_this, public MetadataAdd // 1. Tracking memory use by segment meta data such as footer or index page. // 2. Tracking memory use by segment column reader // The memory consumed by querying is tracked in segment iterator. - // TODO: Segment::_meta_mem_usage Unknown value overflow, causes the value of SegmentMeta mem tracker - // is similar to `-2912341218700198079`. So, temporarily put it in experimental type tracker. int64_t _meta_mem_usage; + int64_t _tracked_meta_mem_usage = 0; RowsetId _rowset_id; TabletSchemaSPtr _tablet_schema; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 5b1bfaf076279f..abdf9116756f0e 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -377,7 +377,7 @@ Status SegmentIterator::_lazy_init() { _row_bitmap.addRange(0, _segment->num_rows()); // z-order can not use prefix index if (_segment->_tablet_schema->sort_type() != SortType::ZORDER && - _segment->_tablet_schema->cluster_key_idxes().empty()) { + _segment->_tablet_schema->cluster_key_uids().empty()) { RETURN_IF_ERROR(_get_row_ranges_by_keys()); } RETURN_IF_ERROR(_get_row_ranges_by_column_conditions()); @@ -1193,7 +1193,7 @@ Status SegmentIterator::_lookup_ordinal_from_pk_index(const RowCursor& key, bool bool has_seq_col = _segment->_tablet_schema->has_sequence_col(); // Used to get key range from primary key index, // for mow with cluster key table, we should get key range from short key index. - DCHECK(_segment->_tablet_schema->cluster_key_idxes().empty()); + DCHECK(_segment->_tablet_schema->cluster_key_uids().empty()); // if full key is exact_match, the primary key without sequence column should also the same if (has_seq_col && !exact_match) { @@ -1998,6 +1998,12 @@ Status SegmentIterator::copy_column_data_by_selector(vectorized::IColumn* input_ return input_col_ptr->filter_by_selector(sel_rowid_idx, select_size, output_col); } +void SegmentIterator::_clear_iterators() { + _column_iterators.clear(); + _bitmap_index_iterators.clear(); + _inverted_index_iterators.clear(); +} + Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { bool is_mem_reuse = block->mem_reuse(); DCHECK(is_mem_reuse); @@ -2104,6 +2110,8 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { } } block->clear_column_data(); + // clear and release iterators memory footprint in advance + _clear_iterators(); return Status::EndOfFile("no more data in segment"); } @@ -2167,11 +2175,11 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { if (block->rows() == 0) { vectorized::MutableColumnPtr col0 = std::move(*block->get_by_position(0).column).mutate(); - auto res_column = vectorized::ColumnString::create(); - res_column->insert_data("", 0); - auto col_const = vectorized::ColumnConst::create(std::move(res_column), - selected_size); - block->replace_by_position(0, std::move(col_const)); + auto tmp_indicator_col = + block->get_by_position(0) + .type->create_column_const_with_default_value( + selected_size); + block->replace_by_position(0, std::move(tmp_indicator_col)); _output_index_result_column_for_expr(_sel_rowid_idx.data(), selected_size, block); block->shrink_char_type_column_suffix_zero(_char_type_idx_no_0); @@ -2205,17 +2213,23 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { _sel_rowid_idx[i] = i; } + // Here we just use col0 as row_number indicator. when reach here, we will calculate the predicates first. + // then use the result to reduce our data read(that is, expr push down). there's now row in block means the first + // column is not in common expr. so it's safe to replace it temporarily to provide correct `selected_size`. if (block->rows() == 0) { vectorized::MutableColumnPtr col0 = std::move(*block->get_by_position(0).column).mutate(); - auto res_column = vectorized::ColumnString::create(); - res_column->insert_data("", 0); - auto col_const = - vectorized::ColumnConst::create(std::move(res_column), selected_size); - block->replace_by_position(0, std::move(col_const)); + // temporary replace the column with a row number indicator. using a ColumnConst is more efficient than + // insert_many_default + auto tmp_indicator_col = + block->get_by_position(0).type->create_column_const_with_default_value( + selected_size); + block->replace_by_position(0, std::move(tmp_indicator_col)); + _output_index_result_column_for_expr(_sel_rowid_idx.data(), selected_size, block); block->shrink_char_type_column_suffix_zero(_char_type_idx_no_0); RETURN_IF_ERROR(_execute_common_expr(_sel_rowid_idx.data(), selected_size, block)); + // now recover the origin col0 block->replace_by_position(0, std::move(col0)); } else { _output_index_result_column_for_expr(_sel_rowid_idx.data(), selected_size, block); @@ -2258,8 +2272,10 @@ Status SegmentIterator::_next_batch_internal(vectorized::Block* block) { size_t rows = block->rows(); for (const auto& entry : *block) { if (entry.column->size() != rows) { - throw doris::Exception(ErrorCode::INTERNAL_ERROR, "unmatched size {}, expected {}", - entry.column->size(), rows); + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "unmatched size {}, expected {}, column: {}, type: {}", + entry.column->size(), rows, entry.column->get_name(), + entry.type->get_name()); } } #endif diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index 5588661302dd06..5b4c8f6d73d0cd 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -383,6 +383,8 @@ class SegmentIterator : public RowwiseIterator { void _calculate_expr_in_remaining_conjunct_root(); + void _clear_iterators(); + class BitmapRangeIterator; class BackwardBitmapRangeIterator; diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 09ff3f6ed3be86..fe465f98a2aad2 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -103,7 +103,7 @@ SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, << ", table_id=" << _tablet_schema->table_id() << ", num_key_columns=" << _num_sort_key_columns << ", num_short_key_columns=" << _num_short_key_columns - << ", cluster_key_columns=" << _tablet_schema->cluster_key_idxes().size(); + << ", cluster_key_columns=" << _tablet_schema->cluster_key_uids().size(); } for (size_t cid = 0; cid < _num_sort_key_columns; ++cid) { const auto& column = _tablet_schema->column(cid); @@ -125,8 +125,8 @@ SegmentWriter::SegmentWriter(io::FileWriter* file_writer, uint32_t segment_id, // cluster keys _key_coders.clear(); _key_index_size.clear(); - _num_sort_key_columns = _tablet_schema->cluster_key_idxes().size(); - for (auto cid : _tablet_schema->cluster_key_idxes()) { + _num_sort_key_columns = _tablet_schema->cluster_key_uids().size(); + for (auto cid : _tablet_schema->cluster_key_uids()) { const auto& column = _tablet_schema->column_by_uid(cid); _key_coders.push_back(get_key_coder(column.type())); _key_index_size.push_back(column.index_length()); @@ -253,10 +253,10 @@ Status SegmentWriter::_create_column_writer(uint32_t cid, const TabletColumn& co opts.data_page_size = storage_page_size; } DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { - auto table_id = DebugPoints::instance()->get_debug_param_or_default( + auto table_id = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", INT_MIN); - auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( + auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "storage_page_size", INT_MIN); if (table_id == INT_MIN || target_data_page_size == INT_MIN) { @@ -545,6 +545,39 @@ Status SegmentWriter::probe_key_for_mow( return Status::OK(); } +Status SegmentWriter::partial_update_preconditions_check(size_t row_pos) { + if (!_is_mow()) { + auto msg = fmt::format( + "Can only do partial update on merge-on-write unique table, but found: " + "keys_type={}, _opts.enable_unique_key_merge_on_write={}, tablet_id={}", + _tablet_schema->keys_type(), _opts.enable_unique_key_merge_on_write, + _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + if (_opts.rowset_ctx->partial_update_info == nullptr) { + auto msg = + fmt::format("partial_update_info should not be nullptr, please check, tablet_id={}", + _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + if (!_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()) { + auto msg = fmt::format( + "in fixed partial update code, but update_mode={}, please check, tablet_id={}", + _opts.rowset_ctx->partial_update_info->update_mode(), _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + if (row_pos != 0) { + auto msg = fmt::format("row_pos should be 0, but found {}, tablet_id={}", row_pos, + _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + return Status::OK(); +} + // for partial update, we should do following steps to fill content of block: // 1. set block data to data convertor, and get all key_column's converted slice // 2. get pk of input block, and read missing columns @@ -562,11 +595,7 @@ Status SegmentWriter::append_block_with_partial_content(const vectorized::Block* block->columns(), _tablet_schema->num_key_columns(), _tablet_schema->num_columns())); } - DCHECK(_is_mow()); - - DCHECK(_opts.rowset_ctx->partial_update_info); - DCHECK(_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()); - DCHECK(row_pos == 0); + RETURN_IF_ERROR(partial_update_preconditions_check(row_pos)); // find missing column cids const auto& missing_cids = _opts.rowset_ctx->partial_update_info->missing_cids; @@ -788,7 +817,7 @@ Status SegmentWriter::append_block(const vectorized::Block* block, size_t row_po seq_column, num_rows, true)); // 2. generate short key index (use cluster key) key_columns.clear(); - for (const auto& cid : _tablet_schema->cluster_key_idxes()) { + for (const auto& cid : _tablet_schema->cluster_key_uids()) { // find cluster key index in tablet schema auto cluster_key_index = _tablet_schema->field_index(cid); if (cluster_key_index == -1) { @@ -1016,6 +1045,18 @@ Status SegmentWriter::finalize_columns_index(uint64_t* index_size) { *index_size = _file_writer->bytes_appended() - index_start; if (_has_key) { if (_is_mow_with_cluster_key()) { + // 1. sort primary keys + std::sort(_primary_keys.begin(), _primary_keys.end()); + // 2. write primary keys index + std::string last_key; + for (const auto& key : _primary_keys) { + DCHECK(key.compare(last_key) > 0) + << "found duplicate key or key is not sorted! current key: " << key + << ", last key: " << last_key; + RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); + last_key = key; + } + RETURN_IF_ERROR(_write_short_key_index()); *index_size = _file_writer->bytes_appended() - index_start; RETURN_IF_ERROR(_write_primary_key_index()); @@ -1236,27 +1277,16 @@ Status SegmentWriter::_generate_primary_key_index( last_key = std::move(key); } } else { // mow table with cluster key - // 1. generate primary keys in memory - std::vector primary_keys; + // generate primary keys in memory for (uint32_t pos = 0; pos < num_rows; pos++) { std::string key = _full_encode_keys(primary_key_coders, primary_key_columns, pos); _maybe_invalid_row_cache(key); if (_tablet_schema->has_sequence_col()) { _encode_seq_column(seq_column, pos, &key); } - _encode_rowid(pos, &key); - primary_keys.emplace_back(std::move(key)); - } - // 2. sort primary keys - std::sort(primary_keys.begin(), primary_keys.end()); - // 3. write primary keys index - std::string last_key; - for (const auto& key : primary_keys) { - DCHECK(key.compare(last_key) > 0) - << "found duplicate key or key is not sorted! current key: " << key - << ", last key: " << last_key; - RETURN_IF_ERROR(_primary_key_index_builder->add_item(key)); - last_key = key; + _encode_rowid(pos + _num_rows_written, &key); + _primary_keys_size += key.size(); + _primary_keys.emplace_back(std::move(key)); } } return Status::OK(); @@ -1289,7 +1319,7 @@ inline bool SegmentWriter::_is_mow() { } inline bool SegmentWriter::_is_mow_with_cluster_key() { - return _is_mow() && !_tablet_schema->cluster_key_idxes().empty(); + return _is_mow() && !_tablet_schema->cluster_key_uids().empty(); } } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index 9a8af131087f92..60300383d7287d 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -105,6 +105,7 @@ class SegmentWriter { const std::function& found_cb, const std::function& not_found_cb, PartialUpdateStats& stats); + Status partial_update_preconditions_check(size_t row_pos); Status append_block_with_partial_content(const vectorized::Block* block, size_t row_pos, size_t num_rows); Status append_block_with_variant_subcolumns(vectorized::Block& data); @@ -155,6 +156,8 @@ class SegmentWriter { return Status::OK(); } + uint64_t primary_keys_size() const { return _primary_keys_size; } + private: DISALLOW_COPY_AND_ASSIGN(SegmentWriter); Status _create_column_writer(uint32_t cid, const TabletColumn& column, @@ -260,6 +263,8 @@ class SegmentWriter { std::map _rsid_to_rowset; // contains auto generated columns, should be nullptr if no variants's subcolumns TabletSchemaSPtr _flush_schema = nullptr; + std::vector _primary_keys; + uint64_t _primary_keys_size = 0; }; } // namespace segment_v2 diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 12028812f0d92b..0846b0fc1186a8 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -109,7 +109,7 @@ VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32 << ", table_id=" << _tablet_schema->table_id() << ", num_key_columns=" << _num_sort_key_columns << ", num_short_key_columns=" << _num_short_key_columns - << ", cluster_key_columns=" << _tablet_schema->cluster_key_idxes().size(); + << ", cluster_key_columns=" << _tablet_schema->cluster_key_uids().size(); } for (size_t cid = 0; cid < _num_sort_key_columns; ++cid) { const auto& column = _tablet_schema->column(cid); @@ -131,8 +131,8 @@ VerticalSegmentWriter::VerticalSegmentWriter(io::FileWriter* file_writer, uint32 // cluster keys _key_coders.clear(); _key_index_size.clear(); - _num_sort_key_columns = _tablet_schema->cluster_key_idxes().size(); - for (auto cid : _tablet_schema->cluster_key_idxes()) { + _num_sort_key_columns = _tablet_schema->cluster_key_uids().size(); + for (auto cid : _tablet_schema->cluster_key_uids()) { const auto& column = _tablet_schema->column_by_uid(cid); _key_coders.push_back(get_key_coder(column.type())); _key_index_size.push_back(column.index_length()); @@ -248,10 +248,10 @@ Status VerticalSegmentWriter::_create_column_writer(uint32_t cid, const TabletCo opts.data_page_size = storage_page_size; } DBUG_EXECUTE_IF("VerticalSegmentWriter._create_column_writer.storage_page_size", { - auto table_id = DebugPoints::instance()->get_debug_param_or_default( + auto table_id = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "table_id", INT_MIN); - auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( + auto target_data_page_size = DebugPoints::instance()->get_debug_param_or_default( "VerticalSegmentWriter._create_column_writer.storage_page_size", "storage_page_size", INT_MIN); if (table_id == INT_MIN || target_data_page_size == INT_MIN) { @@ -418,6 +418,51 @@ Status VerticalSegmentWriter::_probe_key_for_mow( return Status::OK(); } +Status VerticalSegmentWriter::_partial_update_preconditions_check(size_t row_pos, + bool is_flexible_update) { + if (!_is_mow()) { + auto msg = fmt::format( + "Can only do partial update on merge-on-write unique table, but found: " + "keys_type={}, _opts.enable_unique_key_merge_on_write={}, tablet_id={}", + _tablet_schema->keys_type(), _opts.enable_unique_key_merge_on_write, + _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + if (_opts.rowset_ctx->partial_update_info == nullptr) { + auto msg = + fmt::format("partial_update_info should not be nullptr, please check, tablet_id={}", + _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + if (!is_flexible_update) { + if (!_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()) { + auto msg = fmt::format( + "in fixed partial update code, but update_mode={}, please check, tablet_id={}", + _opts.rowset_ctx->partial_update_info->update_mode(), _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + } else { + if (!_opts.rowset_ctx->partial_update_info->is_flexible_partial_update()) { + auto msg = fmt::format( + "in flexible partial update code, but update_mode={}, please check, " + "tablet_id={}", + _opts.rowset_ctx->partial_update_info->update_mode(), _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + } + if (row_pos != 0) { + auto msg = fmt::format("row_pos should be 0, but found {}, tablet_id={}", row_pos, + _tablet->tablet_id()); + DCHECK(false) << msg; + return Status::InternalError(msg); + } + return Status::OK(); +} + // for partial update, we should do following steps to fill content of block: // 1. set block data to data convertor, and get all key_column's converted slice // 2. get pk of input block, and read missing columns @@ -427,11 +472,7 @@ Status VerticalSegmentWriter::_probe_key_for_mow( // 3. set columns to data convertor and then write all columns Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& data, vectorized::Block& full_block) { - DCHECK(_is_mow()); - DCHECK(_opts.rowset_ctx->partial_update_info != nullptr); - DCHECK(_opts.rowset_ctx->partial_update_info->is_fixed_partial_update()); - DCHECK(data.row_pos == 0); - + RETURN_IF_ERROR(_partial_update_preconditions_check(data.row_pos, false)); // create full block and fill with input columns full_block = _tablet_schema->create_block(); const auto& including_cids = _opts.rowset_ctx->partial_update_info->update_cids; @@ -580,10 +621,7 @@ Status VerticalSegmentWriter::_append_block_with_partial_content(RowsInBlock& da Status VerticalSegmentWriter::_append_block_with_flexible_partial_content( RowsInBlock& data, vectorized::Block& full_block) { - DCHECK(_is_mow()); - DCHECK(_opts.rowset_ctx->partial_update_info != nullptr); - DCHECK(_opts.rowset_ctx->partial_update_info->is_flexible_partial_update()); - DCHECK(data.row_pos == 0); + RETURN_IF_ERROR(_partial_update_preconditions_check(data.row_pos, true)); // data.block has the same schema with full_block DCHECK(data.block->columns() == _tablet_schema->num_columns()); @@ -1149,9 +1187,9 @@ Status VerticalSegmentWriter::write_batch() { } auto column_unique_id = _tablet_schema->column(cid).unique_id(); if (_is_mow_with_cluster_key() && - std::find(_tablet_schema->cluster_key_idxes().begin(), - _tablet_schema->cluster_key_idxes().end(), - column_unique_id) != _tablet_schema->cluster_key_idxes().end()) { + std::find(_tablet_schema->cluster_key_uids().begin(), + _tablet_schema->cluster_key_uids().end(), + column_unique_id) != _tablet_schema->cluster_key_uids().end()) { cid_to_column[column_unique_id] = column; } RETURN_IF_ERROR(_column_writers[cid]->append(column->get_nullmap(), column->get_data(), @@ -1213,7 +1251,7 @@ Status VerticalSegmentWriter::_generate_key_index( data.num_rows, true)); // 2. generate short key index (use cluster key) std::vector short_key_columns; - for (const auto& cid : _tablet_schema->cluster_key_idxes()) { + for (const auto& cid : _tablet_schema->cluster_key_uids()) { short_key_columns.push_back(cid_to_column[cid]); } RETURN_IF_ERROR(_generate_short_key_index(short_key_columns, data.num_rows, short_key_pos)); @@ -1572,7 +1610,7 @@ inline bool VerticalSegmentWriter::_is_mow() { } inline bool VerticalSegmentWriter::_is_mow_with_cluster_key() { - return _is_mow() && !_tablet_schema->cluster_key_idxes().empty(); + return _is_mow() && !_tablet_schema->cluster_key_uids().empty(); } } // namespace segment_v2 } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h index 951e9c2e2838c3..8cec6ed4d1abd6 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.h +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.h @@ -175,6 +175,7 @@ class VerticalSegmentWriter { const std::function& found_cb, const std::function& not_found_cb, PartialUpdateStats& stats); + Status _partial_update_preconditions_check(size_t row_pos, bool is_flexible_update); Status _append_block_with_partial_content(RowsInBlock& data, vectorized::Block& full_block); Status _append_block_with_flexible_partial_content(RowsInBlock& data, vectorized::Block& full_block); diff --git a/be/src/olap/rowset/unique_rowset_id_generator.cpp b/be/src/olap/rowset/unique_rowset_id_generator.cpp index 0ac7f63837a099..49e07e5835957a 100644 --- a/be/src/olap/rowset/unique_rowset_id_generator.cpp +++ b/be/src/olap/rowset/unique_rowset_id_generator.cpp @@ -17,8 +17,17 @@ #include "olap/rowset/unique_rowset_id_generator.h" +#include + +#include "olap/storage_engine.h" +#include "runtime/exec_env.h" + namespace doris { +RowsetId next_rowset_id() { + return ExecEnv::GetInstance()->storage_engine().next_rowset_id(); +} + UniqueRowsetIdGenerator::UniqueRowsetIdGenerator(const UniqueId& backend_uid) : _backend_uid(backend_uid), _inc_id(1) {} diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp index 46070f8dccd7ce..f493f21ac97fb7 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.cpp +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.cpp @@ -50,7 +50,8 @@ template requires std::is_base_of_v Status VerticalBetaRowsetWriter::add_columns(const vectorized::Block* block, const std::vector& col_ids, bool is_key, - uint32_t max_rows_per_segment) { + uint32_t max_rows_per_segment, + bool has_cluster_key) { auto& context = this->_context; VLOG_NOTICE << "VerticalBetaRowsetWriter::add_columns, columns: " << block->columns(); @@ -71,7 +72,9 @@ Status VerticalBetaRowsetWriter::add_columns(const vectorized::Block* block, _cur_writer_idx = 0; RETURN_IF_ERROR(_segment_writers[_cur_writer_idx]->append_block(block, 0, num_rows)); } else if (is_key) { - if (_segment_writers[_cur_writer_idx]->num_rows_written() > max_rows_per_segment) { + if (_segment_writers[_cur_writer_idx]->num_rows_written() > max_rows_per_segment || + (has_cluster_key && _segment_writers[_cur_writer_idx]->primary_keys_size() > + config::mow_primary_key_index_max_size_in_memory)) { // segment is full, need flush columns and create new segment writer RETURN_IF_ERROR(_flush_columns(_segment_writers[_cur_writer_idx].get(), true)); @@ -177,6 +180,7 @@ Status VerticalBetaRowsetWriter::_create_segment_writer( writer_options.enable_unique_key_merge_on_write = context.enable_unique_key_merge_on_write; writer_options.rowset_ctx = &context; writer_options.max_rows_per_segment = context.max_rows_per_segment; + // TODO if support VerticalSegmentWriter, also need to handle cluster key primary key index *writer = std::make_unique( segment_file_writer.get(), seg_id, context.tablet_schema, context.tablet, context.data_dir, writer_options, inverted_index_file_writer.get()); diff --git a/be/src/olap/rowset/vertical_beta_rowset_writer.h b/be/src/olap/rowset/vertical_beta_rowset_writer.h index dcb4ae5a8b5d16..ce756334308fcd 100644 --- a/be/src/olap/rowset/vertical_beta_rowset_writer.h +++ b/be/src/olap/rowset/vertical_beta_rowset_writer.h @@ -41,7 +41,7 @@ class VerticalBetaRowsetWriter final : public T { ~VerticalBetaRowsetWriter() override = default; Status add_columns(const vectorized::Block* block, const std::vector& col_ids, - bool is_key, uint32_t max_rows_per_segment) override; + bool is_key, uint32_t max_rows_per_segment, bool has_cluster_key) override; // flush last segment's column Status flush_columns(bool is_key) override; diff --git a/be/src/olap/rowset_builder.cpp b/be/src/olap/rowset_builder.cpp index 9bb0df318ee11c..ec7463d5b9d75d 100644 --- a/be/src/olap/rowset_builder.cpp +++ b/be/src/olap/rowset_builder.cpp @@ -148,8 +148,13 @@ Status BaseRowsetBuilder::init_mow_context(std::shared_ptr& mow_cont } Status RowsetBuilder::check_tablet_version_count() { - if (!_tablet->exceed_version_limit(config::max_tablet_version_num - 100) || - GlobalMemoryArbitrator::is_exceed_soft_mem_limit(GB_EXCHANGE_BYTE)) { + bool injection = false; + DBUG_EXECUTE_IF("RowsetBuilder.check_tablet_version_count.too_many_version", + { injection = true; }); + if (injection) { + // do not return if injection + } else if (!_tablet->exceed_version_limit(config::max_tablet_version_num - 100) || + GlobalMemoryArbitrator::is_exceed_soft_mem_limit(GB_EXCHANGE_BYTE)) { return Status::OK(); } //trigger compaction diff --git a/be/src/olap/rowset_builder.h b/be/src/olap/rowset_builder.h index 7fd578037363a0..fb2294d1770cc4 100644 --- a/be/src/olap/rowset_builder.h +++ b/be/src/olap/rowset_builder.h @@ -38,7 +38,6 @@ namespace doris { class CalcDeleteBitmapToken; class FlushToken; class MemTable; -class MemTracker; class StorageEngine; class TupleDescriptor; class SlotDescriptor; diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index ec291d8d2f0068..7f947612eed4ac 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -198,6 +198,21 @@ class MultiBlockMerger { pushed_row_refs.push_back(row_refs[i]); } } + if (!_tablet->tablet_schema()->cluster_key_uids().empty()) { + std::vector ids; + for (const auto& cid : _tablet->tablet_schema()->cluster_key_uids()) { + auto index = _tablet->tablet_schema()->field_index(cid); + if (index == -1) { + return Status::InternalError( + "could not find cluster key column with unique_id=" + + std::to_string(cid) + " in tablet schema"); + } + ids.push_back(index); + } + // sort by cluster key + std::stable_sort(pushed_row_refs.begin(), pushed_row_refs.end(), + ClusterKeyRowRefComparator(ids)); + } } // update real inserted row number @@ -249,6 +264,20 @@ class MultiBlockMerger { const size_t _num_columns; }; + struct ClusterKeyRowRefComparator { + ClusterKeyRowRefComparator(std::vector columns) : _columns(columns) {} + + int compare(const RowRef& lhs, const RowRef& rhs) const { + return lhs.block->compare_at(lhs.position, rhs.position, &_columns, *rhs.block, -1); + } + + bool operator()(const RowRef& lhs, const RowRef& rhs) const { + return compare(lhs, rhs) < 0; + } + + const std::vector _columns; + }; + BaseTabletSPtr _tablet; RowRefComparator _cmp; vectorized::Arena _arena; @@ -837,6 +866,9 @@ Status SchemaChangeJob::_do_process_alter_tablet(const TAlterTabletReqV2& reques for (int i = 0; i < num_cols; ++i) { return_columns[i] = i; } + std::vector cluster_key_idxes; + + DBUG_EXECUTE_IF("SchemaChangeJob::_do_process_alter_tablet.block", DBUG_BLOCK); // begin to find deltas to convert from base tablet to new tablet so that // obtain base tablet and new tablet's push lock and header write lock to prevent loading data @@ -951,6 +983,14 @@ Status SchemaChangeJob::_do_process_alter_tablet(const TAlterTabletReqV2& reques reader_context.batch_size = ALTER_TABLE_BATCH_SIZE; reader_context.delete_bitmap = &_base_tablet->tablet_meta()->delete_bitmap(); reader_context.version = Version(0, end_version); + if (!_base_tablet_schema->cluster_key_uids().empty()) { + for (const auto& uid : _base_tablet_schema->cluster_key_uids()) { + cluster_key_idxes.emplace_back(_base_tablet_schema->field_index(uid)); + } + reader_context.read_orderby_key_columns = &cluster_key_idxes; + reader_context.is_unique = false; + reader_context.sequence_id_idx = -1; + } for (auto& rs_split : rs_splits) { res = rs_split.rs_reader->init(&reader_context); if (!res) { @@ -1158,6 +1198,7 @@ Status SchemaChangeJob::_convert_historical_rowsets(const SchemaChangeParams& sc } context.write_type = DataWriteType::TYPE_SCHEMA_CHANGE; + // TODO if support VerticalSegmentWriter, also need to handle cluster key primary key index auto result = _new_tablet->create_rowset_writer(context, false); if (!result.has_value()) { res = Status::Error("create_rowset_writer failed, reason={}", diff --git a/be/src/olap/segment_loader.cpp b/be/src/olap/segment_loader.cpp index 26ac54c699b81a..4240f7e250a06b 100644 --- a/be/src/olap/segment_loader.cpp +++ b/be/src/olap/segment_loader.cpp @@ -77,9 +77,8 @@ Status SegmentLoader::load_segments(const BetaRowsetSharedPtr& rowset, } if (use_cache && !config::disable_segment_cache) { // memory of SegmentCache::CacheValue will be handled by SegmentCache - auto* cache_value = new SegmentCache::CacheValue(); + auto* cache_value = new SegmentCache::CacheValue(segment); _cache_mem_usage += segment->meta_mem_usage(); - cache_value->segment = std::move(segment); _segment_cache->insert(cache_key, *cache_value, cache_handle); } else { cache_handle->push_segment(std::move(segment)); diff --git a/be/src/olap/segment_loader.h b/be/src/olap/segment_loader.h index 834906da93bf74..2c5b1ed200dde7 100644 --- a/be/src/olap/segment_loader.h +++ b/be/src/olap/segment_loader.h @@ -75,9 +75,9 @@ class SegmentCache : public LRUCachePolicy { // Holding all opened segments of a rowset. class CacheValue : public LRUCacheValueBase { public: - ~CacheValue() override { segment.reset(); } + CacheValue(segment_v2::SegmentSharedPtr segment_) : segment(std::move(segment_)) {} - segment_v2::SegmentSharedPtr segment; + const segment_v2::SegmentSharedPtr segment; }; SegmentCache(size_t memory_bytes_limit, size_t segment_num_limit) @@ -124,8 +124,13 @@ class SegmentLoader { void erase_segments(const RowsetId& rowset_id, int64_t num_segments); - // Just used for BE UT - int64_t cache_mem_usage() const { return _cache_mem_usage; } + int64_t cache_mem_usage() const { +#ifdef BE_TEST + return _cache_mem_usage; +#else + return _segment_cache->value_mem_consumption(); +#endif + } private: SegmentLoader(); diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index 7470afe0ef62c7..458f3949b17017 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -39,6 +39,7 @@ #include "task/engine_clone_task.h" #include "util/brpc_client_cache.h" #include "util/doris_metrics.h" +#include "util/security.h" #include "util/thrift_rpc_helper.h" #include "util/trace.h" @@ -373,7 +374,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same // name may have different versions. VLOG_DEBUG << "single replica compaction begin to download files, remote path=" - << _mask_token(remote_url_prefix) << " local_path=" << local_path; + << mask_token(remote_url_prefix) << " local_path=" << local_path; RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_path)); RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_path)); @@ -438,10 +439,10 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, std::string local_file_path = local_path + file_name; LOG(INFO) << "single replica compaction begin to download file from: " - << _mask_token(remote_file_url) << " to: " << local_file_path + << mask_token(remote_file_url) << " to: " << local_file_path << ". size(B): " << file_size << ", timeout(s): " << estimate_timeout; - auto download_cb = [this, &remote_file_url, estimate_timeout, &local_file_path, + auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path, file_size](HttpClient* client) { RETURN_IF_ERROR(client->init(remote_file_url)); client->set_timeout_ms(estimate_timeout * 1000); @@ -453,7 +454,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, uint64_t local_file_size = std::filesystem::file_size(local_file_path); if (local_file_size != file_size) { LOG(WARNING) << "download file length error" - << ", remote_path=" << _mask_token(remote_file_url) + << ", remote_path=" << mask_token(remote_file_url) << ", file_size=" << file_size << ", local_file_size=" << local_file_size; return Status::InternalError("downloaded file size is not equal"); @@ -585,9 +586,4 @@ Status SingleReplicaCompaction::_finish_clone(const string& clone_dir, return res; } -std::string SingleReplicaCompaction::_mask_token(const std::string& str) { - std::regex pattern("token=[\\w|-]+"); - return regex_replace(str, pattern, "token=******"); -} - } // namespace doris diff --git a/be/src/olap/single_replica_compaction.h b/be/src/olap/single_replica_compaction.h index 67f5527dd7b336..10ec65ec3f0570 100644 --- a/be/src/olap/single_replica_compaction.h +++ b/be/src/olap/single_replica_compaction.h @@ -62,7 +62,6 @@ class SingleReplicaCompaction final : public CompactionMixin { const std::string& local_path); Status _release_snapshot(const std::string& ip, int port, const std::string& snapshot_path); Status _finish_clone(const std::string& clone_dir, const Version& version); - std::string _mask_token(const std::string& str); CompactionType _compaction_type; std::vector _pending_rs_guards; diff --git a/be/src/olap/snapshot_manager.cpp b/be/src/olap/snapshot_manager.cpp index 67205835b53947..8202feb68c65b5 100644 --- a/be/src/olap/snapshot_manager.cpp +++ b/be/src/olap/snapshot_manager.cpp @@ -700,8 +700,10 @@ Status SnapshotManager::_create_snapshot_files(const TabletSharedPtr& ref_tablet InvertedIndexStorageFormatPB::V1) { for (const auto& index : tablet_schema.inverted_indexes()) { auto index_id = index->index_id(); - auto index_file = ref_tablet->get_segment_index_filepath( - rowset_id, segment_index, index_id); + auto index_file = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix( + segment_file_path), + index_id, index->get_index_suffix()); auto snapshot_segment_index_file_path = fmt::format("{}/{}_{}_{}.binlog-index", schema_full_path, rowset_id, segment_index, index_id); diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index e00b5b595e20dc..24cda8232f115c 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -463,6 +463,16 @@ Status StorageEngine::_check_file_descriptor_number() { << ", use default configuration instead."; return Status::OK(); } + if (getenv("SKIP_CHECK_ULIMIT") == nullptr) { + LOG(INFO) << "will check 'ulimit' value."; + } else if (std::string(getenv("SKIP_CHECK_ULIMIT")) == "true") { + LOG(INFO) << "the 'ulimit' value check is skipped" + << ", the SKIP_CHECK_ULIMIT env value is " << getenv("SKIP_CHECK_ULIMIT"); + return Status::OK(); + } else { + LOG(INFO) << "the SKIP_CHECK_ULIMIT env value is " << getenv("SKIP_CHECK_ULIMIT") + << ", will check ulimit value."; + } if (l.rlim_cur < config::min_file_descriptor_number) { LOG(ERROR) << "File descriptor number is less than " << config::min_file_descriptor_number << ". Please use (ulimit -n) to set a value equal or greater than " diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 421c0eb352d712..a22015898988b3 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -72,6 +72,7 @@ class ReportWorker; class CreateTabletRRIdxCache; struct DirInfo; class SnapshotManager; +class WorkloadGroup; using SegCompactionCandidates = std::vector; using SegCompactionCandidatesSharedPtr = std::shared_ptr; @@ -105,7 +106,7 @@ class BaseStorageEngine { virtual bool stopped() = 0; // start all background threads. This should be call after env is ready. - virtual Status start_bg_threads() = 0; + virtual Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) = 0; virtual Result get_tablet(int64_t tablet_id) = 0; @@ -278,7 +279,7 @@ class StorageEngine final : public BaseStorageEngine { return _default_rowset_type; } - Status start_bg_threads() override; + Status start_bg_threads(std::shared_ptr wg_sptr = nullptr) override; // clear trash and snapshot file // option: update disk usage after sweep diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index 62efef111d3584..c7919b3f8dca24 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -512,6 +512,15 @@ Status Tablet::add_rowset(RowsetSharedPtr rowset) { return Status::OK(); } +bool Tablet::rowset_exists_unlocked(const RowsetSharedPtr& rowset) { + if (auto it = _rs_version_map.find(rowset->version()); it == _rs_version_map.end()) { + return false; + } else if (rowset->rowset_id() != it->second->rowset_id()) { + return false; + } + return true; +} + Status Tablet::modify_rowsets(std::vector& to_add, std::vector& to_delete, bool check_delete) { // the compaction process allow to compact the single version, eg: version[4-4]. @@ -817,10 +826,13 @@ void Tablet::delete_expired_stale_rowset() { auto old_meta_size = _tablet_meta->all_stale_rs_metas().size(); // do delete operation + std::vector version_to_delete; auto to_delete_iter = stale_version_path_map.begin(); while (to_delete_iter != stale_version_path_map.end()) { std::vector& to_delete_version = to_delete_iter->second->timestamped_versions(); + int64_t start_version = -1; + int64_t end_version = -1; for (auto& timestampedVersion : to_delete_version) { auto it = _stale_rs_version_map.find(timestampedVersion->version()); if (it != _stale_rs_version_map.end()) { @@ -841,10 +853,17 @@ void Tablet::delete_expired_stale_rowset() { << timestampedVersion->version().second << "] not find in stale rs version map"; } + if (start_version < 0) { + start_version = timestampedVersion->version().first; + } + end_version = timestampedVersion->version().second; _delete_stale_rowset_by_version(timestampedVersion->version()); } + Version version(start_version, end_version); + version_to_delete.emplace_back(version.to_string()); to_delete_iter++; } + _tablet_meta->delete_bitmap().remove_stale_delete_bitmap_from_queue(version_to_delete); bool reconstructed = _reconstruct_version_tracker_if_necessary(); @@ -1692,6 +1711,10 @@ void Tablet::build_tablet_report_info(TTabletInfo* tablet_info, // tablet may not have cooldowned data, but the storage policy is set tablet_info->__set_cooldown_term(_cooldown_conf.term); } + tablet_info->__set_local_index_size(_tablet_meta->tablet_local_index_size()); + tablet_info->__set_local_segment_size(_tablet_meta->tablet_local_segment_size()); + tablet_info->__set_remote_index_size(_tablet_meta->tablet_remote_index_size()); + tablet_info->__set_remote_segment_size(_tablet_meta->tablet_remote_segment_size()); } void Tablet::report_error(const Status& st) { @@ -1727,8 +1750,13 @@ Status Tablet::prepare_compaction_and_calculate_permits( } if (!res.ok()) { - tablet->set_last_cumu_compaction_failure_time(UnixMillis()); permits = 0; + // if we meet a delete version, should increase the cumulative point to let base compaction handle the delete version. + // no need to wait 5s. + if (!(res.msg() == "_last_delete_version.first not equal to -1") || + config::enable_sleep_between_delete_cumu_compaction) { + tablet->set_last_cumu_compaction_failure_time(UnixMillis()); + } if (!res.is()) { DorisMetrics::instance()->cumulative_compaction_request_failed->increment(1); return Status::InternalError("prepare cumulative compaction with err: {}", res); @@ -1736,6 +1764,12 @@ Status Tablet::prepare_compaction_and_calculate_permits( // return OK if OLAP_ERR_CUMULATIVE_NO_SUITABLE_VERSION, so that we don't need to // print too much useless logs. // And because we set permits to 0, so even if we return OK here, nothing will be done. + LOG_INFO( + "cumulative compaction meet delete rowset, increase cumu point without other " + "operation.") + .tag("tablet id:", tablet->tablet_id()) + .tag("after cumulative compaction, cumu point:", + tablet->cumulative_layer_point()); return Status::OK(); } } else if (compaction_type == CompactionType::BASE_COMPACTION) { @@ -2490,7 +2524,7 @@ CalcDeleteBitmapExecutor* Tablet::calc_delete_bitmap_executor() { Status Tablet::save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t txn_id, DeleteBitmapPtr delete_bitmap, RowsetWriter* rowset_writer, - const RowsetIdUnorderedSet& cur_rowset_ids) { + const RowsetIdUnorderedSet& cur_rowset_ids, int64_t lock_id) { RowsetSharedPtr rowset = txn_info->rowset; int64_t cur_version = rowset->start_version(); @@ -2542,10 +2576,10 @@ void Tablet::set_skip_compaction(bool skip, CompactionType compaction_type, int6 bool Tablet::should_skip_compaction(CompactionType compaction_type, int64_t now) { if (compaction_type == CompactionType::CUMULATIVE_COMPACTION && _skip_cumu_compaction && - now < _skip_cumu_compaction_ts + 120) { + now < _skip_cumu_compaction_ts + config::skip_tablet_compaction_second) { return true; } else if (compaction_type == CompactionType::BASE_COMPACTION && _skip_base_compaction && - now < _skip_base_compaction_ts + 120) { + now < _skip_base_compaction_ts + config::skip_tablet_compaction_second) { return true; } return false; @@ -2581,30 +2615,6 @@ std::string Tablet::get_segment_filepath(std::string_view rowset_id, int64_t seg return fmt::format("{}/_binlog/{}_{}.dat", _tablet_path, rowset_id, segment_index); } -std::string Tablet::get_segment_index_filepath(std::string_view rowset_id, - std::string_view segment_index, - std::string_view index_id) const { - auto format = _tablet_meta->tablet_schema()->get_inverted_index_storage_format(); - if (format == doris::InvertedIndexStorageFormatPB::V1) { - return fmt::format("{}/_binlog/{}_{}_{}.idx", _tablet_path, rowset_id, segment_index, - index_id); - } else { - return fmt::format("{}/_binlog/{}_{}.idx", _tablet_path, rowset_id, segment_index); - } -} - -std::string Tablet::get_segment_index_filepath(std::string_view rowset_id, int64_t segment_index, - int64_t index_id) const { - auto format = _tablet_meta->tablet_schema()->get_inverted_index_storage_format(); - if (format == doris::InvertedIndexStorageFormatPB::V1) { - return fmt::format("{}/_binlog/{}_{}_{}.idx", _tablet_path, rowset_id, segment_index, - index_id); - } else { - DCHECK(index_id == -1); - return fmt::format("{}/_binlog/{}_{}.idx", _tablet_path, rowset_id, segment_index); - } -} - std::vector Tablet::get_binlog_filepath(std::string_view binlog_version) const { const auto& [rowset_id, num_segments] = get_binlog_info(binlog_version); std::vector binlog_filepath; @@ -2649,10 +2659,25 @@ void Tablet::gc_binlogs(int64_t version) { // add binlog segment files and index files for (int64_t i = 0; i < num_segments; ++i) { - wait_for_deleted_binlog_files.emplace_back(get_segment_filepath(rowset_id, i)); - for (const auto& index : this->tablet_schema()->inverted_indexes()) { - wait_for_deleted_binlog_files.emplace_back( - get_segment_index_filepath(rowset_id, i, index->index_id())); + auto segment_file_path = get_segment_filepath(rowset_id, i); + wait_for_deleted_binlog_files.emplace_back(segment_file_path); + + // index files + if (tablet_schema()->has_inverted_index()) { + if (tablet_schema()->get_inverted_index_storage_format() == + InvertedIndexStorageFormatPB::V1) { + for (const auto& index : tablet_schema()->inverted_indexes()) { + auto index_file = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix( + segment_file_path), + index->index_id(), index->get_index_suffix()); + wait_for_deleted_binlog_files.emplace_back(index_file); + } + } else { + auto index_file = InvertedIndexDescriptor::get_index_file_path_v2( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path)); + wait_for_deleted_binlog_files.emplace_back(index_file); + } } } }; diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index f5866c67641581..d00476f044191c 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -173,6 +173,7 @@ class Tablet final : public BaseTablet { // MUST hold EXCLUSIVE `_meta_lock`. Status modify_rowsets(std::vector& to_add, std::vector& to_delete, bool check_delete = false); + bool rowset_exists_unlocked(const RowsetSharedPtr& rowset); Status add_inc_rowset(const RowsetSharedPtr& rowset); /// Delete stale rowset by timing. This delete policy uses now() minutes @@ -417,7 +418,8 @@ class Tablet final : public BaseTablet { CalcDeleteBitmapExecutor* calc_delete_bitmap_executor() override; Status save_delete_bitmap(const TabletTxnInfo* txn_info, int64_t txn_id, DeleteBitmapPtr delete_bitmap, RowsetWriter* rowset_writer, - const RowsetIdUnorderedSet& cur_rowset_ids) override; + const RowsetIdUnorderedSet& cur_rowset_ids, + int64_t lock_id = -1) override; void merge_delete_bitmap(const DeleteBitmap& delete_bitmap); bool check_all_rowset_segment(); @@ -439,11 +441,6 @@ class Tablet final : public BaseTablet { std::string get_segment_filepath(std::string_view rowset_id, std::string_view segment_index) const; std::string get_segment_filepath(std::string_view rowset_id, int64_t segment_index) const; - std::string get_segment_index_filepath(std::string_view rowset_id, - std::string_view segment_index, - std::string_view index_id) const; - std::string get_segment_index_filepath(std::string_view rowset_id, int64_t segment_index, - int64_t index_id) const; bool can_add_binlog(uint64_t total_binlog_size) const; void gc_binlogs(int64_t version); Status ingest_binlog_metas(RowsetBinlogMetasPB* metas_pb); diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index d6a944dbc39853..33fee7ca350900 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -57,8 +57,6 @@ #include "olap/tablet_schema.h" #include "olap/txn_manager.h" #include "runtime/exec_env.h" -#include "runtime/memory/mem_tracker.h" -#include "runtime/thread_context.h" #include "service/backend_options.h" #include "util/defer_op.h" #include "util/doris_metrics.h" @@ -83,28 +81,18 @@ using std::vector; namespace doris { using namespace ErrorCode; -DEFINE_GAUGE_METRIC_PROTOTYPE_5ARG(tablet_meta_mem_consumption, MetricUnit::BYTES, "", - mem_consumption, Labels({{"type", "tablet_meta"}})); - bvar::Adder g_tablet_meta_schema_columns_count("tablet_meta_schema_columns_count"); TabletManager::TabletManager(StorageEngine& engine, int32_t tablet_map_lock_shard_size) : _engine(engine), - _tablet_meta_mem_tracker(std::make_shared("TabletMeta(experimental)")), _tablets_shards_size(tablet_map_lock_shard_size), _tablets_shards_mask(tablet_map_lock_shard_size - 1) { CHECK_GT(_tablets_shards_size, 0); CHECK_EQ(_tablets_shards_size & _tablets_shards_mask, 0); _tablets_shards.resize(_tablets_shards_size); - REGISTER_HOOK_METRIC(tablet_meta_mem_consumption, - [this]() { return _tablet_meta_mem_tracker->consumption(); }); } -TabletManager::~TabletManager() { -#ifndef BE_TEST - DEREGISTER_HOOK_METRIC(tablet_meta_mem_consumption); -#endif -} +TabletManager::~TabletManager() = default; Status TabletManager::_add_tablet_unlocked(TTabletId tablet_id, const TabletSharedPtr& tablet, bool update_meta, bool force, RuntimeProfile* profile) { @@ -242,10 +230,6 @@ Status TabletManager::_add_tablet_to_map_unlocked(TTabletId tablet_id, tablet_map_t& tablet_map = _get_tablet_map(tablet_id); tablet_map[tablet_id] = tablet; _add_tablet_to_partition(tablet); - // TODO: remove multiply 2 of tablet meta mem size - // Because table schema will copy in tablet, there will be double mem cost - // so here multiply 2 - _tablet_meta_mem_tracker->consume(tablet->tablet_meta()->mem_size() * 2); g_tablet_meta_schema_columns_count << tablet->tablet_meta()->tablet_columns_num(); COUNTER_UPDATE(ADD_CHILD_TIMER(profile, "RegisterTabletInfo", "AddTablet"), static_cast(watch.reset())); @@ -599,7 +583,6 @@ Status TabletManager::_drop_tablet(TTabletId tablet_id, TReplicaId replica_id, b } to_drop_tablet->deregister_tablet_from_dir(); - _tablet_meta_mem_tracker->release(to_drop_tablet->tablet_meta()->mem_size() * 2); g_tablet_meta_schema_columns_count << -to_drop_tablet->tablet_meta()->tablet_columns_num(); return Status::OK(); } @@ -1083,6 +1066,10 @@ void TabletManager::build_all_report_tablets_info(std::map* t_tablet_stat.__set_total_version_count(tablet_info.total_version_count); t_tablet_stat.__set_visible_version_count(tablet_info.visible_version_count); t_tablet_stat.__set_visible_version(tablet_info.version); + t_tablet_stat.__set_local_index_size(tablet_info.local_index_size); + t_tablet_stat.__set_local_segment_size(tablet_info.local_segment_size); + t_tablet_stat.__set_remote_index_size(tablet_info.remote_index_size); + t_tablet_stat.__set_remote_segment_size(tablet_info.remote_segment_size); }; for_each_tablet(handler, filter_all_tablets); @@ -1183,14 +1170,14 @@ bool TabletManager::_move_tablet_to_trash(const TabletSharedPtr& tablet) { if (tablet_in_not_shutdown->tablet_path() != tablet->tablet_path()) { LOG(INFO) << "tablet path not eq shutdown tablet path, move it to trash, tablet_id=" << tablet_in_not_shutdown->tablet_id() - << " mem manager tablet path=" << tablet_in_not_shutdown->tablet_path() - << " shutdown tablet path=" << tablet->tablet_path(); + << ", mem manager tablet path=" << tablet_in_not_shutdown->tablet_path() + << ", shutdown tablet path=" << tablet->tablet_path(); return tablet->data_dir()->move_to_trash(tablet->tablet_path()); } else { LOG(INFO) << "tablet path eq shutdown tablet path, not move to trash, tablet_id=" << tablet_in_not_shutdown->tablet_id() - << " mem manager tablet path=" << tablet_in_not_shutdown->tablet_path() - << " shutdown tablet path=" << tablet->tablet_path(); + << ", mem manager tablet path=" << tablet_in_not_shutdown->tablet_path() + << ", shutdown tablet path=" << tablet->tablet_path(); return true; } } @@ -1295,7 +1282,7 @@ Status TabletManager::register_transition_tablet(int64_t tablet_id, std::string // not found shard.tablets_under_transition[tablet_id] = std::make_tuple(reason, thread_id, 1); LOG(INFO) << "add tablet_id= " << tablet_id << " to map, reason=" << reason - << " lock times=1 thread_id_in_map=" << thread_id; + << ", lock times=1, thread_id_in_map=" << thread_id; return Status::OK(); } else { // found @@ -1303,15 +1290,15 @@ Status TabletManager::register_transition_tablet(int64_t tablet_id, std::string if (thread_id != thread_id_in_map) { // other thread, failed LOG(INFO) << "tablet_id = " << tablet_id << " is doing " << r - << " thread_id_in_map=" << thread_id_in_map << " , add reason=" << reason - << " thread_id=" << thread_id; + << ", thread_id_in_map=" << thread_id_in_map << " , add reason=" << reason + << ", thread_id=" << thread_id; return Status::InternalError("{} failed try later, tablet_id={}", reason, tablet_id); } // add lock times ++lock_times; LOG(INFO) << "add tablet_id= " << tablet_id << " to map, reason=" << reason - << " lock times=" << lock_times << " thread_id_in_map=" << thread_id_in_map; + << ", lock times=" << lock_times << ", thread_id_in_map=" << thread_id_in_map; return Status::OK(); } } @@ -1335,10 +1322,10 @@ void TabletManager::unregister_transition_tablet(int64_t tablet_id, std::string --lock_times; if (lock_times != 0) { LOG(INFO) << "erase tablet_id= " << tablet_id << " from map, reason=" << reason - << " left=" << lock_times << " thread_id_in_map=" << thread_id_in_map; + << ", left=" << lock_times << ", thread_id_in_map=" << thread_id_in_map; } else { LOG(INFO) << "erase tablet_id= " << tablet_id << " from map, reason=" << reason - << " thread_id_in_map=" << thread_id_in_map; + << ", thread_id_in_map=" << thread_id_in_map; shard.tablets_under_transition.erase(tablet_id); } } diff --git a/be/src/olap/tablet_manager.h b/be/src/olap/tablet_manager.h index 42623cf05f2aea..6b6e7998f9cee1 100644 --- a/be/src/olap/tablet_manager.h +++ b/be/src/olap/tablet_manager.h @@ -251,9 +251,6 @@ class TabletManager { StorageEngine& _engine; - // TODO: memory size of TabletSchema cannot be accurately tracked. - std::shared_ptr _tablet_meta_mem_tracker; - const int32_t _tablets_shards_size; const int32_t _tablets_shards_mask; std::vector _tablets_shards; diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index 13bbdaa9389faa..43b0d5d8bd0ae0 100644 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -57,6 +57,7 @@ using std::unordered_map; using std::vector; namespace doris { +#include "common/compile_check_begin.h" using namespace ErrorCode; TabletMetaSharedPtr TabletMeta::create( @@ -106,7 +107,7 @@ TabletMeta::TabletMeta() _delete_bitmap(new DeleteBitmap(_tablet_id)) {} TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id, - int64_t replica_id, int32_t schema_hash, uint64_t shard_id, + int64_t replica_id, int32_t schema_hash, int32_t shard_id, const TTabletSchema& tablet_schema, uint32_t next_unique_id, const std::unordered_map& col_ordinal_to_unique_id, TabletUid tablet_uid, TTabletType::type tabletType, @@ -203,6 +204,9 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id case TInvertedIndexFileStorageFormat::V2: schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); break; + case TInvertedIndexFileStorageFormat::V3: + schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V3); + break; default: schema->set_inverted_index_storage_format(InvertedIndexStorageFormatPB::V2); break; @@ -216,8 +220,8 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id schema->set_sort_type(SortType::LEXICAL); } schema->set_sort_col_num(tablet_schema.sort_col_num); - for (const auto& i : tablet_schema.cluster_key_idxes) { - schema->add_cluster_key_idxes(i); + for (const auto& i : tablet_schema.cluster_key_uids) { + schema->add_cluster_key_uids(i); } tablet_meta_pb.set_in_restore_mode(false); @@ -571,7 +575,8 @@ void TabletMeta::serialize(string* meta_binary) { Status TabletMeta::deserialize(std::string_view meta_binary) { TabletMetaPB tablet_meta_pb; - bool parsed = tablet_meta_pb.ParseFromArray(meta_binary.data(), meta_binary.size()); + bool parsed = tablet_meta_pb.ParseFromArray(meta_binary.data(), + static_cast(meta_binary.size())); if (!parsed) { return Status::Error("parse tablet meta failed"); } @@ -664,11 +669,11 @@ void TabletMeta::init_from_pb(const TabletMetaPB& tablet_meta_pb) { int seg_maps_size = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps_size(); CHECK(rst_ids_size == seg_ids_size && seg_ids_size == seg_maps_size && seg_maps_size == versions_size); - for (size_t i = 0; i < rst_ids_size; ++i) { + for (int i = 0; i < rst_ids_size; ++i) { RowsetId rst_id; rst_id.init(tablet_meta_pb.delete_bitmap().rowset_ids(i)); auto seg_id = tablet_meta_pb.delete_bitmap().segment_ids(i); - uint32_t ver = tablet_meta_pb.delete_bitmap().versions(i); + auto ver = tablet_meta_pb.delete_bitmap().versions(i); auto bitmap = tablet_meta_pb.delete_bitmap().segment_delete_bitmaps(i).data(); delete_bitmap().delete_bitmap[{rst_id, seg_id, ver}] = roaring::Roaring::read(bitmap); } @@ -781,12 +786,6 @@ void TabletMeta::to_meta_pb(TabletMetaPB* tablet_meta_pb) { time_series_compaction_level_threshold()); } -int64_t TabletMeta::mem_size() const { - auto size = sizeof(TabletMeta); - size += _schema->mem_size(); - return size; -} - void TabletMeta::to_json(string* json_string, json2pb::Pb2JsonOptions& options) { TabletMetaPB tablet_meta_pb; to_meta_pb(&tablet_meta_pb); @@ -1092,7 +1091,9 @@ uint64_t DeleteBitmap::cardinality() const { std::shared_lock l(lock); uint64_t res = 0; for (auto entry : delete_bitmap) { - res += entry.second.cardinality(); + if (std::get<1>(entry.first) != DeleteBitmap::INVALID_SEGMENT_ID) { + res += entry.second.cardinality(); + } } return res; } @@ -1101,7 +1102,9 @@ uint64_t DeleteBitmap::get_size() const { std::shared_lock l(lock); uint64_t charge = 0; for (auto& [k, v] : delete_bitmap) { - charge += v.getSizeInBytes(); + if (std::get<1>(k) != DeleteBitmap::INVALID_SEGMENT_ID) { + charge += v.getSizeInBytes(); + } } return charge; } @@ -1198,7 +1201,7 @@ void DeleteBitmap::remove_stale_delete_bitmap_from_queue(const std::vector std::vector> to_delete; - auto tablet_id = -1; + int64_t tablet_id = -1; for (auto& version_str : vector) { auto it = _stale_delete_bitmap.find(version_str); if (it != _stale_delete_bitmap.end()) { @@ -1209,14 +1212,18 @@ void DeleteBitmap::remove_stale_delete_bitmap_from_queue(const std::vector(delete_bitmap_tuple); auto end_bmk = std::get<2>(delete_bitmap_tuple); + // the key range of to be removed is [start_bmk,end_bmk), + // due to the different definitions of the right boundary, + // so use end_bmk as right boundary when removing local delete bitmap, + // use (end_bmk - 1) as right boundary when removing ms delete bitmap remove(start_bmk, end_bmk); to_delete.emplace_back(std::make_tuple(std::get<0>(start_bmk).to_string(), 0, - std::get<2>(end_bmk))); + std::get<2>(end_bmk) - 1)); } _stale_delete_bitmap.erase(version_str); } } - if (tablet_id == -1 || to_delete.empty()) { + if (tablet_id == -1 || to_delete.empty() || !config::is_cloud_mode()) { return; } CloudStorageEngine& engine = ExecEnv::GetInstance()->storage_engine().to_cloud(); @@ -1229,7 +1236,13 @@ void DeleteBitmap::remove_stale_delete_bitmap_from_queue(const std::vector(it->first) != DeleteBitmap::INVALID_SEGMENT_ID) { + count++; + } + } + return count; } // We cannot just copy the underlying memory to construct a string @@ -1308,4 +1321,5 @@ std::string tablet_state_name(TabletState state) { } } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index d56e529e42bf4b..25f6bcd569be43 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -51,6 +51,7 @@ #include "util/uid_util.h" namespace json2pb { +#include "common/compile_check_begin.h" struct Pb2JsonOptions; } // namespace json2pb @@ -100,7 +101,7 @@ class TabletMeta : public MetadataAdder { TabletMeta(); TabletMeta(int64_t table_id, int64_t partition_id, int64_t tablet_id, int64_t replica_id, - int32_t schema_hash, uint64_t shard_id, const TTabletSchema& tablet_schema, + int32_t schema_hash, int32_t shard_id, const TTabletSchema& tablet_schema, uint32_t next_unique_id, const std::unordered_map& col_ordinal_to_unique_id, TabletUid tablet_uid, TTabletType::type tabletType, @@ -140,10 +141,6 @@ class TabletMeta : public MetadataAdder { void to_meta_pb(TabletMetaPB* tablet_meta_pb); void to_json(std::string* json_string, json2pb::Pb2JsonOptions& options); - // Don't use. - // TODO: memory size of TabletSchema cannot be accurately tracked. - // In some places, temporarily use num_columns() as TabletSchema size. - int64_t mem_size() const; size_t tablet_columns_num() const { return _schema->num_columns(); } TabletTypePB tablet_type() const { return _tablet_type; } @@ -156,7 +153,7 @@ class TabletMeta : public MetadataAdder { int64_t replica_id() const; void set_replica_id(int64_t replica_id) { _replica_id = replica_id; } int32_t schema_hash() const; - int16_t shard_id() const; + int32_t shard_id() const; void set_shard_id(int32_t shard_id); int64_t creation_time() const; void set_creation_time(int64_t creation_time); @@ -170,6 +167,12 @@ class TabletMeta : public MetadataAdder { size_t tablet_local_size() const; // Remote disk space occupied by tablet. size_t tablet_remote_size() const; + + size_t tablet_local_index_size() const; + size_t tablet_local_segment_size() const; + size_t tablet_remote_index_size() const; + size_t tablet_remote_segment_size() const; + size_t version_count() const; size_t stale_version_count() const; size_t version_count_cross_with_range(const Version& range) const; @@ -236,6 +239,7 @@ class TabletMeta : public MetadataAdder { static void init_column_from_tcolumn(uint32_t unique_id, const TColumn& tcolumn, ColumnPB* column); + DeleteBitmapPtr delete_bitmap_ptr() { return _delete_bitmap; } DeleteBitmap& delete_bitmap() { return *_delete_bitmap; } bool enable_unique_key_merge_on_write() const { return _enable_unique_key_merge_on_write; } @@ -607,7 +611,7 @@ inline int32_t TabletMeta::schema_hash() const { return _schema_hash; } -inline int16_t TabletMeta::shard_id() const { +inline int32_t TabletMeta::shard_id() const { return _shard_id; } @@ -667,6 +671,46 @@ inline size_t TabletMeta::tablet_remote_size() const { return total_size; } +inline size_t TabletMeta::tablet_local_index_size() const { + size_t total_size = 0; + for (auto& rs : _rs_metas) { + if (rs->is_local()) { + total_size += rs->index_disk_size(); + } + } + return total_size; +} + +inline size_t TabletMeta::tablet_local_segment_size() const { + size_t total_size = 0; + for (auto& rs : _rs_metas) { + if (rs->is_local()) { + total_size += rs->data_disk_size(); + } + } + return total_size; +} + +inline size_t TabletMeta::tablet_remote_index_size() const { + size_t total_size = 0; + for (auto& rs : _rs_metas) { + if (!rs->is_local()) { + total_size += rs->index_disk_size(); + } + } + return total_size; +} + +inline size_t TabletMeta::tablet_remote_segment_size() const { + size_t total_size = 0; + for (auto& rs : _rs_metas) { + if (!rs->is_local()) { + total_size += rs->data_disk_size(); + } + } + return total_size; +} + inline size_t TabletMeta::version_count() const { return _rs_metas.size(); } @@ -731,4 +775,5 @@ std::string tablet_state_name(TabletState state); bool operator==(const TabletMeta& a, const TabletMeta& b); bool operator!=(const TabletMeta& a, const TabletMeta& b); +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/olap/tablet_meta_manager.cpp b/be/src/olap/tablet_meta_manager.cpp index 6f27dd4db4e672..7c08d7856200f9 100644 --- a/be/src/olap/tablet_meta_manager.cpp +++ b/be/src/olap/tablet_meta_manager.cpp @@ -291,8 +291,7 @@ Status TabletMetaManager::remove_old_version_delete_bitmap(DataDir* store, TTabl return true; }; LOG(INFO) << "remove old version delete bitmap, tablet_id: " << tablet_id - << " version: " << version << " removed keys size: " << remove_keys.size(); - ; + << " version: " << version << ", removed keys size: " << remove_keys.size(); RETURN_IF_ERROR(meta->iterate(META_COLUMN_FAMILY_INDEX, begin_key, get_remove_keys_func)); return meta->remove(META_COLUMN_FAMILY_INDEX, remove_keys); } diff --git a/be/src/olap/tablet_reader.cpp b/be/src/olap/tablet_reader.cpp index 7410b70f4aa471..a83e0bfdbf4c30 100644 --- a/be/src/olap/tablet_reader.cpp +++ b/be/src/olap/tablet_reader.cpp @@ -464,13 +464,39 @@ Status TabletReader::_init_orderby_keys_param(const ReaderParams& read_params) { // UNIQUE_KEYS will compare all keys as before if (_tablet_schema->keys_type() == DUP_KEYS || (_tablet_schema->keys_type() == UNIQUE_KEYS && _tablet->enable_unique_key_merge_on_write())) { - // find index in vector _return_columns - // for the read_orderby_key_num_prefix_columns orderby keys - for (uint32_t i = 0; i < read_params.read_orderby_key_num_prefix_columns; i++) { - for (uint32_t idx = 0; idx < _return_columns.size(); idx++) { - if (_return_columns[idx] == i) { - _orderby_key_columns.push_back(idx); - break; + if (!_tablet_schema->cluster_key_uids().empty()) { + if (read_params.read_orderby_key_num_prefix_columns > + _tablet_schema->cluster_key_uids().size()) { + return Status::Error( + "read_orderby_key_num_prefix_columns={} > cluster_keys.size()={}", + read_params.read_orderby_key_num_prefix_columns, + _tablet_schema->cluster_key_uids().size()); + } + for (uint32_t i = 0; i < read_params.read_orderby_key_num_prefix_columns; i++) { + auto cid = _tablet_schema->cluster_key_uids()[i]; + auto index = _tablet_schema->field_index(cid); + if (index < 0) { + return Status::Error( + "could not find cluster key column with unique_id=" + + std::to_string(cid) + + " in tablet schema, tablet_id=" + std::to_string(_tablet->tablet_id())); + } + for (uint32_t idx = 0; idx < _return_columns.size(); idx++) { + if (_return_columns[idx] == index) { + _orderby_key_columns.push_back(idx); + break; + } + } + } + } else { + // find index in vector _return_columns + // for the read_orderby_key_num_prefix_columns orderby keys + for (uint32_t i = 0; i < read_params.read_orderby_key_num_prefix_columns; i++) { + for (uint32_t idx = 0; idx < _return_columns.size(); idx++) { + if (_return_columns[idx] == i) { + _orderby_key_columns.push_back(idx); + break; + } } } } @@ -579,8 +605,7 @@ ColumnPredicate* TabletReader::_parse_to_predicate( return nullptr; } const TabletColumn& column = materialize_column(_tablet_schema->column(index)); - return create_column_predicate(index, bloom_filter.second, column.type(), - _reader_context.runtime_state->be_exec_version(), &column); + return create_column_predicate(index, bloom_filter.second, column.type(), &column); } ColumnPredicate* TabletReader::_parse_to_predicate( @@ -590,8 +615,7 @@ ColumnPredicate* TabletReader::_parse_to_predicate( return nullptr; } const TabletColumn& column = materialize_column(_tablet_schema->column(index)); - return create_column_predicate(index, in_filter.second, column.type(), - _reader_context.runtime_state->be_exec_version(), &column); + return create_column_predicate(index, in_filter.second, column.type(), &column); } ColumnPredicate* TabletReader::_parse_to_predicate( @@ -601,8 +625,7 @@ ColumnPredicate* TabletReader::_parse_to_predicate( return nullptr; } const TabletColumn& column = materialize_column(_tablet_schema->column(index)); - return create_column_predicate(index, bitmap_filter.second, column.type(), - _reader_context.runtime_state->be_exec_version(), &column); + return create_column_predicate(index, bitmap_filter.second, column.type(), &column); } ColumnPredicate* TabletReader::_parse_to_predicate(const FunctionFilter& function_filter) { @@ -612,8 +635,7 @@ ColumnPredicate* TabletReader::_parse_to_predicate(const FunctionFilter& functio } const TabletColumn& column = materialize_column(_tablet_schema->column(index)); return create_column_predicate(index, std::make_shared(function_filter), - column.type(), _reader_context.runtime_state->be_exec_version(), - &column); + column.type(), &column); } Status TabletReader::_init_delete_condition(const ReaderParams& read_params) { diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index c4f96e2214853d..3ec5d22166477f 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -41,8 +41,6 @@ #include "olap/tablet_column_object_pool.h" #include "olap/types.h" #include "olap/utils.h" -#include "runtime/memory/lru_cache_policy.h" -#include "runtime/thread_context.h" #include "tablet_meta.h" #include "vec/aggregate_functions/aggregate_function_simple_factory.h" #include "vec/aggregate_functions/aggregate_function_state_union.h" @@ -975,10 +973,10 @@ void TabletSchema::init_from_pb(const TabletSchemaPB& schema, bool ignore_extrac _indexes.clear(); _field_name_to_index.clear(); _field_id_to_index.clear(); - _cluster_key_idxes.clear(); + _cluster_key_uids.clear(); clear_column_cache_handlers(); - for (const auto& i : schema.cluster_key_idxes()) { - _cluster_key_idxes.push_back(i); + for (const auto& i : schema.cluster_key_uids()) { + _cluster_key_uids.push_back(i); } for (auto& column_pb : schema.column()) { TabletColumnPtr column; @@ -1126,10 +1124,10 @@ void TabletSchema::build_current_tablet_schema(int64_t index_id, int32_t version _sequence_col_idx = -1; _version_col_idx = -1; _skip_bitmap_col_idx = -1; - _cluster_key_idxes.clear(); + _cluster_key_uids.clear(); clear_column_cache_handlers(); - for (const auto& i : ori_tablet_schema._cluster_key_idxes) { - _cluster_key_idxes.push_back(i); + for (const auto& i : ori_tablet_schema._cluster_key_uids) { + _cluster_key_uids.push_back(i); } for (auto& column : index->columns) { if (column->is_key()) { @@ -1237,8 +1235,8 @@ void TabletSchema::reserve_extracted_columns() { } void TabletSchema::to_schema_pb(TabletSchemaPB* tablet_schema_pb) const { - for (const auto& i : _cluster_key_idxes) { - tablet_schema_pb->add_cluster_key_idxes(i); + for (const auto& i : _cluster_key_uids) { + tablet_schema_pb->add_cluster_key_uids(i); } tablet_schema_pb->set_keys_type(_keys_type); for (const auto& col : _cols) { diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 5fb3deafd77319..c813d6f0ef8722 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -332,10 +332,8 @@ class TabletSchema : public MetadataAdder { void copy_from(const TabletSchema& tablet_schema); void update_index_info_from(const TabletSchema& tablet_schema); std::string to_key() const; - // Don't use. - // TODO: memory size of TabletSchema cannot be accurately tracked. - // In some places, temporarily use num_columns() as TabletSchema size. - int64_t mem_size() const { return _mem_size; } + // get_metadata_size is only the memory of the TabletSchema itself, not include child objects. + int64_t mem_size() const { return get_metadata_size(); } size_t row_size() const; int32_t field_index(const std::string& field_name) const; int32_t field_index(const vectorized::PathInData& path) const; @@ -351,7 +349,7 @@ class TabletSchema : public MetadataAdder { const std::vector& columns() const; size_t num_columns() const { return _num_columns; } size_t num_key_columns() const { return _num_key_columns; } - const std::vector& cluster_key_idxes() const { return _cluster_key_idxes; } + const std::vector& cluster_key_uids() const { return _cluster_key_uids; } size_t num_null_columns() const { return _num_null_columns; } size_t num_short_key_columns() const { return _num_short_key_columns; } size_t num_rows_per_row_block() const { return _num_rows_per_row_block; } @@ -550,7 +548,7 @@ class TabletSchema : public MetadataAdder { size_t _num_columns = 0; size_t _num_variant_columns = 0; size_t _num_key_columns = 0; - std::vector _cluster_key_idxes; + std::vector _cluster_key_uids; size_t _num_null_columns = 0; size_t _num_short_key_columns = 0; size_t _num_rows_per_row_block = 0; @@ -573,7 +571,6 @@ class TabletSchema : public MetadataAdder { int64_t _db_id = -1; bool _disable_auto_compaction = false; bool _enable_single_replica_compaction = false; - int64_t _mem_size = 0; bool _store_row_column = false; bool _skip_write_index_on_load = false; InvertedIndexStorageFormatPB _inverted_index_storage_format = InvertedIndexStorageFormatPB::V1; diff --git a/be/src/olap/tablet_schema_cache.cpp b/be/src/olap/tablet_schema_cache.cpp index fd238fa5affb3f..e044ef9c0426f4 100644 --- a/be/src/olap/tablet_schema_cache.cpp +++ b/be/src/olap/tablet_schema_cache.cpp @@ -56,7 +56,7 @@ std::pair TabletSchemaCache::insert(const std: tablet_schema_ptr->init_from_pb(pb, false, true); value->tablet_schema = tablet_schema_ptr; lru_handle = LRUCachePolicy::insert(key_signature, value, tablet_schema_ptr->num_columns(), - 0, CachePriority::NORMAL); + tablet_schema_ptr->mem_size(), CachePriority::NORMAL); g_tablet_schema_cache_count << 1; g_tablet_schema_cache_columns_count << tablet_schema_ptr->num_columns(); } diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index fc3a69fd5cde52..fa8d9b8248e3f4 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -45,6 +44,7 @@ #include "gutil/strings/split.h" #include "gutil/strings/strip.h" #include "http/http_client.h" +#include "http/utils.h" #include "io/fs/file_system.h" #include "io/fs/local_file_system.h" #include "io/fs/path.h" @@ -64,6 +64,7 @@ #include "util/debug_points.h" #include "util/defer_op.h" #include "util/network_util.h" +#include "util/security.h" #include "util/stopwatch.hpp" #include "util/thrift_rpc_helper.h" #include "util/trace.h" @@ -82,8 +83,9 @@ namespace { /// if not equal, then return error /// return value: if binlog file not exist, then return to binlog file path Result check_dest_binlog_valid(const std::string& tablet_dir, + const std::string& clone_dir, const std::string& clone_file, bool* skip_link_file) { - std::string to; + std::string from, to; std::string new_clone_file = clone_file; if (clone_file.ends_with(".binlog")) { // change clone_file suffix from .binlog to .dat @@ -92,6 +94,7 @@ Result check_dest_binlog_valid(const std::string& tablet_dir, // change clone_file suffix from .binlog-index to .idx new_clone_file.replace(clone_file.size() - 13, 13, ".idx"); } + from = fmt::format("{}/{}", clone_dir, clone_file); to = fmt::format("{}/_binlog/{}", tablet_dir, new_clone_file); // check to to file exist @@ -106,10 +109,10 @@ Result check_dest_binlog_valid(const std::string& tablet_dir, } LOG(WARNING) << "binlog file already exist. " - << "tablet_dir=" << tablet_dir << ", clone_file=" << clone_file; + << "tablet_dir=" << tablet_dir << ", clone_file=" << from << ", to=" << to; std::string clone_file_md5sum; - status = io::global_local_filesystem()->md5sum(clone_file, &clone_file_md5sum); + status = io::global_local_filesystem()->md5sum(from, &clone_file_md5sum); if (!status.ok()) { return ResultError(std::move(status)); } @@ -397,28 +400,62 @@ Status EngineCloneTask::_make_and_download_snapshots(DataDir& data_dir, .error(st); } }}; - std::string remote_url_prefix; + + std::string remote_dir; { std::stringstream ss; if (snapshot_path->back() == '/') { - ss << "http://" << get_host_port(src.host, src.http_port) << HTTP_REQUEST_PREFIX - << HTTP_REQUEST_TOKEN_PARAM << token << HTTP_REQUEST_FILE_PARAM << *snapshot_path - << _clone_req.tablet_id << "/" << _clone_req.schema_hash << "/"; + ss << *snapshot_path << _clone_req.tablet_id << "/" << _clone_req.schema_hash + << "/"; } else { - ss << "http://" << get_host_port(src.host, src.http_port) << HTTP_REQUEST_PREFIX - << HTTP_REQUEST_TOKEN_PARAM << token << HTTP_REQUEST_FILE_PARAM << *snapshot_path - << "/" << _clone_req.tablet_id << "/" << _clone_req.schema_hash << "/"; + ss << *snapshot_path << "/" << _clone_req.tablet_id << "/" << _clone_req.schema_hash + << "/"; } - remote_url_prefix = ss.str(); + remote_dir = ss.str(); } - status = _download_files(&data_dir, remote_url_prefix, local_data_path); - if (!status.ok()) [[unlikely]] { - LOG_WARNING("failed to download snapshot from remote BE") - .tag("url", _mask_token(remote_url_prefix)) - .error(status); - continue; // Try another BE + std::string address = get_host_port(src.host, src.http_port); + if (config::enable_batch_download && is_support_batch_download(address).ok()) { + // download files via batch api. + LOG_INFO("remote BE supports batch download, use batch file download") + .tag("address", address) + .tag("remote_dir", remote_dir); + status = _batch_download_files(&data_dir, address, remote_dir, local_data_path); + if (!status.ok()) [[unlikely]] { + LOG_WARNING("failed to download snapshot from remote BE in batch") + .tag("address", address) + .tag("remote_dir", remote_dir) + .error(status); + continue; // Try another BE + } + } else { + if (config::enable_batch_download) { + LOG_INFO("remote BE does not support batch download, use single file download") + .tag("address", address) + .tag("remote_dir", remote_dir); + } else { + LOG_INFO("batch download is disabled, use single file download") + .tag("address", address) + .tag("remote_dir", remote_dir); + } + + std::string remote_url_prefix; + { + std::stringstream ss; + ss << "http://" << address << HTTP_REQUEST_PREFIX << HTTP_REQUEST_TOKEN_PARAM + << token << HTTP_REQUEST_FILE_PARAM << remote_dir; + remote_url_prefix = ss.str(); + } + + status = _download_files(&data_dir, remote_url_prefix, local_data_path); + if (!status.ok()) [[unlikely]] { + LOG_WARNING("failed to download snapshot from remote BE") + .tag("url", mask_token(remote_url_prefix)) + .error(status); + continue; // Try another BE + } } + // No need to try again with another BE _pending_rs_guards = DORIS_TRY(_engine.snapshot_mgr()->convert_rowset_ids( local_data_path, _clone_req.tablet_id, _clone_req.replica_id, _clone_req.table_id, @@ -512,7 +549,7 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re // If the header file is not exist, the table couldn't loaded by olap engine. // Avoid of data is not complete, we copy the header file at last. // The header file's name is end of .hdr. - for (int i = 0; i < file_name_list.size() - 1; ++i) { + for (int i = 0; i + 1 < file_name_list.size(); ++i) { if (file_name_list[i].ends_with(".hdr")) { std::swap(file_name_list[i], file_name_list[file_name_list.size() - 1]); break; @@ -552,11 +589,11 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re std::string local_file_path = local_path + "/" + file_name; - LOG(INFO) << "clone begin to download file from: " << _mask_token(remote_file_url) + LOG(INFO) << "clone begin to download file from: " << mask_token(remote_file_url) << " to: " << local_file_path << ". size(B): " << file_size << ", timeout(s): " << estimate_timeout; - auto download_cb = [this, &remote_file_url, estimate_timeout, &local_file_path, + auto download_cb = [&remote_file_url, estimate_timeout, &local_file_path, file_size](HttpClient* client) { RETURN_IF_ERROR(client->init(remote_file_url)); client->set_timeout_ms(estimate_timeout * 1000); @@ -572,7 +609,7 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re } if (local_file_size != file_size) { LOG(WARNING) << "download file length error" - << ", remote_path=" << _mask_token(remote_file_url) + << ", remote_path=" << mask_token(remote_file_url) << ", file_size=" << file_size << ", local_file_size=" << local_file_size; return Status::InternalError("downloaded file size is not equal"); @@ -591,16 +628,94 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re } _copy_size = (int64_t)total_file_size; _copy_time_ms = (int64_t)total_time_ms; - LOG(INFO) << "succeed to copy tablet " << _signature << ", total file size: " << total_file_size - << " B" - << ", cost: " << total_time_ms << " ms" + LOG(INFO) << "succeed to copy tablet " << _signature + << ", total files: " << file_name_list.size() + << ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms" << ", rate: " << copy_rate << " MB/s"; return Status::OK(); } +Status EngineCloneTask::_batch_download_files(DataDir* data_dir, const std::string& address, + const std::string& remote_dir, + const std::string& local_dir) { + constexpr size_t BATCH_FILE_SIZE = 64 << 20; // 64MB + constexpr size_t BATCH_FILE_NUM = 64; + + // Check local path exist, if exist, remove it, then create the dir + // local_file_full_path = tabletid/clone๏ผŒ for a specific tablet, there should be only one folder + // if this folder exists, then should remove it + // for example, BE clone from BE 1 to download file 1 with version (2,2), but clone from BE 1 failed + // then it will try to clone from BE 2, but it will find the file 1 already exist, but file 1 with same + // name may have different versions. + RETURN_IF_ERROR(io::global_local_filesystem()->delete_directory(local_dir)); + RETURN_IF_ERROR(io::global_local_filesystem()->create_directory(local_dir)); + + const std::string& token = _cluster_info->token; + std::vector> file_info_list; + RETURN_IF_ERROR(list_remote_files_v2(address, token, remote_dir, &file_info_list)); + + // If the header file is not exist, the table couldn't loaded by olap engine. + // Avoid of data is not complete, we copy the header file at last. + // The header file's name is end of .hdr. + for (int i = 0; i + 1 < file_info_list.size(); ++i) { + if (file_info_list[i].first.ends_with(".hdr")) { + std::swap(file_info_list[i], file_info_list[file_info_list.size() - 1]); + break; + } + } + + MonotonicStopWatch watch; + watch.start(); + + size_t total_file_size = 0; + size_t total_files = file_info_list.size(); + std::vector> batch_files; + for (size_t i = 0; i < total_files;) { + size_t batch_file_size = 0; + for (size_t j = i; j < total_files; j++) { + // Split batchs by file number and file size, + if (BATCH_FILE_NUM <= batch_files.size() || BATCH_FILE_SIZE <= batch_file_size || + // ... or separate the last .hdr file into a single batch. + (j + 1 == total_files && !batch_files.empty())) { + break; + } + batch_files.push_back(file_info_list[j]); + batch_file_size += file_info_list[j].second; + } + + // check disk capacity + if (data_dir->reach_capacity_limit(batch_file_size)) { + return Status::Error( + "reach the capacity limit of path {}, file_size={}", data_dir->path(), + batch_file_size); + } + + RETURN_IF_ERROR(download_files_v2(address, token, remote_dir, local_dir, batch_files)); + + total_file_size += batch_file_size; + i += batch_files.size(); + batch_files.clear(); + } + + uint64_t total_time_ms = watch.elapsed_time() / 1000 / 1000; + total_time_ms = total_time_ms > 0 ? total_time_ms : 0; + double copy_rate = 0.0; + if (total_time_ms > 0) { + copy_rate = total_file_size / ((double)total_time_ms) / 1000; + } + _copy_size = (int64_t)total_file_size; + _copy_time_ms = (int64_t)total_time_ms; + LOG(INFO) << "succeed to copy tablet " << _signature + << ", total files: " << file_info_list.size() + << ", total file size: " << total_file_size << " B, cost: " << total_time_ms << " ms" + << ", rate: " << copy_rate << " MB/s"; + + return Status::OK(); +} + /// This method will only be called if tablet already exist in this BE when doing clone. /// This method will do the following things: -/// 1. Linke all files from CLONE dir to tablet dir if file does not exist in tablet dir +/// 1. Link all files from CLONE dir to tablet dir if file does not exist in tablet dir /// 2. Call _finish_xx_clone() to revise the tablet meta. Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_dir, int64_t version, bool is_incremental_clone) { @@ -697,7 +812,6 @@ Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_d continue; } - auto from = fmt::format("{}/{}", clone_dir, clone_file); std::string to; if (clone_file.ends_with(".binlog") || clone_file.ends_with(".binlog-index")) { if (!contain_binlog) { @@ -706,7 +820,8 @@ Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_d break; } - if (auto&& result = check_dest_binlog_valid(tablet_dir, clone_file, &skip_link_file); + if (auto&& result = + check_dest_binlog_valid(tablet_dir, clone_dir, clone_file, &skip_link_file); result) { to = std::move(result.value()); } else { @@ -718,6 +833,7 @@ Status EngineCloneTask::_finish_clone(Tablet* tablet, const std::string& clone_d } if (!skip_link_file) { + auto from = fmt::format("{}/{}", clone_dir, clone_file); status = io::global_local_filesystem()->link_file(from, to); if (!status.ok()) { return status; @@ -864,9 +980,4 @@ Status EngineCloneTask::_finish_full_clone(Tablet* tablet, // TODO(plat1ko): write cooldown meta to remote if this replica is cooldown replica } -std::string EngineCloneTask::_mask_token(const std::string& str) { - std::regex pattern("token=[\\w|-]+"); - return regex_replace(str, pattern, "token=******"); -} - } // namespace doris diff --git a/be/src/olap/task/engine_clone_task.h b/be/src/olap/task/engine_clone_task.h index 9290ed9552ecf9..e2ced28f03c88d 100644 --- a/be/src/olap/task/engine_clone_task.h +++ b/be/src/olap/task/engine_clone_task.h @@ -79,6 +79,9 @@ class EngineCloneTask final : public EngineTask { Status _download_files(DataDir* data_dir, const std::string& remote_url_prefix, const std::string& local_path); + Status _batch_download_files(DataDir* data_dir, const std::string& endpoint, + const std::string& remote_dir, const std::string& local_dir); + Status _make_snapshot(const std::string& ip, int port, TTableId tablet_id, TSchemaHash schema_hash, int timeout_s, const std::vector& missing_versions, std::string* snapshot_path, @@ -86,8 +89,6 @@ class EngineCloneTask final : public EngineTask { Status _release_snapshot(const std::string& ip, int port, const std::string& snapshot_path); - std::string _mask_token(const std::string& str); - private: StorageEngine& _engine; const TCloneReq& _clone_req; diff --git a/be/src/olap/task/engine_storage_migration_task.cpp b/be/src/olap/task/engine_storage_migration_task.cpp index a300e6e0f09fa3..210aa6a8c56f08 100644 --- a/be/src/olap/task/engine_storage_migration_task.cpp +++ b/be/src/olap/task/engine_storage_migration_task.cpp @@ -409,8 +409,9 @@ Status EngineStorageMigrationTask::_copy_index_and_data_files( InvertedIndexStorageFormatPB::V1) { for (const auto& index : tablet_schema.inverted_indexes()) { auto index_id = index->index_id(); - auto index_file = - _tablet->get_segment_index_filepath(rowset_id, segment_index, index_id); + auto index_file = InvertedIndexDescriptor::get_index_file_path_v1( + InvertedIndexDescriptor::get_index_file_path_prefix(segment_file_path), + index_id, index->get_index_suffix()); auto snapshot_segment_index_file_path = fmt::format("{}/{}_{}_{}.binlog-index", full_path, rowset_id, segment_index, index_id); diff --git a/be/src/olap/txn_manager.cpp b/be/src/olap/txn_manager.cpp index d227f53053128b..c54b9c5e8f980f 100644 --- a/be/src/olap/txn_manager.cpp +++ b/be/src/olap/txn_manager.cpp @@ -548,8 +548,9 @@ Status TxnManager::publish_txn(OlapMeta* meta, TPartitionId partition_id, if (!status.ok()) { return Status::Error( "add rowset to binlog failed. when publish txn rowset_id: {}, tablet id: {}, " - "txn id: {}", - rowset->rowset_id().to_string(), tablet_id, transaction_id); + "txn id: {}, status: {}", + rowset->rowset_id().to_string(), tablet_id, transaction_id, + status.to_string_no_stack()); } } diff --git a/be/src/olap/txn_manager.h b/be/src/olap/txn_manager.h index 88ee97c5f6a3b9..1994dec949414f 100644 --- a/be/src/olap/txn_manager.h +++ b/be/src/olap/txn_manager.h @@ -87,6 +87,12 @@ struct TabletTxnInfo { std::shared_ptr publish_status; TxnPublishInfo publish_info; + // for cloud only, used to calculate delete bitmap for txn load + bool is_txn_load = false; + std::vector invisible_rowsets; + int64_t lock_id; + int64_t next_visible_version; + TxnState state {TxnState::PREPARED}; TabletTxnInfo() = default; diff --git a/be/src/pipeline/common/set_utils.h b/be/src/pipeline/common/set_utils.h index 2caf5b7d0b814c..38a82a501ff534 100644 --- a/be/src/pipeline/common/set_utils.h +++ b/be/src/pipeline/common/set_utils.h @@ -20,13 +20,20 @@ #include #include -#include "pipeline/exec/join/join_op.h" #include "vec/common/hash_table/hash_map_util.h" namespace doris { +struct RowRefWithFlag { + bool visited; + uint32_t row_num = 0; + RowRefWithFlag() = default; + RowRefWithFlag(size_t row_num_count, bool is_visited = false) + : visited(is_visited), row_num(row_num_count) {} +}; + template -using SetData = PHHashMap>; +using SetData = PHHashMap>; template using SetFixedKeyHashTableContext = vectorized::MethodKeysFixed>; @@ -39,9 +46,8 @@ using SetPrimaryTypeHashTableContextNullable = vectorized::MethodSingleNullableC vectorized::MethodOneNumber>>>; using SetSerializedHashTableContext = - vectorized::MethodSerialized>; -using SetMethodOneString = - vectorized::MethodStringNoCache>; + vectorized::MethodSerialized>; +using SetMethodOneString = vectorized::MethodStringNoCache>; using SetHashTableVariants = std::variant lc(le_lock); if (exchanger->_running_source_operators.fetch_sub(1) == 1) { _set_always_ready(); - exchanger->finalize(local_state); + exchanger->finalize(); } } diff --git a/be/src/pipeline/dependency.h b/be/src/pipeline/dependency.h index 4cc3aceaeebdfa..f1cfe2b02977e1 100644 --- a/be/src/pipeline/dependency.h +++ b/be/src/pipeline/dependency.h @@ -34,6 +34,7 @@ #include "pipeline/common/set_utils.h" #include "pipeline/exec/data_queue.h" #include "pipeline/exec/join/process_hash_table_probe.h" +#include "util/stack_util.h" #include "vec/common/sort/partition_sorter.h" #include "vec/common/sort/sorter.h" #include "vec/core/block.h" @@ -107,7 +108,7 @@ class Dependency : public std::enable_shared_from_this { // Which dependency current pipeline task is blocked by. `nullptr` if this dependency is ready. [[nodiscard]] virtual Dependency* is_blocked_by(PipelineTask* task = nullptr); // Notify downstream pipeline tasks this dependency is ready. - void set_ready(); + virtual void set_ready(); void set_ready_to_read() { DCHECK_EQ(_shared_state->source_deps.size(), 1) << debug_string(); _shared_state->source_deps.front()->set_ready(); @@ -571,14 +572,12 @@ struct AnalyticSharedState : public BasicSharedState { int64_t current_row_position = 0; BlockRowPos partition_by_end; - vectorized::VExprContextSPtrs partition_by_eq_expr_ctxs; int64_t input_total_rows = 0; BlockRowPos all_block_end; std::vector input_blocks; bool input_eos = false; BlockRowPos found_partition_end; std::vector origin_cols; - vectorized::VExprContextSPtrs order_by_eq_expr_ctxs; std::vector input_block_first_row_positions; std::vector> agg_input_columns; @@ -759,7 +758,7 @@ struct LocalExchangeSharedState : public BasicSharedState { } } void sub_running_sink_operators(); - void sub_running_source_operators(LocalExchangeSourceLocalState& local_state); + void sub_running_source_operators(); void _set_always_ready() { for (auto& dep : source_deps) { DCHECK(dep); diff --git a/be/src/pipeline/exec/aggregation_sink_operator.cpp b/be/src/pipeline/exec/aggregation_sink_operator.cpp index 27400fba474eef..44e58535b75b71 100644 --- a/be/src/pipeline/exec/aggregation_sink_operator.cpp +++ b/be/src/pipeline/exec/aggregation_sink_operator.cpp @@ -231,7 +231,6 @@ void AggSinkLocalState::_update_memusage_with_serialized_key() { COUNTER_SET(_memory_used_counter, arena_memory_usage + hash_table_memory_usage); - COUNTER_SET(_peak_memory_usage_counter, _memory_used_counter->value()); COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); COUNTER_SET(_hash_table_memory_usage, hash_table_memory_usage); @@ -415,7 +414,6 @@ Status AggSinkLocalState::_merge_without_key(vectorized::Block* block) { void AggSinkLocalState::_update_memusage_without_key() { int64_t arena_memory_usage = _agg_arena_pool->size(); COUNTER_SET(_memory_used_counter, arena_memory_usage); - COUNTER_SET(_peak_memory_usage_counter, arena_memory_usage); COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); } diff --git a/be/src/pipeline/exec/analytic_sink_operator.cpp b/be/src/pipeline/exec/analytic_sink_operator.cpp index abde34a1d0255b..7cc25eef9446d6 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.cpp +++ b/be/src/pipeline/exec/analytic_sink_operator.cpp @@ -60,15 +60,14 @@ Status AnalyticSinkLocalState::open(RuntimeState* state) { _agg_expr_ctxs[i][j]->root()->data_type()->create_column(); } } - _shared_state->partition_by_eq_expr_ctxs.resize(p._partition_by_eq_expr_ctxs.size()); - for (size_t i = 0; i < _shared_state->partition_by_eq_expr_ctxs.size(); i++) { - RETURN_IF_ERROR(p._partition_by_eq_expr_ctxs[i]->clone( - state, _shared_state->partition_by_eq_expr_ctxs[i])); - } - _shared_state->order_by_eq_expr_ctxs.resize(p._order_by_eq_expr_ctxs.size()); - for (size_t i = 0; i < _shared_state->order_by_eq_expr_ctxs.size(); i++) { + _partition_by_eq_expr_ctxs.resize(p._partition_by_eq_expr_ctxs.size()); + for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); i++) { RETURN_IF_ERROR( - p._order_by_eq_expr_ctxs[i]->clone(state, _shared_state->order_by_eq_expr_ctxs[i])); + p._partition_by_eq_expr_ctxs[i]->clone(state, _partition_by_eq_expr_ctxs[i])); + } + _order_by_eq_expr_ctxs.resize(p._order_by_eq_expr_ctxs.size()); + for (size_t i = 0; i < _order_by_eq_expr_ctxs.size(); i++) { + RETURN_IF_ERROR(p._order_by_eq_expr_ctxs[i]->clone(state, _order_by_eq_expr_ctxs[i])); } return Status::OK(); } @@ -80,11 +79,11 @@ bool AnalyticSinkLocalState::_whether_need_next_partition(BlockRowPos& found_par shared_state.partition_by_end.pos)) { //now still have partition data return false; } - if ((shared_state.partition_by_eq_expr_ctxs.empty() && !shared_state.input_eos) || + if ((_partition_by_eq_expr_ctxs.empty() && !shared_state.input_eos) || (found_partition_end.pos == 0)) { //no partition, get until fetch to EOS return true; } - if (!shared_state.partition_by_eq_expr_ctxs.empty() && + if (!_partition_by_eq_expr_ctxs.empty() && found_partition_end.pos == shared_state.all_block_end.pos && !shared_state.input_eos) { //current partition data calculate done return true; @@ -177,13 +176,13 @@ BlockRowPos AnalyticSinkLocalState::_get_partition_by_end() { return shared_state.partition_by_end; } - if (shared_state.partition_by_eq_expr_ctxs.empty() || + if (_partition_by_eq_expr_ctxs.empty() || (shared_state.input_total_rows == 0)) { //no partition_by, the all block is end return shared_state.all_block_end; } BlockRowPos cal_end = shared_state.all_block_end; - for (size_t i = 0; i < shared_state.partition_by_eq_expr_ctxs.size(); + for (size_t i = 0; i < _partition_by_eq_expr_ctxs.size(); ++i) { //have partition_by, binary search the partiton end cal_end = _compare_row_to_find_end(shared_state.partition_by_column_idxs[i], shared_state.partition_by_end, cal_end); @@ -303,10 +302,10 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block } { SCOPED_TIMER(local_state._compute_partition_by_timer); - for (size_t i = 0; i < local_state._shared_state->partition_by_eq_expr_ctxs.size(); ++i) { + for (size_t i = 0; i < local_state._partition_by_eq_expr_ctxs.size(); ++i) { int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->partition_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); + RETURN_IF_ERROR(local_state._partition_by_eq_expr_ctxs[i]->execute(input_block, + &result_col_id)); DCHECK_GE(result_col_id, 0); local_state._shared_state->partition_by_column_idxs[i] = result_col_id; } @@ -314,17 +313,16 @@ Status AnalyticSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block { SCOPED_TIMER(local_state._compute_order_by_timer); - for (size_t i = 0; i < local_state._shared_state->order_by_eq_expr_ctxs.size(); ++i) { + for (size_t i = 0; i < local_state._order_by_eq_expr_ctxs.size(); ++i) { int result_col_id = -1; - RETURN_IF_ERROR(local_state._shared_state->order_by_eq_expr_ctxs[i]->execute( - input_block, &result_col_id)); + RETURN_IF_ERROR( + local_state._order_by_eq_expr_ctxs[i]->execute(input_block, &result_col_id)); DCHECK_GE(result_col_id, 0); local_state._shared_state->ordey_by_column_idxs[i] = result_col_id; } } COUNTER_UPDATE(local_state._memory_used_counter, input_block->allocated_bytes()); - COUNTER_SET(local_state._peak_memory_usage_counter, local_state._memory_used_counter->value()); //TODO: if need improvement, the is a tips to maintain a free queue, //so the memory could reuse, no need to new/delete again; diff --git a/be/src/pipeline/exec/analytic_sink_operator.h b/be/src/pipeline/exec/analytic_sink_operator.h index e04b220ee351e7..0ff7c4e4e047bd 100644 --- a/be/src/pipeline/exec/analytic_sink_operator.h +++ b/be/src/pipeline/exec/analytic_sink_operator.h @@ -63,6 +63,8 @@ class AnalyticSinkLocalState : public PipelineXSinkLocalState _agg_expr_ctxs; + vectorized::VExprContextSPtrs _partition_by_eq_expr_ctxs; + vectorized::VExprContextSPtrs _order_by_eq_expr_ctxs; }; class AnalyticSinkOperatorX final : public DataSinkOperatorX { diff --git a/be/src/pipeline/exec/analytic_source_operator.cpp b/be/src/pipeline/exec/analytic_source_operator.cpp index 2e041ab45d20bf..fe0ab0b148e55a 100644 --- a/be/src/pipeline/exec/analytic_source_operator.cpp +++ b/be/src/pipeline/exec/analytic_source_operator.cpp @@ -122,13 +122,15 @@ BlockRowPos AnalyticLocalState::_get_partition_by_end() { return shared_state.partition_by_end; } - if (shared_state.partition_by_eq_expr_ctxs.empty() || + const auto partition_exprs_size = + _parent->cast()._partition_exprs_size; + if (partition_exprs_size == 0 || (shared_state.input_total_rows == 0)) { //no partition_by, the all block is end return shared_state.all_block_end; } BlockRowPos cal_end = shared_state.all_block_end; - for (size_t i = 0; i < shared_state.partition_by_eq_expr_ctxs.size(); + for (size_t i = 0; i < partition_exprs_size; ++i) { //have partition_by, binary search the partiton end cal_end = _compare_row_to_find_end(shared_state.partition_by_column_idxs[i], shared_state.partition_by_end, cal_end); @@ -144,12 +146,13 @@ bool AnalyticLocalState::_whether_need_next_partition(BlockRowPos& found_partiti shared_state.partition_by_end.pos)) { //now still have partition data return false; } - if ((shared_state.partition_by_eq_expr_ctxs.empty() && !shared_state.input_eos) || + const auto partition_exprs_size = + _parent->cast()._partition_exprs_size; + if ((partition_exprs_size == 0 && !shared_state.input_eos) || (found_partition_end.pos == 0)) { //no partition, get until fetch to EOS return true; } - if (!shared_state.partition_by_eq_expr_ctxs.empty() && - found_partition_end.pos == shared_state.all_block_end.pos && + if (partition_exprs_size != 0 && found_partition_end.pos == shared_state.all_block_end.pos && !shared_state.input_eos) { //current partition data calculate done return true; } @@ -349,17 +352,17 @@ Status AnalyticLocalState::_get_next_for_rows(size_t current_block_rows) { int64_t range_start, range_end; if (!_parent->cast()._window.__isset.window_start && _parent->cast()._window.window_end.type == - TAnalyticWindowBoundaryType:: - CURRENT_ROW) { //[preceding, current_row],[current_row, following] + TAnalyticWindowBoundaryType::CURRENT_ROW) { + // [preceding, current_row], [current_row, following] rewrite it's same + // as could reuse the previous calculate result, so don't call _reset_agg_status function + // going on calculate, add up data, no need to reset state range_start = _shared_state->current_row_position; - range_end = _shared_state->current_row_position + - 1; //going on calculate,add up data, no need to reset state + range_end = _shared_state->current_row_position + 1; } else { _reset_agg_status(); range_end = _shared_state->current_row_position + _rows_end_offset + 1; - if (!_parent->cast() - ._window.__isset - .window_start) { //[preceding, offset] --unbound: [preceding, following] + //[preceding, offset] --unbound: [preceding, following] + if (!_parent->cast()._window.__isset.window_start) { range_start = _partition_by_start.pos; } else { range_start = _shared_state->current_row_position + _rows_start_offset; @@ -401,7 +404,7 @@ Status AnalyticLocalState::_get_next_for_range(size_t current_block_rows) { void AnalyticLocalState::_update_order_by_range() { _order_by_start = _order_by_end; _order_by_end = _shared_state->partition_by_end; - for (size_t i = 0; i < _shared_state->order_by_eq_expr_ctxs.size(); ++i) { + for (size_t i = 0; i < _parent->cast()._order_by_exprs_size; ++i) { _order_by_end = _compare_row_to_find_end(_shared_state->ordey_by_column_idxs[i], _order_by_start, _order_by_end, true); } @@ -476,7 +479,9 @@ AnalyticSourceOperatorX::AnalyticSourceOperatorX(ObjectPool* pool, const TPlanNo _has_window(tnode.analytic_node.__isset.window), _has_range_window(tnode.analytic_node.window.type == TAnalyticWindowType::RANGE), _has_window_start(tnode.analytic_node.window.__isset.window_start), - _has_window_end(tnode.analytic_node.window.__isset.window_end) { + _has_window_end(tnode.analytic_node.window.__isset.window_end), + _partition_exprs_size(tnode.analytic_node.partition_exprs.size()), + _order_by_exprs_size(tnode.analytic_node.order_by_exprs.size()) { _is_serial_operator = tnode.__isset.is_serial_operator && tnode.is_serial_operator; _fn_scope = AnalyticFnScope::PARTITION; if (tnode.analytic_node.__isset.window && diff --git a/be/src/pipeline/exec/analytic_source_operator.h b/be/src/pipeline/exec/analytic_source_operator.h index 8f44b77f567e55..56c664cec6193b 100644 --- a/be/src/pipeline/exec/analytic_source_operator.h +++ b/be/src/pipeline/exec/analytic_source_operator.h @@ -150,6 +150,8 @@ class AnalyticSourceOperatorX final : public OperatorX { size_t _align_aggregate_states = 1; std::vector _change_to_nullable_flags; + const size_t _partition_exprs_size; + const size_t _order_by_exprs_size; }; } // namespace pipeline diff --git a/be/src/pipeline/exec/assert_num_rows_operator.h b/be/src/pipeline/exec/assert_num_rows_operator.h index dcc64f57878d38..a7408d695928c5 100644 --- a/be/src/pipeline/exec/assert_num_rows_operator.h +++ b/be/src/pipeline/exec/assert_num_rows_operator.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class AssertNumRowsLocalState final : public PipelineXLocalState { public: @@ -55,4 +56,5 @@ class AssertNumRowsOperatorX final : public StreamingOperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/exec/cache_source_operator.h b/be/src/pipeline/exec/cache_source_operator.h index e764323846b153..146c984d04aa3f 100644 --- a/be/src/pipeline/exec/cache_source_operator.h +++ b/be/src/pipeline/exec/cache_source_operator.h @@ -25,6 +25,7 @@ #include "pipeline/query_cache/query_cache.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -101,4 +102,5 @@ class CacheSourceOperatorX final : public OperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/data_queue.h b/be/src/pipeline/exec/data_queue.h index f5bd84cc278d0a..d97f58c0debdb6 100644 --- a/be/src/pipeline/exec/data_queue.h +++ b/be/src/pipeline/exec/data_queue.h @@ -29,6 +29,7 @@ #include "vec/core/block.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class Dependency; @@ -108,4 +109,5 @@ class DataQueue { SpinLock _source_lock; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/datagen_operator.h b/be/src/pipeline/exec/datagen_operator.h index bada5ec4080d08..ffc2c6f946fb3a 100644 --- a/be/src/pipeline/exec/datagen_operator.h +++ b/be/src/pipeline/exec/datagen_operator.h @@ -24,6 +24,7 @@ #include "pipeline/exec/operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -70,4 +71,5 @@ class DataGenSourceOperatorX final : public OperatorX { std::vector _runtime_filter_descs; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/empty_set_operator.cpp b/be/src/pipeline/exec/empty_set_operator.cpp index 7233e46dfd1e52..2dfe9701558da0 100644 --- a/be/src/pipeline/exec/empty_set_operator.cpp +++ b/be/src/pipeline/exec/empty_set_operator.cpp @@ -22,6 +22,7 @@ #include "pipeline/exec/operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status EmptySetSourceOperatorX::get_block(RuntimeState* state, vectorized::Block* block, bool* eos) { @@ -29,4 +30,5 @@ Status EmptySetSourceOperatorX::get_block(RuntimeState* state, vectorized::Block return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/empty_set_operator.h b/be/src/pipeline/exec/empty_set_operator.h index 6b200bfdbde249..d8e920b256494d 100644 --- a/be/src/pipeline/exec/empty_set_operator.h +++ b/be/src/pipeline/exec/empty_set_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class EmptySetLocalState final : public PipelineXLocalState { public: @@ -43,4 +44,5 @@ class EmptySetSourceOperatorX final : public OperatorX { [[nodiscard]] bool is_source() const override { return true; } }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/es_scan_operator.h b/be/src/pipeline/exec/es_scan_operator.h index 2ae562e4fc7f32..6e64110997e3af 100644 --- a/be/src/pipeline/exec/es_scan_operator.h +++ b/be/src/pipeline/exec/es_scan_operator.h @@ -26,6 +26,7 @@ #include "pipeline/exec/scan_operator.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class NewEsScanner; @@ -86,4 +87,5 @@ class EsScanOperatorX final : public ScanOperatorX { std::vector _column_names; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/exchange_sink_buffer.cpp b/be/src/pipeline/exec/exchange_sink_buffer.cpp index 0f02ffc2b9a4b1..e3f895444d4168 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.cpp +++ b/be/src/pipeline/exec/exchange_sink_buffer.cpp @@ -47,6 +47,7 @@ #include "vec/sink/vdata_stream_sender.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { BroadcastPBlockHolder::~BroadcastPBlockHolder() { @@ -87,19 +88,22 @@ void BroadcastPBlockHolderMemLimiter::release(const BroadcastPBlockHolder& holde } // namespace vectorized namespace pipeline { -ExchangeSinkBuffer::ExchangeSinkBuffer(PUniqueId query_id, PlanNodeId dest_node_id, int send_id, - int be_number, RuntimeState* state, - ExchangeSinkLocalState* parent) +ExchangeSinkBuffer::ExchangeSinkBuffer(PUniqueId query_id, PlanNodeId dest_node_id, + RuntimeState* state, + const std::vector& sender_ins_ids) : HasTaskExecutionCtx(state), _queue_capacity(0), - _is_finishing(false), + _is_failed(false), _query_id(std::move(query_id)), _dest_node_id(dest_node_id), - _sender_id(send_id), - _be_number(be_number), _state(state), _context(state->get_query_ctx()), - _parent(parent) {} + _exchange_sink_num(sender_ins_ids.size()) { + for (auto sender_ins_id : sender_ins_ids) { + _queue_deps.emplace(sender_ins_id, nullptr); + _parents.emplace(sender_ins_id, nullptr); + } +} void ExchangeSinkBuffer::close() { // Could not clear the queue here, because there maybe a running rpc want to @@ -110,8 +114,8 @@ void ExchangeSinkBuffer::close() { //_instance_to_request.clear(); } -void ExchangeSinkBuffer::register_sink(TUniqueId fragment_instance_id) { - if (_is_finishing) { +void ExchangeSinkBuffer::construct_request(TUniqueId fragment_instance_id) { + if (_is_failed) { return; } auto low_id = fragment_instance_id.lo; @@ -129,22 +133,27 @@ void ExchangeSinkBuffer::register_sink(TUniqueId fragment_instance_id) { finst_id.set_hi(fragment_instance_id.hi); finst_id.set_lo(fragment_instance_id.lo); _rpc_channel_is_idle[low_id] = true; - _instance_to_receiver_eof[low_id] = false; + _rpc_channel_is_turn_off[low_id] = false; _instance_to_rpc_stats_vec.emplace_back(std::make_shared(low_id)); _instance_to_rpc_stats[low_id] = _instance_to_rpc_stats_vec.back().get(); - _construct_request(low_id, finst_id); + _instance_to_request[low_id] = std::make_shared(); + _instance_to_request[low_id]->mutable_finst_id()->CopyFrom(finst_id); + _instance_to_request[low_id]->mutable_query_id()->CopyFrom(_query_id); + + _instance_to_request[low_id]->set_node_id(_dest_node_id); + _running_sink_count[low_id] = _exchange_sink_num; } Status ExchangeSinkBuffer::add_block(TransmitInfo&& request) { - if (_is_finishing) { + if (_is_failed) { return Status::OK(); } - auto ins_id = request.channel->_fragment_instance_id.lo; + auto ins_id = request.channel->dest_ins_id(); if (!_instance_to_package_queue_mutex.contains(ins_id)) { return Status::InternalError("fragment_instance_id {} not do register_sink", print_id(request.channel->_fragment_instance_id)); } - if (_is_receiver_eof(ins_id)) { + if (_rpc_channel_is_turn_off[ins_id]) { return Status::EndOfFile("receiver eof"); } bool send_now = false; @@ -158,14 +167,15 @@ Status ExchangeSinkBuffer::add_block(TransmitInfo&& request) { if (request.block) { RETURN_IF_ERROR( BeExecVersionManager::check_be_exec_version(request.block->be_exec_version())); - COUNTER_UPDATE(_parent->memory_used_counter(), request.block->ByteSizeLong()); - COUNTER_SET(_parent->peak_memory_usage_counter(), - _parent->memory_used_counter()->value()); + COUNTER_UPDATE(request.channel->_parent->memory_used_counter(), + request.block->ByteSizeLong()); } _instance_to_package_queue[ins_id].emplace(std::move(request)); _total_queue_size++; - if (_queue_dependency && _total_queue_size > _queue_capacity) { - _queue_dependency->block(); + if (_total_queue_size > _queue_capacity) { + for (auto& [_, dep] : _queue_deps) { + dep->block(); + } } } if (send_now) { @@ -176,15 +186,15 @@ Status ExchangeSinkBuffer::add_block(TransmitInfo&& request) { } Status ExchangeSinkBuffer::add_block(BroadcastTransmitInfo&& request) { - if (_is_finishing) { + if (_is_failed) { return Status::OK(); } - auto ins_id = request.channel->_fragment_instance_id.lo; + auto ins_id = request.channel->dest_ins_id(); if (!_instance_to_package_queue_mutex.contains(ins_id)) { return Status::InternalError("fragment_instance_id {} not do register_sink", print_id(request.channel->_fragment_instance_id)); } - if (_is_receiver_eof(ins_id)) { + if (_rpc_channel_is_turn_off[ins_id]) { return Status::EndOfFile("receiver eof"); } bool send_now = false; @@ -211,16 +221,17 @@ Status ExchangeSinkBuffer::add_block(BroadcastTransmitInfo&& request) { Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { std::unique_lock lock(*_instance_to_package_queue_mutex[id]); - DCHECK(_rpc_channel_is_idle[id] == false); - std::queue>& q = _instance_to_package_queue[id]; std::queue>& broadcast_q = _instance_to_broadcast_package_queue[id]; - if (_is_finishing) { + if (_is_failed) { _turn_off_channel(id, lock); return Status::OK(); } + if (_rpc_channel_is_turn_off[id]) { + return Status::OK(); + } if (!q.empty()) { // If we have data to shuffle which is not broadcasted @@ -228,6 +239,8 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { auto& brpc_request = _instance_to_request[id]; brpc_request->set_eos(request.eos); brpc_request->set_packet_seq(_instance_to_seq[id]++); + brpc_request->set_sender_id(request.channel->_parent->sender_id()); + brpc_request->set_be_number(request.channel->_parent->be_number()); if (request.block && !request.block->column_metas().empty()) { brpc_request->set_allocated_block(request.block.get()); } @@ -273,14 +286,16 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { } else if (!s.ok()) { _failed(id, fmt::format("exchange req success but status isn't ok: {}", s.to_string())); + return; } else if (eos) { _ended(id); - } else { - s = _send_rpc(id); - if (!s) { - _failed(id, fmt::format("exchange req success but status isn't ok: {}", - s.to_string())); - } + } + // The eos here only indicates that the current exchange sink has reached eos. + // However, the queue still contains data from other exchange sinks, so RPCs need to continue being sent. + s = _send_rpc(id); + if (!s) { + _failed(id, + fmt::format("exchange req success but status isn't ok: {}", s.to_string())); } }); { @@ -298,13 +313,16 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { } } if (request.block) { - COUNTER_UPDATE(_parent->memory_used_counter(), -request.block->ByteSizeLong()); + COUNTER_UPDATE(request.channel->_parent->memory_used_counter(), + -request.block->ByteSizeLong()); static_cast(brpc_request->release_block()); } q.pop(); _total_queue_size--; - if (_queue_dependency && _total_queue_size <= _queue_capacity) { - _queue_dependency->set_ready(); + if (_total_queue_size <= _queue_capacity) { + for (auto& [_, dep] : _queue_deps) { + dep->set_ready(); + } } } else if (!broadcast_q.empty()) { // If we have data to shuffle which is broadcasted @@ -312,6 +330,8 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { auto& brpc_request = _instance_to_request[id]; brpc_request->set_eos(request.eos); brpc_request->set_packet_seq(_instance_to_seq[id]++); + brpc_request->set_sender_id(request.channel->_parent->sender_id()); + brpc_request->set_be_number(request.channel->_parent->be_number()); if (request.block_holder->get_block() && !request.block_holder->get_block()->column_metas().empty()) { brpc_request->set_allocated_block(request.block_holder->get_block()); @@ -354,14 +374,17 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { } else if (!s.ok()) { _failed(id, fmt::format("exchange req success but status isn't ok: {}", s.to_string())); + return; } else if (eos) { _ended(id); - } else { - s = _send_rpc(id); - if (!s) { - _failed(id, fmt::format("exchange req success but status isn't ok: {}", - s.to_string())); - } + } + + // The eos here only indicates that the current exchange sink has reached eos. + // However, the queue still contains data from other exchange sinks, so RPCs need to continue being sent. + s = _send_rpc(id); + if (!s) { + _failed(id, + fmt::format("exchange req success but status isn't ok: {}", s.to_string())); } }); { @@ -389,16 +412,6 @@ Status ExchangeSinkBuffer::_send_rpc(InstanceLoId id) { return Status::OK(); } -void ExchangeSinkBuffer::_construct_request(InstanceLoId id, PUniqueId finst_id) { - _instance_to_request[id] = std::make_shared(); - _instance_to_request[id]->mutable_finst_id()->CopyFrom(finst_id); - _instance_to_request[id]->mutable_query_id()->CopyFrom(_query_id); - - _instance_to_request[id]->set_node_id(_dest_node_id); - _instance_to_request[id]->set_sender_id(_sender_id); - _instance_to_request[id]->set_be_number(_be_number); -} - void ExchangeSinkBuffer::_ended(InstanceLoId id) { if (!_instance_to_package_queue_mutex.template contains(id)) { std::stringstream ss; @@ -413,24 +426,29 @@ void ExchangeSinkBuffer::_ended(InstanceLoId id) { __builtin_unreachable(); } else { std::unique_lock lock(*_instance_to_package_queue_mutex[id]); - _turn_off_channel(id, lock); + _running_sink_count[id]--; + if (_running_sink_count[id] == 0) { + _turn_off_channel(id, lock); + } } } void ExchangeSinkBuffer::_failed(InstanceLoId id, const std::string& err) { - _is_finishing = true; + _is_failed = true; _context->cancel(Status::Cancelled(err)); } void ExchangeSinkBuffer::_set_receiver_eof(InstanceLoId id) { std::unique_lock lock(*_instance_to_package_queue_mutex[id]); - _instance_to_receiver_eof[id] = true; + // When the receiving side reaches eof, it means the receiver has finished early. + // The remaining data in the current rpc_channel does not need to be sent, + // and the rpc_channel should be turned off immediately. _turn_off_channel(id, lock); std::queue>& broadcast_q = _instance_to_broadcast_package_queue[id]; for (; !broadcast_q.empty(); broadcast_q.pop()) { if (broadcast_q.front().block_holder->get_block()) { - COUNTER_UPDATE(_parent->memory_used_counter(), + COUNTER_UPDATE(broadcast_q.front().channel->_parent->memory_used_counter(), -broadcast_q.front().block_holder->get_block()->ByteSizeLong()); } } @@ -442,7 +460,8 @@ void ExchangeSinkBuffer::_set_receiver_eof(InstanceLoId id) { std::queue>& q = _instance_to_package_queue[id]; for (; !q.empty(); q.pop()) { if (q.front().block) { - COUNTER_UPDATE(_parent->memory_used_counter(), -q.front().block->ByteSizeLong()); + COUNTER_UPDATE(q.front().channel->_parent->memory_used_counter(), + -q.front().block->ByteSizeLong()); } } @@ -452,22 +471,22 @@ void ExchangeSinkBuffer::_set_receiver_eof(InstanceLoId id) { } } -bool ExchangeSinkBuffer::_is_receiver_eof(InstanceLoId id) { - std::unique_lock lock(*_instance_to_package_queue_mutex[id]); - return _instance_to_receiver_eof[id]; -} - // The unused parameter `with_lock` is to ensure that the function is called when the lock is held. void ExchangeSinkBuffer::_turn_off_channel(InstanceLoId id, std::unique_lock& /*with_lock*/) { if (!_rpc_channel_is_idle[id]) { _rpc_channel_is_idle[id] = true; } - _instance_to_receiver_eof[id] = true; - + // Ensure that each RPC is turned off only once. + if (_rpc_channel_is_turn_off[id]) { + return; + } + _rpc_channel_is_turn_off[id] = true; auto weak_task_ctx = weak_task_exec_ctx(); if (auto pip_ctx = weak_task_ctx.lock()) { - _parent->on_channel_finished(id); + for (auto& [_, parent] : _parents) { + parent->on_channel_finished(id); + } } } @@ -511,7 +530,7 @@ void ExchangeSinkBuffer::update_profile(RuntimeProfile* profile) { auto* _max_rpc_timer = ADD_TIMER_WITH_LEVEL(profile, "RpcMaxTime", 1); auto* _min_rpc_timer = ADD_TIMER(profile, "RpcMinTime"); auto* _sum_rpc_timer = ADD_TIMER(profile, "RpcSumTime"); - auto* _count_rpc = ADD_COUNTER_WITH_LEVEL(profile, "RpcCount", TUnit::UNIT, 1); + auto* _count_rpc = ADD_COUNTER(profile, "RpcCount", TUnit::UNIT); auto* _avg_rpc_timer = ADD_TIMER(profile, "RpcAvgTime"); int64_t max_rpc_time = 0, min_rpc_time = 0; @@ -558,4 +577,5 @@ void ExchangeSinkBuffer::update_profile(RuntimeProfile* profile) { } } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/exchange_sink_buffer.h b/be/src/pipeline/exec/exchange_sink_buffer.h index 22a1452f8d545c..458c7c3f66e3ee 100644 --- a/be/src/pipeline/exec/exchange_sink_buffer.h +++ b/be/src/pipeline/exec/exchange_sink_buffer.h @@ -40,6 +40,7 @@ #include "util/ref_count_closure.h" namespace doris { +#include "common/compile_check_begin.h" class PTransmitDataParams; class TUniqueId; @@ -169,13 +170,61 @@ class ExchangeSendCallback : public ::doris::DummyBrpcCallback { bool _eos; }; -// Each ExchangeSinkOperator have one ExchangeSinkBuffer -class ExchangeSinkBuffer final : public HasTaskExecutionCtx { +// ExchangeSinkBuffer can either be shared among multiple ExchangeSinkLocalState instances +// or be individually owned by each ExchangeSinkLocalState. +// The following describes the scenario where ExchangeSinkBuffer is shared among multiple ExchangeSinkLocalState instances. +// Of course, individual ownership can be seen as a special case where only one ExchangeSinkLocalState shares the buffer. + +// A sink buffer contains multiple rpc_channels. +// Each rpc_channel corresponds to a target instance on the receiving side. +// Data is sent using a ping-pong mode within each rpc_channel, +// meaning that at most one RPC can exist in a single rpc_channel at a time. +// The next RPC can only be sent after the previous one has completed. +// +// Each exchange sink sends data to all target instances on the receiving side. +// If the concurrency is 3, a single rpc_channel will be used simultaneously by three exchange sinks. + +/* + +-----------+ +-----------+ +-----------+ + |dest ins id| |dest ins id| |dest ins id| + | | | | | | + +----+------+ +-----+-----+ +------+----+ + | | | + | | | + +----------------+ +----------------+ +----------------+ + | | | | | | + sink buffer -------- | rpc_channel | | rpc_channel | | rpc_channel | + | | | | | | + +-------+--------+ +----------------+ +----------------+ + | | | + |------------------------+----------------------+ + | | | + | | | + +-----------------+ +-------+---------+ +-------+---------+ + | | | | | | + | exchange sink | | exchange sink | | exchange sink | + | | | | | | + +-----------------+ +-----------------+ +-----------------+ +*/ + +#ifdef BE_TEST +void transmit_blockv2(PBackendService_Stub& stub, + std::unique_ptr>> + closure); +#endif +class ExchangeSinkBuffer : public HasTaskExecutionCtx { public: - ExchangeSinkBuffer(PUniqueId query_id, PlanNodeId dest_node_id, int send_id, int be_number, - RuntimeState* state, ExchangeSinkLocalState* parent); + ExchangeSinkBuffer(PUniqueId query_id, PlanNodeId dest_node_id, RuntimeState* state, + const std::vector& sender_ins_ids); + +#ifdef BE_TEST + ExchangeSinkBuffer(RuntimeState* state, int64_t sinknum) + : HasTaskExecutionCtx(state), _exchange_sink_num(sinknum) {}; +#endif ~ExchangeSinkBuffer() override = default; - void register_sink(TUniqueId); + + void construct_request(TUniqueId); Status add_block(TransmitInfo&& request); Status add_block(BroadcastTransmitInfo&& request); @@ -183,17 +232,18 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { void update_rpc_time(InstanceLoId id, int64_t start_rpc_time, int64_t receive_rpc_time); void update_profile(RuntimeProfile* profile); - void set_dependency(std::shared_ptr queue_dependency, - std::shared_ptr finish_dependency) { - _queue_dependency = queue_dependency; - _finish_dependency = finish_dependency; - } - - void set_broadcast_dependency(std::shared_ptr broadcast_dependency) { - _broadcast_dependency = broadcast_dependency; + void set_dependency(InstanceLoId sender_ins_id, std::shared_ptr queue_dependency, + ExchangeSinkLocalState* local_state) { + DCHECK(_queue_deps.contains(sender_ins_id)); + DCHECK(_parents.contains(sender_ins_id)); + _queue_deps[sender_ins_id] = queue_dependency; + _parents[sender_ins_id] = local_state; } - +#ifdef BE_TEST +public: +#else private: +#endif friend class ExchangeSinkLocalState; phmap::flat_hash_map> @@ -214,7 +264,10 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { // One channel is corresponding to a downstream instance. phmap::flat_hash_map _rpc_channel_is_idle; - phmap::flat_hash_map _instance_to_receiver_eof; + // There could be multiple situations that cause an rpc_channel to be turned off, + // such as receiving the eof, manual cancellation by the user, or all sinks reaching eos. + // Therefore, it is necessary to prevent an rpc_channel from being turned off multiple times. + phmap::flat_hash_map _rpc_channel_is_turn_off; struct RpcInstanceStatistics { RpcInstanceStatistics(InstanceLoId id) : inst_lo_id(id) {} InstanceLoId inst_lo_id; @@ -226,33 +279,45 @@ class ExchangeSinkBuffer final : public HasTaskExecutionCtx { std::vector> _instance_to_rpc_stats_vec; phmap::flat_hash_map _instance_to_rpc_stats; - std::atomic _is_finishing; + // It is set to true only when an RPC fails. Currently, we do not have an error retry mechanism. + // If an RPC error occurs, the query will be canceled. + std::atomic _is_failed; PUniqueId _query_id; PlanNodeId _dest_node_id; - // Sender instance id, unique within a fragment. StreamSender save the variable - int _sender_id; - int _be_number; std::atomic _rpc_count = 0; RuntimeState* _state = nullptr; QueryContext* _context = nullptr; Status _send_rpc(InstanceLoId); - // must hold the _instance_to_package_queue_mutex[id] mutex to opera - void _construct_request(InstanceLoId id, PUniqueId); + +#ifndef BE_TEST inline void _ended(InstanceLoId id); inline void _failed(InstanceLoId id, const std::string& err); inline void _set_receiver_eof(InstanceLoId id); - inline bool _is_receiver_eof(InstanceLoId id); inline void _turn_off_channel(InstanceLoId id, std::unique_lock& with_lock); + +#else + virtual void _ended(InstanceLoId id); + virtual void _failed(InstanceLoId id, const std::string& err); + virtual void _set_receiver_eof(InstanceLoId id); + virtual void _turn_off_channel(InstanceLoId id, std::unique_lock& with_lock); +#endif + void get_max_min_rpc_time(int64_t* max_time, int64_t* min_time); int64_t get_sum_rpc_time(); std::atomic _total_queue_size = 0; - std::shared_ptr _queue_dependency = nullptr; - std::shared_ptr _finish_dependency = nullptr; - std::shared_ptr _broadcast_dependency = nullptr; - ExchangeSinkLocalState* _parent = nullptr; + + // _running_sink_count is used to track how many sinks have not finished yet. + // It is only decremented when eos is reached. + phmap::flat_hash_map _running_sink_count; + // _queue_deps is used for memory control. + phmap::flat_hash_map> _queue_deps; + // The ExchangeSinkLocalState in _parents is only used in _turn_off_channel. + phmap::flat_hash_map _parents; + const int64_t _exchange_sink_num; }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/exchange_sink_operator.cpp b/be/src/pipeline/exec/exchange_sink_operator.cpp index 1f91af01aa1f6b..e7fed76be8fa16 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.cpp +++ b/be/src/pipeline/exec/exchange_sink_operator.cpp @@ -32,10 +32,13 @@ #include "pipeline/exec/operator.h" #include "pipeline/exec/sort_source_operator.h" #include "pipeline/local_exchange/local_exchange_sink_operator.h" +#include "pipeline/local_exchange/local_exchange_source_operator.h" +#include "pipeline/pipeline_fragment_context.h" #include "util/runtime_profile.h" #include "util/uid_util.h" #include "vec/columns/column_const.h" #include "vec/exprs/vexpr.h" +#include "vec/sink/tablet_sink_hash_partitioner.h" namespace doris::pipeline { #include "common/compile_check_begin.h" @@ -59,6 +62,7 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf _local_send_timer = ADD_TIMER(_profile, "LocalSendTime"); _split_block_hash_compute_timer = ADD_TIMER(_profile, "SplitBlockHashComputeTime"); _distribute_rows_into_channels_timer = ADD_TIMER(_profile, "DistributeRowsIntoChannelsTime"); + _send_new_partition_timer = ADD_TIMER(_profile, "SendNewPartitionTime"); _blocks_sent_counter = ADD_COUNTER_WITH_LEVEL(_profile, "BlocksProduced", TUnit::UNIT, 1); _overall_throughput = _profile->add_derived_counter( "OverallThroughput", TUnit::BYTES_PER_SECOND, @@ -99,6 +103,76 @@ Status ExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& inf fmt::format("WaitForLocalExchangeBuffer{}", i), TUnit ::TIME_NS, timer_name, 1)); } _wait_broadcast_buffer_timer = ADD_CHILD_TIMER(_profile, "WaitForBroadcastBuffer", timer_name); + + size_t local_size = 0; + for (int i = 0; i < channels.size(); ++i) { + if (channels[i]->is_local()) { + local_size++; + _last_local_channel_idx = i; + } + } + only_local_exchange = local_size == channels.size(); + _rpc_channels_num = channels.size() - local_size; + + if (!only_local_exchange) { + _sink_buffer = p.get_sink_buffer(state->fragment_instance_id().lo); + register_channels(_sink_buffer.get()); + _queue_dependency = Dependency::create_shared(_parent->operator_id(), _parent->node_id(), + "ExchangeSinkQueueDependency", true); + _sink_buffer->set_dependency(state->fragment_instance_id().lo, _queue_dependency, this); + } + + if (_part_type == TPartitionType::HASH_PARTITIONED) { + _partition_count = channels.size(); + _partitioner = + std::make_unique>( + channels.size()); + RETURN_IF_ERROR(_partitioner->init(p._texprs)); + RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); + _profile->add_info_string("Partitioner", + fmt::format("Crc32HashPartitioner({})", _partition_count)); + } else if (_part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) { + _partition_count = channels.size(); + _partitioner = + std::make_unique>( + channels.size()); + RETURN_IF_ERROR(_partitioner->init(p._texprs)); + RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); + _profile->add_info_string("Partitioner", + fmt::format("Crc32HashPartitioner({})", _partition_count)); + } else if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { + _partition_count = channels.size(); + _profile->add_info_string("Partitioner", + fmt::format("Crc32HashPartitioner({})", _partition_count)); + _partitioner = std::make_unique( + _partition_count, p._tablet_sink_txn_id, p._tablet_sink_schema, + p._tablet_sink_partition, p._tablet_sink_location, p._tablet_sink_tuple_id, this); + RETURN_IF_ERROR(_partitioner->init({})); + RETURN_IF_ERROR(_partitioner->prepare(state, {})); + } else if (_part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { + _partition_count = + channels.size() * config::table_sink_partition_write_max_partition_nums_per_writer; + _partitioner = std::make_unique( + channels.size(), _partition_count, channels.size(), 1, + config::table_sink_partition_write_min_partition_data_processed_rebalance_threshold / + state->task_num() == + 0 + ? config::table_sink_partition_write_min_partition_data_processed_rebalance_threshold + : config::table_sink_partition_write_min_partition_data_processed_rebalance_threshold / + state->task_num(), + config::table_sink_partition_write_min_data_processed_rebalance_threshold / + state->task_num() == + 0 + ? config::table_sink_partition_write_min_data_processed_rebalance_threshold + : config::table_sink_partition_write_min_data_processed_rebalance_threshold / + state->task_num()); + + RETURN_IF_ERROR(_partitioner->init(p._texprs)); + RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); + _profile->add_info_string("Partitioner", + fmt::format("Crc32HashPartitioner({})", _partition_count)); + } + return Status::OK(); } @@ -124,6 +198,7 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { SCOPED_TIMER(exec_time_counter()); SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(Base::open(state)); + _writer.reset(new Writer()); auto& p = _parent->cast(); if (_part_type == TPartitionType::UNPARTITIONED || _part_type == TPartitionType::RANDOM || @@ -132,39 +207,24 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { std::mt19937 g(rd()); shuffle(channels.begin(), channels.end(), g); } - size_t local_size = 0; for (int i = 0; i < channels.size(); ++i) { RETURN_IF_ERROR(channels[i]->open(state)); if (channels[i]->is_local()) { - local_size++; _last_local_channel_idx = i; } } - only_local_exchange = local_size == channels.size(); - - _rpc_channels_num = channels.size() - local_size; PUniqueId id; id.set_hi(_state->query_id().hi); id.set_lo(_state->query_id().lo); - if (!only_local_exchange) { - _sink_buffer = std::make_unique(id, p._dest_node_id, _sender_id, - _state->be_number(), state, this); - register_channels(_sink_buffer.get()); - _queue_dependency = Dependency::create_shared(_parent->operator_id(), _parent->node_id(), - "ExchangeSinkQueueDependency", true); - _sink_buffer->set_dependency(_queue_dependency, _finish_dependency); - } - if ((_part_type == TPartitionType::UNPARTITIONED || channels.size() == 1) && !only_local_exchange) { _broadcast_dependency = Dependency::create_shared( _parent->operator_id(), _parent->node_id(), "BroadcastDependency", true); - _sink_buffer->set_broadcast_dependency(_broadcast_dependency); _broadcast_pb_mem_limiter = vectorized::BroadcastPBlockHolderMemLimiter::create_shared(_broadcast_dependency); - } else if (local_size > 0) { + } else if (!only_local_exchange) { size_t dep_id = 0; for (auto& channel : channels) { if (channel->is_local()) { @@ -179,118 +239,12 @@ Status ExchangeSinkLocalState::open(RuntimeState* state) { } } } - if (_part_type == TPartitionType::HASH_PARTITIONED) { - _partition_count = channels.size(); - _partitioner = - std::make_unique>( - channels.size()); - RETURN_IF_ERROR(_partitioner->init(p._texprs)); - RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); - _profile->add_info_string("Partitioner", - fmt::format("Crc32HashPartitioner({})", _partition_count)); - } else if (_part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) { - _partition_count = channels.size(); - _partitioner = - std::make_unique>( - channels.size()); - RETURN_IF_ERROR(_partitioner->init(p._texprs)); - RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); - _profile->add_info_string("Partitioner", - fmt::format("Crc32HashPartitioner({})", _partition_count)); - } else if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { - _partition_count = channels.size(); - _profile->add_info_string("Partitioner", - fmt::format("Crc32HashPartitioner({})", _partition_count)); - _txn_id = p._tablet_sink_txn_id; - _schema = std::make_shared(); - RETURN_IF_ERROR(_schema->init(p._tablet_sink_schema)); - _vpartition = std::make_unique(_schema, p._tablet_sink_partition); - RETURN_IF_ERROR(_vpartition->init()); - auto find_tablet_mode = vectorized::OlapTabletFinder::FindTabletMode::FIND_TABLET_EVERY_ROW; - _tablet_finder = - std::make_unique(_vpartition.get(), find_tablet_mode); - _tablet_sink_tuple_desc = _state->desc_tbl().get_tuple_descriptor(p._tablet_sink_tuple_id); - _tablet_sink_row_desc = p._pool->add(new RowDescriptor(_tablet_sink_tuple_desc, false)); - _tablet_sink_expr_ctxs.resize(p._tablet_sink_expr_ctxs.size()); - for (size_t i = 0; i < _tablet_sink_expr_ctxs.size(); i++) { - RETURN_IF_ERROR(p._tablet_sink_expr_ctxs[i]->clone(state, _tablet_sink_expr_ctxs[i])); - } - // if _part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED, we handle the processing of auto_increment column - // on exchange node rather than on TabletWriter - _block_convertor = - std::make_unique(_tablet_sink_tuple_desc); - _block_convertor->init_autoinc_info(_schema->db_id(), _schema->table_id(), - _state->batch_size()); - _location = p._pool->add(new OlapTableLocationParam(p._tablet_sink_location)); - _row_distribution.init( - {.state = _state, - .block_convertor = _block_convertor.get(), - .tablet_finder = _tablet_finder.get(), - .vpartition = _vpartition.get(), - .add_partition_request_timer = _add_partition_request_timer, - .txn_id = _txn_id, - .pool = p._pool.get(), - .location = _location, - .vec_output_expr_ctxs = &_tablet_sink_expr_ctxs, - .schema = _schema, - .caller = (void*)this, - .create_partition_callback = &ExchangeSinkLocalState::empty_callback_function}); - } else if (_part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { - _partition_count = - channels.size() * config::table_sink_partition_write_max_partition_nums_per_writer; - _partitioner = - std::make_unique>( - _partition_count); - _partition_function = std::make_unique(_partitioner.get()); - - scale_writer_partitioning_exchanger = std::make_unique< - vectorized::ScaleWriterPartitioningExchanger>( - channels.size(), *_partition_function, _partition_count, channels.size(), 1, - config::table_sink_partition_write_min_partition_data_processed_rebalance_threshold / - state->task_num() == - 0 - ? config::table_sink_partition_write_min_partition_data_processed_rebalance_threshold - : config::table_sink_partition_write_min_partition_data_processed_rebalance_threshold / - state->task_num(), - config::table_sink_partition_write_min_data_processed_rebalance_threshold / - state->task_num() == - 0 - ? config::table_sink_partition_write_min_data_processed_rebalance_threshold - : config::table_sink_partition_write_min_data_processed_rebalance_threshold / - state->task_num()); - - RETURN_IF_ERROR(_partitioner->init(p._texprs)); - RETURN_IF_ERROR(_partitioner->prepare(state, p._row_desc)); - _profile->add_info_string("Partitioner", - fmt::format("Crc32HashPartitioner({})", _partition_count)); - } if (_part_type == TPartitionType::HASH_PARTITIONED || _part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED || - _part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { + _part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED || + _part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { RETURN_IF_ERROR(_partitioner->open(state)); - } else if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { - RETURN_IF_ERROR(_row_distribution.open(_tablet_sink_row_desc)); - } - return Status::OK(); -} - -Status ExchangeSinkLocalState::_send_new_partition_batch() { - if (_row_distribution.need_deal_batching()) { // maybe try_close more than 1 time - RETURN_IF_ERROR(_row_distribution.automatic_create_partition()); - vectorized::Block tmp_block = - _row_distribution._batching_block->to_block(); // Borrow out, for lval ref - auto& p = _parent->cast(); - // these order is unique. - // 1. clear batching stats(and flag goes true) so that we won't make a new batching process in dealing batched block. - // 2. deal batched block - // 3. now reuse the column of lval block. cuz write doesn't real adjust it. it generate a new block from that. - _row_distribution.clear_batching_stats(); - RETURN_IF_ERROR(p.sink(_state, &tmp_block, false)); - // Recovery back - _row_distribution._batching_block->set_mutable_columns(tmp_block.mutate_columns()); - _row_distribution._batching_block->clear_column_data(); - _row_distribution._deal_batched = false; } return Status::OK(); } @@ -309,7 +263,8 @@ segment_v2::CompressionTypePB ExchangeSinkLocalState::compression_type() const { ExchangeSinkOperatorX::ExchangeSinkOperatorX( RuntimeState* state, const RowDescriptor& row_desc, int operator_id, - const TDataStreamSink& sink, const std::vector& destinations) + const TDataStreamSink& sink, const std::vector& destinations, + const std::vector& fragment_instance_ids) : DataSinkOperatorX(operator_id, sink.dest_node_id), _texprs(sink.output_partition.partition_exprs), _row_desc(row_desc), @@ -323,7 +278,8 @@ ExchangeSinkOperatorX::ExchangeSinkOperatorX( _tablet_sink_tuple_id(sink.tablet_sink_tuple_id), _tablet_sink_txn_id(sink.tablet_sink_txn_id), _t_tablet_sink_exprs(&sink.tablet_sink_exprs), - _enable_local_merge_sort(state->enable_local_merge_sort()) { + _enable_local_merge_sort(state->enable_local_merge_sort()), + _fragment_instance_ids(fragment_instance_ids) { DCHECK_GT(destinations.size(), 0); DCHECK(sink.output_partition.type == TPartitionType::UNPARTITIONED || sink.output_partition.type == TPartitionType::HASH_PARTITIONED || @@ -368,6 +324,11 @@ Status ExchangeSinkOperatorX::open(RuntimeState* state) { } RETURN_IF_ERROR(vectorized::VExpr::open(_tablet_sink_expr_ctxs, state)); } + std::vector ins_ids; + for (auto fragment_instance_id : _fragment_instance_ids) { + ins_ids.push_back(fragment_instance_id.lo); + } + _sink_buffer = _create_buffer(ins_ids); return Status::OK(); } @@ -393,10 +354,6 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block if (all_receiver_eof) { return Status::EndOfFile("all data stream channels EOF"); } - Defer defer([&]() { - COUNTER_SET(local_state._peak_memory_usage_counter, - local_state._memory_used_counter->value()); - }); if (_part_type == TPartitionType::UNPARTITIONED || local_state.channels.size() == 1) { // 1. serialize depends on it is not local exchange @@ -486,112 +443,10 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block local_state.current_channel_idx = (local_state.current_channel_idx + 1) % local_state.channels.size(); } else if (_part_type == TPartitionType::HASH_PARTITIONED || - _part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED) { - auto rows = block->rows(); - { - SCOPED_TIMER(local_state._split_block_hash_compute_timer); - RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, block)); - } - int64_t old_channel_mem_usage = 0; - for (const auto& channel : local_state.channels) { - old_channel_mem_usage += channel->mem_usage(); - } - if (_part_type == TPartitionType::HASH_PARTITIONED) { - SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); - RETURN_IF_ERROR(channel_add_rows( - state, local_state.channels, local_state._partition_count, - local_state._partitioner->get_channel_ids().get(), rows, block, eos)); - } else { - SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); - RETURN_IF_ERROR(channel_add_rows( - state, local_state.channels, local_state._partition_count, - local_state._partitioner->get_channel_ids().get(), rows, block, eos)); - } - int64_t new_channel_mem_usage = 0; - for (const auto& channel : local_state.channels) { - new_channel_mem_usage += channel->mem_usage(); - } - COUNTER_UPDATE(local_state.memory_used_counter(), - new_channel_mem_usage - old_channel_mem_usage); - COUNTER_SET(local_state.peak_memory_usage_counter(), - local_state.memory_used_counter()->value()); - } else if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED) { - int64_t old_channel_mem_usage = 0; - for (const auto& channel : local_state.channels) { - old_channel_mem_usage += channel->mem_usage(); - } - // check out of limit - RETURN_IF_ERROR(local_state._send_new_partition_batch()); - std::shared_ptr convert_block = std::make_shared(); - const auto& num_channels = local_state._partition_count; - std::vector> channel2rows; - channel2rows.resize(num_channels); - auto input_rows = block->rows(); - - if (input_rows > 0) { - bool has_filtered_rows = false; - int64_t filtered_rows = 0; - local_state._number_input_rows += input_rows; - - RETURN_IF_ERROR(local_state._row_distribution.generate_rows_distribution( - *block, convert_block, filtered_rows, has_filtered_rows, - local_state._row_part_tablet_ids, local_state._number_input_rows)); - - const auto& row_ids = local_state._row_part_tablet_ids[0].row_ids; - const auto& tablet_ids = local_state._row_part_tablet_ids[0].tablet_ids; - for (int idx = 0; idx < row_ids.size(); ++idx) { - const auto& row = row_ids[idx]; - const auto& tablet_id_hash = - HashUtil::zlib_crc_hash(&tablet_ids[idx], sizeof(int64), 0); - channel2rows[tablet_id_hash % num_channels].emplace_back(row); - } - } - - if (eos) { - local_state._row_distribution._deal_batched = true; - RETURN_IF_ERROR(local_state._send_new_partition_batch()); - } - { - SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); - // the convert_block maybe different with block after execute exprs - // when send data we still use block - RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, num_channels, - channel2rows, block, eos)); - } - int64_t new_channel_mem_usage = 0; - for (const auto& channel : local_state.channels) { - new_channel_mem_usage += channel->mem_usage(); - } - COUNTER_UPDATE(local_state.memory_used_counter(), - new_channel_mem_usage - old_channel_mem_usage); - COUNTER_SET(local_state.peak_memory_usage_counter(), - local_state.memory_used_counter()->value()); - } else if (_part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { - int64_t old_channel_mem_usage = 0; - for (const auto& channel : local_state.channels) { - old_channel_mem_usage += channel->mem_usage(); - } - { - SCOPED_TIMER(local_state._split_block_hash_compute_timer); - RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, block)); - } - std::vector> assignments = - local_state.scale_writer_partitioning_exchanger->accept(block); - { - SCOPED_TIMER(local_state._distribute_rows_into_channels_timer); - RETURN_IF_ERROR(channel_add_rows_with_idx(state, local_state.channels, - local_state.channels.size(), assignments, - block, eos)); - } - - int64_t new_channel_mem_usage = 0; - for (const auto& channel : local_state.channels) { - new_channel_mem_usage += channel->mem_usage(); - } - COUNTER_UPDATE(local_state.memory_used_counter(), - new_channel_mem_usage - old_channel_mem_usage); - COUNTER_SET(local_state.peak_memory_usage_counter(), - local_state.memory_used_counter()->value()); + _part_type == TPartitionType::BUCKET_SHFFULE_HASH_PARTITIONED || + _part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED || + _part_type == TPartitionType::TABLE_SINK_HASH_PARTITIONED) { + RETURN_IF_ERROR(local_state._writer->write(&local_state, state, block, eos)); } else if (_part_type == TPartitionType::TABLE_SINK_RANDOM_PARTITIONED) { // Control the number of channels according to the flow, thereby controlling the number of table sink writers. // 1. select channel @@ -639,46 +494,8 @@ Status ExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* block void ExchangeSinkLocalState::register_channels(pipeline::ExchangeSinkBuffer* buffer) { for (auto& channel : channels) { - channel->register_exchange_buffer(buffer); - } -} - -Status ExchangeSinkOperatorX::channel_add_rows( - RuntimeState* state, std::vector>& channels, - size_t num_channels, const uint32_t* __restrict channel_ids, size_t rows, - vectorized::Block* block, bool eos) { - std::vector> channel2rows; - channel2rows.resize(num_channels); - for (uint32_t i = 0; i < rows; i++) { - channel2rows[channel_ids[i]].emplace_back(i); - } - - RETURN_IF_ERROR( - channel_add_rows_with_idx(state, channels, num_channels, channel2rows, block, eos)); - return Status::OK(); -} - -Status ExchangeSinkOperatorX::channel_add_rows_with_idx( - RuntimeState* state, std::vector>& channels, - size_t num_channels, std::vector>& channel2rows, - vectorized::Block* block, bool eos) { - Status status = Status::OK(); - for (int i = 0; i < num_channels; ++i) { - if (!channels[i]->is_receiver_eof() && !channel2rows[i].empty()) { - status = channels[i]->add_rows(block, channel2rows[i], false); - HANDLE_CHANNEL_STATUS(state, channels[i], status); - channel2rows[i].clear(); - } + channel->set_exchange_buffer(buffer); } - if (eos) { - for (int i = 0; i < num_channels; ++i) { - if (!channels[i]->is_receiver_eof()) { - status = channels[i]->add_rows(block, channel2rows[i], true); - HANDLE_CHANNEL_STATUS(state, channels[i], status); - } - } - } - return Status::OK(); } std::string ExchangeSinkLocalState::debug_string(int indentation_level) const { @@ -688,8 +505,8 @@ std::string ExchangeSinkLocalState::debug_string(int indentation_level) const { fmt::format_to(debug_string_buffer, ", Sink Buffer: (_is_finishing = {}, blocks in queue: {}, queue capacity: " "{}, queue dep: {}), _reach_limit: {}, working channels: {}", - _sink_buffer->_is_finishing.load(), _sink_buffer->_total_queue_size, - _sink_buffer->_queue_capacity, (void*)_sink_buffer->_queue_dependency.get(), + _sink_buffer->_is_failed.load(), _sink_buffer->_total_queue_size, + _sink_buffer->_queue_capacity, (void*)_queue_dependency.get(), _reach_limit.load(), _working_channels_count.load()); } return fmt::to_string(debug_string_buffer); @@ -699,17 +516,10 @@ Status ExchangeSinkLocalState::close(RuntimeState* state, Status exec_status) { if (_closed) { return Status::OK(); } - if (_part_type == TPartitionType::TABLET_SINK_SHUFFLE_PARTITIONED && - _block_convertor != nullptr && _tablet_finder != nullptr) { - _state->update_num_rows_load_filtered(_block_convertor->num_filtered_rows() + - _tablet_finder->num_filtered_rows()); - _state->update_num_rows_load_unselected( - _tablet_finder->num_immutable_partition_filtered_rows()); - // sink won't see those filtered rows, we should compensate here - _state->set_num_rows_load_total(_state->num_rows_load_filtered() + - _state->num_rows_load_unselected()); - } SCOPED_TIMER(exec_time_counter()); + if (_partitioner) { + RETURN_IF_ERROR(_partitioner->close(state)); + } SCOPED_TIMER(_close_timer); if (_queue_dependency) { COUNTER_UPDATE(_wait_queue_timer, _queue_dependency->watcher_elapse_time()); @@ -743,4 +553,42 @@ DataDistribution ExchangeSinkOperatorX::required_data_distribution() const { return DataSinkOperatorX::required_data_distribution(); } +std::shared_ptr ExchangeSinkOperatorX::_create_buffer( + const std::vector& sender_ins_ids) { + PUniqueId id; + id.set_hi(_state->query_id().hi); + id.set_lo(_state->query_id().lo); + auto sink_buffer = + std::make_unique(id, _dest_node_id, state(), sender_ins_ids); + for (const auto& _dest : _dests) { + sink_buffer->construct_request(_dest.fragment_instance_id); + } + return sink_buffer; +} + +// For a normal shuffle scenario, if the concurrency is n, +// there can be up to n * n RPCs in the current fragment. +// Therefore, a shared sink buffer is used here to limit the number of concurrent RPCs. +// (Note: This does not reduce the total number of RPCs.) +// In a merge sort scenario, there are only n RPCs, so a shared sink buffer is not needed. +/// TODO: Modify this to let FE handle the judgment instead of BE. +std::shared_ptr ExchangeSinkOperatorX::get_sink_buffer( + InstanceLoId sender_ins_id) { + if (!_child) { + throw doris::Exception(ErrorCode::INTERNAL_ERROR, + "ExchangeSinkOperatorX did not correctly set the child."); + } + // When the child is SortSourceOperatorX or LocalExchangeSourceOperatorX, + // it is an order-by scenario. + // In this case, there is only one target instance, and no n * n RPC concurrency will occur. + // Therefore, sharing a sink buffer is not necessary. + if (std::dynamic_pointer_cast(_child) || + std::dynamic_pointer_cast(_child)) { + return _create_buffer({sender_ins_id}); + } + if (_state->enable_shared_exchange_sink_buffer()) { + return _sink_buffer; + } + return _create_buffer({sender_ins_id}); +} } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/exchange_sink_operator.h b/be/src/pipeline/exec/exchange_sink_operator.h index 63d50290005470..85575beb9f7e47 100644 --- a/be/src/pipeline/exec/exchange_sink_operator.h +++ b/be/src/pipeline/exec/exchange_sink_operator.h @@ -26,10 +26,12 @@ #include "common/status.h" #include "exchange_sink_buffer.h" #include "operator.h" +#include "pipeline/shuffle/writer.h" #include "vec/sink/scale_writer_partitioning_exchanger.hpp" #include "vec/sink/vdata_stream_sender.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; class TDataSink; @@ -39,20 +41,6 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { ENABLE_FACTORY_CREATOR(ExchangeSinkLocalState); using Base = PipelineXSinkLocalState<>; -private: - class HashPartitionFunction { - public: - HashPartitionFunction(vectorized::PartitionerBase* partitioner) - : _partitioner(partitioner) {} - - int get_partition(vectorized::Block* block, int position) { - return _partitioner->get_channel_ids().get()[position]; - } - - private: - vectorized::PartitionerBase* _partitioner; - }; - public: ExchangeSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) : Base(parent, state), _serializer(this) { @@ -61,6 +49,14 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { parent->get_name() + "_FINISH_DEPENDENCY", false); } +#ifdef BE_TEST + ExchangeSinkLocalState(RuntimeState* state) : Base(nullptr, state) { + _profile = state->obj_pool()->add(new RuntimeProfile("mock")); + _memory_used_counter = + _profile->AddHighWaterMarkCounter("MemoryUsage", TUnit::BYTES, "", 1); + } +#endif + std::vector dependencies() const override { std::vector dep_vec; if (_queue_dependency) { @@ -88,31 +84,37 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { bool is_finished() const override { return _reach_limit.load(); } void set_reach_limit() { _reach_limit = true; }; + // sender_id indicates which instance within a fragment, while be_number indicates which instance + // across all fragments. For example, with 3 BEs and 8 instances, the range of sender_id would be 0 to 24, + // and the range of be_number would be from n + 0 to n + 24. + // Since be_number is a required field, it still needs to be set for compatibility with older code. [[nodiscard]] int sender_id() const { return _sender_id; } + [[nodiscard]] int be_number() const { return _state->be_number(); } std::string name_suffix() override; segment_v2::CompressionTypePB compression_type() const; std::string debug_string(int indentation_level) const override; - static Status empty_callback_function(void* sender, TCreatePartitionResult* result) { - return Status::OK(); + RuntimeProfile::Counter* send_new_partition_timer() { return _send_new_partition_timer; } + RuntimeProfile::Counter* add_partition_request_timer() { return _add_partition_request_timer; } + RuntimeProfile::Counter* split_block_hash_compute_timer() { + return _split_block_hash_compute_timer; + } + RuntimeProfile::Counter* distribute_rows_into_channels_timer() { + return _distribute_rows_into_channels_timer; } - Status _send_new_partition_batch(); std::vector> channels; int current_channel_idx {0}; // index of current channel to send to if _random == true bool only_local_exchange {false}; void on_channel_finished(InstanceLoId channel_id); - - // for external table sink hash partition - std::unique_ptr> - scale_writer_partitioning_exchanger; + vectorized::PartitionerBase* partitioner() const { return _partitioner.get(); } private: friend class ExchangeSinkOperatorX; friend class vectorized::Channel; friend class vectorized::BlockSerializer; - std::unique_ptr _sink_buffer = nullptr; + std::shared_ptr _sink_buffer = nullptr; RuntimeProfile::Counter* _serialize_batch_timer = nullptr; RuntimeProfile::Counter* _compress_timer = nullptr; RuntimeProfile::Counter* _bytes_sent_counter = nullptr; @@ -127,6 +129,7 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { // Used to counter send bytes under local data exchange RuntimeProfile::Counter* _local_bytes_send_counter = nullptr; RuntimeProfile::Counter* _merge_block_timer = nullptr; + RuntimeProfile::Counter* _send_new_partition_timer = nullptr; RuntimeProfile::Counter* _wait_queue_timer = nullptr; RuntimeProfile::Counter* _wait_broadcast_buffer_timer = nullptr; @@ -162,28 +165,16 @@ class ExchangeSinkLocalState final : public PipelineXSinkLocalState<> { */ std::vector> _local_channels_dependency; std::unique_ptr _partitioner; + std::unique_ptr _writer; size_t _partition_count; std::shared_ptr _finish_dependency; // for shuffle data by partition and tablet - int64_t _txn_id = -1; - vectorized::VExprContextSPtrs _tablet_sink_expr_ctxs; - std::unique_ptr _vpartition = nullptr; - std::unique_ptr _tablet_finder = nullptr; - std::shared_ptr _schema = nullptr; - std::unique_ptr _block_convertor = nullptr; - TupleDescriptor* _tablet_sink_tuple_desc = nullptr; - RowDescriptor* _tablet_sink_row_desc = nullptr; - OlapTableLocationParam* _location = nullptr; - vectorized::VRowDistribution _row_distribution; + RuntimeProfile::Counter* _add_partition_request_timer = nullptr; - std::vector _row_part_tablet_ids; - int64_t _number_input_rows = 0; TPartitionType::type _part_type; - // for external table sink hash partition - std::unique_ptr _partition_function = nullptr; std::atomic _reach_limit = false; int _last_local_channel_idx = -1; @@ -196,7 +187,8 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX& destinations); + const std::vector& destinations, + const std::vector& fragment_instance_ids); Status init(const TDataSink& tsink) override; RuntimeState* state() { return _state; } @@ -208,22 +200,27 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX get_sink_buffer(InstanceLoId sender_ins_id); + vectorized::VExprContextSPtrs& tablet_sink_expr_ctxs() { return _tablet_sink_expr_ctxs; } + private: friend class ExchangeSinkLocalState; template void _handle_eof_channel(RuntimeState* state, ChannelPtrType channel, Status st); - Status channel_add_rows(RuntimeState* state, - std::vector>& channels, - size_t num_channels, const uint32_t* __restrict channel_ids, - size_t rows, vectorized::Block* block, bool eos); - - Status channel_add_rows_with_idx(RuntimeState* state, - std::vector>& channels, - size_t num_channels, - std::vector>& channel2rows, - vectorized::Block* block, bool eos); + // Use ExchangeSinkOperatorX to create a sink buffer. + // The sink buffer can be shared among multiple ExchangeSinkLocalState instances, + // or each ExchangeSinkLocalState can have its own sink buffer. + std::shared_ptr _create_buffer( + const std::vector& sender_ins_ids); + std::shared_ptr _sink_buffer = nullptr; RuntimeState* _state = nullptr; const std::vector _texprs; @@ -263,7 +260,9 @@ class ExchangeSinkOperatorX final : public DataSinkOperatorX& _fragment_instance_ids; }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/exchange_source_operator.h b/be/src/pipeline/exec/exchange_source_operator.h index f938f5007d1643..ff9c5840033777 100644 --- a/be/src/pipeline/exec/exchange_source_operator.h +++ b/be/src/pipeline/exec/exchange_source_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class ExecNode; } // namespace doris @@ -109,4 +110,5 @@ class ExchangeSourceOperatorX final : public OperatorX { std::vector _nulls_first; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/file_scan_operator.h b/be/src/pipeline/exec/file_scan_operator.h index 2777a013d62f61..87c5bcd2e54de5 100644 --- a/be/src/pipeline/exec/file_scan_operator.h +++ b/be/src/pipeline/exec/file_scan_operator.h @@ -29,6 +29,7 @@ #include "vec/exec/scan/split_source_connector.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class VFileScanner; } // namespace vectorized @@ -86,4 +87,5 @@ class FileScanOperatorX final : public ScanOperatorX { const std::string _table_name; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/group_commit_block_sink_operator.h b/be/src/pipeline/exec/group_commit_block_sink_operator.h index e469aee8df595c..5eabb280c4315d 100644 --- a/be/src/pipeline/exec/group_commit_block_sink_operator.h +++ b/be/src/pipeline/exec/group_commit_block_sink_operator.h @@ -22,8 +22,9 @@ #include "runtime/group_commit_mgr.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class OlapTableBlockConvertor; -} +} // namespace doris::vectorized namespace doris::pipeline { @@ -125,4 +126,5 @@ class GroupCommitBlockSinkOperatorX final TGroupCommitMode::type _group_commit_mode; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/group_commit_scan_operator.h b/be/src/pipeline/exec/group_commit_scan_operator.h index 46f50f3772440a..d1428899ede6b9 100644 --- a/be/src/pipeline/exec/group_commit_scan_operator.h +++ b/be/src/pipeline/exec/group_commit_scan_operator.h @@ -27,6 +27,7 @@ #include "runtime/group_commit_mgr.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class GroupCommitOperatorX; class GroupCommitLocalState final : public ScanLocalState { @@ -60,4 +61,5 @@ class GroupCommitOperatorX final : public ScanOperatorX { const int64_t _table_id; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/hashjoin_build_sink.cpp b/be/src/pipeline/exec/hashjoin_build_sink.cpp index 37de9ac93d839f..19e8493e596a7e 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.cpp +++ b/be/src/pipeline/exec/hashjoin_build_sink.cpp @@ -91,9 +91,8 @@ Status HashJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo RETURN_IF_ERROR(_hash_table_init(state)); _runtime_filters.resize(p._runtime_filter_descs.size()); for (size_t i = 0; i < p._runtime_filter_descs.size(); i++) { - RETURN_IF_ERROR(state->register_producer_runtime_filter( - p._runtime_filter_descs[i], p._need_local_merge, &_runtime_filters[i], - _build_expr_ctxs.size() == 1)); + RETURN_IF_ERROR(state->register_producer_runtime_filter(p._runtime_filter_descs[i], + &_runtime_filters[i])); } _runtime_filter_slots = @@ -115,49 +114,68 @@ Status HashJoinBuildSinkLocalState::close(RuntimeState* state, Status exec_statu } auto p = _parent->cast(); Defer defer {[&]() { - if (_should_build_hash_table) { - // The build side hash key column maybe no need output, but we need to keep the column in block - // because it is used to compare with probe side hash key column - if (p._should_keep_hash_key_column && _build_col_ids.size() == 1) { - p._should_keep_column_flags[_build_col_ids[0]] = true; - } + if (!_should_build_hash_table) { + return; + } + // The build side hash key column maybe no need output, but we need to keep the column in block + // because it is used to compare with probe side hash key column - if (_shared_state->build_block) { - // release the memory of unused column in probe stage - _shared_state->build_block->clear_column_mem_not_keep( - p._should_keep_column_flags, bool(p._shared_hashtable_controller)); - } + if (p._should_keep_hash_key_column && _build_col_ids.size() == 1) { + p._should_keep_column_flags[_build_col_ids[0]] = true; + } + + if (_shared_state->build_block) { + // release the memory of unused column in probe stage + _shared_state->build_block->clear_column_mem_not_keep( + p._should_keep_column_flags, bool(p._shared_hashtable_controller)); } - if (_should_build_hash_table && p._shared_hashtable_controller) { + if (p._shared_hashtable_controller) { p._shared_hashtable_controller->signal_finish(p.node_id()); } }}; - if (!_runtime_filter_slots || _runtime_filters.empty() || state->is_cancelled()) { + if (!_runtime_filter_slots || _runtime_filters.empty() || state->is_cancelled() || !_eos) { return Base::close(state, exec_status); } - if (state->get_task()->wake_up_by_downstream()) { - RETURN_IF_ERROR(_runtime_filter_slots->send_filter_size(state, 0, _finish_dependency)); - RETURN_IF_ERROR(_runtime_filter_slots->ignore_all_filters()); - } else { - auto* block = _shared_state->build_block.get(); - uint64_t hash_table_size = block ? block->rows() : 0; - { - SCOPED_TIMER(_runtime_filter_init_timer); - if (_should_build_hash_table) { + try { + if (state->get_task()->wake_up_early()) { + // partitial ignore rf to make global rf work or ignore useless rf + RETURN_IF_ERROR(_runtime_filter_slots->send_filter_size(state, 0, _finish_dependency)); + RETURN_IF_ERROR(_runtime_filter_slots->ignore_all_filters()); + } else if (_should_build_hash_table) { + auto* block = _shared_state->build_block.get(); + uint64_t hash_table_size = block ? block->rows() : 0; + { + SCOPED_TIMER(_runtime_filter_init_timer); RETURN_IF_ERROR(_runtime_filter_slots->init_filters(state, hash_table_size)); + RETURN_IF_ERROR(_runtime_filter_slots->ignore_filters(state)); + } + if (hash_table_size > 1) { + SCOPED_TIMER(_runtime_filter_compute_timer); + _runtime_filter_slots->insert(block); } - RETURN_IF_ERROR(_runtime_filter_slots->ignore_filters(state)); - } - if (_should_build_hash_table && hash_table_size > 1) { - SCOPED_TIMER(_runtime_filter_compute_timer); - _runtime_filter_slots->insert(block); } + + SCOPED_TIMER(_publish_runtime_filter_timer); + RETURN_IF_ERROR(_runtime_filter_slots->publish(state, !_should_build_hash_table)); + } catch (Exception& e) { + bool blocked_by_complete_build_stage = p._shared_hashtable_controller && + !p._shared_hash_table_context->complete_build_stage; + bool blocked_by_shared_hash_table_signal = !_should_build_hash_table && + p._shared_hashtable_controller && + !p._shared_hash_table_context->signaled; + + return Status::InternalError( + "rf process meet error: {}, wake_up_early: {}, should_build_hash_table: " + "{}, _finish_dependency: {}, blocked_by_complete_build_stage: {}, " + "blocked_by_shared_hash_table_signal: " + "{}", + e.to_string(), state->get_task()->wake_up_early(), _should_build_hash_table, + _finish_dependency->debug_string(), blocked_by_complete_build_stage, + blocked_by_shared_hash_table_signal); } - SCOPED_TIMER(_publish_runtime_filter_timer); - RETURN_IF_ERROR(_runtime_filter_slots->publish(!_should_build_hash_table)); return Base::close(state, exec_status); } @@ -304,7 +322,6 @@ Status HashJoinBuildSinkLocalState::process_build_block(RuntimeState* state, _build_blocks_memory_usage->value() + (int64_t)(arg.hash_table->get_byte_size() + arg.serialized_keys_size(true))); - COUNTER_SET(_peak_memory_usage_counter, _memory_used_counter->value()); return st; }}, _shared_state->hash_table_variants->method_variant, _shared_state->join_op_variants, @@ -353,8 +370,7 @@ Status HashJoinBuildSinkLocalState::_hash_table_init(RuntimeState* state) { HashJoinBuildSinkOperatorX::HashJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, - bool need_local_merge) + const DescriptorTbl& descs) : JoinBuildSinkOperatorX(pool, operator_id, tnode, descs), _join_distribution(tnode.hash_join_node.__isset.dist_type ? tnode.hash_join_node.dist_type : TJoinDistributionType::NONE), @@ -362,8 +378,7 @@ HashJoinBuildSinkOperatorX::HashJoinBuildSinkOperatorX(ObjectPool* pool, int ope tnode.hash_join_node.is_broadcast_join), _partition_exprs(tnode.__isset.distribute_expr_lists && !_is_broadcast_join ? tnode.distribute_expr_lists[1] - : std::vector {}), - _need_local_merge(need_local_merge) {} + : std::vector {}) {} Status HashJoinBuildSinkOperatorX::init(const TPlanNode& tnode, RuntimeState* state) { RETURN_IF_ERROR(JoinBuildSinkOperatorX::init(tnode, state)); @@ -453,7 +468,6 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); - local_state._eos = eos; if (local_state._should_build_hash_table) { // If eos or have already met a null value using short-circuit strategy, we do not need to pull // data from probe side. @@ -488,7 +502,6 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* std::move(*in_block))); int64_t blocks_mem_usage = local_state._build_side_mutable_block.allocated_bytes(); COUNTER_SET(local_state._memory_used_counter, blocks_mem_usage); - COUNTER_SET(local_state._peak_memory_usage_counter, blocks_mem_usage); COUNTER_SET(local_state._build_blocks_memory_usage, blocks_mem_usage); } } @@ -559,6 +572,7 @@ Status HashJoinBuildSinkOperatorX::sink(RuntimeState* state, vectorized::Block* } if (eos) { + local_state._eos = true; local_state.init_short_circuit_for_probe(); // Since the comparison of null values is meaningless, null aware left anti/semi join should not output null // when the build side is not empty. diff --git a/be/src/pipeline/exec/hashjoin_build_sink.h b/be/src/pipeline/exec/hashjoin_build_sink.h index 45aa1e8c8a262d..cc78e6a769f3c3 100644 --- a/be/src/pipeline/exec/hashjoin_build_sink.h +++ b/be/src/pipeline/exec/hashjoin_build_sink.h @@ -106,7 +106,7 @@ class HashJoinBuildSinkOperatorX final : public JoinBuildSinkOperatorX { public: HashJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, bool use_global_rf); + const DescriptorTbl& descs); Status init(const TDataSink& tsink) override { return Status::InternalError("{} should not init with TDataSink", JoinBuildSinkOperatorX::_name); @@ -163,8 +163,6 @@ class HashJoinBuildSinkOperatorX final vectorized::SharedHashTableContextPtr _shared_hash_table_context = nullptr; const std::vector _partition_exprs; - const bool _need_local_merge; - std::vector _hash_output_slot_ids; std::vector _should_keep_column_flags; bool _should_keep_hash_key_column = false; @@ -199,7 +197,10 @@ struct ProcessHashTableBuild { SCOPED_TIMER(_parent->_build_table_insert_timer); hash_table_ctx.hash_table->template prepare_build(_rows, _batch_size, *has_null_key); - + // In order to make the null keys equal when using single null eq, all null keys need to be set to default value. + if (_build_raw_ptrs.size() == 1 && null_map) { + _build_raw_ptrs[0]->assume_mutable()->replace_column_null_data(null_map->data()); + } hash_table_ctx.init_serialized_keys(_build_raw_ptrs, _rows, null_map ? null_map->data() : nullptr, true, true, hash_table_ctx.hash_table->get_bucket_size()); diff --git a/be/src/pipeline/exec/hashjoin_probe_operator.cpp b/be/src/pipeline/exec/hashjoin_probe_operator.cpp index 426bfcb219dc04..0db525f1bf5222 100644 --- a/be/src/pipeline/exec/hashjoin_probe_operator.cpp +++ b/be/src/pipeline/exec/hashjoin_probe_operator.cpp @@ -17,6 +17,8 @@ #include "hashjoin_probe_operator.h" +#include + #include #include "common/cast_set.h" @@ -240,7 +242,7 @@ Status HashJoinProbeOperatorX::pull(doris::RuntimeState* state, vectorized::Bloc // If we use a short-circuit strategy, should return block directly by add additional null data. auto block_rows = local_state._probe_block.rows(); if (local_state._probe_eos && block_rows == 0) { - *eos = local_state._probe_eos; + *eos = true; return Status::OK(); } @@ -483,8 +485,6 @@ Status HashJoinProbeOperatorX::push(RuntimeState* state, vectorized::Block* inpu input_block->swap(local_state._probe_block); COUNTER_SET(local_state._memory_used_counter, (int64_t)local_state._probe_block.allocated_bytes()); - COUNTER_SET(local_state._peak_memory_usage_counter, - local_state._memory_used_counter->value()); } } return Status::OK(); @@ -618,21 +618,34 @@ Status HashJoinProbeOperatorX::open(RuntimeState* state) { size_t idx = 0; for (const auto* slot : slots_to_check) { auto data_type = slot->get_data_type_ptr(); - auto target_data_type = idx < right_col_idx ? _left_table_data_types[idx] - : _right_table_data_types[idx - right_col_idx]; + const auto slot_on_left = idx < right_col_idx; + auto target_data_type = slot_on_left ? _left_table_data_types[idx] + : _right_table_data_types[idx - right_col_idx]; ++idx; if (data_type->equals(*target_data_type)) { continue; } - auto data_type_non_nullable = vectorized::remove_nullable(data_type); - if (data_type_non_nullable->equals(*target_data_type)) { + /// For outer join(left/right/full), the non-nullable columns may be converted to nullable. + const auto accept_nullable_not_match = + _join_op == TJoinOp::FULL_OUTER_JOIN || + (slot_on_left ? _join_op == TJoinOp::RIGHT_OUTER_JOIN + : _join_op == TJoinOp::LEFT_OUTER_JOIN); + + if (accept_nullable_not_match) { + auto data_type_non_nullable = vectorized::remove_nullable(data_type); + if (data_type_non_nullable->equals(*target_data_type)) { + continue; + } + } else if (data_type->equals(*target_data_type)) { continue; } - return Status::InternalError("intermediate slot({}) data type not match: '{}' vs '{}'", - slot->id(), data_type->get_name(), - _left_table_data_types[idx]->get_name()); + return Status::InternalError( + "Join node(id={}, OP={}) intermediate slot({}, #{})'s on {} table data type not " + "match: '{}' vs '{}'", + _node_id, _join_op, slot->col_name(), slot->id(), (slot_on_left ? "left" : "right"), + data_type->get_name(), target_data_type->get_name()); } _build_side_child.reset(); diff --git a/be/src/pipeline/exec/hive_table_sink_operator.h b/be/src/pipeline/exec/hive_table_sink_operator.h index 58e705fd8e46c7..8af3e5bd5e9764 100644 --- a/be/src/pipeline/exec/hive_table_sink_operator.h +++ b/be/src/pipeline/exec/hive_table_sink_operator.h @@ -21,6 +21,7 @@ #include "vec/sink/writer/vhive_table_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class HiveTableSinkOperatorX; @@ -83,4 +84,5 @@ class HiveTableSinkOperatorX final : public DataSinkOperatorX { TOdbcTableType::type _table_type; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/jdbc_table_sink_operator.h b/be/src/pipeline/exec/jdbc_table_sink_operator.h index 3ea702fd0baf0a..a0dae301a5fcad 100644 --- a/be/src/pipeline/exec/jdbc_table_sink_operator.h +++ b/be/src/pipeline/exec/jdbc_table_sink_operator.h @@ -23,6 +23,7 @@ #include "vec/sink/writer/vjdbc_table_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class JdbcTableSinkOperatorX; class JdbcTableSinkLocalState final @@ -59,4 +60,5 @@ class JdbcTableSinkOperatorX final : public DataSinkOperatorX -struct Batch { - static constexpr uint32_t MAX_SIZE = 7; /// Adequate values are 3, 7, 15, 31. - - uint8_t size = 0; /// It's smaller than size_t but keeps align in Arena. - Batch* next = nullptr; - RowRefType row_refs[MAX_SIZE]; - - Batch(Batch* parent) : next(parent) {} - - bool full() const { return size == MAX_SIZE; } - - Batch* insert(RowRefType&& row_ref, vectorized::Arena& pool) { - if (full()) { - auto batch = pool.alloc>(); - *batch = Batch(this); - batch->insert(std::move(row_ref), pool); - return batch; - } - - row_refs[size++] = std::move(row_ref); - return this; - } -}; - -template -class ForwardIterator { -public: - using RowRefType = typename RowRefListType::RowRefType; - ForwardIterator() : root(nullptr), first(false), batch(nullptr), position(0) {} - - ForwardIterator(RowRefListType* begin) - : root(begin), first(true), batch((&root->next)), position(0) {} - - RowRefType& operator*() { - if (first) { - return *root; - } - return batch->operator[](position); - } - - RowRefType* operator->() { return &(**this); } - - void operator++() { - if (first) { - first = false; - return; - } - - if (batch && position < batch->size()) { - ++position; - } - } - - bool ok() const { return first || (batch && position < batch->size()); } - -private: - RowRefListType* root = nullptr; - bool first; - std::vector* batch = nullptr; - size_t position; -}; - -struct RowRefList : RowRef { - using RowRefType = RowRef; - - RowRefList() = default; - RowRefList(size_t row_num_) : RowRef(row_num_) {} - - ForwardIterator begin() { return {this}; } - - /// insert element after current one - void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } - - void clear() { next.clear(); } - -private: - friend class ForwardIterator; - std::vector next; -}; - -struct RowRefListWithFlag : RowRef { - using RowRefType = RowRef; - - RowRefListWithFlag() = default; - RowRefListWithFlag(size_t row_num_) : RowRef(row_num_) {} - - ForwardIterator begin() { return {this}; } - - /// insert element after current one - void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } - - void clear() { next.clear(); } - - bool visited = false; - -private: - friend class ForwardIterator; - std::vector next; -}; - -struct RowRefListWithFlags : RowRefWithFlag { - using RowRefType = RowRefWithFlag; - - RowRefListWithFlags() = default; - RowRefListWithFlags(size_t row_num_) : RowRefWithFlag(row_num_) {} - - ForwardIterator begin() { return {this}; } - - /// insert element after current one - void insert(RowRefType&& row_ref, vectorized::Arena& pool) { next.emplace_back(row_ref); } - - void clear() { next.clear(); } - -private: - friend class ForwardIterator; - std::vector next; -}; - -} // namespace doris diff --git a/be/src/pipeline/exec/join/process_hash_table_probe.h b/be/src/pipeline/exec/join/process_hash_table_probe.h index 14e0edd977f57b..91fd82f0644939 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe.h @@ -19,7 +19,6 @@ #include -#include "join_op.h" #include "vec/columns/column.h" #include "vec/columns/columns_number.h" #include "vec/common/arena.h" diff --git a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h index 05cd3d7d9e0590..24a9a7f67431da 100644 --- a/be/src/pipeline/exec/join/process_hash_table_probe_impl.h +++ b/be/src/pipeline/exec/join/process_hash_table_probe_impl.h @@ -173,6 +173,10 @@ typename HashTableType::State ProcessHashTableProbe::_init_probe_sid if (!_parent->_ready_probe) { _parent->_ready_probe = true; hash_table_ctx.reset(); + // In order to make the null keys equal when using single null eq, all null keys need to be set to default value. + if (_parent->_probe_columns.size() == 1 && null_map) { + _parent->_probe_columns[0]->assume_mutable()->replace_column_null_data(null_map); + } hash_table_ctx.init_serialized_keys(_parent->_probe_columns, probe_rows, null_map, true, false, hash_table_ctx.hash_table->get_bucket_size()); hash_table_ctx.hash_table->pre_build_idxs(hash_table_ctx.bucket_nums, @@ -180,7 +184,6 @@ typename HashTableType::State ProcessHashTableProbe::_init_probe_sid int64_t arena_memory_usage = hash_table_ctx.serialized_keys_size(false); COUNTER_SET(_parent->_probe_arena_memory_usage, arena_memory_usage); COUNTER_UPDATE(_parent->_memory_used_counter, arena_memory_usage); - COUNTER_SET(_parent->_peak_memory_usage_counter, _parent->_memory_used_counter->value()); } return typename HashTableType::State(_parent->_probe_columns); diff --git a/be/src/pipeline/exec/join_build_sink_operator.h b/be/src/pipeline/exec/join_build_sink_operator.h index 9d79a97397ff77..2a24f6a0492f3b 100644 --- a/be/src/pipeline/exec/join_build_sink_operator.h +++ b/be/src/pipeline/exec/join_build_sink_operator.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" template class JoinBuildSinkOperatorX; @@ -78,4 +79,5 @@ class JoinBuildSinkOperatorX : public DataSinkOperatorX { const std::vector _runtime_filter_descs; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/join_probe_operator.h b/be/src/pipeline/exec/join_probe_operator.h index 078806cea4fc5a..161fd18fa1dab8 100644 --- a/be/src/pipeline/exec/join_probe_operator.h +++ b/be/src/pipeline/exec/join_probe_operator.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" template class JoinProbeOperatorX; template @@ -123,4 +124,5 @@ class JoinProbeOperatorX : public StatefulOperatorX { const bool _use_specific_projections; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/memory_scratch_sink_operator.h b/be/src/pipeline/exec/memory_scratch_sink_operator.h index c74659d15b96f2..352826955fca99 100644 --- a/be/src/pipeline/exec/memory_scratch_sink_operator.h +++ b/be/src/pipeline/exec/memory_scratch_sink_operator.h @@ -23,6 +23,7 @@ #include "runtime/result_queue_mgr.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class MemoryScratchSinkOperatorX; class MemoryScratchSinkLocalState final : public PipelineXSinkLocalState { @@ -67,4 +68,5 @@ class MemoryScratchSinkOperatorX final : public DataSinkOperatorX { TUserIdentity _user_identity; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/multi_cast_data_stream_sink.h b/be/src/pipeline/exec/multi_cast_data_stream_sink.h index 57b5974064b6a2..9d69b3fb5bdc9e 100644 --- a/be/src/pipeline/exec/multi_cast_data_stream_sink.h +++ b/be/src/pipeline/exec/multi_cast_data_stream_sink.h @@ -20,6 +20,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class MultiCastDataStreamSinkOperatorX; class MultiCastDataStreamSinkLocalState final @@ -75,4 +76,5 @@ class MultiCastDataStreamSinkOperatorX final std::atomic _num_dests; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/multi_cast_data_streamer.h b/be/src/pipeline/exec/multi_cast_data_streamer.h index 51a73cf0c2b053..380538d0ac0805 100644 --- a/be/src/pipeline/exec/multi_cast_data_streamer.h +++ b/be/src/pipeline/exec/multi_cast_data_streamer.h @@ -20,6 +20,7 @@ #include "vec/sink/vdata_stream_sender.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class Dependency; struct MultiCastBlock { @@ -84,4 +85,5 @@ class MultiCastDataStreamer { std::vector _dependencies; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp index 83b378e792c3fa..35b9de619f393d 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.cpp @@ -43,7 +43,7 @@ struct RuntimeFilterBuild { } { SCOPED_TIMER(_parent->publish_runtime_filter_timer()); - RETURN_IF_ERROR(runtime_filter_slots.publish()); + RETURN_IF_ERROR(runtime_filter_slots.publish(state)); } return Status::OK(); @@ -66,8 +66,8 @@ Status NestedLoopJoinBuildSinkLocalState::init(RuntimeState* state, LocalSinkSta _shared_state->join_op_variants = p._join_op_variants; _runtime_filters.resize(p._runtime_filter_descs.size()); for (size_t i = 0; i < p._runtime_filter_descs.size(); i++) { - RETURN_IF_ERROR(state->register_producer_runtime_filter( - p._runtime_filter_descs[i], p._need_local_merge, &_runtime_filters[i], false)); + RETURN_IF_ERROR(state->register_producer_runtime_filter(p._runtime_filter_descs[i], + &_runtime_filters[i])); } return Status::OK(); } @@ -87,11 +87,9 @@ Status NestedLoopJoinBuildSinkLocalState::open(RuntimeState* state) { NestedLoopJoinBuildSinkOperatorX::NestedLoopJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, - bool need_local_merge) + const DescriptorTbl& descs) : JoinBuildSinkOperatorX(pool, operator_id, tnode, descs), - _need_local_merge(need_local_merge), _is_output_left_side_only(tnode.nested_loop_join_node.__isset.is_output_left_side_only && tnode.nested_loop_join_node.is_output_left_side_only), _row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples) {} diff --git a/be/src/pipeline/exec/nested_loop_join_build_operator.h b/be/src/pipeline/exec/nested_loop_join_build_operator.h index d6e72799f97d92..11bcba2bd8fc3a 100644 --- a/be/src/pipeline/exec/nested_loop_join_build_operator.h +++ b/be/src/pipeline/exec/nested_loop_join_build_operator.h @@ -23,6 +23,7 @@ #include "pipeline/exec/join_build_sink_operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class NestedLoopJoinBuildSinkOperatorX; @@ -59,7 +60,7 @@ class NestedLoopJoinBuildSinkOperatorX final : public JoinBuildSinkOperatorX { public: NestedLoopJoinBuildSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, bool need_local_merge); + const DescriptorTbl& descs); Status init(const TDataSink& tsink) override { return Status::InternalError( "{} should not init with TDataSink", @@ -85,9 +86,9 @@ class NestedLoopJoinBuildSinkOperatorX final vectorized::VExprContextSPtrs _filter_src_expr_ctxs; - bool _need_local_merge; const bool _is_output_left_side_only; RowDescriptor _row_descriptor; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp index afa1a2e59b798c..f4f4ef21ece746 100644 --- a/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp +++ b/be/src/pipeline/exec/nested_loop_join_probe_operator.cpp @@ -516,23 +516,20 @@ Status NestedLoopJoinProbeOperatorX::pull(RuntimeState* state, vectorized::Block local_state._matched_rows_done : local_state._matched_rows_done); + size_t join_block_column_size = local_state._join_block.columns(); { - vectorized::Block tmp_block = local_state._join_block; - - // Here make _join_block release the columns' ptr - local_state._join_block.set_columns(local_state._join_block.clone_empty_columns()); - - local_state.add_tuple_is_null_column(&tmp_block); + local_state.add_tuple_is_null_column(&local_state._join_block); { SCOPED_TIMER(local_state._join_filter_timer); RETURN_IF_ERROR(vectorized::VExprContext::filter_block( - local_state._conjuncts, &tmp_block, tmp_block.columns())); + local_state._conjuncts, &local_state._join_block, + local_state._join_block.columns())); } - RETURN_IF_ERROR(local_state._build_output_block(&tmp_block, block, false)); + RETURN_IF_ERROR( + local_state._build_output_block(&local_state._join_block, block, false)); local_state._reset_tuple_is_null_column(); } - local_state._join_block.clear_column_data(); - + local_state._join_block.clear_column_data(join_block_column_size); if (!(*eos) and !local_state._need_more_input_data) { auto func = [&](auto&& join_op_variants, auto set_build_side_flag, auto set_probe_side_flag) { diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 7b06e216b81bf7..34fa741ff1ec00 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -40,6 +40,7 @@ #include "vec/functions/in.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status OlapScanLocalState::_init_profile() { RETURN_IF_ERROR(ScanLocalState::_init_profile()); @@ -347,13 +348,13 @@ Status OlapScanLocalState::_init_scanners(std::list* s int ranges_per_scanner = std::max(1, (int)ranges->size() / std::min(scanners_per_tablet, size_based_scanners_per_tablet)); - int num_ranges = ranges->size(); - for (int i = 0; i < num_ranges;) { + int64_t num_ranges = ranges->size(); + for (int64_t i = 0; i < num_ranges;) { std::vector scanner_ranges; scanner_ranges.push_back((*ranges)[i].get()); ++i; - for (int j = 1; i < num_ranges && j < ranges_per_scanner && - (*ranges)[i]->end_include == (*ranges)[i - 1]->end_include; + for (int64_t j = 1; i < num_ranges && j < ranges_per_scanner && + (*ranges)[i]->end_include == (*ranges)[i - 1]->end_include; ++j, ++i) { scanner_ranges.push_back((*ranges)[i].get()); } @@ -587,4 +588,5 @@ OlapScanOperatorX::OlapScanOperatorX(ObjectPool* pool, const TPlanNode& tnode, i } } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/olap_scan_operator.h b/be/src/pipeline/exec/olap_scan_operator.h index 9e8624b3a0b255..91980d6a3f172b 100644 --- a/be/src/pipeline/exec/olap_scan_operator.h +++ b/be/src/pipeline/exec/olap_scan_operator.h @@ -26,6 +26,7 @@ #include "pipeline/exec/scan_operator.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class NewOlapScanner; @@ -198,4 +199,5 @@ class OlapScanOperatorX final : public ScanOperatorX { TQueryCacheParam _cache_param; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/olap_table_sink_operator.h b/be/src/pipeline/exec/olap_table_sink_operator.h index 8a9ffaaf769c31..3453a57a67b9bc 100644 --- a/be/src/pipeline/exec/olap_table_sink_operator.h +++ b/be/src/pipeline/exec/olap_table_sink_operator.h @@ -21,6 +21,7 @@ #include "vec/sink/writer/vtablet_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class OlapTableSinkOperatorX; @@ -75,4 +76,5 @@ class OlapTableSinkOperatorX final : public DataSinkOperatorX_peak_memory_usage_counter->set(local_state->_memory_used_counter->value()); return status; } @@ -441,11 +441,7 @@ PipelineXSinkLocalStateBase::PipelineXSinkLocalStateBase(DataSinkOperatorXBase* } PipelineXLocalStateBase::PipelineXLocalStateBase(RuntimeState* state, OperatorXBase* parent) - : _num_rows_returned(0), - _rows_returned_counter(nullptr), - _peak_memory_usage_counter(nullptr), - _parent(parent), - _state(state) { + : _num_rows_returned(0), _rows_returned_counter(nullptr), _parent(parent), _state(state) { _query_statistics = std::make_shared(); } @@ -484,9 +480,8 @@ Status PipelineXLocalState::init(RuntimeState* state, LocalState _open_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "OpenTime", 1); _close_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "CloseTime", 1); _exec_timer = ADD_TIMER_WITH_LEVEL(_runtime_profile, "ExecTime", 1); - _memory_used_counter = ADD_COUNTER_WITH_LEVEL(_runtime_profile, "MemoryUsage", TUnit::BYTES, 1); - _peak_memory_usage_counter = - _runtime_profile->AddHighWaterMarkCounter("MemoryUsagePeak", TUnit::BYTES, "", 1); + _memory_used_counter = + _runtime_profile->AddHighWaterMarkCounter("MemoryUsage", TUnit::BYTES, "", 1); return Status::OK(); } @@ -519,9 +514,6 @@ Status PipelineXLocalState::close(RuntimeState* state) { if constexpr (!std::is_same_v) { COUNTER_SET(_wait_for_dependency_timer, _dependency->watcher_elapse_time()); } - if (_peak_memory_usage_counter) { - _peak_memory_usage_counter->set(_memory_used_counter->value()); - } _closed = true; // Some kinds of source operators has a 1-1 relationship with a sink operator (such as AnalyticOperator). // We must ensure AnalyticSinkOperator will not be blocked if AnalyticSourceOperator already closed. @@ -560,9 +552,7 @@ Status PipelineXSinkLocalState::init(RuntimeState* state, LocalSink _close_timer = ADD_TIMER_WITH_LEVEL(_profile, "CloseTime", 1); _exec_timer = ADD_TIMER_WITH_LEVEL(_profile, "ExecTime", 1); info.parent_profile->add_child(_profile, true, nullptr); - _memory_used_counter = ADD_COUNTER_WITH_LEVEL(_profile, "MemoryUsage", TUnit::BYTES, 1); - _peak_memory_usage_counter = - _profile->AddHighWaterMarkCounter("MemoryUsagePeak", TUnit::BYTES, "", 1); + _memory_used_counter = _profile->AddHighWaterMarkCounter("MemoryUsage", TUnit::BYTES, "", 1); return Status::OK(); } @@ -574,9 +564,6 @@ Status PipelineXSinkLocalState::close(RuntimeState* state, Status e if constexpr (!std::is_same_v) { COUNTER_SET(_wait_for_dependency_timer, _dependency->watcher_elapse_time()); } - if (_peak_memory_usage_counter) { - _peak_memory_usage_counter->set(_memory_used_counter->value()); - } _closed = true; return Status::OK(); } @@ -794,4 +781,5 @@ template class AsyncWriterSink; template class AsyncWriterSink; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/operator.h b/be/src/pipeline/exec/operator.h index 6053b1a2f48e87..a2c8e110cedac3 100644 --- a/be/src/pipeline/exec/operator.h +++ b/be/src/pipeline/exec/operator.h @@ -39,6 +39,7 @@ #include "vec/runtime/vdata_stream_recvr.h" namespace doris { +#include "common/compile_check_begin.h" class RowDescriptor; class RuntimeState; class TDataSink; @@ -165,9 +166,6 @@ class PipelineXLocalStateBase { RuntimeProfile::Counter* exec_time_counter() { return _exec_timer; } RuntimeProfile::Counter* memory_used_counter() { return _memory_used_counter; } - RuntimeProfile::HighWaterMarkCounter* peak_memory_usage_counter() { - return _peak_memory_usage_counter; - } OperatorXBase* parent() { return _parent; } RuntimeState* state() { return _state; } vectorized::VExprContextSPtrs& conjuncts() { return _conjuncts; } @@ -202,11 +200,10 @@ class PipelineXLocalStateBase { RuntimeProfile::Counter* _rows_returned_counter = nullptr; RuntimeProfile::Counter* _blocks_returned_counter = nullptr; RuntimeProfile::Counter* _wait_for_dependency_timer = nullptr; - RuntimeProfile::Counter* _memory_used_counter = nullptr; + // Account for current memory and peak memory used by this node + RuntimeProfile::HighWaterMarkCounter* _memory_used_counter = nullptr; RuntimeProfile::Counter* _projection_timer = nullptr; RuntimeProfile::Counter* _exec_timer = nullptr; - // Account for peak memory used by this node - RuntimeProfile::HighWaterMarkCounter* _peak_memory_usage_counter = nullptr; RuntimeProfile::Counter* _init_timer = nullptr; RuntimeProfile::Counter* _open_timer = nullptr; RuntimeProfile::Counter* _close_timer = nullptr; @@ -348,9 +345,6 @@ class PipelineXSinkLocalStateBase { RuntimeProfile::Counter* rows_input_counter() { return _rows_input_counter; } RuntimeProfile::Counter* exec_time_counter() { return _exec_timer; } RuntimeProfile::Counter* memory_used_counter() { return _memory_used_counter; } - RuntimeProfile::HighWaterMarkCounter* peak_memory_usage_counter() { - return _peak_memory_usage_counter; - } virtual std::vector dependencies() const { return {nullptr}; } // override in exchange sink , AsyncWriterSink @@ -380,8 +374,7 @@ class PipelineXSinkLocalStateBase { RuntimeProfile::Counter* _wait_for_dependency_timer = nullptr; RuntimeProfile::Counter* _wait_for_finish_dependency_timer = nullptr; RuntimeProfile::Counter* _exec_timer = nullptr; - RuntimeProfile::Counter* _memory_used_counter = nullptr; - RuntimeProfile::HighWaterMarkCounter* _peak_memory_usage_counter = nullptr; + RuntimeProfile::HighWaterMarkCounter* _memory_used_counter = nullptr; std::shared_ptr _query_statistics = nullptr; }; @@ -867,4 +860,5 @@ class AsyncWriterSink : public PipelineXSinkLocalState { std::shared_ptr _finish_dependency; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.cpp b/be/src/pipeline/exec/partition_sort_sink_operator.cpp index 48b8fe9cb765a1..d0c28afe9de5ba 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/partition_sort_sink_operator.cpp @@ -24,6 +24,7 @@ #include "vec/common/hash_table/hash.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status PartitionSortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(PipelineXSinkLocalState::init(state, info)); @@ -66,7 +67,7 @@ PartitionSortSinkOperatorX::PartitionSortSinkOperatorX(ObjectPool* pool, int ope _pool(pool), _row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), _limit(tnode.limit), - _partition_exprs_num(tnode.partition_sort_node.partition_exprs.size()), + _partition_exprs_num(cast_set(tnode.partition_sort_node.partition_exprs.size())), _topn_phase(tnode.partition_sort_node.ptopn_phase), _has_global_limit(tnode.partition_sort_node.has_global_limit), _top_n_algorithm(tnode.partition_sort_node.top_n_algorithm), @@ -212,7 +213,7 @@ Status PartitionSortSinkOperatorX::_emplace_into_hash_table( }; SCOPED_TIMER(local_state._emplace_key_timer); - int row = num_rows; + int64_t row = num_rows; for (row = row - 1; row >= 0 && !local_state._is_need_passthrough; --row) { auto& mapped = *agg_method.lazy_emplace(state, row, creator, creator_for_null_key); @@ -274,4 +275,5 @@ bool PartitionSortSinkLocalState::check_whether_need_passthrough() { } // NOLINTEND(readability-simplify-boolean-expr) +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partition_sort_sink_operator.h b/be/src/pipeline/exec/partition_sort_sink_operator.h index 6926445f18f2f4..32bbf38202713f 100644 --- a/be/src/pipeline/exec/partition_sort_sink_operator.h +++ b/be/src/pipeline/exec/partition_sort_sink_operator.h @@ -24,6 +24,7 @@ #include "vec/common/sort/partition_sorter.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class PartitionSortSinkOperatorX; class PartitionSortSinkLocalState : public PipelineXSinkLocalState { @@ -110,4 +111,5 @@ class PartitionSortSinkOperatorX final : public DataSinkOperatorXfragment_instance_id(), state->query_id(), state->fragment_id(), + state->fragment_instance_id(), state->query_id(), state->fragment_id(), state->query_options(), TQueryGlobals {}, state->exec_env(), state->get_query_ctx()); _runtime_state->set_task_execution_context(state->get_task_execution_context().lock()); _runtime_state->set_be_number(state->be_number()); @@ -319,4 +320,5 @@ Status PartitionedAggSinkLocalState::revoke_memory(RuntimeState* state) { std::move(spill_runnable)); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h index 15f6b22387a8e2..63cd95534dc9f5 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_aggregation_sink_operator.h @@ -23,6 +23,7 @@ #include "vec/spill/spill_stream_manager.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class PartitionedAggSinkOperatorX; class PartitionedAggSinkLocalState : public PipelineXSpillSinkLocalState { @@ -324,4 +325,5 @@ class PartitionedAggSinkOperatorX : public DataSinkOperatorXfragment_instance_id(), state->query_id(), state->fragment_id(), + state->fragment_instance_id(), state->query_id(), state->fragment_id(), state->query_options(), TQueryGlobals {}, state->exec_env(), state->get_query_ctx()); _runtime_state->set_task_execution_context(state->get_task_execution_context().lock()); _runtime_state->set_be_number(state->be_number()); @@ -295,4 +296,5 @@ Status PartitionedAggLocalState::initiate_merge_spill_partition_agg_data(Runtime std::make_shared(state, _shared_state->shared_from_this(), exception_catch_func)); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h index 7e73241745e029..6fb0ecaba01e20 100644 --- a/be/src/pipeline/exec/partitioned_aggregation_source_operator.h +++ b/be/src/pipeline/exec/partitioned_aggregation_source_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -99,4 +100,5 @@ class PartitionedAggSourceOperatorX : public OperatorX std::unique_ptr _agg_source_operator; }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp index 0e56acc1c574b2..3e7f95374f53d2 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.cpp @@ -23,6 +23,7 @@ #include "vec/spill/spill_stream_manager.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" PartitionedHashJoinProbeLocalState::PartitionedHashJoinProbeLocalState(RuntimeState* state, OperatorXBase* parent) @@ -606,7 +607,7 @@ Status PartitionedHashJoinProbeOperatorX::_setup_internal_operators( } local_state._runtime_state = RuntimeState::create_unique( - nullptr, state->fragment_instance_id(), state->query_id(), state->fragment_id(), + state->fragment_instance_id(), state->query_id(), state->fragment_id(), state->query_options(), TQueryGlobals {}, state->exec_env(), state->get_query_ctx()); local_state._runtime_state->set_task_execution_context( @@ -866,4 +867,5 @@ Status PartitionedHashJoinProbeOperatorX::get_block(RuntimeState* state, vectori return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h index f8fc0780b6fc3f..a19e88d7203e62 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_probe_operator.h @@ -27,6 +27,7 @@ #include "pipeline/exec/spill_utils.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -213,4 +214,5 @@ class PartitionedHashJoinProbeOperatorX final }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp index 83a205e59c78fb..852dccae71ca3b 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.cpp @@ -23,6 +23,7 @@ #include "vec/spill/spill_stream_manager.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status PartitionedHashJoinSinkLocalState::init(doris::RuntimeState* state, doris::pipeline::LocalSinkStateInfo& info) { @@ -246,11 +247,11 @@ Status PartitionedHashJoinSinkLocalState::revoke_memory(RuntimeState* state) { return _revoke_unpartitioned_block(state); } - _spilling_streams_count = _shared_state->partitioned_build_blocks.size(); + _spilling_streams_count = cast_set(_shared_state->partitioned_build_blocks.size()); auto query_id = state->query_id(); - for (size_t i = 0; i != _shared_state->partitioned_build_blocks.size(); ++i) { + for (int i = 0; i != _shared_state->partitioned_build_blocks.size(); ++i) { vectorized::SpillStreamSPtr& spilling_stream = _shared_state->spilled_streams[i]; auto& mutable_block = _shared_state->partitioned_build_blocks[i]; @@ -393,9 +394,11 @@ void PartitionedHashJoinSinkLocalState::_spill_to_disk( } } -PartitionedHashJoinSinkOperatorX::PartitionedHashJoinSinkOperatorX( - ObjectPool* pool, int operator_id, const TPlanNode& tnode, const DescriptorTbl& descs, - bool use_global_rf, uint32_t partition_count) +PartitionedHashJoinSinkOperatorX::PartitionedHashJoinSinkOperatorX(ObjectPool* pool, + int operator_id, + const TPlanNode& tnode, + const DescriptorTbl& descs, + uint32_t partition_count) : JoinBuildSinkOperatorX(pool, operator_id, tnode, descs), _join_distribution(tnode.hash_join_node.__isset.dist_type ? tnode.hash_join_node.dist_type @@ -436,7 +439,7 @@ Status PartitionedHashJoinSinkOperatorX::_setup_internal_operator(RuntimeState* auto& local_state = get_local_state(state); local_state._shared_state->inner_runtime_state = RuntimeState::create_unique( - nullptr, state->fragment_instance_id(), state->query_id(), state->fragment_id(), + state->fragment_instance_id(), state->query_id(), state->fragment_id(), state->query_options(), TQueryGlobals {}, state->exec_env(), state->get_query_ctx()); local_state._shared_state->inner_runtime_state->set_task_execution_context( state->get_task_execution_context().lock()); @@ -553,4 +556,5 @@ Status PartitionedHashJoinSinkOperatorX::revoke_memory(RuntimeState* state) { return local_state.revoke_memory(state); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h index 8e89763b50a9d5..e16e52dcaf9453 100644 --- a/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h +++ b/be/src/pipeline/exec/partitioned_hash_join_sink_operator.h @@ -28,6 +28,7 @@ #include "vec/runtime/partitioner.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -82,8 +83,7 @@ class PartitionedHashJoinSinkOperatorX : public JoinBuildSinkOperatorX { public: PartitionedHashJoinSinkOperatorX(ObjectPool* pool, int operator_id, const TPlanNode& tnode, - const DescriptorTbl& descs, bool use_global_rf, - uint32_t partition_count); + const DescriptorTbl& descs, uint32_t partition_count); Status init(const TDataSink& tsink) override { return Status::InternalError("{} should not init with TDataSink", @@ -149,4 +149,5 @@ class PartitionedHashJoinSinkOperatorX }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/repeat_operator.cpp b/be/src/pipeline/exec/repeat_operator.cpp index 5c94d43f0d1e05..48131e0d96e4c6 100644 --- a/be/src/pipeline/exec/repeat_operator.cpp +++ b/be/src/pipeline/exec/repeat_operator.cpp @@ -24,6 +24,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -221,8 +222,7 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp _repeat_id_idx++; - int size = _repeat_id_list.size(); - if (_repeat_id_idx >= size) { + if (_repeat_id_idx >= _repeat_id_list.size()) { _intermediate_block->clear(); _child_block.clear_column_data(_child->row_desc().num_materialized_slots()); _repeat_id_idx = 0; @@ -251,4 +251,5 @@ Status RepeatOperatorX::pull(doris::RuntimeState* state, vectorized::Block* outp return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/repeat_operator.h b/be/src/pipeline/exec/repeat_operator.h index 31f88f37231aaa..2c2af32de0b0fb 100644 --- a/be/src/pipeline/exec/repeat_operator.h +++ b/be/src/pipeline/exec/repeat_operator.h @@ -23,6 +23,7 @@ #include "pipeline/exec/operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -92,4 +93,5 @@ class RepeatOperatorX final : public StatefulOperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/result_file_sink_operator.cpp b/be/src/pipeline/exec/result_file_sink_operator.cpp index c65b9dda89d0ec..f806d9533d9e4c 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.cpp +++ b/be/src/pipeline/exec/result_file_sink_operator.cpp @@ -28,6 +28,7 @@ #include "vec/sink/vdata_stream_sender.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" ResultFileSinkLocalState::ResultFileSinkLocalState(DataSinkOperatorXBase* parent, RuntimeState* state) @@ -143,4 +144,5 @@ Status ResultFileSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_ return local_state.sink(state, in_block, eos); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/result_file_sink_operator.h b/be/src/pipeline/exec/result_file_sink_operator.h index e9f2b8eeb9c670..c3c5e345f77e1a 100644 --- a/be/src/pipeline/exec/result_file_sink_operator.h +++ b/be/src/pipeline/exec/result_file_sink_operator.h @@ -21,6 +21,7 @@ #include "vec/sink/writer/vfile_result_writer.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class BroadcastPBlockHolder; } // namespace doris::vectorized @@ -88,4 +89,5 @@ class ResultFileSinkOperatorX final : public DataSinkOperatorX _sender = nullptr; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/result_sink_operator.cpp b/be/src/pipeline/exec/result_sink_operator.cpp index a3f1133f00e78e..8aeecbbddc12dc 100644 --- a/be/src/pipeline/exec/result_sink_operator.cpp +++ b/be/src/pipeline/exec/result_sink_operator.cpp @@ -35,6 +35,7 @@ #include "vec/sink/vmysql_result_writer.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -46,14 +47,25 @@ Status ResultSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) _wait_for_dependency_timer = ADD_TIMER_WITH_LEVEL(_profile, timer_name, 1); auto fragment_instance_id = state->fragment_instance_id(); + auto& p = _parent->cast(); if (state->query_options().enable_parallel_result_sink) { _sender = _parent->cast()._sender; } else { - auto& p = _parent->cast(); RETURN_IF_ERROR(state->exec_env()->result_mgr()->create_sender( fragment_instance_id, p._result_sink_buffer_size_rows, &_sender, state)); } _sender->set_dependency(fragment_instance_id, _dependency->shared_from_this()); + + _output_vexpr_ctxs.resize(p._output_vexpr_ctxs.size()); + for (size_t i = 0; i < _output_vexpr_ctxs.size(); i++) { + RETURN_IF_ERROR(p._output_vexpr_ctxs[i]->clone(state, _output_vexpr_ctxs[i])); + } + if (p._sink_type == TResultSinkType::ARROW_FLIGHT_PROTOCAL) { + std::shared_ptr arrow_schema; + RETURN_IF_ERROR(get_arrow_schema_from_expr_ctxs(_output_vexpr_ctxs, &arrow_schema, + state->timezone())); + _sender->register_arrow_schema(arrow_schema); + } return Status::OK(); } @@ -62,10 +74,6 @@ Status ResultSinkLocalState::open(RuntimeState* state) { SCOPED_TIMER(_open_timer); RETURN_IF_ERROR(Base::open(state)); auto& p = _parent->cast(); - _output_vexpr_ctxs.resize(p._output_vexpr_ctxs.size()); - for (size_t i = 0; i < _output_vexpr_ctxs.size(); i++) { - RETURN_IF_ERROR(p._output_vexpr_ctxs[i]->clone(state, _output_vexpr_ctxs[i])); - } // create writer based on sink type switch (p._sink_type) { case TResultSinkType::MYSQL_PROTOCAL: { @@ -79,10 +87,6 @@ Status ResultSinkLocalState::open(RuntimeState* state) { break; } case TResultSinkType::ARROW_FLIGHT_PROTOCAL: { - std::shared_ptr arrow_schema; - RETURN_IF_ERROR(get_arrow_schema_from_expr_ctxs(_output_vexpr_ctxs, &arrow_schema, - state->timezone())); - _sender->register_arrow_schema(arrow_schema); _writer.reset(new (std::nothrow) vectorized::VArrowFlightResultWriter( _sender.get(), _output_vexpr_ctxs, _profile)); break; @@ -205,4 +209,5 @@ Status ResultSinkLocalState::close(RuntimeState* state, Status exec_status) { return final_status; } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/result_sink_operator.h b/be/src/pipeline/exec/result_sink_operator.h index 339c167825643b..479343ed6d5ea5 100644 --- a/be/src/pipeline/exec/result_sink_operator.h +++ b/be/src/pipeline/exec/result_sink_operator.h @@ -25,6 +25,7 @@ #include "runtime/result_writer.h" namespace doris { +#include "common/compile_check_begin.h" class BufferControlBlock; namespace pipeline { @@ -172,4 +173,5 @@ class ResultSinkOperatorX final : public DataSinkOperatorX }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/scan_operator.cpp b/be/src/pipeline/exec/scan_operator.cpp index 21c3103fe5a708..ae4396b22c7eec 100644 --- a/be/src/pipeline/exec/scan_operator.cpp +++ b/be/src/pipeline/exec/scan_operator.cpp @@ -1037,7 +1037,6 @@ Status ScanLocalState::_init_profile() { _total_throughput_counter = profile()->add_rate_counter("TotalReadThroughput", _rows_read_counter); _num_scanners = ADD_COUNTER(_runtime_profile, "NumScanners", TUnit::UNIT); - _scanner_peak_memory_usage = _peak_memory_usage_counter; //_runtime_profile->AddHighWaterMarkCounter("PeakMemoryUsage", TUnit::BYTES); // 2. counters for scanners diff --git a/be/src/pipeline/exec/scan_operator.h b/be/src/pipeline/exec/scan_operator.h index 5d41c800383bd0..c6c9cdf405d5a4 100644 --- a/be/src/pipeline/exec/scan_operator.h +++ b/be/src/pipeline/exec/scan_operator.h @@ -35,8 +35,9 @@ #include "vec/utils/util.hpp" namespace doris::vectorized { +#include "common/compile_check_begin.h" class ScannerDelegate; -} +} // namespace doris::vectorized namespace doris::pipeline { @@ -108,7 +109,6 @@ class ScanLocalStateBase : public PipelineXLocalState<>, public RuntimeFilterCon // Max num of scanner thread RuntimeProfile::Counter* _max_scanner_thread_num = nullptr; RuntimeProfile::HighWaterMarkCounter* _peak_running_scanner = nullptr; - RuntimeProfile::HighWaterMarkCounter* _scanner_peak_memory_usage = nullptr; // time of get block from scanner RuntimeProfile::Counter* _scan_timer = nullptr; RuntimeProfile::Counter* _scan_cpu_timer = nullptr; @@ -437,4 +437,5 @@ class ScanOperatorX : public OperatorX { std::vector topn_filter_source_node_ids; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/schema_scan_operator.cpp b/be/src/pipeline/exec/schema_scan_operator.cpp index 006ecf8ad82e84..2e2f80f5e24838 100644 --- a/be/src/pipeline/exec/schema_scan_operator.cpp +++ b/be/src/pipeline/exec/schema_scan_operator.cpp @@ -26,6 +26,7 @@ #include "vec/data_types/data_type_factory.hpp" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -48,7 +49,7 @@ Status SchemaScanLocalState::init(RuntimeState* state, LocalStateInfo& info) { // new one scanner _schema_scanner = SchemaScanner::create(schema_table->schema_table_type()); - _schema_scanner->set_dependency(_data_dependency, _finish_dependency); + _schema_scanner->set_dependency(_data_dependency); if (nullptr == _schema_scanner) { return Status::InternalError("schema scanner get nullptr pointer."); } @@ -144,7 +145,7 @@ Status SchemaScanOperatorX::open(RuntimeState* state) { return Status::InternalError("Failed to get tuple descriptor."); } - _slot_num = _dest_tuple_desc->slots().size(); + _slot_num = cast_set(_dest_tuple_desc->slots().size()); // get src tuple desc const auto* schema_table = static_cast(_dest_tuple_desc->table_desc()); @@ -266,10 +267,8 @@ Status SchemaScanOperatorX::get_block(RuntimeState* state, vectorized::Block* bl } while (block->rows() == 0 && !*eos); local_state.reached_limit(block, eos); - if (*eos) { - local_state._finish_dependency->set_always_ready(); - } return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/schema_scan_operator.h b/be/src/pipeline/exec/schema_scan_operator.h index 03cf422fbc52e6..2d861002748163 100644 --- a/be/src/pipeline/exec/schema_scan_operator.h +++ b/be/src/pipeline/exec/schema_scan_operator.h @@ -24,6 +24,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -36,9 +37,6 @@ class SchemaScanLocalState final : public PipelineXLocalState<> { SchemaScanLocalState(RuntimeState* state, OperatorXBase* parent) : PipelineXLocalState<>(state, parent) { - _finish_dependency = - std::make_shared(parent->operator_id(), parent->node_id(), - parent->get_name() + "_FINISH_DEPENDENCY", true); _data_dependency = std::make_shared(parent->operator_id(), parent->node_id(), parent->get_name() + "_DEPENDENCY", true); } @@ -48,7 +46,6 @@ class SchemaScanLocalState final : public PipelineXLocalState<> { Status open(RuntimeState* state) override; - Dependency* finishdependency() override { return _finish_dependency.get(); } std::vector dependencies() const override { return {_data_dependency.get()}; } private: @@ -57,7 +54,6 @@ class SchemaScanLocalState final : public PipelineXLocalState<> { SchemaScannerParam _scanner_param; std::unique_ptr _schema_scanner; - std::shared_ptr _finish_dependency; std::shared_ptr _data_dependency; }; @@ -93,4 +89,5 @@ class SchemaScanOperatorX final : public OperatorX { std::unique_ptr _schema_scanner; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/select_operator.h b/be/src/pipeline/exec/select_operator.h index 5370cd9e293c34..584a6f74308903 100644 --- a/be/src/pipeline/exec/select_operator.h +++ b/be/src/pipeline/exec/select_operator.h @@ -22,6 +22,7 @@ #include "operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class SelectOperatorX; class SelectLocalState final : public PipelineXLocalState { @@ -55,4 +56,5 @@ class SelectOperatorX final : public StreamingOperatorX { [[nodiscard]] bool is_source() const override { return false; } }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/set_probe_sink_operator.cpp b/be/src/pipeline/exec/set_probe_sink_operator.cpp index 4c250d5603b499..db487b0f9e7252 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.cpp +++ b/be/src/pipeline/exec/set_probe_sink_operator.cpp @@ -25,6 +25,7 @@ #include "vec/common/hash_table/hash_table_set_probe.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -69,7 +70,7 @@ Status SetProbeSinkOperatorX::sink(RuntimeState* state, vectorized SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); - auto probe_rows = in_block->rows(); + uint32_t probe_rows = cast_set(in_block->rows()); if (probe_rows > 0) { { SCOPED_TIMER(local_state._extract_probe_data_timer); @@ -220,8 +221,8 @@ void SetProbeSinkOperatorX::_refresh_hash_table( ? (valid_element_in_hash_tbl < arg.hash_table ->size()) // When intersect, shrink as long as the element decreases - : (valid_element_in_hash_tbl < - arg.hash_table->size() * + : ((double)valid_element_in_hash_tbl < + (double)arg.hash_table->size() * need_shrink_ratio); // When except, element decreases need to within the 'need_shrink_ratio' before shrinking if (is_need_shrink) { @@ -231,7 +232,7 @@ void SetProbeSinkOperatorX::_refresh_hash_table( local_state._shared_state->valid_element_in_hash_tbl); while (iter != iter_end) { auto& mapped = iter->get_second(); - auto it = mapped.begin(); + auto* it = &mapped; if constexpr (is_intersect) { if (it->visited) { @@ -249,7 +250,7 @@ void SetProbeSinkOperatorX::_refresh_hash_table( } else if (is_intersect) { while (iter != iter_end) { auto& mapped = iter->get_second(); - auto it = mapped.begin(); + auto* it = &mapped; it->visited = false; ++iter; } @@ -269,4 +270,5 @@ template class SetProbeSinkLocalState; template class SetProbeSinkOperatorX; template class SetProbeSinkOperatorX; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/set_probe_sink_operator.h b/be/src/pipeline/exec/set_probe_sink_operator.h index 368ea812cdfe01..6b764c1e509951 100644 --- a/be/src/pipeline/exec/set_probe_sink_operator.h +++ b/be/src/pipeline/exec/set_probe_sink_operator.h @@ -23,6 +23,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -116,4 +117,5 @@ class SetProbeSinkOperatorX final : public DataSinkOperatorX::_get_data_in_hashtable( auto block_size = 0; auto add_result = [&local_state, &block_size, this](auto value) { - auto it = value.begin(); + auto* it = &value; if constexpr (is_intersect) { if (it->visited) { //intersected: have done probe, so visited values it's the result _add_result_columns(local_state, value, block_size); @@ -147,8 +147,8 @@ Status SetSourceOperatorX::_get_data_in_hashtable( *eos = iter == hash_table_ctx.hash_table->end(); if (*eos && hash_table_ctx.hash_table->has_null_key_data()) { - auto value = hash_table_ctx.hash_table->template get_null_key_data(); - if constexpr (std::is_same_v>) { + auto value = hash_table_ctx.hash_table->template get_null_key_data(); + if constexpr (std::is_same_v>) { add_result(value); } } @@ -168,15 +168,13 @@ Status SetSourceOperatorX::_get_data_in_hashtable( template void SetSourceOperatorX::_add_result_columns( - SetSourceLocalState& local_state, RowRefListWithFlags& value, - int& block_size) { + SetSourceLocalState& local_state, RowRefWithFlag& value, int& block_size) { auto& build_col_idx = local_state._shared_state->build_col_idx; auto& build_block = local_state._shared_state->build_block; - auto it = value.begin(); for (auto idx = build_col_idx.begin(); idx != build_col_idx.end(); ++idx) { auto& column = *build_block.get_by_position(idx->second).column; - local_state._mutable_cols[idx->first]->insert_from(column, it->row_num); + local_state._mutable_cols[idx->first]->insert_from(column, value.row_num); } block_size++; } diff --git a/be/src/pipeline/exec/set_source_operator.h b/be/src/pipeline/exec/set_source_operator.h index 976ffde3bf23ea..d881e9277fb7b6 100644 --- a/be/src/pipeline/exec/set_source_operator.h +++ b/be/src/pipeline/exec/set_source_operator.h @@ -83,8 +83,8 @@ class SetSourceOperatorX final : public OperatorX& local_state, - RowRefListWithFlags& value, int& block_size); + void _add_result_columns(SetSourceLocalState& local_state, RowRefWithFlag& value, + int& block_size); const size_t _child_quantity; }; #include "common/compile_check_end.h" diff --git a/be/src/pipeline/exec/sort_sink_operator.cpp b/be/src/pipeline/exec/sort_sink_operator.cpp index faec4961af93b7..6bec42ac62d192 100644 --- a/be/src/pipeline/exec/sort_sink_operator.cpp +++ b/be/src/pipeline/exec/sort_sink_operator.cpp @@ -25,6 +25,7 @@ #include "vec/common/sort/topn_sorter.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status SortSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -128,7 +129,6 @@ Status SortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Block* in int64_t data_size = local_state._shared_state->sorter->data_size(); COUNTER_SET(local_state._sort_blocks_memory_usage, data_size); COUNTER_SET(local_state._memory_used_counter, data_size); - COUNTER_SET(local_state._peak_memory_usage_counter, data_size); RETURN_IF_CANCELLED(state); @@ -177,4 +177,5 @@ void SortSinkOperatorX::reset(RuntimeState* state) { auto& local_state = get_local_state(state); local_state._shared_state->sorter->reset(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/sort_sink_operator.h b/be/src/pipeline/exec/sort_sink_operator.h index 6bf87164e71026..766c6c0ffc9a59 100644 --- a/be/src/pipeline/exec/sort_sink_operator.h +++ b/be/src/pipeline/exec/sort_sink_operator.h @@ -23,6 +23,7 @@ #include "vec/core/field.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" class SortSinkOperatorX; @@ -109,4 +110,5 @@ class SortSinkOperatorX final : public DataSinkOperatorX { const bool _reuse_mem; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/sort_source_operator.cpp b/be/src/pipeline/exec/sort_source_operator.cpp index 7f801b79c0b12b..2fb09d7278fda8 100644 --- a/be/src/pipeline/exec/sort_source_operator.cpp +++ b/be/src/pipeline/exec/sort_source_operator.cpp @@ -22,6 +22,7 @@ #include "pipeline/exec/operator.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" SortLocalState::SortLocalState(RuntimeState* state, OperatorXBase* parent) : PipelineXLocalState(state, parent) {} @@ -79,4 +80,5 @@ Status SortSourceOperatorX::build_merger(RuntimeState* state, return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/sort_source_operator.h b/be/src/pipeline/exec/sort_source_operator.h index 20714eb44e5e60..a638b04b368eaa 100644 --- a/be/src/pipeline/exec/sort_source_operator.h +++ b/be/src/pipeline/exec/sort_source_operator.h @@ -23,6 +23,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -69,4 +70,5 @@ class SortSourceOperatorX final : public OperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/spill_sort_sink_operator.cpp b/be/src/pipeline/exec/spill_sort_sink_operator.cpp index 267bcc83aad92c..6071301c1d7bcc 100644 --- a/be/src/pipeline/exec/spill_sort_sink_operator.cpp +++ b/be/src/pipeline/exec/spill_sort_sink_operator.cpp @@ -80,7 +80,7 @@ Status SpillSortSinkLocalState::close(RuntimeState* state, Status execsink_statu Status SpillSortSinkLocalState::setup_in_memory_sort_op(RuntimeState* state) { _runtime_state = RuntimeState::create_unique( - nullptr, state->fragment_instance_id(), state->query_id(), state->fragment_id(), + state->fragment_instance_id(), state->query_id(), state->fragment_id(), state->query_options(), TQueryGlobals {}, state->exec_env(), state->get_query_ctx()); _runtime_state->set_task_execution_context(state->get_task_execution_context().lock()); _runtime_state->set_be_number(state->be_number()); @@ -160,7 +160,6 @@ Status SpillSortSinkOperatorX::sink(doris::RuntimeState* state, vectorized::Bloc int64_t data_size = local_state._shared_state->in_mem_shared_state->sorter->data_size(); COUNTER_SET(local_state._sort_blocks_memory_usage, data_size); COUNTER_SET(local_state._memory_used_counter, data_size); - COUNTER_SET(local_state._peak_memory_usage_counter, data_size); if (eos) { if (local_state._shared_state->is_spilled) { diff --git a/be/src/pipeline/exec/spill_sort_source_operator.cpp b/be/src/pipeline/exec/spill_sort_source_operator.cpp index e766cb27168de1..69ed816fa9142d 100644 --- a/be/src/pipeline/exec/spill_sort_source_operator.cpp +++ b/be/src/pipeline/exec/spill_sort_source_operator.cpp @@ -212,7 +212,7 @@ Status SpillSortLocalState::_create_intermediate_merger( } Status SpillSortLocalState::setup_in_memory_sort_op(RuntimeState* state) { _runtime_state = RuntimeState::create_unique( - nullptr, state->fragment_instance_id(), state->query_id(), state->fragment_id(), + state->fragment_instance_id(), state->query_id(), state->fragment_id(), state->query_options(), TQueryGlobals {}, state->exec_env(), state->get_query_ctx()); _runtime_state->set_task_execution_context(state->get_task_execution_context().lock()); _runtime_state->set_be_number(state->be_number()); diff --git a/be/src/pipeline/exec/spill_utils.h b/be/src/pipeline/exec/spill_utils.h index 925e7df44e607e..2ba6f22a60b10c 100644 --- a/be/src/pipeline/exec/spill_utils.h +++ b/be/src/pipeline/exec/spill_utils.h @@ -26,6 +26,7 @@ #include "vec/runtime/partitioner.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" using SpillPartitionerType = vectorized::Crc32HashPartitioner; class SpillRunnable : public Runnable { @@ -70,4 +71,5 @@ class SpillRunnable : public Runnable { std::function _func; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline \ No newline at end of file diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.cpp b/be/src/pipeline/exec/streaming_aggregation_operator.cpp index cf5071d62e4737..b6e5788a07c626 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.cpp +++ b/be/src/pipeline/exec/streaming_aggregation_operator.cpp @@ -29,6 +29,7 @@ #include "vec/exprs/vslot_ref.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -228,7 +229,7 @@ Status StreamingAggLocalState::_merge_with_serialized_key_helper(vectorized::Blo } } - int rows = block->rows(); + size_t rows = block->rows(); if (_places.size() < rows) { _places.resize(rows); } @@ -270,7 +271,7 @@ Status StreamingAggLocalState::_merge_with_serialized_key_helper(vectorized::Blo for (int i = 0; i < _aggregate_evaluators.size(); ++i) { if (_aggregate_evaluators[i]->is_merge() || for_spill) { - int col_id = 0; + size_t col_id = 0; if constexpr (for_spill) { col_id = _probe_expr_ctxs.size() + i; } else { @@ -353,7 +354,6 @@ Status StreamingAggLocalState::_merge_without_key(vectorized::Block* block) { void StreamingAggLocalState::_update_memusage_without_key() { int64_t arena_memory_usage = _agg_arena_pool->size(); COUNTER_SET(_memory_used_counter, arena_memory_usage); - COUNTER_SET(_peak_memory_usage_counter, arena_memory_usage); COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); } @@ -378,8 +378,6 @@ void StreamingAggLocalState::_update_memusage_with_serialized_key() { COUNTER_SET(_memory_used_counter, arena_memory_usage + hash_table_memory_usage); - COUNTER_SET(_peak_memory_usage_counter, - arena_memory_usage + hash_table_memory_usage); COUNTER_SET(_serialize_key_arena_memory_usage, arena_memory_usage); COUNTER_SET(_hash_table_memory_usage, hash_table_memory_usage); @@ -406,7 +404,7 @@ Status StreamingAggLocalState::_execute_with_serialized_key_helper(vectorized::B } } - int rows = block->rows(); + size_t rows = block->rows(); if (_places.size() < rows) { _places.resize(rows); } @@ -545,8 +543,8 @@ bool StreamingAggLocalState::_should_expand_preagg_hash_tables() { const int64_t aggregated_input_rows = input_rows - _cur_num_rows_returned; // TODO chenhao // const int64_t expected_input_rows = estimated_input_cardinality_ - num_rows_returned_; - double current_reduction = - static_cast(aggregated_input_rows) / ht_rows; + double current_reduction = static_cast(aggregated_input_rows) / + static_cast(ht_rows); // TODO: workaround for IMPALA-2490: subplan node rows_returned counter may be // inaccurate, which could lead to a divide by zero below. @@ -618,7 +616,7 @@ Status StreamingAggLocalState::_pre_agg_with_serialized_key(doris::vectorized::B } } - int rows = in_block->rows(); + size_t rows = in_block->rows(); _places.resize(rows); // Stop expanding hash tables if we're not reducing the input sufficiently. As our @@ -742,7 +740,7 @@ Status StreamingAggLocalState::_get_with_serialized_key_result(RuntimeState* sta auto columns_with_schema = vectorized::VectorizedUtils::create_columns_with_type_and_name(p._row_descriptor); - int key_size = _probe_expr_ctxs.size(); + size_t key_size = _probe_expr_ctxs.size(); vectorized::MutableColumns key_columns; for (int i = 0; i < key_size; ++i) { @@ -753,7 +751,7 @@ Status StreamingAggLocalState::_get_with_serialized_key_result(RuntimeState* sta } } vectorized::MutableColumns value_columns; - for (int i = key_size; i < columns_with_schema.size(); ++i) { + for (size_t i = key_size; i < columns_with_schema.size(); ++i) { if (!mem_reuse) { value_columns.emplace_back(columns_with_schema[i].type->create_column()); } else { @@ -855,7 +853,7 @@ Status StreamingAggLocalState::_get_results_without_key(RuntimeState* state, block->clear(); DCHECK(_agg_data->without_key != nullptr); - int agg_size = _aggregate_evaluators.size(); + const auto agg_size = _aggregate_evaluators.size(); vectorized::MutableColumns value_columns(agg_size); std::vector data_types(agg_size); @@ -891,8 +889,8 @@ Status StreamingAggLocalState::_get_results_with_serialized_key(RuntimeState* st bool* eos) { SCOPED_TIMER(_get_results_timer); auto& p = _parent->cast(); - int key_size = _probe_expr_ctxs.size(); - int agg_size = _aggregate_evaluators.size(); + const auto key_size = _probe_expr_ctxs.size(); + const auto agg_size = _aggregate_evaluators.size(); vectorized::MutableColumns value_columns(agg_size); vectorized::DataTypes value_data_types(agg_size); @@ -1016,7 +1014,7 @@ Status StreamingAggLocalState::_get_without_key_result(RuntimeState* state, auto& p = _parent->cast(); *block = vectorized::VectorizedUtils::create_empty_columnswithtypename(p._row_descriptor); - int agg_size = _aggregate_evaluators.size(); + const auto agg_size = _aggregate_evaluators.size(); vectorized::MutableColumns columns(agg_size); std::vector data_types(agg_size); @@ -1173,8 +1171,8 @@ Status StreamingAggOperatorX::open(RuntimeState* state) { DCHECK_EQ(_intermediate_tuple_desc->slots().size(), _output_tuple_desc->slots().size()); RETURN_IF_ERROR(vectorized::VExpr::prepare(_probe_expr_ctxs, state, _child->row_desc())); - int j = _probe_expr_ctxs.size(); - for (int i = 0; i < j; ++i) { + size_t j = _probe_expr_ctxs.size(); + for (size_t i = 0; i < j; ++i) { auto nullable_output = _output_tuple_desc->slots()[i]->is_nullable(); auto nullable_input = _probe_expr_ctxs[i]->root()->is_nullable(); if (nullable_output != nullable_input) { @@ -1182,7 +1180,7 @@ Status StreamingAggOperatorX::open(RuntimeState* state) { _make_nullable_keys.emplace_back(i); } } - for (int i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { + for (size_t i = 0; i < _aggregate_evaluators.size(); ++i, ++j) { SlotDescriptor* intermediate_slot_desc = _intermediate_tuple_desc->slots()[j]; SlotDescriptor* output_slot_desc = _output_tuple_desc->slots()[j]; RETURN_IF_ERROR(_aggregate_evaluators[i]->prepare( @@ -1293,4 +1291,5 @@ bool StreamingAggOperatorX::need_more_input_data(RuntimeState* state) const { return local_state._pre_aggregated_block->empty() && !local_state._child_eos; } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/streaming_aggregation_operator.h b/be/src/pipeline/exec/streaming_aggregation_operator.h index b695880ac2857b..bd35cd940f2974 100644 --- a/be/src/pipeline/exec/streaming_aggregation_operator.h +++ b/be/src/pipeline/exec/streaming_aggregation_operator.h @@ -27,6 +27,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -237,4 +238,5 @@ class StreamingAggOperatorX final : public StatefulOperatorXget_value( columns[p._child_slots.size() + p._fn_num - 1], - state->batch_size() - columns[p._child_slots.size()]->size()); + //// It has already been checked that + // columns[p._child_slots.size()]->size() < state->batch_size(), + // so columns[p._child_slots.size()]->size() will not exceed the range of int. + state->batch_size() - (int)columns[p._child_slots.size()]->size()); _current_row_insert_times += repeat_times; for (int i = 0; i < p._fn_num - 1; i++) { _fns[i]->get_same_many_values(columns[i + p._child_slots.size()], repeat_times); @@ -276,7 +280,7 @@ Status TableFunctionOperatorX::init(const TPlanNode& tnode, RuntimeState* state) fn->set_expr_context(ctx); _fns.push_back(fn); } - _fn_num = _fns.size(); + _fn_num = cast_set(_fns.size()); // Prepare output slot ids RETURN_IF_ERROR(_prepare_output_slot_ids(tnode)); @@ -304,7 +308,7 @@ Status TableFunctionOperatorX::open(doris::RuntimeState* state) { } } - for (size_t i = 0; i < _child_slots.size(); i++) { + for (int i = 0; i < _child_slots.size(); i++) { if (_slot_need_copy(i)) { _output_slot_indexs.push_back(i); } else { @@ -315,4 +319,5 @@ Status TableFunctionOperatorX::open(doris::RuntimeState* state) { return vectorized::VExpr::open(_vfn_ctxs, state); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/table_function_operator.h b/be/src/pipeline/exec/table_function_operator.h index 81160acb7f7611..9aa26e9ae22b10 100644 --- a/be/src/pipeline/exec/table_function_operator.h +++ b/be/src/pipeline/exec/table_function_operator.h @@ -24,6 +24,7 @@ #include "vec/exprs/table_function/table_function.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; } // namespace doris @@ -154,4 +155,5 @@ class TableFunctionOperatorX final : public StatefulOperatorX _child_slot_sizes; }; +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/union_sink_operator.cpp b/be/src/pipeline/exec/union_sink_operator.cpp index 8467eeb1d5467a..56491b5258bc55 100644 --- a/be/src/pipeline/exec/union_sink_operator.cpp +++ b/be/src/pipeline/exec/union_sink_operator.cpp @@ -19,6 +19,7 @@ #include +#include "common/cast_set.h" #include "common/compiler_util.h" // IWYU pragma: keep #include "common/status.h" #include "pipeline/exec/data_queue.h" @@ -27,6 +28,7 @@ #include "util/runtime_profile.h" namespace doris::pipeline { +#include "common/compile_check_begin.h" Status UnionSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo& info) { RETURN_IF_ERROR(Base::init(state, info)); @@ -54,7 +56,8 @@ Status UnionSinkLocalState::open(RuntimeState* state) { UnionSinkOperatorX::UnionSinkOperatorX(int child_id, int sink_id, ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs) : Base(sink_id, tnode.node_id, tnode.node_id), - _first_materialized_child_idx(tnode.union_node.first_materialized_child_idx), + _first_materialized_child_idx( + cast_set(tnode.union_node.first_materialized_child_idx)), _row_descriptor(descs, tnode.row_tuples, tnode.nullable_tuples), _cur_child_id(child_id), _child_size(tnode.num_children) {} @@ -130,4 +133,5 @@ Status UnionSinkOperatorX::sink(RuntimeState* state, vectorized::Block* in_block return Status::OK(); } +#include "common/compile_check_end.h" } // namespace doris::pipeline diff --git a/be/src/pipeline/exec/union_sink_operator.h b/be/src/pipeline/exec/union_sink_operator.h index aa94ed9a73038f..3a8880622cb108 100644 --- a/be/src/pipeline/exec/union_sink_operator.h +++ b/be/src/pipeline/exec/union_sink_operator.h @@ -26,6 +26,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -152,4 +153,5 @@ class UnionSinkOperatorX final : public DataSinkOperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/exec/union_source_operator.cpp b/be/src/pipeline/exec/union_source_operator.cpp index ecaaf22922b657..d13658488e2c9b 100644 --- a/be/src/pipeline/exec/union_source_operator.cpp +++ b/be/src/pipeline/exec/union_source_operator.cpp @@ -30,6 +30,7 @@ #include "vec/core/block.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace pipeline { @@ -148,7 +149,7 @@ Status UnionSourceOperatorX::get_next_const(RuntimeState* state, vectorized::Blo vectorized::Block tmp_block; tmp_block.insert({vectorized::ColumnUInt8::create(1), std::make_shared(), ""}); - int const_expr_lists_size = _const_expr_lists[_const_expr_list_idx].size(); + int const_expr_lists_size = cast_set(_const_expr_lists[_const_expr_list_idx].size()); if (_const_expr_list_idx && const_expr_lists_size != _const_expr_lists[0].size()) { return Status::InternalError( "[UnionNode]const expr at {}'s count({}) not matched({} expected)", @@ -183,4 +184,5 @@ Status UnionSourceOperatorX::get_next_const(RuntimeState* state, vectorized::Blo } } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/pipeline/exec/union_source_operator.h b/be/src/pipeline/exec/union_source_operator.h index 200e7de8597b91..0ee66c3da7447b 100644 --- a/be/src/pipeline/exec/union_source_operator.h +++ b/be/src/pipeline/exec/union_source_operator.h @@ -24,6 +24,7 @@ #include "operator.h" namespace doris { +#include "common/compile_check_begin.h" class RuntimeState; namespace vectorized { @@ -123,4 +124,5 @@ class UnionSourceOperatorX final : public OperatorX { }; } // namespace pipeline +#include "common/compile_check_end.h" } // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp index a939d25654b4cc..b22ee9fd77e72f 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.cpp @@ -62,6 +62,7 @@ Status LocalExchangeSinkOperatorX::init(ExchangeType type, const int num_buckets _num_partitions)); RETURN_IF_ERROR(_partitioner->init(_texprs)); } else if (_type == ExchangeType::BUCKET_HASH_SHUFFLE) { + DCHECK_GT(num_buckets, 0); _partitioner.reset( new vectorized::Crc32HashPartitioner(num_buckets)); RETURN_IF_ERROR(_partitioner->init(_texprs)); @@ -90,6 +91,9 @@ Status LocalExchangeSinkLocalState::init(RuntimeState* state, LocalSinkStateInfo "UseGlobalShuffle", std::to_string(_parent->cast()._use_global_shuffle)); } + _profile->add_info_string( + "PartitionExprsSize", + std::to_string(_parent->cast()._partitioned_exprs_num)); _channel_id = info.task_idx; return Status::OK(); } @@ -140,7 +144,10 @@ Status LocalExchangeSinkOperatorX::sink(RuntimeState* state, vectorized::Block* auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); COUNTER_UPDATE(local_state.rows_input_counter(), (int64_t)in_block->rows()); - RETURN_IF_ERROR(local_state._exchanger->sink(state, in_block, eos, local_state)); + RETURN_IF_ERROR(local_state._exchanger->sink( + state, in_block, eos, + {local_state._compute_hash_value_timer, local_state._distribute_timer, nullptr}, + {&local_state._channel_id, local_state._partitioner.get(), &local_state})); // If all exchange sources ended due to limit reached, current task should also finish if (local_state._exchanger->_running_source_operators == 0) { diff --git a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h index 4c4a400c2bde3b..c067f023c8d420 100644 --- a/be/src/pipeline/local_exchange/local_exchange_sink_operator.h +++ b/be/src/pipeline/local_exchange/local_exchange_sink_operator.h @@ -65,7 +65,6 @@ class LocalExchangeSinkLocalState final : public PipelineXSinkLocalState _partitioner = nullptr; - std::vector _partition_rows_histogram; // Used by random passthrough exchanger int _channel_id = 0; @@ -91,6 +90,7 @@ class LocalExchangeSinkOperatorX final : public DataSinkOperatorX& _texprs; + const size_t _partitioned_exprs_num; std::unique_ptr _partitioner; const std::map _bucket_seq_to_instance_idx; std::vector> _shuffle_idx_to_instance_idx; diff --git a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp index c4832b9958c00d..63e36cdfdb0c01 100644 --- a/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp +++ b/be/src/pipeline/local_exchange/local_exchange_source_operator.cpp @@ -61,10 +61,10 @@ Status LocalExchangeSourceLocalState::close(RuntimeState* state) { } if (_exchanger) { - _exchanger->close(*this); + _exchanger->close({_channel_id, this}); } if (_shared_state) { - _shared_state->sub_running_source_operators(*this); + _shared_state->sub_running_source_operators(); } std::vector {}.swap(_local_merge_deps); @@ -116,7 +116,9 @@ Status LocalExchangeSourceOperatorX::get_block(RuntimeState* state, vectorized:: bool* eos) { auto& local_state = get_local_state(state); SCOPED_TIMER(local_state.exec_time_counter()); - RETURN_IF_ERROR(local_state._exchanger->get_block(state, block, eos, local_state)); + RETURN_IF_ERROR(local_state._exchanger->get_block( + state, block, eos, {nullptr, nullptr, local_state._copy_data_timer}, + {local_state._channel_id, &local_state})); local_state.reached_limit(block, eos); return Status::OK(); } diff --git a/be/src/pipeline/local_exchange/local_exchanger.cpp b/be/src/pipeline/local_exchange/local_exchanger.cpp index c5f99ca5d6a4a5..a963de8b684310 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.cpp +++ b/be/src/pipeline/local_exchange/local_exchanger.cpp @@ -29,8 +29,12 @@ namespace doris::pipeline { #include "common/compile_check_begin.h" template void Exchanger::_enqueue_data_and_set_ready(int channel_id, - LocalExchangeSinkLocalState& local_state, + LocalExchangeSinkLocalState* local_state, BlockType&& block) { + if (local_state == nullptr) { + _enqueue_data_and_set_ready(channel_id, std::move(block)); + return; + } size_t allocated_bytes = 0; // PartitionedBlock is used by shuffle exchanger. // PartitionedBlock will be push into multiple queues with different row ranges, so it will be @@ -44,47 +48,47 @@ void Exchanger::_enqueue_data_and_set_ready(int channel_id, allocated_bytes = block->data_block.allocated_bytes(); } std::unique_lock l(_m); - local_state._shared_state->add_mem_usage(channel_id, allocated_bytes, - !std::is_same_v && - !std::is_same_v); + local_state->_shared_state->add_mem_usage(channel_id, allocated_bytes, + !std::is_same_v && + !std::is_same_v); if (_data_queue[channel_id].enqueue(std::move(block))) { - local_state._shared_state->set_ready_to_read(channel_id); + local_state->_shared_state->set_ready_to_read(channel_id); } else { - local_state._shared_state->sub_mem_usage(channel_id, allocated_bytes); + local_state->_shared_state->sub_mem_usage(channel_id, allocated_bytes); // `enqueue(block)` return false iff this queue's source operator is already closed so we // just unref the block. if constexpr (std::is_same_v || std::is_same_v) { - block.first->unref(local_state._shared_state, allocated_bytes, channel_id); + block.first->unref(local_state->_shared_state, allocated_bytes, channel_id); } else { - block->unref(local_state._shared_state, allocated_bytes, channel_id); + block->unref(local_state->_shared_state, allocated_bytes, channel_id); DCHECK_EQ(block->ref_value(), 0); } } } template -bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, - BlockType& block, bool* eos, - vectorized::Block* data_block) { - return _dequeue_data(local_state, block, eos, data_block, local_state._channel_id); -} - -template -bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_state, +bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState* local_state, BlockType& block, bool* eos, vectorized::Block* data_block, int channel_id) { + if (local_state == nullptr) { + if (!_dequeue_data(block, eos, data_block, channel_id)) { + throw Exception(ErrorCode::INTERNAL_ERROR, "Exchanger has no data: {}", + data_queue_debug_string(channel_id)); + } + return true; + } bool all_finished = _running_sink_operators == 0; if (_data_queue[channel_id].try_dequeue(block)) { if constexpr (std::is_same_v || std::is_same_v) { - local_state._shared_state->sub_mem_usage(channel_id, - block.first->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage(channel_id, + block.first->data_block.allocated_bytes()); } else { - local_state._shared_state->sub_mem_usage(channel_id, - block->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage(channel_id, + block->data_block.allocated_bytes()); data_block->swap(block->data_block); - block->unref(local_state._shared_state, data_block->allocated_bytes(), channel_id); + block->unref(local_state->_shared_state, data_block->allocated_bytes(), channel_id); DCHECK_EQ(block->ref_value(), 0); } return true; @@ -95,54 +99,88 @@ bool Exchanger::_dequeue_data(LocalExchangeSourceLocalState& local_st if (_data_queue[channel_id].try_dequeue(block)) { if constexpr (std::is_same_v || std::is_same_v) { - local_state._shared_state->sub_mem_usage(channel_id, - block.first->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage( + channel_id, block.first->data_block.allocated_bytes()); } else { - local_state._shared_state->sub_mem_usage(channel_id, - block->data_block.allocated_bytes()); + local_state->_shared_state->sub_mem_usage(channel_id, + block->data_block.allocated_bytes()); data_block->swap(block->data_block); - block->unref(local_state._shared_state, data_block->allocated_bytes(), channel_id); + block->unref(local_state->_shared_state, data_block->allocated_bytes(), channel_id); DCHECK_EQ(block->ref_value(), 0); } return true; } - COUNTER_UPDATE(local_state._get_block_failed_counter, 1); - local_state._dependency->block(); + COUNTER_UPDATE(local_state->_get_block_failed_counter, 1); + local_state->_dependency->block(); + } + return false; +} + +template +void Exchanger::_enqueue_data_and_set_ready(int channel_id, BlockType&& block) { + if constexpr (!std::is_same_v && + !std::is_same_v) { + block->ref(1); + } + if (!_data_queue[channel_id].enqueue(std::move(block))) { + if constexpr (std::is_same_v || + std::is_same_v) { + block.first->unref(); + } else { + block->unref(); + DCHECK_EQ(block->ref_value(), 0); + } + } +} + +template +bool Exchanger::_dequeue_data(BlockType& block, bool* eos, vectorized::Block* data_block, + int channel_id) { + if (_data_queue[channel_id].try_dequeue(block)) { + if constexpr (!std::is_same_v && + !std::is_same_v) { + data_block->swap(block->data_block); + block->unref(); + DCHECK_EQ(block->ref_value(), 0); + } + return true; } return false; } Status ShuffleExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } { - SCOPED_TIMER(local_state._compute_hash_value_timer); - RETURN_IF_ERROR(local_state._partitioner->do_partitioning(state, in_block)); + SCOPED_TIMER(profile.compute_hash_value_timer); + RETURN_IF_ERROR(sink_info.partitioner->do_partitioning(state, in_block)); } { - SCOPED_TIMER(local_state._distribute_timer); - RETURN_IF_ERROR(_split_rows(state, - local_state._partitioner->get_channel_ids().get(), - in_block, local_state)); + SCOPED_TIMER(profile.distribute_timer); + RETURN_IF_ERROR(_split_rows(state, sink_info.partitioner->get_channel_ids().get(), + in_block, *sink_info.channel_id, sink_info.local_state)); } return Status::OK(); } -void ShuffleExchanger::close(LocalExchangeSourceLocalState& local_state) { +void ShuffleExchanger::close(SourceInfo&& source_info) { PartitionedBlock partitioned_block; bool eos; vectorized::Block block; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, partitioned_block, &eos, &block)) { - partitioned_block.first->unref(local_state._shared_state, local_state._channel_id); + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, partitioned_block, &eos, &block, + source_info.channel_id)) { + partitioned_block.first->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } } Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { + Profile&& profile, SourceInfo&& source_info) { PartitionedBlock partitioned_block; vectorized::MutableBlock mutable_block; @@ -153,14 +191,18 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block auto block_wrapper = partitioned_block.first; RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->data_block, offset_start, offset_start + partitioned_block.second.length)); - block_wrapper->unref(local_state._shared_state, local_state._channel_id); + block_wrapper->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } while (mutable_block.rows() < state->batch_size() && !*eos && - _dequeue_data(local_state, partitioned_block, eos, block)); + _dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)); return Status::OK(); }; - if (_dequeue_data(local_state, partitioned_block, eos, block)) { - SCOPED_TIMER(local_state._copy_data_timer); + if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)) { + SCOPED_TIMER(profile.copy_data_timer); mutable_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->data_block); RETURN_IF_ERROR(get_data()); @@ -169,22 +211,25 @@ Status ShuffleExchanger::get_block(RuntimeState* state, vectorized::Block* block } Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, - LocalExchangeSinkLocalState& local_state) { + vectorized::Block* block, int channel_id, + LocalExchangeSinkLocalState* local_state) { + if (local_state == nullptr) { + return _split_rows(state, channel_ids, block, channel_id); + } const auto rows = cast_set(block->rows()); auto row_idx = std::make_shared>(rows); + auto& partition_rows_histogram = _partition_rows_histogram[channel_id]; { - local_state._partition_rows_histogram.assign(_num_partitions + 1, 0); + partition_rows_histogram.assign(_num_partitions + 1, 0); for (int32_t i = 0; i < rows; ++i) { - local_state._partition_rows_histogram[channel_ids[i]]++; + partition_rows_histogram[channel_ids[i]]++; } for (int32_t i = 1; i <= _num_partitions; ++i) { - local_state._partition_rows_histogram[i] += - local_state._partition_rows_histogram[i - 1]; + partition_rows_histogram[i] += partition_rows_histogram[i - 1]; } for (int32_t i = rows - 1; i >= 0; --i) { - (*row_idx)[local_state._partition_rows_histogram[channel_ids[i]] - 1] = i; - local_state._partition_rows_histogram[channel_ids[i]]--; + (*row_idx)[partition_rows_histogram[channel_ids[i]] - 1] = i; + partition_rows_histogram[channel_ids[i]]--; } } @@ -200,10 +245,10 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest if (new_block_wrapper->data_block.empty()) { return Status::OK(); } - local_state._shared_state->add_total_mem_usage(new_block_wrapper->data_block.allocated_bytes(), - local_state._channel_id); + local_state->_shared_state->add_total_mem_usage(new_block_wrapper->data_block.allocated_bytes(), + channel_id); auto bucket_seq_to_instance_idx = - local_state._parent->cast()._bucket_seq_to_instance_idx; + local_state->_parent->cast()._bucket_seq_to_instance_idx; if (get_type() == ExchangeType::HASH_SHUFFLE) { /** * If type is `HASH_SHUFFLE`, data are hash-shuffled and distributed to all instances of @@ -211,45 +256,32 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest * For example, row 1 get a hash value 1 which means we should distribute to instance 1 on * BE 1 and row 2 get a hash value 2 which means we should distribute to instance 1 on BE 3. */ - const auto& map = local_state._parent->cast() + const auto& map = local_state->_parent->cast() ._shuffle_idx_to_instance_idx; new_block_wrapper->ref(cast_set(map.size())); for (const auto& it : map) { DCHECK(it.second >= 0 && it.second < _num_partitions) << it.first << " : " << it.second << " " << _num_partitions; - uint32_t start = local_state._partition_rows_histogram[it.first]; - uint32_t size = local_state._partition_rows_histogram[it.first + 1] - start; + uint32_t start = partition_rows_histogram[it.first]; + uint32_t size = partition_rows_histogram[it.first + 1] - start; if (size > 0) { _enqueue_data_and_set_ready(it.second, local_state, {new_block_wrapper, {row_idx, start, size}}); } else { - new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); - } - } - } else if (_num_senders != _num_sources) { - // In this branch, data just should be distributed equally into all instances. - new_block_wrapper->ref(_num_partitions); - for (size_t i = 0; i < _num_partitions; i++) { - uint32_t start = local_state._partition_rows_histogram[i]; - uint32_t size = local_state._partition_rows_histogram[i + 1] - start; - if (size > 0) { - _enqueue_data_and_set_ready(i % _num_sources, local_state, - {new_block_wrapper, {row_idx, start, size}}); - } else { - new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); + new_block_wrapper->unref(local_state->_shared_state, channel_id); } } } else { DCHECK(!bucket_seq_to_instance_idx.empty()); new_block_wrapper->ref(_num_partitions); for (int i = 0; i < _num_partitions; i++) { - uint32_t start = local_state._partition_rows_histogram[i]; - uint32_t size = local_state._partition_rows_histogram[i + 1] - start; + uint32_t start = partition_rows_histogram[i]; + uint32_t size = partition_rows_histogram[i + 1] - start; if (size > 0) { _enqueue_data_and_set_ready(bucket_seq_to_instance_idx[i], local_state, {new_block_wrapper, {row_idx, start, size}}); } else { - new_block_wrapper->unref(local_state._shared_state, local_state._channel_id); + new_block_wrapper->unref(local_state->_shared_state, channel_id); } } } @@ -257,8 +289,53 @@ Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __rest return Status::OK(); } +Status ShuffleExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, + vectorized::Block* block, int channel_id) { + const auto rows = cast_set(block->rows()); + auto row_idx = std::make_shared>(rows); + auto& partition_rows_histogram = _partition_rows_histogram[channel_id]; + { + partition_rows_histogram.assign(_num_partitions + 1, 0); + for (int32_t i = 0; i < rows; ++i) { + partition_rows_histogram[channel_ids[i]]++; + } + for (int32_t i = 1; i <= _num_partitions; ++i) { + partition_rows_histogram[i] += partition_rows_histogram[i - 1]; + } + for (int32_t i = rows - 1; i >= 0; --i) { + (*row_idx)[partition_rows_histogram[channel_ids[i]] - 1] = i; + partition_rows_histogram[channel_ids[i]]--; + } + } + + vectorized::Block data_block; + std::shared_ptr new_block_wrapper; + if (_free_blocks.try_dequeue(data_block)) { + new_block_wrapper = BlockWrapper::create_shared(std::move(data_block)); + } else { + new_block_wrapper = BlockWrapper::create_shared(block->clone_empty()); + } + + new_block_wrapper->data_block.swap(*block); + if (new_block_wrapper->data_block.empty()) { + return Status::OK(); + } + new_block_wrapper->ref(cast_set(_num_partitions)); + for (int i = 0; i < _num_partitions; i++) { + uint32_t start = partition_rows_histogram[i]; + uint32_t size = partition_rows_histogram[i + 1] - start; + if (size > 0) { + _enqueue_data_and_set_ready(i, {new_block_wrapper, {row_idx, start, size}}); + } else { + new_block_wrapper->unref(); + } + } + + return Status::OK(); +} + Status PassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } @@ -269,41 +346,43 @@ Status PassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_blo } new_block.swap(*in_block); wrapper = BlockWrapper::create_shared(std::move(new_block)); - auto channel_id = (local_state._channel_id++) % _num_partitions; - _enqueue_data_and_set_ready(channel_id, local_state, std::move(wrapper)); + auto channel_id = ((*sink_info.channel_id)++) % _num_partitions; + _enqueue_data_and_set_ready(channel_id, sink_info.local_state, std::move(wrapper)); return Status::OK(); } -void PassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { +void PassthroughExchanger::close(SourceInfo&& source_info) { vectorized::Block next_block; BlockWrapperSPtr wrapper; bool eos; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, wrapper, &eos, &next_block, + source_info.channel_id)) { // do nothing } } -void PassToOneExchanger::close(LocalExchangeSourceLocalState& local_state) { +void PassToOneExchanger::close(SourceInfo&& source_info) { vectorized::Block next_block; BlockWrapperSPtr wrapper; bool eos; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, wrapper, &eos, &next_block, + source_info.channel_id)) { // do nothing } } Status PassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { + Profile&& profile, SourceInfo&& source_info) { BlockWrapperSPtr next_block; - _dequeue_data(local_state, next_block, eos, block); + _dequeue_data(source_info.local_state, next_block, eos, block, source_info.channel_id); return Status::OK(); } Status PassToOneExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } @@ -314,70 +393,72 @@ Status PassToOneExchanger::sink(RuntimeState* state, vectorized::Block* in_block new_block.swap(*in_block); BlockWrapperSPtr wrapper = BlockWrapper::create_shared(std::move(new_block)); - _enqueue_data_and_set_ready(0, local_state, std::move(wrapper)); + _enqueue_data_and_set_ready(0, sink_info.local_state, std::move(wrapper)); return Status::OK(); } Status PassToOneExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { - if (local_state._channel_id != 0) { + Profile&& profile, SourceInfo&& source_info) { + if (source_info.channel_id != 0) { *eos = true; return Status::OK(); } BlockWrapperSPtr next_block; - _dequeue_data(local_state, next_block, eos, block); + _dequeue_data(source_info.local_state, next_block, eos, block, source_info.channel_id); return Status::OK(); } Status LocalMergeSortExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (!in_block->empty()) { vectorized::Block new_block; if (!_free_blocks.try_dequeue(new_block)) { new_block = {in_block->clone_empty()}; } - DCHECK_LE(local_state._channel_id, _data_queue.size()); + DCHECK_LE(*sink_info.channel_id, _data_queue.size()); new_block.swap(*in_block); - _enqueue_data_and_set_ready(local_state._channel_id, local_state, + _enqueue_data_and_set_ready(*sink_info.channel_id, sink_info.local_state, BlockWrapper::create_shared(std::move(new_block))); } - if (eos) { - local_state._shared_state->source_deps[local_state._channel_id]->set_always_ready(); + if (eos && sink_info.local_state) { + sink_info.local_state->_shared_state->source_deps[*sink_info.channel_id] + ->set_always_ready(); } return Status::OK(); } -void ExchangerBase::finalize(LocalExchangeSourceLocalState& local_state) { +void ExchangerBase::finalize() { DCHECK(_running_source_operators == 0); vectorized::Block block; while (_free_blocks.try_dequeue(block)) { // do nothing } } -void LocalMergeSortExchanger::finalize(LocalExchangeSourceLocalState& local_state) { + +void LocalMergeSortExchanger::finalize() { BlockWrapperSPtr next_block; vectorized::Block block; bool eos; int id = 0; for (auto& data_queue : _data_queue) { data_queue.set_eos(); - while (_dequeue_data(local_state, next_block, &eos, &block, id)) { + while (_dequeue_data(next_block, &eos, &block, id)) { block = vectorized::Block(); } id++; } - ExchangerBase::finalize(local_state); + ExchangerBase::finalize(); } Status LocalMergeSortExchanger::build_merger(RuntimeState* state, - LocalExchangeSourceLocalState& local_state) { - RETURN_IF_ERROR(_sort_source->build_merger(state, _merger, local_state.profile())); + LocalExchangeSourceLocalState* local_state) { + RETURN_IF_ERROR(_sort_source->build_merger(state, _merger, local_state->profile())); std::vector child_block_suppliers; for (int channel_id = 0; channel_id < _num_partitions; channel_id++) { - vectorized::BlockSupplier block_supplier = [&, id = channel_id](vectorized::Block* block, - bool* eos) { + vectorized::BlockSupplier block_supplier = [&, local_state, id = channel_id]( + vectorized::Block* block, bool* eos) { BlockWrapperSPtr next_block; _dequeue_data(local_state, next_block, eos, block, id); return Status::OK(); @@ -401,20 +482,21 @@ now sort(8) --> local merge(1) ---> datasink(1) [2] ----> */ Status LocalMergeSortExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { - if (local_state._channel_id != 0) { + Profile&& profile, SourceInfo&& source_info) { + if (source_info.channel_id != 0) { *eos = true; return Status::OK(); } if (!_merger) { - RETURN_IF_ERROR(build_merger(state, local_state)); + DCHECK(source_info.local_state); + RETURN_IF_ERROR(build_merger(state, source_info.local_state)); } RETURN_IF_ERROR(_merger->get_next(block, eos)); return Status::OK(); } Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) { + Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } @@ -424,32 +506,40 @@ Status BroadcastExchanger::sink(RuntimeState* state, vectorized::Block* in_block } new_block.swap(*in_block); auto wrapper = BlockWrapper::create_shared(std::move(new_block)); - local_state._shared_state->add_total_mem_usage(wrapper->data_block.allocated_bytes(), - local_state._channel_id); + if (sink_info.local_state) { + sink_info.local_state->_shared_state->add_total_mem_usage( + wrapper->data_block.allocated_bytes(), *sink_info.channel_id); + } + wrapper->ref(_num_partitions); for (int i = 0; i < _num_partitions; i++) { - _enqueue_data_and_set_ready(i, local_state, {wrapper, {0, wrapper->data_block.rows()}}); + _enqueue_data_and_set_ready(i, sink_info.local_state, + {wrapper, {0, wrapper->data_block.rows()}}); } return Status::OK(); } -void BroadcastExchanger::close(LocalExchangeSourceLocalState& local_state) { +void BroadcastExchanger::close(SourceInfo&& source_info) { BroadcastBlock partitioned_block; bool eos; vectorized::Block block; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, partitioned_block, &eos, &block)) { - partitioned_block.first->unref(local_state._shared_state, local_state._channel_id); + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, partitioned_block, &eos, &block, + source_info.channel_id)) { + partitioned_block.first->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } } Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) { + Profile&& profile, SourceInfo&& source_info) { BroadcastBlock partitioned_block; - if (_dequeue_data(local_state, partitioned_block, eos, block)) { - SCOPED_TIMER(local_state._copy_data_timer); + if (_dequeue_data(source_info.local_state, partitioned_block, eos, block, + source_info.channel_id)) { + SCOPED_TIMER(profile.copy_data_timer); vectorized::MutableBlock mutable_block = vectorized::VectorizedUtils::build_mutable_mem_reuse_block( block, partitioned_block.first->data_block); @@ -457,7 +547,9 @@ Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* blo RETURN_IF_ERROR(mutable_block.add_rows(&block_wrapper->data_block, partitioned_block.second.offset_start, partitioned_block.second.length)); - block_wrapper->unref(local_state._shared_state, local_state._channel_id); + block_wrapper->unref( + source_info.local_state ? source_info.local_state->_shared_state : nullptr, + source_info.channel_id); } return Status::OK(); @@ -465,21 +557,21 @@ Status BroadcastExchanger::get_block(RuntimeState* state, vectorized::Block* blo Status AdaptivePassthroughExchanger::_passthrough_sink(RuntimeState* state, vectorized::Block* in_block, - LocalExchangeSinkLocalState& local_state) { + SinkInfo&& sink_info) { vectorized::Block new_block; if (!_free_blocks.try_dequeue(new_block)) { new_block = {in_block->clone_empty()}; } new_block.swap(*in_block); - auto channel_id = (local_state._channel_id++) % _num_partitions; - _enqueue_data_and_set_ready(channel_id, local_state, + auto channel_id = ((*sink_info.channel_id)++) % _num_partitions; + _enqueue_data_and_set_ready(channel_id, sink_info.local_state, BlockWrapper::create_shared(std::move(new_block))); return Status::OK(); } Status AdaptivePassthroughExchanger::_shuffle_sink(RuntimeState* state, vectorized::Block* block, - LocalExchangeSinkLocalState& local_state) { + SinkInfo&& sink_info) { std::vector channel_ids; const auto num_rows = block->rows(); channel_ids.resize(num_rows, 0); @@ -494,40 +586,39 @@ Status AdaptivePassthroughExchanger::_shuffle_sink(RuntimeState* state, vectoriz std::iota(channel_ids.begin() + i, channel_ids.end(), 0); } } - return _split_rows(state, channel_ids.data(), block, local_state); + return _split_rows(state, channel_ids.data(), block, std::move(sink_info)); } Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, - LocalExchangeSinkLocalState& local_state) { + vectorized::Block* block, SinkInfo&& sink_info) { const auto rows = cast_set(block->rows()); auto row_idx = std::make_shared>(rows); + auto& partition_rows_histogram = _partition_rows_histogram[*sink_info.channel_id]; { - local_state._partition_rows_histogram.assign(_num_partitions + 1, 0); + partition_rows_histogram.assign(_num_partitions + 1, 0); for (int32_t i = 0; i < rows; ++i) { - local_state._partition_rows_histogram[channel_ids[i]]++; + partition_rows_histogram[channel_ids[i]]++; } for (int32_t i = 1; i <= _num_partitions; ++i) { - local_state._partition_rows_histogram[i] += - local_state._partition_rows_histogram[i - 1]; + partition_rows_histogram[i] += partition_rows_histogram[i - 1]; } for (int32_t i = rows - 1; i >= 0; --i) { - (*row_idx)[local_state._partition_rows_histogram[channel_ids[i]] - 1] = i; - local_state._partition_rows_histogram[channel_ids[i]]--; + (*row_idx)[partition_rows_histogram[channel_ids[i]] - 1] = i; + partition_rows_histogram[channel_ids[i]]--; } } for (int32_t i = 0; i < _num_partitions; i++) { - const size_t start = local_state._partition_rows_histogram[i]; - const size_t size = local_state._partition_rows_histogram[i + 1] - start; + const size_t start = partition_rows_histogram[i]; + const size_t size = partition_rows_histogram[i + 1] - start; if (size > 0) { std::unique_ptr mutable_block = vectorized::MutableBlock::create_unique(block->clone_empty()); RETURN_IF_ERROR(mutable_block->add_rows(block, start, size)); auto new_block = mutable_block->to_block(); - _enqueue_data_and_set_ready(i, local_state, + _enqueue_data_and_set_ready(i, sink_info.local_state, BlockWrapper::create_shared(std::move(new_block))); } } @@ -535,34 +626,35 @@ Status AdaptivePassthroughExchanger::_split_rows(RuntimeState* state, } Status AdaptivePassthroughExchanger::sink(RuntimeState* state, vectorized::Block* in_block, - bool eos, LocalExchangeSinkLocalState& local_state) { + bool eos, Profile&& profile, SinkInfo&& sink_info) { if (in_block->empty()) { return Status::OK(); } if (_is_pass_through) { - return _passthrough_sink(state, in_block, local_state); + return _passthrough_sink(state, in_block, std::move(sink_info)); } else { if (_total_block++ > _num_partitions) { _is_pass_through = true; } - return _shuffle_sink(state, in_block, local_state); + return _shuffle_sink(state, in_block, std::move(sink_info)); } } Status AdaptivePassthroughExchanger::get_block(RuntimeState* state, vectorized::Block* block, - bool* eos, - LocalExchangeSourceLocalState& local_state) { + bool* eos, Profile&& profile, + SourceInfo&& source_info) { BlockWrapperSPtr next_block; - _dequeue_data(local_state, next_block, eos, block); + _dequeue_data(source_info.local_state, next_block, eos, block, source_info.channel_id); return Status::OK(); } -void AdaptivePassthroughExchanger::close(LocalExchangeSourceLocalState& local_state) { +void AdaptivePassthroughExchanger::close(SourceInfo&& source_info) { vectorized::Block next_block; bool eos; BlockWrapperSPtr wrapper; - _data_queue[local_state._channel_id].set_eos(); - while (_dequeue_data(local_state, wrapper, &eos, &next_block)) { + _data_queue[source_info.channel_id].set_eos(); + while (_dequeue_data(source_info.local_state, wrapper, &eos, &next_block, + source_info.channel_id)) { // do nothing } } diff --git a/be/src/pipeline/local_exchange/local_exchanger.h b/be/src/pipeline/local_exchange/local_exchanger.h index bf052ac3b924ca..d6871b2ba97cc3 100644 --- a/be/src/pipeline/local_exchange/local_exchanger.h +++ b/be/src/pipeline/local_exchange/local_exchanger.h @@ -20,14 +20,33 @@ #include "pipeline/dependency.h" #include "pipeline/exec/operator.h" -namespace doris::pipeline { +namespace doris { #include "common/compile_check_begin.h" - +namespace vectorized { +class PartitionerBase; +} +namespace pipeline { class LocalExchangeSourceLocalState; class LocalExchangeSinkLocalState; struct BlockWrapper; class SortSourceOperatorX; +struct Profile { + RuntimeProfile::Counter* compute_hash_value_timer = nullptr; + RuntimeProfile::Counter* distribute_timer = nullptr; + RuntimeProfile::Counter* copy_data_timer = nullptr; +}; + +struct SinkInfo { + int* channel_id; + vectorized::PartitionerBase* partitioner; + LocalExchangeSinkLocalState* local_state; +}; + +struct SourceInfo { + int channel_id; + LocalExchangeSourceLocalState* local_state; +}; /** * One exchanger is hold by one `LocalExchangeSharedState`. And one `LocalExchangeSharedState` is * shared by all local exchange sink operators and source operators with the same id. @@ -53,22 +72,22 @@ class ExchangerBase { ExchangerBase(int running_sink_operators, int num_sources, int num_partitions, int free_block_limit) : _running_sink_operators(running_sink_operators), - _running_source_operators(num_partitions), + _running_source_operators(num_sources), _num_partitions(num_partitions), _num_senders(running_sink_operators), _num_sources(num_sources), _free_block_limit(free_block_limit) {} virtual ~ExchangerBase() = default; virtual Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) = 0; + Profile&& profile, SourceInfo&& source_info) = 0; virtual Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) = 0; + Profile&& profile, SinkInfo&& sink_info) = 0; virtual ExchangeType get_type() const = 0; // Called if a local exchanger source operator are closed. Free the unused data block in data_queue. - virtual void close(LocalExchangeSourceLocalState& local_state) = 0; + virtual void close(SourceInfo&& source_info) = 0; // Called if all local exchanger source operators are closed. We free the memory in // `_free_blocks` here. - virtual void finalize(LocalExchangeSourceLocalState& local_state); + virtual void finalize(); virtual std::string data_queue_debug_string(int i) = 0; @@ -110,7 +129,11 @@ struct BlockQueue { : eos(other.eos.load()), data_queue(std::move(other.data_queue)) {} inline bool enqueue(BlockType const& item) { if (!eos) { - data_queue.enqueue(item); + if (!data_queue.enqueue(item)) [[unlikely]] { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Exception occurs in data queue [size = {}] of local exchange.", + data_queue.size_approx()); + } return true; } return false; @@ -118,7 +141,11 @@ struct BlockQueue { inline bool enqueue(BlockType&& item) { if (!eos) { - data_queue.enqueue(std::move(item)); + if (!data_queue.enqueue(std::move(item))) [[unlikely]] { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Exception occurs in data queue [size = {}] of local exchange.", + data_queue.size_approx()); + } return true; } return false; @@ -147,12 +174,13 @@ class Exchanger : public ExchangerBase { protected: // Enqueue data block and set downstream source operator to read. - void _enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState& local_state, + void _enqueue_data_and_set_ready(int channel_id, LocalExchangeSinkLocalState* local_state, BlockType&& block); - bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos, - vectorized::Block* data_block); - bool _dequeue_data(LocalExchangeSourceLocalState& local_state, BlockType& block, bool* eos, + bool _dequeue_data(LocalExchangeSourceLocalState* local_state, BlockType& block, bool* eos, vectorized::Block* data_block, int channel_id); + + void _enqueue_data_and_set_ready(int channel_id, BlockType&& block); + bool _dequeue_data(BlockType& block, bool* eos, vectorized::Block* data_block, int channel_id); std::vector> _data_queue; private: @@ -178,7 +206,7 @@ struct BlockWrapper { ~BlockWrapper() { DCHECK_EQ(ref_count.load(), 0); } void ref(int delta) { ref_count += delta; } void unref(LocalExchangeSharedState* shared_state, size_t allocated_bytes, int channel_id) { - if (ref_count.fetch_sub(1) == 1) { + if (ref_count.fetch_sub(1) == 1 && shared_state != nullptr) { DCHECK_GT(allocated_bytes, 0); shared_state->sub_total_mem_usage(allocated_bytes, channel_id); if (shared_state->exchanger->_free_block_limit == 0 || @@ -186,12 +214,14 @@ struct BlockWrapper { shared_state->exchanger->_free_block_limit * shared_state->exchanger->_num_sources) { data_block.clear_column_data(); + // Free blocks is used to improve memory efficiency. Failure during pushing back + // free block will not incur any bad result so just ignore the return value. shared_state->exchanger->_free_blocks.enqueue(std::move(data_block)); } } } - void unref(LocalExchangeSharedState* shared_state, int channel_id) { + void unref(LocalExchangeSharedState* shared_state = nullptr, int channel_id = 0) { unref(shared_state, data_block.allocated_bytes(), channel_id); } int ref_value() const { return ref_count.load(); } @@ -202,29 +232,31 @@ struct BlockWrapper { class ShuffleExchanger : public Exchanger { public: ENABLE_FACTORY_CREATOR(ShuffleExchanger); - ShuffleExchanger(int running_sink_operators, int num_partitions, int free_block_limit) - : Exchanger(running_sink_operators, num_partitions, + ShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, + int free_block_limit) + : Exchanger(running_sink_operators, num_sources, num_partitions, free_block_limit) { - _data_queue.resize(num_partitions); + DCHECK_GT(num_partitions, 0); + DCHECK_GT(num_sources, 0); + _data_queue.resize(num_sources); + _partition_rows_histogram.resize(running_sink_operators); } ~ShuffleExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; - void close(LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; + void close(SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::HASH_SHUFFLE; } protected: - ShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, - int free_block_limit) - : Exchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit) { - _data_queue.resize(num_partitions); - } Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, LocalExchangeSinkLocalState& local_state); + vectorized::Block* block, int channel_id, + LocalExchangeSinkLocalState* local_state); + Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, + vectorized::Block* block, int channel_id); + std::vector> _partition_rows_histogram; }; class BucketShuffleExchanger final : public ShuffleExchanger { @@ -232,7 +264,9 @@ class BucketShuffleExchanger final : public ShuffleExchanger { BucketShuffleExchanger(int running_sink_operators, int num_sources, int num_partitions, int free_block_limit) : ShuffleExchanger(running_sink_operators, num_sources, num_partitions, - free_block_limit) {} + free_block_limit) { + DCHECK_GT(num_partitions, 0); + } ~BucketShuffleExchanger() override = default; ExchangeType get_type() const override { return ExchangeType::BUCKET_HASH_SHUFFLE; } }; @@ -246,13 +280,13 @@ class PassthroughExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~PassthroughExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::PASSTHROUGH; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; }; class PassToOneExchanger final : public Exchanger { @@ -264,13 +298,13 @@ class PassToOneExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~PassToOneExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::PASS_TO_ONE; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; }; class LocalMergeSortExchanger final : public Exchanger { @@ -283,17 +317,17 @@ class LocalMergeSortExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~LocalMergeSortExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::LOCAL_MERGE_SORT; } - Status build_merger(RuntimeState* statem, LocalExchangeSourceLocalState& local_state); + Status build_merger(RuntimeState* statem, LocalExchangeSourceLocalState* local_state); - void close(LocalExchangeSourceLocalState& local_state) override {} - void finalize(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override {} + void finalize() override; private: std::unique_ptr _merger; @@ -309,13 +343,13 @@ class BroadcastExchanger final : public Exchanger { _data_queue.resize(num_partitions); } ~BroadcastExchanger() override = default; - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::BROADCAST; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; }; //The code in AdaptivePassthroughExchanger is essentially @@ -328,26 +362,28 @@ class AdaptivePassthroughExchanger : public Exchanger { : Exchanger(running_sink_operators, num_partitions, free_block_limit) { _data_queue.resize(num_partitions); + _partition_rows_histogram.resize(running_sink_operators); } - Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, - LocalExchangeSinkLocalState& local_state) override; + Status sink(RuntimeState* state, vectorized::Block* in_block, bool eos, Profile&& profile, + SinkInfo&& sink_info) override; - Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, - LocalExchangeSourceLocalState& local_state) override; + Status get_block(RuntimeState* state, vectorized::Block* block, bool* eos, Profile&& profile, + SourceInfo&& source_info) override; ExchangeType get_type() const override { return ExchangeType::ADAPTIVE_PASSTHROUGH; } - void close(LocalExchangeSourceLocalState& local_state) override; + void close(SourceInfo&& source_info) override; private: Status _passthrough_sink(RuntimeState* state, vectorized::Block* in_block, - LocalExchangeSinkLocalState& local_state); - Status _shuffle_sink(RuntimeState* state, vectorized::Block* in_block, - LocalExchangeSinkLocalState& local_state); + SinkInfo&& sink_info); + Status _shuffle_sink(RuntimeState* state, vectorized::Block* in_block, SinkInfo&& sink_info); Status _split_rows(RuntimeState* state, const uint32_t* __restrict channel_ids, - vectorized::Block* block, LocalExchangeSinkLocalState& local_state); + vectorized::Block* block, SinkInfo&& sink_info); std::atomic_bool _is_pass_through = false; std::atomic_int32_t _total_block = 0; + std::vector> _partition_rows_histogram; }; #include "common/compile_check_end.h" -} // namespace doris::pipeline +} // namespace pipeline +} // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/pipeline.cpp b/be/src/pipeline/pipeline.cpp index 96da754daa5d98..6c39d361e59c77 100644 --- a/be/src/pipeline/pipeline.cpp +++ b/be/src/pipeline/pipeline.cpp @@ -112,7 +112,12 @@ void Pipeline::make_all_runnable() { if (_sink->count_down_destination()) { for (auto* task : _tasks) { if (task) { - task->clear_blocking_state(true); + task->set_wake_up_early(); + } + } + for (auto* task : _tasks) { + if (task) { + task->clear_blocking_state(); } } } diff --git a/be/src/pipeline/pipeline.h b/be/src/pipeline/pipeline.h index b969186b178bf7..afbe6c77596432 100644 --- a/be/src/pipeline/pipeline.h +++ b/be/src/pipeline/pipeline.h @@ -73,6 +73,14 @@ class Pipeline : public std::enable_shared_from_this { return idx == ExchangeType::HASH_SHUFFLE || idx == ExchangeType::BUCKET_HASH_SHUFFLE; } + // For HASH_SHUFFLE, BUCKET_HASH_SHUFFLE, and ADAPTIVE_PASSTHROUGH, + // data is processed and shuffled on the sink. + // Compared to PASSTHROUGH, this is a relatively heavy operation. + static bool heavy_operations_on_the_sink(ExchangeType idx) { + return idx == ExchangeType::HASH_SHUFFLE || idx == ExchangeType::BUCKET_HASH_SHUFFLE || + idx == ExchangeType::ADAPTIVE_PASSTHROUGH; + } + bool need_to_local_exchange(const DataDistribution target_data_distribution, const int idx) const; void init_data_distribution() { diff --git a/be/src/pipeline/pipeline_fragment_context.cpp b/be/src/pipeline/pipeline_fragment_context.cpp index d14a0d0c3cd4a7..5ae89db55a45ac 100644 --- a/be/src/pipeline/pipeline_fragment_context.cpp +++ b/be/src/pipeline/pipeline_fragment_context.cpp @@ -35,6 +35,7 @@ #include "cloud/config.h" #include "common/cast_set.h" #include "common/config.h" +#include "common/exception.h" #include "common/logging.h" #include "common/status.h" #include "io/fs/stream_load_pipe.h" @@ -260,7 +261,7 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re _runtime_state = RuntimeState::create_unique( request.query_id, request.fragment_id, request.query_options, _query_ctx->query_globals, _exec_env, _query_ctx.get()); - + _runtime_state->set_task_execution_context(shared_from_this()); SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_runtime_state->query_mem_tracker()); if (request.__isset.backend_id) { _runtime_state->set_backend_id(request.backend_id); @@ -297,7 +298,13 @@ Status PipelineFragmentContext::prepare(const doris::TPipelineFragmentParams& re _query_ctx->init_runtime_predicates(local_params.topn_filter_descs); } - _need_local_merge = request.__isset.parallel_instances; + // init fragment_instance_ids + const auto target_size = request.local_params.size(); + _fragment_instance_ids.resize(target_size); + for (size_t i = 0; i < request.local_params.size(); i++) { + auto fragment_instance_id = request.local_params[i].fragment_instance_id; + _fragment_instance_ids[i] = fragment_instance_id; + } } { @@ -355,7 +362,6 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag _total_tasks = 0; const auto target_size = request.local_params.size(); _tasks.resize(target_size); - _fragment_instance_ids.resize(target_size); _runtime_filter_states.resize(target_size); _task_runtime_states.resize(_pipelines.size()); for (size_t pip_idx = 0; pip_idx < _pipelines.size(); pip_idx++) { @@ -367,31 +373,9 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag auto pre_and_submit = [&](int i, PipelineFragmentContext* ctx) { const auto& local_params = request.local_params[i]; auto fragment_instance_id = local_params.fragment_instance_id; - _fragment_instance_ids[i] = fragment_instance_id; - - auto filterparams = std::make_unique(); - - { - filterparams->runtime_filter_wait_infinitely = - _runtime_state->runtime_filter_wait_infinitely(); - filterparams->runtime_filter_wait_time_ms = - _runtime_state->runtime_filter_wait_time_ms(); - filterparams->execution_timeout = _runtime_state->execution_timeout(); - - filterparams->exec_env = ExecEnv::GetInstance(); - filterparams->query_id.set_hi(_runtime_state->query_id().hi); - filterparams->query_id.set_lo(_runtime_state->query_id().lo); - - filterparams->be_exec_version = _runtime_state->be_exec_version(); - filterparams->query_ctx = _query_ctx.get(); - } - - auto runtime_filter_mgr = std::make_unique( - request.query_id, filterparams.get(), _query_ctx->query_mem_tracker); - - filterparams->runtime_filter_mgr = runtime_filter_mgr.get(); - - _runtime_filter_states[i] = std::move(filterparams); + _runtime_filter_states[i] = RuntimeFilterParamsContext::create(_query_ctx.get()); + std::unique_ptr runtime_filter_mgr = std::make_unique( + request.query_id, _runtime_filter_states[i], _query_ctx->query_mem_tracker, false); std::map pipeline_id_to_task; auto get_local_exchange_state = [&](PipelinePtr pipeline) -> std::map, @@ -419,10 +403,11 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag << print_id(_task_runtime_states[pip_idx][i]->fragment_instance_id()) << " " << pipeline->debug_string(); _task_runtime_states[pip_idx][i] = RuntimeState::create_unique( - this, local_params.fragment_instance_id, request.query_id, - request.fragment_id, request.query_options, _query_ctx->query_globals, - _exec_env, _query_ctx.get()); + local_params.fragment_instance_id, request.query_id, request.fragment_id, + request.query_options, _query_ctx->query_globals, _exec_env, + _query_ctx.get()); auto& task_runtime_state = _task_runtime_states[pip_idx][i]; + _runtime_filter_states[i]->set_state(task_runtime_state.get()); { // Initialize runtime state for this task task_runtime_state->set_query_mem_tracker(_query_ctx->query_mem_tracker); @@ -454,9 +439,8 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag task_runtime_state->set_load_stream_per_node(request.load_stream_per_node); task_runtime_state->set_total_load_streams(request.total_load_streams); task_runtime_state->set_num_local_sink(request.num_local_sink); - DCHECK(_runtime_filter_states[i]->runtime_filter_mgr); - task_runtime_state->set_runtime_filter_mgr( - _runtime_filter_states[i]->runtime_filter_mgr); + + task_runtime_state->set_runtime_filter_mgr(runtime_filter_mgr.get()); } auto cur_task_id = _total_tasks++; task_runtime_state->set_task_id(cur_task_id); @@ -515,8 +499,8 @@ Status PipelineFragmentContext::_build_pipeline_tasks(const doris::TPipelineFrag if (pipeline_id_to_task.contains(_pipelines[pip_idx]->id())) { auto* task = pipeline_id_to_task[_pipelines[pip_idx]->id()]; DCHECK(pipeline_id_to_profile[pip_idx]); - RETURN_IF_ERROR(task->prepare(local_params, request.fragment.output_sink, - _query_ctx.get())); + RETURN_IF_ERROR_OR_CATCH_EXCEPTION(task->prepare( + local_params, request.fragment.output_sink, _query_ctx.get())); } } { @@ -769,7 +753,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( switch (data_distribution.distribution_type) { case ExchangeType::HASH_SHUFFLE: shared_state->exchanger = ShuffleExchanger::create_unique( - std::max(cur_pipe->num_tasks(), _num_instances), + std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, use_global_hash_shuffle ? _total_instances : _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set( @@ -836,7 +820,7 @@ Status PipelineFragmentContext::_add_local_exchange_impl( } case ExchangeType::ADAPTIVE_PASSTHROUGH: shared_state->exchanger = AdaptivePassthroughExchanger::create_unique( - cur_pipe->num_tasks(), _num_instances, + std::max(cur_pipe->num_tasks(), _num_instances), _num_instances, _runtime_state->query_options().__isset.local_exchange_free_blocks_limit ? cast_set( _runtime_state->query_options().local_exchange_free_blocks_limit) @@ -937,9 +921,13 @@ Status PipelineFragmentContext::_add_local_exchange( << " cur_pipe->operators().size(): " << cur_pipe->operators().size() << " new_pip->operators().size(): " << new_pip->operators().size(); - // Add passthrough local exchanger if necessary + // There are some local shuffles with relatively heavy operations on the sink. + // If the local sink concurrency is 1 and the local source concurrency is n, the sink becomes a bottleneck. + // Therefore, local passthrough is used to increase the concurrency of the sink. + // op -> local sink(1) -> local source (n) + // op -> local passthrough(1) -> local passthrough(n) -> local sink(n) -> local source (n) if (cur_pipe->num_tasks() > 1 && new_pip->num_tasks() == 1 && - Pipeline::is_hash_exchange(data_distribution.distribution_type)) { + Pipeline::heavy_operations_on_the_sink(data_distribution.distribution_type)) { RETURN_IF_ERROR(_add_local_exchange_impl( cast_set(new_pip->operators().size()), pool, new_pip, add_pipeline(new_pip, pip_idx + 2), DataDistribution(ExchangeType::PASSTHROUGH), @@ -969,9 +957,9 @@ Status PipelineFragmentContext::_plan_local_exchange( // if 'num_buckets == 0' means the fragment is colocated by exchange node not the // scan node. so here use `_num_instance` to replace the `num_buckets` to prevent dividing 0 // still keep colocate plan after local shuffle - RETURN_IF_ERROR(_plan_local_exchange( - _use_serial_source || num_buckets == 0 ? _num_instances : num_buckets, pip_idx, - _pipelines[pip_idx], bucket_seq_to_instance_idx, shuffle_idx_to_instance_idx)); + RETURN_IF_ERROR(_plan_local_exchange(num_buckets, pip_idx, _pipelines[pip_idx], + bucket_seq_to_instance_idx, + shuffle_idx_to_instance_idx)); } return Status::OK(); } @@ -1025,7 +1013,8 @@ Status PipelineFragmentContext::_create_data_sink(ObjectPool* pool, const TDataS return Status::InternalError("Missing data stream sink."); } _sink.reset(new ExchangeSinkOperatorX(state, row_desc, next_sink_operator_id(), - thrift_sink.stream_sink, params.destinations)); + thrift_sink.stream_sink, params.destinations, + _fragment_instance_ids)); break; } case TDataSinkType::RESULT_SINK: { @@ -1152,10 +1141,10 @@ Status PipelineFragmentContext::_create_data_sink(ObjectPool* pool, const TDataS // 2. create and set sink operator of data stream sender for new pipeline DataSinkOperatorPtr sink_op; - sink_op.reset( - new ExchangeSinkOperatorX(state, *_row_desc, next_sink_operator_id(), - thrift_sink.multi_cast_stream_sink.sinks[i], - thrift_sink.multi_cast_stream_sink.destinations[i])); + sink_op.reset(new ExchangeSinkOperatorX( + state, *_row_desc, next_sink_operator_id(), + thrift_sink.multi_cast_stream_sink.sinks[i], + thrift_sink.multi_cast_stream_sink.destinations[i], _fragment_instance_ids)); RETURN_IF_ERROR(new_pipeline->set_sink(sink_op)); { @@ -1386,8 +1375,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo const uint32_t partition_count = 32; auto inner_probe_operator = std::make_shared(pool, tnode_, 0, descs); - auto inner_sink_operator = std::make_shared( - pool, 0, tnode_, descs, _need_local_merge); + auto inner_sink_operator = + std::make_shared(pool, 0, tnode_, descs); RETURN_IF_ERROR(inner_probe_operator->init(tnode_, _runtime_state.get())); RETURN_IF_ERROR(inner_sink_operator->init(tnode_, _runtime_state.get())); @@ -1407,8 +1396,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); auto sink_operator = std::make_shared( - pool, next_sink_operator_id(), tnode_, descs, _need_local_merge, - partition_count); + pool, next_sink_operator_id(), tnode_, descs, partition_count); sink_operator->set_inner_operators(inner_sink_operator, inner_probe_operator); DataSinkOperatorPtr sink = std::move(sink_operator); sink->set_dests_id({op->operator_id()}); @@ -1432,8 +1420,7 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; - sink.reset(new HashJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, - _need_local_merge)); + sink.reset(new HashJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs)); sink->set_dests_id({op->operator_id()}); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); @@ -1460,8 +1447,8 @@ Status PipelineFragmentContext::_create_operator(ObjectPool* pool, const TPlanNo _dag[downstream_pipeline_id].push_back(build_side_pipe->id()); DataSinkOperatorPtr sink; - sink.reset(new NestedLoopJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs, - _need_local_merge)); + sink.reset( + new NestedLoopJoinBuildSinkOperatorX(pool, next_sink_operator_id(), tnode, descs)); sink->set_dests_id({op->operator_id()}); RETURN_IF_ERROR(build_side_pipe->set_sink(sink)); RETURN_IF_ERROR(build_side_pipe->sink()->init(tnode, _runtime_state.get())); diff --git a/be/src/pipeline/pipeline_fragment_context.h b/be/src/pipeline/pipeline_fragment_context.h index 289f5c8236522f..1674afa886d520 100644 --- a/be/src/pipeline/pipeline_fragment_context.h +++ b/be/src/pipeline/pipeline_fragment_context.h @@ -228,8 +228,6 @@ class PipelineFragmentContext : public TaskExecutionContext { // this is a [n * m] matrix. n is parallelism of pipeline engine and m is the number of pipelines. std::vector>> _tasks; - bool _need_local_merge = false; - // TODO: remove the _sink and _multi_cast_stream_sink_senders to set both // of it in pipeline task not the fragment_context #ifdef __clang__ @@ -301,7 +299,7 @@ class PipelineFragmentContext : public TaskExecutionContext { */ std::vector>> _task_runtime_states; - std::vector> _runtime_filter_states; + std::vector _runtime_filter_states; // Total instance num running on all BEs int _total_instances = -1; diff --git a/be/src/pipeline/pipeline_task.cpp b/be/src/pipeline/pipeline_task.cpp index 6f9e59c8291966..5ed725010ec364 100644 --- a/be/src/pipeline/pipeline_task.cpp +++ b/be/src/pipeline/pipeline_task.cpp @@ -22,6 +22,7 @@ #include #include +#include #include #include @@ -223,9 +224,6 @@ bool PipelineTask::_wait_to_start() { _blocked_dep = _execution_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { static_cast(_blocked_dep)->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } @@ -233,9 +231,6 @@ bool PipelineTask::_wait_to_start() { _blocked_dep = op_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } } @@ -257,9 +252,6 @@ bool PipelineTask::_is_blocked() { _blocked_dep = dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } } @@ -279,9 +271,6 @@ bool PipelineTask::_is_blocked() { _blocked_dep = op_dep->is_blocked_by(this); if (_blocked_dep != nullptr) { _blocked_dep->start_watcher(); - if (_wake_up_by_downstream) { - _eos = true; - } return true; } } @@ -289,15 +278,15 @@ bool PipelineTask::_is_blocked() { } Status PipelineTask::execute(bool* eos) { - SCOPED_TIMER(_task_profile->total_time_counter()); - SCOPED_TIMER(_exec_timer); - SCOPED_ATTACH_TASK(_state); - _eos = _sink->is_finished(_state) || _eos || _wake_up_by_downstream; - *eos = _eos; if (_eos) { - // If task is waken up by finish dependency, `_eos` is set to true by last execution, and we should return here. + *eos = true; return Status::OK(); } + + SCOPED_TIMER(_task_profile->total_time_counter()); + SCOPED_TIMER(_exec_timer); + SCOPED_ATTACH_TASK(_state); + int64_t time_spent = 0; DBUG_EXECUTE_IF("fault_inject::PipelineXTask::execute", { Status status = Status::Error("fault_inject pipeline_task execute failed"); @@ -315,32 +304,36 @@ Status PipelineTask::execute(bool* eos) { if (cpu_qs) { cpu_qs->add_cpu_nanos(delta_cpu_time); } - query_context()->update_wg_cpu_adder(delta_cpu_time); + query_context()->update_cpu_time(delta_cpu_time); }}; if (_wait_to_start()) { return Status::OK(); } - if (_wake_up_by_downstream) { - _eos = true; - *eos = true; - return Status::OK(); - } + // The status must be runnable if (!_opened && !_fragment_context->is_canceled()) { + if (_wake_up_early) { + *eos = true; + _eos = true; + return Status::OK(); + } RETURN_IF_ERROR(_open()); } + auto set_wake_up_and_dep_ready = [&]() { + if (wake_up_early()) { + return; + } + set_wake_up_early(); + clear_blocking_state(); + }; + _task_profile->add_info_string("TaskState", "Runnable"); _task_profile->add_info_string("BlockedByDependency", ""); while (!_fragment_context->is_canceled()) { if (_is_blocked()) { return Status::OK(); } - if (_wake_up_by_downstream) { - _eos = true; - *eos = true; - return Status::OK(); - } /// When a task is cancelled, /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready). @@ -361,47 +354,47 @@ Status PipelineTask::execute(bool* eos) { RETURN_IF_ERROR(_sink->revoke_memory(_state)); continue; } - *eos = _eos; DBUG_EXECUTE_IF("fault_inject::PipelineXTask::executing", { Status status = Status::Error("fault_inject pipeline_task executing failed"); return status; }); - // `_dry_run` means sink operator need no more data // `_sink->is_finished(_state)` means sink operator should be finished - if (_dry_run || _sink->is_finished(_state)) { - *eos = true; - _eos = true; - } else { + if (_sink->is_finished(_state)) { + set_wake_up_and_dep_ready(); + } + + // `_dry_run` means sink operator need no more data + *eos = wake_up_early() || _dry_run; + if (!*eos) { SCOPED_TIMER(_get_block_timer); _get_block_counter->update(1); RETURN_IF_ERROR(_root->get_block_after_projects(_state, block, eos)); } + if (*eos) { + RETURN_IF_ERROR(close(Status::OK(), false)); + } + if (_block->rows() != 0 || *eos) { SCOPED_TIMER(_sink_timer); - Status status = Status::OK(); - // Define a lambda function to catch sink exception, because sink will check - // return error status with EOF, it is special, could not return directly. - auto sink_function = [&]() -> Status { - Status internal_st; - internal_st = _sink->sink(_state, block, *eos); - return internal_st; - }; - status = sink_function(); - if (!status.is()) { - RETURN_IF_ERROR(status); + Status status = _sink->sink(_state, block, *eos); + + if (status.is()) { + set_wake_up_and_dep_ready(); + } else if (!status) { + return status; } - *eos = status.is() ? true : *eos; + if (*eos) { // just return, the scheduler will do finish work - _eos = true; _task_profile->add_info_string("TaskState", "Finished"); + _eos = true; return Status::OK(); } } } - static_cast(get_task_queue()->push_back(this)); + RETURN_IF_ERROR(get_task_queue()->push_back(this)); return Status::OK(); } @@ -470,17 +463,14 @@ void PipelineTask::finalize() { _le_state_map.clear(); } -Status PipelineTask::close(Status exec_status) { +Status PipelineTask::close(Status exec_status, bool close_sink) { int64_t close_ns = 0; - Defer defer {[&]() { - if (_task_queue) { - _task_queue->update_statistics(this, close_ns); - } - }}; Status s; { SCOPED_RAW_TIMER(&close_ns); - s = _sink->close(_state, exec_status); + if (close_sink) { + s = _sink->close(_state, exec_status); + } for (auto& op : _operators) { auto tem = op->close(_state); if (!tem.ok() && s.ok()) { @@ -489,10 +479,18 @@ Status PipelineTask::close(Status exec_status) { } } if (_opened) { - _fresh_profile_counter(); - COUNTER_SET(_close_timer, close_ns); + COUNTER_UPDATE(_close_timer, close_ns); COUNTER_UPDATE(_task_profile->total_time_counter(), close_ns); } + + if (close_sink && _opened) { + _task_profile->add_info_string("WakeUpEarly", wake_up_early() ? "true" : "false"); + _fresh_profile_counter(); + } + + if (_task_queue) { + _task_queue->update_statistics(this, close_ns); + } return s; } @@ -508,10 +506,10 @@ std::string PipelineTask::debug_string() { auto elapsed = _fragment_context->elapsed_time() / 1000000000.0; fmt::format_to(debug_string_buffer, "PipelineTask[this = {}, id = {}, open = {}, eos = {}, finish = {}, dry run = " - "{}, elapse time = {}s, _wake_up_by_downstream = {}], block dependency = {}, is " + "{}, elapse time = {}s, _wake_up_early = {}], block dependency = {}, is " "running = {}\noperators: ", (void*)this, _index, _opened, _eos, _finalized, _dry_run, elapsed, - _wake_up_by_downstream.load(), + _wake_up_early.load(), cur_blocked_dep && !_finalized ? cur_blocked_dep->debug_string() : "NULL", is_running()); for (size_t i = 0; i < _operators.size(); i++) { diff --git a/be/src/pipeline/pipeline_task.h b/be/src/pipeline/pipeline_task.h index 3b4627f589dc54..1a31e5954f479c 100644 --- a/be/src/pipeline/pipeline_task.h +++ b/be/src/pipeline/pipeline_task.h @@ -61,7 +61,7 @@ class PipelineTask { // if the pipeline create a bunch of pipeline task // must be call after all pipeline task is finish to release resource - Status close(Status exec_status); + Status close(Status exec_status, bool close_sink = true); PipelineFragmentContext* fragment_context() { return _fragment_context; } @@ -135,11 +135,12 @@ class PipelineTask { int task_id() const { return _index; }; bool is_finalized() const { return _finalized; } - void clear_blocking_state(bool wake_up_by_downstream = false) { + void set_wake_up_early() { _wake_up_early = true; } + + void clear_blocking_state() { _state->get_query_ctx()->get_execution_dependency()->set_always_ready(); // We use a lock to assure all dependencies are not deconstructed here. std::unique_lock lc(_dependency_lock); - _wake_up_by_downstream = _wake_up_by_downstream || wake_up_by_downstream; if (!_finalized) { _execution_dep->set_always_ready(); for (auto* dep : _filter_dependencies) { @@ -236,7 +237,7 @@ class PipelineTask { PipelineId pipeline_id() const { return _pipeline->id(); } - bool wake_up_by_downstream() const { return _wake_up_by_downstream; } + bool wake_up_early() const { return _wake_up_early; } private: friend class RuntimeFilterDependency; @@ -318,7 +319,7 @@ class PipelineTask { std::atomic _running = false; std::atomic _eos = false; - std::atomic _wake_up_by_downstream = false; + std::atomic _wake_up_early = false; }; } // namespace doris::pipeline diff --git a/be/src/pipeline/shuffle/writer.cpp b/be/src/pipeline/shuffle/writer.cpp new file mode 100644 index 00000000000000..c27fd9a7aeb731 --- /dev/null +++ b/be/src/pipeline/shuffle/writer.cpp @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "writer.h" + +#include "pipeline/exec/exchange_sink_operator.h" +#include "vec/core/block.h" + +namespace doris::pipeline { +#include "common/compile_check_begin.h" +template +void Writer::_handle_eof_channel(RuntimeState* state, ChannelPtrType channel, Status st) const { + channel->set_receiver_eof(st); + // Chanel will not send RPC to the downstream when eof, so close chanel by OK status. + static_cast(channel->close(state)); +} + +Status Writer::write(ExchangeSinkLocalState* local_state, RuntimeState* state, + vectorized::Block* block, bool eos) const { + auto rows = block->rows(); + { + SCOPED_TIMER(local_state->split_block_hash_compute_timer()); + RETURN_IF_ERROR(local_state->partitioner()->do_partitioning(state, block)); + } + int64_t old_channel_mem_usage = 0; + for (const auto& channel : local_state->channels) { + old_channel_mem_usage += channel->mem_usage(); + } + { + SCOPED_TIMER(local_state->distribute_rows_into_channels_timer()); + const auto& channel_filed = local_state->partitioner()->get_channel_ids(); + if (channel_filed.len == sizeof(uint32_t)) { + RETURN_IF_ERROR(_channel_add_rows(state, local_state->channels, + local_state->channels.size(), + channel_filed.get(), rows, block, eos)); + } else { + RETURN_IF_ERROR(_channel_add_rows(state, local_state->channels, + local_state->channels.size(), + channel_filed.get(), rows, block, eos)); + } + } + int64_t new_channel_mem_usage = 0; + for (const auto& channel : local_state->channels) { + new_channel_mem_usage += channel->mem_usage(); + } + COUNTER_UPDATE(local_state->memory_used_counter(), + new_channel_mem_usage - old_channel_mem_usage); + return Status::OK(); +} + +template +Status Writer::_channel_add_rows(RuntimeState* state, + std::vector>& channels, + size_t partition_count, + const ChannelIdType* __restrict channel_ids, size_t rows, + vectorized::Block* block, bool eos) const { + std::vector partition_rows_histogram; + auto row_idx = vectorized::PODArray(rows); + { + partition_rows_histogram.assign(partition_count + 2, 0); + for (size_t i = 0; i < rows; ++i) { + partition_rows_histogram[channel_ids[i] + 1]++; + } + for (size_t i = 1; i <= partition_count + 1; ++i) { + partition_rows_histogram[i] += partition_rows_histogram[i - 1]; + } + for (int32_t i = cast_set(rows) - 1; i >= 0; --i) { + row_idx[partition_rows_histogram[channel_ids[i] + 1] - 1] = i; + partition_rows_histogram[channel_ids[i] + 1]--; + } + } +#define HANDLE_CHANNEL_STATUS(state, channel, status) \ + do { \ + if (status.is()) { \ + _handle_eof_channel(state, channel, status); \ + } else { \ + RETURN_IF_ERROR(status); \ + } \ + } while (0) + Status status = Status::OK(); + for (size_t i = 0; i < partition_count; ++i) { + uint32_t start = partition_rows_histogram[i + 1]; + uint32_t size = partition_rows_histogram[i + 2] - start; + if (!channels[i]->is_receiver_eof() && size > 0) { + status = channels[i]->add_rows(block, row_idx.data(), start, size, false); + HANDLE_CHANNEL_STATUS(state, channels[i], status); + } + } + if (eos) { + for (int i = 0; i < partition_count; ++i) { + if (!channels[i]->is_receiver_eof()) { + status = channels[i]->add_rows(block, row_idx.data(), 0, 0, true); + HANDLE_CHANNEL_STATUS(state, channels[i], status); + } + } + } + return Status::OK(); +} + +} // namespace doris::pipeline diff --git a/be/src/pipeline/shuffle/writer.h b/be/src/pipeline/shuffle/writer.h new file mode 100644 index 00000000000000..0eb772120293e5 --- /dev/null +++ b/be/src/pipeline/shuffle/writer.h @@ -0,0 +1,53 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "vec/sink/vdata_stream_sender.h" + +namespace doris { +class RuntimeState; +class Status; +namespace vectorized { +class Block; +class Channel; +} // namespace vectorized +namespace pipeline { + +#include "common/compile_check_begin.h" +class ExchangeSinkLocalState; + +class Writer { +public: + Writer() = default; + + Status write(ExchangeSinkLocalState* local_state, RuntimeState* state, vectorized::Block* block, + bool eos) const; + +private: + template + Status _channel_add_rows(RuntimeState* state, + std::vector>& channels, + size_t partition_count, const ChannelIdType* __restrict channel_ids, + size_t rows, vectorized::Block* block, bool eos) const; + + template + void _handle_eof_channel(RuntimeState* state, ChannelPtrType channel, Status st) const; +}; +#include "common/compile_check_end.h" +} // namespace pipeline +} // namespace doris \ No newline at end of file diff --git a/be/src/pipeline/task_scheduler.h b/be/src/pipeline/task_scheduler.h index bdb5bec1776f58..3c1b08063dfa61 100644 --- a/be/src/pipeline/task_scheduler.h +++ b/be/src/pipeline/task_scheduler.h @@ -43,7 +43,7 @@ namespace doris::pipeline { class TaskScheduler { public: - TaskScheduler(int core_num, std::string name, CgroupCpuCtl* cgroup_cpu_ctl) + TaskScheduler(int core_num, std::string name, std::shared_ptr cgroup_cpu_ctl) : _task_queue(core_num), _shutdown(false), _name(std::move(name)), @@ -65,7 +65,7 @@ class TaskScheduler { std::vector _markers; bool _shutdown; std::string _name; - CgroupCpuCtl* _cgroup_cpu_ctl = nullptr; + std::weak_ptr _cgroup_cpu_ctl; void _do_work(int index); }; diff --git a/be/src/runtime/buffer_control_block.cpp b/be/src/runtime/buffer_control_block.cpp index 1ed2836f8eb616..6f4427746f8103 100644 --- a/be/src/runtime/buffer_control_block.cpp +++ b/be/src/runtime/buffer_control_block.cpp @@ -30,12 +30,10 @@ #include #include -#include "arrow/record_batch.h" #include "arrow/type_fwd.h" #include "pipeline/dependency.h" #include "runtime/thread_context.h" #include "util/runtime_profile.h" -#include "util/string_util.h" #include "util/thrift_util.h" #include "vec/core/block.h" @@ -149,8 +147,8 @@ void GetArrowResultBatchCtx::on_data( delete this; } -BufferControlBlock::BufferControlBlock(const TUniqueId& id, int buffer_size, RuntimeState* state) - : _fragment_id(id), +BufferControlBlock::BufferControlBlock(TUniqueId id, int buffer_size, RuntimeState* state) + : _fragment_id(std::move(id)), _is_close(false), _is_cancelled(false), _buffer_limit(buffer_size), @@ -292,6 +290,9 @@ Status BufferControlBlock::get_arrow_batch(std::shared_ptr* r _arrow_data_arrival.wait_for(l, std::chrono::milliseconds(20)); } + if (!_status.ok()) { + return _status; + } if (_is_cancelled) { return Status::Cancelled(fmt::format("Cancelled ()", print_id(_fragment_id))); } @@ -311,9 +312,12 @@ Status BufferControlBlock::get_arrow_batch(std::shared_ptr* r // normal path end if (_is_close) { + if (!_status.ok()) { + return _status; + } std::stringstream ss; _profile.pretty_print(&ss); - VLOG_NOTICE << fmt::format( + LOG(INFO) << fmt::format( "BufferControlBlock finished, fragment_id={}, is_close={}, is_cancelled={}, " "packet_num={}, peak_memory_usage={}, profile={}", print_id(_fragment_id), _is_close, _is_cancelled, _packet_num, @@ -321,7 +325,7 @@ Status BufferControlBlock::get_arrow_batch(std::shared_ptr* r return Status::OK(); } return Status::InternalError( - fmt::format("Get Arrow Batch Abnormal Ending ()", print_id(_fragment_id))); + fmt::format("Get Arrow Batch Abnormal Ending (), ()", print_id(_fragment_id), _status)); } void BufferControlBlock::get_arrow_batch(GetArrowResultBatchCtx* ctx) { @@ -354,10 +358,14 @@ void BufferControlBlock::get_arrow_batch(GetArrowResultBatchCtx* ctx) { // normal path end if (_is_close) { + if (!_status.ok()) { + ctx->on_failure(_status); + return; + } ctx->on_close(_packet_num); std::stringstream ss; _profile.pretty_print(&ss); - VLOG_NOTICE << fmt::format( + LOG(INFO) << fmt::format( "BufferControlBlock finished, fragment_id={}, is_close={}, is_cancelled={}, " "packet_num={}, peak_memory_usage={}, profile={}", print_id(_fragment_id), _is_close, _is_cancelled, _packet_num, @@ -391,8 +399,8 @@ Status BufferControlBlock::find_arrow_schema(std::shared_ptr* arr if (_is_close) { return Status::RuntimeError(fmt::format("Closed ()", print_id(_fragment_id))); } - return Status::InternalError( - fmt::format("Get Arrow Schema Abnormal Ending ()", print_id(_fragment_id))); + return Status::InternalError(fmt::format("Get Arrow Schema Abnormal Ending (), ()", + print_id(_fragment_id), _status)); } Status BufferControlBlock::close(const TUniqueId& id, Status exec_status) { diff --git a/be/src/runtime/buffer_control_block.h b/be/src/runtime/buffer_control_block.h index 249e1ba7652df7..9060007232e3bd 100644 --- a/be/src/runtime/buffer_control_block.h +++ b/be/src/runtime/buffer_control_block.h @@ -21,10 +21,10 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -34,7 +34,6 @@ #include "common/status.h" #include "runtime/query_statistics.h" #include "runtime/runtime_state.h" -#include "util/hash_util.hpp" namespace google::protobuf { class Closure; @@ -98,13 +97,15 @@ struct GetArrowResultBatchCtx { // buffer used for result customer and producer class BufferControlBlock { public: - BufferControlBlock(const TUniqueId& id, int buffer_size, RuntimeState* state); + BufferControlBlock(TUniqueId id, int buffer_size, RuntimeState* state); ~BufferControlBlock(); Status init(); + // try to consume _waiting_rpc or make data waiting in _fe_result_batch_queue. try to combine block to reduce rpc first. Status add_batch(RuntimeState* state, std::unique_ptr& result); Status add_arrow_batch(RuntimeState* state, std::shared_ptr& result); + // if there's Block waiting in _fe_result_batch_queue, send it(by on_data). otherwise make a rpc wait in _waiting_rpc. void get_batch(GetResultBatchCtx* ctx); // for ArrowFlightBatchLocalReader Status get_arrow_batch(std::shared_ptr* result, @@ -150,7 +151,7 @@ class BufferControlBlock { const int _buffer_limit; int64_t _packet_num; - // blocking queue for batch + // Producer. blocking queue for result batch waiting to sent to FE by _waiting_rpc. FeResultQueue _fe_result_batch_queue; ArrowFlightResultQueue _arrow_flight_result_batch_queue; // for arrow flight @@ -163,6 +164,7 @@ class BufferControlBlock { // TODO, waiting for data will block pipeline, so use a request pool to save requests waiting for data. std::condition_variable _arrow_data_arrival; + // Consumer. RPCs which FE waiting for result. when _fe_result_batch_queue filled, the rpc could be sent. std::deque _waiting_rpc; std::deque _waiting_arrow_result_batch_rpc; diff --git a/be/src/runtime/exec_env.h b/be/src/runtime/exec_env.h index 813f0833ad4061..636ce2bf288b58 100644 --- a/be/src/runtime/exec_env.h +++ b/be/src/runtime/exec_env.h @@ -178,7 +178,6 @@ class ExecEnv { std::vector mem_tracker_limiter_pool; void init_mem_tracker(); std::shared_ptr orphan_mem_tracker() { return _orphan_mem_tracker; } - std::shared_ptr page_no_cache_mem_tracker() { return _page_no_cache_mem_tracker; } std::shared_ptr brpc_iobuf_block_memory_tracker() { return _brpc_iobuf_block_memory_tracker; } @@ -188,6 +187,15 @@ class ExecEnv { std::shared_ptr stream_load_pipe_tracker() { return _stream_load_pipe_tracker; } + std::shared_ptr tablets_no_cache_mem_tracker() { + return _tablets_no_cache_mem_tracker; + } + std::shared_ptr rowsets_no_cache_mem_tracker() { + return _rowsets_no_cache_mem_tracker; + } + std::shared_ptr segments_no_cache_mem_tracker() { + return _segments_no_cache_mem_tracker; + } std::shared_ptr point_query_executor_mem_tracker() { return _point_query_executor_mem_tracker; } @@ -294,6 +302,7 @@ class ExecEnv { static void set_tracking_memory(bool tracking_memory) { _s_tracking_memory.store(tracking_memory, std::memory_order_release); } + void set_orc_memory_pool(orc::MemoryPool* pool) { _orc_memory_pool = pool; } #endif LoadStreamMapPool* load_stream_map_pool() { return _load_stream_map_pool.get(); } @@ -376,13 +385,15 @@ class ExecEnv { // Ideally, all threads are expected to attach to the specified tracker, so that "all memory has its own ownership", // and the consumption of the orphan mem tracker is close to 0, but greater than 0. std::shared_ptr _orphan_mem_tracker; - // page size not in cache, data page/index page/etc. - std::shared_ptr _page_no_cache_mem_tracker; std::shared_ptr _brpc_iobuf_block_memory_tracker; // Count the memory consumption of segment compaction tasks. std::shared_ptr _segcompaction_mem_tracker; std::shared_ptr _stream_load_pipe_tracker; + std::shared_ptr _tablets_no_cache_mem_tracker; + std::shared_ptr _rowsets_no_cache_mem_tracker; + std::shared_ptr _segments_no_cache_mem_tracker; + // Tracking memory may be shared between multiple queries. std::shared_ptr _point_query_executor_mem_tracker; std::shared_ptr _block_compression_mem_tracker; diff --git a/be/src/runtime/exec_env_init.cpp b/be/src/runtime/exec_env_init.cpp index 75ec588aa50c1d..a371cdb947ff56 100644 --- a/be/src/runtime/exec_env_init.cpp +++ b/be/src/runtime/exec_env_init.cpp @@ -276,6 +276,7 @@ Status ExecEnv::_init(const std::vector& store_paths, _pipeline_tracer_ctx = std::make_unique(); // before query RETURN_IF_ERROR(init_pipeline_task_scheduler()); _workload_group_manager = new WorkloadGroupMgr(); + _workload_group_manager->init_internal_workload_group(); _scanner_scheduler = new doris::vectorized::ScannerScheduler(); _fragment_mgr = new FragmentMgr(this); _result_cache = new ResultCache(config::query_cache_max_size_mb, @@ -364,7 +365,8 @@ Status ExecEnv::_init(const std::vector& store_paths, return st; } _storage_engine->set_heartbeat_flags(this->heartbeat_flags()); - if (st = _storage_engine->start_bg_threads(); !st.ok()) { + WorkloadGroupPtr internal_wg = _workload_group_manager->get_internal_wg(); + if (st = _storage_engine->start_bg_threads(internal_wg); !st.ok()) { LOG(ERROR) << "Failed to starge bg threads of storage engine, res=" << st; return st; } @@ -419,40 +421,28 @@ void ExecEnv::init_file_cache_factory(std::vector& cache_paths std::unordered_set cache_path_set; Status rest = doris::parse_conf_cache_paths(doris::config::file_cache_path, cache_paths); if (!rest) { - LOG(FATAL) << "parse config file cache path failed, path=" - << doris::config::file_cache_path; + LOG(FATAL) << "parse config file cache path failed, path=" << doris::config::file_cache_path + << ", reason=" << rest.msg(); exit(-1); } - std::vector file_cache_init_threads; - std::list cache_status; + doris::Status cache_status; for (auto& cache_path : cache_paths) { if (cache_path_set.find(cache_path.path) != cache_path_set.end()) { LOG(WARNING) << fmt::format("cache path {} is duplicate", cache_path.path); continue; } - file_cache_init_threads.emplace_back([&, status = &cache_status.emplace_back()]() { - *status = doris::io::FileCacheFactory::instance()->create_file_cache( - cache_path.path, cache_path.init_settings()); - }); - - cache_path_set.emplace(cache_path.path); - } - - for (std::thread& thread : file_cache_init_threads) { - if (thread.joinable()) { - thread.join(); - } - } - for (const auto& status : cache_status) { - if (!status.ok()) { + cache_status = doris::io::FileCacheFactory::instance()->create_file_cache( + cache_path.path, cache_path.init_settings()); + if (!cache_status.ok()) { if (!doris::config::ignore_broken_disk) { - LOG(FATAL) << "failed to init file cache, err: " << status; + LOG(FATAL) << "failed to init file cache, err: " << cache_status; exit(-1); } - LOG(WARNING) << "failed to init file cache, err: " << status; + LOG(WARNING) << "failed to init file cache, err: " << cache_status; } + cache_path_set.emplace(cache_path.path); } } @@ -609,15 +599,20 @@ void ExecEnv::init_mem_tracker() { _s_tracking_memory = true; _orphan_mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "Orphan"); - _page_no_cache_mem_tracker = std::make_shared("PageNoCache"); _brpc_iobuf_block_memory_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "IOBufBlockMemory"); _segcompaction_mem_tracker = - MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "SegCompaction"); + MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::COMPACTION, "SegCompaction"); + _tablets_no_cache_mem_tracker = MemTrackerLimiter::create_shared( + MemTrackerLimiter::Type::METADATA, "Tablets(not in SchemaCache, TabletSchemaCache)"); + _segments_no_cache_mem_tracker = MemTrackerLimiter::create_shared( + MemTrackerLimiter::Type::METADATA, "Segments(not in SegmentCache)"); + _rowsets_no_cache_mem_tracker = + MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::METADATA, "Rowsets"); _point_query_executor_mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "PointQueryExecutor"); _query_cache_mem_tracker = - MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "QueryCache"); + MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::CACHE, "QueryCache"); _block_compression_mem_tracker = MemTrackerLimiter::create_shared(MemTrackerLimiter::Type::GLOBAL, "BlockCompression"); _rowid_storage_reader_tracker = @@ -726,7 +721,7 @@ void ExecEnv::destroy() { _file_cache_open_fd_cache.reset(); SAFE_STOP(_write_cooldown_meta_executors); - // StorageEngine must be destoried before _page_no_cache_mem_tracker.reset and _cache_manager destory + // StorageEngine must be destoried before _cache_manager destory SAFE_STOP(_storage_engine); _storage_engine.reset(); diff --git a/be/src/runtime/fragment_mgr.cpp b/be/src/runtime/fragment_mgr.cpp index 95e5f8e2ce14f3..19e8f76366c084 100644 --- a/be/src/runtime/fragment_mgr.cpp +++ b/be/src/runtime/fragment_mgr.cpp @@ -34,17 +34,16 @@ #include #include #include -#include #include #include #include #include #include -#include #include #include -#include +#include +#include #include "common/status.h" // IWYU pragma: no_include @@ -58,19 +57,16 @@ #include #include -#include "cloud/config.h" #include "common/config.h" #include "common/logging.h" #include "common/object_pool.h" #include "common/utils.h" -#include "gutil/strings/substitute.h" #include "io/fs/stream_load_pipe.h" #include "pipeline/pipeline_fragment_context.h" #include "runtime/client_cache.h" #include "runtime/descriptors.h" #include "runtime/exec_env.h" #include "runtime/frontend_info.h" -#include "runtime/memory/mem_tracker_limiter.h" #include "runtime/primitive_type.h" #include "runtime/query_context.h" #include "runtime/runtime_filter_mgr.h" @@ -89,24 +85,20 @@ #include "util/debug_points.h" #include "util/debug_util.h" #include "util/doris_metrics.h" -#include "util/hash_util.hpp" -#include "util/mem_info.h" #include "util/network_util.h" -#include "util/pretty_printer.h" #include "util/runtime_profile.h" #include "util/thread.h" #include "util/threadpool.h" #include "util/thrift_util.h" #include "util/uid_util.h" -#include "util/url_coding.h" #include "vec/runtime/shared_hash_table_controller.h" -#include "vec/runtime/vdatetime_value.h" namespace doris { DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(fragment_instance_count, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(timeout_canceled_fragment_count, MetricUnit::NOUNIT); DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(fragment_thread_pool_queue_size, MetricUnit::NOUNIT); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(fragment_thread_pool_num_active_threads, MetricUnit::NOUNIT); bvar::LatencyRecorder g_fragmentmgr_prepare_latency("doris_FragmentMgr", "prepare"); bvar::Adder g_fragment_executing_count("fragment_executing_count"); @@ -184,7 +176,7 @@ static Status _do_fetch_running_queries_rpc(const FrontendInfo& fe_info, } // Avoid logic error in frontend. - if (rpc_result.__isset.status == false || rpc_result.status.status_code != TStatusCode::OK) { + if (!rpc_result.__isset.status || rpc_result.status.status_code != TStatusCode::OK) { LOG_WARNING("Failed to fetch running queries from {}, reason: {}", PrintThriftNetworkAddress(fe_info.info.coordinator_address), doris::to_string(rpc_result.status.status_code)); @@ -193,7 +185,7 @@ static Status _do_fetch_running_queries_rpc(const FrontendInfo& fe_info, doris::to_string(rpc_result.status.status_code)); } - if (rpc_result.__isset.running_queries == false) { + if (!rpc_result.__isset.running_queries) { return Status::InternalError("Failed to fetch running queries from {}, reason: {}", PrintThriftNetworkAddress(fe_info.info.coordinator_address), "running_queries is not set"); @@ -254,6 +246,8 @@ FragmentMgr::FragmentMgr(ExecEnv* exec_env) REGISTER_HOOK_METRIC(fragment_thread_pool_queue_size, [this]() { return _thread_pool->get_queue_size(); }); + REGISTER_HOOK_METRIC(fragment_thread_pool_num_active_threads, + [this]() { return _thread_pool->num_active_threads(); }); CHECK(s.ok()) << s.to_string(); } @@ -262,6 +256,7 @@ FragmentMgr::~FragmentMgr() = default; void FragmentMgr::stop() { DEREGISTER_HOOK_METRIC(fragment_instance_count); DEREGISTER_HOOK_METRIC(fragment_thread_pool_queue_size); + DEREGISTER_HOOK_METRIC(fragment_thread_pool_num_active_threads); _stop_background_threads_latch.count_down(); if (_cancel_thread) { _cancel_thread->join(); @@ -269,8 +264,11 @@ void FragmentMgr::stop() { // Only me can delete { - std::lock_guard lock(_lock); + std::unique_lock lock(_query_ctx_map_mutex); _query_ctx_map.clear(); + } + { + std::unique_lock lock(_pipeline_map_mutex); _pipeline_map.clear(); } _thread_pool->shutdown(); @@ -392,12 +390,20 @@ void FragmentMgr::coordinator_callback(const ReportStatusRequest& req) { params.load_counters.emplace(s_unselected_rows, std::to_string(num_rows_load_unselected)); if (!req.runtime_state->get_error_log_file_path().empty()) { - params.__set_tracking_url( - to_load_error_http_path(req.runtime_state->get_error_log_file_path())); + std::string error_log_url = + to_load_error_http_path(req.runtime_state->get_error_log_file_path()); + LOG(INFO) << "error log file path: " << error_log_url + << ", query id: " << print_id(req.query_id) + << ", fragment instance id: " << print_id(req.fragment_instance_id); + params.__set_tracking_url(error_log_url); } else if (!req.runtime_states.empty()) { for (auto* rs : req.runtime_states) { if (!rs->get_error_log_file_path().empty()) { - params.__set_tracking_url(to_load_error_http_path(rs->get_error_log_file_path())); + std::string error_log_url = to_load_error_http_path(rs->get_error_log_file_path()); + LOG(INFO) << "error log file path: " << error_log_url + << ", query id: " << print_id(req.query_id) + << ", fragment instance id: " << print_id(rs->fragment_instance_id()); + params.__set_tracking_url(error_log_url); } if (rs->wal_id() > 0) { params.__set_txn_id(rs->wal_id()); @@ -583,11 +589,7 @@ Status FragmentMgr::start_query_execution(const PExecPlanFragmentStartRequest* r TUniqueId query_id; query_id.__set_hi(request->query_id().hi()); query_id.__set_lo(request->query_id().lo()); - std::shared_ptr q_ctx = nullptr; - { - std::lock_guard lock(_lock); - q_ctx = _get_or_erase_query_ctx(query_id); - } + auto q_ctx = get_query_ctx(query_id); if (q_ctx) { q_ctx->set_ready_to_execute(Status::OK()); LOG_INFO("Query {} start execution", print_id(query_id)); @@ -602,114 +604,112 @@ Status FragmentMgr::start_query_execution(const PExecPlanFragmentStartRequest* r void FragmentMgr::remove_pipeline_context( std::shared_ptr f_context) { - { - std::lock_guard lock(_lock); - auto query_id = f_context->get_query_id(); - int64 now = duration_cast( - std::chrono::system_clock::now().time_since_epoch()) - .count(); - g_fragment_executing_count << -1; - g_fragment_last_active_time.set_value(now); - _pipeline_map.erase({query_id, f_context->get_fragment_id()}); - } + auto query_id = f_context->get_query_id(); + int64 now = duration_cast( + std::chrono::system_clock::now().time_since_epoch()) + .count(); + g_fragment_executing_count << -1; + g_fragment_last_active_time.set_value(now); + + std::unique_lock lock(_pipeline_map_mutex); + _pipeline_map.erase({query_id, f_context->get_fragment_id()}); } -std::shared_ptr FragmentMgr::_get_or_erase_query_ctx(const TUniqueId& query_id) { +std::shared_ptr FragmentMgr::get_query_ctx(const TUniqueId& query_id) { + std::shared_lock lock(_query_ctx_map_mutex); auto search = _query_ctx_map.find(query_id); if (search != _query_ctx_map.end()) { if (auto q_ctx = search->second.lock()) { return q_ctx; - } else { - LOG(WARNING) << "Query context (query id = " << print_id(query_id) - << ") has been released."; - _query_ctx_map.erase(search); - return nullptr; } } return nullptr; } -std::shared_ptr FragmentMgr::get_or_erase_query_ctx_with_lock( - const TUniqueId& query_id) { - std::unique_lock lock(_lock); - return _get_or_erase_query_ctx(query_id); -} - -template -Status FragmentMgr::_get_query_ctx(const Params& params, TUniqueId query_id, bool pipeline, - QuerySource query_source, - std::shared_ptr& query_ctx) { +Status FragmentMgr::_get_or_create_query_ctx(const TPipelineFragmentParams& params, + TUniqueId query_id, bool pipeline, + QuerySource query_source, + std::shared_ptr& query_ctx) { DBUG_EXECUTE_IF("FragmentMgr._get_query_ctx.failed", { return Status::InternalError("FragmentMgr._get_query_ctx.failed, query id {}", print_id(query_id)); }); + + // Find _query_ctx_map, in case some other request has already + // create the query fragments context. + query_ctx = get_query_ctx(query_id); if (params.is_simplified_param) { // Get common components from _query_ctx_map - std::lock_guard lock(_lock); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { - query_ctx = q_ctx; - } else { + if (!query_ctx) { return Status::InternalError( "Failed to get query fragments context. Query {} may be timeout or be " "cancelled. host: {}", print_id(query_id), BackendOptions::get_localhost()); } } else { - // Find _query_ctx_map, in case some other request has already - // create the query fragments context. - std::lock_guard lock(_lock); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { - query_ctx = q_ctx; - return Status::OK(); - } + if (!query_ctx) { + std::unique_lock lock(_query_ctx_map_mutex); + // Only one thread need create query ctx. other thread just get query_ctx in _query_ctx_map. + auto search = _query_ctx_map.find(query_id); + if (search != _query_ctx_map.end()) { + query_ctx = search->second.lock(); + } - // First time a fragment of a query arrived. print logs. - LOG(INFO) << "query_id: " << print_id(query_id) << ", coord_addr: " << params.coord - << ", total fragment num on current host: " << params.fragment_num_on_host - << ", fe process uuid: " << params.query_options.fe_process_uuid - << ", query type: " << params.query_options.query_type - << ", report audit fe:" << params.current_connect_fe; - - // This may be a first fragment request of the query. - // Create the query fragments context. - query_ctx = QueryContext::create_shared(query_id, _exec_env, params.query_options, - params.coord, params.is_nereids, - params.current_connect_fe, query_source); - SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(query_ctx->query_mem_tracker); - RETURN_IF_ERROR(DescriptorTbl::create(&(query_ctx->obj_pool), params.desc_tbl, - &(query_ctx->desc_tbl))); - // set file scan range params - if (params.__isset.file_scan_params) { - query_ctx->file_scan_range_params_map = params.file_scan_params; - } + if (!query_ctx) { + WorkloadGroupPtr workload_group_ptr = nullptr; + std::string wg_info_str = "Workload Group not set"; + if (params.__isset.workload_groups && !params.workload_groups.empty()) { + uint64_t wg_id = params.workload_groups[0].id; + workload_group_ptr = _exec_env->workload_group_mgr()->get_group(wg_id); + if (workload_group_ptr != nullptr) { + wg_info_str = workload_group_ptr->debug_string(); + } else { + wg_info_str = "set wg but not find it in be"; + } + } - query_ctx->query_globals = params.query_globals; + // First time a fragment of a query arrived. print logs. + LOG(INFO) << "query_id: " << print_id(query_id) << ", coord_addr: " << params.coord + << ", total fragment num on current host: " << params.fragment_num_on_host + << ", fe process uuid: " << params.query_options.fe_process_uuid + << ", query type: " << params.query_options.query_type + << ", report audit fe:" << params.current_connect_fe + << ", use wg:" << wg_info_str; + + // This may be a first fragment request of the query. + // Create the query fragments context. + query_ctx = QueryContext::create_shared(query_id, _exec_env, params.query_options, + params.coord, params.is_nereids, + params.current_connect_fe, query_source); + SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(query_ctx->query_mem_tracker); + RETURN_IF_ERROR(DescriptorTbl::create(&(query_ctx->obj_pool), params.desc_tbl, + &(query_ctx->desc_tbl))); + // set file scan range params + if (params.__isset.file_scan_params) { + query_ctx->file_scan_range_params_map = params.file_scan_params; + } - if (params.__isset.resource_info) { - query_ctx->user = params.resource_info.user; - query_ctx->group = params.resource_info.group; - query_ctx->set_rsc_info = true; - } + query_ctx->query_globals = params.query_globals; - _set_scan_concurrency(params, query_ctx.get()); - - if (params.__isset.workload_groups && !params.workload_groups.empty()) { - uint64_t tg_id = params.workload_groups[0].id; - WorkloadGroupPtr workload_group_ptr = - _exec_env->workload_group_mgr()->get_task_group_by_id(tg_id); - if (workload_group_ptr != nullptr) { - RETURN_IF_ERROR(workload_group_ptr->add_query(query_id, query_ctx)); - RETURN_IF_ERROR(query_ctx->set_workload_group(workload_group_ptr)); - _exec_env->runtime_query_statistics_mgr()->set_workload_group_id(print_id(query_id), - tg_id); - } else { - LOG(WARNING) << "Query/load id: " << print_id(query_ctx->query_id()) - << "can't find its workload group " << tg_id; + if (params.__isset.resource_info) { + query_ctx->user = params.resource_info.user; + query_ctx->group = params.resource_info.group; + query_ctx->set_rsc_info = true; + } + + _set_scan_concurrency(params, query_ctx.get()); + + if (workload_group_ptr != nullptr) { + RETURN_IF_ERROR(workload_group_ptr->add_query(query_id, query_ctx)); + query_ctx->set_workload_group(workload_group_ptr); + _exec_env->runtime_query_statistics_mgr()->set_workload_group_id( + print_id(query_id), workload_group_ptr->id()); + } + // There is some logic in query ctx's dctor, we could not check if exists and delete the + // temp query ctx now. For example, the query id maybe removed from workload group's queryset. + _query_ctx_map.insert({query_id, query_ctx}); } } - // There is some logic in query ctx's dctor, we could not check if exists and delete the - // temp query ctx now. For example, the query id maybe removed from workload group's queryset. - _query_ctx_map.insert(std::make_pair(query_ctx->query_id(), query_ctx)); } return Status::OK(); } @@ -723,13 +723,13 @@ std::string FragmentMgr::dump_pipeline_tasks(int64_t duration) { fmt::memory_buffer debug_string_buffer; size_t i = 0; { - std::lock_guard lock(_lock); fmt::format_to(debug_string_buffer, "{} pipeline fragment contexts are still running! duration_limit={}\n", _pipeline_map.size(), duration); - timespec now; clock_gettime(CLOCK_MONOTONIC, &now); + + std::shared_lock lock(_pipeline_map_mutex); for (auto& it : _pipeline_map) { auto elapsed = it.second->elapsed_time() / 1000000000.0; if (elapsed < duration) { @@ -748,10 +748,12 @@ std::string FragmentMgr::dump_pipeline_tasks(int64_t duration) { } std::string FragmentMgr::dump_pipeline_tasks(TUniqueId& query_id) { - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { + if (auto q_ctx = get_query_ctx(query_id)) { return q_ctx->print_all_pipeline_context(); } else { - return fmt::format("Query context (query id = {}) not found. \n", print_id(query_id)); + return fmt::format( + "Dump pipeline tasks failed: Query context (query id = {}) not found. \n", + print_id(query_id)); } } @@ -765,7 +767,8 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, << apache::thrift::ThriftDebugString(params.query_options).c_str(); std::shared_ptr query_ctx; - RETURN_IF_ERROR(_get_query_ctx(params, params.query_id, true, query_source, query_ctx)); + RETURN_IF_ERROR( + _get_or_create_query_ctx(params, params.query_id, true, query_source, query_ctx)); SCOPED_ATTACH_TASK(query_ctx.get()); int64_t duration_ns = 0; std::shared_ptr context = @@ -798,16 +801,8 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, } { - // (query_id, fragment_id) is executed only on one BE, locks _pipeline_map. - std::lock_guard lock(_lock); for (const auto& local_param : params.local_params) { const TUniqueId& fragment_instance_id = local_param.fragment_instance_id; - auto iter = _pipeline_map.find({params.query_id, params.fragment_id}); - if (iter != _pipeline_map.end()) { - return Status::InternalError( - "exec_plan_fragment query_id({}) input duplicated fragment_id({})", - print_id(params.query_id), params.fragment_id); - } query_ctx->fragment_instance_ids.push_back(fragment_instance_id); } @@ -816,7 +811,15 @@ Status FragmentMgr::exec_plan_fragment(const TPipelineFragmentParams& params, .count(); g_fragment_executing_count << 1; g_fragment_last_active_time.set_value(now); - // TODO: simplify this mapping + + // (query_id, fragment_id) is executed only on one BE, locks _pipeline_map. + std::unique_lock lock(_pipeline_map_mutex); + auto iter = _pipeline_map.find({params.query_id, params.fragment_id}); + if (iter != _pipeline_map.end()) { + return Status::InternalError( + "exec_plan_fragment query_id({}) input duplicated fragment_id({})", + print_id(params.query_id), params.fragment_id); + } _pipeline_map.insert({{params.query_id, params.fragment_id}, context}); } @@ -846,8 +849,7 @@ void FragmentMgr::cancel_query(const TUniqueId query_id, const Status reason) { std::shared_ptr query_ctx = nullptr; std::vector all_instance_ids; { - std::lock_guard state_lock(_lock); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { + if (auto q_ctx = get_query_ctx(query_id)) { query_ctx = q_ctx; // Copy instanceids to avoid concurrent modification. // And to reduce the scope of lock. @@ -860,7 +862,7 @@ void FragmentMgr::cancel_query(const TUniqueId query_id, const Status reason) { } query_ctx->cancel(reason); { - std::lock_guard state_lock(_lock); + std::unique_lock l(_query_ctx_map_mutex); _query_ctx_map.erase(query_id); } LOG(INFO) << "Query " << print_id(query_id) @@ -896,7 +898,7 @@ void FragmentMgr::cancel_worker() { std::vector> ctx; { - std::lock_guard lock(_lock); + std::shared_lock lock(_pipeline_map_mutex); ctx.reserve(_pipeline_map.size()); for (auto& pipeline_itr : _pipeline_map) { ctx.push_back(pipeline_itr.second); @@ -908,29 +910,34 @@ void FragmentMgr::cancel_worker() { std::unordered_map, BrpcItem> brpc_stub_with_queries; { - std::lock_guard lock(_lock); - for (auto it = _query_ctx_map.begin(); it != _query_ctx_map.end();) { - if (auto q_ctx = it->second.lock()) { - if (q_ctx->is_timeout(now)) { - LOG_WARNING("Query {} is timeout", print_id(it->first)); - queries_timeout.push_back(it->first); - } else if (config::enable_brpc_connection_check) { - auto brpc_stubs = q_ctx->get_using_brpc_stubs(); - for (auto& item : brpc_stubs) { - if (!brpc_stub_with_queries.contains(item.second)) { - brpc_stub_with_queries.emplace(item.second, - BrpcItem {item.first, {q_ctx}}); - } else { - brpc_stub_with_queries[item.second].queries.emplace_back(q_ctx); + { + // TODO: Now only the cancel worker do the GC the _query_ctx_map. each query must + // do erase the finish query unless in _query_ctx_map. Rethink the logic is ok + std::unique_lock lock(_query_ctx_map_mutex); + for (auto it = _query_ctx_map.begin(); it != _query_ctx_map.end();) { + if (auto q_ctx = it->second.lock()) { + if (q_ctx->is_timeout(now)) { + LOG_WARNING("Query {} is timeout", print_id(it->first)); + queries_timeout.push_back(it->first); + } else if (config::enable_brpc_connection_check) { + auto brpc_stubs = q_ctx->get_using_brpc_stubs(); + for (auto& item : brpc_stubs) { + if (!brpc_stub_with_queries.contains(item.second)) { + brpc_stub_with_queries.emplace(item.second, + BrpcItem {item.first, {q_ctx}}); + } else { + brpc_stub_with_queries[item.second].queries.emplace_back(q_ctx); + } } } + ++it; + } else { + it = _query_ctx_map.erase(it); } - ++it; - } else { - it = _query_ctx_map.erase(it); } } + std::shared_lock lock(_query_ctx_map_mutex); // We use a very conservative cancel strategy. // 0. If there are no running frontends, do not cancel any queries. // 1. If query's process uuid is zero, do not cancel @@ -1017,8 +1024,14 @@ void FragmentMgr::cancel_worker() { } } - for (auto it : brpc_stub_with_queries) { - _check_brpc_available(it.first, it.second); + if (config::enable_brpc_connection_check) { + for (auto it : brpc_stub_with_queries) { + if (!it.first) { + LOG(WARNING) << "brpc stub is nullptr, skip it."; + continue; + } + _check_brpc_available(it.first, it.second); + } } if (!queries_lost_coordinator.empty()) { @@ -1054,12 +1067,15 @@ void FragmentMgr::_check_brpc_available(const std::shared_ptr(100, config::brpc_connection_check_timeout_ms); + while (true) { PHandShakeRequest request; request.set_hello(message); PHandShakeResponse response; brpc::Controller cntl; - cntl.set_timeout_ms(500 * (failed_count + 1)); + cntl.set_timeout_ms(check_timeout_ms); cntl.set_max_retry(10); brpc_stub->hand_shake(&cntl, &request, &response, nullptr); @@ -1204,7 +1220,7 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, const auto& fragment_ids = request->fragment_ids(); { - std::unique_lock lock(_lock); + std::shared_lock lock(_pipeline_map_mutex); for (auto fragment_id : fragment_ids) { auto iter = _pipeline_map.find({UniqueId(request->query_id()).to_thrift(), fragment_id}); @@ -1239,7 +1255,9 @@ Status FragmentMgr::apply_filterv2(const PPublishFilterRequestV2* request, RETURN_IF_ERROR(IRuntimeFilter::create_wrapper(¶ms, &filter_wrapper)); std::ranges::for_each(filters, [&](auto& filter) { - filter->update_filter(filter_wrapper, request->merge_time(), start_apply); + filter->update_filter( + filter_wrapper, request->merge_time(), start_apply, + request->has_local_merge_time() ? request->local_merge_time() : 0); }); } @@ -1254,18 +1272,19 @@ Status FragmentMgr::send_filter_size(const PSendFilterSizeRequest* request) { TUniqueId query_id; query_id.__set_hi(queryid.hi); query_id.__set_lo(queryid.lo); - std::lock_guard lock(_lock); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { + if (auto q_ctx = get_query_ctx(query_id)) { query_ctx = q_ctx; } else { - return Status::EndOfFile("Query context (query-id: {}) not found, maybe finished", - queryid.to_string()); + return Status::EndOfFile( + "Send filter size failed: Query context (query-id: {}) not found, maybe " + "finished", + queryid.to_string()); } } std::shared_ptr filter_controller; RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); - auto merge_status = filter_controller->send_filter_size(request); + auto merge_status = filter_controller->send_filter_size(query_ctx, request); return merge_status; } @@ -1276,12 +1295,12 @@ Status FragmentMgr::sync_filter_size(const PSyncFilterSizeRequest* request) { TUniqueId query_id; query_id.__set_hi(queryid.hi); query_id.__set_lo(queryid.lo); - std::lock_guard lock(_lock); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { + if (auto q_ctx = get_query_ctx(query_id)) { query_ctx = q_ctx; } else { - return Status::InvalidArgument("Query context (query-id: {}) not found", - queryid.to_string()); + return Status::EndOfFile( + "Sync filter size failed: Query context (query-id: {}) already finished", + queryid.to_string()); } } return query_ctx->runtime_filter_mgr()->sync_filter_size(request); @@ -1296,24 +1315,24 @@ Status FragmentMgr::merge_filter(const PMergeFilterRequest* request, TUniqueId query_id; query_id.__set_hi(queryid.hi); query_id.__set_lo(queryid.lo); - std::lock_guard lock(_lock); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { + if (auto q_ctx = get_query_ctx(query_id)) { query_ctx = q_ctx; } else { - return Status::InvalidArgument("Query context (query-id: {}) not found", - queryid.to_string()); + return Status::EndOfFile( + "Merge filter size failed: Query context (query-id: {}) already finished", + queryid.to_string()); } } SCOPED_ATTACH_TASK(query_ctx.get()); std::shared_ptr filter_controller; RETURN_IF_ERROR(_runtimefilter_controller.acquire(queryid, &filter_controller)); - auto merge_status = filter_controller->merge(request, attach_data); + auto merge_status = filter_controller->merge(query_ctx, request, attach_data); return merge_status; } void FragmentMgr::get_runtime_query_info(std::vector* query_info_list) { { - std::lock_guard lock(_lock); + std::unique_lock lock(_query_ctx_map_mutex); for (auto iter = _query_ctx_map.begin(); iter != _query_ctx_map.end();) { if (auto q_ctx = iter->second.lock()) { WorkloadQueryInfo workload_query_info; @@ -1336,19 +1355,9 @@ Status FragmentMgr::get_realtime_exec_status(const TUniqueId& query_id, return Status::InvalidArgument("exes_status is nullptr"); } - std::shared_ptr query_context = nullptr; - - { - std::lock_guard lock(_lock); - if (auto q_ctx = _get_or_erase_query_ctx(query_id)) { - query_context = q_ctx; - } else { - return Status::NotFound("Query {} has been released", print_id(query_id)); - } - } - + std::shared_ptr query_context = get_query_ctx(query_id); if (query_context == nullptr) { - return Status::NotFound("Query {} not found", print_id(query_id)); + return Status::NotFound("Query {} not found or released", print_id(query_id)); } *exec_status = query_context->get_realtime_exec_status(); diff --git a/be/src/runtime/fragment_mgr.h b/be/src/runtime/fragment_mgr.h index 0eac0469683961..e85fb07cba68da 100644 --- a/be/src/runtime/fragment_mgr.h +++ b/be/src/runtime/fragment_mgr.h @@ -21,9 +21,8 @@ #include #include #include -#include -#include +#include #include #include #include @@ -133,7 +132,7 @@ class FragmentMgr : public RestMonitorIface { ThreadPool* get_thread_pool() { return _thread_pool.get(); } int32_t running_query_num() { - std::unique_lock ctx_lock(_lock); + std::shared_lock lock(_query_ctx_map_mutex); return _query_ctx_map.size(); } @@ -145,7 +144,7 @@ class FragmentMgr : public RestMonitorIface { Status get_realtime_exec_status(const TUniqueId& query_id, TReportExecStatusParams* exec_status); - std::shared_ptr get_or_erase_query_ctx_with_lock(const TUniqueId& query_id); + std::shared_ptr get_query_ctx(const TUniqueId& query_id); private: struct BrpcItem { @@ -153,14 +152,12 @@ class FragmentMgr : public RestMonitorIface { std::vector> queries; }; - std::shared_ptr _get_or_erase_query_ctx(const TUniqueId& query_id); - template void _set_scan_concurrency(const Param& params, QueryContext* query_ctx); - template - Status _get_query_ctx(const Params& params, TUniqueId query_id, bool pipeline, - QuerySource query_type, std::shared_ptr& query_ctx); + Status _get_or_create_query_ctx(const TPipelineFragmentParams& params, TUniqueId query_id, + bool pipeline, QuerySource query_type, + std::shared_ptr& query_ctx); void _check_brpc_available(const std::shared_ptr& brpc_stub, const BrpcItem& brpc_item); @@ -168,20 +165,21 @@ class FragmentMgr : public RestMonitorIface { // This is input params ExecEnv* _exec_env = nullptr; + // The lock protect the `_pipeline_map` + std::shared_mutex _pipeline_map_mutex; + // (QueryID, FragmentID) -> PipelineFragmentContext + phmap::flat_hash_map, + std::shared_ptr> + _pipeline_map; + // The lock should only be used to protect the structures in fragment manager. Has to be // used in a very small scope because it may dead lock. For example, if the _lock is used // in prepare stage, the call path is prepare --> expr prepare --> may call allocator // when allocate failed, allocator may call query_is_cancelled, query is callced will also // call _lock, so that there is dead lock. - std::mutex _lock; - - // (QueryID, FragmentID) -> PipelineFragmentContext - std::unordered_map, - std::shared_ptr> - _pipeline_map; - + std::shared_mutex _query_ctx_map_mutex; // query id -> QueryContext - std::unordered_map> _query_ctx_map; + phmap::flat_hash_map> _query_ctx_map; std::unordered_map> _bf_size_map; CountDownLatch _stop_background_threads_latch; diff --git a/be/src/runtime/jsonb_value.cpp b/be/src/runtime/jsonb_value.cpp index e88ce3b3d74d1a..0227281fdd0d0e 100644 --- a/be/src/runtime/jsonb_value.cpp +++ b/be/src/runtime/jsonb_value.cpp @@ -28,7 +28,7 @@ namespace doris { -Status JsonBinaryValue::from_json_string(const char* s, int length) { +Status JsonBinaryValue::from_json_string(const char* s, size_t length) { JsonbErrType error = JsonbErrType::E_NONE; if (!parser.parse(s, length)) { error = parser.getErrorCode(); diff --git a/be/src/runtime/jsonb_value.h b/be/src/runtime/jsonb_value.h index 1df9469e1720cd..65f4927759c304 100644 --- a/be/src/runtime/jsonb_value.h +++ b/be/src/runtime/jsonb_value.h @@ -43,7 +43,7 @@ struct JsonBinaryValue { JsonbParser parser; JsonBinaryValue() : ptr(nullptr), len(0) {} - JsonBinaryValue(char* ptr, int len) { + JsonBinaryValue(char* ptr, size_t len) { static_cast(from_json_string(const_cast(ptr), len)); } JsonBinaryValue(const std::string& s) { @@ -115,7 +115,7 @@ struct JsonBinaryValue { __builtin_unreachable(); } - Status from_json_string(const char* s, int len); + Status from_json_string(const char* s, size_t len); std::string to_json_string() const; diff --git a/be/src/runtime/load_channel.cpp b/be/src/runtime/load_channel.cpp index 9369c0c833c53c..6dfd5d46eb6480 100644 --- a/be/src/runtime/load_channel.cpp +++ b/be/src/runtime/load_channel.cpp @@ -45,8 +45,7 @@ LoadChannel::LoadChannel(const UniqueId& load_id, int64_t timeout_s, bool is_hig _backend_id(backend_id), _enable_profile(enable_profile) { std::shared_ptr query_context = - ExecEnv::GetInstance()->fragment_mgr()->get_or_erase_query_ctx_with_lock( - _load_id.to_thrift()); + ExecEnv::GetInstance()->fragment_mgr()->get_query_ctx(_load_id.to_thrift()); std::shared_ptr mem_tracker = nullptr; WorkloadGroupPtr wg_ptr = nullptr; @@ -60,7 +59,7 @@ LoadChannel::LoadChannel(const UniqueId& load_id, int64_t timeout_s, bool is_hig fmt::format("(FromLoadChannel)Load#Id={}", _load_id.to_string())); if (wg_id > 0) { WorkloadGroupPtr workload_group_ptr = - ExecEnv::GetInstance()->workload_group_mgr()->get_task_group_by_id(wg_id); + ExecEnv::GetInstance()->workload_group_mgr()->get_group(wg_id); if (workload_group_ptr) { wg_ptr = workload_group_ptr; wg_ptr->add_mem_tracker_limiter(mem_tracker); @@ -98,7 +97,6 @@ void LoadChannel::_init_profile() { _load_id.to_string(), _sender_ip, _backend_id), true, true); _add_batch_number_counter = ADD_COUNTER(_self_profile, "NumberBatchAdded", TUnit::UNIT); - _peak_memory_usage_counter = ADD_COUNTER(_self_profile, "PeakMemoryUsage", TUnit::BYTES); _add_batch_timer = ADD_TIMER(_self_profile, "AddBatchTime"); _handle_eos_timer = ADD_CHILD_TIMER(_self_profile, "HandleEosTime", "AddBatchTime"); _add_batch_times = ADD_COUNTER(_self_profile, "AddBatchTimes", TUnit::UNIT); diff --git a/be/src/runtime/load_channel.h b/be/src/runtime/load_channel.h index 36a8f363ba9bac..2889bcf256515b 100644 --- a/be/src/runtime/load_channel.h +++ b/be/src/runtime/load_channel.h @@ -91,7 +91,6 @@ class LoadChannel { std::unique_ptr _profile; RuntimeProfile* _self_profile = nullptr; RuntimeProfile::Counter* _add_batch_number_counter = nullptr; - RuntimeProfile::Counter* _peak_memory_usage_counter = nullptr; RuntimeProfile::Counter* _add_batch_timer = nullptr; RuntimeProfile::Counter* _add_batch_times = nullptr; RuntimeProfile::Counter* _mgr_add_batch_timer = nullptr; diff --git a/be/src/runtime/load_stream.cpp b/be/src/runtime/load_stream.cpp index 752e2ff95b2917..60da45fa685fbf 100644 --- a/be/src/runtime/load_stream.cpp +++ b/be/src/runtime/load_stream.cpp @@ -428,7 +428,7 @@ LoadStream::LoadStream(PUniqueId load_id, LoadStreamMgr* load_stream_mgr, bool e TUniqueId load_tid = ((UniqueId)load_id).to_thrift(); #ifndef BE_TEST std::shared_ptr query_context = - ExecEnv::GetInstance()->fragment_mgr()->get_or_erase_query_ctx_with_lock(load_tid); + ExecEnv::GetInstance()->fragment_mgr()->get_query_ctx(load_tid); if (query_context != nullptr) { _query_thread_context = {load_tid, query_context->query_mem_tracker, query_context->workload_group()}; diff --git a/be/src/runtime/memory/cache_policy.h b/be/src/runtime/memory/cache_policy.h index e7e1c73e7cbb41..8f077a4eb45bb1 100644 --- a/be/src/runtime/memory/cache_policy.h +++ b/be/src/runtime/memory/cache_policy.h @@ -17,6 +17,8 @@ #pragma once +#include + #include "util/runtime_profile.h" namespace doris { @@ -123,6 +125,7 @@ class CachePolicy { {"CloudTabletCache", CacheType::CLOUD_TABLET_CACHE}, {"CloudTxnDeleteBitmapCache", CacheType::CLOUD_TXN_DELETE_BITMAP_CACHE}, {"ForUTCacheNumber", CacheType::FOR_UT_CACHE_NUMBER}, + {"QueryCache", CacheType::QUERY_CACHE}, {"TabletColumnObjectPool", CacheType::TABLET_COLUMN_OBJECT_POOL}}; static CacheType string_to_type(std::string type) { @@ -133,6 +136,9 @@ class CachePolicy { } } + inline static std::vector MetadataCache { + CacheType::SEGMENT_CACHE, CacheType::SCHEMA_CACHE, CacheType::TABLET_SCHEMA_CACHE}; + CachePolicy(CacheType type, size_t capacity, uint32_t stale_sweep_time_s, bool enable_prune); virtual ~CachePolicy(); diff --git a/be/src/runtime/memory/jemalloc_hook.cpp b/be/src/runtime/memory/jemalloc_hook.cpp index 445d60d382c270..dffc1344b71dbc 100644 --- a/be/src/runtime/memory/jemalloc_hook.cpp +++ b/be/src/runtime/memory/jemalloc_hook.cpp @@ -60,7 +60,7 @@ void* doris_realloc(void* p, size_t size) __THROW { return nullptr; } -#if USE_MEM_TRACKER +#if defined(USE_MEM_TRACKER) && !defined(BE_TEST) int64_t old_size = jemalloc_usable_size(p); CONSUME_THREAD_MEM_TRACKER_BY_HOOK_WITH_FN( [](size_t size, int64_t old_size) { return jenallocx(size, 0) - old_size; }, size, diff --git a/be/src/runtime/memory/lru_cache_policy.h b/be/src/runtime/memory/lru_cache_policy.h index ea34e2837f1313..3fdb43facd7715 100644 --- a/be/src/runtime/memory/lru_cache_policy.h +++ b/be/src/runtime/memory/lru_cache_policy.h @@ -104,20 +104,26 @@ class LRUCachePolicy : public CachePolicy { return _mem_tracker->consumption(); } + int64_t value_mem_consumption() { + DCHECK(_value_mem_tracker != nullptr); + return _value_mem_tracker->consumption(); + } + // Insert will consume tracking_bytes to _mem_tracker and cache value destroy will release tracking_bytes. - // If LRUCacheType::SIZE, tracking_bytes usually equal to charge. - // If LRUCacheType::NUMBER, tracking_bytes usually not equal to charge, at this time charge is an weight. - // If LRUCacheType::SIZE and tracking_bytes equals 0, memory must be tracked in Doris Allocator, + // If LRUCacheType::SIZE, value_tracking_bytes usually equal to charge. + // If LRUCacheType::NUMBER, value_tracking_bytes usually not equal to charge, at this time charge is an weight. + // If LRUCacheType::SIZE and value_tracking_bytes equals 0, memory must be tracked in Doris Allocator, // cache value is allocated using Alloctor. - // If LRUCacheType::NUMBER and tracking_bytes equals 0, usually currently cannot accurately tracking memory size, + // If LRUCacheType::NUMBER and value_tracking_bytes equals 0, usually currently cannot accurately tracking memory size, // only tracking handle_size(106). - Cache::Handle* insert(const CacheKey& key, void* value, size_t charge, size_t tracking_bytes, + Cache::Handle* insert(const CacheKey& key, void* value, size_t charge, + size_t value_tracking_bytes, CachePriority priority = CachePriority::NORMAL) { - size_t tracking_bytes_with_handle = sizeof(LRUHandle) - 1 + key.size() + tracking_bytes; + size_t tracking_bytes = sizeof(LRUHandle) - 1 + key.size() + value_tracking_bytes; if (value != nullptr) { - mem_tracker()->consume(tracking_bytes_with_handle); ((LRUCacheValueBase*)value) - ->set_tracking_bytes(tracking_bytes_with_handle, _mem_tracker); + ->set_tracking_bytes(tracking_bytes, _mem_tracker, value_tracking_bytes, + _value_mem_tracker); } return _cache->insert(key, value, charge, priority); } @@ -265,9 +271,18 @@ class LRUCachePolicy : public CachePolicy { protected: void _init_mem_tracker(const std::string& type_name) { - _mem_tracker = MemTrackerLimiter::create_shared( - MemTrackerLimiter::Type::GLOBAL, - fmt::format("{}[{}]", type_string(_type), type_name)); + if (std::find(CachePolicy::MetadataCache.begin(), CachePolicy::MetadataCache.end(), + _type) == CachePolicy::MetadataCache.end()) { + _mem_tracker = MemTrackerLimiter::create_shared( + MemTrackerLimiter::Type::CACHE, + fmt::format("{}[{}]", type_string(_type), type_name)); + } else { + _mem_tracker = MemTrackerLimiter::create_shared( + MemTrackerLimiter::Type::METADATA, + fmt::format("{}[{}]", type_string(_type), type_name)); + } + _value_mem_tracker = std::make_shared( + fmt::format("{}::Value[{}]", type_string(_type), type_name)); } // if check_capacity failed, will return dummy lru cache, @@ -277,6 +292,7 @@ class LRUCachePolicy : public CachePolicy { LRUCacheType _lru_cache_type; std::shared_ptr _mem_tracker; + std::shared_ptr _value_mem_tracker; }; } // namespace doris diff --git a/be/src/runtime/memory/lru_cache_value_base.h b/be/src/runtime/memory/lru_cache_value_base.h index f9e534e6600df8..a9a3ae5ddab632 100644 --- a/be/src/runtime/memory/lru_cache_value_base.h +++ b/be/src/runtime/memory/lru_cache_value_base.h @@ -28,18 +28,27 @@ class LRUCacheValueBase { virtual ~LRUCacheValueBase() { if (_tracking_bytes > 0) { _mem_tracker->release(_tracking_bytes); + _value_mem_tracker->release(_value_tracking_bytes); } } void set_tracking_bytes(size_t tracking_bytes, - const std::shared_ptr& mem_tracker) { + const std::shared_ptr& mem_tracker, + size_t value_tracking_bytes, + const std::shared_ptr& value_mem_tracker) { this->_tracking_bytes = tracking_bytes; this->_mem_tracker = mem_tracker; + this->_value_tracking_bytes = value_tracking_bytes; + this->_value_mem_tracker = value_mem_tracker; + _mem_tracker->consume(_tracking_bytes); + _value_mem_tracker->consume(_value_tracking_bytes); } protected: size_t _tracking_bytes = 0; + size_t _value_tracking_bytes = 0; std::shared_ptr _mem_tracker; + std::shared_ptr _value_mem_tracker; }; } // namespace doris diff --git a/be/src/runtime/memory/mem_tracker_limiter.cpp b/be/src/runtime/memory/mem_tracker_limiter.cpp index 05ff13f0e7c646..ac4684835a670c 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.cpp +++ b/be/src/runtime/memory/mem_tracker_limiter.cpp @@ -66,9 +66,13 @@ MemTrackerLimiter::MemTrackerLimiter(Type type, const std::string& label, int64_ _uid = UniqueId::gen_uid(); if (_type == Type::GLOBAL) { _group_num = 0; + } else if (_type == Type::METADATA) { + _group_num = 1; + } else if (_type == Type::CACHE) { + _group_num = 2; } else { _group_num = - mem_tracker_limiter_group_counter.fetch_add(1) % (MEM_TRACKER_GROUP_NUM - 1) + 1; + mem_tracker_limiter_group_counter.fetch_add(1) % (MEM_TRACKER_GROUP_NUM - 3) + 3; } // currently only select/load need runtime query statistics @@ -208,24 +212,20 @@ std::string MemTrackerLimiter::print_address_sanitizers() { RuntimeProfile* MemTrackerLimiter::make_profile(RuntimeProfile* profile) const { RuntimeProfile* profile_snapshot = profile->create_child( fmt::format("{}@{}@id={}", _label, type_string(_type), _uid.to_string()), true, false); - RuntimeProfile::Counter* current_usage_counter = - ADD_COUNTER(profile_snapshot, "CurrentUsage", TUnit::BYTES); - RuntimeProfile::Counter* peak_usage_counter = - ADD_COUNTER(profile_snapshot, "PeakUsage", TUnit::BYTES); - COUNTER_SET(current_usage_counter, consumption()); - COUNTER_SET(peak_usage_counter, peak_consumption()); + RuntimeProfile::HighWaterMarkCounter* usage_counter = + profile_snapshot->AddHighWaterMarkCounter("Memory", TUnit::BYTES); + COUNTER_SET(usage_counter, peak_consumption()); + COUNTER_SET(usage_counter, consumption()); if (has_limit()) { RuntimeProfile::Counter* limit_counter = ADD_COUNTER(profile_snapshot, "Limit", TUnit::BYTES); COUNTER_SET(limit_counter, _limit); } if (reserved_peak_consumption() != 0) { - RuntimeProfile::Counter* reserved_counter = - ADD_COUNTER(profile_snapshot, "ReservedMemory", TUnit::BYTES); - RuntimeProfile::Counter* reserved_peak_counter = - ADD_COUNTER(profile_snapshot, "ReservedPeakMemory", TUnit::BYTES); + RuntimeProfile::HighWaterMarkCounter* reserved_counter = + profile_snapshot->AddHighWaterMarkCounter("ReservedMemory", TUnit::BYTES); + COUNTER_SET(reserved_counter, reserved_peak_consumption()); COUNTER_SET(reserved_counter, reserved_consumption()); - COUNTER_SET(reserved_peak_counter, reserved_peak_consumption()); } return profile_snapshot; } @@ -268,8 +268,26 @@ void MemTrackerLimiter::make_type_trackers_profile(RuntimeProfile* profile, tracker->make_profile(profile); } } + } else if (type == Type::METADATA) { + std::lock_guard l( + ExecEnv::GetInstance()->mem_tracker_limiter_pool[1].group_lock); + for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[1].trackers) { + auto tracker = trackerWptr.lock(); + if (tracker != nullptr) { + tracker->make_profile(profile); + } + } + } else if (type == Type::CACHE) { + std::lock_guard l( + ExecEnv::GetInstance()->mem_tracker_limiter_pool[2].group_lock); + for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[2].trackers) { + auto tracker = trackerWptr.lock(); + if (tracker != nullptr) { + tracker->make_profile(profile); + } + } } else { - for (unsigned i = 1; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { + for (unsigned i = 3; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { std::lock_guard l( ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].group_lock); for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].trackers) { @@ -296,8 +314,8 @@ void MemTrackerLimiter::make_top_consumption_tasks_tracker_profile(RuntimeProfil std::unique_ptr tmp_profile_snapshot = std::make_unique("tmpSnapshot"); std::priority_queue> max_pq; - // start from 2, not include global type. - for (unsigned i = 1; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { + // start from 3, not include global/metadata/cache type. + for (unsigned i = 3; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { std::lock_guard l( ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].group_lock); for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].trackers) { @@ -326,13 +344,19 @@ void MemTrackerLimiter::make_all_tasks_tracker_profile(RuntimeProfile* profile) types_profile[Type::SCHEMA_CHANGE] = profile->create_child("SchemaChangeTasks", true, false); types_profile[Type::OTHER] = profile->create_child("OtherTasks", true, false); - // start from 2, not include global type. - for (unsigned i = 1; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { + // start from 3, not include global/metadata/cache type. + for (unsigned i = 3; i < ExecEnv::GetInstance()->mem_tracker_limiter_pool.size(); ++i) { std::lock_guard l( ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].group_lock); for (auto trackerWptr : ExecEnv::GetInstance()->mem_tracker_limiter_pool[i].trackers) { auto tracker = trackerWptr.lock(); if (tracker != nullptr) { + // BufferControlBlock will continue to exist for 5 minutes after the query ends, even if the + // result buffer is empty, and will not be shown in the profile. of course, this code is tricky. + if (tracker->consumption() == 0 && + tracker->label().starts_with("BufferControlBlock")) { + continue; + } tracker->make_profile(types_profile[tracker->type()]); } } diff --git a/be/src/runtime/memory/mem_tracker_limiter.h b/be/src/runtime/memory/mem_tracker_limiter.h index 445856b1f6af83..43b20a410ff27c 100644 --- a/be/src/runtime/memory/mem_tracker_limiter.h +++ b/be/src/runtime/memory/mem_tracker_limiter.h @@ -77,12 +77,14 @@ class MemTrackerLimiter final { enum class GCType { PROCESS = 0, WORK_LOAD_GROUP = 1 }; enum class Type { - GLOBAL = 0, // Life cycle is the same as the process, e.g. Cache and default Orphan + GLOBAL = 0, // Life cycle is the same as the process, except cache and metadata. QUERY = 1, // Count the memory consumption of all Query tasks. LOAD = 2, // Count the memory consumption of all Load tasks. COMPACTION = 3, // Count the memory consumption of all Base and Cumulative tasks. SCHEMA_CHANGE = 4, // Count the memory consumption of all SchemaChange tasks. - OTHER = 5, + METADATA = 5, // Count the memory consumption of all Metadata. + CACHE = 6, // Count the memory consumption of all Cache. + OTHER = 7, // Count the memory consumption of all other tasks, such as Clone, Snapshot, etc.. }; static std::string type_string(Type type) { @@ -97,8 +99,12 @@ class MemTrackerLimiter final { return "compaction"; case Type::SCHEMA_CHANGE: return "schema_change"; + case Type::METADATA: + return "metadata"; + case Type::CACHE: + return "cache"; case Type::OTHER: - return "other"; + return "other_task"; default: LOG(FATAL) << "not match type of mem tracker limiter :" << static_cast(type); } @@ -158,6 +164,8 @@ class MemTrackerLimiter final { int64_t consumption() const { return _mem_counter.current_value(); } int64_t peak_consumption() const { return _mem_counter.peak_value(); } + // Use carefully! only memory that cannot be allocated using Doris Allocator needs to be consumed manually. + // Ideally, all memory should use Doris Allocator. void consume(int64_t bytes) { _mem_counter.add(bytes); if (_query_statistics) { diff --git a/be/src/runtime/memory/memory_profile.cpp b/be/src/runtime/memory/memory_profile.cpp index 8dbdcbdd3af769..5d649c526014af 100644 --- a/be/src/runtime/memory/memory_profile.cpp +++ b/be/src/runtime/memory/memory_profile.cpp @@ -18,6 +18,9 @@ #include "runtime/memory/memory_profile.h" #include "bvar/reducer.h" +#include "olap/metadata_adder.h" +#include "olap/schema_cache.h" +#include "olap/tablet_schema_cache.h" #include "runtime/exec_env.h" #include "runtime/memory/global_memory_arbitrator.h" #include "runtime/memory/mem_tracker_limiter.h" @@ -28,6 +31,9 @@ namespace doris { static bvar::Adder memory_all_tracked_sum_bytes("memory_all_tracked_sum_bytes"); static bvar::Adder memory_global_trackers_sum_bytes("memory_global_trackers_sum_bytes"); +static bvar::Adder memory_metadata_trackers_sum_bytes( + "memory_metadata_trackers_sum_bytes"); +static bvar::Adder memory_cache_trackers_sum_bytes("memory_cache_trackers_sum_bytes"); static bvar::Adder memory_query_trackers_sum_bytes("memory_query_trackers_sum_bytes"); static bvar::Adder memory_load_trackers_sum_bytes("memory_load_trackers_sum_bytes"); static bvar::Adder memory_compaction_trackers_sum_bytes( @@ -40,140 +46,122 @@ static bvar::Adder memory_all_tasks_memory_bytes("memory_all_tasks_memo static bvar::Adder memory_untracked_memory_bytes("memory_untracked_memory_bytes"); MemoryProfile::MemoryProfile() { - _memory_overview_profile.set(std::make_unique("MemoryOverviewSnapshot")); +#ifdef ADDRESS_SANITIZER + _memory_overview_profile = std::make_unique("[ASAN]MemoryOverviewSnapshot"); +#else + _memory_overview_profile = std::make_unique("MemoryOverviewSnapshot"); +#endif _global_memory_profile.set(std::make_unique("GlobalMemorySnapshot")); + _metadata_memory_profile.set(std::make_unique("MetadataMemorySnapshot")); + _cache_memory_profile.set(std::make_unique("CacheMemorySnapshot")); _top_memory_tasks_profile.set(std::make_unique("TopMemoryTasksSnapshot")); _tasks_memory_profile.set(std::make_unique("TasksMemorySnapshot")); + init_memory_overview_counter(); } -void MemoryProfile::refresh_memory_overview_profile() { -#ifdef ADDRESS_SANITIZER - std::unique_ptr memory_overview_profile = - std::make_unique("[ASAN]MemoryOverviewSnapshot"); -#else - std::unique_ptr memory_overview_profile = - std::make_unique("MemoryOverviewSnapshot"); -#endif - std::unique_ptr global_memory_profile = - std::make_unique("GlobalMemorySnapshot"); - std::unique_ptr top_memory_tasks_profile = - std::make_unique("TopMemoryTasksSnapshot"); - - // 1. create profile +void MemoryProfile::init_memory_overview_counter() { RuntimeProfile* untracked_memory_profile = - memory_overview_profile->create_child("UntrackedMemory", true, false); + _memory_overview_profile->create_child("UntrackedMemory", true, false); RuntimeProfile* tracked_memory_profile = - memory_overview_profile->create_child("TrackedMemory", true, false); + _memory_overview_profile->create_child("TrackedMemory", true, false); RuntimeProfile* tasks_memory_overview_profile = tracked_memory_profile->create_child("TasksMemory", true, false); RuntimeProfile* tasks_memory_overview_details_profile = tasks_memory_overview_profile->create_child("Details", true, false); RuntimeProfile* global_memory_overview_profile = tracked_memory_profile->create_child("GlobalMemory", true, false); + RuntimeProfile* metadata_memory_overview_profile = + tracked_memory_profile->create_child("MetadataMemory", true, false); + RuntimeProfile* cache_memory_overview_profile = + tracked_memory_profile->create_child("CacheMemory", true, false); RuntimeProfile* jemalloc_memory_profile = tracked_memory_profile->create_child("JemallocMemory", true, false); RuntimeProfile* jemalloc_memory_details_profile = jemalloc_memory_profile->create_child("Details", true, false); - // 2. add counter - // 2.1 add process memory counter - RuntimeProfile::Counter* process_physical_memory_current_usage_counter = - ADD_COUNTER(memory_overview_profile, "PhysicalMemory(VmRSS)", TUnit::BYTES); - RuntimeProfile::Counter* process_physical_memory_peak_usage_counter = - memory_overview_profile->AddHighWaterMarkCounter("PhysicalMemoryPeak", TUnit::BYTES); - RuntimeProfile::Counter* process_virtual_memory_current_usage_counter = - ADD_COUNTER(memory_overview_profile, "VirtualMemory(VmSize)", TUnit::BYTES); - RuntimeProfile::Counter* process_virtual_memory_peak_usage_counter = - memory_overview_profile->AddHighWaterMarkCounter("VirtualMemoryPeak", TUnit::BYTES); - - // 2.2 add untracked memory counter - RuntimeProfile::Counter* untracked_memory_current_usage_counter = - ADD_COUNTER(untracked_memory_profile, "CurrentUsage", TUnit::BYTES); - RuntimeProfile::Counter* untracked_memory_peak_usage_counter = - untracked_memory_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); - - // 2.3 add tracked memory counter - RuntimeProfile::Counter* tracked_memory_current_usage_counter = - ADD_COUNTER(tracked_memory_profile, "CurrentUsage", TUnit::BYTES); - RuntimeProfile::Counter* tracked_memory_peak_usage_counter = - tracked_memory_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); - - // 2.4 add jemalloc memory counter - RuntimeProfile::Counter* jemalloc_memory_current_usage_counter = - ADD_COUNTER(jemalloc_memory_profile, "CurrentUsage", TUnit::BYTES); - RuntimeProfile::Counter* jemalloc_memory_peak_usage_counter = - jemalloc_memory_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); - RuntimeProfile::Counter* jemalloc_cache_current_usage_counter = - ADD_COUNTER(jemalloc_memory_details_profile, "Cache", TUnit::BYTES); - RuntimeProfile::Counter* jemalloc_cache_peak_usage_counter = - jemalloc_memory_details_profile->AddHighWaterMarkCounter("CachePeak", TUnit::BYTES); - RuntimeProfile::Counter* jemalloc_metadata_current_usage_counter = - ADD_COUNTER(jemalloc_memory_details_profile, "Metadata", TUnit::BYTES); - RuntimeProfile::Counter* jemalloc_metadata_peak_usage_counter = - jemalloc_memory_details_profile->AddHighWaterMarkCounter("MetadataPeak", TUnit::BYTES); - - // 2.5 add global memory counter - RuntimeProfile::Counter* global_current_usage_counter = - ADD_COUNTER(global_memory_overview_profile, "CurrentUsage", TUnit::BYTES); - RuntimeProfile::Counter* global_peak_usage_counter = - global_memory_overview_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); - - // 2.6 add tasks memory counter - RuntimeProfile::Counter* tasks_memory_current_usage_counter = - ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_profile, "CurrentUsage", TUnit::BYTES, 1); + // 1 add process memory counter + _process_physical_memory_usage_counter = _memory_overview_profile->AddHighWaterMarkCounter( + "PhysicalMemory(VmRSS)", TUnit::BYTES); + _process_virtual_memory_usage_counter = _memory_overview_profile->AddHighWaterMarkCounter( + "VirtualMemory(VmSize)", TUnit::BYTES); + + // 2 add untracked/tracked memory counter + _untracked_memory_usage_counter = + untracked_memory_profile->AddHighWaterMarkCounter("Memory", TUnit::BYTES); + _tracked_memory_usage_counter = + tracked_memory_profile->AddHighWaterMarkCounter("Memory", TUnit::BYTES); + + // 3 add Jemalloc memory counter + _jemalloc_memory_usage_counter = + jemalloc_memory_profile->AddHighWaterMarkCounter("Memory", TUnit::BYTES); + _jemalloc_cache_usage_counter = + jemalloc_memory_details_profile->AddHighWaterMarkCounter("Cache", TUnit::BYTES); + _jemalloc_metadata_usage_counter = + jemalloc_memory_details_profile->AddHighWaterMarkCounter("Metadata", TUnit::BYTES); + + // 4 add global/metadata/cache memory counter + _global_usage_counter = + global_memory_overview_profile->AddHighWaterMarkCounter("Memory", TUnit::BYTES); + _metadata_usage_counter = + metadata_memory_overview_profile->AddHighWaterMarkCounter("Memory", TUnit::BYTES); + _cache_usage_counter = + cache_memory_overview_profile->AddHighWaterMarkCounter("Memory", TUnit::BYTES); + + // 5 add tasks memory counter + _tasks_memory_usage_counter = + tasks_memory_overview_profile->AddHighWaterMarkCounter("Memory", TUnit::BYTES); // Reserved memory is the sum of all task reserved memory, is duplicated with all task memory counter. - RuntimeProfile::Counter* reserved_memory_current_usage_counter = ADD_CHILD_COUNTER_WITH_LEVEL( - tasks_memory_overview_profile, "ReservedMemory", TUnit::BYTES, "CurrentUsage", 1); - RuntimeProfile::Counter* reserved_memory_peak_usage_counter = - tasks_memory_overview_profile->AddHighWaterMarkCounter("ReservedMemoryPeak", - TUnit::BYTES, "CurrentUsage", 1); - RuntimeProfile::Counter* tasks_memory_peak_usage_counter = - tasks_memory_overview_profile->AddHighWaterMarkCounter("PeakUsage", TUnit::BYTES); - RuntimeProfile::Counter* query_current_usage_counter = - ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, "Query", TUnit::BYTES, 1); - RuntimeProfile::Counter* query_peak_usage_counter = - tasks_memory_overview_details_profile->AddHighWaterMarkCounter( - "QueryPeak", TUnit::BYTES, "Query", 1); - RuntimeProfile::Counter* load_current_usage_counter = - ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, "Load", TUnit::BYTES, 1); - RuntimeProfile::Counter* load_peak_usage_counter = - tasks_memory_overview_details_profile->AddHighWaterMarkCounter("LoadPeak", TUnit::BYTES, - "Load", 1); - RuntimeProfile::Counter* load_all_memtables_current_usage_counter = - ADD_CHILD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, - "AllMemTablesMemory", TUnit::BYTES, "Load", 1); - RuntimeProfile::Counter* load_all_memtables_peak_usage_counter = - ADD_CHILD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, - "AllMemTablesMemoryPeak", TUnit::BYTES, "Load", 1); - RuntimeProfile::Counter* compaction_current_usage_counter = ADD_COUNTER_WITH_LEVEL( - tasks_memory_overview_details_profile, "Compaction", TUnit::BYTES, 1); - RuntimeProfile::Counter* compaction_peak_usage_counter = - tasks_memory_overview_details_profile->AddHighWaterMarkCounter( - "CompactionPeak", TUnit::BYTES, "Compaction", 1); - RuntimeProfile::Counter* schema_change_current_usage_counter = ADD_COUNTER_WITH_LEVEL( - tasks_memory_overview_details_profile, "SchemaChange", TUnit::BYTES, 1); - RuntimeProfile::Counter* schema_change_peak_usage_counter = - tasks_memory_overview_details_profile->AddHighWaterMarkCounter( - "SchemaChangePeak", TUnit::BYTES, "SchemaChange", 1); - RuntimeProfile::Counter* other_current_usage_counter = - ADD_COUNTER_WITH_LEVEL(tasks_memory_overview_details_profile, "Other", TUnit::BYTES, 1); - RuntimeProfile::Counter* other_peak_usage_counter = - tasks_memory_overview_details_profile->AddHighWaterMarkCounter( - "OtherPeak", TUnit::BYTES, "Other", 1); - // 3. refresh counter - // 3.1 refresh process memory counter - COUNTER_SET(process_physical_memory_current_usage_counter, + _reserved_memory_usage_counter = tasks_memory_overview_profile->AddHighWaterMarkCounter( + "ReservedMemory", TUnit::BYTES, "Memory", 1); + _query_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter("Query", TUnit::BYTES); + _load_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter("Load", TUnit::BYTES); + _load_all_memtables_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter("AllMemTablesMemory", + TUnit::BYTES, "Load", 1); + _compaction_usage_counter = tasks_memory_overview_details_profile->AddHighWaterMarkCounter( + "Compaction", TUnit::BYTES); + _schema_change_usage_counter = tasks_memory_overview_details_profile->AddHighWaterMarkCounter( + "SchemaChange", TUnit::BYTES); + _other_usage_counter = + tasks_memory_overview_details_profile->AddHighWaterMarkCounter("Other", TUnit::BYTES); +} + +void MemoryProfile::refresh_memory_overview_profile() { + // 1 create profile + std::unique_ptr global_memory_profile = + std::make_unique("GlobalMemorySnapshot"); + std::unique_ptr metadata_memory_profile = + std::make_unique("MetadataMemorySnapshot"); + std::unique_ptr cache_memory_profile = + std::make_unique("CacheMemorySnapshot"); + std::unique_ptr top_memory_tasks_profile = + std::make_unique("TopMemoryTasksSnapshot"); + + // 2 refresh process memory counter + COUNTER_SET(_process_physical_memory_usage_counter, PerfCounters::get_vm_rss()); // from /proc VmRSS VmHWM - COUNTER_SET(process_physical_memory_peak_usage_counter, PerfCounters::get_vm_hwm()); - COUNTER_SET(process_virtual_memory_current_usage_counter, + COUNTER_SET(_process_virtual_memory_usage_counter, PerfCounters::get_vm_size()); // from /proc VmSize VmPeak - COUNTER_SET(process_virtual_memory_peak_usage_counter, PerfCounters::get_vm_peak()); - // 3.2 refresh tracked memory counter + // 2 refresh metadata memory tracker + ExecEnv::GetInstance()->tablets_no_cache_mem_tracker()->set_consumption( + MetadataAdder::get_all_tablets_size() - + TabletSchemaCache::instance()->value_mem_consumption() - + SchemaCache::instance()->value_mem_consumption()); + ExecEnv::GetInstance()->rowsets_no_cache_mem_tracker()->set_consumption( + MetadataAdder::get_all_rowsets_size()); + ExecEnv::GetInstance()->segments_no_cache_mem_tracker()->set_consumption( + MetadataAdder::get_all_segments_estimate_size() - + SegmentLoader::instance()->cache_mem_usage()); + + // 4 refresh tracked memory counter std::unordered_map type_mem_sum = { {MemTrackerLimiter::Type::GLOBAL, 0}, {MemTrackerLimiter::Type::QUERY, 0}, {MemTrackerLimiter::Type::LOAD, 0}, {MemTrackerLimiter::Type::COMPACTION, 0}, - {MemTrackerLimiter::Type::SCHEMA_CHANGE, 0}, {MemTrackerLimiter::Type::OTHER, 0}}; + {MemTrackerLimiter::Type::SCHEMA_CHANGE, 0}, {MemTrackerLimiter::Type::METADATA, 0}, + {MemTrackerLimiter::Type::CACHE, 0}, {MemTrackerLimiter::Type::OTHER, 0}}; // always ExecEnv::ready(), because Daemon::_stop_background_threads_latch for (auto& group : ExecEnv::GetInstance()->mem_tracker_limiter_pool) { std::lock_guard l(group.group_lock); @@ -191,42 +179,46 @@ void MemoryProfile::refresh_memory_overview_profile() { all_tracked_mem_sum += it.second; switch (it.first) { case MemTrackerLimiter::Type::GLOBAL: - COUNTER_SET(global_current_usage_counter, it.second); - COUNTER_SET(global_peak_usage_counter, it.second); + COUNTER_SET(_global_usage_counter, it.second); memory_global_trackers_sum_bytes << it.second - memory_global_trackers_sum_bytes.get_value(); break; case MemTrackerLimiter::Type::QUERY: - COUNTER_SET(query_current_usage_counter, it.second); - COUNTER_SET(query_peak_usage_counter, it.second); + COUNTER_SET(_query_usage_counter, it.second); tasks_trackers_mem_sum += it.second; memory_query_trackers_sum_bytes << it.second - memory_query_trackers_sum_bytes.get_value(); break; case MemTrackerLimiter::Type::LOAD: - COUNTER_SET(load_current_usage_counter, it.second); - COUNTER_SET(load_peak_usage_counter, it.second); + COUNTER_SET(_load_usage_counter, it.second); tasks_trackers_mem_sum += it.second; memory_load_trackers_sum_bytes << it.second - memory_load_trackers_sum_bytes.get_value(); break; case MemTrackerLimiter::Type::COMPACTION: - COUNTER_SET(compaction_current_usage_counter, it.second); - COUNTER_SET(compaction_peak_usage_counter, it.second); + COUNTER_SET(_compaction_usage_counter, it.second); tasks_trackers_mem_sum += it.second; memory_compaction_trackers_sum_bytes << it.second - memory_compaction_trackers_sum_bytes.get_value(); break; case MemTrackerLimiter::Type::SCHEMA_CHANGE: - COUNTER_SET(schema_change_current_usage_counter, it.second); - COUNTER_SET(schema_change_peak_usage_counter, it.second); + COUNTER_SET(_schema_change_usage_counter, it.second); tasks_trackers_mem_sum += it.second; memory_schema_change_trackers_sum_bytes << it.second - memory_schema_change_trackers_sum_bytes.get_value(); break; + case MemTrackerLimiter::Type::METADATA: + COUNTER_SET(_metadata_usage_counter, it.second); + memory_metadata_trackers_sum_bytes + << it.second - memory_metadata_trackers_sum_bytes.get_value(); + break; + case MemTrackerLimiter::Type::CACHE: + COUNTER_SET(_cache_usage_counter, it.second); + memory_cache_trackers_sum_bytes + << it.second - memory_cache_trackers_sum_bytes.get_value(); + break; case MemTrackerLimiter::Type::OTHER: - COUNTER_SET(other_current_usage_counter, it.second); - COUNTER_SET(other_peak_usage_counter, it.second); + COUNTER_SET(_other_usage_counter, it.second); tasks_trackers_mem_sum += it.second; memory_other_trackers_sum_bytes << it.second - memory_other_trackers_sum_bytes.get_value(); @@ -235,60 +227,52 @@ void MemoryProfile::refresh_memory_overview_profile() { MemTrackerLimiter::make_type_trackers_profile(global_memory_profile.get(), MemTrackerLimiter::Type::GLOBAL); + MemTrackerLimiter::make_type_trackers_profile(metadata_memory_profile.get(), + MemTrackerLimiter::Type::METADATA); + MemTrackerLimiter::make_type_trackers_profile(cache_memory_profile.get(), + MemTrackerLimiter::Type::CACHE); MemTrackerLimiter::make_top_consumption_tasks_tracker_profile(top_memory_tasks_profile.get(), 15); - COUNTER_SET(tasks_memory_current_usage_counter, tasks_trackers_mem_sum); - COUNTER_SET(tasks_memory_peak_usage_counter, tasks_trackers_mem_sum); + COUNTER_SET(_tasks_memory_usage_counter, tasks_trackers_mem_sum); memory_all_tasks_memory_bytes << tasks_trackers_mem_sum - memory_all_tasks_memory_bytes.get_value(); - COUNTER_SET(reserved_memory_current_usage_counter, - GlobalMemoryArbitrator::process_reserved_memory()); - COUNTER_SET(reserved_memory_peak_usage_counter, - GlobalMemoryArbitrator::process_reserved_memory()); + COUNTER_SET(_reserved_memory_usage_counter, GlobalMemoryArbitrator::process_reserved_memory()); memory_reserved_memory_bytes << GlobalMemoryArbitrator::process_reserved_memory() - memory_reserved_memory_bytes.get_value(); all_tracked_mem_sum += MemInfo::allocator_cache_mem(); - COUNTER_SET(jemalloc_cache_current_usage_counter, - static_cast(MemInfo::allocator_cache_mem())); - COUNTER_SET(jemalloc_cache_peak_usage_counter, + COUNTER_SET(_jemalloc_cache_usage_counter, static_cast(MemInfo::allocator_cache_mem())); all_tracked_mem_sum += MemInfo::allocator_metadata_mem(); - COUNTER_SET(jemalloc_metadata_current_usage_counter, - static_cast(MemInfo::allocator_metadata_mem())); - COUNTER_SET(jemalloc_metadata_peak_usage_counter, + COUNTER_SET(_jemalloc_metadata_usage_counter, static_cast(MemInfo::allocator_metadata_mem())); - COUNTER_SET(jemalloc_memory_current_usage_counter, - jemalloc_cache_current_usage_counter->value() + - jemalloc_metadata_current_usage_counter->value()); - COUNTER_SET(jemalloc_memory_peak_usage_counter, - jemalloc_cache_current_usage_counter->value() + - jemalloc_metadata_current_usage_counter->value()); - - COUNTER_SET(tracked_memory_current_usage_counter, all_tracked_mem_sum); - COUNTER_SET(tracked_memory_peak_usage_counter, all_tracked_mem_sum); + COUNTER_SET(_jemalloc_memory_usage_counter, + _jemalloc_cache_usage_counter->current_value() + + _jemalloc_metadata_usage_counter->current_value()); + + COUNTER_SET(_tracked_memory_usage_counter, all_tracked_mem_sum); memory_all_tracked_sum_bytes << all_tracked_mem_sum - memory_all_tracked_sum_bytes.get_value(); - // 3.3 refresh untracked memory counter + // 5 refresh untracked memory counter int64_t untracked_memory = - process_physical_memory_current_usage_counter->value() - all_tracked_mem_sum; - COUNTER_SET(untracked_memory_current_usage_counter, untracked_memory); - COUNTER_SET(untracked_memory_peak_usage_counter, untracked_memory); + _process_physical_memory_usage_counter->current_value() - all_tracked_mem_sum; + COUNTER_SET(_untracked_memory_usage_counter, untracked_memory); memory_untracked_memory_bytes << untracked_memory - memory_untracked_memory_bytes.get_value(); - // 3.4 refresh additional tracker printed when memory exceeds limit. - COUNTER_SET(load_all_memtables_current_usage_counter, - ExecEnv::GetInstance()->memtable_memory_limiter()->mem_tracker()->consumption()); + // 6 refresh additional tracker printed when memory exceeds limit. COUNTER_SET( - load_all_memtables_peak_usage_counter, + _load_all_memtables_usage_counter, ExecEnv::GetInstance()->memtable_memory_limiter()->mem_tracker()->peak_consumption()); + COUNTER_SET(_load_all_memtables_usage_counter, + ExecEnv::GetInstance()->memtable_memory_limiter()->mem_tracker()->consumption()); - // 4. reset profile - _memory_overview_profile.set(std::move(memory_overview_profile)); + // 7. reset profile _global_memory_profile.set(std::move(global_memory_profile)); + _metadata_memory_profile.set(std::move(metadata_memory_profile)); + _cache_memory_profile.set(std::move(cache_memory_profile)); _top_memory_tasks_profile.set(std::move(top_memory_tasks_profile)); } @@ -302,16 +286,25 @@ void MemoryProfile::refresh_tasks_memory_profile() { void MemoryProfile::make_memory_profile(RuntimeProfile* profile) const { RuntimeProfile* memory_profile_snapshot = profile->create_child("MemoryProfile", true, false); - auto memory_overview_version_ptr = _memory_overview_profile.get(); RuntimeProfile* memory_overview_profile = - memory_profile_snapshot->create_child(memory_overview_version_ptr->name(), true, false); - memory_overview_profile->merge(const_cast(memory_overview_version_ptr.get())); + memory_profile_snapshot->create_child(_memory_overview_profile->name(), true, false); + memory_overview_profile->merge(const_cast(_memory_overview_profile.get())); auto global_memory_version_ptr = _global_memory_profile.get(); RuntimeProfile* global_memory_profile = memory_profile_snapshot->create_child(global_memory_version_ptr->name(), true, false); global_memory_profile->merge(const_cast(global_memory_version_ptr.get())); + auto metadata_memory_version_ptr = _metadata_memory_profile.get(); + RuntimeProfile* metadata_memory_profile = + memory_profile_snapshot->create_child(metadata_memory_version_ptr->name(), true, false); + metadata_memory_profile->merge(const_cast(metadata_memory_version_ptr.get())); + + auto cache_memory_version_ptr = _cache_memory_profile.get(); + RuntimeProfile* cache_memory_profile = + memory_profile_snapshot->create_child(cache_memory_version_ptr->name(), true, false); + cache_memory_profile->merge(const_cast(cache_memory_version_ptr.get())); + auto top_memory_tasks_version_ptr = _top_memory_tasks_profile.get(); RuntimeProfile* top_memory_tasks_profile = memory_profile_snapshot->create_child( top_memory_tasks_version_ptr->name(), true, false); @@ -346,6 +339,8 @@ void MemoryProfile::print_log_process_usage() { LOG(WARNING) << "Process Memory Summary: " + GlobalMemoryArbitrator::process_mem_log_str(); LOG(WARNING) << "\n" << print_memory_overview_profile(); LOG(WARNING) << "\n" << print_global_memory_profile(); + LOG(WARNING) << "\n" << print_metadata_memory_profile(); + LOG(WARNING) << "\n" << print_cache_memory_profile(); LOG(WARNING) << "\n" << print_top_memory_tasks_profile(); } } diff --git a/be/src/runtime/memory/memory_profile.h b/be/src/runtime/memory/memory_profile.h index 9f1bab0c02a802..c6aefb72f22e1a 100644 --- a/be/src/runtime/memory/memory_profile.h +++ b/be/src/runtime/memory/memory_profile.h @@ -33,31 +33,27 @@ class MemoryProfile { void make_memory_profile(RuntimeProfile* profile) const; std::string print_memory_overview_profile() const { - std::stringstream ss; - auto version_ptr = _memory_overview_profile.get(); - version_ptr->pretty_print(&ss); - return ss.str(); + return return_memory_profile_str(_memory_overview_profile.get()); } std::string print_global_memory_profile() const { - std::stringstream ss; - auto version_ptr = _global_memory_profile.get(); - version_ptr->pretty_print(&ss); - return ss.str(); + return return_memory_profile_str(_global_memory_profile.get().get()); + } + + std::string print_metadata_memory_profile() const { + return return_memory_profile_str(_metadata_memory_profile.get().get()); + } + + std::string print_cache_memory_profile() const { + return return_memory_profile_str(_cache_memory_profile.get().get()); } std::string print_top_memory_tasks_profile() const { - std::stringstream ss; - auto version_ptr = _top_memory_tasks_profile.get(); - version_ptr->pretty_print(&ss); - return ss.str(); + return return_memory_profile_str(_top_memory_tasks_profile.get().get()); } std::string print_tasks_memory_profile() const { - std::stringstream ss; - auto version_ptr = _tasks_memory_profile.get(); - version_ptr->pretty_print(&ss); - return ss.str(); + return return_memory_profile_str(_tasks_memory_profile.get().get()); } static int64_t query_current_usage(); @@ -71,11 +67,50 @@ class MemoryProfile { void print_log_process_usage(); private: - MultiVersion _memory_overview_profile; + std::string return_memory_profile_str(const RuntimeProfile* profile) const { + std::stringstream ss; + profile->pretty_print(&ss); + return ss.str(); + } + + void init_memory_overview_counter(); + + std::unique_ptr _memory_overview_profile; MultiVersion _global_memory_profile; + MultiVersion _metadata_memory_profile; + MultiVersion _cache_memory_profile; MultiVersion _top_memory_tasks_profile; MultiVersion _tasks_memory_profile; + // process memory counter + RuntimeProfile::HighWaterMarkCounter* _process_physical_memory_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _process_virtual_memory_usage_counter; + + // untracked/tracked memory counter + RuntimeProfile::HighWaterMarkCounter* _untracked_memory_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _tracked_memory_usage_counter; + + // Jemalloc memory counter + RuntimeProfile::HighWaterMarkCounter* _jemalloc_memory_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _jemalloc_cache_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _jemalloc_metadata_usage_counter; + + // global/metadata/cache memory counter + RuntimeProfile::HighWaterMarkCounter* _global_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _metadata_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _cache_usage_counter; + + // tasks memory counter + RuntimeProfile::HighWaterMarkCounter* _tasks_memory_usage_counter; + // reserved memory is the sum of all task reserved memory, is duplicated with all task memory counter. + RuntimeProfile::HighWaterMarkCounter* _reserved_memory_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _query_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _load_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _load_all_memtables_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _compaction_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _schema_change_usage_counter; + RuntimeProfile::HighWaterMarkCounter* _other_usage_counter; + std::atomic _enable_print_log_process_usage {true}; }; diff --git a/be/src/runtime/process_profile.cpp b/be/src/runtime/process_profile.cpp index d91aedbeac2025..60b381e9c31578 100644 --- a/be/src/runtime/process_profile.cpp +++ b/be/src/runtime/process_profile.cpp @@ -19,6 +19,7 @@ #include +#include "olap/metadata_adder.h" #include "runtime/memory/memory_profile.h" namespace doris { @@ -37,8 +38,15 @@ void ProcessProfile::refresh_profile() { std::unique_ptr process_profile = std::make_unique("ProcessProfile"); _memory_profile->make_memory_profile(process_profile.get()); - _process_profile.set(std::move(process_profile)); // TODO make other profile + + // 3. dump object heap + RuntimeProfile* object_heap_dump_snapshot = + process_profile->create_child("ObjectHeapDump", true, false); + MetadataAdder::dump_metadata_object(object_heap_dump_snapshot); + // TODO dump other object (block, column, etc.) + + _process_profile.set(std::move(process_profile)); } } // namespace doris diff --git a/be/src/runtime/query_context.cpp b/be/src/runtime/query_context.cpp index 811fa6002b5cf5..c777c8100ef213 100644 --- a/be/src/runtime/query_context.cpp +++ b/be/src/runtime/query_context.cpp @@ -86,7 +86,7 @@ QueryContext::QueryContext(TUniqueId query_id, ExecEnv* exec_env, _shared_hash_table_controller.reset(new vectorized::SharedHashTableController()); _execution_dependency = pipeline::Dependency::create_unique(-1, -1, "ExecutionDependency"); _runtime_filter_mgr = std::make_unique( - TUniqueId(), RuntimeFilterParamsContext::create(this), query_mem_tracker); + TUniqueId(), RuntimeFilterParamsContext::create(this), query_mem_tracker, true); _timeout_second = query_options.execution_timeout; @@ -323,14 +323,13 @@ ThreadPool* QueryContext::get_memtable_flush_pool() { } } -Status QueryContext::set_workload_group(WorkloadGroupPtr& tg) { +void QueryContext::set_workload_group(WorkloadGroupPtr& tg) { _workload_group = tg; // Should add query first, then the workload group will not be deleted. // see task_group_manager::delete_workload_group_by_ids _workload_group->add_mem_tracker_limiter(query_mem_tracker); _workload_group->get_query_scheduler(&_task_scheduler, &_scan_task_scheduler, &_memtable_flush_pool, &_remote_scan_task_scheduler); - return Status::OK(); } void QueryContext::add_fragment_profile( diff --git a/be/src/runtime/query_context.h b/be/src/runtime/query_context.h index 4746553040521b..621c5ebca90cad 100644 --- a/be/src/runtime/query_context.h +++ b/be/src/runtime/query_context.h @@ -138,7 +138,7 @@ class QueryContext { } } - Status set_workload_group(WorkloadGroupPtr& tg); + void set_workload_group(WorkloadGroupPtr& tg); int execution_timeout() const { return _query_options.__isset.execution_timeout ? _query_options.execution_timeout @@ -165,6 +165,12 @@ class QueryContext { return _query_options.__isset.fe_process_uuid ? _query_options.fe_process_uuid : 0; } + bool ignore_runtime_filter_error() const { + return _query_options.__isset.ignore_runtime_filter_error + ? _query_options.ignore_runtime_filter_error + : false; + } + // global runtime filter mgr, the runtime filter have remote target or // need local merge should regist here. before publish() or push_to_remote() // the runtime filter should do the local merge work @@ -235,9 +241,9 @@ class QueryContext { // only for file scan node std::map file_scan_range_params_map; - void update_wg_cpu_adder(int64_t delta_cpu_time) { + void update_cpu_time(int64_t delta_cpu_time) { if (_workload_group != nullptr) { - _workload_group->update_cpu_adder(delta_cpu_time); + _workload_group->update_cpu_time(delta_cpu_time); } } diff --git a/be/src/runtime/routine_load/routine_load_task_executor.h b/be/src/runtime/routine_load/routine_load_task_executor.h index 0e597d796c9f77..b1196f7824afac 100644 --- a/be/src/runtime/routine_load/routine_load_task_executor.h +++ b/be/src/runtime/routine_load/routine_load_task_executor.h @@ -73,6 +73,8 @@ class RoutineLoadTaskExecutor { std::vector* partition_offsets, int timeout); + ThreadPool& get_thread_pool() { return *_thread_pool; } + private: // execute the task void exec_task(std::shared_ptr ctx, DataConsumerPool* pool, diff --git a/be/src/runtime/runtime_filter_mgr.cpp b/be/src/runtime/runtime_filter_mgr.cpp index 1a238787207b17..b4a38173d72222 100644 --- a/be/src/runtime/runtime_filter_mgr.cpp +++ b/be/src/runtime/runtime_filter_mgr.cpp @@ -45,12 +45,12 @@ namespace doris { RuntimeFilterMgr::RuntimeFilterMgr(const UniqueId& query_id, RuntimeFilterParamsContext* state, - const std::shared_ptr& query_mem_tracker) { - _state = state; - _state->runtime_filter_mgr = this; - _query_mem_tracker = query_mem_tracker; - _tracker = std::make_unique("RuntimeFilterMgr(experimental)"); -} + const std::shared_ptr& query_mem_tracker, + const bool is_global) + : _is_global(is_global), + _state(state), + _tracker(std::make_unique("RuntimeFilterMgr(experimental)")), + _query_mem_tracker(query_mem_tracker) {} RuntimeFilterMgr::~RuntimeFilterMgr() { CHECK(_query_mem_tracker != nullptr); @@ -60,6 +60,7 @@ RuntimeFilterMgr::~RuntimeFilterMgr() { Status RuntimeFilterMgr::get_consume_filters( const int filter_id, std::vector>& consumer_filters) { + DCHECK(_is_global); std::lock_guard l(_lock); auto iter = _consumer_map.find(filter_id); if (iter == _consumer_map.end()) { @@ -72,10 +73,24 @@ Status RuntimeFilterMgr::get_consume_filters( return Status::OK(); } +std::vector> RuntimeFilterMgr::get_consume_filters( + const int filter_id) { + std::lock_guard l(_lock); + auto iter = _consumer_map.find(filter_id); + if (iter == _consumer_map.end()) { + return {}; + } + std::vector> consumer_filters; + for (auto& holder : iter->second) { + consumer_filters.emplace_back(holder.filter); + } + return consumer_filters; +} + Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, int node_id, std::shared_ptr* consumer_filter, - bool build_bf_exactly, bool need_local_merge) { + bool need_local_merge) { SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; bool has_exist = false; @@ -90,11 +105,12 @@ Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc } } + DCHECK(!(_is_global xor need_local_merge)) + << " _is_global: " << _is_global << " need_local_merge: " << need_local_merge; if (!has_exist) { std::shared_ptr filter; RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::CONSUMER, - node_id, &filter, build_bf_exactly, - need_local_merge)); + node_id, &filter)); _consumer_map[key].emplace_back(node_id, filter); *consumer_filter = filter; } else if (!need_local_merge) { @@ -106,7 +122,8 @@ Status RuntimeFilterMgr::register_consumer_filter(const TRuntimeFilterDesc& desc Status RuntimeFilterMgr::register_local_merge_producer_filter( const doris::TRuntimeFilterDesc& desc, const doris::TQueryOptions& options, - std::shared_ptr* producer_filter, bool build_bf_exactly) { + std::shared_ptr producer_filter) { + DCHECK(_is_global); SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; @@ -121,27 +138,25 @@ Status RuntimeFilterMgr::register_local_merge_producer_filter( } DCHECK(_state != nullptr); - RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, - producer_filter, build_bf_exactly, true)); { std::lock_guard l(*iter->second.lock); if (iter->second.filters.empty()) { std::shared_ptr merge_filter; RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, - RuntimeFilterRole::PRODUCER, -1, &merge_filter, - build_bf_exactly, true)); + RuntimeFilterRole::PRODUCER, -1, &merge_filter)); merge_filter->set_ignored(); iter->second.filters.emplace_back(merge_filter); } iter->second.merge_time++; iter->second.merge_size_times++; - iter->second.filters.emplace_back(*producer_filter); + iter->second.filters.emplace_back(producer_filter); } return Status::OK(); } Status RuntimeFilterMgr::get_local_merge_producer_filters( int filter_id, doris::LocalMergeFilters** local_merge_filters) { + DCHECK(_is_global); std::lock_guard l(_lock); auto iter = _local_merge_producer_map.find(filter_id); if (iter == _local_merge_producer_map.end()) { @@ -155,10 +170,20 @@ Status RuntimeFilterMgr::get_local_merge_producer_filters( return Status::OK(); } -Status RuntimeFilterMgr::register_producer_filter(const TRuntimeFilterDesc& desc, - const TQueryOptions& options, - std::shared_ptr* producer_filter, - bool build_bf_exactly) { +doris::LocalMergeFilters* RuntimeFilterMgr::get_local_merge_producer_filters(int filter_id) { + DCHECK(_is_global); + std::lock_guard l(_lock); + auto iter = _local_merge_producer_map.find(filter_id); + if (iter == _local_merge_producer_map.end()) { + return nullptr; + } + return &iter->second; +} + +Status RuntimeFilterMgr::register_producer_filter( + const TRuntimeFilterDesc& desc, const TQueryOptions& options, + std::shared_ptr* producer_filter) { + DCHECK(!_is_global); SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); int32_t key = desc.filter_id; std::lock_guard l(_lock); @@ -169,38 +194,11 @@ Status RuntimeFilterMgr::register_producer_filter(const TRuntimeFilterDesc& desc return Status::InvalidArgument("filter has registed"); } RETURN_IF_ERROR(IRuntimeFilter::create(_state, &desc, &options, RuntimeFilterRole::PRODUCER, -1, - producer_filter, build_bf_exactly)); + producer_filter)); _producer_map.emplace(key, *producer_filter); return Status::OK(); } -Status RuntimeFilterMgr::update_filter(const PPublishFilterRequest* request, - butil::IOBufAsZeroCopyInputStream* data) { - SCOPED_CONSUME_MEM_TRACKER(_tracker.get()); - UpdateRuntimeFilterParams params(request, data); - int filter_id = request->filter_id(); - std::vector> filters; - // The code is organized for upgrade compatibility to prevent infinite waiting - // old way update filter the code should be deleted after the upgrade is complete. - { - std::lock_guard l(_lock); - auto iter = _consumer_map.find(filter_id); - if (iter == _consumer_map.end()) { - return Status::InternalError("update_filter meet unknown filter: {}, role: CONSUMER.", - filter_id); - } - for (auto& holder : iter->second) { - filters.emplace_back(holder.filter); - } - iter->second.clear(); - } - for (auto filter : filters) { - RETURN_IF_ERROR(filter->update_filter(¶ms)); - } - - return Status::OK(); -} - void RuntimeFilterMgr::set_runtime_filter_params( const TRuntimeFilterParams& runtime_filter_params) { std::lock_guard l(_lock); @@ -221,35 +219,14 @@ Status RuntimeFilterMgr::get_merge_addr(TNetworkAddress* addr) { Status RuntimeFilterMergeControllerEntity::_init_with_desc( const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, - const std::vector* target_info, + const std::vector&& targetv2_info, const int producer_size) { - std::unique_lock guard(_filter_map_mutex); std::shared_ptr cnt_val = std::make_shared(); // runtime_filter_desc and target will be released, // so we need to copy to cnt_val cnt_val->producer_size = producer_size; cnt_val->runtime_filter_desc = *runtime_filter_desc; - cnt_val->pool.reset(new ObjectPool()); - cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); - - auto filter_id = runtime_filter_desc->filter_id; - RETURN_IF_ERROR(cnt_val->filter->init_with_desc(&cnt_val->runtime_filter_desc, query_options, - -1, false)); - cnt_val->filter->set_ignored(); - _filter_map.emplace(filter_id, cnt_val); - return Status::OK(); -} - -Status RuntimeFilterMergeControllerEntity::_init_with_desc( - const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, - const std::vector* targetv2_info, - const int producer_size) { - std::shared_ptr cnt_val = std::make_shared(); - // runtime_filter_desc and target will be released, - // so we need to copy to cnt_val - cnt_val->producer_size = producer_size; - cnt_val->runtime_filter_desc = *runtime_filter_desc; - cnt_val->targetv2_info = *targetv2_info; + cnt_val->targetv2_info = targetv2_info; cnt_val->pool.reset(new ObjectPool()); cnt_val->filter = cnt_val->pool->add(new IRuntimeFilter(_state, runtime_filter_desc)); auto filter_id = runtime_filter_desc->filter_id; @@ -270,42 +247,28 @@ Status RuntimeFilterMergeControllerEntity::init(UniqueId query_id, if (runtime_filter_params.__isset.rid_to_runtime_filter) { for (const auto& filterid_to_desc : runtime_filter_params.rid_to_runtime_filter) { int filter_id = filterid_to_desc.first; - const auto& target_iter = runtime_filter_params.rid_to_target_param.find(filter_id); - if (target_iter == runtime_filter_params.rid_to_target_param.end() && - !runtime_filter_params.__isset.rid_to_target_paramv2) { - // This runtime filter has to target info - return Status::InternalError("runtime filter params meet error"); - } else if (target_iter == runtime_filter_params.rid_to_target_param.end()) { - const auto& targetv2_iter = - runtime_filter_params.rid_to_target_paramv2.find(filter_id); - if (targetv2_iter == runtime_filter_params.rid_to_target_paramv2.end()) { - // This runtime filter has to target info - return Status::InternalError("runtime filter params meet error"); - } - const auto& build_iter = - runtime_filter_params.runtime_filter_builder_num.find(filter_id); - if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { - // This runtime filter has to builder info - return Status::InternalError("runtime filter params meet error"); - } - - RETURN_IF_ERROR(_init_with_desc(&filterid_to_desc.second, &query_options, - &targetv2_iter->second, build_iter->second)); - } else { - const auto& build_iter = - runtime_filter_params.runtime_filter_builder_num.find(filter_id); - if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { - return Status::InternalError("runtime filter params meet error"); - } - RETURN_IF_ERROR(_init_with_desc(&filterid_to_desc.second, &query_options, - &target_iter->second, build_iter->second)); + const auto& targetv2_iter = runtime_filter_params.rid_to_target_paramv2.find(filter_id); + const auto& build_iter = + runtime_filter_params.runtime_filter_builder_num.find(filter_id); + if (build_iter == runtime_filter_params.runtime_filter_builder_num.end()) { + // This runtime filter has no builder info + return Status::InternalError( + "Runtime filter has a wrong parameter. Maybe FE version is mismatched."); } + + RETURN_IF_ERROR(_init_with_desc( + &filterid_to_desc.second, &query_options, + targetv2_iter == runtime_filter_params.rid_to_target_paramv2.end() + ? std::vector {} + : std::move(targetv2_iter->second), + build_iter->second)); } } return Status::OK(); } -Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSizeRequest* request) { +Status RuntimeFilterMergeControllerEntity::send_filter_size(std::weak_ptr query_ctx, + const PSendFilterSizeRequest* request) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); std::shared_ptr cnt_val; @@ -326,6 +289,8 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz Status st = Status::OK(); if (cnt_val->source_addrs.size() == cnt_val->producer_size) { + auto ctx = query_ctx.lock()->ignore_runtime_filter_error() ? std::weak_ptr {} + : query_ctx; for (auto addr : cnt_val->source_addrs) { std::shared_ptr stub( ExecEnv::GetInstance()->brpc_internal_client_cache()->get_client(addr)); @@ -339,12 +304,13 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz auto closure = AutoReleaseClosure>:: create_unique(std::make_shared(), - DummyBrpcCallback::create_shared()); + DummyBrpcCallback::create_shared(), ctx); auto* pquery_id = closure->request_->mutable_query_id(); - pquery_id->set_hi(_state->query_id.hi()); - pquery_id->set_lo(_state->query_id.lo()); - closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + pquery_id->set_hi(_state->get_query_ctx()->query_id().hi); + pquery_id->set_lo(_state->get_query_ctx()->query_id().lo); + closure->cntl_->set_timeout_ms( + get_execution_rpc_timeout_ms(_state->get_query_ctx()->execution_timeout())); if (config::execution_ignore_eovercrowded) { closure->cntl_->ignore_eovercrowded(); } @@ -361,12 +327,6 @@ Status RuntimeFilterMergeControllerEntity::send_filter_size(const PSendFilterSiz } Status RuntimeFilterMgr::sync_filter_size(const PSyncFilterSizeRequest* request) { - auto filter = try_get_product_filter(request->filter_id()); - if (filter) { - filter->set_synced_size(request->filter_size()); - return Status::OK(); - } - LocalMergeFilters* local_merge_filters = nullptr; RETURN_IF_ERROR(get_local_merge_producer_filters(request->filter_id(), &local_merge_filters)); // first filter size merged filter @@ -377,7 +337,8 @@ Status RuntimeFilterMgr::sync_filter_size(const PSyncFilterSizeRequest* request) } // merge data -Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* request, +Status RuntimeFilterMergeControllerEntity::merge(std::weak_ptr query_ctx, + const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data) { SCOPED_CONSUME_MEM_TRACKER(_mem_tracker); std::shared_ptr cnt_val; @@ -416,6 +377,8 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ DCHECK_LE(merged_size, cnt_val->producer_size); cnt_val->merge_time += (MonotonicMillis() - start_merge); merge_time = cnt_val->merge_time; + cnt_val->local_merge_time += + request->has_local_merge_time() ? request->local_merge_time() : 0; } if (merged_size == cnt_val->producer_size) { @@ -444,21 +407,25 @@ Status RuntimeFilterMergeControllerEntity::merge(const PMergeFilterRequest* requ has_attachment = true; } + auto ctx = query_ctx.lock()->ignore_runtime_filter_error() ? std::weak_ptr {} + : query_ctx; std::vector& targets = cnt_val->targetv2_info; for (auto& target : targets) { auto closure = AutoReleaseClosure>:: create_unique(std::make_shared(apply_request), - DummyBrpcCallback::create_shared()); + DummyBrpcCallback::create_shared(), ctx); closure->request_->set_filter_id(request->filter_id()); closure->request_->set_merge_time(merge_time); + closure->request_->set_local_merge_time(cnt_val->local_merge_time); *closure->request_->mutable_query_id() = request->query_id(); if (has_attachment) { closure->cntl_->request_attachment().append(request_attachment); } - closure->cntl_->set_timeout_ms(get_execution_rpc_timeout_ms(_state->execution_timeout)); + closure->cntl_->set_timeout_ms( + get_execution_rpc_timeout_ms(_state->get_query_ctx()->execution_timeout())); if (config::execution_ignore_eovercrowded) { closure->cntl_->ignore_eovercrowded(); } @@ -521,31 +488,22 @@ void RuntimeFilterMergeController::remove_entity(UniqueId query_id) { RuntimeFilterParamsContext* RuntimeFilterParamsContext::create(RuntimeState* state) { RuntimeFilterParamsContext* params = state->get_query_ctx()->obj_pool.add(new RuntimeFilterParamsContext()); - params->runtime_filter_wait_infinitely = state->runtime_filter_wait_infinitely(); - params->runtime_filter_wait_time_ms = state->runtime_filter_wait_time_ms(); - params->execution_timeout = state->execution_timeout(); - params->runtime_filter_mgr = state->local_runtime_filter_mgr(); - params->exec_env = state->exec_env(); - params->query_id.set_hi(state->query_id().hi); - params->query_id.set_lo(state->query_id().lo); - - params->be_exec_version = state->be_exec_version(); - params->query_ctx = state->get_query_ctx(); + params->_query_ctx = state->get_query_ctx(); + params->_state = state; return params; } +RuntimeFilterMgr* RuntimeFilterParamsContext::global_runtime_filter_mgr() { + return _query_ctx->runtime_filter_mgr(); +} + +RuntimeFilterMgr* RuntimeFilterParamsContext::local_runtime_filter_mgr() { + return _state->local_runtime_filter_mgr(); +} + RuntimeFilterParamsContext* RuntimeFilterParamsContext::create(QueryContext* query_ctx) { RuntimeFilterParamsContext* params = query_ctx->obj_pool.add(new RuntimeFilterParamsContext()); - params->runtime_filter_wait_infinitely = query_ctx->runtime_filter_wait_infinitely(); - params->runtime_filter_wait_time_ms = query_ctx->runtime_filter_wait_time_ms(); - params->execution_timeout = query_ctx->execution_timeout(); - params->runtime_filter_mgr = query_ctx->runtime_filter_mgr(); - params->exec_env = query_ctx->exec_env(); - params->query_id.set_hi(query_ctx->query_id().hi); - params->query_id.set_lo(query_ctx->query_id().lo); - - params->be_exec_version = query_ctx->be_exec_version(); - params->query_ctx = query_ctx; + params->_query_ctx = query_ctx; return params; } diff --git a/be/src/runtime/runtime_filter_mgr.h b/be/src/runtime/runtime_filter_mgr.h index b0aea7568cff65..c54be905f28f08 100644 --- a/be/src/runtime/runtime_filter_mgr.h +++ b/be/src/runtime/runtime_filter_mgr.h @@ -34,6 +34,7 @@ #include "common/object_pool.h" #include "common/status.h" +#include "util/stopwatch.hpp" #include "util/uid_util.h" namespace butil { @@ -60,6 +61,7 @@ struct LocalMergeFilters { int merge_size_times = 0; uint64_t local_merged_size = 0; std::vector> filters; + MonotonicStopWatch merge_watcher; }; /// producer: @@ -77,12 +79,14 @@ struct LocalMergeFilters { class RuntimeFilterMgr { public: RuntimeFilterMgr(const UniqueId& query_id, RuntimeFilterParamsContext* state, - const std::shared_ptr& query_mem_tracker); + const std::shared_ptr& query_mem_tracker, + const bool is_global); ~RuntimeFilterMgr(); Status get_consume_filters(const int filter_id, std::vector>& consumer_filters); + std::vector> get_consume_filters(const int filter_id); std::shared_ptr try_get_product_filter(const int filter_id) { std::lock_guard l(_lock); @@ -96,23 +100,19 @@ class RuntimeFilterMgr { // register filter Status register_consumer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, int node_id, std::shared_ptr* consumer_filter, - bool build_bf_exactly = false, bool need_local_merge = false); + bool need_local_merge = false); Status register_local_merge_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, - std::shared_ptr* producer_filter, - bool build_bf_exactly = false); + std::shared_ptr producer_filter); Status get_local_merge_producer_filters(int filter_id, LocalMergeFilters** local_merge_filters); + LocalMergeFilters* get_local_merge_producer_filters(int filter_id); Status register_producer_filter(const TRuntimeFilterDesc& desc, const TQueryOptions& options, - std::shared_ptr* producer_filter, - bool build_bf_exactly = false); + std::shared_ptr* producer_filter); // update filter by remote - Status update_filter(const PPublishFilterRequest* request, - butil::IOBufAsZeroCopyInputStream* data); - void set_runtime_filter_params(const TRuntimeFilterParams& runtime_filter_params); Status get_merge_addr(TNetworkAddress* addr); @@ -124,6 +124,18 @@ class RuntimeFilterMgr { int node_id; std::shared_ptr filter; }; + /** + * `_is_global = true` means this runtime filter manager menages query-level runtime filters. + * If so, all consumers in this query shared the same RF with the same ID. For producers, all + * RFs produced should be merged. + * + * If `_is_global` is false, a RF will be produced and consumed by a single-producer-single-consumer mode. + * This is usually happened in a co-located join and scan operators are not serial. + * + * `_local_merge_producer_map` is used only if `_is_global` is true. It is said, RFs produced by + * different producers need to be merged only if it is a global RF. + */ + const bool _is_global; // RuntimeFilterMgr is owned by RuntimeState, so we only // use filter_id as key // key: "filter-id" @@ -156,10 +168,11 @@ class RuntimeFilterMergeControllerEntity { const TQueryOptions& query_options); // handle merge rpc - Status merge(const PMergeFilterRequest* request, + Status merge(std::weak_ptr query_ctx, const PMergeFilterRequest* request, butil::IOBufAsZeroCopyInputStream* attach_data); - Status send_filter_size(const PSendFilterSizeRequest* request); + Status send_filter_size(std::weak_ptr query_ctx, + const PSendFilterSizeRequest* request); UniqueId query_id() const { return _query_id; } @@ -173,17 +186,13 @@ class RuntimeFilterMergeControllerEntity { std::unordered_set arrive_id; std::vector source_addrs; std::shared_ptr pool; + uint64_t local_merge_time = 0; }; private: Status _init_with_desc(const TRuntimeFilterDesc* runtime_filter_desc, const TQueryOptions* query_options, - const std::vector* target_info, - const int producer_size); - - Status _init_with_desc(const TRuntimeFilterDesc* runtime_filter_desc, - const TQueryOptions* query_options, - const std::vector* target_info, + const std::vector&& target_info, const int producer_size); UniqueId _query_id; @@ -267,24 +276,22 @@ class RuntimeFilterMergeController { FilterControllerMap _filter_controller_map[kShardNum]; }; -//There are two types of runtime filters: -// one is global, originating from QueryContext, -// and the other is local, originating from RuntimeState. -// In practice, we have already distinguished between them through UpdateRuntimeFilterParamsV2/V1. -// RuntimeState/QueryContext is only used to store runtime_filter_wait_time_ms... +// There are two types of runtime filters: +// 1. Global runtime filter. Managed by QueryContext's RuntimeFilterMgr which is produced by multiple producers and shared by multiple consumers. +// 2. Local runtime filter. Managed by RuntimeState's RuntimeFilterMgr which is 1-producer-1-consumer mode. struct RuntimeFilterParamsContext { - RuntimeFilterParamsContext() = default; static RuntimeFilterParamsContext* create(RuntimeState* state); static RuntimeFilterParamsContext* create(QueryContext* query_ctx); - bool runtime_filter_wait_infinitely; - int32_t runtime_filter_wait_time_ms; - int32_t execution_timeout; - RuntimeFilterMgr* runtime_filter_mgr; - ExecEnv* exec_env; - PUniqueId query_id; - int be_exec_version; - QueryContext* query_ctx; - QueryContext* get_query_ctx() const { return query_ctx; } + QueryContext* get_query_ctx() const { return _query_ctx; } + void set_state(RuntimeState* state) { _state = state; } + RuntimeFilterMgr* global_runtime_filter_mgr(); + RuntimeFilterMgr* local_runtime_filter_mgr(); + +private: + RuntimeFilterParamsContext() = default; + + QueryContext* _query_ctx; + RuntimeState* _state; }; } // namespace doris diff --git a/be/src/runtime/runtime_state.cpp b/be/src/runtime/runtime_state.cpp index e3f9d075c8ffc2..df7c4141691d0b 100644 --- a/be/src/runtime/runtime_state.cpp +++ b/be/src/runtime/runtime_state.cpp @@ -40,6 +40,7 @@ #include "pipeline/exec/operator.h" #include "pipeline/pipeline_task.h" #include "runtime/exec_env.h" +#include "runtime/fragment_mgr.h" #include "runtime/load_path_mgr.h" #include "runtime/memory/mem_tracker_limiter.h" #include "runtime/memory/thread_mem_tracker_mgr.h" @@ -122,38 +123,6 @@ RuntimeState::RuntimeState(const TUniqueId& instance_id, const TUniqueId& query_ DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); } -RuntimeState::RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& instance_id, - const TUniqueId& query_id, int32_t fragment_id, - const TQueryOptions& query_options, const TQueryGlobals& query_globals, - ExecEnv* exec_env, QueryContext* ctx) - : _profile("Fragment " + print_id(instance_id)), - _load_channel_profile(""), - _obj_pool(new ObjectPool()), - _runtime_filter_mgr(nullptr), - _unreported_error_idx(0), - _query_id(query_id), - _fragment_id(fragment_id), - _per_fragment_instance_idx(0), - _num_rows_load_total(0), - _num_rows_load_filtered(0), - _num_rows_load_unselected(0), - _num_rows_filtered_in_strict_mode_partial_update(0), - _num_print_error_rows(0), - _num_bytes_load_total(0), - _num_finished_scan_range(0), - _error_row_number(0), - _query_ctx(ctx) { - [[maybe_unused]] auto status = init(instance_id, query_options, query_globals, exec_env); - _query_mem_tracker = ctx->query_mem_tracker; -#ifdef BE_TEST - if (_query_mem_tracker == nullptr) { - init_mem_trackers(); - } -#endif - DCHECK(_query_mem_tracker != nullptr && _query_mem_tracker->label() != "Orphan"); - DCHECK(status.ok()); -} - RuntimeState::RuntimeState(const TUniqueId& query_id, int32_t fragment_id, const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env, QueryContext* ctx) @@ -294,6 +263,10 @@ Status RuntimeState::init(const TUniqueId& fragment_instance_id, const TQueryOpt return Status::OK(); } +std::weak_ptr RuntimeState::get_query_ctx_weak() { + return _exec_env->fragment_mgr()->get_query_ctx(_query_ctx->query_id()); +} + void RuntimeState::init_mem_trackers(const std::string& name, const TUniqueId& id) { _query_mem_tracker = MemTrackerLimiter::create_shared( MemTrackerLimiter::Type::OTHER, fmt::format("{}#Id={}", name, print_id(id))); @@ -352,7 +325,7 @@ Status RuntimeState::create_error_log_file() { // https://dev.mysql.com/doc/dev/mysql-server/latest/page_protocol_basic_err_packet.html // shorten the path as much as possible to prevent the length of the presigned URL from // exceeding the MySQL error packet size limit - ss << "error_log/" << _import_label << "_" << std::hex << _fragment_instance_id.hi; + ss << "error_log/" << std::hex << _query_id.hi; _s3_error_log_file_path = ss.str(); } } @@ -368,7 +341,9 @@ Status RuntimeState::create_error_log_file() { LOG(WARNING) << error_msg.str(); return Status::InternalError(error_msg.str()); } - VLOG_FILE << "create error log file: " << _error_log_file_path; + LOG(INFO) << "create error log file: " << _error_log_file_path + << ", query id: " << print_id(_query_id) + << ", fragment instance id: " << print_id(_fragment_instance_id); return Status::OK(); } @@ -512,15 +487,14 @@ RuntimeFilterMgr* RuntimeState::global_runtime_filter_mgr() { } Status RuntimeState::register_producer_runtime_filter( - const TRuntimeFilterDesc& desc, bool need_local_merge, - std::shared_ptr* producer_filter, bool build_bf_exactly) { - if (desc.has_remote_targets || need_local_merge) { - return global_runtime_filter_mgr()->register_local_merge_producer_filter( - desc, query_options(), producer_filter, build_bf_exactly); - } else { - return local_runtime_filter_mgr()->register_producer_filter( - desc, query_options(), producer_filter, build_bf_exactly); - } + const TRuntimeFilterDesc& desc, std::shared_ptr* producer_filter) { + // Producers are created by local runtime filter mgr and shared by global runtime filter manager. + // When RF is published, consumers in both global and local RF mgr will be found. + RETURN_IF_ERROR(local_runtime_filter_mgr()->register_producer_filter(desc, query_options(), + producer_filter)); + RETURN_IF_ERROR(global_runtime_filter_mgr()->register_local_merge_producer_filter( + desc, query_options(), *producer_filter)); + return Status::OK(); } Status RuntimeState::register_consumer_runtime_filter( @@ -528,10 +502,10 @@ Status RuntimeState::register_consumer_runtime_filter( std::shared_ptr* consumer_filter) { if (desc.has_remote_targets || need_local_merge) { return global_runtime_filter_mgr()->register_consumer_filter(desc, query_options(), node_id, - consumer_filter, false, true); + consumer_filter, true); } else { return local_runtime_filter_mgr()->register_consumer_filter(desc, query_options(), node_id, - consumer_filter, false, false); + consumer_filter, false); } } diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 88deee491d19c4..1e7c1e579f7735 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -85,12 +85,7 @@ class RuntimeState { const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env, QueryContext* ctx); - // for only use in pipelineX - RuntimeState(pipeline::PipelineFragmentContext*, const TUniqueId& instance_id, - const TUniqueId& query_id, int32 fragment_id, const TQueryOptions& query_options, - const TQueryGlobals& query_globals, ExecEnv* exec_env, QueryContext* ctx); - - // Used by pipelineX. This runtime state is only used for setup. + // Used by pipeline. This runtime state is only used for setup. RuntimeState(const TUniqueId& query_id, int32 fragment_id, const TQueryOptions& query_options, const TQueryGlobals& query_globals, ExecEnv* exec_env, QueryContext* ctx); @@ -449,6 +444,8 @@ class RuntimeState { QueryContext* get_query_ctx() { return _query_ctx; } + std::weak_ptr get_query_ctx_weak(); + void set_query_mem_tracker(const std::shared_ptr& tracker) { _query_mem_tracker = tracker; } @@ -559,9 +556,7 @@ class RuntimeState { } Status register_producer_runtime_filter(const doris::TRuntimeFilterDesc& desc, - bool need_local_merge, - std::shared_ptr* producer_filter, - bool build_bf_exactly); + std::shared_ptr* producer_filter); Status register_consumer_runtime_filter(const doris::TRuntimeFilterDesc& desc, bool need_local_merge, int node_id, @@ -592,6 +587,11 @@ class RuntimeState { _query_options.enable_local_merge_sort; } + bool enable_shared_exchange_sink_buffer() const { + return _query_options.__isset.enable_shared_exchange_sink_buffer && + _query_options.enable_shared_exchange_sink_buffer; + } + int64_t min_revocable_mem() const { if (_query_options.__isset.min_revocable_mem) { return std::max(_query_options.min_revocable_mem, (int64_t)1); diff --git a/be/src/runtime/stream_load/stream_load_executor.cpp b/be/src/runtime/stream_load/stream_load_executor.cpp index 482fadac44e051..ad4d22946f1b83 100644 --- a/be/src/runtime/stream_load/stream_load_executor.cpp +++ b/be/src/runtime/stream_load/stream_load_executor.cpp @@ -85,13 +85,18 @@ Status StreamLoadExecutor::execute_plan_fragment(std::shared_ptrnumber_unselected_rows = state->num_rows_load_unselected(); ctx->loaded_bytes = state->num_bytes_load_total(); int64_t num_selected_rows = ctx->number_total_rows - ctx->number_unselected_rows; + ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); if (!ctx->group_commit && num_selected_rows > 0 && (double)ctx->number_filtered_rows / num_selected_rows > ctx->max_filter_ratio) { // NOTE: Do not modify the error message here, for historical reasons, // some users may rely on this error message. - *status = Status::DataQualityError("too many filtered rows"); + if (ctx->need_commit_self) { + *status = + Status::DataQualityError("too many filtered rows, url: " + ctx->error_url); + } else { + *status = Status::DataQualityError("too many filtered rows"); + } } - ctx->error_url = to_load_error_http_path(state->get_error_log_file_path()); if (status->ok()) { DorisMetrics::instance()->stream_receive_bytes_total->increment(ctx->receive_bytes); diff --git a/be/src/runtime/thread_context.h b/be/src/runtime/thread_context.h index 72e1532e58d4f1..e0a44af69c1d66 100644 --- a/be/src/runtime/thread_context.h +++ b/be/src/runtime/thread_context.h @@ -119,38 +119,40 @@ __VA_ARGS__; \ } while (0) -#define LIMIT_LOCAL_SCAN_IO(data_dir, bytes_read) \ - std::shared_ptr iot = nullptr; \ - auto* t_ctx = doris::thread_context(true); \ - if (t_ctx) { \ - iot = t_ctx->get_local_scan_io_throttle(data_dir); \ - } \ - if (iot) { \ - iot->acquire(-1); \ - } \ - Defer defer { \ - [&]() { \ - if (iot) { \ - iot->update_next_io_time(*bytes_read); \ - t_ctx->update_total_local_scan_io_adder(*bytes_read); \ - } \ - } \ +#define LIMIT_LOCAL_SCAN_IO(data_dir, bytes_read) \ + std::shared_ptr iot = nullptr; \ + auto* t_ctx = doris::thread_context(true); \ + if (t_ctx) { \ + iot = t_ctx->get_local_scan_io_throttle(data_dir); \ + } \ + if (iot) { \ + iot->acquire(-1); \ + } \ + Defer defer { \ + [&]() { \ + if (iot) { \ + iot->update_next_io_time(*bytes_read); \ + t_ctx->update_local_scan_io(data_dir, *bytes_read); \ + } \ + } \ } -#define LIMIT_REMOTE_SCAN_IO(bytes_read) \ - std::shared_ptr iot = nullptr; \ - if (auto* t_ctx = doris::thread_context(true)) { \ - iot = t_ctx->get_remote_scan_io_throttle(); \ - } \ - if (iot) { \ - iot->acquire(-1); \ - } \ - Defer defer { \ - [&]() { \ - if (iot) { \ - iot->update_next_io_time(*bytes_read); \ - } \ - } \ +#define LIMIT_REMOTE_SCAN_IO(bytes_read) \ + std::shared_ptr iot = nullptr; \ + auto* t_ctx = doris::thread_context(true); \ + if (t_ctx) { \ + iot = t_ctx->get_remote_scan_io_throttle(); \ + } \ + if (iot) { \ + iot->acquire(-1); \ + } \ + Defer defer { \ + [&]() { \ + if (iot) { \ + iot->update_next_io_time(*bytes_read); \ + t_ctx->update_remote_scan_io(*bytes_read); \ + } \ + } \ } namespace doris { @@ -282,9 +284,15 @@ class ThreadContext { return nullptr; } - void update_total_local_scan_io_adder(size_t bytes_read) { + void update_local_scan_io(std::string path, size_t bytes_read) { + if (std::shared_ptr wg_ptr = _wg_wptr.lock()) { + wg_ptr->update_local_scan_io(path, bytes_read); + } + } + + void update_remote_scan_io(size_t bytes_read) { if (std::shared_ptr wg_ptr = _wg_wptr.lock()) { - wg_ptr->update_total_local_scan_io_adder(bytes_read); + wg_ptr->update_remote_scan_io(bytes_read); } } diff --git a/be/src/runtime/workload_group/workload_group.cpp b/be/src/runtime/workload_group/workload_group.cpp index c6a3c07adda1dd..6b9388af30a7f7 100644 --- a/be/src/runtime/workload_group/workload_group.cpp +++ b/be/src/runtime/workload_group/workload_group.cpp @@ -35,6 +35,7 @@ #include "runtime/exec_env.h" #include "runtime/memory/global_memory_arbitrator.h" #include "runtime/memory/mem_tracker_limiter.h" +#include "runtime/workload_group/workload_group_metrics.h" #include "runtime/workload_management/io_throttle.h" #include "util/mem_info.h" #include "util/parse_util.h" @@ -47,10 +48,12 @@ namespace doris { const static std::string MEMORY_LIMIT_DEFAULT_VALUE = "0%"; const static bool ENABLE_MEMORY_OVERCOMMIT_DEFAULT_VALUE = true; const static int CPU_HARD_LIMIT_DEFAULT_VALUE = -1; -const static int SPILL_LOW_WATERMARK_DEFAULT_VALUE = 50; -const static int SPILL_HIGH_WATERMARK_DEFAULT_VALUE = 80; +const static int MEMORY_LOW_WATERMARK_DEFAULT_VALUE = 50; +const static int MEMORY_HIGH_WATERMARK_DEFAULT_VALUE = 80; -WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info) +WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& wg_info) : WorkloadGroup(wg_info, true) {} + +WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info, bool need_create_query_thread_pool) : _id(tg_info.id), _name(tg_info.name), _version(tg_info.version), @@ -62,24 +65,18 @@ WorkloadGroup::WorkloadGroup(const WorkloadGroupInfo& tg_info) _scan_thread_num(tg_info.scan_thread_num), _max_remote_scan_thread_num(tg_info.max_remote_scan_thread_num), _min_remote_scan_thread_num(tg_info.min_remote_scan_thread_num), - _spill_low_watermark(tg_info.spill_low_watermark), - _spill_high_watermark(tg_info.spill_high_watermark), + _memory_low_watermark(tg_info.memory_low_watermark), + _memory_high_watermark(tg_info.memory_high_watermark), _scan_bytes_per_second(tg_info.read_bytes_per_second), - _remote_scan_bytes_per_second(tg_info.remote_read_bytes_per_second) { + _remote_scan_bytes_per_second(tg_info.remote_read_bytes_per_second), + _need_create_query_thread_pool(need_create_query_thread_pool) { std::vector& data_dir_list = io::BeConfDataDirReader::be_config_data_dir_list; for (const auto& data_dir : data_dir_list) { - _scan_io_throttle_map[data_dir.path] = - std::make_shared(_name, data_dir.bvar_name + "_read_bytes"); - } - _remote_scan_io_throttle = std::make_shared(_name, "remote_read_bytes"); - _mem_used_status = std::make_unique>(_name, "memory_used", 0); - _cpu_usage_adder = std::make_unique>(_name, "cpu_usage_adder"); - _cpu_usage_per_second = std::make_unique>>( - _name, "cpu_usage", _cpu_usage_adder.get(), 10); - _total_local_scan_io_adder = - std::make_unique>(_name, "total_local_read_bytes"); - _total_local_scan_io_per_second = std::make_unique>>( - _name, "total_local_read_bytes_per_second", _total_local_scan_io_adder.get(), 1); + _scan_io_throttle_map[data_dir.path] = std::make_shared(data_dir.bvar_name); + } + _remote_scan_io_throttle = std::make_shared(); + + _wg_metrics = std::make_shared(this); } std::string WorkloadGroup::debug_string() const { @@ -88,12 +85,12 @@ std::string WorkloadGroup::debug_string() const { "TG[id = {}, name = {}, cpu_share = {}, memory_limit = {}, enable_memory_overcommit = " "{}, version = {}, cpu_hard_limit = {}, scan_thread_num = " "{}, max_remote_scan_thread_num = {}, min_remote_scan_thread_num = {}, " - "spill_low_watermark={}, spill_high_watermark={}, is_shutdown={}, query_num={}, " + "memory_low_watermark={}, memory_high_watermark={}, is_shutdown={}, query_num={}, " "read_bytes_per_second={}, remote_read_bytes_per_second={}]", _id, _name, cpu_share(), PrettyPrinter::print(_memory_limit, TUnit::BYTES), _enable_memory_overcommit ? "true" : "false", _version, cpu_hard_limit(), _scan_thread_num, _max_remote_scan_thread_num, _min_remote_scan_thread_num, - _spill_low_watermark, _spill_high_watermark, _is_shutdown, _query_ctxs.size(), + _memory_low_watermark, _memory_high_watermark, _is_shutdown, _query_ctxs.size(), _scan_bytes_per_second, _remote_scan_bytes_per_second); } @@ -101,14 +98,14 @@ std::string WorkloadGroup::memory_debug_string() const { return fmt::format( "TG[id = {}, name = {}, memory_limit = {}, enable_memory_overcommit = " "{}, weighted_memory_limit = {}, total_mem_used = {}, " - "wg_refresh_interval_memory_growth = {}, spill_low_watermark = {}, " - "spill_high_watermark = {}, version = {}, is_shutdown = {}, query_num = {}]", + "wg_refresh_interval_memory_growth = {}, memory_low_watermark = {}, " + "memory_high_watermark = {}, version = {}, is_shutdown = {}, query_num = {}]", _id, _name, PrettyPrinter::print(_memory_limit, TUnit::BYTES), _enable_memory_overcommit ? "true" : "false", PrettyPrinter::print(_weighted_memory_limit, TUnit::BYTES), PrettyPrinter::print(_total_mem_used, TUnit::BYTES), PrettyPrinter::print(_wg_refresh_interval_memory_growth, TUnit::BYTES), - _spill_low_watermark, _spill_high_watermark, _version, _is_shutdown, + _memory_low_watermark, _memory_high_watermark, _version, _is_shutdown, _query_ctxs.size()); } @@ -134,8 +131,8 @@ void WorkloadGroup::check_and_update(const WorkloadGroupInfo& tg_info) { _scan_thread_num = tg_info.scan_thread_num; _max_remote_scan_thread_num = tg_info.max_remote_scan_thread_num; _min_remote_scan_thread_num = tg_info.min_remote_scan_thread_num; - _spill_low_watermark = tg_info.spill_low_watermark; - _spill_high_watermark = tg_info.spill_high_watermark; + _memory_low_watermark = tg_info.memory_low_watermark; + _memory_high_watermark = tg_info.memory_high_watermark; _scan_bytes_per_second = tg_info.read_bytes_per_second; _remote_scan_bytes_per_second = tg_info.remote_read_bytes_per_second; } else { @@ -166,11 +163,11 @@ int64_t WorkloadGroup::make_memory_tracker_snapshots( } // refresh total memory used. _total_mem_used = used_memory; + _wg_metrics->update_memory_used_bytes(used_memory); // reserve memory is recorded in the query mem tracker // and _total_mem_used already contains all the current reserve memory. // so after refreshing _total_mem_used, reset _wg_refresh_interval_memory_growth. _wg_refresh_interval_memory_growth.store(0.0); - _mem_used_status->set_value(used_memory); return used_memory; } @@ -342,19 +339,19 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( // 4 cpu_share uint64_t cpu_share = CgroupCpuCtl::cpu_soft_limit_default_value(); - if (tworkload_group_info.__isset.cpu_share) { + if (tworkload_group_info.__isset.cpu_share && tworkload_group_info.cpu_share > 0) { cpu_share = tworkload_group_info.cpu_share; } // 5 cpu hard limit int cpu_hard_limit = CPU_HARD_LIMIT_DEFAULT_VALUE; - if (tworkload_group_info.__isset.cpu_hard_limit) { + if (tworkload_group_info.__isset.cpu_hard_limit && tworkload_group_info.cpu_hard_limit > 0) { cpu_hard_limit = tworkload_group_info.cpu_hard_limit; } // 6 mem_limit std::string mem_limit_str = MEMORY_LIMIT_DEFAULT_VALUE; - if (tworkload_group_info.__isset.mem_limit) { + if (tworkload_group_info.__isset.mem_limit && tworkload_group_info.mem_limit != "-1") { mem_limit_str = tworkload_group_info.mem_limit; } bool is_percent = true; @@ -393,27 +390,29 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( min_remote_scan_thread_num = tworkload_group_info.min_remote_scan_thread_num; } - // 12 spill low watermark - int spill_low_watermark = SPILL_LOW_WATERMARK_DEFAULT_VALUE; - if (tworkload_group_info.__isset.spill_threshold_low_watermark) { - spill_low_watermark = tworkload_group_info.spill_threshold_low_watermark; + // 12 memory low watermark + int memory_low_watermark = MEMORY_LOW_WATERMARK_DEFAULT_VALUE; + if (tworkload_group_info.__isset.memory_low_watermark) { + memory_low_watermark = tworkload_group_info.memory_low_watermark; } - // 13 spil high watermark - int spill_high_watermark = SPILL_HIGH_WATERMARK_DEFAULT_VALUE; - if (tworkload_group_info.__isset.spill_threshold_high_watermark) { - spill_high_watermark = tworkload_group_info.spill_threshold_high_watermark; + // 13 memory high watermark + int memory_high_watermark = MEMORY_HIGH_WATERMARK_DEFAULT_VALUE; + if (tworkload_group_info.__isset.memory_high_watermark) { + memory_high_watermark = tworkload_group_info.memory_high_watermark; } // 14 scan io int read_bytes_per_second = -1; - if (tworkload_group_info.__isset.read_bytes_per_second) { + if (tworkload_group_info.__isset.read_bytes_per_second && + tworkload_group_info.read_bytes_per_second > 0) { read_bytes_per_second = tworkload_group_info.read_bytes_per_second; } // 15 remote scan io int remote_read_bytes_per_second = -1; - if (tworkload_group_info.__isset.remote_read_bytes_per_second) { + if (tworkload_group_info.__isset.remote_read_bytes_per_second && + tworkload_group_info.remote_read_bytes_per_second > 0) { remote_read_bytes_per_second = tworkload_group_info.remote_read_bytes_per_second; } @@ -428,60 +427,66 @@ WorkloadGroupInfo WorkloadGroupInfo::parse_topic_info( .scan_thread_num = scan_thread_num, .max_remote_scan_thread_num = max_remote_scan_thread_num, .min_remote_scan_thread_num = min_remote_scan_thread_num, - .spill_low_watermark = spill_low_watermark, - .spill_high_watermark = spill_high_watermark, + .memory_low_watermark = memory_low_watermark, + .memory_high_watermark = memory_high_watermark, .read_bytes_per_second = read_bytes_per_second, .remote_read_bytes_per_second = remote_read_bytes_per_second}; } -void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* exec_env) { - uint64_t tg_id = tg_info->id; - std::string tg_name = tg_info->name; - int cpu_hard_limit = tg_info->cpu_hard_limit; - uint64_t cpu_shares = tg_info->cpu_share; - bool enable_cpu_hard_limit = tg_info->enable_cpu_hard_limit; - int scan_thread_num = tg_info->scan_thread_num; - int max_remote_scan_thread_num = tg_info->max_remote_scan_thread_num; - int min_remote_scan_thread_num = tg_info->min_remote_scan_thread_num; +std::weak_ptr WorkloadGroup::get_cgroup_cpu_ctl_wptr() { + std::shared_lock rlock(_task_sched_lock); + return _cgroup_cpu_ctl; +} +void WorkloadGroup::create_cgroup_cpu_ctl() { std::lock_guard wlock(_task_sched_lock); + create_cgroup_cpu_ctl_no_lock(); +} + +void WorkloadGroup::create_cgroup_cpu_ctl_no_lock() { if (config::doris_cgroup_cpu_path != "" && _cgroup_cpu_ctl == nullptr) { - std::unique_ptr cgroup_cpu_ctl = CgroupCpuCtl::create_cgroup_cpu_ctl(tg_id); + std::shared_ptr cgroup_cpu_ctl = CgroupCpuCtl::create_cgroup_cpu_ctl(_id); if (cgroup_cpu_ctl) { Status ret = cgroup_cpu_ctl->init(); if (ret.ok()) { _cgroup_cpu_ctl = std::move(cgroup_cpu_ctl); - LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << tg_id; + LOG(INFO) << "[upsert wg thread pool] cgroup init success, wg_id=" << _id; } else { - LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id=" << tg_id + LOG(INFO) << "[upsert wg thread pool] cgroup init failed, wg_id=" << _id << ", reason=" << ret.to_string(); } } else { - LOG(INFO) << "[upsert wg thread pool] create cgroup cpu ctl for " << tg_id << " failed"; + LOG(INFO) << "[upsert wg thread pool] create cgroup cpu ctl wg_id=" << _id << " failed"; } } +} - CgroupCpuCtl* cg_cpu_ctl_ptr = _cgroup_cpu_ctl.get(); - +void WorkloadGroup::upsert_thread_pool_no_lock(WorkloadGroupInfo* wg_info, + std::shared_ptr cg_cpu_ctl_ptr) { + uint64_t wg_id = wg_info->id; + std::string wg_name = wg_info->name; + int scan_thread_num = wg_info->scan_thread_num; + int max_remote_scan_thread_num = wg_info->max_remote_scan_thread_num; + int min_remote_scan_thread_num = wg_info->min_remote_scan_thread_num; if (_task_sched == nullptr) { int32_t executors_size = config::pipeline_executor_size; if (executors_size <= 0) { executors_size = CpuInfo::num_cores(); } std::unique_ptr pipeline_task_scheduler = - std::make_unique(executors_size, "Pipe_" + tg_name, + std::make_unique(executors_size, "Pipe_" + wg_name, cg_cpu_ctl_ptr); Status ret = pipeline_task_scheduler->start(); if (ret.ok()) { _task_sched = std::move(pipeline_task_scheduler); } else { - LOG(INFO) << "[upsert wg thread pool] task scheduler start failed, gid= " << tg_id; + LOG(INFO) << "[upsert wg thread pool] task scheduler start failed, gid= " << wg_id; } } if (_scan_task_sched == nullptr) { std::unique_ptr scan_scheduler = - std::make_unique("Scan_" + tg_name, + std::make_unique("Scan_" + wg_name, cg_cpu_ctl_ptr); Status ret = scan_scheduler->start(config::doris_scanner_thread_pool_thread_num, config::doris_scanner_thread_pool_thread_num, @@ -489,7 +494,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e if (ret.ok()) { _scan_task_sched = std::move(scan_scheduler); } else { - LOG(INFO) << "[upsert wg thread pool] scan scheduler start failed, gid=" << tg_id; + LOG(INFO) << "[upsert wg thread pool] scan scheduler start failed, gid=" << wg_id; } } if (scan_thread_num > 0 && _scan_task_sched) { @@ -501,7 +506,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e int remote_scan_thread_queue_size = vectorized::ScannerScheduler::get_remote_scan_thread_queue_size(); std::unique_ptr remote_scan_scheduler = - std::make_unique("RScan_" + tg_name, + std::make_unique("RScan_" + wg_name, cg_cpu_ctl_ptr); Status ret = remote_scan_scheduler->start(remote_max_thread_num, config::doris_scanner_min_thread_pool_thread_num, @@ -510,7 +515,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e _remote_scan_task_sched = std::move(remote_scan_scheduler); } else { LOG(INFO) << "[upsert wg thread pool] remote scan scheduler start failed, gid=" - << tg_id; + << wg_id; } } if (max_remote_scan_thread_num >= min_remote_scan_thread_num && _remote_scan_task_sched) { @@ -532,7 +537,7 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e : std::min(num_disk * min_threads, num_cpus * config::wg_flush_thread_num_per_cpu); - std::string pool_name = "wg_flush_" + tg_name; + std::string pool_name = "wg_flush_" + wg_name; auto ret = ThreadPoolBuilder(pool_name) .set_min_threads(min_threads) .set_max_threads(max_threads) @@ -540,17 +545,24 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e .build(&thread_pool); if (!ret.ok()) { LOG(INFO) << "[upsert wg thread pool] create " + pool_name + " failed, gid=" - << tg_id; + << wg_id; } else { _memtable_flush_pool = std::move(thread_pool); - LOG(INFO) << "[upsert wg thread pool] create " + pool_name + " succ, gid=" << tg_id + LOG(INFO) << "[upsert wg thread pool] create " + pool_name + " succ, gid=" << wg_id << ", max thread num=" << max_threads << ", min thread num=" << min_threads; } } } +} + +void WorkloadGroup::upsert_cgroup_cpu_ctl_no_lock(WorkloadGroupInfo* wg_info) { + uint64_t wg_id = wg_info->id; + int cpu_hard_limit = wg_info->cpu_hard_limit; + uint64_t cpu_shares = wg_info->cpu_share; + bool enable_cpu_hard_limit = wg_info->enable_cpu_hard_limit; + create_cgroup_cpu_ctl_no_lock(); - // step 6: update cgroup cpu if needed if (_cgroup_cpu_ctl) { if (enable_cpu_hard_limit) { if (cpu_hard_limit > 0) { @@ -560,15 +572,24 @@ void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* e } else { LOG(INFO) << "[upsert wg thread pool] enable cpu hard limit but value is " "illegal: " - << cpu_hard_limit << ", gid=" << tg_id; + << cpu_hard_limit << ", gid=" << wg_id; } } else { _cgroup_cpu_ctl->update_cpu_soft_limit(cpu_shares); _cgroup_cpu_ctl->update_cpu_hard_limit( CPU_HARD_LIMIT_DEFAULT_VALUE); // disable cpu hard limit } - _cgroup_cpu_ctl->get_cgroup_cpu_info(&(tg_info->cgroup_cpu_shares), - &(tg_info->cgroup_cpu_hard_limit)); + _cgroup_cpu_ctl->get_cgroup_cpu_info(&(wg_info->cgroup_cpu_shares), + &(wg_info->cgroup_cpu_hard_limit)); + } +} + +void WorkloadGroup::upsert_task_scheduler(WorkloadGroupInfo* wg_info) { + std::lock_guard wlock(_task_sched_lock); + upsert_cgroup_cpu_ctl_no_lock(wg_info); + + if (_need_create_query_thread_pool) { + upsert_thread_pool_no_lock(wg_info, _cgroup_cpu_ctl); } } @@ -631,16 +652,20 @@ std::shared_ptr WorkloadGroup::get_remote_scan_io_throttle() { return _remote_scan_io_throttle; } -void WorkloadGroup::update_cpu_adder(int64_t delta_cpu_time) { - (*_cpu_usage_adder) << (uint64_t)delta_cpu_time; +void WorkloadGroup::update_cpu_time(int64_t delta_cpu_time) { + _wg_metrics->update_cpu_time_nanos(delta_cpu_time); +} + +void WorkloadGroup::update_local_scan_io(std::string path, size_t scan_bytes) { + _wg_metrics->update_local_scan_io_bytes(path, (uint64_t)scan_bytes); } -void WorkloadGroup::update_total_local_scan_io_adder(size_t scan_bytes) { - (*_total_local_scan_io_adder) << scan_bytes; +void WorkloadGroup::update_remote_scan_io(size_t scan_bytes) { + _wg_metrics->update_remote_scan_io_bytes((uint64_t)scan_bytes); } -int64_t WorkloadGroup::get_remote_scan_bytes_per_second() { - return _remote_scan_io_throttle->get_bvar_io_per_second(); +int64_t WorkloadGroup::get_mem_used() { + return _total_mem_used; } void WorkloadGroup::try_stop_schedulers() { diff --git a/be/src/runtime/workload_group/workload_group.h b/be/src/runtime/workload_group/workload_group.h index 2ba84ce982b304..35a8802e4c449a 100644 --- a/be/src/runtime/workload_group/workload_group.h +++ b/be/src/runtime/workload_group/workload_group.h @@ -17,7 +17,6 @@ #pragma once -#include #include #include #include @@ -54,10 +53,14 @@ class TaskScheduler; class WorkloadGroup; struct WorkloadGroupInfo; struct TrackerLimiterGroup; +class WorkloadGroupMetrics; + class WorkloadGroup : public std::enable_shared_from_this { public: explicit WorkloadGroup(const WorkloadGroupInfo& tg_info); + explicit WorkloadGroup(const WorkloadGroupInfo& tg_info, bool need_create_query_thread_pool); + int64_t version() const { return _version; } uint64_t cpu_share() const { return _cpu_share.load(); } @@ -92,11 +95,11 @@ class WorkloadGroup : public std::enable_shared_from_this { void do_sweep(); - int spill_threshold_low_water_mark() const { - return _spill_low_watermark.load(std::memory_order_relaxed); + int memory_low_watermark() const { + return _memory_low_watermark.load(std::memory_order_relaxed); } - int spill_threashold_high_water_mark() const { - return _spill_high_watermark.load(std::memory_order_relaxed); + int memory_high_watermark() const { + return _memory_high_watermark.load(std::memory_order_relaxed); } void set_weighted_memory_ratio(double ratio); @@ -105,7 +108,7 @@ class WorkloadGroup : public std::enable_shared_from_this { _total_mem_used + _wg_refresh_interval_memory_growth.load() + size; if ((realtime_total_mem_used > ((double)_weighted_memory_limit * - _spill_high_watermark.load(std::memory_order_relaxed) / 100))) { + _memory_high_watermark.load(std::memory_order_relaxed) / 100))) { return false; } else { _wg_refresh_interval_memory_growth.fetch_add(size); @@ -120,10 +123,10 @@ class WorkloadGroup : public std::enable_shared_from_this { auto realtime_total_mem_used = _total_mem_used + _wg_refresh_interval_memory_growth.load(); *is_low_wartermark = (realtime_total_mem_used > ((double)_weighted_memory_limit * - _spill_low_watermark.load(std::memory_order_relaxed) / 100)); + _memory_low_watermark.load(std::memory_order_relaxed) / 100)); *is_high_wartermark = (realtime_total_mem_used > ((double)_weighted_memory_limit * - _spill_high_watermark.load(std::memory_order_relaxed) / 100)); + _memory_high_watermark.load(std::memory_order_relaxed) / 100)); } std::string debug_string() const; @@ -165,7 +168,7 @@ class WorkloadGroup : public std::enable_shared_from_this { int64_t gc_memory(int64_t need_free_mem, RuntimeProfile* profile, bool is_minor_gc); - void upsert_task_scheduler(WorkloadGroupInfo* tg_info, ExecEnv* exec_env); + void upsert_task_scheduler(WorkloadGroupInfo* tg_info); void get_query_scheduler(doris::pipeline::TaskScheduler** exec_sched, vectorized::SimplifiedScanScheduler** scan_sched, @@ -187,29 +190,33 @@ class WorkloadGroup : public std::enable_shared_from_this { void upsert_scan_io_throttle(WorkloadGroupInfo* tg_info); - void update_cpu_adder(int64_t delta_cpu_time); + void update_cpu_time(int64_t delta_cpu_time); - void update_total_local_scan_io_adder(size_t scan_bytes); + void update_local_scan_io(std::string path, size_t scan_bytes); - int64_t get_mem_used() { return _mem_used_status->get_value(); } - uint64_t get_cpu_usage() { return _cpu_usage_per_second->get_value(); } - int64_t get_local_scan_bytes_per_second() { - return _total_local_scan_io_per_second->get_value(); - } - int64_t get_remote_scan_bytes_per_second(); + void update_remote_scan_io(size_t scan_bytes); - CgroupCpuCtl* get_cgroup_cpu_ctl_ptr() { - std::shared_lock rlock(_task_sched_lock); - return _cgroup_cpu_ctl.get(); - } + int64_t get_mem_used(); ThreadPool* get_memtable_flush_pool_ptr() { // no lock here because this is called by memtable flush, // to avoid lock competition with the workload thread pool's update return _memtable_flush_pool.get(); } + void create_cgroup_cpu_ctl(); + + std::weak_ptr get_cgroup_cpu_ctl_wptr(); + + std::shared_ptr get_metrics() { return _wg_metrics; } + + friend class WorkloadGroupMetrics; private: + void create_cgroup_cpu_ctl_no_lock(); + void upsert_cgroup_cpu_ctl_no_lock(WorkloadGroupInfo* wg_info); + void upsert_thread_pool_no_lock(WorkloadGroupInfo* wg_info, + std::shared_ptr cg_cpu_ctl_ptr); + mutable std::shared_mutex _mutex; // lock _name, _version, _cpu_share, _memory_limit const uint64_t _id; std::string _name; @@ -228,8 +235,8 @@ class WorkloadGroup : public std::enable_shared_from_this { std::atomic _scan_thread_num; std::atomic _max_remote_scan_thread_num; std::atomic _min_remote_scan_thread_num; - std::atomic _spill_low_watermark; - std::atomic _spill_high_watermark; + std::atomic _memory_low_watermark; + std::atomic _memory_high_watermark; std::atomic _scan_bytes_per_second {-1}; std::atomic _remote_scan_bytes_per_second {-1}; @@ -240,7 +247,10 @@ class WorkloadGroup : public std::enable_shared_from_this { std::unordered_map> _query_ctxs; std::shared_mutex _task_sched_lock; - std::unique_ptr _cgroup_cpu_ctl {nullptr}; + // _cgroup_cpu_ctl not only used by threadpool which managed by WorkloadGroup, + // but also some global background threadpool which not owned by WorkloadGroup, + // so it should be shared ptr; + std::shared_ptr _cgroup_cpu_ctl {nullptr}; std::unique_ptr _task_sched {nullptr}; std::unique_ptr _scan_task_sched {nullptr}; std::unique_ptr _remote_scan_task_sched {nullptr}; @@ -249,12 +259,10 @@ class WorkloadGroup : public std::enable_shared_from_this { std::map> _scan_io_throttle_map; std::shared_ptr _remote_scan_io_throttle {nullptr}; - // bvar metric - std::unique_ptr> _mem_used_status; - std::unique_ptr> _cpu_usage_adder; - std::unique_ptr>> _cpu_usage_per_second; - std::unique_ptr> _total_local_scan_io_adder; - std::unique_ptr>> _total_local_scan_io_per_second; + // for some background workload, it doesn't need to create query thread pool + const bool _need_create_query_thread_pool; + + std::shared_ptr _wg_metrics {nullptr}; }; using WorkloadGroupPtr = std::shared_ptr; @@ -271,8 +279,8 @@ struct WorkloadGroupInfo { const int scan_thread_num = 0; const int max_remote_scan_thread_num = 0; const int min_remote_scan_thread_num = 0; - const int spill_low_watermark = 0; - const int spill_high_watermark = 0; + const int memory_low_watermark = 0; + const int memory_high_watermark = 0; const int read_bytes_per_second = -1; const int remote_read_bytes_per_second = -1; // log cgroup cpu info diff --git a/be/src/runtime/workload_group/workload_group_manager.cpp b/be/src/runtime/workload_group/workload_group_manager.cpp index 927d4d13814267..1e01a7ce1bafb1 100644 --- a/be/src/runtime/workload_group/workload_group_manager.cpp +++ b/be/src/runtime/workload_group/workload_group_manager.cpp @@ -26,6 +26,7 @@ #include "pipeline/task_scheduler.h" #include "runtime/memory/mem_tracker_limiter.h" #include "runtime/workload_group/workload_group.h" +#include "runtime/workload_group/workload_group_metrics.h" #include "util/mem_info.h" #include "util/threadpool.h" #include "util/time.h" @@ -34,6 +35,25 @@ namespace doris { +void WorkloadGroupMgr::init_internal_workload_group() { + WorkloadGroupPtr internal_wg = nullptr; + { + std::lock_guard w_lock(_group_mutex); + if (_workload_groups.find(INTERNAL_WORKLOAD_GROUP_ID) == _workload_groups.end()) { + WorkloadGroupInfo internal_wg_info { + .id = INTERNAL_WORKLOAD_GROUP_ID, + .name = INTERNAL_WORKLOAD_GROUP_NAME, + .cpu_share = CgroupCpuCtl::cpu_soft_limit_default_value()}; + internal_wg = std::make_shared(internal_wg_info, false); + _workload_groups[internal_wg_info.id] = internal_wg; + } + } + DCHECK(internal_wg != nullptr); + if (internal_wg) { + internal_wg->create_cgroup_cpu_ctl(); + } +} + WorkloadGroupPtr WorkloadGroupMgr::get_or_create_workload_group( const WorkloadGroupInfo& workload_group_info) { { @@ -67,10 +87,10 @@ void WorkloadGroupMgr::get_related_workload_groups( } } -WorkloadGroupPtr WorkloadGroupMgr::get_task_group_by_id(uint64_t tg_id) { +WorkloadGroupPtr WorkloadGroupMgr::get_group(uint64_t wg_id) { std::shared_lock r_lock(_group_mutex); - if (_workload_groups.find(tg_id) != _workload_groups.end()) { - return _workload_groups.at(tg_id); + if (_workload_groups.find(wg_id) != _workload_groups.end()) { + return _workload_groups.at(wg_id); } return nullptr; } @@ -86,6 +106,10 @@ void WorkloadGroupMgr::delete_workload_group_by_ids(std::set used_wg_i old_wg_size = _workload_groups.size(); for (auto iter = _workload_groups.begin(); iter != _workload_groups.end(); iter++) { uint64_t wg_id = iter->first; + // internal workload group created by BE can not be dropped + if (wg_id == INTERNAL_WORKLOAD_GROUP_ID) { + continue; + } auto workload_group_ptr = iter->second; if (used_wg_id.find(wg_id) == used_wg_id.end()) { workload_group_ptr->shutdown(); @@ -264,16 +288,25 @@ void WorkloadGroupMgr::get_wg_resource_usage(vectorized::Block* block) { for (const auto& [id, wg] : _workload_groups) { SchemaScannerHelper::insert_int64_value(0, be_id, block); SchemaScannerHelper::insert_int64_value(1, wg->id(), block); - SchemaScannerHelper::insert_int64_value(2, wg->get_mem_used(), block); + SchemaScannerHelper::insert_int64_value(2, wg->get_metrics()->get_memory_used(), block); - double cpu_usage_p = - (double)wg->get_cpu_usage() / (double)total_cpu_time_ns_per_second * 100; + double cpu_usage_p = (double)wg->get_metrics()->get_cpu_time_nanos_per_second() / + (double)total_cpu_time_ns_per_second * 100; cpu_usage_p = std::round(cpu_usage_p * 100.0) / 100.0; SchemaScannerHelper::insert_double_value(3, cpu_usage_p, block); - SchemaScannerHelper::insert_int64_value(4, wg->get_local_scan_bytes_per_second(), block); - SchemaScannerHelper::insert_int64_value(5, wg->get_remote_scan_bytes_per_second(), block); + SchemaScannerHelper::insert_int64_value( + 4, wg->get_metrics()->get_local_scan_bytes_per_second(), block); + SchemaScannerHelper::insert_int64_value( + 5, wg->get_metrics()->get_remote_scan_bytes_per_second(), block); + } +} + +void WorkloadGroupMgr::refresh_workload_group_metrics() { + std::shared_lock r_lock(_group_mutex); + for (const auto& [id, wg] : _workload_groups) { + wg->get_metrics()->refresh_metrics(); } } diff --git a/be/src/runtime/workload_group/workload_group_manager.h b/be/src/runtime/workload_group/workload_group_manager.h index f76e98d26063ba..5d75a4558ef4f8 100644 --- a/be/src/runtime/workload_group/workload_group_manager.h +++ b/be/src/runtime/workload_group/workload_group_manager.h @@ -36,11 +36,18 @@ class TaskScheduler; class MultiCoreTaskQueue; } // namespace pipeline +// internal_group is used for doris internal workload, currently is mainly compaction +const static uint64_t INTERNAL_WORKLOAD_GROUP_ID = + static_cast(TWorkloadType::type::INTERNAL); +const static std::string INTERNAL_WORKLOAD_GROUP_NAME = "_internal"; + class WorkloadGroupMgr { public: WorkloadGroupMgr() = default; ~WorkloadGroupMgr() = default; + void init_internal_workload_group(); + WorkloadGroupPtr get_or_create_workload_group(const WorkloadGroupInfo& workload_group_info); void get_related_workload_groups(const std::function& pred, @@ -48,7 +55,7 @@ class WorkloadGroupMgr { void delete_workload_group_by_ids(std::set id_set); - WorkloadGroupPtr get_task_group_by_id(uint64_t tg_id); + WorkloadGroupPtr get_group(uint64_t wg_id); void do_sweep(); @@ -64,6 +71,13 @@ class WorkloadGroupMgr { void get_wg_resource_usage(vectorized::Block* block); + WorkloadGroupPtr get_internal_wg() { + std::shared_lock r_lock(_group_mutex); + return _workload_groups[INTERNAL_WORKLOAD_GROUP_ID]; + } + + void refresh_workload_group_metrics(); + private: std::shared_mutex _group_mutex; std::unordered_map _workload_groups; diff --git a/be/src/runtime/workload_group/workload_group_metrics.cpp b/be/src/runtime/workload_group/workload_group_metrics.cpp new file mode 100644 index 00000000000000..18ff7aa2f4f185 --- /dev/null +++ b/be/src/runtime/workload_group/workload_group_metrics.cpp @@ -0,0 +1,128 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "runtime/workload_group/workload_group_metrics.h" + +#include "runtime/workload_group/workload_group.h" +#include "runtime/workload_management/io_throttle.h" +#include "util/doris_metrics.h" +#include "util/metrics.h" + +namespace doris { + +#include "common/compile_check_begin.h" + +WorkloadGroupMetrics::~WorkloadGroupMetrics() { + DorisMetrics::instance()->metric_registry()->deregister_entity(_entity); +} + +WorkloadGroupMetrics::WorkloadGroupMetrics(WorkloadGroup* wg) { + _entity = DorisMetrics::instance()->metric_registry()->register_entity( + "workload_group." + wg->name(), {{"name", wg->name()}}); + + _cpu_time_metric = std::make_unique( + doris::MetricType::COUNTER, doris::MetricUnit::SECONDS, "workload_group_cpu_time_sec"); + _cpu_time_counter = + (IntAtomicCounter*)(_entity->register_metric(_cpu_time_metric.get())); + + _mem_used_bytes_metric = std::make_unique( + doris::MetricType::COUNTER, doris::MetricUnit::BYTES, "workload_group_mem_used_bytes"); + _mem_used_bytes_counter = (IntAtomicCounter*)(_entity->register_metric( + _mem_used_bytes_metric.get())); + + _local_scan_bytes_metric = std::make_unique( + doris::MetricType::COUNTER, doris::MetricUnit::BYTES, + "workload_group_local_scan_bytes"); + _local_scan_bytes_counter = (IntAtomicCounter*)(_entity->register_metric( + _local_scan_bytes_metric.get())); + + _remote_scan_bytes_metric = std::make_unique( + doris::MetricType::COUNTER, doris::MetricUnit::BYTES, + "workload_group_remote_scan_bytes"); + _remote_scan_bytes_counter = (IntAtomicCounter*)(_entity->register_metric( + _remote_scan_bytes_metric.get())); + + for (const auto& [key, io_throttle] : wg->_scan_io_throttle_map) { + std::unique_ptr metric = std::make_unique( + doris::MetricType::COUNTER, doris::MetricUnit::BYTES, + "workload_group_local_scan_bytes_" + io_throttle->metric_name()); + _local_scan_bytes_counter_map[key] = + (IntAtomicCounter*)(_entity->register_metric(metric.get())); + _local_scan_bytes_metric_map[key] = std::move(metric); + } +} + +void WorkloadGroupMetrics::update_cpu_time_nanos(uint64_t delta_cpu_time) { + _cpu_time_nanos += delta_cpu_time; +} + +void WorkloadGroupMetrics::update_memory_used_bytes(int64_t memory_used) { + _memory_used = memory_used; +} + +void WorkloadGroupMetrics::update_local_scan_io_bytes(std::string path, uint64_t delta_io_bytes) { + _local_scan_bytes_counter->increment(delta_io_bytes); + _local_scan_bytes_counter_map[path]->increment((int64_t)delta_io_bytes); +} + +void WorkloadGroupMetrics::update_remote_scan_io_bytes(uint64_t delta_io_bytes) { + _remote_scan_bytes_counter->increment(delta_io_bytes); +} + +void WorkloadGroupMetrics::refresh_metrics() { + int interval_second = config::workload_group_metrics_interval_ms / 1000; + + // cpu + uint64_t _current_cpu_time_nanos = _cpu_time_nanos.load(); + uint64_t _cpu_time_sec = _current_cpu_time_nanos / (1000L * 1000L * 1000L); + _cpu_time_counter->set_value(_cpu_time_sec); + _per_sec_cpu_time_nanos = (_current_cpu_time_nanos - _last_cpu_time_nanos) / interval_second; + _last_cpu_time_nanos = _current_cpu_time_nanos; + + // memory + _mem_used_bytes_counter->set_value(_memory_used); + + // local scan + int64_t current_local_scan_bytes = _local_scan_bytes_counter->value(); + _per_sec_local_scan_bytes = + (current_local_scan_bytes - _last_local_scan_bytes) / interval_second; + _last_local_scan_bytes = current_local_scan_bytes; + + // remote scan + int64_t current_remote_scan_bytes = _remote_scan_bytes_counter->value(); + _per_sec_remote_scan_bytes = + (current_remote_scan_bytes - _last_remote_scan_bytes) / interval_second; + _last_remote_scan_bytes = current_remote_scan_bytes; +} + +uint64_t WorkloadGroupMetrics::get_cpu_time_nanos_per_second() { + return _per_sec_cpu_time_nanos.load(); +} + +int64_t WorkloadGroupMetrics::get_local_scan_bytes_per_second() { + return _per_sec_local_scan_bytes.load(); +} + +int64_t WorkloadGroupMetrics::get_remote_scan_bytes_per_second() { + return _last_remote_scan_bytes.load(); +} + +int64_t WorkloadGroupMetrics::get_memory_used() { + return _mem_used_bytes_counter->value(); +} + +} // namespace doris \ No newline at end of file diff --git a/be/src/runtime/workload_group/workload_group_metrics.h b/be/src/runtime/workload_group/workload_group_metrics.h new file mode 100644 index 00000000000000..e68715df249dee --- /dev/null +++ b/be/src/runtime/workload_group/workload_group_metrics.h @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +namespace doris { + +class WorkloadGroup; + +template +class AtomicCounter; +using IntAtomicCounter = AtomicCounter; +class MetricEntity; +struct MetricPrototype; + +class WorkloadGroupMetrics { +public: + WorkloadGroupMetrics(WorkloadGroup* wg); + + ~WorkloadGroupMetrics(); + + void update_cpu_time_nanos(uint64_t delta_cpu_time); + + void update_memory_used_bytes(int64_t memory_used); + + void update_local_scan_io_bytes(std::string path, uint64_t delta_io_bytes); + + void update_remote_scan_io_bytes(uint64_t delta_io_bytes); + + void refresh_metrics(); + + uint64_t get_cpu_time_nanos_per_second(); + + int64_t get_local_scan_bytes_per_second(); + + int64_t get_remote_scan_bytes_per_second(); + + int64_t get_memory_used(); + +private: + std::unique_ptr _cpu_time_metric {nullptr}; + std::unique_ptr _mem_used_bytes_metric {nullptr}; + std::unique_ptr _local_scan_bytes_metric {nullptr}; + std::unique_ptr _remote_scan_bytes_metric {nullptr}; + // NOTE: _local_scan_bytes_metric is sum of all disk's IO + // _local_disk_io_metric is every disk's IO + std::map> _local_scan_bytes_metric_map; + + IntAtomicCounter* _cpu_time_counter {nullptr}; // used for metric + IntAtomicCounter* _mem_used_bytes_counter {nullptr}; // used for metric + IntAtomicCounter* _local_scan_bytes_counter {nullptr}; // used for metric + IntAtomicCounter* _remote_scan_bytes_counter {nullptr}; // used for metric + std::map _local_scan_bytes_counter_map; // used for metric + + std::atomic _cpu_time_nanos {0}; + std::atomic _last_cpu_time_nanos {0}; + std::atomic _per_sec_cpu_time_nanos {0}; // used for system table + + std::atomic _per_sec_local_scan_bytes {0}; + std::atomic _last_local_scan_bytes {0}; // used for system table + + std::atomic _per_sec_remote_scan_bytes {0}; + std::atomic _last_remote_scan_bytes {0}; // used for system table + + std::atomic _memory_used {0}; + + std::shared_ptr _entity {nullptr}; +}; + +} // namespace doris \ No newline at end of file diff --git a/be/src/runtime/workload_management/io_throttle.cpp b/be/src/runtime/workload_management/io_throttle.cpp index dacfa29012f59f..118fc518072272 100644 --- a/be/src/runtime/workload_management/io_throttle.cpp +++ b/be/src/runtime/workload_management/io_throttle.cpp @@ -22,12 +22,6 @@ namespace doris { -IOThrottle::IOThrottle(std::string prefix, std::string name) { - _io_adder = std::make_unique>(prefix, name); - _io_adder_per_second = std::make_unique>>( - prefix, name + "_per_second", _io_adder.get(), 1); -} - bool IOThrottle::acquire(int64_t block_timeout_ms) { if (_io_bytes_per_second_limit < 0) { return true; @@ -57,11 +51,6 @@ bool IOThrottle::try_acquire() { } void IOThrottle::update_next_io_time(int64_t io_bytes) { - Defer defer {[&]() { - if (io_bytes > 0) { - (*_io_adder) << io_bytes; - } - }}; if (_io_bytes_per_second_limit <= 0 || io_bytes <= 0) { return; } diff --git a/be/src/runtime/workload_management/io_throttle.h b/be/src/runtime/workload_management/io_throttle.h index 4212527020e0e2..f688922fcd29f9 100644 --- a/be/src/runtime/workload_management/io_throttle.h +++ b/be/src/runtime/workload_management/io_throttle.h @@ -28,7 +28,9 @@ namespace doris { class IOThrottle { public: - IOThrottle(std::string prefix, std::string name); + IOThrottle() = default; + + IOThrottle(std::string metric_name) : _metric_name(metric_name) {} ~IOThrottle() = default; @@ -41,7 +43,7 @@ class IOThrottle { void set_io_bytes_per_second(int64_t read_bytes_per_second); - size_t get_bvar_io_per_second() { return _io_adder_per_second->get_value(); } + std::string metric_name() { return _metric_name; } private: std::mutex _mutex; @@ -49,8 +51,6 @@ class IOThrottle { int64_t _next_io_time_micros {0}; std::atomic _io_bytes_per_second_limit {-1}; - // bvar monitor - std::unique_ptr> _io_adder; - std::unique_ptr>> _io_adder_per_second; + std::string _metric_name; }; }; // namespace doris \ No newline at end of file diff --git a/be/src/runtime/workload_management/workload_action.cpp b/be/src/runtime/workload_management/workload_action.cpp index 8e6e3b19e2c385..77042b074fd624 100644 --- a/be/src/runtime/workload_management/workload_action.cpp +++ b/be/src/runtime/workload_management/workload_action.cpp @@ -25,7 +25,7 @@ void WorkloadActionCancelQuery::exec(WorkloadQueryInfo* query_info) { std::stringstream msg; msg << "query " << query_info->query_id << " cancelled by workload policy: " << query_info->policy_name - << ", id:" << query_info->policy_id; + << ", id:" << query_info->policy_id << ", " << query_info->cond_eval_msg; std::string msg_str = msg.str(); LOG(INFO) << "[workload_schedule]" << msg_str; ExecEnv::GetInstance()->fragment_mgr()->cancel_query(query_info->tquery_id, diff --git a/be/src/runtime/workload_management/workload_condition.h b/be/src/runtime/workload_management/workload_condition.h index a85268a8dc3a6a..cf53a5f07ddf9d 100644 --- a/be/src/runtime/workload_management/workload_condition.h +++ b/be/src/runtime/workload_management/workload_condition.h @@ -33,6 +33,10 @@ class WorkloadCondition { virtual bool eval(std::string str_val) = 0; virtual WorkloadMetricType get_workload_metric_type() = 0; + + virtual std::string get_metric_string() = 0; + + virtual std::string get_metric_value_string() = 0; }; class WorkloadConditionQueryTime : public WorkloadCondition { @@ -45,6 +49,10 @@ class WorkloadConditionQueryTime : public WorkloadCondition { return WorkloadMetricType::QUERY_TIME; } + std::string get_metric_string() override { return "query_time"; } + + std::string get_metric_value_string() override { return std::to_string(_query_time); } + private: int64_t _query_time; WorkloadCompareOperator _op; @@ -56,6 +64,10 @@ class WorkloadConditionScanRows : public WorkloadCondition { bool eval(std::string str_val) override; WorkloadMetricType get_workload_metric_type() override { return WorkloadMetricType::SCAN_ROWS; } + std::string get_metric_string() override { return "scan_rows"; } + + std::string get_metric_value_string() override { return std::to_string(_scan_rows); } + private: int64_t _scan_rows; WorkloadCompareOperator _op; @@ -69,6 +81,10 @@ class WorkloadConditionScanBytes : public WorkloadCondition { return WorkloadMetricType::SCAN_BYTES; } + std::string get_metric_string() override { return "scan_bytes"; } + + std::string get_metric_value_string() override { return std::to_string(_scan_bytes); } + private: int64_t _scan_bytes; WorkloadCompareOperator _op; @@ -82,6 +98,10 @@ class WorkloadConditionQueryMemory : public WorkloadCondition { return WorkloadMetricType::QUERY_MEMORY_BYTES; } + std::string get_metric_string() override { return "query_memory"; } + + std::string get_metric_value_string() override { return std::to_string(_query_memory_bytes); } + private: int64_t _query_memory_bytes; WorkloadCompareOperator _op; diff --git a/be/src/runtime/workload_management/workload_query_info.h b/be/src/runtime/workload_management/workload_query_info.h index e544668e1039ed..16151eec390746 100644 --- a/be/src/runtime/workload_management/workload_query_info.h +++ b/be/src/runtime/workload_management/workload_query_info.h @@ -30,7 +30,8 @@ class WorkloadQueryInfo { std::string query_id; int64_t wg_id; int64_t policy_id; - std::string policy_name; + std::string policy_name {""}; + std::string cond_eval_msg {""}; }; } // namespace doris \ No newline at end of file diff --git a/be/src/runtime/workload_management/workload_sched_policy.cpp b/be/src/runtime/workload_management/workload_sched_policy.cpp index efa8965dd77121..63b9362bc217be 100644 --- a/be/src/runtime/workload_management/workload_sched_policy.cpp +++ b/be/src/runtime/workload_management/workload_sched_policy.cpp @@ -60,6 +60,7 @@ bool WorkloadSchedPolicy::is_match(WorkloadQueryInfo* query_info_ptr) { } auto& metric_val_map = query_info_ptr->metric_map; + std::string cond_eval_msg = ""; for (auto& cond : _condition_list) { if (metric_val_map.find(cond->get_workload_metric_type()) == metric_val_map.end()) { return false; @@ -69,7 +70,11 @@ bool WorkloadSchedPolicy::is_match(WorkloadQueryInfo* query_info_ptr) { if (!cond->eval(val)) { return false; } + cond_eval_msg += cond->get_metric_string() + ":" + val + "(" + + cond->get_metric_value_string() + "), "; } + cond_eval_msg = cond_eval_msg.substr(0, cond_eval_msg.size() - 2); + query_info_ptr->cond_eval_msg = cond_eval_msg; return true; } diff --git a/be/src/service/CMakeLists.txt b/be/src/service/CMakeLists.txt index 4ce611345840c1..e44045dffce17e 100644 --- a/be/src/service/CMakeLists.txt +++ b/be/src/service/CMakeLists.txt @@ -28,7 +28,7 @@ add_library(Service STATIC ${SRC_FILES}) pch_reuse(Service) -if (${MAKE_TEST} STREQUAL "OFF") +if (${MAKE_TEST} STREQUAL "OFF" AND ${BUILD_BENCHMARK} STREQUAL "OFF") add_executable(doris_be doris_main.cpp ) diff --git a/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp b/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp index e935aff996d55e..c24fcb73384494 100644 --- a/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp +++ b/be/src/service/arrow_flight/arrow_flight_batch_reader.cpp @@ -56,7 +56,7 @@ arrow::Status ArrowFlightBatchReaderBase::_return_invalid_status(const std::stri } ArrowFlightBatchReaderBase::~ArrowFlightBatchReaderBase() { - VLOG_NOTICE << fmt::format( + LOG(INFO) << fmt::format( "ArrowFlightBatchReader finished, packet_seq={}, result_addr={}:{}, finistId={}, " "convert_arrow_batch_timer={}, deserialize_block_timer={}, peak_memory_usage={}", _packet_seq, _statement->result_addr.hostname, _statement->result_addr.port, diff --git a/be/src/service/http_service.cpp b/be/src/service/http_service.cpp index e7b920796a1b98..eb0824170b30cc 100644 --- a/be/src/service/http_service.cpp +++ b/be/src/service/http_service.cpp @@ -25,13 +25,13 @@ #include #include "cloud/cloud_compaction_action.h" -#include "cloud/cloud_delete_bitmap_action.h" #include "cloud/config.h" #include "cloud/injection_point_action.h" #include "common/config.h" #include "common/status.h" #include "http/action/adjust_log_level.h" #include "http/action/adjust_tracing_dump.h" +#include "http/action/batch_download_action.h" #include "http/action/be_proc_thread_action.h" #include "http/action/calc_file_crc_action.h" #include "http/action/check_rpc_channel_action.h" @@ -42,6 +42,7 @@ #include "http/action/compaction_score_action.h" #include "http/action/config_action.h" #include "http/action/debug_point_action.h" +#include "http/action/delete_bitmap_action.h" #include "http/action/download_action.h" #include "http/action/download_binlog_action.h" #include "http/action/file_cache_action.h" @@ -80,6 +81,7 @@ #include "util/doris_metrics.h" namespace doris { +#include "common/compile_check_begin.h" namespace { std::shared_ptr get_rate_limit_group(event_base* event_base) { auto rate_limit = config::download_binlog_rate_limit_kbs; @@ -308,6 +310,16 @@ void HttpService::register_local_handler(StorageEngine& engine) { tablet_download_action); _ev_http_server->register_handler(HttpMethod::GET, "/api/_tablet/_download", tablet_download_action); + + BatchDownloadAction* batch_download_action = + _pool.add(new BatchDownloadAction(_env, _rate_limit_group, allow_paths)); + _ev_http_server->register_handler(HttpMethod::HEAD, "/api/_tablet/_batch_download", + batch_download_action); + _ev_http_server->register_handler(HttpMethod::GET, "/api/_tablet/_batch_download", + batch_download_action); + _ev_http_server->register_handler(HttpMethod::POST, "/api/_tablet/_batch_download", + batch_download_action); + if (config::enable_single_replica_load) { DownloadAction* single_replica_download_action = _pool.add(new DownloadAction( _env, nullptr, allow_paths, config::single_replica_load_download_num_workers)); @@ -377,6 +389,13 @@ void HttpService::register_local_handler(StorageEngine& engine) { _ev_http_server->register_handler(HttpMethod::GET, "/api/compaction/run_status", run_status_compaction_action); + + DeleteBitmapAction* count_delete_bitmap_action = + _pool.add(new DeleteBitmapAction(DeleteBitmapActionType::COUNT_LOCAL, _env, engine, + TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/delete_bitmap/count_local", + count_delete_bitmap_action); + CheckTabletSegmentAction* check_tablet_segment_action = _pool.add(new CheckTabletSegmentAction( _env, engine, TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); _ev_http_server->register_handler(HttpMethod::POST, "/api/check_tablet_segment_lost", @@ -425,11 +444,16 @@ void HttpService::register_cloud_handler(CloudStorageEngine& engine) { TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); _ev_http_server->register_handler(HttpMethod::GET, "/api/compaction/run_status", run_status_compaction_action); - CloudDeleteBitmapAction* count_delete_bitmap_action = - _pool.add(new CloudDeleteBitmapAction(DeleteBitmapActionType::COUNT_INFO, _env, engine, - TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); - _ev_http_server->register_handler(HttpMethod::GET, "/api/delete_bitmap/count", - count_delete_bitmap_action); + DeleteBitmapAction* count_local_delete_bitmap_action = + _pool.add(new DeleteBitmapAction(DeleteBitmapActionType::COUNT_LOCAL, _env, engine, + TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/delete_bitmap/count_local", + count_local_delete_bitmap_action); + DeleteBitmapAction* count_ms_delete_bitmap_action = + _pool.add(new DeleteBitmapAction(DeleteBitmapActionType::COUNT_MS, _env, engine, + TPrivilegeHier::GLOBAL, TPrivilegeType::ADMIN)); + _ev_http_server->register_handler(HttpMethod::GET, "/api/delete_bitmap/count_ms", + count_ms_delete_bitmap_action); #ifdef ENABLE_INJECTION_POINT InjectionPointAction* injection_point_action = _pool.add(new InjectionPointAction); _ev_http_server->register_handler(HttpMethod::GET, "/api/injection_point/{op}", @@ -468,4 +492,5 @@ int HttpService::get_real_port() const { return _ev_http_server->get_real_port(); } +#include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/service/internal_service.cpp b/be/src/service/internal_service.cpp index 29eb01bad2aaa8..fb0b2f090bc045 100644 --- a/be/src/service/internal_service.cpp +++ b/be/src/service/internal_service.cpp @@ -665,15 +665,11 @@ void PInternalService::cancel_plan_fragment(google::protobuf::RpcController* /*c void PInternalService::fetch_data(google::protobuf::RpcController* controller, const PFetchDataRequest* request, PFetchDataResult* result, google::protobuf::Closure* done) { - bool ret = _heavy_work_pool.try_offer([this, controller, request, result, done]() { - brpc::Controller* cntl = static_cast(controller); - GetResultBatchCtx* ctx = new GetResultBatchCtx(cntl, result, done); - _exec_env->result_mgr()->fetch_data(request->finst_id(), ctx); - }); - if (!ret) { - offer_failed(result, done, _heavy_work_pool); - return; - } + // fetch_data is a light operation which will put a request rather than wait inplace when there's no data ready. + // when there's data ready, use brpc to send. there's queue in brpc service. won't take it too long. + auto* cntl = static_cast(controller); + auto* ctx = new GetResultBatchCtx(cntl, result, done); + _exec_env->result_mgr()->fetch_data(request->finst_id(), ctx); } void PInternalService::fetch_arrow_data(google::protobuf::RpcController* controller, @@ -903,6 +899,7 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController auto st = ExecEnv::GetInstance()->result_mgr()->find_arrow_schema( UniqueId(request->finst_id()).to_thrift(), &schema); if (!st.ok()) { + LOG(WARNING) << "fetch arrow flight schema failed, errmsg=" << st; st.to_protobuf(result->mutable_status()); return; } @@ -911,9 +908,11 @@ void PInternalService::fetch_arrow_flight_schema(google::protobuf::RpcController st = serialize_arrow_schema(&schema, &schema_str); if (st.ok()) { result->set_schema(std::move(schema_str)); - if (!config::public_access_ip.empty() && config::public_access_port != -1) { - result->set_be_arrow_flight_ip(config::public_access_ip); - result->set_be_arrow_flight_port(config::public_access_port); + if (!config::public_host.empty()) { + result->set_be_arrow_flight_ip(config::public_host); + } + if (config::arrow_flight_sql_proxy_port != -1) { + result->set_be_arrow_flight_port(config::arrow_flight_sql_proxy_port); } } st.to_protobuf(result->mutable_status()); @@ -1237,7 +1236,10 @@ void PInternalService::report_stream_load_status(google::protobuf::RpcController void PInternalService::get_info(google::protobuf::RpcController* controller, const PProxyRequest* request, PProxyResult* response, google::protobuf::Closure* done) { - bool ret = _heavy_work_pool.try_offer([this, request, response, done]() { + bool ret = _exec_env->routine_load_task_executor()->get_thread_pool().submit_func([this, + request, + response, + done]() { brpc::ClosureGuard closure_guard(done); // PProxyRequest is defined in gensrc/proto/internal_service.proto // Currently it supports 2 kinds of requests: diff --git a/be/src/service/point_query_executor.cpp b/be/src/service/point_query_executor.cpp index 74dab466340330..ea991e158a1138 100644 --- a/be/src/service/point_query_executor.cpp +++ b/be/src/service/point_query_executor.cpp @@ -396,17 +396,6 @@ Status PointQueryExecutor::_lookup_row_key() { specified_rowsets = _tablet->get_rowset_by_ids(nullptr); } std::vector> segment_caches(specified_rowsets.size()); - // init segment_cache - { - SCOPED_TIMER(&_profile_metrics.load_segment_key_stage_ns); - for (size_t i = 0; i < specified_rowsets.size(); i++) { - auto& rs = specified_rowsets[i]; - segment_caches[i] = std::make_unique(); - RETURN_IF_ERROR(SegmentLoader::instance()->load_segments( - std::static_pointer_cast(rs), segment_caches[i].get(), true, true, - &_profile_metrics.read_stats)); - } - } for (size_t i = 0; i < _row_read_ctxs.size(); ++i) { RowLocation location; if (!config::disable_storage_row_cache) { diff --git a/be/src/util/arrow/block_convertor.cpp b/be/src/util/arrow/block_convertor.cpp index 817231e02ba03e..eb2508c8d0cb74 100644 --- a/be/src/util/arrow/block_convertor.cpp +++ b/be/src/util/arrow/block_convertor.cpp @@ -391,8 +391,9 @@ Status FromBlockConverter::convert(std::shared_ptr* out) { _cur_start, _cur_start + _cur_rows, _timezone_obj); } catch (std::exception& e) { - return Status::InternalError("Fail to convert block data to arrow data, error: {}", - e.what()); + return Status::InternalError( + "Fail to convert block data to arrow data, tyep: {}, name: {}, error: {}", + _cur_type->get_name(), e.what()); } arrow_st = _cur_builder->Finish(&_arrays[_cur_field_idx]); if (!arrow_st.ok()) { diff --git a/be/src/util/arrow/row_batch.cpp b/be/src/util/arrow/row_batch.cpp index dd11d5ae46f740..a0cd77aee41931 100644 --- a/be/src/util/arrow/row_batch.cpp +++ b/be/src/util/arrow/row_batch.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -84,12 +85,10 @@ Status convert_to_arrow_type(const TypeDescriptor& type, std::shared_ptr CGroupUtil::get_cgroupsv2_path(const std::string& sub Status CGroupUtil::read_int_line_from_cgroup_file(const std::filesystem::path& file_path, int64_t* val) { std::ifstream file_stream(file_path, std::ios::in); + if (!file_stream.is_open()) { + return Status::CgroupError("Error open {}", file_path.string()); + } + string line; getline(file_stream, line); if (file_stream.fail() || file_stream.bad()) { @@ -264,4 +268,167 @@ void CGroupUtil::read_int_metric_from_cgroup_file( } } +Status CGroupUtil::read_string_line_from_cgroup_file(const std::filesystem::path& file_path, + std::string* line_ptr) { + std::ifstream file_stream(file_path, std::ios::in); + if (!file_stream.is_open()) { + return Status::CgroupError("Error open {}", file_path.string()); + } + string line; + getline(file_stream, line); + if (file_stream.fail() || file_stream.bad()) { + return Status::CgroupError("Error reading {}: {}", file_path.string(), get_str_err_msg()); + } + *line_ptr = line; + return Status::OK(); +} + +Status CGroupUtil::parse_cpuset_line(std::string cpuset_line, int* cpu_count_ptr) { + if (cpuset_line.empty()) { + return Status::CgroupError("cpuset line is empty"); + } + std::vector ranges; + boost::split(ranges, cpuset_line, boost::is_any_of(",")); + int cpu_count = 0; + + for (const std::string& range : ranges) { + std::vector cpu_values; + boost::split(cpu_values, range, boost::is_any_of("-")); + + if (cpu_values.size() == 2) { + int start = std::stoi(cpu_values[0]); + int end = std::stoi(cpu_values[1]); + cpu_count += (end - start) + 1; + } else { + cpu_count++; + } + } + *cpu_count_ptr = cpu_count; + return Status::OK(); +} + +int CGroupUtil::get_cgroup_limited_cpu_number(int physical_cores) { + if (physical_cores <= 0) { + return physical_cores; + } + int ret = physical_cores; +#if defined(OS_LINUX) + // For cgroup v2 + // Child cgroup's cpu.max may bigger than parent group's cpu.max, + // so it should look up from current cgroup to top group. + // For cpuset, child cgroup's cpuset.cpus could not bigger thant parent's cpuset.cpus. + if (CGroupUtil::cgroupsv2_enable()) { + std::string cgroupv2_process_path = CGroupUtil::cgroupv2_of_process(); + if (cgroupv2_process_path.empty()) { + return ret; + } + std::filesystem::path current_cgroup_path = (default_cgroups_mount / cgroupv2_process_path); + ret = get_cgroup_v2_cpu_quota_number(current_cgroup_path, default_cgroups_mount, ret); + + current_cgroup_path = (default_cgroups_mount / cgroupv2_process_path); + ret = get_cgroup_v2_cpuset_number(current_cgroup_path, default_cgroups_mount, ret); + } else if (CGroupUtil::cgroupsv1_enable()) { + // cpu quota, should find first not empty config from current path to top. + // because if a process attach to current cgroup, its cpu quota may not be set. + std::string cpu_quota_path = ""; + Status cpu_quota_ret = CGroupUtil::find_abs_cgroupv1_path("cpu", &cpu_quota_path); + if (cpu_quota_ret.ok() && !cpu_quota_path.empty()) { + std::filesystem::path current_cgroup_path = cpu_quota_path; + ret = get_cgroup_v1_cpu_quota_number(current_cgroup_path, default_cgroups_mount, ret); + } + + //cpuset + // just lookup current process cgroup path is enough + // because if a process attach to current cgroup, its cpuset.cpus must be set. + std::string cpuset_path = ""; + Status cpuset_ret = CGroupUtil::find_abs_cgroupv1_path("cpuset", &cpuset_path); + if (cpuset_ret.ok() && !cpuset_path.empty()) { + std::filesystem::path current_path = cpuset_path; + ret = get_cgroup_v1_cpuset_number(current_path, ret); + } + } +#endif + return ret; +} + +int CGroupUtil::get_cgroup_v2_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num) { + int ret = cpu_num; + while (current_path != default_cg_mout_path.parent_path()) { + std::ifstream cpu_max_file(current_path / "cpu.max"); + if (cpu_max_file.is_open()) { + std::string cpu_limit_str; + double cpu_period; + cpu_max_file >> cpu_limit_str >> cpu_period; + if (cpu_limit_str != "max" && cpu_period != 0) { + double cpu_limit = std::stod(cpu_limit_str); + ret = std::min(static_cast(std::ceil(cpu_limit / cpu_period)), ret); + } + } + current_path = current_path.parent_path(); + } + return ret; +} + +int CGroupUtil::get_cgroup_v2_cpuset_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num) { + int ret = cpu_num; + while (current_path != default_cg_mout_path.parent_path()) { + std::ifstream cpuset_cpus_file(current_path / "cpuset.cpus.effective"); + current_path = current_path.parent_path(); + if (cpuset_cpus_file.is_open()) { + std::string cpuset_line; + cpuset_cpus_file >> cpuset_line; + if (cpuset_line.empty()) { + continue; + } + int cpus_count = 0; + static_cast(CGroupUtil::parse_cpuset_line(cpuset_line, &cpus_count)); + ret = std::min(cpus_count, ret); + break; + } + } + return ret; +} + +int CGroupUtil::get_cgroup_v1_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num) { + int ret = cpu_num; + while (current_path != default_cg_mout_path.parent_path()) { + std::ifstream cpu_quota_file(current_path / "cpu.cfs_quota_us"); + std::ifstream cpu_period_file(current_path / "cpu.cfs_period_us"); + if (cpu_quota_file.is_open() && cpu_period_file.is_open()) { + double cpu_quota_value; + double cpu_period_value; + cpu_quota_file >> cpu_quota_value; + cpu_period_file >> cpu_period_value; + if (cpu_quota_value > 0 && cpu_period_value > 0) { + ret = std::min(ret, + static_cast(std::ceil(cpu_quota_value / cpu_period_value))); + break; + } + } + current_path = current_path.parent_path(); + } + return ret; +} + +int CGroupUtil::get_cgroup_v1_cpuset_number(std::filesystem::path& current_path, int cpu_num) { + int ret = cpu_num; + std::string cpuset_line = ""; + Status cpuset_ret = CGroupUtil::read_string_line_from_cgroup_file( + (current_path / "cpuset.cpus"), &cpuset_line); + if (cpuset_ret.ok() && !cpuset_line.empty()) { + int cpuset_count = 0; + static_cast(CGroupUtil::parse_cpuset_line(cpuset_line, &cpuset_count)); + if (cpuset_count > 0) { + ret = std::min(ret, cpuset_count); + } + } + return ret; +} + } // namespace doris diff --git a/be/src/util/cgroup_util.h b/be/src/util/cgroup_util.h index bc1417453f41f6..54fc9494599f15 100644 --- a/be/src/util/cgroup_util.h +++ b/be/src/util/cgroup_util.h @@ -104,5 +104,27 @@ class CGroupUtil { static void read_int_metric_from_cgroup_file( const std::filesystem::path& file_path, std::unordered_map& metrics_map); + + static Status read_string_line_from_cgroup_file(const std::filesystem::path& file_path, + std::string* line_ptr); + + // cpuset_line: 0-4,6,8-10 + static Status parse_cpuset_line(std::string cpuset_line, int* cpu_count_ptr); + + static int get_cgroup_limited_cpu_number(int physical_cores); + + static int get_cgroup_v2_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num); + + static int get_cgroup_v2_cpuset_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num); + + static int get_cgroup_v1_cpu_quota_number(std::filesystem::path& current_path, + const std::filesystem::path& default_cg_mout_path, + int cpu_num); + + static int get_cgroup_v1_cpuset_number(std::filesystem::path& current_path, int cpu_num); }; } // namespace doris diff --git a/be/src/util/cpu_info.cpp b/be/src/util/cpu_info.cpp index 116dacb8da7ed4..b49985cdc06830 100644 --- a/be/src/util/cpu_info.cpp +++ b/be/src/util/cpu_info.cpp @@ -59,6 +59,7 @@ #include "gflags/gflags.h" #include "gutil/stringprintf.h" #include "gutil/strings/substitute.h" +#include "util/cgroup_util.h" #include "util/pretty_printer.h" using boost::algorithm::contains; @@ -109,58 +110,6 @@ static struct { {"popcnt", CpuInfo::POPCNT}, {"avx", CpuInfo::AVX}, {"avx2", CpuInfo::AVX2}, }; -int cgroup_bandwidth_quota(int physical_cores) { - namespace fs = std::filesystem; - fs::path cpu_max = "/sys/fs/cgroup/cpu.max"; - fs::path cfs_quota = "/sys/fs/cgroup/cpu/cpu.cfs_quota_us"; - fs::path cfs_period = "/sys/fs/cgroup/cpu/cpu.cfs_period_us"; - - int64_t quota, period; - char byte_buffer[1000]; - int64_t read_bytes; - - if (fs::exists(cpu_max)) { - // cgroup v2 - // https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html - std::ifstream file(cpu_max); - file.read(byte_buffer, 999); - read_bytes = file.gcount(); - byte_buffer[read_bytes] = '\0'; - if (sscanf(byte_buffer, "%" SCNd64 " %" SCNd64 "", "a, &period) != 2) { - return physical_cores; - } - } else if (fs::exists(cfs_quota) && fs::exists(cfs_period)) { - // cgroup v1 - // https://www.kernel.org/doc/html/latest/scheduler/sched-bwc.html#management - - // Read the quota, this indicates how many microseconds the CPU can be utilized by this cgroup per period - std::ifstream quota_file(cfs_quota); - quota_file.read(byte_buffer, 999); - read_bytes = quota_file.gcount(); - byte_buffer[read_bytes] = '\0'; - if (sscanf(byte_buffer, "%" SCNd64 "", "a) != 1) { - return physical_cores; - } - - // Read the time period, a cgroup can utilize the CPU up to quota microseconds every period - std::ifstream period_file(cfs_period); - period_file.read(byte_buffer, 999); - read_bytes = period_file.gcount(); - byte_buffer[read_bytes] = '\0'; - if (sscanf(byte_buffer, "%" SCNd64 "", &period) != 1) { - return physical_cores; - } - } else { - // No cgroup quota - return physical_cores; - } - if (quota > 0 && period > 0) { - return int64_t(ceil(double(quota) / double(period))); - } else { - return physical_cores; - } -} - // Helper function to parse for hardware flags. // values contains a list of space-separated flags. check to see if the flags we // care about are present. @@ -212,7 +161,7 @@ void CpuInfo::init() { } } - int num_cores = cgroup_bandwidth_quota(physical_num_cores); + int num_cores = CGroupUtil::get_cgroup_limited_cpu_number(physical_num_cores); if (max_mhz != 0) { cycles_per_ms_ = int64_t(max_mhz) * 1000; } else { diff --git a/be/src/util/debug_util.cpp b/be/src/util/debug_util.cpp index 1cf03d2c22d0e1..0856b10c051709 100644 --- a/be/src/util/debug_util.cpp +++ b/be/src/util/debug_util.cpp @@ -17,6 +17,7 @@ #include "util/debug_util.h" +#include #include #include #include @@ -104,6 +105,16 @@ std::string hexdump(const char* buf, int len) { return ss.str(); } +bvar::Status be_version_metrics("doris_be_version", [] { + std::stringstream ss; + ss << version::doris_build_version_major() << 0 << version::doris_build_version_minor() << 0 + << version::doris_build_version_patch(); + if (version::doris_build_version_hotfix() > 0) { + ss << 0 << version::doris_build_version_hotfix(); + } + return std::strtoul(ss.str().c_str(), nullptr, 10); +}()); + std::string PrintThriftNetworkAddress(const TNetworkAddress& add) { std::stringstream ss; add.printTo(ss); diff --git a/be/src/util/doris_metrics.h b/be/src/util/doris_metrics.h index 69516773debdbc..31b907eec9ed6c 100644 --- a/be/src/util/doris_metrics.h +++ b/be/src/util/doris_metrics.h @@ -202,6 +202,7 @@ class DorisMetrics { UIntGauge* send_batch_thread_pool_thread_num = nullptr; UIntGauge* send_batch_thread_pool_queue_size = nullptr; UIntGauge* fragment_thread_pool_queue_size = nullptr; + UIntGauge* fragment_thread_pool_num_active_threads = nullptr; // Upload metrics UIntGauge* upload_total_byte = nullptr; diff --git a/be/src/util/hash_util.hpp b/be/src/util/hash_util.hpp index e9ac72c5ccdcb4..fbf10b75ae02c0 100644 --- a/be/src/util/hash_util.hpp +++ b/be/src/util/hash_util.hpp @@ -38,15 +38,7 @@ namespace doris { // Utility class to compute hash values. class HashUtil { public: - template - static uint32_t fixed_len_to_uint32(T value) { - if constexpr (sizeof(T) <= sizeof(uint32_t)) { - return (uint32_t)value; - } - return std::hash()(value); - } - - static uint32_t zlib_crc_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t zlib_crc_hash(const void* data, uint32_t bytes, uint32_t hash) { return crc32(hash, (const unsigned char*)data, bytes); } @@ -66,7 +58,7 @@ class HashUtil { // NOTE: Any changes made to this function need to be reflected in Codegen::GetHashFn. // TODO: crc32 hashes with different seeds do not result in different hash functions. // The resulting hashes are correlated. - static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) { if (!CpuInfo::is_supported(CpuInfo::SSE4_2)) { return zlib_crc_hash(data, bytes, hash); } @@ -93,7 +85,7 @@ class HashUtil { return hash; } - static uint64_t crc_hash64(const void* data, int32_t bytes, uint64_t hash) { + static uint64_t crc_hash64(const void* data, uint32_t bytes, uint64_t hash) { uint32_t words = bytes / sizeof(uint32_t); bytes = bytes % sizeof(uint32_t); @@ -125,7 +117,7 @@ class HashUtil { return converter.u64; } #else - static uint32_t crc_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t crc_hash(const void* data, uint32_t bytes, uint32_t hash) { return zlib_crc_hash(data, bytes, hash); } #endif @@ -202,7 +194,7 @@ class HashUtil { // For example, if the data is <1000, 2000, 3000, 4000, ..> and then the mod of 1000 // is taken on the hash, all values will collide to the same bucket. // For string values, Fnv is slightly faster than boost. - static uint32_t fnv_hash(const void* data, int32_t bytes, uint32_t hash) { + static uint32_t fnv_hash(const void* data, uint32_t bytes, uint32_t hash) { const uint8_t* ptr = reinterpret_cast(data); while (bytes--) { @@ -213,7 +205,7 @@ class HashUtil { return hash; } - static uint64_t fnv_hash64(const void* data, int32_t bytes, uint64_t hash) { + static uint64_t fnv_hash64(const void* data, uint32_t bytes, uint64_t hash) { const uint8_t* ptr = reinterpret_cast(data); while (bytes--) { @@ -291,7 +283,7 @@ class HashUtil { // depending on hardware capabilities. // Seed values for different steps of the query execution should use different seeds // to prevent accidental key collisions. (See IMPALA-219 for more details). - static uint32_t hash(const void* data, int32_t bytes, uint32_t seed) { + static uint32_t hash(const void* data, uint32_t bytes, uint32_t seed) { #ifdef __SSE4_2__ if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) { @@ -305,7 +297,7 @@ class HashUtil { #endif } - static uint64_t hash64(const void* data, int32_t bytes, uint64_t seed) { + static uint64_t hash64(const void* data, uint64_t bytes, uint64_t seed) { #ifdef _SSE4_2_ if (LIKELY(CpuInfo::is_supported(CpuInfo::SSE4_2))) { return crc_hash64(data, bytes, seed); diff --git a/be/src/util/jsonb_document.h b/be/src/util/jsonb_document.h index 016da3142cd24c..909ee70742998e 100644 --- a/be/src/util/jsonb_document.h +++ b/be/src/util/jsonb_document.h @@ -180,7 +180,7 @@ class JsonbDocument { static JsonbDocument* createDocument(const char* pb, size_t size); // create an JsonbValue from JSONB packed bytes - static JsonbValue* createValue(const char* pb, uint32_t size); + static JsonbValue* createValue(const char* pb, size_t size); uint8_t version() { return header_.ver_; } @@ -1160,7 +1160,7 @@ inline void JsonbDocument::setValue(const JsonbValue* value) { memcpy(payload_, value, value->numPackedBytes()); } -inline JsonbValue* JsonbDocument::createValue(const char* pb, uint32_t size) { +inline JsonbValue* JsonbDocument::createValue(const char* pb, size_t size) { if (!pb || size < sizeof(JsonbHeader) + sizeof(JsonbValue)) { return nullptr; } diff --git a/be/src/util/jsonb_parser.h b/be/src/util/jsonb_parser.h index c90012a4fbef30..4192e36ea5cc80 100644 --- a/be/src/util/jsonb_parser.h +++ b/be/src/util/jsonb_parser.h @@ -84,16 +84,16 @@ class JsonbParserT { // parse a UTF-8 JSON string bool parse(const std::string& str, hDictInsert handler = nullptr) { - return parse(str.c_str(), (unsigned int)str.size(), handler); + return parse(str.c_str(), str.size(), handler); } // parse a UTF-8 JSON c-style string (NULL terminated) bool parse(const char* c_str, hDictInsert handler = nullptr) { - return parse(c_str, (unsigned int)strlen(c_str), handler); + return parse(c_str, strlen(c_str), handler); } // parse a UTF-8 JSON string with length - bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) { + bool parse(const char* pch, size_t len, hDictInsert handler = nullptr) { if (!pch || len == 0) { err_ = JsonbErrType::E_EMPTY_DOCUMENT; return false; diff --git a/be/src/util/jsonb_parser_simd.h b/be/src/util/jsonb_parser_simd.h index 6621912a9d0400..96ce866f74e256 100644 --- a/be/src/util/jsonb_parser_simd.h +++ b/be/src/util/jsonb_parser_simd.h @@ -85,16 +85,16 @@ class JsonbParserTSIMD { // parse a UTF-8 JSON string bool parse(const std::string& str, hDictInsert handler = nullptr) { - return parse(str.c_str(), (unsigned int)str.size(), handler); + return parse(str.c_str(), str.size(), handler); } // parse a UTF-8 JSON c-style string (NULL terminated) bool parse(const char* c_str, hDictInsert handler = nullptr) { - return parse(c_str, (unsigned int)strlen(c_str), handler); + return parse(c_str, strlen(c_str), handler); } // parse a UTF-8 JSON string with length - bool parse(const char* pch, unsigned int len, hDictInsert handler = nullptr) { + bool parse(const char* pch, size_t len, hDictInsert handler = nullptr) { // reset state before parse reset(); diff --git a/be/src/util/jsonb_stream.h b/be/src/util/jsonb_stream.h index 4567ab8384bd9d..2ea5d9090c735b 100644 --- a/be/src/util/jsonb_stream.h +++ b/be/src/util/jsonb_stream.h @@ -72,7 +72,7 @@ class JsonbInBuffer : public std::streambuf { */ class JsonbOutStream : public std::ostream { public: - explicit JsonbOutStream(uint32_t capacity = 1024) + explicit JsonbOutStream(uint64_t capacity = 1024) : std::ostream(nullptr), head_(nullptr), size_(0), capacity_(capacity), alloc_(true) { if (capacity_ == 0) { capacity_ = 1024; @@ -81,7 +81,7 @@ class JsonbOutStream : public std::ostream { head_ = (char*)malloc(capacity_); } - JsonbOutStream(char* buffer, uint32_t capacity) + JsonbOutStream(char* buffer, uint64_t capacity) : std::ostream(nullptr), head_(buffer), size_(0), capacity_(capacity), alloc_(false) { assert(buffer && capacity_ > 0); } @@ -94,10 +94,12 @@ class JsonbOutStream : public std::ostream { void put(char c) { write(&c, 1); } - void write(const char* c_str) { write(c_str, (uint32_t)strlen(c_str)); } + void write(const char* c_str) { write(c_str, strlen(c_str)); } - void write(const char* bytes, uint32_t len) { - if (len == 0) return; + void write(const char* bytes, uint64_t len) { + if (len == 0) { + return; + } if (size_ + len > capacity_) { realloc(len); @@ -156,14 +158,14 @@ class JsonbOutStream : public std::ostream { pos_type tellp() const { return size_; } - void seekp(pos_type pos) { size_ = (uint32_t)pos; } + void seekp(pos_type pos) { size_ = (uint64_t)pos; } const char* getBuffer() const { return head_; } pos_type getSize() const { return tellp(); } private: - void realloc(uint32_t len) { + void realloc(uint64_t len) { assert(capacity_ > 0); capacity_ *= 2; @@ -186,8 +188,8 @@ class JsonbOutStream : public std::ostream { private: char* head_ = nullptr; - uint32_t size_; - uint32_t capacity_; + uint64_t size_; + uint64_t capacity_; bool alloc_; }; diff --git a/be/src/util/jsonb_writer.h b/be/src/util/jsonb_writer.h index 61bd28bb783bd2..52d912d29d3b6d 100644 --- a/be/src/util/jsonb_writer.h +++ b/be/src/util/jsonb_writer.h @@ -315,7 +315,9 @@ class JsonbWriterT { return false; } - uint32_t writeString(const char* str, uint32_t len) { + // TODO: here changed length to uint64_t, as some api also need changed, But the thirdparty api is uint_32t + // need consider a better way to handle case. + uint64_t writeString(const char* str, uint64_t len) { if (kvState_ == WS_String) { os_->write(str, len); return len; @@ -324,9 +326,7 @@ class JsonbWriterT { return 0; } - uint32_t writeString(const std::string& str) { - return writeString(str.c_str(), (uint32_t)str.size()); - } + uint32_t writeString(const std::string& str) { return writeString(str.c_str(), str.size()); } uint32_t writeString(char ch) { if (kvState_ == WS_String) { os_->put(ch); @@ -372,7 +372,7 @@ class JsonbWriterT { return false; } - uint32_t writeBinary(const char* bin, uint32_t len) { + uint64_t writeBinary(const char* bin, uint64_t len) { if (kvState_ == WS_Binary) { os_->write(bin, len); return len; @@ -483,8 +483,7 @@ class JsonbWriterT { } JsonbValue* getValue() { - return JsonbDocument::createValue(getOutput()->getBuffer(), - (uint32_t)getOutput()->getSize()); + return JsonbDocument::createValue(getOutput()->getBuffer(), getOutput()->getSize()); } bool writeEnd() { diff --git a/be/src/util/jvm_metrics.cpp b/be/src/util/jvm_metrics.cpp index 4cb71f5e827878..b1089ef413628d 100644 --- a/be/src/util/jvm_metrics.cpp +++ b/be/src/util/jvm_metrics.cpp @@ -485,8 +485,8 @@ Status JvmStats::refresh(JvmMetrics* jvm_metrics) const { jvm_metrics->jvm_thread_count->set_value(threadCount < 0 ? 0 : threadCount); for (int i = 0; i < threadCount; i++) { - JNI_CALL_METHOD_CHECK_EXCEPTION(jobject, threadInfo, env, - GetObjectArrayElement((jobjectArray)threadInfos, i)); + JNI_CALL_METHOD_CHECK_EXCEPTION_DELETE_REF( + jobject, threadInfo, env, GetObjectArrayElement((jobjectArray)threadInfos, i)); if (threadInfo == nullptr) { continue; diff --git a/be/src/util/mem_info.cpp b/be/src/util/mem_info.cpp index 36579452db3f85..fe9cf84b2aed54 100644 --- a/be/src/util/mem_info.cpp +++ b/be/src/util/mem_info.cpp @@ -197,9 +197,10 @@ void MemInfo::refresh_proc_meminfo() { _s_cgroup_mem_limit = std::numeric_limits::max(); // find cgroup limit failed, wait 300s, 1000 * 100ms. _s_cgroup_mem_refresh_wait_times = -3000; - LOG(INFO) << "Refresh cgroup memory limit failed, refresh again after 300s, cgroup " - "mem limit: " - << _s_cgroup_mem_limit; + LOG(WARNING) + << "Refresh cgroup memory limit failed, refresh again after 300s, cgroup " + "mem limit: " + << _s_cgroup_mem_limit << ", " << status; } else { _s_cgroup_mem_limit = cgroup_mem_limit; // wait 10s, 100 * 100ms, avoid too frequently. @@ -209,12 +210,17 @@ void MemInfo::refresh_proc_meminfo() { _s_cgroup_mem_refresh_wait_times++; } + // cgroup mem limit is refreshed every 10 seconds, + // cgroup mem usage is refreshed together with memInfo every time, which is very frequent. if (_s_cgroup_mem_limit != std::numeric_limits::max()) { int64_t cgroup_mem_usage; auto status = CGroupMemoryCtl::find_cgroup_mem_usage(&cgroup_mem_usage); if (!status.ok()) { _s_cgroup_mem_usage = std::numeric_limits::min(); _s_cgroup_mem_refresh_state = false; + LOG_EVERY_N(WARNING, 500) + << "Refresh cgroup memory usage failed, cgroup mem limit: " + << _s_cgroup_mem_limit << ", " << status; } else { _s_cgroup_mem_usage = cgroup_mem_usage; _s_cgroup_mem_refresh_state = true; @@ -279,6 +285,12 @@ void MemInfo::refresh_proc_meminfo() { mem_available = _mem_info_bytes["MemAvailable"]; } if (_s_cgroup_mem_refresh_state) { + // Note, CgroupV2 MemAvailable is usually a little smaller than Process MemAvailable. + // Process `MemAvailable = MemFree - LowWaterMark + (PageCache - min(PageCache / 2, LowWaterMark))`, + // from `MemAvailable` in `/proc/meminfo`, calculated by OS. + // CgroupV2 `MemAvailable = cgroup_mem_limit - cgroup_mem_usage`, + // `cgroup_mem_usage = memory.current - inactive_file - slab_reclaimable`, in fact, + // there seems to be some memory that can be reused in `cgroup_mem_usage`. if (mem_available < 0) { mem_available = _s_cgroup_mem_limit - _s_cgroup_mem_usage; } else { diff --git a/be/src/util/mysql_row_buffer.cpp b/be/src/util/mysql_row_buffer.cpp index 3e20a2d9de72fe..4fd7de13753a95 100644 --- a/be/src/util/mysql_row_buffer.cpp +++ b/be/src/util/mysql_row_buffer.cpp @@ -107,7 +107,11 @@ MysqlRowBuffer::~MysqlRowBuffer() { template void MysqlRowBuffer::open_dynamic_mode() { if (!_dynamic_mode) { - *_pos++ = NEXT_EIGHT_BYTE; + // if _pos now exactly at the end of _buf memory, + // we should reserve 1 byte for _dynamic_mode flag byte to avoid *pos = 254 + // cause _dynamic_mode flag byte be overwritten + reserve(1); + *_pos++ = NEXT_EIGHT_BYTE; // *_pos = 254 ; _pos++ // write length when dynamic mode close _len_pos = (_pos - _buf); _pos = _pos + 8; diff --git a/be/src/util/ref_count_closure.h b/be/src/util/ref_count_closure.h index 92772a82373fec..560aebb98ee15e 100644 --- a/be/src/util/ref_count_closure.h +++ b/be/src/util/ref_count_closure.h @@ -20,7 +20,9 @@ #include #include +#include +#include "runtime/query_context.h" #include "runtime/thread_context.h" #include "service/brpc.h" #include "util/ref_count_closure.h" @@ -79,8 +81,9 @@ class AutoReleaseClosure : public google::protobuf::Closure { ENABLE_FACTORY_CREATOR(AutoReleaseClosure); public: - AutoReleaseClosure(std::shared_ptr req, std::shared_ptr callback) - : request_(req), callback_(callback) { + AutoReleaseClosure(std::shared_ptr req, std::shared_ptr callback, + std::weak_ptr context = {}) + : request_(req), callback_(callback), context_(std::move(context)) { this->cntl_ = callback->cntl_; this->response_ = callback->response_; } @@ -113,12 +116,22 @@ class AutoReleaseClosure : public google::protobuf::Closure { protected: virtual void _process_if_rpc_failed() { - LOG(WARNING) << "RPC meet failed: " << cntl_->ErrorText(); + std::string error_msg = "RPC meet failed: " + cntl_->ErrorText(); + if (auto ctx = context_.lock(); ctx) { + ctx->cancel(Status::NetworkError(error_msg)); + } else { + LOG(WARNING) << error_msg; + } } virtual void _process_if_meet_error_status(const Status& status) { - // no need to log END_OF_FILE, reduce the unlessful log - if (!status.is()) { + if (status.is()) { + // no need to log END_OF_FILE, reduce the unlessful log + return; + } + if (auto ctx = context_.lock(); ctx) { + ctx->cancel(status); + } else { LOG(WARNING) << "RPC meet error status: " << status; } } @@ -136,6 +149,7 @@ class AutoReleaseClosure : public google::protobuf::Closure { // Use a weak ptr to keep the callback, so that the callback can be deleted if the main // thread is freed. Weak callback_; + std::weak_ptr context_; }; } // namespace doris diff --git a/be/src/util/runtime_profile.cpp b/be/src/util/runtime_profile.cpp index e87301880d2479..1df4d8b55c278e 100644 --- a/be/src/util/runtime_profile.cpp +++ b/be/src/util/runtime_profile.cpp @@ -28,6 +28,7 @@ #include #include #include +#include #include "common/object_pool.h" #include "util/container_util.hpp" @@ -72,8 +73,7 @@ void RuntimeProfile::merge(RuntimeProfile* other) { dst_iter = _counter_map.find(src_iter->first); if (dst_iter == _counter_map.end()) { - _counter_map[src_iter->first] = _pool->add( - new Counter(src_iter->second->type(), src_iter->second->value())); + _counter_map[src_iter->first] = _pool->add(src_iter->second->clone()); } else { DCHECK(dst_iter->second->type() == src_iter->second->type()); @@ -363,28 +363,24 @@ const std::string* RuntimeProfile::get_info_string(const std::string& key) { return &it->second; } -#define ADD_COUNTER_IMPL(NAME, T) \ - RuntimeProfile::T* RuntimeProfile::NAME(const std::string& name, TUnit::type unit, \ - const std::string& parent_counter_name, \ - int64_t level) { \ - DCHECK_EQ(_is_averaged_profile, false); \ - std::lock_guard l(_counter_map_lock); \ - if (_counter_map.find(name) != _counter_map.end()) { \ - return reinterpret_cast(_counter_map[name]); \ - } \ - DCHECK(parent_counter_name == ROOT_COUNTER || \ - _counter_map.find(parent_counter_name) != _counter_map.end()); \ - T* counter = _pool->add(new T(unit, level)); \ - _counter_map[name] = counter; \ - std::set* child_counters = \ - find_or_insert(&_child_counter_map, parent_counter_name, std::set()); \ - child_counters->insert(name); \ - return counter; \ - } - -//ADD_COUNTER_IMPL(AddCounter, Counter); -ADD_COUNTER_IMPL(AddHighWaterMarkCounter, HighWaterMarkCounter); -//ADD_COUNTER_IMPL(AddConcurrentTimerCounter, ConcurrentTimerCounter); +RuntimeProfile::HighWaterMarkCounter* RuntimeProfile::AddHighWaterMarkCounter( + const std::string& name, TUnit::type unit, const std::string& parent_counter_name, + int64_t level) { + DCHECK_EQ(_is_averaged_profile, false); + std::lock_guard l(_counter_map_lock); + if (_counter_map.find(name) != _counter_map.end()) { + return reinterpret_cast(_counter_map[name]); + } + DCHECK(parent_counter_name == ROOT_COUNTER || + _counter_map.find(parent_counter_name) != _counter_map.end()); + RuntimeProfile::HighWaterMarkCounter* counter = + _pool->add(new RuntimeProfile::HighWaterMarkCounter(unit, level, parent_counter_name)); + _counter_map[name] = counter; + std::set* child_counters = + find_or_insert(&_child_counter_map, parent_counter_name, std::set()); + child_counters->insert(name); + return counter; +} std::shared_ptr RuntimeProfile::AddSharedHighWaterMarkCounter( const std::string& name, TUnit::type unit, const std::string& parent_counter_name) { @@ -395,7 +391,8 @@ std::shared_ptr RuntimeProfile::AddSharedH } DCHECK(parent_counter_name == ROOT_COUNTER || _counter_map.find(parent_counter_name) != _counter_map.end()); - std::shared_ptr counter = std::make_shared(unit); + std::shared_ptr counter = + std::make_shared(unit, 2, parent_counter_name); _shared_counter_pool[name] = counter; DCHECK(_counter_map.find(name) == _counter_map.end()) @@ -577,8 +574,6 @@ void RuntimeProfile::to_thrift(TRuntimeProfileTree* tree) { } void RuntimeProfile::to_thrift(std::vector* nodes) { - nodes->reserve(nodes->size() + _children.size()); - int index = nodes->size(); nodes->push_back(TRuntimeProfileNode()); TRuntimeProfileNode& node = (*nodes)[index]; @@ -605,10 +600,13 @@ void RuntimeProfile::to_thrift(std::vector* nodes) { ChildVector children; { + // _children may be modified during to_thrift(), + // so we have to lock and copy _children to avoid race condition std::lock_guard l(_children_lock); children = _children; } node.num_children = children.size(); + nodes->reserve(nodes->size() + children.size()); for (int i = 0; i < children.size(); ++i) { int child_idx = nodes->size(); @@ -697,17 +695,14 @@ void RuntimeProfile::print_child_counters(const std::string& prefix, const CounterMap& counter_map, const ChildCounterMap& child_counter_map, std::ostream* s) { - std::ostream& stream = *s; - ChildCounterMap::const_iterator itr = child_counter_map.find(counter_name); + auto itr = child_counter_map.find(counter_name); if (itr != child_counter_map.end()) { const std::set& child_counters = itr->second; for (const std::string& child_counter : child_counters) { - CounterMap::const_iterator iter = counter_map.find(child_counter); + auto iter = counter_map.find(child_counter); DCHECK(iter != counter_map.end()); - stream << prefix << " - " << iter->first << ": " - << PrettyPrinter::print(iter->second->value(), iter->second->type()) - << std::endl; + iter->second->pretty_print(s, prefix, iter->first); RuntimeProfile::print_child_counters(prefix + " ", child_counter, counter_map, child_counter_map, s); } diff --git a/be/src/util/runtime_profile.h b/be/src/util/runtime_profile.h index 955d77b72aa51c..7130acbd2f9427 100644 --- a/be/src/util/runtime_profile.h +++ b/be/src/util/runtime_profile.h @@ -39,6 +39,7 @@ #include "common/compiler_util.h" // IWYU pragma: keep #include "util/binary_cast.hpp" +#include "util/container_util.hpp" #include "util/pretty_printer.h" #include "util/stopwatch.hpp" @@ -99,6 +100,8 @@ class RuntimeProfile { : _value(value), _type(type), _level(level) {} virtual ~Counter() = default; + virtual Counter* clone() const { return new Counter(type(), value(), _level); } + virtual void update(int64_t delta) { _value.fetch_add(delta, std::memory_order_relaxed); } void bit_or(int64_t delta) { _value.fetch_or(delta, std::memory_order_relaxed); } @@ -126,9 +129,17 @@ class RuntimeProfile { tcounters.push_back(std::move(counter)); } + virtual void pretty_print(std::ostream* s, const std::string& prefix, + const std::string& name) const { + std::ostream& stream = *s; + stream << prefix << " - " << name << ": " + << PrettyPrinter::print(_value.load(std::memory_order_relaxed), type()) + << std::endl; + } + TUnit::type type() const { return _type; } - virtual int64_t level() { return _level; } + virtual int64_t level() const { return _level; } private: friend class RuntimeProfile; @@ -142,15 +153,56 @@ class RuntimeProfile { /// as value()) and the current value. class HighWaterMarkCounter : public Counter { public: - HighWaterMarkCounter(TUnit::type unit, int64_t level = 2) - : Counter(unit, 0, level), current_value_(0) {} + HighWaterMarkCounter(TUnit::type unit, int64_t level, const std::string& parent_name, + int64_t value = 0, int64_t current_value = 0) + : Counter(unit, value, level), + current_value_(current_value), + _parent_name(parent_name) {} + + virtual Counter* clone() const override { + return new HighWaterMarkCounter(type(), level(), parent_name(), value(), + current_value()); + } - virtual void add(int64_t delta) { + void add(int64_t delta) { current_value_.fetch_add(delta, std::memory_order_relaxed); if (delta > 0) { UpdateMax(current_value_); } } + virtual void update(int64_t delta) override { add(delta); } + + virtual void to_thrift( + const std::string& name, std::vector& tcounters, + std::map>& child_counters_map) override { + { + TCounter counter; + counter.name = name; + counter.value = this->current_value(); + counter.type = this->type(); + counter.__set_level(this->level()); + tcounters.push_back(std::move(counter)); + } + { + TCounter counter; + std::string peak_name = name + "Peak"; + counter.name = peak_name; + counter.value = this->value(); + counter.type = this->type(); + counter.__set_level(this->level()); + tcounters.push_back(std::move(counter)); + child_counters_map[_parent_name].insert(peak_name); + } + } + + virtual void pretty_print(std::ostream* s, const std::string& prefix, + const std::string& name) const override { + std::ostream& stream = *s; + stream << prefix << " - " << name + << " Current: " << PrettyPrinter::print(current_value(), type()) << " (Peak: " + << PrettyPrinter::print(_value.load(std::memory_order_relaxed), type()) << ")" + << std::endl; + } /// Tries to increase the current value by delta. If current_value() + delta /// exceeds max, return false and current_value is not changed. @@ -174,6 +226,8 @@ class RuntimeProfile { int64_t current_value() const { return current_value_.load(std::memory_order_relaxed); } + std::string parent_name() const { return _parent_name; } + private: /// Set '_value' to 'v' if 'v' is larger than '_value'. The entire operation is /// atomic. @@ -194,6 +248,8 @@ class RuntimeProfile { /// The current value of the counter. _value in the super class represents /// the high water mark. std::atomic current_value_; + + const std::string _parent_name; }; using DerivedCounterFunction = std::function; @@ -202,8 +258,13 @@ class RuntimeProfile { // Do not call Set() and Update(). class DerivedCounter : public Counter { public: - DerivedCounter(TUnit::type type, const DerivedCounterFunction& counter_fn) - : Counter(type, 0), _counter_fn(counter_fn) {} + DerivedCounter(TUnit::type type, const DerivedCounterFunction& counter_fn, + int64_t value = 0, int64_t level = 1) + : Counter(type, value, level), _counter_fn(counter_fn) {} + + virtual Counter* clone() const override { + return new DerivedCounter(type(), _counter_fn, value(), level()); + } int64_t value() const override { return _counter_fn(); } @@ -214,8 +275,13 @@ class RuntimeProfile { // NonZeroCounter will not be converted to Thrift if the value is 0. class NonZeroCounter : public Counter { public: - NonZeroCounter(TUnit::type type, int64_t level, const std::string& parent_name) - : Counter(type, 0, level), _parent_name(parent_name) {} + NonZeroCounter(TUnit::type type, int64_t level, const std::string& parent_name, + int64_t value = 0) + : Counter(type, value, level), _parent_name(parent_name) {} + + virtual Counter* clone() const override { + return new NonZeroCounter(type(), level(), parent_name(), value()); + } void to_thrift(const std::string& name, std::vector& tcounters, std::map>& child_counters_map) override { @@ -227,6 +293,8 @@ class RuntimeProfile { } } + std::string parent_name() const { return _parent_name; } + private: const std::string _parent_name; }; @@ -561,10 +629,6 @@ class RuntimeProfile { static void print_child_counters(const std::string& prefix, const std::string& counter_name, const CounterMap& counter_map, const ChildCounterMap& child_counter_map, std::ostream* s); - - static std::string print_counter(Counter* counter) { - return PrettyPrinter::print(counter->value(), counter->type()); - } }; // Utility class to update the counter at object construction and destruction. diff --git a/be/src/util/security.h b/be/src/util/security.h new file mode 100644 index 00000000000000..d2201b1b297b70 --- /dev/null +++ b/be/src/util/security.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +namespace doris { + +inline std::string mask_token(const std::string& str) { + std::regex pattern("token=[\\w|-]+"); + return std::regex_replace(str, pattern, "token=******"); +} + +inline std::string mask_token(const char* str) { + std::regex pattern("token=[\\w|-]+"); + return std::regex_replace(str, pattern, "token=******"); +} + +} // namespace doris diff --git a/be/src/util/simd/vstring_function.h b/be/src/util/simd/vstring_function.h index 99313132382e5c..bfa75b728d5620 100644 --- a/be/src/util/simd/vstring_function.h +++ b/be/src/util/simd/vstring_function.h @@ -309,8 +309,11 @@ class VStringFunctions { // is to say, counting bytes which do not match 10xx_xxxx pattern. // All 0xxx_xxxx, 110x_xxxx, 1110_xxxx and 1111_0xxx are greater than 1011_1111 when use int8_t arithmetic, // so just count bytes greater than 1011_1111 in a byte string as the result of utf8_length. - static inline size_t get_char_len(const char* src, size_t len) { - size_t char_len = 0; + // get_char_len is used to return the UTF-8 length of a string. + // The return value will never exceed len. + template + static inline T get_char_len(const char* src, T len) { + T char_len = 0; const char* p = src; const char* end = p + len; #if defined(__SSE2__) || defined(__aarch64__) diff --git a/be/src/util/slice.h b/be/src/util/slice.h index fd6bcf0adfb510..b70a82e17ce74d 100644 --- a/be/src/util/slice.h +++ b/be/src/util/slice.h @@ -302,27 +302,6 @@ inline int Slice::compare(const Slice& b) const { return r; } -/// @brief STL map whose keys are Slices. -/// -/// An example of usage: -/// @code -/// typedef SliceMap::type MySliceMap; -/// -/// MySliceMap my_map; -/// my_map.insert(MySliceMap::value_type(a, 1)); -/// my_map.insert(MySliceMap::value_type(b, 2)); -/// my_map.insert(MySliceMap::value_type(c, 3)); -/// -/// for (const MySliceMap::value_type& pair : my_map) { -/// ... -/// } -/// @endcode -template -struct SliceMap { - /// A handy typedef for the slice map with appropriate comparison operator. - typedef std::map type; -}; - // A move-only type which manage the lifecycle of externally allocated data. // Unlike std::unique_ptr, OwnedSlice remembers the size of data so that clients can access // the underlying buffer as a Slice. diff --git a/be/src/util/system_metrics.cpp b/be/src/util/system_metrics.cpp index c1385b6244bf62..fc2cdcc9262b31 100644 --- a/be/src/util/system_metrics.cpp +++ b/be/src/util/system_metrics.cpp @@ -33,11 +33,26 @@ #include "gutil/strings/split.h" // for string split #include "gutil/strtoint.h" // for atoi64 +#include "util/cgroup_util.h" #include "util/mem_info.h" #include "util/perf_counters.h" namespace doris { +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(avail_cpu_num, MetricUnit::NOUNIT); + +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(host_cpu_num, MetricUnit::NOUNIT); +struct CpuNumberMetrics { + CpuNumberMetrics(MetricEntity* ent) : entity(ent) { + INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, host_cpu_num); + INT_ATOMIC_COUNTER_METRIC_REGISTER(entity, avail_cpu_num); + } + + IntAtomicCounter* host_cpu_num {nullptr}; + IntAtomicCounter* avail_cpu_num {nullptr}; + MetricEntity* entity = nullptr; +}; + #define DEFINE_CPU_COUNTER_METRIC(metric) \ DEFINE_COUNTER_METRIC_PROTOTYPE_5ARG(cpu_##metric, MetricUnit::PERCENT, "", cpu, \ Labels({{"mode", #metric}})); @@ -386,11 +401,22 @@ void SystemMetrics::update() { void SystemMetrics::_install_cpu_metrics() { get_cpu_name(); + + int cpu_num = 0; for (auto cpu_name : _cpu_names) { + // NOTE: cpu_name comes from /proc/stat which named 'cpu' is not a real cpu name, it should be skipped. + if (cpu_name != "cpu") { + cpu_num++; + } auto cpu_entity = _registry->register_entity(cpu_name, {{"device", cpu_name}}); CpuMetrics* metrics = new CpuMetrics(cpu_entity.get()); _cpu_metrics.emplace(cpu_name, metrics); } + + auto cpu_num_entity = _registry->register_entity("doris_be_host_cpu_num"); + _cpu_num_metrics = std::make_unique(cpu_num_entity.get()); + + _cpu_num_metrics->host_cpu_num->set_value(cpu_num); } #ifdef BE_TEST @@ -983,6 +1009,14 @@ void SystemMetrics::_update_proc_metrics() { fclose(fp); } +void SystemMetrics::update_be_avail_cpu_num() { + int64_t physical_cpu_num = _cpu_num_metrics->host_cpu_num->value(); + if (physical_cpu_num > 0) { + physical_cpu_num = CGroupUtil::get_cgroup_limited_cpu_number(physical_cpu_num); + _cpu_num_metrics->avail_cpu_num->set_value(physical_cpu_num); + } +} + void SystemMetrics::get_metrics_from_proc_vmstat() { #ifdef BE_TEST FILE* fp = fopen(k_ut_vmstat_path, "r"); diff --git a/be/src/util/system_metrics.h b/be/src/util/system_metrics.h index c72ba3693012fb..2c5446b81f4f71 100644 --- a/be/src/util/system_metrics.h +++ b/be/src/util/system_metrics.h @@ -31,6 +31,7 @@ namespace doris { struct CpuMetrics; +struct CpuNumberMetrics; struct MemoryMetrics; struct DiskMetrics; struct NetworkMetrics; @@ -65,6 +66,8 @@ class SystemMetrics { void update_max_network_receive_bytes_rate(int64_t max_receive_bytes_rate); void update_allocator_metrics(); + void update_be_avail_cpu_num(); + private: void _install_cpu_metrics(); // On Intel(R) Xeon(R) CPU E5-2450 0 @ 2.10GHz; @@ -99,6 +102,7 @@ class SystemMetrics { static const char* _s_hook_name; std::map _cpu_metrics; + std::unique_ptr _cpu_num_metrics; std::unique_ptr _memory_metrics; std::map _disk_metrics; std::map _network_metrics; diff --git a/be/src/util/threadpool.cpp b/be/src/util/threadpool.cpp index 15fb36181d4336..f5ea38515def36 100644 --- a/be/src/util/threadpool.cpp +++ b/be/src/util/threadpool.cpp @@ -75,7 +75,8 @@ ThreadPoolBuilder& ThreadPoolBuilder::set_max_queue_size(int max_queue_size) { return *this; } -ThreadPoolBuilder& ThreadPoolBuilder::set_cgroup_cpu_ctl(CgroupCpuCtl* cgroup_cpu_ctl) { +ThreadPoolBuilder& ThreadPoolBuilder::set_cgroup_cpu_ctl( + std::weak_ptr cgroup_cpu_ctl) { _cgroup_cpu_ctl = cgroup_cpu_ctl; return *this; } @@ -476,8 +477,8 @@ void ThreadPool::dispatch_thread() { _num_threads++; _num_threads_pending_start--; - if (_cgroup_cpu_ctl != nullptr) { - static_cast(_cgroup_cpu_ctl->add_thread_to_cgroup()); + if (std::shared_ptr cg_cpu_ctl_sptr = _cgroup_cpu_ctl.lock()) { + static_cast(cg_cpu_ctl_sptr->add_thread_to_cgroup()); } // Owned by this worker thread and added/removed from _idle_threads as needed. diff --git a/be/src/util/threadpool.h b/be/src/util/threadpool.h index 5ce27e2f27b9a5..f822c307aa6b8e 100644 --- a/be/src/util/threadpool.h +++ b/be/src/util/threadpool.h @@ -20,12 +20,11 @@ #pragma once -#include -#include - #include #include #include +#include +#include // IWYU pragma: no_include #include // IWYU pragma: keep #include @@ -50,7 +49,7 @@ class ThreadPoolToken; class Runnable { public: virtual void run() = 0; - virtual ~Runnable() {} + virtual ~Runnable() = default; }; // ThreadPool takes a lot of arguments. We provide sane defaults with a builder. @@ -107,7 +106,7 @@ class ThreadPoolBuilder { ThreadPoolBuilder& set_min_threads(int min_threads); ThreadPoolBuilder& set_max_threads(int max_threads); ThreadPoolBuilder& set_max_queue_size(int max_queue_size); - ThreadPoolBuilder& set_cgroup_cpu_ctl(CgroupCpuCtl* cgroup_cpu_ctl); + ThreadPoolBuilder& set_cgroup_cpu_ctl(std::weak_ptr cgroup_cpu_ctl); template ThreadPoolBuilder& set_idle_timeout(const std::chrono::duration& idle_timeout) { _idle_timeout = std::chrono::duration_cast(idle_timeout); @@ -127,18 +126,18 @@ class ThreadPoolBuilder { return Status::OK(); } + ThreadPoolBuilder(const ThreadPoolBuilder&) = delete; + void operator=(const ThreadPoolBuilder&) = delete; + private: friend class ThreadPool; const std::string _name; int _min_threads; int _max_threads; int _max_queue_size; - CgroupCpuCtl* _cgroup_cpu_ctl = nullptr; + std::weak_ptr _cgroup_cpu_ctl; std::chrono::milliseconds _idle_timeout; - ThreadPoolBuilder(const ThreadPoolBuilder&) = delete; - void operator=(const ThreadPoolBuilder&) = delete; - template static constexpr bool always_false_v = false; }; @@ -256,13 +255,22 @@ class ThreadPool { return _total_queued_tasks; } - std::vector debug_info() { + std::vector debug_info() const { std::lock_guard l(_lock); std::vector arr = {_num_threads, static_cast(_threads.size()), _min_threads, _max_threads}; return arr; } + std::string get_info() const { + std::lock_guard l(_lock); + return fmt::format("ThreadPool(name={}, threads(active/pending)=({}/{}), queued_task={})", + _name, _active_threads, _num_threads_pending_start, _total_queued_tasks); + } + + ThreadPool(const ThreadPool&) = delete; + void operator=(const ThreadPool&) = delete; + private: friend class ThreadPoolBuilder; friend class ThreadPoolToken; @@ -345,7 +353,7 @@ class ThreadPool { // Protected by _lock. int _total_queued_tasks; - CgroupCpuCtl* _cgroup_cpu_ctl = nullptr; + std::weak_ptr _cgroup_cpu_ctl; // All allocated tokens. // @@ -372,7 +380,7 @@ class ThreadPool { // // Protected by _lock. struct IdleThread : public boost::intrusive::list_base_hook<> { - explicit IdleThread() {} + explicit IdleThread() = default; // Condition variable for "queue is not empty". Waiters wake up when a new // task is queued. @@ -384,9 +392,6 @@ class ThreadPool { // ExecutionMode::CONCURRENT token used by the pool for tokenless submission. std::unique_ptr _tokenless; - - ThreadPool(const ThreadPool&) = delete; - void operator=(const ThreadPool&) = delete; }; // Entry point for token-based task submission and blocking for a particular @@ -434,6 +439,9 @@ class ThreadPoolToken { return _entries.size(); } + ThreadPoolToken(const ThreadPoolToken&) = delete; + void operator=(const ThreadPoolToken&) = delete; + private: // All possible token states. Legal state transitions: // IDLE -> RUNNING: task is submitted via token @@ -516,9 +524,6 @@ class ThreadPoolToken { int _num_submitted_tasks; // Number of tasks which has not been submitted to the thread pool's queue. int _num_unsubmitted_tasks; - - ThreadPoolToken(const ThreadPoolToken&) = delete; - void operator=(const ThreadPoolToken&) = delete; }; } // namespace doris diff --git a/be/src/util/work_thread_pool.hpp b/be/src/util/work_thread_pool.hpp index 00430ff75148fc..1da8a08f90d234 100644 --- a/be/src/util/work_thread_pool.hpp +++ b/be/src/util/work_thread_pool.hpp @@ -18,7 +18,6 @@ #pragma once #include -#include #include "util/blocking_priority_queue.hpp" #include "util/blocking_queue.hpp" @@ -126,12 +125,13 @@ class WorkThreadPool { } std::string get_info() const { - return fmt::format( - "PriorityThreadPool(name={}, queue_size={}/{}, active_thread={}/{}, " - "total_get_wait_time={}, total_put_wait_time={})", - _name, get_queue_size(), _work_queue.get_capacity(), _active_threads, - _threads.size(), _work_queue.total_get_wait_time(), - _work_queue.total_put_wait_time()); + return (Priority ? "PriorityThreadPool" : "FifoThreadPool") + + fmt::format( + "(name={}, queue_size={}/{}, active_thread={}/{}, " + "total_get_wait_time={}, total_put_wait_time={})", + _name, get_queue_size(), _work_queue.get_capacity(), _active_threads, + _threads.size(), _work_queue.total_get_wait_time(), + _work_queue.total_put_wait_time()); } protected: diff --git a/be/src/vec/aggregate_functions/aggregate_function.h b/be/src/vec/aggregate_functions/aggregate_function.h index e9148716f99f35..d761d40c4c932c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function.h +++ b/be/src/vec/aggregate_functions/aggregate_function.h @@ -20,6 +20,8 @@ #pragma once +#include + #include "common/exception.h" #include "common/status.h" #include "util/defer_op.h" @@ -36,6 +38,7 @@ #include "vec/data_types/data_type_string.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" class Arena; class IColumn; @@ -43,7 +46,7 @@ class IDataType; struct AggregateFunctionAttr { bool enable_decimal256 {false}; - std::vector> column_infos; + std::vector column_names; }; template @@ -80,7 +83,7 @@ using ConstAggregateDataPtr = const char*; */ class IAggregateFunction { public: - IAggregateFunction(const DataTypes& argument_types_) : argument_types(argument_types_) {} + IAggregateFunction(DataTypes argument_types_) : argument_types(std::move(argument_types_)) {} /// Get main function name. virtual String get_name() const = 0; @@ -224,7 +227,7 @@ class IAggregateFunction { virtual void set_version(const int version_) { version = version_; } - virtual AggregateFunctionPtr transmit_to_stable() { return nullptr; } + virtual IAggregateFunction* transmit_to_stable() { return nullptr; } /// Verify function signature virtual Status verify_result_type(const bool without_key, const DataTypes& argument_types, @@ -598,3 +601,5 @@ class AggregateFunctionGuard { }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp index 18662bf66cf38c..8bf6c32c0872de 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.cpp @@ -29,6 +29,7 @@ #include "vec/functions/function.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" AggregateFunctionPtr create_aggregate_function_approx_count_distinct( const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h index d267499e059818..3ef22be9fca74c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_count_distinct.h @@ -38,6 +38,7 @@ #include "vec/io/io_helper.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; class BufferReadable; @@ -64,8 +65,7 @@ struct AggregateFunctionApproxCountDistinctData { void write(BufferWritable& buf) const { std::string result; result.resize(hll_data.max_serialized_size()); - int size = hll_data.serialize((uint8_t*)result.data()); - result.resize(size); + result.resize(hll_data.serialize((uint8_t*)result.data())); write_binary(result, buf); } @@ -136,3 +136,5 @@ class AggregateFunctionApproxCountDistinct final }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h index 7885321bba3e11..399af84f43cf20 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top.h @@ -18,12 +18,92 @@ #pragma once #include "vec/core/types.h" +#include "vec/data_types/data_type.h" +#include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { class AggregateFunctionApproxTop { public: + AggregateFunctionApproxTop(const std::vector& column_names) + : _column_names(column_names) {} + + static int32_t is_valid_const_columns(const std::vector& is_const_columns) { + int32_t true_count = 0; + bool found_false_after_true = false; + for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { + if (is_const_columns[i]) { + true_count++; + if (found_false_after_true) { + return false; + } + } else { + if (true_count > 2) { + return false; + } + found_false_after_true = true; + } + } + if (true_count > 2) { + throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); + } + return true_count; + } + +protected: + void lazy_init(const IColumn** columns, ssize_t row_num, + const DataTypes& argument_types) const { + auto get_param = [](size_t idx, const DataTypes& data_types, + const IColumn** columns) -> uint64_t { + const auto& data_type = data_types.at(idx); + const IColumn* column = columns[idx]; + + const auto* type = data_type.get(); + if (type->is_nullable()) { + type = assert_cast(type) + ->get_nested_type() + .get(); + } + int64_t value = 0; + WhichDataType which(type); + if (which.idx == TypeIndex::Int8) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int16) { + value = assert_cast(column) + ->get_element(0); + } else if (which.idx == TypeIndex::Int32) { + value = assert_cast(column) + ->get_element(0); + } + if (value <= 0) { + throw Exception(ErrorCode::INVALID_ARGUMENT, + "The parameter cannot be less than or equal to 0."); + } + return value; + }; + + _threshold = + std::min(get_param(_column_names.size(), argument_types, columns), (uint64_t)4096); + _reserved = std::min( + std::max(get_param(_column_names.size() + 1, argument_types, columns), _threshold), + (uint64_t)4096); + + if (_threshold == 0 || _reserved == 0 || _threshold > 4096 || _reserved > 4096) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "approx_top_sum param error, _threshold: {}, _reserved: {}", _threshold, + _reserved); + } + + _init_flag = true; + } + static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; + + mutable std::vector _column_names; + mutable bool _init_flag = false; + mutable uint64_t _threshold = 10; + mutable uint64_t _reserved = 30; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp index d6298881a90630..0aa7adc253da0f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.cpp @@ -24,58 +24,16 @@ namespace doris::vectorized { -int32_t is_valid_const_columns(const std::vector& is_const_columns) { - int32_t true_count = 0; - bool found_false_after_true = false; - for (int32_t i = is_const_columns.size() - 1; i >= 0; --i) { - if (is_const_columns[i]) { - true_count++; - if (found_false_after_true) { - return false; - } - } else { - if (true_count > 2) { - return false; - } - found_false_after_true = true; - } - } - if (true_count > 2) { - throw Exception(ErrorCode::INVALID_ARGUMENT, "Invalid is_const_columns configuration"); - } - return true_count; -} - AggregateFunctionPtr create_aggregate_function_approx_top_k(const std::string& name, const DataTypes& argument_types, const bool result_is_nullable, const AggregateFunctionAttr& attr) { - if (argument_types.empty()) { + if (argument_types.size() < 3) { return nullptr; } - std::vector is_const_columns; - std::vector column_names; - for (const auto& [name, is_const] : attr.column_infos) { - is_const_columns.push_back(is_const); - if (!is_const) { - column_names.push_back(name); - } - } - - int32_t true_count = is_valid_const_columns(is_const_columns); - if (true_count == 0) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 1) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else if (true_count == 2) { - return creator_without_type::create>( - argument_types, result_is_nullable, column_names); - } else { - return nullptr; - } + return creator_without_type::create( + argument_types, result_is_nullable, attr.column_names); } void register_aggregate_function_approx_top_k(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h index 7253ae8a96e200..93ea3232c311a1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_k.h @@ -45,28 +45,25 @@ namespace doris::vectorized { -inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF; - struct AggregateFunctionTopKGenericData { using Set = SpaceSaving; Set value; }; -template class AggregateFunctionApproxTopK final : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTopK>, AggregateFunctionApproxTop { private: using State = AggregateFunctionTopKGenericData; public: - AggregateFunctionApproxTopK(std::vector column_names, + AggregateFunctionApproxTopK(const std::vector& column_names, const DataTypes& argument_types_) : IAggregateFunctionDataHelper>(argument_types_), - _column_names(std::move(column_names)) {} + AggregateFunctionApproxTopK>(argument_types_), + AggregateFunctionApproxTop(column_names) {} String get_name() const override { return "approx_top_k"; } @@ -88,7 +85,7 @@ class AggregateFunctionApproxTopK final void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, Arena* arena) const override { auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { - size_t size = 0; + uint64_t size = 0; read_var_uint(size, buf); if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { @@ -104,7 +101,7 @@ class AggregateFunctionApproxTopK final auto& set = this->data(place).value; set.clear(); - size_t size = 0; + uint64_t size = 0; read_var_uint(size, buf); if (UNLIKELY(size > TOP_K_MAX_SIZE)) { throw Exception(ErrorCode::INTERNAL_ERROR, @@ -141,7 +138,7 @@ class AggregateFunctionApproxTopK final void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, Arena* arena) const override { if (!_init_flag) { - lazy_init(columns, row_num); + lazy_init(columns, row_num, this->get_argument_types()); } auto& set = this->data(place).value; @@ -227,64 +224,6 @@ class AggregateFunctionApproxTopK final std::string res = buffer.GetString(); data_to.insert_data(res.data(), res.size()); } - -private: - void lazy_init(const IColumn** columns, ssize_t row_num) const { - auto get_param = [](size_t idx, const DataTypes& data_types, - const IColumn** columns) -> uint64_t { - const auto& data_type = data_types.at(idx); - const IColumn* column = columns[idx]; - - const auto* type = data_type.get(); - if (type->is_nullable()) { - type = assert_cast(type) - ->get_nested_type() - .get(); - } - int64_t value = 0; - WhichDataType which(type); - if (which.idx == TypeIndex::Int8) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int16) { - value = assert_cast(column) - ->get_element(0); - } else if (which.idx == TypeIndex::Int32) { - value = assert_cast(column) - ->get_element(0); - } - if (value <= 0) { - throw Exception(ErrorCode::INVALID_ARGUMENT, - "The parameter cannot be less than or equal to 0."); - } - return value; - }; - - const auto& data_types = this->get_argument_types(); - if (ArgsSize == 1) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - } else if (ArgsSize == 2) { - _threshold = - std::min(get_param(_column_names.size(), data_types, columns), (uint64_t)1000); - _reserved = std::min( - std::max(get_param(_column_names.size() + 1, data_types, columns), _threshold), - (uint64_t)1000); - } - - if (_threshold == 0 || _reserved == 0 || _threshold > 1000 || _reserved > 1000) { - throw Exception(ErrorCode::INTERNAL_ERROR, - "approx_top_k param error, _threshold: {}, _reserved: {}", _threshold, - _reserved); - } - - _init_flag = true; - } - - mutable std::vector _column_names; - mutable bool _init_flag = false; - mutable uint64_t _threshold = 10; - mutable uint64_t _reserved = 300; }; } // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp new file mode 100644 index 00000000000000..7325651d141c13 --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.cpp @@ -0,0 +1,71 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "vec/aggregate_functions/aggregate_function_approx_top_sum.h" + +#include "common/exception.h" +#include "vec/aggregate_functions/aggregate_function_simple_factory.h" +#include "vec/aggregate_functions/helpers.h" +#include "vec/data_types/data_type.h" + +namespace doris::vectorized { + +template +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + if (N == argument_types.size() - 3) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, + column_names); + } else { + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, column_names); + } +} + +template <> +AggregateFunctionPtr create_aggregate_function_multi_top_sum_impl<0>( + const DataTypes& argument_types, const bool result_is_nullable, + const std::vector& column_names) { + return creator_with_type_base::template create< + AggregateFunctionApproxTopSumSimple>(argument_types, result_is_nullable, column_names); +} + +AggregateFunctionPtr create_aggregate_function_approx_top_sum(const std::string& name, + const DataTypes& argument_types, + const bool result_is_nullable, + const AggregateFunctionAttr& attr) { + if (argument_types.size() < 3) { + return nullptr; + } + + constexpr size_t max_param_value = 10; + if (argument_types.size() > max_param_value) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Argument types size exceeds the supported limit."); + } + + return create_aggregate_function_multi_top_sum_impl( + argument_types, result_is_nullable, attr.column_names); +} + +void register_aggregate_function_approx_top_sum(AggregateFunctionSimpleFactory& factory) { + factory.register_function_both("approx_top_sum", create_aggregate_function_approx_top_sum); +} + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h new file mode 100644 index 00000000000000..12b89bd02b51fd --- /dev/null +++ b/be/src/vec/aggregate_functions/aggregate_function_approx_top_sum.h @@ -0,0 +1,245 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include + +#include +#include + +#include "vec/aggregate_functions/aggregate_function.h" +#include "vec/aggregate_functions/aggregate_function_approx_top.h" +#include "vec/columns/column.h" +#include "vec/columns/column_array.h" +#include "vec/columns/column_string.h" +#include "vec/columns/column_struct.h" +#include "vec/columns/column_vector.h" +#include "vec/columns/columns_number.h" +#include "vec/common/assert_cast.h" +#include "vec/common/space_saving.h" +#include "vec/common/string_ref.h" +#include "vec/core/types.h" +#include "vec/data_types/data_type_array.h" +#include "vec/data_types/data_type_ipv4.h" +#include "vec/data_types/data_type_struct.h" +#include "vec/io/io_helper.h" + +namespace doris::vectorized { + +struct AggregateFunctionTopKGenericData { + using Set = SpaceSaving; + + Set value; +}; + +template +class AggregateFunctionApproxTopSum final + : public IAggregateFunctionDataHelper>, + AggregateFunctionApproxTop { +private: + using State = AggregateFunctionTopKGenericData; + + using ResultDataType = DataTypeNumber; + using ColVecType = ColumnVector; + using ColVecResult = ColumnVector; + +public: + AggregateFunctionApproxTopSum(const std::vector& column_names, + const DataTypes& argument_types_) + : IAggregateFunctionDataHelper>( + argument_types_), + AggregateFunctionApproxTop(column_names) {} + + String get_name() const override { return "approx_top_sum"; } + + DataTypePtr get_return_type() const override { return std::make_shared(); } + + // Serializes the aggregate function's state (including the SpaceSaving structure and threshold) into a buffer. + void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { + this->data(place).value.write(buf); + + write_var_uint(_column_names.size(), buf); + for (const auto& column_name : _column_names) { + write_string_binary(column_name, buf); + } + write_var_uint(_threshold, buf); + write_var_uint(_reserved, buf); + } + + // Deserializes the aggregate function's state from a buffer (including the SpaceSaving structure and threshold). + void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, + Arena* arena) const override { + auto readStringBinaryInto = [](Arena& arena, BufferReadable& buf) { + uint64_t size = 0; + read_var_uint(size, buf); + + if (UNLIKELY(size > DEFAULT_MAX_STRING_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, "Too large string size."); + } + + char* data = arena.alloc(size); + buf.read(data, size); + + return StringRef(data, size); + }; + + auto& set = this->data(place).value; + set.clear(); + + uint64_t size = 0; + read_var_uint(size, buf); + if (UNLIKELY(size > TOP_K_MAX_SIZE)) { + throw Exception(ErrorCode::INTERNAL_ERROR, + "Too large size ({}) for aggregate function '{}' state (maximum is {})", + size, get_name(), TOP_K_MAX_SIZE); + } + + set.resize(size); + for (size_t i = 0; i < size; ++i) { + auto ref = readStringBinaryInto(*arena, buf); + uint64_t count = 0; + uint64_t error = 0; + read_var_uint(count, buf); + read_var_uint(error, buf); + set.insert(ref, count, error); + arena->rollback(ref.size); + } + + set.read_alpha_map(buf); + + uint64_t column_size = 0; + read_var_uint(column_size, buf); + _column_names.clear(); + for (uint64_t i = 0; i < column_size; i++) { + std::string column_name; + read_string_binary(column_name, buf); + _column_names.emplace_back(std::move(column_name)); + } + read_var_uint(_threshold, buf); + read_var_uint(_reserved, buf); + } + + // Adds a new row of data to the aggregate function (inserts a new value into the SpaceSaving structure). + void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, + Arena* arena) const override { + if (!_init_flag) { + lazy_init(columns, row_num, this->get_argument_types()); + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + + auto all_serialize_value_into_arena = + [](size_t i, size_t keys_size, const IColumn** columns, Arena* arena) -> StringRef { + const char* begin = nullptr; + + size_t sum_size = 0; + for (size_t j = 0; j < keys_size; ++j) { + sum_size += columns[j]->serialize_value_into_arena(i, *arena, begin).size; + } + + return {begin, sum_size}; + }; + + StringRef str_serialized = + all_serialize_value_into_arena(row_num, _column_names.size(), columns, arena); + const auto& column = assert_cast( + *columns[_column_names.size() - 1]); + set.insert(str_serialized, TResult(column.get_data()[row_num])); + arena->rollback(str_serialized.size); + } + + void add_many(AggregateDataPtr __restrict place, const IColumn** columns, + std::vector& rows, Arena* arena) const override { + for (auto row : rows) { + add(place, columns, row, arena); + } + } + + void reset(AggregateDataPtr __restrict place) const override { + this->data(place).value.clear(); + } + + // Merges the state of another aggregate function into the current one (merges two SpaceSaving sets). + void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, + Arena*) const override { + auto& rhs_set = this->data(rhs).value; + if (!rhs_set.size()) { + return; + } + + auto& set = this->data(place).value; + if (set.capacity() != _reserved) { + set.resize(_reserved); + } + set.merge(rhs_set); + } + + void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { + auto& data_to = assert_cast(to); + + const typename State::Set& set = this->data(place).value; + auto result_vec = set.top_k(_threshold); + + rapidjson::StringBuffer buffer; + rapidjson::PrettyWriter writer(buffer); + writer.StartArray(); + for (auto& result : result_vec) { + auto argument_types = this->get_argument_types(); + MutableColumns argument_columns(_column_names.size()); + for (size_t i = 0; i < _column_names.size(); ++i) { + argument_columns[i] = argument_types[i]->create_column(); + } + rapidjson::StringBuffer sub_buffer; + rapidjson::Writer sub_writer(sub_buffer); + sub_writer.StartObject(); + const char* begin = result.key.data; + for (size_t i = 0; i < _column_names.size(); i++) { + begin = argument_columns[i]->deserialize_and_insert_from_arena(begin); + std::string row_str = argument_types[i]->to_string(*argument_columns[i], 0); + sub_writer.Key(_column_names[i].data(), _column_names[i].size()); + sub_writer.String(row_str.data(), row_str.size()); + } + sub_writer.Key("sum"); + sub_writer.String(std::to_string(result.count).c_str()); + sub_writer.EndObject(); + writer.RawValue(sub_buffer.GetString(), sub_buffer.GetSize(), rapidjson::kObjectType); + } + writer.EndArray(); + std::string res = buffer.GetString(); + data_to.insert_data(res.data(), res.size()); + } +}; + +template +struct TopSumSimple { + using ResultType = T; + using AggregateDataType = AggregateFunctionTopKGenericData; + using Function = AggregateFunctionApproxTopSum; +}; + +template +using AggregateFunctionApproxTopSumSimple = typename TopSumSimple::Function; + +} // namespace doris::vectorized \ No newline at end of file diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.cpp b/be/src/vec/aggregate_functions/aggregate_function_avg.cpp index 6a6711f90f983e..6109f0b0c601cd 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_avg.cpp @@ -25,6 +25,7 @@ #include "vec/core/field.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template struct Avg { diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg.h b/be/src/vec/aggregate_functions/aggregate_function_avg.h index 62fbb8078ea949..8b24db692aef05 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_avg.h @@ -41,6 +41,7 @@ #include "vec/io/io_helper.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; class BufferReadable; @@ -72,7 +73,8 @@ struct AggregateFunctionAvgData { ResultT result() const { if constexpr (std::is_floating_point_v) { if constexpr (std::numeric_limits::is_iec559) { - return static_cast(sum) / count; /// allow division by zero + return static_cast(sum) / + static_cast(count); /// allow division by zero } } @@ -91,7 +93,7 @@ struct AggregateFunctionAvgData { if constexpr (IsDecimal256) { return static_cast(sum / T(count)); } else { - return static_cast(sum) / count; + return static_cast(sum) / static_cast(count); } } } @@ -124,7 +126,11 @@ class AggregateFunctionAvg final IsDecimalV2, ColumnDecimal, std::conditional_t, ColumnDecimal, ColumnFloat64>>; + // The result calculated by PercentileApprox is an approximate value, + // so the underlying storage uses float. The following calls will involve + // an implicit cast to float. + using DataType = typename Data::ResultType; /// ctor for native types AggregateFunctionAvg(const DataTypes& argument_types_) : IAggregateFunctionDataHelper>(argument_types_), @@ -148,9 +154,9 @@ class AggregateFunctionAvg final const auto& column = assert_cast(*columns[0]); if constexpr (IsDecimalNumber) { - this->data(place).sum += column.get_data()[row_num].value; + this->data(place).sum += (DataType)column.get_data()[row_num].value; } else { - this->data(place).sum += column.get_data()[row_num]; + this->data(place).sum += (DataType)column.get_data()[row_num]; } ++this->data(place).count; } @@ -282,3 +288,5 @@ class AggregateFunctionAvg final }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.cpp b/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.cpp index fc5df5303fd15d..70a707b02e992b 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.cpp @@ -21,6 +21,7 @@ #include "vec/aggregate_functions/helpers.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" void register_aggregate_function_avg_weighted(AggregateFunctionSimpleFactory& factory) { factory.register_function_both("avg_weighted", creator_with_type::creator); diff --git a/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h b/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h index b59a3dccf0cea8..d1a5921b45039f 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h +++ b/be/src/vec/aggregate_functions/aggregate_function_avg_weighted.h @@ -35,6 +35,7 @@ #include "vec/io/io_helper.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; class BufferReadable; @@ -57,7 +58,7 @@ struct AggregateFunctionAvgWeightedData { DecimalV2Value value = binary_cast(data_val); data_sum = data_sum + (double(value) * weight_val); } else { - data_sum = data_sum + (data_val * weight_val); + data_sum = data_sum + (double(data_val) * weight_val); } weight_sum = weight_sum + weight_val; } @@ -138,3 +139,5 @@ class AggregateFunctionAvgWeight final }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_binary.h b/be/src/vec/aggregate_functions/aggregate_function_binary.h index 9fba9d11a1013a..fd5fc55d253661 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_binary.h +++ b/be/src/vec/aggregate_functions/aggregate_function_binary.h @@ -36,6 +36,7 @@ #include "vec/io/io_helper.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template typename Moments> struct StatFunc { @@ -127,3 +128,5 @@ AggregateFunctionPtr create_with_two_basic_numeric_types(const DataTypePtr& firs } } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_bit.cpp b/be/src/vec/aggregate_functions/aggregate_function_bit.cpp index 97a6c0e92fa723..981ced1fbd5a46 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bit.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_bit.cpp @@ -24,6 +24,7 @@ #include "vec/aggregate_functions/helpers.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" void register_aggregate_function_bit(AggregateFunctionSimpleFactory& factory) { factory.register_function_both( diff --git a/be/src/vec/aggregate_functions/aggregate_function_bit.h b/be/src/vec/aggregate_functions/aggregate_function_bit.h index 1ab01b03ceea38..d9760fdd30080b 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bit.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bit.h @@ -30,6 +30,7 @@ #include "vec/io/io_helper.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; class BufferReadable; @@ -142,4 +143,5 @@ class AggregateFunctionBitwise final } }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp b/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp index e9c86d4b9556da..47ddf2d81b6a71 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.cpp @@ -23,6 +23,7 @@ #include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template class AggregateFunctionTemplate> AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_type) { @@ -33,7 +34,11 @@ AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_type) { return std::make_shared>>( \ argument_type); \ } - FOR_INTEGER_TYPES(DISPATCH) + // Keep consistent with the FE definition; the function does not have an int128 type. + DISPATCH(Int8) + DISPATCH(Int16) + DISPATCH(Int32) + DISPATCH(Int64) #undef DISPATCH LOG(WARNING) << "with unknowed type, failed in create_with_int_data_type bitmap_union_int" << " and type is: " << argument_type[0]->get_name(); diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h index b0619a63e1ffe8..fb17b0a80be092 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap.h @@ -38,6 +38,7 @@ #include "vec/data_types/data_type_number.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; class BufferReadable; @@ -432,4 +433,5 @@ AggregateFunctionPtr create_aggregate_function_bitmap_union(const std::string& n const DataTypes& argument_types, const bool result_is_nullable); -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp index 0b95ddfd46f0d5..2a2c86303f3000 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.cpp @@ -23,6 +23,7 @@ #include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_types) { @@ -32,7 +33,11 @@ AggregateFunctionPtr create_with_int_data_type(const DataTypes& argument_types) if (which.idx == TypeIndex::TYPE) { \ return std::make_shared>(argument_types); \ } - FOR_INTEGER_TYPES(DISPATCH) + // Keep consistent with the FE definition; the function does not have an int128 type. + DISPATCH(Int8) + DISPATCH(Int16) + DISPATCH(Int32) + DISPATCH(Int64) #undef DISPATCH LOG(WARNING) << "with unknown type, failed in create_with_int_data_type bitmap_union_int" << " and type is: " << argument_types[0]->get_name(); diff --git a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h index 5747faf1b8e8c1..bff32aa606ccd2 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h +++ b/be/src/vec/aggregate_functions/aggregate_function_bitmap_agg.h @@ -31,6 +31,7 @@ #include "vec/data_types/data_type_bitmap.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; class BufferReadable; @@ -226,4 +227,5 @@ class AggregateFunctionBitmapAgg final } }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.cpp b/be/src/vec/aggregate_functions/aggregate_function_collect.cpp index d726b7c6355318..15806c739ed58c 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.cpp @@ -26,6 +26,7 @@ #include "vec/aggregate_functions/helpers.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template AggregateFunctionPtr do_create_agg_function_collect(bool distinct, const DataTypes& argument_types, @@ -72,12 +73,18 @@ AggregateFunctionPtr create_aggregate_function_collect_impl(const std::string& n if (which.is_date_or_datetime()) { return do_create_agg_function_collect(distinct, argument_types, result_is_nullable); - } else if (which.is_date_v2() || which.is_ipv4()) { + } else if (which.is_date_v2()) { return do_create_agg_function_collect(distinct, argument_types, result_is_nullable); - } else if (which.is_date_time_v2() || which.is_ipv6()) { + } else if (which.is_date_time_v2()) { return do_create_agg_function_collect(distinct, argument_types, result_is_nullable); + } else if (which.is_ipv6()) { + return do_create_agg_function_collect(distinct, argument_types, + result_is_nullable); + } else if (which.is_ipv4()) { + return do_create_agg_function_collect(distinct, argument_types, + result_is_nullable); } else if (which.is_string()) { return do_create_agg_function_collect( distinct, argument_types, result_is_nullable); diff --git a/be/src/vec/aggregate_functions/aggregate_function_collect.h b/be/src/vec/aggregate_functions/aggregate_function_collect.h index 02490be56a0bf1..2d18a56313f3f9 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_collect.h +++ b/be/src/vec/aggregate_functions/aggregate_function_collect.h @@ -46,6 +46,7 @@ #include "vec/io/var_int.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; } // namespace vectorized @@ -98,7 +99,7 @@ struct AggregateFunctionCollectSetData { } void read(BufferReadable& buf) { - size_t new_size = 0; + uint64_t new_size = 0; read_var_uint(new_size, buf); ElementNativeType x; for (size_t i = 0; i < new_size; ++i) { @@ -836,3 +837,5 @@ class AggregateFunctionCollect }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_combinator.h b/be/src/vec/aggregate_functions/aggregate_function_combinator.h index 1593d74ed4e59d..0908ac8d0278f1 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_combinator.h +++ b/be/src/vec/aggregate_functions/aggregate_function_combinator.h @@ -26,6 +26,7 @@ #include "vec/data_types/data_type.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" /** Aggregate function combinator allows to take one aggregate function * and transform it to another aggregate function. @@ -69,3 +70,5 @@ class IAggregateFunctionCombinator { }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp index cdaab6e086f4a5..e0a51ca6629a06 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_corr.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_corr.cpp @@ -21,6 +21,7 @@ #include "vec/core/types.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" template struct CorrMoment { diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.cpp b/be/src/vec/aggregate_functions/aggregate_function_count.cpp index 5cfe5af41982f6..72d12cf65fe9d0 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_count.cpp @@ -26,6 +26,7 @@ #include "vec/aggregate_functions/factory_helpers.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" AggregateFunctionPtr create_aggregate_function_count(const std::string& name, const DataTypes& argument_types, diff --git a/be/src/vec/aggregate_functions/aggregate_function_count.h b/be/src/vec/aggregate_functions/aggregate_function_count.h index 7b54d074683b04..630994a7967957 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count.h @@ -41,6 +41,7 @@ #include "vec/io/var_int.h" namespace doris { +#include "common/compile_check_begin.h" namespace vectorized { class Arena; class BufferReadable; @@ -321,3 +322,5 @@ class AggregateFunctionCountNotNullUnary final }; } // namespace doris::vectorized + +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp index 093b31d57db554..20235d9e2ef2e9 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.cpp @@ -26,6 +26,7 @@ #include "vec/core/types.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" AggregateFunctionPtr create_aggregate_function_count_by_enum(const std::string& name, const DataTypes& argument_types, diff --git a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h index 1f5093de68263e..543ae55f872da6 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h +++ b/be/src/vec/aggregate_functions/aggregate_function_count_by_enum.h @@ -32,6 +32,7 @@ #include "vec/io/io_helper.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" struct CountByEnumData { std::unordered_map cbe; @@ -46,8 +47,7 @@ void build_json_from_vec(rapidjson::StringBuffer& buffer, doc.SetArray(); rapidjson::Document::AllocatorType& allocator = doc.GetAllocator(); - int vec_size_number = data_vec.size(); - for (int idx = 0; idx < vec_size_number; ++idx) { + for (size_t idx = 0; idx < data_vec.size(); ++idx) { rapidjson::Value obj(rapidjson::kObjectType); rapidjson::Value obj_cbe(rapidjson::kObjectType); @@ -239,4 +239,5 @@ class AggregateFunctionCountByEnum final size_t arg_count; }; -} // namespace doris::vectorized \ No newline at end of file +} // namespace doris::vectorized +#include "common/compile_check_end.h" diff --git a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp index 71d09f61de4302..d9c091fb601868 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_covar.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_covar.cpp @@ -28,21 +28,17 @@ #include "vec/data_types/data_type_nullable.h" namespace doris::vectorized { +#include "common/compile_check_begin.h" -template