Skip to content

Commit

Permalink
[opt](inverted index) mow supports index optimization #(apache#38180)
Browse files Browse the repository at this point in the history
## Proposed changes

apache#37428
apache#37429

<!--Describe your changes.-->
  • Loading branch information
zzzxl1993 authored Aug 6, 2024
1 parent ab3057b commit ff6fa33
Show file tree
Hide file tree
Showing 6 changed files with 334 additions and 10 deletions.
47 changes: 37 additions & 10 deletions be/src/olap/rowset/segment_v2/segment_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1980,11 +1980,12 @@ Status SegmentIterator::_read_columns_by_index(uint32_t nrows_read_limit, uint32
auto debug_col_name = DebugPoints::instance()->get_debug_param_or_default<std::string>(
"segment_iterator._read_columns_by_index", "column_name", "");
if (debug_col_name.empty()) {
return Status::Error<ErrorCode::INTERNAL_ERROR>("{} does not need to read data");
return Status::Error<ErrorCode::INTERNAL_ERROR>("does not need to read data");
}
auto col_name = _opts.tablet_schema->column(cid).name();
if (debug_col_name.find(col_name) != std::string::npos) {
return Status::Error<ErrorCode::INTERNAL_ERROR>("{} does not need to read data");
return Status::Error<ErrorCode::INTERNAL_ERROR>("does not need to read data, {}",
debug_col_name);
}
})

Expand Down Expand Up @@ -2179,9 +2180,27 @@ Status SegmentIterator::_read_columns_by_rowids(std::vector<ColumnId>& read_colu
}

for (auto cid : read_column_ids) {
if (_prune_column(cid, (*mutable_columns)[cid], true, select_size)) {
auto& colunm = (*mutable_columns)[cid];
if (_no_need_read_key_data(cid, colunm, select_size)) {
continue;
}
if (_prune_column(cid, colunm, true, select_size)) {
continue;
}

DBUG_EXECUTE_IF("segment_iterator._read_columns_by_index", {
auto debug_col_name = DebugPoints::instance()->get_debug_param_or_default<std::string>(
"segment_iterator._read_columns_by_index", "column_name", "");
if (debug_col_name.empty()) {
return Status::Error<ErrorCode::INTERNAL_ERROR>("does not need to read data");
}
auto col_name = _opts.tablet_schema->column(cid).name();
if (debug_col_name.find(col_name) != std::string::npos) {
return Status::Error<ErrorCode::INTERNAL_ERROR>("does not need to read data, {}",
debug_col_name);
}
})

RETURN_IF_ERROR(_column_iterators[cid]->read_by_rowids(rowids.data(), select_size,
_current_return_columns[cid]));
}
Expand Down Expand Up @@ -2794,10 +2813,9 @@ void SegmentIterator::_calculate_pred_in_remaining_conjunct_root(

bool SegmentIterator::_no_need_read_key_data(ColumnId cid, vectorized::MutableColumnPtr& column,
size_t nrows_read) {
if (_opts.runtime_state && !_opts.runtime_state->query_options().enable_no_need_read_data_opt) {
return false;
}
if (_opts.tablet_schema->keys_type() != KeysType::DUP_KEYS) {
if (!((_opts.tablet_schema->keys_type() == KeysType::DUP_KEYS ||
(_opts.tablet_schema->keys_type() == KeysType::UNIQUE_KEYS &&
_opts.enable_unique_key_merge_on_write)))) {
return false;
}

Expand Down Expand Up @@ -2855,11 +2873,20 @@ bool SegmentIterator::_can_opt_topn_reads() const {
return false;
}

if (!_col_predicates.empty() || !_col_preds_except_leafnode_of_andnode.empty()) {
return false;
std::set<uint32_t> cids;
for (auto* pred : _col_predicates) {
cids.insert(pred->column_id());
}
for (auto* pred : _col_preds_except_leafnode_of_andnode) {
cids.insert(pred->column_id());
}

return true;
uint32_t delete_sign_idx = _opts.tablet_schema->delete_sign_idx();
bool result = std::ranges::all_of(cids.begin(), cids.end(), [delete_sign_idx](auto cid) {
return cid == delete_sign_idx;
});

return result;
}

} // namespace segment_v2
Expand Down
2 changes: 2 additions & 0 deletions be/src/vec/exprs/vexpr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -604,6 +604,8 @@ bool VExpr::fast_execute(Block& block, const ColumnNumbers& arguments, size_t re
size_t input_rows_count, const std::string& function_name) {
std::string result_column_name = gen_predicate_result_sign(block, arguments, function_name);
if (!block.has(result_column_name)) {
DBUG_EXECUTE_IF("segment_iterator.fast_execute",
{ return Status::Error<ErrorCode::INTERNAL_ERROR>("fast_execute failed"); })
return false;
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
210

-- !sql --
2

-- !sql --
4

-- !sql --
29

-- !sql --
14

-- !sql --
120

-- !sql --
2

-- !sql --
4

-- !sql --
22

-- !sql --
11

Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !sql --
893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

-- !sql --
893964617 40.135.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

-- !sql --
893964672 26.1.0.0 GET /images/hm_bg.jpg HTTP/1.0 200 24736

Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_all_index_hit_fault_injection", "nonConcurrent") {
// define a sql table
def indexTbName1 = "test_all_index_hit_fault_injection_1"
def indexTbName2 = "test_all_index_hit_fault_injection_2"

sql "DROP TABLE IF EXISTS ${indexTbName1}"
sql """
CREATE TABLE ${indexTbName1} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` varchar(20) NULL COMMENT "",
`request` text NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`size` int(11) NULL COMMENT "",
INDEX clientip_idx (`clientip`) USING INVERTED COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"disable_auto_compaction" = "true"
);
"""

sql "DROP TABLE IF EXISTS ${indexTbName2}"
sql """
CREATE TABLE ${indexTbName2} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` varchar(20) NULL COMMENT "",
`request` text NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`size` int(11) NULL COMMENT "",
INDEX clientip_idx (`clientip`) USING INVERTED COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
UNIQUE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"enable_unique_key_merge_on_write" = "true",
"disable_auto_compaction" = "true"
);
"""

def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false,
expected_succ_rows = -1, load_to_single_tablet = 'true' ->

// load the json data
streamLoad {
table "${table_name}"

// set http request header params
set 'label', label + "_" + UUID.randomUUID().toString()
set 'read_json_by_line', read_flag
set 'format', format_flag
file file_name // import json file
time 10000 // limit inflight 10s
if (expected_succ_rows >= 0) {
set 'max_filter_ratio', '1'
}

// if declared a check callback, the default check condition will ignore.
// So you must check all condition
check { result, exception, startTime, endTime ->
if (ignore_failure && expected_succ_rows < 0) { return }
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
}
}
}

try {
load_httplogs_data.call(indexTbName1, 'test_all_index_hit_fault_injection_1', 'true', 'json', 'documents-1000.json')
load_httplogs_data.call(indexTbName2, 'test_all_index_hit_fault_injection_2', 'true', 'json', 'documents-1000.json')

sql "sync"

try {
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator._read_columns_by_index", [column_name: "clientip,request"])
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.fast_execute")

qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm'); """
qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip = '126.1.0.0'); """
qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0'); """
qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0' or clientip = '232.0.0.0'); """
qt_sql """ select count() from ${indexTbName1} where (request match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """

qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm'); """
qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip = '126.1.0.0'); """
qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip = '126.1.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0'); """
qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip = '126.1.0.0' or clientip = '247.37.0.0') or (request match_phrase 'bg' and clientip = '201.0.0.0' or clientip = '232.0.0.0'); """
qt_sql """ select count() from ${indexTbName2} where (request match_phrase 'hm' and clientip in ('126.1.0.0', '247.37.0.0')) or (request match_phrase 'bg' and clientip in ('201.0.0.0', '232.0.0.0')); """

} finally {
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator._read_columns_by_index")
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.fast_execute")
}
} finally {
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_topn_fault_injection", "nonConcurrent") {
// define a sql table
def indexTbName1 = "test_topn_fault_injection1"
def indexTbName2 = "test_topn_fault_injection2"

sql "DROP TABLE IF EXISTS ${indexTbName1}"
sql """
CREATE TABLE ${indexTbName1} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` varchar(20) NULL COMMENT "",
`request` text NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`size` int(11) NULL COMMENT "",
INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
UNIQUE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY HASH(`@timestamp`) BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"enable_unique_key_merge_on_write" = "true",
"disable_auto_compaction" = "true"
);
"""

sql "DROP TABLE IF EXISTS ${indexTbName2}"
sql """
CREATE TABLE ${indexTbName2} (
`@timestamp` int(11) NULL COMMENT "",
`clientip` varchar(20) NULL COMMENT "",
`request` text NULL COMMENT "",
`status` int(11) NULL COMMENT "",
`size` int(11) NULL COMMENT "",
INDEX clientip_idx (`clientip`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT '',
INDEX request_idx (`request`) USING INVERTED PROPERTIES("parser" = "english", "support_phrase" = "true") COMMENT ''
) ENGINE=OLAP
DUPLICATE KEY(`@timestamp`)
COMMENT "OLAP"
DISTRIBUTED BY RANDOM BUCKETS 1
PROPERTIES (
"replication_allocation" = "tag.location.default: 1",
"disable_auto_compaction" = "true"
);
"""

def load_httplogs_data = {table_name, label, read_flag, format_flag, file_name, ignore_failure=false,
expected_succ_rows = -1, load_to_single_tablet = 'true' ->

// load the json data
streamLoad {
table "${table_name}"

// set http request header params
set 'label', label + "_" + UUID.randomUUID().toString()
set 'read_json_by_line', read_flag
set 'format', format_flag
file file_name // import json file
time 10000 // limit inflight 10s
if (expected_succ_rows >= 0) {
set 'max_filter_ratio', '1'
}

// if declared a check callback, the default check condition will ignore.
// So you must check all condition
check { result, exception, startTime, endTime ->
if (ignore_failure && expected_succ_rows < 0) { return }
if (exception != null) {
throw exception
}
log.info("Stream load result: ${result}".toString())
def json = parseJson(result)
}
}
}

try {
load_httplogs_data.call(indexTbName1, 'test_topn_fault_injection1', 'true', 'json', 'documents-1000.json')
load_httplogs_data.call(indexTbName2, 'test_topn_fault_injection2', 'true', 'json', 'documents-1000.json')

sql "sync"

try {
GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.topn_opt")

qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip match_phrase '2') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName1} where (request match_phrase 'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4') order by `@timestamp` limit 1; """

qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm' and clientip match_phrase '1') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm' and clientip match_phrase '1') or (request match_phrase 'bg' and clientip match_phrase '2') order by `@timestamp` limit 1; """
qt_sql """ select * from ${indexTbName2} where (request match_phrase 'hm' and clientip match_phrase '1' or clientip match_phrase '3') or (request match_phrase 'bg' and clientip match_phrase '2' or clientip match_phrase '4') order by `@timestamp` limit 1; """
} finally {
GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.topn_opt")
}
} finally {
}
}

0 comments on commit ff6fa33

Please sign in to comment.