From 6b57a4e40f5fc90b900017af1659b6f19716994f Mon Sep 17 00:00:00 2001 From: Murphy <96611012+murphyatwork@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:42:01 +0800 Subject: [PATCH] [BugFix] fix multiple partition column statistics (#50488) Signed-off-by: Murphy (cherry picked from commit 7bdc5bf5635b18c23960f9341afb767520c4f960) # Conflicts: # fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java # test/lib/sr_sql_lib.py --- .../statistics/StatisticsCalculator.java | 43 +++++++++ test/lib/sr_sql_lib.py | 88 +++++++++++++++++++ .../R/test_list_partition_cardinality | 81 +++++++++++++++++ .../T/test_list_partition_cardinality | 41 +++++++++ 4 files changed, 253 insertions(+) create mode 100644 test/sql/test_list_partition/R/test_list_partition_cardinality create mode 100644 test/sql/test_list_partition/T/test_list_partition_cardinality diff --git a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java index 6ca11c2f7d012..be00e4f35d7ba 100644 --- a/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java +++ b/fe/fe-core/src/main/java/com/starrocks/sql/optimizer/statistics/StatisticsCalculator.java @@ -531,10 +531,26 @@ private ColumnStatistic adjustPartitionStatistic(Collection selectedPartit ColumnStatistic partitionColumnStatistic = GlobalStateMgr.getCurrentStatisticStorage().getColumnStatistic(olapTable, partitionColumn); +<<<<<<< HEAD +======= + Map colNameMap = Maps.newHashMap(); + colRefToColumnMetaMap.entrySet().stream().forEach(e -> colNameMap.put(e.getValue().getName(), e.getKey())); + // It might contain null value, if some partition columns are not referenced in the scan + List partitionCols = + olapTable.getPartitionColumnNames().stream() + .map(colNameMap::get) + .collect(Collectors.toList()); + PartitionInfo partitionInfo = olapTable.getPartitionInfo(); + if (partitionInfo instanceof RangePartitionInfo) { + if (partitionCols.size() != 1 || partitionCols.stream().anyMatch(Objects::isNull)) { + return; + } +>>>>>>> 7bdc5bf563 ([BugFix] fix multiple partition column statistics (#50488)) if (optimizerContext.getDumpInfo() != null) { optimizerContext.getDumpInfo().addTableStatistics(olapTable, partitionColumn, partitionColumnStatistic); } +<<<<<<< HEAD PartitionInfo partitionInfo = olapTable.getPartitionInfo(); if (partitionInfo instanceof RangePartitionInfo) { RangePartitionInfo rangePartitionInfo = (RangePartitionInfo) partitionInfo; @@ -572,6 +588,33 @@ private ColumnStatistic adjustPartitionStatistic(Collection selectedPartit allNoEmptyPartitionsSize; return buildFrom(partitionColumnStatistic). setMinValue(min).setMaxValue(max).setDistinctValuesCount(max(distinctValues, 1)).build(); +======= + int selectedPartitionsSize = selectedPartitionId.size(); + int allNoEmptyPartitionsSize = (int) olapTable.getPartitions().stream().filter(Partition::hasData).count(); + double distinctValues = + builder.getColumnStatistics(partitionCols.get(0)).getDistinctValuesCount() * 1.0 * selectedPartitionsSize / + allNoEmptyPartitionsSize; + ColumnStatistic columnStatistic = ColumnStatistic.buildFrom(builder.getColumnStatistics(partitionCols.get(0))) + .setMinValue(min).setMaxValue(max).setDistinctValuesCount(max(distinctValues, 1)).build(); + builder.addColumnStatistic(partitionCols.get(0), columnStatistic); + } else if (partitionInfo instanceof ListPartitionInfo) { + ListPartitionInfo listPartitionInfo = (ListPartitionInfo) partitionInfo; + for (int i = 0; i < partitionCols.size(); i++) { + ColumnRefOperator columnRef = partitionCols.get(i); + // For multi-column list partition, pruning on any column should adjust the statistics + if (columnRef == null) { + continue; + } + if (optimizerContext.getDumpInfo() != null) { + optimizerContext.getDumpInfo().addTableStatistics(olapTable, + partitionCols.get(i).getName(), + builder.getColumnStatistics(partitionCols.get(i))); + } + long ndv = extractDistinctPartitionValues(listPartitionInfo, selectedPartitionId, i); + ColumnStatistic columnStatistic = ColumnStatistic.buildFrom(builder.getColumnStatistics(columnRef)) + .setDistinctValuesCount(ndv).build(); + builder.addColumnStatistic(columnRef, columnStatistic); +>>>>>>> 7bdc5bf563 ([BugFix] fix multiple partition column statistics (#50488)) } } return null; diff --git a/test/lib/sr_sql_lib.py b/test/lib/sr_sql_lib.py index bf7075bc92677..1b459177deef3 100644 --- a/test/lib/sr_sql_lib.py +++ b/test/lib/sr_sql_lib.py @@ -1751,6 +1751,94 @@ def set_first_tablet_bad_and_recover(self, table_name): time.sleep(0.5) else: break +<<<<<<< HEAD +======= + + def assert_explain_contains(self, query, *expects): + """ + assert explain result contains expect string + """ + sql = "explain %s" % query + res = self.execute_sql(sql, True) + for expect in expects: + tools.assert_true( + str(res["result"]).find(expect) > 0, + "assert expect {} is not found in plan {}".format(expect, res["result"]), + ) + + def assert_explain_not_contains(self, query, *expects): + """ + assert explain result contains expect string + """ + sql = "explain %s" % query + res = self.execute_sql(sql, True) + for expect in expects: + tools.assert_true(str(res["result"]).find(expect) == -1, "assert expect %s is found in plan" % (expect)) + + def assert_explain_verbose_contains(self, query, *expects): + """ + assert explain verbose result contains expect string + """ + sql = "explain verbose %s" % (query) + res = self.execute_sql(sql, True) + tools.assert_true(res["status"], res['msg']) + for expect in expects: + plan_string = "\n".join(item[0] for item in res["result"]) + tools.assert_true(plan_string.find(expect) > 0, "assert expect %s is not found in plan: %s" % (expect, plan_string)) + + def assert_explain_costs_contains(self, query, *expects): + """ + assert explain costs result contains expect string + """ + sql = "explain costs %s" % query + res = self.execute_sql(sql, True) + for expect in expects: + tools.assert_true(str(res["result"]).find(expect) > 0, "assert expect %s is not found in plan" % (expect)) + + def assert_trace_values_contains(self, query, *expects): + """ + assert trace values result contains expect string + """ + sql = "trace values %s" % query + res = self.execute_sql(sql, True) + for expect in expects: + tools.assert_true( + str(res["result"]).find(expect) > 0, + "assert expect %s is not found in plan, error msg is %s" % (expect, str(res["result"])), + ) + + def assert_prepare_execute(self, db, query, params=()): + conn = mysql.connector.connect( + host=self.mysql_host, user=self.mysql_user, password="", port=self.mysql_port, database=db + ) + cursor = conn.cursor(prepared=True) + + try: + if params: + cursor.execute(query, params) + else: + cursor.execute(query) + cursor.fetchall() + except mysql.connector.Error as e: + tools.assert_true(1 == 0, e) + + finally: + cursor.close() + conn.close() + + def assert_trace_times_contains(self, query, *expects): + """ + assert trace times result contains expect string + """ + sql = "trace times %s" % query + res = self.execute_sql(sql, True) + for expect in expects: + tools.assert_true( + str(res["result"]).find(expect) > 0, + "assert expect %s is not found in plan, error msg is %s" % (expect, str(res["result"])), + ) + +>>>>>>> 7bdc5bf563 ([BugFix] fix multiple partition column statistics (#50488)) def assert_clear_stale_stats(self, query, expect_num): timeout = 300 num = 0; diff --git a/test/sql/test_list_partition/R/test_list_partition_cardinality b/test/sql/test_list_partition/R/test_list_partition_cardinality new file mode 100644 index 0000000000000..3331bc007ba63 --- /dev/null +++ b/test/sql/test_list_partition/R/test_list_partition_cardinality @@ -0,0 +1,81 @@ +-- name: test_list_partition_cardinality +DROP DATABASE IF EXISTS test_list_partition_cardinality; +-- result: +-- !result +CREATE DATABASE test_list_partition_cardinality; +-- result: +-- !result +USE test_list_partition_cardinality; +-- result: +-- !result +CREATE TABLE partitions_multi_column_1 ( + c1 int NOT NULL, + c2 int NOT NULL, + c3 int +) +PARTITION BY (c1, c2); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 VALUES + (1,1,1), + (1,2,4), + (1,2,4), + (1,2,4), + (2,3,2), + (2,4,5), + (3,5,3), + (3,6,6); +-- result: +-- !result +INSERT INTO partitions_multi_column_1 +SELECT 4, 7, generate_series FROM TABLE(generate_series(1, 1000)); +-- result: +-- !result +ANALYZE FULL TABLE partitions_multi_column_1 WITH SYNC MODE; +-- result: +test_list_partition_cardinality.partitions_multi_column_1 analyze status OK +-- !result +SELECT count(*) FROM partitions_multi_column_1; +-- result: +1008 +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=0', 'EMPTYSET') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=1', 'cardinality: 2') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=2', 'cardinality: 1') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=3', 'cardinality: 1') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=4', 'cardinality: 500') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=0', 'EMPTYSET') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=1', 'cardinality: 1') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=2', 'cardinality: 2') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=3', 'cardinality: 1') +-- result: +None +-- !result +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=7', 'cardinality: 500') +-- result: +None +-- !result \ No newline at end of file diff --git a/test/sql/test_list_partition/T/test_list_partition_cardinality b/test/sql/test_list_partition/T/test_list_partition_cardinality new file mode 100644 index 0000000000000..c2d13a17b5e90 --- /dev/null +++ b/test/sql/test_list_partition/T/test_list_partition_cardinality @@ -0,0 +1,41 @@ +-- name: test_list_partition_cardinality + +DROP DATABASE IF EXISTS test_list_partition_cardinality; +CREATE DATABASE test_list_partition_cardinality; +USE test_list_partition_cardinality; + +CREATE TABLE partitions_multi_column_1 ( + c1 int NOT NULL, + c2 int NOT NULL, + c3 int +) +PARTITION BY (c1, c2); + +INSERT INTO partitions_multi_column_1 VALUES + (1,1,1), + (1,2,4), + (1,2,4), + (1,2,4), + (2,3,2), + (2,4,5), + (3,5,3), + (3,6,6); + +INSERT INTO partitions_multi_column_1 +SELECT 4, 7, generate_series FROM TABLE(generate_series(1, 1000)); + +ANALYZE FULL TABLE partitions_multi_column_1 WITH SYNC MODE; + +SELECT count(*) FROM partitions_multi_column_1; + +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=0', 'EMPTYSET') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=1', 'cardinality: 2') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=2', 'cardinality: 1') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=3', 'cardinality: 1') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c1=4', 'cardinality: 500') + +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=0', 'EMPTYSET') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=1', 'cardinality: 1') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=2', 'cardinality: 2') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=3', 'cardinality: 1') +function: assert_explain_verbose_contains('SELECT COUNT(*) FROM partitions_multi_column_1 WHERE c2=7', 'cardinality: 500') \ No newline at end of file