From 6ee389b914a79d4dc5abdf7c33bcae9e9e4c7b0b Mon Sep 17 00:00:00 2001 From: Socrates Date: Thu, 5 Dec 2024 01:05:54 +0800 Subject: [PATCH] [fix](hudi) support reading hudi read optimized table with orc format (#44995) ### What problem does this PR solve? Problem Summary: When reading the hudi ro table, it will be pushed back from jni to the native reader. However, this process will default the file format to parquet, and does not consider the situation that the hudi table is stored in orc format. 1. support reading hudi read optimized table with orc format 2. fix explain results of hudiScanNode when force_jni_reader=true 3. add cases about timestamp with different timezones --- .../datasource/hudi/source/HudiScanNode.java | 14 +++++-- .../hudi/test_hudi_orc_tables.out | 15 +++++++ .../hudi/test_hudi_timestamp.out | 31 ++++++++++++-- .../hudi/test_hudi_orc_tables.groovy | 41 +++++++++++++++++++ .../hudi/test_hudi_timestamp.groovy | 18 +++++++- 5 files changed, 111 insertions(+), 8 deletions(-) create mode 100644 regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out create mode 100644 regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java index 28805aae63c1e3..a73a2065d0ffaf 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hudi/source/HudiScanNode.java @@ -25,6 +25,7 @@ import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.UserException; +import org.apache.doris.common.util.FileFormatUtils; import org.apache.doris.common.util.LocationPath; import org.apache.doris.datasource.ExternalTable; import org.apache.doris.datasource.FileSplit; @@ -247,8 +248,15 @@ protected void setScanParams(TFileRangeDesc rangeDesc, Split split) { && !sessionVariable.isForceJniScanner() && hudiSplit.getHudiDeltaLogs().isEmpty()) { // no logs, is read optimize table, fallback to use native reader - // TODO: support read orc hudi table in native reader - rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + String fileFormat = FileFormatUtils.getFileFormatBySuffix(hudiSplit.getDataFilePath()) + .orElse("Unknown"); + if (fileFormat.equals("parquet")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET); + } else if (fileFormat.equals("orc")) { + rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC); + } else { + throw new RuntimeException("Unsupported file format: " + fileFormat); + } } setHudiParams(rangeDesc, hudiSplit); } @@ -495,7 +503,7 @@ private HudiSplit generateHudiSplit(FileSlice fileSlice, List partitionV List logs = fileSlice.getLogFiles().map(HoodieLogFile::getPath) .map(StoragePath::toString) .collect(Collectors.toList()); - if (logs.isEmpty()) { + if (logs.isEmpty() && !sessionVariable.isForceJniScanner()) { noLogsSplitNum.incrementAndGet(); } diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out new file mode 100644 index 00000000000000..9e28074dc9114e --- /dev/null +++ b/regression-test/data/external_table_p2/hudi/test_hudi_orc_tables.out @@ -0,0 +1,15 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cow -- +20241204190011744 20241204190011744_0_6 20241204190011744_0_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 1 A +20241204190011744 20241204190011744_0_7 20241204190011744_2_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 3 C +20241204190011744 20241204190011744_0_8 20241204190011744_4_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 5 E +20241204190011744 20241204190011744_0_9 20241204190011744_1_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 2 B +20241204190011744 20241204190011744_0_10 20241204190011744_3_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 4 D + +-- !mor -- +20241204190002046 20241204190002046_0_11 20241204190002046_0_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 1 A +20241204190002046 20241204190002046_0_12 20241204190002046_2_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 3 C +20241204190002046 20241204190002046_0_13 20241204190002046_4_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 5 E +20241204190002046 20241204190002046_0_14 20241204190002046_1_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 2 B +20241204190002046 20241204190002046_0_15 20241204190002046_3_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 4 D + diff --git a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out index dc47ff86d90a8d..9bdb0f7cb7285f 100644 --- a/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out +++ b/regression-test/data/external_table_p2/hudi/test_hudi_timestamp.out @@ -1,6 +1,31 @@ -- This file is automatically generated. You should know what you did if you want to edit this --- !timestamp -- +-- !timestamp1 -- 20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T08:00 -20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30:00 -20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00 + +-- !timestamp2 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T23:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-26T00:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-26T02:00 + +-- !timestamp3 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T15:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T16:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T18:00 + +-- !timestamp1 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T08:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00 + +-- !timestamp2 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T23:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-26T00:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-26T02:00 + +-- !timestamp3 -- +20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T15:00 +20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T16:30 +20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T18:00 diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy new file mode 100644 index 00000000000000..43638a23881e0e --- /dev/null +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_orc_tables.groovy @@ -0,0 +1,41 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hudi_orc_tables", "p2,external,hudi,external_remote,external_remote_hudi") { + String enabled = context.config.otherConfigs.get("enableExternalHudiTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("disable hudi test") + } + + String catalog_name = "test_hudi_orc_tables" + String props = context.config.otherConfigs.get("hudiEmrCatalog") + sql """drop catalog if exists ${catalog_name};""" + sql """ + create catalog if not exists ${catalog_name} properties ( + ${props} + ); + """ + + sql """ switch ${catalog_name};""" + sql """ use regression_hudi;""" + sql """ set enable_fallback_to_original_planner=false """ + + qt_cow """ select * from orc_hudi_table_cow; """ + qt_mor """ select * from orc_hudi_table_mor; """ + + sql """drop catalog if exists ${catalog_name};""" +} \ No newline at end of file diff --git a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy index 36309322558f52..3d7bd40b2d54cf 100644 --- a/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy +++ b/regression-test/suites/external_table_p2/hudi/test_hudi_timestamp.groovy @@ -34,8 +34,22 @@ suite("test_hudi_timestamp", "p2,external,hudi,external_remote,external_remote_h sql """ use regression_hudi;""" sql """ set enable_fallback_to_original_planner=false """ - // TODO: fix hudi timezone issue and enable this - // qt_timestamp """ select * from hudi_table_with_timestamp order by id; """ + def test_timestamp_different_timezones = { + sql """set time_zone = 'America/Los_Angeles';""" + qt_timestamp1 """ select * from hudi_table_with_timestamp order by id; """ + sql """set time_zone = 'Asia/Shanghai';""" + qt_timestamp2 """ select * from hudi_table_with_timestamp order by id; """ + sql """set time_zone = 'UTC';""" + qt_timestamp3 """ select * from hudi_table_with_timestamp order by id; """ + } + + // test native reader + test_timestamp_different_timezones() + sql """ set force_jni_scanner = true; """ + // test jni reader + test_timestamp_different_timezones() + sql """ set force_jni_scanner = false; """ + sql """drop catalog if exists ${catalog_name};""" }