Skip to content

Commit

Permalink
[fix](hudi) support reading hudi read optimized table with orc format (
Browse files Browse the repository at this point in the history
…apache#44995)

### What problem does this PR solve?
Problem Summary:
When reading the hudi ro table, it will be pushed back from jni to the
native reader. However, this process will default the file format to
parquet, and does not consider the situation that the hudi table is
stored in orc format.

1. support reading hudi read optimized table with orc format
2. fix explain results of hudiScanNode when force_jni_reader=true
3. add cases about  timestamp with different timezones
  • Loading branch information
suxiaogang223 committed Dec 5, 2024
1 parent fe87b4d commit 6ee389b
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.UserException;
import org.apache.doris.common.util.FileFormatUtils;
import org.apache.doris.common.util.LocationPath;
import org.apache.doris.datasource.ExternalTable;
import org.apache.doris.datasource.FileSplit;
Expand Down Expand Up @@ -247,8 +248,15 @@ protected void setScanParams(TFileRangeDesc rangeDesc, Split split) {
&& !sessionVariable.isForceJniScanner()
&& hudiSplit.getHudiDeltaLogs().isEmpty()) {
// no logs, is read optimize table, fallback to use native reader
// TODO: support read orc hudi table in native reader
rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET);
String fileFormat = FileFormatUtils.getFileFormatBySuffix(hudiSplit.getDataFilePath())
.orElse("Unknown");
if (fileFormat.equals("parquet")) {
rangeDesc.setFormatType(TFileFormatType.FORMAT_PARQUET);
} else if (fileFormat.equals("orc")) {
rangeDesc.setFormatType(TFileFormatType.FORMAT_ORC);
} else {
throw new RuntimeException("Unsupported file format: " + fileFormat);
}
}
setHudiParams(rangeDesc, hudiSplit);
}
Expand Down Expand Up @@ -495,7 +503,7 @@ private HudiSplit generateHudiSplit(FileSlice fileSlice, List<String> partitionV
List<String> logs = fileSlice.getLogFiles().map(HoodieLogFile::getPath)
.map(StoragePath::toString)
.collect(Collectors.toList());
if (logs.isEmpty()) {
if (logs.isEmpty() && !sessionVariable.isForceJniScanner()) {
noLogsSplitNum.incrementAndGet();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !cow --
20241204190011744 20241204190011744_0_6 20241204190011744_0_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 1 A
20241204190011744 20241204190011744_0_7 20241204190011744_2_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 3 C
20241204190011744 20241204190011744_0_8 20241204190011744_4_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 5 E
20241204190011744 20241204190011744_0_9 20241204190011744_1_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 2 B
20241204190011744 20241204190011744_0_10 20241204190011744_3_0 a99e363a-6c10-40f3-a675-9117506d1a43-0_0-38-94_20241204190011744.orc 4 D

-- !mor --
20241204190002046 20241204190002046_0_11 20241204190002046_0_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 1 A
20241204190002046 20241204190002046_0_12 20241204190002046_2_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 3 C
20241204190002046 20241204190002046_0_13 20241204190002046_4_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 5 E
20241204190002046 20241204190002046_0_14 20241204190002046_1_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 2 B
20241204190002046 20241204190002046_0_15 20241204190002046_3_0 b1e68412-01d6-467f-b4c2-b4b18ec71346-0_0-30-75_20241204190002046.orc 4 D

Original file line number Diff line number Diff line change
@@ -1,6 +1,31 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !timestamp --
-- !timestamp1 --
20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T08:00
20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30:00
20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00:00
20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30
20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00

-- !timestamp2 --
20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T23:00
20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-26T00:30
20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-26T02:00

-- !timestamp3 --
20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T15:00
20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T16:30
20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T18:00

-- !timestamp1 --
20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T08:00
20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T09:30
20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T11:00

-- !timestamp2 --
20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T23:00
20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-26T00:30
20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-26T02:00

-- !timestamp3 --
20241115015956800 20241115015956800_0_2 1 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 1 Alice 2024-10-25T15:00
20241115015956800 20241115015956800_0_0 2 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 2 Bob 2024-10-25T16:30
20241115015956800 20241115015956800_0_1 3 eec4913a-0d5f-4b8b-a0f5-934e252c2e45-0_0-7-14_20241115015956800.parquet 3 Charlie 2024-10-25T18:00

Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

suite("test_hudi_orc_tables", "p2,external,hudi,external_remote,external_remote_hudi") {
String enabled = context.config.otherConfigs.get("enableExternalHudiTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("disable hudi test")
}

String catalog_name = "test_hudi_orc_tables"
String props = context.config.otherConfigs.get("hudiEmrCatalog")
sql """drop catalog if exists ${catalog_name};"""
sql """
create catalog if not exists ${catalog_name} properties (
${props}
);
"""

sql """ switch ${catalog_name};"""
sql """ use regression_hudi;"""
sql """ set enable_fallback_to_original_planner=false """

qt_cow """ select * from orc_hudi_table_cow; """
qt_mor """ select * from orc_hudi_table_mor; """

sql """drop catalog if exists ${catalog_name};"""
}
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,22 @@ suite("test_hudi_timestamp", "p2,external,hudi,external_remote,external_remote_h
sql """ use regression_hudi;"""
sql """ set enable_fallback_to_original_planner=false """

// TODO: fix hudi timezone issue and enable this
// qt_timestamp """ select * from hudi_table_with_timestamp order by id; """
def test_timestamp_different_timezones = {
sql """set time_zone = 'America/Los_Angeles';"""
qt_timestamp1 """ select * from hudi_table_with_timestamp order by id; """
sql """set time_zone = 'Asia/Shanghai';"""
qt_timestamp2 """ select * from hudi_table_with_timestamp order by id; """
sql """set time_zone = 'UTC';"""
qt_timestamp3 """ select * from hudi_table_with_timestamp order by id; """
}

// test native reader
test_timestamp_different_timezones()
sql """ set force_jni_scanner = true; """
// test jni reader
test_timestamp_different_timezones()
sql """ set force_jni_scanner = false; """


sql """drop catalog if exists ${catalog_name};"""
}
Expand Down

0 comments on commit 6ee389b

Please sign in to comment.