Skip to content

Commit

Permalink
[enchement](utf8)import enable_text_validate_utf8 session var (apache…
Browse files Browse the repository at this point in the history
…#45537)

Problem Summary:
When reading text format files in Hive catalog and TVF, sometimes you
may encounter the exception `Only support csv data in utf8 codec`.
I introduced a new session variable `enable_text_validate_utf8` to
control whether to check the utf8 format.

Introduced `enable_text_validate_utf8` session variable to control
whether to check the utf8 format.
hubgeter committed Dec 27, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature. The key has expired.
1 parent 4746f16 commit 6596cea
Showing 11 changed files with 223 additions and 3 deletions.
7 changes: 7 additions & 0 deletions be/src/util/utf8_check.cpp
Original file line number Diff line number Diff line change
@@ -327,4 +327,11 @@ bool validate_utf8(const char* src, size_t len) {
return validate_utf8_naive(src, len);
}
#endif

bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len) {
if (params.__isset.file_attributes && !params.file_attributes.enable_text_validate_utf8) {
return true;
}
return validate_utf8(src, len);
}
} // namespace doris
4 changes: 4 additions & 0 deletions be/src/util/utf8_check.h
Original file line number Diff line number Diff line change
@@ -17,6 +17,8 @@

#pragma once

#include <gen_cpp/PlanNodes_types.h>

#include <cstddef>

namespace doris {
@@ -25,4 +27,6 @@ namespace doris {
bool validate_utf8(const char* src, size_t len);
// check utf8 use naive c++
bool validate_utf8_naive(const char* data, size_t len);

bool validate_utf8(const TFileScanRangeParams& params, const char* src, size_t len);
} // namespace doris
6 changes: 3 additions & 3 deletions be/src/vec/exec/format/csv/csv_reader.cpp
Original file line number Diff line number Diff line change
@@ -713,7 +713,7 @@ Status CsvReader::_fill_empty_line(Block* block, std::vector<MutableColumnPtr>&
}

Status CsvReader::_validate_line(const Slice& line, bool* success) {
if (!_is_proto_format && !validate_utf8(line.data, line.size)) {
if (!_is_proto_format && !validate_utf8(_params, line.data, line.size)) {
if (!_is_load) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
} else {
@@ -954,7 +954,7 @@ Status CsvReader::_parse_col_nums(size_t* col_nums) {
return Status::InternalError<false>(
"The first line is empty, can not parse column numbers");
}
if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
}
ptr = _remove_bom(ptr, size);
@@ -971,7 +971,7 @@ Status CsvReader::_parse_col_names(std::vector<std::string>* col_names) {
if (size == 0) {
return Status::InternalError<false>("The first line is empty, can not parse column names");
}
if (!validate_utf8(const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
if (!validate_utf8(_params, const_cast<char*>(reinterpret_cast<const char*>(ptr)), size)) {
return Status::InternalError<false>("Only support csv data in utf8 codec");
}
ptr = _remove_bom(ptr, size);
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
CREATE TABLE invalid_utf8_data (
id INT,
corrupted_data STRING,
string_data1 STRING,
string_data2 STRING
)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
location '/user/doris/preinstalled_data/text/utf8_check';


CREATE TABLE invalid_utf8_data2 (
id INT,
corrupted_data STRING,
string_data1 STRING,
string_data2 STRING
)
ROW FORMAT SERDE 'org.apache.hadoop.hive.serde2.OpenCSVSerde'
WITH SERDEPROPERTIES (
"separatorChar" = ",",
"quoteChar" = "\"",
"escapeChar" = "\\"
)
location '/user/doris/preinstalled_data/text/utf8_check';
msck repair table invalid_utf8_data;
msck repair table invalid_utf8_data2;
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1,�,AAB,helloworld
2,��,AAB,helloworld
2,���,AAB,helloworld
4,����,AAB,helloworld
5,�����,AAB,helloworld
Original file line number Diff line number Diff line change
@@ -435,6 +435,8 @@ protected TFileAttributes getFileAttributes() throws UserException {
textParams.setNullFormat(HiveProperties.getNullFormat(table));
fileAttributes.setTextParams(textParams);
fileAttributes.setHeaderType("");
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
// set set properties of OpenCSVSerde
@@ -451,6 +453,8 @@ protected TFileAttributes getFileAttributes() throws UserException {
if (textParams.isSetEnclose()) {
fileAttributes.setTrimDoubleQuotes(true);
}
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe")) {
TFileTextScanRangeParams textParams = new TFileTextScanRangeParams();
textParams.setColumnSeparator("\t");
10 changes: 10 additions & 0 deletions fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
Original file line number Diff line number Diff line change
@@ -690,11 +690,14 @@ public class SessionVariable implements Serializable, Writable {
*/
public static final String ENABLE_AUTO_CREATE_WHEN_OVERWRITE = "enable_auto_create_when_overwrite";


public static final String ENABLE_ADAPTIVE_PIPELINE_TASK_SERIAL_READ_ON_LIMIT =
"enable_adaptive_pipeline_task_serial_read_on_limit";
public static final String ADAPTIVE_PIPELINE_TASK_SERIAL_READ_ON_LIMIT =
"adaptive_pipeline_task_serial_read_on_limit";

public static final String ENABLE_TEXT_VALIDATE_UTF8 = "enable_text_validate_utf8";

/**
* If set false, user couldn't submit analyze SQL and FE won't allocate any related resources.
*/
@@ -2298,6 +2301,13 @@ public void setIgnoreShapePlanNodes(String ignoreShapePlanNodes) {
})
public boolean enableAutoCreateWhenOverwrite = false;

@VariableMgr.VarAttr(name = ENABLE_TEXT_VALIDATE_UTF8, needForward = true, description = {
"对于 text 类型的文件读取,是否开启utf8编码检查。非utf8字符会显示成乱码。",
"For text type file reading, whether to enable utf8 encoding check."
+ "non-utf8 characters will be displayed as garbled characters."
})
public boolean enableTextValidateUtf8 = true;

@VariableMgr.VarAttr(name = SKIP_CHECKING_ACID_VERSION_FILE, needForward = true, description = {
"跳过检查 transactional hive 版本文件 '_orc_acid_version.'",
"Skip checking transactional hive version file '_orc_acid_version.'"
Original file line number Diff line number Diff line change
@@ -305,6 +305,8 @@ public TFileAttributes getFileAttributes() {
fileAttributes.setHeaderType(this.headerType);
fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes);
fileAttributes.setSkipLines(skipLines);
fileAttributes.setEnableTextValidateUtf8(
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
} else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) {
fileAttributes.setJsonRoot(jsonRoot);
fileAttributes.setJsonpaths(jsonPaths);
2 changes: 2 additions & 0 deletions gensrc/thrift/PlanNodes.thrift
Original file line number Diff line number Diff line change
@@ -284,6 +284,8 @@ struct TFileAttributes {
10: optional bool trim_double_quotes;
// csv skip line num, only used when csv header_type is not set.
11: optional i32 skip_lines;
//For text type file reading, whether to enable utf8 encoding check.(Catalog && TVF)
12: optional bool enable_text_validate_utf8 = true;
// for cloud copy into
1001: optional bool ignore_csv_redundant_col;
}
55 changes: 55 additions & 0 deletions regression-test/data/external_table_p0/hive/test_utf8_check.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
-- This file is automatically generated. You should know what you did if you want to edit this
-- !1 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !2 --
c1 text Yes false \N NONE
c2 text Yes false \N NONE
c3 text Yes false \N NONE
c4 text Yes false \N NONE

-- !3 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !4 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !1 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !2 --
c1 text Yes false \N NONE
c2 text Yes false \N NONE
c3 text Yes false \N NONE
c4 text Yes false \N NONE

-- !3 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

-- !4 --
1 � AAB helloworld
2 �� AAB helloworld
2 ��� AAB helloworld
4 ���� AAB helloworld
5 ����� AAB helloworld

100 changes: 100 additions & 0 deletions regression-test/suites/external_table_p0/hive/test_utf8_check.groovy
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.


suite("test_utf8_check","p0,external,tvf,hive,external_docker,external_docker_hive") {
String enabled = context.config.otherConfigs.get("enableHiveTest")
if (enabled == null || !enabled.equalsIgnoreCase("true")) {
logger.info("diable Hive test.")
return;
}

for (String hivePrefix : ["hive2","hive3"]) {

String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort")
String catalog_name = "${hivePrefix}_test_utf8_check"
String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
def hdfsUserName = "doris"
String hdfs_port = context.config.otherConfigs.get(hivePrefix + "HdfsPort")
def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"

sql """drop catalog if exists ${catalog_name}"""
sql """create catalog if not exists ${catalog_name} properties (
"type"="hms",
'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
);"""
sql """use `${catalog_name}`.`default`"""


sql """ set enable_text_validate_utf8 = true; """

test {
sql """ select * from invalid_utf8_data """
exception """Only support csv data in utf8 codec"""
}


test {
sql """ select * from invalid_utf8_data2; """
exception """Only support csv data in utf8 codec"""
}


def uri = "${defaultFS}" + "/user/doris/preinstalled_data/text/utf8_check/utf8_check_fail.csv"


test {
sql """ desc function HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""
exception """Only support csv data in utf8 codec"""
}

test {
sql """select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""
exception """Only support csv data in utf8 codec"""
}


sql """ set enable_text_validate_utf8 = false; """

qt_1 """select * from invalid_utf8_data order by id """

qt_2 """ desc function HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",")"""


qt_3 """select * from HDFS(
"uri" = "${uri}",
"hadoop.username" = "${hdfsUserName}",
"format" = "csv",
"column_separator"=",") order by c1"""
qt_4 """select * from invalid_utf8_data2 order by id """


}

}

0 comments on commit 6596cea

Please sign in to comment.