Skip to content

Commit

Permalink
[BugFix] Fix UDTF wrong result when miss multibyte UTF-8 (#51232)
Browse files Browse the repository at this point in the history
Signed-off-by: stdpain <[email protected]>
(cherry picked from commit 266bfc8)

# Conflicts:
#	test/sql/test_udf/R/test_jvm_udf
#	test/sql/test_udf/T/test_jvm_udf
  • Loading branch information
stdpain authored and mergify[bot] committed Sep 20, 2024
1 parent de661db commit 85650a2
Show file tree
Hide file tree
Showing 3 changed files with 232 additions and 7 deletions.
11 changes: 4 additions & 7 deletions be/src/udf/java/java_udf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,14 +399,11 @@ jobject JVMFunctionHelper::newString(const char* data, size_t size) {
return nstr;
}

size_t JVMFunctionHelper::string_length(jstring jstr) {
return _env->GetStringUTFLength(jstr);
}

Slice JVMFunctionHelper::sliceVal(jstring jstr, std::string* buffer) {
size_t length = this->string_length(jstr);
buffer->resize(length);
_env->GetStringUTFRegion(jstr, 0, length, buffer->data());
const size_t utf_length = _env->GetStringUTFLength(jstr);
buffer->resize(utf_length);
const size_t string_length = _env->GetStringLength(jstr);
_env->GetStringUTFRegion(jstr, 0, string_length, buffer->data());
return {buffer->data(), buffer->length()};
}

Expand Down
141 changes: 141 additions & 0 deletions test/sql/test_udf/R/test_jvm_udf
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
-- name: test_jvm_udf
set enable_group_execution = true;
-- result:
-- !result
CREATE AGGREGATE FUNCTION sumbigint(bigint)
RETURNS bigint
symbol = "Sumbigint"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FSumbigint.jar";
-- result:
-- !result
CREATE TABLE FUNCTION udtfstring(string)
RETURNS string
symbol = "UDTFstring"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFstring.jar";
-- result:
-- !result
CREATE TABLE FUNCTION udtfstring_wrong_match(string)
RETURNS int
symbol = "UDTFstring"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFstring.jar";
-- result:
-- !result
CREATE TABLE FUNCTION udtfint(int)
RETURNS int
symbol = "UDTFint"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFint.jar";
-- result:
-- !result
CREATE TABLE FUNCTION udtfbigint(bigint)
RETURNS bigint
symbol = "UDTFbigint"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFbigint.jar";
-- result:
-- !result
CREATE TABLE FUNCTION udtffloat(float)
RETURNS float
symbol = "UDTFfloat"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFfloat.jar";
-- result:
-- !result
CREATE TABLE FUNCTION udtfdouble(double)
RETURNS double
symbol = "UDTFdouble"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFdouble.jar";
-- result:
-- !result
CREATE TABLE `t0` (
`c0` int(11) NULL COMMENT "",
`c1` varchar(20) NULL COMMENT "",
`c2` varchar(200) NULL COMMENT "",
`c3` int(11) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`c0`, `c1`)
COMMENT "OLAP"
PROPERTIES (
"replication_num" = "1"
);
-- result:
-- !result
insert into t0 SELECT generate_series, generate_series, generate_series, generate_series FROM TABLE(generate_series(1, 40960));
-- result:
-- !result
select count(udtfstring) from t0, udtfstring(c1);
-- result:
81920
-- !result
select count(udtfstring_wrong_match) from t0, udtfstring_wrong_match(c1);
-- result:
E: (1064, 'Type not matched, expect class java.lang.Integer, but got class java.lang.String')
-- !result
select count(udtfint) from t0, udtfint(c1);
-- result:
81920
-- !result
select count(udtfbigint) from t0, udtfbigint(c1);
-- result:
81920
-- !result
select count(udtffloat) from t0, udtffloat(c1);
-- result:
81920
-- !result
select count(udtfdouble) from t0, udtfdouble(c1);
-- result:
81920
-- !result
select * from TABLE(udtfstring(""));
-- result:
-- !result
select * from TABLE(udtfstring("▁▂▃▄▅▆▇█"));
-- result:
▁▂▃▄▅▆▇█
▁▂▃▄▅▆▇█
-- !result
select * from TABLE(udtfstring("中文测试"));
-- result:
中文测试
中文测试
-- !result
select * from TABLE(udtfstring("∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა"));
-- result:
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
-- !result
select * from TABLE(udtfstring("2H₂ + O₂ ⇌ 2H₂O"));
-- result:
2H₂ + O₂ ⇌ 2H₂O
2H₂ + O₂ ⇌ 2H₂O
-- !result
select * from TABLE(udtfstring("ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ"));
-- result:
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
-- !result
set streaming_preaggregation_mode="force_streaming";
-- result:
-- !result
select sum(delta), count(*), count(delta) from (select (sum(c3) - sumbigint(c3)) as delta from t0 group by c0,c1 limit 10) tb;
-- result:
0 10 10
-- !result
set streaming_preaggregation_mode="auto";
-- result:
-- !result
set enable_spill=true;
-- result:
-- !result
set spill_mode="force";
-- result:
-- !result
select sum(delta), count(*), count(delta) from (select (sum(c3) - sumbigint(c3)) as delta from t0 group by c0,c1) tb;
-- result:
0 40960 40960
-- !result
87 changes: 87 additions & 0 deletions test/sql/test_udf/T/test_jvm_udf
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
-- name: test_jvm_udf

set enable_group_execution = true;

CREATE AGGREGATE FUNCTION sumbigint(bigint)
RETURNS bigint
symbol = "Sumbigint"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FSumbigint.jar";

CREATE TABLE FUNCTION udtfstring(string)
RETURNS string
symbol = "UDTFstring"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFstring.jar";

CREATE TABLE FUNCTION udtfstring_wrong_match(string)
RETURNS int
symbol = "UDTFstring"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFstring.jar";

CREATE TABLE FUNCTION udtfint(int)
RETURNS int
symbol = "UDTFint"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFint.jar";

CREATE TABLE FUNCTION udtfbigint(bigint)
RETURNS bigint
symbol = "UDTFbigint"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFbigint.jar";

CREATE TABLE FUNCTION udtffloat(float)
RETURNS float
symbol = "UDTFfloat"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFfloat.jar";

CREATE TABLE FUNCTION udtfdouble(double)
RETURNS double
symbol = "UDTFdouble"
type = "StarrocksJar"
file = "${udf_url}/starrocks-jdbc%2FUDTFdouble.jar";


CREATE TABLE `t0` (
`c0` int(11) NULL COMMENT "",
`c1` varchar(20) NULL COMMENT "",
`c2` varchar(200) NULL COMMENT "",
`c3` int(11) NULL COMMENT ""
) ENGINE=OLAP
DUPLICATE KEY(`c0`, `c1`)
COMMENT "OLAP"
PROPERTIES (
"replication_num" = "1"
);

insert into t0 SELECT generate_series, generate_series, generate_series, generate_series FROM TABLE(generate_series(1, 40960));

-- test udtf cases
select count(udtfstring) from t0, udtfstring(c1);
select count(udtfstring_wrong_match) from t0, udtfstring_wrong_match(c1);
select count(udtfint) from t0, udtfint(c1);
select count(udtfbigint) from t0, udtfbigint(c1);
select count(udtffloat) from t0, udtffloat(c1);
select count(udtfdouble) from t0, udtfdouble(c1);
-- test udtf with utf8 case
select * from TABLE(udtfstring(""));
select * from TABLE(udtfstring("▁▂▃▄▅▆▇█"));
select * from TABLE(udtfstring("中文测试"));
select * from TABLE(udtfstring("∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა"));
select * from TABLE(udtfstring("2H₂ + O₂ ⇌ 2H₂O"));
select * from TABLE(udtfstring("ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ"));


-- test group by limit case:
set streaming_preaggregation_mode="force_streaming";
select sum(delta), count(*), count(delta) from (select (sum(c3) - sumbigint(c3)) as delta from t0 group by c0,c1 limit 10) tb;

-- test group by spill case:
set streaming_preaggregation_mode="auto";
set enable_spill=true;
set spill_mode="force";

select sum(delta), count(*), count(delta) from (select (sum(c3) - sumbigint(c3)) as delta from t0 group by c0,c1) tb;

0 comments on commit 85650a2

Please sign in to comment.