Skip to content

Commit

Permalink
[BugFix] Fix UDTF wrong result when miss multibyte UTF-8 (#51232)
Browse files Browse the repository at this point in the history
Signed-off-by: stdpain <[email protected]>
(cherry picked from commit 266bfc8)
  • Loading branch information
stdpain authored and mergify[bot] committed Sep 20, 2024
1 parent 6eb366f commit 041793a
Show file tree
Hide file tree
Showing 4 changed files with 40 additions and 8 deletions.
11 changes: 4 additions & 7 deletions be/src/udf/java/java_udf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -406,14 +406,11 @@ jobject JVMFunctionHelper::newString(const char* data, size_t size) {
return nstr;
}

size_t JVMFunctionHelper::string_length(jstring jstr) {
return _env->GetStringUTFLength(jstr);
}

Slice JVMFunctionHelper::sliceVal(jstring jstr, std::string* buffer) {
size_t length = this->string_length(jstr);
buffer->resize(length);
_env->GetStringUTFRegion(jstr, 0, length, buffer->data());
const size_t utf_length = _env->GetStringUTFLength(jstr);
buffer->resize(utf_length);
const size_t string_length = _env->GetStringLength(jstr);
_env->GetStringUTFRegion(jstr, 0, string_length, buffer->data());
return {buffer->data(), buffer->length()};
}

Expand Down
1 change: 0 additions & 1 deletion be/src/udf/java/java_udf.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,6 @@ class JVMFunctionHelper {
DECLARE_NEW_BOX(double, double, Double)

jobject newString(const char* data, size_t size);
size_t string_length(jstring jstr);

Slice sliceVal(jstring jstr, std::string* buffer);
jclass string_clazz() { return _string_class; }
Expand Down
28 changes: 28 additions & 0 deletions test/sql/test_udf/R/test_jvm_udf
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,34 @@ select count(udtfdouble) from t0, udtfdouble(c1);
-- result:
81920
-- !result
select * from TABLE(udtfstring(""));
-- result:
-- !result
select * from TABLE(udtfstring("▁▂▃▄▅▆▇█"));
-- result:
▁▂▃▄▅▆▇█
▁▂▃▄▅▆▇█
-- !result
select * from TABLE(udtfstring("中文测试"));
-- result:
中文测试
中文测试
-- !result
select * from TABLE(udtfstring("∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა"));
-- result:
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა
-- !result
select * from TABLE(udtfstring("2H₂ + O₂ ⇌ 2H₂O"));
-- result:
2H₂ + O₂ ⇌ 2H₂O
2H₂ + O₂ ⇌ 2H₂O
-- !result
select * from TABLE(udtfstring("ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ"));
-- result:
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ
-- !result
set streaming_preaggregation_mode="force_streaming";
-- result:
-- !result
Expand Down
8 changes: 8 additions & 0 deletions test/sql/test_udf/T/test_jvm_udf
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ select count(udtfint) from t0, udtfint(c1);
select count(udtfbigint) from t0, udtfbigint(c1);
select count(udtffloat) from t0, udtffloat(c1);
select count(udtfdouble) from t0, udtfdouble(c1);
-- test udtf with utf8 case
select * from TABLE(udtfstring(""));
select * from TABLE(udtfstring("▁▂▃▄▅▆▇█"));
select * from TABLE(udtfstring("中文测试"));
select * from TABLE(udtfstring("∀∂∈ℝ∧∪≡∞ ↑↗↨↻⇣ ┐┼╔╘░►☺♀ fi�⑀₂ἠḂӥẄɐː⍎אԱა"));
select * from TABLE(udtfstring("2H₂ + O₂ ⇌ 2H₂O"));
select * from TABLE(udtfstring("ᚻᛖ ᚳᚹᚫᚦ ᚦᚫᛏ ᚻᛖ ᛒᚢᛞᛖ ᚩᚾ ᚦᚫᛗ ᛚᚪᚾᛞᛖ ᚾᚩᚱᚦᚹᛖᚪᚱᛞᚢᛗ ᚹᛁᚦ ᚦᚪ ᚹᛖᛥᚫ"));


-- test group by limit case:
set streaming_preaggregation_mode="force_streaming";
Expand Down

0 comments on commit 041793a

Please sign in to comment.