Skip to content

HIVE-27370: support 4 bytes characters #5624

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public StringSubstrColStart() {
* @param len length of the bytes the string holds in the byte array
* @param substrStart the Start index for the substring operation
*/
static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
int end = start + len;

if (substrStart < 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,10 @@ public StringSubstrColStartLen() {
* @param start start offset of the byte array the string starts at
* @param len length of the bytes the string holds in the byte array
* @param substrStart the Start index for the substring operation
* @param substrLen the length of the substring
* @param offsetArray the array that indexes are populated to. Assume its length >= 2.
* @param substrLength the length of the substring
* @param offsetArray the array that indexes are populated to. Assume its {@literal length >= 2}.
*/
static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
int substrLength, int[] offsetArray) {
int curIdx = -1;
offsetArray[0] = -1;
Expand Down
64 changes: 44 additions & 20 deletions ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,13 +96,12 @@ private Text evaluateInternal(Text t, int pos, int len) {
return r;
}

String s = t.toString();
int[] index = makeIndex(pos, len, s.length());
if (index == null) {
StringSubstrColStartLen.populateSubstrOffsets(t.getBytes(), 0, t.getLength(), adjustStartPos(pos), len, index);
if (index[0] == -1) {
return r;
}

r.set(s.substring(index[0], index[1]));
r.set(t.getBytes(), index[0], index[1]);
return r;
}

Expand Down Expand Up @@ -133,16 +132,44 @@ private int[] makeIndex(int pos, int len, int inputLen) {

private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);

// Even though we are using longs, substr can only deal with ints, so we use
//Even though we are using longs, substr can only deal with ints, so we use
// the maximum int value as the maxValue
private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE);

private Text evaluateInternal(Text t, int pos) {
r.clear();

int offset = StringSubstrColStart.getSubstrStartOffset(t.getBytes(), 0, t.getLength(), adjustStartPos(pos));
if (offset == -1) {
return r;
}

r.set(t.getBytes(), offset, t.getLength() - offset);
return r;
}

public Text evaluate(Text s, IntWritable pos) {
return evaluate(s, pos, maxValue);
if ((s == null) || (pos == null)) {
return null;
}

return evaluateInternal(s, pos.get());
}

public Text evaluate(Text s, LongWritable pos) {
return evaluate(s, pos, maxLongValue);
if ((s == null) || (pos == null)) {
return null;
}

long longPos = pos.get();
// If an unsupported value is seen, we don't want to return a string
// that doesn't match what the user expects, so we return NULL (still
// unexpected, of course, but probably better than a bad string).
if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) {
return null;
}

return evaluateInternal(s, (int) pos.get());
}

public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) {
Expand Down Expand Up @@ -198,6 +225,14 @@ public StatEstimator getStatEstimator() {
return new SubStrStatEstimator();
}

private int adjustStartPos(int pos) {
if (pos <= 0) {
return pos;
}

return pos - 1;
}

private static class SubStrStatEstimator implements StatEstimator {

@Override
Expand All @@ -208,7 +243,6 @@ public Optional<ColStatistics> estimate(List<ColStatistics> csList) {
// 99 rows with 0 length
// orig avg is 10
// new avg is 5 (if substr(5)) ; but in reality it will stay ~10
Optional<Double> start = getRangeWidth(csList.get(1).getRange());
Range startRange = csList.get(1).getRange();
if (startRange != null && startRange.minValue != null) {
double newAvgColLen = cs.getAvgColLen() - startRange.minValue.doubleValue();
Expand All @@ -219,23 +253,13 @@ public Optional<ColStatistics> estimate(List<ColStatistics> csList) {
if (csList.size() > 2) {
Range lengthRange = csList.get(2).getRange();
if (lengthRange != null && lengthRange.maxValue != null) {
Double w = lengthRange.maxValue.doubleValue();
double w = lengthRange.maxValue.doubleValue();
if (cs.getAvgColLen() > w) {
cs.setAvgColLen(w);
}
}
}
return Optional.of(cs);
}

private Optional<Double> getRangeWidth(Range range) {
if (range != null) {
if (range.minValue != null && range.maxValue != null) {
return Optional.of(range.maxValue.doubleValue() - range.minValue.doubleValue());
}
}
return Optional.empty();
}

}
}
}
8 changes: 8 additions & 0 deletions ql/src/test/queries/clientpositive/udf_substr.q
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,11 @@ FROM src tablesample (1 rows);
SELECT
substr('ABC', cast(2147483649 as bigint))
FROM src tablesample (1 rows);

--test 4-byte charactor
set hive.vectorized.execution.enabled=false;
SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I verified the master branch can't pass this test case

+POSTHOOK: query: SELECT
+  substr('あa🤎いiうu', 1, 3) as b1,
+  substr('あa🤎いiうu', 3) as b2,
+  substr('あa🤎いiうu', -5) as b3
+FROM src tablesample (1 rows)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+あa?   🤎いiうu        ?いiうu

17 changes: 17 additions & 0 deletions ql/src/test/results/clientpositive/llap/udf_substr.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,20 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
NULL
PREHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
あa🤎 🤎いiうu 🤎いiうu
Loading