diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java index 3d7742bf1407..146cb0709567 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStart.java @@ -75,7 +75,7 @@ public StringSubstrColStart() { * @param len length of the bytes the string holds in the byte array * @param substrStart the Start index for the substring operation */ - static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) { + public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) { int end = start + len; if (substrStart < 0) { diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java index 7ef8552123f6..957c46ea6b1b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/vector/expressions/StringSubstrColStartLen.java @@ -83,10 +83,10 @@ public StringSubstrColStartLen() { * @param start start offset of the byte array the string starts at * @param len length of the bytes the string holds in the byte array * @param substrStart the Start index for the substring operation - * @param substrLen the length of the substring - * @param offsetArray the array that indexes are populated to. Assume its length >= 2. + * @param substrLength the length of the substring + * @param offsetArray the array that indexes are populated to. Assume its {@literal length >= 2}. */ - static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, + public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart, int substrLength, int[] offsetArray) { int curIdx = -1; offsetArray[0] = -1; diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java index 7c6de37c8073..419f65778a5c 100755 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java @@ -96,53 +96,51 @@ private Text evaluateInternal(Text t, int pos, int len) { return r; } - String s = t.toString(); - int[] index = makeIndex(pos, len, s.length()); - if (index == null) { + byte[] utf8String = t.toString().getBytes(); + StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, craetePos(pos), len, index); + if (index[0] == -1) { return r; } - r.set(s.substring(index[0], index[1])); + r.set(new String(utf8String, index[0], index[1])); return r; } - private int[] makeIndex(int pos, int len, int inputLen) { - if ((Math.abs(pos) > inputLen)) { - return null; - } - - int start, end; + private Text evaluateInternal(Text t, int pos) { + r.clear(); - if (pos > 0) { - start = pos - 1; - } else if (pos < 0) { - start = inputLen + pos; - } else { - start = 0; + byte[] utf8String = t.toString().getBytes(); + int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, craetePos(pos)); + if (offset == -1) { + return r; } - if ((inputLen - start) < len) { - end = inputLen; - } else { - end = start + len; - } - index[0] = start; - index[1] = end; - return index; + r.set(new String(utf8String, offset, utf8String.length - offset)); + return r; } - private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE); - - // Even though we are using longs, substr can only deal with ints, so we use - // the maximum int value as the maxValue - private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE); - public Text evaluate(Text s, IntWritable pos) { - return evaluate(s, pos, maxValue); + if ((s == null) || (pos == null)) { + return null; + } + + return evaluateInternal(s, pos.get()); } public Text evaluate(Text s, LongWritable pos) { - return evaluate(s, pos, maxLongValue); + if ((s == null) || (pos == null)) { + return null; + } + + long longPos = pos.get(); + // If an unsupported value is seen, we don't want to return a string + // that doesn't match what the user expects, so we return NULL (still + // unexpected, of course, but probably better than a bad string). + if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) { + return null; + } + + return evaluateInternal(s, (int) pos.get()); } public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) { @@ -172,25 +170,42 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len } private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) { - if (len <= 0) { return new BytesWritable(); } - int[] index = makeIndex(pos, len, bw.getLength()); - if (index == null) { + byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); + StringSubstrColStartLen.populateSubstrOffsets(b, 0, b.length, craetePos(pos), len, index); + if (index[0] == -1) { return new BytesWritable(); } - return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1])); + return new BytesWritable(arrayCopy(b, index[0], index[1])); + } + + private BytesWritable evaluateInternal(BytesWritable bw, int pos) { + byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength()); + int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos)); + if (offset == -1) { + return new BytesWritable(); + } + + return new BytesWritable(arrayCopy(b, offset, bw.getLength() - offset)); } public BytesWritable evaluate(BytesWritable bw, IntWritable pos){ - return evaluate(bw, pos, maxValue); + if ((bw == null) || (pos == null)) { + return null; + } + return evaluateInternal(bw, pos.get()); } public BytesWritable evaluate(BytesWritable bw, LongWritable pos){ - return evaluate(bw, pos, maxLongValue); + if ((bw == null) || (pos == null)) { + return null; + } + + return evaluateInternal(bw, (int) pos.get()); } @Override @@ -198,6 +213,26 @@ public StatEstimator getStatEstimator() { return new SubStrStatEstimator(); } + private byte[] arrayCopy(byte[] src, int pos, int len) { + byte[] b = new byte[len]; + + int copyIdx = 0; + for (int srcIdx = pos; copyIdx < len; srcIdx++) { + b[copyIdx] = src[srcIdx]; + copyIdx++; + } + + return b; + } + + private int craetePos(int pos) { + if (pos <= 0) { + return pos; + } + + return pos - 1; + } + private static class SubStrStatEstimator implements StatEstimator { @Override diff --git a/ql/src/test/queries/clientpositive/udf_substr.q b/ql/src/test/queries/clientpositive/udf_substr.q index a609536f37e5..f1ea68d99c5b 100644 --- a/ql/src/test/queries/clientpositive/udf_substr.q +++ b/ql/src/test/queries/clientpositive/udf_substr.q @@ -89,3 +89,11 @@ FROM src tablesample (1 rows); SELECT substr('ABC', cast(2147483649 as bigint)) FROM src tablesample (1 rows); + +--test 4-byte charactor +set hive.vectorized.execution.enabled=false; +SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows); diff --git a/ql/src/test/results/clientpositive/llap/udf_substr.q.out b/ql/src/test/results/clientpositive/llap/udf_substr.q.out index 9ffa39b03356..ebf541a31e6d 100644 --- a/ql/src/test/results/clientpositive/llap/udf_substr.q.out +++ b/ql/src/test/results/clientpositive/llap/udf_substr.q.out @@ -240,3 +240,20 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@src #### A masked pattern was here #### NULL +PREHOOK: query: SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + substr('あa🤎いiうu', 1, 3) as b1, + substr('あa🤎いiうu', 3) as b2, + substr('あa🤎いiうu', -5) as b3 +FROM src tablesample (1 rows) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +あa🤎 🤎いiうu 🤎いiうu