Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HIVE-27370: support 4 bytes characters #5624

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ public StringSubstrColStart() {
* @param len length of the bytes the string holds in the byte array
* @param substrStart the Start index for the substring operation
*/
static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
public static int getSubstrStartOffset(byte[] utf8String, int start, int len, int substrStart) {
int end = start + len;

if (substrStart < 0) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ public StringSubstrColStartLen() {
* @param substrLen the length of the substring
* @param offsetArray the array that indexes are populated to. Assume its length >= 2.
*/
static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
public static void populateSubstrOffsets(byte[] utf8String, int start, int len, int substrStart,
int substrLength, int[] offsetArray) {
int curIdx = -1;
offsetArray[0] = -1;
Expand Down
112 changes: 74 additions & 38 deletions ql/src/java/org/apache/hadoop/hive/ql/udf/UDFSubstr.java
Original file line number Diff line number Diff line change
Expand Up @@ -96,53 +96,51 @@ private Text evaluateInternal(Text t, int pos, int len) {
return r;
}

String s = t.toString();
int[] index = makeIndex(pos, len, s.length());
if (index == null) {
byte[] utf8String = t.toString().getBytes();
StringSubstrColStartLen.populateSubstrOffsets(utf8String, 0, utf8String.length, craetePos(pos), len, index);
if (index[0] == -1) {
return r;
}

r.set(s.substring(index[0], index[1]));
r.set(new String(utf8String, index[0], index[1]));
return r;
}

private int[] makeIndex(int pos, int len, int inputLen) {
if ((Math.abs(pos) > inputLen)) {
return null;
}

int start, end;
private Text evaluateInternal(Text t, int pos) {
r.clear();

if (pos > 0) {
start = pos - 1;
} else if (pos < 0) {
start = inputLen + pos;
} else {
start = 0;
byte[] utf8String = t.toString().getBytes();
int offset = StringSubstrColStart.getSubstrStartOffset(utf8String, 0, utf8String.length, craetePos(pos));
if (offset == -1) {
return r;
}

if ((inputLen - start) < len) {
end = inputLen;
} else {
end = start + len;
}
index[0] = start;
index[1] = end;
return index;
r.set(new String(utf8String, offset, utf8String.length - offset));
return r;
}

private final IntWritable maxValue = new IntWritable(Integer.MAX_VALUE);

// Even though we are using longs, substr can only deal with ints, so we use
// the maximum int value as the maxValue
private final LongWritable maxLongValue = new LongWritable(Integer.MAX_VALUE);

public Text evaluate(Text s, IntWritable pos) {
return evaluate(s, pos, maxValue);
if ((s == null) || (pos == null)) {
return null;
}

return evaluateInternal(s, pos.get());
}

public Text evaluate(Text s, LongWritable pos) {
return evaluate(s, pos, maxLongValue);
if ((s == null) || (pos == null)) {
return null;
}

long longPos = pos.get();
// If an unsupported value is seen, we don't want to return a string
// that doesn't match what the user expects, so we return NULL (still
// unexpected, of course, but probably better than a bad string).
if (longPos > Integer.MAX_VALUE || longPos < Integer.MIN_VALUE) {
return null;
}

return evaluateInternal(s, (int) pos.get());
}

public BytesWritable evaluate(BytesWritable bw, LongWritable pos, LongWritable len) {
Expand Down Expand Up @@ -172,32 +170,70 @@ public BytesWritable evaluate(BytesWritable bw, IntWritable pos, IntWritable len
}

private BytesWritable evaluateInternal(BytesWritable bw, int pos, int len) {

if (len <= 0) {
return new BytesWritable();
}

int[] index = makeIndex(pos, len, bw.getLength());
if (index == null) {
byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
StringSubstrColStartLen.populateSubstrOffsets(b, 0, b.length, craetePos(pos), len, index);
if (index[0] == -1) {
return new BytesWritable();
}

return new BytesWritable(Arrays.copyOfRange(bw.getBytes(), index[0], index[1]));
return new BytesWritable(arrayCopy(b, index[0], index[1]));
}

private BytesWritable evaluateInternal(BytesWritable bw, int pos) {
byte[] b = Arrays.copyOf(bw.getBytes(), bw.getLength());
int offset = StringSubstrColStart.getSubstrStartOffset(b, 0, b.length, craetePos(pos));

if (offset == -1) {
return new BytesWritable();
}

return new BytesWritable(arrayCopy(b, offset, bw.getLength() - offset));
}

public BytesWritable evaluate(BytesWritable bw, IntWritable pos){
return evaluate(bw, pos, maxValue);
if ((bw == null) || (pos == null)) {
return null;
}
return evaluateInternal(bw, pos.get());
}

public BytesWritable evaluate(BytesWritable bw, LongWritable pos){
return evaluate(bw, pos, maxLongValue);
if ((bw == null) || (pos == null)) {
return null;
}

return evaluateInternal(bw, (int) pos.get());
}

@Override
public StatEstimator getStatEstimator() {
return new SubStrStatEstimator();
}

private byte[] arrayCopy(byte[] src, int pos, int len) {
byte[] b = new byte[len];

int copyIdx = 0;
for (int srcIdx = pos; copyIdx < len; srcIdx++) {
b[copyIdx] = src[srcIdx];
copyIdx++;
}

return b;
}

private int craetePos(int pos) {
if (pos <= 0) {
return pos;
}

return pos - 1;
}

private static class SubStrStatEstimator implements StatEstimator {

@Override
Expand Down
8 changes: 8 additions & 0 deletions ql/src/test/queries/clientpositive/udf_substr.q
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,11 @@ FROM src tablesample (1 rows);
SELECT
substr('ABC', cast(2147483649 as bigint))
FROM src tablesample (1 rows);

--test 4-byte charactor
set hive.vectorized.execution.enabled=false;
SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows);
17 changes: 17 additions & 0 deletions ql/src/test/results/clientpositive/llap/udf_substr.q.out
Original file line number Diff line number Diff line change
Expand Up @@ -240,3 +240,20 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
NULL
PREHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
PREHOOK: type: QUERY
PREHOOK: Input: default@src
#### A masked pattern was here ####
POSTHOOK: query: SELECT
substr('あa🤎いiうu', 1, 3) as b1,
substr('あa🤎いiうu', 3) as b2,
substr('あa🤎いiうu', -5) as b3
FROM src tablesample (1 rows)
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src
#### A masked pattern was here ####
あa🤎 🤎いiうu 🤎いiうu
Loading