diff --git a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/SubstringEvaluator.java b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/SubstringEvaluator.java index 2830e5012c4..5537ab4afae 100644 --- a/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/SubstringEvaluator.java +++ b/kernel/kernel-defaults/src/main/java/io/delta/kernel/defaults/internal/expressions/SubstringEvaluator.java @@ -106,7 +106,7 @@ public String getString(int rowId) { String inputString = input.getString(rowId); int position = positionVector.getInt(rowId); Optional length = lengthVector.map(columnVector -> columnVector.getInt(rowId)); - if (position > getStringLengthWithCodePoint(inputString) + if (position > getStringLength(inputString) || (length.isPresent() && length.get() < 1)) { return ""; } @@ -119,11 +119,11 @@ public String getString(int rowId) { // e.g. Substring("aaa", -100, 95), should be read as Substring("aaa", 0, 0) int endIndex = Math.min( - getStringLengthWithCodePoint(inputString), + getStringLength(inputString), Math.max(startPosition + len, 0)); - return subStringWithCodePoint(inputString, startIndex, Optional.of(endIndex)); + return getSubstring(inputString, startIndex, Optional.of(endIndex)); }) - .orElse(subStringWithCodePoint(inputString, startIndex, Optional.empty())); + .orElse(getSubstring(inputString, startIndex, Optional.empty())); } }; } @@ -140,18 +140,19 @@ public String getString(int rowId) { private static int buildStartPosition(String inputString, int pos) { // Handles the negative position (substring("abc", -2, 1), the start position should be 1("b")) if (pos < 0) { - return getStringLengthWithCodePoint(inputString) + pos; + return getStringLength(inputString) + pos; } // Pos is 1 based and pos = 0 is treated as 1. return Math.max(pos - 1, 0); } /** Returns code point based string length for handling surrogate pairs. */ - private static int getStringLengthWithCodePoint(String s) { + private static int getStringLength(String s) { return s.codePointCount(/* beginIndex = */ 0, s.length()); } - private static String subStringWithCodePoint(String s, int start, Optional end) { + /** Returns code point based substring for handling surrogate pairs. */ + private static String getSubstring(String s, int start, Optional end) { int startIndex = s.offsetByCodePoints(/* beginIndex = */ 0, start); return end.map(e -> s.substring(startIndex, s.offsetByCodePoints(0, e))) .orElse(s.substring(startIndex));