Skip to content

Commit

Permalink
Remove JsonParserVisitor#indexes (#3597)
Browse files Browse the repository at this point in the history
The `JsonParserVisitor#indexes` array requires an additional 4 bytes of memory per code point in the source code (only in case the source has at least one surrogate). Rather than doing that, the parser can have two cursors which are kept in sync when advancing through the source code: One code point cursor and one code unit cursor. The former is to align with the indexes of the ANTLR tokens and the latter is to be able to read from the underlying source string.
  • Loading branch information
knutwannheden authored Oct 3, 2023
1 parent 5675dd6 commit 05f7981
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,50 +48,14 @@ public class JsonParserVisitor extends JSON5BaseVisitor<Json> {
private final FileAttributes fileAttributes;

private int cursor = 0;
private int codePointCursor = 0;

// Whether the source has multi bytes (> 2 bytes) unicode characters
private final boolean hasMultiBytesUnicode;
// Antlr index to source index mapping
private final int[] indexes;

public JsonParserVisitor(Path path,
@Nullable FileAttributes fileAttributes,
EncodingDetectingInputStream sourceInput
) {
public JsonParserVisitor(Path path, @Nullable FileAttributes fileAttributes, EncodingDetectingInputStream source) {
this.path = path;
this.fileAttributes = fileAttributes;
this.source = sourceInput.readFully();
this.charset = sourceInput.getCharset();
this.charsetBomMarked = sourceInput.isCharsetBomMarked();

boolean hasMultiBytesUnicode = false;
int[] pos = new int[source.length() + 1];
int cursor = 0;
int i = 1;
pos[0] = 0;

while (cursor < source.length()) {
int newCursor = source.offsetByCodePoints(cursor, 1);
if (newCursor > cursor + 1) {
hasMultiBytesUnicode = true;
}
pos[i++] = newCursor;
cursor = newCursor;
}

this.hasMultiBytesUnicode = hasMultiBytesUnicode;
this.indexes = hasMultiBytesUnicode ? pos : null;
}

/**
* Characters index to source index mapping, valid only when `hasMultiBytesUnicode` is true.
* Antlr index is based on characters index and reader is based on source index.
* If there are any >2 bytes unicode characters in source code, it will make the index mismatch.
* @param index index from Antlr
* @return corrected cursor index
*/
private int getCursorIndex(int index) {
return hasMultiBytesUnicode ? indexes[index] : index;
this.source = source.readFully();
this.charset = source.getCharset();
this.charsetBomMarked = source.isCharsetBomMarked();
}

@Override
Expand Down Expand Up @@ -301,15 +265,21 @@ private Space prefix(@Nullable TerminalNode terminalNode) {
}

private Space prefix(Token token) {
int start = getCursorIndex(token.getStartIndex());
if (start < cursor) {
int start = token.getStartIndex();
if (start < codePointCursor) {
return Space.EMPTY;
}
String prefix = source.substring(cursor, start);
cursor = start;
String prefix = source.substring(cursor, advanceCursor(start));
return Space.format(prefix);
}

public int advanceCursor(int newCodePointIndex) {
for (; codePointCursor < newCodePointIndex; codePointCursor++) {
cursor = source.offsetByCodePoints(cursor, 1);
}
return cursor;
}

@Nullable
private <C extends ParserRuleContext, T> T convert(C ctx, BiFunction<C, Space, T> conversion) {
if (ctx == null) {
Expand All @@ -318,20 +288,20 @@ private <C extends ParserRuleContext, T> T convert(C ctx, BiFunction<C, Space, T

T t = conversion.apply(ctx, prefix(ctx));
if (ctx.getStop() != null) {
cursor = getCursorIndex(ctx.getStop().getStopIndex()) + (Character.isWhitespace(source.charAt(getCursorIndex(ctx.getStop().getStopIndex()))) ? 0 : 1);
advanceCursor(ctx.getStop().getStopIndex() + 1);
}

return t;
}

private <T> T convert(TerminalNode node, BiFunction<TerminalNode, Space, T> conversion) {
T t = conversion.apply(node, prefix(node));
cursor = getCursorIndex(node.getSymbol().getStopIndex()) + 1;
advanceCursor(node.getSymbol().getStopIndex() + 1);
return t;
}

private void skip(TerminalNode node) {
cursor = node.getSymbol().getStopIndex() + 1;
advanceCursor(node.getSymbol().getStopIndex() + 1);
}

/**
Expand All @@ -346,7 +316,7 @@ private Space sourceBefore(String untilDelim) {
}

String prefix = source.substring(cursor, delimIndex);
cursor += prefix.length() + untilDelim.length(); // advance past the delimiter
advanceCursor(codePointCursor + Character.codePointCount(prefix, 0, prefix.length()) + untilDelim.length());
return Space.format(prefix);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ void multiBytesUnicode() {
json(
"""
{
"🤖" : "robot"
"🤖" : "robot",
"robot" : "🤖",
"நடித்த" : 3 /* 🇩🇪 */
}
"""
)
Expand Down

0 comments on commit 05f7981

Please sign in to comment.