Skip to content

Commit

Permalink
Initial unicode character support for identifiers and whitespace
Browse files Browse the repository at this point in the history
Summary:

Test Plan:
Added a test

Reviewers:

Subscribers:

Tasks:

Tags:
  • Loading branch information
kaikalur committed Jan 10, 2023
1 parent 19f8399 commit e9f30c0
Show file tree
Hide file tree
Showing 7 changed files with 25,375 additions and 5 deletions.
2 changes: 1 addition & 1 deletion parser/cpp/prepare-javacc-grammar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ pwd
GRAMMAR_DIR='../grammar'
GEN_DIR='target/generated-sources/javacc'
mkdir -p $GEN_DIR
cat ./javacc-options.txt $GRAMMAR_DIR/nonreservedwords.txt $GRAMMAR_DIR/reservedwords.txt $GRAMMAR_DIR/sql-spec.txt $GRAMMAR_DIR/presto-extensions.txt $GRAMMAR_DIR/lexical-elements.txt > $GEN_DIR/parser_tmp.jjt
cat ./javacc-options.txt $GRAMMAR_DIR/nonreservedwords.txt $GRAMMAR_DIR/reservedwords.txt $GRAMMAR_DIR/sql-spec.txt $GRAMMAR_DIR/presto-extensions.txt $GRAMMAR_DIR/unicode-identifier-start.txt $GRAMMAR_DIR/unicode-identifier-extend.txt $GRAMMAR_DIR/ws.txt $GRAMMAR_DIR/lexical-elements.txt > $GEN_DIR/parser_tmp.jjt
15 changes: 12 additions & 3 deletions parser/grammar/lexical-elements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,14 @@ regular_identifier()

| <#identifier_part: <identifier_start> | <identifier_extend> >

| <#identifier_start: ["a"-"z"] // temp
| <#identifier_start: (<UnicodeIdentifierStart>)
/*!! See the Syntax Rules.*/
/* Unicode char classes: <Ll> | <Lm> | <Lo> | <Lt> | <Lu> | <Nl> */
>

| <#identifier_extend: ["\u00B7", "0"-"9", "_"] // temp
| <#identifier_extend: ["\u00B7"] | <UnicodeIdentifierExtend>
//!! See the Syntax Rules.
/* Unicode char classes: <Mn>, <Mc>, <Nd>, <Pc>, <Cf> */
>

| <large_object_length_token: ( <digit> )+ <multiplier> >
Expand Down Expand Up @@ -150,7 +152,14 @@ delimiter_token:

SPECIAL_TOKEN:
{
<white_space: <newline> | [ " ", "\t" ] // temp
<white_space: (<UnicodeWhiteSpace> |
[ "\u0009" // Horizontal Tabulation
, "\n" // "\\u000A" //, Line Feed
, "\u000B" // Vertical Tabulation
, "\f" // "\\u000C" //, Form Feed
, "\r" // "\\u000D" //, Carriage Return
, "\u0085" // Next Line
])
//!! See the Syntax Rules.
>

Expand Down
2 changes: 1 addition & 1 deletion parser/grammar/prepare-javacc-grammar.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Concatenate all the fragments into a .jj file.
gendir='../target/generated-sources/javacc'
mkdir -p $gendir
cat javacc-options-java.txt nonreservedwords.txt reservedwords.txt sql-spec.txt presto-extensions.txt lexical-elements.txt > $gendir/parser_tmp.jjt
cat javacc-options-java.txt nonreservedwords.txt reservedwords.txt sql-spec.txt presto-extensions.txt unicode-identifier-start.txt unicode-identifier-extend.txt ws.txt lexical-elements.txt > $gendir/parser_tmp.jjt
3,316 changes: 3,316 additions & 0 deletions parser/grammar/unicode-identifier-extend.txt

Large diffs are not rendered by default.

22,012 changes: 22,012 additions & 0 deletions parser/grammar/unicode-identifier-start.txt

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions parser/grammar/ws.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
TOKEN:
{
<#Zl: [
"\u2028" //LINE SEPARATOR;Zl;0;WS;;;;;N;;;;;
]>

| <#Zp: [
"\u2029" //PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;;
]>

| <#Zs: [
"\u0020" //SPACE;Zs;0;WS;;;;;N;;;;;
, "\u00A0" //NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
, "\u1680" //OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
, "\u2000" //EN QUAD;Zs;0;WS;2002;;;;N;;;;;
, "\u2001" //EM QUAD;Zs;0;WS;2003;;;;N;;;;;
, "\u2002" //EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2003" //EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2004" //THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2005" //FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2006" //SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2007" //FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
, "\u2008" //PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2009" //THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u200A" //HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u202F" //NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
, "\u205F" //MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u3000" //IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
]>

| <#UnicodeWhiteSpace: (<Zl> | <Zp> | <Zs>)>
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ public class TestSqlParser
"SELECT f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f())))))))))))))))))))))))))))));",
"SELECT abs, 2 as abs;",
"SELECT sqrt(x), power(y, 5), myFunction('a') FROM T;",
"SELECT 1 ఒకటి;",
};

private AstNode parse(String sql)
Expand Down

0 comments on commit e9f30c0

Please sign in to comment.