Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial unicode support for identifiers and whitespace from SQL 2016 spec #41

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parser/cpp/prepare-javacc-grammar.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@ pwd
GRAMMAR_DIR='../grammar'
GEN_DIR='target/generated-sources/javacc'
mkdir -p $GEN_DIR
cat ./javacc-options.txt $GRAMMAR_DIR/nonreservedwords.txt $GRAMMAR_DIR/reservedwords.txt $GRAMMAR_DIR/sql-spec.txt $GRAMMAR_DIR/presto-extensions.txt $GRAMMAR_DIR/lexical-elements.txt > $GEN_DIR/parser_tmp.jjt
cat ./javacc-options.txt $GRAMMAR_DIR/nonreservedwords.txt $GRAMMAR_DIR/reservedwords.txt $GRAMMAR_DIR/sql-spec.txt $GRAMMAR_DIR/presto-extensions.txt $GRAMMAR_DIR/unicode-identifier-start.txt $GRAMMAR_DIR/unicode-identifier-extend.txt $GRAMMAR_DIR/ws.txt $GRAMMAR_DIR/lexical-elements.txt > $GEN_DIR/parser_tmp.jjt
15 changes: 12 additions & 3 deletions parser/grammar/lexical-elements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,12 +80,14 @@ regular_identifier()

| <#identifier_part: <identifier_start> | <identifier_extend> >

| <#identifier_start: ["a"-"z"] // temp
| <#identifier_start: (<UnicodeIdentifierStart>)
/*!! See the Syntax Rules.*/
/* Unicode char classes: <Ll> | <Lm> | <Lo> | <Lt> | <Lu> | <Nl> */
>

| <#identifier_extend: ["\u00B7", "0"-"9", "_"] // temp
| <#identifier_extend: ["\u00B7"] | <UnicodeIdentifierExtend>
//!! See the Syntax Rules.
/* Unicode char classes: <Mn>, <Mc>, <Nd>, <Pc>, <Cf> */
>

| <large_object_length_token: ( <digit> )+ <multiplier> >
Expand Down Expand Up @@ -150,7 +152,14 @@ delimiter_token:

SPECIAL_TOKEN:
{
<white_space: <newline> | [ " ", "\t" ] // temp
<white_space: (<UnicodeWhiteSpace> |
[ "\u0009" // Horizontal Tabulation
, "\n" // "\\u000A" //, Line Feed
, "\u000B" // Vertical Tabulation
, "\f" // "\\u000C" //, Form Feed
, "\r" // "\\u000D" //, Carriage Return
, "\u0085" // Next Line
])
//!! See the Syntax Rules.
>

Expand Down
2 changes: 1 addition & 1 deletion parser/grammar/prepare-javacc-grammar.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Concatenate all the fragments into a .jj file.
gendir='../target/generated-sources/javacc'
mkdir -p $gendir
cat javacc-options-java.txt nonreservedwords.txt reservedwords.txt sql-spec.txt presto-extensions.txt lexical-elements.txt > $gendir/parser_tmp.jjt
cat javacc-options-java.txt nonreservedwords.txt reservedwords.txt sql-spec.txt presto-extensions.txt unicode-identifier-start.txt unicode-identifier-extend.txt ws.txt lexical-elements.txt > $gendir/parser_tmp.jjt
2 changes: 1 addition & 1 deletion parser/grammar/presto-extensions.txt
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,6 @@ void weird_identifiers():

TOKEN:
{
<identifier_starting_with_underscore: ("_")+ (<regular_identifier>)? > { setKindToIdentifier(matchedToken); }
<identifier_with_underscore: (<regular_identifier>|"_")+ > { setKindToIdentifier(matchedToken); }
| <generic_unicode: "U&'" ( ~["'"] | ("''") )* "'"> { setUnicodeLiteralType(matchedToken); }
}
3,316 changes: 3,316 additions & 0 deletions parser/grammar/unicode-identifier-extend.txt

Large diffs are not rendered by default.

22,012 changes: 22,012 additions & 0 deletions parser/grammar/unicode-identifier-start.txt

Large diffs are not rendered by default.

32 changes: 32 additions & 0 deletions parser/grammar/ws.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
TOKEN:
{
<#Zl: [
"\u2028" //LINE SEPARATOR;Zl;0;WS;;;;;N;;;;;
]>

| <#Zp: [
"\u2029" //PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;;
]>

| <#Zs: [
"\u0020" //SPACE;Zs;0;WS;;;;;N;;;;;
, "\u00A0" //NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;NON-BREAKING SPACE;;;;
, "\u1680" //OGHAM SPACE MARK;Zs;0;WS;;;;;N;;;;;
, "\u2000" //EN QUAD;Zs;0;WS;2002;;;;N;;;;;
, "\u2001" //EM QUAD;Zs;0;WS;2003;;;;N;;;;;
, "\u2002" //EN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2003" //EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2004" //THREE-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2005" //FOUR-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2006" //SIX-PER-EM SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2007" //FIGURE SPACE;Zs;0;WS;<noBreak> 0020;;;;N;;;;;
, "\u2008" //PUNCTUATION SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u2009" //THIN SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u200A" //HAIR SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u202F" //NARROW NO-BREAK SPACE;Zs;0;CS;<noBreak> 0020;;;;N;;;;;
, "\u205F" //MEDIUM MATHEMATICAL SPACE;Zs;0;WS;<compat> 0020;;;;N;;;;;
, "\u3000" //IDEOGRAPHIC SPACE;Zs;0;WS;<wide> 0020;;;;N;;;;;
]>

| <#UnicodeWhiteSpace: (<Zl> | <Zp> | <Zs>)>
}
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ public class TestSqlParser
"SELECT f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f(f())))))))))))))))))))))))))))));",
"SELECT abs, 2 as abs;",
"SELECT sqrt(x), power(y, 5), myFunction('a') FROM T;",
"SELECT 1 ఒకటి;",
"SELECT a_b(a,'a', 1);",
"SELECT if(regexp_like(content_fbtype,'comment'),content_id,container_post_fbid) as content_id;",
};

private AstNode parse(String sql)
Expand Down