Skip to content

Commit

Permalink
Fix #143 Disallow quotes in roles and symbols
Browse files Browse the repository at this point in the history
Quotes are now only allowed to delimit strings or as escaped
characters within strings.

This commit also changes the PEG grammar to disallow line-breaking
spaces within strings so it more accurately represents the line-based
parsing behavior.
  • Loading branch information
goodmami committed Aug 7, 2024
1 parent 479266f commit 0dfbcf3
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 4 deletions.
6 changes: 4 additions & 2 deletions docs/notation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,10 @@ grammar to allow for surface alignments.
Symbol <- NameChar+
Role <- ':' NameChar*
Alignment <- '~' ([a-zA-Z] '.'?)? Digit+ (',' Digit+)*
String <- '"' (!'"' ('\\' . / .))* '"'
NameChar <- ![ \n\t\r\f\v()/:~] .
String <- '"' (!'"' (StrEscape / StrChar))* '"'
StrEscape <- '\\' StrChar
StrChar <- ![\n\r\f\v] .
NameChar <- ![ \n\t\r\f\v"()/:~] .
Digit <- [0-9]
This grammar has some seemingly unnecessary ambiguity in that both the
Expand Down
4 changes: 2 additions & 2 deletions penman/_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
'ALIGNMENT': r'~(?:[a-z]\.?)?[0-9]+(?:,[0-9]+)*',
# ROLE cannot be made up of COLON + SYMBOL because it then becomes
# difficult to detect anonymous roles: (a : b) vs (a :b c)
'ROLE': r':[^ \t\r\n\v\f()\/:~]*',
'SYMBOL': r'[^ \t\r\n\v\f()\/:~]+',
'ROLE': r':[^ \t\r\n\v\f"()\/:~]*',
'SYMBOL': r'[^ \t\r\n\v\f"()\/:~]+',
'LPAREN': r'\(',
'RPAREN': r'\)',
'SLASH': r'\/', # concept (node label) role
Expand Down
10 changes: 10 additions & 0 deletions tests/test_codec.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,16 @@ def test_decode_recursion_limit(self):
assert len(g.triples) == (n # n :instance triples
+ n - 1) # n - 1 :ARG0 triples

def test_decode_issue_143(self):
# https://github.com/goodmami/penman/issues/143
with pytest.raises(penman.DecodeError):
decode('(a :op ")')
with pytest.raises(penman.DecodeError):
decode('(a :op1 " :op2 "foo")')
with pytest.raises(penman.DecodeError):
decode('(a :" foo)')


def test_encode(self, x1):
# empty graph
g = penman.Graph([])
Expand Down
15 changes: 15 additions & 0 deletions tests/test_lexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,18 @@ def test_nonbreaking_space_issue_99():
assert [tok.type for tok in lexer.lex('1\r2')] == ['SYMBOL', 'SYMBOL']
assert [tok.type for tok in lexer.lex('1\u00a02')] == ['SYMBOL']
assert [tok.type for tok in lexer.lex('あ い')] == ['SYMBOL']


def test_unterminated_string_issue_143():
# https://github.com/goodmami/penman/issues/143
# unmatched quotes result in unexpected tokens
assert [tok.type for tok in lexer.lex('(a :op ")')] == [
'LPAREN', 'SYMBOL', 'ROLE', 'UNEXPECTED', 'RPAREN'
]
assert [tok.type for tok in lexer.lex('(a :op1 " :op2 "foo")')] == [
'LPAREN', 'SYMBOL', 'ROLE', 'STRING', 'SYMBOL', 'UNEXPECTED', 'RPAREN'
]
# also disallow quotes in role names
assert [tok.type for tok in lexer.lex('(a :" b)')] == [
'LPAREN', 'SYMBOL', 'ROLE', 'UNEXPECTED', 'SYMBOL', 'RPAREN'
]

0 comments on commit 0dfbcf3

Please sign in to comment.