user_syntax.text

		A User-Friendly Syntax for Core SHACL
	
	
This is a user-friendly syntax for the core of SHACL that tries to align
directly with my SHACL RDF syntax.   The grammar is compatible with ANTLR.
	
It would be possible to modify the syntax to come up with a more standard
treatment of conjunction, disjunction, and negation but that syntax would
not align as closely with the RDF syntax.
	
	
	Example    				Description
	
∈ ex:Person ⊩ |≤27|			// there are at most 27 people
¹ex:child ⊩ ∋ ex:john			// child subjects include John
¹ex:child ⊩ ∈ ex:Person ∧ IRI		// ... are people and not blank
ex:child² ⊩ ∈ ex:Person 		// child objects are people
∈ ex:Patriot ⊩ ⋹ ex:Citizen		// patriots are directly citizens
ex:password² ⊩ ^^xs:string ∧ ℓ≤24 ∧ ℓ≥8	// passwords are between 8 and 24 long
ex:age² ⊩ ^^xs:integer ∧ ≥0		// ages are non-negative integers
ex:john ⊩ ex:name ∝ "^John.*" ★ 	// John's name starts with John
ex:mstatus² ⊩ ∈ { ex:single ex:married ex:divorced }
					// three marital statuses only
ex:Person ⊩ ex:mstatus ∝ |=1|		// people have one marital status
ex:Person ⊩ ex:mstatus ∝ ∋ ex:married → ex:spouse ∝ |≥1|
					// married people have a spouse
ex:Person ⊩ ex:mstatus ∝ ∋ ex:single → ex:spouse ∝ |≤0|
					// single people don't have a spouse
∈ ex:Person ⊩ ex:spouse ∅ ex:child ∧	// people can't marry their children
	ex:child ex:age ≤ ex:age ∧	// ... are older than their children
	ex:age ≤ ex:child⁻¹ ex:age ∧	// ... are younger than their parents
	ex:child⁻¹ ex:child ∝ |≤9| ∧	// ... have at most 8 siblings
	ex:name ∝ ∈ rdf:langString ∧	// ... names are lang-tagged strings
	ex:name ∝ ➀			// ... have only one name per language
∈ ex:Person ⊩
  ( ( ex:spouse ∝ |≤0| → ex:mstatus ∝ ( |≥1| ∧ ∈{ex:single ex:divorced} ) ) ∖
    ( ex:spouse ∝ |≤1| → ex:mstatus ∝ ( |≥1| ∧ ∈{ex:married} ) ) ∖
    |≤3| ∖ )      		// people with no spouse are single or divorced
				// people with one spouse are married
				// and there are at most three people left over
∈ ex:Isolated ⊩ ⟦ rdf:type ⟧		// isolated nodes have only types
∈ ex:nonIsolated ⊩ ¬ ⟦ rdf:type ⟧	// non-isolated nodes have other values
sh:partShape ≡ (IRI ∨ sh:inverse ∝ IRI)	// parts are properties or inverses
sh:pathShape ≡ ∈ sh:path ⊩ ( sh:partShape ∨ ⦇ sh:partShape ⦈ )
					// paths are parts or lists of parts
	
	
	Grammar					Meaning

shaclDoc  : statement* EOF ;

statement : prefix | define | scoping ;
prefix    : '@prefix' PNAME_NS IRIREF '.' ;
define    : iri '≡' ( scope ( '∪' scope )* '⊩' )? shape '.' ; // name shape
scoping	  : scope ( '∪' scope )* '⊩' shape '.' ; // nodes in scope validate

scope	  : value		    #svalue	// the value
	  | '∈' clss		    #sclass	// SHACL instances of class
	  | '¹' proprty		    #spsubject	// subjects of property
	  | proprty '²'		    #spobject	// objects of property
	  | '¹' '?'		    #sasubject	// all subjects
	  | '?' '²'		    #saobject 	// all objects
	  ;

shape	  : ( filtr ( '∧' filtr )* '→' )? component ( '∧' component )* ;
			// set of nodes that validate against all filtrs
			// validates against each component

filtr	  : component ;				// nodes that validate vs shape

component : iri			    #cname	// validate against named shape
 | '∈' clss ( '∪' clss )*	    #cclass	// SHACL instance of some class
 | '^^' datatype ( '∪' datatype )*  #cdatatype	// has one of datatypes
 | '∈' '{' value* '}'		    #cin	// is one of values
 | '⋹' clss  			    #cdirect	// has rdf:type of class
 | 'ℓ' '≤' UNSIGNED_INTEGER	    #cmaxLength	// maximum string length
 | 'ℓ' '≥' UNSIGNED_INTEGER	    #cminLength	// minimum string length
 | '>' literal			    #cminExcl	// exclusive minimum
 | '≥' literal			    #cminIncl	// inclusive minimum
 | '<' literal			    #cmaxExcl	// exclusive maximum
 | '≤' literal			    #cmaxIncl	// inclusive maximum
 | 'IRI'			    #cnkIRI
 | 'Literal'			    #cnkLit
 | 'BlankNode'			    #cnkBlank	// kind of node
 | regex '★' ( string )?	    #cpattern	// matches pattern (with flags)
 | path '=' path		    #cequals	// path values the same
 | path '∅' path		    #cdisjoint	// path values disjoint
 | path '<' path		    #clt	// path1 values < path2 values
 | path '≤' path		    #clte	// path1 values ≤ path2 values
 | '⟦' pathpart * '⟧'		    #cclosed	// no other property has values
 | '∋' value			    #chasValue	// set contains value
 | '|' '≥' UNSIGNED_INTEGER '|'	    #cminCount	// minimum size of set
 | '|' '≤' UNSIGNED_INTEGER '|'	    #cmaxCount	// maximum size of set
 | '|' '=' UNSIGNED_INTEGER '|'	    #cexactCount// exact size of set
 | '➀'			    	    #cuniqueLang// only one value per language
 | '¬' component		    #cnot	// doesn't validate ag. comp.
 | path '∝' component		    #cpathValues// path values in shape
 | '⦇' shape '⦈'		    #clist	// list members in shape
 | '(' shape ')'		    #cshape	// validate against shape
 | '(' component ('∨'component)+ ')'#cor	// validate against one or more
 | '(' ( component '∖' ) + ')' 	    #cpartition // partition - see below
 ;		// The initial remnant is the entire set being validated.
		// The next remnant is the subset of the current one that fails
		// to validate against the filter of the respective component.
		// The final remnant is empty.
		// Each remnant validates against the respective component.

path	  : pathpart + ;			// composition
pathpart  : proprty		    #pprop
	  | proprty '⁻¹' 	    #pinv 	// inverse of property
	  ;

clss	  : iri ;				// a class
datatype  : iri ;				// a datatype
proprty   : iri ;				// a property
value	  : iri | literal ;			// object
regex	  : string ;				// regular expression

// Productions from Turtle grammar

literal        : rdfLiteral | numericLiteral | booleanLiteral ;
rdfLiteral     : string ( LANGTAG |  '^^' iri )? ;
numericLiteral : integer | DECIMAL | DOUBLE ;
booleanLiteral : 'true' | 'false' ;
string         : STRING_LITERAL_QUOTE | STRING_LITERAL_SINGLE_QUOTE
	       | STRING_LITERAL_LONG_SINGLE_QUOTE | STRING_LITERAL_LONG_QUOTE ;
iri            : IRIREF | prefixedName ;
prefixedName   : PNAME_LN | PNAME_NS ;
integer	       : UNSIGNED_INTEGER | SIGNED_INTEGER ;

IRIREF            : '<' (~[\u0000-\u0020<>"{}|^`\\] | UCHAR)* '>' ;
PNAME_NS          : PN_PREFIX? ':' ;
PNAME_LN          : PNAME_NS PN_LOCAL ;
LANGTAG           : '@' [a-zA-Z]+ ('-' [a-zA-Z0-9]+)* ;
//INTEGER         : ('+'|'-')? [0-9]+ ; // split in two
SIGNED_INTEGER    : ('+'|'-') [0-9]+ ;
UNSIGNED_INTEGER  : [0-9]+ ;
DECIMAL           : ('+'|'-')? [0-9]* '.' [0-9]+ ;
DOUBLE            : ('+'|'-')? ( [0-9]+'.'[0-9]* EXPONENT | '.'[0-9]+ EXPONENT | 
 		    	       	 [0-9]+  EXPONENT ) ;
fragment EXPONENT : [eE] ('+'|'-')? [0-9]+ ;

STRING_LITERAL_QUOTE        : '"'  (~[\u0022\u005C\u000A\u000D] | ECHAR | UCHAR)* '"'  ;
STRING_LITERAL_SINGLE_QUOTE : '\'' (~[\u0027\u005C\u000A\u000D] | ECHAR | UCHAR)* '\'' ;
STRING_LITERAL_LONG_SINGLE_QUOTE:'\'\'\'' (('\''|'\'\'')?(~['\\]|ECHAR|UCHAR))* '\'\'\'' ;
STRING_LITERAL_LONG_QUOTE 	: '"""' (('"' | '""')? (~["\\] | ECHAR | UCHAR))* '"""' ;
fragment UCHAR         : '\\u' HEX HEX HEX HEX | '\\U' HEX HEX HEX HEX HEX HEX HEX HEX ;
fragment ECHAR         : '\\' [tbnrf"'\\] ;
fragment PN_CHARS_BASE : [A-Za-z\u00C0-\u00D6\u00D8-\u00F6] |
                         [\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF] |
			 [\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF] |
			 [\uF900-\uFDCF\uFDF0-\uFFFD-\uEFFF\U00010000-\u000EFFFF] ;
fragment PN_CHARS_U    : PN_CHARS_BASE | '_' ;
fragment PN_CHARS      : PN_CHARS_U | '-' | [0-9\u00B7\u0300-\u036F\u203F-\u2040] ;
fragment PN_PREFIX     : PN_CHARS_BASE ((PN_CHARS | '.')* PN_CHARS)? ;
fragment PN_LOCAL      : (PN_CHARS_U | [:0-9] | PLX) ((PN_CHARS | '.' | ':' | PLX)* (PN_CHARS | ':' | PLX))? ;
fragment PLX	       : PERCENT | PN_LOCAL_ESC ;
fragment PERCENT       : '%' HEX HEX ;
fragment HEX           : [0-9A-Fa-f] ;
fragment PN_LOCAL_ESC  : '\\' [_~.-!$&'()*+,;=/?#@%] ;

WS 	 	       : [ \t\r\n]+ -> skip ;