diff --git a/Schema/ParlaMint-TEI.ana.rnc b/Schema/ParlaMint-TEI.ana.rnc index cbb0a69e6..2e9e5874f 100644 --- a/Schema/ParlaMint-TEI.ana.rnc +++ b/Schema/ParlaMint-TEI.ana.rnc @@ -26,23 +26,26 @@ include "ParlaMint-TEI.rnc" { ## The only element that contains analysed text is the segment. It can, ## however, contain transcription commentary. - seg = - element seg { - global.atts - >> a:documentation [ "Optional @corresp for MTed corpora." ], - corresp.att, - (comment | pb | sentence)+ - } + seg = element seg { global.atts, (comment | pb | sentence)+ } } ## A sentence. sentence = element s { - global.atts - >> a:documentation [ "Optional @corresp for MTed corpora." ], - corresp.att, - (comment | pb | ner | word | punct)+, - syntax? + global.atts, (comment | pb | ner | phr | word | punct)+, syntax? + } + +## Multi-Word Unit +phr = + element phr { + global.atts, + attribute function { xsd:token }, + attribute ana { xsd:anyURI } + >> a:documentation [ + "Currently we only support phrases with semantic markers" + ], + attribute type { "sem" }, + mwe_tokens } ## Named entity element @@ -70,6 +73,9 @@ ner.atts = global.atts, attribute ana { xsd:anyURI } +## Annotations that can appear in a MWE. +mwe_tokens = (word | punct)+ + ## Annotations that can appear in a NE. ner_tokens = (word | punct | comment | pb | ner)+ @@ -123,7 +129,12 @@ token.atts = attribute msd { non-empty.string }, attribute ana { anyURIs }?, attribute pos { non-empty.string }?, - attribute function { normalized-space.string }? + attribute function { + # This is different from TEI, where it is a name token (which can't contain comma!?) + + # ref name="normalized-space.string"/ + xsd:token + }? ## Obligatory attributes of word. word.atts = diff --git a/Schema/ParlaMint-TEI.ana.rng b/Schema/ParlaMint-TEI.ana.rng index c2992c7db..5d72ee677 100644 --- a/Schema/ParlaMint-TEI.ana.rng +++ b/Schema/ParlaMint-TEI.ana.rng @@ -36,8 +36,6 @@ however, contain transcription commentary. - Optional @corresp for MTed corpora. - @@ -54,13 +52,12 @@ A sentence. - Optional @corresp for MTed corpora. - + @@ -71,6 +68,24 @@ + + Multi-Word Unit + + + + + + + + + Currently we only support phrases with semantic markers + + sem + + + + + Named entity element @@ -139,6 +154,16 @@ + + Annotations that can appear in a MWE. + + + + + + + + Annotations that can appear in a NE. @@ -261,7 +286,9 @@ - + + + diff --git a/Schema/ParlaMint-TEI.rnc b/Schema/ParlaMint-TEI.rnc index 57f21cb33..7821a61a8 100644 --- a/Schema/ParlaMint-TEI.rnc +++ b/Schema/ParlaMint-TEI.rnc @@ -174,7 +174,7 @@ title = } ## A "meeting" (e.g. session) of the parliament. -meeting = element meeting { global.atts, ana.att, corresp.att, text } +meeting = element meeting { global.atts, ana.att, text } ## Publisher of the corpus. publisher = @@ -218,9 +218,7 @@ text-body = ## The definition of the div element. \div = element div { - global.atts - >> a:documentation [ "Optional @corresp for MTed corpora." ], - corresp.att, + global.atts, ## A standard div with utterances. ((attribute type { "debateSection" }, @@ -249,18 +247,14 @@ u = "Utterance has obligatory @ana, for the role of the speaker." ], ana.att, - (attribute source { anyURIs }?) - >> a:documentation [ "Optional @corresp for MTed corpora." ], - corresp.att, + attribute source { anyURIs }?, (seg | comment | pb)+ } ## A segment (i.e paragraph) inside a speech. seg = element seg { - global.atts - >> a:documentation [ "Optional @corresp for MTed corpora." ], - corresp.att, + global.atts, (comment | pb | text diff --git a/Schema/ParlaMint-TEI.rng b/Schema/ParlaMint-TEI.rng index acba7c4e3..03f342af7 100644 --- a/Schema/ParlaMint-TEI.rng +++ b/Schema/ParlaMint-TEI.rng @@ -318,7 +318,6 @@ - @@ -408,8 +407,6 @@ The definition of the div element. - Optional @corresp for MTed corpora. - A standard div with utterances. @@ -479,8 +476,6 @@ - Optional @corresp for MTed corpora. - @@ -495,8 +490,6 @@ A segment (i.e paragraph) inside a speech. - Optional @corresp for MTed corpora. - diff --git a/Schema/ParlaMint.rnc b/Schema/ParlaMint.rnc index d2e878916..ef437aa42 100644 --- a/Schema/ParlaMint.rnc +++ b/Schema/ParlaMint.rnc @@ -267,7 +267,7 @@ head = } ## A page break, possibly with its source URI. -pb = element pb { global.atts, corresp.att, source.att, empty } +pb = element pb { global.atts, source.att, empty } ## A series of paragraphs. Paragraphs are only used in the teiHeader. paras = element p { global.atts, annotated.text }+ @@ -339,6 +339,7 @@ corresp.att = attribute corresp { anyURIs }? global.atts = id.att?, lang.att, + corresp.att, attribute n { xsd:string }? ## The identifier attribute, giving the unique ID of the diff --git a/Schema/ParlaMint.rng b/Schema/ParlaMint.rng index d40037581..d10327576 100644 --- a/Schema/ParlaMint.rng +++ b/Schema/ParlaMint.rng @@ -549,7 +549,6 @@ A page break, possibly with its source URI. - @@ -733,6 +732,7 @@ + diff --git a/Schema/compile.log b/Schema/compile.log index a5ba19f27..516027f0a 100644 --- a/Schema/compile.log +++ b/Schema/compile.log @@ -7,5 +7,5 @@ java -jar /usr/share/java/trang.jar ParlaMint-listOrg.rng ParlaMint-listOrg.rnc java -jar /usr/share/java/trang.jar ParlaMint-listPerson.rng ParlaMint-listPerson.rnc java -jar /usr/share/java/trang.jar ParlaMint-taxonomy.rng ParlaMint-taxonomy.rnc make[1]: Leaving directory '/home/project/corpora/Parla/ParlaMint/ParlaMint/Schema' -4.92user 0.61system 0:02.46elapsed 224%CPU (0avgtext+0avgdata 67164maxresident)k -1872inputs+824outputs (0major+71458minor)pagefaults 0swaps +5.33user 0.56system 0:02.59elapsed 227%CPU (0avgtext+0avgdata 67200maxresident)k +1600inputs+872outputs (0major+71468minor)pagefaults 0swaps