diff --git a/CHANGELOG.md b/CHANGELOG.md index 24eb434..a1ae9db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,14 @@ The format is loosely based on [Keep a Changelog](http://keepachangelog.com/). L This project tries to adhere to [Semantic Versioning](http://semver.org/). +## Version 2.0.2 - 2017-04-09 + +- Fixed bug where id attributes were added to source tags (issue #56) +- Fixed bug where directory names could not have spaces in them (issue #42) +- Added explanation on unintuitive character offsets of docelement (issue #15) +- Changed how Tag instances are created, which used to be somewhat inconsistent. + + ## Version 2.0.1 - 2017-04-03 - Added links to Tarsqi publications to the manual. diff --git a/README.md b/README.md index 19fa84c..6e77856 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,4 @@ This is the main repository for the Tarsqi Toolkit (TTK), a set of processing components for extracting temporal information from news wire texts. TTK extracts time expressions, events, subordination links and temporal links; in addition, it can ensure consistency of temporal information. -To use the Tarsqi Toolkit first either clone this repository or download the most recent release from https://github.com/tarsqi/ttk/releases, then follow the instructions in the manual at `docs/manual/index.html`. Manuals can also be browsed on the [TimeML website](http://timeml.org/tarsqi/toolkit/manual/versions/). +To use the Tarsqi Toolkit first either clone this repository or download the most recent release from https://github.com/tarsqi/ttk/releases, then follow the instructions in the manual at `docs/manual/index.html`. Manuals can also be browsed on the [TimeML website](http://timeml.org/tarsqi/toolkit/docs/versions/). diff --git a/VERSION b/VERSION index 38f77a6..e9307ca 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.0.1 +2.0.2 diff --git a/components/common_modules/tree.py b/components/common_modules/tree.py index 2c10873..f688cd7 100644 --- a/components/common_modules/tree.py +++ b/components/common_modules/tree.py @@ -39,7 +39,7 @@ def create_tarsqi_tree(tarsqidoc, element, links=False): tree = TarsqiTree(tarsqidoc, element) o1 = element.begin o2 = element.end - top_tag = Tag(None, None, o1, o2, {}) + top_tag = Tag(None, o1, o2, {}) top_node = Node(top_tag, None, tree) for tag in (tarsqidoc.tags.find_tags(SENTENCE, o1, o2) + tarsqidoc.tags.find_tags(NOUNCHUNK, o1, o2) + diff --git a/components/preprocessing/wrapper.py b/components/preprocessing/wrapper.py index 994e7fc..6d54ca1 100644 --- a/components/preprocessing/wrapper.py +++ b/components/preprocessing/wrapper.py @@ -180,12 +180,13 @@ def _export(self, text): TagRepository using the preprocessing result.""" ctag = None for sentence in text: - stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR}) + sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR } + stag = Tag('s', None, None, sentence_attrs) for token in sentence: if _is_tag(token): if not token.startswith('', ''): in_chunk = False chunk_tag = token[2:-1] - ctag = Tag(TagId.next('c'), chunk_tag, chunk_begin, chunk_end, {'origin': CHUNKER}) + ctag = Tag(chunk_tag, chunk_begin, chunk_end, + { 'id': TagId.next('c'), 'origin': CHUNKER }) self.document.tags.append(ctag) elif in_chunk: if chunk_begin is None: diff --git a/docmodel/document.py b/docmodel/document.py index 7838904..3a772cc 100644 --- a/docmodel/document.py +++ b/docmodel/document.py @@ -222,7 +222,6 @@ def __init__(self, filename=''): self.metadata = {} self.tags = TagRepository() self.offset = 0 - self.tag_number = 0 def __getitem__(self, i): return self.text[i] @@ -230,15 +229,13 @@ def __getitem__(self, i): def add_opening_tag(self, name, attrs): """Add an opening tag to source_tags. This is used by the StartElementHandler of the Expat parser in SourceParserXML.""" - self.tag_number += 1 - opening_tag = OpeningTag(self.tag_number, name, self.offset, attrs) + opening_tag = OpeningTag(name, self.offset, attrs) self.tags.add_tmp_tag(opening_tag) def add_closing_tag(self, name): """Add a closing tag to source_tags. This is used by the EndElementHandler of the Expat parser in SourceParserXML.""" - self.tag_number += 1 - closing_tag = ClosingTag(self.tag_number, name, self.offset) + closing_tag = ClosingTag(name, self.offset) self.tags.add_tmp_tag(closing_tag) def add_characters(self, string): @@ -288,10 +285,10 @@ def print_source(self, filename): class TagRepository: - """Class that provides access to the tags for a document. An instance of - this class is used for the DocSource instance, other instances will be used - for the elements in a TarsqiDocument. For now, the repository has the - following structure: + """Class that provides access to the tags for a document. An instance of this + class is used for the DocSource instance, other instances will be used for + the elements in a TarsqiDocument. For now, the repository has the following + structure: self.tmp A list of OpeningTag and ClosingTag elements, used only to build the tags @@ -344,15 +341,7 @@ def add_tmp_tag(self, tagInstance): def add_tag(self, name, begin, end, attrs): """Add a tag to the tags list and the opening_tags and closing_tags dictionaries.""" - tag = Tag(None, name, begin, end, attrs) - self.tags.append(tag) - self.opening_tags.setdefault(begin, []).append(tag) - self.closing_tags.setdefault(end, {}).setdefault(begin, {})[tag.name] = True - - def add_tag_with_id(self, name, identifier, begin, end, attrs): - """Add a tag to the tags list and the opening_tags and closing_tags - dictionaries.""" - tag = Tag(identifier, name, begin, end, attrs) + tag = Tag(name, begin, end, attrs) self.tags.append(tag) self.opening_tags.setdefault(begin, []).append(tag) self.closing_tags.setdefault(end, {}).setdefault(begin, {})[tag.name] = True @@ -376,7 +365,9 @@ def merge(self): stack.append(t) elif t.name == stack[-1].name: t1 = stack.pop() - tag = Tag(t1.id, t1.name, t1.begin, t.end, t1.attrs) + tag = Tag(t1.name, t1.begin, t.end, t1.attrs) + # We are not bothering to use add_tag since we will be building + # the index right after the merge. self.tags.append(tag) else: raise TarsqiInputError("non-matching tag %s" % t) @@ -457,7 +448,7 @@ def find_tags_at(self, begin_offset): def import_tags(self, tag_repository, tagname): """Import all tags with name=tagname from tag_repository into self. This - is moslty used when we want to take tags from the SourceDoc and add them + is mostly used when we want to take tags from the SourceDoc and add them to the tags on the TarsqiDocument.""" for tag in tag_repository.find_tags(tagname): self.add_tag(tagname, tag.begin, tag.end, tag.attrs) @@ -485,29 +476,23 @@ def pp_closing_tags(self): class Tag: - """A Tag has a name, an id, a begin offset, an end offset and a dictionary - of attributes. The id is handed in by the code that creates the Tag which - could be: (1) the code that parses the source document, in which case - identifiers are numbered depending on text position, (2) the preprocessor - code, which assigns identifiers for lex, ng, vg and s tags, or (3) one of - the components that creates tarsqi tags, in which case the identifier is - None because special identifiers like eid, eiid, tid and lid are used.""" - - def __init__(self, identifier, name, o1, o2, attrs): - """Initialize id, name, begin, end and attrs instance variables and make - sure that what we have can be turned into valid XML by removing duplicate + """A Tag has a name, an id, a begin offset, an end offset and a dictionary of + attributes. The id is handed in by the code that creates the Tag which could + be: (1) the code that parses the source document, which will only assign an + identifier if the source had an id attribute, (2) the preprocessor code, + which assigns identifiers for lex, ng, vg and s tags, or (3) one of the + components that creates tarsqi tags, in which case the identifier is None, + but special identifiers like eid, eiid, tid and lid are used.""" + + def __init__(self, name, o1, o2, attrs): + """Initialize name, begin, end and attrs instance variables and make sure + that what we have can be turned into valid XML by removing duplicate attribute names.""" - self.id = identifier self.name = name self.begin = o1 self.end = o2 - # Sometimes attrs is None, use an empty dictionary in that case + # Sometimes attrs is None self.attrs = attrs or {} - # If there already was an 'id' attribute, use it to set or overwrite the - # identifier that was handed in. - if 'id' in self.attrs: - self.id = self.attrs.get('id') - del(self.attrs['id']) # In case existing tags have a begin or end attribute, replace it with a # generated new attribute name (if we have 'end', then the new attribute # name will be 'end-N' where N is 1 or a higher number if needed). @@ -516,10 +501,9 @@ def __init__(self, identifier, name, o1, o2, attrs): self.attrs[self.new_attr(attr, self.attrs)] = self.attrs.pop(attr) def __str__(self): - id_string = "id=%s " % self.id if self.id is not None else '' attrs = ''.join([" %s='%s'" % (k, v) for k, v in self.attrs.items()]) - return "" % \ - (self.name, id_string, self.begin, self.end, attrs) + return "" % \ + (self.name, self.begin, self.end, attrs) def __cmp__(self, other): """Order two Tags based on their begin offset and end offsets. Tags with @@ -551,23 +535,16 @@ def is_closing_tag(self): def as_ttk_tag(self): """Return the tag as a tag in the Tarsqi output format.""" - # move id tag from attrs to toplevel if needed - # TODO: maybe this should happen elsewhere - if self.id is None and self.attrs.get('id'): - self.id = self.attrs.get('id') - del(self.attrs['id']) begin = " begin=\"%s\"" % self.begin if self.begin >= 0 else '' end = " end=\"%s\"" % self.end if self.end >= 0 else '' - identifier = "" if self.id is None else " id=" + quoteattr(str(self.id)) - return "<%s%s%s%s%s />" % \ - (self.name, identifier, begin, end, self.attributes_as_string()) + return "<%s%s%s%s />" % (self.name, begin, end, self.attributes_as_string()) def as_lex_xml_string(self, text): """Return an opening and closing tag wrapped around text. This is used only by the GUTime wrapper to create input for GUTime, and it therefore has a narrow focus and does not get all information from the tag.""" return "%s" % \ - (self.id, self.begin, self.end, str(self.attrs['pos']), escape(text)) + (None, self.begin, self.end, str(self.attrs['pos']), escape(text)) def attributes_as_string(self): """Return a string representation of the attributes dictionary.""" @@ -581,12 +558,12 @@ class OpeningTag(Tag): "Like Tag, but self.end is always None.""" - def __init__(self, id, name, offset, attrs): - Tag.__init__(self, id, name, offset, None, attrs) + def __init__(self, name, offset, attrs): + Tag.__init__(self, name, offset, None, attrs) def __str__(self): - return "" % \ - (self.id, self.name, self.begin, str(self.attrs)) + return "" % \ + (self.name, self.begin, str(self.attrs)) def is_opening_tag(self): return True @@ -596,12 +573,12 @@ class ClosingTag(Tag): "Like Tag, but self.begin and self.attrs are always None.""" - def __init__(self, id, name, offset): - Tag.__init__(self, id, name, None, offset, None) + def __init__(self, name, offset): + Tag.__init__(self, name, None, offset, None) def __str__(self): - return "" % \ - (self.id, self.name, self.end) + return "" % \ + (self.name, self.end) def is_closing_tag(self): return True diff --git a/docmodel/source_parser.py b/docmodel/source_parser.py index e29bcdc..ca87cbc 100644 --- a/docmodel/source_parser.py +++ b/docmodel/source_parser.py @@ -141,12 +141,11 @@ def _add_to_tag_repository(self, node, tag_repository): name = node.tagName o1 = node.getAttribute('begin') o2 = node.getAttribute('end') - o1 = int(o1) if o1 else -1 - o2 = int(o2) if o2 else -1 + o1 = int(o1) if o1 is not None else -1 + o2 = int(o2) if o2 is not None else -1 attrs = dict(node.attributes.items()) attrs = dict([(k, v) for (k, v) in attrs.items() if k not in ('begin', 'end')]) - # print name, o1, o2, attrs tag_repository.add_tag(name, o1, o2, attrs) @@ -234,6 +233,7 @@ def _handle_start(self, name, attrs): of attributes. Asks the SourceDoc instance in the sourcedoc variable to add an opening tag.""" self._debug('start', name, attrs) + #print ',,,', name, attrs self.sourcedoc.add_opening_tag(name, attrs) def _handle_end(self, name): diff --git a/docs/code/modules/docmodel.document.html b/docs/code/modules/docmodel.document.html index 293aa4a..ec07173 100644 --- a/docs/code/modules/docmodel.document.html +++ b/docs/code/modules/docmodel.document.html @@ -48,7 +48,7 @@

Public Functions

-
__init__(self, id, name, offset)
+
__init__(self, name, offset)
 
__str__(self)
@@ -67,7 +67,7 @@

Public Functions

Public Functions

-
__init__(self, id, name, offset, attrs)
+
__init__(self, name, offset, attrs)
 
__str__(self)
@@ -141,13 +141,13 @@

Public Functions

class Tag
 
-A Tag has a name, an id, a begin offset, an end offset and a dictionary
-of attributes. The id is handed in by the code that creates the Tag which
-could be: (1) the code that parses the source document, in which case
-identifiers are numbered depending on text position, (2) the preprocessor
-code, which assigns identifiers for lex, ng, vg and s tags, or (3) one of
-the components that creates tarsqi tags, in which case the identifier is
-None because special identifiers like eid, eiid, tid and lid are used.
+A Tag has a name, an id, a begin offset, an end offset and a dictionary of +attributes. The id is handed in by the code that creates the Tag which could +be: (1) the code that parses the source document, which will only assign an +identifier if the source had an id attribute, (2) the preprocessor code, +which assigns identifiers for lex, ng, vg and s tags, or (3) one of the +components that creates tarsqi tags, in which case the identifier is None, +but special identifiers like eid, eiid, tid and lid are used.

Public Functions

@@ -159,9 +159,9 @@

Public Functions

no begin (that is, it is set to -1) will be ordered at the end. The order of two tags with the same begin and end is undefined.
-
__init__(self, identifier, name, o1, o2, attrs)
-Initialize id, name, begin, end and attrs instance variables and make -sure that what we have can be turned into valid XML by removing duplicate +
__init__(self, name, o1, o2, attrs)
+Initialize name, begin, end and attrs instance variables and make sure +that what we have can be turned into valid XML by removing duplicate attribute names.
 
__str__(self)
@@ -188,10 +188,10 @@

Public Functions

class TagRepository
 
-Class that provides access to the tags for a document. An instance of
-this class is used for the DocSource instance, other instances will be used
-for the elements in a TarsqiDocument. For now, the repository has the
-following structure:
+Class that provides access to the tags for a document. An instance of this
+class is used for the DocSource instance, other instances will be used for
+the elements in a TarsqiDocument. For now, the repository has the following
+structure:
 
 self.tmp
    A list of OpeningTag and ClosingTag elements, used only to build the tags
@@ -224,10 +224,6 @@ 

Public Functions

Add a tag to the tags list and the opening_tags and closing_tags dictionaries.
-
add_tag_with_id(self, name, identifier, begin, end, attrs)
-Add a tag to the tags list and the opening_tags and closing_tags -dictionaries.
-
 
add_tmp_tag(self, tagInstance)
Add an OpeningTag or ClosingTag to a temporary list. Used by the XML handlers.
@@ -255,7 +251,7 @@

Public Functions

 
import_tags(self, tag_repository, tagname)
Import all tags with name=tagname from tag_repository into self. This -is moslty used when we want to take tags from the SourceDoc and add them +is mostly used when we want to take tags from the SourceDoc and add them to the tags on the TarsqiDocument.
 
index(self)
diff --git a/docs/manual/index.html b/docs/manual/index.html index efc0441..1139a06 100644 --- a/docs/manual/index.html +++ b/docs/manual/index.html @@ -9,7 +9,7 @@

The TARSQI Toolkit Manual

-

Release Notes and Manual for TTK Version 2.0.1. +

Release Notes and Manual for TTK Version 2.0.2.

Marc Verhagen, April 2017. @@ -405,9 +405,9 @@

7. Known Issues

5. Contributors

Many people have contributed to the Tarsqi project, they are listed here in -alphabetical order: Alex Baron, Swini Garimella, Linda van Guilder, Josh -Gieringer, Catherine Havasi, Jerry Hobbs, Seokbae Jang, Bob Knippen, Congmin -Lee, Inderjeet Mani, Emin Mimaroglu, Jessica Moszkowicz, Feng Pan, John +alphabetical order: Alex Baron, John Frank, Swini Garimella, Linda van Guilder, +Josh Gieringer, Catherine Havasi, Jerry Hobbs, Seokbae Jang, Bob Knippen, +Congmin Lee, Inderjeet Mani, Emin Mimaroglu, Jessica Moszkowicz, Feng Pan, John Phillips, Alex Plotnick, James Pustejovsky, Hongyuan Qiu, Ruth Reeves, Anna Rumshisky, Sanjib Kumar Saha, Roser Saurí, Barry Schiffman, Andrew See, Amber Stubbs, Kevin Thomas, Marc Verhagen and Ben Wellner.

diff --git a/docs/notes/creating-tags.md b/docs/notes/creating-tags.md new file mode 100644 index 0000000..1225d91 --- /dev/null +++ b/docs/notes/creating-tags.md @@ -0,0 +1,68 @@ +# Creating Tags + +This document has some notes on how Tags are created, with particular attention +to when and how identifiers are added. + +There are two kinds of Tags: + +- components.common_modules.tags.Tag, with subtypes EventTag and TimexTag +- docmodel.document.Tag, with subtypes OpeningTag and ClosingTag + +This document is about the second one. + +Tags are initialized in several spots: + +- TagRepository.merge() +- SourceParserTTK +- PreprocessorWrapper +- TagRepository.add\_tag() +- create\_tarsqi\_tree() + + +#### docmodel.document.TagRepository.merge() + +This is used when an XML document is parsed and source tags are added to the +SourceDoc. There is a method SourceDoc.finish() that will call merge() which +takes the OpeningTags and ClosingTags and merges them into Tags, taking the +identifier from the OpeningTag. + +This merging has to be done because initially the XML parser in SourceParserXML +triggers invocation of add\_opening\_tag() and add\_closing\_tag(), which add +OpeningTags and ClosingTags to a temporary list of tags as found by the parser. + +Tags added this way used to always have an `id` attribute, but this seemed +useless and `id` attributes are not added anymore. + + +#### docmodel.source_parser.SourceParserTTK + +This loads the DOM and then adds DOM Nodes as Tags to the TarsqiDocument or +SourceDoc using \_add\_to\_tag\_repository(), which calls add_tag(). + + +#### components.preprocessor.wrapper.PreprocessorWrapper + +Tags are created when the PreProcessor wrapper exports its results to the +TagRepository on the TarsqiDocument. Tag identifiers in the `id` attribute are +created in the wrapper module by the TagId class, which maintains counters for +s, ng, vg and lex tags (where ng and vg tags share a counter). Tags are also +added by the TokenizerWrapper (for s and lex tags) and the ChunkerWrapper (for +ng and vg tags), but not by the TaggerWrapper because part-of-speech and lemma +are added to existing Tags. + + +#### docmodel.document.TagRepository.add_tag() + +Used by processing components, the SourceParserTTK class, the conversion code in +utility.convert and convenience methods on TarsqiDocument. If identifiers are +added they are generated by upstream code that creates the attributes dictionary +for the Tag. + + +#### components.common_modules.tree.create\_tarsqi\_tree() + +Uses a Tag with just begin and end offsets as one step in the process of +building an instance of TarsqiTree. No other Tags are created, but note that the +Node objects contain Tag instance that were created earlier. + + diff --git a/docs/notes/offset-curiosity.md b/docs/notes/offset-curiosity.md new file mode 100644 index 0000000..52fdb89 --- /dev/null +++ b/docs/notes/offset-curiosity.md @@ -0,0 +1,34 @@ +# Curious offsets + +This document explains why the docelement tag might appear to be outside the top-level xml tag of a file. + +Consider the following minimal input. + +```xml + + +``` + +And the resulting ttk file after processing. + + +```xml + + + + + + + + + + + + + + +``` + +It may look weird that the text tag has offsets 1:1 while the docelement tag has offsets 0:2. But note that the opening text tag does not start immediately after the ?xml processing instruction and that the closing text tag is followed by a newline. + +The document structure parser in docmodel.docstructure_parser.DocumentStructureParser takes the full text content of the document, which includes characters before and after the main tag. Therefore, if you have only one docelement, the main tag's offsets will be included in the docelement tag's offsets. diff --git a/utilities/convert.py b/utilities/convert.py index 27ec7b9..86fddda 100644 --- a/utilities/convert.py +++ b/utilities/convert.py @@ -2,7 +2,7 @@ 1. Convert LDC TimeBank into a modern TimeBank in the TTK format. - $ python convert.py --convert-timebank TIMEBANK_DIR TTK_DIR + $ python convert.py --timebank2ttk TIMEBANK_DIR TTK_DIR Converts TimeBank 1.2 as released by LDC into a version without makeinstance tags using the TTK format. This should be run on the data/extra files in the @@ -11,7 +11,7 @@ 2. Convert Thyme format into TTK format. - $ python convert.py --convert-thyme THYME_TEXT_DIR THYME_ANNO_DIR TTK_DIR + $ python convert.py --thyme2ttk THYME_TEXT_DIR THYME_ANNO_DIR TTK_DIR Note that in the Thyme corpus we have annotation directories like AnnotationData/coloncancer/Dev, whereas in the text directories we find @@ -20,8 +20,8 @@ 3. Convert the TTK format into HTML. - $ python convert.py --convert-ttk-into-html TTK_DIR HTML_DIR - $ python convert.py --convert-ttk-into-html --show-links TTK_DIR HTML_DIR + $ python convert.py --ttk2html TTK_DIR HTML_DIR + $ python convert.py --ttk2html --show-links TTK_DIR HTML_DIR Converts TTK files in TTK_DIR into HTML files in HTML_DIR, if --show-links is used links are shown in addition to the timexes and events. @@ -51,6 +51,8 @@ SLINK = LIBRARY.timeml.SLINK TLINK = LIBRARY.timeml.TLINK +LID = LIBRARY.timeml.LID +TID = LIBRARY.timeml.TID EID = LIBRARY.timeml.EID EIID = LIBRARY.timeml.EIID EVENTID = LIBRARY.timeml.EVENTID @@ -124,6 +126,7 @@ def convert_thyme(thyme_text_dir, thyme_anno_dir, out_dir, limit=sys.maxint): def _convert_thyme_file(thyme_text_file, thyme_anno_file, out_file): + LinkID.reset() tarsqidoc = _get_tarsqidoc(thyme_text_file, "text") dom = minidom.parse(thyme_anno_file) entities = [Entity(e) for e in dom.getElementsByTagName('entity')] @@ -151,7 +154,7 @@ def _add_timexes_to_tarsqidoc(timexes, timex_idx, metadata, tarsqidoc): if timex_idx.has_key(timex.id): print "WARNING: timex %s already exists" % timex.id timex_idx[timex.id] = begin - attrs = {'tid': timex.id} + attrs = { TID: timex.id } if timex.type == 'DOCTIME': metadata['dct'] = timex attrs['functionInDocument'] = 'DOCTIME' @@ -161,13 +164,14 @@ def _add_timexes_to_tarsqidoc(timexes, timex_idx, metadata, tarsqidoc): tarsqidoc.metadata['dct'] = dct_value elif timex.type == 'SECTIONTIME': attrs['functionInDocument'] = 'SECTIONTIME' - # TODO: see comment above - tarsqidoc.tags.add_tag_with_id('TIMEX3', timex.id, begin, end, attrs) + tarsqidoc.tags.add_tag('TIMEX3', begin, end, attrs) except ValueError: print "Skipping discontinuous timex" def _add_events_to_tarsqidoc(events, event_idx, dct, tarsqidoc): + """Add an event from the Thyme file. Also includes adding a TLINK to the DCT, + for this link we generate a new link identifier.""" dct_rel_id = 0 for event in events: try: @@ -176,28 +180,31 @@ def _add_events_to_tarsqidoc(events, event_idx, dct, tarsqidoc): print "WARNING: event %s already exists" % event.id event_idx[event.id] = begin # TODO: is it okay for these to be the same? - attrs = {'eid': event.id, 'eiid': event.id} - # TODO: could I just use add_tag()? - tarsqidoc.tags.add_tag_with_id('EVENT', event.id, begin, end, attrs) + attrs = { EID: event.id, EIID: event.id} + tarsqidoc.tags.add_tag('EVENT', begin, end, attrs) dct_rel_id += 1 - attrs = {RELTYPE: event.DocTimeRel, - EVENT_INSTANCE_ID: event.id, - RELATED_TO_TIME: dct.id} - tarsqidoc.tags.add_tag_with_id('TLINK', "d%s" % dct_rel_id, None, None, attrs) + attrs = { LID: LinkID.next(), + RELTYPE: event.DocTimeRel, + EVENT_INSTANCE_ID: event.id, + RELATED_TO_TIME: dct.id } + tarsqidoc.tags.add_tag('TLINK', None, None, attrs) except ValueError: print "Skipping discontinuous event" def _add_links_to_tarsqidoc(links, timex_idx, event_idx, tarsqidoc): + """Add a link from the Thyme file. Inherit the identifier on the Thyme + relation, even though it does not adhere to TimeML id formatting.""" for rel in links: linkid = "r%s" % rel.id.split('@')[0] sourceid = "%s%s" % (rel.Source.split('@')[1], rel.Source.split('@')[0]) targetid = "%s%s" % (rel.Target.split('@')[1], rel.Target.split('@')[0]) attrs = { + LID: linkid, _source_attr_name(rel.type, sourceid, timex_idx, event_idx): sourceid, _target_attr_name(rel.type, targetid, timex_idx, event_idx): targetid, RELTYPE: rel.RelType} - tarsqidoc.tags.add_tag_with_id(rel.type, linkid, None, None, attrs) + tarsqidoc.tags.add_tag(rel.type, None, None, attrs) def _source_attr_name(link_type, source_id, timex_idx, event_idx): @@ -224,7 +231,8 @@ def _target_attr_name(link_type, target_id, timex_idx, event_idx): class Entity(object): - """An entity from a Thyme annotation, either an event or a timex.""" + """An entity from a Thyme annotation, either an event or a timex (note that + a timex can be a DOCTIME or SECTIONTIME type).""" def __init__(self, dom_element): self.id = get_simple_value(dom_element, 'id') @@ -265,6 +273,24 @@ def __str__(self): (self.type, self.id, self.RelType, self.Source, self.Target) +class LinkID(object): + + """Class to provide fresh identifiers for TLINK tags.""" + + # TODO: should probably combine this with TagID in the preprocessor wrapper + + IDENTIFIER = 0 + + @classmethod + def next(cls): + cls.IDENTIFIER += 1 + return "l%d" % cls.IDENTIFIER + + @classmethod + def reset(cls): + cls.IDENTIFIER = 0 + + def get_value(entity, attr): return entity.getElementsByTagName(attr)[0] @@ -450,16 +476,15 @@ def _get_tarsqidoc(infile, source, metadata=True): if __name__ == '__main__': - long_options = ['convert-timebank', 'convert-thyme', - 'convert-ttk-into-html', 'show-links'] + long_options = ['timebank2ttk', 'thyme2ttk', 'ttk2html', 'show-links'] (opts, args) = getopt.getopt(sys.argv[1:], 'i:o:', long_options) opts = { k: v for k, v in opts } - if '--convert-timebank' in opts: + if '--timebank2ttk' in opts: convert_timebank(args[0], args[1]) - elif '--convert-thyme' in opts: + elif '--thyme2ttk' in opts: limit = 10 if DEBUG else sys.maxint convert_thyme(args[0], args[1], args[2], limit) - elif '--convert-ttk-into-html' in opts: + elif '--ttk2html' in opts: limit = 10 if DEBUG else sys.maxint showlinks = True if '--show-links' in opts else False convert_ttk_into_html(args[0], args[1], showlinks, limit) diff --git a/utilities/mallet.py b/utilities/mallet.py index 31d4851..69b57fc 100644 --- a/utilities/mallet.py +++ b/utilities/mallet.py @@ -80,11 +80,11 @@ def classify_command(mallet, vectors, model): regexp = "--line-regex \"^(\S*)[\s,]*(\S*)[\s]*(.*)$\"" regexp = "--line-regex \"^(\S*)[\s]*(\S*)[\s]*(.*)$\"" name_and_data = "--name 1 --data 3" - vectors_in = "--input %s" % vectors - classifier = "--classifier %s" % model + vectors_in = "--input '%s'" % vectors + classifier = "--classifier '%s'" % model output = '--output -' - stdout = "%s.out" % vectors - stderr = "%s.err" % vectors + stdout = "'%s.out'" % vectors + stderr = "'%s.err'" % vectors scriptname = os.path.join(mallet, 'bin', mallet_script) if not os.path.isfile(scriptname): logger.error("Cannot find %s" % scriptname) @@ -183,6 +183,7 @@ def _make_pipe(self, classifier): def _pipe_command(self, classifier): """Assemble the classifier command for use in a pipe.""" + # TODO: when testing this make sure you allow spaces in the classifier path return "sh %s classify-file --input - --output - --classifier %s" \ % (self.mallet, classifier)