Merge pull request #64 from tarsqi/hotfixes

Updating to version 2.0.2
tarsqi · Apr 9, 2017 · a7fed3d · a7fed3d
2 parents 6facdb8 + 357d5a6
commit a7fed3d
Show file tree

Hide file tree

Showing 13 changed files with 237 additions and 124 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,14 @@ The format is loosely based on [Keep a Changelog](http://keepachangelog.com/). L
 This project tries to adhere to [Semantic Versioning](http://semver.org/).
 
 
+## Version 2.0.2 - 2017-04-09
+
+- Fixed bug where id attributes were added to source tags (issue #56)
+- Fixed bug where directory names could not have spaces in them (issue #42)
+- Added explanation on unintuitive character offsets of docelement (issue #15)
+- Changed how Tag instances are created, which used to be somewhat inconsistent.
+
+
 ## Version 2.0.1 - 2017-04-03
 
 - Added links to Tarsqi publications to the manual.

diff --git a/README.md b/README.md
@@ -2,4 +2,4 @@
 
 This is the main repository for the Tarsqi Toolkit (TTK), a set of processing components for extracting temporal information from news wire texts. TTK extracts time expressions, events, subordination links and temporal links; in addition, it can ensure consistency of temporal information.
 
-To use the Tarsqi Toolkit first either clone this repository or download the most recent release from https://github.com/tarsqi/ttk/releases, then follow the instructions in the manual at `docs/manual/index.html`. Manuals can also be browsed on the [TimeML website](http://timeml.org/tarsqi/toolkit/manual/versions/).
+To use the Tarsqi Toolkit first either clone this repository or download the most recent release from https://github.com/tarsqi/ttk/releases, then follow the instructions in the manual at `docs/manual/index.html`. Manuals can also be browsed on the [TimeML website](http://timeml.org/tarsqi/toolkit/docs/versions/).
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.0.1
+2.0.2
diff --git a/components/common_modules/tree.py b/components/common_modules/tree.py
@@ -39,7 +39,7 @@ def create_tarsqi_tree(tarsqidoc, element, links=False):
     tree = TarsqiTree(tarsqidoc, element)
     o1 = element.begin
     o2 = element.end
-    top_tag = Tag(None, None, o1, o2, {})
+    top_tag = Tag(None, o1, o2, {})
     top_node = Node(top_tag, None, tree)
     for tag in (tarsqidoc.tags.find_tags(SENTENCE, o1, o2) +
                 tarsqidoc.tags.find_tags(NOUNCHUNK, o1, o2) +

diff --git a/components/preprocessing/wrapper.py b/components/preprocessing/wrapper.py
@@ -180,12 +180,13 @@ def _export(self, text):
         TagRepository using the preprocessing result."""
         ctag = None
         for sentence in text:
-            stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
+            sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR }
+            stag = Tag('s', None, None, sentence_attrs)
             for token in sentence:
                 if _is_tag(token):
                     if not token.startswith('</'):
-                        ctag = Tag(TagId.next('c'), token[1:-1], None, None,
-                                   {'origin': PREPROCESSOR})
+                        ctag = Tag(token[1:-1], None, None,
+                                   { 'id': TagId.next('c'), 'origin': PREPROCESSOR })
                     else:
                         ctag.end = last_ltag.end
                         self.document.tags.append(ctag)
@@ -216,9 +217,9 @@ def _is_tag(token):
 
 def _make_ltag(token):
     """Return an instance of Tag for the token."""
-    return Tag(TagId.next('l'), 'lex', token[3], token[4],
-               { 'lemma': token[2], 'pos': token[1], 'text': token[0],
-                 'origin': PREPROCESSOR })
+    return Tag('lex', token[3], token[4],
+               { 'id': TagId.next('l'), 'lemma': token[2], 'pos': token[1],
+                  'text': token[0], 'origin': PREPROCESSOR })
 
 
 class TokenizerWrapper:
@@ -266,8 +267,9 @@ def _export_tokens(self, tokens):
                 s_begin, s_end = None, None
             else:
                 begin, end = t.begin, t.end
-                attrs = { 'text': t.text, 'origin': TOKENIZER }
-                ltag = Tag(TagId.next('l'), 'lex', begin, end, attrs)
+                lid = TagId.next('l')
+                ltag = Tag('lex', begin, end,
+                           { 'id': lid, 'text': t.text, 'origin': TOKENIZER })
                 self.document.tags.append(ltag)
                 if s_begin is None:
                     s_begin = begin
@@ -278,7 +280,8 @@ def _export_tokens(self, tokens):
     def _export_sentence(self, s_begin, s_end):
         """Add an s tag to the TagRepository of the TarsqiDocument."""
         if s_begin is not None:
-            stag = Tag(TagId.next('s'), 's', s_begin, s_end, {'origin': TOKENIZER})
+            stag = Tag('s', s_begin, s_end,
+                       { 'id': TagId.next('s'), 'origin': TOKENIZER })
             self.document.tags.append(stag)
 
 
@@ -429,7 +432,8 @@ def _export_chunks(self, text):
                 elif token in ('</ng>', '</vg>'):
                     in_chunk = False
                     chunk_tag = token[2:-1]
-                    ctag = Tag(TagId.next('c'), chunk_tag, chunk_begin, chunk_end, {'origin': CHUNKER})
+                    ctag = Tag(chunk_tag, chunk_begin, chunk_end,
+                               { 'id': TagId.next('c'), 'origin': CHUNKER })
                     self.document.tags.append(ctag)
                 elif in_chunk:
                     if chunk_begin is None:

diff --git a/docmodel/document.py b/docmodel/document.py
@@ -222,23 +222,20 @@ def __init__(self, filename='<STRING>'):
         self.metadata = {}
         self.tags = TagRepository()
         self.offset = 0
-        self.tag_number = 0
 
     def __getitem__(self, i):
         return self.text[i]
 
     def add_opening_tag(self, name, attrs):
         """Add an opening tag to source_tags. This is used by the
         StartElementHandler of the Expat parser in SourceParserXML."""
-        self.tag_number += 1
-        opening_tag = OpeningTag(self.tag_number, name, self.offset, attrs)
+        opening_tag = OpeningTag(name, self.offset, attrs)
         self.tags.add_tmp_tag(opening_tag)
 
     def add_closing_tag(self, name):
         """Add a closing tag to source_tags. This is used by the
         EndElementHandler of the Expat parser in SourceParserXML."""
-        self.tag_number += 1
-        closing_tag = ClosingTag(self.tag_number, name, self.offset)
+        closing_tag = ClosingTag(name, self.offset)
         self.tags.add_tmp_tag(closing_tag)
 
     def add_characters(self, string):
@@ -288,10 +285,10 @@ def print_source(self, filename):
 
 class TagRepository:
 
-    """Class that provides access to the tags for a document. An instance of
-    this class is used for the DocSource instance, other instances will be used
-    for the elements in a TarsqiDocument. For now, the repository has the
-    following structure:
+    """Class that provides access to the tags for a document. An instance of this
+    class is used for the DocSource instance, other instances will be used for
+    the elements in a TarsqiDocument. For now, the repository has the following
+    structure:
 
     self.tmp
        A list of OpeningTag and ClosingTag elements, used only to build the tags
@@ -344,15 +341,7 @@ def add_tmp_tag(self, tagInstance):
     def add_tag(self, name, begin, end, attrs):
         """Add a tag to the tags list and the opening_tags and closing_tags
         dictionaries."""
-        tag = Tag(None, name, begin, end, attrs)
-        self.tags.append(tag)
-        self.opening_tags.setdefault(begin, []).append(tag)
-        self.closing_tags.setdefault(end, {}).setdefault(begin, {})[tag.name] = True
-
-    def add_tag_with_id(self, name, identifier, begin, end, attrs):
-        """Add a tag to the tags list and the opening_tags and closing_tags
-        dictionaries."""
-        tag = Tag(identifier, name, begin, end, attrs)
+        tag = Tag(name, begin, end, attrs)
         self.tags.append(tag)
         self.opening_tags.setdefault(begin, []).append(tag)
         self.closing_tags.setdefault(end, {}).setdefault(begin, {})[tag.name] = True
@@ -376,7 +365,9 @@ def merge(self):
                 stack.append(t)
             elif t.name == stack[-1].name:
                 t1 = stack.pop()
-                tag = Tag(t1.id, t1.name, t1.begin, t.end, t1.attrs)
+                tag = Tag(t1.name, t1.begin, t.end, t1.attrs)
+                # We are not bothering to use add_tag since we will be building
+                # the index right after the merge.
                 self.tags.append(tag)
             else:
                 raise TarsqiInputError("non-matching tag %s" % t)
@@ -457,7 +448,7 @@ def find_tags_at(self, begin_offset):
 
     def import_tags(self, tag_repository, tagname):
         """Import all tags with name=tagname from tag_repository into self. This
-        is moslty used when we want to take tags from the SourceDoc and add them
+        is mostly used when we want to take tags from the SourceDoc and add them
         to the tags on the TarsqiDocument."""
         for tag in tag_repository.find_tags(tagname):
             self.add_tag(tagname, tag.begin, tag.end, tag.attrs)
@@ -485,29 +476,23 @@ def pp_closing_tags(self):
 
 class Tag:
 
-    """A Tag has a name, an id, a begin offset, an end offset and a dictionary
-    of attributes. The id is handed in by the code that creates the Tag which
-    could be: (1) the code that parses the source document, in which case
-    identifiers are numbered depending on text position, (2) the preprocessor
-    code, which assigns identifiers for lex, ng, vg and s tags, or (3) one of
-    the components that creates tarsqi tags, in which case the identifier is
-    None because special identifiers like eid, eiid, tid and lid are used."""
-
-    def __init__(self, identifier, name, o1, o2, attrs):
-        """Initialize id, name, begin, end and attrs instance variables and make
-        sure that what we have can be turned into valid XML by removing duplicate
+    """A Tag has a name, an id, a begin offset, an end offset and a dictionary of
+    attributes. The id is handed in by the code that creates the Tag which could
+    be: (1) the code that parses the source document, which will only assign an
+    identifier if the source had an id attribute, (2) the preprocessor code,
+    which assigns identifiers for lex, ng, vg and s tags, or (3) one of the
+    components that creates tarsqi tags, in which case the identifier is None,
+    but special identifiers like eid, eiid, tid and lid are used."""
+
+    def __init__(self, name, o1, o2, attrs):
+        """Initialize name, begin, end and attrs instance variables and make sure
+        that what we have can be turned into valid XML by removing duplicate
         attribute names."""
-        self.id = identifier
         self.name = name
         self.begin = o1
         self.end = o2
-        # Sometimes attrs is None, use an empty dictionary in that case
+        # Sometimes attrs is None
         self.attrs = attrs or {}
-        # If there already was an 'id' attribute, use it to set or overwrite the
-        # identifier that was handed in.
-        if 'id' in self.attrs:
-            self.id = self.attrs.get('id')
-            del(self.attrs['id'])
         # In case existing tags have a begin or end attribute, replace it with a
         # generated new attribute name (if we have 'end', then the new attribute
         # name will be 'end-N' where N is 1 or a higher number if needed).
@@ -516,10 +501,9 @@ def __init__(self, identifier, name, o1, o2, attrs):
                 self.attrs[self.new_attr(attr, self.attrs)] = self.attrs.pop(attr)
 
     def __str__(self):
-        id_string = "id=%s " % self.id if self.id is not None else ''
         attrs = ''.join([" %s='%s'" % (k, v) for k, v in self.attrs.items()])
-        return "<Tag %s %s%s:%s {%s }>" % \
-               (self.name, id_string, self.begin, self.end, attrs)
+        return "<Tag %s %s:%s {%s }>" % \
+               (self.name, self.begin, self.end, attrs)
 
     def __cmp__(self, other):
         """Order two Tags based on their begin offset and end offsets. Tags with
@@ -551,23 +535,16 @@ def is_closing_tag(self):
 
     def as_ttk_tag(self):
         """Return the tag as a tag in the Tarsqi output format."""
-        # move id tag from attrs to toplevel if needed
-        # TODO: maybe this should happen elsewhere
-        if self.id is None and self.attrs.get('id'):
-            self.id = self.attrs.get('id')
-            del(self.attrs['id'])
         begin = " begin=\"%s\"" % self.begin if self.begin >= 0 else ''
         end = " end=\"%s\"" % self.end if self.end >= 0 else ''
-        identifier = "" if self.id is None else " id=" + quoteattr(str(self.id))
-        return "<%s%s%s%s%s />" % \
-            (self.name, identifier, begin, end, self.attributes_as_string())
+        return "<%s%s%s%s />" % (self.name, begin, end, self.attributes_as_string())
 
     def as_lex_xml_string(self, text):
         """Return an opening and closing tag wrapped around text. This is used only by
         the GUTime wrapper to create input for GUTime, and it therefore has a narrow
         focus and does not get all information from the tag."""
         return "<lex id=\"%s\" begin=\"%d\" end=\"%d\" pos=\"%s\">%s</lex>" % \
-            (self.id, self.begin, self.end, str(self.attrs['pos']), escape(text))
+            (None, self.begin, self.end, str(self.attrs['pos']), escape(text))
 
     def attributes_as_string(self):
         """Return a string representation of the attributes dictionary."""
@@ -581,12 +558,12 @@ class OpeningTag(Tag):
 
     "Like Tag, but self.end is always None."""
 
-    def __init__(self, id, name, offset, attrs):
-        Tag.__init__(self, id, name, offset, None, attrs)
+    def __init__(self, name, offset, attrs):
+        Tag.__init__(self, name, offset, None, attrs)
 
     def __str__(self):
-        return "<OpeningTag %d %s %d %s>" % \
-            (self.id, self.name, self.begin, str(self.attrs))
+        return "<OpeningTag %s %d %s>" % \
+            (self.name, self.begin, str(self.attrs))
 
     def is_opening_tag(self):
         return True
@@ -596,12 +573,12 @@ class ClosingTag(Tag):
 
     "Like Tag, but self.begin and self.attrs are always None."""
 
-    def __init__(self, id, name, offset):
-        Tag.__init__(self, id, name, None, offset, None)
+    def __init__(self, name, offset):
+        Tag.__init__(self, name, None, offset, None)
 
     def __str__(self):
-        return "<ClosingTag %d %s %d>" % \
-            (self.id, self.name, self.end)
+        return "<ClosingTag %s %d>" % \
+            (self.name, self.end)
 
     def is_closing_tag(self):
         return True

diff --git a/docmodel/source_parser.py b/docmodel/source_parser.py
@@ -141,12 +141,11 @@ def _add_to_tag_repository(self, node, tag_repository):
         name = node.tagName
         o1 = node.getAttribute('begin')
         o2 = node.getAttribute('end')
-        o1 = int(o1) if o1 else -1
-        o2 = int(o2) if o2 else -1
+        o1 = int(o1) if o1 is not None else -1
+        o2 = int(o2) if o2 is not None else -1
         attrs = dict(node.attributes.items())
         attrs = dict([(k, v) for (k, v) in attrs.items()
                       if k not in ('begin', 'end')])
-        # print name, o1, o2, attrs
         tag_repository.add_tag(name, o1, o2, attrs)
 
 
@@ -234,6 +233,7 @@ def _handle_start(self, name, attrs):
         of attributes. Asks the SourceDoc instance in the sourcedoc variable to
         add an opening tag."""
         self._debug('start', name, attrs)
+        #print ',,,', name, attrs
         self.sourcedoc.add_opening_tag(name, attrs)
 
     def _handle_end(self, name):

diff --git a/docs/code/modules/docmodel.document.html b/docs/code/modules/docmodel.document.html
@@ -48,7 +48,7 @@
 <blockquote>
 <h3>Public Functions</h3>
 <pre>
-<div class=function>__init__(self, id, name, offset)</div>
+<div class=function>__init__(self, name, offset)</div>
 </pre>
 <pre>
 <div class=function>__str__(self)</div>
@@ -67,7 +67,7 @@ <h3>Public Functions</h3>
 <blockquote>
 <h3>Public Functions</h3>
 <pre>
-<div class=function>__init__(self, id, name, offset, attrs)</div>
+<div class=function>__init__(self, name, offset, attrs)</div>
 </pre>
 <pre>
 <div class=function>__str__(self)</div>
@@ -141,13 +141,13 @@ <h3>Public Functions</h3>
 <a name="Tag"/><div class="section">class Tag</div>
 <pre>
 
-A Tag has a name, an id, a begin offset, an end offset and a dictionary
-of attributes. The id is handed in by the code that creates the Tag which
-could be: (1) the code that parses the source document, in which case
-identifiers are numbered depending on text position, (2) the preprocessor
-code, which assigns identifiers for lex, ng, vg and s tags, or (3) one of
-the components that creates tarsqi tags, in which case the identifier is
-None because special identifiers like eid, eiid, tid and lid are used.</pre>
+A Tag has a name, an id, a begin offset, an end offset and a dictionary of
+attributes. The id is handed in by the code that creates the Tag which could
+be: (1) the code that parses the source document, which will only assign an
+identifier if the source had an id attribute, (2) the preprocessor code,
+which assigns identifiers for lex, ng, vg and s tags, or (3) one of the
+components that creates tarsqi tags, in which case the identifier is None,
+but special identifiers like eid, eiid, tid and lid are used.</pre>
 
 <blockquote>
 <h3>Public Functions</h3>
@@ -159,9 +159,9 @@ <h3>Public Functions</h3>
 no begin (that is, it is set to -1) will be ordered at the end. The
 order of two tags with the same begin and end is undefined.</pre>
 <pre>
-<div class=function>__init__(self, identifier, name, o1, o2, attrs)</div>
-Initialize id, name, begin, end and attrs instance variables and make
-sure that what we have can be turned into valid XML by removing duplicate
+<div class=function>__init__(self, name, o1, o2, attrs)</div>
+Initialize name, begin, end and attrs instance variables and make sure
+that what we have can be turned into valid XML by removing duplicate
 attribute names.</pre>
 <pre>
 <div class=function>__str__(self)</div>
@@ -188,10 +188,10 @@ <h3>Public Functions</h3>
 <a name="TagRepository"/><div class="section">class TagRepository</div>
 <pre>
 
-Class that provides access to the tags for a document. An instance of
-this class is used for the DocSource instance, other instances will be used
-for the elements in a TarsqiDocument. For now, the repository has the
-following structure:
+Class that provides access to the tags for a document. An instance of this
+class is used for the DocSource instance, other instances will be used for
+the elements in a TarsqiDocument. For now, the repository has the following
+structure:
 
 self.tmp
    A list of OpeningTag and ClosingTag elements, used only to build the tags
@@ -224,10 +224,6 @@ <h3>Public Functions</h3>
 Add a tag to the tags list and the opening_tags and closing_tags
 dictionaries.</pre>
 <pre>
-<div class=function>add_tag_with_id(self, name, identifier, begin, end, attrs)</div>
-Add a tag to the tags list and the opening_tags and closing_tags
-dictionaries.</pre>
-<pre>
 <div class=function>add_tmp_tag(self, tagInstance)</div>
 Add an OpeningTag or ClosingTag to a temporary list. Used by the XML
 handlers.</pre>
@@ -255,7 +251,7 @@ <h3>Public Functions</h3>
 <pre>
 <div class=function>import_tags(self, tag_repository, tagname)</div>
 Import all tags with name=tagname from tag_repository into self. This
-is moslty used when we want to take tags from the SourceDoc and add them
+is mostly used when we want to take tags from the SourceDoc and add them
 to the tags on the TarsqiDocument.</pre>
 <pre>
 <div class=function>index(self)</div>