Skip to content

Commit

Permalink
Merge pull request #64 from tarsqi/hotfixes
Browse files Browse the repository at this point in the history
Updating to version 2.0.2
  • Loading branch information
marcverhagen authored Apr 9, 2017
2 parents 6facdb8 + 357d5a6 commit a7fed3d
Show file tree
Hide file tree
Showing 13 changed files with 237 additions and 124 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ The format is loosely based on [Keep a Changelog](http://keepachangelog.com/). L
This project tries to adhere to [Semantic Versioning](http://semver.org/).


## Version 2.0.2 - 2017-04-09

- Fixed bug where id attributes were added to source tags (issue #56)
- Fixed bug where directory names could not have spaces in them (issue #42)
- Added explanation on unintuitive character offsets of docelement (issue #15)
- Changed how Tag instances are created, which used to be somewhat inconsistent.


## Version 2.0.1 - 2017-04-03

- Added links to Tarsqi publications to the manual.
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@

This is the main repository for the Tarsqi Toolkit (TTK), a set of processing components for extracting temporal information from news wire texts. TTK extracts time expressions, events, subordination links and temporal links; in addition, it can ensure consistency of temporal information.

To use the Tarsqi Toolkit first either clone this repository or download the most recent release from https://github.com/tarsqi/ttk/releases, then follow the instructions in the manual at `docs/manual/index.html`. Manuals can also be browsed on the [TimeML website](http://timeml.org/tarsqi/toolkit/manual/versions/).
To use the Tarsqi Toolkit first either clone this repository or download the most recent release from https://github.com/tarsqi/ttk/releases, then follow the instructions in the manual at `docs/manual/index.html`. Manuals can also be browsed on the [TimeML website](http://timeml.org/tarsqi/toolkit/docs/versions/).
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.0.1
2.0.2
2 changes: 1 addition & 1 deletion components/common_modules/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def create_tarsqi_tree(tarsqidoc, element, links=False):
tree = TarsqiTree(tarsqidoc, element)
o1 = element.begin
o2 = element.end
top_tag = Tag(None, None, o1, o2, {})
top_tag = Tag(None, o1, o2, {})
top_node = Node(top_tag, None, tree)
for tag in (tarsqidoc.tags.find_tags(SENTENCE, o1, o2) +
tarsqidoc.tags.find_tags(NOUNCHUNK, o1, o2) +
Expand Down
24 changes: 14 additions & 10 deletions components/preprocessing/wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,12 +180,13 @@ def _export(self, text):
TagRepository using the preprocessing result."""
ctag = None
for sentence in text:
stag = Tag(TagId.next('s'), 's', None, None, {'origin': PREPROCESSOR})
sentence_attrs = { 'id': TagId.next('s'), 'origin': PREPROCESSOR }
stag = Tag('s', None, None, sentence_attrs)
for token in sentence:
if _is_tag(token):
if not token.startswith('</'):
ctag = Tag(TagId.next('c'), token[1:-1], None, None,
{'origin': PREPROCESSOR})
ctag = Tag(token[1:-1], None, None,
{ 'id': TagId.next('c'), 'origin': PREPROCESSOR })
else:
ctag.end = last_ltag.end
self.document.tags.append(ctag)
Expand Down Expand Up @@ -216,9 +217,9 @@ def _is_tag(token):

def _make_ltag(token):
"""Return an instance of Tag for the token."""
return Tag(TagId.next('l'), 'lex', token[3], token[4],
{ 'lemma': token[2], 'pos': token[1], 'text': token[0],
'origin': PREPROCESSOR })
return Tag('lex', token[3], token[4],
{ 'id': TagId.next('l'), 'lemma': token[2], 'pos': token[1],
'text': token[0], 'origin': PREPROCESSOR })


class TokenizerWrapper:
Expand Down Expand Up @@ -266,8 +267,9 @@ def _export_tokens(self, tokens):
s_begin, s_end = None, None
else:
begin, end = t.begin, t.end
attrs = { 'text': t.text, 'origin': TOKENIZER }
ltag = Tag(TagId.next('l'), 'lex', begin, end, attrs)
lid = TagId.next('l')
ltag = Tag('lex', begin, end,
{ 'id': lid, 'text': t.text, 'origin': TOKENIZER })
self.document.tags.append(ltag)
if s_begin is None:
s_begin = begin
Expand All @@ -278,7 +280,8 @@ def _export_tokens(self, tokens):
def _export_sentence(self, s_begin, s_end):
"""Add an s tag to the TagRepository of the TarsqiDocument."""
if s_begin is not None:
stag = Tag(TagId.next('s'), 's', s_begin, s_end, {'origin': TOKENIZER})
stag = Tag('s', s_begin, s_end,
{ 'id': TagId.next('s'), 'origin': TOKENIZER })
self.document.tags.append(stag)


Expand Down Expand Up @@ -429,7 +432,8 @@ def _export_chunks(self, text):
elif token in ('</ng>', '</vg>'):
in_chunk = False
chunk_tag = token[2:-1]
ctag = Tag(TagId.next('c'), chunk_tag, chunk_begin, chunk_end, {'origin': CHUNKER})
ctag = Tag(chunk_tag, chunk_begin, chunk_end,
{ 'id': TagId.next('c'), 'origin': CHUNKER })
self.document.tags.append(ctag)
elif in_chunk:
if chunk_begin is None:
Expand Down
93 changes: 35 additions & 58 deletions docmodel/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,23 +222,20 @@ def __init__(self, filename='<STRING>'):
self.metadata = {}
self.tags = TagRepository()
self.offset = 0
self.tag_number = 0

def __getitem__(self, i):
return self.text[i]

def add_opening_tag(self, name, attrs):
"""Add an opening tag to source_tags. This is used by the
StartElementHandler of the Expat parser in SourceParserXML."""
self.tag_number += 1
opening_tag = OpeningTag(self.tag_number, name, self.offset, attrs)
opening_tag = OpeningTag(name, self.offset, attrs)
self.tags.add_tmp_tag(opening_tag)

def add_closing_tag(self, name):
"""Add a closing tag to source_tags. This is used by the
EndElementHandler of the Expat parser in SourceParserXML."""
self.tag_number += 1
closing_tag = ClosingTag(self.tag_number, name, self.offset)
closing_tag = ClosingTag(name, self.offset)
self.tags.add_tmp_tag(closing_tag)

def add_characters(self, string):
Expand Down Expand Up @@ -288,10 +285,10 @@ def print_source(self, filename):

class TagRepository:

"""Class that provides access to the tags for a document. An instance of
this class is used for the DocSource instance, other instances will be used
for the elements in a TarsqiDocument. For now, the repository has the
following structure:
"""Class that provides access to the tags for a document. An instance of this
class is used for the DocSource instance, other instances will be used for
the elements in a TarsqiDocument. For now, the repository has the following
structure:
self.tmp
A list of OpeningTag and ClosingTag elements, used only to build the tags
Expand Down Expand Up @@ -344,15 +341,7 @@ def add_tmp_tag(self, tagInstance):
def add_tag(self, name, begin, end, attrs):
"""Add a tag to the tags list and the opening_tags and closing_tags
dictionaries."""
tag = Tag(None, name, begin, end, attrs)
self.tags.append(tag)
self.opening_tags.setdefault(begin, []).append(tag)
self.closing_tags.setdefault(end, {}).setdefault(begin, {})[tag.name] = True

def add_tag_with_id(self, name, identifier, begin, end, attrs):
"""Add a tag to the tags list and the opening_tags and closing_tags
dictionaries."""
tag = Tag(identifier, name, begin, end, attrs)
tag = Tag(name, begin, end, attrs)
self.tags.append(tag)
self.opening_tags.setdefault(begin, []).append(tag)
self.closing_tags.setdefault(end, {}).setdefault(begin, {})[tag.name] = True
Expand All @@ -376,7 +365,9 @@ def merge(self):
stack.append(t)
elif t.name == stack[-1].name:
t1 = stack.pop()
tag = Tag(t1.id, t1.name, t1.begin, t.end, t1.attrs)
tag = Tag(t1.name, t1.begin, t.end, t1.attrs)
# We are not bothering to use add_tag since we will be building
# the index right after the merge.
self.tags.append(tag)
else:
raise TarsqiInputError("non-matching tag %s" % t)
Expand Down Expand Up @@ -457,7 +448,7 @@ def find_tags_at(self, begin_offset):

def import_tags(self, tag_repository, tagname):
"""Import all tags with name=tagname from tag_repository into self. This
is moslty used when we want to take tags from the SourceDoc and add them
is mostly used when we want to take tags from the SourceDoc and add them
to the tags on the TarsqiDocument."""
for tag in tag_repository.find_tags(tagname):
self.add_tag(tagname, tag.begin, tag.end, tag.attrs)
Expand Down Expand Up @@ -485,29 +476,23 @@ def pp_closing_tags(self):

class Tag:

"""A Tag has a name, an id, a begin offset, an end offset and a dictionary
of attributes. The id is handed in by the code that creates the Tag which
could be: (1) the code that parses the source document, in which case
identifiers are numbered depending on text position, (2) the preprocessor
code, which assigns identifiers for lex, ng, vg and s tags, or (3) one of
the components that creates tarsqi tags, in which case the identifier is
None because special identifiers like eid, eiid, tid and lid are used."""

def __init__(self, identifier, name, o1, o2, attrs):
"""Initialize id, name, begin, end and attrs instance variables and make
sure that what we have can be turned into valid XML by removing duplicate
"""A Tag has a name, an id, a begin offset, an end offset and a dictionary of
attributes. The id is handed in by the code that creates the Tag which could
be: (1) the code that parses the source document, which will only assign an
identifier if the source had an id attribute, (2) the preprocessor code,
which assigns identifiers for lex, ng, vg and s tags, or (3) one of the
components that creates tarsqi tags, in which case the identifier is None,
but special identifiers like eid, eiid, tid and lid are used."""

def __init__(self, name, o1, o2, attrs):
"""Initialize name, begin, end and attrs instance variables and make sure
that what we have can be turned into valid XML by removing duplicate
attribute names."""
self.id = identifier
self.name = name
self.begin = o1
self.end = o2
# Sometimes attrs is None, use an empty dictionary in that case
# Sometimes attrs is None
self.attrs = attrs or {}
# If there already was an 'id' attribute, use it to set or overwrite the
# identifier that was handed in.
if 'id' in self.attrs:
self.id = self.attrs.get('id')
del(self.attrs['id'])
# In case existing tags have a begin or end attribute, replace it with a
# generated new attribute name (if we have 'end', then the new attribute
# name will be 'end-N' where N is 1 or a higher number if needed).
Expand All @@ -516,10 +501,9 @@ def __init__(self, identifier, name, o1, o2, attrs):
self.attrs[self.new_attr(attr, self.attrs)] = self.attrs.pop(attr)

def __str__(self):
id_string = "id=%s " % self.id if self.id is not None else ''
attrs = ''.join([" %s='%s'" % (k, v) for k, v in self.attrs.items()])
return "<Tag %s %s%s:%s {%s }>" % \
(self.name, id_string, self.begin, self.end, attrs)
return "<Tag %s %s:%s {%s }>" % \
(self.name, self.begin, self.end, attrs)

def __cmp__(self, other):
"""Order two Tags based on their begin offset and end offsets. Tags with
Expand Down Expand Up @@ -551,23 +535,16 @@ def is_closing_tag(self):

def as_ttk_tag(self):
"""Return the tag as a tag in the Tarsqi output format."""
# move id tag from attrs to toplevel if needed
# TODO: maybe this should happen elsewhere
if self.id is None and self.attrs.get('id'):
self.id = self.attrs.get('id')
del(self.attrs['id'])
begin = " begin=\"%s\"" % self.begin if self.begin >= 0 else ''
end = " end=\"%s\"" % self.end if self.end >= 0 else ''
identifier = "" if self.id is None else " id=" + quoteattr(str(self.id))
return "<%s%s%s%s%s />" % \
(self.name, identifier, begin, end, self.attributes_as_string())
return "<%s%s%s%s />" % (self.name, begin, end, self.attributes_as_string())

def as_lex_xml_string(self, text):
"""Return an opening and closing tag wrapped around text. This is used only by
the GUTime wrapper to create input for GUTime, and it therefore has a narrow
focus and does not get all information from the tag."""
return "<lex id=\"%s\" begin=\"%d\" end=\"%d\" pos=\"%s\">%s</lex>" % \
(self.id, self.begin, self.end, str(self.attrs['pos']), escape(text))
(None, self.begin, self.end, str(self.attrs['pos']), escape(text))

def attributes_as_string(self):
"""Return a string representation of the attributes dictionary."""
Expand All @@ -581,12 +558,12 @@ class OpeningTag(Tag):

"Like Tag, but self.end is always None."""

def __init__(self, id, name, offset, attrs):
Tag.__init__(self, id, name, offset, None, attrs)
def __init__(self, name, offset, attrs):
Tag.__init__(self, name, offset, None, attrs)

def __str__(self):
return "<OpeningTag %d %s %d %s>" % \
(self.id, self.name, self.begin, str(self.attrs))
return "<OpeningTag %s %d %s>" % \
(self.name, self.begin, str(self.attrs))

def is_opening_tag(self):
return True
Expand All @@ -596,12 +573,12 @@ class ClosingTag(Tag):

"Like Tag, but self.begin and self.attrs are always None."""

def __init__(self, id, name, offset):
Tag.__init__(self, id, name, None, offset, None)
def __init__(self, name, offset):
Tag.__init__(self, name, None, offset, None)

def __str__(self):
return "<ClosingTag %d %s %d>" % \
(self.id, self.name, self.end)
return "<ClosingTag %s %d>" % \
(self.name, self.end)

def is_closing_tag(self):
return True
Expand Down
6 changes: 3 additions & 3 deletions docmodel/source_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,12 +141,11 @@ def _add_to_tag_repository(self, node, tag_repository):
name = node.tagName
o1 = node.getAttribute('begin')
o2 = node.getAttribute('end')
o1 = int(o1) if o1 else -1
o2 = int(o2) if o2 else -1
o1 = int(o1) if o1 is not None else -1
o2 = int(o2) if o2 is not None else -1
attrs = dict(node.attributes.items())
attrs = dict([(k, v) for (k, v) in attrs.items()
if k not in ('begin', 'end')])
# print name, o1, o2, attrs
tag_repository.add_tag(name, o1, o2, attrs)


Expand Down Expand Up @@ -234,6 +233,7 @@ def _handle_start(self, name, attrs):
of attributes. Asks the SourceDoc instance in the sourcedoc variable to
add an opening tag."""
self._debug('start', name, attrs)
#print ',,,', name, attrs
self.sourcedoc.add_opening_tag(name, attrs)

def _handle_end(self, name):
Expand Down
38 changes: 17 additions & 21 deletions docs/code/modules/docmodel.document.html
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
<blockquote>
<h3>Public Functions</h3>
<pre>
<div class=function>__init__(self, id, name, offset)</div>
<div class=function>__init__(self, name, offset)</div>
</pre>
<pre>
<div class=function>__str__(self)</div>
Expand All @@ -67,7 +67,7 @@ <h3>Public Functions</h3>
<blockquote>
<h3>Public Functions</h3>
<pre>
<div class=function>__init__(self, id, name, offset, attrs)</div>
<div class=function>__init__(self, name, offset, attrs)</div>
</pre>
<pre>
<div class=function>__str__(self)</div>
Expand Down Expand Up @@ -141,13 +141,13 @@ <h3>Public Functions</h3>
<a name="Tag"/><div class="section">class Tag</div>
<pre>

A Tag has a name, an id, a begin offset, an end offset and a dictionary
of attributes. The id is handed in by the code that creates the Tag which
could be: (1) the code that parses the source document, in which case
identifiers are numbered depending on text position, (2) the preprocessor
code, which assigns identifiers for lex, ng, vg and s tags, or (3) one of
the components that creates tarsqi tags, in which case the identifier is
None because special identifiers like eid, eiid, tid and lid are used.</pre>
A Tag has a name, an id, a begin offset, an end offset and a dictionary of
attributes. The id is handed in by the code that creates the Tag which could
be: (1) the code that parses the source document, which will only assign an
identifier if the source had an id attribute, (2) the preprocessor code,
which assigns identifiers for lex, ng, vg and s tags, or (3) one of the
components that creates tarsqi tags, in which case the identifier is None,
but special identifiers like eid, eiid, tid and lid are used.</pre>

<blockquote>
<h3>Public Functions</h3>
Expand All @@ -159,9 +159,9 @@ <h3>Public Functions</h3>
no begin (that is, it is set to -1) will be ordered at the end. The
order of two tags with the same begin and end is undefined.</pre>
<pre>
<div class=function>__init__(self, identifier, name, o1, o2, attrs)</div>
Initialize id, name, begin, end and attrs instance variables and make
sure that what we have can be turned into valid XML by removing duplicate
<div class=function>__init__(self, name, o1, o2, attrs)</div>
Initialize name, begin, end and attrs instance variables and make sure
that what we have can be turned into valid XML by removing duplicate
attribute names.</pre>
<pre>
<div class=function>__str__(self)</div>
Expand All @@ -188,10 +188,10 @@ <h3>Public Functions</h3>
<a name="TagRepository"/><div class="section">class TagRepository</div>
<pre>

Class that provides access to the tags for a document. An instance of
this class is used for the DocSource instance, other instances will be used
for the elements in a TarsqiDocument. For now, the repository has the
following structure:
Class that provides access to the tags for a document. An instance of this
class is used for the DocSource instance, other instances will be used for
the elements in a TarsqiDocument. For now, the repository has the following
structure:

self.tmp
A list of OpeningTag and ClosingTag elements, used only to build the tags
Expand Down Expand Up @@ -224,10 +224,6 @@ <h3>Public Functions</h3>
Add a tag to the tags list and the opening_tags and closing_tags
dictionaries.</pre>
<pre>
<div class=function>add_tag_with_id(self, name, identifier, begin, end, attrs)</div>
Add a tag to the tags list and the opening_tags and closing_tags
dictionaries.</pre>
<pre>
<div class=function>add_tmp_tag(self, tagInstance)</div>
Add an OpeningTag or ClosingTag to a temporary list. Used by the XML
handlers.</pre>
Expand Down Expand Up @@ -255,7 +251,7 @@ <h3>Public Functions</h3>
<pre>
<div class=function>import_tags(self, tag_repository, tagname)</div>
Import all tags with name=tagname from tag_repository into self. This
is moslty used when we want to take tags from the SourceDoc and add them
is mostly used when we want to take tags from the SourceDoc and add them
to the tags on the TarsqiDocument.</pre>
<pre>
<div class=function>index(self)</div>
Expand Down
Loading

0 comments on commit a7fed3d

Please sign in to comment.