From 2018eeb6ea695639b815ac55c1f133d16c29a73e Mon Sep 17 00:00:00 2001 From: Nicolas Garneau Date: Thu, 8 Feb 2018 17:56:01 -0500 Subject: [PATCH] Add the feature for automatically annotate relations --- .gitignore | 3 + client/src/annotator_ui.js | 44 ++++++++++++++ diff.xhtml | 12 ++++ index.xhtml | 12 ++++ server/src/dispatch.py | 4 +- server/src/document.py | 7 +++ server/src/projectconfig.py | 22 ++++++- server/src/tag.py | 111 ++++++++++++++++++++++++++++++++++-- 8 files changed, 207 insertions(+), 8 deletions(-) diff --git a/.gitignore b/.gitignore index b943b0257..052a5af4a 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,6 @@ config.py # Standard locations of data and server temporary files data/ work/ + +venv/ +.idea/ diff --git a/client/src/annotator_ui.js b/client/src/annotator_ui.js index b3afe214f..e96af6930 100644 --- a/client/src/annotator_ui.js +++ b/client/src/annotator_ui.js @@ -2102,6 +2102,16 @@ var AnnotatorUI = (function($, window, undefined) { dispatcher.post('ajax', [tagOptions, 'edited']); } + var tagRelationCurrentDocument = function(taggerId) { + var tagOptions = { + action: 'link', + collection: coll, + 'document': doc, + tagger: taggerId, + }; + dispatcher.post('ajax', [tagOptions, 'edited']); + } + var setupTaggerUI = function(response) { var taggers = response.ner_taggers || []; $taggerButtons = $('#tagger_buttons').empty(); @@ -2135,6 +2145,39 @@ var AnnotatorUI = (function($, window, undefined) { } } + var setupLinkerUI = function(response) { + var taggers = response.re_taggers || []; + $taggerButtons = $('#rel_tagger_buttons').empty(); + $.each(taggers, function(taggerNo, tagger) { + // expect a tuple with ID, name, model, and URL + var taggerId = tagger[0]; + var taggerName = tagger[1]; + var taggerModel = tagger[2]; + if (!taggerId || !taggerName || !taggerModel) { + dispatcher.post('messages', [[['Invalid tagger specification received from server', 'error']]]); + return true; // continue + } + var $row = $('
'); + var $label = $(''+Util.escapeHTML(taggerName)+''); + var $button = $(''); + $row.append($label).append($button); + $taggerButtons.append($row); + $button.click(function(evt) { + tagRelationCurrentDocument(taggerId); + }); + }); + $taggerButtons.find('input').button(); + // if nothing was set up, hide the whole fieldset and show + // a message to this effect, else the other way around + if ($taggerButtons.find('input').length == 0) { + $('#auto_rel_tagging_fieldset').hide(); + $('#no_rel_tagger_message').show(); + } else { + $('#auto_rel_tagging_fieldset').show(); + $('#no_rel_tagger_message').hide(); + } + } + // recursively traverses type hierarchy (entity_types or // event_types) and stores normalizations in normDbsByType. var rememberNormDbsForType = function(types) { @@ -2810,6 +2853,7 @@ var AnnotatorUI = (function($, window, undefined) { on('dataReady', rememberData). on('collectionLoaded', rememberSpanSettings). on('collectionLoaded', setupTaggerUI). + on('collectionLoaded', setupLinkerUI). on('collectionLoaded', setupNormalizationUI). on('spanAndAttributeTypesLoaded', spanAndAttributeTypesLoaded). on('newSourceData', onNewSourceData). diff --git a/diff.xhtml b/diff.xhtml index 10119d081..217b9f478 100644 --- a/diff.xhtml +++ b/diff.xhtml @@ -255,6 +255,18 @@
(No tools set up. Please contact server maintainer if needed.)
+ +
+
+ Automatic relation annotation +
Automatically tag relations on current document
+
+
+ +
Import diff --git a/index.xhtml b/index.xhtml index 933e271cf..28771669c 100644 --- a/index.xhtml +++ b/index.xhtml @@ -209,6 +209,18 @@
(No tools set up. Please contact server administrator if needed.)
+ +
+
+ Automatic relation annotation +
Automatically tag relations on current document
+
+
+ +
Import diff --git a/server/src/dispatch.py b/server/src/dispatch.py index a849514e8..7d874ae7e 100644 --- a/server/src/dispatch.py +++ b/server/src/dispatch.py @@ -34,7 +34,7 @@ from search import search_text, search_entity, search_event, search_relation, search_note from predict import suggest_span_types from undo import undo -from tag import tag +from tag import tag, link from delete import delete_document, delete_collection from norm import norm_get_name, norm_search, norm_get_data @@ -90,6 +90,7 @@ def logging_no_op(collection, document, log): 'undo': undo, 'tag': tag, + 'link': link, 'deleteDocument': delete_document, 'deleteCollection': delete_collection, @@ -134,6 +135,7 @@ def logging_no_op(collection, document, log): 'searchNoteInCollection', 'tag', + 'link', )) # Sanity check diff --git a/server/src/document.py b/server/src/document.py index 2f430983a..ac4415f98 100644 --- a/server/src/document.py +++ b/server/src/document.py @@ -405,6 +405,9 @@ def get_annotator_config(directory): # where most annotators are expected to be human. Rethink. return ProjectConfiguration(directory).get_annotator_config() +def get_linker_config(directory): + return ProjectConfiguration(directory).get_linker_config() + def assert_allowed_to_read(doc_path): if not allowed_to_read(doc_path): raise AccessDeniedError # Permission denied by access control @@ -586,6 +589,9 @@ def get_directory_information(collection): # fill in NER services, if any ner_taggers = get_annotator_config(real_dir) + # fill in RE services, if any + re_taggers = get_linker_config(real_dir) + return _inject_annotation_type_conf(real_dir, json_dic={ 'items': combolist, 'header' : doclist_header, @@ -597,6 +603,7 @@ def get_directory_information(collection): 'normalization_config' : normalization_config, 'annotation_logging': ann_logging, 'ner_taggers': ner_taggers, + 're_taggers': re_taggers }) class UnableToReadTextFile(ProtocolError): diff --git a/server/src/projectconfig.py b/server/src/projectconfig.py index caa41a9eb..a18ea890d 100644 --- a/server/src/projectconfig.py +++ b/server/src/projectconfig.py @@ -58,11 +58,12 @@ class InvalidProjectConfigException(Exception): # tools config section name constants SEARCH_SECTION = "search" ANNOTATORS_SECTION = "annotators" +LINKERS_SECTION = "linkers" DISAMBIGUATORS_SECTION = "disambiguators" NORMALIZATION_SECTION = "normalization" -__expected_tools_sections = (OPTIONS_SECTION, SEARCH_SECTION, ANNOTATORS_SECTION, DISAMBIGUATORS_SECTION, NORMALIZATION_SECTION) -__optional_tools_sections = (OPTIONS_SECTION, SEARCH_SECTION, ANNOTATORS_SECTION, DISAMBIGUATORS_SECTION, NORMALIZATION_SECTION) +__expected_tools_sections = (OPTIONS_SECTION, SEARCH_SECTION, ANNOTATORS_SECTION, LINKERS_SECTION, DISAMBIGUATORS_SECTION, NORMALIZATION_SECTION) +__optional_tools_sections = (OPTIONS_SECTION, SEARCH_SECTION, ANNOTATORS_SECTION, LINKERS_SECTION, DISAMBIGUATORS_SECTION, NORMALIZATION_SECTION) # special relation types for marking which spans can overlap # ENTITY_NESTING_TYPE used up to version 1.3, now deprecated @@ -711,6 +712,7 @@ def get_visual_configs(directory): OPTIONS_SECTION : [], SEARCH_SECTION : [TypeHierarchyNode(["google"], [":http://www.google.com/search?q=%s"])], ANNOTATORS_SECTION : [], + LINKERS_SECTION : [], DISAMBIGUATORS_SECTION : [], NORMALIZATION_SECTION : [], } @@ -781,6 +783,9 @@ def get_search_config(directory): def get_annotator_config(directory): return get_tools_configs(directory)[0][ANNOTATORS_SECTION] +def get_linker_config(directory): + return get_tools_configs(directory)[0][LINKERS_SECTION] + def get_disambiguator_config(directory): return get_tools_configs(directory)[0][DISAMBIGUATORS_SECTION] @@ -875,7 +880,14 @@ def get_annotator_config_list(directory): if directory not in cache: cache[directory] = __type_hierarchy_to_list(get_annotator_config(directory)) return cache[directory] -get_annotator_config_list.__cache = {} +get_annotator_config_list.__cache = {} + +def get_linker_config_list(directory): + cache = get_linker_config_list.__cache + if directory not in cache: + cache[directory] = __type_hierarchy_to_list(get_linker_config(directory)) + return cache[directory] +get_linker_config_list.__cache = {} def get_disambiguator_config_list(directory): cache = get_disambiguator_config_list.__cache @@ -1536,6 +1548,10 @@ def get_annotator_config(self): tool_list = get_annotator_config_list(self.directory) return self._get_tool_config(tool_list) + def get_linker_config(self): + tool_list = get_linker_config_list(self.directory) + return self._get_tool_config(tool_list) + def get_normalization_config(self): norm_list = get_normalization_config_list(self.directory) norm_config = [] diff --git a/server/src/tag.py b/server/src/tag.py index 23b73993f..099825517 100644 --- a/server/src/tag.py +++ b/server/src/tag.py @@ -16,12 +16,12 @@ from socket import error as SocketError from urlparse import urlparse -from annotation import TextAnnotations, TextBoundAnnotationWithText +from annotation import TextAnnotations, TextBoundAnnotationWithText, BinaryRelationAnnotation from annotation import NormalizationAnnotation from annotator import _json_from_ann, ModificationTracker from common import ProtocolError from document import real_directory -from jsonwrap import loads +from jsonwrap import loads, dumps from message import Messager from projectconfig import ProjectConfiguration @@ -87,6 +87,9 @@ def _is_textbound(ann): def _is_normalization(ann): return 'target' in ann +def _is_relation(ann): + return 'rel_type' in ann + def tag(collection, document, tagger): pconf = ProjectConfiguration(real_directory(collection)) for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config(): @@ -157,8 +160,7 @@ def tag(collection, document, tagger): mods = ModificationTracker() cidmap = {} - for cid, ann in ((i, a) for i, a in json_resp.iteritems() - if _is_textbound(a)): + for cid, ann in ((i, a) for i, a in json_resp.iteritems() if _is_textbound(a)): assert 'offsets' in ann, 'Tagger response lacks offsets' offsets = ann['offsets'] assert 'type' in ann, 'Tagger response lacks type' @@ -202,6 +204,107 @@ def tag(collection, document, tagger): mod_resp['annotations'] = _json_from_ann(ann_obj) return mod_resp + +def link(collection, document, tagger): + pconf = ProjectConfiguration(real_directory(collection)) + for linker_token, _, _, linker_service_url in pconf.get_linker_config(): + if tagger == linker_token: + break + else: + raise UnknownTaggerError(tagger) + + with TextAnnotations(path_join(real_directory(collection), document)) as ann_obj: + + url_soup = urlparse(linker_service_url) + + if url_soup.scheme == 'http': + Connection = HTTPConnection + elif url_soup.scheme == 'https': + # Delayed HTTPS import since it relies on SSL which is commonly + # missing if you roll your own Python, for once we should not + # fail early since tagging is currently an edge case and we + # can't allow it to bring down the whole server. + from httplib import HTTPSConnection + Connection = HTTPSConnection + else: + raise InvalidConnectionSchemeError(linker_token, url_soup.scheme) + + conn = None + try: + conn = Connection(url_soup.netloc) + req_headers = { + 'Content-type': 'text/plain; charset=utf-8', + 'Accept': 'application/json', + } + # Build a new service URL since the request method doesn't accept + # a parameters argument + service_url = url_soup.path + ( + '?' + url_soup.query if url_soup.query else '') + try: + entities = list() + for e in ann_obj.get_entities(): + s = "{}\t{} {} {}\t{}\n".format( + e.id, + e.type, + e.start, + e.end, + e.text.encode('utf-8') + ) + entities.append(s) + data = { + 'document': ann_obj.get_document_text().encode('utf-8'), + 'entities': entities + } + # req_headers['Content-length'] = len(data) + # Note: Trout slapping for anyone sending Unicode objects here + conn.request('POST', + # As per: http://bugs.python.org/issue11898 + # Force the url to be an ascii string + str(service_url), + dumps(data), + headers=req_headers) + except SocketError, e: + raise TaggerConnectionError(linker_token, e) + resp = conn.getresponse() + + # Did the request succeed? + if resp.status != 200: + raise TaggerConnectionError(linker_token, + '%s %s' % (resp.status, resp.reason)) + # Finally, we can read the response data + resp_data = resp.read() + finally: + if conn is not None: + conn.close() + + try: + json_resp = loads(resp_data) + except ValueError: + raise InvalidTaggerResponseError(linker_token, resp_data) + + mods = ModificationTracker() + cidmap = {} + + for cid, ann in ((i, a) for i, a in json_resp.iteritems() if _is_relation(a)): + assert 'rel_type' in ann, 'Tagger response lacks rel_type' + rel_type = ann['rel_type'] + assert 'arg1' in ann, 'Tagger response lacks arg1' + arg1 = ann['arg1'] + assert 'arg2' in ann, 'Tagger response lacks arg2' + arg2 = ann['arg2'] + + _id = ann_obj.get_new_id('R') + cidmap[cid] = _id + + tb = BinaryRelationAnnotation(_id, rel_type, 'Arg1', arg1, 'Arg2', arg2, "") + + mods.addition(tb) + ann_obj.add_annotation(tb) + + mod_resp = mods.json_response() + mod_resp['annotations'] = _json_from_ann(ann_obj) + return mod_resp + if __name__ == '__main__': # Silly test, but helps tag('/BioNLP-ST_2011_ID_devel', 'PMC1874608-01-INTRODUCTION', 'random')