From 3fc6668bc605b9c8766a9a2093eec5d91ee5278d Mon Sep 17 00:00:00 2001 From: Leon Oostrum Date: Wed, 29 May 2024 11:21:14 +0200 Subject: [PATCH 1/3] Run text model inputs one-by-one through model to avoid shape mismatch errors --- dianna/dashboard/_movie_model.py | 33 ++++++++++++----------- tutorials/explainers/LIME/lime_text.ipynb | 24 ++++++++--------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/dianna/dashboard/_movie_model.py b/dianna/dashboard/_movie_model.py index 2cd48496..bd32ebb7 100644 --- a/dianna/dashboard/_movie_model.py +++ b/dianna/dashboard/_movie_model.py @@ -26,25 +26,26 @@ def __call__(self, sentences): if isinstance(sentences, str): sentences = [sentences] - tokenized_sentences = [ - self.tokenize(sentence) for sentence in sentences - ] + output = [] + for sentence in sentences: + # tokenize and pad to minimum length + tokens = self.tokenizer.tokenize(sentence) + if len(tokens) < self.max_filter_size: + tokens += [''] * (self.max_filter_size - len(tokens)) - expected_length = len(tokenized_sentences[0]) - if not all( - len(tokens) == expected_length - for tokens in tokenized_sentences): - raise ValueError( - 'Mismatch in length of tokenized sentences.' - 'This is a problem in the tokenizer:' - 'https://github.com/dianna-ai/dianna/issues/531', ) + # numericalize the tokens + tokens_numerical = [ + self.vocab.stoi[token] + if token in self.vocab.stoi else self.vocab.stoi[''] + for token in tokens + ] - # run the model, applying a sigmoid because the model outputs logits - logits = self.run_model(tokenized_sentences) - pred = np.apply_along_axis(sigmoid, 1, logits) + # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis + pred = float(sigmoid(self.run_model([tokens_numerical]))) + output.append(pred) - # output pos/neg - positivity = pred[:, 0] + # output two classes + positivity = np.array(output) negativity = 1 - positivity return np.transpose([negativity, positivity]) diff --git a/tutorials/explainers/LIME/lime_text.ipynb b/tutorials/explainers/LIME/lime_text.ipynb index 74846fbd..d92efbdb 100644 --- a/tutorials/explainers/LIME/lime_text.ipynb +++ b/tutorials/explainers/LIME/lime_text.ipynb @@ -187,38 +187,36 @@ "source": [ "class MovieReviewsModelRunner:\n", " def __init__(self, model, word_vectors, max_filter_size):\n", - " self.run_model = utils.get_function(str(model))\n", + " self.run_model = utils.get_function(model)\n", " self.vocab = Vectors(word_vectors, cache=os.path.dirname(word_vectors))\n", " self.max_filter_size = max_filter_size\n", " \n", - " self.tokenizer = SpacyTokenizer(name='en_core_web_sm')\n", + " self.tokenizer = SpacyTokenizer(name='en_core_web_sm')\n", "\n", " def __call__(self, sentences):\n", " # ensure the input has a batch axis\n", " if isinstance(sentences, str):\n", " sentences = [sentences]\n", "\n", - " tokenized_sentences = []\n", + " output = []\n", " for sentence in sentences:\n", " # tokenize and pad to minimum length\n", - " tokens = self.tokenizer.tokenize(sentence.lower())\n", + " tokens = self.tokenizer.tokenize(sentence)\n", " if len(tokens) < self.max_filter_size:\n", " tokens += [''] * (self.max_filter_size - len(tokens))\n", " \n", " # numericalize the tokens\n", " tokens_numerical = [self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.stoi['']\n", " for token in tokens]\n", - " tokenized_sentences.append(tokens_numerical)\n", - " \n", - " # run the model, applying a sigmoid because the model outputs logits\n", - " logits = self.run_model(tokenized_sentences)\n", - " pred = np.apply_along_axis(sigmoid, 1, logits)\n", - " \n", + "\n", + " # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis\n", + " pred = float(sigmoid(self.run_model([tokens_numerical])))\n", + " output.append(pred)\n", + "\n", " # output two classes\n", - " positivity = pred[:, 0]\n", + " positivity = np.array(output)\n", " negativity = 1 - positivity\n", - " return np.transpose([negativity, positivity])\n", - " " + " return np.transpose([negativity, positivity]) " ] }, { From b8fa45b9e27db66c68addcbe9ef6698442000468 Mon Sep 17 00:00:00 2001 From: Leon Oostrum Date: Wed, 29 May 2024 11:28:42 +0200 Subject: [PATCH 2/3] Fix tests movie model, also make sure to always make text lower case --- dianna/dashboard/_movie_model.py | 2 +- tests/utils.py | 47 ++++++++++------------- tutorials/explainers/LIME/lime_text.ipynb | 2 +- tutorials/explainers/RISE/rise_text.ipynb | 2 +- 4 files changed, 24 insertions(+), 29 deletions(-) diff --git a/dianna/dashboard/_movie_model.py b/dianna/dashboard/_movie_model.py index bd32ebb7..15377873 100644 --- a/dianna/dashboard/_movie_model.py +++ b/dianna/dashboard/_movie_model.py @@ -29,7 +29,7 @@ def __call__(self, sentences): output = [] for sentence in sentences: # tokenize and pad to minimum length - tokens = self.tokenizer.tokenize(sentence) + tokens = self.tokenizer.tokenize(sentence.lower()) if len(tokens) < self.max_filter_size: tokens += [''] * (self.max_filter_size - len(tokens)) diff --git a/tests/utils.py b/tests/utils.py index ea7e731b..ed458e9c 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,5 +1,4 @@ import numpy as np -import onnxruntime as ort import spacy from scipy.special import expit as sigmoid from torchtext.vocab import Vectors @@ -80,35 +79,31 @@ def __init__(self, model_path, word_vector_file, max_filter_size): self.max_filter_size = max_filter_size def __call__(self, sentences): - """Call function.""" + """Call Runner.""" # ensure the input has a batch axis if isinstance(sentences, str): sentences = [sentences] - sess = ort.InferenceSession(self.filename) - input_name = sess.get_inputs()[0].name - output_name = sess.get_outputs()[0].name - - tokenized_sentences = [ - self.tokenize(sentence) for sentence in sentences - ] - - expected_length = len(tokenized_sentences[0]) - if not all( - len(tokens) == expected_length - for tokens in tokenized_sentences): - raise ValueError( - 'Mismatch in length of tokenized sentences.' - 'This is a problem in the tokenizer:' - 'https://github.com/dianna-ai/dianna/issues/531', ) - - # run the model, applying a sigmoid because the model outputs logits - onnx_input = {input_name: tokenized_sentences} - logits = sess.run([output_name], onnx_input)[0] - pred = np.apply_along_axis(sigmoid, 1, logits) - - # output pos/neg - positivity = pred[:, 0] + output = [] + for sentence in sentences: + # tokenize and pad to minimum length + tokens = self.tokenizer.tokenize(sentence.lower()) + if len(tokens) < self.max_filter_size: + tokens += [''] * (self.max_filter_size - len(tokens)) + + # numericalize the tokens + tokens_numerical = [ + self.vocab.stoi[token] + if token in self.vocab.stoi else self.vocab.stoi[''] + for token in tokens + ] + + # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis + pred = float(sigmoid(self.run_model([tokens_numerical]))) + output.append(pred) + + # output two classes + positivity = np.array(output) negativity = 1 - positivity return np.transpose([negativity, positivity]) diff --git a/tutorials/explainers/LIME/lime_text.ipynb b/tutorials/explainers/LIME/lime_text.ipynb index d92efbdb..a5506f88 100644 --- a/tutorials/explainers/LIME/lime_text.ipynb +++ b/tutorials/explainers/LIME/lime_text.ipynb @@ -201,7 +201,7 @@ " output = []\n", " for sentence in sentences:\n", " # tokenize and pad to minimum length\n", - " tokens = self.tokenizer.tokenize(sentence)\n", + " tokens = self.tokenizer.tokenize(sentence.lower())\n", " if len(tokens) < self.max_filter_size:\n", " tokens += [''] * (self.max_filter_size - len(tokens))\n", " \n", diff --git a/tutorials/explainers/RISE/rise_text.ipynb b/tutorials/explainers/RISE/rise_text.ipynb index d564194f..6ebfb9a0 100644 --- a/tutorials/explainers/RISE/rise_text.ipynb +++ b/tutorials/explainers/RISE/rise_text.ipynb @@ -169,7 +169,7 @@ " output = []\n", " for sentence in sentences:\n", " # tokenize and pad to minimum length\n", - " tokens = self.tokenizer.tokenize(sentence)\n", + " tokens = self.tokenizer.tokenize(sentence.lower())\n", " if len(tokens) < self.max_filter_size:\n", " tokens += [''] * (self.max_filter_size - len(tokens))\n", " \n", From 2fff22f75f6067910647e69baa4a444e32a76421 Mon Sep 17 00:00:00 2001 From: Leon Oostrum Date: Wed, 29 May 2024 11:55:27 +0200 Subject: [PATCH 3/3] fix tests movie model, the lime output is now slightly different as well --- tests/methods/test_lime_text.py | 10 +++++----- tests/utils.py | 9 ++++++++- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/tests/methods/test_lime_text.py b/tests/methods/test_lime_text.py index 5cec5cab..034c90fd 100644 --- a/tests/methods/test_lime_text.py +++ b/tests/methods/test_lime_text.py @@ -31,11 +31,11 @@ def test_lime_text(self): def test_lime_text_special_chars(self): """Tests exact expected output given a text with special characters and model for Lime.""" review = 'such a bad movie "!?\'"' - expected_words = ['bad', '?', '!', 'movie', 'such', 'a', "'", '"', '"'] - expected_word_indices = [2, 6, 5, 3, 0, 1, 7, 4, 8] + expected_words = ['bad', 'movie', '?', 'such', '!', "'", '"', 'a', '"'] + expected_word_indices = [2, 3, 6, 0, 5, 7, 8, 1, 4] expected_scores = [ - 0.50032869, 0.06458735, -0.05793979, 0.01413776, -0.01246357, - -0.00528022, 0.00305347, 0.00185159, -0.00165128 + 0.51140699, 0.02827488, 0.02657974, -0.02208464, -0.02140743, + 0.00962419, 0.00746798, -0.00743376, -0.0012061 ] explanation = dianna.explain_text(self.runner, @@ -44,7 +44,7 @@ def test_lime_text_special_chars(self): labels=[0], method='LIME', random_state=42)[0] - + print(explanation) assert_explanation_satisfies_expectations(explanation, expected_scores, expected_word_indices, expected_words) diff --git a/tests/utils.py b/tests/utils.py index ed458e9c..c5ec7bb0 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -1,4 +1,5 @@ import numpy as np +import onnxruntime as ort import spacy from scipy.special import expit as sigmoid from torchtext.vocab import Vectors @@ -84,6 +85,10 @@ def __call__(self, sentences): if isinstance(sentences, str): sentences = [sentences] + sess = ort.InferenceSession(self.filename) + input_name = sess.get_inputs()[0].name + output_name = sess.get_outputs()[0].name + output = [] for sentence in sentences: # tokenize and pad to minimum length @@ -99,7 +104,9 @@ def __call__(self, sentences): ] # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis - pred = float(sigmoid(self.run_model([tokens_numerical]))) + onnx_input = {input_name: [tokens_numerical]} + logits = sess.run([output_name], onnx_input)[0] + pred = float(sigmoid(logits)) output.append(pred) # output two classes