From 3fc6668bc605b9c8766a9a2093eec5d91ee5278d Mon Sep 17 00:00:00 2001
From: Leon Oostrum <l.oostrum@esciencecenter.nl>
Date: Wed, 29 May 2024 11:21:14 +0200
Subject: [PATCH 1/3] Run text model inputs one-by-one through model to avoid
 shape mismatch errors

---
 dianna/dashboard/_movie_model.py          | 33 ++++++++++++-----------
 tutorials/explainers/LIME/lime_text.ipynb | 24 ++++++++---------
 2 files changed, 28 insertions(+), 29 deletions(-)
diff --git a/dianna/dashboard/_movie_model.py b/dianna/dashboard/_movie_model.py
index 2cd48496..bd32ebb7 100644
--- a/dianna/dashboard/_movie_model.py
+++ b/dianna/dashboard/_movie_model.py
@@ -26,25 +26,26 @@ def __call__(self, sentences):
         if isinstance(sentences, str):
             sentences = [sentences]
 
-        tokenized_sentences = [
-            self.tokenize(sentence) for sentence in sentences
-        ]
+        output = []
+        for sentence in sentences:
+            # tokenize and pad to minimum length
+            tokens = self.tokenizer.tokenize(sentence)
+            if len(tokens) < self.max_filter_size:
+                tokens += ['<pad>'] * (self.max_filter_size - len(tokens))
 
-        expected_length = len(tokenized_sentences[0])
-        if not all(
-                len(tokens) == expected_length
-                for tokens in tokenized_sentences):
-            raise ValueError(
-                'Mismatch in length of tokenized sentences.'
-                'This is a problem in the tokenizer:'
-                'https://github.com/dianna-ai/dianna/issues/531', )
+            # numericalize the tokens
+            tokens_numerical = [
+                self.vocab.stoi[token]
+                if token in self.vocab.stoi else self.vocab.stoi['<unk>']
+                for token in tokens
+            ]
 
-        # run the model, applying a sigmoid because the model outputs logits
-        logits = self.run_model(tokenized_sentences)
-        pred = np.apply_along_axis(sigmoid, 1, logits)
+            # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis
+            pred = float(sigmoid(self.run_model([tokens_numerical])))
+            output.append(pred)
 
-        # output pos/neg
-        positivity = pred[:, 0]
+        # output two classes
+        positivity = np.array(output)
         negativity = 1 - positivity
         return np.transpose([negativity, positivity])
 
diff --git a/tutorials/explainers/LIME/lime_text.ipynb b/tutorials/explainers/LIME/lime_text.ipynb
index 74846fbd..d92efbdb 100644
--- a/tutorials/explainers/LIME/lime_text.ipynb
+++ b/tutorials/explainers/LIME/lime_text.ipynb
@@ -187,38 +187,36 @@
    "source": [
     "class MovieReviewsModelRunner:\n",
     "    def __init__(self, model, word_vectors, max_filter_size):\n",
-    "        self.run_model = utils.get_function(str(model))\n",
+    "        self.run_model = utils.get_function(model)\n",
     "        self.vocab = Vectors(word_vectors, cache=os.path.dirname(word_vectors))\n",
     "        self.max_filter_size = max_filter_size\n",
     "        \n",
-    "        self.tokenizer =  SpacyTokenizer(name='en_core_web_sm')\n",
+    "        self.tokenizer = SpacyTokenizer(name='en_core_web_sm')\n",
     "\n",
     "    def __call__(self, sentences):\n",
     "        # ensure the input has a batch axis\n",
     "        if isinstance(sentences, str):\n",
     "            sentences = [sentences]\n",
     "\n",
-    "        tokenized_sentences = []\n",
+    "        output = []\n",
     "        for sentence in sentences:\n",
     "            # tokenize and pad to minimum length\n",
-    "            tokens = self.tokenizer.tokenize(sentence.lower())\n",
+    "            tokens = self.tokenizer.tokenize(sentence)\n",
     "            if len(tokens) < self.max_filter_size:\n",
     "                tokens += ['<pad>'] * (self.max_filter_size - len(tokens))\n",
     "            \n",
     "            # numericalize the tokens\n",
     "            tokens_numerical = [self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.stoi['<unk>']\n",
     "                                for token in tokens]\n",
-    "            tokenized_sentences.append(tokens_numerical)\n",
-    "            \n",
-    "        # run the model, applying a sigmoid because the model outputs logits\n",
-    "        logits = self.run_model(tokenized_sentences)\n",
-    "        pred = np.apply_along_axis(sigmoid, 1, logits)\n",
-    "        \n",
+    "\n",
+    "            # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis\n",
+    "            pred = float(sigmoid(self.run_model([tokens_numerical])))\n",
+    "            output.append(pred)\n",
+    "\n",
     "        # output two classes\n",
-    "        positivity = pred[:, 0]\n",
+    "        positivity = np.array(output)\n",
     "        negativity = 1 - positivity\n",
-    "        return np.transpose([negativity, positivity])\n",
-    "            "
+    "        return np.transpose([negativity, positivity])       "
    ]
   },
   {

From b8fa45b9e27db66c68addcbe9ef6698442000468 Mon Sep 17 00:00:00 2001
From: Leon Oostrum <l.oostrum@esciencecenter.nl>
Date: Wed, 29 May 2024 11:28:42 +0200
Subject: [PATCH 2/3] Fix tests movie model, also make sure to always make text
 lower case

---
 dianna/dashboard/_movie_model.py          |  2 +-
 tests/utils.py                            | 47 ++++++++++-------------
 tutorials/explainers/LIME/lime_text.ipynb |  2 +-
 tutorials/explainers/RISE/rise_text.ipynb |  2 +-
 4 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/dianna/dashboard/_movie_model.py b/dianna/dashboard/_movie_model.py
index bd32ebb7..15377873 100644
--- a/dianna/dashboard/_movie_model.py
+++ b/dianna/dashboard/_movie_model.py
@@ -29,7 +29,7 @@ def __call__(self, sentences):
         output = []
         for sentence in sentences:
             # tokenize and pad to minimum length
-            tokens = self.tokenizer.tokenize(sentence)
+            tokens = self.tokenizer.tokenize(sentence.lower())
             if len(tokens) < self.max_filter_size:
                 tokens += ['<pad>'] * (self.max_filter_size - len(tokens))
 
diff --git a/tests/utils.py b/tests/utils.py
index ea7e731b..ed458e9c 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,5 +1,4 @@
 import numpy as np
-import onnxruntime as ort
 import spacy
 from scipy.special import expit as sigmoid
 from torchtext.vocab import Vectors
@@ -80,35 +79,31 @@ def __init__(self, model_path, word_vector_file, max_filter_size):
         self.max_filter_size = max_filter_size
 
     def __call__(self, sentences):
-        """Call function."""
+        """Call Runner."""
         # ensure the input has a batch axis
         if isinstance(sentences, str):
             sentences = [sentences]
 
-        sess = ort.InferenceSession(self.filename)
-        input_name = sess.get_inputs()[0].name
-        output_name = sess.get_outputs()[0].name
-
-        tokenized_sentences = [
-            self.tokenize(sentence) for sentence in sentences
-        ]
-
-        expected_length = len(tokenized_sentences[0])
-        if not all(
-                len(tokens) == expected_length
-                for tokens in tokenized_sentences):
-            raise ValueError(
-                'Mismatch in length of tokenized sentences.'
-                'This is a problem in the tokenizer:'
-                'https://github.com/dianna-ai/dianna/issues/531', )
-
-        # run the model, applying a sigmoid because the model outputs logits
-        onnx_input = {input_name: tokenized_sentences}
-        logits = sess.run([output_name], onnx_input)[0]
-        pred = np.apply_along_axis(sigmoid, 1, logits)
-
-        # output pos/neg
-        positivity = pred[:, 0]
+        output = []
+        for sentence in sentences:
+            # tokenize and pad to minimum length
+            tokens = self.tokenizer.tokenize(sentence.lower())
+            if len(tokens) < self.max_filter_size:
+                tokens += ['<pad>'] * (self.max_filter_size - len(tokens))
+
+            # numericalize the tokens
+            tokens_numerical = [
+                self.vocab.stoi[token]
+                if token in self.vocab.stoi else self.vocab.stoi['<unk>']
+                for token in tokens
+            ]
+
+            # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis
+            pred = float(sigmoid(self.run_model([tokens_numerical])))
+            output.append(pred)
+
+        # output two classes
+        positivity = np.array(output)
         negativity = 1 - positivity
         return np.transpose([negativity, positivity])
 
diff --git a/tutorials/explainers/LIME/lime_text.ipynb b/tutorials/explainers/LIME/lime_text.ipynb
index d92efbdb..a5506f88 100644
--- a/tutorials/explainers/LIME/lime_text.ipynb
+++ b/tutorials/explainers/LIME/lime_text.ipynb
@@ -201,7 +201,7 @@
     "        output = []\n",
     "        for sentence in sentences:\n",
     "            # tokenize and pad to minimum length\n",
-    "            tokens = self.tokenizer.tokenize(sentence)\n",
+    "            tokens = self.tokenizer.tokenize(sentence.lower())\n",
     "            if len(tokens) < self.max_filter_size:\n",
     "                tokens += ['<pad>'] * (self.max_filter_size - len(tokens))\n",
     "            \n",
diff --git a/tutorials/explainers/RISE/rise_text.ipynb b/tutorials/explainers/RISE/rise_text.ipynb
index d564194f..6ebfb9a0 100644
--- a/tutorials/explainers/RISE/rise_text.ipynb
+++ b/tutorials/explainers/RISE/rise_text.ipynb
@@ -169,7 +169,7 @@
     "        output = []\n",
     "        for sentence in sentences:\n",
     "            # tokenize and pad to minimum length\n",
-    "            tokens = self.tokenizer.tokenize(sentence)\n",
+    "            tokens = self.tokenizer.tokenize(sentence.lower())\n",
     "            if len(tokens) < self.max_filter_size:\n",
     "                tokens += ['<pad>'] * (self.max_filter_size - len(tokens))\n",
     "            \n",

From 2fff22f75f6067910647e69baa4a444e32a76421 Mon Sep 17 00:00:00 2001
From: Leon Oostrum <l.oostrum@esciencecenter.nl>
Date: Wed, 29 May 2024 11:55:27 +0200
Subject: [PATCH 3/3] fix tests movie model, the lime output is now slightly
 different as well

---
 tests/methods/test_lime_text.py | 10 +++++-----
 tests/utils.py                  |  9 ++++++++-
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/tests/methods/test_lime_text.py b/tests/methods/test_lime_text.py
index 5cec5cab..034c90fd 100644
--- a/tests/methods/test_lime_text.py
+++ b/tests/methods/test_lime_text.py
@@ -31,11 +31,11 @@ def test_lime_text(self):
     def test_lime_text_special_chars(self):
         """Tests exact expected output given a text with special characters and model for Lime."""
         review = 'such a bad movie "!?\'"'
-        expected_words = ['bad', '?', '!', 'movie', 'such', 'a', "'", '"', '"']
-        expected_word_indices = [2, 6, 5, 3, 0, 1, 7, 4, 8]
+        expected_words = ['bad', 'movie', '?', 'such', '!', "'", '"', 'a', '"']
+        expected_word_indices = [2, 3, 6, 0, 5, 7, 8, 1, 4]
         expected_scores = [
-            0.50032869, 0.06458735, -0.05793979, 0.01413776, -0.01246357,
-            -0.00528022, 0.00305347, 0.00185159, -0.00165128
+            0.51140699, 0.02827488, 0.02657974, -0.02208464, -0.02140743,
+            0.00962419, 0.00746798, -0.00743376, -0.0012061
         ]
 
         explanation = dianna.explain_text(self.runner,
@@ -44,7 +44,7 @@ def test_lime_text_special_chars(self):
                                           labels=[0],
                                           method='LIME',
                                           random_state=42)[0]
-
+        print(explanation)
         assert_explanation_satisfies_expectations(explanation, expected_scores,
                                                   expected_word_indices,
                                                   expected_words)
diff --git a/tests/utils.py b/tests/utils.py
index ed458e9c..c5ec7bb0 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -1,4 +1,5 @@
 import numpy as np
+import onnxruntime as ort
 import spacy
 from scipy.special import expit as sigmoid
 from torchtext.vocab import Vectors
@@ -84,6 +85,10 @@ def __call__(self, sentences):
         if isinstance(sentences, str):
             sentences = [sentences]
 
+        sess = ort.InferenceSession(self.filename)
+        input_name = sess.get_inputs()[0].name
+        output_name = sess.get_outputs()[0].name
+
         output = []
         for sentence in sentences:
             # tokenize and pad to minimum length
@@ -99,7 +104,9 @@ def __call__(self, sentences):
             ]
 
             # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis
-            pred = float(sigmoid(self.run_model([tokens_numerical])))
+            onnx_input = {input_name: [tokens_numerical]}
+            logits = sess.run([output_name], onnx_input)[0]
+            pred = float(sigmoid(logits))
             output.append(pred)
 
         # output two classes