Skip to content

Commit

Permalink
Merge pull request #773 from dianna-ai/fix-lime-text-special-chars
Browse files Browse the repository at this point in the history
Run text model inputs one-by-one through model to avoid shape mismatch errors
  • Loading branch information
elboyran authored May 29, 2024
2 parents 780f1c3 + 2fff22f commit 209a0ed
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 55 deletions.
33 changes: 17 additions & 16 deletions dianna/dashboard/_movie_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,26 @@ def __call__(self, sentences):
if isinstance(sentences, str):
sentences = [sentences]

tokenized_sentences = [
self.tokenize(sentence) for sentence in sentences
]
output = []
for sentence in sentences:
# tokenize and pad to minimum length
tokens = self.tokenizer.tokenize(sentence.lower())
if len(tokens) < self.max_filter_size:
tokens += ['<pad>'] * (self.max_filter_size - len(tokens))

expected_length = len(tokenized_sentences[0])
if not all(
len(tokens) == expected_length
for tokens in tokenized_sentences):
raise ValueError(
'Mismatch in length of tokenized sentences.'
'This is a problem in the tokenizer:'
'https://github.com/dianna-ai/dianna/issues/531', )
# numericalize the tokens
tokens_numerical = [
self.vocab.stoi[token]
if token in self.vocab.stoi else self.vocab.stoi['<unk>']
for token in tokens
]

# run the model, applying a sigmoid because the model outputs logits
logits = self.run_model(tokenized_sentences)
pred = np.apply_along_axis(sigmoid, 1, logits)
# run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis
pred = float(sigmoid(self.run_model([tokens_numerical])))
output.append(pred)

# output pos/neg
positivity = pred[:, 0]
# output two classes
positivity = np.array(output)
negativity = 1 - positivity
return np.transpose([negativity, positivity])

Expand Down
10 changes: 5 additions & 5 deletions tests/methods/test_lime_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@ def test_lime_text(self):
def test_lime_text_special_chars(self):
"""Tests exact expected output given a text with special characters and model for Lime."""
review = 'such a bad movie "!?\'"'
expected_words = ['bad', '?', '!', 'movie', 'such', 'a', "'", '"', '"']
expected_word_indices = [2, 6, 5, 3, 0, 1, 7, 4, 8]
expected_words = ['bad', 'movie', '?', 'such', '!', "'", '"', 'a', '"']
expected_word_indices = [2, 3, 6, 0, 5, 7, 8, 1, 4]
expected_scores = [
0.50032869, 0.06458735, -0.05793979, 0.01413776, -0.01246357,
-0.00528022, 0.00305347, 0.00185159, -0.00165128
0.51140699, 0.02827488, 0.02657974, -0.02208464, -0.02140743,
0.00962419, 0.00746798, -0.00743376, -0.0012061
]

explanation = dianna.explain_text(self.runner,
Expand All @@ -44,7 +44,7 @@ def test_lime_text_special_chars(self):
labels=[0],
method='LIME',
random_state=42)[0]

print(explanation)
assert_explanation_satisfies_expectations(explanation, expected_scores,
expected_word_indices,
expected_words)
Expand Down
44 changes: 23 additions & 21 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def __init__(self, model_path, word_vector_file, max_filter_size):
self.max_filter_size = max_filter_size

def __call__(self, sentences):
"""Call function."""
"""Call Runner."""
# ensure the input has a batch axis
if isinstance(sentences, str):
sentences = [sentences]
Expand All @@ -89,26 +89,28 @@ def __call__(self, sentences):
input_name = sess.get_inputs()[0].name
output_name = sess.get_outputs()[0].name

tokenized_sentences = [
self.tokenize(sentence) for sentence in sentences
]

expected_length = len(tokenized_sentences[0])
if not all(
len(tokens) == expected_length
for tokens in tokenized_sentences):
raise ValueError(
'Mismatch in length of tokenized sentences.'
'This is a problem in the tokenizer:'
'https://github.com/dianna-ai/dianna/issues/531', )

# run the model, applying a sigmoid because the model outputs logits
onnx_input = {input_name: tokenized_sentences}
logits = sess.run([output_name], onnx_input)[0]
pred = np.apply_along_axis(sigmoid, 1, logits)

# output pos/neg
positivity = pred[:, 0]
output = []
for sentence in sentences:
# tokenize and pad to minimum length
tokens = self.tokenizer.tokenize(sentence.lower())
if len(tokens) < self.max_filter_size:
tokens += ['<pad>'] * (self.max_filter_size - len(tokens))

# numericalize the tokens
tokens_numerical = [
self.vocab.stoi[token]
if token in self.vocab.stoi else self.vocab.stoi['<unk>']
for token in tokens
]

# run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis
onnx_input = {input_name: [tokens_numerical]}
logits = sess.run([output_name], onnx_input)[0]
pred = float(sigmoid(logits))
output.append(pred)

# output two classes
positivity = np.array(output)
negativity = 1 - positivity
return np.transpose([negativity, positivity])

Expand Down
22 changes: 10 additions & 12 deletions tutorials/explainers/LIME/lime_text.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -187,18 +187,18 @@
"source": [
"class MovieReviewsModelRunner:\n",
" def __init__(self, model, word_vectors, max_filter_size):\n",
" self.run_model = utils.get_function(str(model))\n",
" self.run_model = utils.get_function(model)\n",
" self.vocab = Vectors(word_vectors, cache=os.path.dirname(word_vectors))\n",
" self.max_filter_size = max_filter_size\n",
" \n",
" self.tokenizer = SpacyTokenizer(name='en_core_web_sm')\n",
" self.tokenizer = SpacyTokenizer(name='en_core_web_sm')\n",
"\n",
" def __call__(self, sentences):\n",
" # ensure the input has a batch axis\n",
" if isinstance(sentences, str):\n",
" sentences = [sentences]\n",
"\n",
" tokenized_sentences = []\n",
" output = []\n",
" for sentence in sentences:\n",
" # tokenize and pad to minimum length\n",
" tokens = self.tokenizer.tokenize(sentence.lower())\n",
Expand All @@ -208,17 +208,15 @@
" # numericalize the tokens\n",
" tokens_numerical = [self.vocab.stoi[token] if token in self.vocab.stoi else self.vocab.stoi['<unk>']\n",
" for token in tokens]\n",
" tokenized_sentences.append(tokens_numerical)\n",
" \n",
" # run the model, applying a sigmoid because the model outputs logits\n",
" logits = self.run_model(tokenized_sentences)\n",
" pred = np.apply_along_axis(sigmoid, 1, logits)\n",
" \n",
"\n",
" # run the model, applying a sigmoid because the model outputs logits, remove any remaining batch axis\n",
" pred = float(sigmoid(self.run_model([tokens_numerical])))\n",
" output.append(pred)\n",
"\n",
" # output two classes\n",
" positivity = pred[:, 0]\n",
" positivity = np.array(output)\n",
" negativity = 1 - positivity\n",
" return np.transpose([negativity, positivity])\n",
" "
" return np.transpose([negativity, positivity]) "
]
},
{
Expand Down
2 changes: 1 addition & 1 deletion tutorials/explainers/RISE/rise_text.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@
" output = []\n",
" for sentence in sentences:\n",
" # tokenize and pad to minimum length\n",
" tokens = self.tokenizer.tokenize(sentence)\n",
" tokens = self.tokenizer.tokenize(sentence.lower())\n",
" if len(tokens) < self.max_filter_size:\n",
" tokens += ['<pad>'] * (self.max_filter_size - len(tokens))\n",
" \n",
Expand Down

0 comments on commit 209a0ed

Please sign in to comment.