diff --git a/docs/book/reference/all-metrics.md b/docs/book/reference/all-metrics.md
index 29d5762a5b..35a3d50ba0 100644
--- a/docs/book/reference/all-metrics.md
+++ b/docs/book/reference/all-metrics.md
@@ -319,7 +319,8 @@ Use pre-trained machine learning models for evaluation.
| **Sentiment()**
- Analyzes the sentiment of the text using a word-based model.
- Return a score on a scale: -1 (negative) to 1 positive).
| **Required:**
n/a
**Optional:** |
| **HuggingFaceModel()**
Scores the text using the user-selected HuggingFace model.| See [docs](../customization/huggingface_descriptor.md) with some example models (classification by topic, emotion, etc.)|
| **HuggingFaceToxicityModel()** - Detects hate speech using [HuggingFace Model](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target).
- Returns predicted probability for the “hate” label.
- Scale: 0 to 1.
| **Optional**: - `toxic_label="hate"` (default)
- `display_name`
|
-
+| **BERTScore()** - Calculates similarity between two text columns based on token embeddings from a pre-trained BERT model.
- Returns [BERTScore](https://arxiv.org/pdf/1904.09675) (F1 Score) based on cosine similarity between token embeddings.
| **Required:** **Optional:** - `model`:Name of the pre-trained BERT model to use (default: `"bert-base-uncased"`).
- `tfidf_weighted`: Boolean indicating if embeddings should be weighted with inverse document frequency (IDF) scores (default: `False`).
- `display_name`
|
+
# Data Drift
**Defaults for Data Drift**. By default, all data drift metrics use the Evidently [drift detection logic](data-drift-algorithm.md) that selects a drift detection method based on feature type and volume. You always need a reference dataset.
diff --git a/src/evidently/descriptors/BERTScore_descriptor.py b/src/evidently/descriptors/BERTScore_descriptor.py
new file mode 100644
index 0000000000..f10253b636
--- /dev/null
+++ b/src/evidently/descriptors/BERTScore_descriptor.py
@@ -0,0 +1,13 @@
+from evidently.features.BERTScore_feature import BERTScoreFeature
+from evidently.features.generated_features import FeatureDescriptor
+from evidently.features.generated_features import GeneratedFeatures
+
+
+class BERTScore(FeatureDescriptor):
+ class Config:
+ type_alias = "evidently:descriptor:BERTScore"
+
+ with_column: str
+
+ def feature(self, column_name: str) -> GeneratedFeatures:
+ return BERTScoreFeature(columns=[column_name, self.with_column], display_name=self.display_name)
diff --git a/src/evidently/descriptors/__init__.py b/src/evidently/descriptors/__init__.py
index 174c3a73f1..3c3d0228eb 100644
--- a/src/evidently/descriptors/__init__.py
+++ b/src/evidently/descriptors/__init__.py
@@ -1,4 +1,5 @@
from . import _registry
+from .BERTScore_descriptor import BERTScore
from .contains_link_descriptor import ContainsLink
from .custom_descriptor import CustomColumnEval
from .custom_descriptor import CustomPairColumnEval
@@ -38,6 +39,7 @@
from .words_descriptor import WordNoMatch
__all__ = [
+ "BERTScore",
"CustomColumnEval",
"CustomPairColumnEval",
"HuggingFaceModel",
diff --git a/src/evidently/descriptors/_registry.py b/src/evidently/descriptors/_registry.py
index cf385db060..4a73d281be 100644
--- a/src/evidently/descriptors/_registry.py
+++ b/src/evidently/descriptors/_registry.py
@@ -66,6 +66,11 @@
"evidently.descriptors.semantic_similarity.SemanticSimilarity",
"evidently:descriptor:SemanticSimilarity",
)
+register_type_alias(
+ FeatureDescriptor,
+ "evidently.descriptors.BERTScore_descriptor.BERTScore",
+ "evidently:descriptor:BERTScore",
+)
register_type_alias(
FeatureDescriptor,
"evidently.descriptors.sentence_count_descriptor.SentenceCount",
diff --git a/src/evidently/features/BERTScore_feature.py b/src/evidently/features/BERTScore_feature.py
new file mode 100644
index 0000000000..88dfecdc14
--- /dev/null
+++ b/src/evidently/features/BERTScore_feature.py
@@ -0,0 +1,120 @@
+from collections import defaultdict
+from typing import ClassVar
+from typing import Dict
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from evidently.base_metric import ColumnName
+from evidently.core import ColumnType
+from evidently.features.generated_features import GeneratedFeature
+from evidently.utils.data_preprocessing import DataDefinition
+
+
+class BERTScoreFeature(GeneratedFeature):
+ class Config:
+ type_alias = "evidently:feature:BERTScoreFeature"
+
+ __feature_type__: ClassVar = ColumnType.Numerical
+ columns: List[str]
+ model: str = "bert-base-uncased" # Pretrained BERT model
+ tfidf_weighted: bool = False # Whether to weight embeddings with IDF
+
+ def generate_feature(self, data: pd.DataFrame, data_definition: DataDefinition) -> pd.DataFrame:
+ # Load BERT model and tokenizer
+ from transformers import BertModel
+ from transformers import BertTokenizer
+
+ tokenizer = BertTokenizer.from_pretrained(self.model)
+ model = BertModel.from_pretrained(self.model)
+
+ # Tokenize sentences
+ tokens_first = tokenizer(
+ data[self.columns[0]].fillna("").tolist(), return_tensors="pt", padding=True, truncation=True
+ )
+ tokens_second = tokenizer(
+ data[self.columns[1]].fillna("").tolist(), return_tensors="pt", padding=True, truncation=True
+ )
+
+ # Get embeddings
+ embeddings_first = model(**tokens_first).last_hidden_state.detach().numpy()
+ embeddings_second = model(**tokens_second).last_hidden_state.detach().numpy()
+ # Obtain IDF scores
+ idf_scores = self.compute_idf_scores(data[self.columns[0]], data[self.columns[1]], tokenizer)
+
+ scores = []
+ for i, (emb1, emb2) in enumerate(zip(embeddings_first, embeddings_second)):
+ recall, precision = self.calculate_scores(emb1, emb2, idf_scores, i)
+ f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
+ scores.append(f1_score)
+
+ # Return as a DataFrame
+ return pd.DataFrame(
+ {
+ self._feature_name(): pd.Series(scores, index=data.index),
+ }
+ )
+
+ def compute_idf_scores(self, col1: pd.Series, col2: pd.Series, tokenizer) -> tuple:
+ # Combine reference sentences
+ reference_sentences = pd.concat([col1, col2]).dropna().tolist()
+ M = len(reference_sentences)
+
+ # Compute IDF for each unique token
+ token_counts: Dict[str, int] = defaultdict(int)
+ for sentence in reference_sentences:
+ tokens = [tokenizer.cls_token] + tokenizer.tokenize(sentence) + [tokenizer.sep_token]
+ unique_tokens = set(tokens)
+ for token in unique_tokens:
+ token_counts[token] += 1
+
+ idf_scores = {token: -np.log(count / M) for token, count in token_counts.items()}
+
+ # Convert IDF scores to numpy arrays
+ def convert_to_idf_arrays(sentences):
+ idf_arrays = []
+ for sentence in sentences:
+ tokens = tokenizer.tokenize(sentence)
+
+ # Add special tokens
+ tokens = [tokenizer.cls_token] + tokens + [tokenizer.sep_token]
+ # Compute IDF scores for each token including plus one smoothing
+ idf_array = np.array([idf_scores.get(token, 0) + 1 for token in tokens])
+ idf_arrays.append(idf_array)
+ # Pad sequences to the same length
+ max_len = max(len(arr) for arr in idf_arrays)
+ idf_arrays = np.array([np.pad(arr, (0, max_len - len(arr)), "constant") for arr in idf_arrays])
+ return idf_arrays
+
+ idf_arrays1 = convert_to_idf_arrays(col1.fillna("").tolist())
+ idf_arrays2 = convert_to_idf_arrays(col2.fillna("").tolist())
+ return idf_arrays1, idf_arrays2
+
+ def max_similarity(self, embeddings1, embeddings2):
+ # Compute max cosine similarity for each token in embeddings1 with respect to embeddings2
+ similarity_matrix = np.dot(embeddings1, embeddings2.T) / (
+ np.linalg.norm(embeddings1, axis=1, keepdims=True) * np.linalg.norm(embeddings2, axis=1, keepdims=True).T
+ )
+ return similarity_matrix.max(axis=1)
+
+ def calculate_scores(self, emb1, emb2, idf_scores, index):
+ if self.tfidf_weighted:
+ weighted_scores = np.multiply(self.max_similarity(emb1, emb2), idf_scores[0][index])
+ recall = weighted_scores.sum() / idf_scores[0][index].sum()
+
+ weighted_scores = np.multiply(self.max_similarity(emb2, emb1), idf_scores[1][index])
+ precision = weighted_scores.sum() / idf_scores[1][index].sum()
+ else:
+ recall = self.max_similarity(emb1, emb2).mean()
+ precision = self.max_similarity(emb2, emb1).mean()
+ return recall, precision
+
+ def _feature_name(self):
+ return "|".join(self.columns)
+
+ def _as_column(self) -> "ColumnName":
+ return self._create_column(
+ self._feature_name(),
+ default_display_name=f"BERTScore for {' '.join(self.columns)}.",
+ )
diff --git a/src/evidently/features/_registry.py b/src/evidently/features/_registry.py
index e5c9199968..d567380d0a 100644
--- a/src/evidently/features/_registry.py
+++ b/src/evidently/features/_registry.py
@@ -52,6 +52,11 @@
"evidently.features.semantic_similarity_feature.SemanticSimilarityFeature",
"evidently:feature:SemanticSimilarityFeature",
)
+register_type_alias(
+ GeneratedFeatures,
+ "evidently.features.BERTScore_feature.BERTScoreFeature",
+ "evidently:feature:BERTScoreFeature",
+)
register_type_alias(
GeneratedFeatures, "evidently.features.sentence_count_feature.SentenceCount", "evidently:feature:SentenceCount"
)
diff --git a/tests/features/test_bertscore_feature.py b/tests/features/test_bertscore_feature.py
new file mode 100644
index 0000000000..7f37a14aa2
--- /dev/null
+++ b/tests/features/test_bertscore_feature.py
@@ -0,0 +1,40 @@
+import pandas as pd
+import pytest
+
+from evidently.features.BERTScore_feature import BERTScoreFeature
+from evidently.pipeline.column_mapping import ColumnMapping
+from evidently.utils.data_preprocessing import create_data_definition
+
+test_data = [
+ ("The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a lazy dog"),
+ ("Hello world", "Hi universe"),
+ ("Machine learning is fascinating", "Artificial intelligence is intriguing"),
+ ("I love apples", "I adore oranges"),
+ ("Python is a great programming language", "Python is an excellent coding language"),
+]
+
+
+@pytest.mark.parametrize(
+ (
+ "column_1",
+ "column_2",
+ "expected",
+ ), # expected values obtained from the BERTScore library https://github.com/Tiiiger/bert_score
+ [
+ ("The quick brown fox jumps over the lazy dog", "A fast brown fox leaps over a lazy dog", 0.8917),
+ ("Hello world", "Hi universe", 0.7707),
+ ("Machine learning is fascinating", "Artificial intelligence is intriguing", 0.8238),
+ ("I love apples", "I adore oranges", 0.7017),
+ ("Python is a great programming language", "Python is an excellent coding language", 0.8689),
+ ],
+)
+def test_bert_score_feature(column_1: str, column_2: str, expected: float):
+ feature_generator = BERTScoreFeature(columns=["column_1", "column_2"], tfidf_weighted=False)
+ data = pd.DataFrame(dict(column_1=[column_1], column_2=[column_2]))
+
+ result = feature_generator.generate_feature(
+ data=data,
+ data_definition=create_data_definition(None, data, ColumnMapping()),
+ )
+ column_expected = feature_generator._feature_name()
+ assert result[column_expected].iloc[0] == pytest.approx(expected, rel=0.1)