Add results for same topic versus different topic based on the topic …

…IDs provided in the dataset
lcn-kul · Jul 27, 2023 · 763a44e · 763a44e
1 parent 5f8d904
commit 763a44e
Show file tree

Hide file tree

Showing 3 changed files with 160 additions and 65 deletions.
diff --git a/src/sentencefmricomparison/data/preprocess_pereira.py b/src/sentencefmricomparison/data/preprocess_pereira.py
@@ -123,6 +123,7 @@ def get_subject_data(
                     for i in range(0, len(data_pic["keySentences"]), 4)
                 ]
             )
+            fmri["topic_indices"] = np.array([i[0] for i in data_pic["labelsPassageCategory"]])
         else:
             fmri["sentences"] = np.array(
                 [
@@ -226,6 +227,7 @@ def convert_to_hf_dataset(
         mri_file_names = [f for f in mri_file_names if "passage" not in f]
 
     # Add the permuted sentences to the dataset as well
+    permuted_file = None
     if passage_wise_processing:
         permuted_file = pd.read_csv(
             os.path.join(processed_mri_dir, "pereira_permuted_passages.csv")

diff --git a/src/sentencefmricomparison/models/neural_encoder.py b/src/sentencefmricomparison/models/neural_encoder.py
@@ -11,83 +11,22 @@
 import pandas as pd
 import torch
 from datasets import load_dataset
-from scipy.spatial.distance import cosine
-from scipy.stats import pearsonr
-from sklearn.base import BaseEstimator
 from sklearn.linear_model import LinearRegression, Ridge
-from sklearn.model_selection import cross_val_score
 from tqdm import tqdm
 
 from sentencefmricomparison.constants import (
     PEREIRA_OUTPUT_DIR,
     SENT_EMBED_MODEL_LIST_EN,
 )
 from sentencefmricomparison.models.sentence_embedding_base import SentenceEmbeddingModel
+from sentencefmricomparison.utils import cross_val_score_with_topic_ids, pairwise_accuracy, pearson_scoring
 
 # Initialize logger
 logging.basicConfig()
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
 
-def pairwise_accuracy(
-    estimator: BaseEstimator = None,
-    X: torch.Tensor = None,  # noqa
-    y: torch.Tensor = None,
-) -> float:
-    """Calculate the average pairwise accuracy of all pairs of true and predicted vectors.
-
-    Based on the pairwise accuracy as defined in Oota et al. 2022, Sun et al. 2021, Pereira et al. 2018.
-
-    :param estimator: Estimator object (e.g., a Ridge regression)
-    :type estimator: BaseEstimator
-    :param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
-    :type X: torch.Tensor
-    :param y: True MRI vectors
-    :type y: torch.Tensor
-    :return: Average pairwise accuracy from all possible sentence pairs
-    :rtype: float
-    """
-    pred = estimator.predict(X)
-
-    # See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
-    # sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
-    res = [
-        cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
-        for i in range(len(X))
-        for j in range(i + 1, len(X))
-    ]
-
-    # Return the fraction of instances for which the condition holds versus all possible pairs
-    return sum(res) / len(res)
-
-
-def pearson_scoring(
-    estimator: BaseEstimator = None,
-    X: torch.Tensor = None,  # noqa
-    y: torch.Tensor = None,
-) -> float:
-    """Calculate the average pearson correlation for the given set of true and predicted MRI vectors.
-
-    :param estimator: Estimator object (e.g., a Ridge regression)
-    :type estimator: BaseEstimator
-    :param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
-    :type X: torch.Tensor
-    :param y: True MRI vectors
-    :type y: torch.Tensor
-    :return: Average pearson correlation from all pairs of predicted and true MRI vectors
-    :rtype: float
-    """
-    pred = estimator.predict(X)
-
-    # See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
-    # sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
-    res = [pearsonr(t, p).statistic for t, p in zip(y, pred)]
-
-    # Return the fraction of instances for which the condition holds versus all possible pairs
-    return np.mean(res)
-
-
 def calculate_brain_scores_cv(
     dataset_hf_name: str = "helena-balabin/pereira_fMRI_passages",
     sent_embed_models: List[str] = SENT_EMBED_MODEL_LIST_EN,
@@ -101,6 +40,7 @@ def calculate_brain_scores_cv(
     mapping_params: Union[Dict[str, Union[str, int, float]], None] = None,  # noqa
     cv: int = 5,
     scoring: str = "pairwise_accuracy",
+    scoring_variation: str = None,
     write_output: bool = True,
     output_dir: str = PEREIRA_OUTPUT_DIR,
 ) -> Union[pd.DataFrame, float]:
@@ -134,6 +74,9 @@ def calculate_brain_scores_cv(
     :param scoring: Scoring function used to evaluate the performance of the neural encoding approach, defaults to
         "pairwise_accuracy"
     :type scoring: str
+    :param scoring_variation: Variation of the scoring function, e.g., "same-topic" or "different-topic" for
+        "pairwise_accuracy", defaults to None
+    :type scoring_variation: str
     :param write_output: Whether to write the output to a file, defaults to True
     :type write_output: bool
     :param output_dir: Output directory to save the brain scores to
@@ -203,12 +146,14 @@ def calculate_brain_scores_cv(
                 for roi_name, roi_features in brain_rois.items():
                     # Average across cross-validated results
                     brain_score = np.mean(
-                        cross_val_score(
+                        cross_val_score_with_topic_ids(
                             mapping_model,
                             sents_encoded.to("cpu").numpy(),
                             roi_features.to("cpu").numpy(),
+                            topic_ids=torch.tensor(dataset["train"][i]["topic_indices"]),
                             cv=cv,
                             scoring=scoring_func or scoring,
+                            scoring_variation=scoring_variation,
                         )
                     )
                     roi_results[roi_name] = [brain_score]
@@ -226,11 +171,14 @@ def calculate_brain_scores_cv(
             else:
                 brain_voxels = torch.tensor(subj["all"])
                 brain_score = np.mean(
-                    cross_val_score(
+                    cross_val_score_with_topic_ids(
                         mapping_model,
                         sents_encoded.to("cpu").numpy(),
                         brain_voxels.to("cpu").numpy(),
+                        topic_ids=torch.tensor(dataset["train"][i]["topic_indices"]),
+                        cv=cv,
                         scoring=scoring_func or scoring,
+                        scoring_variation=scoring_variation,
                     )
                 )
                 subj_results.append(pd.DataFrame({"all": [brain_score]}))
@@ -247,6 +195,8 @@ def calculate_brain_scores_cv(
         results["mean"] = results.mean(axis=1)
 
     # 7. Save the averaged results
+    if scoring_variation:
+        scoring = f"{scoring}_{scoring_variation}"
     if write_output:
         if only_middle:
             output_file = f"pereira_neural_enc_{dataset_hf_name.split('_')[-1]}_{scoring}_{mapping}_middle.csv"
@@ -276,6 +226,7 @@ def calculate_brain_scores_cv(
 @click.option("--mapping", type=str, default="ridge")
 @click.option("--cv", type=int, default=5)
 @click.option("--scoring", type=str, default="pairwise_accuracy")
+@click.option("--scoring-variation", type=str, default=None)
 @click.option("--output-dir", type=str, default=PEREIRA_OUTPUT_DIR)
 def calculate_brain_scores_cv_wrapper(
     dataset_hf_name: str = "helena-balabin/pereira_fMRI_passages",
@@ -288,6 +239,7 @@ def calculate_brain_scores_cv_wrapper(
     mapping: str = "ridge",
     cv: int = 5,
     scoring: str = "pairwise_accuracy",
+    scoring_variation: str = None,
     output_dir: str = PEREIRA_OUTPUT_DIR,
 ):
     """Use calculate_brain_scores_cv (wrapper function).
@@ -317,6 +269,8 @@ def calculate_brain_scores_cv_wrapper(
     :param scoring: Scoring function used to evaluate the performance of the neural encoding approach, defaults to
         "pairwise_accuracy"
     :type scoring: str
+    :param scoring_variation: Optional variation of the scoring function, defaults to None
+    :type scoring_variation: str
     :param output_dir: Output directory to save the brain scores to
     :type output_dir: str
     """
@@ -331,6 +285,7 @@ def calculate_brain_scores_cv_wrapper(
         region_based=region_based,
         cv=cv,
         scoring=scoring,
+        scoring_variation=scoring_variation,
         write_output=True,
         output_dir=output_dir,
     )

diff --git a/src/sentencefmricomparison/utils.py b/src/sentencefmricomparison/utils.py
@@ -2,13 +2,16 @@
 
 from dataclasses import make_dataclass
 from itertools import combinations, permutations, product
-from typing import Iterable, List, Optional, Union
+from typing import Callable, Iterable, List, Optional, Union
 
 import numpy as np
 import torch
 from scipy._lib._util import check_random_state  # noqa
+from scipy.spatial.distance import cosine
 from scipy.special import comb, factorial
 from scipy.stats import kendalltau, pearsonr, spearmanr
+from sklearn.base import BaseEstimator, clone
+from sklearn.model_selection import KFold
 from torchmetrics.functional.pairwise import (
     pairwise_cosine_similarity,
     pairwise_euclidean_distance,
@@ -566,3 +569,138 @@ def two_sided(null_distribution, observed):  # noqa
     p_values = np.clip(p_values, 0, 1)
 
     return PermutationTestResult(observed, p_values, null_distribution)
+
+
+def cross_val_score_with_topic_ids(
+    estimator,
+    X, # noqa
+    y,
+    topic_ids: torch.Tensor = None,
+    cv: int = 5,
+    scoring: Callable = None,
+    scoring_variation: str = None,
+):
+    """Perform cross-validation with an additional feature 'topic_ids'.
+
+    :param estimator: Estimator object (e.g., a Ridge regression)
+    :type estimator: BaseEstimator
+    :param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
+    :type X: torch.Tensor
+    :param y: True MRI vectors
+    :type y: torch.Tensor
+    :param topic_ids: Topic IDs for each paragraph, defaults to None
+    :type topic_ids: np.ndarray
+    :param cv: Number of cross-validation folds, defaults to 5
+    :type cv: int, optional
+    :param scoring: Scoring function, defaults to None
+    :type scoring: Callable, optional
+    :param scoring_variation: Variation of the scoring function, defaults to None
+    :type scoring_variation: str, optional
+    :return: Cross-validation scores
+    :rtype: np.ndarray
+    """
+    kf = KFold(n_splits=cv, shuffle=False)
+
+    scores = []
+
+    for train_index, test_index in kf.split(X):
+        X_train, X_test = X[train_index], X[test_index]  # noqa
+        y_train, y_test = y[train_index], y[test_index]
+        topic_ids_test = topic_ids[test_index]
+
+        estimator_clone = clone(estimator)
+        estimator_clone.fit(X_train, y_train)  # noqa
+
+        if callable(scoring):
+            score = scoring(
+                estimator_clone,
+                X_test,
+                y_test,
+                topic_ids=topic_ids_test,
+                scoring_variation=scoring_variation,
+            )
+        else:
+            score = estimator_clone.score(X_test, y_test)  # noqa
+
+        scores.append(score)
+
+    return np.array(scores)
+
+
+def pairwise_accuracy(
+    estimator: BaseEstimator = None,
+    X: torch.Tensor = None,  # noqa
+    y: torch.Tensor = None,
+    topic_ids: torch.Tensor = None,
+    scoring_variation: str = None,
+) -> float:
+    """Calculate the average pairwise accuracy of all pairs of true and predicted vectors.
+
+    Based on the pairwise accuracy as defined in Oota et al. 2022, Sun et al. 2021, Pereira et al. 2018.
+
+    :param estimator: Estimator object (e.g., a Ridge regression)
+    :type estimator: BaseEstimator
+    :param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
+    :type X: torch.Tensor
+    :param y: True MRI vectors
+    :type y: torch.Tensor
+    :param topic_ids: Topic IDs for each paragraph
+    :type topic_ids: torch.Tensor
+    :param scoring_variation: Variation of the scoring function, defaults to None
+    :type scoring_variation: str, optional
+    :return: Average pairwise accuracy from all possible sentence pairs
+    :rtype: float
+    """
+    pred = estimator.predict(X)  # noqa
+
+    if scoring_variation == "same-topic":
+        # Calculate pairwise accuracy for same-topic sentences
+        res = [
+            cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
+            for i in range(len(X))
+            for j in range(i + 1, len(X)) if topic_ids[i] == topic_ids[j]
+        ]
+    elif scoring_variation == "different-topic":
+        # Calculate pairwise accuracy for different-topic sentences
+        res = [
+            cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
+            for i in range(len(X))
+            for j in range(i + 1, len(X)) if topic_ids[i] != topic_ids[j]
+        ]
+    else:
+        # See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
+        # sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
+        res = [
+            cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
+            for i in range(len(X))
+            for j in range(i + 1, len(X))
+        ]
+
+    # Return the fraction of instances for which the condition holds versus all possible pairs
+    return sum(res) / len(res)
+
+
+def pearson_scoring(
+    estimator: BaseEstimator = None,
+    X: torch.Tensor = None,  # noqa
+    y: torch.Tensor = None,
+) -> float:
+    """Calculate the average pearson correlation for the given set of true and predicted MRI vectors.
+
+    :param estimator: Estimator object (e.g., a Ridge regression)
+    :type estimator: BaseEstimator
+    :param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
+    :type X: torch.Tensor
+    :param y: True MRI vectors
+    :type y: torch.Tensor
+    :return: Average pearson correlation from all pairs of predicted and true MRI vectors
+    :rtype: float
+    """
+    pred = estimator.predict(X)  # noqa
+
+    # See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
+    # sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
+    res = [pearsonr(t, p).statistic for t, p in zip(y, pred)]
+
+    # Return the fraction of instances for which the condition holds versus all possible pairs
+    return np.mean(res)  # noqa