Skip to content

Commit

Permalink
Add results for same topic versus different topic based on the topic …
Browse files Browse the repository at this point in the history
…IDs provided in the dataset
  • Loading branch information
helena-balabin committed Jul 27, 2023
1 parent 5f8d904 commit 763a44e
Show file tree
Hide file tree
Showing 3 changed files with 160 additions and 65 deletions.
2 changes: 2 additions & 0 deletions src/sentencefmricomparison/data/preprocess_pereira.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ def get_subject_data(
for i in range(0, len(data_pic["keySentences"]), 4)
]
)
fmri["topic_indices"] = np.array([i[0] for i in data_pic["labelsPassageCategory"]])
else:
fmri["sentences"] = np.array(
[
Expand Down Expand Up @@ -226,6 +227,7 @@ def convert_to_hf_dataset(
mri_file_names = [f for f in mri_file_names if "passage" not in f]

# Add the permuted sentences to the dataset as well
permuted_file = None
if passage_wise_processing:
permuted_file = pd.read_csv(
os.path.join(processed_mri_dir, "pereira_permuted_passages.csv")
Expand Down
83 changes: 19 additions & 64 deletions src/sentencefmricomparison/models/neural_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,83 +11,22 @@
import pandas as pd
import torch
from datasets import load_dataset
from scipy.spatial.distance import cosine
from scipy.stats import pearsonr
from sklearn.base import BaseEstimator
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import cross_val_score
from tqdm import tqdm

from sentencefmricomparison.constants import (
PEREIRA_OUTPUT_DIR,
SENT_EMBED_MODEL_LIST_EN,
)
from sentencefmricomparison.models.sentence_embedding_base import SentenceEmbeddingModel
from sentencefmricomparison.utils import cross_val_score_with_topic_ids, pairwise_accuracy, pearson_scoring

# Initialize logger
logging.basicConfig()
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def pairwise_accuracy(
estimator: BaseEstimator = None,
X: torch.Tensor = None, # noqa
y: torch.Tensor = None,
) -> float:
"""Calculate the average pairwise accuracy of all pairs of true and predicted vectors.
Based on the pairwise accuracy as defined in Oota et al. 2022, Sun et al. 2021, Pereira et al. 2018.
:param estimator: Estimator object (e.g., a Ridge regression)
:type estimator: BaseEstimator
:param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
:type X: torch.Tensor
:param y: True MRI vectors
:type y: torch.Tensor
:return: Average pairwise accuracy from all possible sentence pairs
:rtype: float
"""
pred = estimator.predict(X)

# See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
# sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
res = [
cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
for i in range(len(X))
for j in range(i + 1, len(X))
]

# Return the fraction of instances for which the condition holds versus all possible pairs
return sum(res) / len(res)


def pearson_scoring(
estimator: BaseEstimator = None,
X: torch.Tensor = None, # noqa
y: torch.Tensor = None,
) -> float:
"""Calculate the average pearson correlation for the given set of true and predicted MRI vectors.
:param estimator: Estimator object (e.g., a Ridge regression)
:type estimator: BaseEstimator
:param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
:type X: torch.Tensor
:param y: True MRI vectors
:type y: torch.Tensor
:return: Average pearson correlation from all pairs of predicted and true MRI vectors
:rtype: float
"""
pred = estimator.predict(X)

# See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
# sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
res = [pearsonr(t, p).statistic for t, p in zip(y, pred)]

# Return the fraction of instances for which the condition holds versus all possible pairs
return np.mean(res)


def calculate_brain_scores_cv(
dataset_hf_name: str = "helena-balabin/pereira_fMRI_passages",
sent_embed_models: List[str] = SENT_EMBED_MODEL_LIST_EN,
Expand All @@ -101,6 +40,7 @@ def calculate_brain_scores_cv(
mapping_params: Union[Dict[str, Union[str, int, float]], None] = None, # noqa
cv: int = 5,
scoring: str = "pairwise_accuracy",
scoring_variation: str = None,
write_output: bool = True,
output_dir: str = PEREIRA_OUTPUT_DIR,
) -> Union[pd.DataFrame, float]:
Expand Down Expand Up @@ -134,6 +74,9 @@ def calculate_brain_scores_cv(
:param scoring: Scoring function used to evaluate the performance of the neural encoding approach, defaults to
"pairwise_accuracy"
:type scoring: str
:param scoring_variation: Variation of the scoring function, e.g., "same-topic" or "different-topic" for
"pairwise_accuracy", defaults to None
:type scoring_variation: str
:param write_output: Whether to write the output to a file, defaults to True
:type write_output: bool
:param output_dir: Output directory to save the brain scores to
Expand Down Expand Up @@ -203,12 +146,14 @@ def calculate_brain_scores_cv(
for roi_name, roi_features in brain_rois.items():
# Average across cross-validated results
brain_score = np.mean(
cross_val_score(
cross_val_score_with_topic_ids(
mapping_model,
sents_encoded.to("cpu").numpy(),
roi_features.to("cpu").numpy(),
topic_ids=torch.tensor(dataset["train"][i]["topic_indices"]),
cv=cv,
scoring=scoring_func or scoring,
scoring_variation=scoring_variation,
)
)
roi_results[roi_name] = [brain_score]
Expand All @@ -226,11 +171,14 @@ def calculate_brain_scores_cv(
else:
brain_voxels = torch.tensor(subj["all"])
brain_score = np.mean(
cross_val_score(
cross_val_score_with_topic_ids(
mapping_model,
sents_encoded.to("cpu").numpy(),
brain_voxels.to("cpu").numpy(),
topic_ids=torch.tensor(dataset["train"][i]["topic_indices"]),
cv=cv,
scoring=scoring_func or scoring,
scoring_variation=scoring_variation,
)
)
subj_results.append(pd.DataFrame({"all": [brain_score]}))
Expand All @@ -247,6 +195,8 @@ def calculate_brain_scores_cv(
results["mean"] = results.mean(axis=1)

# 7. Save the averaged results
if scoring_variation:
scoring = f"{scoring}_{scoring_variation}"
if write_output:
if only_middle:
output_file = f"pereira_neural_enc_{dataset_hf_name.split('_')[-1]}_{scoring}_{mapping}_middle.csv"
Expand Down Expand Up @@ -276,6 +226,7 @@ def calculate_brain_scores_cv(
@click.option("--mapping", type=str, default="ridge")
@click.option("--cv", type=int, default=5)
@click.option("--scoring", type=str, default="pairwise_accuracy")
@click.option("--scoring-variation", type=str, default=None)
@click.option("--output-dir", type=str, default=PEREIRA_OUTPUT_DIR)
def calculate_brain_scores_cv_wrapper(
dataset_hf_name: str = "helena-balabin/pereira_fMRI_passages",
Expand All @@ -288,6 +239,7 @@ def calculate_brain_scores_cv_wrapper(
mapping: str = "ridge",
cv: int = 5,
scoring: str = "pairwise_accuracy",
scoring_variation: str = None,
output_dir: str = PEREIRA_OUTPUT_DIR,
):
"""Use calculate_brain_scores_cv (wrapper function).
Expand Down Expand Up @@ -317,6 +269,8 @@ def calculate_brain_scores_cv_wrapper(
:param scoring: Scoring function used to evaluate the performance of the neural encoding approach, defaults to
"pairwise_accuracy"
:type scoring: str
:param scoring_variation: Optional variation of the scoring function, defaults to None
:type scoring_variation: str
:param output_dir: Output directory to save the brain scores to
:type output_dir: str
"""
Expand All @@ -331,6 +285,7 @@ def calculate_brain_scores_cv_wrapper(
region_based=region_based,
cv=cv,
scoring=scoring,
scoring_variation=scoring_variation,
write_output=True,
output_dir=output_dir,
)
Expand Down
140 changes: 139 additions & 1 deletion src/sentencefmricomparison/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@

from dataclasses import make_dataclass
from itertools import combinations, permutations, product
from typing import Iterable, List, Optional, Union
from typing import Callable, Iterable, List, Optional, Union

import numpy as np
import torch
from scipy._lib._util import check_random_state # noqa
from scipy.spatial.distance import cosine
from scipy.special import comb, factorial
from scipy.stats import kendalltau, pearsonr, spearmanr
from sklearn.base import BaseEstimator, clone
from sklearn.model_selection import KFold
from torchmetrics.functional.pairwise import (
pairwise_cosine_similarity,
pairwise_euclidean_distance,
Expand Down Expand Up @@ -566,3 +569,138 @@ def two_sided(null_distribution, observed): # noqa
p_values = np.clip(p_values, 0, 1)

return PermutationTestResult(observed, p_values, null_distribution)


def cross_val_score_with_topic_ids(
estimator,
X, # noqa
y,
topic_ids: torch.Tensor = None,
cv: int = 5,
scoring: Callable = None,
scoring_variation: str = None,
):
"""Perform cross-validation with an additional feature 'topic_ids'.
:param estimator: Estimator object (e.g., a Ridge regression)
:type estimator: BaseEstimator
:param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
:type X: torch.Tensor
:param y: True MRI vectors
:type y: torch.Tensor
:param topic_ids: Topic IDs for each paragraph, defaults to None
:type topic_ids: np.ndarray
:param cv: Number of cross-validation folds, defaults to 5
:type cv: int, optional
:param scoring: Scoring function, defaults to None
:type scoring: Callable, optional
:param scoring_variation: Variation of the scoring function, defaults to None
:type scoring_variation: str, optional
:return: Cross-validation scores
:rtype: np.ndarray
"""
kf = KFold(n_splits=cv, shuffle=False)

scores = []

for train_index, test_index in kf.split(X):
X_train, X_test = X[train_index], X[test_index] # noqa
y_train, y_test = y[train_index], y[test_index]
topic_ids_test = topic_ids[test_index]

estimator_clone = clone(estimator)
estimator_clone.fit(X_train, y_train) # noqa

if callable(scoring):
score = scoring(
estimator_clone,
X_test,
y_test,
topic_ids=topic_ids_test,
scoring_variation=scoring_variation,
)
else:
score = estimator_clone.score(X_test, y_test) # noqa

scores.append(score)

return np.array(scores)


def pairwise_accuracy(
estimator: BaseEstimator = None,
X: torch.Tensor = None, # noqa
y: torch.Tensor = None,
topic_ids: torch.Tensor = None,
scoring_variation: str = None,
) -> float:
"""Calculate the average pairwise accuracy of all pairs of true and predicted vectors.
Based on the pairwise accuracy as defined in Oota et al. 2022, Sun et al. 2021, Pereira et al. 2018.
:param estimator: Estimator object (e.g., a Ridge regression)
:type estimator: BaseEstimator
:param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
:type X: torch.Tensor
:param y: True MRI vectors
:type y: torch.Tensor
:param topic_ids: Topic IDs for each paragraph
:type topic_ids: torch.Tensor
:param scoring_variation: Variation of the scoring function, defaults to None
:type scoring_variation: str, optional
:return: Average pairwise accuracy from all possible sentence pairs
:rtype: float
"""
pred = estimator.predict(X) # noqa

if scoring_variation == "same-topic":
# Calculate pairwise accuracy for same-topic sentences
res = [
cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
for i in range(len(X))
for j in range(i + 1, len(X)) if topic_ids[i] == topic_ids[j]
]
elif scoring_variation == "different-topic":
# Calculate pairwise accuracy for different-topic sentences
res = [
cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
for i in range(len(X))
for j in range(i + 1, len(X)) if topic_ids[i] != topic_ids[j]
]
else:
# See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
# sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
res = [
cosine(pred[i], y[i]) + cosine(pred[j], y[j]) < cosine(pred[i], y[j]) + cosine(pred[j], y[i])
for i in range(len(X))
for j in range(i + 1, len(X))
]

# Return the fraction of instances for which the condition holds versus all possible pairs
return sum(res) / len(res)


def pearson_scoring(
estimator: BaseEstimator = None,
X: torch.Tensor = None, # noqa
y: torch.Tensor = None,
) -> float:
"""Calculate the average pearson correlation for the given set of true and predicted MRI vectors.
:param estimator: Estimator object (e.g., a Ridge regression)
:type estimator: BaseEstimator
:param X: Sentence embeddings used as a basis to predict MRI vectors with the estimator
:type X: torch.Tensor
:param y: True MRI vectors
:type y: torch.Tensor
:return: Average pearson correlation from all pairs of predicted and true MRI vectors
:rtype: float
"""
pred = estimator.predict(X) # noqa

# See for all possible sentence pairings: Is the distance between the correct matches of predicted and X
# sentences smaller than the distance between pairings of X and predicted vectors from different sentences?
res = [pearsonr(t, p).statistic for t, p in zip(y, pred)]

# Return the fraction of instances for which the condition holds versus all possible pairs
return np.mean(res) # noqa

0 comments on commit 763a44e

Please sign in to comment.