Add surrogate data generation

tsbinns · Aug 2, 2024 · e4f40cc · e4f40cc
1 parent 00c381a
commit e4f40cc
Show file tree

Hide file tree

Showing 7 changed files with 472 additions and 171 deletions.
diff --git a/doc/api.rst b/doc/api.rst
@@ -111,4 +111,5 @@ Dataset functions
 .. autosummary::
    :toctree: generated/
 
-   make_signals_in_freq_bands
+   make_signals_in_freq_bands
+   make_surrogate_data
diff --git a/doc/references.bib b/doc/references.bib
@@ -185,6 +185,17 @@ @book{OppenheimEtAl1999
  year = {1999}
 }
 
+@article{PellegriniEtAl2023,
+  title={Identifying good practices for detecting inter-regional linear functional connectivity from {EEG}},
+  author={Pellegrini, Franziska and Delorme, Arnaud and Nikulin, Vadim and Haufe, Stefan},
+  doi={10.1016/j.neuroimage.2023.120218},
+  journal={NeuroImage},
+  volume={277},
+  pages={120218},
+  year={2023},
+  publisher={Elsevier}
+}
+
 @book{SekiharaNagarajan2008,
  author = {Sekihara, Kensuke and Nagarajan, Srikantan S.},
  doi = {10.1007/978-3-540-79370-0},

diff --git a/mne_connectivity/__init__.py b/mne_connectivity/__init__.py
@@ -23,7 +23,7 @@
     SpectroTemporalConnectivity,
     TemporalConnectivity,
 )
-from .datasets import make_signals_in_freq_bands
+from .datasets import make_signals_in_freq_bands, make_surrogate_data
 from .decoding import CoherencyDecomposition
 from .effective import phase_slope_index
 from .envelope import envelope_correlation, symmetric_orth

diff --git a/mne_connectivity/datasets/__init__.py b/mne_connectivity/datasets/__init__.py
@@ -1 +1,2 @@
 from .frequency import make_signals_in_freq_bands
+from .surrogate import make_surrogate_data
diff --git a/mne_connectivity/datasets/surrogate.py b/mne_connectivity/datasets/surrogate.py
@@ -0,0 +1,141 @@
+# Authors: Thomas S. Binns <[email protected]>
+#
+# License: BSD (3-clause)
+
+import numpy as np
+from mne.time_frequency import EpochsSpectrum, EpochsSpectrumArray
+from mne.utils import _validate_type
+
+
+def make_surrogate_data(data, n_shuffles=1000, rng_seed=None, return_generator=True):
+    """Create surrogate data for a null hypothesis of connectivity.
+
+    Parameters
+    ----------
+    data : ~mne.time_frequency.EpochsSpectrum | ~mne.time_frequency.EpochsSpectrumArray
+        The Fourier coefficients to create the null hypothesis surrogate data for. Can
+        be generated from :meth:`mne.Epochs.compute_psd` with ``output='complex'``
+        (requires ``mne >= 1.8``).
+    n_shuffles : int (default 1000)
+        The number of surrogate datasets to create.
+    rng_seed : int | None (default None)
+        The seed to use for the random number generator. If `None`, no seed is
+        specified.
+    return_generator : bool (default True)
+        Whether or not to return the surrogate data as a :term:`generator` object
+        instead of a :class:`list`. This allows iterating over the surrogates without
+        having to keep them all in memory.
+
+    Returns
+    -------
+    surrogate_data : list of ~mne.time_frequency.EpochsSpectrum
+        The surrogate data for the null hypothesis with ``n_shuffles`` entries. Returned
+        as a :term:`generator` if ``return_generator=True``.
+
+    Notes
+    -----
+    Surrogate data is generated by randomly shuffling the order of epochs, independently
+    for each channel. This destroys the covariance of the data, such that connectivity
+    estimates should reflect the null hypothesis of no genuine connectivity between
+    signals (i.e., only interactions due to background noise)
+    :footcite:`PellegriniEtAl2023`.
+
+    For the surrogate data to properly reflect a null hypothesis, the data which is
+    shuffled **must not** have a temporal structure that is consistent across epochs.
+    Examples of this data include evoked potentials, where a stimulus is presented or an
+    action performed at a set time during each epoch. Such data should not be used for
+    generating surrogates, as even after shuffling the epochs, it will still show a high
+    degree of residual connectivity between channels. As a result, connectivity
+    estimates from your surrogate data will capture genuine interactions, instead of the
+    desired background noise. Treating these estimates as a null hypothesis will
+    increase the likelihood of a type II (false negative) error, i.e., that there is no
+    significant connectivity in your data.
+
+    Appropriate data for generating surrogates includes data from a resting state,
+    inter-trial period, or similar. Here, a strong temporal consistency across epochs is
+    not assumed, reducing the chances that connectivity information of interest is
+    captured in your surrogate connectivity estimates.
+
+    In situations where you want to assess whether evoked data has significant
+    connectivity, you can generate your surrogate connectivity estimates from non-evoked
+    data (e.g., rest data, inter-trial data) and compare this to your true connectivity
+    estimates from the evoked data.
+
+    Regardless of whether you are working with evoked or non-evoked data, **you should
+    always compare true and surrogate connectivity estimates from epochs of the same
+    duration**. This will ensure that spectral information is captured with the same
+    accuracy in both sets of connectivity estimates. Ideally, **you should also compare
+    true and surrogate connectivity estimates from the same number of epochs** to avoid
+    a noise bias for one set of estimates (fewer epochs gives noisier connectivity
+    estimates).
+
+    .. versionadded:: 0.8
+
+    References
+    ----------
+    .. footbibliography::
+    """
+    # Validate inputs
+    _validate_type(
+        data,
+        (EpochsSpectrum, EpochsSpectrumArray),
+        "data",
+        "mne.time_frequency.EpochsSpectrum or mne.time_frequency.EpochsSpectrumArray",
+    )
+    if not np.iscomplexobj(data.get_data()):
+        raise TypeError("values in `data` must be complex-valued")
+    n_epochs, n_chans = data.get_data().shape[:2]
+    if n_epochs == 1:
+        raise ValueError("data must contain more than one epoch for shuffling")
+    if n_chans == 1:
+        raise ValueError("data must contain more than one channel for shuffling")
+
+    _validate_type(n_shuffles, "int-like", "n_shuffles", "int")
+    if n_shuffles < 1:
+        raise ValueError("number of shuffles must be >= 1")
+
+    _validate_type(return_generator, bool, "return_generator", "bool")
+    # rng_seed checked by NumPy later
+
+    # Make surrogate data and package into EpochsSpectrum objects
+    surrogate_data = _shuffle_coefficients(data, n_shuffles, rng_seed)
+    if not return_generator:
+        surrogate_data = [shuffle for shuffle in surrogate_data]
+
+    return surrogate_data
+
+
+def _shuffle_coefficients(data, n_shuffles, rng_seed):
+    """Shuffle coefficients over epochs to create surrogate data.
+
+    Surrogate data for each shuffle packaged into an EpochsSpectrum object, which are
+    together returned as a generator to minimise memory demand.
+    """
+    # Extract data array and EpochsSpectrum information
+    data_arr = data.get_data()
+    state = data.__getstate__()
+    defaults = dict(
+        method=None,
+        fmin=None,
+        fmax=None,
+        tmin=None,
+        tmax=None,
+        picks=None,
+        exclude=(),
+        proj=None,
+        remove_dc=None,
+        n_jobs=None,
+        verbose=None,
+    )
+
+    # Make surrogate data
+    rng = np.random.default_rng(rng_seed)
+    for _ in range(n_shuffles):
+        # Shuffle epochs for each channel independently
+        surrogate_arr = np.zeros_like(data_arr, dtype=data_arr.dtype)
+        for chan_i in range(data_arr.shape[1]):
+            surrogate_arr[:, chan_i] = rng.permutation(data_arr[:, chan_i], axis=0)
+
+        # Package surrogate data for this shuffle
+        state["data"] = surrogate_arr
+        yield EpochsSpectrum(state, **defaults)  # return surrogate data as a generator