soda-inria · Vincent-Maladiere · Oct 2, 2024 · Jan 6, 2025 · Jan 7, 2025 · Jan 7, 2025
diff --git a/doc/api.rst b/doc/api.rst
@@ -30,6 +30,7 @@ Metrics
     metrics.brier_score_incidence
     metrics.integrated_brier_score_survival
     metrics.integrated_brier_score_incidence
+    metrics.accuracy_in_time
 
 Datasets
 --------

diff --git a/examples/plot_03_competing_risks.py b/examples/plot_03_competing_risks.py
@@ -0,0 +1,186 @@
+"""
+==============================
+Exploring the accuracy in time
+==============================
+
+In this notebook, we showcase how the accuracy in time metric behaves, and how
+to interpret it.
+"""
+# %%
+# We begin by generating a linear, synthetic dataset. For each individual, we uniformly
+# sample a shape and scale value, which we use to parameterize a Weibull distribution,
+# from which we sample a duration.
+from hazardous.data import make_synthetic_competing_weibull
+from sklearn.model_selection import train_test_split
+
+
+X, y = make_synthetic_competing_weibull(n_events=3, n_samples=10_000, return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y)
+
+X_train.shape, y_train.shape
+
+# %%
+# Next, we display the distribution of our target.
+import seaborn as sns
+from matplotlib import pyplot as plt
+
+
+sns.histplot(
+    y_test,
+    x="duration",
+    hue="event",
+    multiple="stack",
+    palette="colorblind",
+)
+
+# %%
+# We train a Survival Boost model and compute its accuracy in time.
+import numpy as np
+from hazardous import SurvivalBoost
+from hazardous.metrics import accuracy_in_time
+
+
+results = []
+
+time_grid = np.arange(0, 4000, 100)
+surv = SurvivalBoost(show_progressbar=False).fit(X_train, y_train)
+y_pred = surv.predict_cumulative_incidence(X_test, times=time_grid)
+
+quantiles = np.linspace(0.125, 1, 16)
+accuracy, taus = accuracy_in_time(y_test, y_pred, time_grid, quantiles=quantiles)
+results.append(dict(model_name="Survival Boost", accuracy=accuracy, taus=taus))
+
+# %%
+# We also compute the accuracy in time of the Aalen-Johansen estimator, which is
+# a marginal model (it doesn't use covariates X), similar to the Kaplan-Meier estimator,
+# except that it computes cumulative incidence functions of competing risks instead
+# of a survival function.
+from scipy.interpolate import interp1d
+from lifelines import AalenJohansenFitter
+from hazardous.utils import check_y_survival
+
+
+def predict_aalen_johansen(y_train, time_grid, n_sample_test):
+    event, duration = check_y_survival(y_train)
+    event_ids = sorted(set(event) - set([0]))
+
+    y_pred = []
+    for event_id in event_ids:
+        aj = AalenJohansenFitter(calculate_variance=False).fit(
+            durations=duration,
+            event_observed=event,
+            event_of_interest=event_id,
+        )
+        cif = aj.cumulative_density_
+        y_pred_ = interp1d(
+            x=cif.index,
+            y=cif[cif.columns[0]],
+            kind="linear",
+            fill_value="extrapolate",
+        )(time_grid)
+
+        y_pred.append(
+            # shape: (n_sample_test, 1, n_time_steps)
+            np.tile(y_pred_, (n_sample_test, 1, 1))
+        )
+
+    y_survival = (1 - np.sum(np.concatenate(y_pred, axis=1), axis=1))[:, None, :]
+    y_pred.insert(0, y_survival)
+
+    return np.concatenate(y_pred, axis=1)
+
+
+y_pred_aj = predict_aalen_johansen(y_train, time_grid, n_sample_test=X_test.shape[0])
+
+accuracy, taus = accuracy_in_time(y_test, y_pred_aj, time_grid, quantiles=quantiles)
+results.append(dict(model_name="Aalan-Johansen", accuracy=accuracy, taus=taus))
+
+# %%
+# We display the accuracy in time to compare Survival Boost with Aalen-Johansen.
+# Higher is better. Note that the accuracy is high at very beginning (t < 1000), because
+# both models predict that every individual survive.
+# Then, beyond the time horizon 1000, the discriminative power of the conditional
+# Survival Boost yields a better accuracy than the marginal, unbiased, Aalen-Johansen.
+import pandas as pd
+
+
+fig, ax = plt.subplots(figsize=(6, 3), dpi=300)
+
+results = pd.DataFrame(results).explode(column=["accuracy", "taus"])
+
+sns.lineplot(
+    results,
+    x="taus",
+    y="accuracy",
+    hue="model_name",
+    ax=ax,
+    legend=False,
+)
+
+sns.scatterplot(
+    results,
+    x="taus",
+    y="accuracy",
+    hue="model_name",
+    ax=ax,
+    s=50,
+    zorder=100,
+    style="model_name",
+)
+
+
+# %%
+# We can drill into this metric by counting the observed events cumulatively across
+# time, and compare that to predictions.
+#
+# We display below the distribution of ground truth labels. Each color bar group
+# represents the event distribution at some given horizon.
+# Almost no individual have experienced an event at the very beginning.
+# Then, as time passes by, events occur and the number of censored individual at each
+# time horizon shrinks. Therefore, the very last distribution represents the overall
+# event distribution of the dataset.
+def plot_event_in_time(y_in_time):
+    event_in_times = []
+    for event_id in range(4):
+        event_in_times.append(
+            dict(
+                event_count=(y_in_time == event_id).sum(axis=0),
+                time_grid=time_grid,
+                event=event_id,
+            )
+        )
+
+    event_in_times = pd.DataFrame(event_in_times).explode(["event_count", "time_grid"])
+
+    ax = sns.barplot(
+        event_in_times,
+        x="time_grid",
+        y="event_count",
+        hue="event",
+        palette="colorblind",
+    )
+
+    ax.set_xticks(ax.get_xticks()[::10])
+
+
+time_grid_2d = np.tile(time_grid, (y_test.shape[0], 1))
+y_test_class = (y_test["duration"].values[:, None] <= time_grid_2d) * y_test[
+    "event"
+].values[:, None]
+plot_event_in_time(y_test_class)
+# %%
+# Now, we compare this ground truth to the classes predicted by our Survival Boost
+# model. Interestingly, it seems too confident about the censoring event at the
+# beginning (t < 500), but then becomes underconfident in the middle (t > 1500) and
+# very overconfident about the class 3 in the end (t > 3000).
+
+y_pred_class = y_pred.argmax(axis=1)
+plot_event_in_time(y_pred_class)
+
+# %%
+# Finally, we compare this to the classes predicted by the Aalen-Johansen model.
+# They are constant in individuals because this model is marginal and we simply
+# duplicated the global cumulative incidences for each individual.
+y_pred_class_aj = y_pred_aj.argmax(axis=1)
+plot_event_in_time(y_pred_class_aj)
+# %%
diff --git a/hazardous/_survival_boost.py b/hazardous/_survival_boost.py
@@ -169,7 +169,7 @@ def fit(self, X):
 
 
 class SurvivalBoost(BaseEstimator, ClassifierMixin):
-    r"""Cause-specific Cumulative Incidence Function (CIF) with GBDT [1]_.
+    r"""Cause-specific Cumulative Incidence Function (CIF) with GBDT [Alberge2024]_.
 
     This model estimates the cause-specific Cumulative Incidence Function (CIF) for
     each event of interest, as well as the survival function for any event, using a
@@ -297,10 +297,9 @@ class SurvivalBoost(BaseEstimator, ClassifierMixin):
 
     References
     ----------
-    .. [1]  J. Alberge, V. Maladière, O. Grisel, J. Abécassis, G. Varoquaux,
-            "Teaching Models To Survive: Proper Scoring Rule and Stochastic Optimization
-            with Competing Risks", 2024.
-            https://arxiv.org/pdf/2406.14085
+    .. [Alberge2024] J. Alberge, V. Maladiere,  O. Grisel, J. Abécassis, G. Varoquaux,
+       "Survival Models: Proper Scoring Rule and Stochastic Optimization
+       with Competing Risks", 2024
 
     Examples
     --------

diff --git a/hazardous/metrics/__init__.py b/hazardous/metrics/__init__.py
@@ -1,3 +1,4 @@
+from ._accuracy_in_time import accuracy_in_time
 from ._brier_score import (
     brier_score_incidence,
     brier_score_survival,
@@ -10,4 +11,5 @@
     "brier_score_incidence",
     "integrated_brier_score_survival",
     "integrated_brier_score_incidence",
+    "accuracy_in_time",
 ]
diff --git a/hazardous/metrics/_accuracy_in_time.py b/hazardous/metrics/_accuracy_in_time.py
@@ -0,0 +1,131 @@
+import numpy as np
+
+from ..utils import check_y_survival
+
+
+def accuracy_in_time(y_test, y_pred, time_grid, quantiles=None, taus=None):
+    r"""Accuracy in time for prognostic models using competing risks.
+
+    .. math::
+
+        \mathrm{acc}(\zeta) = \frac{1}{n_{nc}} \sum_{i=1}^n I\{\hat{y}_i=y_{i,\zeta}\}
+        \overline{I\{\delta_i = 0 \cap t_i \leq \zeta \}}
+
+    where:
+
+    - :math:`\zeta` is a fixed time horizon
+    - :math:`n_{nc}` is the number of uncensored individuals at :math:`\zeta`
+    - :math:`\delta_i` is the event experienced by the individual :math:`i` at
+      :math:`t_i`
+    - :math:`\hat{y} = \text{arg}\max\limits_{k \in [1, K]} \hat{F}_k(\zeta|X=x_i)` is
+      the most probable predicted event for individual :math:`i` at :math:`\zeta`
+    - :math:`y_{i,\zeta} = \delta_i I\{t_i \leq \zeta \}` is the observed event
+      for individual :math:`i` at :math:`\zeta`
+
+    The accuracy in time is a metrics introduced in [Alberge2024]_ which evaluates
+    whether observed events are predicted as the most likely at given times.
+    It is defined as the probability that the maximum predicted cumulative incidence
+    function (CIF) accross :math:`k` events corresponds to the observed event at a
+    fixed time horizon :math:`\zeta`.
+
+    We remove individuals that were censored at times :math:`t \leq \zeta`, so the
+    accuracy in time essentially represents the accuracy of the estimator on
+    observed events up to :math:`\zeta`.
+
+    While the C-index can help clinicians to priorize treatment allocation by ranking
+    individuals by risk of a given event of interest, the accuracy in time answers
+    a different question: "`what is the most likely event that this individual will
+    experience at some fixed time horizon?`". Conceptually, it helps clinicians choose
+    the right treatment by priorizing the risk for a given individual.
+
+    Parameters
+    ----------
+    y_test : array, dictionnary or dataframe of shape (n_samples, 2)
+        The test target, consisting in the 'event' and 'duration' columns
+
+    y_pred : array of shape (n_samples_test, n_events, n_time_grid)
+        Cumulative incidence for all competing events, at the time points
+        from the input time_grid.
+
+    time_grid : array of shape (n_time_grid,)
+        Time points used to predict the cumulative incidence.
+
+    quantiles : array or list of shape (n_quantiles,), default=None
+        The quantiles of ``time_grid`` used to define the fixed horizons at which
+        to compute the accuracy in time. The values of the quantiles of ``time_grid``
+        are equivalent to ``taus``, therefore ``quantiles`` can't be set if ``taus``
+        is set. If neither ``taus`` nor ``quantiles`` are set, we set quantiles
+        as a uniform grid of 8 quantiles, from 0 to 1.
+
+    taus : array or list of shape (n_taus), default=None
+        The fixed time horizons to compute the accuracy in time. Can't be set if
+        ``quantiles`` is set.
+
+    Returns
+    -------
+    acc_in_time : array of shape (n_quantiles or n_taus)
+        The accuracy in time computed at the fixed horizons ``taus``.
+
+    taus : array of shape (n_quantiles or n_taus)
+        The fixed time horizons effectively used to compute the accuracy in time.
+
+    References
+    ----------
+    .. [Alberge2024] J. Alberge, V. Maladiere,  O. Grisel, J. Abécassis, G. Varoquaux,
+        "Survival Models: Proper Scoring Rule and Stochastic Optimization
+        with Competing Risks", 2024
+    """
+    event_true, _ = check_y_survival(y_test)
+
+    if y_pred.ndim != 3:
+        raise ValueError(
+            "'y_pred' must be a 3D array with shape (n_samples, n_events, n_times), got"
+            f" shape {y_pred.shape}."
+        )
+    if y_pred.shape[0] != event_true.shape[0]:
+        raise ValueError(
+            "'y_true' and 'y_pred' must have the same number of samples, "
+            f"got {event_true.shape[0]} and {y_pred.shape[0]} respectively."
+        )
+    time_grid = np.atleast_1d(time_grid)
+    if y_pred.shape[2] != time_grid.shape[0]:
+        raise ValueError(
+            f"'time_grid' length ({time_grid.shape[0]}) "
+            f"must be equal to y_pred.shape[2] ({y_pred.shape[2]})."
+        )
+
+    if quantiles is not None:
+        if taus is not None:
+            raise ValueError("'quantiles' and 'taus' can't be set at the same time.")
+
+        quantiles = np.atleast_1d(quantiles)
+        if any(quantiles < 0) or any(quantiles > 1):
+            raise ValueError(f"quantiles must be in [0, 1], got {quantiles}.")
+        taus = np.quantile(time_grid, quantiles)
+
+    elif quantiles is None and taus is None:
+        n_quantiles = min(time_grid.shape[0], 8)
+        quantiles = np.linspace(0, 1, n_quantiles)
+        taus = np.quantile(time_grid, quantiles)
+
+    acc_in_time = []
+
+    for tau in taus:
+        mask_past_censored = (y_test["event"] == 0) & (y_test["duration"] <= tau)
+
+        tau_idx = np.searchsorted(time_grid, tau)
+
+        # If tau is beyond the time_grid, we extrapolate its accuracy as
+        # the accuracy at max(time_grid).
+        if tau_idx == time_grid.shape[0]:
+            tau_idx = -1
+
+        y_pred_at_t = y_pred[:, :, tau_idx]
+        y_pred_class = y_pred_at_t[~mask_past_censored, :].argmax(axis=1)
+
+        y_test_class = y_test["event"] * (y_test["duration"] <= tau)
+        y_test_class = y_test_class.loc[~mask_past_censored].values
+
+        acc_in_time.append((y_test_class == y_pred_class).mean())
+
+    return np.array(acc_in_time), np.asarray(taus)
diff --git a/hazardous/metrics/_brier_score.py b/hazardous/metrics/_brier_score.py
@@ -145,6 +145,7 @@ def brier_score_incidence(self, y_true, y_pred, times):
                 "'y_true' and 'y_pred' must have the same number of samples, "
                 f"got {event_true.shape[0]} and {y_pred.shape[0]} respectively."
             )
+        times = np.atleast_1d(times)
         if y_pred.shape[1] != times.shape[0]:
             raise ValueError(
                 f"'times' length ({times.shape[0]}) "