diff --git a/.cspell/library_terms.txt b/.cspell/library_terms.txt index 9d142a96..36d4bf38 100644 --- a/.cspell/library_terms.txt +++ b/.cspell/library_terms.txt @@ -83,6 +83,7 @@ ndim newaxis nobs nonzero +notin numpy opencv operatorname diff --git a/coreax/coreset.py b/coreax/coreset.py index 2881683b..64471e3c 100644 --- a/coreax/coreset.py +++ b/coreax/coreset.py @@ -35,41 +35,16 @@ class Coreset(eqx.Module, Generic[_Data]): r""" Data structure for representing a coreset. - TLDR: a coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data - points that, in some sense, best represent the "important" properties of a larger - set of :math:`n > \hat{n}` (potentially weighted) data points. + A coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data points, + :math:`\hat{X} := \{(\hat{x}_i, \hat{w}_i)\}_{i=1}^\hat{n}` that, in some sense, + best represent the "important" properties of a larger set of :math:`n > \hat{n}` + (potentially weighted) data points :math:`X := \{(x_i, w_i)\}_{i=1}^n`. - Given a dataset :math:`X = \{x_i\}_{i=1}^n, x \in \Omega`, where each node is paired - with a non-negative (probability) weight :math:`w_i \in \mathbb{R} \ge 0`, there - exists an implied discrete (probability) measure over :math:`\Omega` + :math:`\hat{x}_i, x_i \in \Omega` represent the data points/nodes and + :math:`\hat{w}_i, w_i \in \mathbb{R}` represent the associated weights. - .. math:: - \eta_n = \sum_{i=1}^{n} w_i \delta_{x_i}. - - If we then specify a set of test-functions :math:`\Phi = {\phi_1, \dots, \phi_M}`, - where :math:`\phi_i \colon \Omega \to \mathbb{R}`, which somehow capture the - "important" properties of the data, then there also exists an implied push-forward - measure over :math:`\mathbb{R}^M` - - .. math:: - \mu_n = \sum_{i=1}^{n} w_i \delta_{\Phi(x_i)}. - - A coreset is simply a reduced measure containing :math:`\hat{n} < n` updated nodes - :math:`\hat{x}_i` and weights :math:`\hat{w}_i`, such that the push-forward measure - of the coreset :math:`\nu_\hat{n}` has (approximately for some algorithms) the same - "centre-of-mass" as the push-forward measure for the original data :math:`\mu_n` - - .. math:: - \text{CoM}(\mu_n) = \text{CoM}(\nu_\hat{n}), - \text{CoM}(\nu_\hat{n}) = \int_\Omega \Phi(\omega) d\nu_\hat{x}(\omega), - \text{CoM}(\nu_\hat{n}) = \sum_{i=1}^\hat{n} \hat{w}_i \delta_{\Phi(\hat{x}_i)}. - - .. note:: - Depending on the algorithm, the test-functions may be explicitly specified by - the user, or implicitly defined by the algorithm's specific objectives. - - :param nodes: The (weighted) coreset nodes, math:`x_i \in \text{supp}(\nu_\hat{n})`; - once instantiated, the nodes should be accessed via :meth:`Coresubset.coreset` + :param nodes: The (weighted) coreset nodes, :math:`\hat{x}_i`; once instantiated, + the nodes should only be accessed via :meth:`Coresubset.coreset` :param pre_coreset_data: The dataset :math:`X` used to construct the coreset. """ @@ -125,27 +100,24 @@ class Coresubset(Coreset[Data], Generic[_Data]): r""" Data structure for representing a coresubset. - A coresubset is a :class`Coreset`, with the additional condition that the support of - the reduced measure (the coreset), must be a subset of the support of the original - measure (the original data), such that + A coresubset is a :class:`Coreset`, with the additional condition that the coreset + data points/nodes must be a subset of the original data points/nodes, such that .. math:: \hat{x}_i = x_i, \forall i \in I, - I \subset \{1, \dots, n\}, text{card}(I) = \hat{n}. + I \subset \{1, \dots, n\}, \text{card}(I) = \hat{n}. Thus, a coresubset, unlike a coreset, ensures that feasibility constraints on the - support of the measure are maintained :cite:`litterer2012recombination`. This is - vital if, for example, the test-functions are only defined on the support of the - original measure/nodes, rather than all of :math:`\Omega`. + support of the measure are maintained :cite:`litterer2012recombination`. - In coresubsets, the measure reduction can be implicit (setting weights/nodes to - zero for all :math:`i \in I \ {1, \dots, n}`) or explicit (removing entries from the - weight/node arrays). The implicit approach is useful when input/output array shape - stability is required (E.G. for some JAX transformations); the explicit approach is - more similar to a standard coreset. + In coresubsets, the dataset reduction can be implicit (setting weights/nodes to zero + for all :math:`i \notin I`) or explicit (removing entries from the weight/node + arrays). The implicit approach is useful when input/output array shape stability is + required (E.G. for some JAX transformations); the explicit approach is more similar + to a standard coreset. :param nodes: The (weighted) coresubset node indices, :math:`I`; the materialised - coresubset nodes should be accessed via :meth:`Coresubset.coreset`. + coresubset nodes should only be accessed via :meth:`Coresubset.coreset`. :param pre_coreset_data: The dataset :math:`X` used to construct the coreset. """