From facc25f98ede54f7ba8f3e9d7c4871fa3d7efa78 Mon Sep 17 00:00:00 2001 From: tc85324 <158290903+tc85324@users.noreply.github.com> Date: Fri, 28 Jun 2024 09:55:55 +0100 Subject: [PATCH 1/3] docs: simplify `Coreax.Coreset` docstring maths --- coreax/coreset.py | 63 ++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 22 deletions(-) diff --git a/coreax/coreset.py b/coreax/coreset.py index 511f6293..a20c91a6 100644 --- a/coreax/coreset.py +++ b/coreax/coreset.py @@ -21,41 +21,60 @@ class Coreset(eqx.Module, Generic[_Data]): r""" Data structure for representing a coreset. - TLDR: a coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data + **TLDR:** a coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data points that, in some sense, best represent the "important" properties of a larger set of :math:`n > \hat{n}` (potentially weighted) data points. - Given a dataset :math:`X = \{x_i\}_{i=1}^n, x \in \Omega`, where each node is paired - with a non-negative (probability) weight :math:`w_i \in \mathbb{R} \ge 0`, there - exists an implied discrete (probability) measure over :math:`\Omega` + For a dataset :math:`\{(x_i, w_i)\}_{i=1}^n`, where each node :math:`x_i \in \Omega` + is paired with a non-negative weight :math:`w_i \in \mathbb{R} \ge 0`, there exists + an implied (discrete) measure :math:`\nu_n = \sum_{i=1}^{n} w_i \delta_{x_i}` on + :math:`\Omega`. While not very useful on its own, when combined with a set of + :math:`\nu_n`-integrable test-functions :math:`\Phi = \{ \phi_1, \dots, \phi_M \}`, + where :math:`\phi_i\ \colon\ \Omega \to \mathbb{R}`, the measure :math:`\nu_n` + implies the following push-forward measure over :math:`\mathbb{R}^M` .. math:: - \eta_n = \sum_{i=1}^{n} w_i \delta_{x_i}. + \begin{align} + \mu_n &:= \Phi_* \nu_n,\\ + \mu_n &= \sum_{i=1}^{n} w_i \delta_{\Phi(x_i)}. + \end{align} - If we then specify a set of test-functions :math:`\Phi = {\phi_1, \dots, \phi_M}`, - where :math:`\phi_i \colon \Omega \to \mathbb{R}`, which somehow capture the - "important" properties of the data, then there also exists an implied push-forward - measure over :math:`\mathbb{R}^M` + We assume, that for some choice of test-functions, the "important" properties of + :math:`\nu_n` (the original dataset) are encoded in the "centre-of-mass" of the + pushed-forward measure :math:`\mu_n` .. math:: - \mu_n = \sum_{i=1}^{n} w_i \delta_{\Phi(x_i)}. + \begin{align} + \text{CoM}(\mu_n) &:= \sum_{i}^{n} w_i \Phi(x_i),\\ + \text{CoM}(\mu_n) &= \int_\Omega \phi_j(\omega) d\mu_n.\ + \end{align} - A coreset is simply a reduced measure containing :math:`\hat{n} < n` updated nodes - :math:`\hat{x}_i` and weights :math:`\hat{w}_i`, such that the push-forward measure - of the coreset :math:`\nu_\hat{n}` has (approximately for some algorithms) the same - "centre-of-mass" as the push-forward measure for the original data :math:`\mu_n` + .. note:: + Depending on the coreset solver, the test-functions may be explicitly specified + by the user (the user makes a choice about what properties are "important"), or + implicitly defined by the solvers's specific objectives (the solver specifies + what properties are "important"). + + A coreset is simply a reduced measure :math:`\hat{\nu}_\hat{n}`, whose push-forward + :math:`\hat{\mu}_\hat{n} := \Phi_* \hat{\nu}_\hat{n}` has, approximately in some + cases, the same "centre-of-mass" as the push-forward measure of the original dataset .. math:: - \text{CoM}(\mu_n) = \text{CoM}(\nu_\hat{n}), - \text{CoM}(\nu_\hat{n}) = \int_\Omega \Phi(\omega) d\nu_\hat{x}(\omega), - \text{CoM}(\nu_\hat{n}) = \sum_{i=1}^\hat{n} \hat{w}_i \delta_{\Phi(\hat{x}_i)}. + \hat{\nu}_\hat{n} := \sum_{i=1}^\hat{n} \hat{w}_i \delta_{\hat{x}_i}, \quad + \text{CoM}(\hat{\mu}_\hat{n}) = \text{CoM}(\mu_n), - .. note:: - Depending on the algorithm, the test-functions may be explicitly specified by - the user, or implicitly defined by the algorithm's specific objectives. + where :math:`\hat{x}_i \in \Omega` and :math:`\hat{w}_i \in \mathbb{R} \ge 0`. In + preserving the "centre-of-mass", the coreset satisfies + + .. math:: + \int_\Omega f(\omega)\ d\mu_n = \int_\Omega f(\omega)\ d\hat{\mu}_\hat{n}, + + for all functions :math:`f \in \text{span}(\Phi)`. I.E. integration against the + push-forward of the original dataset and the push-forward of the coreset is + identical for all functions in the span of the test-functions. - :param nodes: The (weighted) coreset nodes, math:`x_i \in \text{supp}(\nu_\hat{n})`; - once instantiated, the nodes should be accessed via :meth:`Coresubset.coreset` + :param nodes: The (weighted) coreset nodes, :math:`\hat{x}_i`; once instantiated, + the nodes should only be accessed via :meth:`Coresubset.coreset` :param pre_coreset_data: The dataset :math:`X` used to construct the coreset. """ From e3d229195a9a4dedd0b4937221bc499abe32acf2 Mon Sep 17 00:00:00 2001 From: tc85324 <158290903+tc85324@users.noreply.github.com> Date: Fri, 18 Oct 2024 15:53:58 +0100 Subject: [PATCH 2/3] docs: simplify coreset docstrings. Replaces complex measure theoretic definitions with more accessible ones while maintaining the required mathematical notation. Thanks to @db091756 for the helpful discussions. Refs: #684 --- coreax/coreset.py | 71 ++++++++--------------------------------------- 1 file changed, 12 insertions(+), 59 deletions(-) diff --git a/coreax/coreset.py b/coreax/coreset.py index a20c91a6..0ea8fb6f 100644 --- a/coreax/coreset.py +++ b/coreax/coreset.py @@ -21,57 +21,13 @@ class Coreset(eqx.Module, Generic[_Data]): r""" Data structure for representing a coreset. - **TLDR:** a coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data - points that, in some sense, best represent the "important" properties of a larger - set of :math:`n > \hat{n}` (potentially weighted) data points. - - For a dataset :math:`\{(x_i, w_i)\}_{i=1}^n`, where each node :math:`x_i \in \Omega` - is paired with a non-negative weight :math:`w_i \in \mathbb{R} \ge 0`, there exists - an implied (discrete) measure :math:`\nu_n = \sum_{i=1}^{n} w_i \delta_{x_i}` on - :math:`\Omega`. While not very useful on its own, when combined with a set of - :math:`\nu_n`-integrable test-functions :math:`\Phi = \{ \phi_1, \dots, \phi_M \}`, - where :math:`\phi_i\ \colon\ \Omega \to \mathbb{R}`, the measure :math:`\nu_n` - implies the following push-forward measure over :math:`\mathbb{R}^M` + A coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data points, + :math:`\hat{X} := \{(\hat{x}_i, \hat{w}_i)\}_{i=1}^\hat{n}` that, in some sense, + best represent the "important" properties of a larger set of :math:`n > \hat{n}` + (potentially weighted) data points :math:`X := \{(x_i, w_i)\}_{i=1}^n`,. - .. math:: - \begin{align} - \mu_n &:= \Phi_* \nu_n,\\ - \mu_n &= \sum_{i=1}^{n} w_i \delta_{\Phi(x_i)}. - \end{align} - - We assume, that for some choice of test-functions, the "important" properties of - :math:`\nu_n` (the original dataset) are encoded in the "centre-of-mass" of the - pushed-forward measure :math:`\mu_n` - - .. math:: - \begin{align} - \text{CoM}(\mu_n) &:= \sum_{i}^{n} w_i \Phi(x_i),\\ - \text{CoM}(\mu_n) &= \int_\Omega \phi_j(\omega) d\mu_n.\ - \end{align} - - .. note:: - Depending on the coreset solver, the test-functions may be explicitly specified - by the user (the user makes a choice about what properties are "important"), or - implicitly defined by the solvers's specific objectives (the solver specifies - what properties are "important"). - - A coreset is simply a reduced measure :math:`\hat{\nu}_\hat{n}`, whose push-forward - :math:`\hat{\mu}_\hat{n} := \Phi_* \hat{\nu}_\hat{n}` has, approximately in some - cases, the same "centre-of-mass" as the push-forward measure of the original dataset - - .. math:: - \hat{\nu}_\hat{n} := \sum_{i=1}^\hat{n} \hat{w}_i \delta_{\hat{x}_i}, \quad - \text{CoM}(\hat{\mu}_\hat{n}) = \text{CoM}(\mu_n), - - where :math:`\hat{x}_i \in \Omega` and :math:`\hat{w}_i \in \mathbb{R} \ge 0`. In - preserving the "centre-of-mass", the coreset satisfies - - .. math:: - \int_\Omega f(\omega)\ d\mu_n = \int_\Omega f(\omega)\ d\hat{\mu}_\hat{n}, - - for all functions :math:`f \in \text{span}(\Phi)`. I.E. integration against the - push-forward of the original dataset and the push-forward of the coreset is - identical for all functions in the span of the test-functions. + :math:`\hat{x}_i, x_i \in \Omega` represent the data points/nodes and + :math:`\hat{w}_i, w_i \in \mathbb{R}` represent the associated weights. :param nodes: The (weighted) coreset nodes, :math:`\hat{x}_i`; once instantiated, the nodes should only be accessed via :meth:`Coresubset.coreset` @@ -112,27 +68,24 @@ class Coresubset(Coreset[_Data], Generic[_Data]): r""" Data structure for representing a coresubset. - A coresubset is a :class`Coreset`, with the additional condition that the support of - the reduced measure (the coreset), must be a subset of the support of the original - measure (the original data), such that + A coresubset is a :class`Coreset`, with the additional condition that the coreset + data points/nodes must be a subset of the the original data points/nodes, such that .. math:: \hat{x}_i = x_i, \forall i \in I, I \subset \{1, \dots, n\}, text{card}(I) = \hat{n}. Thus, a coresubset, unlike a coreset, ensures that feasibility constraints on the - support of the measure are maintained :cite:`litterer2012recombination`. This is - vital if, for example, the test-functions are only defined on the support of the - original measure/nodes, rather than all of :math:`\Omega`. + support of the measure are maintained :cite:`litterer2012recombination`. - In coresubsets, the measure reduction can be implicit (setting weights/nodes to - zero for all :math:`i \in I \ {1, \dots, n}`) or explicit (removing entries from the + In coresubsets, the dataset reduction can be implicit (setting weights/nodes to zero + for all :math:`i \in I \ {1, \dots, n}`) or explicit (removing entries from the weight/node arrays). The implicit approach is useful when input/output array shape stability is required (E.G. for some JAX transformations); the explicit approach is more similar to a standard coreset. :param nodes: The (weighted) coresubset node indices, :math:`I`; the materialised - coresubset nodes should be accessed via :meth:`Coresubset.coreset`. + coresubset nodes should only be accessed via :meth:`Coresubset.coreset`. :param pre_coreset_data: The dataset :math:`X` used to construct the coreset. """ From a9e8571bddbb9181ddb751eaecd991e592d90a54 Mon Sep 17 00:00:00 2001 From: pc532627 <138115355+pc532627@users.noreply.github.com> Date: Mon, 21 Oct 2024 12:35:40 +0100 Subject: [PATCH 3/3] Minor typo updates --- .cspell/library_terms.txt | 1 + coreax/coreset.py | 16 ++++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.cspell/library_terms.txt b/.cspell/library_terms.txt index 9d142a96..36d4bf38 100644 --- a/.cspell/library_terms.txt +++ b/.cspell/library_terms.txt @@ -83,6 +83,7 @@ ndim newaxis nobs nonzero +notin numpy opencv operatorname diff --git a/coreax/coreset.py b/coreax/coreset.py index e88514be..64471e3c 100644 --- a/coreax/coreset.py +++ b/coreax/coreset.py @@ -38,7 +38,7 @@ class Coreset(eqx.Module, Generic[_Data]): A coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data points, :math:`\hat{X} := \{(\hat{x}_i, \hat{w}_i)\}_{i=1}^\hat{n}` that, in some sense, best represent the "important" properties of a larger set of :math:`n > \hat{n}` - (potentially weighted) data points :math:`X := \{(x_i, w_i)\}_{i=1}^n`,. + (potentially weighted) data points :math:`X := \{(x_i, w_i)\}_{i=1}^n`. :math:`\hat{x}_i, x_i \in \Omega` represent the data points/nodes and :math:`\hat{w}_i, w_i \in \mathbb{R}` represent the associated weights. @@ -100,21 +100,21 @@ class Coresubset(Coreset[Data], Generic[_Data]): r""" Data structure for representing a coresubset. - A coresubset is a :class`Coreset`, with the additional condition that the coreset - data points/nodes must be a subset of the the original data points/nodes, such that + A coresubset is a :class:`Coreset`, with the additional condition that the coreset + data points/nodes must be a subset of the original data points/nodes, such that .. math:: \hat{x}_i = x_i, \forall i \in I, - I \subset \{1, \dots, n\}, text{card}(I) = \hat{n}. + I \subset \{1, \dots, n\}, \text{card}(I) = \hat{n}. Thus, a coresubset, unlike a coreset, ensures that feasibility constraints on the support of the measure are maintained :cite:`litterer2012recombination`. In coresubsets, the dataset reduction can be implicit (setting weights/nodes to zero - for all :math:`i \in I \ {1, \dots, n}`) or explicit (removing entries from the - weight/node arrays). The implicit approach is useful when input/output array shape - stability is required (E.G. for some JAX transformations); the explicit approach is - more similar to a standard coreset. + for all :math:`i \notin I`) or explicit (removing entries from the weight/node + arrays). The implicit approach is useful when input/output array shape stability is + required (E.G. for some JAX transformations); the explicit approach is more similar + to a standard coreset. :param nodes: The (weighted) coresubset node indices, :math:`I`; the materialised coresubset nodes should only be accessed via :meth:`Coresubset.coreset`.