From facc25f98ede54f7ba8f3e9d7c4871fa3d7efa78 Mon Sep 17 00:00:00 2001
From: tc85324 <158290903+tc85324@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:55:55 +0100
Subject: [PATCH 1/3] docs: simplify `Coreax.Coreset` docstring maths

---
 coreax/coreset.py | 63 ++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 22 deletions(-)

diff --git a/coreax/coreset.py b/coreax/coreset.py
index 511f6293..a20c91a6 100644
--- a/coreax/coreset.py
+++ b/coreax/coreset.py
@@ -21,41 +21,60 @@ class Coreset(eqx.Module, Generic[_Data]):
     r"""
     Data structure for representing a coreset.
 
-    TLDR: a coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data
+    **TLDR:** a coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data
     points that, in some sense, best represent the "important" properties of a larger
     set of :math:`n > \hat{n}` (potentially weighted) data points.
 
-    Given a dataset :math:`X = \{x_i\}_{i=1}^n, x \in \Omega`, where each node is paired
-    with a non-negative (probability) weight :math:`w_i \in \mathbb{R} \ge 0`, there
-    exists an implied discrete (probability) measure over :math:`\Omega`
+    For a dataset :math:`\{(x_i, w_i)\}_{i=1}^n`, where each node :math:`x_i \in \Omega`
+    is paired with a non-negative weight :math:`w_i \in \mathbb{R} \ge 0`, there exists
+    an implied (discrete) measure :math:`\nu_n = \sum_{i=1}^{n} w_i \delta_{x_i}` on
+    :math:`\Omega`. While not very useful on its own, when combined with a set of
+    :math:`\nu_n`-integrable test-functions :math:`\Phi = \{ \phi_1, \dots, \phi_M \}`,
+    where :math:`\phi_i\ \colon\ \Omega \to \mathbb{R}`, the measure :math:`\nu_n`
+    implies the following push-forward measure over :math:`\mathbb{R}^M`
 
     .. math::
-        \eta_n = \sum_{i=1}^{n} w_i \delta_{x_i}.
+        \begin{align}
+            \mu_n &:= \Phi_* \nu_n,\\
+            \mu_n &= \sum_{i=1}^{n} w_i \delta_{\Phi(x_i)}.
+        \end{align}
 
-    If we then specify a set of test-functions :math:`\Phi = {\phi_1, \dots, \phi_M}`,
-    where :math:`\phi_i \colon \Omega \to \mathbb{R}`, which somehow capture the
-    "important" properties of the data, then there also exists an implied push-forward
-    measure over :math:`\mathbb{R}^M`
+    We assume, that for some choice of test-functions, the "important" properties of
+    :math:`\nu_n` (the original dataset) are encoded in the "centre-of-mass" of the
+    pushed-forward measure :math:`\mu_n`
 
     .. math::
-        \mu_n = \sum_{i=1}^{n} w_i \delta_{\Phi(x_i)}.
+        \begin{align}
+            \text{CoM}(\mu_n) &:= \sum_{i}^{n} w_i \Phi(x_i),\\
+            \text{CoM}(\mu_n) &= \int_\Omega \phi_j(\omega) d\mu_n.\
+        \end{align}
 
-    A coreset is simply a reduced measure containing :math:`\hat{n} < n` updated nodes
-    :math:`\hat{x}_i` and weights :math:`\hat{w}_i`, such that the push-forward measure
-    of the coreset :math:`\nu_\hat{n}` has (approximately for some algorithms) the same
-    "centre-of-mass" as the push-forward measure for the original data :math:`\mu_n`
+    .. note::
+        Depending on the coreset solver, the test-functions may be explicitly specified
+        by the user (the user makes a choice about what properties are "important"), or
+        implicitly defined by the solvers's specific objectives (the solver specifies
+        what properties are "important").
+
+    A coreset is simply a reduced measure :math:`\hat{\nu}_\hat{n}`, whose push-forward
+    :math:`\hat{\mu}_\hat{n} := \Phi_* \hat{\nu}_\hat{n}` has, approximately in some
+    cases, the same "centre-of-mass" as the push-forward measure of the original dataset
 
     .. math::
-        \text{CoM}(\mu_n) = \text{CoM}(\nu_\hat{n}),
-        \text{CoM}(\nu_\hat{n}) = \int_\Omega \Phi(\omega) d\nu_\hat{x}(\omega),
-        \text{CoM}(\nu_\hat{n}) = \sum_{i=1}^\hat{n} \hat{w}_i \delta_{\Phi(\hat{x}_i)}.
+        \hat{\nu}_\hat{n} := \sum_{i=1}^\hat{n} \hat{w}_i \delta_{\hat{x}_i}, \quad
+        \text{CoM}(\hat{\mu}_\hat{n}) = \text{CoM}(\mu_n),
 
-    .. note::
-        Depending on the algorithm, the test-functions may be explicitly specified by
-        the user, or implicitly defined by the algorithm's specific objectives.
+    where :math:`\hat{x}_i \in \Omega` and :math:`\hat{w}_i \in \mathbb{R} \ge 0`. In
+    preserving the "centre-of-mass", the coreset satisfies
+
+    .. math::
+        \int_\Omega f(\omega)\ d\mu_n = \int_\Omega f(\omega)\ d\hat{\mu}_\hat{n},
+
+    for all functions :math:`f \in \text{span}(\Phi)`. I.E. integration against the
+    push-forward of the original dataset and the push-forward of the coreset is
+    identical for all functions in the span of the test-functions.
 
-    :param nodes: The (weighted) coreset nodes, math:`x_i \in \text{supp}(\nu_\hat{n})`;
-        once instantiated, the nodes should be accessed via :meth:`Coresubset.coreset`
+    :param nodes: The (weighted) coreset nodes, :math:`\hat{x}_i`; once instantiated,
+        the nodes should only be accessed via :meth:`Coresubset.coreset`
     :param pre_coreset_data: The dataset :math:`X` used to construct the coreset.
     """
 

From e3d229195a9a4dedd0b4937221bc499abe32acf2 Mon Sep 17 00:00:00 2001
From: tc85324 <158290903+tc85324@users.noreply.github.com>
Date: Fri, 18 Oct 2024 15:53:58 +0100
Subject: [PATCH 2/3] docs: simplify coreset docstrings.

Replaces complex measure theoretic definitions with more accessible
ones while maintaining the required mathematical notation.

Thanks to @db091756 for the helpful discussions.

Refs: #684
---
 coreax/coreset.py | 71 ++++++++---------------------------------------
 1 file changed, 12 insertions(+), 59 deletions(-)

diff --git a/coreax/coreset.py b/coreax/coreset.py
index a20c91a6..0ea8fb6f 100644
--- a/coreax/coreset.py
+++ b/coreax/coreset.py
@@ -21,57 +21,13 @@ class Coreset(eqx.Module, Generic[_Data]):
     r"""
     Data structure for representing a coreset.
 
-    **TLDR:** a coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data
-    points that, in some sense, best represent the "important" properties of a larger
-    set of :math:`n > \hat{n}` (potentially weighted) data points.
-
-    For a dataset :math:`\{(x_i, w_i)\}_{i=1}^n`, where each node :math:`x_i \in \Omega`
-    is paired with a non-negative weight :math:`w_i \in \mathbb{R} \ge 0`, there exists
-    an implied (discrete) measure :math:`\nu_n = \sum_{i=1}^{n} w_i \delta_{x_i}` on
-    :math:`\Omega`. While not very useful on its own, when combined with a set of
-    :math:`\nu_n`-integrable test-functions :math:`\Phi = \{ \phi_1, \dots, \phi_M \}`,
-    where :math:`\phi_i\ \colon\ \Omega \to \mathbb{R}`, the measure :math:`\nu_n`
-    implies the following push-forward measure over :math:`\mathbb{R}^M`
+    A coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data points,
+    :math:`\hat{X} := \{(\hat{x}_i, \hat{w}_i)\}_{i=1}^\hat{n}` that, in some sense,
+    best represent the "important" properties of a larger set of :math:`n > \hat{n}`
+    (potentially weighted) data points :math:`X := \{(x_i, w_i)\}_{i=1}^n`,.
 
-    .. math::
-        \begin{align}
-            \mu_n &:= \Phi_* \nu_n,\\
-            \mu_n &= \sum_{i=1}^{n} w_i \delta_{\Phi(x_i)}.
-        \end{align}
-
-    We assume, that for some choice of test-functions, the "important" properties of
-    :math:`\nu_n` (the original dataset) are encoded in the "centre-of-mass" of the
-    pushed-forward measure :math:`\mu_n`
-
-    .. math::
-        \begin{align}
-            \text{CoM}(\mu_n) &:= \sum_{i}^{n} w_i \Phi(x_i),\\
-            \text{CoM}(\mu_n) &= \int_\Omega \phi_j(\omega) d\mu_n.\
-        \end{align}
-
-    .. note::
-        Depending on the coreset solver, the test-functions may be explicitly specified
-        by the user (the user makes a choice about what properties are "important"), or
-        implicitly defined by the solvers's specific objectives (the solver specifies
-        what properties are "important").
-
-    A coreset is simply a reduced measure :math:`\hat{\nu}_\hat{n}`, whose push-forward
-    :math:`\hat{\mu}_\hat{n} := \Phi_* \hat{\nu}_\hat{n}` has, approximately in some
-    cases, the same "centre-of-mass" as the push-forward measure of the original dataset
-
-    .. math::
-        \hat{\nu}_\hat{n} := \sum_{i=1}^\hat{n} \hat{w}_i \delta_{\hat{x}_i}, \quad
-        \text{CoM}(\hat{\mu}_\hat{n}) = \text{CoM}(\mu_n),
-
-    where :math:`\hat{x}_i \in \Omega` and :math:`\hat{w}_i \in \mathbb{R} \ge 0`. In
-    preserving the "centre-of-mass", the coreset satisfies
-
-    .. math::
-        \int_\Omega f(\omega)\ d\mu_n = \int_\Omega f(\omega)\ d\hat{\mu}_\hat{n},
-
-    for all functions :math:`f \in \text{span}(\Phi)`. I.E. integration against the
-    push-forward of the original dataset and the push-forward of the coreset is
-    identical for all functions in the span of the test-functions.
+    :math:`\hat{x}_i, x_i \in \Omega` represent the data points/nodes and
+    :math:`\hat{w}_i, w_i \in \mathbb{R}` represent the associated weights.
 
     :param nodes: The (weighted) coreset nodes, :math:`\hat{x}_i`; once instantiated,
         the nodes should only be accessed via :meth:`Coresubset.coreset`
@@ -112,27 +68,24 @@ class Coresubset(Coreset[_Data], Generic[_Data]):
     r"""
     Data structure for representing a coresubset.
 
-    A coresubset is a :class`Coreset`, with the additional condition that the support of
-    the reduced measure (the coreset), must be a subset of the support of the original
-    measure (the original data), such that
+    A coresubset is a :class`Coreset`, with the additional condition that the coreset
+    data points/nodes must be a subset of the the original data points/nodes, such that
 
     .. math::
         \hat{x}_i = x_i, \forall i \in I,
         I \subset \{1, \dots, n\}, text{card}(I) = \hat{n}.
 
     Thus, a coresubset, unlike a coreset, ensures that feasibility constraints on the
-    support of the measure are maintained :cite:`litterer2012recombination`. This is
-    vital if, for example, the test-functions are only defined on the support of the
-    original measure/nodes, rather than all of :math:`\Omega`.
+    support of the measure are maintained :cite:`litterer2012recombination`.
 
-    In coresubsets, the measure reduction can be implicit (setting weights/nodes to
-    zero for all :math:`i \in I \ {1, \dots, n}`) or explicit (removing entries from the
+    In coresubsets, the dataset reduction can be implicit (setting weights/nodes to zero
+    for all :math:`i \in I \ {1, \dots, n}`) or explicit (removing entries from the
     weight/node arrays). The implicit approach is useful when input/output array shape
     stability is required (E.G. for some JAX transformations); the explicit approach is
     more similar to a standard coreset.
 
     :param nodes: The (weighted) coresubset node indices, :math:`I`; the materialised
-        coresubset nodes should be accessed via :meth:`Coresubset.coreset`.
+        coresubset nodes should only be accessed via :meth:`Coresubset.coreset`.
     :param pre_coreset_data: The dataset :math:`X` used to construct the coreset.
     """
 

From a9e8571bddbb9181ddb751eaecd991e592d90a54 Mon Sep 17 00:00:00 2001
From: pc532627 <138115355+pc532627@users.noreply.github.com>
Date: Mon, 21 Oct 2024 12:35:40 +0100
Subject: [PATCH 3/3] Minor typo updates

---
 .cspell/library_terms.txt |  1 +
 coreax/coreset.py         | 16 ++++++++--------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.cspell/library_terms.txt b/.cspell/library_terms.txt
index 9d142a96..36d4bf38 100644
--- a/.cspell/library_terms.txt
+++ b/.cspell/library_terms.txt
@@ -83,6 +83,7 @@ ndim
 newaxis
 nobs
 nonzero
+notin
 numpy
 opencv
 operatorname
diff --git a/coreax/coreset.py b/coreax/coreset.py
index e88514be..64471e3c 100644
--- a/coreax/coreset.py
+++ b/coreax/coreset.py
@@ -38,7 +38,7 @@ class Coreset(eqx.Module, Generic[_Data]):
     A coreset is a reduced set of :math:`\hat{n}` (potentially weighted) data points,
     :math:`\hat{X} := \{(\hat{x}_i, \hat{w}_i)\}_{i=1}^\hat{n}` that, in some sense,
     best represent the "important" properties of a larger set of :math:`n > \hat{n}`
-    (potentially weighted) data points :math:`X := \{(x_i, w_i)\}_{i=1}^n`,.
+    (potentially weighted) data points :math:`X := \{(x_i, w_i)\}_{i=1}^n`.
 
     :math:`\hat{x}_i, x_i \in \Omega` represent the data points/nodes and
     :math:`\hat{w}_i, w_i \in \mathbb{R}` represent the associated weights.
@@ -100,21 +100,21 @@ class Coresubset(Coreset[Data], Generic[_Data]):
     r"""
     Data structure for representing a coresubset.
 
-    A coresubset is a :class`Coreset`, with the additional condition that the coreset
-    data points/nodes must be a subset of the the original data points/nodes, such that
+    A coresubset is a :class:`Coreset`, with the additional condition that the coreset
+    data points/nodes must be a subset of the original data points/nodes, such that
 
     .. math::
         \hat{x}_i = x_i, \forall i \in I,
-        I \subset \{1, \dots, n\}, text{card}(I) = \hat{n}.
+        I \subset \{1, \dots, n\}, \text{card}(I) = \hat{n}.
 
     Thus, a coresubset, unlike a coreset, ensures that feasibility constraints on the
     support of the measure are maintained :cite:`litterer2012recombination`.
 
     In coresubsets, the dataset reduction can be implicit (setting weights/nodes to zero
-    for all :math:`i \in I \ {1, \dots, n}`) or explicit (removing entries from the
-    weight/node arrays). The implicit approach is useful when input/output array shape
-    stability is required (E.G. for some JAX transformations); the explicit approach is
-    more similar to a standard coreset.
+    for all :math:`i \notin I`) or explicit (removing entries from the weight/node
+    arrays). The implicit approach is useful when input/output array shape stability is
+    required (E.G. for some JAX transformations); the explicit approach is more similar
+    to a standard coreset.
 
     :param nodes: The (weighted) coresubset node indices, :math:`I`; the materialised
         coresubset nodes should only be accessed via :meth:`Coresubset.coreset`.