From a4cbc802a241857972da9481df1640d1cc88f005 Mon Sep 17 00:00:00 2001
From: qh681248 <181246904+qh681248@users.noreply.github.com>
Date: Wed, 23 Oct 2024 15:23:03 +0100
Subject: [PATCH 1/7] :fix: correct plot scaling calculation and leaf
 parameters - Fix scale calculation to use downsampling factor squared -
 Update leaf count to 16,000

---
 examples/david_map_reduce_weighted.py | 61 ++++++++++++++++++---------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py
index 44cdbe151..4b69aaabf 100644
--- a/examples/david_map_reduce_weighted.py
+++ b/examples/david_map_reduce_weighted.py
@@ -44,7 +44,6 @@
 import jax.numpy as jnp
 import matplotlib.pyplot as plt
 import numpy as np
-from flax import linen
 from jax import random
 
 from coreax import (
@@ -104,32 +103,51 @@ def main(
     if out_path is not None and not out_path.is_absolute():
         out_path = Path(__file__).parent.joinpath(out_path)
 
+    def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
+        """
+        Downsample an image using `func: cv2.resize` and convert it to grayscale.
+
+        :param image_path: Path to the input image file.
+        :param downsampling_factor: Factor by which to downsample the image.
+        :return: Grayscale image after downsampling.
+        """
+        img = cv2.imread(image_path)
+
+        # Calculate new dimensions based on downsampling factor
+        scale_factor = 1 / downsampling_factor
+        width = int(img.shape[1] * scale_factor)
+        height = int(img.shape[0] * scale_factor)
+        dim = (width, height)
+
+        # Resize using INTER_AREA for better downsampling
+        resized = cv2.resize(img, dim, interpolation=cv2.INTER_AREA)
+
+        # Convert to grayscale after resizing
+        grayscale_resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
+
+        return grayscale_resized
+
     # Path to original image
-    original_data = cv2.imread(str(in_path))
-    image_data = np.asarray(cv2.cvtColor(original_data, cv2.COLOR_BGR2GRAY))
-    # Pool/downsample the image
-    window_shape = (downsampling_factor, downsampling_factor)
-    pooled_image_data = linen.avg_pool(
-        image_data[..., None], window_shape, strides=window_shape
-    )[..., 0]
-    block_size = 1_000 // downsampling_factor
-
-    print(f"Image dimensions: {pooled_image_data.shape}")
-    pre_coreset_data = np.column_stack(np.nonzero(pooled_image_data < MAX_8BIT))
-    pixel_values = pooled_image_data[pooled_image_data < MAX_8BIT]
+    original_data = downsample_opencv(str(in_path), downsampling_factor)
+
+    block_size = 1_000 // (downsampling_factor**2)
+
+    print(f"Image dimensions: {original_data.shape}")
+    pre_coreset_data = np.column_stack(np.nonzero(original_data < MAX_8BIT))
+    pixel_values = original_data[original_data < MAX_8BIT]
     pre_coreset_data = np.column_stack((pre_coreset_data, pixel_values)).astype(
         np.float32
     )
     num_data_points = pre_coreset_data.shape[0]
 
     # Request coreset points
-    coreset_size = 8_000 // downsampling_factor
+    coreset_size = 8_000 // (downsampling_factor**2)
 
     # Setup the original data object
     data = Data(pre_coreset_data)
 
     # Set the length_scale parameter of the kernel from at most 1000 samples
-    num_samples_length_scale = min(num_data_points, 1000 // downsampling_factor)
+    num_samples_length_scale = min(num_data_points, 1000 // (downsampling_factor**2))
     random_seed = 1_989
     generator = np.random.default_rng(random_seed)
     idx = generator.choice(num_data_points, num_samples_length_scale, replace=False)
@@ -159,10 +177,10 @@ def main(
     herding_solver = KernelHerding(
         coreset_size,
         kernel=herding_kernel,
-        block_size=1_000 // downsampling_factor,
+        block_size=block_size,
     )
     mapped_herding_solver = MapReduce(
-        herding_solver, leaf_size=10_000 // downsampling_factor
+        herding_solver, leaf_size=16_000 // (downsampling_factor**2)
     )
     herding_coreset, _ = eqx.filter_jit(mapped_herding_solver.reduce)(data)
     herding_weights = weights_optimiser.solve(data, herding_coreset.coreset)
@@ -171,6 +189,7 @@ def main(
     # Generate a coreset via uniform random sampling for comparison
     random_solver = RandomSample(coreset_size, sample_key, unique=True)
     random_coreset, _ = eqx.filter_jit(random_solver.reduce)(data)
+    random_weights = weights_optimiser.solve(data, random_coreset.coreset)
 
     # Define a reference kernel to use for comparisons of MMD. We'll use a normalised
     # SquaredExponentialKernel (which is also a Gaussian kernel)
@@ -196,7 +215,7 @@ def main(
     # Plot the pre-coreset image
     plt.figure(figsize=(10, 5))
     plt.subplot(1, 3, 1)
-    plt.imshow(pooled_image_data, cmap="gray")
+    plt.imshow(original_data, cmap="gray")
     plt.title("Pre-Coreset")
     plt.axis("off")
 
@@ -208,7 +227,7 @@ def main(
         -herding_coreset.coreset.data[:, 0],
         c=herding_coreset.coreset.data[:, 2],
         cmap="gray",
-        s=np.exp(2.0 * coreset_size * herding_weights).reshape(1, -1),
+        s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1),
         marker="h",
         alpha=0.8,
     )
@@ -222,7 +241,9 @@ def main(
         random_coreset.coreset.data[:, 1],
         -random_coreset.coreset.data[:, 0],
         c=random_coreset.coreset.data[:, 2],
-        s=1.0,
+        s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape(
+            1, -1
+        ),
         cmap="gray",
         marker="h",
         alpha=0.8,

From 07dbc36545065dac4beedff95f5c762df2f94ce9 Mon Sep 17 00:00:00 2001
From: qh681248 <181246904+qh681248@users.noreply.github.com>
Date: Thu, 24 Oct 2024 14:21:09 +0100
Subject: [PATCH 2/7] fix: At david_map_reduce_weighted.py, fixed
 `herding_weights` and `random_weights` in the plots

---
 examples/david_map_reduce_weighted.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py
index 4b69aaabf..1de332029 100644
--- a/examples/david_map_reduce_weighted.py
+++ b/examples/david_map_reduce_weighted.py
@@ -227,7 +227,9 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
         -herding_coreset.coreset.data[:, 0],
         c=herding_coreset.coreset.data[:, 2],
         cmap="gray",
-        s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1),
+        s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape(
+            1, -1
+        ),
         marker="h",
         alpha=0.8,
     )
@@ -241,9 +243,7 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
         random_coreset.coreset.data[:, 1],
         -random_coreset.coreset.data[:, 0],
         c=random_coreset.coreset.data[:, 2],
-        s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape(
-            1, -1
-        ),
+        s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1),
         cmap="gray",
         marker="h",
         alpha=0.8,

From ff209ddf5f59db9663aeea4225689f58bc136340 Mon Sep 17 00:00:00 2001
From: qh681248 <181246904+qh681248@users.noreply.github.com>
Date: Mon, 4 Nov 2024 16:50:55 +0000
Subject: [PATCH 3/7] fix: There is bigger difference in the size of smallest
 and the biggest marker

---
 examples/david_map_reduce_weighted.py | 32 ++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py
index 1de332029..9ff98a34e 100644
--- a/examples/david_map_reduce_weighted.py
+++ b/examples/david_map_reduce_weighted.py
@@ -189,7 +189,6 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
     # Generate a coreset via uniform random sampling for comparison
     random_solver = RandomSample(coreset_size, sample_key, unique=True)
     random_coreset, _ = eqx.filter_jit(random_solver.reduce)(data)
-    random_weights = weights_optimiser.solve(data, random_coreset.coreset)
 
     # Define a reference kernel to use for comparisons of MMD. We'll use a normalised
     # SquaredExponentialKernel (which is also a Gaussian kernel)
@@ -211,6 +210,29 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
     print(f"Random sampling coreset MMD: {random_mmd}")
     print(f"Herding coreset MMD: {herding_mmd}")
 
+    def transform_marker_size(
+        weights, scale_factor=15, min_size=4 * downsampling_factor
+    ):
+        # Define threshold percentiles
+        lower_percentile, upper_percentile = 1, 99
+
+        # Clip weights to reduce the effect of outliers
+        clipped_weights = np.clip(
+            weights,
+            np.percentile(weights, lower_percentile),
+            np.percentile(weights, upper_percentile),
+        )
+
+        # Normalize weights to a [0, 1] range
+        normalized_weights = (clipped_weights - clipped_weights.min()) / (
+            clipped_weights.max() - clipped_weights.min()
+        )
+
+        # Apply exponential scaling to get the desired spread
+        transformed_sizes = min_size + (scale_factor**normalized_weights - 1) * min_size
+
+        return transformed_sizes
+
     print("Plotting")
     # Plot the pre-coreset image
     plt.figure(figsize=(10, 5))
@@ -227,9 +249,7 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
         -herding_coreset.coreset.data[:, 0],
         c=herding_coreset.coreset.data[:, 2],
         cmap="gray",
-        s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape(
-            1, -1
-        ),
+        s=(transform_marker_size(herding_weights)).reshape(1, -1),
         marker="h",
         alpha=0.8,
     )
@@ -243,7 +263,7 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
         random_coreset.coreset.data[:, 1],
         -random_coreset.coreset.data[:, 0],
         c=random_coreset.coreset.data[:, 2],
-        s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1),
+        s=25 * downsampling_factor,
         cmap="gray",
         marker="h",
         alpha=0.8,
@@ -269,4 +289,4 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
 
 
 if __name__ == "__main__":
-    main()
+    main(out_path=Path("data/david_coreset_2.png"))

From 1f70b12bcadf18fa5a930238b093bcc23a464f40 Mon Sep 17 00:00:00 2001
From: qh681248 <181246904+qh681248@users.noreply.github.com>
Date: Mon, 4 Nov 2024 17:10:29 +0000
Subject: [PATCH 4/7] chore: Add docstring and typehints

---
 examples/david_map_reduce_weighted.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py
index 9ff98a34e..f6893adba 100644
--- a/examples/david_map_reduce_weighted.py
+++ b/examples/david_map_reduce_weighted.py
@@ -211,8 +211,18 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray:
     print(f"Herding coreset MMD: {herding_mmd}")
 
     def transform_marker_size(
-        weights, scale_factor=15, min_size=4 * downsampling_factor
-    ):
+        weights: np.ndarray,
+        scale_factor: int = 15,
+        min_size: int = 4 * downsampling_factor,
+    ) -> np.ndarray:
+        """
+        Transform coreset weights to marker sizes for plotting.
+
+        :param weights: Array of coreset weights to be transformed.
+        :param scale_factor: Ratio of the largest and the smallest marker sizes.
+        :param min_size: Smallest marker size.
+        :return: Array of transformed marker sizes for plotting.
+        """
         # Define threshold percentiles
         lower_percentile, upper_percentile = 1, 99
 

From ea2cc843be768db84a7d48d42790aabe62d96a58 Mon Sep 17 00:00:00 2001
From: qh681248 <181246904+qh681248@users.noreply.github.com>
Date: Tue, 5 Nov 2024 14:12:19 +0000
Subject: [PATCH 5/7] chore: Add changes made to David example to CHANGELOG.md

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b8ef41fc7..6acaa8b24 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -19,7 +19,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   points in the coreset.(https://github.com/gchq/coreax/pull/836)
 
 ### Changed
--
+- Adjusted `examples.david_map_reduce_weighted`: increased leaf size to
+  double the coreset size, and set marker sizes of the random and herding
+  coresets to a common scale for fairer comparison.
 
 ### Removed
 -

From b0c7ab177074e71e5cc3e8552536c30fc2f981ae Mon Sep 17 00:00:00 2001
From: qh681248 <181246904+qh681248@users.noreply.github.com>
Date: Wed, 6 Nov 2024 10:45:13 +0000
Subject: [PATCH 6/7] chore: linked PR to the entry in CHANGELOG.md

---
 CHANGELOG.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6acaa8b24..c785840a6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,11 +17,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   correctly.(https://github.com/gchq/coreax/pull/825)
 - `RPCholesky.reduce` in `coreax.solvers.coresubset` now does not produce duplicate
   points in the coreset.(https://github.com/gchq/coreax/pull/836)
+- Fixed the example `examples.david_map_reduce_weighted` so that it no longer errors if
+  downsampling is enabled and it runs faster, set marker sizes of the random and herding
+  coresets to a common scale for fairer comparison.(https://github.com/gchq/coreax/pull/821)
 
 ### Changed
-- Adjusted `examples.david_map_reduce_weighted`: increased leaf size to
-  double the coreset size, and set marker sizes of the random and herding
-  coresets to a common scale for fairer comparison.
+-
 
 ### Removed
 -

From bf68a303b5215820b46b25f09e9f086ba9ce4c6f Mon Sep 17 00:00:00 2001
From: qh681248 <181246904+qh681248@users.noreply.github.com>
Date: Wed, 6 Nov 2024 11:13:17 +0000
Subject: [PATCH 7/7] chore: Reworded the entry in CHANGELOG.md

---
 CHANGELOG.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c785840a6..2e82b5360 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -17,9 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   correctly.(https://github.com/gchq/coreax/pull/825)
 - `RPCholesky.reduce` in `coreax.solvers.coresubset` now does not produce duplicate
   points in the coreset.(https://github.com/gchq/coreax/pull/836)
-- Fixed the example `examples.david_map_reduce_weighted` so that it no longer errors if
-  downsampling is enabled and it runs faster, set marker sizes of the random and herding
-  coresets to a common scale for fairer comparison.(https://github.com/gchq/coreax/pull/821)
+- Fixed the example `examples.david_map_reduce_weighted` to prevent errors when
+  downsampling is enabled, and to make it run faster.(https://github.com/gchq/coreax/pull/821)
 
 ### Changed
 -