From a4cbc802a241857972da9481df1640d1cc88f005 Mon Sep 17 00:00:00 2001 From: qh681248 <181246904+qh681248@users.noreply.github.com> Date: Wed, 23 Oct 2024 15:23:03 +0100 Subject: [PATCH 1/7] :fix: correct plot scaling calculation and leaf parameters - Fix scale calculation to use downsampling factor squared - Update leaf count to 16,000 --- examples/david_map_reduce_weighted.py | 61 ++++++++++++++++++--------- 1 file changed, 41 insertions(+), 20 deletions(-) diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py index 44cdbe151..4b69aaabf 100644 --- a/examples/david_map_reduce_weighted.py +++ b/examples/david_map_reduce_weighted.py @@ -44,7 +44,6 @@ import jax.numpy as jnp import matplotlib.pyplot as plt import numpy as np -from flax import linen from jax import random from coreax import ( @@ -104,32 +103,51 @@ def main( if out_path is not None and not out_path.is_absolute(): out_path = Path(__file__).parent.joinpath(out_path) + def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: + """ + Downsample an image using `func: cv2.resize` and convert it to grayscale. + + :param image_path: Path to the input image file. + :param downsampling_factor: Factor by which to downsample the image. + :return: Grayscale image after downsampling. + """ + img = cv2.imread(image_path) + + # Calculate new dimensions based on downsampling factor + scale_factor = 1 / downsampling_factor + width = int(img.shape[1] * scale_factor) + height = int(img.shape[0] * scale_factor) + dim = (width, height) + + # Resize using INTER_AREA for better downsampling + resized = cv2.resize(img, dim, interpolation=cv2.INTER_AREA) + + # Convert to grayscale after resizing + grayscale_resized = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY) + + return grayscale_resized + # Path to original image - original_data = cv2.imread(str(in_path)) - image_data = np.asarray(cv2.cvtColor(original_data, cv2.COLOR_BGR2GRAY)) - # Pool/downsample the image - window_shape = (downsampling_factor, downsampling_factor) - pooled_image_data = linen.avg_pool( - image_data[..., None], window_shape, strides=window_shape - )[..., 0] - block_size = 1_000 // downsampling_factor - - print(f"Image dimensions: {pooled_image_data.shape}") - pre_coreset_data = np.column_stack(np.nonzero(pooled_image_data < MAX_8BIT)) - pixel_values = pooled_image_data[pooled_image_data < MAX_8BIT] + original_data = downsample_opencv(str(in_path), downsampling_factor) + + block_size = 1_000 // (downsampling_factor**2) + + print(f"Image dimensions: {original_data.shape}") + pre_coreset_data = np.column_stack(np.nonzero(original_data < MAX_8BIT)) + pixel_values = original_data[original_data < MAX_8BIT] pre_coreset_data = np.column_stack((pre_coreset_data, pixel_values)).astype( np.float32 ) num_data_points = pre_coreset_data.shape[0] # Request coreset points - coreset_size = 8_000 // downsampling_factor + coreset_size = 8_000 // (downsampling_factor**2) # Setup the original data object data = Data(pre_coreset_data) # Set the length_scale parameter of the kernel from at most 1000 samples - num_samples_length_scale = min(num_data_points, 1000 // downsampling_factor) + num_samples_length_scale = min(num_data_points, 1000 // (downsampling_factor**2)) random_seed = 1_989 generator = np.random.default_rng(random_seed) idx = generator.choice(num_data_points, num_samples_length_scale, replace=False) @@ -159,10 +177,10 @@ def main( herding_solver = KernelHerding( coreset_size, kernel=herding_kernel, - block_size=1_000 // downsampling_factor, + block_size=block_size, ) mapped_herding_solver = MapReduce( - herding_solver, leaf_size=10_000 // downsampling_factor + herding_solver, leaf_size=16_000 // (downsampling_factor**2) ) herding_coreset, _ = eqx.filter_jit(mapped_herding_solver.reduce)(data) herding_weights = weights_optimiser.solve(data, herding_coreset.coreset) @@ -171,6 +189,7 @@ def main( # Generate a coreset via uniform random sampling for comparison random_solver = RandomSample(coreset_size, sample_key, unique=True) random_coreset, _ = eqx.filter_jit(random_solver.reduce)(data) + random_weights = weights_optimiser.solve(data, random_coreset.coreset) # Define a reference kernel to use for comparisons of MMD. We'll use a normalised # SquaredExponentialKernel (which is also a Gaussian kernel) @@ -196,7 +215,7 @@ def main( # Plot the pre-coreset image plt.figure(figsize=(10, 5)) plt.subplot(1, 3, 1) - plt.imshow(pooled_image_data, cmap="gray") + plt.imshow(original_data, cmap="gray") plt.title("Pre-Coreset") plt.axis("off") @@ -208,7 +227,7 @@ def main( -herding_coreset.coreset.data[:, 0], c=herding_coreset.coreset.data[:, 2], cmap="gray", - s=np.exp(2.0 * coreset_size * herding_weights).reshape(1, -1), + s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1), marker="h", alpha=0.8, ) @@ -222,7 +241,9 @@ def main( random_coreset.coreset.data[:, 1], -random_coreset.coreset.data[:, 0], c=random_coreset.coreset.data[:, 2], - s=1.0, + s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape( + 1, -1 + ), cmap="gray", marker="h", alpha=0.8, From 07dbc36545065dac4beedff95f5c762df2f94ce9 Mon Sep 17 00:00:00 2001 From: qh681248 <181246904+qh681248@users.noreply.github.com> Date: Thu, 24 Oct 2024 14:21:09 +0100 Subject: [PATCH 2/7] fix: At david_map_reduce_weighted.py, fixed `herding_weights` and `random_weights` in the plots --- examples/david_map_reduce_weighted.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py index 4b69aaabf..1de332029 100644 --- a/examples/david_map_reduce_weighted.py +++ b/examples/david_map_reduce_weighted.py @@ -227,7 +227,9 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: -herding_coreset.coreset.data[:, 0], c=herding_coreset.coreset.data[:, 2], cmap="gray", - s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1), + s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape( + 1, -1 + ), marker="h", alpha=0.8, ) @@ -241,9 +243,7 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: random_coreset.coreset.data[:, 1], -random_coreset.coreset.data[:, 0], c=random_coreset.coreset.data[:, 2], - s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape( - 1, -1 - ), + s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1), cmap="gray", marker="h", alpha=0.8, From ff209ddf5f59db9663aeea4225689f58bc136340 Mon Sep 17 00:00:00 2001 From: qh681248 <181246904+qh681248@users.noreply.github.com> Date: Mon, 4 Nov 2024 16:50:55 +0000 Subject: [PATCH 3/7] fix: There is bigger difference in the size of smallest and the biggest marker --- examples/david_map_reduce_weighted.py | 32 ++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py index 1de332029..9ff98a34e 100644 --- a/examples/david_map_reduce_weighted.py +++ b/examples/david_map_reduce_weighted.py @@ -189,7 +189,6 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: # Generate a coreset via uniform random sampling for comparison random_solver = RandomSample(coreset_size, sample_key, unique=True) random_coreset, _ = eqx.filter_jit(random_solver.reduce)(data) - random_weights = weights_optimiser.solve(data, random_coreset.coreset) # Define a reference kernel to use for comparisons of MMD. We'll use a normalised # SquaredExponentialKernel (which is also a Gaussian kernel) @@ -211,6 +210,29 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: print(f"Random sampling coreset MMD: {random_mmd}") print(f"Herding coreset MMD: {herding_mmd}") + def transform_marker_size( + weights, scale_factor=15, min_size=4 * downsampling_factor + ): + # Define threshold percentiles + lower_percentile, upper_percentile = 1, 99 + + # Clip weights to reduce the effect of outliers + clipped_weights = np.clip( + weights, + np.percentile(weights, lower_percentile), + np.percentile(weights, upper_percentile), + ) + + # Normalize weights to a [0, 1] range + normalized_weights = (clipped_weights - clipped_weights.min()) / ( + clipped_weights.max() - clipped_weights.min() + ) + + # Apply exponential scaling to get the desired spread + transformed_sizes = min_size + (scale_factor**normalized_weights - 1) * min_size + + return transformed_sizes + print("Plotting") # Plot the pre-coreset image plt.figure(figsize=(10, 5)) @@ -227,9 +249,7 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: -herding_coreset.coreset.data[:, 0], c=herding_coreset.coreset.data[:, 2], cmap="gray", - s=(5.0 * coreset_size * herding_weights * downsampling_factor**2).reshape( - 1, -1 - ), + s=(transform_marker_size(herding_weights)).reshape(1, -1), marker="h", alpha=0.8, ) @@ -243,7 +263,7 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: random_coreset.coreset.data[:, 1], -random_coreset.coreset.data[:, 0], c=random_coreset.coreset.data[:, 2], - s=(5.0 * coreset_size * random_weights * downsampling_factor**2).reshape(1, -1), + s=25 * downsampling_factor, cmap="gray", marker="h", alpha=0.8, @@ -269,4 +289,4 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: if __name__ == "__main__": - main() + main(out_path=Path("data/david_coreset_2.png")) From 1f70b12bcadf18fa5a930238b093bcc23a464f40 Mon Sep 17 00:00:00 2001 From: qh681248 <181246904+qh681248@users.noreply.github.com> Date: Mon, 4 Nov 2024 17:10:29 +0000 Subject: [PATCH 4/7] chore: Add docstring and typehints --- examples/david_map_reduce_weighted.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/david_map_reduce_weighted.py b/examples/david_map_reduce_weighted.py index 9ff98a34e..f6893adba 100644 --- a/examples/david_map_reduce_weighted.py +++ b/examples/david_map_reduce_weighted.py @@ -211,8 +211,18 @@ def downsample_opencv(image_path: str, downsampling_factor: int) -> np.ndarray: print(f"Herding coreset MMD: {herding_mmd}") def transform_marker_size( - weights, scale_factor=15, min_size=4 * downsampling_factor - ): + weights: np.ndarray, + scale_factor: int = 15, + min_size: int = 4 * downsampling_factor, + ) -> np.ndarray: + """ + Transform coreset weights to marker sizes for plotting. + + :param weights: Array of coreset weights to be transformed. + :param scale_factor: Ratio of the largest and the smallest marker sizes. + :param min_size: Smallest marker size. + :return: Array of transformed marker sizes for plotting. + """ # Define threshold percentiles lower_percentile, upper_percentile = 1, 99 From ea2cc843be768db84a7d48d42790aabe62d96a58 Mon Sep 17 00:00:00 2001 From: qh681248 <181246904+qh681248@users.noreply.github.com> Date: Tue, 5 Nov 2024 14:12:19 +0000 Subject: [PATCH 5/7] chore: Add changes made to David example to CHANGELOG.md --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8ef41fc7..6acaa8b24 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,7 +19,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 points in the coreset.(https://github.com/gchq/coreax/pull/836) ### Changed -- +- Adjusted `examples.david_map_reduce_weighted`: increased leaf size to + double the coreset size, and set marker sizes of the random and herding + coresets to a common scale for fairer comparison. ### Removed - From b0c7ab177074e71e5cc3e8552536c30fc2f981ae Mon Sep 17 00:00:00 2001 From: qh681248 <181246904+qh681248@users.noreply.github.com> Date: Wed, 6 Nov 2024 10:45:13 +0000 Subject: [PATCH 6/7] chore: linked PR to the entry in CHANGELOG.md --- CHANGELOG.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 6acaa8b24..c785840a6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,11 +17,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 correctly.(https://github.com/gchq/coreax/pull/825) - `RPCholesky.reduce` in `coreax.solvers.coresubset` now does not produce duplicate points in the coreset.(https://github.com/gchq/coreax/pull/836) +- Fixed the example `examples.david_map_reduce_weighted` so that it no longer errors if + downsampling is enabled and it runs faster, set marker sizes of the random and herding + coresets to a common scale for fairer comparison.(https://github.com/gchq/coreax/pull/821) ### Changed -- Adjusted `examples.david_map_reduce_weighted`: increased leaf size to - double the coreset size, and set marker sizes of the random and herding - coresets to a common scale for fairer comparison. +- ### Removed - From bf68a303b5215820b46b25f09e9f086ba9ce4c6f Mon Sep 17 00:00:00 2001 From: qh681248 <181246904+qh681248@users.noreply.github.com> Date: Wed, 6 Nov 2024 11:13:17 +0000 Subject: [PATCH 7/7] chore: Reworded the entry in CHANGELOG.md --- CHANGELOG.md | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c785840a6..2e82b5360 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,9 +17,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 correctly.(https://github.com/gchq/coreax/pull/825) - `RPCholesky.reduce` in `coreax.solvers.coresubset` now does not produce duplicate points in the coreset.(https://github.com/gchq/coreax/pull/836) -- Fixed the example `examples.david_map_reduce_weighted` so that it no longer errors if - downsampling is enabled and it runs faster, set marker sizes of the random and herding - coresets to a common scale for fairer comparison.(https://github.com/gchq/coreax/pull/821) +- Fixed the example `examples.david_map_reduce_weighted` to prevent errors when + downsampling is enabled, and to make it run faster.(https://github.com/gchq/coreax/pull/821) ### Changed -