From 0600b89915f9823a4e9059c4eba0e3910ba0e196 Mon Sep 17 00:00:00 2001 From: vdelacruzb Date: Tue, 16 Apr 2024 16:44:12 +0200 Subject: [PATCH] fix clustering in rs --- .../python/lib/clustering/__init__.py | 59 ++++++++++++++++++- .../libraries/python/lib/clustering/kmeans.py | 4 +- .../fixtures/st_clusterkmeans_out.txt | 6 +- 3 files changed, 62 insertions(+), 7 deletions(-) diff --git a/clouds/redshift/libraries/python/lib/clustering/__init__.py b/clouds/redshift/libraries/python/lib/clustering/__init__.py index c7c275ddb..0f4e0f771 100644 --- a/clouds/redshift/libraries/python/lib/clustering/__init__.py +++ b/clouds/redshift/libraries/python/lib/clustering/__init__.py @@ -18,6 +18,54 @@ def remove_duplicated_coords(arr): unique_rows.append(row) return np.array(unique_rows) +def reorder_coords(coords): + import numpy as np + + unique_coords = [] + duplicated_coords = [] + + # Split the array into unique and duplicated coordinates + count_map = {} + for coord in coords: + coord_str = tuple(coord) + if coord_str not in count_map: + count_map[coord_str] = 1 + unique_coords.append(coord) + else: + count_map[coord_str] += 1 + duplicated_coords.append(coord) + + # Convert lists to NumPy arrays for sorting + unique_coords = np.array(unique_coords) + duplicated_coords = np.array(duplicated_coords) + + # Sort unique coordinates lexicographically if not empty + if unique_coords.size > 0: + unique_coords_sorted = unique_coords[np.lexsort(np.rot90(unique_coords))] + + # Sort duplicated coordinates lexicographically if not empty + if duplicated_coords.size > 0: + duplicated_coords_sorted = duplicated_coords[np.lexsort(np.rot90(duplicated_coords))] + + # Concatenate unique and duplicated coordinates + return np.concatenate((unique_coords_sorted, duplicated_coords_sorted)) + else: + return unique_coords_sorted + else: + # Sort duplicated coordinates lexicographically if not empty + if duplicated_coords.size > 0: + return duplicated_coords[np.lexsort(np.rot90(duplicated_coords))] + else: + return coords + +def count_distinct_coords(coords): + import numpy as np + count_map = {} + for coord in coords: + coord_str = tuple(coord) + count_map[coord_str] = count_map.get(coord_str, 0) + 1 + return len(count_map) + def clusterkmeanstable(geom, k): from .kmeans import KMeans import json @@ -25,9 +73,11 @@ def clusterkmeanstable(geom, k): geom = load_geom(geom) points = geom['_coords'] - coords = np.array( + coords = reorder_coords(np.array( [[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)] - ) + )) + # k cannot be greater than the number of distinct coordinates + k = min(k, count_distinct_coords(coords)) cluster_idxs, centers, loss = KMeans()(coords, k) @@ -46,7 +96,10 @@ def clusterkmeans(geom, k): if geom.type != 'MultiPoint': raise Exception('Invalid operation: Input points parameter must be MultiPoint.') else: - coords = remove_duplicated_coords(np.array(list(geojson.utils.coords(geom)))) + coords = reorder_coords(np.array(list(geojson.utils.coords(geom)))) + # k cannot be greater than the number of distinct coordinates + k = min(k, count_distinct_coords(coords)) + cluster_idxs, centers, loss = KMeans()(coords, k) return geojson.dumps( [ diff --git a/clouds/redshift/libraries/python/lib/clustering/kmeans.py b/clouds/redshift/libraries/python/lib/clustering/kmeans.py index bad0c8db2..d86542f0d 100644 --- a/clouds/redshift/libraries/python/lib/clustering/kmeans.py +++ b/clouds/redshift/libraries/python/lib/clustering/kmeans.py @@ -154,7 +154,9 @@ def __call__( cluster centers: k x d numpy array, the centers loss: final loss value of the objective function of KMeans """ - centers = self._init_centers(points, k, **kwargs) + # centers = self._init_centers(points, k, **kwargs) + # instead of using random initialization, we will use the first k points + centers = points[:k] prev_loss = 0 for it in range(max_iters): cluster_idx = self._update_assignment(centers, points) diff --git a/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt b/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt index 11dae73d3..ac6d1e166 100644 --- a/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt +++ b/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt @@ -1,3 +1,3 @@ -[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}}] -[{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}] -[{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}] \ No newline at end of file +[{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}}] +[{"cluster":0,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}}] +[{"cluster":0,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[5.0,0.0]}}]