Skip to content

Commit

Permalink
fix clustering in rs
Browse files Browse the repository at this point in the history
  • Loading branch information
vdelacruzb committed Apr 16, 2024
1 parent f871201 commit 0600b89
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 7 deletions.
59 changes: 56 additions & 3 deletions clouds/redshift/libraries/python/lib/clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,66 @@ def remove_duplicated_coords(arr):
unique_rows.append(row)
return np.array(unique_rows)

def reorder_coords(coords):
import numpy as np

unique_coords = []
duplicated_coords = []

# Split the array into unique and duplicated coordinates
count_map = {}
for coord in coords:
coord_str = tuple(coord)
if coord_str not in count_map:
count_map[coord_str] = 1
unique_coords.append(coord)
else:
count_map[coord_str] += 1
duplicated_coords.append(coord)

# Convert lists to NumPy arrays for sorting
unique_coords = np.array(unique_coords)
duplicated_coords = np.array(duplicated_coords)

# Sort unique coordinates lexicographically if not empty
if unique_coords.size > 0:
unique_coords_sorted = unique_coords[np.lexsort(np.rot90(unique_coords))]

# Sort duplicated coordinates lexicographically if not empty
if duplicated_coords.size > 0:
duplicated_coords_sorted = duplicated_coords[np.lexsort(np.rot90(duplicated_coords))]

# Concatenate unique and duplicated coordinates
return np.concatenate((unique_coords_sorted, duplicated_coords_sorted))
else:
return unique_coords_sorted
else:
# Sort duplicated coordinates lexicographically if not empty
if duplicated_coords.size > 0:
return duplicated_coords[np.lexsort(np.rot90(duplicated_coords))]
else:
return coords

def count_distinct_coords(coords):
import numpy as np
count_map = {}
for coord in coords:
coord_str = tuple(coord)
count_map[coord_str] = count_map.get(coord_str, 0) + 1
return len(count_map)

def clusterkmeanstable(geom, k):
from .kmeans import KMeans
import json
import numpy as np

geom = load_geom(geom)
points = geom['_coords']
coords = np.array(
coords = reorder_coords(np.array(
[[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)]
)
))
# k cannot be greater than the number of distinct coordinates
k = min(k, count_distinct_coords(coords))

cluster_idxs, centers, loss = KMeans()(coords, k)

Expand All @@ -46,7 +96,10 @@ def clusterkmeans(geom, k):
if geom.type != 'MultiPoint':
raise Exception('Invalid operation: Input points parameter must be MultiPoint.')
else:
coords = remove_duplicated_coords(np.array(list(geojson.utils.coords(geom))))
coords = reorder_coords(np.array(list(geojson.utils.coords(geom))))
# k cannot be greater than the number of distinct coordinates
k = min(k, count_distinct_coords(coords))

cluster_idxs, centers, loss = KMeans()(coords, k)
return geojson.dumps(
[
Expand Down
4 changes: 3 additions & 1 deletion clouds/redshift/libraries/python/lib/clustering/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ def __call__(
cluster centers: k x d numpy array, the centers
loss: final loss value of the objective function of KMeans
"""
centers = self._init_centers(points, k, **kwargs)
# centers = self._init_centers(points, k, **kwargs)
# instead of using random initialization, we will use the first k points
centers = points[:k]
prev_loss = 0
for it in range(max_iters):
cluster_idx = self._update_assignment(centers, points)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}}]
[{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
[{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
[{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}}]
[{"cluster":0,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}}]
[{"cluster":0,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[5.0,0.0]}}]

0 comments on commit 0600b89

Please sign in to comment.