CartoDB · vdelacruzb · Apr 18, 2024 · Apr 16, 2024 · Apr 16, 2024 · Apr 16, 2024
diff --git a/clouds/bigquery/libraries/javascript/src/clustering.js b/clouds/bigquery/libraries/javascript/src/clustering.js
@@ -1,7 +1,35 @@
 import { featureCollection, feature, clustersKmeans } from '@turf/turf';
 
+function prioritizeDistinctSort(arr) {
+    const uniqueValues = [];
+    const duplicatedValues = [];
+
+    // Split the array into unique and duplicated values
+    const countMap = {};
+    for (const item of arr) {
+        if (countMap[item] === undefined) {
+            countMap[item] = 1;
+            uniqueValues.push(item);
+        } else {
+            countMap[item]++;
+            duplicatedValues.push(item);
+        }
+    }
+
+    // Sort unique values alphabetically
+    uniqueValues.sort();
+
+    // Sort duplicated values alphabetically
+    duplicatedValues.sort();
+
+    // Concatenate unique and duplicated values
+    const result = [...uniqueValues, ...duplicatedValues];
+    return result;
+}
+
 export default {
     featureCollection,
     feature,
-    clustersKmeans
+    clustersKmeans,
+    prioritizeDistinctSort
 };
diff --git a/clouds/bigquery/libraries/javascript/test/clustering.test.js b/clouds/bigquery/libraries/javascript/test/clustering.test.js
@@ -4,4 +4,5 @@ test('clustering library defined', () => {
     expect(lib.clustering.featureCollection).toBeDefined();
     expect(lib.clustering.feature).toBeDefined();
     expect(lib.clustering.clustersKmeans).toBeDefined();
+    expect(lib.clustering.prioritizeDistinctSort).toBeDefined();
 });
diff --git a/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog, numberOfClusters)
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geog`: `ARRAY<GEOGRAPHY>` points to be clustered.
-* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.
-
-````hint:info
-The resulting geometries are unique. So duplicated points will be removed from the input array
-````
+* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.  The output number of cluster cannot be greater to the number of distinct points of the `geog`.
 
 **Return type**
 

diff --git a/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql b/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql
@@ -19,8 +19,7 @@ AS """
         options.numberOfClusters = parseInt(Math.sqrt(geojson.length/2))
     }
     options.mutate = true;
-    geojson = Array.from(new Set(geojson));
-    const featuresCollection = lib.clustering.featureCollection(geojson.map(x => lib.clustering.feature(JSON.parse(x))));
+    const featuresCollection = lib.clustering.featureCollection(lib.clustering.prioritizeDistinctSort(geojson).map(x => lib.clustering.feature(JSON.parse(x))));
     lib.clustering.clustersKmeans(featuresCollection, options);
     const cluster = [];
     featuresCollection.features.forEach(function(item, index, array) {

diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points1_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points1_out.js
@@ -1,3 +1,3 @@
 module.exports = {
-    value: '[{"cluster":1,"geom":{"value":"POINT(0 0)"}},{"cluster":1,"geom":{"value":"POINT(0 1)"}},{"cluster":0,"geom":{"value":"POINT(5 0)"}},{"cluster":1,"geom":{"value":"POINT(1 0)"}}]'
+    value: '[{"cluster":1,"geom":{"value":"POINT(0 0)"}},{"cluster":1,"geom":{"value":"POINT(0 1)"}},{"cluster":1,"geom":{"value":"POINT(1 0)"}},{"cluster":0,"geom":{"value":"POINT(5 0)"}}]'
   }
diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js
diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js
diff --git a/clouds/redshift/libraries/python/lib/clustering/__init__.py b/clouds/redshift/libraries/python/lib/clustering/__init__.py
@@ -18,16 +18,66 @@ def remove_duplicated_coords(arr):
             unique_rows.append(row)
     return np.array(unique_rows)
 
+def reorder_coords(coords):
+    import numpy as np
+
+    unique_coords = []
+    duplicated_coords = []
+
+    # Split the array into unique and duplicated coordinates
+    count_map = {}
+    for coord in coords:
+        coord_str = tuple(coord)
+        if coord_str not in count_map:
+            count_map[coord_str] = 1
+            unique_coords.append(coord)
+        else:
+            count_map[coord_str] += 1
+            duplicated_coords.append(coord)
+
+    # Convert lists to NumPy arrays for sorting
+    unique_coords = np.array(unique_coords)
+    duplicated_coords = np.array(duplicated_coords)
+
+    # Sort unique coordinates lexicographically if not empty
+    if unique_coords.size > 0:
+        unique_coords_sorted = unique_coords[np.lexsort(np.rot90(unique_coords))]
+
+        # Sort duplicated coordinates lexicographically if not empty
+        if duplicated_coords.size > 0:
+            duplicated_coords_sorted = duplicated_coords[np.lexsort(np.rot90(duplicated_coords))]
+
+            # Concatenate unique and duplicated coordinates
+            return np.concatenate((unique_coords_sorted, duplicated_coords_sorted))
+        else:
+            return unique_coords_sorted
+    else:
+        # Sort duplicated coordinates lexicographically if not empty
+        if duplicated_coords.size > 0:
+            return duplicated_coords[np.lexsort(np.rot90(duplicated_coords))]
+        else:
+            return coords
+
+def count_distinct_coords(coords):
+    import numpy as np
+    count_map = {}
+    for coord in coords:
+        coord_str = tuple(coord)
+        count_map[coord_str] = count_map.get(coord_str, 0) + 1
+    return len(count_map)
+
 def clusterkmeanstable(geom, k):
     from .kmeans import KMeans
     import json
     import numpy as np
 
     geom = load_geom(geom)
     points = geom['_coords']
-    coords = np.array(
+    coords = reorder_coords(np.array(
         [[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)]
-    )
+    ))
+    # k cannot be greater than the number of distinct coordinates
+    k = min(k, count_distinct_coords(coords))
 
     cluster_idxs, centers, loss = KMeans()(coords, k)
 
@@ -46,7 +96,10 @@ def clusterkmeans(geom, k):
     if geom.type != 'MultiPoint':
         raise Exception('Invalid operation: Input points parameter must be MultiPoint.')
     else:
-        coords = remove_duplicated_coords(np.array(list(geojson.utils.coords(geom))))
+        coords = reorder_coords(np.array(list(geojson.utils.coords(geom))))
+    # k cannot be greater than the number of distinct coordinates
+    k = min(k, count_distinct_coords(coords))
+
     cluster_idxs, centers, loss = KMeans()(coords, k)
     return geojson.dumps(
         [

diff --git a/clouds/redshift/libraries/python/lib/clustering/kmeans.py b/clouds/redshift/libraries/python/lib/clustering/kmeans.py
@@ -154,7 +154,9 @@ def __call__(
         cluster centers: k x d numpy array, the centers
         loss: final loss value of the objective function of KMeans
         """
-        centers = self._init_centers(points, k, **kwargs)
+        # centers = self._init_centers(points, k, **kwargs)
+        # instead of using random initialization, we will use the first k points
+        centers = points[:k]
         prev_loss = 0
         for it in range(max_iters):
             cluster_idx = self._update_assignment(centers, points)

diff --git a/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters])
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geog`: `GEOMETRY` points to be clustered.
-* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`).
-
-````hint:info
-The resulting geometries are unique. So duplicated points will be removed from the input multipoint
-````
+* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`). The output number of cluster cannot be greater to the number of distinct points of the `geog`.
 
 **Return type**
 

diff --git a/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt b/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt
@@ -1,3 +1,3 @@
-[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}}]
-[{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
-[{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
+[{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}}]
+[{"cluster":0,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}}]
+[{"cluster":0,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[5.0,0.0]}}]
diff --git a/clouds/snowflake/libraries/javascript/libs/clustering.js b/clouds/snowflake/libraries/javascript/libs/clustering.js
@@ -1,7 +1,35 @@
 import { featureCollection, feature, clustersKmeans } from '@turf/turf';
 
+function prioritizeDistinctSort(arr) {
+    const uniqueValues = [];
+    const duplicatedValues = [];
+
+    // Split the array into unique and duplicated values
+    const countMap = {};
+    for (const item of arr) {
+        if (countMap[item] === undefined) {
+            countMap[item] = 1;
+            uniqueValues.push(item);
+        } else {
+            countMap[item]++;
+            duplicatedValues.push(item);
+        }
+    }
+
+    // Sort unique values alphabetically
+    uniqueValues.sort();
+
+    // Sort duplicated values alphabetically
+    duplicatedValues.sort();
+
+    // Concatenate unique and duplicated values
+    const result = [...uniqueValues, ...duplicatedValues];
+    return result;
+}
+
 export default {
     featureCollection,
     feature,
-    clustersKmeans
+    clustersKmeans,
+    prioritizeDistinctSort
 };
diff --git a/clouds/snowflake/libraries/javascript/test/clustering.test.js b/clouds/snowflake/libraries/javascript/test/clustering.test.js
@@ -4,4 +4,5 @@ test('clustering library defined', () => {
     expect(clusteringLib.featureCollection).toBeDefined();
     expect(clusteringLib.feature).toBeDefined();
     expect(clusteringLib.clustersKmeans).toBeDefined();
+    expect(clusteringLib.prioritizeDistinctSort).toBeDefined();
 });
diff --git a/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters])
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geojsons`: `ARRAY` points to be clustered.
-* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(<NUMBER OF POINTS>/2)`.
-
-````hint:info
-The resulting geometries are unique. So duplicated points will be removed from the input array
-````
+* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(<NUMBER OF POINTS>/2)`. The output number of cluster cannot be greater to the number of distinct points of the `geojsons`.
 
 **Return type**
 

diff --git a/clouds/snowflake/modules/sql/clustering/ST_CLUSTERKMEANS.sql b/clouds/snowflake/modules/sql/clustering/ST_CLUSTERKMEANS.sql
@@ -17,8 +17,7 @@ AS $$
     const options = {};
     options.numberOfClusters = Number(NUMBEROFCLUSTERS);
     options.mutate = true;
-    GEOJSONS = Array.from(new Set(GEOJSONS));
-    const featuresCollection = clusteringLib.featureCollection(GEOJSONS.map(x => clusteringLib.feature(JSON.parse(x))));
+    const featuresCollection = clusteringLib.featureCollection(clusteringLib.prioritizeDistinctSort(GEOJSONS).map(x => clusteringLib.feature(JSON.parse(x))));
     clusteringLib.clustersKmeans(featuresCollection, options);
     const cluster = [];
     featuresCollection.features.forEach(function(item, index, array) {

diff --git a/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points1.js b/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points1.js