fix(bq,sf,rs|clustering): improve how ST_CLUSTERKMEANS deals with dup…

…licates (#495)
CartoDB · Apr 18, 2024 · 8a29098 · 8a29098
1 parent d51021b
commit 8a29098
Show file tree

Hide file tree

Showing 20 changed files with 182 additions and 36 deletions.
diff --git a/clouds/bigquery/libraries/javascript/src/clustering.js b/clouds/bigquery/libraries/javascript/src/clustering.js
@@ -1,7 +1,29 @@
 import { featureCollection, feature, clustersKmeans } from '@turf/turf';
 
+function prioritizeDistinctSort (arr) {
+    const uniqueValues = [];
+    const duplicatedValues = [];
+
+    // Split the array into unique and duplicated values
+    const countMap = {};
+    for (const item of arr) {
+        if (countMap[item] === undefined) {
+            countMap[item] = 1;
+            uniqueValues.push(item);
+        } else {
+            countMap[item]++;
+            duplicatedValues.push(item);
+        }
+    }
+
+    // Concatenate unique and duplicated values
+    const result = [...uniqueValues, ...duplicatedValues];
+    return result;
+}
+
 export default {
     featureCollection,
     feature,
-    clustersKmeans
+    clustersKmeans,
+    prioritizeDistinctSort
 };
diff --git a/clouds/bigquery/libraries/javascript/test/clustering.test.js b/clouds/bigquery/libraries/javascript/test/clustering.test.js
@@ -4,4 +4,5 @@ test('clustering library defined', () => {
     expect(lib.clustering.featureCollection).toBeDefined();
     expect(lib.clustering.feature).toBeDefined();
     expect(lib.clustering.clustersKmeans).toBeDefined();
+    expect(lib.clustering.prioritizeDistinctSort).toBeDefined();
 });
diff --git a/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog, numberOfClusters)
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geog`: `ARRAY<GEOGRAPHY>` points to be clustered.
-* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.
-
-````hint:info
-The resulting geometries are unique. So duplicated points will be removed from the input array
-````
+* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.  The output number of cluster cannot be greater to the number of distinct points of the `geog`.
 
 **Return type**
 

diff --git a/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql b/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql
@@ -19,8 +19,7 @@ AS """
         options.numberOfClusters = parseInt(Math.sqrt(geojson.length/2))
     }
     options.mutate = true;
-    geojson = Array.from(new Set(geojson));
-    const featuresCollection = lib.clustering.featureCollection(geojson.map(x => lib.clustering.feature(JSON.parse(x))));
+    const featuresCollection = lib.clustering.featureCollection(lib.clustering.prioritizeDistinctSort(geojson).map(x => lib.clustering.feature(JSON.parse(x))));
     lib.clustering.clustersKmeans(featuresCollection, options);
     const cluster = [];
     featuresCollection.features.forEach(function(item, index, array) {

diff --git a/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js b/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js
@@ -17,6 +17,24 @@ test('ST_CLUSTERKMEANS should work', async () => {
     expect(rows[0].clusterKMeans3).toEqual(JSON.parse(points3FixturesOut.value));
 });
 
+test('ST_CLUSTERKMEANS should work for duplicated entries ', async () => {
+    const requestedClusters = 3;
+    // When the input array contains consecutives entries at the beggining,
+    // it should be reordered to the required number of clusters
+    const query = `SELECT
+        \`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`([ST_GEOGPOINT(0, 0),ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(5, 0)], ${requestedClusters}) as clusterKMeans
+    `;
+    const rows = await runQuery(query);
+    const uniqueClusters = new Set();
+
+    rows[0].clusterKMeans.forEach(item => {
+        uniqueClusters.add(item.cluster);
+    });
+
+    expect(rows.length).toEqual(1);
+    expect(uniqueClusters.size).toEqual(requestedClusters);
+});
+
 test('ST_CLUSTERKMEANS should return NULL if any NULL mandatory argument', async () => {
     const query = `SELECT
         \`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`(NULL, 2) as clusterKMeans1

diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js
diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js
diff --git a/clouds/redshift/libraries/python/lib/clustering/__init__.py b/clouds/redshift/libraries/python/lib/clustering/__init__.py
@@ -10,24 +10,74 @@ def load_geom(geom):
     geom = json.dumps(_geom)
     return loads(geom)
 
+
 def remove_duplicated_coords(arr):
     import numpy as np
+
     unique_rows = []
     for row in arr:
         if not any(np.array_equal(row, unique_row) for unique_row in unique_rows):
             unique_rows.append(row)
     return np.array(unique_rows)
 
+
+def reorder_coords(coords):
+    import numpy as np
+
+    unique_coords = []
+    duplicated_coords = []
+
+    # Split the array into unique and duplicated coordinates
+    count_map = {}
+    for coord in coords:
+        coord_str = tuple(coord)
+        if coord_str not in count_map:
+            count_map[coord_str] = 1
+            unique_coords.append(coord)
+        else:
+            count_map[coord_str] += 1
+            duplicated_coords.append(coord)
+
+    # Convert lists to NumPy arrays for sorting
+    unique_coords = np.array(unique_coords)
+    duplicated_coords = np.array(duplicated_coords)
+
+    if unique_coords.size > 0:
+        if duplicated_coords.size > 0:
+            # Concatenate unique and duplicated coordinates
+            return np.concatenate((unique_coords, duplicated_coords))
+        else:
+            return unique_coords
+    else:
+        if duplicated_coords.size > 0:
+            return duplicated_coords
+        else:
+            # This should never happen, so just returning the input
+            return coords
+
+
+def count_distinct_coords(coords):
+    import numpy as np
+
+    count_map = {}
+    for coord in coords:
+        coord_str = tuple(coord)
+        count_map[coord_str] = count_map.get(coord_str, 0) + 1
+    return len(count_map)
+
+
 def clusterkmeanstable(geom, k):
     from .kmeans import KMeans
     import json
     import numpy as np
 
     geom = load_geom(geom)
     points = geom['_coords']
-    coords = np.array(
-        [[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)]
+    coords = reorder_coords(
+        np.array([[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)])
     )
+    # k cannot be greater than the number of distinct coordinates
+    k = min(k, count_distinct_coords(coords))
 
     cluster_idxs, centers, loss = KMeans()(coords, k)
 
@@ -46,7 +96,10 @@ def clusterkmeans(geom, k):
     if geom.type != 'MultiPoint':
         raise Exception('Invalid operation: Input points parameter must be MultiPoint.')
     else:
-        coords = remove_duplicated_coords(np.array(list(geojson.utils.coords(geom))))
+        coords = reorder_coords(np.array(list(geojson.utils.coords(geom))))
+    # k cannot be greater than the number of distinct coordinates
+    k = min(k, count_distinct_coords(coords))
+
     cluster_idxs, centers, loss = KMeans()(coords, k)
     return geojson.dumps(
         [

diff --git a/clouds/redshift/libraries/python/lib/clustering/kmeans.py b/clouds/redshift/libraries/python/lib/clustering/kmeans.py
@@ -154,7 +154,9 @@ def __call__(
         cluster centers: k x d numpy array, the centers
         loss: final loss value of the objective function of KMeans
         """
-        centers = self._init_centers(points, k, **kwargs)
+        # centers = self._init_centers(points, k, **kwargs)
+        # instead of using random initialization, we will use the first k points
+        centers = points[:k]
         prev_loss = 0
         for it in range(max_iters):
             cluster_idx = self._update_assignment(centers, points)

diff --git a/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters])
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geog`: `GEOMETRY` points to be clustered.
-* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`).
-
-````hint:info
-The resulting geometries are unique. So duplicated points will be removed from the input multipoint
-````
+* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`). The output number of cluster cannot be greater to the number of distinct points of the `geog`.
 
 **Return type**
 

diff --git a/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt b/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt
@@ -1,3 +1,3 @@
-[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}}]
-[{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
-[{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
+[{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,0.0]}}]
+[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}}]
+[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}}]
diff --git a/clouds/redshift/modules/test/clustering/test_ST_CLUSTERKMEANS.py b/clouds/redshift/modules/test/clustering/test_ST_CLUSTERKMEANS.py
@@ -28,6 +28,28 @@ def test_st_clusterkmeans():
         assert str(result[0]) == lines[idx].rstrip()
 
 
+def test_st_clusterkmeans_duplicated_entries():
+    import json
+
+    requested_clusters = 3
+    # When the input array contains consecutives entries at the beggining,
+    # it should be reordered to the required number of clusters
+    results = run_query(
+        f"""
+        SELECT @@RS_SCHEMA@@.ST_CLUSTERKMEANS(
+            ST_GEOMFROMTEXT(
+                'MULTIPOINT ((0 0), (0 0), (0 0), (0 1), (0 1), (0 1), (5 0))'),
+            {requested_clusters})
+        """
+    )
+    results_data = json.loads(results[0][0])
+    unique_clusters = set()
+    for item in results_data:
+        unique_clusters.add(item['cluster'])
+
+    assert len(unique_clusters) == requested_clusters
+
+
 def test_st_clusterkmeans_default_args_success():
     with open(f'{here}/fixtures/st_clusterkmeans_in.txt', 'r') as fixture_file:
         lines = fixture_file.readlines()

diff --git a/clouds/snowflake/libraries/javascript/libs/clustering.js b/clouds/snowflake/libraries/javascript/libs/clustering.js
@@ -1,7 +1,29 @@
 import { featureCollection, feature, clustersKmeans } from '@turf/turf';
 
+function prioritizeDistinctSort (arr) {
+    const uniqueValues = [];
+    const duplicatedValues = [];
+
+    // Split the array into unique and duplicated values
+    const countMap = {};
+    for (const item of arr) {
+        if (countMap[item] === undefined) {
+            countMap[item] = 1;
+            uniqueValues.push(item);
+        } else {
+            countMap[item]++;
+            duplicatedValues.push(item);
+        }
+    }
+
+    // Concatenate unique and duplicated values
+    const result = [...uniqueValues, ...duplicatedValues];
+    return result;
+}
+
 export default {
     featureCollection,
     feature,
-    clustersKmeans
+    clustersKmeans,
+    prioritizeDistinctSort
 };
diff --git a/clouds/snowflake/libraries/javascript/test/clustering.test.js b/clouds/snowflake/libraries/javascript/test/clustering.test.js
@@ -4,4 +4,5 @@ test('clustering library defined', () => {
     expect(clusteringLib.featureCollection).toBeDefined();
     expect(clusteringLib.feature).toBeDefined();
     expect(clusteringLib.clustersKmeans).toBeDefined();
+    expect(clusteringLib.prioritizeDistinctSort).toBeDefined();
 });
diff --git a/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters])
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geojsons`: `ARRAY` points to be clustered.
-* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(<NUMBER OF POINTS>/2)`.
-
-````hint:info
-The resulting geometries are unique. So duplicated points will be removed from the input array
-````
+* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(<NUMBER OF POINTS>/2)`. The output number of cluster cannot be greater to the number of distinct points of the `geojsons`.
 
 **Return type**