diff --git a/clouds/bigquery/libraries/javascript/src/clustering.js b/clouds/bigquery/libraries/javascript/src/clustering.js index dd050e749..cf17cabfa 100644 --- a/clouds/bigquery/libraries/javascript/src/clustering.js +++ b/clouds/bigquery/libraries/javascript/src/clustering.js @@ -1,7 +1,29 @@ import { featureCollection, feature, clustersKmeans } from '@turf/turf'; +function prioritizeDistinctSort (arr) { + const uniqueValues = []; + const duplicatedValues = []; + + // Split the array into unique and duplicated values + const countMap = {}; + for (const item of arr) { + if (countMap[item] === undefined) { + countMap[item] = 1; + uniqueValues.push(item); + } else { + countMap[item]++; + duplicatedValues.push(item); + } + } + + // Concatenate unique and duplicated values + const result = [...uniqueValues, ...duplicatedValues]; + return result; +} + export default { featureCollection, feature, - clustersKmeans + clustersKmeans, + prioritizeDistinctSort }; \ No newline at end of file diff --git a/clouds/bigquery/libraries/javascript/test/clustering.test.js b/clouds/bigquery/libraries/javascript/test/clustering.test.js index 1f77e4efc..0441d1608 100644 --- a/clouds/bigquery/libraries/javascript/test/clustering.test.js +++ b/clouds/bigquery/libraries/javascript/test/clustering.test.js @@ -4,4 +4,5 @@ test('clustering library defined', () => { expect(lib.clustering.featureCollection).toBeDefined(); expect(lib.clustering.feature).toBeDefined(); expect(lib.clustering.clustersKmeans).toBeDefined(); + expect(lib.clustering.prioritizeDistinctSort).toBeDefined(); }); \ No newline at end of file diff --git a/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md index 793582540..8234c6e0a 100644 --- a/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md +++ b/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md @@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog, numberOfClusters) Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry. * `geog`: `ARRAY` points to be clustered. -* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(/2)` is used. - -````hint:info -The resulting geometries are unique. So duplicated points will be removed from the input array -```` +* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(/2)` is used. The output number of cluster cannot be greater to the number of distinct points of the `geog`. **Return type** diff --git a/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql b/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql index 63d7125ef..3a84db951 100644 --- a/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql +++ b/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql @@ -19,8 +19,7 @@ AS """ options.numberOfClusters = parseInt(Math.sqrt(geojson.length/2)) } options.mutate = true; - geojson = Array.from(new Set(geojson)); - const featuresCollection = lib.clustering.featureCollection(geojson.map(x => lib.clustering.feature(JSON.parse(x)))); + const featuresCollection = lib.clustering.featureCollection(lib.clustering.prioritizeDistinctSort(geojson).map(x => lib.clustering.feature(JSON.parse(x)))); lib.clustering.clustersKmeans(featuresCollection, options); const cluster = []; featuresCollection.features.forEach(function(item, index, array) { diff --git a/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js b/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js index 0b9867861..7e8f3fbf5 100644 --- a/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js +++ b/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js @@ -17,6 +17,24 @@ test('ST_CLUSTERKMEANS should work', async () => { expect(rows[0].clusterKMeans3).toEqual(JSON.parse(points3FixturesOut.value)); }); +test('ST_CLUSTERKMEANS should work for duplicated entries ', async () => { + const requestedClusters = 3; + // When the input array contains consecutives entries at the beggining, + // it should be reordered to the required number of clusters + const query = `SELECT + \`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`([ST_GEOGPOINT(0, 0),ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(5, 0)], ${requestedClusters}) as clusterKMeans + `; + const rows = await runQuery(query); + const uniqueClusters = new Set(); + + rows[0].clusterKMeans.forEach(item => { + uniqueClusters.add(item.cluster); + }); + + expect(rows.length).toEqual(1); + expect(uniqueClusters.size).toEqual(requestedClusters); +}); + test('ST_CLUSTERKMEANS should return NULL if any NULL mandatory argument', async () => { const query = `SELECT \`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`(NULL, 2) as clusterKMeans1 diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js index 0a03efd4f..07aa17ffc 100644 --- a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js +++ b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js @@ -1,3 +1,3 @@ module.exports = { - value: '[{"cluster":0,"geom":{"value":"POINT(0 0)"}},{"cluster":0,"geom":{"value":"POINT(0 1)"}},{"cluster":2,"geom":{"value":"POINT(5 0)"}},{"cluster":0,"geom":{"value":"POINT(1 0)"}},{"cluster":1,"geom":{"value":"POINT(1 19)"}},{"cluster":2,"geom":{"value":"POINT(12 1)"}},{"cluster":2,"geom":{"value":"POINT(9 2)"}},{"cluster":1,"geom":{"value":"POINT(1 10)"}},{"cluster":0,"geom":{"value":"POINT(-3 1)"}},{"cluster":2,"geom":{"value":"POINT(5 5)"}},{"cluster":2,"geom":{"value":"POINT(8 6)"}},{"cluster":2,"geom":{"value":"POINT(10 10)"}},{"cluster":0,"geom":{"value":"POINT(-3 -5)"}},{"cluster":2,"geom":{"value":"POINT(6 5)"}},{"cluster":1,"geom":{"value":"POINT(-8 9)"}},{"cluster":0,"geom":{"value":"POINT(1 -10)"}},{"cluster":0,"geom":{"value":"POINT(2 -2)"}},{"cluster":1,"geom":{"value":"POINT(3 10)"}}]' + value: '[{"cluster":0,"geom":{"value":"POINT(0 0)"}},{"cluster":0,"geom":{"value":"POINT(0 1)"}},{"cluster":2,"geom":{"value":"POINT(5 0)"}},{"cluster":0,"geom":{"value":"POINT(1 0)"}},{"cluster":1,"geom":{"value":"POINT(1 19)"}},{"cluster":2,"geom":{"value":"POINT(12 1)"}},{"cluster":2,"geom":{"value":"POINT(9 2)"}},{"cluster":1,"geom":{"value":"POINT(1 10)"}},{"cluster":0,"geom":{"value":"POINT(-3 1)"}},{"cluster":2,"geom":{"value":"POINT(5 5)"}},{"cluster":2,"geom":{"value":"POINT(8 6)"}},{"cluster":2,"geom":{"value":"POINT(10 10)"}},{"cluster":0,"geom":{"value":"POINT(-3 -5)"}},{"cluster":2,"geom":{"value":"POINT(6 5)"}},{"cluster":1,"geom":{"value":"POINT(-8 9)"}},{"cluster":0,"geom":{"value":"POINT(1 -10)"}},{"cluster":0,"geom":{"value":"POINT(2 -2)"}},{"cluster":1,"geom":{"value":"POINT(3 10)"}},{"cluster":0,"geom":{"value":"POINT(0 1)"}},{"cluster":2,"geom":{"value":"POINT(5 0)"}},{"cluster":0,"geom":{"value":"POINT(0 0)"}}]' } \ No newline at end of file diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js index 27a0f651e..a03a346df 100644 --- a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js +++ b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js @@ -1,3 +1,3 @@ module.exports = { - value: '[{"cluster":0,"geom":{"value":"POINT(0 0)"}},{"cluster":0,"geom":{"value":"POINT(0 1)"}},{"cluster":0,"geom":{"value":"POINT(5 0)"}},{"cluster":0,"geom":{"value":"POINT(1 0)"}},{"cluster":4,"geom":{"value":"POINT(1 19)"}},{"cluster":2,"geom":{"value":"POINT(12 1)"}},{"cluster":2,"geom":{"value":"POINT(9 2)"}},{"cluster":4,"geom":{"value":"POINT(1 10)"}},{"cluster":0,"geom":{"value":"POINT(-3 1)"}},{"cluster":2,"geom":{"value":"POINT(5 5)"}},{"cluster":2,"geom":{"value":"POINT(8 6)"}},{"cluster":2,"geom":{"value":"POINT(10 10)"}},{"cluster":3,"geom":{"value":"POINT(-3 -5)"}},{"cluster":2,"geom":{"value":"POINT(6 5)"}},{"cluster":1,"geom":{"value":"POINT(-8 9)"}},{"cluster":3,"geom":{"value":"POINT(1 -10)"}},{"cluster":0,"geom":{"value":"POINT(2 -2)"}},{"cluster":4,"geom":{"value":"POINT(3 10)"}}]' + value: '[{"cluster":0,"geom":{"value":"POINT(0 0)"}},{"cluster":0,"geom":{"value":"POINT(0 1)"}},{"cluster":2,"geom":{"value":"POINT(5 0)"}},{"cluster":0,"geom":{"value":"POINT(1 0)"}},{"cluster":4,"geom":{"value":"POINT(1 19)"}},{"cluster":2,"geom":{"value":"POINT(12 1)"}},{"cluster":2,"geom":{"value":"POINT(9 2)"}},{"cluster":4,"geom":{"value":"POINT(1 10)"}},{"cluster":0,"geom":{"value":"POINT(-3 1)"}},{"cluster":2,"geom":{"value":"POINT(5 5)"}},{"cluster":2,"geom":{"value":"POINT(8 6)"}},{"cluster":2,"geom":{"value":"POINT(10 10)"}},{"cluster":3,"geom":{"value":"POINT(-3 -5)"}},{"cluster":2,"geom":{"value":"POINT(6 5)"}},{"cluster":1,"geom":{"value":"POINT(-8 9)"}},{"cluster":3,"geom":{"value":"POINT(1 -10)"}},{"cluster":0,"geom":{"value":"POINT(2 -2)"}},{"cluster":4,"geom":{"value":"POINT(3 10)"}},{"cluster":0,"geom":{"value":"POINT(0 1)"}},{"cluster":2,"geom":{"value":"POINT(5 0)"}},{"cluster":0,"geom":{"value":"POINT(0 0)"}}]' } \ No newline at end of file diff --git a/clouds/redshift/libraries/python/lib/clustering/__init__.py b/clouds/redshift/libraries/python/lib/clustering/__init__.py index c7c275ddb..8db9a6911 100644 --- a/clouds/redshift/libraries/python/lib/clustering/__init__.py +++ b/clouds/redshift/libraries/python/lib/clustering/__init__.py @@ -10,14 +10,62 @@ def load_geom(geom): geom = json.dumps(_geom) return loads(geom) + def remove_duplicated_coords(arr): import numpy as np + unique_rows = [] for row in arr: if not any(np.array_equal(row, unique_row) for unique_row in unique_rows): unique_rows.append(row) return np.array(unique_rows) + +def reorder_coords(coords): + import numpy as np + + unique_coords = [] + duplicated_coords = [] + + # Split the array into unique and duplicated coordinates + count_map = {} + for coord in coords: + coord_str = tuple(coord) + if coord_str not in count_map: + count_map[coord_str] = 1 + unique_coords.append(coord) + else: + count_map[coord_str] += 1 + duplicated_coords.append(coord) + + # Convert lists to NumPy arrays for sorting + unique_coords = np.array(unique_coords) + duplicated_coords = np.array(duplicated_coords) + + if unique_coords.size > 0: + if duplicated_coords.size > 0: + # Concatenate unique and duplicated coordinates + return np.concatenate((unique_coords, duplicated_coords)) + else: + return unique_coords + else: + if duplicated_coords.size > 0: + return duplicated_coords + else: + # This should never happen, so just returning the input + return coords + + +def count_distinct_coords(coords): + import numpy as np + + count_map = {} + for coord in coords: + coord_str = tuple(coord) + count_map[coord_str] = count_map.get(coord_str, 0) + 1 + return len(count_map) + + def clusterkmeanstable(geom, k): from .kmeans import KMeans import json @@ -25,9 +73,11 @@ def clusterkmeanstable(geom, k): geom = load_geom(geom) points = geom['_coords'] - coords = np.array( - [[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)] + coords = reorder_coords( + np.array([[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)]) ) + # k cannot be greater than the number of distinct coordinates + k = min(k, count_distinct_coords(coords)) cluster_idxs, centers, loss = KMeans()(coords, k) @@ -46,7 +96,10 @@ def clusterkmeans(geom, k): if geom.type != 'MultiPoint': raise Exception('Invalid operation: Input points parameter must be MultiPoint.') else: - coords = remove_duplicated_coords(np.array(list(geojson.utils.coords(geom)))) + coords = reorder_coords(np.array(list(geojson.utils.coords(geom)))) + # k cannot be greater than the number of distinct coordinates + k = min(k, count_distinct_coords(coords)) + cluster_idxs, centers, loss = KMeans()(coords, k) return geojson.dumps( [ diff --git a/clouds/redshift/libraries/python/lib/clustering/kmeans.py b/clouds/redshift/libraries/python/lib/clustering/kmeans.py index bad0c8db2..d86542f0d 100644 --- a/clouds/redshift/libraries/python/lib/clustering/kmeans.py +++ b/clouds/redshift/libraries/python/lib/clustering/kmeans.py @@ -154,7 +154,9 @@ def __call__( cluster centers: k x d numpy array, the centers loss: final loss value of the objective function of KMeans """ - centers = self._init_centers(points, k, **kwargs) + # centers = self._init_centers(points, k, **kwargs) + # instead of using random initialization, we will use the first k points + centers = points[:k] prev_loss = 0 for it in range(max_iters): cluster_idx = self._update_assignment(centers, points) diff --git a/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md index 16e703a94..805acd5a6 100644 --- a/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md +++ b/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md @@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters]) Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry. * `geog`: `GEOMETRY` points to be clustered. -* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(/2)`). - -````hint:info -The resulting geometries are unique. So duplicated points will be removed from the input multipoint -```` +* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(/2)`). The output number of cluster cannot be greater to the number of distinct points of the `geog`. **Return type** diff --git a/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt b/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt index 11dae73d3..753914357 100644 --- a/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt +++ b/clouds/redshift/modules/test/clustering/fixtures/st_clusterkmeans_out.txt @@ -1,3 +1,3 @@ -[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}}] -[{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}] -[{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}] \ No newline at end of file +[{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,0.0]}}] +[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}}] +[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}}] diff --git a/clouds/redshift/modules/test/clustering/test_ST_CLUSTERKMEANS.py b/clouds/redshift/modules/test/clustering/test_ST_CLUSTERKMEANS.py index 8d37a8e35..2073b1374 100644 --- a/clouds/redshift/modules/test/clustering/test_ST_CLUSTERKMEANS.py +++ b/clouds/redshift/modules/test/clustering/test_ST_CLUSTERKMEANS.py @@ -28,6 +28,28 @@ def test_st_clusterkmeans(): assert str(result[0]) == lines[idx].rstrip() +def test_st_clusterkmeans_duplicated_entries(): + import json + + requested_clusters = 3 + # When the input array contains consecutives entries at the beggining, + # it should be reordered to the required number of clusters + results = run_query( + f""" + SELECT @@RS_SCHEMA@@.ST_CLUSTERKMEANS( + ST_GEOMFROMTEXT( + 'MULTIPOINT ((0 0), (0 0), (0 0), (0 1), (0 1), (0 1), (5 0))'), + {requested_clusters}) + """ + ) + results_data = json.loads(results[0][0]) + unique_clusters = set() + for item in results_data: + unique_clusters.add(item['cluster']) + + assert len(unique_clusters) == requested_clusters + + def test_st_clusterkmeans_default_args_success(): with open(f'{here}/fixtures/st_clusterkmeans_in.txt', 'r') as fixture_file: lines = fixture_file.readlines() diff --git a/clouds/snowflake/libraries/javascript/libs/clustering.js b/clouds/snowflake/libraries/javascript/libs/clustering.js index dd050e749..cf17cabfa 100644 --- a/clouds/snowflake/libraries/javascript/libs/clustering.js +++ b/clouds/snowflake/libraries/javascript/libs/clustering.js @@ -1,7 +1,29 @@ import { featureCollection, feature, clustersKmeans } from '@turf/turf'; +function prioritizeDistinctSort (arr) { + const uniqueValues = []; + const duplicatedValues = []; + + // Split the array into unique and duplicated values + const countMap = {}; + for (const item of arr) { + if (countMap[item] === undefined) { + countMap[item] = 1; + uniqueValues.push(item); + } else { + countMap[item]++; + duplicatedValues.push(item); + } + } + + // Concatenate unique and duplicated values + const result = [...uniqueValues, ...duplicatedValues]; + return result; +} + export default { featureCollection, feature, - clustersKmeans + clustersKmeans, + prioritizeDistinctSort }; \ No newline at end of file diff --git a/clouds/snowflake/libraries/javascript/test/clustering.test.js b/clouds/snowflake/libraries/javascript/test/clustering.test.js index 774ab1a32..4a2166d60 100644 --- a/clouds/snowflake/libraries/javascript/test/clustering.test.js +++ b/clouds/snowflake/libraries/javascript/test/clustering.test.js @@ -4,4 +4,5 @@ test('clustering library defined', () => { expect(clusteringLib.featureCollection).toBeDefined(); expect(clusteringLib.feature).toBeDefined(); expect(clusteringLib.clustersKmeans).toBeDefined(); + expect(clusteringLib.prioritizeDistinctSort).toBeDefined(); }); \ No newline at end of file diff --git a/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md index 46e9e4b4b..28e98196d 100644 --- a/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md +++ b/clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md @@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters]) Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry. * `geojsons`: `ARRAY` points to be clustered. -* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(/2)`. - -````hint:info -The resulting geometries are unique. So duplicated points will be removed from the input array -```` +* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(/2)`. The output number of cluster cannot be greater to the number of distinct points of the `geojsons`. **Return type** diff --git a/clouds/snowflake/modules/sql/clustering/ST_CLUSTERKMEANS.sql b/clouds/snowflake/modules/sql/clustering/ST_CLUSTERKMEANS.sql index 8c6f44d78..96f0c3eb4 100644 --- a/clouds/snowflake/modules/sql/clustering/ST_CLUSTERKMEANS.sql +++ b/clouds/snowflake/modules/sql/clustering/ST_CLUSTERKMEANS.sql @@ -17,8 +17,7 @@ AS $$ const options = {}; options.numberOfClusters = Number(NUMBEROFCLUSTERS); options.mutate = true; - GEOJSONS = Array.from(new Set(GEOJSONS)); - const featuresCollection = clusteringLib.featureCollection(GEOJSONS.map(x => clusteringLib.feature(JSON.parse(x)))); + const featuresCollection = clusteringLib.featureCollection(clusteringLib.prioritizeDistinctSort(GEOJSONS).map(x => clusteringLib.feature(JSON.parse(x)))); clusteringLib.clustersKmeans(featuresCollection, options); const cluster = []; featuresCollection.features.forEach(function(item, index, array) { diff --git a/clouds/snowflake/modules/test/clustering/ST_CLUSTERKMEANS.test.js b/clouds/snowflake/modules/test/clustering/ST_CLUSTERKMEANS.test.js index c615ce403..f0e2a129c 100644 --- a/clouds/snowflake/modules/test/clustering/ST_CLUSTERKMEANS.test.js +++ b/clouds/snowflake/modules/test/clustering/ST_CLUSTERKMEANS.test.js @@ -17,6 +17,25 @@ test('ST_CLUSTERKMEANS should work', async () => { expect(JSON.stringify(rows[0].CLUSTERKMEANS3)).toEqual(points3FixturesOut.value); }); +test('ST_CLUSTERKMEANS should work for duplicated entries ', async () => { + const requestedClusters = 3; + // When the input array contains consecutives entries at the beggining, + // it should be reordered to the required number of clusters + const query = `SELECT + @@SF_SCHEMA@@.ST_CLUSTERKMEANS(ARRAY_CONSTRUCT(ST_ASGEOJSON(ST_POINT(0, 0))::STRING, ST_ASGEOJSON(ST_POINT(0, 0))::STRING, ST_ASGEOJSON(ST_POINT(0, 0))::STRING, ST_ASGEOJSON(ST_POINT(0, 1))::STRING, ST_ASGEOJSON(ST_POINT(0, 1))::STRING, ST_ASGEOJSON(ST_POINT(0, 1))::STRING, ST_ASGEOJSON(ST_POINT(5, 0))::STRING), ${requestedClusters}) as clusterKMeans + `; + const rows = await runQuery(query); + const uniqueClusters = new Set(); + + rows[0].CLUSTERKMEANS.forEach(item => { + uniqueClusters.add(item.cluster); + }); + + expect(rows.length).toEqual(1); + expect(uniqueClusters.size).toEqual(requestedClusters); + +}); + test('ST_CLUSTERKMEANS should return NULL if any NULL mandatory argument', async () => { const query = `SELECT @@SF_SCHEMA@@.ST_CLUSTERKMEANS(NULL, 2) as clusterKMeans1, diff --git a/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points1.js b/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points1.js index 483619843..1d27769b6 100644 --- a/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points1.js +++ b/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points1.js @@ -1,3 +1,3 @@ module.exports = { - value: '[{"cluster":1,"geom":"{\\\"coordinates\\\":[0,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":1,"geom":"{\\\"coordinates\\\":[0,1],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[5,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":1,"geom":"{\\\"coordinates\\\":[1,0],\\\"type\\\":\\\"Point\\\"}"}]' - } \ No newline at end of file + value: '[{"cluster":1,"geom":"{\\"coordinates\\":[0,0],\\"type\\":\\"Point\\"}"},{"cluster":1,"geom":"{\\"coordinates\\":[0,1],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[5,0],\\"type\\":\\"Point\\"}"},{"cluster":1,"geom":"{\\"coordinates\\":[1,0],\\"type\\":\\"Point\\"}"}]' +} \ No newline at end of file diff --git a/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points2.js b/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points2.js index 88ff55bfb..4bbee7ecb 100644 --- a/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points2.js +++ b/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points2.js @@ -1,3 +1,3 @@ module.exports = { - value: '[{"cluster":0,"geom":"{\\\"coordinates\\\":[0,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[0,1],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[5,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[1,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":1,"geom":"{\\\"coordinates\\\":[1,19],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[12,1],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[9,2],\\\"type\\\":\\\"Point\\\"}"},{"cluster":1,"geom":"{\\\"coordinates\\\":[1,10],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[-3,1],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[5,5],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[8,6],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[10,10],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[-3,-5],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[6,5],\\\"type\\\":\\\"Point\\\"}"},{"cluster":1,"geom":"{\\\"coordinates\\\":[-8,9],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[1,-10],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[2,-2],\\\"type\\\":\\\"Point\\\"}"},{"cluster":1,"geom":"{\\\"coordinates\\\":[3,10],\\\"type\\\":\\\"Point\\\"}"}]' - } \ No newline at end of file + value: '[{"cluster":0,"geom":"{\\"coordinates\\":[0,0],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[0,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[5,0],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[1,0],\\"type\\":\\"Point\\"}"},{"cluster":1,"geom":"{\\"coordinates\\":[1,19],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[12,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[9,2],\\"type\\":\\"Point\\"}"},{"cluster":1,"geom":"{\\"coordinates\\":[1,10],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[-3,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[5,5],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[8,6],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[10,10],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[-3,-5],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[6,5],\\"type\\":\\"Point\\"}"},{"cluster":1,"geom":"{\\"coordinates\\":[-8,9],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[1,-10],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[2,-2],\\"type\\":\\"Point\\"}"},{"cluster":1,"geom":"{\\"coordinates\\":[3,10],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[0,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[5,0],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[0,0],\\"type\\":\\"Point\\"}"}]' +} \ No newline at end of file diff --git a/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points3.js b/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points3.js index e4a9a9e0c..a004fc46c 100644 --- a/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points3.js +++ b/clouds/snowflake/modules/test/clustering/fixtures/st_clusterkmeans_out_points3.js @@ -1,3 +1,3 @@ module.exports = { - value: '[{"cluster":0,"geom":"{\\\"coordinates\\\":[0,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[0,1],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[5,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[1,0],\\\"type\\\":\\\"Point\\\"}"},{"cluster":4,"geom":"{\\\"coordinates\\\":[1,19],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[12,1],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[9,2],\\\"type\\\":\\\"Point\\\"}"},{"cluster":4,"geom":"{\\\"coordinates\\\":[1,10],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[-3,1],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[5,5],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[8,6],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[10,10],\\\"type\\\":\\\"Point\\\"}"},{"cluster":3,"geom":"{\\\"coordinates\\\":[-3,-5],\\\"type\\\":\\\"Point\\\"}"},{"cluster":2,"geom":"{\\\"coordinates\\\":[6,5],\\\"type\\\":\\\"Point\\\"}"},{"cluster":1,"geom":"{\\\"coordinates\\\":[-8,9],\\\"type\\\":\\\"Point\\\"}"},{"cluster":3,"geom":"{\\\"coordinates\\\":[1,-10],\\\"type\\\":\\\"Point\\\"}"},{"cluster":0,"geom":"{\\\"coordinates\\\":[2,-2],\\\"type\\\":\\\"Point\\\"}"},{"cluster":4,"geom":"{\\\"coordinates\\\":[3,10],\\\"type\\\":\\\"Point\\\"}"}]' - } \ No newline at end of file + value: '[{"cluster":0,"geom":"{\\"coordinates\\":[0,0],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[0,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[5,0],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[1,0],\\"type\\":\\"Point\\"}"},{"cluster":4,"geom":"{\\"coordinates\\":[1,19],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[12,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[9,2],\\"type\\":\\"Point\\"}"},{"cluster":4,"geom":"{\\"coordinates\\":[1,10],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[-3,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[5,5],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[8,6],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[10,10],\\"type\\":\\"Point\\"}"},{"cluster":3,"geom":"{\\"coordinates\\":[-3,-5],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[6,5],\\"type\\":\\"Point\\"}"},{"cluster":1,"geom":"{\\"coordinates\\":[-8,9],\\"type\\":\\"Point\\"}"},{"cluster":3,"geom":"{\\"coordinates\\":[1,-10],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[2,-2],\\"type\\":\\"Point\\"}"},{"cluster":4,"geom":"{\\"coordinates\\":[3,10],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[0,1],\\"type\\":\\"Point\\"}"},{"cluster":2,"geom":"{\\"coordinates\\":[5,0],\\"type\\":\\"Point\\"}"},{"cluster":0,"geom":"{\\"coordinates\\":[0,0],\\"type\\":\\"Point\\"}"}]' +} \ No newline at end of file