Skip to content

Commit

Permalink
fix(bq,sf,rs|clustering): improve how ST_CLUSTERKMEANS deals with dup…
Browse files Browse the repository at this point in the history
…licates (#495)
  • Loading branch information
vdelacruzb authored Apr 18, 2024
1 parent d51021b commit 8a29098
Show file tree
Hide file tree
Showing 20 changed files with 182 additions and 36 deletions.
24 changes: 23 additions & 1 deletion clouds/bigquery/libraries/javascript/src/clustering.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
import { featureCollection, feature, clustersKmeans } from '@turf/turf';

function prioritizeDistinctSort (arr) {
const uniqueValues = [];
const duplicatedValues = [];

// Split the array into unique and duplicated values
const countMap = {};
for (const item of arr) {
if (countMap[item] === undefined) {
countMap[item] = 1;
uniqueValues.push(item);
} else {
countMap[item]++;
duplicatedValues.push(item);
}
}

// Concatenate unique and duplicated values
const result = [...uniqueValues, ...duplicatedValues];
return result;
}

export default {
featureCollection,
feature,
clustersKmeans
clustersKmeans,
prioritizeDistinctSort
};
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ test('clustering library defined', () => {
expect(lib.clustering.featureCollection).toBeDefined();
expect(lib.clustering.feature).toBeDefined();
expect(lib.clustering.clustersKmeans).toBeDefined();
expect(lib.clustering.prioritizeDistinctSort).toBeDefined();
});
6 changes: 1 addition & 5 deletions clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog, numberOfClusters)
Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.

* `geog`: `ARRAY<GEOGRAPHY>` points to be clustered.
* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.

````hint:info
The resulting geometries are unique. So duplicated points will be removed from the input array
````
* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used. The output number of cluster cannot be greater to the number of distinct points of the `geog`.

**Return type**

Expand Down
3 changes: 1 addition & 2 deletions clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ AS """
options.numberOfClusters = parseInt(Math.sqrt(geojson.length/2))
}
options.mutate = true;
geojson = Array.from(new Set(geojson));
const featuresCollection = lib.clustering.featureCollection(geojson.map(x => lib.clustering.feature(JSON.parse(x))));
const featuresCollection = lib.clustering.featureCollection(lib.clustering.prioritizeDistinctSort(geojson).map(x => lib.clustering.feature(JSON.parse(x))));
lib.clustering.clustersKmeans(featuresCollection, options);
const cluster = [];
featuresCollection.features.forEach(function(item, index, array) {
Expand Down
18 changes: 18 additions & 0 deletions clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,24 @@ test('ST_CLUSTERKMEANS should work', async () => {
expect(rows[0].clusterKMeans3).toEqual(JSON.parse(points3FixturesOut.value));
});

test('ST_CLUSTERKMEANS should work for duplicated entries ', async () => {
const requestedClusters = 3;
// When the input array contains consecutives entries at the beggining,
// it should be reordered to the required number of clusters
const query = `SELECT
\`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`([ST_GEOGPOINT(0, 0),ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(5, 0)], ${requestedClusters}) as clusterKMeans
`;
const rows = await runQuery(query);
const uniqueClusters = new Set();

rows[0].clusterKMeans.forEach(item => {
uniqueClusters.add(item.cluster);
});

expect(rows.length).toEqual(1);
expect(uniqueClusters.size).toEqual(requestedClusters);
});

test('ST_CLUSTERKMEANS should return NULL if any NULL mandatory argument', async () => {
const query = `SELECT
\`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`(NULL, 2) as clusterKMeans1
Expand Down

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

59 changes: 56 additions & 3 deletions clouds/redshift/libraries/python/lib/clustering/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,74 @@ def load_geom(geom):
geom = json.dumps(_geom)
return loads(geom)


def remove_duplicated_coords(arr):
import numpy as np

unique_rows = []
for row in arr:
if not any(np.array_equal(row, unique_row) for unique_row in unique_rows):
unique_rows.append(row)
return np.array(unique_rows)


def reorder_coords(coords):
import numpy as np

unique_coords = []
duplicated_coords = []

# Split the array into unique and duplicated coordinates
count_map = {}
for coord in coords:
coord_str = tuple(coord)
if coord_str not in count_map:
count_map[coord_str] = 1
unique_coords.append(coord)
else:
count_map[coord_str] += 1
duplicated_coords.append(coord)

# Convert lists to NumPy arrays for sorting
unique_coords = np.array(unique_coords)
duplicated_coords = np.array(duplicated_coords)

if unique_coords.size > 0:
if duplicated_coords.size > 0:
# Concatenate unique and duplicated coordinates
return np.concatenate((unique_coords, duplicated_coords))
else:
return unique_coords
else:
if duplicated_coords.size > 0:
return duplicated_coords
else:
# This should never happen, so just returning the input
return coords


def count_distinct_coords(coords):
import numpy as np

count_map = {}
for coord in coords:
coord_str = tuple(coord)
count_map[coord_str] = count_map.get(coord_str, 0) + 1
return len(count_map)


def clusterkmeanstable(geom, k):
from .kmeans import KMeans
import json
import numpy as np

geom = load_geom(geom)
points = geom['_coords']
coords = np.array(
[[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)]
coords = reorder_coords(
np.array([[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)])
)
# k cannot be greater than the number of distinct coordinates
k = min(k, count_distinct_coords(coords))

cluster_idxs, centers, loss = KMeans()(coords, k)

Expand All @@ -46,7 +96,10 @@ def clusterkmeans(geom, k):
if geom.type != 'MultiPoint':
raise Exception('Invalid operation: Input points parameter must be MultiPoint.')
else:
coords = remove_duplicated_coords(np.array(list(geojson.utils.coords(geom))))
coords = reorder_coords(np.array(list(geojson.utils.coords(geom))))
# k cannot be greater than the number of distinct coordinates
k = min(k, count_distinct_coords(coords))

cluster_idxs, centers, loss = KMeans()(coords, k)
return geojson.dumps(
[
Expand Down
4 changes: 3 additions & 1 deletion clouds/redshift/libraries/python/lib/clustering/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,9 @@ def __call__(
cluster centers: k x d numpy array, the centers
loss: final loss value of the objective function of KMeans
"""
centers = self._init_centers(points, k, **kwargs)
# centers = self._init_centers(points, k, **kwargs)
# instead of using random initialization, we will use the first k points
centers = points[:k]
prev_loss = 0
for it in range(max_iters):
cluster_idx = self._update_assignment(centers, points)
Expand Down
6 changes: 1 addition & 5 deletions clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters])
Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.

* `geog`: `GEOMETRY` points to be clustered.
* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`).

````hint:info
The resulting geometries are unique. So duplicated points will be removed from the input multipoint
````
* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`). The output number of cluster cannot be greater to the number of distinct points of the `geog`.

**Return type**

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}}]
[{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
[{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}}]
[{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,0.0]}}]
[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}}]
[{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[1.0,0.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,19.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[12.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[9.0,2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[1.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[-3.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[8.0,6.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[10.0,10.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[-3.0,-5.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[6.0,5.0]}},{"cluster":1,"geom":{"type":"Point","coordinates":[-8.0,9.0]}},{"cluster":3,"geom":{"type":"Point","coordinates":[1.0,-10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[2.0,-2.0]}},{"cluster":4,"geom":{"type":"Point","coordinates":[3.0,10.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,1.0]}},{"cluster":2,"geom":{"type":"Point","coordinates":[5.0,0.0]}},{"cluster":0,"geom":{"type":"Point","coordinates":[0.0,0.0]}}]
22 changes: 22 additions & 0 deletions clouds/redshift/modules/test/clustering/test_ST_CLUSTERKMEANS.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,28 @@ def test_st_clusterkmeans():
assert str(result[0]) == lines[idx].rstrip()


def test_st_clusterkmeans_duplicated_entries():
import json

requested_clusters = 3
# When the input array contains consecutives entries at the beggining,
# it should be reordered to the required number of clusters
results = run_query(
f"""
SELECT @@RS_SCHEMA@@.ST_CLUSTERKMEANS(
ST_GEOMFROMTEXT(
'MULTIPOINT ((0 0), (0 0), (0 0), (0 1), (0 1), (0 1), (5 0))'),
{requested_clusters})
"""
)
results_data = json.loads(results[0][0])
unique_clusters = set()
for item in results_data:
unique_clusters.add(item['cluster'])

assert len(unique_clusters) == requested_clusters


def test_st_clusterkmeans_default_args_success():
with open(f'{here}/fixtures/st_clusterkmeans_in.txt', 'r') as fixture_file:
lines = fixture_file.readlines()
Expand Down
24 changes: 23 additions & 1 deletion clouds/snowflake/libraries/javascript/libs/clustering.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,29 @@
import { featureCollection, feature, clustersKmeans } from '@turf/turf';

function prioritizeDistinctSort (arr) {
const uniqueValues = [];
const duplicatedValues = [];

// Split the array into unique and duplicated values
const countMap = {};
for (const item of arr) {
if (countMap[item] === undefined) {
countMap[item] = 1;
uniqueValues.push(item);
} else {
countMap[item]++;
duplicatedValues.push(item);
}
}

// Concatenate unique and duplicated values
const result = [...uniqueValues, ...duplicatedValues];
return result;
}

export default {
featureCollection,
feature,
clustersKmeans
clustersKmeans,
prioritizeDistinctSort
};
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ test('clustering library defined', () => {
expect(clusteringLib.featureCollection).toBeDefined();
expect(clusteringLib.feature).toBeDefined();
expect(clusteringLib.clustersKmeans).toBeDefined();
expect(clusteringLib.prioritizeDistinctSort).toBeDefined();
});
6 changes: 1 addition & 5 deletions clouds/snowflake/modules/doc/clustering/ST_CLUSTERKMEANS.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters])
Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.

* `geojsons`: `ARRAY` points to be clustered.
* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(<NUMBER OF POINTS>/2)`.

````hint:info
The resulting geometries are unique. So duplicated points will be removed from the input array
````
* `numberOfClusters` (optional): `INT` numberOfClusters that will be generated. By default `numberOfClusters` is `Math.sqrt(<NUMBER OF POINTS>/2)`. The output number of cluster cannot be greater to the number of distinct points of the `geojsons`.

**Return type**

Expand Down
Loading

0 comments on commit 8a29098

Please sign in to comment.