release: 2024-04-18 (#497)

CartoDB · Apr 18, 2024 · 703ca29 · 703ca29
1 parent 532fc22
commit 703ca29
Show file tree

Hide file tree

Showing 97 changed files with 1,146 additions and 8,967 deletions.
diff --git a/.github/workflows/snowflake.yml b/.github/workflows/snowflake.yml
@@ -92,7 +92,7 @@ jobs:
     if: github.ref_name == 'main'
     needs: test
     runs-on: ubuntu-20.04
-    timeout-minutes: 20
+    timeout-minutes: 30
     env:
       APP_PACKAGE_NAME: ${{ secrets.SF_NATIVE_APP_PACKAGE_NAME_CD }}
       APP_NAME: ${{ secrets.SF_NATIVE_APP_NAME_CD }}

diff --git a/.gitignore b/.gitignore
@@ -25,3 +25,6 @@ MANIFEST
 
 # IntelliJ
 .idea/
+
+# Vim
+*.swp
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,16 @@ CARTO Analytics Toolbox Core.
 
 All notable commits to this project will be documented in this file.
 
+## 2024-04-18
+
+- chore(sf|h3): reimplement basic h3 functions (#489)
+- docs(bq,sf,rs|processing): update voronoi doc (#492)
+- chore(sf|h3): reimplement polyfill h3 functions (#490)
+- fix(bq,sf,rs|clustering): improve how ST_CLUSTERKMEANS deals with duplicates (#491, #495)
+- chore(deps): bump sqlparse from 0.4.4 to 0.5.0 in /clouds/redshift/common (#494)
+- chore(deps): bump sqlparse from 0.4.4 to 0.5.0 in /clouds/postgres/common (#493)
+- chore(deps): fix CI crashing because native-apps timeout and sql-parse version (#496)
+
 ## 2024-03-18
 
 - fix(sf): CI and CD not working because of snowflake driver breaking changes (#484)

diff --git a/README.md b/README.md
@@ -62,6 +62,23 @@ Right now the only way to get access the Analytics toolbox is by installing it d
 | Postgres | [README.md](./clouds/postgres/README.md) |
 | Databricks | [README.md](./clouds/databricks/README.md) |
 
+### Useful make commands
+
+To run tests, switch to a specific cloud directory. For example, Showflake: `cd clouds/snowflake`.  
+
+```
+# All tests
+make test
+
+# Specific module(s)
+make test modules=h3
+make test modules=h3,transformations
+
+# Specific function(s)
+make test functions=H3_POLYFILL
+make test functions=H3_POLYFILL,ST_BUFFER
+```
+
 ## Contribute
 
 This project is public. We are more than happy of receiving feedback and contributions. Feel free to open a ticket with a bug, a doubt or a discussion, or open a pull request with a fix or a new feature.
diff --git a/clouds/bigquery/CHANGELOG.md b/clouds/bigquery/CHANGELOG.md
@@ -4,6 +4,11 @@ CARTO Analytics Toolbox Core for BigQuery.
 
 All notable commits to this project will be documented in this file.
 
+## [1.2.2] - 2024-04-18
+
+- docs(processing): update voronoi doc (#492)
+- fix(clustering): improve how ST_CLUSTERKMEANS deals with duplicates (#491, #495)
+
 ## [1.2.1] - 2024-03-18
 
 - fix(random): ST_GENERATEPOINTS returning exact name of points (#486)

diff --git a/clouds/bigquery/libraries/javascript/src/clustering.js b/clouds/bigquery/libraries/javascript/src/clustering.js
@@ -1,7 +1,29 @@
 import { featureCollection, feature, clustersKmeans } from '@turf/turf';
 
+function prioritizeDistinctSort (arr) {
+    const uniqueValues = [];
+    const duplicatedValues = [];
+
+    // Split the array into unique and duplicated values
+    const countMap = {};
+    for (const item of arr) {
+        if (countMap[item] === undefined) {
+            countMap[item] = 1;
+            uniqueValues.push(item);
+        } else {
+            countMap[item]++;
+            duplicatedValues.push(item);
+        }
+    }
+
+    // Concatenate unique and duplicated values
+    const result = [...uniqueValues, ...duplicatedValues];
+    return result;
+}
+
 export default {
     featureCollection,
     feature,
-    clustersKmeans
+    clustersKmeans,
+    prioritizeDistinctSort
 };
diff --git a/clouds/bigquery/libraries/javascript/test/clustering.test.js b/clouds/bigquery/libraries/javascript/test/clustering.test.js
@@ -4,4 +4,5 @@ test('clustering library defined', () => {
     expect(lib.clustering.featureCollection).toBeDefined();
     expect(lib.clustering.feature).toBeDefined();
     expect(lib.clustering.clustersKmeans).toBeDefined();
+    expect(lib.clustering.prioritizeDistinctSort).toBeDefined();
 });
diff --git a/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/bigquery/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,7 +9,7 @@ ST_CLUSTERKMEANS(geog, numberOfClusters)
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geog`: `ARRAY<GEOGRAPHY>` points to be clustered.
-* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.
+* `numberOfClusters`: `INT64`|`NULL` numberOfClusters that will be generated. If `NULL` the default value `Math.sqrt(<NUMBER OF POINTS>/2)` is used.  The output number of cluster cannot be greater to the number of distinct points of the `geog`.
 
 **Return type**
 

diff --git a/clouds/bigquery/modules/doc/processing/ST_DELAUNAYLINES.md b/clouds/bigquery/modules/doc/processing/ST_DELAUNAYLINES.md
@@ -12,6 +12,12 @@ Calculates the Delaunay triangulation of the points provided. An array of line s
 
 Due to technical limitations of the underlying libraries used, the input points' coordinates are truncated to 5 decimal places in order to avoid problems that happen with close but distinct input points. This limits the precision of the results and can alter slightly the position of the resulting polygons (about 1 meter). This can also result in some points being merged together, so that fewer polygons than expected may result.
 
+````hint:warning
+**warning**
+
+The maximum number of points typically used to compute Delaunay diagrams is 300,000. This limit ensures efficient computation while maintaining accuracy in delineating regions based on proximity to specified points.
+````
+
 **Return type**
 
 `ARRAY<GEOGRAPHY>`

diff --git a/clouds/bigquery/modules/doc/processing/ST_DELAUNAYPOLYGONS.md b/clouds/bigquery/modules/doc/processing/ST_DELAUNAYPOLYGONS.md
@@ -12,6 +12,12 @@ Calculates the Delaunay triangulation of the points provided. An array of polygo
 
 Due to technical limitations of the underlying libraries used, the input points' coordinates are truncated to 5 decimal places in order to avoid problems that happen with close but distinct input points. This limits the precision of the results and can alter slightly the position of the resulting polygons (about 1 meter). This can also result in some points being merged together, so that fewer polygons than expected may result.
 
+````hint:warning
+**warning**
+
+The maximum number of points typically used to compute Delaunay diagrams is 300,000. This limit ensures efficient computation while maintaining accuracy in delineating regions based on proximity to specified points.
+````
+
 **Return type**
 
 `ARRAY<GEOGRAPHY>`

diff --git a/clouds/bigquery/modules/doc/processing/ST_VORONOILINES.md b/clouds/bigquery/modules/doc/processing/ST_VORONOILINES.md
@@ -13,6 +13,12 @@ Calculates the Voronoi diagram of the points provided. An array of lines is retu
 
 Due to technical limitations of the underlying libraries used, the input points' coordinates are truncated to 5 decimal places in order to avoid problems that happen with close but distinct input points. This limits the precision of the results and can alter slightly the position of the resulting lines (about 1 meter). This can also result in some points being merged together, so that fewer lines than input points may result.
 
+````hint:warning
+**warning**
+
+The maximum number of points typically used to compute Voronoi diagrams is 300,000. This limit ensures efficient computation while maintaining accuracy in delineating regions based on proximity to specified points.
+````
+
 **Return type**
 
 `ARRAY<GEOGRAPHY>`

diff --git a/clouds/bigquery/modules/doc/processing/ST_VORONOIPOLYGONS.md b/clouds/bigquery/modules/doc/processing/ST_VORONOIPOLYGONS.md
@@ -13,6 +13,12 @@ Calculates the Voronoi diagram of the points provided. An array of polygons is r
 
 Due to technical limitations of the underlying libraries used, the input points' coordinates are truncated to 5 decimal places in order to avoid problems that happen with close but distinct input points. This limits the precision of the results and can alter slightly the position of the resulting polygons (about 1 meter). This can also result in some points being merged together, so that fewer polygons than input points may result.
 
+````hint:warning
+**warning**
+
+The maximum number of points typically used to compute Voronoi diagrams is 300,000. This limit ensures efficient computation while maintaining accuracy in delineating regions based on proximity to specified points.
+````
+
 **Return type**
 
 `ARRAY<GEOGRAPHY>`

diff --git a/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql b/clouds/bigquery/modules/sql/clustering/ST_CLUSTERKMEANS.sql
@@ -1,6 +1,6 @@
-----------------------------
--- Copyright (C) 2021 CARTO
-----------------------------
+--------------------------------
+-- Copyright (C) 2021-2024 CARTO
+--------------------------------
 
 CREATE OR REPLACE FUNCTION `@@BQ_DATASET@@.__CLUSTERKMEANS`
 (geojson ARRAY<STRING>, numberOfClusters INT64)
@@ -15,9 +15,11 @@ AS """
     const options = {};
     if (numberOfClusters != null) {
         options.numberOfClusters = Number(numberOfClusters);
+    } else {
+        options.numberOfClusters = parseInt(Math.sqrt(geojson.length/2))
     }
     options.mutate = true;
-    const featuresCollection = lib.clustering.featureCollection(geojson.map(x => lib.clustering.feature(JSON.parse(x))));
+    const featuresCollection = lib.clustering.featureCollection(lib.clustering.prioritizeDistinctSort(geojson).map(x => lib.clustering.feature(JSON.parse(x))));
     lib.clustering.clustersKmeans(featuresCollection, options);
     const cluster = [];
     featuresCollection.features.forEach(function(item, index, array) {

diff --git a/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js b/clouds/bigquery/modules/test/clustering/ST_CLUSTERKMEANS.test.js
@@ -17,6 +17,24 @@ test('ST_CLUSTERKMEANS should work', async () => {
     expect(rows[0].clusterKMeans3).toEqual(JSON.parse(points3FixturesOut.value));
 });
 
+test('ST_CLUSTERKMEANS should work for duplicated entries ', async () => {
+    const requestedClusters = 3;
+    // When the input array contains consecutives entries at the beggining,
+    // it should be reordered to the required number of clusters
+    const query = `SELECT
+        \`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`([ST_GEOGPOINT(0, 0),ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 0), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(0, 1), ST_GEOGPOINT(5, 0)], ${requestedClusters}) as clusterKMeans
+    `;
+    const rows = await runQuery(query);
+    const uniqueClusters = new Set();
+
+    rows[0].clusterKMeans.forEach(item => {
+        uniqueClusters.add(item.cluster);
+    });
+
+    expect(rows.length).toEqual(1);
+    expect(uniqueClusters.size).toEqual(requestedClusters);
+});
+
 test('ST_CLUSTERKMEANS should return NULL if any NULL mandatory argument', async () => {
     const query = `SELECT
         \`@@BQ_DATASET@@.ST_CLUSTERKMEANS\`(NULL, 2) as clusterKMeans1

diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points2_out.js
diff --git a/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js b/clouds/bigquery/modules/test/clustering/fixtures/st_clusterkmeans_points3_out.js
diff --git a/clouds/bigquery/version b/clouds/bigquery/version
@@ -1 +1 @@
-1.2.1
+1.2.2
diff --git a/clouds/postgres/common/python3_requirements.txt b/clouds/postgres/common/python3_requirements.txt
@@ -5,7 +5,7 @@ pep8-naming==0.12.1
 brunette==0.2.0
 pytest==6.2.4
 psycopg2-binary==2.9.1
-sqlparse==0.4.4
+sqlparse==0.5.0
 wheel==0.38.1
 tqdm==4.64.0
 GeoAlchemy2==0.9.3

diff --git a/clouds/redshift/CHANGELOG.md b/clouds/redshift/CHANGELOG.md
@@ -4,6 +4,11 @@ CARTO Analytics Toolbox Core for Redshift.
 
 All notable commits to this project will be documented in this file.
 
+## [1.1.1] - 2024-04-18
+
+- docs(processing): update voronoi doc (#492)
+- fix(clustering): improve how ST_CLUSTERKMEANS deals with duplicates (#491, #495)
+
 ## [1.1.0] - 2024-01-17
 
 - feat(quadbin): add function QUADBIN_DISTANCE (#457)

diff --git a/clouds/redshift/libraries/python/lib/clustering/__init__.py b/clouds/redshift/libraries/python/lib/clustering/__init__.py
@@ -11,16 +11,73 @@ def load_geom(geom):
     return loads(geom)
 
 
+def remove_duplicated_coords(arr):
+    import numpy as np
+
+    unique_rows = []
+    for row in arr:
+        if not any(np.array_equal(row, unique_row) for unique_row in unique_rows):
+            unique_rows.append(row)
+    return np.array(unique_rows)
+
+
+def reorder_coords(coords):
+    import numpy as np
+
+    unique_coords = []
+    duplicated_coords = []
+
+    # Split the array into unique and duplicated coordinates
+    count_map = {}
+    for coord in coords:
+        coord_str = tuple(coord)
+        if coord_str not in count_map:
+            count_map[coord_str] = 1
+            unique_coords.append(coord)
+        else:
+            count_map[coord_str] += 1
+            duplicated_coords.append(coord)
+
+    # Convert lists to NumPy arrays for sorting
+    unique_coords = np.array(unique_coords)
+    duplicated_coords = np.array(duplicated_coords)
+
+    if unique_coords.size > 0:
+        if duplicated_coords.size > 0:
+            # Concatenate unique and duplicated coordinates
+            return np.concatenate((unique_coords, duplicated_coords))
+        else:
+            return unique_coords
+    else:
+        if duplicated_coords.size > 0:
+            return duplicated_coords
+        else:
+            # This should never happen, so just returning the input
+            return coords
+
+
+def count_distinct_coords(coords):
+    import numpy as np
+
+    count_map = {}
+    for coord in coords:
+        coord_str = tuple(coord)
+        count_map[coord_str] = count_map.get(coord_str, 0) + 1
+    return len(count_map)
+
+
 def clusterkmeanstable(geom, k):
     from .kmeans import KMeans
     import json
     import numpy as np
 
     geom = load_geom(geom)
     points = geom['_coords']
-    coords = np.array(
-        [[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)]
+    coords = reorder_coords(
+        np.array([[points[i], points[i + 1]] for i in range(0, len(points) - 1, 2)])
     )
+    # k cannot be greater than the number of distinct coordinates
+    k = min(k, count_distinct_coords(coords))
 
     cluster_idxs, centers, loss = KMeans()(coords, k)
 
@@ -39,14 +96,17 @@ def clusterkmeans(geom, k):
     if geom.type != 'MultiPoint':
         raise Exception('Invalid operation: Input points parameter must be MultiPoint.')
     else:
-        coords = np.array(list(geojson.utils.coords(geom)))
+        coords = reorder_coords(np.array(list(geojson.utils.coords(geom))))
+    # k cannot be greater than the number of distinct coordinates
+    k = min(k, count_distinct_coords(coords))
+
     cluster_idxs, centers, loss = KMeans()(coords, k)
     return geojson.dumps(
         [
             {
                 'cluster': cluster_idxs[idx],
-                'geom': {'coordinates': point, 'type': 'Point'},
+                'geom': {'coordinates': point.tolist(), 'type': 'Point'},
             }
-            for idx, point in enumerate(geom['coordinates'])
+            for idx, point in enumerate(coords)
         ]
     )
diff --git a/clouds/redshift/libraries/python/lib/clustering/kmeans.py b/clouds/redshift/libraries/python/lib/clustering/kmeans.py
@@ -154,7 +154,9 @@ def __call__(
         cluster centers: k x d numpy array, the centers
         loss: final loss value of the objective function of KMeans
         """
-        centers = self._init_centers(points, k, **kwargs)
+        # centers = self._init_centers(points, k, **kwargs)
+        # instead of using random initialization, we will use the first k points
+        centers = points[:k]
         prev_loss = 0
         for it in range(max_iters):
             cluster_idx = self._update_assignment(centers, points)

diff --git a/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md b/clouds/redshift/modules/doc/clustering/ST_CLUSTERKMEANS.md
@@ -9,7 +9,7 @@ ST_CLUSTERKMEANS(geog [, numberOfClusters])
 Takes a set of points as input and partitions them into clusters using the k-means algorithm. Returns an array of tuples with the cluster index for each of the input features and the input geometry.
 
 * `geog`: `GEOMETRY` points to be clustered.
-* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`).
+* `numberOfClusters` (optional): `INT` number of clusters that will be generated. It defaults to the square root of half the number of points (`sqrt(<NUMBER OF POINTS>/2)`). The output number of cluster cannot be greater to the number of distinct points of the `geog`.
 
 **Return type**
 

diff --git a/clouds/redshift/modules/doc/processing/ST_DELAUNAYLINES.md b/clouds/redshift/modules/doc/processing/ST_DELAUNAYLINES.md
@@ -10,6 +10,12 @@ Calculates the Delaunay triangulation of the points provided. A MultiLineString
 
 * `points`: `GEOMETRY` MultiPoint input to the Delaunay triangulation.
 
+````hint:warning
+**warning**
+
+The maximum number of points typically used to compute Delaunay diagrams is 300,000. This limit ensures efficient computation while maintaining accuracy in delineating regions based on proximity to specified points.
+````
+
 **Return type**
 
 `VARCHAR(MAX)`

diff --git a/clouds/redshift/modules/doc/processing/ST_DELAUNAYPOLYGONS.md b/clouds/redshift/modules/doc/processing/ST_DELAUNAYPOLYGONS.md
@@ -10,6 +10,12 @@ Calculates the Delaunay triangulation of the points provided. A MultiPolygon obj
 
 * `points`: `GEOMETRY` MultiPoint input to the Delaunay triangulation.
 
+````hint:warning
+**warning**
+
+The maximum number of points typically used to compute Delaunay diagrams is 300,000. This limit ensures efficient computation while maintaining accuracy in delineating regions based on proximity to specified points.
+````
+
 **Return type**
 
 `VARCHAR(MAX)`
-Original file line number
+Diff line change
@@ Expand Up / @@ -25,3 +25,6 @@ MANIFEST @@
     # IntelliJ
     .idea/
+    # Vim
+    *.swp