From b61286935f941c34cc1dd30ba33a6952797d924e Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 13 Dec 2024 00:36:28 -0500 Subject: [PATCH] Initial DataFusion extension crate (#939) ### Change list - Initial UDFs that wrap native Rust code - Array types for Geometry, Box2D, Box3D, Point2D, and Point3D. - Function support matrix in the README compared to PostGIS --- Cargo.lock | 664 +++++++++++++++++- Cargo.toml | 2 +- .../src/algorithm/native/bounding_rect.rs | 78 ++ rust/geoarrow/src/algorithm/native/mod.rs | 1 + rust/geoarrow/src/array/geometry/array.rs | 407 +++++++++-- rust/geoarrow/src/array/rect/builder.rs | 19 + rust/geoarrow/src/datatypes.rs | 6 + rust/geodatafusion/Cargo.toml | 32 + rust/geodatafusion/README.md | 365 ++++++++++ rust/geodatafusion/src/data_types.rs | 52 ++ rust/geodatafusion/src/error.rs | 33 + rust/geodatafusion/src/lib.rs | 3 + rust/geodatafusion/src/udf/geos/mod.rs | 1 + rust/geodatafusion/src/udf/mod.rs | 2 + .../src/udf/native/accessors/coord_dim.rs | 96 +++ .../src/udf/native/accessors/envelope.rs | 73 ++ .../src/udf/native/accessors/line_string.rs | 99 +++ .../src/udf/native/accessors/mod.rs | 12 + .../src/udf/native/bounding_box/box.rs | 70 ++ .../src/udf/native/bounding_box/extrema.rs | 288 ++++++++ .../src/udf/native/bounding_box/mod.rs | 13 + .../src/udf/native/constructors/mod.rs | 9 + .../src/udf/native/constructors/point.rs | 234 ++++++ rust/geodatafusion/src/udf/native/io/mod.rs | 14 + rust/geodatafusion/src/udf/native/io/wkb.rs | 137 ++++ rust/geodatafusion/src/udf/native/io/wkt.rs | 137 ++++ .../src/udf/native/measurement/area.rs | 118 ++++ .../src/udf/native/measurement/mod.rs | 8 + rust/geodatafusion/src/udf/native/mod.rs | 20 + .../src/udf/native/processing/centroid.rs | 125 ++++ .../src/udf/native/processing/convex_hull.rs | 75 ++ .../src/udf/native/processing/mod.rs | 10 + 32 files changed, 3157 insertions(+), 46 deletions(-) create mode 100644 rust/geodatafusion/Cargo.toml create mode 100644 rust/geodatafusion/README.md create mode 100644 rust/geodatafusion/src/data_types.rs create mode 100644 rust/geodatafusion/src/error.rs create mode 100644 rust/geodatafusion/src/lib.rs create mode 100644 rust/geodatafusion/src/udf/geos/mod.rs create mode 100644 rust/geodatafusion/src/udf/mod.rs create mode 100644 rust/geodatafusion/src/udf/native/accessors/coord_dim.rs create mode 100644 rust/geodatafusion/src/udf/native/accessors/envelope.rs create mode 100644 rust/geodatafusion/src/udf/native/accessors/line_string.rs create mode 100644 rust/geodatafusion/src/udf/native/accessors/mod.rs create mode 100644 rust/geodatafusion/src/udf/native/bounding_box/box.rs create mode 100644 rust/geodatafusion/src/udf/native/bounding_box/extrema.rs create mode 100644 rust/geodatafusion/src/udf/native/bounding_box/mod.rs create mode 100644 rust/geodatafusion/src/udf/native/constructors/mod.rs create mode 100644 rust/geodatafusion/src/udf/native/constructors/point.rs create mode 100644 rust/geodatafusion/src/udf/native/io/mod.rs create mode 100644 rust/geodatafusion/src/udf/native/io/wkb.rs create mode 100644 rust/geodatafusion/src/udf/native/io/wkt.rs create mode 100644 rust/geodatafusion/src/udf/native/measurement/area.rs create mode 100644 rust/geodatafusion/src/udf/native/measurement/mod.rs create mode 100644 rust/geodatafusion/src/udf/native/mod.rs create mode 100644 rust/geodatafusion/src/udf/native/processing/centroid.rs create mode 100644 rust/geodatafusion/src/udf/native/processing/convex_hull.rs create mode 100644 rust/geodatafusion/src/udf/native/processing/mod.rs diff --git a/Cargo.lock b/Cargo.lock index f00042c2..e63c7c98 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -97,6 +97,18 @@ dependencies = [ "num-traits", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + [[package]] name = "arrow" version = "53.3.0" @@ -175,6 +187,7 @@ dependencies = [ "atoi", "base64 0.22.1", "chrono", + "comfy-table", "half", "lexical-core 1.0.2", "num", @@ -317,6 +330,24 @@ dependencies = [ "regex-syntax", ] +[[package]] +name = "async-compression" +version = "0.4.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df895a515f70646414f4b45c0b79082783b80552b373a68283012928df56f522" +dependencies = [ + "bzip2", + "flate2", + "futures-core", + "futures-io", + "memchr", + "pin-project-lite", + "tokio", + "xz2", + "zstd", + "zstd-safe", +] + [[package]] name = "async-stream" version = "0.3.6" @@ -465,6 +496,28 @@ dependencies = [ "serde", ] +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8ee0c1824c4dea5b5f81736aff91bae041d2c07ee1192bec91054e10e3e601e" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -476,9 +529,9 @@ dependencies = [ [[package]] name = "brotli" -version = "6.0.0" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74f7971dbd9326d58187408ab83117d8ac1bb9c17b085fdacd1cf2f598719b6b" +checksum = "cc97b8f16f944bba54f0433f07e30be199b6dc2bd25937444bbad560bcea29bd" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -519,6 +572,27 @@ version = "1.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "428d9aa8fbc0670b7b8d6030a7fadd0f86151cae55e4dbbece15f3780a3dfaf3" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.11+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "736a955f3fa7875102d57c82b8cac37ec45224a07fd32d58f9f7a186b6cd4cdc" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "c_vec" version = "2.0.0" @@ -665,6 +739,17 @@ dependencies = [ "cc", ] +[[package]] +name = "comfy-table" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24f165e7b643266ea80cb858aed492ad9280e3e05ce24d4a99d7d7b889b6a4d9" +dependencies = [ + "strum", + "strum_macros", + "unicode-width", +] + [[package]] name = "const-oid" version = "0.9.6" @@ -691,6 +776,12 @@ dependencies = [ "tiny-keccak", ] +[[package]] +name = "constant_time_eq" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c74b8349d32d297c9134b8c88677813a227df8f779daa29bfc29c183fe3dca6" + [[package]] name = "core-foundation" version = "0.9.4" @@ -847,6 +938,425 @@ dependencies = [ "memchr", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbba0799cf6913b456ed07a94f0f3b6e12c62a5d88b10809e2284a0f2b915c05" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-ipc", + "arrow-schema", + "async-compression", + "async-trait", + "bytes", + "bzip2", + "chrono", + "dashmap", + "datafusion-catalog", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-sql", + "flate2", + "futures", + "glob", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools 0.13.0", + "log", + "num_cpus", + "object_store", + "parking_lot", + "parquet", + "paste", + "pin-project-lite", + "rand", + "sqlparser", + "tempfile", + "tokio", + "tokio-util", + "url", + "uuid", + "xz2", + "zstd", +] + +[[package]] +name = "datafusion-catalog" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7493c5c2d40eec435b13d92e5703554f4efc7059451fcb8d3a79580ff0e45560" +dependencies = [ + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", +] + +[[package]] +name = "datafusion-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24953049ebbd6f8964f91f60aa3514e121b5e81e068e33b60e77815ab369b25c" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-schema", + "chrono", + "half", + "hashbrown 0.14.5", + "indexmap", + "instant", + "libc", + "num_cpus", + "object_store", + "parquet", + "paste", + "sqlparser", + "tokio", +] + +[[package]] +name = "datafusion-common-runtime" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f06df4ef76872e11c924d3c814fd2a8dd09905ed2e2195f71c857d78abd19685" +dependencies = [ + "log", + "tokio", +] + +[[package]] +name = "datafusion-execution" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bbdcb628d690f3ce5fea7de81642b514486d58ff9779a51f180a69a4eadb361" +dependencies = [ + "arrow", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "hashbrown 0.14.5", + "log", + "object_store", + "parking_lot", + "rand", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8036495980e3131f706b7d33ab00b4492d73dc714e3cb74d11b50f9602a73246" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap", + "paste", + "serde_json", + "sqlparser", + "strum", + "strum_macros", +] + +[[package]] +name = "datafusion-expr-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4da0f3cb4669f9523b403d6b5a0ec85023e0ab3bf0183afd1517475b3e64fdd2" +dependencies = [ + "arrow", + "datafusion-common", + "itertools 0.13.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f52c4012648b34853e40a2c6bcaa8772f837831019b68aca384fb38436dba162" +dependencies = [ + "arrow", + "arrow-buffer", + "base64 0.22.1", + "blake2", + "blake3", + "chrono", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "hashbrown 0.14.5", + "hex", + "itertools 0.13.0", + "log", + "md-5", + "rand", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5b8bb624597ba28ed7446df4a9bd7c7a7bde7c578b6b527da3f47371d5f6741" +dependencies = [ + "ahash", + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "indexmap", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fb06208fc470bc8cf1ce2d9a1159d42db591f2c7264a8c1776b53ad8f675143" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", + "rand", +] + +[[package]] +name = "datafusion-functions-nested" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fca25bbb87323716d05e54114666e942172ccca23c5a507e9c7851db6e965317" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-physical-expr-common", + "itertools 0.13.0", + "log", + "paste", + "rand", +] + +[[package]] +name = "datafusion-functions-window" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ae23356c634e54c59f7c51acb7a5b9f6240ffb2cf997049a1a24a8a88598dbe" +dependencies = [ + "datafusion-common", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4b3d6ff7794acea026de36007077a06b18b89e4f9c3fea7f2215f9f7dd9059b" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-optimizer" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bec6241eb80c595fa0e1a8a6b69686b5cf3bd5fdacb8319582a0943b0bd788aa" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "hashbrown 0.14.5", + "indexmap", + "itertools 0.13.0", + "log", + "paste", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3370357b8fc75ec38577700644e5d1b0bc78f38babab99c0b8bd26bafb3e4335" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "arrow-string", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools 0.13.0", + "log", + "paste", + "petgraph", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b7734d94bf2fa6f6e570935b0ddddd8421179ce200065be97874e13d46a47b" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.14.5", + "rand", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eee8c479522df21d7b395640dff88c5ed05361852dce6544d7c98e9dbcebffe" +dependencies = [ + "arrow", + "arrow-schema", + "datafusion-common", + "datafusion-execution", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-plan", + "itertools 0.13.0", +] + +[[package]] +name = "datafusion-physical-plan" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17e1fc2e2c239d14e8556f2622b19a726bf6bc6962cc00c71fc52626274bee24" +dependencies = [ + "ahash", + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.14.5", + "indexmap", + "itertools 0.13.0", + "log", + "once_cell", + "parking_lot", + "pin-project-lite", + "rand", + "tokio", +] + +[[package]] +name = "datafusion-sql" +version = "43.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e3a4ed41dbee20a5d947a59ca035c225d67dc9cbe869c10f66dcdf25e7ce51" +dependencies = [ + "arrow", + "arrow-array", + "arrow-schema", + "datafusion-common", + "datafusion-expr", + "indexmap", + "log", + "regex", + "sqlparser", + "strum", +] + [[package]] name = "dbase" version = "0.5.0" @@ -992,6 +1502,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "flatbuffers" version = "24.3.25" @@ -1335,6 +1851,28 @@ dependencies = [ "wkt 0.12.0", ] +[[package]] +name = "geodatafusion" +version = "0.1.0-dev" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-schema", + "async-stream", + "async-trait", + "datafusion", + "geo 0.29.3", + "geo-traits", + "geoarrow", + "thiserror", + "tokio", +] + [[package]] name = "geographiclib-rs" version = "0.2.4" @@ -1772,6 +2310,18 @@ dependencies = [ "hashbrown 0.15.2", ] +[[package]] +name = "instant" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222" +dependencies = [ + "cfg-if", + "js-sys", + "wasm-bindgen", + "web-sys", +] + [[package]] name = "integer-encoding" version = "3.0.4" @@ -2073,6 +2623,17 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "md-5" version = "0.10.6" @@ -2246,6 +2807,16 @@ dependencies = [ "libm", ] +[[package]] +name = "num_cpus" +version = "1.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" +dependencies = [ + "hermit-abi 0.3.9", + "libc", +] + [[package]] name = "num_enum" version = "0.7.3" @@ -2396,9 +2967,9 @@ dependencies = [ [[package]] name = "parquet" -version = "53.1.0" +version = "53.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "310c46a70a3ba90d98fec39fa2da6d9d731e544191da6fb56c9d199484d0dd3e" +checksum = "2b449890367085eb65d7d3321540abc3d7babbd179ce31df0016e90719114191" dependencies = [ "ahash", "arrow-array", @@ -2415,7 +2986,7 @@ dependencies = [ "flate2", "futures", "half", - "hashbrown 0.14.5", + "hashbrown 0.15.2", "lz4_flex", "num", "num-bigint", @@ -2466,6 +3037,16 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" +[[package]] +name = "petgraph" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4c5cc86750666a3ed20bdaf5ca2a0344f9c67674cae0515bec2da16fbaa47db" +dependencies = [ + "fixedbitset", + "indexmap", +] + [[package]] name = "phf" version = "0.11.2" @@ -3055,6 +3636,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "rustversion" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e819f2bc632f285be6d7cd36e25940d45b2391dd6d9b939e79de557f7014248" + [[package]] name = "ryu" version = "1.0.18" @@ -3327,6 +3914,27 @@ dependencies = [ "unicode_categories", ] +[[package]] +name = "sqlparser" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fe11944a61da0da3f592e19a45ebe5ab92dc14a779907ff1f08fbb797bfefc7" +dependencies = [ + "log", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.79", +] + [[package]] name = "sqlx" version = "0.7.4" @@ -3551,6 +4159,28 @@ dependencies = [ "unicode-properties", ] +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.79", +] + [[package]] name = "subtle" version = "2.6.1" @@ -3899,6 +4529,12 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "unicode-width" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd" + [[package]] name = "unicode_categories" version = "0.1.1" @@ -3928,6 +4564,15 @@ version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" +[[package]] +name = "uuid" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8c5f0a0af699448548ad1a2fbf920fb4bee257eae39953ba95cb84891a0446a" +dependencies = [ + "getrandom", +] + [[package]] name = "vcpkg" version = "0.2.15" @@ -4342,6 +4987,15 @@ dependencies = [ "rustix", ] +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "zerocopy" version = "0.7.35" diff --git a/Cargo.toml b/Cargo.toml index e9d56a31..6f0487d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["rust/geoarrow"] +members = ["rust/geoarrow", "rust/geodatafusion"] exclude = ["js"] resolver = "2" diff --git a/rust/geoarrow/src/algorithm/native/bounding_rect.rs b/rust/geoarrow/src/algorithm/native/bounding_rect.rs index 12c64b49..9d8198ee 100644 --- a/rust/geoarrow/src/algorithm/native/bounding_rect.rs +++ b/rust/geoarrow/src/algorithm/native/bounding_rect.rs @@ -6,6 +6,11 @@ use geo_traits::{ MultiLineStringTrait, MultiPointTrait, MultiPolygonTrait, PointTrait, PolygonTrait, RectTrait, }; +use crate::array::*; +use crate::datatypes::{Dimension, NativeType}; +use crate::error::Result; +use crate::trait_::ArrayAccessor; + #[derive(Debug, Clone, Copy)] pub struct BoundingRect { minx: f64, @@ -301,4 +306,77 @@ pub fn bounding_rect_rect(geom: &impl RectTrait) -> ([f64; 2], [f64; 2] rect.into() } +/// Calculation of the bounding rectangle of a geometry. +pub trait BoundingRectArray { + type Output; + + fn bounding_rect(&self) -> Self::Output; +} + +/// Implementation that iterates over geo objects +macro_rules! array_impl { + ($type:ty, $bounding_rect_fn:ident) => { + impl BoundingRectArray for $type { + type Output = RectArray; + + fn bounding_rect(&self) -> Self::Output { + let mut builder = RectBuilder::with_capacity_and_options( + Dimension::XY, + self.len(), + self.metadata().clone(), + ); + for geom in self.iter() { + if let Some(geom) = geom { + let ([minx, miny], [maxx, maxy]) = $bounding_rect_fn(&geom); + builder.push_box2d(Some([minx, miny, maxx, maxy])); + } else { + builder.push_null(); + } + } + + builder.finish() + } + } + }; +} + +array_impl!(PointArray, bounding_rect_point); +array_impl!(LineStringArray, bounding_rect_linestring); +array_impl!(PolygonArray, bounding_rect_polygon); +array_impl!(MultiPointArray, bounding_rect_multipoint); +array_impl!(MultiLineStringArray, bounding_rect_multilinestring); +array_impl!(MultiPolygonArray, bounding_rect_multipolygon); +array_impl!(MixedGeometryArray, bounding_rect_geometry); +array_impl!(GeometryCollectionArray, bounding_rect_geometry_collection); +array_impl!(GeometryArray, bounding_rect_geometry); + +impl BoundingRectArray for RectArray { + type Output = RectArray; + + fn bounding_rect(&self) -> Self::Output { + self.clone() + } +} + +impl BoundingRectArray for &dyn NativeArray { + type Output = Result; + + fn bounding_rect(&self) -> Self::Output { + use NativeType::*; + + let result = match self.data_type() { + Point(_, _) => self.as_point().bounding_rect(), + LineString(_, _) => self.as_line_string().bounding_rect(), + Polygon(_, _) => self.as_polygon().bounding_rect(), + MultiPoint(_, _) => self.as_multi_point().bounding_rect(), + MultiLineString(_, _) => self.as_multi_line_string().bounding_rect(), + MultiPolygon(_, _) => self.as_multi_polygon().bounding_rect(), + GeometryCollection(_, _) => self.as_geometry_collection().bounding_rect(), + Geometry(_) => self.as_geometry().bounding_rect(), + Rect(_) => self.as_rect().bounding_rect(), + }; + Ok(result) + } +} + // TODO: add tests from geo diff --git a/rust/geoarrow/src/algorithm/native/mod.rs b/rust/geoarrow/src/algorithm/native/mod.rs index 9509e355..f6a896fe 100644 --- a/rust/geoarrow/src/algorithm/native/mod.rs +++ b/rust/geoarrow/src/algorithm/native/mod.rs @@ -19,6 +19,7 @@ pub(crate) mod type_id; mod unary; pub use binary::Binary; +pub use bounding_rect::BoundingRectArray; pub use cast::Cast; pub use concatenate::Concatenate; pub use downcast::{Downcast, DowncastTable}; diff --git a/rust/geoarrow/src/array/geometry/array.rs b/rust/geoarrow/src/array/geometry/array.rs index 0703b31f..b1c0fce6 100644 --- a/rust/geoarrow/src/array/geometry/array.rs +++ b/rust/geoarrow/src/array/geometry/array.rs @@ -996,49 +996,376 @@ impl TryFrom> for GeometryArray { } } -macro_rules! impl_to_geometry_array { - ($source_array:ty, $typeid_xy:expr, $typeid_xyz:expr, $child_xy:ident, $child_xyz:ident) => { - impl From<$source_array> for GeometryArray { - fn from(value: $source_array) -> Self { - let dim = value.dimension(); - let type_ids = match dim { - Dimension::XY => vec![$typeid_xy; value.len()], - Dimension::XYZ => vec![$typeid_xyz; value.len()], - }; - let mut slf = Self { - data_type: NativeType::Geometry(value.coord_type()), - metadata: value.metadata().clone(), - type_ids: type_ids.into(), - offsets: ScalarBuffer::from_iter(0..value.len() as i32), - ..Default::default() - }; - match dim { - Dimension::XY => { - slf.$child_xy = value; - } - Dimension::XYZ => { - slf.$child_xyz = value; - } - } - slf - } +impl From for GeometryArray { + fn from(value: PointArray) -> Self { + let dim = value.dimension(); + let type_ids = match dim { + Dimension::XY => vec![1; value.len()], + Dimension::XYZ => vec![11; value.len()], + } + .into(); + let offsets = ScalarBuffer::from_iter(0..value.len() as i32); + let metadata = value.metadata().clone(); + match dim { + Dimension::XY => Self::new( + type_ids, + offsets, + Some(value), + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + metadata, + ), + Dimension::XYZ => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + None, + Some(value), + None, + None, + None, + None, + None, + None, + metadata, + ), + } + } +} + +impl From for GeometryArray { + fn from(value: LineStringArray) -> Self { + let dim = value.dimension(); + let type_ids = match dim { + Dimension::XY => vec![2; value.len()], + Dimension::XYZ => vec![12; value.len()], + } + .into(); + let offsets = ScalarBuffer::from_iter(0..value.len() as i32); + let metadata = value.metadata().clone(); + match dim { + Dimension::XY => Self::new( + type_ids, + offsets, + None, + Some(value), + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + metadata, + ), + Dimension::XYZ => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + None, + None, + Some(value), + None, + None, + None, + None, + None, + metadata, + ), + } + } +} + +impl From for GeometryArray { + fn from(value: PolygonArray) -> Self { + let dim = value.dimension(); + let type_ids = match dim { + Dimension::XY => vec![3; value.len()], + Dimension::XYZ => vec![13; value.len()], + } + .into(); + let offsets = ScalarBuffer::from_iter(0..value.len() as i32); + let metadata = value.metadata().clone(); + match dim { + Dimension::XY => Self::new( + type_ids, + offsets, + None, + None, + Some(value), + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + metadata, + ), + Dimension::XYZ => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + None, + None, + None, + Some(value), + None, + None, + None, + None, + metadata, + ), + } + } +} + +impl From for GeometryArray { + fn from(value: MultiPointArray) -> Self { + let dim = value.dimension(); + let type_ids = match dim { + Dimension::XY => vec![4; value.len()], + Dimension::XYZ => vec![14; value.len()], } - }; + .into(); + let offsets = ScalarBuffer::from_iter(0..value.len() as i32); + let metadata = value.metadata().clone(); + match dim { + Dimension::XY => Self::new( + type_ids, + offsets, + None, + None, + None, + Some(value), + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + metadata, + ), + Dimension::XYZ => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + Some(value), + None, + None, + None, + metadata, + ), + } + } +} + +impl From for GeometryArray { + fn from(value: MultiLineStringArray) -> Self { + let dim = value.dimension(); + let type_ids = match dim { + Dimension::XY => vec![5; value.len()], + Dimension::XYZ => vec![15; value.len()], + } + .into(); + let offsets = ScalarBuffer::from_iter(0..value.len() as i32); + let metadata = value.metadata().clone(); + match dim { + Dimension::XY => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + Some(value), + None, + None, + None, + None, + None, + None, + None, + None, + None, + metadata, + ), + Dimension::XYZ => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + Some(value), + None, + None, + metadata, + ), + } + } +} + +impl From for GeometryArray { + fn from(value: MultiPolygonArray) -> Self { + let dim = value.dimension(); + let type_ids = match dim { + Dimension::XY => vec![6; value.len()], + Dimension::XYZ => vec![16; value.len()], + } + .into(); + let offsets = ScalarBuffer::from_iter(0..value.len() as i32); + let metadata = value.metadata().clone(); + match dim { + Dimension::XY => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + Some(value), + None, + None, + None, + None, + None, + None, + None, + None, + metadata, + ), + Dimension::XYZ => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + Some(value), + None, + metadata, + ), + } + } } -impl_to_geometry_array!(PointArray, 1, 11, point_xy, point_xyz); -impl_to_geometry_array!(LineStringArray, 2, 12, line_string_xy, line_string_xy); -impl_to_geometry_array!(PolygonArray, 3, 13, polygon_xy, polygon_xyz); -impl_to_geometry_array!(MultiPointArray, 4, 14, mpoint_xy, mpoint_xyz); -impl_to_geometry_array!( - MultiLineStringArray, - 5, - 15, - mline_string_xy, - mline_string_xyz -); -impl_to_geometry_array!(MultiPolygonArray, 6, 16, mpolygon_xy, mpolygon_xyz); -impl_to_geometry_array!(GeometryCollectionArray, 7, 17, gc_xy, gc_xyz); +impl From for GeometryArray { + fn from(value: GeometryCollectionArray) -> Self { + let dim = value.dimension(); + let type_ids = match dim { + Dimension::XY => vec![7; value.len()], + Dimension::XYZ => vec![17; value.len()], + } + .into(); + let offsets = ScalarBuffer::from_iter(0..value.len() as i32); + let metadata = value.metadata().clone(); + match dim { + Dimension::XY => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + Some(value), + None, + None, + None, + None, + None, + None, + None, + metadata, + ), + Dimension::XYZ => Self::new( + type_ids, + offsets, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + None, + Some(value), + metadata, + ), + } + } +} impl From for GeometryArray { fn from(value: MixedGeometryArray) -> Self { diff --git a/rust/geoarrow/src/array/rect/builder.rs b/rust/geoarrow/src/array/rect/builder.rs index 4136021d..4f589156 100644 --- a/rust/geoarrow/src/array/rect/builder.rs +++ b/rust/geoarrow/src/array/rect/builder.rs @@ -149,6 +149,25 @@ impl RectBuilder { self.push_rect(None::<&Rect>); } + /// Push a 2D box to the builder. + /// + /// The array should be `[minx, miny, maxx, maxy]`. + #[inline] + pub fn push_box2d(&mut self, value: Option<[f64; 4]>) { + if let Some(value) = value { + self.lower + .push_coord(&geo::coord! { x: value[0], y: value[1] }); + self.upper + .push_coord(&geo::coord! { x: value[2], y: value[3] }); + self.validity.append_non_null() + } else { + // Since it's a struct, we still need to push coords when null + self.lower.push_nan_coord(); + self.upper.push_nan_coord(); + self.validity.append_null(); + } + } + /// Create this builder from a iterator of Rects. pub fn from_rects<'a>( geoms: impl ExactSizeIterator + 'a)>, diff --git a/rust/geoarrow/src/datatypes.rs b/rust/geoarrow/src/datatypes.rs index 8857648a..4258ec08 100644 --- a/rust/geoarrow/src/datatypes.rs +++ b/rust/geoarrow/src/datatypes.rs @@ -146,6 +146,12 @@ pub enum NativeType { Geometry(CoordType), } +impl From for DataType { + fn from(value: NativeType) -> Self { + value.to_data_type() + } +} + /// A type enum representing "serialized" GeoArrow geometry types. #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum SerializedType { diff --git a/rust/geodatafusion/Cargo.toml b/rust/geodatafusion/Cargo.toml new file mode 100644 index 00000000..dcbcfc1e --- /dev/null +++ b/rust/geodatafusion/Cargo.toml @@ -0,0 +1,32 @@ + +[package] +name = "geodatafusion" +version = "0.1.0-dev" +authors = ["Kyle Barron "] +edition = "2021" +license = "MIT OR Apache-2.0" +repository = "https://github.com/geoarrow/geoarrow-rs" +description = "Rust implementation of GeoArrow" +categories = ["science::geo"] +rust-version = "1.82" + + +[dependencies] +datafusion = "43" +arrow = { version = "53.3", features = ["ffi"] } +arrow-array = { version = "53.3", features = ["chrono-tz"] } +arrow-buffer = "53.3" +arrow-cast = { version = "53.3" } +arrow-csv = { version = "53", optional = true } +arrow-data = "53.3" +arrow-ipc = "53.3" +arrow-schema = "53.3" +async-stream = { version = "0.3", optional = true } +async-trait = { version = "0.1", optional = true } +geo = "0.29.3" +geo-traits = "0.2" +geoarrow = { path = "../geoarrow", features = ["flatgeobuf"] } +thiserror = "1" + +[dev-dependencies] +tokio = { version = "1.9", features = ["macros", "fs", "rt-multi-thread"] } diff --git a/rust/geodatafusion/README.md b/rust/geodatafusion/README.md new file mode 100644 index 00000000..c8f1cde1 --- /dev/null +++ b/rust/geodatafusion/README.md @@ -0,0 +1,365 @@ +# `geodatafusion` + +Spatial extensions for [Apache DataFusion](https://datafusion.apache.org/), an extensible query engine written in Rust that uses Apache Arrow as its in-memory format. + +## Functions supported + +### Geometry Constructors + +| Name | Implemented | Description | +| --------------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------- | +| ST_Collect | | Creates a GeometryCollection or Multi\* geometry from a set of geometries. | +| ST_LineFromMultiPoint | | Creates a LineString from a MultiPoint geometry. | +| ST_MakeEnvelope | | Creates a rectangular Polygon from minimum and maximum coordinates. | +| ST_MakeLine | | Creates a LineString from Point, MultiPoint, or LineString geometries. | +| ST_MakePoint | ✅ | Creates a 2D, 3DZ or 4D Point. | +| ST_MakePointM | | Creates a Point from X, Y and M values. | +| ST_MakePolygon | | Creates a Polygon from a shell and optional list of holes. | +| ST_Point | ✅ | Creates a Point with X, Y and SRID values. | +| ST_PointZ | | Creates a Point with X, Y, Z and SRID values. | +| ST_PointM | | Creates a Point with X, Y, M and SRID values. | +| ST_PointZM | | Creates a Point with X, Y, Z, M and SRID values. | +| ST_Polygon | | Creates a Polygon from a LineString with a specified SRID. | +| ST_TileEnvelope | | Creates a rectangular Polygon in Web Mercator (SRID:3857) using the XYZ tile system. | +| ST_HexagonGrid | | Returns a set of hexagons and cell indices that completely cover the bounds of the geometry argument. | +| ST_Hexagon | | Returns a single hexagon, using the provided edge size and cell coordinate within the hexagon grid space. | +| ST_SquareGrid | | Returns a set of grid squares and cell indices that completely cover the bounds of the geometry argument. | +| ST_Square | | Returns a single square, using the provided edge size and cell coordinate within the square grid space. | +| ST_Letters | | Returns the input letters rendered as geometry with a default start position at the origin and default text height of 100. | + +### Geometry Accessors + +| Name | Implemented | Description | +| ------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------- | +| GeometryType | | Returns the type of a geometry as text. | +| ST_Boundary | | Returns the boundary of a geometry. | +| ST_BoundingDiagonal | | Returns the diagonal of a geometry's bounding box. | +| ST_CoordDim | ✅ | Return the coordinate dimension of a geometry. | +| ST_Dimension | | Returns the topological dimension of a geometry. | +| ST_Dump | | Returns a set of geometry_dump rows for the components of a geometry. | +| ST_DumpPoints | | Returns a set of geometry_dump rows for the coordinates in a geometry. | +| ST_DumpSegments | | Returns a set of geometry_dump rows for the segments in a geometry. | +| ST_DumpRings | | Returns a set of geometry_dump rows for the exterior and interior rings of a Polygon. | +| ST_EndPoint | | Returns the last point of a LineString or CircularLineString. | +| ST_Envelope | ✅ | Returns a geometry representing the bounding box of a geometry. | +| ST_ExteriorRing | | Returns a LineString representing the exterior ring of a Polygon. | +| ST_GeometryN | | Return an element of a geometry collection. | +| ST_GeometryType | | Returns the SQL-MM type of a geometry as text. | +| ST_HasArc | | Tests if a geometry contains a circular arc | +| ST_InteriorRingN | | Returns the Nth interior ring (hole) of a Polygon. | +| ST_NumCurves | | Return the number of component curves in a CompoundCurve. | +| ST_CurveN | | Returns the Nth component curve geometry of a CompoundCurve. | +| ST_IsClosed | | Tests if a LineStrings's start and end points are coincident. For a PolyhedralSurface tests if it is closed (volumetric). | +| ST_IsCollection | | Tests if a geometry is a geometry collection type. | +| ST_IsEmpty | | Tests if a geometry is empty. | +| ST_IsPolygonCCW | | Tests if Polygons have exterior rings oriented counter-clockwise and interior rings oriented clockwise. | +| ST_IsPolygonCW | | Tests if Polygons have exterior rings oriented clockwise and interior rings oriented counter-clockwise. | +| ST_IsRing | | Tests if a LineString is closed and simple. | +| ST_IsSimple | | Tests if a geometry has no points of self-intersection or self-tangency. | +| ST_M | | Returns the M coordinate of a Point. | +| ST_MemSize | | Returns the amount of memory space a geometry takes. | +| ST_NDims | | Returns the coordinate dimension of a geometry. | +| ST_NPoints | | Returns the number of points (vertices) in a geometry. | +| ST_NRings | | Returns the number of rings in a polygonal geometry. | +| ST_NumGeometries | | Returns the number of elements in a geometry collection. | +| ST_NumInteriorRings | | Returns the number of interior rings (holes) of a Polygon. | +| ST_NumInteriorRing | | Returns the number of interior rings (holes) of a Polygon. Aias for ST_NumInteriorRings | +| ST_NumPatches | | Return the number of faces on a Polyhedral Surface. Will return null for non-polyhedral geometries. | +| ST_NumPoints | | Returns the number of points in a LineString or CircularString. | +| ST_PatchN | | Returns the Nth geometry (face) of a PolyhedralSurface. | +| ST_PointN | | Returns the Nth point in the first LineString or circular LineString in a geometry. | +| ST_Points | | Returns a MultiPoint containing the coordinates of a geometry. | +| ST_StartPoint | ✅ | Returns the first point of a LineString. | +| ST_Summary | | Returns a text summary of the contents of a geometry. | +| ST_X | | Returns the X coordinate of a Point. | +| ST_Y | | Returns the Y coordinate of a Point. | +| ST_Z | | Returns the Z coordinate of a Point. | +| ST_Zmflag | | Returns a code indicating the ZM coordinate dimension of a geometry. | +| ST_HasZ | | Checks if a geometry has a Z dimension. | +| ST_HasM | | Checks if a geometry has an M (measure) dimension. | + +### Geometry Editors + +| Name | Implemented | Description | +| -------------------------------- | ----------- | --------------------------------------------------------------------------------------------------- | +| ST_AddPoint | | Add a point to a LineString. | +| ST_CollectionExtract | | Given a geometry collection, returns a multi-geometry containing only elements of a specified type. | +| ST_CollectionHomogenize | | Returns the simplest representation of a geometry collection. | +| ST_CurveToLine | | Converts a geometry containing curves to a linear geometry. | +| ST_Scroll | | Change start point of a closed LineString. | +| ST_FlipCoordinates | | Returns a version of a geometry with X and Y axis flipped. | +| ST_Force2D | | Force the geometries into a "2-dimensional mode". | +| ST_Force3D | | Force the geometries into XYZ mode. This is an alias for ST_Force3DZ. | +| ST_Force3DZ | | Force the geometries into XYZ mode. | +| ST_Force3DM | | Force the geometries into XYM mode. | +| ST_Force4D | | Force the geometries into XYZM mode. | +| ST_ForceCollection | | Convert the geometry into a GEOMETRYCOLLECTION. | +| ST_ForceCurve | | Upcast a geometry into its curved type, if applicable. | +| ST_ForcePolygonCCW | | Orients all exterior rings counter-clockwise and all interior rings clockwise. | +| ST_ForcePolygonCW | | Orients all exterior rings clockwise and all interior rings counter-clockwise. | +| ST_ForceSFS | | Force the geometries to use SFS 1.1 geometry types only. | +| ST_ForceRHR | | Force the orientation of the vertices in a polygon to follow the Right-Hand-Rule. | +| ST_LineExtend | | Returns a line extended forwards and backwards by specified distances. | +| ST_LineToCurve | | Converts a linear geometry to a curved geometry. | +| ST_Multi | | Return the geometry as a MULTI\* geometry. | +| ST_Normalize | | Return the geometry in its canonical form. | +| ST_Project | | Returns a point projected from a start point by a distance and bearing (azimuth). | +| ST_QuantizeCoordinates | | Sets least significant bits of coordinates to zero | +| ST_RemovePoint | | Remove a point from a linestring. | +| ST_RemoveRepeatedPoints | | Returns a version of a geometry with duplicate points removed. | +| ST_RemoveIrrelevantPointsForView | | Removes points that are irrelevant for rendering a specific rectangluar view of a geometry. | +| ST_RemoveSmallParts | | Removes small parts (polygon rings or linestrings) of a geometry. | +| ST_Reverse | | Return the geometry with vertex order reversed. | +| ST_Segmentize | | Returns a modified geometry/geography having no segment longer than a given distance. | +| ST_SetPoint | | Replace point of a linestring with a given point. | +| ST_ShiftLongitude | | Shifts the longitude coordinates of a geometry between -180..180 and 0..360. | +| ST_WrapX | | Wrap a geometry around an X value. | +| ST_SnapToGrid | | Snap all points of the input geometry to a regular grid. | +| ST_Snap | | Snap segments and vertices of input geometry to vertices of a reference geometry. | +| ST_SwapOrdinates | | Returns a version of the given geometry with given ordinate values swapped. | + +### Geometry Validation + +| Name | Implemented | Description | +| ---------------- | ----------- | -------------------------------------------------------------------------------------------- | +| ST_IsValid | | Tests if a geometry is well-formed in 2D. | +| ST_IsValidDetail | | Returns a valid_detail row stating if a geometry is valid or if not a reason and a location. | +| ST_IsValidReason | | Returns text stating if a geometry is valid, or a reason for invalidity. | +| ST_MakeValid | | Attempts to make an invalid geometry valid without losing vertices. | + +### Geometry Input + +#### Well-Known Text (WKT) + +| Name | Implemented | Description | +| -------------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | +| ST_BdPolyFromText | | Construct a Polygon given an arbitrary collection of closed linestrings as a MultiLineString Well-Known text representation. | +| ST_BdMPolyFromText | | Construct a MultiPolygon given an arbitrary collection of closed linestrings as a MultiLineString text representation Well-Known text representation. | +| ST_GeogFromText | | Return a specified geography value from Well-Known Text representation or extended (WKT). | +| ST_GeographyFromText | | Return a specified geography value from Well-Known Text representation or extended (WKT). | +| ST_GeomCollFromText | | Makes a collection Geometry from collection WKT with the given SRID. If SRID is not given, it defaults to 0. | +| ST_GeomFromEWKT | | Return a specified ST_Geometry value from Extended Well-Known Text representation (EWKT). | +| ST_GeomFromMARC21 | | Takes MARC21/XML geographic data as input and returns a PostGIS geometry object. | +| ST_GeometryFromText | | Return a specified ST_Geometry value from Well-Known Text representation (WKT). This is an alias name for ST_GeomFromText | +| ST_GeomFromText | ✅ | Return a specified ST_Geometry value from Well-Known Text representation (WKT). | +| ST_LineFromText | | Makes a Geometry from WKT representation with the given SRID. If SRID is not given, it defaults to 0. | +| ST_MLineFromText | | Return a specified ST_MultiLineString value from WKT representation. | +| ST_MPointFromText | | Makes a Geometry from WKT with the given SRID. If SRID is not given, it defaults to 0. | +| ST_MPolyFromText | | Makes a MultiPolygon Geometry from WKT with the given SRID. If SRID is not given, it defaults to 0. | +| ST_PointFromText | | Makes a point Geometry from WKT with the given SRID. If SRID is not given, it defaults to unknown. | +| ST_PolygonFromText | | Makes a Geometry from WKT with the given SRID. If SRID is not given, it defaults to 0. | +| ST_WKTToSQL | | Return a specified ST_Geometry value from Well-Known Text representation (WKT). This is an alias name for ST_GeomFromText | + +#### Well-Known Binary (WKB) + +| Name | Implemented | Description | +| -------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------- | +| ST_GeogFromWKB | | Creates a geography instance from a Well-Known Binary geometry representation (WKB) or extended Well Known Binary (EWKB). | +| ST_GeomFromEWKB | | Return a specified ST_Geometry value from Extended Well-Known Binary representation (EWKB). | +| ST_GeomFromWKB | ✅ | Creates a geometry instance from a Well-Known Binary geometry representation (WKB) and optional SRID. | +| ST_LineFromWKB | | Makes a LINESTRING from WKB with the given SRID | +| ST_LinestringFromWKB | | Makes a geometry from WKB with the given SRID. | +| ST_PointFromWKB | | Makes a geometry from WKB with the given SRID | +| ST_WKBToSQL | | Return a specified ST_Geometry value from Well-Known Binary representation (WKB). This is an alias name for ST_GeomFromWKB that takes no srid | + +#### Other Formats + +| Name | Implemented | Description | +| -------------------------- | ----------- | ------------------------------------------------------------------------------------------------------ | +| ST_Box2dFromGeoHash | | Return a BOX2D from a GeoHash string. | +| ST_GeomFromGeoHash | | Return a geometry from a GeoHash string. | +| ST_GeomFromGML | | Takes as input GML representation of geometry and outputs a PostGIS geometry object | +| ST_GeomFromGeoJSON | | Takes as input a geojson representation of a geometry and outputs a PostGIS geometry object | +| ST_GeomFromKML | | Takes as input KML representation of geometry and outputs a PostGIS geometry object | +| ST_GeomFromTWKB | | Creates a geometry instance from a TWKB ("Tiny Well-Known Binary") geometry representation. | +| ST_GMLToSQL | | Return a specified ST_Geometry value from GML representation. This is an alias name for ST_GeomFromGML | +| ST_LineFromEncodedPolyline | | Creates a LineString from an Encoded Polyline. | +| ST_PointFromGeoHash | | Return a point from a GeoHash string. | +| ST_FromFlatGeobufToTable | | Creates a table based on the structure of FlatGeobuf data. | +| ST_FromFlatGeobuf | | Reads FlatGeobuf data. | + +### Geometry Output + +#### Well-Known Text (WKT) + +| Name | Implemented | Description | +| --------- | ----------- | ------------------------------------------------------------------------------------------------ | +| ST_AsEWKT | | Return the Well-Known Text (WKT) representation of the geometry with SRID meta data. | +| ST_AsText | ✅ | Return the Well-Known Text (WKT) representation of the geometry/geography without SRID metadata. | + +#### Well-Known Binary (WKB) + +| Name | Implemented | Description | +| ------------ | ----------- | ------------------------------------------------------------------------------------------------------------- | +| ST_AsBinary | ✅ | Return the OGC/ISO Well-Known Binary (WKB) representation of the geometry/geography without SRID meta data. | +| ST_AsEWKB | | Return the Extended Well-Known Binary (EWKB) representation of the geometry with SRID meta data. | +| ST_AsHEXEWKB | | Returns a Geometry in HEXEWKB format (as text) using either little-endian (NDR) or big-endian (XDR) encoding. | + +#### Other Formats + +| Name | Implemented | Description | +| -------------------- | ----------- | ------------------------------------------------------------------------------------- | +| ST_AsEncodedPolyline | | Returns an Encoded Polyline from a LineString geometry. | +| ST_AsFlatGeobuf | | Return a FlatGeobuf representation of a set of rows. | +| ST_AsGeobuf | | Return a Geobuf representation of a set of rows. | +| ST_AsGeoJSON | | Return a geometry or feature in GeoJSON format. | +| ST_AsGML | | Return the geometry as a GML version 2 or 3 element. | +| ST_AsKML | | Return the geometry as a KML element. | +| ST_AsLatLonText | | Return the Degrees, Minutes, Seconds representation of the given point. | +| ST_AsMARC21 | | Returns geometry as a MARC21/XML record with a geographic datafield (034). | +| ST_AsMVTGeom | | Transforms a geometry into the coordinate space of a MVT tile. | +| ST_AsMVT | | Aggregate function returning a MVT representation of a set of rows. | +| ST_AsSVG | | Returns SVG path data for a geometry. | +| ST_AsTWKB | | Returns the geometry as TWKB, aka "Tiny Well-Known Binary" | +| ST_AsX3D | | Returns a Geometry in X3D xml node element format: ISO-IEC-19776-1.2-X3DEncodings-XML | +| ST_GeoHash | | Return a GeoHash representation of the geometry. | + +### Operators + +### Spatial Relationships + +### Measurement Functions + +| Name | Implemented | Description | +| ----------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------ | +| ST_Area | | Returns the area of a polygonal geometry. | +| ST_Azimuth | | Returns the north-based azimuth of a line between two points. | +| ST_Angle | | Returns the angle between two vectors defined by 3 or 4 points, or 2 lines. | +| ST_ClosestPoint | | Returns the 2D point on g1 that is closest to g2. This is the first point of the shortest line from one geometry to the other. | +| ST_3DClosestPoint | | Returns the 3D point on g1 that is closest to g2. This is the first point of the 3D shortest line. | +| ST_Distance | | Returns the distance between two geometry or geography values. | +| ST_3DDistance | | Returns the 3D cartesian minimum distance (based on spatial ref) between two geometries in projected units. | +| ST_DistanceSphere | | Returns minimum distance in meters between two lon/lat geometries using a spherical earth model. | +| ST_DistanceSpheroid | | Returns the minimum distance between two lon/lat geometries using a spheroidal earth model. | +| ST_FrechetDistance | | Returns the Fréchet distance between two geometries. | +| ST_HausdorffDistance | | Returns the Hausdorff distance between two geometries. | +| ST_Length | | Returns the 2D length of a linear geometry. | +| ST_Length2D | | Returns the 2D length of a linear geometry. Alias for ST_Length | +| ST_3DLength | | Returns the 3D length of a linear geometry. | +| ST_LengthSpheroid | | Returns the 2D or 3D length/perimeter of a lon/lat geometry on a spheroid. | +| ST_LongestLine | | Returns the 2D longest line between two geometries. | +| ST_3DLongestLine | | Returns the 3D longest line between two geometries | +| ST_MaxDistance | | Returns the 2D largest distance between two geometries in projected units. | +| ST_3DMaxDistance | | Returns the 3D cartesian maximum distance (based on spatial ref) between two geometries in projected units. | +| ST_MinimumClearance | | Returns the minimum clearance of a geometry, a measure of a geometry's robustness. | +| ST_MinimumClearanceLine | | Returns the two-point LineString spanning a geometry's minimum clearance. | +| ST_Perimeter | | Returns the length of the boundary of a polygonal geometry or geography. | +| ST_Perimeter2D | | Returns the 2D perimeter of a polygonal geometry. Alias for ST_Perimeter. | +| ST_3DPerimeter | | Returns the 3D perimeter of a polygonal geometry. | +| ST_ShortestLine | | Returns the 2D shortest line between two geometries | +| ST_3DShortestLine | | Returns the 3D shortest line between two geometries | + +### Overlay Functions + +| Name | Implemented | Description | +| ---------------- | ----------- | ------------------------------------------------------------------------------------------- | +| ST_ClipByBox2D | | Computes the portion of a geometry falling within a rectangle. | +| ST_Difference | | Computes a geometry representing the part of geometry A that does not intersect geometry B. | +| ST_Intersection | | Computes a geometry representing the shared portion of geometries A and B. | +| ST_MemUnion | | Aggregate function which unions geometries in a memory-efficent but slower way | +| ST_Node | | Nodes a collection of lines. | +| ST_Split | | Returns a collection of geometries created by splitting a geometry by another geometry. | +| ST_Subdivide | | Computes a rectilinear subdivision of a geometry. | +| ST_SymDifference | | Computes a geometry representing the portions of geometries A and B that do not intersect. | +| ST_UnaryUnion | | Computes the union of the components of a single geometry. | +| ST_Union | | Computes a geometry representing the point-set union of the input geometries. | + +### Geometry Processing + +| Name | Implemented | Description | +| --------------------------- | ----------- | ------------------------------------------------------------------------------------------------- | +| ST_Buffer | | Computes a geometry covering all points within a given distance from a geometry. | +| ST_BuildArea | | Creates a polygonal geometry formed by the linework of a geometry. | +| ST_Centroid | ✅ | Returns the geometric center of a geometry. | +| ST_ChaikinSmoothing | | Returns a smoothed version of a geometry, using the Chaikin algorithm | +| ST_ConcaveHull | | Computes a possibly concave geometry that contains all input geometry vertices | +| ST_ConvexHull | ✅ | Computes the convex hull of a geometry. | +| ST_DelaunayTriangles | | Returns the Delaunay triangulation of the vertices of a geometry. | +| ST_FilterByM | | Removes vertices based on their M value | +| ST_GeneratePoints | | Generates a multipoint of random points contained in a Polygon or MultiPolygon. | +| ST_GeometricMedian | | Returns the geometric median of a MultiPoint. | +| ST_LineMerge | | Return the lines formed by sewing together a MultiLineString. | +| ST_MaximumInscribedCircle | | Computes the largest circle contained within a geometry. | +| ST_LargestEmptyCircle | | Computes the largest circle not overlapping a geometry. | +| ST_MinimumBoundingCircle | | Returns the smallest circle polygon that contains a geometry. | +| ST_MinimumBoundingRadius | | Returns the center point and radius of the smallest circle that contains a geometry. | +| ST_OrientedEnvelope | | Returns a minimum-area rectangle containing a geometry. | +| ST_OffsetCurve | | Returns an offset line at a given distance and side from an input line. | +| ST_PointOnSurface | | Computes a point guaranteed to lie in a polygon, or on a geometry. | +| ST_Polygonize | | Computes a collection of polygons formed from the linework of a set of geometries. | +| ST_ReducePrecision | | Returns a valid geometry with points rounded to a grid tolerance. | +| ST_SharedPaths | | Returns a collection containing paths shared by the two input linestrings/multilinestrings. | +| ST_Simplify | | Returns a simplified representation of a geometry, using the Douglas-Peucker algorithm. | +| ST_SimplifyPreserveTopology | | Returns a simplified and valid representation of a geometry, using the Douglas-Peucker algorithm. | +| ST_SimplifyPolygonHull | | Computes a simplifed topology-preserving outer or inner hull of a polygonal geometry. | +| ST_SimplifyVW | | Returns a simplified representation of a geometry, using the Visvalingam-Whyatt algorithm | +| ST_SetEffectiveArea | | Sets the effective area for each vertex, using the Visvalingam-Whyatt algorithm. | +| ST_TriangulatePolygon | | Computes the constrained Delaunay triangulation of polygons | +| ST_VoronoiLines | | Returns the boundaries of the Voronoi diagram of the vertices of a geometry. | +| ST_VoronoiPolygons | | Returns the cells of the Voronoi diagram of the vertices of a geometry. | + +### Coverages + +| Name | Implemented | Description | +| ----------------------- | ----------- | ------------------------------------------------------------------------------------ | +| ST_CoverageInvalidEdges | | Window function that finds locations where polygons fail to form a valid coverage. | +| ST_CoverageSimplify | | Window function that simplifies the edges of a polygonal coverage. | +| ST_CoverageUnion | | Computes the union of a set of polygons forming a coverage by removing shared edges. | + +### Affine Transformations + +| Name | Implemented | Description | +| ------------- | ----------- | -------------------------------------------------------------- | +| ST_Affine | | Apply a 3D affine transformation to a geometry. | +| ST_Rotate | | Rotates a geometry about an origin point. | +| ST_RotateX | | Rotates a geometry about the X axis. | +| ST_RotateY | | Rotates a geometry about the Y axis. | +| ST_RotateZ | | Rotates a geometry about the Z axis. | +| ST_Scale | | Scales a geometry by given factors. | +| ST_Translate | | Translates a geometry by given offsets. | +| ST_TransScale | | Translates and scales a geometry by given offsets and factors. | + +### Clustering Functions + +| Name | Implemented | Description | +| ------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------- | +| ST_ClusterDBSCAN | | Window function that returns a cluster id for each input geometry using the DBSCAN algorithm. | +| ST_ClusterIntersecting | | Aggregate function that clusters input geometries into connected sets. | +| ST_ClusterIntersectingWin | | Window function that returns a cluster id for each input geometry, clustering input geometries into connected sets. | +| ST_ClusterKMeans | | Window function that returns a cluster id for each input geometry using the K-means algorithm. | +| ST_ClusterWithin | | Aggregate function that clusters geometries by separation distance. | +| ST_ClusterWithinWin | | Window function that returns a cluster id for each input geometry, clustering using separation distance. | + +### Bounding Box Functions + +| Name | Implemented | Description | +| ------------------ | ----------- | ------------------------------------------------------------------------ | +| Box2D | ✅ | Returns a BOX2D representing the 2D extent of a geometry. | +| Box3D | | Returns a BOX3D representing the 3D extent of a geometry. | +| ST_EstimatedExtent | | Returns the estimated extent of a spatial table. | +| ST_Expand | | Returns a bounding box expanded from another bounding box or a geometry. | +| ST_Extent | | Aggregate function that returns the bounding box of geometries. | +| ST_3DExtent | | Aggregate function that returns the 3D bounding box of geometries. | +| ST_MakeBox2D | | Creates a BOX2D defined by two 2D point geometries. | +| ST_3DMakeBox | | Creates a BOX3D defined by two 3D point geometries. | +| ST_XMax | ✅ | Returns the X maxima of a 2D or 3D bounding box or a geometry. | +| ST_XMin | ✅ | Returns the X minima of a 2D or 3D bounding box or a geometry. | +| ST_YMax | ✅ | Returns the Y maxima of a 2D or 3D bounding box or a geometry. | +| ST_YMin | ✅ | Returns the Y minima of a 2D or 3D bounding box or a geometry. | +| ST_ZMax | | Returns the Z maxima of a 2D or 3D bounding box or a geometry. | +| ST_ZMin | | Returns the Z minima of a 2D or 3D bounding box or a geometry. | + +### Linear Referencing + +| Name | Implemented | Description | +| -------------------------- | ----------- | -------------------------------------------------------------------------- | +| ST_LineInterpolatePoint | | Returns a point interpolated along a line at a fractional location. | +| ST_3DLineInterpolatePoint | | Returns a point interpolated along a 3D line at a fractional location. | +| ST_LineInterpolatePoints | | Returns points interpolated along a line at a fractional interval. | +| ST_LineLocatePoint | | Returns the fractional location of the closest point on a line to a point. | +| ST_LineSubstring | | Returns the part of a line between two fractional locations. | +| ST_LocateAlong | | Returns the point(s) on a geometry that match a measure value. | +| ST_LocateBetween | | Returns the portions of a geometry that match a measure range. | +| ST_LocateBetweenElevations | | Returns the portions of a geometry that lie in an elevation (Z) range. | +| ST_InterpolatePoint | | Returns the interpolated measure of a geometry closest to a point. | +| ST_AddMeasure | | Interpolates measures along a linear geometry. | diff --git a/rust/geodatafusion/src/data_types.rs b/rust/geodatafusion/src/data_types.rs new file mode 100644 index 00000000..70959788 --- /dev/null +++ b/rust/geodatafusion/src/data_types.rs @@ -0,0 +1,52 @@ +use std::sync::Arc; + +use arrow_array::ArrayRef; +use datafusion::error::DataFusionError; +use datafusion::logical_expr::{Signature, Volatility}; +use geoarrow::array::{CoordType, GeometryArray, PointArray, RectArray}; +use geoarrow::datatypes::{Dimension, NativeType}; +use geoarrow::NativeArray; + +use crate::error::GeoDataFusionResult; + +pub const POINT2D_TYPE: NativeType = NativeType::Point(CoordType::Separated, Dimension::XY); +pub const POINT3D_TYPE: NativeType = NativeType::Point(CoordType::Separated, Dimension::XYZ); +pub const BOX2D_TYPE: NativeType = NativeType::Rect(Dimension::XY); +pub const BOX3D_TYPE: NativeType = NativeType::Rect(Dimension::XYZ); +pub const GEOMETRY_TYPE: NativeType = NativeType::Geometry(CoordType::Separated); + +pub(crate) fn any_single_geometry_type_input() -> Signature { + Signature::uniform( + 1, + vec![ + POINT2D_TYPE.into(), + POINT3D_TYPE.into(), + BOX2D_TYPE.into(), + BOX3D_TYPE.into(), + GEOMETRY_TYPE.into(), + ], + Volatility::Immutable, + ) +} + +/// This will not cast a PointArray to a GeometryArray +pub(crate) fn parse_to_native_array(array: ArrayRef) -> GeoDataFusionResult> { + let data_type = array.data_type(); + if data_type.equals_datatype(&POINT2D_TYPE.into()) { + let point_array = PointArray::try_from((array.as_ref(), Dimension::XY))?; + Ok(Arc::new(point_array)) + } else if data_type.equals_datatype(&POINT3D_TYPE.into()) { + let point_array = PointArray::try_from((array.as_ref(), Dimension::XYZ))?; + Ok(Arc::new(point_array)) + } else if data_type.equals_datatype(&BOX2D_TYPE.into()) { + let rect_array = RectArray::try_from((array.as_ref(), Dimension::XY))?; + Ok(Arc::new(rect_array)) + } else if data_type.equals_datatype(&BOX3D_TYPE.into()) { + let rect_array = RectArray::try_from((array.as_ref(), Dimension::XYZ))?; + Ok(Arc::new(rect_array)) + } else if data_type.equals_datatype(&GEOMETRY_TYPE.into()) { + Ok(Arc::new(GeometryArray::try_from(array.as_ref())?)) + } else { + Err(DataFusionError::Execution(format!("Unexpected input data type: {}", data_type)).into()) + } +} diff --git a/rust/geodatafusion/src/error.rs b/rust/geodatafusion/src/error.rs new file mode 100644 index 00000000..a219e01e --- /dev/null +++ b/rust/geodatafusion/src/error.rs @@ -0,0 +1,33 @@ +//! Defines [`GeoArrowError`], representing all errors returned by this crate. + +use arrow_schema::ArrowError; +use datafusion::error::DataFusionError; +use geoarrow::error::GeoArrowError; +use std::fmt::Debug; +use thiserror::Error; + +/// Enum with all errors in this crate. +#[derive(Error, Debug)] +pub(crate) enum GeoDataFusionError { + #[error(transparent)] + Arrow(#[from] ArrowError), + + #[error(transparent)] + DataFusion(#[from] DataFusionError), + + #[error(transparent)] + GeoArrow(#[from] GeoArrowError), +} + +/// Crate-specific result type. +pub(crate) type GeoDataFusionResult = std::result::Result; + +impl From for DataFusionError { + fn from(value: GeoDataFusionError) -> Self { + match value { + GeoDataFusionError::Arrow(err) => DataFusionError::ArrowError(err, None), + GeoDataFusionError::DataFusion(err) => err, + GeoDataFusionError::GeoArrow(err) => DataFusionError::External(Box::new(err)), + } + } +} diff --git a/rust/geodatafusion/src/lib.rs b/rust/geodatafusion/src/lib.rs new file mode 100644 index 00000000..7581c8a9 --- /dev/null +++ b/rust/geodatafusion/src/lib.rs @@ -0,0 +1,3 @@ +pub(crate) mod data_types; +pub(crate) mod error; +pub mod udf; diff --git a/rust/geodatafusion/src/udf/geos/mod.rs b/rust/geodatafusion/src/udf/geos/mod.rs new file mode 100644 index 00000000..2491e36c --- /dev/null +++ b/rust/geodatafusion/src/udf/geos/mod.rs @@ -0,0 +1 @@ +//! User-defined functions that wrap the [geos] crate. diff --git a/rust/geodatafusion/src/udf/mod.rs b/rust/geodatafusion/src/udf/mod.rs new file mode 100644 index 00000000..1a7a9bd7 --- /dev/null +++ b/rust/geodatafusion/src/udf/mod.rs @@ -0,0 +1,2 @@ +pub mod geos; +pub mod native; diff --git a/rust/geodatafusion/src/udf/native/accessors/coord_dim.rs b/rust/geodatafusion/src/udf/native/accessors/coord_dim.rs new file mode 100644 index 00000000..531c25d2 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/accessors/coord_dim.rs @@ -0,0 +1,96 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow::array::UInt8Builder; +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use datafusion::scalar::ScalarValue; +use geo_traits::{GeometryTrait, PointTrait}; +use geoarrow::array::AsNativeArray; +use geoarrow::datatypes::NativeType; +use geoarrow::trait_::ArrayAccessor; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct CoordDim { + signature: Signature, +} + +impl CoordDim { + pub fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for CoordDim { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_coorddim" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::UInt8) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(coord_dim_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Return the coordinate dimension of the ST_Geometry value.") + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn coord_dim_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + + match native_array.data_type() { + NativeType::Point(_, _) => { + let array_ref = native_array.as_ref(); + let arr = array_ref.as_point(); + let mut output_array = UInt8Builder::with_capacity(native_array.len()); + for geom in arr.iter() { + output_array.append_option(geom.map(|g| g.dim().size().try_into().unwrap())); + } + Ok(ColumnarValue::Array(Arc::new(output_array.finish()))) + } + NativeType::Rect(dim) => Ok(ColumnarValue::Scalar(ScalarValue::UInt8(Some( + dim.size().try_into().unwrap(), + )))), + NativeType::Geometry(_) => { + let array_ref = native_array.as_ref(); + let arr = array_ref.as_geometry(); + let mut output_array = UInt8Builder::with_capacity(native_array.len()); + for geom in arr.iter() { + output_array.append_option(geom.map(|g| g.dim().size().try_into().unwrap())); + } + Ok(ColumnarValue::Array(Arc::new(output_array.finish()))) + } + _ => unreachable!(), + } +} diff --git a/rust/geodatafusion/src/udf/native/accessors/envelope.rs b/rust/geodatafusion/src/udf/native/accessors/envelope.rs new file mode 100644 index 00000000..29b7438b --- /dev/null +++ b/rust/geodatafusion/src/udf/native/accessors/envelope.rs @@ -0,0 +1,73 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use geoarrow::algorithm::native::BoundingRectArray; +use geoarrow::ArrayBase; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array, GEOMETRY_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct Envelope { + signature: Signature, +} + +impl Envelope { + pub fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Envelope { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_envelope" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(GEOMETRY_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(envelope_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Computes a point which is the geometric center of mass of a geometry.", + ) + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn envelope_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + // Since a RectArray is a valid normal geometry type for us, we don't have to cast it to a + // Geometry array. That just has overhead. + let output = native_array.as_ref().bounding_rect()?; + Ok(output.into_array_ref().into()) +} diff --git a/rust/geodatafusion/src/udf/native/accessors/line_string.rs b/rust/geodatafusion/src/udf/native/accessors/line_string.rs new file mode 100644 index 00000000..7375fbbb --- /dev/null +++ b/rust/geodatafusion/src/udf/native/accessors/line_string.rs @@ -0,0 +1,99 @@ +//! Accessors from LineString geometries + +use std::any::Any; +use std::sync::OnceLock; + +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use geo_traits::LineStringTrait; +use geoarrow::array::{AsNativeArray, CoordType, PointBuilder}; +use geoarrow::datatypes::Dimension; +use geoarrow::error::GeoArrowError; +use geoarrow::scalar::Geometry; +use geoarrow::trait_::ArrayAccessor; +use geoarrow::ArrayBase; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array, POINT2D_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct StartPoint { + signature: Signature, +} + +impl StartPoint { + pub fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static START_POINT_DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for StartPoint { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_startpoint" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(POINT2D_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(start_point_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(START_POINT_DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Returns the first point of a LINESTRING geometry as a POINT. Returns NULL if the input is not a LINESTRING", + ) + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn start_point_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + let native_array_ref = native_array.as_ref(); + let geometry_array = native_array_ref + .as_geometry_opt() + .ok_or(GeoArrowError::General( + "Expected Geometry-typed array in ST_StartPoint".to_string(), + ))?; + + let mut output_builder = PointBuilder::with_capacity_and_options( + Dimension::XY, + geometry_array.len(), + CoordType::Separated, + Default::default(), + ); + + for geom in geometry_array.iter() { + if let Some(Geometry::LineString(line_string)) = geom { + output_builder.push_coord(line_string.coord(0).as_ref()); + } else { + output_builder.push_null(); + } + } + + Ok(output_builder.finish().into_array_ref().into()) +} diff --git a/rust/geodatafusion/src/udf/native/accessors/mod.rs b/rust/geodatafusion/src/udf/native/accessors/mod.rs new file mode 100644 index 00000000..8118b6d0 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/accessors/mod.rs @@ -0,0 +1,12 @@ +mod coord_dim; +mod envelope; +mod line_string; + +use datafusion::prelude::SessionContext; + +/// Register all provided [geo] functions for constructing geometries +pub fn register_udfs(ctx: &SessionContext) { + ctx.register_udf(coord_dim::CoordDim::new().into()); + ctx.register_udf(envelope::Envelope::new().into()); + ctx.register_udf(line_string::StartPoint::new().into()); +} diff --git a/rust/geodatafusion/src/udf/native/bounding_box/box.rs b/rust/geodatafusion/src/udf/native/bounding_box/box.rs new file mode 100644 index 00000000..c5d2c6ae --- /dev/null +++ b/rust/geodatafusion/src/udf/native/bounding_box/box.rs @@ -0,0 +1,70 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use geoarrow::algorithm::native::BoundingRectArray; +use geoarrow::ArrayBase; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array, BOX2D_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct Box2D { + signature: Signature, +} + +impl Box2D { + pub fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Box2D { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_box2d" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(BOX2D_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(box2d_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns a box2d representing the 2D extent of the geometry.") + .with_argument("geom", "geometry") + .build() + .unwrap() + })) + } +} + +// Note: this is exactly the same impl as ST_Envelope. Perhaps we should use an alias instead +fn box2d_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + let output = native_array.as_ref().bounding_rect()?; + Ok(output.into_array_ref().into()) +} diff --git a/rust/geodatafusion/src/udf/native/bounding_box/extrema.rs b/rust/geodatafusion/src/udf/native/bounding_box/extrema.rs new file mode 100644 index 00000000..62dbb361 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/bounding_box/extrema.rs @@ -0,0 +1,288 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow::array::Float64Builder; +use arrow_array::ArrayRef; +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use geo_traits::{CoordTrait, RectTrait}; +use geoarrow::algorithm::native::BoundingRectArray; +use geoarrow::array::RectArray; +use geoarrow::trait_::ArrayAccessor; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array}; +use crate::error::GeoDataFusionResult; + +fn rect_array_from_array_ref(array: ArrayRef) -> GeoDataFusionResult { + let native_arr = parse_to_native_array(array)?; + Ok(native_arr.as_ref().bounding_rect()?) +} + +#[derive(Debug)] +pub(super) struct XMin { + signature: Signature, +} + +impl XMin { + pub(super) fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static XMIN_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for XMin { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_xmin" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::Float64) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + let arg = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let mut output_array = Float64Builder::with_capacity(arg.len()); + + let rect_array = rect_array_from_array_ref(arg)?; + + for rect in rect_array.iter() { + output_array.append_option(rect.map(|r| r.min().x())); + } + Ok(ColumnarValue::from( + Arc::new(output_array.finish()) as ArrayRef + )) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(XMIN_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns X minima of a bounding box 2d or 3d or a geometry") + .with_syntax_example("ST_XMin(geometry)") + .with_argument("box", "The geometry or box input") + .with_related_udf("st_xmin") + .with_related_udf("st_ymin") + .with_related_udf("st_zmin") + .with_related_udf("st_xmax") + .with_related_udf("st_ymax") + .with_related_udf("st_zmax") + .build() + .unwrap() + })) + } +} + +#[derive(Debug)] +pub(super) struct YMin { + signature: Signature, +} + +impl YMin { + pub(super) fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static YMIN_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for YMin { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_ymin" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::Float64) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + let arg = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let mut output_array = Float64Builder::with_capacity(arg.len()); + + let rect_array = rect_array_from_array_ref(arg)?; + + for rect in rect_array.iter() { + output_array.append_option(rect.map(|r| r.min().y())); + } + Ok(ColumnarValue::from( + Arc::new(output_array.finish()) as ArrayRef + )) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(YMIN_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns Y minima of a bounding box 2d or 3d or a geometry") + .with_syntax_example("ST_YMin(geometry)") + .with_argument("box", "The geometry or box input") + .with_related_udf("st_xmin") + .with_related_udf("st_ymin") + .with_related_udf("st_zmin") + .with_related_udf("st_xmax") + .with_related_udf("st_ymax") + .with_related_udf("st_zmax") + .build() + .unwrap() + })) + } +} + +#[derive(Debug)] +pub(super) struct XMax { + signature: Signature, +} + +impl XMax { + pub(super) fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static XMAX_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for XMax { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_xmax" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::Float64) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + let arg = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let mut output_array = Float64Builder::with_capacity(arg.len()); + let rect_array = rect_array_from_array_ref(arg)?; + for rect in rect_array.iter() { + output_array.append_option(rect.map(|r| r.max().x())); + } + Ok(ColumnarValue::from( + Arc::new(output_array.finish()) as ArrayRef + )) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(XMAX_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns X maxima of a bounding box 2d or 3d or a geometry") + .with_syntax_example("ST_XMax(geometry)") + .with_argument("box", "The geometry or box input") + .with_related_udf("st_xmin") + .with_related_udf("st_ymin") + .with_related_udf("st_zmin") + .with_related_udf("st_xmax") + .with_related_udf("st_ymax") + .with_related_udf("st_zmax") + .build() + .unwrap() + })) + } +} + +#[derive(Debug)] +pub(super) struct YMax { + signature: Signature, +} + +impl YMax { + pub(super) fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static YMAX_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for YMax { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_ymax" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::Float64) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + let arg = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let mut output_array = Float64Builder::with_capacity(arg.len()); + let rect_array = rect_array_from_array_ref(arg)?; + for rect in rect_array.iter() { + output_array.append_option(rect.map(|r| r.max().y())); + } + Ok(ColumnarValue::from( + Arc::new(output_array.finish()) as ArrayRef + )) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(YMAX_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns Y maxima of a bounding box 2d or 3d or a geometry") + .with_syntax_example("ST_YMax(geometry)") + .with_argument("box", "The geometry or box input") + .with_related_udf("st_xmin") + .with_related_udf("st_ymin") + .with_related_udf("st_zmin") + .with_related_udf("st_xmax") + .with_related_udf("st_ymax") + .with_related_udf("st_zmax") + .build() + .unwrap() + })) + } +} diff --git a/rust/geodatafusion/src/udf/native/bounding_box/mod.rs b/rust/geodatafusion/src/udf/native/bounding_box/mod.rs new file mode 100644 index 00000000..962ec875 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/bounding_box/mod.rs @@ -0,0 +1,13 @@ +mod r#box; +mod extrema; + +use datafusion::prelude::SessionContext; + +/// Register all provided bounding box functions +pub fn register_udfs(ctx: &SessionContext) { + ctx.register_udf(extrema::XMin::new().into()); + ctx.register_udf(extrema::YMin::new().into()); + ctx.register_udf(extrema::XMax::new().into()); + ctx.register_udf(extrema::YMax::new().into()); + ctx.register_udf(r#box::Box2D::new().into()); +} diff --git a/rust/geodatafusion/src/udf/native/constructors/mod.rs b/rust/geodatafusion/src/udf/native/constructors/mod.rs new file mode 100644 index 00000000..7111a584 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/constructors/mod.rs @@ -0,0 +1,9 @@ +mod point; + +use datafusion::prelude::SessionContext; + +/// Register all provided [geo] functions for constructing geometries +pub fn register_udfs(ctx: &SessionContext) { + ctx.register_udf(point::Point::new().into()); + ctx.register_udf(point::MakePoint::new().into()); +} diff --git a/rust/geodatafusion/src/udf/native/constructors/point.rs b/rust/geodatafusion/src/udf/native/constructors/point.rs new file mode 100644 index 00000000..c4e9b560 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/constructors/point.rs @@ -0,0 +1,234 @@ +//! Point constructors + +use std::any::Any; +use std::sync::OnceLock; + +use arrow::array::AsArray; +use arrow::datatypes::Float64Type; +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, TypeSignature, Volatility, +}; +use geo_traits::CoordTrait; +use geoarrow::array::{CoordType, GeometryArray, PointBuilder}; +use geoarrow::datatypes::Dimension; +use geoarrow::ArrayBase; + +use crate::data_types::{POINT2D_TYPE, POINT3D_TYPE}; + +#[derive(Debug)] +pub(super) struct Point { + signature: Signature, +} + +impl Point { + pub fn new() -> Self { + Self { + signature: Signature::exact( + vec![DataType::Float64, DataType::Float64], + Volatility::Immutable, + ), + } + } +} + +static POINT_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Point { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_point" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(POINT2D_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + let mut args = ColumnarValue::values_to_arrays(args)?.into_iter(); + let x = args.next().unwrap(); + let y = args.next().unwrap(); + + let x = x.as_primitive::(); + let y = y.as_primitive::(); + + let mut builder = PointBuilder::with_capacity_and_options( + Dimension::XY, + x.len(), + CoordType::Separated, + Default::default(), + ); + for (x, y) in x.iter().zip(y.iter()) { + if let (Some(x), Some(y)) = (x, y) { + builder.push_coord(Some(&geo::coord! { x: x, y: y})); + } else { + builder.push_null(); + } + } + + Ok(builder.finish().into_array_ref().into()) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(POINT_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns a Point with the given X and Y coordinate values.") + .with_syntax_example("ST_Point(-71.104, 42.315)") + .with_argument("x", "x value") + .with_argument("y", "y value") + .with_related_udf("st_makepoint") + .with_related_udf("st_pointz") + .build() + .unwrap() + })) + } +} + +#[derive(Debug)] +pub(super) struct MakePoint { + signature: Signature, +} + +impl MakePoint { + pub fn new() -> Self { + Self { + signature: Signature::one_of( + vec![ + TypeSignature::Exact(vec![DataType::Float64, DataType::Float64]), + TypeSignature::Exact(vec![ + DataType::Float64, + DataType::Float64, + DataType::Float64, + ]), + ], + Volatility::Immutable, + ), + } + } +} + +static MAKE_POINT_DOC: OnceLock = OnceLock::new(); + +struct PointZ { + x: f64, + y: f64, + z: f64, +} + +impl CoordTrait for PointZ { + type T = f64; + + fn dim(&self) -> geo_traits::Dimensions { + geo_traits::Dimensions::Xyz + } + + fn x(&self) -> Self::T { + self.x + } + + fn y(&self) -> Self::T { + self.y + } + + fn nth_or_panic(&self, n: usize) -> Self::T { + match n { + 0 => self.x, + 1 => self.y, + 2 => self.z, + _ => panic!("invalid dimension index"), + } + } +} + +impl ScalarUDFImpl for MakePoint { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_makepoint" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> datafusion::error::Result { + match arg_types.len() { + 2 => Ok(POINT2D_TYPE.into()), + 3 => Ok(POINT3D_TYPE.into()), + _ => unreachable!(), + } + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + let mut args = ColumnarValue::values_to_arrays(args)?.into_iter(); + let x = args.next().unwrap(); + let y = args.next().unwrap(); + let z = args.next(); + + let x = x.as_primitive::(); + let y = y.as_primitive::(); + + let dim = if z.is_some() { + Dimension::XYZ + } else { + Dimension::XY + }; + let mut builder = PointBuilder::with_capacity_and_options( + dim, + x.len(), + CoordType::Separated, + Default::default(), + ); + + if let Some(z) = z { + let z = z.as_primitive::(); + + for ((x, y), z) in x.iter().zip(y.iter()).zip(z.iter()) { + if let (Some(x), Some(y), Some(z)) = (x, y, z) { + builder.push_coord(Some(&PointZ { x, y, z })); + } else { + builder.push_null(); + } + } + } else { + for (x, y) in x.iter().zip(y.iter()) { + if let (Some(x), Some(y)) = (x, y) { + builder.push_coord(Some(&geo::coord! { x: x, y: y})); + } else { + builder.push_null(); + } + } + } + + Ok(GeometryArray::from(builder.finish()) + .into_array_ref() + .into()) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(MAKE_POINT_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Creates a 2D XY or 3D XYZ Point geometry.") + .with_syntax_example("ST_MakePoint(-71.104, 42.315)") + .with_argument("x", "x value") + .with_argument("y", "y value") + .with_argument("z", "z value") + .with_related_udf("st_point") + .with_related_udf("st_pointz") + .build() + .unwrap() + })) + } +} diff --git a/rust/geodatafusion/src/udf/native/io/mod.rs b/rust/geodatafusion/src/udf/native/io/mod.rs new file mode 100644 index 00000000..341beca7 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/io/mod.rs @@ -0,0 +1,14 @@ +//! Geometry Input and Output + +mod wkb; +mod wkt; + +use datafusion::prelude::SessionContext; + +/// Register all provided functions for geometry input and output +pub fn register_udfs(ctx: &SessionContext) { + ctx.register_udf(wkb::AsBinary::new().into()); + ctx.register_udf(wkb::GeomFromWKB::new().into()); + ctx.register_udf(wkt::AsText::new().into()); + ctx.register_udf(wkt::GeomFromText::new().into()); +} diff --git a/rust/geodatafusion/src/udf/native/io/wkb.rs b/rust/geodatafusion/src/udf/native/io/wkb.rs new file mode 100644 index 00000000..7bda68fa --- /dev/null +++ b/rust/geodatafusion/src/udf/native/io/wkb.rs @@ -0,0 +1,137 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow::array::AsArray; +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use geoarrow::array::{CoordType, WKBArray}; +use geoarrow::datatypes::NativeType; +use geoarrow::io::wkb::{from_wkb, to_wkb}; +use geoarrow::ArrayBase; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array, GEOMETRY_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct AsBinary { + signature: Signature, +} + +impl AsBinary { + pub fn new() -> Self { + // TODO: extend to allow specifying little/big endian + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static AS_BINARY_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for AsBinary { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_asbinary" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::Binary) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(as_binary_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(AS_BINARY_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Returns the OGC/ISO Well-Known Binary (WKB) representation of the geometry.", + ) + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn as_binary_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + let wkb_arr = to_wkb::(native_array.as_ref()); + Ok(wkb_arr.into_array_ref().into()) +} + +#[derive(Debug)] +pub(super) struct GeomFromWKB { + signature: Signature, +} + +impl GeomFromWKB { + pub fn new() -> Self { + Self { + signature: Signature::coercible(vec![DataType::Binary], Volatility::Immutable), + } + } +} + +static GEOM_FROM_WKB_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for GeomFromWKB { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_geomfromwkb" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(GEOMETRY_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(geom_from_wkb_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(GEOM_FROM_WKB_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Takes a well-known binary representation of a geometry and a Spatial Reference System ID (SRID) and creates an instance of the appropriate geometry type", + ) + .with_argument("geom", "WKB buffers") + .build() + .unwrap() + })) + } +} + +fn geom_from_wkb_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let wkb_arr = WKBArray::new(array.as_binary::().clone(), Default::default()); + let native_arr = from_wkb(&wkb_arr, NativeType::Geometry(CoordType::Separated), false)?; + Ok(native_arr.to_array_ref().into()) +} diff --git a/rust/geodatafusion/src/udf/native/io/wkt.rs b/rust/geodatafusion/src/udf/native/io/wkt.rs new file mode 100644 index 00000000..1d46fb85 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/io/wkt.rs @@ -0,0 +1,137 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow::array::AsArray; +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ + ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, +}; +use geoarrow::array::{CoordType, WKTArray}; +use geoarrow::io::wkt::{read_wkt, ToWKT}; +use geoarrow::ArrayBase; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array, GEOMETRY_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct AsText { + signature: Signature, +} + +impl AsText { + pub fn new() -> Self { + // TODO: extend to allow specifying little/big endian + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static AS_TEXT_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for AsText { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_astext" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::Utf8) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(as_text_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(AS_TEXT_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Returns the OGC Well-Known Text (WKT) representation of the geometry/geography.", + ) + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn as_text_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + let wkt_arr = native_array.as_ref().to_wkt::()?; + Ok(wkt_arr.into_array_ref().into()) +} + +#[derive(Debug)] +pub(super) struct GeomFromText { + signature: Signature, +} + +impl GeomFromText { + pub fn new() -> Self { + // TODO: extend to allow specifying little/big endian + Self { + signature: Signature::coercible(vec![DataType::Utf8], Volatility::Immutable), + } + } +} + +static GEOM_FROM_TEXT_DOC: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for GeomFromText { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_astext" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(GEOMETRY_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(geom_from_text_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(GEOM_FROM_TEXT_DOC.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Constructs a geometry object from the OGC Well-Known text representation.", + ) + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn geom_from_text_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let wkt_arr = WKTArray::new(array.as_string::().clone(), Default::default()); + let native_arr = read_wkt(&wkt_arr, CoordType::Separated, false)?; + Ok(native_arr.to_array_ref().into()) +} diff --git a/rust/geodatafusion/src/udf/native/measurement/area.rs b/rust/geodatafusion/src/udf/native/measurement/area.rs new file mode 100644 index 00000000..0738105c --- /dev/null +++ b/rust/geodatafusion/src/udf/native/measurement/area.rs @@ -0,0 +1,118 @@ +use std::any::Any; +use std::sync::{Arc, OnceLock}; + +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use geoarrow::algorithm::geo::Area as _Area; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct Area { + signature: Signature, +} + +impl Area { + pub fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Area { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_area" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(DataType::Float64) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(area_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description("Returns the area of a polygonal geometry.") + .with_argument("geom", "geometry") + .build() + .unwrap() + })) + } +} + +fn area_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + let area = native_array.as_ref().unsigned_area()?; + Ok(ColumnarValue::Array(Arc::new(area))) +} + +#[cfg(test)] +mod test { + use arrow_array::RecordBatch; + use arrow_schema::Schema; + use datafusion::error::Result; + use datafusion::prelude::SessionContext; + use geoarrow::algorithm::native::Cast; + use geoarrow::array::CoordType; + use geoarrow::datatypes::NativeType; + use geoarrow::io::flatgeobuf::read_flatgeobuf; + use std::fs::File; + use std::sync::Arc; + + use super::*; + + fn load_file() -> RecordBatch { + let mut file = File::open("../../fixtures/flatgeobuf/countries.fgb").unwrap(); + let table = read_flatgeobuf(&mut file, Default::default()).unwrap(); + let geometry = table.geometry_column(None).unwrap(); + let geometry = geometry + .as_ref() + .cast(NativeType::Geometry(CoordType::Separated)) + .unwrap(); + let field = geometry.extension_field(); + let chunk = geometry.array_refs()[0].clone(); + RecordBatch::try_new(Arc::new(Schema::new(vec![field])), vec![chunk]).unwrap() + } + + fn create_context() -> Result { + let ctx = SessionContext::new(); + + let batch = load_file(); + + ctx.register_batch("t", batch).unwrap(); + Ok(ctx) + } + + #[tokio::test] + async fn test() -> Result<()> { + let ctx = create_context()?; + ctx.register_udf(Area::new().into()); + + let sql_df = ctx.sql("SELECT ST_Area(geometry) FROM t;").await?; + // print the results + sql_df.show().await?; + + Ok(()) + } +} diff --git a/rust/geodatafusion/src/udf/native/measurement/mod.rs b/rust/geodatafusion/src/udf/native/measurement/mod.rs new file mode 100644 index 00000000..026bf7ad --- /dev/null +++ b/rust/geodatafusion/src/udf/native/measurement/mod.rs @@ -0,0 +1,8 @@ +mod area; + +use datafusion::prelude::SessionContext; + +/// Register all provided [geo] functions for constructing geometries +pub fn register_udfs(ctx: &SessionContext) { + ctx.register_udf(area::Area::new().into()); +} diff --git a/rust/geodatafusion/src/udf/native/mod.rs b/rust/geodatafusion/src/udf/native/mod.rs new file mode 100644 index 00000000..15ac8b46 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/mod.rs @@ -0,0 +1,20 @@ +//! User-defined functions that wrap native Rust implementations. + +mod accessors; +mod bounding_box; +mod constructors; +mod io; +mod measurement; +mod processing; + +use datafusion::prelude::SessionContext; + +/// Register all provided [geo] functions +pub fn register_geo(ctx: &SessionContext) { + accessors::register_udfs(ctx); + bounding_box::register_udfs(ctx); + constructors::register_udfs(ctx); + io::register_udfs(ctx); + measurement::register_udfs(ctx); + processing::register_udfs(ctx); +} diff --git a/rust/geodatafusion/src/udf/native/processing/centroid.rs b/rust/geodatafusion/src/udf/native/processing/centroid.rs new file mode 100644 index 00000000..f5136bd3 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/processing/centroid.rs @@ -0,0 +1,125 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use geoarrow::algorithm::geo::Centroid as _Centroid; +use geoarrow::array::CoordType; +use geoarrow::ArrayBase; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array, POINT2D_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct Centroid { + signature: Signature, +} + +impl Centroid { + pub fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for Centroid { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_centroid" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(POINT2D_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(centroid_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Computes a point which is the geometric center of mass of a geometry.", + ) + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn centroid_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + let output = native_array.as_ref().centroid()?; + Ok(output + .into_coord_type(CoordType::Separated) + .into_array_ref() + .into()) +} + +#[cfg(test)] +mod test { + use arrow_array::RecordBatch; + use arrow_schema::Schema; + use datafusion::error::Result; + use datafusion::prelude::SessionContext; + use geoarrow::algorithm::native::Cast; + use geoarrow::array::CoordType; + use geoarrow::datatypes::NativeType; + use geoarrow::io::flatgeobuf::read_flatgeobuf; + use std::fs::File; + use std::sync::Arc; + + use super::*; + + fn load_file() -> RecordBatch { + let mut file = File::open("../../fixtures/flatgeobuf/countries.fgb").unwrap(); + let table = read_flatgeobuf(&mut file, Default::default()).unwrap(); + let geometry = table.geometry_column(None).unwrap(); + let geometry = geometry + .as_ref() + .cast(NativeType::Geometry(CoordType::Separated)) + .unwrap(); + let field = geometry.extension_field(); + let chunk = geometry.array_refs()[0].clone(); + RecordBatch::try_new(Arc::new(Schema::new(vec![field])), vec![chunk]).unwrap() + } + + fn create_context() -> Result { + let ctx = SessionContext::new(); + + let batch = load_file(); + + ctx.register_batch("t", batch).unwrap(); + Ok(ctx) + } + + #[tokio::test] + async fn test() -> Result<()> { + let ctx = create_context()?; + ctx.register_udf(Centroid::new().into()); + + let sql_df = ctx.sql("SELECT ST_centroid(geometry) FROM t;").await?; + // print the results + sql_df.show().await?; + + Ok(()) + } +} diff --git a/rust/geodatafusion/src/udf/native/processing/convex_hull.rs b/rust/geodatafusion/src/udf/native/processing/convex_hull.rs new file mode 100644 index 00000000..4b66b4c0 --- /dev/null +++ b/rust/geodatafusion/src/udf/native/processing/convex_hull.rs @@ -0,0 +1,75 @@ +use std::any::Any; +use std::sync::OnceLock; + +use arrow_schema::DataType; +use datafusion::logical_expr::scalar_doc_sections::DOC_SECTION_OTHER; +use datafusion::logical_expr::{ColumnarValue, Documentation, ScalarUDFImpl, Signature}; +use geoarrow::algorithm::geo::ConvexHull as _ConvexHull; +use geoarrow::array::{CoordType, GeometryArray}; +use geoarrow::ArrayBase; + +use crate::data_types::{any_single_geometry_type_input, parse_to_native_array, GEOMETRY_TYPE}; +use crate::error::GeoDataFusionResult; + +#[derive(Debug)] +pub(super) struct ConvexHull { + signature: Signature, +} + +impl ConvexHull { + pub fn new() -> Self { + Self { + signature: any_single_geometry_type_input(), + } + } +} + +static DOCUMENTATION: OnceLock = OnceLock::new(); + +impl ScalarUDFImpl for ConvexHull { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "st_convexhull" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, _arg_types: &[DataType]) -> datafusion::error::Result { + Ok(GEOMETRY_TYPE.into()) + } + + fn invoke(&self, args: &[ColumnarValue]) -> datafusion::error::Result { + Ok(convex_hull_impl(args)?) + } + + fn documentation(&self) -> Option<&Documentation> { + Some(DOCUMENTATION.get_or_init(|| { + Documentation::builder() + .with_doc_section(DOC_SECTION_OTHER) + .with_description( + "Computes the convex hull of a geometry. The convex hull is the smallest convex geometry that encloses all geometries in the input.", + ) + .with_argument("g1", "geometry") + .build() + .unwrap() + })) + } +} + +fn convex_hull_impl(args: &[ColumnarValue]) -> GeoDataFusionResult { + let array = ColumnarValue::values_to_arrays(args)? + .into_iter() + .next() + .unwrap(); + let native_array = parse_to_native_array(array)?; + let output = native_array + .as_ref() + .convex_hull()? + .into_coord_type(CoordType::Separated); + Ok(GeometryArray::from(output).into_array_ref().into()) +} diff --git a/rust/geodatafusion/src/udf/native/processing/mod.rs b/rust/geodatafusion/src/udf/native/processing/mod.rs new file mode 100644 index 00000000..e6bbe9cb --- /dev/null +++ b/rust/geodatafusion/src/udf/native/processing/mod.rs @@ -0,0 +1,10 @@ +mod centroid; +mod convex_hull; + +use datafusion::prelude::SessionContext; + +/// Register all provided [geo] functions for processing geometries +pub fn register_udfs(ctx: &SessionContext) { + ctx.register_udf(centroid::Centroid::new().into()); + ctx.register_udf(convex_hull::ConvexHull::new().into()); +}