From 9d8fadcab43e86d2b998e07b3b3dcfe44be1195e Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 20 Dec 2024 17:12:41 -0500 Subject: [PATCH 1/4] Improve docs for geoarrow::io --- rust/geoarrow/src/io/csv/mod.rs | 8 ++++ rust/geoarrow/src/io/csv/reader.rs | 6 +++ rust/geoarrow/src/io/csv/writer.rs | 4 +- rust/geoarrow/src/io/flatgeobuf/writer.rs | 10 ++--- .../src/io/geozero/table/data_source.rs | 4 +- rust/geoarrow/src/io/ipc/writer.rs | 14 +++---- rust/geoarrow/src/io/mod.rs | 2 +- rust/geoarrow/src/io/stream.rs | 41 +++++++++++-------- 8 files changed, 51 insertions(+), 38 deletions(-) diff --git a/rust/geoarrow/src/io/csv/mod.rs b/rust/geoarrow/src/io/csv/mod.rs index 12deed89..4bf724c6 100644 --- a/rust/geoarrow/src/io/csv/mod.rs +++ b/rust/geoarrow/src/io/csv/mod.rs @@ -1,5 +1,13 @@ //! Read from and write to CSV files. //! +//! The CSV reader implements [`RecordBatchReader`], so you can iterate over the batches of the CSV +//! without materializing the entire file in memory. +//! +//! [`RecordBatchReader`]: arrow_array::RecordBatchReader +//! +//! Additionally, the CSV writer takes in a [`RecordBatchReader`], so you can write an Arrow +//! iterator to CSV without materializing all batches in memory at once. +//! //! # Examples //! //! ``` diff --git a/rust/geoarrow/src/io/csv/reader.rs b/rust/geoarrow/src/io/csv/reader.rs index f045bea4..9720cd96 100644 --- a/rust/geoarrow/src/io/csv/reader.rs +++ b/rust/geoarrow/src/io/csv/reader.rs @@ -30,6 +30,11 @@ pub struct CSVReaderOptions { /// When `true`, the first row of the CSV file is treated as a header row pub has_header: Option, + /// The maximum number of records to read for schema inference. + /// + /// See [`arrow_csv::reader::Format::infer_schema`]. + /// + /// **By default, all rows are read to infer the CSV schema.** pub max_records: Option, /// Specify a custom delimiter character, defaults to comma `','` @@ -119,6 +124,7 @@ pub struct CSVReader { } impl CSVReader { + /// Access the schema of this reader pub fn schema(&self) -> SchemaRef { self.output_schema.clone() } diff --git a/rust/geoarrow/src/io/csv/writer.rs b/rust/geoarrow/src/io/csv/writer.rs index 36eeacfc..2599abd1 100644 --- a/rust/geoarrow/src/io/csv/writer.rs +++ b/rust/geoarrow/src/io/csv/writer.rs @@ -12,8 +12,8 @@ use std::sync::Arc; /// Write a Table to CSV pub fn write_csv>(stream: S, writer: W) -> Result<()> { - let mut stream: RecordBatchReader = stream.into(); - let reader = stream.take().unwrap(); + let stream: RecordBatchReader = stream.into(); + let reader = stream.into_inner(); let mut csv_writer = arrow_csv::Writer::new(writer); for batch in reader { diff --git a/rust/geoarrow/src/io/flatgeobuf/writer.rs b/rust/geoarrow/src/io/flatgeobuf/writer.rs index c9916775..3a69c80a 100644 --- a/rust/geoarrow/src/io/flatgeobuf/writer.rs +++ b/rust/geoarrow/src/io/flatgeobuf/writer.rs @@ -20,11 +20,11 @@ pub struct FlatGeobufWriterOptions { pub detect_type: bool, /// Convert single to multi geometries, if `geometry_type` is multi type or Unknown pub promote_to_multi: bool, - // Dataset title + /// Dataset title pub title: Option, - // Dataset description (intended for free form long text) + /// Dataset description (intended for free form long text) pub description: Option, - // Dataset metadata (intended to be application specific and + /// Dataset metadata (intended to be application specific and pub metadata: Option, /// A method for transforming CRS to WKT /// @@ -119,7 +119,7 @@ pub fn write_flatgeobuf_with_options>( ) -> Result<()> { let mut stream: RecordBatchReader = stream.into(); - let schema = stream.schema()?; + let schema = stream.schema(); let fields = &schema.fields; let geom_col_idxs = schema.as_ref().geometry_columns(); if geom_col_idxs.len() != 1 { @@ -133,7 +133,7 @@ pub fn write_flatgeobuf_with_options>( let wkt_crs_str = options.create_wkt_crs(&array_meta)?; let fgb_options = options.create_fgb_options(geo_data_type, wkt_crs_str.as_deref()); - let geometry_type = infer_flatgeobuf_geometry_type(stream.schema()?.as_ref())?; + let geometry_type = infer_flatgeobuf_geometry_type(stream.schema().as_ref())?; let mut fgb = FgbWriter::create_with_options(name, geometry_type, fgb_options)?; stream.process(&mut fgb)?; diff --git a/rust/geoarrow/src/io/geozero/table/data_source.rs b/rust/geoarrow/src/io/geozero/table/data_source.rs index 33508711..48d7a137 100644 --- a/rust/geoarrow/src/io/geozero/table/data_source.rs +++ b/rust/geoarrow/src/io/geozero/table/data_source.rs @@ -25,9 +25,7 @@ use geozero::{ColumnValue, FeatureProcessor, GeomProcessor, GeozeroDatasource, P impl GeozeroDatasource for RecordBatchReader { fn process(&mut self, processor: &mut P) -> Result<(), GeozeroError> { - let reader = self.take().ok_or(GeozeroError::Dataset( - "Cannot read from closed RecordBatchReader".to_string(), - ))?; + let reader = self.inner_mut(); let schema = reader.schema(); let geom_indices = schema.as_ref().geometry_columns(); let geometry_column_index = if geom_indices.len() != 1 { diff --git a/rust/geoarrow/src/io/ipc/writer.rs b/rust/geoarrow/src/io/ipc/writer.rs index da9bf11e..5ac02f20 100644 --- a/rust/geoarrow/src/io/ipc/writer.rs +++ b/rust/geoarrow/src/io/ipc/writer.rs @@ -2,15 +2,13 @@ use std::io::Write; use arrow_ipc::writer::{FileWriter, StreamWriter}; -use crate::error::{GeoArrowError, Result}; +use crate::error::Result; use crate::io::stream::RecordBatchReader; /// Write a Table to an Arrow IPC (Feather v2) file pub fn write_ipc>(stream: S, writer: W) -> Result<()> { - let inner = stream - .into() - .take() - .ok_or(GeoArrowError::General("Closed stream".to_string()))?; + let inner: RecordBatchReader = stream.into(); + let inner = inner.into_inner(); let schema = inner.schema(); let mut writer = FileWriter::try_new(writer, &schema)?; @@ -23,10 +21,8 @@ pub fn write_ipc>(stream: S, writer: W) -> /// Write a Table to an Arrow IPC stream pub fn write_ipc_stream>(stream: S, writer: W) -> Result<()> { - let inner = stream - .into() - .take() - .ok_or(GeoArrowError::General("Closed stream".to_string()))?; + let inner: RecordBatchReader = stream.into(); + let inner = inner.into_inner(); let schema = inner.schema(); let mut writer = StreamWriter::try_new(writer, &schema)?; diff --git a/rust/geoarrow/src/io/mod.rs b/rust/geoarrow/src/io/mod.rs index a3ee3557..1c12a9a8 100644 --- a/rust/geoarrow/src/io/mod.rs +++ b/rust/geoarrow/src/io/mod.rs @@ -1,7 +1,7 @@ //! Reader and writer implementations of many common geospatial file formats, including //! interoperability with the `geozero` crate. -#![allow(missing_docs)] // FIXME +// #![allow(missing_docs)] // FIXME pub mod crs; #[cfg(feature = "csv")] diff --git a/rust/geoarrow/src/io/stream.rs b/rust/geoarrow/src/io/stream.rs index 786e64de..22ff57f8 100644 --- a/rust/geoarrow/src/io/stream.rs +++ b/rust/geoarrow/src/io/stream.rs @@ -5,33 +5,40 @@ use arrow_schema::SchemaRef; /// A newtype wrapper around an [`arrow_array::RecordBatchReader`] so that we can implement the /// [`geozero::GeozeroDatasource`] trait on it. -pub struct RecordBatchReader(Option>); +/// +/// This allows for exporting Arrow data to a geozero-based consumer even when not all of the Arrow +/// data is present in memory at once. +pub struct RecordBatchReader(Box); impl RecordBatchReader { + /// Create a new RecordBatchReader from an [`arrow_array::RecordBatchReader`]. pub fn new(reader: Box) -> Self { - Self(Some(reader)) + Self(reader) } - pub fn schema(&self) -> Result { - let reader = self - .0 - .as_ref() - .ok_or(GeoArrowError::General("Closed stream".to_string()))?; - Ok(reader.schema()) + /// Access the schema of this reader. + pub fn schema(&self) -> SchemaRef { + self.0.schema() } - pub fn take(&mut self) -> Option> { - self.0.take() + /// Access a mutable reference to the underlying [`arrow_array::RecordBatchReader`]. + pub fn inner_mut(&mut self) -> &mut Box { + &mut self.0 + } + + /// Access the underlying [`arrow_array::RecordBatchReader`]. + pub fn into_inner(self) -> Box { + self.0 } } impl From for RecordBatchReader { fn from(value: Table) -> Self { let (batches, schema) = value.into_inner(); - Self(Some(Box::new(RecordBatchIterator::new( + Self(Box::new(RecordBatchIterator::new( batches.into_iter().map(Ok), schema, - )))) + ))) } } @@ -44,10 +51,8 @@ impl From<&Table> for RecordBatchReader { impl TryFrom for Table { type Error = GeoArrowError; - fn try_from(mut value: RecordBatchReader) -> Result { - let reader = value - .take() - .ok_or(GeoArrowError::General("Closed stream".to_string()))?; + fn try_from(value: RecordBatchReader) -> Result { + let reader = value.0; let schema = reader.schema(); Table::try_new(reader.collect::>()?, schema) } @@ -55,12 +60,12 @@ impl TryFrom for Table { impl From> for RecordBatchReader { fn from(value: Box) -> Self { - Self(Some(value)) + Self(value) } } impl From> for RecordBatchReader { fn from(value: Box) -> Self { - Self(Some(value)) + Self(value) } } From 358b11544dbce5f8494b8e16bf9c3cb2e83aa8cf Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 20 Dec 2024 17:33:24 -0500 Subject: [PATCH 2/4] Flesh out io docs --- .../src/io/flatgeobuf/reader/async.rs | 1 + rust/geoarrow/src/io/geos/array/binary.rs | 3 +- rust/geoarrow/src/io/geos/array/linestring.rs | 6 ++- .../src/io/geos/array/multilinestring.rs | 6 ++- rust/geoarrow/src/io/geos/array/multipoint.rs | 6 ++- .../src/io/geos/array/multipolygon.rs | 6 ++- rust/geoarrow/src/io/geos/array/point.rs | 6 ++- rust/geoarrow/src/io/geos/array/polygon.rs | 6 ++- .../geoarrow/src/io/geos/scalar/linestring.rs | 4 +- .../src/io/geozero/scalar/geometry.rs | 2 + rust/geoarrow/src/io/postgis/reader.rs | 1 + rust/geoarrow/src/io/shapefile/reader.rs | 1 + rust/geoarrow/src/io/wkb/api.rs | 43 ++++++++++++------- rust/geoarrow/src/io/wkb/writer/mod.rs | 1 + rust/geoarrow/src/io/wkb/writer/rect.rs | 42 ++++++++++++++++++ 15 files changed, 104 insertions(+), 30 deletions(-) create mode 100644 rust/geoarrow/src/io/wkb/writer/rect.rs diff --git a/rust/geoarrow/src/io/flatgeobuf/reader/async.rs b/rust/geoarrow/src/io/flatgeobuf/reader/async.rs index 71434433..ac4733d4 100644 --- a/rust/geoarrow/src/io/flatgeobuf/reader/async.rs +++ b/rust/geoarrow/src/io/flatgeobuf/reader/async.rs @@ -16,6 +16,7 @@ use crate::io::geozero::array::MixedGeometryStreamBuilder; use crate::io::geozero::table::{GeoTableBuilder, GeoTableBuilderOptions}; use crate::table::Table; +/// Read a FlatGeobuf file to a Table asynchronously from object storage. pub async fn read_flatgeobuf_async( reader: Arc, location: Path, diff --git a/rust/geoarrow/src/io/geos/array/binary.rs b/rust/geoarrow/src/io/geos/array/binary.rs index 8f77d21f..8b8efe1f 100644 --- a/rust/geoarrow/src/io/geos/array/binary.rs +++ b/rust/geoarrow/src/io/geos/array/binary.rs @@ -6,7 +6,8 @@ use crate::array::WKBArray; use crate::error::Result; impl WKBArray { - pub fn from_geos(value: Vec>) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>) -> Result { let mut builder = GenericBinaryBuilder::new(); for maybe_geom in value { if let Some(geom) = maybe_geom { diff --git a/rust/geoarrow/src/io/geos/array/linestring.rs b/rust/geoarrow/src/io/geos/array/linestring.rs index 9a2e899c..994b75f5 100644 --- a/rust/geoarrow/src/io/geos/array/linestring.rs +++ b/rust/geoarrow/src/io/geos/array/linestring.rs @@ -4,7 +4,8 @@ use crate::error::Result; use crate::io::geos::scalar::GEOSLineString; impl LineStringBuilder { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { // TODO: don't use new_unchecked let geos_objects: Vec> = value .into_iter() @@ -15,7 +16,8 @@ impl LineStringBuilder { } impl LineStringArray { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { let mutable_arr = LineStringBuilder::from_geos(value, dim)?; Ok(mutable_arr.into()) } diff --git a/rust/geoarrow/src/io/geos/array/multilinestring.rs b/rust/geoarrow/src/io/geos/array/multilinestring.rs index d5eda1a8..939f8be2 100644 --- a/rust/geoarrow/src/io/geos/array/multilinestring.rs +++ b/rust/geoarrow/src/io/geos/array/multilinestring.rs @@ -4,7 +4,8 @@ use crate::error::Result; use crate::io::geos::scalar::GEOSMultiLineString; impl MultiLineStringBuilder { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { // TODO: don't use new_unchecked let geos_objects: Vec> = value .into_iter() @@ -15,7 +16,8 @@ impl MultiLineStringBuilder { } impl MultiLineStringArray { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { let mutable_arr = MultiLineStringBuilder::from_geos(value, dim)?; Ok(mutable_arr.into()) } diff --git a/rust/geoarrow/src/io/geos/array/multipoint.rs b/rust/geoarrow/src/io/geos/array/multipoint.rs index 58a7ad4a..f2f2ffc5 100644 --- a/rust/geoarrow/src/io/geos/array/multipoint.rs +++ b/rust/geoarrow/src/io/geos/array/multipoint.rs @@ -4,7 +4,8 @@ use crate::error::Result; use crate::io::geos::scalar::GEOSMultiPoint; impl MultiPointBuilder { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { // TODO: don't use new_unchecked let geos_objects: Vec> = value .into_iter() @@ -15,7 +16,8 @@ impl MultiPointBuilder { } impl MultiPointArray { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { let mutable_arr = MultiPointBuilder::from_geos(value, dim)?; Ok(mutable_arr.into()) } diff --git a/rust/geoarrow/src/io/geos/array/multipolygon.rs b/rust/geoarrow/src/io/geos/array/multipolygon.rs index 5ddd91a4..3364a7b3 100644 --- a/rust/geoarrow/src/io/geos/array/multipolygon.rs +++ b/rust/geoarrow/src/io/geos/array/multipolygon.rs @@ -4,7 +4,8 @@ use crate::error::Result; use crate::io::geos::scalar::GEOSMultiPolygon; impl MultiPolygonBuilder { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { // TODO: don't use new_unchecked let geos_objects: Vec> = value .into_iter() @@ -15,7 +16,8 @@ impl MultiPolygonBuilder { } impl MultiPolygonArray { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { let mutable_arr = MultiPolygonBuilder::from_geos(value, dim)?; Ok(mutable_arr.into()) } diff --git a/rust/geoarrow/src/io/geos/array/point.rs b/rust/geoarrow/src/io/geos/array/point.rs index d70558e2..bd6775e7 100644 --- a/rust/geoarrow/src/io/geos/array/point.rs +++ b/rust/geoarrow/src/io/geos/array/point.rs @@ -4,7 +4,8 @@ use crate::error::Result; use crate::io::geos::scalar::GEOSPoint; impl PointBuilder { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { // TODO: don't use new_unchecked let geos_linestring_objects: Vec> = value .into_iter() @@ -15,7 +16,8 @@ impl PointBuilder { } impl PointArray { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { let mutable_arr = PointBuilder::from_geos(value, dim)?; Ok(mutable_arr.into()) } diff --git a/rust/geoarrow/src/io/geos/array/polygon.rs b/rust/geoarrow/src/io/geos/array/polygon.rs index afc83b6d..2b1079a2 100644 --- a/rust/geoarrow/src/io/geos/array/polygon.rs +++ b/rust/geoarrow/src/io/geos/array/polygon.rs @@ -4,7 +4,8 @@ use crate::error::Result; use crate::io::geos::scalar::GEOSPolygon; impl PolygonBuilder { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { // TODO: don't use new_unchecked let geos_objects: Vec> = value .into_iter() @@ -16,7 +17,8 @@ impl PolygonBuilder { } impl PolygonArray { - pub fn from_geos(value: Vec>, dim: Dimension) -> Result { + #[allow(dead_code)] + pub(crate) fn from_geos(value: Vec>, dim: Dimension) -> Result { let mutable_arr = PolygonBuilder::from_geos(value, dim)?; Ok(mutable_arr.into()) } diff --git a/rust/geoarrow/src/io/geos/scalar/linestring.rs b/rust/geoarrow/src/io/geos/scalar/linestring.rs index 5ef8897f..0c8b49e1 100644 --- a/rust/geoarrow/src/io/geos/scalar/linestring.rs +++ b/rust/geoarrow/src/io/geos/scalar/linestring.rs @@ -18,7 +18,9 @@ impl<'a> TryFrom<&'a LineString<'_>> for geos::Geometry { } impl LineString<'_> { - pub fn to_geos_linear_ring(&self) -> std::result::Result { + /// Convert to a GEOS LinearRing + #[allow(dead_code)] + pub(crate) fn to_geos_linear_ring(&self) -> std::result::Result { let (start, end) = self.geom_offsets.start_end(self.geom_index); let sliced_coords = self.coords.clone().slice(start, end - start); diff --git a/rust/geoarrow/src/io/geozero/scalar/geometry.rs b/rust/geoarrow/src/io/geozero/scalar/geometry.rs index 60e9b3ed..a34f89df 100644 --- a/rust/geoarrow/src/io/geozero/scalar/geometry.rs +++ b/rust/geoarrow/src/io/geozero/scalar/geometry.rs @@ -45,7 +45,9 @@ impl GeozeroGeometry for Geometry<'_> { } } +/// Convert a geozero scalar data source to an [OwnedGeometry]. pub trait ToGeometry { + /// Convert a geozero scalar data source to an [OwnedGeometry]. fn to_geometry(&self, dim: Dimension) -> geozero::error::Result; } diff --git a/rust/geoarrow/src/io/postgis/reader.rs b/rust/geoarrow/src/io/postgis/reader.rs index 9cd20982..d04634e2 100644 --- a/rust/geoarrow/src/io/postgis/reader.rs +++ b/rust/geoarrow/src/io/postgis/reader.rs @@ -167,6 +167,7 @@ impl GeoTableBuilder { } } +/// Execute a SQL string against a PostGIS database, returning the result as an Arrow table. pub async fn read_postgis<'c, E: Executor<'c, Database = Postgres>>( executor: E, sql: &str, diff --git a/rust/geoarrow/src/io/shapefile/reader.rs b/rust/geoarrow/src/io/shapefile/reader.rs index 1aa5174a..53f20aab 100644 --- a/rust/geoarrow/src/io/shapefile/reader.rs +++ b/rust/geoarrow/src/io/shapefile/reader.rs @@ -33,6 +33,7 @@ pub struct ShapefileReaderOptions { // TODO: // stretch goal: return a record batch reader. +/// Read a Shapefile into a [Table]. pub fn read_shapefile( shp_reader: T, dbf_reader: T, diff --git a/rust/geoarrow/src/io/wkb/api.rs b/rust/geoarrow/src/io/wkb/api.rs index a7691e86..ea8d7c13 100644 --- a/rust/geoarrow/src/io/wkb/api.rs +++ b/rust/geoarrow/src/io/wkb/api.rs @@ -18,8 +18,10 @@ use arrow_array::OffsetSizeTrait; /// determine the exact buffer sizes, then making a single set of allocations and filling those new /// arrays with the WKB coordinate values. pub trait FromWKB: Sized { + /// The input array type. Either [`WKBArray`] or [`ChunkedWKBArray`] type Input; + /// Parse the WKB input. fn from_wkb( arr: &Self::Input, coord_type: CoordType, @@ -100,6 +102,20 @@ impl FromWKB for GeometryCollectionArray { } } +impl FromWKB for GeometryArray { + type Input = WKBArray; + + fn from_wkb( + arr: &WKBArray, + coord_type: CoordType, + _dim: Dimension, + ) -> Result { + let wkb_objects: Vec>> = arr.iter().collect(); + let builder = GeometryBuilder::from_wkb(&wkb_objects, coord_type, arr.metadata(), true)?; + Ok(builder.finish()) + } +} + impl FromWKB for Arc { type Input = WKBArray; @@ -108,15 +124,7 @@ impl FromWKB for Arc { coord_type: CoordType, dim: Dimension, ) -> Result { - let wkb_objects: Vec>> = arr.iter().collect(); - let builder = GeometryCollectionBuilder::from_wkb( - &wkb_objects, - dim, - coord_type, - arr.metadata(), - true, - )?; - builder.finish().downcast() + Ok(Arc::new(GeometryArray::from_wkb(arr, coord_type, dim)?)) } } @@ -175,15 +183,17 @@ impl FromWKB for Arc { /// /// This supports either ISO or EWKB-flavored data. /// -/// Does not downcast automatically +/// The returned array is guaranteed to have exactly the type of `target_type`. +/// +/// `NativeType::Rect` is currently not allowed. pub fn from_wkb( arr: &WKBArray, - target_geo_data_type: NativeType, + target_type: NativeType, prefer_multi: bool, ) -> Result> { use NativeType::*; let wkb_objects: Vec>> = arr.iter().collect(); - match target_geo_data_type { + match target_type { Point(coord_type, dim) => { let builder = PointBuilder::from_wkb(&wkb_objects, dim, coord_type, arr.metadata())?; Ok(Arc::new(builder.finish())) @@ -224,7 +234,7 @@ pub fn from_wkb( } Rect(_) => Err(GeoArrowError::General(format!( "Unexpected data type {:?}", - target_geo_data_type, + target_type, ))), Geometry(coord_type) => { let builder = @@ -240,8 +250,10 @@ pub fn from_wkb( /// determine the exact buffer sizes, then making a single set of allocations and filling those new /// arrays with the WKB coordinate values. pub trait ToWKB: Sized { + /// The output type, either [WKBArray] or [ChunkedWKBArray] type Output; + /// Encode as WKB fn to_wkb(&self) -> Self::Output; } @@ -259,8 +271,7 @@ impl ToWKB for &dyn NativeArray { MultiLineString(_, _) => self.as_multi_line_string().into(), MultiPolygon(_, _) => self.as_multi_polygon().into(), GeometryCollection(_, _) => self.as_geometry_collection().into(), - - Rect(_) => todo!(), + Rect(_) => self.as_rect().into(), Geometry(_) => self.as_geometry().into(), } } @@ -308,7 +319,7 @@ pub fn to_wkb(arr: &dyn NativeArray) -> WKBArray { MultiLineString(_, _) => arr.as_multi_line_string().into(), MultiPolygon(_, _) => arr.as_multi_polygon().into(), GeometryCollection(_, _) => arr.as_geometry_collection().into(), - Rect(_) => todo!(), + Rect(_) => arr.as_rect().into(), Geometry(_) => arr.as_geometry().into(), } } diff --git a/rust/geoarrow/src/io/wkb/writer/mod.rs b/rust/geoarrow/src/io/wkb/writer/mod.rs index cb6e07c1..858b31c5 100644 --- a/rust/geoarrow/src/io/wkb/writer/mod.rs +++ b/rust/geoarrow/src/io/wkb/writer/mod.rs @@ -6,3 +6,4 @@ mod multipoint; mod multipolygon; mod point; mod polygon; +mod rect; diff --git a/rust/geoarrow/src/io/wkb/writer/rect.rs b/rust/geoarrow/src/io/wkb/writer/rect.rs new file mode 100644 index 00000000..f16ad8e2 --- /dev/null +++ b/rust/geoarrow/src/io/wkb/writer/rect.rs @@ -0,0 +1,42 @@ +use crate::array::offset_builder::OffsetsBuilder; +use crate::array::{RectArray, WKBArray}; +use crate::trait_::ArrayAccessor; +use crate::ArrayBase; +use arrow_array::{GenericBinaryArray, OffsetSizeTrait}; +use arrow_buffer::Buffer; +use std::io::Cursor; +use wkb::writer::{rect_wkb_size, write_rect}; +use wkb::Endianness; + +impl From<&RectArray> for WKBArray { + fn from(value: &RectArray) -> Self { + let mut offsets: OffsetsBuilder = OffsetsBuilder::with_capacity(value.len()); + + // First pass: calculate binary array offsets + for maybe_geom in value.iter() { + if let Some(geom) = maybe_geom { + offsets.try_push_usize(rect_wkb_size(&geom)).unwrap(); + } else { + offsets.extend_constant(1); + } + } + + let values = { + let values = Vec::with_capacity(offsets.last().to_usize().unwrap()); + let mut writer = Cursor::new(values); + + for geom in value.iter().flatten() { + write_rect(&mut writer, &geom, Endianness::LittleEndian).unwrap(); + } + + writer.into_inner() + }; + + let binary_arr = GenericBinaryArray::new( + offsets.into(), + Buffer::from_vec(values), + value.nulls().cloned(), + ); + WKBArray::new(binary_arr, value.metadata()) + } +} From 19d8715cd00d9e5d69a977270f9e2255feff597c Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 20 Dec 2024 17:34:35 -0500 Subject: [PATCH 3/4] remove comment --- rust/geoarrow/src/io/mod.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/rust/geoarrow/src/io/mod.rs b/rust/geoarrow/src/io/mod.rs index 1c12a9a8..efe12a4b 100644 --- a/rust/geoarrow/src/io/mod.rs +++ b/rust/geoarrow/src/io/mod.rs @@ -1,8 +1,6 @@ //! Reader and writer implementations of many common geospatial file formats, including //! interoperability with the `geozero` crate. -// #![allow(missing_docs)] // FIXME - pub mod crs; #[cfg(feature = "csv")] pub mod csv; From 910d92b63ae8f34ea1a0560c2d4b5e04587758fe Mon Sep 17 00:00:00 2001 From: Kyle Barron Date: Fri, 20 Dec 2024 17:54:49 -0500 Subject: [PATCH 4/4] Fix GeometryArray FFI --- python/tests/interop/test_wkb.py | 7 +- rust/geoarrow/src/array/geometry/array.rs | 18 ++--- rust/geoarrow/src/datatypes.rs | 96 +++++++++++++++-------- rust/geoarrow/src/io/wkb/api.rs | 6 +- 4 files changed, 75 insertions(+), 52 deletions(-) diff --git a/python/tests/interop/test_wkb.py b/python/tests/interop/test_wkb.py index 41ddaa43..5b3de336 100644 --- a/python/tests/interop/test_wkb.py +++ b/python/tests/interop/test_wkb.py @@ -24,10 +24,9 @@ def test_geometry_collection(): retour = to_wkb(parsed_geoarrow) retour_shapely = shapely.from_wkb(retour[0].as_py()) - # Need to unpack the geoms because they're returned as multi-geoms - assert retour_shapely.geoms[0].geoms[0] == point - assert retour_shapely.geoms[1].geoms[0] == point2 - assert retour_shapely.geoms[2].geoms[0] == line_string + assert retour_shapely.geoms[0] == point + assert retour_shapely.geoms[1] == point2 + assert retour_shapely.geoms[2] == line_string def test_ewkb_srid(): diff --git a/rust/geoarrow/src/array/geometry/array.rs b/rust/geoarrow/src/array/geometry/array.rs index 0820a3f4..a10b85b3 100644 --- a/rust/geoarrow/src/array/geometry/array.rs +++ b/rust/geoarrow/src/array/geometry/array.rs @@ -676,18 +676,14 @@ impl NativeGeometryAccessor for GeometryArray { 4 => Geometry::MultiPoint(self.mpoint_xy.value(offset)), 5 => Geometry::MultiLineString(self.mline_string_xy.value(offset)), 6 => Geometry::MultiPolygon(self.mpolygon_xy.value(offset)), - 7 => { - panic!("nested geometry collections not supported") - } + 7 => Geometry::GeometryCollection(self.gc_xy.value(offset)), 11 => Geometry::Point(self.point_xyz.value(offset)), 12 => Geometry::LineString(self.line_string_xyz.value(offset)), 13 => Geometry::Polygon(self.polygon_xyz.value(offset)), 14 => Geometry::MultiPoint(self.mpoint_xyz.value(offset)), 15 => Geometry::MultiLineString(self.mline_string_xyz.value(offset)), 16 => Geometry::MultiPolygon(self.mpolygon_xyz.value(offset)), - 17 => { - panic!("nested geometry collections not supported") - } + 17 => Geometry::GeometryCollection(self.gc_xyz.value(offset)), _ => panic!("unknown type_id {}", type_id), } } @@ -719,18 +715,14 @@ impl<'a> ArrayAccessor<'a> for GeometryArray { 4 => Geometry::MultiPoint(self.mpoint_xy.value(offset)), 5 => Geometry::MultiLineString(self.mline_string_xy.value(offset)), 6 => Geometry::MultiPolygon(self.mpolygon_xy.value(offset)), - 7 => { - panic!("nested geometry collections not supported") - } + 7 => Geometry::GeometryCollection(self.gc_xy.value(offset)), 11 => Geometry::Point(self.point_xyz.value(offset)), 12 => Geometry::LineString(self.line_string_xyz.value(offset)), 13 => Geometry::Polygon(self.polygon_xyz.value(offset)), 14 => Geometry::MultiPoint(self.mpoint_xyz.value(offset)), 15 => Geometry::MultiLineString(self.mline_string_xyz.value(offset)), 16 => Geometry::MultiPolygon(self.mpolygon_xyz.value(offset)), - 17 => { - panic!("nested geometry collections not supported") - } + 17 => Geometry::GeometryCollection(self.gc_xyz.value(offset)), _ => panic!("unknown type_id {}", type_id), } } @@ -752,12 +744,14 @@ impl IntoArrow for GeometryArray { self.mpoint_xy.into_array_ref(), self.mline_string_xy.into_array_ref(), self.mpolygon_xy.into_array_ref(), + self.gc_xy.into_array_ref(), self.point_xyz.into_array_ref(), self.line_string_xyz.into_array_ref(), self.polygon_xyz.into_array_ref(), self.mpoint_xyz.into_array_ref(), self.mline_string_xyz.into_array_ref(), self.mpolygon_xyz.into_array_ref(), + self.gc_xyz.into_array_ref(), ]; UnionArray::try_new( diff --git a/rust/geoarrow/src/datatypes.rs b/rust/geoarrow/src/datatypes.rs index 4258ec08..2db35667 100644 --- a/rust/geoarrow/src/datatypes.rs +++ b/rust/geoarrow/src/datatypes.rs @@ -333,9 +333,9 @@ fn rect_data_type(dim: Dimension) -> DataType { DataType::Struct(rect_fields(dim)) } -fn unknown_data_type(coord_type: CoordType) -> DataType { +fn geometry_data_type(coord_type: CoordType) -> DataType { let mut fields = vec![]; - let type_ids = vec![1, 2, 3, 4, 5, 6, 11, 12, 13, 14, 15, 16]; + let type_ids = vec![1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14, 15, 16, 17]; // Note: we manually construct the fields because these fields shouldn't have their own // GeoArrow extension metadata @@ -344,42 +344,72 @@ fn unknown_data_type(coord_type: CoordType) -> DataType { NativeType::Point(coord_type, Dimension::XY).to_data_type(), true, )); - - let linestring = NativeType::LineString(coord_type, Dimension::XY); - fields.push(Field::new("", linestring.to_data_type(), true)); - - let polygon = NativeType::Polygon(coord_type, Dimension::XY); - fields.push(Field::new("", polygon.to_data_type(), true)); - - let multi_point = NativeType::MultiPoint(coord_type, Dimension::XY); - fields.push(Field::new("", multi_point.to_data_type(), true)); - - let multi_line_string = NativeType::MultiLineString(coord_type, Dimension::XY); - fields.push(Field::new("", multi_line_string.to_data_type(), true)); - - let multi_polygon = NativeType::MultiPolygon(coord_type, Dimension::XY); - fields.push(Field::new("", multi_polygon.to_data_type(), true)); + fields.push(Field::new( + "", + NativeType::LineString(coord_type, Dimension::XY).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::Polygon(coord_type, Dimension::XY).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::MultiPoint(coord_type, Dimension::XY).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::MultiLineString(coord_type, Dimension::XY).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::MultiPolygon(coord_type, Dimension::XY).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::GeometryCollection(coord_type, Dimension::XY).to_data_type(), + true, + )); fields.push(Field::new( "", NativeType::Point(coord_type, Dimension::XYZ).to_data_type(), true, )); - - let linestring = NativeType::LineString(coord_type, Dimension::XYZ); - fields.push(Field::new("", linestring.to_data_type(), true)); - - let polygon = NativeType::Polygon(coord_type, Dimension::XYZ); - fields.push(Field::new("", polygon.to_data_type(), true)); - - let multi_point = NativeType::MultiPoint(coord_type, Dimension::XYZ); - fields.push(Field::new("", multi_point.to_data_type(), true)); - - let multi_line_string = NativeType::MultiLineString(coord_type, Dimension::XYZ); - fields.push(Field::new("", multi_line_string.to_data_type(), true)); - - let multi_polygon = NativeType::MultiPolygon(coord_type, Dimension::XYZ); - fields.push(Field::new("", multi_polygon.to_data_type(), true)); + fields.push(Field::new( + "", + NativeType::LineString(coord_type, Dimension::XYZ).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::Polygon(coord_type, Dimension::XYZ).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::MultiPoint(coord_type, Dimension::XYZ).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::MultiLineString(coord_type, Dimension::XYZ).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::MultiPolygon(coord_type, Dimension::XYZ).to_data_type(), + true, + )); + fields.push(Field::new( + "", + NativeType::GeometryCollection(coord_type, Dimension::XYZ).to_data_type(), + true, + )); let union_fields = UnionFields::new(type_ids, fields); DataType::Union(union_fields, UnionMode::Dense) @@ -445,7 +475,7 @@ impl NativeType { MultiPolygon(coord_type, dim) => multi_polygon_data_type(*coord_type, *dim), GeometryCollection(coord_type, dim) => geometry_collection_data_type(*coord_type, *dim), Rect(dim) => rect_data_type(*dim), - Geometry(coord_type) => unknown_data_type(*coord_type), + Geometry(coord_type) => geometry_data_type(*coord_type), } } diff --git a/rust/geoarrow/src/io/wkb/api.rs b/rust/geoarrow/src/io/wkb/api.rs index ea8d7c13..9ccc483f 100644 --- a/rust/geoarrow/src/io/wkb/api.rs +++ b/rust/geoarrow/src/io/wkb/api.rs @@ -77,7 +77,7 @@ impl FromWKB for MixedGeometryArray { ) -> Result { let wkb_objects: Vec>> = arr.iter().collect(); let builder = - MixedGeometryBuilder::from_wkb(&wkb_objects, dim, coord_type, arr.metadata(), true)?; + MixedGeometryBuilder::from_wkb(&wkb_objects, dim, coord_type, arr.metadata(), false)?; Ok(builder.finish()) } } @@ -96,7 +96,7 @@ impl FromWKB for GeometryCollectionArray { dim, coord_type, arr.metadata(), - true, + false, )?; Ok(builder.finish()) } @@ -111,7 +111,7 @@ impl FromWKB for GeometryArray { _dim: Dimension, ) -> Result { let wkb_objects: Vec>> = arr.iter().collect(); - let builder = GeometryBuilder::from_wkb(&wkb_objects, coord_type, arr.metadata(), true)?; + let builder = GeometryBuilder::from_wkb(&wkb_objects, coord_type, arr.metadata(), false)?; Ok(builder.finish()) } }