diff --git a/rust/geoarrow/src/datatypes.rs b/rust/geoarrow/src/datatypes.rs index de9e8408..0b3243a7 100644 --- a/rust/geoarrow/src/datatypes.rs +++ b/rust/geoarrow/src/datatypes.rs @@ -469,7 +469,7 @@ impl NativeType { Mixed(_, _) => "geoarrow.geometry", GeometryCollection(_, _) => "geoarrow.geometrycollection", Rect(_) => "geoarrow.box", - Geometry(_) => "geoarrow.unknown", + Geometry(_) => "geoarrow.geometry", } } @@ -794,7 +794,7 @@ fn parse_multi_polygon(field: &Field) -> Result { } } -fn parse_geometry(field: &Field) -> Result { +fn parse_mixed(field: &Field) -> Result { match field.data_type() { DataType::Union(fields, _) => { let mut coord_types: HashSet = HashSet::new(); @@ -927,13 +927,13 @@ fn parse_geometry_collection(field: &Field) -> Result { // We need to parse the _inner_ type of the geometry collection as a union so that we can check // what coordinate type it's using. match field.data_type() { - DataType::List(inner_field) => match parse_geometry(inner_field)? { + DataType::List(inner_field) => match parse_mixed(inner_field)? { NativeType::Mixed(coord_type, dim) => { Ok(NativeType::GeometryCollection(coord_type, dim)) } _ => panic!(), }, - DataType::LargeList(inner_field) => match parse_geometry(inner_field)? { + DataType::LargeList(inner_field) => match parse_mixed(inner_field)? { NativeType::Mixed(coord_type, dim) => { Ok(NativeType::GeometryCollection(coord_type, dim)) } @@ -970,7 +970,7 @@ fn parse_rect(field: &Field) -> NativeType { } } -fn parse_unknown(field: &Field) -> Result { +fn parse_geometry(field: &Field) -> Result { if let DataType::Union(fields, _mode) = field.data_type() { let mut coord_types: HashSet = HashSet::new(); @@ -1090,10 +1090,10 @@ impl TryFrom<&Field> for NativeType { "geoarrow.multipoint" => parse_multi_point(field)?, "geoarrow.multilinestring" => parse_multi_linestring(field)?, "geoarrow.multipolygon" => parse_multi_polygon(field)?, - "geoarrow.geometry" => parse_geometry(field)?, "geoarrow.geometrycollection" => parse_geometry_collection(field)?, "geoarrow.box" => parse_rect(field), - "geoarrow.unknown" => parse_unknown(field)?, + "geoarrow.geometry" => parse_geometry(field)?, + // "geoarrow.geometry" => parse_mixed(field)?, name => return Err(GeoArrowError::General(format!("Expected GeoArrow native type, got '{}'.\nIf you're passing a serialized GeoArrow type like 'geoarrow.wkb' or 'geoarrow.wkt', you need to parse to a native representation.", name))), }; Ok(data_type) diff --git a/rust/geoarrow/src/io/crs.rs b/rust/geoarrow/src/io/crs.rs index 55424de0..e391cad4 100644 --- a/rust/geoarrow/src/io/crs.rs +++ b/rust/geoarrow/src/io/crs.rs @@ -1,3 +1,6 @@ +//! Defines CRS transforms used for writing GeoArrow data to file formats that require different +//! CRS representations. + use std::fmt::Debug; use serde_json::Value; @@ -51,7 +54,9 @@ pub trait CRSTransform: Debug { } } -/// A default implementation for [CRSTransform] which errors on any CRS conversion. +/// A default implementation for [CRSTransform] which does not do any CRS conversion. +/// +/// Instead of raising an error, this will **silently drop any CRS information when writing data**. #[derive(Debug, Clone, Default)] pub struct DefaultCRSTransform {} diff --git a/rust/geoarrow/src/io/csv/mod.rs b/rust/geoarrow/src/io/csv/mod.rs index 5d0c94fa..fc63844f 100644 --- a/rust/geoarrow/src/io/csv/mod.rs +++ b/rust/geoarrow/src/io/csv/mod.rs @@ -1,4 +1,46 @@ //! Read from and write to CSV files. +//! +//! # Examples +//! +//! ``` +//! use std::io::{Cursor, Seek}; +//! +//! use arrow_array::RecordBatchReader; +//! +//! use crate::array::CoordType; +//! use crate::io::csv::{infer_csv_schema, read_csv, CSVReaderOptions}; +//! use crate::table::Table; +//! +//! let s = r#" +//! address,type,datetime,report location,incident number +//! 904 7th Av,Car Fire,05/22/2019 12:55:00 PM,POINT (-122.329051 47.6069),F190051945 +//! 9610 53rd Av S,Aid Response,05/22/2019 12:55:00 PM,POINT (-122.266529 47.515984),F190051946"#; +//! let mut cursor = Cursor::new(s); +//! +//! let options = CSVReaderOptions { +//! coord_type: CoordType::Separated, +//! geometry_column_name: Some("report location".to_string()), +//! has_header: Some(true), +//! ..Default::default() +//! }; +//! +//! // Note: this initial schema currently represents the CSV data _on disk_. That is, the +//! // geometry column is represented as a string. This may change in the future. +//! let (schema, _read_records, _geometry_column_name) = +//! infer_csv_schema(&mut cursor, &options).unwrap(); +//! cursor.rewind().unwrap(); +//! +//! // `read_csv` returns a RecordBatchReader, which enables streaming the CSV without reading +//! // all of it. +//! let record_batch_reader = read_csv(cursor, schema, options).unwrap(); +//! let geospatial_schema = record_batch_reader.schema(); +//! let table = Table::try_new( +//! record_batch_reader.collect::>().unwrap(), +//! geospatial_schema, +//! ) +//! .unwrap(); +//! ``` +//! pub use reader::{infer_csv_schema, read_csv, CSVReaderOptions}; pub use writer::write_csv; diff --git a/rust/geoarrow/src/io/csv/reader.rs b/rust/geoarrow/src/io/csv/reader.rs index 73317bcc..8fae0d8d 100644 --- a/rust/geoarrow/src/io/csv/reader.rs +++ b/rust/geoarrow/src/io/csv/reader.rs @@ -95,29 +95,34 @@ impl Default for CSVReaderOptions { } } -/// Infer a CSV file's schema +/// Infer a CSV file's schema. +/// +/// By default, the reader will **scan the entire CSV file** to infer the data's +/// schema. If your data is large, you can limit the number of records scanned +/// with the [CSVReaderOptions]. +/// /// Returns (Schema, records_read, geometry column name) /// /// Note that the geometry column in the Schema is still left as a String. pub fn infer_csv_schema( reader: impl Read, options: &CSVReaderOptions, -) -> Result<(Schema, usize, String)> { +) -> Result<(SchemaRef, usize, String)> { let format = options.to_format(); let (schema, records_read) = format.infer_schema(reader, options.max_records)?; let geometry_col_name = find_geometry_column(&schema, options.geometry_column_name.as_deref())?; - Ok((schema, records_read, geometry_col_name)) + Ok((Arc::new(schema), records_read, geometry_col_name)) } -/// Read a CSV file to a Table +/// Read a CSV file to a [RecordBatchReader]. /// /// This expects a geometry to be encoded as WKT within one column. /// -/// Note that this is Read and not Read + Seek. This means that you must infer the schema yourself -/// before calling this function. This allows using with objects that are only `Read` in the case -/// when you already know the file's schema. +/// Note that the input required here is [`Read`] and not [`Read`] + [`Seek`][std::io::Seek]. This +/// means that you must infer the schema yourself before calling this function. This allows using +/// with objects that are only `Read` in the case when you already know the file's schema. /// /// This schema is expected to be the schema inferred by `arrow-csv`'s /// [`infer_schema`][Format::infer_schema]. That means the geometry should be a string in the diff --git a/rust/geoarrow/src/io/gdal/mod.rs b/rust/geoarrow/src/io/gdal/mod.rs index a6d363c8..7981536d 100644 --- a/rust/geoarrow/src/io/gdal/mod.rs +++ b/rust/geoarrow/src/io/gdal/mod.rs @@ -1,3 +1,5 @@ +//! Read-only integration with [GDAL][gdal]. + mod reader; pub use reader::read_gdal; diff --git a/rust/geoarrow/src/io/geos/mod.rs b/rust/geoarrow/src/io/geos/mod.rs index aced9efe..cf2f6c8b 100644 --- a/rust/geoarrow/src/io/geos/mod.rs +++ b/rust/geoarrow/src/io/geos/mod.rs @@ -1,4 +1,4 @@ //! Export to and import from data structures of the [`geos`] crate. mod array; -pub mod scalar; +pub(crate) mod scalar; diff --git a/rust/geoarrow/src/io/mod.rs b/rust/geoarrow/src/io/mod.rs index 5f450c88..38eeadd3 100644 --- a/rust/geoarrow/src/io/mod.rs +++ b/rust/geoarrow/src/io/mod.rs @@ -6,7 +6,7 @@ pub mod crs; #[cfg(feature = "csv")] pub mod csv; -pub mod display; +pub(crate) mod display; #[cfg(feature = "flatgeobuf")] pub mod flatgeobuf; #[cfg(feature = "gdal")] @@ -14,7 +14,7 @@ pub mod gdal; pub mod geojson; pub mod geojson_lines; #[cfg(feature = "geos")] -pub mod geos; +pub(crate) mod geos; pub mod geozero; pub mod ipc; #[cfg(feature = "parquet")] diff --git a/rust/geoarrow/src/io/shapefile/mod.rs b/rust/geoarrow/src/io/shapefile/mod.rs index ac1f77ff..868124c5 100644 --- a/rust/geoarrow/src/io/shapefile/mod.rs +++ b/rust/geoarrow/src/io/shapefile/mod.rs @@ -1,3 +1,7 @@ +//! Read from [Shapefile](https://www.esri.com/content/dam/esrisites/sitecore-archive/Files/Pdfs/library/whitepapers/pdfs/shapefile.pdf) datasets. +//! +//! This wraps the [shapefile] crate. + mod reader; mod scalar; diff --git a/rust/geoarrow/src/io/stream.rs b/rust/geoarrow/src/io/stream.rs index 19534e91..7076f51c 100644 --- a/rust/geoarrow/src/io/stream.rs +++ b/rust/geoarrow/src/io/stream.rs @@ -3,8 +3,8 @@ use crate::table::Table; use arrow_array::{RecordBatchIterator, RecordBatchReader as _RecordBatchReader}; use arrow_schema::SchemaRef; -/// A newtype wrapper around an [arrow_array::RecordBatchReader] so that we can impl the -/// [geozero::GeozeroDatasource] trait. +/// A newtype wrapper around an [`arrow_array::RecordBatchReader`] so that we can implement the +/// [`geozero::GeozeroDatasource`] trait on it. pub struct RecordBatchReader(Option>); impl RecordBatchReader { diff --git a/rust/geoarrow/src/table.rs b/rust/geoarrow/src/table.rs index 9bc87bb6..d35fbb9c 100644 --- a/rust/geoarrow/src/table.rs +++ b/rust/geoarrow/src/table.rs @@ -29,11 +29,10 @@ pub(crate) static GEOARROW_EXTENSION_NAMES: Set<&'static str> = phf_set! { "geoarrow.geometrycollection", "geoarrow.wkb", "geoarrow.wkt", - "geoarrow.unknown", "ogc.wkb", }; -/// An Arrow table that MAY contain one or more geospatial columns. +/// An Arrow table that may contain one or more geospatial columns. /// /// This Table object is designed to be interoperable with non-geospatial Arrow libraries, and thus /// does not _require_ a geometry column.