Skip to content

Commit

Permalink
Improved rust docs
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebarron committed Dec 10, 2024
1 parent bf58429 commit 900dd1d
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 22 deletions.
14 changes: 7 additions & 7 deletions rust/geoarrow/src/datatypes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -469,7 +469,7 @@ impl NativeType {
Mixed(_, _) => "geoarrow.geometry",
GeometryCollection(_, _) => "geoarrow.geometrycollection",
Rect(_) => "geoarrow.box",
Geometry(_) => "geoarrow.unknown",
Geometry(_) => "geoarrow.geometry",
}
}

Expand Down Expand Up @@ -794,7 +794,7 @@ fn parse_multi_polygon(field: &Field) -> Result<NativeType> {
}
}

fn parse_geometry(field: &Field) -> Result<NativeType> {
fn parse_mixed(field: &Field) -> Result<NativeType> {
match field.data_type() {
DataType::Union(fields, _) => {
let mut coord_types: HashSet<CoordType> = HashSet::new();
Expand Down Expand Up @@ -927,13 +927,13 @@ fn parse_geometry_collection(field: &Field) -> Result<NativeType> {
// We need to parse the _inner_ type of the geometry collection as a union so that we can check
// what coordinate type it's using.
match field.data_type() {
DataType::List(inner_field) => match parse_geometry(inner_field)? {
DataType::List(inner_field) => match parse_mixed(inner_field)? {
NativeType::Mixed(coord_type, dim) => {
Ok(NativeType::GeometryCollection(coord_type, dim))
}
_ => panic!(),
},
DataType::LargeList(inner_field) => match parse_geometry(inner_field)? {
DataType::LargeList(inner_field) => match parse_mixed(inner_field)? {
NativeType::Mixed(coord_type, dim) => {
Ok(NativeType::GeometryCollection(coord_type, dim))
}
Expand Down Expand Up @@ -970,7 +970,7 @@ fn parse_rect(field: &Field) -> NativeType {
}
}

fn parse_unknown(field: &Field) -> Result<NativeType> {
fn parse_geometry(field: &Field) -> Result<NativeType> {
if let DataType::Union(fields, _mode) = field.data_type() {
let mut coord_types: HashSet<CoordType> = HashSet::new();

Expand Down Expand Up @@ -1090,10 +1090,10 @@ impl TryFrom<&Field> for NativeType {
"geoarrow.multipoint" => parse_multi_point(field)?,
"geoarrow.multilinestring" => parse_multi_linestring(field)?,
"geoarrow.multipolygon" => parse_multi_polygon(field)?,
"geoarrow.geometry" => parse_geometry(field)?,
"geoarrow.geometrycollection" => parse_geometry_collection(field)?,
"geoarrow.box" => parse_rect(field),
"geoarrow.unknown" => parse_unknown(field)?,
"geoarrow.geometry" => parse_geometry(field)?,
// "geoarrow.geometry" => parse_mixed(field)?,
name => return Err(GeoArrowError::General(format!("Expected GeoArrow native type, got '{}'.\nIf you're passing a serialized GeoArrow type like 'geoarrow.wkb' or 'geoarrow.wkt', you need to parse to a native representation.", name))),
};
Ok(data_type)
Expand Down
7 changes: 6 additions & 1 deletion rust/geoarrow/src/io/crs.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
//! Defines CRS transforms used for writing GeoArrow data to file formats that require different
//! CRS representations.
use std::fmt::Debug;

use serde_json::Value;
Expand Down Expand Up @@ -51,7 +54,9 @@ pub trait CRSTransform: Debug {
}
}

/// A default implementation for [CRSTransform] which errors on any CRS conversion.
/// A default implementation for [CRSTransform] which does not do any CRS conversion.
///
/// Instead of raising an error, this will **silently drop any CRS information when writing data**.
#[derive(Debug, Clone, Default)]
pub struct DefaultCRSTransform {}

Expand Down
42 changes: 42 additions & 0 deletions rust/geoarrow/src/io/csv/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,46 @@
//! Read from and write to CSV files.
//!
//! # Examples
//!
//! ```
//! use std::io::{Cursor, Seek};
//!
//! use arrow_array::RecordBatchReader;
//!
//! use crate::array::CoordType;
//! use crate::io::csv::{infer_csv_schema, read_csv, CSVReaderOptions};
//! use crate::table::Table;
//!
//! let s = r#"
//! address,type,datetime,report location,incident number
//! 904 7th Av,Car Fire,05/22/2019 12:55:00 PM,POINT (-122.329051 47.6069),F190051945
//! 9610 53rd Av S,Aid Response,05/22/2019 12:55:00 PM,POINT (-122.266529 47.515984),F190051946"#;
//! let mut cursor = Cursor::new(s);
//!
//! let options = CSVReaderOptions {
//! coord_type: CoordType::Separated,
//! geometry_column_name: Some("report location".to_string()),
//! has_header: Some(true),
//! ..Default::default()
//! };
//!
//! // Note: this initial schema currently represents the CSV data _on disk_. That is, the
//! // geometry column is represented as a string. This may change in the future.
//! let (schema, _read_records, _geometry_column_name) =
//! infer_csv_schema(&mut cursor, &options).unwrap();
//! cursor.rewind().unwrap();
//!
//! // `read_csv` returns a RecordBatchReader, which enables streaming the CSV without reading
//! // all of it.
//! let record_batch_reader = read_csv(cursor, schema, options).unwrap();
//! let geospatial_schema = record_batch_reader.schema();
//! let table = Table::try_new(
//! record_batch_reader.collect::<Result<_, _>>().unwrap(),
//! geospatial_schema,
//! )
//! .unwrap();
//! ```
//!
pub use reader::{infer_csv_schema, read_csv, CSVReaderOptions};
pub use writer::write_csv;
Expand Down
19 changes: 12 additions & 7 deletions rust/geoarrow/src/io/csv/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,29 +95,34 @@ impl Default for CSVReaderOptions {
}
}

/// Infer a CSV file's schema
/// Infer a CSV file's schema.
///
/// By default, the reader will **scan the entire CSV file** to infer the data's
/// schema. If your data is large, you can limit the number of records scanned
/// with the [CSVReaderOptions].
///
/// Returns (Schema, records_read, geometry column name)
///
/// Note that the geometry column in the Schema is still left as a String.
pub fn infer_csv_schema(
reader: impl Read,
options: &CSVReaderOptions,
) -> Result<(Schema, usize, String)> {
) -> Result<(SchemaRef, usize, String)> {
let format = options.to_format();
let (schema, records_read) = format.infer_schema(reader, options.max_records)?;

let geometry_col_name = find_geometry_column(&schema, options.geometry_column_name.as_deref())?;

Ok((schema, records_read, geometry_col_name))
Ok((Arc::new(schema), records_read, geometry_col_name))
}

/// Read a CSV file to a Table
/// Read a CSV file to a [RecordBatchReader].
///
/// This expects a geometry to be encoded as WKT within one column.
///
/// Note that this is Read and not Read + Seek. This means that you must infer the schema yourself
/// before calling this function. This allows using with objects that are only `Read` in the case
/// when you already know the file's schema.
/// Note that the input required here is [`Read`] and not [`Read`] + [`Seek`][std::io::Seek]. This
/// means that you must infer the schema yourself before calling this function. This allows using
/// with objects that are only `Read` in the case when you already know the file's schema.
///
/// This schema is expected to be the schema inferred by `arrow-csv`'s
/// [`infer_schema`][Format::infer_schema]. That means the geometry should be a string in the
Expand Down
2 changes: 2 additions & 0 deletions rust/geoarrow/src/io/gdal/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Read-only integration with [GDAL][gdal].
mod reader;

pub use reader::read_gdal;
2 changes: 1 addition & 1 deletion rust/geoarrow/src/io/geos/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//! Export to and import from data structures of the [`geos`] crate.
mod array;
pub mod scalar;
pub(crate) mod scalar;
4 changes: 2 additions & 2 deletions rust/geoarrow/src/io/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
pub mod crs;
#[cfg(feature = "csv")]
pub mod csv;
pub mod display;
pub(crate) mod display;
#[cfg(feature = "flatgeobuf")]
pub mod flatgeobuf;
#[cfg(feature = "gdal")]
pub mod gdal;
pub mod geojson;
pub mod geojson_lines;
#[cfg(feature = "geos")]
pub mod geos;
pub(crate) mod geos;
pub mod geozero;
pub mod ipc;
#[cfg(feature = "parquet")]
Expand Down
4 changes: 4 additions & 0 deletions rust/geoarrow/src/io/shapefile/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
//! Read from [Shapefile](https://www.esri.com/content/dam/esrisites/sitecore-archive/Files/Pdfs/library/whitepapers/pdfs/shapefile.pdf) datasets.
//!
//! This wraps the [shapefile] crate.
mod reader;
mod scalar;

Expand Down
4 changes: 2 additions & 2 deletions rust/geoarrow/src/io/stream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ use crate::table::Table;
use arrow_array::{RecordBatchIterator, RecordBatchReader as _RecordBatchReader};
use arrow_schema::SchemaRef;

/// A newtype wrapper around an [arrow_array::RecordBatchReader] so that we can impl the
/// [geozero::GeozeroDatasource] trait.
/// A newtype wrapper around an [`arrow_array::RecordBatchReader`] so that we can implement the
/// [`geozero::GeozeroDatasource`] trait on it.
pub struct RecordBatchReader(Option<Box<dyn _RecordBatchReader>>);

impl RecordBatchReader {
Expand Down
3 changes: 1 addition & 2 deletions rust/geoarrow/src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,10 @@ pub(crate) static GEOARROW_EXTENSION_NAMES: Set<&'static str> = phf_set! {
"geoarrow.geometrycollection",
"geoarrow.wkb",
"geoarrow.wkt",
"geoarrow.unknown",
"ogc.wkb",
};

/// An Arrow table that MAY contain one or more geospatial columns.
/// An Arrow table that may contain one or more geospatial columns.
///
/// This Table object is designed to be interoperable with non-geospatial Arrow libraries, and thus
/// does not _require_ a geometry column.
Expand Down

0 comments on commit 900dd1d

Please sign in to comment.