Skip to content

Commit

Permalink
Remove temporal strategies, fix tests and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
chmp committed Jan 19, 2025
1 parent c240d69 commit 3c4bb73
Show file tree
Hide file tree
Showing 13 changed files with 105 additions and 235 deletions.
21 changes: 9 additions & 12 deletions serde_arrow/Status.md
Original file line number Diff line number Diff line change
Expand Up @@ -88,26 +88,25 @@ The page documents the supported types both from an Arrow and a Rust perspective
#### `chrono::DateTime<Utc>`

- is serialized / deserialized as strings
- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("Utc"))`, `Date64` with strategy `UtcStrAsDate64`
- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("UTC"))`
- `from_samples` detects
- `LargeUtf8` without configuration
- `Date64` with strategy `UtcStrAsDate64` when setting `guess_dates = true`
- `Timestamp(Millisecond, Some("UTC"))` when setting `guess_dates = true`
- `from_type` is not supported, as the type is not self-describing

With [`chrono::serde::ts_microseconds`][chrono-ts-microseconds]:

- is serialized / deserialized as `i64`
- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("Utc"))`, `Date64` without Strategy,
`Date64` with strategy `UtcStrAsDate64`
- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("UTC"))`
- `from_samples` and `from_type` detect `Int64`

#### `chrono::NaiveDateTime`

- is serialized / deserialized as strings
- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., None)`, `Date64` with strategy `NaiveStrAsDate64`
- can be mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., None)`
- `from_samples` detects
- `LargeUtf8` without configuration
- `Date64` with strategy `NaiveStrAsDate64` when setting `guess_dates = true`
- `Timestamp(Millisecond, None)` when setting `guess_dates = true`
- `from_type` is not supported, as the type is not self-describing

#### `chrono::NaiveTime`
Expand Down Expand Up @@ -153,21 +152,19 @@ With [`chrono::serde::ts_microseconds`][chrono-ts-microseconds]:
#### `jiff::DateTime`

- is serialized as Serde strings
- can me mapped to `Utf8`, `LargeUtf8`, `Timestmap(.., None)`, `Date64` with strategy
`NaiveStrAsDate64`
- can me mapped to `Utf8`, `LargeUtf8`, `Timestmap(.., None)`
- `from_samples` detects
- `LargeUtf8` without configuration
- `Date64` with strategy `NaiveStrAsDate64` when setting `guess_dates = true`
- `Timestamp(Millisecond, None)` when setting `guess_dates = true`
- `from_type` is not supported, as the type is not self-describing

#### `jiff::Timestamp`

- is serialized as Serde strings
- can me mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("UTC"))`, `Date64` with strategy
`UtcStrAsDate64`
- can me mapped to `Utf8`, `LargeUtf8`, `Timestamp(.., Some("UTC"))`
- `from_samples` detects
- `LargeUtf8` without configuration
- `Date64` with strategy `UtcStrDate64` when setting `guess_dates = true`
- `Timestamp(Millisecond, Some("UTC"))` when setting `guess_dates = true`
- `from_type` is not supported, as the type is not self-describing

#### `jiff::Span`
Expand Down
60 changes: 29 additions & 31 deletions serde_arrow/src/_impl/docs/quickstart.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@
//! `NaiveDateTime`, the values are per default encoded as strings. To store them compactly as
//! integer columns, the data type has to be modified.
//!
//! For example, consider a list of [`NaiveDateTime`][chrono::NaiveDateTime]
//! objects. The traced field `val` will be of type `Utf8`.
//! For example, consider a list of [`NaiveDateTime`][chrono::NaiveDateTime] objects. The traced
//! field `val` will be of type `Utf8`.
//!
//! ```rust
//! # #[cfg(has_arrow)]
Expand All @@ -52,36 +52,36 @@
//! # #[cfg(not(has_arrow))] fn main() { }
//! ```
//!
//! To store it as `Date64` field, modify the data type as in
//! To store it as a `Timestamp` field, modify the data type as in
//!
//! ```rust
//! # #[cfg(has_arrow)]
//! # fn main() {
//! # use serde_arrow::_impl::arrow::datatypes::{DataType, Field};
//! # use serde_arrow::_impl::arrow::datatypes::{DataType, TimeUnit, Field};
//! # use serde_arrow::schema::Strategy;
//! # let mut fields = vec![Field::new("dummy", DataType::Null, true)];
//! fields[0] = Field::new("item", DataType::Date64, false)
//! .with_metadata(Strategy::NaiveStrAsDate64.into());
//! fields[0] = Field::new("item", DataType::Timestamp(TimeUnit::Millisecond, None), false);
//! # }
//! # #[cfg(not(has_arrow))] fn main() { }
//! ```
//!
//! Integer fields containing timestamps in milliseconds since the epoch or
//! `DateTime<Utc>` objects can be directly stored as `Date64` without any
//! configuration:
//! Integer fields containing timestamps in milliseconds since the epoch or `DateTime<Utc>` objects
//! can be directly stored as `Timestamp(..)` without any configuration:
//!
//! ```rust
//! # #[cfg(has_arrow)]
//! # fn main() -> serde_arrow::_impl::PanicOnError<()> {
//! # use std::sync::Arc;
//! # use serde_arrow::_impl::arrow::datatypes::{DataType, Field};
//! # use serde_arrow::_impl::arrow::datatypes::{DataType, TimeUnit, Field};
//! # use serde_arrow::utils::Item;
//! let records: &[Item<i64>] = &[
//! Item(12 * 60 * 60 * 24 * 1000),
//! Item(9 * 60 * 60 * 24 * 1000),
//! ];
//!
//! let fields = vec![Arc::new(Field::new("item", DataType::Date64, false))];
//! let fields = vec![
//! Arc::new(Field::new("item", DataType::Timestamp(TimeUnit::Millisecond, None), false)),
//! ];
//! let arrays = serde_arrow::to_arrow(&fields, records)?;
//! # Ok(())
//! # }
Expand Down Expand Up @@ -119,8 +119,8 @@
//!
//! ## Dictionary encoding for strings
//!
//! Strings with repeated values can be encoded as dictionaries. The data type
//! of the corresponding field must be changed to `Dictionary`.
//! Strings with repeated values can be encoded as dictionaries. The data type of the corresponding
//! field must be changed to `Dictionary`.
//!
//! For an existing field this can be done via:
//!
Expand All @@ -139,8 +139,8 @@
//! # #[cfg(not(has_arrow))] fn main() { }
//! ```
//!
//! To dictionary encode all string fields, set the `string_dictionary_encoding`
//! of `TracingOptions`, when tracing the fields:
//! To dictionary encode all string fields, set the `string_dictionary_encoding` of
//! `TracingOptions`, when tracing the fields:
//!
//! ```rust
//! # #[cfg(has_arrow)]
Expand All @@ -159,13 +159,12 @@
//!
//! ## Working with enums
//!
//! Rust enums correspond to arrow's union types and are supported by
//! `serde_arrow`. Both enums with and without fields are supported. Variants
//! without fields are mapped to null arrays. Only variants that are included in
//! schema can be serialized or deserialized and the variants must have the
//! correct index. When using
//! [`SchemaLike::from_type`][crate::schema::SchemaLike::from_type] these
//! requirements will automatically be met.
//! Rust enums correspond to arrow's union types and are supported by `serde_arrow`. Both enums with
//! and without fields are supported. Variants without fields are mapped to null arrays. Only
//! variants that are included in schema can be serialized or deserialized and the variants must
//! have the correct index. When using
//! [`SchemaLike::from_type`][crate::schema::SchemaLike::from_type] these requirements will
//! automatically be met.
//!
//! For example:
//!
Expand All @@ -188,10 +187,10 @@
//! - `type = 1`: `Struct { 0: u32, 1: u32 }`
//! - `type = 2`: `Struct { a: f32, b: f32 }`
//!
//! Enums without data can also be serialized to and deserialized from strings,
//! both dictionary encoded or non-dictionary encoded. To select this encoding,
//! either set the field data type manually to a string data type or trace the
//! field with `enums_without_data_as_strings(true)`. E.g.,
//! Enums without data can also be serialized to and deserialized from strings, both dictionary
//! encoded or non-dictionary encoded. To select this encoding, either set the field data type
//! manually to a string data type or trace the field with `enums_without_data_as_strings(true)`.
//! E.g.,
//!
//! ```rust
//! # use serde::{Deserialize, Serialize};
Expand Down Expand Up @@ -249,16 +248,15 @@
//!
//! ## Convert from arrow2 to arrow arrays
//!
//! Both `arrow` and `arrow2` use the Arrow memory format. Hence, it is possible
//! to convert arrays between both packages with minimal work using their
//! respective FFI interfaces:
//! Both `arrow` and `arrow2` use the Arrow memory format. Hence, it is possible to convert arrays
//! between both packages with minimal work using their respective FFI interfaces:
//!
//! - [`arrow2::ffi::export_field_to_c`](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_field_to_c.html)
//! - [`arrow2::ffi_export_array_to_c`](https://docs.rs/arrow2/latest/arrow2/ffi/fn.export_array_to_c.html)
//! - [`arrow::ffi::ArrowArray::new`](https://docs.rs/arrow/latest/arrow/ffi/struct.ArrowArray.html#method.new)
//!
//! The arrow2 crate includes [a helper
//! trait](https://docs.rs/arrow2/latest/arrow2/array/trait.Arrow2Arrow.html) to
//! perform this conversion when used with the `arrow` feature.
//! trait](https://docs.rs/arrow2/latest/arrow2/array/trait.Arrow2Arrow.html) to perform this
//! conversion when used with the `arrow` feature.
//!
//!
30 changes: 9 additions & 21 deletions serde_arrow/src/internal/deserialization/array_deserializer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ pub enum ArrayDeserializer<'a> {
}

impl<'a> ArrayDeserializer<'a> {
pub fn new(path: String, strategy: Option<&Strategy>, array: View<'a>) -> Result<Self> {
// TODO: decide whether to keep strategy parameter
pub fn new(path: String, _strategy: Option<&Strategy>, array: View<'a>) -> Result<Self> {
use {ArrayDeserializer as D, View as V};
match array {
View::Null(_) => Ok(Self::Null(NullDeserializer::new(path))),
Expand Down Expand Up @@ -105,26 +106,13 @@ impl<'a> ArrayDeserializer<'a> {
))),
V::Time32(view) => Ok(D::Time32(TimeDeserializer::new(path, view))),
V::Time64(view) => Ok(D::Time64(TimeDeserializer::new(path, view))),
V::Timestamp(view) => match strategy {
// TODO: fix this: move functionality into timestamp deserializer
Some(Strategy::DateTimeAsStr) => Ok(Self::Date64(Date64Deserializer::new(
path,
view.values,
view.validity,
view.unit,
is_utc_timestamp(view.timezone.as_deref())?,
))),
Some(strategy) => {
fail!("Invalid strategy: {strategy} is not supported for timestamp field")
}
None => Ok(Self::Date64(Date64Deserializer::new(
path,
view.values,
view.validity,
view.unit,
is_utc_timestamp(view.timezone.as_deref())?,
))),
},
V::Timestamp(view) => Ok(Self::Date64(Date64Deserializer::new(
path,
view.values,
view.validity,
view.unit,
is_utc_timestamp(view.timezone.as_deref())?,
))),
V::Duration(view) => Ok(D::Duration(DurationDeserializer::new(
path,
view.unit,
Expand Down
9 changes: 3 additions & 6 deletions serde_arrow/src/internal/schema/from_samples/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use serde::{ser::Impossible, Serialize};
use crate::internal::{
chrono,
error::{fail, try_, Context, ContextSupport, Error, Result},
schema::{Strategy, TracingMode, TracingOptions},
schema::{TracingMode, TracingOptions},
};

use super::tracer::{
Expand Down Expand Up @@ -340,14 +340,11 @@ impl<'a> serde::ser::Serializer for TracerSerializer<'a> {
(self.0.get_options().string_type(), None)
} else {
if chrono::matches_naive_datetime(s) {
(
DataType::Timestamp(TimeUnit::Millisecond, None),
Some(Strategy::DateTimeAsStr),
)
(DataType::Timestamp(TimeUnit::Millisecond, None), None)
} else if chrono::matches_utc_datetime(s) {
(
DataType::Timestamp(TimeUnit::Millisecond, Some(String::from("UTC"))),
Some(Strategy::DateTimeAsStr),
None,
)
} else if chrono::matches_naive_time(s) {
(DataType::Time64(TimeUnit::Nanosecond), None)
Expand Down
48 changes: 21 additions & 27 deletions serde_arrow/src/internal/schema/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,7 @@ pub trait Sealed {}
/// ```
///
pub trait SchemaLike: Sized + Sealed {
/// Build the schema from an object that implements serialize (e.g.,
/// `serde_json::Value`)
/// Build the schema from an object that implements serialize (e.g., `serde_json::Value`)
///
/// ```rust
/// # #[cfg(has_arrow)]
Expand Down Expand Up @@ -99,12 +98,9 @@ pub trait SchemaLike: Sized + Sealed {
///
/// - `"name"` (**required**): the name of the field
/// - `"data_type"` (**required**): the data type of the field as a string
/// - `"nullable"` (**optional**): if `true`, the field can contain null
/// values
/// - `"strategy"` (**optional**): if given a string describing the strategy
/// to use (e.g., "NaiveStrAsDate64").
/// - `"children"` (**optional**): a list of child fields, the semantics
/// depend on the data type
/// - `"nullable"` (**optional**): if `true`, the field can contain null values
/// - `"strategy"` (**optional**): if given a string describing the strategy to use
/// - `"children"` (**optional**): a list of child fields, the semantics depend on the data type
///
/// The following data types are supported:
///
Expand All @@ -115,22 +111,20 @@ pub trait SchemaLike: Sized + Sealed {
/// - strings: `"Utf8"`, `"LargeUtf8"`
/// - decimals: `"Decimal128(precision, scale)"`, as in `"Decimal128(5, 2)"`
/// - date objects: `"Date32"`
/// - date time objects: , `"Date64"`, `"Timestamp(unit, timezone)"` with
/// unit being one of `Second`, `Millisecond`, `Microsecond`,
/// `Nanosecond`.
/// - time objects: `"Time32(unit)"`, `"Time64(unit)"` with unit being one
/// of `Second`, `Millisecond`, `Microsecond`, `Nanosecond`.
/// - durations: `"Duration(unit)"` with unit being one of `Second`,
/// - date time objects: , `"Date64"`, `"Timestamp(unit, timezone)"` with unit being one of
/// `Second`, `Millisecond`, `Microsecond`, `Nanosecond`.
/// - time objects: `"Time32(unit)"`, `"Time64(unit)"` with unit being one of `Second`,
/// `Millisecond`, `Microsecond`, `Nanosecond`.
/// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single
/// field named `"element"` that describes the element type
/// - durations: `"Duration(unit)"` with unit being one of `Second`, `Millisecond`,
/// `Microsecond`, `Nanosecond`.
/// - lists: `"List"`, `"LargeList"`. `"children"` must contain a single field named `"element"`
/// that describes the element type
/// - structs: `"Struct"`. `"children"` must contain the child fields
/// - maps: `"Map"`. `"children"` must contain two fields, named `"key"` and
/// `"value"` that encode the key and value types
/// - maps: `"Map"`. `"children"` must contain two fields, named `"key"` and `"value"` that
/// encode the key and value types
/// - unions: `"Union"`. `"children"` must contain the different variants
/// - dictionaries: `"Dictionary"`. `"children"` must contain two different
/// fields, named `"key"` of integer type and named `"value"` of string
/// type
/// - dictionaries: `"Dictionary"`. `"children"` must contain two different fields, named
/// `"key"` of integer type and named `"value"` of string type
///
fn from_value<T: Serialize>(value: T) -> Result<Self>;

Expand Down Expand Up @@ -435,17 +429,17 @@ fn validate_dictionary_field(field: &Field, key: &DataType, value: &DataType) ->
}

fn validate_date64_field(field: &Field) -> Result<()> {
match get_strategy_from_metadata(&field.metadata)? {
None | Some(Strategy::DateTimeAsStr) => Ok(()),
Some(strategy) => fail!("invalid strategy for Date64 field: {strategy}"),
if let Some(strategy) = get_strategy_from_metadata(&field.metadata)? {
fail!("invalid strategy for Date64 field: {strategy}");
}
Ok(())
}

fn validate_timestamp_field(field: &Field, unit: TimeUnit, tz: Option<&str>) -> Result<()> {
match get_strategy_from_metadata(&field.metadata)? {
None | Some(Strategy::DateTimeAsStr) => Ok(()),
Some(strategy) => fail!("invalid strategy for Timestamp({unit}, {tz:?}) field: {strategy}"),
if let Some(strategy) = get_strategy_from_metadata(&field.metadata)? {
fail!("invalid strategy for Timestamp({unit}, {tz:?}) field: {strategy}");
}
Ok(())
}

fn validate_time32_field(field: &Field, unit: TimeUnit) -> Result<()> {
Expand Down
4 changes: 0 additions & 4 deletions serde_arrow/src/internal/schema/strategy.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,6 @@ pub enum Strategy {
/// serialization or deserialization of such a field is attempted, it will
/// result in an error.
UnknownVariant,
/// Serialize date time types as strings
DateTimeAsStr,
}

impl std::fmt::Display for Strategy {
Expand All @@ -67,7 +65,6 @@ impl std::fmt::Display for Strategy {
Self::TupleAsStruct => write!(f, "TupleAsStruct"),
Self::MapAsStruct => write!(f, "MapAsStruct"),
Self::UnknownVariant => write!(f, "UnknownVariant"),
Self::DateTimeAsStr => write!(f, "DateTimeAsStr"),
}
}
}
Expand Down Expand Up @@ -95,7 +92,6 @@ impl FromStr for Strategy {
"TupleAsStruct" => Ok(Self::TupleAsStruct),
"MapAsStruct" => Ok(Self::MapAsStruct),
"UnknownVariant" => Ok(Self::UnknownVariant),
"DateTimeAsStr" => Ok(Self::DateTimeAsStr),
_ => fail!("Unknown strategy {s}"),
}
}
Expand Down
Loading

0 comments on commit 3c4bb73

Please sign in to comment.