diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 3ec9134b3..6c9ab4710 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,6 +45,9 @@ jobs: - name: Test with no default features run: cargo test --no-default-features + - name: Test with alloc feature + run: cargo test --no-default-features --features alloc + rust-minimal-versions: name: Compile with minimum dependency versions runs-on: ubuntu-latest diff --git a/Cargo.toml b/Cargo.toml index 0afef66d5..a448ac9db 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "roe" -version = "0.1.0" # remember to set `html_root_url` in `src/lib.rs`. +version = "0.0.1" # remember to set `html_root_url` in `src/lib.rs`. authors = ["Ryan Lopopolo "] license = "MIT" edition = "2018" @@ -16,7 +16,7 @@ include = ["src/**/*", "tests/**/*", "LICENSE", "README.md"] [features] default = ["std"] # Enable dependency on `std`, the Rust standard library. This feature enables -# `std::error::Error` implementations on the error types in `boba`. +# `std::error::Error` implementations on the error types in `roe`. std = ["alloc"] # Enable a dependency on `alloc`, The Rust collections library. This feature # enables APIs that depend on `Vec` and `String`. @@ -33,6 +33,6 @@ version-sync = "0.9, >= 0.9.2" [package.metadata.docs.rs] # This sets the default target to `x86_64-unknown-linux-gnu` and only builds -# that target. `boba` has the same API and code on all targets. +# that target. `roe` has the same API and code on all targets. targets = ["x86_64-unknown-linux-gnu"] rustdoc-args = ["--cfg", "docsrs"] diff --git a/README.md b/README.md index 9ad5cf6ae..abbc81370 100644 --- a/README.md +++ b/README.md @@ -21,20 +21,30 @@ and uppercase forms. This crate is used to implement [`String#capitalize`], This crate depends on [`bstr`]. +## Status + +This crate is currently a _work in progress_. When the API is complete, Roe will +support lowercase, uppercase, titlecase, and case folding iterators for +conventionally UTF-8 byte slices. + +Roe will implement support for full, Turkic, ASCII, and case folding transforms. + ## Usage Add this to your `Cargo.toml`: ```toml [dependencies] -roe = "0.1" +roe = "0.0.1" ``` Then convert case like: ```rust -assert_eq!(roe::lowercase("Pineapple").collect::>(), b"pineapple"); -assert_eq!(roe::upercase(b"xexax").collect::>(), b"XEXAX"); +use roe::{LowercaseMode, UppercaseMode}; + +assert_eq!(roe::lowercase(b"Artichoke Ruby", LowercaseMode::Ascii).collect::>(), b"artichoke ruby"); +assert_eq!(roe::uppercase("Αύριο".as_bytes(), UppercaseMode::Full).collect::>(), "ΑΎΡΙΟ".as_bytes()); ``` ## Crate Features @@ -47,8 +57,8 @@ assert_eq!(roe::upercase(b"xexax").collect::>(), b"XEXAX"); feature enables [`std::error::Error`] implementations on error types in this crate. Enabling the **std** feature also enables the **alloc** feature. - **alloc** - Adds a dependency on [`alloc`], the Rust allocation and - collections library. This feature enables APIs that depend on [`Vec`] and - [`String`]. + collections library. This feature enables APIs that allocate [`String`] or + [`Vec`]. ## License @@ -69,10 +79,8 @@ assert_eq!(roe::upercase(b"xexax").collect::>(), b"XEXAX"); [`symbol#upcase`]: https://ruby-doc.org/core-2.6.3/Symbol.html#method-i-upcase [artichoke ruby]: https://github.com/artichoke/artichoke [`bstr`]: https://crates.io/crates/bstr -[`alloc`]: https://doc.rust-lang.org/stable/alloc/index.html -[`std`]: https://doc.rust-lang.org/stable/std/index.html -[`std::error::error`]: - https://doc.rust-lang.org/stable/std/error/trait.Error.html -[`vec`]: https://doc.rust-lang.org/stable/alloc/vec/struct.Vec.html +[`alloc`]: https://doc.rust-lang.org/alloc/index.html +[`std`]: https://doc.rust-lang.org/std/index.html +[`std::error::error`]: https://doc.rust-lang.org/std/error/trait.Error.html [`string`]: https://doc.rust-lang.org/stable/alloc/string/struct.String.html -[cargo-fuzz]: https://crates.io/crates/cargo-fuzz +[`vec`]: https://doc.rust-lang.org/stable/alloc/vec/struct.Vec.html diff --git a/Rakefile b/Rakefile index 0555c418e..b2d9b3cfe 100644 --- a/Rakefile +++ b/Rakefile @@ -17,7 +17,7 @@ namespace :lint do FileList['**/{build,lib,main}.rs'].each do |root| FileUtils.touch(root) end - sh 'cargo clippy --workspace --all-features' + sh 'cargo clippy --workspace --all-features --all-targets' end desc 'Lint Rust sources with Clippy restriction pass (unenforced lints)' diff --git a/src/ascii/lowercase.rs b/src/ascii/lowercase.rs index 7f20053e8..6443aaf16 100644 --- a/src/ascii/lowercase.rs +++ b/src/ascii/lowercase.rs @@ -30,6 +30,7 @@ use alloc::vec::Vec; /// [`String#downcase!`]: https://ruby-doc.org/core-2.6.3/String.html#method-i-downcase-21 /// [slice-primitive]: https://doc.rust-lang.org/std/primitive.slice.html#method.make_ascii_lowercase #[inline] +#[allow(clippy::module_name_repetitions)] pub fn make_ascii_lowercase>(mut slice: T) { let slice = slice.as_mut(); slice.make_ascii_lowercase(); @@ -62,6 +63,7 @@ pub fn make_ascii_lowercase>(mut slice: T) { #[inline] #[cfg(feature = "alloc")] #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))] +#[allow(clippy::module_name_repetitions)] pub fn to_ascii_lowercase>(slice: T) -> Vec { let slice = slice.as_ref(); slice.to_ascii_lowercase() diff --git a/src/ascii/titlecase.rs b/src/ascii/titlecase.rs index 0ba5233c0..559c617dc 100644 --- a/src/ascii/titlecase.rs +++ b/src/ascii/titlecase.rs @@ -4,7 +4,7 @@ use alloc::vec::Vec; /// Converts the given slice to its ASCII title case equivalent in-place. /// /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z' in the first byte; -/// subsequent bytes with ASCII letters 'A' to 'Z' are mapped to 'a' to 'z; +/// subsequent bytes with ASCII letters 'A' to 'Z' are mapped to 'a' to 'z'; /// non-ASCII letters are unchanged. /// /// This function can be used to implement [`String#capitalize!`] for ASCII @@ -40,6 +40,7 @@ use alloc::vec::Vec; /// /// [`String#capitalize!`]: https://ruby-doc.org/core-2.6.3/String.html#method-i-capitalize-21 #[inline] +#[allow(clippy::module_name_repetitions)] pub fn make_ascii_titlecase>(slice: &mut T) { let slice = slice.as_mut(); if let Some((head, tail)) = slice.split_first_mut() { @@ -52,7 +53,7 @@ pub fn make_ascii_titlecase>(slice: &mut T) { /// mapped to its ASCII title case equivalent. /// /// ASCII letters 'a' to 'z' are mapped to 'A' to 'Z' in the first byte; -/// subsequent bytes with ASCII letters 'A' to 'Z' are mapped to 'a' to 'z; +/// subsequent bytes with ASCII letters 'A' to 'Z' are mapped to 'a' to 'z'; /// non-ASCII letters are unchanged. /// /// This function can be used to implement [`String#capitalize`] and @@ -76,6 +77,7 @@ pub fn make_ascii_titlecase>(slice: &mut T) { #[inline] #[cfg(feature = "alloc")] #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))] +#[allow(clippy::module_name_repetitions)] pub fn to_ascii_titlecase>(slice: T) -> Vec { let slice = slice.as_ref(); let mut titlecase = slice.to_ascii_lowercase(); diff --git a/src/ascii/uppercase.rs b/src/ascii/uppercase.rs index 741304c52..8dc0cc231 100644 --- a/src/ascii/uppercase.rs +++ b/src/ascii/uppercase.rs @@ -30,6 +30,7 @@ use alloc::vec::Vec; /// [`String#upcase!`]: https://ruby-doc.org/core-2.6.3/String.html#method-i-upcase-21 /// [slice-primitive]: https://doc.rust-lang.org/std/primitive.u8.html#method.make_ascii_uppercase #[inline] +#[allow(clippy::module_name_repetitions)] pub fn make_ascii_uppercase>(slice: &mut T) { let slice = slice.as_mut(); slice.make_ascii_uppercase(); @@ -62,6 +63,7 @@ pub fn make_ascii_uppercase>(slice: &mut T) { #[inline] #[cfg(feature = "alloc")] #[cfg_attr(docsrs, doc(cfg(feature = "alloc")))] +#[allow(clippy::module_name_repetitions)] pub fn to_ascii_uppercase>(slice: T) -> Vec { let slice = slice.as_ref(); slice.to_ascii_uppercase() diff --git a/src/lib.rs b/src/lib.rs index 88013e8fb..125245483 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,17 @@ -#![no_std] +#![warn(clippy::all)] +#![warn(clippy::pedantic)] +#![cfg_attr(test, allow(clippy::non_ascii_literal))] +#![cfg_attr(test, allow(clippy::shadow_unrelated))] +#![warn(clippy::cargo)] +#![allow(unknown_lints)] +#![warn(missing_docs, broken_intra_doc_links)] +#![warn(missing_debug_implementations)] +#![warn(missing_copy_implementations)] +#![warn(rust_2018_idioms)] +#![warn(trivial_casts, trivial_numeric_casts)] +#![warn(unused_qualifications)] +#![warn(variant_size_differences)] +#![forbid(unsafe_code)] // Enable feature callouts in generated documentation: // https://doc.rust-lang.org/beta/unstable-book/language-features/doc-cfg.html // @@ -6,12 +19,81 @@ #![cfg_attr(docsrs, feature(doc_cfg))] #![cfg_attr(docsrs, feature(doc_alias))] +//! This crate provides [Unicode case mapping] routines and iterators for +//! [conventionally UTF-8 binary strings]. +//! +//! Unicode case mapping or case conversion can be used to transform the +//! characters in a string. To quote the Unicode FAQ: +//! +//! > Case mapping or case conversion is a process whereby strings are converted +//! > to a particular form—uppercase, lowercase, or titlecase—possibly for +//! > display to the user. +//! +//! This crate is currently a *work in progress*. When the API is complete, Roe +//! will support lowercase, uppercase, titlecase, and case folding iterators for +//! conventionally UTF-8 byte slices. +//! +//! Roe will implement support for full, Turkic, ASCII, and case folding +//! transforms. +//! +//! # Usage +//! +//! You can convert case like: +//! +//! ``` +//! # use roe::{LowercaseMode, UppercaseMode}; +//! assert_eq!(roe::lowercase(b"Artichoke Ruby", LowercaseMode::Ascii).collect::>(), b"artichoke ruby"); +//! assert_eq!(roe::uppercase("Αύριο".as_bytes(), UppercaseMode::Full).collect::>(), "ΑΎΡΙΟ".as_bytes()); +//! ``` +//! +//! +//! Roe provides fast path routines that assume the byte slice is ASCII-only. +//! +//! # Crate Features +//! +//! Roe is `no_std` compatible with an optional dependency on the [`alloc`] +//! crate. +//! +//! Roe has several Cargo features, all of which are enabled by default: +//! +//! - **std** - Adds a dependency on [`std`], the Rust Standard Library. This +//! feature enables [`std::error::Error`] implementations on error types in +//! this crate. Enabling the **std** feature also enables the **alloc** +//! feature. +//! - **alloc** - Adds a dependency on [`alloc`], the Rust allocation and +//! collections library. This feature enables APIs that allocate [`String`] or +//! [`Vec`]. +//! +//! [Unicode case mapping]: https://unicode.org/faq/casemap_charprop.html#casemap +//! [conventionally UTF-8 binary strings]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings +//! [`std`]: https://doc.rust-lang.org/std/index.html +//! [`std::error::Error`]: https://doc.rust-lang.org/std/error/trait.Error.html +//! [`String`]: https://doc.rust-lang.org/alloc/string/struct.String.html +//! [`Vec`]: https://doc.rust-lang.org/alloc/vec/struct.Vec.html + +#![no_std] +#![doc(html_root_url = "https://docs.rs/roe/0.0.1")] + #[cfg(any(feature = "alloc", test))] extern crate alloc; #[cfg(feature = "std")] extern crate std; +// Ensure code blocks in README.md compile +#[cfg(doctest)] +macro_rules! readme { + ($x:expr) => { + #[doc = $x] + mod readme {} + }; + () => { + readme!(include_str!("../README.md")); + }; +} +#[cfg(all(feature = "alloc", doctest))] +readme!(); + use core::convert::{TryFrom, TryInto}; use core::fmt; use core::str::FromStr; @@ -26,49 +108,68 @@ pub use ascii::{to_ascii_lowercase, to_ascii_titlecase, to_ascii_uppercase}; pub use lowercase::Lowercase; pub use uppercase::Uppercase; +/// Error that indicates a failure to parse a [`LowercaseMode`] or +/// [`UppercaseMode`]. +/// +/// This error corresponds to the [Ruby `ArgumentError` Exception class]. +/// +/// # Examples +/// +/// ``` +/// # use core::convert::TryInto; +/// # use roe::{InvalidCaseMappingMode, LowercaseMode}; +/// let err = InvalidCaseMappingMode::new(); +/// assert_eq!(err.message(), "invalid option"); +/// +/// let mode: Result = "full".try_into(); +/// ``` +/// +/// [Ruby `ArgumentError` Exception class]: https://ruby-doc.org/core-2.6.3/ArgumentError.html #[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] -pub struct InvalidLowercaseMode { +pub struct InvalidCaseMappingMode { _private: (), } -impl InvalidLowercaseMode { - /// Construct a new `InvalidLowercaseMode` error. +impl InvalidCaseMappingMode { + /// Construct a new `InvalidCaseMappingMode` error. /// /// # Examples /// /// ``` - /// # use roe::InvalidLowercaseMode; - /// const ERR: InvalidLowercaseMode = InvalidLowercaseMode::new(); + /// # use roe::InvalidCaseMappingMode; + /// const ERR: InvalidCaseMappingMode = InvalidCaseMappingMode::new(); /// assert_eq!(ERR.message(), "invalid option"); /// ``` + #[must_use] pub const fn new() -> Self { Self { _private: () } } - /// Retrieve the error message associated with this `InvalidLowercaseMode`. + /// Retrieve the error message associated with this `InvalidCaseMappingMode`. /// /// # Examples /// /// ``` - /// # use roe::InvalidLowercaseMode; - /// const MESSAGE: &str = InvalidLowercaseMode::new().message(); + /// # use roe::InvalidCaseMappingMode; + /// const MESSAGE: &str = InvalidCaseMappingMode::new().message(); /// assert_eq!(MESSAGE, "invalid option"); /// ``` + #[must_use] #[allow(clippy::clippy::unused_self)] pub const fn message(self) -> &'static str { "invalid option" } } -impl fmt::Display for InvalidLowercaseMode { +impl fmt::Display for InvalidCaseMappingMode { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - const MESSAGE: &str = InvalidLowercaseMode::new().message(); + const MESSAGE: &str = InvalidCaseMappingMode::new().message(); f.write_str(MESSAGE) } } #[cfg(feature = "std")] -impl std::error::Error for InvalidLowercaseMode {} +impl std::error::Error for InvalidCaseMappingMode {} /// Options to configure the behavior of [`lowercase`]. /// @@ -77,6 +178,9 @@ impl std::error::Error for InvalidLowercaseMode {} /// /// See individual variants for a description of the available behaviors. /// +/// If you're not sure which mode to choose, [`LowercaseMode::Full`] is a a good +/// default. +/// /// [`lowercase`]: crate::lowercase() #[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] pub enum LowercaseMode { @@ -122,17 +226,43 @@ impl Default for LowercaseMode { } } +impl TryFrom<&str> for LowercaseMode { + type Error = InvalidCaseMappingMode; + + #[inline] + fn try_from(value: &str) -> Result { + value.as_bytes().try_into() + } +} + impl TryFrom> for LowercaseMode { - type Error = InvalidLowercaseMode; + type Error = InvalidCaseMappingMode; + #[inline] fn try_from(value: Option<&str>) -> Result { value.map(str::as_bytes).try_into() } } +impl TryFrom<&[u8]> for LowercaseMode { + type Error = InvalidCaseMappingMode; + + #[inline] + fn try_from(value: &[u8]) -> Result { + match value { + b"ascii" => Ok(Self::Ascii), + b"turkic" => Ok(Self::Turkic), + b"lithuanian" => Ok(Self::Lithuanian), + b"fold" => Ok(Self::Fold), + _ => Err(InvalidCaseMappingMode::new()), + } + } +} + impl TryFrom> for LowercaseMode { - type Error = InvalidLowercaseMode; + type Error = InvalidCaseMappingMode; + #[inline] fn try_from(value: Option<&[u8]>) -> Result { match value { None => Ok(Self::Full), @@ -140,43 +270,178 @@ impl TryFrom> for LowercaseMode { Some(b"turkic") => Ok(Self::Turkic), Some(b"lithuanian") => Ok(Self::Lithuanian), Some(b"fold") => Ok(Self::Fold), - Some(_) => Err(InvalidLowercaseMode::new()), + Some(_) => Err(InvalidCaseMappingMode::new()), } } } impl FromStr for LowercaseMode { - type Err = InvalidLowercaseMode; + type Err = InvalidCaseMappingMode; + #[inline] fn from_str(s: &str) -> Result { - Some(s).try_into() + s.try_into() } } -// Returns an iterator that yields a copy of the bytes in the given slice with -// all uppercase letters replaced with their lowercase counterparts. -// -// This function treats the given slice as a conventionally UTF-8 string. UTF-8 -// byte sequences are converted to their Unicode lowercase equivalents. Invalid -// UTF-8 byte sequences are yielded as is. -// -// The case mapping mode is determined by the given [`LowercaseMode`]. See its -// documentation for details on the available case mapping modes. -pub const fn lowercase(slice: &[u8], options: LowercaseMode) -> Lowercase<'_> { +/// Returns an iterator that yields a copy of the bytes in the given slice with +/// all uppercase letters replaced with their lowercase counterparts. +/// +/// This function treats the given slice as a [conventionally UTF-8 string]. +/// UTF-8 byte sequences are converted to their Unicode lowercase equivalents. +/// Invalid UTF-8 byte sequences are yielded as is. +/// +/// The case mapping mode is determined by the given [`LowercaseMode`]. See its +/// documentation for details on the available case mapping modes. +/// +/// # Panics +/// +/// Not all [`LowercaseMode`]s are currently implemented. This function will +/// panic if the caller supplies [Turkic] or [case folding] lowercasing mode. +/// +/// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings +/// [Turkic]: LowercaseMode::Turkic +/// [case folding]: LowercaseMode::Fold +// TODO: make this const once we're no longer panicking. +pub fn lowercase(slice: &[u8], options: LowercaseMode) -> Lowercase<'_> { match options { LowercaseMode::Full | LowercaseMode::Lithuanian => Lowercase::with_slice(slice), LowercaseMode::Ascii => Lowercase::with_ascii_slice(slice), // TODO: implement `turkic` and `fold` modes. - LowercaseMode::Turkic | LowercaseMode::Fold => Lowercase::new(), + LowercaseMode::Turkic => panic!("lowercase Turkic mode is not yet implemented"), + LowercaseMode::Fold => panic!("lowercase case folding mode is not yet implemented"), } } -// Returns an iterator that yields a copy of the bytes in the given slice with -// all lowercase letters replaced with their uppercase counterparts. -// -// This function treats the given slice as a conventionally UTF-8 string. UTF-8 -// byte sequences are converted to their Unicode uppercase equivalents. Invalid -// UTF-8 byte sequences are yielded as is. -pub const fn uppercase(slice: &[u8]) -> Uppercase<'_> { - Uppercase::with_slice(slice) +/// Options to configure the behavior of [`uppercase`]. +/// +/// Which letters exactly are replaced, and by which other letters, depends on +/// the given options. +/// +/// See individual variants for a description of the available behaviors. +/// +/// If you're not sure which mode to choose, [`UppercaseMode::Full`] is a a good +/// default. +/// +/// [`uppercase`]: crate::uppercase() +#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] +pub enum UppercaseMode { + /// Full Unicode case mapping, suitable for most languages. + /// + /// See the [Turkic] and [Lithuanian] variants for exceptions. + /// + /// Context-dependent case mapping as described in Table 3-14 of the Unicode + /// standard is currently not supported. + /// + /// [Turkic]: Self::Turkic + /// [Lithuanian]: Self::Lithuanian + Full, + /// Only the ASCII region, i.e. the characters `'A'..='Z'` and `'a'..='z'`, + /// are affected. + /// + /// This option cannot be combined with any other option. + Ascii, + /// Full Unicode case mapping, adapted for Turkic languages (Turkish, + /// Azerbaijani, …). + /// + /// This means that upper case I is mapped to lower case dotless i, and so + /// on. + Turkic, + /// Currently, just [full Unicode case mapping]. + /// + /// In the future, full Unicode case mapping adapted for Lithuanian (keeping + /// the dot on the lower case i even if there is an accent on top). + /// + /// [full Unicode case mapping]: Self::Full + Lithuanian, +} + +impl Default for UppercaseMode { + fn default() -> Self { + Self::Full + } +} + +impl TryFrom<&str> for UppercaseMode { + type Error = InvalidCaseMappingMode; + + #[inline] + fn try_from(value: &str) -> Result { + value.as_bytes().try_into() + } +} + +impl TryFrom> for UppercaseMode { + type Error = InvalidCaseMappingMode; + + #[inline] + fn try_from(value: Option<&str>) -> Result { + value.map(str::as_bytes).try_into() + } +} + +impl TryFrom<&[u8]> for UppercaseMode { + type Error = InvalidCaseMappingMode; + + #[inline] + fn try_from(value: &[u8]) -> Result { + match value { + b"ascii" => Ok(Self::Ascii), + b"turkic" => Ok(Self::Turkic), + b"lithuanian" => Ok(Self::Lithuanian), + _ => Err(InvalidCaseMappingMode::new()), + } + } +} + +impl TryFrom> for UppercaseMode { + type Error = InvalidCaseMappingMode; + + #[inline] + fn try_from(value: Option<&[u8]>) -> Result { + match value { + None => Ok(Self::Full), + Some(b"ascii") => Ok(Self::Ascii), + Some(b"turkic") => Ok(Self::Turkic), + Some(b"lithuanian") => Ok(Self::Lithuanian), + Some(_) => Err(InvalidCaseMappingMode::new()), + } + } +} + +impl FromStr for UppercaseMode { + type Err = InvalidCaseMappingMode; + + #[inline] + fn from_str(s: &str) -> Result { + s.try_into() + } +} + +/// Returns an iterator that yields a copy of the bytes in the given slice with +/// all lowercase letters replaced with their uppercase counterparts. +/// +/// This function treats the given slice as a [conventionally UTF-8 string]. +/// UTF-8 byte sequences are converted to their Unicode uppercase equivalents. +/// Invalid UTF-8 byte sequences are yielded as is. +/// +/// The case mapping mode is determined by the given [`UppercaseMode`]. See its +/// documentation for details on the available case mapping modes. +/// +/// # Panics +/// +/// Not all [`UppercaseMode`]s are currently implemented. This function will +/// panic if the caller supplies [Turkic] uppercasing mode. +/// +/// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings +/// [Turkic]: LowercaseMode::Turkic +/// [case folding]: LowercaseMode::Fold +// TODO: make this const once we're no longer panicking. +pub fn uppercase(slice: &[u8], options: UppercaseMode) -> Uppercase<'_> { + match options { + UppercaseMode::Full | UppercaseMode::Lithuanian => Uppercase::with_slice(slice), + UppercaseMode::Ascii => Uppercase::with_ascii_slice(slice), + // TODO: implement `turkic` mode. + UppercaseMode::Turkic => panic!("uppercase Turkic mode is not yet implemented"), + } } diff --git a/src/lowercase.rs b/src/lowercase.rs index 791ae709f..e87b43820 100644 --- a/src/lowercase.rs +++ b/src/lowercase.rs @@ -4,6 +4,7 @@ mod ascii; mod full; #[derive(Debug, Clone)] +#[allow(variant_size_differences)] enum Inner<'a> { Empty, Full(full::Lowercase<'a>), diff --git a/src/uppercase.rs b/src/uppercase.rs index 4ea5a7342..ec67fb293 100644 --- a/src/uppercase.rs +++ b/src/uppercase.rs @@ -1,9 +1,15 @@ -use core::char::ToUppercase; -use core::fmt; use core::iter::FusedIterator; -use core::ops::Range; -use bstr::ByteSlice; +mod ascii; +mod full; + +#[derive(Debug, Clone)] +#[allow(variant_size_differences)] +enum Inner<'a> { + Empty, + Full(full::Uppercase<'a>), + Ascii(ascii::Uppercase<'a>), +} /// An iterator that yields the uppercase equivalent of a conventionally UTF-8 /// byte string. @@ -15,36 +21,10 @@ use bstr::ByteSlice; /// /// [bytes]: u8 /// [`uppercase`]: crate::uppercase() -#[derive(Clone)] +#[derive(Debug, Clone)] #[must_use = "Uppercase is a Iterator and must be used"] pub struct Uppercase<'a> { - slice: &'a [u8], - next_bytes: [u8; 4], - next_range: Range, - uppercase: Option, -} - -impl<'a> fmt::Debug for Uppercase<'a> { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - f.debug_struct("Uppercase") - .field("slice", &self.slice.as_bstr()) - .field("next_bytes", &self.next_bytes) - .field("next_range", &self.next_range) - .field("uppercase", &self.uppercase) - .finish() - } -} - -impl<'a> Default for Uppercase<'a> { - fn default() -> Self { - Self::new() - } -} - -impl<'a> From<&'a [u8]> for Uppercase<'a> { - fn from(slice: &'a [u8]) -> Self { - Self::with_slice(slice) - } + iter: Inner<'a>, } impl<'a> Uppercase<'a> { @@ -58,15 +38,11 @@ impl<'a> Uppercase<'a> { /// assert_eq!(uppercase.next(), None); /// ``` pub const fn new() -> Self { - Self { - slice: &[], - next_bytes: [0; 4], - next_range: 0..0, - uppercase: None, - } + Self { iter: Inner::Empty } } - /// Create a new uppercase iterator with the given byte slice. + /// Create a new uppercase iterator with the given byte slice using full + /// Unicode case mapping. /// /// # Examples /// @@ -81,12 +57,69 @@ impl<'a> Uppercase<'a> { /// assert_eq!(uppercase.next(), Some(b'Z')); /// assert_eq!(uppercase.next(), None); /// ``` + /// + /// Non-ASCII characters are case mapped: + /// + /// ``` + /// # use roe::Uppercase; + /// let uppercase = Uppercase::with_slice("Αύριο".as_bytes()); + /// assert_eq!(uppercase.collect::>(), "ΑΎΡΙΟ".as_bytes()); + /// ``` + /// + /// Invalid UTF-8 bytes are yielded as is without impacting Unicode + /// characters: + /// + /// ``` + /// # use roe::Uppercase; + /// let mut s = "Αύριο".to_string().into_bytes(); + /// s.extend(b"\xFF\xFE"); + /// let uppercase = Uppercase::with_slice(s.as_slice()); + /// + /// let mut expected = "ΑΎΡΙΟ".to_string().into_bytes(); + /// expected.extend(b"\xFF\xFE"); + /// assert_eq!(uppercase.collect::>(), expected); + /// ``` pub const fn with_slice(slice: &'a [u8]) -> Self { Self { - slice, - next_bytes: [0; 4], - next_range: 0..0, - uppercase: None, + iter: Inner::Full(full::Uppercase::with_slice(slice)), + } + } + + /// Create a new uppercase iterator with the given byte slice using ASCII + /// case mapping. + /// + /// # Examples + /// + /// ``` + /// # use roe::Uppercase; + /// let mut uppercase = Uppercase::with_ascii_slice(b"abcXYZ"); + /// assert_eq!(uppercase.next(), Some(b'A')); + /// assert_eq!(uppercase.next(), Some(b'B')); + /// assert_eq!(uppercase.next(), Some(b'C')); + /// assert_eq!(uppercase.next(), Some(b'X')); + /// assert_eq!(uppercase.next(), Some(b'Y')); + /// assert_eq!(uppercase.next(), Some(b'Z')); + /// assert_eq!(uppercase.next(), None); + /// ``` + /// + /// Non-ASCII characters are ignored: + /// + /// ``` + /// # use roe::Uppercase; + /// let uppercase = Uppercase::with_ascii_slice("Αύριο".as_bytes()); + /// assert_eq!(uppercase.collect::>(), "Αύριο".as_bytes()); + /// ``` + /// + /// Invalid UTF-8 bytes are yielded as is without impacting ASCII bytes: + /// + /// ``` + /// # use roe::Uppercase; + /// let uppercase = Uppercase::with_ascii_slice(b"abc\xFF\xFEXYZ"); + /// assert_eq!(uppercase.collect::>(), b"ABC\xFF\xFEXYZ"); + /// ``` + pub const fn with_ascii_slice(slice: &'a [u8]) -> Self { + Self { + iter: Inner::Ascii(ascii::Uppercase::with_slice(slice)), } } } @@ -95,50 +128,26 @@ impl<'a> Iterator for Uppercase<'a> { type Item = u8; fn next(&mut self) -> Option { - if let Some(idx) = self.next_range.next() { - debug_assert!(self.next_bytes.get(idx).is_some()); - - return Some(self.next_bytes[idx]); + match self.iter { + Inner::Empty => None, + Inner::Full(ref mut iter) => iter.next(), + Inner::Ascii(ref mut iter) => iter.next(), } + } - if let Some(ch) = self.uppercase.as_mut().and_then(Iterator::next) { - let enc = ch.encode_utf8(&mut self.next_bytes); - - self.next_range = 1..enc.len(); - debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some()); - - return Some(self.next_bytes[0]); + fn size_hint(&self) -> (usize, Option) { + match self.iter { + Inner::Empty => (0, Some(0)), + Inner::Full(ref iter) => iter.size_hint(), + Inner::Ascii(ref iter) => iter.size_hint(), } + } - self.uppercase = None; - - match bstr::decode_utf8(self.slice) { - (_, 0) => None, - (Some(ch), size) => { - self.slice = &self.slice[size..]; - let mut uppercase = ch.to_uppercase(); - let ch = uppercase - .next() - .expect("ToUppercase yields at least one char"); - let enc = ch.encode_utf8(&mut self.next_bytes); - - self.next_range = 1..enc.len(); - debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some()); - - self.uppercase = Some(uppercase); - Some(self.next_bytes[0]) - } - (None, size) => { - let (bytes, remainder) = self.slice.split_at(size); - self.slice = remainder; - - // Invalid byte sequences are at most three bytes. - debug_assert!(self.next_bytes.get(..bytes.len()).is_some()); - - self.next_bytes[..bytes.len()].copy_from_slice(bytes); - self.next_range = 1..bytes.len(); - Some(self.next_bytes[0]) - } + fn count(self) -> usize { + match self.iter { + Inner::Empty => 0, + Inner::Full(iter) => iter.count(), + Inner::Ascii(iter) => iter.count(), } } } @@ -154,175 +163,111 @@ mod tests { #[test] fn empty() { - let iter = Uppercase::from(&b""[..]); + let iter = Uppercase::new(); assert_eq!(iter.collect::>().as_bstr(), b"".as_bstr()); - } - #[test] - fn ascii() { - let iter = Uppercase::from(&b"abc"[..]); - assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); - - let iter = Uppercase::from(&b"aBC"[..]); - assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); - - let iter = Uppercase::from(&b"ABC"[..]); - assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); + let iter = Uppercase::with_slice(b""); + assert_eq!(iter.collect::>().as_bstr(), b"".as_bstr()); - let iter = Uppercase::from(&b"aBC, 123, ABC, baby you and me girl"[..]); - assert_eq!( - iter.collect::>().as_bstr(), - b"ABC, 123, ABC, BABY YOU AND ME GIRL".as_bstr() - ); + let iter = Uppercase::with_ascii_slice(b""); + assert_eq!(iter.collect::>().as_bstr(), b"".as_bstr()); } #[test] - fn utf8() { - let s = "ß".as_bytes(); - let iter = Uppercase::from(s); - assert_eq!( - iter.collect::>().as_bstr(), - "SS".as_bytes().as_bstr() - ); + fn size_hint() { + assert_eq!(Uppercase::new().size_hint(), (0, Some(0))); - let s = "Αύριο".as_bytes(); - let iter = Uppercase::from(s); + assert_eq!(Uppercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8))); assert_eq!( - iter.collect::>().as_bstr(), - "ΑΎΡΙΟ".as_bytes().as_bstr() + Uppercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(), + (12, Some(144)) ); - - let s = "Έτος".as_bytes(); - let iter = Uppercase::from(s); assert_eq!( - iter.collect::>().as_bstr(), - "ΈΤΟΣ".as_bytes().as_bstr() + Uppercase::with_slice("�".as_bytes()).size_hint(), + (3, Some(36)) ); - - // two-byte characters - // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200 - let s = "𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐑁𐐲𐑉𐑅𐐻/𐑅𐐯𐐿𐐲𐑌𐐼 𐐺𐐳𐐿 𐐺𐐴 𐑄 𐑉𐐨𐐾𐐯𐑌𐐻𐑅 𐐱𐑂 𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐐷𐐮𐐭𐑌𐐮𐑂𐐲𐑉𐑅𐐮𐐻𐐮".as_bytes(); - let iter = Uppercase::from(s); assert_eq!( - iter.collect::>().as_bstr(), - "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐉𐐚 𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆" - .as_bytes() - .as_bstr() + Uppercase::with_slice("Έτος".as_bytes()).size_hint(), + (8, Some(96)) ); - - // Change length when uppercased - // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232 - let s = "zⱥⱦ".as_bytes(); - let iter = Uppercase::from(s); assert_eq!( - iter.collect::>().as_bstr(), - "ZȺȾ".as_bytes().as_bstr() + Uppercase::with_slice("ZȺȾ".as_bytes()).size_hint(), + (5, Some(60)) ); - } - #[test] - fn invalid_utf8() { - let iter = Uppercase::from(&b"\xFF\xFE"[..]); - assert_eq!(iter.collect::>().as_bstr(), b"\xFF\xFE".as_bstr()); - - let iter = Uppercase::from(&b"abc\xFF\xFExyz"[..]); - assert_eq!( - iter.collect::>().as_bstr(), - b"ABC\xFF\xFEXYZ".as_bstr() - ); - - let iter = Uppercase::from(&b"abc\xFF\xFEXYZ"[..]); + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); assert_eq!( - iter.collect::>().as_bstr(), - b"ABC\xFF\xFEXYZ".as_bstr() + Uppercase::with_slice(&utf8_with_invalid_bytes).size_hint(), + (10, Some(120)) ); - // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of - // them on their own are invalid. Only one replacement codepoint is - // substituted, which demonstrates the "substitution of maximal - // subparts" strategy. - // - // See: https://docs.rs/bstr/0.2.*/bstr/#handling-of-invalid-utf-8 - let iter = Uppercase::from(&b"aB\xF0\x9F\x87Yz"[..]); assert_eq!( - iter.collect::>().as_bstr(), - b"AB\xF0\x9F\x87YZ".as_bstr() + Uppercase::with_ascii_slice(b"abc, xyz").size_hint(), + (8, Some(8)) ); - } - - #[test] - fn unicode_replacement_character() { - let s = "�".as_bytes(); - let iter = Uppercase::from(s); - assert_eq!(iter.collect::>().as_bstr(), "�".as_bytes().as_bstr()); - } - - #[test] - fn dz_titlecase() { - let s = "Dž".as_bytes(); - let iter = Uppercase::from(s); - assert_eq!(iter.collect::>().as_bstr(), "DŽ".as_bytes().as_bstr()); - } - - #[test] - fn latin_small_i_with_dot_above() { - let s = "i̇".as_bytes(); - let iter = Uppercase::from(s); assert_eq!( - iter.collect::>().as_bstr(), - [73_u8, 204, 135].as_bstr() + Uppercase::with_ascii_slice(b"abc, \xFF\xFE, xyz").size_hint(), + (12, Some(12)) ); - } - - #[test] - fn case_map_to_two_chars() { - let s = "և".as_bytes(); - let iter = Uppercase::from(s); assert_eq!( - iter.collect::>().as_bstr(), - "ԵՒ".as_bytes().as_bstr() + Uppercase::with_ascii_slice("�".as_bytes()).size_hint(), + (3, Some(3)) ); - - let s = "ẙ".as_bytes(); - let iter = Uppercase::from(s); assert_eq!( - iter.collect::>().as_bstr(), - "Y\u{30a}".as_bytes().as_bstr() + Uppercase::with_ascii_slice("Έτος".as_bytes()).size_hint(), + (8, Some(8)) ); - - let s = "ᾂ".as_bytes(); - let iter = Uppercase::from(s); assert_eq!( - iter.collect::>().as_bstr(), - "ἊΙ".as_bytes().as_bstr() + Uppercase::with_ascii_slice("ZȺȾ".as_bytes()).size_hint(), + (5, Some(5)) ); - let s = "ﬗ".as_bytes(); - let iter = Uppercase::from(s); + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); assert_eq!( - iter.collect::>().as_bstr(), - "ՄԽ".as_bytes().as_bstr() + Uppercase::with_ascii_slice(&utf8_with_invalid_bytes).size_hint(), + (10, Some(10)) ); } #[test] - fn case_map_to_three_chars() { - let s = "ffi".as_bytes(); - let iter = Uppercase::from(s); - assert_eq!(iter.collect::>().as_bstr(), b"FFI".as_bstr()); + fn count() { + assert_eq!(Uppercase::new().count(), 0); + + assert_eq!(Uppercase::with_slice(b"abc, xyz").count(), 8); + assert_eq!(Uppercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12); + assert_eq!(Uppercase::with_slice("�".as_bytes()).count(), 3); + assert_eq!(Uppercase::with_slice("Έτος".as_bytes()).count(), 8); + assert_eq!(Uppercase::with_slice("zⱥⱦ".as_bytes()).count(), 5); + + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); + assert_eq!(Uppercase::with_slice(&utf8_with_invalid_bytes).count(), 10); - let s = "ὖ".as_bytes(); - let iter = Uppercase::from(s); + assert_eq!(Uppercase::with_ascii_slice(b"abc, xyz").count(), 8); assert_eq!( - iter.collect::>().as_bstr(), - "Υ\u{313}\u{342}".as_bytes().as_bstr() + Uppercase::with_ascii_slice(b"abc, \xFF\xFE, xyz").count(), + 12 ); + assert_eq!(Uppercase::with_ascii_slice("�".as_bytes()).count(), 3); + assert_eq!(Uppercase::with_ascii_slice("Έτος".as_bytes()).count(), 8); + assert_eq!(Uppercase::with_ascii_slice("ZȺȾ".as_bytes()).count(), 5); - let s = "ῷ".as_bytes(); - let iter = Uppercase::from(s); + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); assert_eq!( - iter.collect::>().as_bstr(), - "Ω\u{342}Ι".as_bytes().as_bstr() + Uppercase::with_ascii_slice(&utf8_with_invalid_bytes).count(), + 10 ); } + + #[test] + fn size_hint_covers_count() { + let iter = Uppercase::new(); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + } } diff --git a/src/uppercase/ascii.rs b/src/uppercase/ascii.rs new file mode 100644 index 000000000..fb3e3018a --- /dev/null +++ b/src/uppercase/ascii.rs @@ -0,0 +1,287 @@ +use core::fmt; +use core::iter::FusedIterator; + +use bstr::ByteSlice; + +#[derive(Clone)] +#[must_use = "Uppercase is a Iterator and must be used"] +pub struct Uppercase<'a> { + slice: &'a [u8], +} + +impl<'a> fmt::Debug for Uppercase<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Uppercase") + .field("slice", &self.slice.as_bstr()) + .finish() + } +} + +impl<'a> From<&'a [u8]> for Uppercase<'a> { + fn from(slice: &'a [u8]) -> Self { + Self::with_slice(slice) + } +} + +impl<'a> Uppercase<'a> { + pub const fn with_slice(slice: &'a [u8]) -> Self { + Self { slice } + } +} + +impl<'a> Iterator for Uppercase<'a> { + type Item = u8; + + fn next(&mut self) -> Option { + let (&byte, remainder) = self.slice.split_first()?; + self.slice = remainder; + Some(byte.to_ascii_uppercase()) + } + + fn size_hint(&self) -> (usize, Option) { + let len = self.slice.len(); + (len, Some(len)) + } + + fn count(self) -> usize { + self.slice.len() + } +} + +impl<'a> DoubleEndedIterator for Uppercase<'a> { + fn next_back(&mut self) -> Option { + let (&byte, remainder) = self.slice.split_last()?; + self.slice = remainder; + Some(byte.to_ascii_uppercase()) + } +} + +impl<'a> ExactSizeIterator for Uppercase<'a> {} + +impl<'a> FusedIterator for Uppercase<'a> {} + +#[cfg(test)] +mod tests { + use alloc::vec::Vec; + use bstr::ByteSlice; + + use super::Uppercase; + + #[test] + fn empty() { + let iter = Uppercase::from(&b""[..]); + assert_eq!(iter.collect::>().as_bstr(), b"".as_bstr()); + } + + #[test] + fn ascii() { + let iter = Uppercase::from(&b"abc"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); + + let iter = Uppercase::from(&b"aBC"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); + + let iter = Uppercase::from(&b"ABC"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); + + let iter = Uppercase::from(&b"aBC, 123, ABC, baby you and me girl"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"ABC, 123, ABC, BABY YOU AND ME GIRL".as_bstr() + ); + } + + // ignore unicode for ASCII iterator + #[test] + fn utf8() { + let s = "ß".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), "ß".as_bytes().as_bstr()); + + let s = "Αύριο".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "Αύριο".as_bytes().as_bstr() + ); + + let s = "Έτος".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "Έτος".as_bytes().as_bstr() + ); + + // two-byte characters + // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200 + let s = "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐱𐑂 𐑄 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐱𐑂 𐑄 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆" + .as_bytes() + .as_bstr() + ); + + // Change length when lowercased + // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232 + let s = "zȺȾ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "ZȺȾ".as_bytes().as_bstr() + ); + } + + #[test] + fn invalid_utf8() { + let iter = Uppercase::from(&b"\xFF\xFE"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"\xFF\xFE".as_bstr()); + + let iter = Uppercase::from(&b"ABC\xFF\xFEXYZ"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"ABC\xFF\xFEXYZ".as_bstr() + ); + + let iter = Uppercase::from(&b"abc\xFF\xFEXYZ"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"ABC\xFF\xFEXYZ".as_bstr() + ); + + // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of + // them on their own are invalid. Only one replacement codepoint is + // substituted, which demonstrates the "substitution of maximal + // subparts" strategy. + // + // See: https://docs.rs/bstr/0.2.*/bstr/#handling-of-invalid-utf-8 + let iter = Uppercase::from(&b"aB\xF0\x9F\x87Yz"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"AB\xF0\x9F\x87YZ".as_bstr() + ); + } + + // ignore unicode for ASCII iterator + #[test] + fn unicode_replacement_character() { + let s = "�".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), "�".as_bytes().as_bstr()); + } + + // ignore unicode for ASCII iterator + #[test] + fn dz_titlecase() { + let s = "Dž".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), "Dž".as_bytes().as_bstr()); + } + + // ignore unicode for ASCII iterator + #[test] + fn latin_capital_i_with_dot_above() { + let s = "İ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), "İ".as_bytes().as_bstr()); + } + + // ignore unicode for ASCII iterator + #[test] + fn case_map_to_two_chars() { + let s = "İ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), "İ".as_bytes().as_bstr()); + } + + #[test] + fn size_hint() { + assert_eq!(Uppercase::with_slice(b"").size_hint(), (0, Some(0))); + assert_eq!(Uppercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8))); + assert_eq!( + Uppercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(), + (12, Some(12)) + ); + assert_eq!( + Uppercase::with_slice("�".as_bytes()).size_hint(), + (3, Some(3)) + ); + assert_eq!( + Uppercase::with_slice("Έτος".as_bytes()).size_hint(), + (8, Some(8)) + ); + assert_eq!( + Uppercase::with_slice("ZȺȾ".as_bytes()).size_hint(), + (5, Some(5)) + ); + + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); + assert_eq!( + Uppercase::with_slice(&utf8_with_invalid_bytes).size_hint(), + (10, Some(10)) + ); + } + + #[test] + fn count() { + assert_eq!(Uppercase::with_slice(b"").count(), 0); + assert_eq!(Uppercase::with_slice(b"abc, xyz").count(), 8); + assert_eq!(Uppercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12); + assert_eq!(Uppercase::with_slice("�".as_bytes()).count(), 3); + assert_eq!(Uppercase::with_slice("Έτος".as_bytes()).count(), 8); + assert_eq!(Uppercase::with_slice("ZȺȾ".as_bytes()).count(), 5); + + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); + assert_eq!(Uppercase::with_slice(&utf8_with_invalid_bytes).count(), 10); + } + + #[test] + fn size_hint_covers_count() { + let iter = Uppercase::with_slice(b""); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice(b"abc, xyz"); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice(b"abc, \xFF\xFE, xyz"); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice("�".as_bytes()); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice("Έτος".as_bytes()); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice("ZȺȾ".as_bytes()); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); + let iter = Uppercase::with_slice(&utf8_with_invalid_bytes); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + } +} diff --git a/src/uppercase/full.rs b/src/uppercase/full.rs new file mode 100644 index 000000000..bdf5e3a3c --- /dev/null +++ b/src/uppercase/full.rs @@ -0,0 +1,393 @@ +use core::char::ToUppercase; +use core::fmt; +use core::iter::FusedIterator; +use core::ops::Range; + +use bstr::ByteSlice; + +#[derive(Clone)] +#[must_use = "Uppercase is a Iterator and must be used"] +pub struct Uppercase<'a> { + slice: &'a [u8], + next_bytes: [u8; 4], + next_range: Range, + uppercase: Option, +} + +impl<'a> fmt::Debug for Uppercase<'a> { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Uppercase") + .field("slice", &self.slice.as_bstr()) + .field("next_bytes", &self.next_bytes) + .field("next_range", &self.next_range) + .field("uppercase", &self.uppercase) + .finish() + } +} + +impl<'a> From<&'a [u8]> for Uppercase<'a> { + fn from(slice: &'a [u8]) -> Self { + Self::with_slice(slice) + } +} + +impl<'a> Uppercase<'a> { + pub const fn with_slice(slice: &'a [u8]) -> Self { + Self { + slice, + next_bytes: [0; 4], + next_range: 0..0, + uppercase: None, + } + } +} + +impl<'a> Iterator for Uppercase<'a> { + type Item = u8; + + fn next(&mut self) -> Option { + if let Some(idx) = self.next_range.next() { + debug_assert!(self.next_bytes.get(idx).is_some()); + + return Some(self.next_bytes[idx]); + } + + if let Some(ch) = self.uppercase.as_mut().and_then(Iterator::next) { + let enc = ch.encode_utf8(&mut self.next_bytes); + + self.next_range = 1..enc.len(); + debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some()); + + return Some(self.next_bytes[0]); + } + + self.uppercase = None; + + match bstr::decode_utf8(self.slice) { + (_, 0) => None, + (Some(ch), size) => { + self.slice = &self.slice[size..]; + let mut uppercase = ch.to_uppercase(); + let ch = uppercase + .next() + .expect("ToUppercase yields at least one char"); + let enc = ch.encode_utf8(&mut self.next_bytes); + + self.next_range = 1..enc.len(); + debug_assert!(self.next_bytes.get(self.next_range.clone()).is_some()); + + self.uppercase = Some(uppercase); + Some(self.next_bytes[0]) + } + (None, size) => { + let (bytes, remainder) = self.slice.split_at(size); + self.slice = remainder; + + // Invalid byte sequences are at most three bytes. + debug_assert!(self.next_bytes.get(..bytes.len()).is_some()); + + self.next_bytes[..bytes.len()].copy_from_slice(bytes); + self.next_range = 1..bytes.len(); + Some(self.next_bytes[0]) + } + } + } + + fn size_hint(&self) -> (usize, Option) { + const TO_UPPER_EXPAND: usize = 3; + const UTF_8_CHAR_MAX_BYTES: usize = 4; + if self.slice.is_empty() { + (0, Some(0)) + } else if self.slice.is_ascii() { + let len = self.slice.len(); + (len, Some(len)) + } else { + let len = self.slice.len(); + (len, Some(len * TO_UPPER_EXPAND * UTF_8_CHAR_MAX_BYTES)) + } + } + + fn count(self) -> usize { + if self.slice.is_empty() { + 0 + } else if self.slice.is_ascii() { + self.slice.len() + } else { + self.fold(0, |acc, _| acc + 1) + } + } +} + +impl<'a> FusedIterator for Uppercase<'a> {} + +#[cfg(test)] +mod tests { + use alloc::vec::Vec; + use bstr::ByteSlice; + + use super::Uppercase; + + #[test] + fn empty() { + let iter = Uppercase::from(&b""[..]); + assert_eq!(iter.collect::>().as_bstr(), b"".as_bstr()); + } + + #[test] + fn ascii() { + let iter = Uppercase::from(&b"abc"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); + + let iter = Uppercase::from(&b"aBC"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); + + let iter = Uppercase::from(&b"ABC"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"ABC".as_bstr()); + + let iter = Uppercase::from(&b"aBC, 123, ABC, baby you and me girl"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"ABC, 123, ABC, BABY YOU AND ME GIRL".as_bstr() + ); + } + + #[test] + fn utf8() { + let s = "ß".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "SS".as_bytes().as_bstr() + ); + + let s = "Αύριο".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "ΑΎΡΙΟ".as_bytes().as_bstr() + ); + + let s = "Έτος".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "ΈΤΟΣ".as_bytes().as_bstr() + ); + + // two-byte characters + // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L198-L200 + let s = "𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐑁𐐲𐑉𐑅𐐻/𐑅𐐯𐐿𐐲𐑌𐐼 𐐺𐐳𐐿 𐐺𐐴 𐑄 𐑉𐐨𐐾𐐯𐑌𐐻𐑅 𐐱𐑂 𐑄 𐐼𐐯𐑅𐐨𐑉𐐯𐐻 𐐷𐐮𐐭𐑌𐐮𐑂𐐲𐑉𐑅𐐮𐐻𐐮".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐙𐐊𐐡𐐝𐐓/𐐝𐐇𐐗𐐊𐐤𐐔 𐐒𐐋𐐗 𐐒𐐌 𐐜 𐐡𐐀𐐖𐐇𐐤𐐓𐐝 𐐉𐐚 𐐜 𐐔𐐇𐐝𐐀𐐡𐐇𐐓 𐐏𐐆𐐅𐐤𐐆𐐚𐐊𐐡𐐝𐐆𐐓𐐆" + .as_bytes() + .as_bstr() + ); + + // Change length when uppercased + // https://github.com/minimaxir/big-list-of-naughty-strings/blob/894882e7/blns.txt#L226-L232 + let s = "zⱥⱦ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "ZȺȾ".as_bytes().as_bstr() + ); + } + + #[test] + fn invalid_utf8() { + let iter = Uppercase::from(&b"\xFF\xFE"[..]); + assert_eq!(iter.collect::>().as_bstr(), b"\xFF\xFE".as_bstr()); + + let iter = Uppercase::from(&b"abc\xFF\xFExyz"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"ABC\xFF\xFEXYZ".as_bstr() + ); + + let iter = Uppercase::from(&b"abc\xFF\xFEXYZ"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"ABC\xFF\xFEXYZ".as_bstr() + ); + + // The bytes \xF0\x9F\x87 could lead to a valid UTF-8 sequence, but 3 of + // them on their own are invalid. Only one replacement codepoint is + // substituted, which demonstrates the "substitution of maximal + // subparts" strategy. + // + // See: https://docs.rs/bstr/0.2.*/bstr/#handling-of-invalid-utf-8 + let iter = Uppercase::from(&b"aB\xF0\x9F\x87Yz"[..]); + assert_eq!( + iter.collect::>().as_bstr(), + b"AB\xF0\x9F\x87YZ".as_bstr() + ); + } + + #[test] + fn unicode_replacement_character() { + let s = "�".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), "�".as_bytes().as_bstr()); + } + + #[test] + fn dz_titlecase() { + let s = "Dž".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), "DŽ".as_bytes().as_bstr()); + } + + #[test] + fn latin_small_i_with_dot_above() { + let s = "i̇".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + [73_u8, 204, 135].as_bstr() + ); + } + + #[test] + fn case_map_to_two_chars() { + let s = "և".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "ԵՒ".as_bytes().as_bstr() + ); + + let s = "ẙ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "Y\u{30a}".as_bytes().as_bstr() + ); + + let s = "ᾂ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "ἊΙ".as_bytes().as_bstr() + ); + + let s = "ﬗ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "ՄԽ".as_bytes().as_bstr() + ); + } + + #[test] + fn case_map_to_three_chars() { + let s = "ffi".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!(iter.collect::>().as_bstr(), b"FFI".as_bstr()); + + let s = "ὖ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "Υ\u{313}\u{342}".as_bytes().as_bstr() + ); + + let s = "ῷ".as_bytes(); + let iter = Uppercase::from(s); + assert_eq!( + iter.collect::>().as_bstr(), + "Ω\u{342}Ι".as_bytes().as_bstr() + ); + } + + #[test] + fn size_hint() { + assert_eq!(Uppercase::with_slice(b"").size_hint(), (0, Some(0))); + assert_eq!(Uppercase::with_slice(b"abc, xyz").size_hint(), (8, Some(8))); + assert_eq!( + Uppercase::with_slice(b"abc, \xFF\xFE, xyz").size_hint(), + (12, Some(144)) + ); + assert_eq!( + Uppercase::with_slice("�".as_bytes()).size_hint(), + (3, Some(36)) + ); + assert_eq!( + Uppercase::with_slice("Έτος".as_bytes()).size_hint(), + (8, Some(96)) + ); + assert_eq!( + Uppercase::with_slice("ZȺȾ".as_bytes()).size_hint(), + (5, Some(60)) + ); + + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); + assert_eq!( + Uppercase::with_slice(&utf8_with_invalid_bytes).size_hint(), + (10, Some(120)) + ); + } + + #[test] + fn count() { + assert_eq!(Uppercase::with_slice(b"").count(), 0); + assert_eq!(Uppercase::with_slice(b"abc, xyz").count(), 8); + assert_eq!(Uppercase::with_slice(b"abc, \xFF\xFE, xyz").count(), 12); + assert_eq!(Uppercase::with_slice("�".as_bytes()).count(), 3); + assert_eq!(Uppercase::with_slice("Έτος".as_bytes()).count(), 8); + assert_eq!(Uppercase::with_slice("zⱥⱦ".as_bytes()).count(), 5); + + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); + assert_eq!(Uppercase::with_slice(&utf8_with_invalid_bytes).count(), 10); + } + + #[test] + fn size_hint_covers_count() { + let iter = Uppercase::with_slice(b""); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice(b"abc, xyz"); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice(b"abc, \xFF\xFE, xyz"); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice("�".as_bytes()); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice("Έτος".as_bytes()); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let iter = Uppercase::with_slice("ZȺȾ".as_bytes()); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + + let mut utf8_with_invalid_bytes = b"\xFF\xFE".to_vec(); + utf8_with_invalid_bytes.extend_from_slice("Έτος".as_bytes()); + let iter = Uppercase::with_slice(&utf8_with_invalid_bytes); + let (min, max) = iter.size_hint(); + let count = iter.count(); + assert!(min <= count); + assert!(count <= max.unwrap()); + } +}