Skip to content

Commit

Permalink
Merge pull request #139 from artichoke/choznerol/65-titlecase
Browse files Browse the repository at this point in the history
Implement titlecase iterator
  • Loading branch information
lopopolo authored Dec 6, 2023
2 parents fa47095 + cad0ef1 commit 7209421
Show file tree
Hide file tree
Showing 24 changed files with 39,172 additions and 31 deletions.
1 change: 1 addition & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
generated/*.rs linguist-generated=true
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ homepage = "https://github.com/artichoke/roe"
description = "Unicode case conversion"
keywords = ["capitalize", "case", "lowercase", "unicode", "uppercase"]
categories = ["encoding", "internationalization", "no-std", "no-std::no-alloc", "text-processing"]
include = ["src/**/*", "tests/**/*", "LICENSE", "README.md"]
include = ["src/**/*", "generated/**/*", "tests/**/*", "LICENSE", "README.md"]

[features]
default = ["std"]
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ roe = "0.0.5"
Then convert case like:

```rust
use roe::{LowercaseMode, UppercaseMode};
use roe::{LowercaseMode, UppercaseMode, TitlecaseMode};

assert_eq!(
roe::lowercase(b"Artichoke Ruby", LowercaseMode::Ascii).collect::<Vec<_>>(),
Expand All @@ -52,6 +52,10 @@ assert_eq!(
roe::uppercase("Αύριο".as_bytes(), UppercaseMode::Full).collect::<Vec<_>>(),
"ΑΎΡΙΟ".as_bytes()
);
assert_eq!(
roe::titlecase("".as_bytes(), TitlecaseMode::Full).collect::<Vec<_>>(),
"Ffi".as_bytes()
);
```

## Crate Features
Expand Down
39 changes: 39 additions & 0 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ require 'open-uri'
require 'shellwords'
require 'bundler/audit/task'
require 'rubocop/rake_task'
require 'pathname'

task default: %i[format lint]

Expand Down Expand Up @@ -89,6 +90,44 @@ task :test do
sh 'cargo test --workspace'
end

namespace :unicode do
generated_dir = Pathname.pwd.join('generated')
ucd_dir = generated_dir.join('ucd')

desc 'Rebuild Rust generated Rust sources from Unicode data'
task :build do
unless system 'which ucd-generate'
raise '`ucd-generate` not found. ' \
"Install it for generating Unicode data: \n\n " \
"cargo install 'ucd-generate@>=0.3.0'\n\n"
end

installed_version = `ucd-generate --version`[/(\d+\.\d+\.\d+)/]
unless Gem::Version.new(installed_version) >= Gem::Version.new('0.3.0')
# The `--include` flag used later is only available after 0.3.0
raise 'Please upgrade ucd-generate to >=0.3.0 to run this task ' \
"(Using ucd-generate #{installed_version})."
end

raise 'Stage your changes before running this task' unless system 'git diff --exit-code'

filename = generated_dir.join('case_mapping.rs')
sh "ucd-generate case-mapping #{ucd_dir.relative_path_from(Pathname.pwd)} " \
"--include TITLE --flat-table > #{filename.relative_path_from(Pathname.pwd)}"
sh 'cargo clippy --fix --allow-dirty'
end

desc 'Update Unicode data'
task :update do
%w[UnicodeData.txt SpecialCasing.txt PropList.txt].each do |filename|
uri = "https://www.unicode.org/Public/UCD/latest/ucd/#{filename}"
URI.parse(uri).open do |data|
IO.copy_stream(data, ucd_dir.join(filename))
end
end
end
end

Bundler::Audit::Task.new

namespace :release do
Expand Down
6 changes: 6 additions & 0 deletions generated/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Generated Unicode Tables and Mappings

Do not edit files under this directory. They are automatically generated by:

bundle exec unicode:update
bundle exec unicode:build
558 changes: 558 additions & 0 deletions generated/case_mapping.rs

Large diffs are not rendered by default.

1,767 changes: 1,767 additions & 0 deletions generated/ucd/PropList.txt

Large diffs are not rendered by default.

281 changes: 281 additions & 0 deletions generated/ucd/SpecialCasing.txt

Large diffs are not rendered by default.

34,924 changes: 34,924 additions & 0 deletions generated/ucd/UnicodeData.txt

Large diffs are not rendered by default.

File renamed without changes.
251 changes: 247 additions & 4 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
//! You can convert case like:
//!
//! ```
//! # use roe::{LowercaseMode, UppercaseMode};
//! # use roe::{LowercaseMode, UppercaseMode, TitlecaseMode};
//! assert_eq!(
//! roe::lowercase(b"Artichoke Ruby", LowercaseMode::Ascii).collect::<Vec<_>>(),
//! b"artichoke ruby"
Expand All @@ -50,6 +50,10 @@
//! roe::uppercase("Αύριο".as_bytes(), UppercaseMode::Full).collect::<Vec<_>>(),
//! "ΑΎΡΙΟ".as_bytes()
//! );
//! assert_eq!(
//! roe::titlecase("ffi".as_bytes(), TitlecaseMode::Full).collect::<Vec<_>>(),
//! "Ffi".as_bytes()
//! );
//! ```
//!
//!
Expand Down Expand Up @@ -110,16 +114,20 @@ use core::str::FromStr;

mod ascii;
mod lowercase;
mod titlecase;
mod unicode;
mod uppercase;

pub use ascii::{make_ascii_lowercase, make_ascii_titlecase, make_ascii_uppercase};
#[cfg(feature = "alloc")]
pub use ascii::{to_ascii_lowercase, to_ascii_titlecase, to_ascii_uppercase};
pub use lowercase::Lowercase;
pub use titlecase::Titlecase;
pub use unicode::to_titlecase;
pub use uppercase::Uppercase;

/// Error that indicates a failure to parse a [`LowercaseMode`] or
/// [`UppercaseMode`].
/// Error that indicates a failure to parse a [`LowercaseMode`],
/// [`UppercaseMode`], or [`TitlecaseMode`].
///
/// This error corresponds to the [Ruby `ArgumentError` Exception class].
///
Expand Down Expand Up @@ -445,7 +453,6 @@ impl FromStr for UppercaseMode {
///
/// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings
/// [Turkic]: LowercaseMode::Turkic
/// [case folding]: LowercaseMode::Fold
// TODO: make this const once we're no longer panicking.
pub fn uppercase(slice: &[u8], options: UppercaseMode) -> Uppercase<'_> {
match options {
Expand All @@ -456,6 +463,136 @@ pub fn uppercase(slice: &[u8], options: UppercaseMode) -> Uppercase<'_> {
}
}

/// Options to configure the behavior of [`titlecase`].
///
/// Which letters exactly are replaced, and by which other letters, depends on
/// the given options.
///
/// See individual variants for a description of the available behaviors.
///
/// If you're not sure which mode to choose, [`UppercaseMode::Full`] is a a good
/// default.
///
/// [`titlecase`]: crate::titlecase()
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)]
pub enum TitlecaseMode {
/// Full Unicode case mapping, suitable for most languages.
///
/// See the [Turkic] and [Lithuanian] variants for exceptions.
///
/// Context-dependent case mapping as described in Table 3-14 of the Unicode
/// standard is currently not supported.
///
/// [Turkic]: Self::Turkic
/// [Lithuanian]: Self::Lithuanian
Full,
/// Only the ASCII region, i.e. the characters `'A'..='Z'` and `'a'..='z'`,
/// are affected.
///
/// This option cannot be combined with any other option.
Ascii,
/// Full Unicode case mapping, adapted for Turkic languages (Turkish,
/// Azerbaijani, …).
///
/// This means that upper case I is mapped to title case dotless i, and so
/// on.
Turkic,
/// Currently, just [full Unicode case mapping].
///
/// In the future, full Unicode case mapping adapted for Lithuanian (keeping
/// the dot on the title case i even if there is an accent on top).
///
/// [full Unicode case mapping]: Self::Full
Lithuanian,
}

impl Default for TitlecaseMode {
fn default() -> Self {
Self::Full
}
}

impl TryFrom<&str> for TitlecaseMode {
type Error = InvalidCaseMappingMode;

#[inline]
fn try_from(value: &str) -> Result<Self, Self::Error> {
value.as_bytes().try_into()
}
}

impl TryFrom<Option<&str>> for TitlecaseMode {
type Error = InvalidCaseMappingMode;

#[inline]
fn try_from(value: Option<&str>) -> Result<Self, Self::Error> {
value.map(str::as_bytes).try_into()
}
}

impl TryFrom<&[u8]> for TitlecaseMode {
type Error = InvalidCaseMappingMode;

#[inline]
fn try_from(value: &[u8]) -> Result<Self, Self::Error> {
match value {
b"ascii" => Ok(Self::Ascii),
b"turkic" => Ok(Self::Turkic),
b"lithuanian" => Ok(Self::Lithuanian),
_ => Err(InvalidCaseMappingMode::new()),
}
}
}

impl TryFrom<Option<&[u8]>> for TitlecaseMode {
type Error = InvalidCaseMappingMode;

#[inline]
fn try_from(value: Option<&[u8]>) -> Result<Self, Self::Error> {
match value {
None => Ok(Self::default()),
Some(value) => value.try_into(),
}
}
}

impl FromStr for TitlecaseMode {
type Err = InvalidCaseMappingMode;

#[inline]
fn from_str(s: &str) -> Result<Self, Self::Err> {
s.try_into()
}
}

/// Returns an iterator that yields a copy of the bytes in the given slice with
/// the leading letter replaced with their titlecase counterpart, and rest
/// letters replaced with their titlecase counterparts.
///
/// This function treats the given slice as a [conventionally UTF-8 string].
/// UTF-8 byte sequences are converted to their Unicode titlecase equivalents.
/// Invalid UTF-8 byte sequences are yielded as is.
///
/// The case mapping mode is determined by the given [`TitlecaseMode`]. See its
/// documentation for details on the available case mapping modes.
///
/// # Panics
///
/// Not all [`TitlecaseMode`]s are currently implemented. This function will
/// panic if the caller supplies [Turkic] titlecasing mode.
///
/// [conventionally UTF-8 string]: https://docs.rs/bstr/0.2.*/bstr/#when-should-i-use-byte-strings
/// [Turkic]: TitlecaseMode::Turkic
// TODO: make this const once we're no longer panicking.
pub fn titlecase(slice: &[u8], options: TitlecaseMode) -> Titlecase<'_> {
match options {
TitlecaseMode::Full | TitlecaseMode::Lithuanian => Titlecase::with_slice(slice),
TitlecaseMode::Ascii => Titlecase::with_ascii_slice(slice),
// TODO: implement `turkic` mode.
TitlecaseMode::Turkic => panic!("titlecase Turkic mode is not yet implemented"),
}
}

// Ensure code blocks in README.md compile
//
// This module and macro declaration should be kept at the end of the file, in
Expand All @@ -472,3 +609,109 @@ macro_rules! readme {
}
#[cfg(doctest)]
readme!();

#[cfg(test)]
mod tests {
use core::{convert::TryInto, str::FromStr};

use alloc::format;

use crate::{InvalidCaseMappingMode, LowercaseMode, TitlecaseMode, UppercaseMode};

#[test]
fn test_invalid_case_mapping_mode_fmt() {
let err = InvalidCaseMappingMode::new();
assert_eq!(format!("{err}"), "invalid option");
}

#[test]
fn test_lowercase_mode_parsing() {
assert_eq!(LowercaseMode::from_str("ascii"), Ok(LowercaseMode::Ascii));
assert_eq!(LowercaseMode::from_str("turkic"), Ok(LowercaseMode::Turkic));
assert_eq!(
LowercaseMode::from_str("lithuanian"),
Ok(LowercaseMode::Lithuanian)
);
assert_eq!(LowercaseMode::from_str("fold"), Ok(LowercaseMode::Fold));
assert_eq!(
LowercaseMode::from_str("full"),
Err(InvalidCaseMappingMode::new())
);
}

#[test]
fn test_lowercase_mode_conversion() {
let mut mode: LowercaseMode;
mode = "turkic".try_into().unwrap();
assert_eq!(mode, LowercaseMode::Turkic);

mode = Some("turkic").try_into().unwrap();
assert_eq!(mode, LowercaseMode::Turkic);

mode = b"turkic"[..].try_into().unwrap();
assert_eq!(mode, LowercaseMode::Turkic);

mode = Some(&b"turkic"[..]).try_into().unwrap();
assert_eq!(mode, LowercaseMode::Turkic);
}

#[test]
fn test_uppercase_mode_parsing() {
assert_eq!(UppercaseMode::from_str("ascii"), Ok(UppercaseMode::Ascii));
assert_eq!(UppercaseMode::from_str("turkic"), Ok(UppercaseMode::Turkic));
assert_eq!(
UppercaseMode::from_str("lithuanian"),
Ok(UppercaseMode::Lithuanian)
);
assert_eq!(
UppercaseMode::from_str("full"),
Err(InvalidCaseMappingMode::new())
);
}

#[test]
fn test_uppercase_mode_conversion() {
let mut mode: UppercaseMode;
mode = "turkic".try_into().unwrap();
assert_eq!(mode, UppercaseMode::Turkic);

mode = Some("turkic").try_into().unwrap();
assert_eq!(mode, UppercaseMode::Turkic);

mode = b"turkic"[..].try_into().unwrap();
assert_eq!(mode, UppercaseMode::Turkic);

mode = Some(&b"turkic"[..]).try_into().unwrap();
assert_eq!(mode, UppercaseMode::Turkic);
}

#[test]
fn test_titlecase_mode_parsing() {
assert_eq!(TitlecaseMode::from_str("ascii"), Ok(TitlecaseMode::Ascii));
assert_eq!(TitlecaseMode::from_str("turkic"), Ok(TitlecaseMode::Turkic));
assert_eq!(
TitlecaseMode::from_str("lithuanian"),
Ok(TitlecaseMode::Lithuanian)
);
assert_eq!(
TitlecaseMode::from_str("full"),
Err(InvalidCaseMappingMode::new())
);
}

#[test]
fn test_titlecase_mode_conversion() {
let mut mode: TitlecaseMode;
mode = "turkic".try_into().unwrap();
assert_eq!(mode, TitlecaseMode::Turkic);

mode = Some("turkic").try_into().unwrap();
assert_eq!(mode, TitlecaseMode::Turkic);

mode = b"turkic"[..].try_into().unwrap();
assert_eq!(mode, TitlecaseMode::Turkic);

mode = Some(&b"turkic"[..]).try_into().unwrap();
assert_eq!(mode, TitlecaseMode::Turkic);
}
}
Loading

0 comments on commit 7209421

Please sign in to comment.