Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add SourceInfo option to the data export #5506

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions provider/core/src/export/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,46 @@
mod payload;

pub use payload::{ExportBox, ExportMarker};
use zerofrom::ZeroFrom;

use crate::prelude::*;
use std::collections::HashSet;

/// Choices for determining the deduplication of locales for exported data payloads.
///
/// Deduplication affects the lookup table from locales to data payloads. If a child locale
/// points to the same payload as its parent locale, then the child locale can be removed from
/// the lookup table. Therefore, all deduplication strategies guarantee that data requests for
/// selected locales will succeed so long as fallback is enabled at runtime (either internally
/// or externally). They also do not impact which _payloads_ are included: only the lookup table.
///
/// Comparison of the deduplication strategies:
///
/// | Name | Data file size | Supported locale queries? | Needs runtime fallback? |
/// |---|---|---|---|
/// | [`Maximal`] | Smallest | No | Yes |
/// | [`RetainBaseLanguages`] | Small | Yes | Yes |
/// | [`None`] | Medium/Small | Yes | No |
///
/// [`Maximal`]: DeduplicationStrategy::Maximal
/// [`RetainBaseLanguages`]: DeduplicationStrategy::RetainBaseLanguages
/// [`None`]: DeduplicationStrategy::None
#[non_exhaustive]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash, ZeroFrom)]
#[cfg_attr(feature = "export", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "export", databake(path = icu_provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
#[cfg_attr(feature = "serde", serde(rename_all = "camelCase"))]
pub enum DeduplicationStrategy {
/// Removes from the lookup table any locale whose parent maps to the same data.
Maximal,
/// Removes from the lookup table any locale whose parent maps to the same data, except if
/// the parent is `und`.
RetainBaseLanguages,
/// Keeps all selected locales in the lookup table.
None,
}

/// An object capable of exporting data payloads in some form.
pub trait DataExporter: Sync {
/// Save a `payload` corresponding to the given marker and locale.
Expand Down
5 changes: 5 additions & 0 deletions provider/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ pub mod marker {
};
}

#[cfg(feature = "macros")]
mod source_info;
#[cfg(feature = "macros")]
pub use source_info::{SourceInfo, SourceInfoMarker};

/// Core selection of APIs and structures for the ICU4X data provider.
pub mod prelude {
#[doc(no_inline)]
Expand Down
32 changes: 32 additions & 0 deletions provider/core/src/source_info.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use core::marker::PhantomData;

use crate::export::DeduplicationStrategy;
use crate::prelude::*;
use alloc::borrow::Cow;

use crate as icu_provider;

#[derive(Debug, Clone, PartialEq)]
#[non_exhaustive]
#[crate::data_struct(marker(SourceInfoMarker, "_sourceinfo@1", singleton))]
#[cfg_attr(feature = "export", derive(serde::Serialize, databake::Bake))]
#[cfg_attr(feature = "export", databake(path = icu_provider))]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub enum SourceInfo<'data> {
V001 {
deduplication: DeduplicationStrategy,
lifetime_holder: Cow<'data, ()>,
},
}

impl SourceInfo<'_> {
/// Get the deduplication strategy that the provider was exported with.
pub fn deduplication(&self) -> DeduplicationStrategy {
let Self::V001 { deduplication, .. } = self;
*deduplication
}
}
31 changes: 30 additions & 1 deletion provider/export/src/export_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,12 @@
use crate::{DataLocaleFamilyAnnotations, DeduplicationStrategy, ExportDriver};
use icu_locale::fallback::LocaleFallbackIterator;
use icu_locale::LocaleFallbacker;
use icu_provider::dynutil::UpcastDataPayload;
use icu_provider::export::*;
use icu_provider::prelude::*;
use icu_provider::SourceInfo;
use icu_provider::SourceInfoMarker;
use std::borrow::Cow;
use std::collections::HashMap;
use std::collections::HashSet;
use std::fmt;
Expand Down Expand Up @@ -55,21 +59,30 @@ impl ExportDriver {
fallbacker,
deduplication_strategy,
attributes_filters,
with_source_info,
} = self;

let markers = markers.unwrap_or_else(|| provider.supported_markers());
let mut markers = markers.unwrap_or_else(|| provider.supported_markers());

if markers.is_empty() {
log::warn!("No markers selected");
}

if with_source_info {
markers.insert(SourceInfoMarker::INFO);
}

log::info!(
"Datagen configured with {}, and these locales: {:?}",
match deduplication_strategy {
DeduplicationStrategy::Maximal => "maximal deduplication",
DeduplicationStrategy::RetainBaseLanguages =>
"deduplication retaining base languages",
DeduplicationStrategy::None => "no deduplication",
_ => {
return Err(DataError::custom("Unsupported deduplication strategy")
.with_debug_context(&deduplication_strategy));
}
},
if include_full {
vec!["<all>".to_string()]
Expand Down Expand Up @@ -138,6 +151,18 @@ impl ExportDriver {
log::trace!("Generating marker {marker:?}");
let instant1 = Instant::now();

if marker == SourceInfoMarker::INFO {
let payload = UpcastDataPayload::upcast(
DataPayload::<SourceInfoMarker>::from_owned(SourceInfo::V001 {
deduplication: self.deduplication_strategy,
lifetime_holder: Cow::Borrowed(&()),
}),
);
sink.flush_singleton(marker, &payload)
.map_err(|e| e.with_req(marker, Default::default()))?;
return Ok(());
}

if marker.is_singleton {
let supported = provider.iter_ids_for_marker(marker)?;
if supported.len() != 1 || !supported.first().unwrap().is_default() {
Expand Down Expand Up @@ -231,6 +256,10 @@ impl ExportDriver {
.collect::<Result<Vec<_>, DataError>>()?
.into_iter()
.max(),
_ => {
return Err(DataError::custom("Unsupported deduplication strategy")
.with_debug_context(&deduplication_strategy));
}
}
.unwrap_or_default();

Expand Down
50 changes: 15 additions & 35 deletions provider/export/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,16 +67,18 @@ pub use icu_provider_blob::export as blob_exporter;
#[cfg(feature = "fs_exporter")]
pub use icu_provider_fs::export as fs_exporter;

pub(crate) use icu_provider::export::DeduplicationStrategy;

/// A prelude for using the datagen API
pub mod prelude {
#[doc(no_inline)]
pub use crate::{
DataLocaleFamily, DeduplicationStrategy, ExportDriver, FallbackOptions, NoFallbackOptions,
};
pub use crate::{DataLocaleFamily, ExportDriver, FallbackOptions, NoFallbackOptions};
#[doc(no_inline)]
pub use icu_locale::{locale, LocaleFallbacker};
#[doc(no_inline)]
pub use icu_provider::{export::DataExporter, DataLocale, DataMarker, DataMarkerInfo};
pub use icu_provider::{
export::DataExporter, export::DeduplicationStrategy, DataLocale, DataMarker, DataMarkerInfo,
};
}

use icu_locale::LocaleFallbacker;
Expand Down Expand Up @@ -118,6 +120,7 @@ pub struct ExportDriver {
fallbacker: LocaleFallbacker,
include_full: bool,
deduplication_strategy: DeduplicationStrategy,
with_source_info: bool,
}

impl core::fmt::Debug for ExportDriver {
Expand Down Expand Up @@ -169,6 +172,7 @@ impl ExportDriver {
include_full,
fallbacker,
deduplication_strategy: options.deduplication_strategy,
with_source_info: false,
}
.with_recommended_segmenter_models()
.with_additional_collations([])
Expand All @@ -195,6 +199,13 @@ impl ExportDriver {
}
}

pub fn with_source_info(self, with_source_info: bool) -> Self {
Self {
with_source_info,
..self
}
}

/// This option is only relevant if using `icu::collator`.
///
/// By default, the collations `big5han`, `gb2312`, and those starting with `search`
Expand Down Expand Up @@ -270,37 +281,6 @@ impl ExportDriver {
#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
pub struct NoFallbackOptions {}

/// Choices for determining the deduplication of locales for exported data payloads.
///
/// Deduplication affects the lookup table from locales to data payloads. If a child locale
/// points to the same payload as its parent locale, then the child locale can be removed from
/// the lookup table. Therefore, all deduplication strategies guarantee that data requests for
/// selected locales will succeed so long as fallback is enabled at runtime (either internally
/// or externally). They also do not impact which _payloads_ are included: only the lookup table.
///
/// Comparison of the deduplication strategies:
///
/// | Name | Data file size | Supported locale queries? | Needs runtime fallback? |
/// |---|---|---|---|
/// | [`Maximal`] | Smallest | No | Yes |
/// | [`RetainBaseLanguages`] | Small | Yes | Yes |
/// | [`None`] | Medium/Small | Yes | No |
///
/// [`Maximal`]: DeduplicationStrategy::Maximal
/// [`RetainBaseLanguages`]: DeduplicationStrategy::RetainBaseLanguages
/// [`None`]: DeduplicationStrategy::None
#[non_exhaustive]
#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
pub enum DeduplicationStrategy {
/// Removes from the lookup table any locale whose parent maps to the same data.
Maximal,
/// Removes from the lookup table any locale whose parent maps to the same data, except if
/// the parent is `und`.
RetainBaseLanguages,
/// Keeps all selected locales in the lookup table.
None,
}

/// Options bag configuring locale inclusion and behavior when runtime fallback is enabled.
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
#[non_exhaustive]
Expand Down
6 changes: 3 additions & 3 deletions provider/icu4x-datagen/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -463,11 +463,11 @@ fn main() -> eyre::Result<()> {
};

let deduplication_strategy = match cli.deduplication {
Some(Deduplication::Maximal) => icu_provider_export::DeduplicationStrategy::Maximal,
Some(Deduplication::Maximal) => icu_provider::export::DeduplicationStrategy::Maximal,
Some(Deduplication::RetainBaseLanguages) => {
icu_provider_export::DeduplicationStrategy::RetainBaseLanguages
icu_provider::export::DeduplicationStrategy::RetainBaseLanguages
}
Some(Deduplication::None) => icu_provider_export::DeduplicationStrategy::None,
Some(Deduplication::None) => icu_provider::export::DeduplicationStrategy::None,
None => match cli.format {
Format::Fs | Format::Blob | Format::Blob2 => DeduplicationStrategy::None,
Format::Baked if cli.no_internal_fallback && cli.deduplication.is_none() =>
Expand Down
6 changes: 6 additions & 0 deletions provider/source/data/debug/[email protected]

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions provider/source/src/tests/make_testdata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ fn make_testdata() {
DeduplicationStrategy::None.into(),
LocaleFallbacker::try_new_unstable(&provider).unwrap(),
)
.with_source_info(true)
.with_segmenter_models([
"thaidict".into(),
"Thai_codepoints_exclusive_model4_heavy".into(),
Expand Down Expand Up @@ -142,6 +143,10 @@ impl DataExporter for ZeroCopyCheckExporter {
let deserialized: DataPayload<icu_provider::hello_world::HelloWorldV1Marker> = buffer_payload.into_deserialized(icu_provider::buf::BufferFormat::Postcard1).unwrap();
(MeasuringAllocator::end_measure(), UpcastDataPayload::upcast(deserialized))
}
k if k == icu_provider::SourceInfoMarker::INFO => {
let deserialized: DataPayload<icu_provider::SourceInfoMarker> = buffer_payload.into_deserialized(icu_provider::buf::BufferFormat::Postcard1).unwrap();
(MeasuringAllocator::end_measure(), UpcastDataPayload::upcast(deserialized))
}
$(
k if k == <$marker>::INFO => {
let deserialized: DataPayload<$marker> = buffer_payload.into_deserialized(icu_provider::buf::BufferFormat::Postcard1).unwrap();
Expand Down
Loading