Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use a simplified codepath if no bidi isolation controls are present. #131

Merged
merged 4 commits into from
Mar 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 30 additions & 1 deletion src/explicit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,19 +19,25 @@ use super::char_data::{
BidiClass::{self, *},
};
use super::level::Level;
use super::prepare::removed_by_x9;
use super::LevelRunVec;
use super::TextSource;

/// Compute explicit embedding levels for one paragraph of text (X1-X8).
/// Compute explicit embedding levels for one paragraph of text (X1-X8), and identify
/// level runs (BD7) for use when determining Isolating Run Sequences (X10).
///
/// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`,
/// for each char in `text`.
///
/// `runs` returns the list of level runs (BD7) of the text.
#[cfg_attr(feature = "flame_it", flamer::flame)]
pub fn compute<'a, T: TextSource<'a> + ?Sized>(
text: &'a T,
para_level: Level,
original_classes: &[BidiClass],
levels: &mut [Level],
processing_classes: &mut [BidiClass],
runs: &mut LevelRunVec,
) {
assert_eq!(text.len(), original_classes.len());

Expand All @@ -51,6 +57,9 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>(
let mut overflow_embedding_count = 0u32;
let mut valid_isolate_count = 0u32;

let mut current_run_level = Level::ltr();
let mut current_run_start = 0;

for (i, len) in text.indices_lengths() {
let last = stack.last().unwrap();

Expand Down Expand Up @@ -182,6 +191,26 @@ pub fn compute<'a, T: TextSource<'a> + ?Sized>(
levels[i + j] = levels[i];
processing_classes[i + j] = processing_classes[i];
}

// Identify level runs to be passed to prepare::isolating_run_sequences().
if i == 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we're moving level_runs logic here then we should:

  • link to the spec
  • mention in the fn docs that this code also handles that part of the spec
  • ideally, more comments referencing spec list items or other text

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've added/tidied up comments, as well as added a test for the edge-case of calling the paragraph API with empty text (and RTL direction, which prevents us taking the early exit in compute_bidi_info_for_para).

// Initialize for the first (or only) run.
current_run_level = levels[i];
} else {
// Check if we need to start a new level run.
// <https://www.unicode.org/reports/tr9/#BD7>
if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level {
// End the last run and start a new one.
runs.push(current_run_start..i);
current_run_level = levels[i];
current_run_start = i;
}
}
}

// Append the trailing level run, if non-empty.
if levels.len() > current_run_start {
runs.push(current_run_start..levels.len());
}
}

Expand Down
104 changes: 83 additions & 21 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ mod prepare;
pub use crate::char_data::{BidiClass, UNICODE_VERSION};
pub use crate::data_source::BidiDataSource;
pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL};
pub use crate::prepare::LevelRun;
pub use crate::prepare::{LevelRun, LevelRunVec};

#[cfg(feature = "hardcoded-data")]
pub use crate::char_data::{bidi_class, HardcodedBidiData};
Expand Down Expand Up @@ -248,8 +248,14 @@ struct InitialInfoExt<'text> {

/// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that
/// requires no further bidi processing (i.e. there are no RTL characters or bidi
/// control codes present).
pure_ltr: Vec<bool>,
/// control codes present), and whether any bidi isolation controls are present.
flags: Vec<ParagraphInfoFlags>,
}

#[derive(PartialEq, Debug)]
struct ParagraphInfoFlags {
is_pure_ltr: bool,
has_isolate_controls: bool,
}

impl<'text> InitialInfoExt<'text> {
Expand All @@ -269,12 +275,12 @@ impl<'text> InitialInfoExt<'text> {
default_para_level: Option<Level>,
) -> InitialInfoExt<'a> {
let mut paragraphs = Vec::<ParagraphInfo>::new();
let mut pure_ltr = Vec::<bool>::new();
let (original_classes, _, _) = compute_initial_info(
let mut flags = Vec::<ParagraphInfoFlags>::new();
let (original_classes, _, _, _) = compute_initial_info(
data_source,
text,
default_para_level,
Some((&mut paragraphs, &mut pure_ltr)),
Some((&mut paragraphs, &mut flags)),
);

InitialInfoExt {
Expand All @@ -283,7 +289,7 @@ impl<'text> InitialInfoExt<'text> {
original_classes,
paragraphs,
},
pure_ltr,
flags,
}
}
}
Expand All @@ -299,8 +305,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
data_source: &D,
text: &'a T,
default_para_level: Option<Level>,
mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<bool>)>,
) -> (Vec<BidiClass>, Level, bool) {
mut split_paragraphs: Option<(&mut Vec<ParagraphInfo>, &mut Vec<ParagraphInfoFlags>)>,
) -> (Vec<BidiClass>, Level, bool, bool) {
let mut original_classes = Vec::with_capacity(text.len());

// The stack contains the starting code unit index for each nested isolate we're inside.
Expand All @@ -310,8 +316,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
let mut isolate_stack = Vec::new();

debug_assert!(
if let Some((ref paragraphs, ref pure_ltr)) = split_paragraphs {
paragraphs.is_empty() && pure_ltr.is_empty()
if let Some((ref paragraphs, ref flags)) = split_paragraphs {
paragraphs.is_empty() && flags.is_empty()
} else {
true
}
Expand All @@ -323,6 +329,8 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
// Per-paragraph flag: can subsequent processing be skipped? Set to false if any
// RTL characters or bidi control characters are encountered in the paragraph.
let mut is_pure_ltr = true;
// Set to true if any bidi isolation controls are present in the paragraph.
let mut has_isolate_controls = false;

#[cfg(feature = "flame_it")]
flame::start("compute_initial_info(): iter text.char_indices()");
Expand All @@ -341,7 +349,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(

match class {
B => {
if let Some((ref mut paragraphs, ref mut pure_ltr)) = split_paragraphs {
if let Some((ref mut paragraphs, ref mut flags)) = split_paragraphs {
// P1. Split the text into separate paragraphs. The paragraph separator is kept
// with the previous paragraph.
let para_end = i + len;
Expand All @@ -350,14 +358,18 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
// P3. If no character is found in p2, set the paragraph level to zero.
level: para_level.unwrap_or(LTR_LEVEL),
});
pure_ltr.push(is_pure_ltr);
flags.push(ParagraphInfoFlags {
is_pure_ltr,
has_isolate_controls,
});
// Reset state for the start of the next paragraph.
para_start = para_end;
// TODO: Support defaulting to direction of previous paragraph
//
// <http://www.unicode.org/reports/tr9/#HL1>
para_level = default_para_level;
is_pure_ltr = true;
has_isolate_controls = false;
isolate_stack.clear();
}
}
Expand Down Expand Up @@ -394,6 +406,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(

RLI | LRI | FSI => {
is_pure_ltr = false;
has_isolate_controls = true;
isolate_stack.push(i);
}

Expand All @@ -405,15 +418,18 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
}
}

if let Some((paragraphs, pure_ltr)) = split_paragraphs {
if let Some((paragraphs, flags)) = split_paragraphs {
if para_start < text.len() {
paragraphs.push(ParagraphInfo {
range: para_start..text.len(),
level: para_level.unwrap_or(LTR_LEVEL),
});
pure_ltr.push(is_pure_ltr);
flags.push(ParagraphInfoFlags {
is_pure_ltr,
has_isolate_controls,
});
}
debug_assert_eq!(paragraphs.len(), pure_ltr.len());
debug_assert_eq!(paragraphs.len(), flags.len());
}
debug_assert_eq!(original_classes.len(), text.len());

Expand All @@ -424,6 +440,7 @@ fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>(
original_classes,
para_level.unwrap_or(LTR_LEVEL),
is_pure_ltr,
has_isolate_controls,
)
}

Expand Down Expand Up @@ -482,20 +499,21 @@ impl<'text> BidiInfo<'text> {
text: &'a str,
default_para_level: Option<Level>,
) -> BidiInfo<'a> {
let InitialInfoExt { base, pure_ltr, .. } =
let InitialInfoExt { base, flags, .. } =
InitialInfoExt::new_with_data_source(data_source, text, default_para_level);

let mut levels = Vec::<Level>::with_capacity(text.len());
let mut processing_classes = base.original_classes.clone();

for (para, is_pure_ltr) in base.paragraphs.iter().zip(pure_ltr.iter()) {
for (para, flags) in base.paragraphs.iter().zip(flags.iter()) {
let text = &text[para.range.clone()];
let original_classes = &base.original_classes[para.range.clone()];

compute_bidi_info_for_para(
data_source,
para,
*is_pure_ltr,
flags.is_pure_ltr,
flags.has_isolate_controls,
text,
original_classes,
&mut processing_classes,
Expand Down Expand Up @@ -720,7 +738,7 @@ impl<'text> ParagraphBidiInfo<'text> {
) -> ParagraphBidiInfo<'a> {
// Here we could create a ParagraphInitialInfo struct to parallel the one
// used by BidiInfo, but there doesn't seem any compelling reason for it.
let (original_classes, paragraph_level, is_pure_ltr) =
let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) =
compute_initial_info(data_source, text, default_para_level, None);

let mut levels = Vec::<Level>::with_capacity(text.len());
Expand All @@ -738,6 +756,7 @@ impl<'text> ParagraphBidiInfo<'text> {
data_source,
&para_info,
is_pure_ltr,
has_isolate_controls,
text,
&original_classes,
&mut processing_classes,
Expand Down Expand Up @@ -1066,6 +1085,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>
data_source: &D,
para: &ParagraphInfo,
is_pure_ltr: bool,
has_isolate_controls: bool,
text: &'a T,
original_classes: &[BidiClass],
processing_classes: &mut [BidiClass],
Expand All @@ -1079,16 +1099,26 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>

let processing_classes = &mut processing_classes[para.range.clone()];
let levels = &mut levels[para.range.clone()];
let mut level_runs = LevelRunVec::new();

explicit::compute(
text,
para.level,
original_classes,
levels,
processing_classes,
&mut level_runs,
);

let sequences = prepare::isolating_run_sequences(para.level, original_classes, levels);
let mut sequences = prepare::IsolatingRunSequenceVec::new();
prepare::isolating_run_sequences(
para.level,
original_classes,
levels,
level_runs,
has_isolate_controls,
&mut sequences,
);
for sequence in &sequences {
implicit::resolve_weak(text, sequence, processing_classes);
implicit::resolve_neutral(
Expand All @@ -1100,6 +1130,7 @@ fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>
processing_classes,
);
}

implicit::resolve_levels(processing_classes, levels);

assign_levels_to_removed_chars(para.level, original_classes, levels);
Expand Down Expand Up @@ -1549,6 +1580,24 @@ mod tests {
#[cfg(feature = "hardcoded-data")]
fn test_process_text() {
let tests = vec![
(
// text
"",
// base level
Some(RTL_LEVEL),
// levels
Level::vec(&[]),
// original_classes
vec![],
// paragraphs
vec![],
// levels_u16
Level::vec(&[]),
// original_classes_u16
vec![],
// paragraphs_u16
vec![],
),
(
// text
"abc123",
Expand Down Expand Up @@ -1710,6 +1759,19 @@ mod tests {
paragraphs: t.4.clone(),
}
);
// If it was empty, also test that ParagraphBidiInfo handles it safely.
if t.4.len() == 0 {
assert_eq!(
ParagraphBidiInfo::new(t.0, t.1),
ParagraphBidiInfo {
text: t.0,
original_classes: t.3.clone(),
levels: t.2.clone(),
paragraph_level: RTL_LEVEL,
is_pure_ltr: true,
}
)
}
// If it was a single paragraph, also test ParagraphBidiInfo.
if t.4.len() == 1 {
assert_eq!(
Expand Down
Loading