Skip to content

Commit a2a3ba1

Browse files
authored
Merge pull request #105 from Manishearth/reorder-conformance
Run reordering conformance tests, more docs for reordering, fix bug
2 parents 492381c + 511db3f commit a2a3ba1

File tree

2 files changed

+185
-75
lines changed

2 files changed

+185
-75
lines changed

src/lib.rs

Lines changed: 110 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -376,16 +376,93 @@ impl<'text> BidiInfo<'text> {
376376
}
377377
}
378378

379-
/// Re-order a line based on resolved levels and return only the embedding levels, one `Level`
380-
/// per *byte*.
379+
/// Produce the levels for this paragraph as needed for reordering, one level per *byte*
380+
/// in the paragraph. The returned vector includes bytes that are not included
381+
/// in the `line`, but will not adjust them.
382+
///
383+
/// This runs [Rule L1], you can run
384+
/// [Rule L2] by calling [`Self::reorder_visual()`].
385+
/// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
386+
/// to avoid non-byte indices.
387+
///
388+
/// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
389+
///
390+
/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
391+
/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
381392
#[cfg_attr(feature = "flame_it", flamer::flame)]
382393
pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range<usize>) -> Vec<Level> {
383-
let (levels, _) = self.visual_runs(para, line);
394+
assert!(line.start <= self.levels.len());
395+
assert!(line.end <= self.levels.len());
396+
397+
let mut levels = self.levels.clone();
398+
let line_classes = &self.original_classes[line.clone()];
399+
let line_levels = &mut levels[line.clone()];
400+
401+
// Reset some whitespace chars to paragraph level.
402+
// <http://www.unicode.org/reports/tr9/#L1>
403+
let line_str: &str = &self.text[line.clone()];
404+
let mut reset_from: Option<usize> = Some(0);
405+
let mut reset_to: Option<usize> = None;
406+
let mut prev_level = para.level;
407+
for (i, c) in line_str.char_indices() {
408+
match line_classes[i] {
409+
// Segment separator, Paragraph separator
410+
B | S => {
411+
assert_eq!(reset_to, None);
412+
reset_to = Some(i + c.len_utf8());
413+
if reset_from == None {
414+
reset_from = Some(i);
415+
}
416+
}
417+
// Whitespace, isolate formatting
418+
WS | FSI | LRI | RLI | PDI => {
419+
if reset_from == None {
420+
reset_from = Some(i);
421+
}
422+
}
423+
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
424+
// same as above + set the level
425+
RLE | LRE | RLO | LRO | PDF | BN => {
426+
if reset_from == None {
427+
reset_from = Some(i);
428+
}
429+
// also set the level to previous
430+
line_levels[i] = prev_level;
431+
}
432+
_ => {
433+
reset_from = None;
434+
}
435+
}
436+
if let (Some(from), Some(to)) = (reset_from, reset_to) {
437+
for level in &mut line_levels[from..to] {
438+
*level = para.level;
439+
}
440+
reset_from = None;
441+
reset_to = None;
442+
}
443+
prev_level = line_levels[i];
444+
}
445+
if let Some(from) = reset_from {
446+
for level in &mut line_levels[from..] {
447+
*level = para.level;
448+
}
449+
}
384450
levels
385451
}
386452

387-
/// Re-order a line based on resolved levels and return only the embedding levels, one `Level`
388-
/// per *character*.
453+
/// Produce the levels for this paragraph as needed for reordering, one level per *character*
454+
/// in the paragraph. The returned vector includes characters that are not included
455+
/// in the `line`, but will not adjust them.
456+
///
457+
/// This runs [Rule L1], you can run
458+
/// [Rule L2] by calling [`Self::reorder_visual()`].
459+
/// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead
460+
/// to avoid non-byte indices.
461+
///
462+
/// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`].
463+
///
464+
/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
465+
/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
389466
#[cfg_attr(feature = "flame_it", flamer::flame)]
390467
pub fn reordered_levels_per_char(
391468
&self,
@@ -397,6 +474,11 @@ impl<'text> BidiInfo<'text> {
397474
}
398475

399476
/// Re-order a line based on resolved levels and return the line in display order.
477+
///
478+
/// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring.
479+
///
480+
/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
481+
/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
400482
#[cfg_attr(feature = "flame_it", flamer::flame)]
401483
pub fn reorder_line(&self, para: &ParagraphInfo, line: Range<usize>) -> Cow<'text, str> {
402484
let (levels, runs) = self.visual_runs(para, line.clone());
@@ -536,69 +618,33 @@ impl<'text> BidiInfo<'text> {
536618
///
537619
/// `line` is a range of bytes indices within `levels`.
538620
///
621+
/// The first return value is a vector of levels used by the reordering algorithm,
622+
/// i.e. the result of [Rule L1]. The second return value is a vector of level runs,
623+
/// the result of [Rule L2], showing the visual order that each level run (a run of text with the
624+
/// same level) should be displayed. Within each run, the display order can be checked
625+
/// against the Level vector.
626+
///
627+
/// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring),
628+
/// as that should be handled by the engine using this API.
629+
///
630+
/// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by
631+
/// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead
632+
/// of producing a level map, since one may wish to deal with the fact that this is operating on
633+
/// byte rather than character indices.
634+
///
539635
/// <http://www.unicode.org/reports/tr9/#Reordering_Resolved_Levels>
636+
///
637+
/// [Rule L1]: https://www.unicode.org/reports/tr9/#L1
638+
/// [Rule L2]: https://www.unicode.org/reports/tr9/#L2
639+
/// [Rule L3]: https://www.unicode.org/reports/tr9/#L3
640+
/// [Rule L4]: https://www.unicode.org/reports/tr9/#L4
540641
#[cfg_attr(feature = "flame_it", flamer::flame)]
541642
pub fn visual_runs(
542643
&self,
543644
para: &ParagraphInfo,
544645
line: Range<usize>,
545646
) -> (Vec<Level>, Vec<LevelRun>) {
546-
assert!(line.start <= self.levels.len());
547-
assert!(line.end <= self.levels.len());
548-
549-
let mut levels = self.levels.clone();
550-
let line_classes = &self.original_classes[line.clone()];
551-
let line_levels = &mut levels[line.clone()];
552-
553-
// Reset some whitespace chars to paragraph level.
554-
// <http://www.unicode.org/reports/tr9/#L1>
555-
let line_str: &str = &self.text[line.clone()];
556-
let mut reset_from: Option<usize> = Some(0);
557-
let mut reset_to: Option<usize> = None;
558-
let mut prev_level = para.level;
559-
for (i, c) in line_str.char_indices() {
560-
match line_classes[i] {
561-
// Segment separator, Paragraph separator
562-
B | S => {
563-
assert_eq!(reset_to, None);
564-
reset_to = Some(i + c.len_utf8());
565-
if reset_from == None {
566-
reset_from = Some(i);
567-
}
568-
}
569-
// Whitespace, isolate formatting
570-
WS | FSI | LRI | RLI | PDI => {
571-
if reset_from == None {
572-
reset_from = Some(i);
573-
}
574-
}
575-
// <https://www.unicode.org/reports/tr9/#Retaining_Explicit_Formatting_Characters>
576-
// same as above + set the level
577-
RLE | LRE | RLO | LRO | PDF | BN => {
578-
if reset_from == None {
579-
reset_from = Some(i);
580-
}
581-
// also set the level to previous
582-
line_levels[i] = prev_level;
583-
}
584-
_ => {
585-
reset_from = None;
586-
}
587-
}
588-
if let (Some(from), Some(to)) = (reset_from, reset_to) {
589-
for level in &mut line_levels[from..to] {
590-
*level = para.level;
591-
}
592-
reset_from = None;
593-
reset_to = None;
594-
}
595-
prev_level = line_levels[i];
596-
}
597-
if let Some(from) = reset_from {
598-
for level in &mut line_levels[from..] {
599-
*level = para.level;
600-
}
601-
}
647+
let levels = self.reordered_levels(para, line.clone());
602648

603649
// Find consecutive level runs.
604650
let mut runs = Vec::new();
@@ -626,31 +672,25 @@ impl<'text> BidiInfo<'text> {
626672

627673
// Stop at the lowest *odd* level.
628674
min_level = min_level.new_lowest_ge_rtl().expect("Level error");
629-
630675
// This loop goes through contiguous chunks of level runs that have a level
631676
// ≥ max_level and reverses their contents, reducing max_level by 1 each time.
632-
//
633-
// It can do this check with the original levels instead of checking reorderings because all
634-
// prior reorderings will have been for contiguous chunks of levels >> max, which will
635-
// be a subset of these chunks anyway.
636677
while max_level >= min_level {
637678
// Look for the start of a sequence of consecutive runs of max_level or higher.
638679
let mut seq_start = 0;
639680
while seq_start < run_count {
640-
if self.levels[runs[seq_start].start] < max_level {
681+
if levels[runs[seq_start].start] < max_level {
641682
seq_start += 1;
642683
continue;
643684
}
644685

645686
// Found the start of a sequence. Now find the end.
646687
let mut seq_end = seq_start + 1;
647688
while seq_end < run_count {
648-
if self.levels[runs[seq_end].start] < max_level {
689+
if levels[runs[seq_end].start] < max_level {
649690
break;
650691
}
651692
seq_end += 1;
652693
}
653-
654694
// Reverse the runs within this sequence.
655695
runs[seq_start..seq_end].reverse();
656696

@@ -660,7 +700,6 @@ impl<'text> BidiInfo<'text> {
660700
.lower(1)
661701
.expect("Lowering embedding level below zero");
662702
}
663-
664703
(levels, runs)
665704
}
666705

@@ -984,7 +1023,7 @@ mod tests {
9841023
// Testing for RLE Character
9851024
assert_eq!(
9861025
reorder_paras("\u{202B}abc אבג\u{202C}"),
987-
vec!["\u{202B}\u{202C}גבא abc"]
1026+
vec!["\u{202b}גבא abc\u{202c}"]
9881027
);
9891028

9901029
// Testing neutral characters

tests/conformance_tests.rs

Lines changed: 75 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,10 +7,12 @@
77
// option. This file may not be copied, modified, or distributed
88
// except according to those terms.
99

10+
use std::collections::BTreeMap;
1011
use unicode_bidi::bidi_class;
1112
use unicode_bidi::{format_chars, level, BidiInfo, Level};
1213

1314
#[derive(Debug)]
15+
#[allow(dead_code)] // lint ignores the Debug impl but that's what we use this for
1416
struct Fail {
1517
pub line_num: usize,
1618
pub input_base_level: Option<Level>,
@@ -21,7 +23,41 @@ struct Fail {
2123
pub exp_ordering: Vec<String>,
2224
pub actual_base_level: Option<Level>,
2325
pub actual_levels: Vec<Level>,
24-
// TODO pub actual_ordering: Vec<String>,
26+
/// A list of reordered indices, with X9 characters removed
27+
pub actual_ordering: Vec<String>,
28+
/// The full reordered index map (map[visual] = logical)
29+
/// without X9 characters removed
30+
pub actual_unfiltered_ordering: Vec<usize>,
31+
}
32+
33+
/// Turns the output of BidiInfo::visual_runs() to a per-*character* visual-to-logical map,
34+
/// compatible with that returned by `reorder_visual()`
35+
fn reorder_map_from_visual_runs(info: &BidiInfo) -> Vec<usize> {
36+
let para = &info.paragraphs[0];
37+
let (levels, runs) = info.visual_runs(para, para.range.clone());
38+
let char_index_map: BTreeMap<usize, usize> = info
39+
.text
40+
.char_indices()
41+
.enumerate()
42+
.map(|(logical, (byte, _ch))| (byte, logical))
43+
.collect();
44+
let mut map = Vec::new();
45+
for run in runs {
46+
if levels[run.start].is_rtl() {
47+
for byte_idx in run.rev() {
48+
if let Some(logical) = char_index_map.get(&byte_idx) {
49+
map.push(*logical);
50+
}
51+
}
52+
} else {
53+
for byte_idx in run {
54+
if let Some(logical) = char_index_map.get(&byte_idx) {
55+
map.push(*logical);
56+
}
57+
}
58+
}
59+
}
60+
map
2561
}
2662

2763
#[test]
@@ -82,7 +118,23 @@ fn test_basic_conformance() {
82118
let exp_levels: Vec<String> = exp_levels.iter().map(|x| x.to_owned()).collect();
83119
let para = &bidi_info.paragraphs[0];
84120
let levels = bidi_info.reordered_levels_per_char(para, para.range.clone());
85-
if levels != exp_levels {
121+
122+
let reorder_map = BidiInfo::reorder_visual(&levels);
123+
let visual_runs_levels = reorder_map_from_visual_runs(&bidi_info);
124+
125+
// The conformance tests only require this to be true for the non-ignored characters
126+
// However, as an internal invariant of this crate we would like to ensure these stay
127+
// the same to reduce confusion. This is an assert instead of appending to the `fails`
128+
// list since it is testing an internal invariant between two APIs.
129+
assert_eq!(reorder_map, visual_runs_levels, "Maps returned by reorder_visual() and visual_runs() must be the same, for line {line}");
130+
131+
let actual_ordering: Vec<String> = reorder_map
132+
.iter()
133+
.filter(|logical_idx| exp_levels[**logical_idx] != "x")
134+
.map(|logical_idx| logical_idx.to_string())
135+
.collect();
136+
137+
if levels != exp_levels || actual_ordering != exp_ordering {
86138
fails.push(Fail {
87139
line_num: line_idx + 1,
88140
input_base_level,
@@ -91,6 +143,8 @@ fn test_basic_conformance() {
91143
exp_base_level: None,
92144
exp_levels: exp_levels.to_owned(),
93145
exp_ordering: exp_ordering.to_owned(),
146+
actual_ordering,
147+
actual_unfiltered_ordering: reorder_map,
94148
actual_base_level: None,
95149
actual_levels: levels.to_owned(),
96150
});
@@ -175,15 +229,32 @@ fn test_character_conformance() {
175229
// Check levels
176230
let para = &bidi_info.paragraphs[0];
177231
let levels = bidi_info.reordered_levels_per_char(para, para.range.clone());
178-
if levels != exp_levels {
232+
233+
let reorder_map = BidiInfo::reorder_visual(&levels);
234+
let visual_runs_levels = reorder_map_from_visual_runs(&bidi_info);
235+
236+
// The conformance tests only require this to be true for the non-ignored characters
237+
// However, as an internal invariant of this crate we would like to ensure these stay
238+
// the same to reduce confusion. This is an assert instead of appending to the `fails`
239+
// list since it is testing an internal invariant between two APIs.
240+
assert_eq!(reorder_map, visual_runs_levels, "Maps returned by reorder_visual() and visual_runs() must be the same, for line {line}");
241+
242+
let actual_ordering: Vec<String> = reorder_map
243+
.iter()
244+
.filter(|logical_idx| exp_levels[**logical_idx] != "x")
245+
.map(|logical_idx| logical_idx.to_string())
246+
.collect();
247+
if levels != exp_levels || exp_ordering != actual_ordering {
179248
fails.push(Fail {
180249
line_num: line_idx + 1,
181250
input_base_level,
182251
input_classes: vec![],
183252
input_string: input_string.to_owned(),
184253
exp_base_level: Some(exp_base_level),
185254
exp_levels: exp_levels.to_owned(),
186-
exp_ordering: exp_ordering.to_owned(),
255+
exp_ordering: exp_ordering,
256+
actual_ordering: actual_ordering,
257+
actual_unfiltered_ordering: reorder_map,
187258
actual_base_level: None,
188259
actual_levels: levels.to_owned(),
189260
});

0 commit comments

Comments
 (0)