Skip to content

Commit 84966f9

Browse files
authored
Simplify Hangul composition (#2200)
Removes a premature attempt at optimization that did not really make sense as an optimization.
1 parent 219e20f commit 84966f9

File tree

1 file changed

+4
-39
lines changed
  • components/normalizer/src

1 file changed

+4
-39
lines changed

components/normalizer/src/lib.rs

Lines changed: 4 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1002,44 +1002,6 @@ where
10021002
return Some(starter);
10031003
}
10041004
}
1005-
// Even a Hangul LVT syllable may have decomposed if followed by something that
1006-
// could combine backwards with some other base.
1007-
// `buffer_pos` may be non-zero for NFKC and parenthesized Hangul.
1008-
if let Some(potential) = self.decomposition.buffer.get(self.decomposition.buffer_pos) {
1009-
let potential_c = potential.character();
1010-
let v = u32::from(potential_c).wrapping_sub(HANGUL_V_BASE);
1011-
if v < HANGUL_V_COUNT {
1012-
// Hangul vowel
1013-
let l = u32::from(starter).wrapping_sub(HANGUL_L_BASE);
1014-
if l < HANGUL_L_COUNT {
1015-
let lv = l * HANGUL_N_COUNT + v * HANGUL_T_COUNT;
1016-
// Safe, because the inputs are known to be in range.
1017-
starter = unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lv) };
1018-
1019-
self.decomposition.buffer_pos += 1;
1020-
}
1021-
}
1022-
}
1023-
1024-
if let Some(potential) = self.decomposition.buffer.get(self.decomposition.buffer_pos) {
1025-
let potential_c = potential.character();
1026-
if in_inclusive_range(potential_c, '\u{11A8}', '\u{11C2}') {
1027-
// Hangul trail
1028-
let lv = u32::from(starter).wrapping_sub(HANGUL_S_BASE);
1029-
if lv < HANGUL_S_COUNT && lv % HANGUL_T_COUNT == 0 {
1030-
let lvt = lv + u32::from(potential_c) - HANGUL_T_BASE;
1031-
// Safe, because the inputs are known to be in range.
1032-
starter = unsafe { char::from_u32_unchecked(HANGUL_S_BASE + lvt) };
1033-
1034-
self.decomposition.buffer_pos += 1;
1035-
}
1036-
}
1037-
}
1038-
1039-
// We could skip the loop based on the knowledge that we saw Hangul, but it
1040-
// would only be an optimization in a non-realistic case, so we'll let the loop
1041-
// below try composing stuff with Hangul without success.
1042-
10431005
// We first loop by index to avoid moving the contents of `buffer`, but
10441006
// if there's a discontiguous match, we'll start modifying `buffer` instead.
10451007
loop {
@@ -1055,7 +1017,7 @@ where
10551017
self.decomposition.buffer_pos = 0;
10561018
break;
10571019
};
1058-
if let Some(composed) = self.compose_non_hangul(starter, character) {
1020+
if let Some(composed) = self.compose(starter, character) {
10591021
starter = composed;
10601022
self.decomposition.buffer_pos += 1;
10611023
continue;
@@ -1086,6 +1048,9 @@ where
10861048
}
10871049
debug_assert!(ccc >= most_recent_skipped_ccc);
10881050
if ccc != most_recent_skipped_ccc {
1051+
// Using the non-Hangul version as a micro-optimization, since
1052+
// we already rejected the case where `second` is a starter
1053+
// above, and conjoining jamo are starters.
10891054
if let Some(composed) = self.compose_non_hangul(starter, character) {
10901055
self.decomposition.buffer.remove(i);
10911056
starter = composed;

0 commit comments

Comments
 (0)