Skip to content

Commit 79687ca

Browse files
authored
Merge pull request #100 from unicode-rs/safety-comments
Add safety comments
2 parents 71a54fa + a97388a commit 79687ca

File tree

1 file changed

+26
-3
lines changed

1 file changed

+26
-3
lines changed

src/normalize.rs

+26-3
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ where
7171
}
7272

7373
#[inline]
74+
#[allow(unsafe_code)]
7475
fn decompose<D, F>(c: char, decompose_char: D, mut emit_char: F)
7576
where
7677
D: Fn(char) -> Option<&'static [char]>,
@@ -84,7 +85,10 @@ where
8485

8586
// Perform decomposition for Hangul
8687
if is_hangul_syllable(c) {
87-
decompose_hangul(c, emit_char);
88+
// Safety: Hangul Syllables invariant checked by is_hangul_syllable above
89+
unsafe {
90+
decompose_hangul(c, emit_char);
91+
}
8892
return;
8993
}
9094

@@ -127,27 +131,37 @@ const T_LAST: u32 = T_BASE + T_COUNT - 1;
127131
// i.e. `T_BASE + 1 ..= T_LAST`.
128132
const T_FIRST: u32 = T_BASE + 1;
129133

134+
// Safety-usable invariant: This ensures that c is a valid Hangul Syllable character (U+AC00..U+D7AF)
130135
pub(crate) fn is_hangul_syllable(c: char) -> bool {
136+
// Safety: This checks the range 0xAC00 (S_BASE) to 0xD7A4 (S_BASE + S_COUNT), upholding the safety-usable invariant
131137
(c as u32) >= S_BASE && (c as u32) < (S_BASE + S_COUNT)
132138
}
133139

134140
// Decompose a precomposed Hangul syllable
135-
#[allow(unsafe_code)]
141+
// Safety: `s` MUST be a valid Hangul Syllable character, between U+AC00..U+D7AF
142+
#[allow(unsafe_code, unused_unsafe)]
136143
#[inline(always)]
137-
fn decompose_hangul<F>(s: char, mut emit_char: F)
144+
unsafe fn decompose_hangul<F>(s: char, mut emit_char: F)
138145
where
139146
F: FnMut(char),
140147
{
148+
// This will be at most 0x2baf, the size of the Hangul Syllables block
141149
let s_index = s as u32 - S_BASE;
150+
// This will be at most 0x2baf / (21 * 28), 19
142151
let l_index = s_index / N_COUNT;
143152
unsafe {
153+
// Safety: L_BASE (0x1100) plus at most 19 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
144154
emit_char(char::from_u32_unchecked(L_BASE + l_index));
145155

156+
// Safety: This will be at most (N_COUNT - 1) / T_COUNT = (V*T - 1) / T, which gives us an upper bound of V_COUNT = 21
146157
let v_index = (s_index % N_COUNT) / T_COUNT;
158+
// Safety: V_BASE (0x1161) plus at most 21 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
147159
emit_char(char::from_u32_unchecked(V_BASE + v_index));
148160

161+
// Safety: This will be at most T_COUNT - 1 (27)
149162
let t_index = s_index % T_COUNT;
150163
if t_index > 0 {
164+
// Safety: T_BASE (0x11A7) plus at most 27 is still going to be in range for a valid Unicode code point in the BMP (< 0xD800)
151165
emit_char(char::from_u32_unchecked(T_BASE + t_index));
152166
}
153167
}
@@ -173,14 +187,23 @@ fn compose_hangul(a: char, b: char) -> Option<char> {
173187
match (a, b) {
174188
// Compose a leading consonant and a vowel together into an LV_Syllable
175189
(L_BASE..=L_LAST, V_BASE..=V_LAST) => {
190+
// Safety: based on the above bounds, l_index will be less than or equal to L_COUNT (19)
191+
// and v_index will be <= V_COUNT (21)
176192
let l_index = a - L_BASE;
177193
let v_index = b - V_BASE;
194+
// Safety: This will be <= 19 * (20 * 21) + (21 * 20), which is 8400.
178195
let lv_index = l_index * N_COUNT + v_index * T_COUNT;
196+
// Safety: This is between 0xAC00 and 0xCCD0, which are in range for Hangul Syllables (U+AC00..U+D7AF) and also in range
197+
// for BMP unicode
179198
let s = S_BASE + lv_index;
199+
// Safety: We've verified this is in-range
180200
Some(unsafe { char::from_u32_unchecked(s) })
181201
}
182202
// Compose an LV_Syllable and a trailing consonant into an LVT_Syllable
183203
(S_BASE..=S_LAST, T_FIRST..=T_LAST) if (a - S_BASE) % T_COUNT == 0 => {
204+
// Safety: a is between 0xAC00 and (0xAC00 + 19 * 21 * 28). b - T_BASE is between 0 and 19.
205+
// Adding a number 0 to 19 to a number that is at largest 0xD7A4 will not go out of bounds to 0xD800 (where the
206+
// surrogates start), so this is safe.
184207
Some(unsafe { char::from_u32_unchecked(a + (b - T_BASE)) })
185208
}
186209
_ => None,

0 commit comments

Comments
 (0)