Skip to content

Commit 6998018

Browse files
authored
Auto merge of #33907 - strake:decode_utf8, r=alexcrichton
add core::char::DecodeUtf8 See [issue](#33906)
2 parents 3c85f41 + 837029f commit 6998018

File tree

5 files changed

+80
-0
lines changed

5 files changed

+80
-0
lines changed

src/libcore/char.rs

+47
Original file line numberDiff line numberDiff line change
@@ -676,3 +676,50 @@ impl Iterator for EncodeUtf16 {
676676
self.as_slice().iter().size_hint()
677677
}
678678
}
679+
680+
681+
/// An iterator over an iterator of bytes of the characters the bytes represent
682+
/// as UTF-8
683+
#[unstable(feature = "decode_utf8", issue = "33906")]
684+
#[derive(Clone, Debug)]
685+
pub struct DecodeUtf8<I: Iterator<Item = u8>>(::iter::Peekable<I>);
686+
687+
/// Decodes an `Iterator` of bytes as UTF-8.
688+
#[unstable(feature = "decode_utf8", issue = "33906")]
689+
#[inline]
690+
pub fn decode_utf8<I: IntoIterator<Item = u8>>(i: I) -> DecodeUtf8<I::IntoIter> {
691+
DecodeUtf8(i.into_iter().peekable())
692+
}
693+
694+
/// `<DecodeUtf8 as Iterator>::next` returns this for an invalid input sequence.
695+
#[unstable(feature = "decode_utf8", issue = "33906")]
696+
#[derive(PartialEq, Debug)]
697+
pub struct InvalidSequence(());
698+
699+
#[unstable(feature = "decode_utf8", issue = "33906")]
700+
impl<I: Iterator<Item = u8>> Iterator for DecodeUtf8<I> {
701+
type Item = Result<char, InvalidSequence>;
702+
#[inline]
703+
fn next(&mut self) -> Option<Result<char, InvalidSequence>> {
704+
self.0.next().map(|b| {
705+
if b & 0x80 == 0 { Ok(b as char) } else {
706+
let l = (!b).leading_zeros() as usize; // number of bytes in UTF-8 representation
707+
if l < 2 || l > 6 { return Err(InvalidSequence(())) };
708+
let mut x = (b as u32) & (0x7F >> l);
709+
for _ in 0..l-1 {
710+
match self.0.peek() {
711+
Some(&b) if b & 0xC0 == 0x80 => {
712+
self.0.next();
713+
x = (x << 6) | (b as u32) & 0x3F;
714+
},
715+
_ => return Err(InvalidSequence(())),
716+
}
717+
}
718+
match from_u32(x) {
719+
Some(x) if l == x.len_utf8() => Ok(x),
720+
_ => Err(InvalidSequence(())),
721+
}
722+
}
723+
})
724+
}
725+
}

src/libcoretest/char.rs

+29
Original file line numberDiff line numberDiff line change
@@ -302,3 +302,32 @@ fn eu_iterator_specializations() {
302302
check('\u{12340}');
303303
check('\u{10FFFF}');
304304
}
305+
306+
#[test]
307+
fn test_decode_utf8() {
308+
use core::char::*;
309+
use core::iter::FromIterator;
310+
311+
for &(str, bs) in [("", &[] as &[u8]),
312+
("A", &[0x41u8] as &[u8]),
313+
("�", &[0xC1u8, 0x81u8] as &[u8]),
314+
("♥", &[0xE2u8, 0x99u8, 0xA5u8]),
315+
("♥A", &[0xE2u8, 0x99u8, 0xA5u8, 0x41u8] as &[u8]),
316+
("�", &[0xE2u8, 0x99u8] as &[u8]),
317+
("�A", &[0xE2u8, 0x99u8, 0x41u8] as &[u8]),
318+
("�", &[0xC0u8] as &[u8]),
319+
("�A", &[0xC0u8, 0x41u8] as &[u8]),
320+
("�", &[0x80u8] as &[u8]),
321+
("�A", &[0x80u8, 0x41u8] as &[u8]),
322+
("�", &[0xFEu8] as &[u8]),
323+
("�A", &[0xFEu8, 0x41u8] as &[u8]),
324+
("�", &[0xFFu8] as &[u8]),
325+
("�A", &[0xFFu8, 0x41u8] as &[u8])].into_iter() {
326+
assert!(Iterator::eq(str.chars(),
327+
decode_utf8(bs.into_iter().map(|&b|b))
328+
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))),
329+
"chars = {}, bytes = {:?}, decoded = {:?}", str, bs,
330+
Vec::from_iter(decode_utf8(bs.into_iter().map(|&b|b))
331+
.map(|r_b| r_b.unwrap_or('\u{FFFD}'))));
332+
}
333+
}

src/libcoretest/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#![feature(core_private_bignum)]
1919
#![feature(core_private_diy_float)]
2020
#![feature(dec2flt)]
21+
#![feature(decode_utf8)]
2122
#![feature(fixed_size_array)]
2223
#![feature(flt2dec)]
2324
#![feature(libc)]

src/librustc_unicode/char.rs

+2
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ pub use core::char::{MAX, from_digit, from_u32, from_u32_unchecked};
3939
pub use core::char::{EncodeUtf16, EncodeUtf8, EscapeDefault, EscapeUnicode};
4040

4141
// unstable reexports
42+
#[unstable(feature = "decode_utf8", issue = "33906")]
43+
pub use core::char::{DecodeUtf8, decode_utf8};
4244
#[unstable(feature = "unicode", issue = "27783")]
4345
pub use tables::UNICODE_VERSION;
4446

src/librustc_unicode/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
#![no_std]
3434

3535
#![feature(core_char_ext)]
36+
#![feature(decode_utf8)]
3637
#![feature(lang_items)]
3738
#![feature(staged_api)]
3839
#![feature(unicode)]

0 commit comments

Comments
 (0)