Skip to content

Commit ca38434

Browse files
committed
auto merge of #15638 : blake2-ppc/rust/ptr-arithmetic-chars, r=huonw
Reimplement the string slice's `Iterator<char>` by wrapping the already efficient slice iterator. The iterator uses our guarantee that the string contains valid UTF-8, but its only unsafe code is transmuting the decoded `u32` into `char`. Benchmarks suggest that the runtime of `Chars` benchmarks are reduced by up to 30%, runtime of `Chars` reversed reduced by up to 60%. ``` BEFORE test str::bench::char_indicesator ... bench: 124 ns/iter (+/- 1) test str::bench::char_indicesator_rev ... bench: 188 ns/iter (+/- 9) test str::bench::char_iterator ... bench: 122 ns/iter (+/- 2) test str::bench::char_iterator_ascii ... bench: 302 ns/iter (+/- 41) test str::bench::char_iterator_for ... bench: 123 ns/iter (+/- 4) test str::bench::char_iterator_rev ... bench: 189 ns/iter (+/- 14) test str::bench::char_iterator_rev_for ... bench: 177 ns/iter (+/- 4) AFTER test str::bench::char_indicesator ... bench: 85 ns/iter (+/- 3) test str::bench::char_indicesator_rev ... bench: 82 ns/iter (+/- 2) test str::bench::char_iterator ... bench: 100 ns/iter (+/- 3) test str::bench::char_iterator_ascii ... bench: 317 ns/iter (+/- 3) test str::bench::char_iterator_for ... bench: 86 ns/iter (+/- 2) test str::bench::char_iterator_rev ... bench: 80 ns/iter (+/- 6) test str::bench::char_iterator_rev_for ... bench: 68 ns/iter (+/- 0) ``` Note: Branch name is no longer indicative of the implementation.
2 parents e0a6e2b + c5e0736 commit ca38434

File tree

2 files changed

+161
-57
lines changed

2 files changed

+161
-57
lines changed

src/libcollections/str.rs

+48-6
Original file line numberDiff line numberDiff line change
@@ -808,6 +808,7 @@ impl OwnedStr for String {
808808
#[cfg(test)]
809809
mod tests {
810810
use std::iter::AdditiveIterator;
811+
use std::iter::range;
811812
use std::default::Default;
812813
use std::char::Char;
813814
use std::clone::Clone;
@@ -1610,6 +1611,30 @@ mod tests {
16101611
assert_eq!(pos, v.len());
16111612
}
16121613

1614+
#[test]
1615+
fn test_chars_decoding() {
1616+
let mut bytes = [0u8, ..4];
1617+
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
1618+
let len = c.encode_utf8(bytes);
1619+
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
1620+
if Some(c) != s.chars().next() {
1621+
fail!("character {:x}={} does not decode correctly", c as u32, c);
1622+
}
1623+
}
1624+
}
1625+
1626+
#[test]
1627+
fn test_chars_rev_decoding() {
1628+
let mut bytes = [0u8, ..4];
1629+
for c in range(0u32, 0x110000).filter_map(|c| ::core::char::from_u32(c)) {
1630+
let len = c.encode_utf8(bytes);
1631+
let s = ::core::str::from_utf8(bytes.slice_to(len)).unwrap();
1632+
if Some(c) != s.chars().rev().next() {
1633+
fail!("character {:x}={} does not decode correctly", c as u32, c);
1634+
}
1635+
}
1636+
}
1637+
16131638
#[test]
16141639
fn test_iterator_clone() {
16151640
let s = "ศไทย中华Việt Nam";
@@ -2240,16 +2265,26 @@ mod tests {
22402265
#[cfg(test)]
22412266
mod bench {
22422267
use test::Bencher;
2268+
use test::black_box;
22432269
use super::*;
2270+
use std::option::{None, Some};
22442271
use std::iter::{Iterator, DoubleEndedIterator};
22452272
use std::collections::Collection;
22462273

22472274
#[bench]
22482275
fn char_iterator(b: &mut Bencher) {
22492276
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
2250-
let len = s.char_len();
22512277

2252-
b.iter(|| assert_eq!(s.chars().count(), len));
2278+
b.iter(|| s.chars().count());
2279+
}
2280+
2281+
#[bench]
2282+
fn char_iterator_for(b: &mut Bencher) {
2283+
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
2284+
2285+
b.iter(|| {
2286+
for ch in s.chars() { black_box(ch) }
2287+
});
22532288
}
22542289

22552290
#[bench]
@@ -2260,17 +2295,24 @@ mod bench {
22602295
Mary had a little lamb, Little lamb
22612296
Mary had a little lamb, Little lamb
22622297
Mary had a little lamb, Little lamb";
2263-
let len = s.char_len();
22642298

2265-
b.iter(|| assert_eq!(s.chars().count(), len));
2299+
b.iter(|| s.chars().count());
22662300
}
22672301

22682302
#[bench]
22692303
fn char_iterator_rev(b: &mut Bencher) {
22702304
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
2271-
let len = s.char_len();
22722305

2273-
b.iter(|| assert_eq!(s.chars().rev().count(), len));
2306+
b.iter(|| s.chars().rev().count());
2307+
}
2308+
2309+
#[bench]
2310+
fn char_iterator_rev_for(b: &mut Bencher) {
2311+
let s = "ศไทย中华Việt Nam; Mary had a little lamb, Little lamb";
2312+
2313+
b.iter(|| {
2314+
for ch in s.chars().rev() { black_box(ch) }
2315+
});
22742316
}
22752317

22762318
#[bench]

src/libcore/str.rs

+113-51
Original file line numberDiff line numberDiff line change
@@ -97,47 +97,110 @@ impl<'a> CharEq for &'a [char] {
9797
Section: Iterators
9898
*/
9999

100-
/// External iterator for a string's characters.
101-
/// Use with the `std::iter` module.
100+
/// Iterator for the char (representing *Unicode Scalar Values*) of a string
101+
///
102+
/// Created with the method `.chars()`.
102103
#[deriving(Clone)]
103104
pub struct Chars<'a> {
104-
/// The slice remaining to be iterated
105-
string: &'a str,
105+
iter: slice::Items<'a, u8>
106+
}
107+
108+
// Return the initial codepoint accumulator for the first byte.
109+
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
110+
// for width 3, and 3 bits for width 4
111+
macro_rules! utf8_first_byte(
112+
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
113+
)
114+
115+
// return the value of $ch updated with continuation byte $byte
116+
macro_rules! utf8_acc_cont_byte(
117+
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & CONT_MASK) as u32)
118+
)
119+
120+
macro_rules! utf8_is_cont_byte(
121+
($byte:expr) => (($byte & !CONT_MASK) == TAG_CONT_U8)
122+
)
123+
124+
#[inline]
125+
fn unwrap_or_0(opt: Option<&u8>) -> u8 {
126+
match opt {
127+
Some(&byte) => byte,
128+
None => 0,
129+
}
106130
}
107131

108132
impl<'a> Iterator<char> for Chars<'a> {
109133
#[inline]
110134
fn next(&mut self) -> Option<char> {
111-
// Decode the next codepoint, then update
112-
// the slice to be just the remaining part
113-
if self.string.len() != 0 {
114-
let CharRange {ch, next} = self.string.char_range_at(0);
115-
unsafe {
116-
self.string = raw::slice_unchecked(self.string, next, self.string.len());
135+
// Decode UTF-8, using the valid UTF-8 invariant
136+
let x = match self.iter.next() {
137+
None => return None,
138+
Some(&next_byte) if next_byte < 128 => return Some(next_byte as char),
139+
Some(&next_byte) => next_byte,
140+
};
141+
142+
// Multibyte case follows
143+
// Decode from a byte combination out of: [[[x y] z] w]
144+
// NOTE: Performance is sensitive to the exact formulation here
145+
let init = utf8_first_byte!(x, 2);
146+
let y = unwrap_or_0(self.iter.next());
147+
let mut ch = utf8_acc_cont_byte!(init, y);
148+
if x >= 0xE0 {
149+
// [[x y z] w] case
150+
// 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
151+
let z = unwrap_or_0(self.iter.next());
152+
let y_z = utf8_acc_cont_byte!((y & CONT_MASK) as u32, z);
153+
ch = init << 12 | y_z;
154+
if x >= 0xF0 {
155+
// [x y z w] case
156+
// use only the lower 3 bits of `init`
157+
let w = unwrap_or_0(self.iter.next());
158+
ch = (init & 7) << 18 | utf8_acc_cont_byte!(y_z, w);
117159
}
118-
Some(ch)
119-
} else {
120-
None
160+
}
161+
162+
// str invariant says `ch` is a valid Unicode Scalar Value
163+
unsafe {
164+
Some(mem::transmute(ch))
121165
}
122166
}
123167

124168
#[inline]
125169
fn size_hint(&self) -> (uint, Option<uint>) {
126-
(self.string.len().saturating_add(3)/4, Some(self.string.len()))
170+
let (len, _) = self.iter.size_hint();
171+
(len.saturating_add(3) / 4, Some(len))
127172
}
128173
}
129174

130175
impl<'a> DoubleEndedIterator<char> for Chars<'a> {
131176
#[inline]
132177
fn next_back(&mut self) -> Option<char> {
133-
if self.string.len() != 0 {
134-
let CharRange {ch, next} = self.string.char_range_at_reverse(self.string.len());
135-
unsafe {
136-
self.string = raw::slice_unchecked(self.string, 0, next);
178+
let w = match self.iter.next_back() {
179+
None => return None,
180+
Some(&back_byte) if back_byte < 128 => return Some(back_byte as char),
181+
Some(&back_byte) => back_byte,
182+
};
183+
184+
// Multibyte case follows
185+
// Decode from a byte combination out of: [x [y [z w]]]
186+
let mut ch;
187+
let z = unwrap_or_0(self.iter.next_back());
188+
ch = utf8_first_byte!(z, 2);
189+
if utf8_is_cont_byte!(z) {
190+
let y = unwrap_or_0(self.iter.next_back());
191+
ch = utf8_first_byte!(y, 3);
192+
if utf8_is_cont_byte!(y) {
193+
let x = unwrap_or_0(self.iter.next_back());
194+
ch = utf8_first_byte!(x, 4);
195+
ch = utf8_acc_cont_byte!(ch, y);
137196
}
138-
Some(ch)
139-
} else {
140-
None
197+
ch = utf8_acc_cont_byte!(ch, z);
198+
}
199+
ch = utf8_acc_cont_byte!(ch, w);
200+
201+
// str invariant says `ch` is a valid Unicode Scalar Value
202+
unsafe {
203+
Some(mem::transmute(ch))
141204
}
142205
}
143206
}
@@ -146,18 +209,23 @@ impl<'a> DoubleEndedIterator<char> for Chars<'a> {
146209
/// Use with the `std::iter` module.
147210
#[deriving(Clone)]
148211
pub struct CharOffsets<'a> {
149-
/// The original string to be iterated
150-
string: &'a str,
212+
front_offset: uint,
151213
iter: Chars<'a>,
152214
}
153215

154216
impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
155217
#[inline]
156218
fn next(&mut self) -> Option<(uint, char)> {
157-
// Compute the byte offset by using the pointer offset between
158-
// the original string slice and the iterator's remaining part
159-
let offset = self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
160-
self.iter.next().map(|ch| (offset, ch))
219+
let (pre_len, _) = self.iter.iter.size_hint();
220+
match self.iter.next() {
221+
None => None,
222+
Some(ch) => {
223+
let index = self.front_offset;
224+
let (len, _) = self.iter.iter.size_hint();
225+
self.front_offset += pre_len - len;
226+
Some((index, ch))
227+
}
228+
}
161229
}
162230

163231
#[inline]
@@ -169,11 +237,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
169237
impl<'a> DoubleEndedIterator<(uint, char)> for CharOffsets<'a> {
170238
#[inline]
171239
fn next_back(&mut self) -> Option<(uint, char)> {
172-
self.iter.next_back().map(|ch| {
173-
let offset = self.iter.string.len() +
174-
self.iter.string.as_ptr() as uint - self.string.as_ptr() as uint;
175-
(offset, ch)
176-
})
240+
match self.iter.next_back() {
241+
None => None,
242+
Some(ch) => {
243+
let (len, _) = self.iter.iter.size_hint();
244+
let index = self.front_offset + len;
245+
Some((index, ch))
246+
}
247+
}
177248
}
178249
}
179250

@@ -672,9 +743,9 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
672743
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
673744
// %xF4 %x80-8F 2( UTF8-tail )
674745
match w {
675-
2 => if second & 192 != TAG_CONT_U8 {err!()},
746+
2 => if second & !CONT_MASK != TAG_CONT_U8 {err!()},
676747
3 => {
677-
match (first, second, next!() & 192) {
748+
match (first, second, next!() & !CONT_MASK) {
678749
(0xE0 , 0xA0 .. 0xBF, TAG_CONT_U8) |
679750
(0xE1 .. 0xEC, 0x80 .. 0xBF, TAG_CONT_U8) |
680751
(0xED , 0x80 .. 0x9F, TAG_CONT_U8) |
@@ -683,7 +754,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
683754
}
684755
}
685756
4 => {
686-
match (first, second, next!() & 192, next!() & 192) {
757+
match (first, second, next!() & !CONT_MASK, next!() & !CONT_MASK) {
687758
(0xF0 , 0x90 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
688759
(0xF1 .. 0xF3, 0x80 .. 0xBF, TAG_CONT_U8, TAG_CONT_U8) |
689760
(0xF4 , 0x80 .. 0x8F, TAG_CONT_U8, TAG_CONT_U8) => {}
@@ -880,19 +951,10 @@ pub struct CharRange {
880951
pub next: uint,
881952
}
882953

883-
// Return the initial codepoint accumulator for the first byte.
884-
// The first byte is special, only want bottom 5 bits for width 2, 4 bits
885-
// for width 3, and 3 bits for width 4
886-
macro_rules! utf8_first_byte(
887-
($byte:expr, $width:expr) => (($byte & (0x7F >> $width)) as u32)
888-
)
889-
890-
// return the value of $ch updated with continuation byte $byte
891-
macro_rules! utf8_acc_cont_byte(
892-
($ch:expr, $byte:expr) => (($ch << 6) | ($byte & 63u8) as u32)
893-
)
894-
895-
static TAG_CONT_U8: u8 = 128u8;
954+
/// Mask of the value bits of a continuation byte
955+
static CONT_MASK: u8 = 0b0011_1111u8;
956+
/// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
957+
static TAG_CONT_U8: u8 = 0b1000_0000u8;
896958

897959
/// Unsafe operations
898960
pub mod raw {
@@ -1608,7 +1670,7 @@ impl<'a> StrSlice<'a> for &'a str {
16081670

16091671
#[inline]
16101672
fn chars(&self) -> Chars<'a> {
1611-
Chars{string: *self}
1673+
Chars{iter: self.as_bytes().iter()}
16121674
}
16131675

16141676
#[inline]
@@ -1618,7 +1680,7 @@ impl<'a> StrSlice<'a> for &'a str {
16181680

16191681
#[inline]
16201682
fn char_indices(&self) -> CharOffsets<'a> {
1621-
CharOffsets{string: *self, iter: self.chars()}
1683+
CharOffsets{front_offset: 0, iter: self.chars()}
16221684
}
16231685

16241686
#[inline]
@@ -1828,7 +1890,7 @@ impl<'a> StrSlice<'a> for &'a str {
18281890
// Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
18291891
fn multibyte_char_range_at_reverse(s: &str, mut i: uint) -> CharRange {
18301892
// while there is a previous byte == 10......
1831-
while i > 0 && s.as_bytes()[i] & 192u8 == TAG_CONT_U8 {
1893+
while i > 0 && s.as_bytes()[i] & !CONT_MASK == TAG_CONT_U8 {
18321894
i -= 1u;
18331895
}
18341896

0 commit comments

Comments
 (0)