@@ -97,47 +97,110 @@ impl<'a> CharEq for &'a [char] {
97
97
Section: Iterators
98
98
*/
99
99
100
- /// External iterator for a string's characters.
101
- /// Use with the `std::iter` module.
100
+ /// Iterator for the char (representing *Unicode Scalar Values*) of a string
101
+ ///
102
+ /// Created with the method `.chars()`.
102
103
#[ deriving( Clone ) ]
103
104
pub struct Chars < ' a > {
104
- /// The slice remaining to be iterated
105
- string : & ' a str ,
105
+ iter : slice:: Items < ' a , u8 >
106
+ }
107
+
108
+ // Return the initial codepoint accumulator for the first byte.
109
+ // The first byte is special, only want bottom 5 bits for width 2, 4 bits
110
+ // for width 3, and 3 bits for width 4
111
+ macro_rules! utf8_first_byte(
112
+ ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
113
+ )
114
+
115
+ // return the value of $ch updated with continuation byte $byte
116
+ macro_rules! utf8_acc_cont_byte(
117
+ ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & CONT_MASK ) as u32 )
118
+ )
119
+
120
+ macro_rules! utf8_is_cont_byte(
121
+ ( $byte: expr) => ( ( $byte & !CONT_MASK ) == TAG_CONT_U8 )
122
+ )
123
+
124
+ #[ inline]
125
+ fn unwrap_or_0 ( opt : Option < & u8 > ) -> u8 {
126
+ match opt {
127
+ Some ( & byte) => byte,
128
+ None => 0 ,
129
+ }
106
130
}
107
131
108
132
impl < ' a > Iterator < char > for Chars < ' a > {
109
133
#[ inline]
110
134
fn next ( & mut self ) -> Option < char > {
111
- // Decode the next codepoint, then update
112
- // the slice to be just the remaining part
113
- if self . string . len ( ) != 0 {
114
- let CharRange { ch, next} = self . string . char_range_at ( 0 ) ;
115
- unsafe {
116
- self . string = raw:: slice_unchecked ( self . string , next, self . string . len ( ) ) ;
135
+ // Decode UTF-8, using the valid UTF-8 invariant
136
+ let x = match self . iter . next ( ) {
137
+ None => return None ,
138
+ Some ( & next_byte) if next_byte < 128 => return Some ( next_byte as char ) ,
139
+ Some ( & next_byte) => next_byte,
140
+ } ;
141
+
142
+ // Multibyte case follows
143
+ // Decode from a byte combination out of: [[[x y] z] w]
144
+ // NOTE: Performance is sensitive to the exact formulation here
145
+ let init = utf8_first_byte ! ( x, 2 ) ;
146
+ let y = unwrap_or_0 ( self . iter . next ( ) ) ;
147
+ let mut ch = utf8_acc_cont_byte ! ( init, y) ;
148
+ if x >= 0xE0 {
149
+ // [[x y z] w] case
150
+ // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid
151
+ let z = unwrap_or_0 ( self . iter . next ( ) ) ;
152
+ let y_z = utf8_acc_cont_byte ! ( ( y & CONT_MASK ) as u32 , z) ;
153
+ ch = init << 12 | y_z;
154
+ if x >= 0xF0 {
155
+ // [x y z w] case
156
+ // use only the lower 3 bits of `init`
157
+ let w = unwrap_or_0 ( self . iter . next ( ) ) ;
158
+ ch = ( init & 7 ) << 18 | utf8_acc_cont_byte ! ( y_z, w) ;
117
159
}
118
- Some ( ch)
119
- } else {
120
- None
160
+ }
161
+
162
+ // str invariant says `ch` is a valid Unicode Scalar Value
163
+ unsafe {
164
+ Some ( mem:: transmute ( ch) )
121
165
}
122
166
}
123
167
124
168
#[ inline]
125
169
fn size_hint ( & self ) -> ( uint , Option < uint > ) {
126
- ( self . string . len ( ) . saturating_add ( 3 ) /4 , Some ( self . string . len ( ) ) )
170
+ let ( len, _) = self . iter . size_hint ( ) ;
171
+ ( len. saturating_add ( 3 ) / 4 , Some ( len) )
127
172
}
128
173
}
129
174
130
175
impl < ' a > DoubleEndedIterator < char > for Chars < ' a > {
131
176
#[ inline]
132
177
fn next_back ( & mut self ) -> Option < char > {
133
- if self . string . len ( ) != 0 {
134
- let CharRange { ch, next} = self . string . char_range_at_reverse ( self . string . len ( ) ) ;
135
- unsafe {
136
- self . string = raw:: slice_unchecked ( self . string , 0 , next) ;
178
+ let w = match self . iter . next_back ( ) {
179
+ None => return None ,
180
+ Some ( & back_byte) if back_byte < 128 => return Some ( back_byte as char ) ,
181
+ Some ( & back_byte) => back_byte,
182
+ } ;
183
+
184
+ // Multibyte case follows
185
+ // Decode from a byte combination out of: [x [y [z w]]]
186
+ let mut ch;
187
+ let z = unwrap_or_0 ( self . iter . next_back ( ) ) ;
188
+ ch = utf8_first_byte ! ( z, 2 ) ;
189
+ if utf8_is_cont_byte ! ( z) {
190
+ let y = unwrap_or_0 ( self . iter . next_back ( ) ) ;
191
+ ch = utf8_first_byte ! ( y, 3 ) ;
192
+ if utf8_is_cont_byte ! ( y) {
193
+ let x = unwrap_or_0 ( self . iter . next_back ( ) ) ;
194
+ ch = utf8_first_byte ! ( x, 4 ) ;
195
+ ch = utf8_acc_cont_byte ! ( ch, y) ;
137
196
}
138
- Some ( ch)
139
- } else {
140
- None
197
+ ch = utf8_acc_cont_byte ! ( ch, z) ;
198
+ }
199
+ ch = utf8_acc_cont_byte ! ( ch, w) ;
200
+
201
+ // str invariant says `ch` is a valid Unicode Scalar Value
202
+ unsafe {
203
+ Some ( mem:: transmute ( ch) )
141
204
}
142
205
}
143
206
}
@@ -146,18 +209,23 @@ impl<'a> DoubleEndedIterator<char> for Chars<'a> {
146
209
/// Use with the `std::iter` module.
147
210
#[ deriving( Clone ) ]
148
211
pub struct CharOffsets < ' a > {
149
- /// The original string to be iterated
150
- string : & ' a str ,
212
+ front_offset : uint ,
151
213
iter : Chars < ' a > ,
152
214
}
153
215
154
216
impl < ' a > Iterator < ( uint , char ) > for CharOffsets < ' a > {
155
217
#[ inline]
156
218
fn next ( & mut self ) -> Option < ( uint , char ) > {
157
- // Compute the byte offset by using the pointer offset between
158
- // the original string slice and the iterator's remaining part
159
- let offset = self . iter . string . as_ptr ( ) as uint - self . string . as_ptr ( ) as uint ;
160
- self . iter . next ( ) . map ( |ch| ( offset, ch) )
219
+ let ( pre_len, _) = self . iter . iter . size_hint ( ) ;
220
+ match self . iter . next ( ) {
221
+ None => None ,
222
+ Some ( ch) => {
223
+ let index = self . front_offset ;
224
+ let ( len, _) = self . iter . iter . size_hint ( ) ;
225
+ self . front_offset += pre_len - len;
226
+ Some ( ( index, ch) )
227
+ }
228
+ }
161
229
}
162
230
163
231
#[ inline]
@@ -169,11 +237,14 @@ impl<'a> Iterator<(uint, char)> for CharOffsets<'a> {
169
237
impl < ' a > DoubleEndedIterator < ( uint , char ) > for CharOffsets < ' a > {
170
238
#[ inline]
171
239
fn next_back ( & mut self ) -> Option < ( uint , char ) > {
172
- self . iter . next_back ( ) . map ( |ch| {
173
- let offset = self . iter . string . len ( ) +
174
- self . iter . string . as_ptr ( ) as uint - self . string . as_ptr ( ) as uint ;
175
- ( offset, ch)
176
- } )
240
+ match self . iter . next_back ( ) {
241
+ None => None ,
242
+ Some ( ch) => {
243
+ let ( len, _) = self . iter . iter . size_hint ( ) ;
244
+ let index = self . front_offset + len;
245
+ Some ( ( index, ch) )
246
+ }
247
+ }
177
248
}
178
249
}
179
250
@@ -672,9 +743,9 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
672
743
// UTF8-4 = %xF0 %x90-BF 2( UTF8-tail ) / %xF1-F3 3( UTF8-tail ) /
673
744
// %xF4 %x80-8F 2( UTF8-tail )
674
745
match w {
675
- 2 => if second & 192 != TAG_CONT_U8 { err ! ( ) } ,
746
+ 2 => if second & ! CONT_MASK != TAG_CONT_U8 { err ! ( ) } ,
676
747
3 => {
677
- match ( first, second, next ! ( ) & 192 ) {
748
+ match ( first, second, next ! ( ) & ! CONT_MASK ) {
678
749
( 0xE0 , 0xA0 .. 0xBF , TAG_CONT_U8 ) |
679
750
( 0xE1 .. 0xEC , 0x80 .. 0xBF , TAG_CONT_U8 ) |
680
751
( 0xED , 0x80 .. 0x9F , TAG_CONT_U8 ) |
@@ -683,7 +754,7 @@ fn run_utf8_validation_iterator(iter: &mut slice::Items<u8>) -> bool {
683
754
}
684
755
}
685
756
4 => {
686
- match ( first, second, next ! ( ) & 192 , next ! ( ) & 192 ) {
757
+ match ( first, second, next ! ( ) & ! CONT_MASK , next ! ( ) & ! CONT_MASK ) {
687
758
( 0xF0 , 0x90 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
688
759
( 0xF1 .. 0xF3 , 0x80 .. 0xBF , TAG_CONT_U8 , TAG_CONT_U8 ) |
689
760
( 0xF4 , 0x80 .. 0x8F , TAG_CONT_U8 , TAG_CONT_U8 ) => { }
@@ -880,19 +951,10 @@ pub struct CharRange {
880
951
pub next : uint ,
881
952
}
882
953
883
- // Return the initial codepoint accumulator for the first byte.
884
- // The first byte is special, only want bottom 5 bits for width 2, 4 bits
885
- // for width 3, and 3 bits for width 4
886
- macro_rules! utf8_first_byte(
887
- ( $byte: expr, $width: expr) => ( ( $byte & ( 0x7F >> $width) ) as u32 )
888
- )
889
-
890
- // return the value of $ch updated with continuation byte $byte
891
- macro_rules! utf8_acc_cont_byte(
892
- ( $ch: expr, $byte: expr) => ( ( $ch << 6 ) | ( $byte & 63u8 ) as u32 )
893
- )
894
-
895
- static TAG_CONT_U8 : u8 = 128u8 ;
954
+ /// Mask of the value bits of a continuation byte
955
+ static CONT_MASK : u8 = 0b0011_1111u8 ;
956
+ /// Value of the tag bits (tag mask is !CONT_MASK) of a continuation byte
957
+ static TAG_CONT_U8 : u8 = 0b1000_0000u8 ;
896
958
897
959
/// Unsafe operations
898
960
pub mod raw {
@@ -1608,7 +1670,7 @@ impl<'a> StrSlice<'a> for &'a str {
1608
1670
1609
1671
#[ inline]
1610
1672
fn chars ( & self ) -> Chars < ' a > {
1611
- Chars { string : * self }
1673
+ Chars { iter : self . as_bytes ( ) . iter ( ) }
1612
1674
}
1613
1675
1614
1676
#[ inline]
@@ -1618,7 +1680,7 @@ impl<'a> StrSlice<'a> for &'a str {
1618
1680
1619
1681
#[ inline]
1620
1682
fn char_indices ( & self ) -> CharOffsets < ' a > {
1621
- CharOffsets { string : * self , iter : self . chars ( ) }
1683
+ CharOffsets { front_offset : 0 , iter : self . chars ( ) }
1622
1684
}
1623
1685
1624
1686
#[ inline]
@@ -1828,7 +1890,7 @@ impl<'a> StrSlice<'a> for &'a str {
1828
1890
// Multibyte case is a fn to allow char_range_at_reverse to inline cleanly
1829
1891
fn multibyte_char_range_at_reverse ( s : & str , mut i : uint ) -> CharRange {
1830
1892
// while there is a previous byte == 10......
1831
- while i > 0 && s. as_bytes ( ) [ i] & 192u8 == TAG_CONT_U8 {
1893
+ while i > 0 && s. as_bytes ( ) [ i] & ! CONT_MASK == TAG_CONT_U8 {
1832
1894
i -= 1 u;
1833
1895
}
1834
1896
0 commit comments