Skip to content

Commit 8aafbd2

Browse files
committed
Remove newline handling from preprocessing.
1 parent b6c0b0c commit 8aafbd2

File tree

1 file changed

+42
-24
lines changed

1 file changed

+42
-24
lines changed

src/tokenizer.rs

Lines changed: 42 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ impl Iterator<Node> for Tokenizer {
3232

3333
#[inline]
3434
fn preprocess(input: &str) -> String {
35-
// TODO: Is this faster if done in one pass?
36-
input.replace("\r\n", "\n").replace("\r", "\n").replace("\x0C", "\n")
35+
input.into_string()
3736
}
3837

3938

@@ -43,6 +42,12 @@ pub struct Tokenizer {
4342
position: uint, // All counted in bytes, not characters
4443
}
4544

45+
macro_rules! is_match(
46+
($value:expr, $($pattern:pat)|+) => (
47+
match $value { $($pattern)|+ => true, _ => false }
48+
);
49+
)
50+
4651

4752
impl Tokenizer {
4853
#[inline]
@@ -57,6 +62,12 @@ impl Tokenizer {
5762
self.input.as_slice().char_at(self.position + offset)
5863
}
5964

65+
#[inline]
66+
fn has_newline_at(&self, offset: uint) -> bool {
67+
self.position + offset < self.length &&
68+
is_match!(self.char_at(offset), '\n' | '\r' | '\x0C')
69+
}
70+
6071
#[inline]
6172
fn consume_char(&mut self) -> char {
6273
let range = self.input.as_slice().char_range_at(self.position);
@@ -70,12 +81,6 @@ impl Tokenizer {
7081
}
7182
}
7283

73-
macro_rules! is_match(
74-
($value:expr, $($pattern:pat)|+) => (
75-
match $value { $($pattern)|+ => true, _ => false }
76-
);
77-
)
78-
7984

8085
fn next_component_value(tokenizer: &mut Tokenizer) -> Option<Node> {
8186
consume_comments(tokenizer);
@@ -89,10 +94,10 @@ fn next_component_value(tokenizer: &mut Tokenizer) -> Option<Node> {
8994
};
9095
let c = tokenizer.current_char();
9196
let component_value = match c {
92-
'\t' | '\n' | ' ' => {
97+
'\t' | '\n' | ' ' | '\r' | '\x0C' => {
9398
while !tokenizer.is_eof() {
9499
match tokenizer.current_char() {
95-
' ' | '\t' | '\n' => tokenizer.position += 1,
100+
' ' | '\t' | '\n' | '\r' | '\x0C' => tokenizer.position += 1,
96101
_ => break,
97102
}
98103
}
@@ -104,7 +109,7 @@ fn next_component_value(tokenizer: &mut Tokenizer) -> Option<Node> {
104109
if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
105110
else if !tokenizer.is_eof() && match tokenizer.current_char() {
106111
'a'...'z' | 'A'...'Z' | '0'...'9' | '-' | '_' => true,
107-
'\\' => !tokenizer.starts_with("\\\n"),
112+
'\\' => !tokenizer.has_newline_at(1),
108113
_ => c > '\x7F', // Non-ASCII
109114
} { Hash(consume_name(tokenizer)) }
110115
else { Delim(c) }
@@ -193,7 +198,7 @@ fn next_component_value(tokenizer: &mut Tokenizer) -> Option<Node> {
193198
'a'...'z' | 'A'...'Z' | '_' | '\0' => consume_ident_like(tokenizer),
194199
'[' => SquareBracketBlock(consume_block(tokenizer, CloseSquareBracket)),
195200
'\\' => {
196-
if !tokenizer.starts_with("\\\n") { consume_ident_like(tokenizer) }
201+
if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
197202
else { tokenizer.position += 1; Delim(c) }
198203
},
199204
']' => { tokenizer.position += 1; CloseSquareBracket },
@@ -289,16 +294,23 @@ fn consume_quoted_string(tokenizer: &mut Tokenizer, single_quote: bool) -> Resul
289294
match tokenizer.consume_char() {
290295
'"' if !single_quote => break,
291296
'\'' if single_quote => break,
292-
'\n' => {
297+
'\n' | '\r' | '\x0C' => {
293298
tokenizer.position -= 1;
294299
return Err(());
295300
},
296301
'\\' => {
297302
if !tokenizer.is_eof() {
298-
if tokenizer.current_char() == '\n' { // Escaped newline
299-
tokenizer.position += 1;
303+
match tokenizer.current_char() {
304+
// Escaped newline
305+
'\n' | '\x0C' => tokenizer.position += 1,
306+
'\r' => {
307+
tokenizer.position += 1;
308+
if !tokenizer.is_eof() && tokenizer.current_char() == '\n' {
309+
tokenizer.position += 1;
310+
}
311+
}
312+
_ => string.push(consume_escape(tokenizer))
300313
}
301-
else { string.push(consume_escape(tokenizer)) }
302314
}
303315
// else: escaped EOF, do nothing.
304316
}
@@ -316,10 +328,10 @@ fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
316328
'a'...'z' | 'A'...'Z' | '_' | '\0' => true,
317329
'-' => tokenizer.position + 1 < tokenizer.length && match tokenizer.char_at(1) {
318330
'a'...'z' | 'A'...'Z' | '_' | '\0' => true,
319-
'\\' => !tokenizer.input.as_slice().slice_from(tokenizer.position + 1).starts_with("\\\n"),
331+
'\\' => !tokenizer.has_newline_at(1),
320332
c => c > '\x7F', // Non-ASCII
321333
},
322-
'\\' => !tokenizer.starts_with("\\\n"),
334+
'\\' => !tokenizer.has_newline_at(1),
323335
c => c > '\x7F', // Non-ASCII
324336
}
325337
}
@@ -342,7 +354,7 @@ fn consume_name(tokenizer: &mut Tokenizer) -> String {
342354
value.push(match c {
343355
'a'...'z' | 'A'...'Z' | '0'...'9' | '_' | '-' => { tokenizer.position += 1; c },
344356
'\\' => {
345-
if tokenizer.starts_with("\\\n") { break }
357+
if tokenizer.has_newline_at(1) { break }
346358
tokenizer.position += 1;
347359
consume_escape(tokenizer)
348360
},
@@ -435,7 +447,7 @@ fn consume_url(tokenizer: &mut Tokenizer) -> ComponentValue {
435447
tokenizer.position += 1; // Skip the ( of url(
436448
while !tokenizer.is_eof() {
437449
match tokenizer.current_char() {
438-
' ' | '\t' | '\n' => tokenizer.position += 1,
450+
' ' | '\t' | '\n' | '\r' | '\x0C' => tokenizer.position += 1,
439451
'"' => return consume_quoted_url(tokenizer, false),
440452
'\'' => return consume_quoted_url(tokenizer, true),
441453
')' => { tokenizer.position += 1; break },
@@ -455,12 +467,12 @@ fn consume_url(tokenizer: &mut Tokenizer) -> ComponentValue {
455467
let mut string = String::new();
456468
while !tokenizer.is_eof() {
457469
let next_char = match tokenizer.consume_char() {
458-
' ' | '\t' | '\n' => return consume_url_end(tokenizer, string),
470+
' ' | '\t' | '\n' | '\r' | '\x0C' => return consume_url_end(tokenizer, string),
459471
')' => break,
460472
'\x01'...'\x08' | '\x0B' | '\x0E'...'\x1F' | '\x7F' // non-printable
461473
| '"' | '\'' | '(' => return consume_bad_url(tokenizer),
462474
'\\' => {
463-
if !tokenizer.is_eof() && tokenizer.current_char() == '\n' {
475+
if tokenizer.has_newline_at(0) {
464476
return consume_bad_url(tokenizer)
465477
}
466478
consume_escape(tokenizer)
@@ -476,7 +488,7 @@ fn consume_url(tokenizer: &mut Tokenizer) -> ComponentValue {
476488
fn consume_url_end(tokenizer: &mut Tokenizer, string: String) -> ComponentValue {
477489
while !tokenizer.is_eof() {
478490
match tokenizer.consume_char() {
479-
' ' | '\t' | '\n' => (),
491+
' ' | '\t' | '\n' | '\r' | '\x0C' => (),
480492
')' => break,
481493
_ => return consume_bad_url(tokenizer)
482494
}
@@ -557,7 +569,13 @@ fn consume_escape(tokenizer: &mut Tokenizer) -> char {
557569
}
558570
if !tokenizer.is_eof() {
559571
match tokenizer.current_char() {
560-
' ' | '\t' | '\n' => tokenizer.position += 1,
572+
' ' | '\t' | '\n' | '\x0C' => tokenizer.position += 1,
573+
'\r' => {
574+
tokenizer.position += 1;
575+
if !tokenizer.is_eof() && tokenizer.current_char() == '\n' {
576+
tokenizer.position += 1;
577+
}
578+
}
561579
_ => ()
562580
}
563581
}

0 commit comments

Comments
 (0)