|
1 |
| -mod parser; |
2 |
| -mod string; |
| 1 | +use crate::{TextRange, TextUnit}; |
| 2 | +use self::StringComponentKind::*; |
3 | 3 |
|
4 |
| -pub use self::{ |
5 |
| - parser::{StringComponent, StringComponentKind}, |
6 |
| - string::{parse_string_literal, parse_char_literal, parse_byte_literal, parse_byte_string_literal}, |
7 |
| -}; |
| 4 | +#[derive(Debug, Eq, PartialEq, Clone)] |
| 5 | +pub(crate) struct StringComponent { |
| 6 | + pub(crate) range: TextRange, |
| 7 | + pub(crate) kind: StringComponentKind, |
| 8 | +} |
| 9 | + |
| 10 | +#[derive(Debug, Eq, PartialEq, Clone)] |
| 11 | +pub(crate) enum StringComponentKind { |
| 12 | + IgnoreNewline, |
| 13 | + CodePoint, |
| 14 | + AsciiEscape, |
| 15 | + AsciiCodeEscape, |
| 16 | + UnicodeEscape, |
| 17 | +} |
| 18 | + |
| 19 | +pub(crate) fn parse_quoted_literal( |
| 20 | + prefix: Option<char>, |
| 21 | + quote: char, |
| 22 | + src: &str, |
| 23 | +) -> StringComponentIter { |
| 24 | + let prefix = prefix.map(|p| match p { |
| 25 | + 'b' => b'b', |
| 26 | + _ => panic!("invalid prefix"), |
| 27 | + }); |
| 28 | + let quote = match quote { |
| 29 | + '\'' => b'\'', |
| 30 | + '"' => b'"', |
| 31 | + _ => panic!("invalid quote"), |
| 32 | + }; |
| 33 | + StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None } |
| 34 | +} |
| 35 | + |
| 36 | +pub(crate) struct StringComponentIter<'a> { |
| 37 | + src: &'a str, |
| 38 | + prefix: Option<u8>, |
| 39 | + quote: u8, |
| 40 | + pos: usize, |
| 41 | + pub(crate) has_closing_quote: bool, |
| 42 | + pub(crate) suffix: Option<TextRange>, |
| 43 | +} |
| 44 | + |
| 45 | +impl<'a> Iterator for StringComponentIter<'a> { |
| 46 | + type Item = StringComponent; |
| 47 | + fn next(&mut self) -> Option<StringComponent> { |
| 48 | + if self.pos == 0 { |
| 49 | + if let Some(prefix) = self.prefix { |
| 50 | + assert!( |
| 51 | + self.advance() == prefix as char, |
| 52 | + "literal should start with a {:?}", |
| 53 | + prefix as char, |
| 54 | + ); |
| 55 | + } |
| 56 | + assert!( |
| 57 | + self.advance() == self.quote as char, |
| 58 | + "literal should start with a {:?}", |
| 59 | + self.quote as char, |
| 60 | + ); |
| 61 | + } |
| 62 | + |
| 63 | + if let Some(component) = self.parse_component() { |
| 64 | + return Some(component); |
| 65 | + } |
| 66 | + |
| 67 | + // We get here when there are no char components left to parse |
| 68 | + if self.peek() == Some(self.quote as char) { |
| 69 | + self.advance(); |
| 70 | + self.has_closing_quote = true; |
| 71 | + if let Some(range) = self.parse_suffix() { |
| 72 | + self.suffix = Some(range); |
| 73 | + } |
| 74 | + } |
| 75 | + |
| 76 | + assert!( |
| 77 | + self.peek() == None, |
| 78 | + "literal should leave no unparsed input: src = {:?}, pos = {}, length = {}", |
| 79 | + self.src, |
| 80 | + self.pos, |
| 81 | + self.src.len() |
| 82 | + ); |
| 83 | + |
| 84 | + None |
| 85 | + } |
| 86 | +} |
| 87 | + |
| 88 | +impl<'a> StringComponentIter<'a> { |
| 89 | + fn peek(&self) -> Option<char> { |
| 90 | + if self.pos == self.src.len() { |
| 91 | + return None; |
| 92 | + } |
| 93 | + |
| 94 | + self.src[self.pos..].chars().next() |
| 95 | + } |
| 96 | + |
| 97 | + fn advance(&mut self) -> char { |
| 98 | + let next = self.peek().expect("cannot advance if end of input is reached"); |
| 99 | + self.pos += next.len_utf8(); |
| 100 | + next |
| 101 | + } |
| 102 | + |
| 103 | + fn parse_component(&mut self) -> Option<StringComponent> { |
| 104 | + let next = self.peek()?; |
| 105 | + |
| 106 | + // Ignore string close |
| 107 | + if next == self.quote as char { |
| 108 | + return None; |
| 109 | + } |
| 110 | + |
| 111 | + let start = self.start_range(); |
| 112 | + self.advance(); |
| 113 | + |
| 114 | + if next == '\\' { |
| 115 | + // Strings can use `\` to ignore newlines, so we first try to parse one of those |
| 116 | + // before falling back to parsing char escapes |
| 117 | + if self.quote == b'"' { |
| 118 | + if let Some(component) = self.parse_ignore_newline(start) { |
| 119 | + return Some(component); |
| 120 | + } |
| 121 | + } |
| 122 | + |
| 123 | + Some(self.parse_escape(start)) |
| 124 | + } else { |
| 125 | + Some(self.finish_component(start, CodePoint)) |
| 126 | + } |
| 127 | + } |
| 128 | + |
| 129 | + fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> { |
| 130 | + // In string literals, when a `\` occurs immediately before the newline, the `\`, |
| 131 | + // the newline, and all whitespace at the beginning of the next line are ignored |
| 132 | + match self.peek() { |
| 133 | + Some('\n') | Some('\r') => { |
| 134 | + self.skip_whitespace(); |
| 135 | + Some(self.finish_component(start, IgnoreNewline)) |
| 136 | + } |
| 137 | + _ => None, |
| 138 | + } |
| 139 | + } |
| 140 | + |
| 141 | + fn skip_whitespace(&mut self) { |
| 142 | + while self.peek().map(|c| c.is_whitespace()) == Some(true) { |
| 143 | + self.advance(); |
| 144 | + } |
| 145 | + } |
| 146 | + |
| 147 | + fn parse_escape(&mut self, start: TextUnit) -> StringComponent { |
| 148 | + if self.peek().is_none() { |
| 149 | + return self.finish_component(start, AsciiEscape); |
| 150 | + } |
| 151 | + |
| 152 | + let next = self.advance(); |
| 153 | + match next { |
| 154 | + 'x' => self.parse_ascii_code_escape(start), |
| 155 | + 'u' => self.parse_unicode_escape(start), |
| 156 | + _ => self.finish_component(start, AsciiEscape), |
| 157 | + } |
| 158 | + } |
| 159 | + |
| 160 | + fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent { |
| 161 | + match self.peek() { |
| 162 | + Some('{') => { |
| 163 | + self.advance(); |
| 164 | + |
| 165 | + // Parse anything until we reach `}` |
| 166 | + while let Some(next) = self.peek() { |
| 167 | + self.advance(); |
| 168 | + if next == '}' { |
| 169 | + break; |
| 170 | + } |
| 171 | + } |
| 172 | + |
| 173 | + self.finish_component(start, UnicodeEscape) |
| 174 | + } |
| 175 | + Some(_) | None => self.finish_component(start, UnicodeEscape), |
| 176 | + } |
| 177 | + } |
| 178 | + |
| 179 | + fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent { |
| 180 | + let code_start = self.pos; |
| 181 | + while let Some(next) = self.peek() { |
| 182 | + if next == '\'' || (self.pos - code_start == 2) { |
| 183 | + break; |
| 184 | + } |
| 185 | + |
| 186 | + self.advance(); |
| 187 | + } |
| 188 | + self.finish_component(start, AsciiCodeEscape) |
| 189 | + } |
| 190 | + |
| 191 | + fn parse_suffix(&mut self) -> Option<TextRange> { |
| 192 | + let start = self.start_range(); |
| 193 | + let _ = self.peek()?; |
| 194 | + while let Some(_) = self.peek() { |
| 195 | + self.advance(); |
| 196 | + } |
| 197 | + Some(self.finish_range(start)) |
| 198 | + } |
| 199 | + |
| 200 | + fn start_range(&self) -> TextUnit { |
| 201 | + TextUnit::from_usize(self.pos) |
| 202 | + } |
| 203 | + |
| 204 | + fn finish_range(&self, start: TextUnit) -> TextRange { |
| 205 | + TextRange::from_to(start, TextUnit::from_usize(self.pos)) |
| 206 | + } |
| 207 | + |
| 208 | + fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent { |
| 209 | + let range = self.finish_range(start); |
| 210 | + StringComponent { range, kind } |
| 211 | + } |
| 212 | +} |
| 213 | + |
| 214 | +#[cfg(test)] |
| 215 | +mod tests { |
| 216 | + use super::*; |
| 217 | + |
| 218 | + fn parse(src: &str) -> (bool, Vec<StringComponent>) { |
| 219 | + let component_iterator = &mut parse_quoted_literal(None, '\'', src); |
| 220 | + let components: Vec<_> = component_iterator.collect(); |
| 221 | + (component_iterator.has_closing_quote, components) |
| 222 | + } |
| 223 | + |
| 224 | + fn unclosed_char_component(src: &str) -> StringComponent { |
| 225 | + let (has_closing_quote, components) = parse(src); |
| 226 | + assert!(!has_closing_quote, "char should not have closing quote"); |
| 227 | + assert!(components.len() == 1); |
| 228 | + components[0].clone() |
| 229 | + } |
| 230 | + |
| 231 | + fn closed_char_component(src: &str) -> StringComponent { |
| 232 | + let (has_closing_quote, components) = parse(src); |
| 233 | + assert!(has_closing_quote, "char should have closing quote"); |
| 234 | + assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components); |
| 235 | + components[0].clone() |
| 236 | + } |
| 237 | + |
| 238 | + fn closed_char_components(src: &str) -> Vec<StringComponent> { |
| 239 | + let (has_closing_quote, components) = parse(src); |
| 240 | + assert!(has_closing_quote, "char should have closing quote"); |
| 241 | + components |
| 242 | + } |
| 243 | + |
| 244 | + fn range_closed(src: &str) -> TextRange { |
| 245 | + TextRange::from_to(1.into(), (src.len() as u32 - 1).into()) |
| 246 | + } |
| 247 | + |
| 248 | + fn range_unclosed(src: &str) -> TextRange { |
| 249 | + TextRange::from_to(1.into(), (src.len() as u32).into()) |
| 250 | + } |
| 251 | + |
| 252 | + #[test] |
| 253 | + fn test_unicode_escapes() { |
| 254 | + let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""]; |
| 255 | + for escape in unicode_escapes { |
| 256 | + let escape_sequence = format!(r"'\u{}'", escape); |
| 257 | + let component = closed_char_component(&escape_sequence); |
| 258 | + let expected_range = range_closed(&escape_sequence); |
| 259 | + assert_eq!(component.kind, UnicodeEscape); |
| 260 | + assert_eq!(component.range, expected_range); |
| 261 | + } |
| 262 | + } |
| 263 | + |
| 264 | + #[test] |
| 265 | + fn test_unicode_escapes_unclosed() { |
| 266 | + let unicode_escapes = &["{DEAD", "{BEEF", "{FF"]; |
| 267 | + for escape in unicode_escapes { |
| 268 | + let escape_sequence = format!(r"'\u{}'", escape); |
| 269 | + let component = unclosed_char_component(&escape_sequence); |
| 270 | + let expected_range = range_unclosed(&escape_sequence); |
| 271 | + assert_eq!(component.kind, UnicodeEscape); |
| 272 | + assert_eq!(component.range, expected_range); |
| 273 | + } |
| 274 | + } |
| 275 | + |
| 276 | + #[test] |
| 277 | + fn test_empty_char() { |
| 278 | + let (has_closing_quote, components) = parse("''"); |
| 279 | + assert!(has_closing_quote, "char should have closing quote"); |
| 280 | + assert!(components.len() == 0); |
| 281 | + } |
| 282 | + |
| 283 | + #[test] |
| 284 | + fn test_unclosed_char() { |
| 285 | + let component = unclosed_char_component("'a"); |
| 286 | + assert!(component.kind == CodePoint); |
| 287 | + assert!(component.range == TextRange::from_to(1.into(), 2.into())); |
| 288 | + } |
| 289 | + |
| 290 | + #[test] |
| 291 | + fn test_digit_escapes() { |
| 292 | + let literals = &[r"", r"5", r"55"]; |
| 293 | + |
| 294 | + for literal in literals { |
| 295 | + let lit_text = format!(r"'\x{}'", literal); |
| 296 | + let component = closed_char_component(&lit_text); |
| 297 | + assert!(component.kind == AsciiCodeEscape); |
| 298 | + assert!(component.range == range_closed(&lit_text)); |
| 299 | + } |
| 300 | + |
| 301 | + // More than 2 digits starts a new codepoint |
| 302 | + let components = closed_char_components(r"'\x555'"); |
| 303 | + assert!(components.len() == 2); |
| 304 | + assert!(components[1].kind == CodePoint); |
| 305 | + } |
| 306 | + |
| 307 | + #[test] |
| 308 | + fn test_ascii_escapes() { |
| 309 | + let literals = &[ |
| 310 | + r"\'", "\\\"", // equivalent to \" |
| 311 | + r"\n", r"\r", r"\t", r"\\", r"\0", |
| 312 | + ]; |
| 313 | + |
| 314 | + for literal in literals { |
| 315 | + let lit_text = format!("'{}'", literal); |
| 316 | + let component = closed_char_component(&lit_text); |
| 317 | + assert!(component.kind == AsciiEscape); |
| 318 | + assert!(component.range == range_closed(&lit_text)); |
| 319 | + } |
| 320 | + } |
| 321 | + |
| 322 | + #[test] |
| 323 | + fn test_no_escapes() { |
| 324 | + let literals = &['"', 'n', 'r', 't', '0', 'x', 'u']; |
| 325 | + |
| 326 | + for &literal in literals { |
| 327 | + let lit_text = format!("'{}'", literal); |
| 328 | + let component = closed_char_component(&lit_text); |
| 329 | + assert!(component.kind == CodePoint); |
| 330 | + assert!(component.range == range_closed(&lit_text)); |
| 331 | + } |
| 332 | + } |
| 333 | +} |
0 commit comments