Skip to content

Commit 5cdf525

Browse files
bors[bot]matklad
andcommitted
Merge #1093
1093: simplify r=matklad a=matklad Co-authored-by: Aleksey Kladov <[email protected]>
2 parents 9e46400 + cf1caf5 commit 5cdf525

File tree

7 files changed

+336
-400
lines changed

7 files changed

+336
-400
lines changed

crates/ra_syntax/src/string_lexing.rs

+332-6
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,333 @@
1-
mod parser;
2-
mod string;
1+
use crate::{TextRange, TextUnit};
2+
use self::StringComponentKind::*;
33

4-
pub use self::{
5-
parser::{StringComponent, StringComponentKind},
6-
string::{parse_string_literal, parse_char_literal, parse_byte_literal, parse_byte_string_literal},
7-
};
4+
#[derive(Debug, Eq, PartialEq, Clone)]
5+
pub(crate) struct StringComponent {
6+
pub(crate) range: TextRange,
7+
pub(crate) kind: StringComponentKind,
8+
}
9+
10+
#[derive(Debug, Eq, PartialEq, Clone)]
11+
pub(crate) enum StringComponentKind {
12+
IgnoreNewline,
13+
CodePoint,
14+
AsciiEscape,
15+
AsciiCodeEscape,
16+
UnicodeEscape,
17+
}
18+
19+
pub(crate) fn parse_quoted_literal(
20+
prefix: Option<char>,
21+
quote: char,
22+
src: &str,
23+
) -> StringComponentIter {
24+
let prefix = prefix.map(|p| match p {
25+
'b' => b'b',
26+
_ => panic!("invalid prefix"),
27+
});
28+
let quote = match quote {
29+
'\'' => b'\'',
30+
'"' => b'"',
31+
_ => panic!("invalid quote"),
32+
};
33+
StringComponentIter { src, prefix, quote, pos: 0, has_closing_quote: false, suffix: None }
34+
}
35+
36+
pub(crate) struct StringComponentIter<'a> {
37+
src: &'a str,
38+
prefix: Option<u8>,
39+
quote: u8,
40+
pos: usize,
41+
pub(crate) has_closing_quote: bool,
42+
pub(crate) suffix: Option<TextRange>,
43+
}
44+
45+
impl<'a> Iterator for StringComponentIter<'a> {
46+
type Item = StringComponent;
47+
fn next(&mut self) -> Option<StringComponent> {
48+
if self.pos == 0 {
49+
if let Some(prefix) = self.prefix {
50+
assert!(
51+
self.advance() == prefix as char,
52+
"literal should start with a {:?}",
53+
prefix as char,
54+
);
55+
}
56+
assert!(
57+
self.advance() == self.quote as char,
58+
"literal should start with a {:?}",
59+
self.quote as char,
60+
);
61+
}
62+
63+
if let Some(component) = self.parse_component() {
64+
return Some(component);
65+
}
66+
67+
// We get here when there are no char components left to parse
68+
if self.peek() == Some(self.quote as char) {
69+
self.advance();
70+
self.has_closing_quote = true;
71+
if let Some(range) = self.parse_suffix() {
72+
self.suffix = Some(range);
73+
}
74+
}
75+
76+
assert!(
77+
self.peek() == None,
78+
"literal should leave no unparsed input: src = {:?}, pos = {}, length = {}",
79+
self.src,
80+
self.pos,
81+
self.src.len()
82+
);
83+
84+
None
85+
}
86+
}
87+
88+
impl<'a> StringComponentIter<'a> {
89+
fn peek(&self) -> Option<char> {
90+
if self.pos == self.src.len() {
91+
return None;
92+
}
93+
94+
self.src[self.pos..].chars().next()
95+
}
96+
97+
fn advance(&mut self) -> char {
98+
let next = self.peek().expect("cannot advance if end of input is reached");
99+
self.pos += next.len_utf8();
100+
next
101+
}
102+
103+
fn parse_component(&mut self) -> Option<StringComponent> {
104+
let next = self.peek()?;
105+
106+
// Ignore string close
107+
if next == self.quote as char {
108+
return None;
109+
}
110+
111+
let start = self.start_range();
112+
self.advance();
113+
114+
if next == '\\' {
115+
// Strings can use `\` to ignore newlines, so we first try to parse one of those
116+
// before falling back to parsing char escapes
117+
if self.quote == b'"' {
118+
if let Some(component) = self.parse_ignore_newline(start) {
119+
return Some(component);
120+
}
121+
}
122+
123+
Some(self.parse_escape(start))
124+
} else {
125+
Some(self.finish_component(start, CodePoint))
126+
}
127+
}
128+
129+
fn parse_ignore_newline(&mut self, start: TextUnit) -> Option<StringComponent> {
130+
// In string literals, when a `\` occurs immediately before the newline, the `\`,
131+
// the newline, and all whitespace at the beginning of the next line are ignored
132+
match self.peek() {
133+
Some('\n') | Some('\r') => {
134+
self.skip_whitespace();
135+
Some(self.finish_component(start, IgnoreNewline))
136+
}
137+
_ => None,
138+
}
139+
}
140+
141+
fn skip_whitespace(&mut self) {
142+
while self.peek().map(|c| c.is_whitespace()) == Some(true) {
143+
self.advance();
144+
}
145+
}
146+
147+
fn parse_escape(&mut self, start: TextUnit) -> StringComponent {
148+
if self.peek().is_none() {
149+
return self.finish_component(start, AsciiEscape);
150+
}
151+
152+
let next = self.advance();
153+
match next {
154+
'x' => self.parse_ascii_code_escape(start),
155+
'u' => self.parse_unicode_escape(start),
156+
_ => self.finish_component(start, AsciiEscape),
157+
}
158+
}
159+
160+
fn parse_unicode_escape(&mut self, start: TextUnit) -> StringComponent {
161+
match self.peek() {
162+
Some('{') => {
163+
self.advance();
164+
165+
// Parse anything until we reach `}`
166+
while let Some(next) = self.peek() {
167+
self.advance();
168+
if next == '}' {
169+
break;
170+
}
171+
}
172+
173+
self.finish_component(start, UnicodeEscape)
174+
}
175+
Some(_) | None => self.finish_component(start, UnicodeEscape),
176+
}
177+
}
178+
179+
fn parse_ascii_code_escape(&mut self, start: TextUnit) -> StringComponent {
180+
let code_start = self.pos;
181+
while let Some(next) = self.peek() {
182+
if next == '\'' || (self.pos - code_start == 2) {
183+
break;
184+
}
185+
186+
self.advance();
187+
}
188+
self.finish_component(start, AsciiCodeEscape)
189+
}
190+
191+
fn parse_suffix(&mut self) -> Option<TextRange> {
192+
let start = self.start_range();
193+
let _ = self.peek()?;
194+
while let Some(_) = self.peek() {
195+
self.advance();
196+
}
197+
Some(self.finish_range(start))
198+
}
199+
200+
fn start_range(&self) -> TextUnit {
201+
TextUnit::from_usize(self.pos)
202+
}
203+
204+
fn finish_range(&self, start: TextUnit) -> TextRange {
205+
TextRange::from_to(start, TextUnit::from_usize(self.pos))
206+
}
207+
208+
fn finish_component(&self, start: TextUnit, kind: StringComponentKind) -> StringComponent {
209+
let range = self.finish_range(start);
210+
StringComponent { range, kind }
211+
}
212+
}
213+
214+
#[cfg(test)]
215+
mod tests {
216+
use super::*;
217+
218+
fn parse(src: &str) -> (bool, Vec<StringComponent>) {
219+
let component_iterator = &mut parse_quoted_literal(None, '\'', src);
220+
let components: Vec<_> = component_iterator.collect();
221+
(component_iterator.has_closing_quote, components)
222+
}
223+
224+
fn unclosed_char_component(src: &str) -> StringComponent {
225+
let (has_closing_quote, components) = parse(src);
226+
assert!(!has_closing_quote, "char should not have closing quote");
227+
assert!(components.len() == 1);
228+
components[0].clone()
229+
}
230+
231+
fn closed_char_component(src: &str) -> StringComponent {
232+
let (has_closing_quote, components) = parse(src);
233+
assert!(has_closing_quote, "char should have closing quote");
234+
assert!(components.len() == 1, "Literal: {}\nComponents: {:#?}", src, components);
235+
components[0].clone()
236+
}
237+
238+
fn closed_char_components(src: &str) -> Vec<StringComponent> {
239+
let (has_closing_quote, components) = parse(src);
240+
assert!(has_closing_quote, "char should have closing quote");
241+
components
242+
}
243+
244+
fn range_closed(src: &str) -> TextRange {
245+
TextRange::from_to(1.into(), (src.len() as u32 - 1).into())
246+
}
247+
248+
fn range_unclosed(src: &str) -> TextRange {
249+
TextRange::from_to(1.into(), (src.len() as u32).into())
250+
}
251+
252+
#[test]
253+
fn test_unicode_escapes() {
254+
let unicode_escapes = &[r"{DEAD}", "{BEEF}", "{FF}", "{}", ""];
255+
for escape in unicode_escapes {
256+
let escape_sequence = format!(r"'\u{}'", escape);
257+
let component = closed_char_component(&escape_sequence);
258+
let expected_range = range_closed(&escape_sequence);
259+
assert_eq!(component.kind, UnicodeEscape);
260+
assert_eq!(component.range, expected_range);
261+
}
262+
}
263+
264+
#[test]
265+
fn test_unicode_escapes_unclosed() {
266+
let unicode_escapes = &["{DEAD", "{BEEF", "{FF"];
267+
for escape in unicode_escapes {
268+
let escape_sequence = format!(r"'\u{}'", escape);
269+
let component = unclosed_char_component(&escape_sequence);
270+
let expected_range = range_unclosed(&escape_sequence);
271+
assert_eq!(component.kind, UnicodeEscape);
272+
assert_eq!(component.range, expected_range);
273+
}
274+
}
275+
276+
#[test]
277+
fn test_empty_char() {
278+
let (has_closing_quote, components) = parse("''");
279+
assert!(has_closing_quote, "char should have closing quote");
280+
assert!(components.len() == 0);
281+
}
282+
283+
#[test]
284+
fn test_unclosed_char() {
285+
let component = unclosed_char_component("'a");
286+
assert!(component.kind == CodePoint);
287+
assert!(component.range == TextRange::from_to(1.into(), 2.into()));
288+
}
289+
290+
#[test]
291+
fn test_digit_escapes() {
292+
let literals = &[r"", r"5", r"55"];
293+
294+
for literal in literals {
295+
let lit_text = format!(r"'\x{}'", literal);
296+
let component = closed_char_component(&lit_text);
297+
assert!(component.kind == AsciiCodeEscape);
298+
assert!(component.range == range_closed(&lit_text));
299+
}
300+
301+
// More than 2 digits starts a new codepoint
302+
let components = closed_char_components(r"'\x555'");
303+
assert!(components.len() == 2);
304+
assert!(components[1].kind == CodePoint);
305+
}
306+
307+
#[test]
308+
fn test_ascii_escapes() {
309+
let literals = &[
310+
r"\'", "\\\"", // equivalent to \"
311+
r"\n", r"\r", r"\t", r"\\", r"\0",
312+
];
313+
314+
for literal in literals {
315+
let lit_text = format!("'{}'", literal);
316+
let component = closed_char_component(&lit_text);
317+
assert!(component.kind == AsciiEscape);
318+
assert!(component.range == range_closed(&lit_text));
319+
}
320+
}
321+
322+
#[test]
323+
fn test_no_escapes() {
324+
let literals = &['"', 'n', 'r', 't', '0', 'x', 'u'];
325+
326+
for &literal in literals {
327+
let lit_text = format!("'{}'", literal);
328+
let component = closed_char_component(&lit_text);
329+
assert!(component.kind == CodePoint);
330+
assert!(component.range == range_closed(&lit_text));
331+
}
332+
}
333+
}

0 commit comments

Comments
 (0)