Skip to content

Commit 70c817a

Browse files
committed
Allow lexer to recover from some homoglyphs
1 parent 27a6a30 commit 70c817a

6 files changed

+74
-34
lines changed

src/libsyntax/parse/lexer/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -389,7 +389,10 @@ impl<'a> StringReader<'a> {
389389
self.pos,
390390
"unknown start of token",
391391
c);
392-
unicode_chars::check_for_substitution(self, start, c, &mut err);
392+
if let Some(t) = unicode_chars::check_for_substitution(self, start, c, &mut err) {
393+
err.emit();
394+
return Ok(t);
395+
}
393396
return Err(err)
394397
}
395398
};

src/libsyntax/parse/lexer/unicode_chars.rs

Lines changed: 33 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
use super::StringReader;
55
use errors::{Applicability, DiagnosticBuilder};
66
use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
7+
use crate::parse::token;
78

89
#[rustfmt::skip] // for line breaks
910
const UNICODE_ARRAY: &[(char, &str, char)] = &[
@@ -297,53 +298,53 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
297298
('>', "Fullwidth Greater-Than Sign", '>'),
298299
];
299300

300-
const ASCII_ARRAY: &[(char, &str)] = &[
301-
(' ', "Space"),
302-
('_', "Underscore"),
303-
('-', "Minus/Hyphen"),
304-
(',', "Comma"),
305-
(';', "Semicolon"),
306-
(':', "Colon"),
307-
('!', "Exclamation Mark"),
308-
('?', "Question Mark"),
309-
('.', "Period"),
310-
('\'', "Single Quote"),
311-
('"', "Quotation Mark"),
312-
('(', "Left Parenthesis"),
313-
(')', "Right Parenthesis"),
314-
('[', "Left Square Bracket"),
315-
(']', "Right Square Bracket"),
316-
('{', "Left Curly Brace"),
317-
('}', "Right Curly Brace"),
318-
('*', "Asterisk"),
319-
('/', "Slash"),
320-
('\\', "Backslash"),
321-
('&', "Ampersand"),
322-
('+', "Plus Sign"),
323-
('<', "Less-Than Sign"),
324-
('=', "Equals Sign"),
325-
('>', "Greater-Than Sign"),
301+
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
302+
(' ', "Space", Some(token::Whitespace)),
303+
('_', "Underscore", None),
304+
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
305+
(',', "Comma", Some(token::Comma)),
306+
(';', "Semicolon", Some(token::Semi)),
307+
(':', "Colon", Some(token::Colon)),
308+
('!', "Exclamation Mark", Some(token::Not)),
309+
('?', "Question Mark", Some(token::Question)),
310+
('.', "Period", Some(token::Dot)),
311+
('\'', "Single Quote", None), // Literals are already lexed by this point, so we can't recover
312+
('"', "Quotation Mark", None), // gracefully just by spitting the correct token out.
313+
('(', "Left Parenthesis", Some(token::OpenDelim(token::Paren))),
314+
(')', "Right Parenthesis", Some(token::CloseDelim(token::Paren))),
315+
('[', "Left Square Bracket", Some(token::OpenDelim(token::Bracket))),
316+
(']', "Right Square Bracket", Some(token::CloseDelim(token::Bracket))),
317+
('{', "Left Curly Brace", Some(token::OpenDelim(token::Brace))),
318+
('}', "Right Curly Brace", Some(token::CloseDelim(token::Brace))),
319+
('*', "Asterisk", Some(token::BinOp(token::Star))),
320+
('/', "Slash", Some(token::BinOp(token::Slash))),
321+
('\\', "Backslash", None),
322+
('&', "Ampersand", Some(token::BinOp(token::And))),
323+
('+', "Plus Sign", Some(token::BinOp(token::Plus))),
324+
('<', "Less-Than Sign", Some(token::Lt)),
325+
('=', "Equals Sign", Some(token::Eq)),
326+
('>', "Greater-Than Sign", Some(token::Gt)),
326327
];
327328

328329
crate fn check_for_substitution<'a>(
329330
reader: &StringReader<'a>,
330331
pos: BytePos,
331332
ch: char,
332333
err: &mut DiagnosticBuilder<'a>,
333-
) -> bool {
334+
) -> Option<token::TokenKind> {
334335
let (u_name, ascii_char) = match UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch) {
335336
Some(&(_u_char, u_name, ascii_char)) => (u_name, ascii_char),
336-
None => return false,
337+
None => return None,
337338
};
338339

339340
let span = Span::new(pos, pos + Pos::from_usize(ch.len_utf8()), NO_EXPANSION);
340341

341-
let ascii_name = match ASCII_ARRAY.iter().find(|&&(c, _)| c == ascii_char) {
342-
Some((_ascii_char, ascii_name)) => ascii_name,
342+
let (ascii_name, token) = match ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) {
343+
Some((_ascii_char, ascii_name, token)) => (ascii_name, token),
343344
None => {
344345
let msg = format!("substitution character not found for '{}'", ch);
345346
reader.sess.span_diagnostic.span_bug_no_panic(span, &msg);
346-
return false;
347+
return None;
347348
}
348349
};
349350

@@ -371,7 +372,7 @@ crate fn check_for_substitution<'a>(
371372
);
372373
err.span_suggestion(span, &msg, ascii_char.to_string(), Applicability::MaybeIncorrect);
373374
}
374-
true
375+
token.clone()
375376
}
376377

377378
/// Extract string if found at current position with given delimiters
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
22
//~^ ERROR expected at least one digit in exponent
33
//~| ERROR unknown start of token: \u{2212}
4+
//~| ERROR cannot subtract `{integer}` from `{float}`
45

56
fn main() {}

src/test/ui/did_you_mean/issue-49746-unicode-confusable-in-float-literal-expt.stderr

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,5 +14,14 @@ help: Unicode character '−' (Minus Sign) looks like '-' (Minus/Hyphen), but it
1414
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e-11; // m³⋅kg⁻¹⋅s⁻²
1515
| ^
1616

17-
error: aborting due to 2 previous errors
17+
error[E0277]: cannot subtract `{integer}` from `{float}`
18+
--> $DIR/issue-49746-unicode-confusable-in-float-literal-expt.rs:1:53
19+
|
20+
LL | const UNIVERSAL_GRAVITATIONAL_CONSTANT: f64 = 6.674e−11; // m³⋅kg⁻¹⋅s⁻²
21+
| ^ no implementation for `{float} - {integer}`
22+
|
23+
= help: the trait `std::ops::Sub<{integer}>` is not implemented for `{float}`
24+
25+
error: aborting due to 3 previous errors
1826

27+
For more information about this error, try `rustc --explain E0277`.
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
fn main() {
2+
println!(""); //~ ERROR unknown start of token: \u{37e}
3+
let x: usize = (); //~ ERROR mismatched types
4+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
error: unknown start of token: \u{37e}
2+
--> $DIR/recover-from-homoglyph.rs:2:17
3+
|
4+
LL | println!("");
5+
| ^
6+
help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
7+
|
8+
LL | println!("");
9+
| ^
10+
11+
error[E0308]: mismatched types
12+
--> $DIR/recover-from-homoglyph.rs:3:20
13+
|
14+
LL | let x: usize = ();
15+
| ^^ expected usize, found ()
16+
|
17+
= note: expected type `usize`
18+
found type `()`
19+
20+
error: aborting due to 2 previous errors
21+
22+
For more information about this error, try `rustc --explain E0308`.

0 commit comments

Comments
 (0)