From 3b12e784114cd3dc2ba28811559d846e31d53bae Mon Sep 17 00:00:00 2001 From: Evan Haas Date: Sun, 12 Jan 2025 15:01:58 -0800 Subject: [PATCH] Preprocessor: add support for UCN identifiers Closes #823 --- src/aro/Preprocessor.zig | 34 +++++++++++++++++-- src/aro/Tokenizer.zig | 36 ++++++++++++++++++++- src/aro/ucn.zig | 63 ++++++++++++++++++++++++++++++++++++ test/cases/ucn identifiers.c | 17 ++++++++++ 4 files changed, 146 insertions(+), 4 deletions(-) create mode 100644 src/aro/ucn.zig create mode 100644 test/cases/ucn identifiers.c diff --git a/src/aro/Preprocessor.zig b/src/aro/Preprocessor.zig index 6a81b162..d8866528 100644 --- a/src/aro/Preprocessor.zig +++ b/src/aro/Preprocessor.zig @@ -16,6 +16,7 @@ const RawToken = Tokenizer.Token; const Tree = @import("Tree.zig"); const Token = Tree.Token; const TokenWithExpansionLocs = Tree.TokenWithExpansionLocs; +const ucn = @import("ucn.zig"); const DefineMap = std.StringHashMapUnmanaged(Macro); const RawTokenList = std.ArrayList(RawToken); @@ -991,7 +992,7 @@ fn expr(pp: *Preprocessor, tokenizer: *Tokenizer) MacroError!bool { } }, } - pp.addTokenAssumeCapacity(tok); + pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok)); } try pp.addToken(.{ .id = .eof, @@ -2398,6 +2399,32 @@ fn expandMacroExhaustive( buf.items.len = moving_end_idx; } +fn writeUnescapedChar(pp: *Preprocessor, codepoint: u32) !void { + var space: [4]u8 = undefined; + const len = std.unicode.utf8Encode(@as(u21, @intCast(codepoint)), space[0..]) catch @panic("todo"); + pp.comp.generated_buf.appendSliceAssumeCapacity(space[0..len]); +} + +fn unescapeUcn(pp: *Preprocessor, tok: TokenWithExpansionLocs) !TokenWithExpansionLocs { + if (tok.id == .extended_identifier) { + @branchHint(.cold); + const identifier = pp.expandedSlice(tok); + if (mem.indexOfScalar(u8, identifier, '\\') != null) { + @branchHint(.cold); + const start = pp.comp.generated_buf.items.len; + try pp.comp.generated_buf.ensureUnusedCapacity(pp.gpa, identifier.len + 1); + + var it: ucn.CharIterator = .{ .str = identifier }; + while (it.next()) |c| { + try pp.writeUnescapedChar(c); + } + pp.comp.generated_buf.appendAssumeCapacity('\n'); + return pp.makeGeneratedToken(start, .extended_identifier, tok); + } + } + return tok; +} + /// Try to expand a macro after a possible candidate has been read from the `tokenizer` /// into the `raw` token passed as argument fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroError!void { @@ -2427,7 +2454,7 @@ fn expandMacro(pp: *Preprocessor, tokenizer: *Tokenizer, raw: RawToken) MacroErr continue; } tok.id.simplifyMacroKeywordExtra(true); - pp.addTokenAssumeCapacity(tok.*); + pp.addTokenAssumeCapacity(try pp.unescapeUcn(tok.*)); } if (pp.preserve_whitespace) { try pp.ensureUnusedTokenCapacity(pp.add_expansion_nl); @@ -3100,7 +3127,8 @@ fn makePragmaToken(pp: *Preprocessor, raw: RawToken, operator_loc: ?Source.Locat return tok; } -pub fn addToken(pp: *Preprocessor, tok: TokenWithExpansionLocs) !void { +pub fn addToken(pp: *Preprocessor, tok_arg: TokenWithExpansionLocs) !void { + const tok = try pp.unescapeUcn(tok_arg); if (tok.expansion_locs) |expansion_locs| { try pp.expansion_entries.append(pp.gpa, .{ .idx = @intCast(pp.tokens.len), .locs = expansion_locs }); } diff --git a/src/aro/Tokenizer.zig b/src/aro/Tokenizer.zig index fb0ecfe2..dc1a1401 100644 --- a/src/aro/Tokenizer.zig +++ b/src/aro/Tokenizer.zig @@ -19,7 +19,7 @@ pub const Token = struct { eof, /// identifier containing solely basic character set characters identifier, - /// identifier with at least one extended character + /// identifier with at least one extended character or UCN escape sequence extended_identifier, // string literals with prefixes @@ -1074,14 +1074,45 @@ pub fn next(self: *Tokenizer) Token { pp_num, pp_num_exponent, pp_num_digit_separator, + ucn_slash, + ucn, } = .start; var start = self.index; var id: Token.Id = .eof; + var ucn_wants: u8 = undefined; + var ucn_consumed: u8 = undefined; while (self.index < self.buf.len) : (self.index += 1) { const c = self.buf[self.index]; switch (state) { + .ucn_slash => switch (c) { + 'u' => { + ucn_wants = 4; + ucn_consumed = 0; + state = .ucn; + }, + 'U' => { + ucn_wants = 8; + ucn_consumed = 0; + state = .ucn; + }, + else => { + id = .invalid; + break; + }, + }, + .ucn => switch (c) { + 'a'...'f', 'A'...'F', '0'...'9' => { + ucn_consumed += 1; + if (ucn_consumed == ucn_wants) { + state = .extended_identifier; + } + }, + else => { + @panic("todo"); + }, + }, .start => switch (c) { '\n' => { id = .nl; @@ -1100,6 +1131,7 @@ pub fn next(self: *Tokenizer) Token { 'u' => state = .u, 'U' => state = .U, 'L' => state = .L, + '\\' => state = .ucn_slash, 'a'...'t', 'v'...'z', 'A'...'K', 'M'...'T', 'V'...'Z', '_' => state = .identifier, '=' => state = .equal, '!' => state = .bang, @@ -1325,6 +1357,7 @@ pub fn next(self: *Tokenizer) Token { break; }, 0x80...0xFF => state = .extended_identifier, + '\\' => state = .ucn_slash, else => { id = if (state == .identifier) Token.getTokenId(self.langopts, self.buf[start..self.index]) else .extended_identifier; break; @@ -1732,6 +1765,7 @@ pub fn next(self: *Tokenizer) Token { } } else if (self.index == self.buf.len) { switch (state) { + .ucn_slash, .ucn => @panic("todo"), .start, .line_comment => {}, .u, .u8, .U, .L, .identifier => id = Token.getTokenId(self.langopts, self.buf[start..self.index]), .extended_identifier => id = .extended_identifier, diff --git a/src/aro/ucn.zig b/src/aro/ucn.zig new file mode 100644 index 00000000..e5ec8923 --- /dev/null +++ b/src/aro/ucn.zig @@ -0,0 +1,63 @@ +const std = @import("std"); + +const DecodedUniversalChar = struct { + codepoint: u32, + consumed: usize, +}; + +/// Decodes a C99-style universal character name (e.g., \uXXXX or \UXXXXXXXX) +/// into a unicode codepoint. Returns the decoded character and the number of +/// bytes consumed from the input string. +fn decodeUniversalChar(input: []const u8) ?DecodedUniversalChar { + const is_long = input[1] == 'U'; + const required: usize = if (is_long) 10 else 6; + + if (input.len < required) + return null; + + const hex_part = input[2..required]; + var codepoint: u32 = 0; + for (hex_part) |c| { + codepoint *= 16; + const value = switch (c) { + '0'...'9' => c - '0', + 'a'...'f' => 10 + (c - 'a'), + 'A'...'F' => 10 + (c - 'A'), + else => return null, + }; + codepoint += value; + } + + return .{ .codepoint = codepoint, .consumed = required }; +} + +pub const CharIterator = struct { + str: []const u8, + i: usize = 0, + + pub fn next(self: *@This()) ?u32 { + if (self.i >= self.str.len) return null; + if (self.str[self.i] == '\\' and self.i + 1 < self.str.len and (self.str[self.i + 1] == 'u' or self.str[self.i + 1] == 'U')) { + const decoded = decodeUniversalChar(self.str[self.i..]) orelse { + self.i += 1; + return '\\'; + }; + self.i += decoded.consumed; + return decoded.codepoint; + } else { + const len = std.unicode.utf8ByteSequenceLength(self.str[self.i]) catch 1; + const cp = switch (len) { + 1 => self.str[self.i], + 2 => std.unicode.utf8Decode2(self.str[self.i..][0..2].*), + 3 => std.unicode.utf8Decode3(self.str[self.i..][0..3].*), + 4 => std.unicode.utf8Decode4(self.str[self.i..][0..4].*), + else => unreachable, + } catch { + defer self.i += 1; + return self.str[self.i]; + }; + self.i += len; + return cp; + } + } +}; diff --git a/test/cases/ucn identifiers.c b/test/cases/ucn identifiers.c new file mode 100644 index 00000000..2988e6ae --- /dev/null +++ b/test/cases/ucn identifiers.c @@ -0,0 +1,17 @@ +int foo(void) { + int \u4F60\u597D = 5; + int \u0061 = 5; // TODO: error: character 'a' cannot be specified by a universal character name + return 你好; +} + +struct S { + int 你好; +}; + +int bar(int x) { + struct S s; + s.\u4F60\u597D = x; + return s.你好; +} + +#define TESTS_SKIPPED 1