Skip to content

Commit c64ea2e

Browse files
eddybFriz64
authored andcommitted
analysis/cdecl: custom parser for a large enough subset of C's declaration syntax.
1 parent dfe2261 commit c64ea2e

File tree

3 files changed

+466
-27
lines changed

3 files changed

+466
-27
lines changed

analysis/src/cdecl.rs

Lines changed: 393 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,393 @@
1+
use std::num::NonZeroU8;
2+
3+
/// Identifier-category-aware minimal tokenization of a subset of C syntax,
4+
/// sufficient for parsing the C declarations used in `vk.xml`.
5+
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
6+
pub enum CTok<'a> {
7+
/// Identifier referring to a type declaration in scope.
8+
TypeName(&'a str),
9+
10+
/// Identifier referring to a value declaration in scope.
11+
ValueName(&'a str),
12+
13+
/// Identifier that is being presently declared (exactly one per `CDecl`).
14+
DeclName(&'a str),
15+
16+
/// Supported keyword (one of [`CTok::SUPPORTED_KEYWORDS`]).
17+
Kw(&'static str),
18+
19+
/// Any ASCII punctuation (i.e. as determined by [`char::is_ascii_punctuation`]).
20+
// FIXME(eddyb) this could really use the `std::ascii` API.
21+
Punct(char),
22+
23+
/// Integer literal (for e.g. array lengths).
24+
IntLit(&'a str),
25+
26+
/// Unknown identifier (all known cases are spec bugs or deficiencies).
27+
StrayIdent(&'a str),
28+
}
29+
30+
#[derive(Debug)]
31+
pub struct UnsupportedCTok<'a>(&'a str);
32+
33+
impl<'a> CTok<'a> {
34+
pub const SUPPORTED_KEYWORDS: &'static [&'static str] = &["const", "struct", "typedef", "void"];
35+
36+
pub fn lex_into(
37+
s: &'a str,
38+
out: &mut impl Extend<CTok<'a>>,
39+
) -> Result<(), UnsupportedCTok<'a>> {
40+
// FIXME(eddyb) this could really use the `std::ascii` API.
41+
let mut s = s;
42+
while let Some(c) = s.chars().next() {
43+
if !c.is_ascii() {
44+
return Err(UnsupportedCTok(s));
45+
}
46+
47+
let is_ident_or_number = |c: char| c.is_ascii_alphanumeric() || c == '_';
48+
let tok = if is_ident_or_number(c) {
49+
let len = s.chars().take_while(|&c| is_ident_or_number(c)).count();
50+
let (tok, rest) = s.split_at(len);
51+
s = rest;
52+
if c.is_ascii_digit() {
53+
CTok::IntLit(tok)
54+
} else if let Some(kw) = CTok::SUPPORTED_KEYWORDS.iter().find(|&&kw| kw == tok) {
55+
CTok::Kw(kw)
56+
} else {
57+
CTok::StrayIdent(tok)
58+
}
59+
} else if c.is_ascii_punctuation() {
60+
s = &s[1..];
61+
CTok::Punct(c)
62+
} else if c.is_ascii_whitespace() {
63+
s = s.trim_start();
64+
continue;
65+
} else {
66+
return Err(UnsupportedCTok(s));
67+
};
68+
out.extend([tok]);
69+
}
70+
Ok(())
71+
}
72+
}
73+
74+
#[derive(Debug, PartialEq, Eq)]
75+
pub struct CDecl<'a> {
76+
pub ty: CType<'a>,
77+
pub name: &'a str,
78+
pub bitfield_width: Option<NonZeroU8>,
79+
}
80+
81+
#[derive(Copy, Clone, PartialEq, Eq)]
82+
pub enum CDeclMode {
83+
TypeDef,
84+
StructMember,
85+
FuncParam,
86+
FuncTypeParam,
87+
}
88+
89+
#[derive(Debug, PartialEq, Eq)]
90+
pub enum CType<'a> {
91+
Base(CBaseType<'a>),
92+
Ptr {
93+
implicit_for_decay: bool,
94+
is_const: bool,
95+
pointee: Box<CType<'a>>,
96+
},
97+
Array {
98+
element: Box<CType<'a>>,
99+
len: CArrayLen<'a>,
100+
},
101+
Func {
102+
ret_ty: Option<Box<CType<'a>>>,
103+
params: Vec<CDecl<'a>>,
104+
},
105+
}
106+
107+
impl CType<'_> {
108+
pub const VOID: CType<'static> = CType::Base(CBaseType {
109+
struct_tag: false,
110+
name: "void",
111+
});
112+
}
113+
114+
#[derive(Debug, PartialEq, Eq)]
115+
pub struct CBaseType<'a> {
116+
pub struct_tag: bool,
117+
pub name: &'a str,
118+
}
119+
120+
#[derive(Debug, PartialEq, Eq)]
121+
pub enum CArrayLen<'a> {
122+
Named(&'a str),
123+
Literal(u128),
124+
}
125+
126+
#[derive(Debug)]
127+
pub struct CDeclParseError<'a, 'b> {
128+
pub kind: CDeclParseErrorKind<'a>,
129+
pub tokens: &'b [CTok<'a>],
130+
}
131+
132+
#[derive(Debug)]
133+
pub enum CDeclParseErrorKind<'a> {
134+
Missing(&'static str),
135+
Multiple(&'static str),
136+
Unused(&'static str),
137+
InvalidIntLit(std::num::ParseIntError),
138+
UnsupportedLeftmostToken(CTok<'a>),
139+
UnsupportedRightmostToken(CTok<'a>),
140+
UnbalancedBrackets,
141+
UnsupportedArrayLength,
142+
}
143+
144+
impl<'a> CDecl<'a> {
145+
// HACK(eddyb) this split is literally just to simplify error tracking.
146+
pub fn parse<'b>(
147+
mode: CDeclMode,
148+
tokens: &'b [CTok<'a>],
149+
) -> Result<CDecl<'a>, CDeclParseError<'a, 'b>> {
150+
CDecl::parse_inner(mode, tokens).map_err(|kind| CDeclParseError { kind, tokens })
151+
}
152+
fn parse_inner<'b>(
153+
mode: CDeclMode,
154+
tokens: &'b [CTok<'a>],
155+
) -> Result<CDecl<'a>, CDeclParseErrorKind<'a>> {
156+
use CDeclParseErrorKind as ErrorKind;
157+
158+
trait InsertIfNone<T> {
159+
fn insert_if_none(&mut self, value: T) -> Option<&mut T>;
160+
}
161+
impl<T> InsertIfNone<T> for Option<T> {
162+
fn insert_if_none(&mut self, value: T) -> Option<&mut T> {
163+
self.is_none().then(|| self.insert(value))
164+
}
165+
}
166+
167+
let (mut left, decl_name, mut right) = {
168+
let mut decl_names =
169+
tokens
170+
.iter()
171+
.copied()
172+
.enumerate()
173+
.filter_map(|(i, tok)| match tok {
174+
CTok::DeclName(name) => Some((i, name)),
175+
176+
// HACK(eddyb) this is only allowed due to the (few)
177+
// function pointer typedefs in `vk.xml`, which don't
178+
// label parameter names in any special way.
179+
CTok::StrayIdent(name) if mode == CDeclMode::FuncTypeParam => {
180+
Some((i, name))
181+
}
182+
183+
_ => None,
184+
});
185+
match (decl_names.next(), decl_names.next()) {
186+
(Some((i, name)), None) => (&tokens[..i], name, &tokens[i + 1..]),
187+
(None, _) => return Err(ErrorKind::Missing("DeclName")),
188+
(Some(_), Some(_)) => return Err(ErrorKind::Multiple("DeclName")),
189+
}
190+
};
191+
192+
if mode == CDeclMode::TypeDef {
193+
// NOTE(eddyb) `typedef` can appear later on as well, so this is
194+
// unnecessarily strict, but it avoids much more involved tracking.
195+
left = left
196+
.strip_prefix(&[CTok::Kw("typedef")])
197+
.ok_or(ErrorKind::Missing("typedef"))?;
198+
right = right
199+
.strip_suffix(&[CTok::Punct(';')])
200+
.ok_or(ErrorKind::Missing(";"))?;
201+
}
202+
203+
let bitfield_width = match right {
204+
[rest @ .., CTok::Punct(':'), CTok::IntLit(width_lit)]
205+
if mode == CDeclMode::StructMember =>
206+
{
207+
right = rest;
208+
Some(width_lit.parse().map_err(ErrorKind::InvalidIntLit)?)
209+
}
210+
_ => None,
211+
};
212+
213+
// FIXME(eddyb) deduplicate qualifier parsing somehow.
214+
let mut const_qualif = match left {
215+
[CTok::Kw("const"), rest @ ..] => {
216+
left = rest;
217+
Some(())
218+
}
219+
_ => None,
220+
};
221+
222+
let mut ty = CType::Base(match left {
223+
[CTok::Kw("struct"), CTok::TypeName(name), rest @ ..] => {
224+
left = rest;
225+
CBaseType {
226+
struct_tag: true,
227+
name,
228+
}
229+
}
230+
[CTok::TypeName(name) | CTok::Kw(name @ "void"), rest @ ..] => {
231+
left = rest;
232+
CBaseType {
233+
struct_tag: false,
234+
name,
235+
}
236+
}
237+
_ => return Err(ErrorKind::Missing("TypeName")),
238+
});
239+
240+
// This is the core of the C declaration parsing strategy: we have some
241+
// type `T` (held in the variable `ty`) and tokens to either side of the
242+
// name being declared, and at every step of the loops below there is a
243+
// "closest binding" (postfix) "type operator", which we pattern-match
244+
// from its side and then apply to `T`, replacing `T` with any of:
245+
// - `T*` pointers (like Rust `*T`), from `T* ...`
246+
// (only `left` side "type operator", and it takes precedence, making
247+
// array-of-pointers much easier to spell out than pointer-to-array)
248+
// - `T[N]` arrays (like Rust `[T; N]`), from `T ...[N]`
249+
// - `T(A, B, C)` functions, from `T ...(A, B, C)`
250+
// (Rust only has pointers to such types, `fn(A, B, C) -> T`)
251+
//
252+
// Notably, both sides are consumed outside-in (`left` LTR, `right` RTL),
253+
// converging on the middle (where the name being declared is), and that
254+
// can get confusing (an older comment below also tried to explain it).
255+
//
256+
// Once we run out of "type operators", and the declaration isn't trivial,
257+
// only syntax left is parenthesization *around* the name being declared,
258+
// with everything inside the parentheses applying *on top of* everything
259+
// outside: but we've consumed everything outside so we're actually left
260+
// with `T (...)` and we can simply drop the parentheses!
261+
while !left.is_empty() || !right.is_empty() {
262+
while let Some((&leftmost, after_leftmost)) = left.split_first() {
263+
match leftmost {
264+
CTok::Kw("const") => {
265+
const_qualif
266+
.insert_if_none(())
267+
.ok_or(ErrorKind::Multiple("const"))?;
268+
}
269+
CTok::Punct('*') => {
270+
ty = CType::Ptr {
271+
implicit_for_decay: false,
272+
is_const: const_qualif.take().is_some(),
273+
pointee: Box::new(ty),
274+
};
275+
}
276+
277+
// Outermost parentheses around the name being declared,
278+
// handled together after both `left` and `right` loops.
279+
CTok::Punct('(') => break,
280+
281+
_ => return Err(ErrorKind::UnsupportedLeftmostToken(leftmost)),
282+
}
283+
left = after_leftmost;
284+
}
285+
'right: while let Some(&rightmost) = right.last() {
286+
// NOTE(eddyb) outermost (i.e. rightmost) suffixes apply first,
287+
// and the only way this is "intuitive" is that e.g. a 2D array
288+
// like `T m[A][B]` means `typeof(m[i][j]) = T`, and the lvalue
289+
// syntax has to match the declaration (so `i < A` and `j < B`),
290+
// IOW it's equivalent to `(T[B]) m[A]` / `typeof((m[i])[j]) = T`
291+
// (if C had type parenthesization, or via C++ type aliases).
292+
match rightmost {
293+
CTok::Punct(']' | ')') => {}
294+
295+
_ => return Err(ErrorKind::UnsupportedRightmostToken(rightmost)),
296+
}
297+
298+
// As `rightmost` is `]`/`)`, the matching `[`/`(` must be found.
299+
let (before_rightmost_group, rightmost_group) = {
300+
let mut i = right.len() - 1;
301+
let mut nesting = 0;
302+
loop {
303+
let checked_dec =
304+
|x: usize| x.checked_sub(1).ok_or(ErrorKind::UnbalancedBrackets);
305+
match right[i] {
306+
CTok::Punct(']' | ')') => nesting += 1,
307+
CTok::Punct('[' | '(') => nesting = checked_dec(nesting)?,
308+
_ => {}
309+
}
310+
if nesting == 0 {
311+
break;
312+
}
313+
314+
// Outermost parentheses around the name being declared,
315+
// handled together after both `left` and `right` loops.
316+
if i == 0 && rightmost == CTok::Punct(')') {
317+
break 'right;
318+
}
319+
320+
i = checked_dec(i)?;
321+
}
322+
right.split_at(i)
323+
};
324+
325+
match rightmost_group {
326+
[CTok::Punct('['), len @ .., CTok::Punct(']')] => {
327+
ty = CType::Array {
328+
element: Box::new(ty),
329+
len: match len {
330+
[CTok::ValueName(name)] => CArrayLen::Named(name),
331+
[CTok::IntLit(lit)] => CArrayLen::Literal(
332+
lit.parse().map_err(ErrorKind::InvalidIntLit)?,
333+
),
334+
_ => return Err(ErrorKind::UnsupportedArrayLength),
335+
},
336+
};
337+
}
338+
[CTok::Punct('('), params @ .., CTok::Punct(')')] => {
339+
if const_qualif.is_some() {
340+
return Err(ErrorKind::Unused("const"));
341+
}
342+
343+
let params = match params {
344+
[] => return Err(ErrorKind::Missing("parameters")),
345+
[CTok::Kw("void")] => vec![],
346+
_ => params
347+
.split(|&tok| tok == CTok::Punct(','))
348+
.map(|param| CDecl::parse_inner(CDeclMode::FuncTypeParam, param))
349+
.collect::<Result<_, _>>()?,
350+
};
351+
ty = CType::Func {
352+
ret_ty: Some(ty).filter(|ty| *ty != CType::VOID).map(Box::new),
353+
params,
354+
};
355+
}
356+
_ => return Err(ErrorKind::UnbalancedBrackets),
357+
}
358+
right = before_rightmost_group;
359+
}
360+
361+
// Outermost parentheses around the name being declared, handled here
362+
// to ensure there is nothing else left around them, and can therefore
363+
// be cleanly removed.
364+
if let ([CTok::Punct('('), left_inner @ ..], [right_inner @ .., CTok::Punct(')')]) =
365+
(left, right)
366+
{
367+
left = left_inner;
368+
right = right_inner;
369+
}
370+
}
371+
372+
// NOTE(eddyb) parameters to functions decay "into" pointers, but because
373+
// we control the typesystem, we can keep both the array types, and the
374+
// implicit pointer, closer to Rust e.g. `&[T; N]` arguments.
375+
if let (CDeclMode::FuncParam, CType::Array { .. }) = (mode, &ty) {
376+
ty = CType::Ptr {
377+
implicit_for_decay: true,
378+
is_const: const_qualif.take().is_some(),
379+
pointee: Box::new(ty),
380+
};
381+
}
382+
383+
if const_qualif.is_some() {
384+
return Err(ErrorKind::Unused("const"));
385+
}
386+
387+
Ok(CDecl {
388+
ty,
389+
name: decl_name,
390+
bitfield_width,
391+
})
392+
}
393+
}

analysis/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
mod cdecl;
12
mod xml;
23

34
use std::{fs, path::Path};

0 commit comments

Comments
 (0)