From e1b8c2f5e57436b515001a70e717bba455b1da80 Mon Sep 17 00:00:00 2001
From: Will Wray <wjwray@gmail.com>
Date: Thu, 24 Nov 2022 07:09:15 -0400
Subject: [PATCH] Add PP_NUMBER, remove CPP_INTEGER, CPP_FLOAT

A simple proof of concept change that fixes #79.
With it, pcpp can do codegen using the IREPEAT library.

I believe it's conceptually correct, but my Python may not be;
please test this against your suite and review the method
(hack) carefully. There's not much code! Mostly deletions.

The change removes CPP_INTEGER, effectively replacing it with
PP_NUMBER, and entirely removes CPP_FLOAT as superfluous for
preprocessing purposes.

pp-number is sufficient for preprocessing to stage 4

The pp-number regex in the issue is incorrect, lifted from
unpublished WG21 https://isocpp.org/files/papers/D2180R0.html
"pp-number makes cpp dumber" (best proposal title ever).

Instead, I crafted a regex based on the lastest C++ draft
https://eel.is/c++draft/lex.ppnumber#ntref:pp-number
which accepts character ' as digit separator:

  regex string   r'\.?\d(\.|[\w_]|\'[\w_]|[eEpP][-+])*'

(also admits binary literals, with digit separator, of course,
 so they can now be added to the Value parsing code)

Only the conditional evaluator is required to interpret the
numbers as integer constant expressions.

This is achieved by hacky means:

    def p_expression_number(p):
        'expression : PP_NUMBER'
        try:
            p[0] = Value(p[1])
        except:
            p[0] = p[1]

The idea is that if the parsed string p[1] can be interpreted as
an integer constant-expression Value(p[1]) then do so, otherwise
simply pass through the string for possible further pasting and
processing.

A robust method might check p[1] against the CPP_INTEGER regex
(removed in this commit) for a full match, consuming all input.
On the other hand, relying on Value to validate the input while
parsing and to raise an exception on failure may be Pythonic.

It seems that pp-number itself is a hack in the standard; I see
no way to incorporate pp-number alongside INTEGER and FLOAT tokens
meaningful in C; but then there's no need to. Happy Thanksgiving!
---
 pcpp/evaluator.py | 10 +++++++---
 pcpp/parser.py    | 13 +++++--------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/pcpp/evaluator.py b/pcpp/evaluator.py
index 38abe20..498ec1f 100644
--- a/pcpp/evaluator.py
+++ b/pcpp/evaluator.py
@@ -357,7 +357,7 @@ def __gt__(self, other):
 
 # The subset of tokens from Preprocessor used in preprocessor expressions
 tokens = (
-   'CPP_ID', 'CPP_INTEGER', 'CPP_CHAR', 'CPP_STRING',
+   'CPP_ID', 'PP_NUMBER', 'CPP_CHAR', 'CPP_STRING',
    'CPP_PLUS', 'CPP_MINUS', 'CPP_STAR', 'CPP_FSLASH', 'CPP_PERCENT', 'CPP_BAR',
    'CPP_AMPERSAND', 'CPP_TILDE', 'CPP_HAT', 'CPP_LESS', 'CPP_GREATER', 'CPP_EXCLAMATION',
    'CPP_QUESTION', 'CPP_LPAREN', 'CPP_RPAREN',
@@ -394,8 +394,12 @@ def p_error(p):
         raise SyntaxError("at EOF")
 
 def p_expression_number(p):
-    'expression : CPP_INTEGER'
-    p[0] = Value(p[1])
+    'expression : PP_NUMBER'
+    try:
+        p[0] = Value(p[1])
+    except:
+        p[0] = p[1]
+
 
 def p_expression_character(p):
     'expression : CPP_CHAR'
diff --git a/pcpp/parser.py b/pcpp/parser.py
index 8e6a10a..6e55987 100644
--- a/pcpp/parser.py
+++ b/pcpp/parser.py
@@ -27,7 +27,7 @@
 # -----------------------------------------------------------------------------
 
 tokens = (
-   'CPP_ID','CPP_INTEGER', 'CPP_FLOAT', 'CPP_STRING', 'CPP_CHAR', 'CPP_WS', 'CPP_LINECONT', 'CPP_COMMENT1', 'CPP_COMMENT2',
+   'CPP_ID', 'PP_NUMBER', 'CPP_STRING', 'CPP_CHAR', 'CPP_WS', 'CPP_LINECONT', 'CPP_COMMENT1', 'CPP_COMMENT2',
    'CPP_POUND','CPP_DPOUND', 'CPP_PLUS', 'CPP_MINUS', 'CPP_STAR', 'CPP_FSLASH', 'CPP_PERCENT', 'CPP_BAR',
    'CPP_AMPERSAND', 'CPP_TILDE', 'CPP_HAT', 'CPP_LESS', 'CPP_GREATER', 'CPP_EQUAL', 'CPP_EXCLAMATION',
    'CPP_QUESTION', 'CPP_LPAREN', 'CPP_RPAREN', 'CPP_LBRACKET', 'CPP_RBRACKET', 'CPP_LCURLY', 'CPP_RCURLY',
@@ -111,15 +111,12 @@ def t_CPP_LINECONT(t):
 # Identifier
 t_CPP_ID = r'[A-Za-z_][\w_]*'
 
-# Integer literal
-def CPP_INTEGER(t):
-    r'(((((0x)|(0X))[0-9a-fA-F]+)|(\d+))([uU][lL]|[lL][uU]|[uU]|[lL])?)'
+# Preprocessor number
+def PP_NUMBER(t):
+    r'\.?\d(\.|[\w_]|\'[\w_]|[eEpP][-+])*'
     return t
 
-t_CPP_INTEGER = CPP_INTEGER
-
-# Floating literal
-t_CPP_FLOAT = r'((\d+)(\.\d+)(e(\+|-)?(\d+))?|(\d+)e(\+|-)?(\d+))([lL]|[fF])?'
+t_PP_NUMBER = PP_NUMBER
 
 # String literal
 def t_CPP_STRING(t):