source/compiler-core/slang-lexer.cpp

// slang-lexer.cpp
#include "slang-lexer.h"

// This file implements the lexer/scanner, which is responsible for taking a raw stream of
// input bytes and turning it into semantically useful tokens.
//

#include "core/slang-char-encode.h"
#include "slang-core-diagnostics.h"
#include "slang-name.h"
#include "slang-source-loc.h"

namespace Slang
{
Token TokenReader::getEndOfFileToken()
{
    return Token(TokenType::EndOfFile, UnownedStringSlice::fromLiteral(""), SourceLoc());
}

const Token* TokenList::begin() const
{
    SLANG_ASSERT(m_tokens.getCount());
    return &m_tokens[0];
}

const Token* TokenList::end() const
{
    SLANG_ASSERT(m_tokens.getCount());
    SLANG_ASSERT(m_tokens[m_tokens.getCount() - 1].type == TokenType::EndOfFile);
    return &m_tokens[m_tokens.getCount() - 1];
}

TokenSpan::TokenSpan()
    : m_begin(nullptr), m_end(nullptr)
{
}

TokenReader::TokenReader()
    : m_cursor(nullptr), m_end(nullptr)
{
    _updateLookaheadToken();
}

Token& TokenReader::peekToken()
{
    return m_nextToken;
}

TokenType TokenReader::peekTokenType() const
{
    return m_nextToken.type;
}

SourceLoc TokenReader::peekLoc() const
{
    return m_nextToken.loc;
}

Token TokenReader::advanceToken()
{
    Token result = m_nextToken;
    if (m_cursor != m_end)
        m_cursor++;
    _updateLookaheadToken();
    return result;
}

void TokenReader::_updateLookaheadToken()
{
    // We assume here that we can read a token from a non-null `m_cursor`
    // *even* in the case where `m_cursor == m_end`, because the invariant
    // for lists of tokens is that they should be terminated with and
    // end-of-file token, so that there is always a token "one past the end."
    //
    m_nextToken = m_cursor ? *m_cursor : getEndOfFileToken();

    // If the token we read came from the end of the sub-sequence we are
    // reading, then we will change the token type to an end-of-file token
    // so that code that reads from the sequence and expects a terminating
    // EOF will find it.
    //
    // TODO: We might eventually want a way to look at the actual token type
    // and not just use EOF in all cases: e.g., when emitting diagnostic
    // messages that include the token that is seen.
    //
    if (m_cursor == m_end)
        m_nextToken.type = TokenType::EndOfFile;
}

// Lexer

void Lexer::initialize(
    SourceView* sourceView,
    DiagnosticSink* sink,
    NamePool* namePool,
    MemoryArena* memoryArena)
{
    m_sourceView = sourceView;
    m_sink = sink;
    m_namePool = namePool;
    m_memoryArena = memoryArena;

    auto content = sourceView->getContent();

    m_begin = content.begin();
    m_cursor = content.begin();
    m_end = content.end();

    // Set the start location
    m_startLoc = sourceView->getRange().begin;

    // The first token read from a translation unit should be considered to be at
    // the start of a line, and *also* as coming after whitespace (conceptually
    // both the end-of-file and beginning-of-file pseudo-tokens are whitespace).
    //
    m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
    m_lexerFlags = 0;
}

Lexer::~Lexer() {}

enum
{
    kEOF = -1
};

// Get the next input byte, without any handling of
// escaped newlines, non-ASCII code points, source locations, etc.
static int _peekRaw(Lexer* lexer)
{
    // If we are at the end of the input, return a designated end-of-file value
    if (lexer->m_cursor == lexer->m_end)
        return kEOF;

    // Otherwise, just look at the next byte
    return *lexer->m_cursor;
}

// Read one input byte without any special handling (similar to `peekRaw`)
static int _advanceRaw(Lexer* lexer)
{
    // The logic here is basically the same as for `peekRaw()`,
    // escape we advance `cursor` if we aren't at the end.

    if (lexer->m_cursor == lexer->m_end)
        return kEOF;

    return *lexer->m_cursor++;
}

// When the cursor is already at the first byte of an end-of-line sequence,
// consume one or two bytes that compose the sequence.
//
// Basically, a newline is one of:
//
//  "\n"
//  "\r"
//  "\r\n"
//  "\n\r"
//
// We always look for the longest match possible.
//
static void _handleNewLineInner(Lexer* lexer, int c)
{
    SLANG_ASSERT(c == '\n' || c == '\r');

    int d = _peekRaw(lexer);
    if ((c ^ d) == ('\n' ^ '\r'))
    {
        _advanceRaw(lexer);
    }
}

// Look ahead one code point, dealing with complications like
// escaped newlines.
static int _peek(Lexer* lexer, int offset = 0)
{
    int pos = 0;
    int c = kEOF;

    do
    {
        if (lexer->m_cursor + pos == lexer->m_end)
            return kEOF;

        c = lexer->m_cursor[pos++];

        while (c == '\\')
        {
            // We might have a backslash-escaped newline.
            // Look at the next byte (if any) to see.
            //
            // Note(tfoley): We are assuming a null-terminated input here,
            // so that we can safely look at the next byte without issue.
            int d = lexer->m_cursor[pos++];
            switch (d)
            {
            case '\r':
            case '\n':
                {
                    // The newline was escaped, so return the code point after *that*
                    int e = lexer->m_cursor[pos++];
                    if ((d ^ e) == ('\r' ^ '\n'))
                        c = lexer->m_cursor[pos++];
                    else
                        c = e;
                    continue;
                }
            default:
                break;
            }

            // Only continue this while loop in the case where we consumed
            // some newlines
            break;
        }
        if (isUtf8LeadingByte((Byte)c))
        {
            // Consume all unicode characters.
            pos--;
            c = getUnicodePointFromUTF8([&]() { return lexer->m_cursor[pos++]; });
        }
        // Default case is to just hand along the byte we read as an ASCII code point.
    } while (offset--);

    return c;
}

// Get the next code point from the input, and advance the cursor.
static int _advance(Lexer* lexer)
{
    // We are going to loop, but only as a way of handling
    // escaped line endings.
    for (;;)
    {
        // If we are at the end of the input, then the task is easy.
        if (lexer->m_cursor == lexer->m_end)
            return kEOF;

        // Look at the next raw byte, and decide what to do
        int c = *lexer->m_cursor++;

        if (c == '\\')
        {
            // We might have a backslash-escaped newline.
            // Look at the next byte (if any) to see.
            //
            // Note(tfoley): We are assuming a null-terminated input here,
            // so that we can safely look at the next byte without issue.
            int d = *lexer->m_cursor;
            switch (d)
            {
            case '\r':
            case '\n':
                // handle the end-of-line for our source location tracking
                lexer->m_cursor++;
                _handleNewLineInner(lexer, d);

                lexer->m_tokenFlags |= TokenFlag::ScrubbingNeeded;

                // Now try again, looking at the character after the
                // escaped newline.
                continue;

            default:
                break;
            }
        }

        // Consume all unicode characters.
        if (isUtf8LeadingByte((Byte)c))
        {
            lexer->m_cursor--;
            c = getUnicodePointFromUTF8([&]() { return *lexer->m_cursor++; });
        }

        // Default case is to return the raw byte we saw.
        return c;
    }
}

static void _handleNewLine(Lexer* lexer)
{
    int c = _advance(lexer);
    _handleNewLineInner(lexer, c);
}

static void _lexLineComment(Lexer* lexer)
{
    for (;;)
    {
        switch (_peek(lexer))
        {
        case '\n':
        case '\r':
        case kEOF:
            return;

        default:
            _advance(lexer);
            continue;
        }
    }
}

static void _lexBlockComment(Lexer* lexer)
{
    for (;;)
    {
        switch (_peek(lexer))
        {
        case kEOF:
            // TODO(tfoley) diagnostic!
            return;

        case '\n':
        case '\r':
            _handleNewLine(lexer);
            continue;

        case '*':
            _advance(lexer);
            switch (_peek(lexer))
            {
            case '/':
                _advance(lexer);
                return;

            default:
                continue;
            }

        default:
            _advance(lexer);
            continue;
        }
    }
}

static void _lexHorizontalSpace(Lexer* lexer)
{
    for (;;)
    {
        switch (_peek(lexer))
        {
        case ' ':
        case '\t':
            _advance(lexer);
            continue;

        default:
            return;
        }
    }
}

static bool isNonAsciiCodePoint(unsigned int codePoint)
{
    return codePoint != 0xFFFFFFFF && codePoint >= 0x80;
}

static void _lexIdentifier(Lexer* lexer)
{
    for (;;)
    {
        int c = _peek(lexer);
        if (('a' <= c) && (c <= 'z') || ('A' <= c) && (c <= 'Z') || ('0' <= c) && (c <= '9') ||
            (c == '_') || isNonAsciiCodePoint((unsigned int)c))
        {
            _advance(lexer);
            continue;
        }
        return;
    }
}

static SourceLoc _getSourceLoc(Lexer* lexer)
{
    return lexer->m_startLoc + (lexer->m_cursor - lexer->m_begin);
}

static void _lexDigits(Lexer* lexer, int base)
{
    for (;;)
    {
        int c = _peek(lexer);

        int digitVal = 0;
        switch (c)
        {
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            digitVal = c - '0';
            break;

        case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
            if (base <= 10)
                return;
            digitVal = 10 + c - 'a';
            break;

        case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
            if (base <= 10)
                return;
            digitVal = 10 + c - 'A';
            break;

        default:
            // Not more digits!
            return;
        }

        if (digitVal >= base)
        {
            if (auto sink = lexer->getDiagnosticSink())
            {
                char buffer[] = {(char)c, 0};
                sink->diagnose(
                    _getSourceLoc(lexer),
                    LexerDiagnostics::invalidDigitForBase,
                    buffer,
                    base);
            }
        }

        _advance(lexer);
    }
}

static TokenType _maybeLexNumberSuffix(Lexer* lexer, TokenType tokenType)
{
    // Be liberal in what we accept here, so that figuring out
    // the semantics of a numeric suffix is left up to the parser
    // and semantic checking logic.
    //
    for (;;)
    {
        int c = _peek(lexer);

        // Accept any alphanumeric character, plus underscores.
        if (('a' <= c) && (c <= 'z') || ('A' <= c) && (c <= 'Z') || ('0' <= c) && (c <= '9') ||
            (c == '_'))
        {
            _advance(lexer);
            continue;
        }

        // Stop at the first character that isn't
        // alphanumeric.
        return tokenType;
    }
}

static bool _isNumberExponent(int c, int base)
{
    switch (c)
    {
    default:
        return false;

    case 'e':
    case 'E':
        if (base != 10)
            return false;
        break;

    case 'p':
    case 'P':
        if (base != 16)
            return false;
        break;
    }

    return true;
}

static bool _maybeLexNumberExponent(Lexer* lexer, int base)
{
    if (_peek(lexer) == '#')
    {
        // Special case #INF
        const auto inf = toSlice("#INF");
        for (auto c : inf)
        {
            if (_peek(lexer) != c)
            {
                return false;
            }
            _advance(lexer);
        }

        return true;
    }

    if (!_isNumberExponent(_peek(lexer), base))
        return false;

    // we saw an exponent marker
    _advance(lexer);

    // Now start to read the exponent
    switch (_peek(lexer))
    {
    case '+':
    case '-':
        _advance(lexer);
        break;
    }

    // TODO(tfoley): it would be an error to not see digits here...

    _lexDigits(lexer, 10);

    return true;
}

static TokenType _lexNumberAfterDecimalPoint(Lexer* lexer, int base)
{
    _lexDigits(lexer, base);
    _maybeLexNumberExponent(lexer, base);

    return _maybeLexNumberSuffix(lexer, TokenType::FloatingPointLiteral);
}

static TokenType _lexNumber(Lexer* lexer, int base)
{
    // TODO(tfoley): Need to consider whether to allow any kind of digit separator character.

    TokenType tokenType = TokenType::IntegerLiteral;

    // At the start of things, we just concern ourselves with digits
    _lexDigits(lexer, base);

    if (_peek(lexer) == '.')
    {
        switch (_peek(lexer, 1))
        {
            // 123.xxxx or 123.rrrr
        case 'x':
        case 'r':
            break;

        default:
            tokenType = TokenType::FloatingPointLiteral;

            _advance(lexer);
            _lexDigits(lexer, base);
        }
    }

    if (_maybeLexNumberExponent(lexer, base))
    {
        tokenType = TokenType::FloatingPointLiteral;
    }

    _maybeLexNumberSuffix(lexer, tokenType);
    return tokenType;
}

static int _maybeReadDigit(char const** ioCursor, int base)
{
    auto& cursor = *ioCursor;

    for (;;)
    {
        int c = *cursor;
        switch (c)
        {
        default:
            return -1;

        // TODO: need to decide on digit separator characters
        case '_':
            cursor++;
            continue;

        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            cursor++;
            return c - '0';

        case 'a':
        case 'b':
        case 'c':
        case 'd':
        case 'e':
        case 'f':
            if (base > 10)
            {
                cursor++;
                return 10 + c - 'a';
            }
            return -1;

        case 'A':
        case 'B':
        case 'C':
        case 'D':
        case 'E':
        case 'F':
            if (base > 10)
            {
                cursor++;
                return 10 + c - 'A';
            }
            return -1;
        }
    }
}

static int _readOptionalBase(char const** ioCursor)
{
    auto& cursor = *ioCursor;
    if (*cursor == '0')
    {
        cursor++;
        switch (*cursor)
        {
        case 'x':
        case 'X':
            cursor++;
            return 16;

        case 'b':
        case 'B':
            cursor++;
            return 2;

        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            return 8;

        default:
            return 10;
        }
    }

    return 10;
}


IntegerLiteralValue getIntegerLiteralValue(Token const& token, UnownedStringSlice* outSuffix, bool* outIsDecimalBase)
{
    IntegerLiteralValue value = 0;

    const UnownedStringSlice content = token.getContent();

    char const* cursor = content.begin();
    char const* end = content.end();

    int base = _readOptionalBase(&cursor);

    for (;;)
    {
        int digit = _maybeReadDigit(&cursor, base);
        if (digit < 0)
            break;

        value = value * base + digit;
    }

    if (outSuffix)
    {
        *outSuffix = UnownedStringSlice(cursor, end);
    }

    if(outIsDecimalBase)
    {
        *outIsDecimalBase = (base == 10);
    }

    return value;
}

FloatingPointLiteralValue getFloatingPointLiteralValue(
    Token const& token,
    UnownedStringSlice* outSuffix)
{
    FloatingPointLiteralValue value = 0;

    const UnownedStringSlice content = token.getContent();

    char const* cursor = content.begin();
    char const* end = content.end();

    int radix = _readOptionalBase(&cursor);

    bool seenDot = false;
    FloatingPointLiteralValue divisor = 1;
    for (;;)
    {
        if (*cursor == '.')
        {
            cursor++;
            seenDot = true;
            continue;
        }

        int digit = _maybeReadDigit(&cursor, radix);
        if (digit < 0)
            break;

        value = value * radix + digit;

        if (seenDot)
        {
            divisor *= radix;
        }
    }

    if (*cursor == '#')
    {
        // It must be INF
        const auto inf = toSlice("#INF");

        if (UnownedStringSlice(cursor, end).startsWith(inf))
        {
            if (outSuffix)
            {
                *outSuffix = UnownedStringSlice(cursor + inf.getLength(), end);
            }

            value = INFINITY;

            return value;
        }
    }

    // Now read optional exponent
    if (_isNumberExponent(*cursor, radix))
    {
        cursor++;

        bool exponentIsNegative = false;
        switch (*cursor)
        {
        default:
            break;

        case '-':
            exponentIsNegative = true;
            cursor++;
            break;

        case '+':
            cursor++;
            break;
        }

        int exponentRadix = 10;
        int exponent = 0;

        for (;;)
        {
            int digit = _maybeReadDigit(&cursor, exponentRadix);
            if (digit < 0)
                break;

            exponent = exponent * exponentRadix + digit;
        }

        FloatingPointLiteralValue exponentBase = 10;
        if (radix == 16)
        {
            exponentBase = 2;
        }

        FloatingPointLiteralValue exponentValue = pow(exponentBase, exponent);

        if (exponentIsNegative)
        {
            divisor *= exponentValue;
        }
        else
        {
            value *= exponentValue;
        }
    }

    value /= divisor;

    if (outSuffix)
    {
        *outSuffix = UnownedStringSlice(cursor, end);
    }

    return value;
}

static void _lexStringLiteralBody(Lexer* lexer, char quote)
{
    for (;;)
    {
        int c = _peek(lexer);
        if (c == quote)
        {
            _advance(lexer);
            return;
        }

        switch (c)
        {
        case kEOF:
            if (auto sink = lexer->getDiagnosticSink())
            {
                sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral);
            }
            return;

        case '\n':
        case '\r':
            if (auto sink = lexer->getDiagnosticSink())
            {
                sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::newlineInLiteral);
            }
            return;

        case '\\':
            // Need to handle various escape sequence cases
            _advance(lexer);
            switch (_peek(lexer))
            {
            case '\'':
            case '\"':
            case '\\':
            case '?':
            case 'a':
            case 'b':
            case 'f':
            case 'n':
            case 'r':
            case 't':
            case 'v':
                _advance(lexer);
                break;

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
                // octal escape: up to 3 characters
                _advance(lexer);
                for (int ii = 0; ii < 3; ++ii)
                {
                    int d = _peek(lexer);
                    if (('0' <= d) && (d <= '7'))
                    {
                        _advance(lexer);
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }
                break;

            case 'x':
                // hexadecimal escape: any number of characters
                _advance(lexer);
                for (;;)
                {
                    int d = _peek(lexer);
                    if (('0' <= d) && (d <= '9') || ('a' <= d) && (d <= 'f') ||
                        ('A' <= d) && (d <= 'F'))
                    {
                        _advance(lexer);
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }
                break;

                // TODO: Unicode escape sequences
            }
            break;

        default:
            _advance(lexer);
            continue;
        }
    }
}

static void _lexRawStringLiteralBody(Lexer* lexer)
{
    const char* start = lexer->m_cursor;
    const char* endOfDelimiter = nullptr;
    for (;;)
    {
        int c = _peek(lexer);
        if (c == '(' && endOfDelimiter == nullptr)
            endOfDelimiter = lexer->m_cursor;
        if (c == '\"')
        {
            if (!endOfDelimiter)
            {
                if (auto sink = lexer->getDiagnosticSink())
                {
                    sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::quoteCannotBeDelimiter);
                }
            }
            else
            {
                auto testStart = lexer->m_cursor - (endOfDelimiter - start);
                if (testStart > endOfDelimiter)
                {
                    auto testDelimiter = UnownedStringSlice(testStart, lexer->m_cursor);
                    auto delimiter = UnownedStringSlice(start, endOfDelimiter);
                    if (*(testStart - 1) == ')' && testDelimiter == delimiter)
                    {
                        _advance(lexer);
                        return;
                    }
                }
            }
        }

        switch (c)
        {
        case kEOF:
            if (auto sink = lexer->getDiagnosticSink())
            {
                sink->diagnose(_getSourceLoc(lexer), LexerDiagnostics::endOfFileInLiteral);
            }
            return;
        default:
            _advance(lexer);
            continue;
        }
    }
}

UnownedStringSlice getRawStringLiteralTokenValue(Token const& token)
{
    auto content = token.getContent();
    if (content.getLength() <= 5)
        return UnownedStringSlice();
    auto start = content.begin() + 2;
    auto delimEnd = start;
    while (delimEnd < content.end() && *delimEnd != '(')
        delimEnd++;
    auto delimLength = delimEnd - start;
    auto contentEnd = content.end() - delimLength - 2;
    auto contentBegin = start + delimLength + 1;
    if (contentEnd <= contentBegin)
        return UnownedStringSlice();
    return UnownedStringSlice(contentBegin, contentEnd);
}

String getStringLiteralTokenValue(Token const& token)
{
    SLANG_ASSERT(token.type == TokenType::StringLiteral || token.type == TokenType::CharLiteral);

    if (token.getContent().startsWith("R"))
        return getRawStringLiteralTokenValue(token);

    const UnownedStringSlice content = token.getContent();

    char const* cursor = content.begin();
    char const* end = content.end();
    SLANG_UNREFERENCED_VARIABLE(end);

    auto quote = *cursor++;
    SLANG_ASSERT(quote == '\'' || quote == '"');

    StringBuilder valueBuilder;
    for (;;)
    {
        SLANG_ASSERT(cursor != end);

        auto c = *cursor++;

        // If we see a closing quote, then we are at the end of the string literal
        if (c == quote)
        {
            SLANG_ASSERT(cursor == end);
            return valueBuilder.produceString();
        }

        // Characters that don't being escape sequences are easy;
        // just append them to the buffer and move on.
        if (c != '\\')
        {
            valueBuilder.append(c);
            continue;
        }

        // Now we look at another character to figure out the kind of
        // escape sequence we are dealing with:

        char d = *cursor++;

        switch (d)
        {
        // Simple characters that just needed to be escaped
        case '\'':
        case '\"':
        case '\\':
        case '?':
            valueBuilder.append(d);
            continue;

        // Traditional escape sequences for special characters
        case 'a':
            valueBuilder.append('\a');
            continue;
        case 'b':
            valueBuilder.append('\b');
            continue;
        case 'f':
            valueBuilder.append('\f');
            continue;
        case 'n':
            valueBuilder.append('\n');
            continue;
        case 'r':
            valueBuilder.append('\r');
            continue;
        case 't':
            valueBuilder.append('\t');
            continue;
        case 'v':
            valueBuilder.append('\v');
            continue;

        // Octal escape: up to 3 characters
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
            {
                cursor--;
                int value = 0;
                for (int ii = 0; ii < 3; ++ii)
                {
                    d = *cursor;
                    if (('0' <= d) && (d <= '7'))
                    {
                        value = value * 8 + (d - '0');

                        cursor++;
                        continue;
                    }
                    else
                    {
                        break;
                    }
                }

                // TODO: add support for appending an arbitrary code point?
                valueBuilder.append((char)value);
            }
            continue;

        // Hexadecimal escape: any number of characters
        case 'x':
            {
                int value = 0;
                for (;;)
                {
                    d = *cursor++;
                    int digitValue = 0;
                    if (('0' <= d) && (d <= '9'))
                    {
                        digitValue = d - '0';
                    }
                    else if (('a' <= d) && (d <= 'f'))
                    {
                        digitValue = d - 'a';
                    }
                    else if (('A' <= d) && (d <= 'F'))
                    {
                        digitValue = d - 'A';
                    }
                    else
                    {
                        cursor--;
                        break;
                    }

                    value = value * 16 + digitValue;
                }

                // TODO: add support for appending an arbitrary code point?
                valueBuilder.append((char)value);
            }
            continue;

            // TODO: Unicode escape sequences
        }
    }
}

String getFileNameTokenValue(Token const& token)
{
    const UnownedStringSlice content = token.getContent();

    // A file name usually doesn't process escape sequences
    // (this is import on Windows, where `\\` is a valid
    // path separator character).

    // Just trim off the first and last characters to remove the quotes
    // (whether they were `""` or `<>`.
    return String(content.begin() + 1, content.end() - 1);
}

static TokenType _lexTokenImpl(Lexer* lexer)
{
    int nextCodePoint = _peek(lexer);
    switch (nextCodePoint)
    {
    default:
        break;

    case kEOF:
        return TokenType::EndOfFile;

    case '\r':
    case '\n':
        _handleNewLine(lexer);
        return TokenType::NewLine;

    case ' ':
    case '\t':
        _lexHorizontalSpace(lexer);
        return TokenType::WhiteSpace;

    case '.':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
            return _lexNumberAfterDecimalPoint(lexer, 10);

        case '.':
            // Note: consuming the second `.` here means that
            // we cannot back up and return a `.` token by itself
            // any more. We thus end up having distinct tokens for
            // `.`, `..`, and `...` even though the `..` case is
            // not part of HLSL.
            //
            _advance(lexer);
            switch (_peek(lexer))
            {
            case '.':
                _advance(lexer);
                return TokenType::Ellipsis;

            default:
                return TokenType::DotDot;
            }

        default:
            return TokenType::Dot;
        }

    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
        return _lexNumber(lexer, 10);

    case '0':
        {
            auto loc = _getSourceLoc(lexer);
            _advance(lexer);
            switch (_peek(lexer))
            {
            default:
                return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral);

            case '.':
                switch (_peek(lexer, 1))
                {
                    // 0.xxxx or 0.rrrr
                case 'x':
                case 'r':
                    return _maybeLexNumberSuffix(lexer, TokenType::IntegerLiteral);
                default:
                    _advance(lexer);
                    return _lexNumberAfterDecimalPoint(lexer, 10);
                }

            case 'x':
            case 'X':
                _advance(lexer);
                return _lexNumber(lexer, 16);

            case 'b':
            case 'B':
                _advance(lexer);
                return _lexNumber(lexer, 2);

            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
                if (auto sink = lexer->getDiagnosticSink())
                {
                    sink->diagnose(loc, LexerDiagnostics::octalLiteral);
                }
                return _lexNumber(lexer, 8);
            }
        }

    case 'a':
    case 'b':
    case 'c':
    case 'd':
    case 'e':
    case 'f':
    case 'g':
    case 'h':
    case 'i':
    case 'j':
    case 'k':
    case 'l':
    case 'm':
    case 'n':
    case 'o':
    case 'p':
    case 'q':
    case 'r':
    case 's':
    case 't':
    case 'u':
    case 'v':
    case 'w':
    case 'x':
    case 'y':
    case 'z':
    case 'A':
    case 'B':
    case 'C':
    case 'D':
    case 'E':
    case 'F':
    case 'G':
    case 'H':
    case 'I':
    case 'J':
    case 'K':
    case 'L':
    case 'M':
    case 'N':
    case 'O':
    case 'P':
    case 'Q':
    case 'S':
    case 'T':
    case 'U':
    case 'V':
    case 'W':
    case 'X':
    case 'Y':
    case 'Z':
    case '_':
        _lexIdentifier(lexer);
        return TokenType::Identifier;
    case 'R':
        _advance(lexer);
        switch (_peek(lexer))
        {
        default:
            _lexIdentifier(lexer);
            return TokenType::Identifier;
        case '\"':
            _advance(lexer);
            _lexRawStringLiteralBody(lexer);
            return TokenType::StringLiteral;
        }

    case '\"':
        _advance(lexer);
        _lexStringLiteralBody(lexer, '\"');
        return TokenType::StringLiteral;

    case '\'':
        _advance(lexer);
        _lexStringLiteralBody(lexer, '\'');
        return TokenType::CharLiteral;


    case '+':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '+':
            _advance(lexer);
            return TokenType::OpInc;
        case '=':
            _advance(lexer);
            return TokenType::OpAddAssign;
        default:
            return TokenType::OpAdd;
        }

    case '-':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '-':
            _advance(lexer);
            return TokenType::OpDec;
        case '=':
            _advance(lexer);
            return TokenType::OpSubAssign;
        case '>':
            _advance(lexer);
            return TokenType::RightArrow;
        default:
            return TokenType::OpSub;
        }

    case '*':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '=':
            _advance(lexer);
            return TokenType::OpMulAssign;
        default:
            return TokenType::OpMul;
        }

    case '/':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '=':
            _advance(lexer);
            return TokenType::OpDivAssign;
        case '/':
            _advance(lexer);
            _lexLineComment(lexer);
            return TokenType::LineComment;
        case '*':
            _advance(lexer);
            _lexBlockComment(lexer);
            return TokenType::BlockComment;
        default:
            return TokenType::OpDiv;
        }

    case '%':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '=':
            _advance(lexer);
            return TokenType::OpModAssign;
        default:
            return TokenType::OpMod;
        }

    case '|':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '|':
            _advance(lexer);
            return TokenType::OpOr;
        case '=':
            _advance(lexer);
            return TokenType::OpOrAssign;
        default:
            return TokenType::OpBitOr;
        }

    case '&':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '&':
            _advance(lexer);
            return TokenType::OpAnd;
        case '=':
            _advance(lexer);
            return TokenType::OpAndAssign;
        default:
            return TokenType::OpBitAnd;
        }

    case '^':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '=':
            _advance(lexer);
            return TokenType::OpXorAssign;
        default:
            return TokenType::OpBitXor;
        }

    case '>':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '>':
            _advance(lexer);
            switch (_peek(lexer))
            {
            case '=':
                _advance(lexer);
                return TokenType::OpShrAssign;
            default:
                return TokenType::OpRsh;
            }
        case '=':
            _advance(lexer);
            return TokenType::OpGeq;
        default:
            return TokenType::OpGreater;
        }

    case '<':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '<':
            _advance(lexer);
            switch (_peek(lexer))
            {
            case '=':
                _advance(lexer);
                return TokenType::OpShlAssign;
            default:
                return TokenType::OpLsh;
            }
        case '=':
            _advance(lexer);
            return TokenType::OpLeq;
        default:
            return TokenType::OpLess;
        }

    case '=':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '=':
            _advance(lexer);
            return TokenType::OpEql;
        default:
            return TokenType::OpAssign;
        }

    case '!':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '=':
            _advance(lexer);
            return TokenType::OpNeq;
        default:
            return TokenType::OpNot;
        }

    case '#':
        _advance(lexer);
        switch (_peek(lexer))
        {
        case '#':
            _advance(lexer);
            return TokenType::PoundPound;

        case '?':
            _advance(lexer);
            return TokenType::CompletionRequest;

        default:
            return TokenType::Pound;
        }

    case '~':
        _advance(lexer);
        return TokenType::OpBitNot;

    case ':':
        {
            _advance(lexer);
            if (_peek(lexer) == ':')
            {
                _advance(lexer);
                return TokenType::Scope;
            }
            return TokenType::Colon;
        }
    case ';':
        _advance(lexer);
        return TokenType::Semicolon;
    case ',':
        _advance(lexer);
        return TokenType::Comma;

    case '{':
        _advance(lexer);
        return TokenType::LBrace;
    case '}':
        _advance(lexer);
        return TokenType::RBrace;
    case '[':
        _advance(lexer);
        return TokenType::LBracket;
    case ']':
        _advance(lexer);
        return TokenType::RBracket;
    case '(':
        _advance(lexer);
        return TokenType::LParent;
    case ')':
        _advance(lexer);
        return TokenType::RParent;

    case '?':
        _advance(lexer);
        return TokenType::QuestionMark;
    case '@':
        _advance(lexer);
        return TokenType::At;
    case '$':
        {
            _advance(lexer);
            if (_peek(lexer) == '$')
            {
                _advance(lexer);
                return TokenType::DollarDollar;
            }
            return TokenType::Dollar;
        }
    }

    // We treat all unicode characters as a part of an identifier.
    if (isNonAsciiCodePoint(nextCodePoint))
    {
        _lexIdentifier(lexer);
        return TokenType::Identifier;
    }

    {
        // If none of the above cases matched, then we have an
        // unexpected/invalid character.

        auto loc = _getSourceLoc(lexer);
        int c = _advance(lexer);

        if (auto sink = lexer->getDiagnosticSink())
        {
            if (c >= 0x20 && c <= 0x7E)
            {
                char buffer[] = {(char)c, 0};
                sink->diagnose(loc, LexerDiagnostics::illegalCharacterPrint, buffer);
            }
            else if (c == kEOF)
            {
                sink->diagnose(loc, LexerDiagnostics::unexpectedEndOfInput);
            }
            else
            {
                // Fallback: print as hexadecimal
                sink->diagnose(
                    loc,
                    LexerDiagnostics::illegalCharacterHex,
                    String((unsigned char)c, 16));
            }
        }

        return TokenType::Invalid;
    }
}

Token Lexer::lexToken()
{
    for (;;)
    {
        Token token;
        token.loc = _getSourceLoc(this);

        char const* textBegin = m_cursor;

        auto tokenType = _lexTokenImpl(this);

        // The flags on the token we just lexed will be based
        // on the current state of the lexer.
        //
        auto tokenFlags = m_tokenFlags;
        //
        // Depending on what kind of token we just lexed, the
        // flags that will be used for the *next* token might
        // need to be updated.
        //
        switch (tokenType)
        {
        case TokenType::NewLine:
            {
                // If we just reached the end of a line, then the next token
                // should count as being at the start of a line, and also after
                // whitespace.
                //
                m_tokenFlags = TokenFlag::AtStartOfLine | TokenFlag::AfterWhitespace;
                break;
            }

        case TokenType::WhiteSpace:
        case TokenType::BlockComment:
        case TokenType::LineComment:
            {
                // True horizontal whitespace and comments both count as whitespace.
                //
                // Note that a line comment does not include the terminating newline,
                // we do not need to set `AtStartOfLine` here.
                //
                m_tokenFlags |= TokenFlag::AfterWhitespace;
                break;
            }

        default:
            {
                // If we read some token other then the above cases, then we are
                // neither after whitespace nor at the start of a line.
                //
                m_tokenFlags = 0;
                break;
            }
        }

        token.type = tokenType;
        token.flags = tokenFlags;

        char const* textEnd = m_cursor;

        // Note(tfoley): `StringBuilder::Append()` seems to crash when appending zero bytes
        if (textEnd != textBegin)
        {
            // "scrubbing" token value here to remove escaped newlines...
            //
            // Only perform this work if we encountered an escaped newline
            // while lexing this token (e.g., keep a flag on the lexer), or
            // do it on-demand when the actual value of the token is needed.
            if (tokenFlags & TokenFlag::ScrubbingNeeded)
            {
                // Allocate space that will always be more than enough for stripped contents
                char* startDst = (char*)m_memoryArena->allocateUnaligned(textEnd - textBegin);
                char* dst = startDst;

                auto tt = textBegin;
                while (tt != textEnd)
                {
                    char c = *tt++;
                    if (c == '\\')
                    {
                        char d = *tt;
                        switch (d)
                        {
                        case '\r':
                        case '\n':
                            {
                                tt++;
                                char e = *tt;
                                if ((d ^ e) == ('\r' ^ '\n'))
                                {
                                    tt++;
                                }
                            }
                            continue;

                        default:
                            break;
                        }
                    }
                    *dst++ = c;
                }
                token.setContent(UnownedStringSlice(startDst, dst));
            }
            else
            {
                token.setContent(UnownedStringSlice(textBegin, textEnd));
            }
        }

        if (m_namePool)
        {
            if (tokenType == TokenType::Identifier || tokenType == TokenType::CompletionRequest)
            {
                token.setName(m_namePool->getName(token.getContent()));
            }
        }

        return token;
    }
}

TokenList Lexer::lexAllSemanticTokens()
{
    TokenList tokenList;
    for (;;)
    {
        Token token = lexToken();

        // We are only interested intokens that are semantically
        // significant, so we will skip over forms of whitespace
        // and comments.
        //
        switch (token.type)
        {
        default:
            break;

        case TokenType::WhiteSpace:
        case TokenType::BlockComment:
        case TokenType::LineComment:
        case TokenType::NewLine:
            continue;
        }

        tokenList.add(token);
        if (token.type == TokenType::EndOfFile)
            return tokenList;
    }
}

TokenList Lexer::lexAllMarkupTokens()
{
    TokenList tokenList;
    for (;;)
    {
        Token token = lexToken();
        switch (token.type)
        {
        default:
            break;

        case TokenType::WhiteSpace:
        case TokenType::NewLine:
            continue;
        }

        tokenList.add(token);
        if (token.type == TokenType::EndOfFile)
            return tokenList;
    }
}

/* static */ UnownedStringSlice Lexer::sourceLocationLexer(const UnownedStringSlice& in)
{
    Lexer lexer;

    SourceManager sourceManager;
    sourceManager.initialize(nullptr, nullptr);

    auto sourceFile = sourceManager.createSourceFileWithString(PathInfo::makeUnknown(), in);
    auto sourceView = sourceManager.createSourceView(sourceFile, nullptr, SourceLoc::fromRaw(0));

    DiagnosticSink sink(&sourceManager, nullptr);

    MemoryArena arena;

    RootNamePool rootNamePool;
    NamePool namePool;
    namePool.setRootNamePool(&rootNamePool);

    lexer.initialize(sourceView, &sink, &namePool, &arena);

    Token tok = lexer.lexToken();

    if (tok.type == TokenType::Invalid)
    {
        return UnownedStringSlice();
    }

    const int offset = sourceView->getRange().getOffset(tok.loc);

    SLANG_ASSERT(offset >= 0 && offset <= in.getLength());
    SLANG_ASSERT(Index(offset + tok.charsCount) <= in.getLength());

    return UnownedStringSlice(in.begin() + offset, in.begin() + offset + tok.charsCount);
}

} // namespace Slang