Skip to content

[TableGen] Remove explicit recursion in LexToken #143697

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
267 changes: 156 additions & 111 deletions llvm/lib/TableGen/TGLexer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,129 +174,174 @@ int TGLexer::peekNextChar(int Index) const {
}

tgtok::TokKind TGLexer::LexToken(bool FileOrLineStart) {
TokStart = CurPtr;
// This always consumes at least one character.
int CurChar = getNextChar();
while (true) {
TokStart = CurPtr;
// This always consumes at least one character.
int CurChar = getNextChar();

switch (CurChar) {
default:
// Handle letters: [a-zA-Z_]
if (isValidIDChar(CurChar, /*First=*/true))
return LexIdentifier();

// Unknown character, emit an error.
return ReturnError(TokStart, "unexpected character");
case EOF:
// Lex next token, if we just left an include file.
// Note that leaving an include file means that the next
// symbol is located at the end of the 'include "..."'
// construct, so LexToken() is called with default
// false parameter.
if (processEOF())
return LexToken();
switch (CurChar) {
default:
// Handle letters: [a-zA-Z_]
if (isValidIDChar(CurChar, /*First=*/true))
return LexIdentifier();

// Return EOF denoting the end of lexing.
return tgtok::Eof;

case ':': return tgtok::colon;
case ';': return tgtok::semi;
case ',': return tgtok::comma;
case '<': return tgtok::less;
case '>': return tgtok::greater;
case ']': return tgtok::r_square;
case '{': return tgtok::l_brace;
case '}': return tgtok::r_brace;
case '(': return tgtok::l_paren;
case ')': return tgtok::r_paren;
case '=': return tgtok::equal;
case '?': return tgtok::question;
case '#':
if (FileOrLineStart) {
tgtok::TokKind Kind = prepIsDirective();
if (Kind != tgtok::Error)
return lexPreprocessor(Kind);
}
// Unknown character, emit an error.
return ReturnError(TokStart, "unexpected character");
case EOF:
// Lex next token, if we just left an include file.
if (processEOF()) {
// Leaving an include file means that the next symbol is located at the
// end of the 'include "..."' construct.
FileOrLineStart = false;
break;
}

return tgtok::paste;
// Return EOF denoting the end of lexing.
return tgtok::Eof;

case ':':
return tgtok::colon;
case ';':
return tgtok::semi;
case ',':
return tgtok::comma;
case '<':
return tgtok::less;
case '>':
return tgtok::greater;
case ']':
return tgtok::r_square;
case '{':
return tgtok::l_brace;
case '}':
return tgtok::r_brace;
case '(':
return tgtok::l_paren;
case ')':
return tgtok::r_paren;
case '=':
return tgtok::equal;
case '?':
return tgtok::question;
case '#':
if (FileOrLineStart) {
tgtok::TokKind Kind = prepIsDirective();
if (Kind != tgtok::Error)
return lexPreprocessor(Kind);
}

return tgtok::paste;

// The period is a separate case so we can recognize the "..."
// range punctuator.
case '.':
if (peekNextChar(0) == '.') {
++CurPtr; // Eat second dot.
// The period is a separate case so we can recognize the "..."
// range punctuator.
case '.':
if (peekNextChar(0) == '.') {
++CurPtr; // Eat third dot.
return tgtok::dotdotdot;
++CurPtr; // Eat second dot.
if (peekNextChar(0) == '.') {
++CurPtr; // Eat third dot.
return tgtok::dotdotdot;
}
return ReturnError(TokStart, "invalid '..' punctuation");
}
return ReturnError(TokStart, "invalid '..' punctuation");
}
return tgtok::dot;
return tgtok::dot;

case '\r':
llvm_unreachable("getNextChar() must never return '\r'");
case '\r':
llvm_unreachable("getNextChar() must never return '\r'");

case ' ':
case '\t':
// Ignore whitespace.
return LexToken(FileOrLineStart);
case '\n':
// Ignore whitespace, and identify the new line.
return LexToken(true);
case '/':
// If this is the start of a // comment, skip until the end of the line or
// the end of the buffer.
if (*CurPtr == '/')
SkipBCPLComment();
else if (*CurPtr == '*') {
if (SkipCComment())
return tgtok::Error;
} else // Otherwise, this is an error.
return ReturnError(TokStart, "unexpected character");
return LexToken(FileOrLineStart);
case '-': case '+':
case '0': case '1': case '2': case '3': case '4': case '5': case '6':
case '7': case '8': case '9': {
int NextChar = 0;
if (isDigit(CurChar)) {
// Allow identifiers to start with a number if it is followed by
// an identifier. This can happen with paste operations like
// foo#8i.
int i = 0;
do {
NextChar = peekNextChar(i++);
} while (isDigit(NextChar));

if (NextChar == 'x' || NextChar == 'b') {
// If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
// likely a number.
int NextNextChar = peekNextChar(i);
switch (NextNextChar) {
default:
break;
case '0': case '1':
if (NextChar == 'b')
return LexNumber();
[[fallthrough]];
case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F':
if (NextChar == 'x')
return LexNumber();
break;
case ' ':
case '\t':
// Ignore whitespace.
break;
case '\n':
// Ignore whitespace, and identify the new line.
FileOrLineStart = true;
break;
case '/':
// If this is the start of a // comment, skip until the end of the line or
// the end of the buffer.
if (*CurPtr == '/')
SkipBCPLComment();
else if (*CurPtr == '*') {
if (SkipCComment())
return tgtok::Error;
} else // Otherwise, this is an error.
return ReturnError(TokStart, "unexpected character");
break;
case '-':
case '+':
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9': {
int NextChar = 0;
if (isDigit(CurChar)) {
// Allow identifiers to start with a number if it is followed by
// an identifier. This can happen with paste operations like
// foo#8i.
int i = 0;
do {
NextChar = peekNextChar(i++);
} while (isDigit(NextChar));

if (NextChar == 'x' || NextChar == 'b') {
// If this is [0-9]b[01] or [0-9]x[0-9A-fa-f] this is most
// likely a number.
int NextNextChar = peekNextChar(i);
switch (NextNextChar) {
default:
break;
case '0':
case '1':
if (NextChar == 'b')
return LexNumber();
[[fallthrough]];
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
if (NextChar == 'x')
return LexNumber();
break;
}
}
}
}

if (isValidIDChar(NextChar, /*First=*/true))
return LexIdentifier();
if (isValidIDChar(NextChar, /*First=*/true))
return LexIdentifier();

return LexNumber();
}
case '"': return LexString();
case '$': return LexVarName();
case '[': return LexBracket();
case '!': return LexExclaim();
return LexNumber();
}
case '"':
return LexString();
case '$':
return LexVarName();
case '[':
return LexBracket();
case '!':
return LexExclaim();
}
}
}

Expand Down