// Copyright Epic Games, Inc. All Rights Reserved. using System; using System.Collections; using System.Collections.Generic; using System.Linq; using System.Text; using System.Threading.Tasks; namespace UnrealBuildTool { /// /// Tokenizer for C++ source files. Provides functions for navigating a source file skipping whitespace, comments, and so on when required. /// class TokenReader : IEnumerator { /// /// Flags used to classify the first character in a token /// enum FirstCharacterClass : byte { Unknown, Terminator, Whitespace, Newline, Identifier, Number, Character, String, Dot, QuestionMark, Colon, ExclamationMark, Pipe, Ampersand, Caret, Equals, LeftTriangleBracket, RightTriangleBracket, Plus, Minus, Star, Slash, PercentSign, Tilde, LeftParen, RightParen, Hash, Comma, } /// /// Flags used to classify subsequent characters in a token /// [Flags] enum CharacterFlags : byte { Identifier = 1, Digit = 2, NumberTail = 4, Whitespace = 8, } /// /// Map of utf-8 leading bytes to their class /// static FirstCharacterClass[] FirstCharacters; /// /// Array of flags for different leading utf-8 sequences /// static CharacterFlags[] Characters; /// /// The current buffer being read from. Encoded as UTF-8 with a null terminator. /// byte[] Data; /// /// Current offset within the buffer /// int Offset; /// /// The current token /// Token? CurrentToken; /// /// Constructor /// /// The raw byte stream to read from, encoded as UTF-8 public TokenReader(byte[] Data) { this.Data = Data; this.LineNumber = 0; this.LineNumberAfterToken = 1; // Make sure the input data has a null terminator if(Data.Length == 0 || Data[Data.Length - 1] != 0) { throw new ArgumentException("Data parameter must be null terminated."); } // If the data contains a UTF-8 BOM, skip over it if(Data[0] == 0xef && Data[1] == 0xbb && Data[2] == 0xbf) { Offset = 3; } } /// /// Constructor /// /// The text to read from public TokenReader(string Text) : this(GetNullTerminatedByteArray(Text)) { } /// /// Copy constructor /// /// Token reader to copy from public TokenReader(TokenReader Other) { this.Data = Other.Data; this.Offset = Other.Offset; this.LineNumber = Other.LineNumber; this.LineNumberAfterToken = Other.LineNumberAfterToken; } /// /// Gets a null terminated byte array from the given string /// /// String to convert into bytes /// Array of bytes public static byte[] GetNullTerminatedByteArray(string Text) { byte[] Bytes = new byte[Encoding.UTF8.GetByteCount(Text) + 1]; Encoding.UTF8.GetBytes(Text, 0, Text.Length, Bytes, 0); return Bytes; } /// /// Tokenize an input string /// /// Text to tokenize /// List of tokens parsed from the text public static List Tokenize(string Text) { List Tokens = new List(); TokenReader Reader = new TokenReader(Text); while(Reader.MoveNext()) { Tokens.Add(Reader.Current); } return Tokens; } /// /// Current line number (one-based) /// public int LineNumber { get; private set; } /// /// Line number at the end of this token (one-based) /// public int LineNumberAfterToken { get; private set; } /// /// Returns the current token /// public Token Current { get { return CurrentToken!; } } /// /// Untyped implementation of Current for IEnumerator. /// object IEnumerator.Current { get { return Current; } } /// /// Override of IEnumerator.Dispose. Not required. /// void IDisposable.Dispose() { } /// /// Move to the next token /// /// True if the reader could move to the next token, false otherwise public bool MoveNext() { // Skip past the leading whitespace TokenFlags Flags = SkipWhitespace(); // Update the line number to the one after the current token LineNumber = LineNumberAfterToken; // Capture the start of the token int StartOffset = Offset; // Initialize the flags for the next token byte FirstByte = Data[Offset++]; switch(FirstCharacters[FirstByte]) { case FirstCharacterClass.Unknown: if((FirstByte & 0x80) != 0) { while((Data[Offset] & 0x80) != 0) { Offset++; } } CurrentToken = CreateLiteral(TokenType.Unknown, Flags, StartOffset, Offset - StartOffset); return true; case FirstCharacterClass.Terminator: Offset = StartOffset; CurrentToken = new Token(TokenType.End, Flags); return false; case FirstCharacterClass.Newline: LineNumberAfterToken++; CurrentToken = new Token(TokenType.Newline, Flags); return true; case FirstCharacterClass.Identifier: // Identifier (or text literal with prefix) while((Characters[Data[Offset]] & CharacterFlags.Identifier) != 0) { Offset++; } // Check if it's a prefixed text literal if(Data[Offset] == '\'') { Offset++; SkipTextLiteral('\''); CurrentToken = CreateLiteral(TokenType.Character, Flags, StartOffset, Offset - StartOffset); return true; } else if(Data[Offset] == '\"') { Offset++; SkipTextLiteral('\"'); CurrentToken = CreateLiteral(TokenType.String, Flags, StartOffset, Offset - StartOffset); return true; } else { string Name = Encoding.UTF8.GetString(Data, StartOffset, Offset - StartOffset); Identifier Identifier = Identifier.FindOrAdd(Name); CurrentToken = new Token(Identifier, Flags); return true; } case FirstCharacterClass.Number: // Numeric literal SkipNumericLiteral(); CurrentToken = CreateLiteral(TokenType.Number, Flags, StartOffset, Offset - StartOffset); return true; case FirstCharacterClass.Character: // Character literal SkipTextLiteral('\''); CurrentToken = CreateLiteral(TokenType.Character, Flags, StartOffset, Offset - StartOffset); return true; case FirstCharacterClass.String: // String literal SkipTextLiteral('\"'); CurrentToken = CreateLiteral(TokenType.String, Flags, StartOffset, Offset - StartOffset); return true; case FirstCharacterClass.Dot: // Numeric literal, ellipsis, or dot if((Characters[Data[Offset]] & CharacterFlags.Digit) != 0) { Offset++; SkipNumericLiteral(); CurrentToken = CreateLiteral(TokenType.Number, Flags, StartOffset, Offset - StartOffset); return true; } else if(Data[Offset] == '.' && Data[Offset + 1] == '.') { Offset += 2; CurrentToken = new Token(TokenType.Ellipsis, Flags); return true; } else { CurrentToken = new Token(TokenType.Dot, Flags); return true; } case FirstCharacterClass.QuestionMark: CurrentToken = new Token(TokenType.QuestionMark, Flags); return true; case FirstCharacterClass.Colon: CurrentToken = new Token(TokenType.Colon, Flags); return true; case FirstCharacterClass.ExclamationMark: if(ReadCharacter('=')) { CurrentToken = new Token(TokenType.CompareNotEqual, Flags); } else { CurrentToken = new Token(TokenType.LogicalNot, Flags); } return true; case FirstCharacterClass.Pipe: if(ReadCharacter('|')) { CurrentToken = new Token(TokenType.LogicalOr, Flags); } else { CurrentToken = new Token(TokenType.BitwiseOr, Flags); } return true; case FirstCharacterClass.Ampersand: if(ReadCharacter('&')) { CurrentToken = new Token(TokenType.LogicalAnd, Flags); } else { CurrentToken = new Token(TokenType.BitwiseAnd, Flags); } return true; case FirstCharacterClass.Caret: CurrentToken = new Token(TokenType.BitwiseXor, Flags); return true; case FirstCharacterClass.Equals: if(ReadCharacter('=')) { CurrentToken = new Token(TokenType.CompareEqual, Flags); } else { CurrentToken = new Token(TokenType.Equals, Flags); } return true; case FirstCharacterClass.LeftTriangleBracket: if(ReadCharacter('<')) { CurrentToken = new Token(TokenType.LeftShift, Flags); } else if(ReadCharacter('=')) { CurrentToken = new Token(TokenType.CompareLessOrEqual, Flags); } else { CurrentToken = new Token(TokenType.CompareLess, Flags); } return true; case FirstCharacterClass.RightTriangleBracket: if(ReadCharacter('>')) { CurrentToken = new Token(TokenType.RightShift, Flags); } else if(ReadCharacter('=')) { CurrentToken = new Token(TokenType.CompareGreaterOrEqual, Flags); } else { CurrentToken = new Token(TokenType.CompareGreater, Flags); } return true; case FirstCharacterClass.Plus: CurrentToken = new Token(TokenType.Plus, Flags); return true; case FirstCharacterClass.Minus: CurrentToken = new Token(TokenType.Minus, Flags); return true; case FirstCharacterClass.Star: CurrentToken = new Token(TokenType.Multiply, Flags); return true; case FirstCharacterClass.Slash: CurrentToken = new Token(TokenType.Divide, Flags); return true; case FirstCharacterClass.PercentSign: CurrentToken = new Token(TokenType.Modulo, Flags); return true; case FirstCharacterClass.Tilde: CurrentToken = new Token(TokenType.BitwiseNot, Flags); return true; case FirstCharacterClass.LeftParen: CurrentToken = new Token(TokenType.LeftParen, Flags); return true; case FirstCharacterClass.RightParen: CurrentToken = new Token(TokenType.RightParen, Flags); return true; case FirstCharacterClass.Hash: if(ReadCharacter('#')) { CurrentToken = new Token(TokenType.HashHash, Flags); } else { CurrentToken = new Token(TokenType.Hash, Flags); } return true; case FirstCharacterClass.Comma: CurrentToken = new Token(TokenType.Comma, Flags); return true; default: throw new NotImplementedException(); } } /// /// Move to the next token, parsing a single literal token string until the end of the line (in the form used by error directives) /// /// True if a token was parsed public bool MoveNextTokenString() { TokenFlags Flags = SkipWhitespace(); int StartOffset = Offset; while(Data[Offset] != 0 && Data[Offset] != '\n') { Offset++; } CurrentToken = CreateLiteral(TokenType.StringOfTokens, TokenFlags.None, StartOffset, Offset - StartOffset); return true; } /// /// Move to the next token, allowing include directives in triangle brackets in the style permitted by system include directives /// /// True if a token was read public bool MoveNextIncludePath() { bool bResult = MoveNext(); if(bResult && Current.Type == TokenType.CompareLess) { int StartOffset = Offset - 1; for(int EndOffset = Offset; Data[EndOffset] != 0 && Data[EndOffset] != '\n'; EndOffset++) { if(Data[EndOffset] == '>') { CurrentToken = CreateLiteral(TokenType.SystemInclude, TokenFlags.None, StartOffset, EndOffset + 1 - StartOffset); Offset = EndOffset + 1; break; } } } return bResult; } /// /// Creates a literal token /// /// Type of token to create /// Flags for the token /// Offset of the literal within the source data stream /// Length of the literal /// New token for the literal private Token CreateLiteral(TokenType Type, TokenFlags Flags, int Offset, int Length) { byte[] Literal = new byte[Length]; for(int Idx = 0; Idx < Length; Idx++) { Literal[Idx] = Data[Offset + Idx]; } return new Token(Type, Flags, Literal); } /// /// Scan ahead until we reach the next directive (a hash token after a newline). Assumes that the current line does not contain a directive. /// /// True if we were able to find another token public bool MoveToNextDirective() { // Scan lines until we reach a directive for(;;) { // Move to the next newline while(Data[Offset] != '\n') { if(Data[Offset] == 0) { return false; } else if(Data[Offset] == '\\' && Data[Offset + 1] == '\r' && Data[Offset + 2] == '\n') { LineNumberAfterToken++; Offset += 3; } else if(Data[Offset] == '\\' && Data[Offset + 1] == '\n') { LineNumberAfterToken++; Offset += 2; } else if (Data[Offset] == '/' && Data[Offset + 1] == '*') { Offset += 2; for(; Data[Offset] != 0; Offset++) { if(Data[Offset] == '\n') { LineNumberAfterToken++; continue; } if(Data[Offset] == '*' && Data[Offset + 1] == '/') { Offset += 2; break; } } } else { Offset++; } } // Move past the newline Offset++; LineNumberAfterToken++; // Skip any horizontal whitespace TokenFlags Flags = SkipWhitespace(); // Check if this is a line marker if(Data[Offset] == '#' && Data[Offset + 1] != '#') { Offset++; LineNumber = LineNumberAfterToken; CurrentToken = new Token(TokenType.Hash, Flags); return true; } } } /// /// Definition of IEnumerator.Reset(). Not supported. /// void IEnumerator.Reset() { throw new NotSupportedException(); } /// /// Attempts to read a given character from the stream /// /// Character to read /// True if the character was read (and the current position was updated) bool ReadCharacter(char Character) { if(Data[Offset] == Character) { Offset++; return true; } return false; } /// /// Advances the given position past any horizontal whitespace or comments /// /// Flags for the following token TokenFlags SkipWhitespace() { TokenFlags Flags = TokenFlags.None; for(;;) { // Quickly skip over trivial whitespace while((Characters[Data[Offset]] & CharacterFlags.Whitespace) != 0) { Offset++; Flags |= TokenFlags.HasLeadingSpace; } // Look at what's next char Character = (char)Data[Offset]; if (Character == '\\') { byte NextCharacter = Data[Offset + 1]; if(NextCharacter == '\r' && Data[Offset + 2] == '\n') { LineNumberAfterToken++; Offset += 3; } else if (NextCharacter == '\n') { LineNumberAfterToken++; Offset += 2; } else { break; } } else if (Character == '/') { byte NextCharacter = Data[Offset + 1]; if (NextCharacter == '/') { Offset += 2; for(; Data[Offset] != 0; Offset++) { if(Data[Offset] == '\n') { break; } } Flags |= TokenFlags.HasLeadingSpace; } else if (NextCharacter == '*') { Offset += 2; for(; Data[Offset] != 0; Offset++) { if(Data[Offset] == '\n') { LineNumberAfterToken++; continue; } if(Data[Offset] == '*' && Data[Offset + 1] == '/') { Offset += 2; break; } } Flags |= TokenFlags.HasLeadingSpace; } else { break; } } else { if(Character == '\r') { Offset++; } else { break; } } } return Flags; } /// /// Skip past a text literal (a quoted character literal or string literal) /// /// The terminating character to look for, ignoring escape sequences void SkipTextLiteral(char LastCharacter) { for(;;) { char Character = (char)Data[Offset]; if(Character == '\0') { throw new Exception("Unexpected end of file in text literal"); } Offset++; if(Character == '\\' && Data[Offset] != 0) { Offset++; } else if(Character == LastCharacter) { break; } } } /// /// Skips over a numeric literal after the initial digit or dot/digit pair. /// void SkipNumericLiteral() { for (;;) { while((Characters[Data[Offset]] & CharacterFlags.NumberTail) != 0) { Offset++; } if((Data[Offset] == '+' || Data[Offset] == '-') && (Data[Offset - 1] == 'e' || Data[Offset - 1] == 'E')) { Offset++; } else { break; } } } /// /// Static constructor. Initializes the lookup tables used by the lexer. /// static TokenReader() { FirstCharacters = new FirstCharacterClass[256]; // End of file FirstCharacters[0] = FirstCharacterClass.Terminator; // Horizontal whitespace FirstCharacters[' '] = FirstCharacterClass.Whitespace; FirstCharacters['\t'] = FirstCharacterClass.Whitespace; FirstCharacters['\v'] = FirstCharacterClass.Whitespace; FirstCharacters['\r'] = FirstCharacterClass.Whitespace; // Newline FirstCharacters['\n'] = FirstCharacterClass.Newline; // Identifiers FirstCharacters['_'] = FirstCharacterClass.Identifier; for(int Idx = 'a'; Idx <= 'z'; Idx++) { FirstCharacters[Idx] = FirstCharacterClass.Identifier; } for(int Idx = 'A'; Idx <= 'Z'; Idx++) { FirstCharacters[Idx] = FirstCharacterClass.Identifier; } // Numeric literals for(int Idx = '0'; Idx <= '9'; Idx++) { FirstCharacters[Idx] = FirstCharacterClass.Number; } // Character literals FirstCharacters['\''] = FirstCharacterClass.Character; // String literals FirstCharacters['\"'] = FirstCharacterClass.String; // Other symbols FirstCharacters['.'] = FirstCharacterClass.Dot; FirstCharacters['?'] = FirstCharacterClass.QuestionMark; FirstCharacters[':'] = FirstCharacterClass.Colon; FirstCharacters['!'] = FirstCharacterClass.ExclamationMark; FirstCharacters['|'] = FirstCharacterClass.Pipe; FirstCharacters['&'] = FirstCharacterClass.Ampersand; FirstCharacters['^'] = FirstCharacterClass.Caret; FirstCharacters['='] = FirstCharacterClass.Equals; FirstCharacters['<'] = FirstCharacterClass.LeftTriangleBracket; FirstCharacters['>'] = FirstCharacterClass.RightTriangleBracket; FirstCharacters['+'] = FirstCharacterClass.Plus; FirstCharacters['-'] = FirstCharacterClass.Minus; FirstCharacters['*'] = FirstCharacterClass.Star; FirstCharacters['/'] = FirstCharacterClass.Slash; FirstCharacters['%'] = FirstCharacterClass.PercentSign; FirstCharacters['~'] = FirstCharacterClass.Tilde; FirstCharacters['('] = FirstCharacterClass.LeftParen; FirstCharacters[')'] = FirstCharacterClass.RightParen; FirstCharacters['#'] = FirstCharacterClass.Hash; FirstCharacters[','] = FirstCharacterClass.Comma; // Flags for secondary characters Characters = new CharacterFlags[256]; // Identifiers Characters['_'] |= CharacterFlags.Identifier; for(int Idx = 'a'; Idx <= 'z'; Idx++) { Characters[Idx] |= CharacterFlags.Identifier; } for(int Idx = 'A'; Idx <= 'Z'; Idx++) { Characters[Idx] |= CharacterFlags.Identifier; } for(int Idx = '0'; Idx <= '9'; Idx++) { Characters[Idx] |= CharacterFlags.Identifier; } // Numbers for(int Idx = '0'; Idx <= '9'; Idx++) { Characters[Idx] |= CharacterFlags.Digit; } // Preprocessing number tail Characters['.'] |= CharacterFlags.NumberTail; Characters['_'] |= CharacterFlags.NumberTail; for(int Idx = '0'; Idx <= '9'; Idx++) { Characters[Idx] |= CharacterFlags.NumberTail; } for(int Idx = 'a'; Idx <= 'z'; Idx++) { Characters[Idx] |= CharacterFlags.NumberTail; } for(int Idx = 'A'; Idx <= 'Z'; Idx++) { Characters[Idx] |= CharacterFlags.NumberTail; } // Whitespace Characters[' '] |= CharacterFlags.Whitespace; Characters['\t'] |= CharacterFlags.Whitespace; Characters['\v'] |= CharacterFlags.Whitespace; } } }