mirror of
https://github.com/danbulant/Cosmos
synced 2026-05-19 20:39:01 +00:00
324 lines
13 KiB
C#
324 lines
13 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Linq;
|
|
using System.Text;
|
|
|
|
namespace Cosmos.Compiler.XSharp {
|
|
public class Parser {
|
|
/// <summary>Index in <see cref="mData"/> of the first yet unconsumed character.</summary>
|
|
protected int mStart = 0;
|
|
/// <summary>Initial text provided as a constructor parameter.</summary>
|
|
protected string mData;
|
|
/// <summary>true if whitespace tokens should be kept and propagated to the next parsing
|
|
/// stage.</summary>
|
|
protected bool mIncludeWhiteSpace;
|
|
/// <summary>true while every token encountered until so far by this parser are whitespace
|
|
/// tokens.</summary>
|
|
protected bool mAllWhitespace;
|
|
/// <summary>true if the parser supports patterns recognition.</summary>
|
|
protected bool mAllowPatterns;
|
|
|
|
/// <summary>Tokens retrieved so far by the parser.</summary>
|
|
protected TokenList mTokens;
|
|
|
|
/// <summary>Get a list of tokens that has been built at class instanciation.</summary>
|
|
public TokenList Tokens {
|
|
get { return mTokens; }
|
|
}
|
|
|
|
protected static readonly char[] mComma = new char[] { ',' };
|
|
protected static readonly char[] mSpace = new char[] { ' ' };
|
|
public static string[] mKeywords = (
|
|
"As,All"
|
|
+ ",BYTE"
|
|
+ ",CALL,CONST"
|
|
+ ",DWORD"
|
|
+ ",exit"
|
|
+ ",function"
|
|
+ ",goto"
|
|
+ ",IF,INTERRUPT,iret"
|
|
+ ",namespace"
|
|
+ ",PORT"
|
|
+ ",return,ret,REPEAT"
|
|
+ ",times"
|
|
+ ",var"
|
|
+ ",word,while"
|
|
).ToUpper().Split(mComma);
|
|
|
|
public static readonly string[] Registers;
|
|
public static readonly string[] RegistersAddr;
|
|
public static readonly string[] Registers8 = "AH,AL,BH,BL,CH,CL,DH,DL".Split(mComma);
|
|
public static readonly string[] Registers16 = "AX,BX,CX,DX".Split(mComma);
|
|
public static readonly string[] Registers32 = "EAX,EBX,ECX,EDX".Split(mComma);
|
|
public static readonly string[] RegistersIdx = "ESI,EDI,ESP,EBP".Split(mComma);
|
|
public static readonly string[] RegisterPatterns = "_REG,_REG8,_REG16,_REG32,_REGIDX,_REGADDR".Split(mComma);
|
|
public static readonly string[] Delimiters = ",".Split(mSpace);
|
|
// _.$ are AlphaNum. See comments in Parser
|
|
// # is comment and literal, but could be reused when not first char
|
|
// string[] is used instead of string because operators can be multi char, != >= etc
|
|
public static readonly string[] Operators = "( ) () ! = != >= <= [ [- ] + - * : { } < > ?= ?& @ ~> <~ >> << ++ -- # +# & | ^".Split(mSpace);
|
|
|
|
static Parser() {
|
|
var xRegisters = new List<string>();
|
|
xRegisters.AddRange(Registers8);
|
|
xRegisters.AddRange(Registers16);
|
|
xRegisters.AddRange(Registers32);
|
|
xRegisters.AddRange(RegistersIdx);
|
|
Registers = xRegisters.ToArray();
|
|
|
|
var xRegistersAddr = new List<string>();
|
|
xRegistersAddr.AddRange(Registers32);
|
|
xRegistersAddr.AddRange(RegistersIdx);
|
|
RegistersAddr = xRegistersAddr.ToArray();
|
|
}
|
|
|
|
/// <summary>Parse next token from currently parsed line, starting at given position and
|
|
/// add the retrieved token at end of given token list.</summary>
|
|
/// <param name="aList">The token list where to add the newly recognized token.</param>
|
|
/// <param name="lineNumber">Line number for diagnostics and debugging purpose.</param>
|
|
/// <param name="rPos">The index in current source code line of the first not yet consumed
|
|
/// character. On return this parameter will be updated to account for characters that would
|
|
/// have been consumed.</param>
|
|
protected void NewToken(TokenList aList, int lineNumber, ref int rPos) {
|
|
#region Pattern Notes
|
|
// All patterns start with _, this makes them reserved. User can use too, but at own risk of conflict.
|
|
//
|
|
// Wildcards
|
|
// -_REG or ??X
|
|
// -_REG8 or ?H,?L
|
|
// -_REG16 or ?X
|
|
// -_REG32 or E?X
|
|
// - ? based ones are ugly and less clear
|
|
// -_Keyword
|
|
// -_ABC
|
|
//
|
|
//
|
|
// Multiple Options (All caps only) - Registers only
|
|
// Used to suport EAX,EBX - ie lists. But found out wasnt really needed. May add again later.
|
|
//
|
|
// -AX/AL - Conflict if we ever use /
|
|
// -AX|AL - Conflict if we ever use |
|
|
// -AX,AL - , is unlikely to ever be used as an operator and is logical as a separator. Method calls might use, but likely better to use a space
|
|
// since we will only allow simple arguments, not compound.
|
|
// -_REG:AX|AL - End terminator issue
|
|
// -_REG[AX|AL] - Conflict with existing indirect access. Is indirect access always numeric? I think x86 has some register based ones too.
|
|
//
|
|
//
|
|
// Specific: Register, Keyword, AlphaNum
|
|
// -EAX
|
|
#endregion
|
|
|
|
string xString = null;
|
|
char xChar1 = mData[mStart];
|
|
var xToken = new Token(lineNumber);
|
|
|
|
// Recognize comments and literal assembler code.
|
|
if (mAllWhitespace && "/!".Contains(xChar1)) {
|
|
rPos = mData.Length; // This will account for the dummy whitespace at the end.
|
|
xString = mData.Substring(mStart + 1, rPos - mStart - 1).Trim();
|
|
// So ToString/Format wont generate error
|
|
xString = xString.Replace("{", "{{");
|
|
xString = xString.Replace("}", "}}");
|
|
// Fix issue #15662 with string length check.
|
|
// Fix issue #15663 with comparing from mData and not from xString anymore.
|
|
if (('/' == xChar1) && (2 <= xString.Length) && ('/' == mData[mStart + 1])) {
|
|
xString = xString.Substring(1);
|
|
xToken.Type = TokenType.Comment;
|
|
} else if (xChar1 == '!') {
|
|
// Literal assembler code.
|
|
xToken.Type = TokenType.LiteralAsm;
|
|
}
|
|
} else {
|
|
xString = mData.Substring(mStart, rPos - mStart);
|
|
|
|
if (string.IsNullOrWhiteSpace(xString) && xString.Length > 0) {
|
|
xToken.Type = TokenType.WhiteSpace;
|
|
|
|
} else if (xChar1 == '\'') {
|
|
xToken.Type = TokenType.ValueString;
|
|
xString = xString.Substring(1, xString.Length - 2);
|
|
|
|
} else if (char.IsDigit(xChar1)) {
|
|
xToken.Type = TokenType.ValueInt;
|
|
|
|
} else if (xChar1 == '$') {
|
|
xToken.Type = TokenType.ValueInt;
|
|
// Remove surrounding '
|
|
xString = "0x" + xString.Substring(1);
|
|
|
|
} else if (IsAlphaNum(xChar1)) { // This must be after check for ValueInt
|
|
string xUpper = xString.ToUpper();
|
|
|
|
// Special parsing when in pattern mode. We recognize some special strings
|
|
// which would otherwise be considered as simple AlphaNum token otherwise.
|
|
if (mAllowPatterns) {
|
|
if (RegisterPatterns.Contains(xUpper)) {
|
|
xToken.Type = TokenType.Register;
|
|
} else if (xUpper == "_KEYWORD") {
|
|
xToken.Type = TokenType.Keyword;
|
|
xString = null;
|
|
} else if (xUpper == "_ABC") {
|
|
xToken.Type = TokenType.AlphaNum;
|
|
xString = null;
|
|
}
|
|
else if (xUpper == "_PCALL") {
|
|
xString = null;
|
|
xToken.Type = TokenType.Call;
|
|
}
|
|
}
|
|
|
|
if (xToken.Type == TokenType.Unknown) {
|
|
if (Registers.Contains(xUpper)) {
|
|
xToken.Type = TokenType.Register;
|
|
} else if (mKeywords.Contains(xUpper)) {
|
|
xToken.Type = TokenType.Keyword;
|
|
} else if(xString.Contains("(") && xString.Contains(")") && IsAlphaNum(xChar1)) {
|
|
xToken.Type = TokenType.Call;
|
|
} else {
|
|
xToken.Type = TokenType.AlphaNum;
|
|
}
|
|
}
|
|
|
|
} else if (Delimiters.Contains(xString)) {
|
|
xToken.Type = TokenType.Delimiter;
|
|
|
|
} else if (Operators.Contains(xString)) {
|
|
xToken.Type = TokenType.Operator;
|
|
}
|
|
}
|
|
|
|
xToken.Value = xString;
|
|
xToken.SrcPosStart = mStart;
|
|
xToken.SrcPosEnd = xToken.Type == TokenType.Call ? rPos : rPos - 1;
|
|
if (mAllWhitespace && (xToken.Type != TokenType.WhiteSpace)) {
|
|
mAllWhitespace = false;
|
|
}
|
|
mStart = xToken.Type == TokenType.Call ? rPos + 1 : rPos;
|
|
|
|
if (mIncludeWhiteSpace || (xToken.Type != TokenType.WhiteSpace)) {
|
|
aList.Add(xToken);
|
|
}
|
|
}
|
|
|
|
protected enum CharType { WhiteSpace, Identifier, Symbol, String };
|
|
|
|
protected bool IsAlphaNum(char aChar) {
|
|
return char.IsLetterOrDigit(aChar) || aChar == '_' || aChar == '.' || aChar == '$';
|
|
}
|
|
|
|
/// <summary>Consume text that has been provided to the class constructor, splitting it into
|
|
/// a list of tokens.</summary>
|
|
/// <param name="lineNumber">Line number for diagnostics and debugging.</param>
|
|
/// <returns>The resulting tokens list.</returns>
|
|
protected TokenList Parse(int lineNumber) {
|
|
// Save in comment, might be useful in future. Already had to dig it out of TFS once
|
|
//var xRegex = new System.Text.RegularExpressions.Regex(@"(\W)");
|
|
|
|
var xResult = new TokenList();
|
|
CharType xLastCharType = CharType.WhiteSpace;
|
|
char xChar;
|
|
CharType xCharType = CharType.WhiteSpace;
|
|
int i = 0;
|
|
for (i = 0; i < mData.Length; i++) {
|
|
xChar = mData[i];
|
|
// Extract string literal (surrounded with single quote characters).
|
|
if (xChar == '\'') {
|
|
// Take data before the ' as a token.
|
|
NewToken(xResult, lineNumber, ref i);
|
|
// Now scan to the next ' taking into account escaped single quotes.
|
|
bool escapedCharacter = false;
|
|
for (i = i + 1; i < mData.Length; i++) {
|
|
bool done = false;
|
|
switch(mData[i])
|
|
{
|
|
case '\'':
|
|
if (!escapedCharacter) { done = true; }
|
|
break;
|
|
case '\\':
|
|
escapedCharacter = !escapedCharacter;
|
|
break;
|
|
default:
|
|
escapedCharacter = false;
|
|
break;
|
|
}
|
|
if (done) { break; }
|
|
}
|
|
if (i == mData.Length) {
|
|
throw new Exception("Unterminated string.");
|
|
}
|
|
i++;
|
|
xCharType = CharType.String;
|
|
}
|
|
else if (xChar == '(')
|
|
{
|
|
for (i += 1; i < mData.Length; i++)
|
|
{
|
|
if (mData[i] == ')' && mData.LastIndexOf(")") <= i)
|
|
{
|
|
i++;
|
|
NewToken(xResult, lineNumber, ref i);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
else if (char.IsWhiteSpace(xChar))
|
|
{
|
|
xCharType = CharType.WhiteSpace;
|
|
}
|
|
else if (IsAlphaNum(xChar))
|
|
{
|
|
// _ and . were never likely to stand on their own. ie ESP _ 2 and ESP . 2 are never likely to be used.
|
|
// Having them on their own required a lot of code
|
|
// to treat them as a single unit where we did use them. So we treat them as AlphaNum.
|
|
xCharType = CharType.Identifier;
|
|
}
|
|
else
|
|
{
|
|
xCharType = CharType.Symbol;
|
|
}
|
|
|
|
// i > 0 - Never do NewToken on first char. i = 0 is just a pass to get char and set lastchar.
|
|
// But its faster as the second short circuit rather than a separate if.
|
|
if ((xCharType != xLastCharType) && (0 < i)) {
|
|
NewToken(xResult, lineNumber, ref i);
|
|
}
|
|
|
|
xLastCharType = xCharType;
|
|
}
|
|
|
|
// Last token
|
|
if (mStart < mData.Length) {
|
|
NewToken(xResult, lineNumber, ref i);
|
|
}
|
|
|
|
return xResult;
|
|
}
|
|
|
|
/// <summary>Create a new Parser instance and immediately consume the given <paramref name="aData"/>
|
|
/// string. On return the <seealso cref="Tokens"/> property is available for enumeration.</summary>
|
|
/// <param name="aData">The text to be parsed. WARNING : This is expected to be a single full line
|
|
/// of text. The parser can be create with a special "pattern recognition" mode.</param>
|
|
/// <param name="aIncludeWhiteSpace"></param>
|
|
/// <param name="aAllowPatterns">True if <paramref name="aData"/> is a pattern and thus the parsing
|
|
/// should be performed specifically.</param>
|
|
/// <exception cref="Exception">At least one unrecognized token has been parsed.</exception>
|
|
public Parser(string aData, int lineNumber, bool aIncludeWhiteSpace, bool aAllowPatterns) {
|
|
mData = aData;
|
|
mIncludeWhiteSpace = aIncludeWhiteSpace;
|
|
mAllowPatterns = aAllowPatterns;
|
|
mAllWhitespace = true;
|
|
|
|
mTokens = Parse(lineNumber);
|
|
if (mTokens.Count(q => q.Type == TokenType.Unknown) > 0) {
|
|
|
|
foreach(Token token in mTokens)
|
|
{
|
|
if (TokenType.Unknown == token.Type) {
|
|
throw new Exception(string.Format("Unknown token '{0}' found at {1}/{2}.",
|
|
token.Value ?? "NULL", token.LineNumber, token.SrcPosStart));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|