Hello,
I am trying to make a small generic lexer that bases its token analysis on regular expressions. The principle I have in mind is to define a token type table with its corresponding regular expression, here is the code I currently have:
import std.regex;
/// ditto
struct Token
{
/// The token type
string type;
/// The regex to match the token
Regex!char re;
/// The matched string
string matched = null;
}
/// Function to find the right token in the given table
Token find(Token[] table, const(Captures!string delegate(Token) pure @safe) fn)
{
foreach (token; table)
if (fn(token)) return token;
return Token("", regex(r""));
}
/// The lexer class
class Lexer
{
private Token[] tokens;
/// ditto
this(Token[] tkns = [])
{
this.tokens = tkns;
}
override string toString() const
{
import std.algorithm : map;
import std.conv : to;
import std.format : format;
return to!string
(this.tokens.map!(tok =>
format("(%s, %s)", tok.type, tok.matched)));
}
// Others useful methods ...
}
/// My token table
static Token[] table =
[ Token("NUMBER", regex(r"(?:\d+(?:\.\d*)?|\.\d+)"))
, Token("MINS", regex(r"\-"))
, Token("PLUS", regex(r"\+")) ];
/// Build a new lexer
Lexer lex(string text)
{
Token[] result = [];
while (text.length > 0)
{
Token token = table.find((Token t) => matchFirst(text, t.re));
const string tmatch = matchFirst(text, token.re)[0];
result ~= Token(token.type, token.re, tmatch);
text = text[tmatch.length .. $];
}
return new Lexer(result);
}
void main()
{
import std.stdio : writeln;
const auto l = lex("3+2");
writeln(l);
}
When I run this program, it gives the following sequence:
["(NUMBER, 3)", "(NUMBER, 2)", "(NUMBER, 2)"]
While I want this:
["(NUMBER, 3)", "(PLUS, +)", "(NUMBER, 2)"]
The problem seems to come from the find
function which returns the first regex to have match and not the regex of the first substring to have match (I hope I am clear enough 😅).
I'm not used to manipulating regex, especially in D, so I'm not sure how to consider a solution to this problem.
I thank you in advance for your help.