module linetoken; private { import std.ctype; import std.string; import std.utf; } char[][] TokenizeLine(char[] pSource, char[] pDelim = ",", char[] pComment = "//") { dchar[][] lTemp; char[][] lResult; lTemp= TokenizeLine( std.utf.toUTF32(pSource), std.utf.toUTF32(pDelim), std.utf.toUTF32(pComment) ); foreach( dchar[] lLine; lTemp ) { lResult ~= std.utf.toUTF8( lLine ); } return lResult; } dchar[][] TokenizeLine(dchar[] pSource, dchar[] pDelim = ",", dchar[] pComment = "//") { dchar[][] lResult; dchar lOpenBracket; dchar lCloseBracket; int lNestLevel; int lInToken; dchar[] lDelim; int lTrimSpot; int lPos; bool lLitMode; static dchar[] vOpenBracket = "\"'([{`"; static dchar[] vCloseBracket = "\"')]}`"; if (pDelim.length > 0) // Only use single-char delimiters. Excess chars are ignored. lDelim ~= pDelim[0]; else lDelim = ""; // Meaning 'any group of whitespace chars' lInToken = -1; lTrimSpot = -1; foreach(int i, dchar c; pSource) { if (lNestLevel == 0) { // Check for comment string. if (pComment.length > 0) { if (c == pComment[0]) { if ((pSource.length - i) > pComment.length) { if (pSource[i .. i + pComment.length] == pComment) break; } } } } if(lInToken == -1) { // Not in a token yet. if (std.ctype.isspace(c)) continue; // Skip over spaces // Non-space so a token is about to start. lInToken = lResult.length; lResult.length = lInToken + 1; lTrimSpot = -1; } if (lLitMode) { lResult[lInToken] ~= c; lLitMode = false; lTrimSpot = -1; continue; } if (c == '\\') { lLitMode = true; continue; } if (lNestLevel == 0) { // Only check for delimiters if not in 'bracket'-mode. if (lDelim.length == 0) { if (std.ctype.isspace(c)) { lTrimSpot = -1; lInToken = -1; // Go fetch next character. continue; } } else if (c == lDelim[0]) { // Found a token delimiter, so I end the current token. if (lTrimSpot != -1) { // But first I trim off trailing spaces. lResult[lInToken].length = lTrimSpot-1; lTrimSpot = -1; } lInToken = -1; // Go fetch next character. continue; } } if (lResult[lInToken].length == 0) { // Not started a token yet. lPos = find(vOpenBracket, c); if (lPos != -1) { // An 'open' bracket was found, so make this its // own token, start another new one, and go into // 'bracket'-mode. lResult[lInToken] ~= c; lInToken = lResult.length; lResult.length = lInToken + 1; lOpenBracket = c; lCloseBracket = vCloseBracket[lPos]; lNestLevel = 1; // Go fetch next character. continue; } } if (lNestLevel > 0) { if (c == lCloseBracket) { lNestLevel--; if (lNestLevel == 0) { // Okay, I've found the end of the bracketed chars. // Note that this doesn't necessarily mean the end of // a token was also found. And I can start checking // again for trailing spaces. lTrimSpot = -1; // Go fetch next character continue; } } else if (c == lOpenBracket) { // Note that the char is added to the token too. lNestLevel++; } } // Finally, I get to add this char to the token. lResult[lInToken] ~= c; if (lNestLevel == 0) // Only check for trailing spaces if not in 'bracket'-mode if (std.ctype.isspace(c)) { // It was a space, so it is potentially a trailing space, // thus I mark its spot (if it's the first in a set of spaces.) if (lTrimSpot == -1) lTrimSpot = lResult[lInToken].length; } else lTrimSpot = -1; } if (lResult.length == 0) lResult ~= ""; if (lTrimSpot != -1) { // Trim off trailing spaces on last token. lResult[$-1].length = lTrimSpot-1; } return lResult; } int find(dchar[] pStringToScan, dchar pCharToFind) { foreach (int i, dchar c; pStringToScan) { if (pCharToFind == c) return i; } return -1; } /* How To Use =============================== Insert this into your code ... private import linetoken; Then to call it use ... char[] Toks; char[] InputLine; char[] DelimChar; char[] CommentString; Toks = TokenizeLine(InputLine, DelimChar, CommentString); ** Note that it accepts all 'char[]' or all 'dchar[]' arguments. The routine scans the input string and returns a set of strings, one per token found in the input string. The tokens are delimited by the single character in DelimChar. However, if DelimChar is an empty string, then tokens are delimited by any group of one or more white-space characters. By default, DelimChar is ",". If CommentString is not empty, then all parts of the input string from the begining of the comment to the end are ignored. By default CommentString is "//". If a token begins with a quote (single, double or back), then you will get back two tokens. The first is the quote as a single character string, and the second is all the characters up to, but not including the next quote of the same type. The ending quote is discarded. If a token begins with a bracket (parenthesis, square, or brace), then you will get back two tokens. The first is the opening bracket as a single character string, and the second is all the characters up to, but not including, the matching end bracket, taking nested brackets (of the same type) into consideration. All whitespace in between tokens is ignored, and not returned. If the tokenizer finds a back-slash character (\), then next character is always considered as a part of a token. You can use this to force the delimiter character or spaces to be inserted into a token. Examples: TokenizeLine(" abc, def , ghi, ") --> {"abc", "def", "ghi", ""} TokenizeLine("character or spaces to be \t inserted", "") --> {"character", "or", "spaces", "to", "be", "inserted"} TokenizeLine(" abc; def , ghi; ", ";") --> {"abc", "def , ghi", "" } TokenizeLine(" abc, [def , ghi] ") --> {"abc", "[", "def , ghi"} TokenizeLine(" abc, [def , ghi] // comment") --> {"abc", "[", "def , ghi"} TokenizeLine(" abc, [def , [ghi, jkl] ] ") --> {"abc", "[", "def , [ghi, jkl] "} TokenizeLine(" abc, def , ghi ; comment", ",", ";") --> {"abc", "def", "ghi"} TokenizeLine(` abc, "def , ghi" , jkl `) --> {"abc", `"`, "def , ghi", "jkl"} */