| Posted by H. S. Teoh in reply to Dukc | PermalinkReply |
|
H. S. Teoh
| On Tue, Sep 29, 2020 at 04:22:18PM +0000, Dukc via Digitalmars-d-learn wrote:
> On Monday, 28 September 2020 at 18:23:43 UTC, Chloé Kekoa wrote:
> > The documentation of std.uni [1] says that the unicode struct provides sets for several binary properties. I am looking for a way to query non-binary properties of a character. Is that possible with std.uni or do I need to use a third-party library?
> >
> > I am specifically interested in the East_Asian_Width property [2] (which has six allowed values). Trying to access std.uni.unicode.East_Asian_Width results in the error message:
> >
> > > No unicode set by name East_Asian_Width was found.
> >
> > [1]: https://dlang.org/library/std/uni.html
> > [2]: https://www.unicode.org/reports/tr11/tr11-38.html
>
> It seems the East Asian width is Unicode standard 13.0, while Phobos implements 6.2. So seems like ca case for a third-party library :(.
[...]
OTOH, the relevant Unicode data file that contains East_Asian_Width data (EastAsianWidth.txt) is relatively straightforward to parse. In one of my projects, I wrote a little helper program to parse this file and generate a function that tells me if a given dchar is wide or narrow.
Here's the generated function (just copy-n-paste this into your code, no need for yet another external library dependency):
bool isWide(dchar ch) @safe pure nothrow @nogc
{
if (ch < 63744)
{
if (ch < 12880)
{
if (ch < 11904)
{
if (ch < 4352) return false;
if (ch < 4448) return true;
if (ch == 9001 || ch == 9002) return true;
return false;
}
else if (ch < 12351) return true;
else
{
if (ch < 12353) return false;
if (ch < 12872) return true;
return false;
}
}
else if (ch < 19904) return true;
else
{
if (ch < 43360)
{
if (ch < 19968) return false;
if (ch < 42183) return true;
return false;
}
else if (ch < 43389) return true;
else
{
if (ch < 44032) return false;
if (ch < 55204) return true;
return false;
}
}
}
else if (ch < 64256) return true;
else
{
if (ch < 65504)
{
if (ch < 65072)
{
if (ch < 65040) return false;
if (ch < 65050) return true;
return false;
}
else if (ch < 65132) return true;
else
{
if (ch < 65281) return false;
if (ch < 65377) return true;
return false;
}
}
else if (ch < 65511) return true;
else
{
if (ch < 127488)
{
if (ch == 110592 || ch == 110593) return true;
return false;
}
else if (ch < 127570) return true;
else
{
if (ch < 131072) return false;
if (ch < 262142) return true;
return false;
}
}
}
}
Here's the utility that generated this code:
/**
* Simple program to parse EastAsianWidth.txt to extract some useful info.
*/
import std.algorithm;
import std.conv;
import std.range;
import std.regex;
import std.stdio;
struct CodeRange
{
dchar start, end;
bool overlaps(CodeRange cr)
{
return ((start >= cr.start && start < cr.end) ||
(end >= cr.start && end < cr.end));
}
unittest
{
assert(CodeRange(1,11).overlaps(CodeRange(11,12)));
assert(!CodeRange(1,10).overlaps(CodeRange(11,12)));
}
void merge(CodeRange cr)
{
start = min(start, cr.start);
end = max(end, cr.end);
}
unittest
{
auto cr = CodeRange(10,20);
cr.merge(CodeRange(20,30));
assert(cr == CodeRange(10,30));
}
void toString(scope void delegate(const(char)[]) sink)
{
import std.format : formattedWrite;
sink.formattedWrite("%04X", start);
if (end > start+1)
sink.formattedWrite("..%04X", end-1);
}
}
struct Entry
{
CodeRange range;
string width;
void toString(scope void delegate(const(char)[]) sink)
{
import std.format : formattedWrite;
sink.formattedWrite("%s;%s", range, width);
}
}
/**
* Returns: An input range of Entry objects.
*/
auto parse(R)(R input)
if (isInputRange!R && is(ElementType!R : const(char)[]))
{
// For our purposes, we don't need to distinguish between explicit/implicit
// narrowness, and ambiguous cases can just default to narrow. So we map
// the original width to its equivalent using the following equivalence
// table.
string[string] equivs = [
"Na" : "N",
"N" : "N",
"H" : "N",
"A" : "N",
"W" : "W",
"F" : "W"
];
auto reEmpty = regex(`^\s*$`);
auto reSingle = regex(`^([0-9A-F]+);(N|A|H|W|F|Na)\b`);
auto reRange = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);(N|A|H|W|F|Na)\b`);
struct Result
{
R range;
Entry front;
bool empty;
this(R _range)
{
range = _range;
next(); // get things started
}
void next()
{
while (!range.empty)
{
auto line = range.front;
if (auto m = line.match(reSingle))
{
auto width = equivs[m.captures[2]];
dchar ch = cast(dchar) m.captures[1].to!int(16);
front = Entry(CodeRange(ch, ch+1), width);
empty = false;
return;
}
else if (auto m = line.match(reRange))
{
auto width = equivs[m.captures[3]];
dchar start = cast(dchar) m.captures[1].to!int(16);
dchar end = cast(dchar) m.captures[2].to!int(16) + 1;
front = Entry(CodeRange(start, end), width);
empty = false;
return;
}
else if (!line.startsWith("#") && !line.match(reEmpty))
{
import std.string : format;
throw new Exception("Couldn't parse line:\n%s"
.format(line));
}
range.popFront();
}
empty = true;
}
void popFront()
{
range.popFront();
next();
}
}
static assert(isInputRange!Result);
return Result(input);
}
void outputByWidthType(R)(R input)
if (isInputRange!R && is(ElementType!R : const(char)[]))
{
CodeRange[][string] widths;
string lastWidth;
void addRange(Entry entry)
{
auto range = entry.range;
auto width = entry.width;
auto ranges = width in widths;
if (ranges && ranges.length > 0 && width == lastWidth)
{
(*ranges)[$-1].merge(range);
}
else
widths[width] ~= range;
lastWidth = width;
}
foreach (entry; input.parse())
{
addRange(entry);
}
foreach (width; widths.byKey())
{
writeln("# ", width);
foreach (range; widths[width])
{
writefln("%s;%s", range, width);
}
writeln();
}
}
/**
* Returns: An input range of Entry objects.
*/
auto mergeConsecutive(R)(R input)
if (isInputRange!R && is(ElementType!R : Entry))
{
struct Result
{
R range;
bool empty;
Entry front;
Entry current;
this(R _range)
{
range = _range;
next();
}
void next()
{
while (!range.empty)
{
auto e = range.front;
if (current.width != e.width)
{
if (current.width != "")
{
empty = false;
front = current;
current = e;
range.popFront();
//writefln("Yielding: %s", front);
return;
}
current = e;
}
else
{
//writefln("Merging: %s with %s", current, e);
current.range.merge(e.range);
}
range.popFront();
}
if (current.width != "")
{
empty = false;
front = current;
}
else
empty = true;
}
void popFront()
{
if (range.empty)
empty = true; // on last element
else
next();
}
}
return Result(input);
}
void outputByCodePoint(R)(R input)
if (isInputRange!R && is(ElementType!R : const(char)[]))
{
writefln("%(%s\n%)", input.parse().mergeConsecutive());
}
void tally(R)(R input)
if (isInputRange!R && is(ElementType!R : const(char)[]))
{
int totalW, totalN;
foreach (e; input.parse().mergeConsecutive())
{
if (e.width=="W")
totalW += (e.range.end - e.range.start);
else if (e.width=="N")
totalN += (e.range.end - e.range.start);
else
assert(0);
}
writefln("Tally: W=%d N=%d\n", totalW, totalN);
}
void genRecogCode(R)(R input)
if (isInputRange!R && is(ElementType!R : const(char)[]))
{
import std.uni;
CodepointSet wideChars;
foreach (e; input.parse().mergeConsecutive())
{
if (e.width=="W")
wideChars.add(e.range.start, e.range.end);
}
writeln(wideChars.toSourceCode("isWide"));
}
int main(string[] args)
{
if (args.length < 2)
{
assert(args.length > 0);
stderr.writefln("Usage: %s (bywidth|bypoint|tally|gencode)", args[0]);
return 1;
}
auto input = File("ext/EastAsianWidth.txt", "r").byLine();
auto cmd = args[1];
switch (cmd)
{
case "bywidth":
outputByWidthType(input);
break;
case "bypoint":
outputByCodePoint(input);
break;
case "tally":
tally(input);
break;
case "gencode":
genRecogCode(input);
break;
default:
stderr.writefln("Unknown command: %s", cmd);
return 1;
}
return 0;
}
T
--
People tell me that I'm skeptical, but I don't believe them.
|