Accessing non-binary Unicode properties with std.uni

On Monday, 28 September 2020 at 18:23:43 UTC, Chloé Kekoa wrote:
> The documentation of std.uni [1] says that the unicode struct provides sets for several binary properties. I am looking for a way to query non-binary properties of a character. Is that possible with std.uni or do I need to use a third-party library?
>
> I am specifically interested in the East_Asian_Width property [2] (which has six allowed values). Trying to access std.uni.unicode.East_Asian_Width results in the error message:
>
>> No unicode set by name East_Asian_Width was found.
>
> [1]: https://dlang.org/library/std/uni.html
> [2]: https://www.unicode.org/reports/tr11/tr11-38.html

It seems the East Asian width is Unicode standard 13.0, while Phobos implements 6.2. So seems like ca case for a third-party library :(.

September 29, 2020

Re: Accessing non-binary Unicode properties with std.uni

Posted by H. S. Teoh
in reply to Dukc

Permalink

H. S. Teoh

Posted in reply to Dukc

Permalink

On Tue, Sep 29, 2020 at 04:22:18PM +0000, Dukc via Digitalmars-d-learn wrote:
> On Monday, 28 September 2020 at 18:23:43 UTC, Chloé Kekoa wrote:
> > The documentation of std.uni [1] says that the unicode struct provides sets for several binary properties. I am looking for a way to query non-binary properties of a character. Is that possible with std.uni or do I need to use a third-party library?
> > 
> > I am specifically interested in the East_Asian_Width property [2] (which has six allowed values). Trying to access std.uni.unicode.East_Asian_Width results in the error message:
> > 
> > > No unicode set by name East_Asian_Width was found.
> > 
> > [1]: https://dlang.org/library/std/uni.html
> > [2]: https://www.unicode.org/reports/tr11/tr11-38.html
> 
> It seems the East Asian width is Unicode standard 13.0, while Phobos implements 6.2. So seems like ca case for a third-party library :(.
[...]

OTOH, the relevant Unicode data file that contains East_Asian_Width data (EastAsianWidth.txt) is relatively straightforward to parse.  In one of my projects, I wrote a little helper program to parse this file and generate a function that tells me if a given dchar is wide or narrow.

Here's the generated function (just copy-n-paste this into your code, no need for yet another external library dependency):

	bool isWide(dchar ch) @safe pure nothrow @nogc
	{
	    if (ch < 63744)
	    {
		if (ch < 12880)
		{
		    if (ch < 11904)
		    {
			if (ch < 4352) return false;
			if (ch < 4448) return true;
			if (ch == 9001 || ch == 9002) return true;
			return false;
		    }
		    else if (ch < 12351) return true;
		    else
		    {
			if (ch < 12353) return false;
			if (ch < 12872) return true;
			return false;
		    }
		}
		else if (ch < 19904) return true;
		else
		{
		    if (ch < 43360)
		    {
			if (ch < 19968) return false;
			if (ch < 42183) return true;
			return false;
		    }
		    else if (ch < 43389) return true;
		    else
		    {
			if (ch < 44032) return false;
			if (ch < 55204) return true;
			return false;
		    }
		}
	    }
	    else if (ch < 64256) return true;
	    else
	    {
		if (ch < 65504)
		{
		    if (ch < 65072)
		    {
			if (ch < 65040) return false;
			if (ch < 65050) return true;
			return false;
		    }
		    else if (ch < 65132) return true;
		    else
		    {
			if (ch < 65281) return false;
			if (ch < 65377) return true;
			return false;
		    }
		}
		else if (ch < 65511) return true;
		else
		{
		    if (ch < 127488)
		    {
			if (ch == 110592 || ch == 110593) return true;
			return false;
		    }
		    else if (ch < 127570) return true;
		    else
		    {
			if (ch < 131072) return false;
			if (ch < 262142) return true;
			return false;
		    }
		}
	    }
	}

Here's the utility that generated this code:

	/**
	 * Simple program to parse EastAsianWidth.txt to extract some useful info.
	 */

	import std.algorithm;
	import std.conv;
	import std.range;
	import std.regex;
	import std.stdio;

	struct CodeRange
	{
	    dchar start, end;

	    bool overlaps(CodeRange cr)
	    {
	        return ((start >= cr.start && start < cr.end) ||
	                (end >= cr.start && end < cr.end));
	    }

	    unittest
	    {
	        assert(CodeRange(1,11).overlaps(CodeRange(11,12)));
	        assert(!CodeRange(1,10).overlaps(CodeRange(11,12)));
	    }

	    void merge(CodeRange cr)
	    {
	        start = min(start, cr.start);
	        end = max(end, cr.end);
	    }

	    unittest
	    {
	        auto cr = CodeRange(10,20);
	        cr.merge(CodeRange(20,30));
	        assert(cr == CodeRange(10,30));
	    }

	    void toString(scope void delegate(const(char)[]) sink)
	    {
	        import std.format : formattedWrite;
	        sink.formattedWrite("%04X", start);
	        if (end > start+1)
	            sink.formattedWrite("..%04X", end-1);
	    }
	}

	struct Entry
	{
	    CodeRange range;
	    string width;

	    void toString(scope void delegate(const(char)[]) sink)
	    {
	        import std.format : formattedWrite;
	        sink.formattedWrite("%s;%s", range, width);
	    }
	}

	/**
	 * Returns: An input range of Entry objects.
	 */
	auto parse(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    // For our purposes, we don't need to distinguish between explicit/implicit
	    // narrowness, and ambiguous cases can just default to narrow. So we map
	    // the original width to its equivalent using the following equivalence
	    // table.
	    string[string] equivs = [
	        "Na" : "N",
	        "N"  : "N",
	        "H"  : "N",
	        "A"  : "N",
	        "W"  : "W",
	        "F"  : "W"
	    ];

	    auto reEmpty = regex(`^\s*$`);
	    auto reSingle = regex(`^([0-9A-F]+);(N|A|H|W|F|Na)\b`);
	    auto reRange = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);(N|A|H|W|F|Na)\b`);

	    struct Result
	    {
	        R     range;
	        Entry front;
	        bool  empty;

	        this(R _range)
	        {
	            range = _range;
	            next(); // get things started
	        }

	        void next()
	        {
	            while (!range.empty)
	            {
	                auto line = range.front;

	                if (auto m = line.match(reSingle))
	                {
	                    auto width = equivs[m.captures[2]];
	                    dchar ch = cast(dchar) m.captures[1].to!int(16);
	                    front = Entry(CodeRange(ch, ch+1), width);
	                    empty = false;
	                    return;
	                }
	                else if (auto m = line.match(reRange))
	                {
	                    auto width = equivs[m.captures[3]];
	                    dchar start = cast(dchar) m.captures[1].to!int(16);
	                    dchar end = cast(dchar) m.captures[2].to!int(16) + 1;
	                    front = Entry(CodeRange(start, end), width);
	                    empty = false;
	                    return;
	                }
	                else if (!line.startsWith("#") && !line.match(reEmpty))
	                {
	                    import std.string : format;
	                    throw new Exception("Couldn't parse line:\n%s"
	                                        .format(line));
	                }

	                range.popFront();
	            }
	            empty = true;
	        }

	        void popFront()
	        {
	            range.popFront();
	            next();
	        }
	    }
	    static assert(isInputRange!Result);

	    return Result(input);
	}

	void outputByWidthType(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    CodeRange[][string] widths;
	    string lastWidth;

	    void addRange(Entry entry)
	    {
	        auto range = entry.range;
	        auto width = entry.width;
	        auto ranges = width in widths;
	        if (ranges && ranges.length > 0 && width == lastWidth)
	        {
	            (*ranges)[$-1].merge(range);
	        }
	        else
	            widths[width] ~= range;

	        lastWidth = width;
	    }

	    foreach (entry; input.parse())
	    {
	         addRange(entry);
	    }

	    foreach (width; widths.byKey())
	    {
	        writeln("# ", width);
	        foreach (range; widths[width])
	        {
	            writefln("%s;%s", range, width);
	        }
	        writeln();
	    }
	}

	/**
	 * Returns: An input range of Entry objects.
	 */
	auto mergeConsecutive(R)(R input)
	    if (isInputRange!R && is(ElementType!R : Entry))
	{
	    struct Result
	    {
	        R     range;
	        bool  empty;
	        Entry front;
	        Entry current;

	        this(R _range)
	        {
	            range = _range;
	            next();
	        }

	        void next()
	        {
	            while (!range.empty)
	            {
	                auto e = range.front;
	                if (current.width != e.width)
	                {
	                    if (current.width != "")
	                    {
	                        empty = false;
	                        front = current;

	                        current = e;
	                        range.popFront();

	                        //writefln("Yielding: %s", front);
	                        return;
	                    }
	                    current = e;
	                }
	                else
	                {
	                    //writefln("Merging: %s with %s", current, e);
	                    current.range.merge(e.range);
	                }

	                range.popFront();
	            }

	            if (current.width != "")
	            {
	                empty = false;
	                front = current;
	            }
	            else
	                empty = true;
	        }

	        void popFront()
	        {
	            if (range.empty)
	                empty = true; // on last element
	            else
	                next();
	        }
	    }

	    return Result(input);
	}

	void outputByCodePoint(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    writefln("%(%s\n%)", input.parse().mergeConsecutive());
	}

	void tally(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    int totalW, totalN;

	    foreach (e; input.parse().mergeConsecutive())
	    {
	        if (e.width=="W")
	            totalW += (e.range.end - e.range.start);
	        else if (e.width=="N")
	            totalN += (e.range.end - e.range.start);
	        else
	            assert(0);
	    }
	    writefln("Tally: W=%d N=%d\n", totalW, totalN);
	}

	void genRecogCode(R)(R input)
	    if (isInputRange!R && is(ElementType!R : const(char)[]))
	{
	    import std.uni;

	    CodepointSet wideChars;
	    foreach (e; input.parse().mergeConsecutive())
	    {
	        if (e.width=="W")
	            wideChars.add(e.range.start, e.range.end);
	    }

	    writeln(wideChars.toSourceCode("isWide"));
	}

	int main(string[] args)
	{
	    if (args.length < 2)
	    {
	        assert(args.length > 0);
	        stderr.writefln("Usage: %s (bywidth|bypoint|tally|gencode)", args[0]);
	        return 1;
	    }

	    auto input = File("ext/EastAsianWidth.txt", "r").byLine();

	    auto cmd = args[1];
	    switch (cmd)
	    {
	        case "bywidth":
	            outputByWidthType(input);
	            break;

	        case "bypoint":
	            outputByCodePoint(input);
	            break;

	        case "tally":
	            tally(input);
	            break;

	        case "gencode":
	            genRecogCode(input);
	            break;

	        default:
	            stderr.writefln("Unknown command: %s", cmd);
	            return 1;
	    }
	    return 0;
	}


T

-- 
People tell me that I'm skeptical, but I don't believe them.

Forums