Jump to page: 1 2
Thread overview
UTF-8 to dchar conversion
Jul 28, 2004
Arcane Jill
Jul 28, 2004
Arcane Jill
Jul 28, 2004
Arcane Jill
Jul 28, 2004
Arcane Jill
Jul 28, 2004
parabolis
Jul 28, 2004
Arcane Jill
Jul 28, 2004
Sean Kelly
Jul 28, 2004
Walter
Jul 29, 2004
Arcane Jill
Jul 29, 2004
Walter
Jul 29, 2004
Arcane Jill
Jul 29, 2004
Arcane Jill
Jul 29, 2004
Walter
July 28, 2004
For Sean...

I noticed your std.utf update on the bugs forum. Using delegates is obviously sensible, but I noticed the routine looked a tad on the slow side. Here's a faster algorithm - it doesn't use delegates, but I'm sure you could do some mixing and matching to get the best of both. Here's my fast converter:

#    const ubyte[256] LENGTH =
#    [
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
#    ];
#
#    const ubyte[256] START_CALC =
#    [
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27
#        0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
#        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
#        0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
#        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
#        0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
#        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
#        0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
#        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
#        0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
#        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
#        0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#    ];
#
#    dchar decode(inout char[] s)
#    {
#        if (s.length > 0)
#        {
#            uint firstChar = s[0];
#            uint len = LENGTH[firstChar];
#            if (len != 0 && s.length >= len)
#            {
#                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
#                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
#                {
#                    uint c = START_CALC[firstChar];
#                    for (uint i=1; i<len; ++i)
#                    {
#                        c <<= 6;
#                        c |= s[i];
#                    }
#                    if (isValidDchar(s))
#                    {
#                        p = p[len..p.length];
#                        return c;
#                    }
#                }
#            }
#        }
#        throw new UtfError("invalid UTF-8 sequence");
#    }

(and no nasty gotos either!)
Jill


July 28, 2004
In article <ce91ga$jnj$1@digitaldaemon.com>, Arcane Jill says...

Ah, bugger!

#    c |= s[i];

should read:

#    c |= s[i] & 0x3F;

That'll teach me to post code without testing it first! Jill


July 28, 2004
In article <ce91t7$jrt$1@digitaldaemon.com>, Arcane Jill says...

And

#    p = p[len..p.length];

should read

#    s = s[len..s.length];

(Aren't you glad I'm not writing real code myself just now. Just think how many bugs it would end up with! Still - the /principle/ is sound.)


July 28, 2004
Aaargh!

Found even more bugs. Fixed them. Let's just start again. HERE's the fast UTF-8 routine... (If there are any more bugs after this, someone else can find them).


#    const ubyte[256] LENGTH =
#    [
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
#    ];
#
#    const ubyte[256] START_CALC =
#    [
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
#        0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
#        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
#        0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
#        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
#        0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
#        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
#        0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
#        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
#        0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
#        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
#        0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#    ];
#
#    dchar convert(inout char[] s)
#    {
#        if (s.length > 0)
#        {
#            uint firstChar = s[0];
#            uint len = LENGTH[firstChar];
#            if (len != 0 && s.length >= len)
#            {
#                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
#                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
#                {
#                    uint c = START_CALC[firstChar];
#                    for (uint i=1; i<len; ++i)
#                    {
#                        c <<= 6;
#                        c |= s[i] & 0x3F;
#                    }
#                    if (isValidDchar(c))
#                    {
#                        s = s[len..s.length];
#                        return c;
#                    }
#                }
#            }
#        }
#        throw new UtfError("invalid UTF-8 sequence");
#    }


July 28, 2004
This function does not verify any non-first byte in a UTF-8 sequence actually starts with 10xxxxxx... So it accepts
  0xC1,0xBF  (correct)
  and
  0xC1,0xFF (incorrect)

You also probably wanted
    isValidDchar(c)
        instead of
    isValidDchar(s)
        and
    s = s[len..s.length];
        instead of
    p = p[len..p.length];

(I also noticed you used uint exclusively... :P)

Out of curiosity why did you define the LENGTH and the START_CALC arrays?

Arcane Jill wrote:
> For Sean...
> 
> I noticed your std.utf update on the bugs forum. Using delegates is obviously
> sensible, but I noticed the routine looked a tad on the slow side. Here's a
> faster algorithm - it doesn't use delegates, but I'm sure you could do some
> mixing and matching to get the best of both. Here's my fast converter:
> 
> #    const ubyte[256] LENGTH =
> #    [
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
> #        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
> #    ];
> #    #    const ubyte[256] START_CALC = #    [
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> #        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> #        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27
> #        0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
> #        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
> #        0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
> #        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
> #        0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
> #        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
> #        0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
> #        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
> #        0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
> #        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
> #        0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> #        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #    ];
> #    #    dchar decode(inout char[] s)
> #    {
> #        if (s.length > 0)
> #        {
> #            uint firstChar = s[0];
> #            uint len = LENGTH[firstChar];
> #            if (len != 0 && s.length >= len)
> #            {
> #                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
> #                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
> #                {
> #                    uint c = START_CALC[firstChar];
> #                    for (uint i=1; i<len; ++i)
> #                    {
> #                        c <<= 6;
> #                        c |= s[i];
> #                    }
> #                    if (isValidDchar(s))
> #                    {
> #                        p = p[len..p.length];
> #                        return c;
> #                    }
> #                }
> #            }
> #        }
> #        throw new UtfError("invalid UTF-8 sequence");
> #    }
> 
> (and no nasty gotos either!)
> Jill
> 
> 
July 28, 2004
The routines themselves were left unaltered from the original UTF functions. I'll play with your suggestions and see if I can get it all working though.  If the code can be made faster then that's fine with me :)


Sean


July 28, 2004
In article <ce9483$kq0$1@digitaldaemon.com>, parabolis says...
>
>This function does not verify any non-first byte in a UTF-8 sequence actually starts with 10xxxxxx... So it accepts
>   0xC1,0xBF  (correct)
>   and
>   0xC1,0xFF (incorrect)

Well spotted. Okay, so replace

#    c |= s[i] & 0x3F;
#    // etc

with

#    if ((s[i] & 0xC0) == 0x80)
#    {
#        c |= s[i] & 0x3F;
#       // etc
#    }

Thanks very much for pointing that out. I appreciate it.


>You also probably wanted

Yeah, there were some typos in the original post. I fixed them in the repost.

>Out of curiosity why did you define the LENGTH and the START_CALC arrays?

Because they're the fast lookup tables.
Jill


July 28, 2004
One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data.


July 29, 2004
In article <ce98eo$n71$1@digitaldaemon.com>, Walter says...
>
>One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data.

Good point.

Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!)

#    const ubyte[128] LENGTH =
#    [
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
#    ];
#
#    const ubyte[128] START_CALC =
#    [
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#    ];
#
#    dchar convert(inout char[] s)
#    {
#        if (s.length > 0)
#        {
#            uint firstChar = s[0];
#            if (firstChar < 0x80) // ASCII
#            {
#                s = s[1..s.length];
#                return firstChar;
#            }
#            firstChar -= 0x80;
#            uint len = LENGTH[firstChar];
#            if (len != 0 && s.length >= len)
#            {
#                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
#                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
#                {
#                    uint c = START_CALC[firstChar];
#                    uint i;
#                    for (i=1; i<len; ++i)
#                    {
#                        if ((s[i] & 0xC0) != 0x80) break;
#                        c <<= 6;
#                        c |= s[i] & 0x3F;
#                    }
#                    if (i == len && isValidDchar(c))
#                    {
#                        s = s[len..s.length];
#                        return c;
#                    }
#                }
#            }
#        }
#        throw new UtfError("invalid UTF-8 sequence");
#    }

Jill



July 29, 2004
Does your version also reject UTF-8 sequences that produce the correct value, but are not the shortest possible sequence?

"Arcane Jill" <Arcane_member@pathlink.com> wrote in message news:cea792$14f4$1@digitaldaemon.com...
> In article <ce98eo$n71$1@digitaldaemon.com>, Walter says...
> >
> >One aspect to consider when writing fast conversion code is the frequency
of
> >various characters. Characters do not have a flat random distribution.
I'd
> >wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary
operations
> >on ASCII chars, so while it may be faster if the data is random, it would
be
> >slower on text data.
>
> Good point.
>
> Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!)
>
> #    const ubyte[128] LENGTH =
> #    [
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
> #        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
> #    ];
> #
> #    const ubyte[128] START_CALC =
> #    [
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> #        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #    ];
> #
> #    dchar convert(inout char[] s)
> #    {
> #        if (s.length > 0)
> #        {
> #            uint firstChar = s[0];
> #            if (firstChar < 0x80) // ASCII
> #            {
> #                s = s[1..s.length];
> #                return firstChar;
> #            }
> #            firstChar -= 0x80;
> #            uint len = LENGTH[firstChar];
> #            if (len != 0 && s.length >= len)
> #            {
> #                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
> #                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
> #                {
> #                    uint c = START_CALC[firstChar];
> #                    uint i;
> #                    for (i=1; i<len; ++i)
> #                    {
> #                        if ((s[i] & 0xC0) != 0x80) break;
> #                        c <<= 6;
> #                        c |= s[i] & 0x3F;
> #                    }
> #                    if (i == len && isValidDchar(c))
> #                    {
> #                        s = s[len..s.length];
> #                        return c;
> #                    }
> #                }
> #            }
> #        }
> #        throw new UtfError("invalid UTF-8 sequence");
> #    }
>
> Jill
>
>
>


« First   ‹ Prev
1 2