UTF-8 to dchar conversion - D Programming Language Discussion Forum

Forums

New users
- Learn
Community
- General
- Announce
Improvements
- DIP Ideas
- DIP Devel.
Ecosystem
- GDC
- LDC
- Debuggers
- IDEs
- DWT
Development
- Internals
- Issues
- Beta
- DMD
- Phobos
- Druntime
- Study
Turkish
- Genel
- Duyuru

Index » General » UTF-8 to dchar conversion

Thread overview

UTF-8 to dchar conversion
Jul 28, 2004 Arcane Jill
Jul 28, 2004 Arcane Jill
Jul 28, 2004 Arcane Jill
Jul 28, 2004 Arcane Jill
Jul 28, 2004 parabolis
Jul 28, 2004 Arcane Jill
Jul 28, 2004 Sean Kelly
Jul 28, 2004 Walter
Jul 29, 2004 Arcane Jill
Jul 29, 2004 Walter
Jul 29, 2004 Arcane Jill
Jul 29, 2004 Arcane Jill
Jul 29, 2004 Walter

July 28, 2004

UTF-8 to dchar conversion

Posted by Arcane Jill

Arcane Jill

For Sean...

I noticed your std.utf update on the bugs forum. Using delegates is obviously sensible, but I noticed the routine looked a tad on the slow side. Here's a faster algorithm - it doesn't use delegates, but I'm sure you could do some mixing and matching to get the best of both. Here's my fast converter:

#    const ubyte[256] LENGTH =
#    [
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
#    ];
#
#    const ubyte[256] START_CALC =
#    [
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27
#        0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
#        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
#        0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
#        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
#        0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
#        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
#        0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
#        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
#        0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
#        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
#        0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#    ];
#
#    dchar decode(inout char[] s)
#    {
#        if (s.length > 0)
#        {
#            uint firstChar = s[0];
#            uint len = LENGTH[firstChar];
#            if (len != 0 && s.length >= len)
#            {
#                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
#                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
#                {
#                    uint c = START_CALC[firstChar];
#                    for (uint i=1; i<len; ++i)
#                    {
#                        c <<= 6;
#                        c |= s[i];
#                    }
#                    if (isValidDchar(s))
#                    {
#                        p = p[len..p.length];
#                        return c;
#                    }
#                }
#            }
#        }
#        throw new UtfError("invalid UTF-8 sequence");
#    }

(and no nasty gotos either!)
Jill

July 28, 2004

Re: UTF-8 to dchar conversion

Posted by Arcane Jill
in reply to Arcane Jill

Arcane Jill

Posted in reply to Arcane Jill

In article <ce91ga$jnj$1@digitaldaemon.com>, Arcane Jill says...

Ah, bugger!

#    c |= s[i];

should read:

#    c |= s[i] & 0x3F;

That'll teach me to post code without testing it first! Jill

July 28, 2004

Re: UTF-8 to dchar conversion

Posted by Arcane Jill
in reply to Arcane Jill

Arcane Jill

Posted in reply to Arcane Jill

In article <ce91t7$jrt$1@digitaldaemon.com>, Arcane Jill says...

And

#    p = p[len..p.length];

should read

#    s = s[len..s.length];

(Aren't you glad I'm not writing real code myself just now. Just think how many bugs it would end up with! Still - the /principle/ is sound.)

July 28, 2004

Re: UTF-8 to dchar conversion

Posted by Arcane Jill
in reply to Arcane Jill

Arcane Jill

Posted in reply to Arcane Jill

Aaargh!

Found even more bugs. Fixed them. Let's just start again. HERE's the fast UTF-8 routine... (If there are any more bugs after this, someone else can find them).


#    const ubyte[256] LENGTH =
#    [
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
#    ];
#
#    const ubyte[256] START_CALC =
#    [
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27,
#        0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
#        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
#        0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
#        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
#        0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
#        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
#        0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
#        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
#        0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
#        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
#        0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#    ];
#
#    dchar convert(inout char[] s)
#    {
#        if (s.length > 0)
#        {
#            uint firstChar = s[0];
#            uint len = LENGTH[firstChar];
#            if (len != 0 && s.length >= len)
#            {
#                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
#                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
#                {
#                    uint c = START_CALC[firstChar];
#                    for (uint i=1; i<len; ++i)
#                    {
#                        c <<= 6;
#                        c |= s[i] & 0x3F;
#                    }
#                    if (isValidDchar(c))
#                    {
#                        s = s[len..s.length];
#                        return c;
#                    }
#                }
#            }
#        }
#        throw new UtfError("invalid UTF-8 sequence");
#    }

July 28, 2004

Re: UTF-8 to dchar conversion

Posted by parabolis
in reply to Arcane Jill

parabolis

Posted in reply to Arcane Jill

This function does not verify any non-first byte in a UTF-8 sequence actually starts with 10xxxxxx... So it accepts
  0xC1,0xBF  (correct)
  and
  0xC1,0xFF (incorrect)

You also probably wanted
    isValidDchar(c)
        instead of
    isValidDchar(s)
        and
    s = s[len..s.length];
        instead of
    p = p[len..p.length];

(I also noticed you used uint exclusively... :P)

Out of curiosity why did you define the LENGTH and the START_CALC arrays?

Arcane Jill wrote:
> For Sean...
> 
> I noticed your std.utf update on the bugs forum. Using delegates is obviously
> sensible, but I noticed the routine looked a tad on the slow side. Here's a
> faster algorithm - it doesn't use delegates, but I'm sure you could do some
> mixing and matching to get the best of both. Here's my fast converter:
> 
> #    const ubyte[256] LENGTH =
> #    [
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
> #        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
> #    ];
> #    #    const ubyte[256] START_CALC = #    [
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> #        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> #        0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27
> #        0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
> #        0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
> #        0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
> #        0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
> #        0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
> #        0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
> #        0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
> #        0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
> #        0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
> #        0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
> #        0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> #        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #    ];
> #    #    dchar decode(inout char[] s)
> #    {
> #        if (s.length > 0)
> #        {
> #            uint firstChar = s[0];
> #            uint len = LENGTH[firstChar];
> #            if (len != 0 && s.length >= len)
> #            {
> #                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
> #                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
> #                {
> #                    uint c = START_CALC[firstChar];
> #                    for (uint i=1; i<len; ++i)
> #                    {
> #                        c <<= 6;
> #                        c |= s[i];
> #                    }
> #                    if (isValidDchar(s))
> #                    {
> #                        p = p[len..p.length];
> #                        return c;
> #                    }
> #                }
> #            }
> #        }
> #        throw new UtfError("invalid UTF-8 sequence");
> #    }
> 
> (and no nasty gotos either!)
> Jill
> 
>

July 28, 2004

Re: UTF-8 to dchar conversion

Posted by Sean Kelly
in reply to Arcane Jill

Sean Kelly

Posted in reply to Arcane Jill

The routines themselves were left unaltered from the original UTF functions. I'll play with your suggestions and see if I can get it all working though.  If the code can be made faster then that's fine with me :)


Sean

July 28, 2004

Re: UTF-8 to dchar conversion

Posted by Arcane Jill
in reply to parabolis

Arcane Jill

Posted in reply to parabolis

In article <ce9483$kq0$1@digitaldaemon.com>, parabolis says...
>
>This function does not verify any non-first byte in a UTF-8 sequence actually starts with 10xxxxxx... So it accepts
>   0xC1,0xBF  (correct)
>   and
>   0xC1,0xFF (incorrect)

Well spotted. Okay, so replace

#    c |= s[i] & 0x3F;
#    // etc

with

#    if ((s[i] & 0xC0) == 0x80)
#    {
#        c |= s[i] & 0x3F;
#       // etc
#    }

Thanks very much for pointing that out. I appreciate it.


>You also probably wanted

Yeah, there were some typos in the original post. I fixed them in the repost.

>Out of curiosity why did you define the LENGTH and the START_CALC arrays?

Because they're the fast lookup tables.
Jill

July 28, 2004

Re: UTF-8 to dchar conversion

Posted by Walter
in reply to Arcane Jill

Walter

Posted in reply to Arcane Jill

One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data.

July 29, 2004

Re: UTF-8 to dchar conversion

Posted by Arcane Jill
in reply to Walter

Arcane Jill

Posted in reply to Walter

In article <ce98eo$n71$1@digitaldaemon.com>, Walter says...
>
>One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data.

Good point.

Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!)

#    const ubyte[128] LENGTH =
#    [
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
#        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
#        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
#    ];
#
#    const ubyte[128] START_CALC =
#    [
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
#        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
#        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
#        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
#    ];
#
#    dchar convert(inout char[] s)
#    {
#        if (s.length > 0)
#        {
#            uint firstChar = s[0];
#            if (firstChar < 0x80) // ASCII
#            {
#                s = s[1..s.length];
#                return firstChar;
#            }
#            firstChar -= 0x80;
#            uint len = LENGTH[firstChar];
#            if (len != 0 && s.length >= len)
#            {
#                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
#                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
#                {
#                    uint c = START_CALC[firstChar];
#                    uint i;
#                    for (i=1; i<len; ++i)
#                    {
#                        if ((s[i] & 0xC0) != 0x80) break;
#                        c <<= 6;
#                        c |= s[i] & 0x3F;
#                    }
#                    if (i == len && isValidDchar(c))
#                    {
#                        s = s[len..s.length];
#                        return c;
#                    }
#                }
#            }
#        }
#        throw new UtfError("invalid UTF-8 sequence");
#    }

Jill

July 29, 2004

Re: UTF-8 to dchar conversion

Posted by Walter
in reply to Arcane Jill

Walter

Posted in reply to Arcane Jill

Does your version also reject UTF-8 sequences that produce the correct value, but are not the shortest possible sequence?

"Arcane Jill" <Arcane_member@pathlink.com> wrote in message news:cea792$14f4$1@digitaldaemon.com...
> In article <ce98eo$n71$1@digitaldaemon.com>, Walter says...
> >
> >One aspect to consider when writing fast conversion code is the frequency
of
> >various characters. Characters do not have a flat random distribution.
I'd
> >wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary
operations
> >on ASCII chars, so while it may be faster if the data is random, it would
be
> >slower on text data.
>
> Good point.
>
> Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!)
>
> #    const ubyte[128] LENGTH =
> #    [
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> #        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> #        3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
> #        4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
> #    ];
> #
> #    const ubyte[128] START_CALC =
> #    [
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> #        0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> #        0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> #        0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> #    ];
> #
> #    dchar convert(inout char[] s)
> #    {
> #        if (s.length > 0)
> #        {
> #            uint firstChar = s[0];
> #            if (firstChar < 0x80) // ASCII
> #            {
> #                s = s[1..s.length];
> #                return firstChar;
> #            }
> #            firstChar -= 0x80;
> #            uint len = LENGTH[firstChar];
> #            if (len != 0 && s.length >= len)
> #            {
> #                if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
> #                   (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
> #                {
> #                    uint c = START_CALC[firstChar];
> #                    uint i;
> #                    for (i=1; i<len; ++i)
> #                    {
> #                        if ((s[i] & 0xC0) != 0x80) break;
> #                        c <<= 6;
> #                        c |= s[i] & 0x3F;
> #                    }
> #                    if (i == len && isValidDchar(c))
> #                    {
> #                        s = s[len..s.length];
> #                        return c;
> #                    }
> #                }
> #            }
> #        }
> #        throw new UtfError("invalid UTF-8 sequence");
> #    }
>
> Jill
>
>
>

Top | Forum index | About this forum

Copyright © 1999-2021 by the D Language Foundation