Thread overview | |||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
July 28, 2004 UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
For Sean... I noticed your std.utf update on the bugs forum. Using delegates is obviously sensible, but I noticed the routine looked a tad on the slow side. Here's a faster algorithm - it doesn't use delegates, but I'm sure you could do some mixing and matching to get the best of both. Here's my fast converter: # const ubyte[256] LENGTH = # [ # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, # 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0, # ]; # # const ubyte[256] START_CALC = # [ # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, # 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27 # 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, # 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, # 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, # 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47, # 0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F, # 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57, # 0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F, # 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, # 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, # 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, # 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # ]; # # dchar decode(inout char[] s) # { # if (s.length > 0) # { # uint firstChar = s[0]; # uint len = LENGTH[firstChar]; # if (len != 0 && s.length >= len) # { # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) && # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80)) # { # uint c = START_CALC[firstChar]; # for (uint i=1; i<len; ++i) # { # c <<= 6; # c |= s[i]; # } # if (isValidDchar(s)) # { # p = p[len..p.length]; # return c; # } # } # } # } # throw new UtfError("invalid UTF-8 sequence"); # } (and no nasty gotos either!) Jill |
July 28, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Arcane Jill | In article <ce91ga$jnj$1@digitaldaemon.com>, Arcane Jill says... Ah, bugger! # c |= s[i]; should read: # c |= s[i] & 0x3F; That'll teach me to post code without testing it first! Jill |
July 28, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Arcane Jill | In article <ce91t7$jrt$1@digitaldaemon.com>, Arcane Jill says... And # p = p[len..p.length]; should read # s = s[len..s.length]; (Aren't you glad I'm not writing real code myself just now. Just think how many bugs it would end up with! Still - the /principle/ is sound.) |
July 28, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Arcane Jill | Aaargh! Found even more bugs. Fixed them. Let's just start again. HERE's the fast UTF-8 routine... (If there are any more bugs after this, someone else can find them). # const ubyte[256] LENGTH = # [ # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, # 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0, # ]; # # const ubyte[256] START_CALC = # [ # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, # 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27, # 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F, # 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37, # 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F, # 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47, # 0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F, # 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57, # 0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F, # 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67, # 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F, # 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77, # 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # ]; # # dchar convert(inout char[] s) # { # if (s.length > 0) # { # uint firstChar = s[0]; # uint len = LENGTH[firstChar]; # if (len != 0 && s.length >= len) # { # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) && # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80)) # { # uint c = START_CALC[firstChar]; # for (uint i=1; i<len; ++i) # { # c <<= 6; # c |= s[i] & 0x3F; # } # if (isValidDchar(c)) # { # s = s[len..s.length]; # return c; # } # } # } # } # throw new UtfError("invalid UTF-8 sequence"); # } |
July 28, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Arcane Jill | This function does not verify any non-first byte in a UTF-8 sequence actually starts with 10xxxxxx... So it accepts
0xC1,0xBF (correct)
and
0xC1,0xFF (incorrect)
You also probably wanted
isValidDchar(c)
instead of
isValidDchar(s)
and
s = s[len..s.length];
instead of
p = p[len..p.length];
(I also noticed you used uint exclusively... :P)
Out of curiosity why did you define the LENGTH and the START_CALC arrays?
Arcane Jill wrote:
> For Sean...
>
> I noticed your std.utf update on the bugs forum. Using delegates is obviously
> sensible, but I noticed the routine looked a tad on the slow side. Here's a
> faster algorithm - it doesn't use delegates, but I'm sure you could do some
> mixing and matching to get the best of both. Here's my fast converter:
>
> # const ubyte[256] LENGTH =
> # [
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
> # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
> # 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> # 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
> # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
> # 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0,
> # ];
> # # const ubyte[256] START_CALC = # [
> # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> # 0x20,0x21,0x22,0x23,0x24,0x25,0x26,0x27
> # 0x28,0x29,0x2A,0x2B,0x2C,0x2D,0x2E,0x2F,
> # 0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,
> # 0x38,0x39,0x3A,0x3B,0x3C,0x3D,0x3E,0x3F,
> # 0x40,0x41,0x42,0x43,0x44,0x45,0x46,0x47,
> # 0x48,0x49,0x4A,0x4B,0x4C,0x4D,0x4E,0x4F,
> # 0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,
> # 0x58,0x59,0x5A,0x5B,0x5C,0x5D,0x5E,0x5F,
> # 0x60,0x61,0x62,0x63,0x64,0x65,0x66,0x67,
> # 0x68,0x69,0x6A,0x6B,0x6C,0x6D,0x6E,0x6F,
> # 0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,
> # 0x78,0x79,0x7A,0x7B,0x7C,0x7D,0x7E,0x7F,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # 0x00,0x00,0x02,0x03,0x04,0x05,0x06,0x07,
> # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17,
> # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F,
> # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F,
> # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,
> # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
> # ];
> # # dchar decode(inout char[] s)
> # {
> # if (s.length > 0)
> # {
> # uint firstChar = s[0];
> # uint len = LENGTH[firstChar];
> # if (len != 0 && s.length >= len)
> # {
> # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) &&
> # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80))
> # {
> # uint c = START_CALC[firstChar];
> # for (uint i=1; i<len; ++i)
> # {
> # c <<= 6;
> # c |= s[i];
> # }
> # if (isValidDchar(s))
> # {
> # p = p[len..p.length];
> # return c;
> # }
> # }
> # }
> # }
> # throw new UtfError("invalid UTF-8 sequence");
> # }
>
> (and no nasty gotos either!)
> Jill
>
>
|
July 28, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Arcane Jill | The routines themselves were left unaltered from the original UTF functions. I'll play with your suggestions and see if I can get it all working though. If the code can be made faster then that's fine with me :) Sean |
July 28, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to parabolis | In article <ce9483$kq0$1@digitaldaemon.com>, parabolis says... > >This function does not verify any non-first byte in a UTF-8 sequence actually starts with 10xxxxxx... So it accepts > 0xC1,0xBF (correct) > and > 0xC1,0xFF (incorrect) Well spotted. Okay, so replace # c |= s[i] & 0x3F; # // etc with # if ((s[i] & 0xC0) == 0x80) # { # c |= s[i] & 0x3F; # // etc # } Thanks very much for pointing that out. I appreciate it. >You also probably wanted Yeah, there were some typos in the original post. I fixed them in the repost. >Out of curiosity why did you define the LENGTH and the START_CALC arrays? Because they're the fast lookup tables. Jill |
July 28, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Arcane Jill | One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data. |
July 29, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Walter | In article <ce98eo$n71$1@digitaldaemon.com>, Walter says... > >One aspect to consider when writing fast conversion code is the frequency of various characters. Characters do not have a flat random distribution. I'd wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations on ASCII chars, so while it may be faster if the data is random, it would be slower on text data. Good point. Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!) # const ubyte[128] LENGTH = # [ # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, # 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0, # ]; # # const ubyte[128] START_CALC = # [ # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, # ]; # # dchar convert(inout char[] s) # { # if (s.length > 0) # { # uint firstChar = s[0]; # if (firstChar < 0x80) // ASCII # { # s = s[1..s.length]; # return firstChar; # } # firstChar -= 0x80; # uint len = LENGTH[firstChar]; # if (len != 0 && s.length >= len) # { # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) && # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80)) # { # uint c = START_CALC[firstChar]; # uint i; # for (i=1; i<len; ++i) # { # if ((s[i] & 0xC0) != 0x80) break; # c <<= 6; # c |= s[i] & 0x3F; # } # if (i == len && isValidDchar(c)) # { # s = s[len..s.length]; # return c; # } # } # } # } # throw new UtfError("invalid UTF-8 sequence"); # } Jill |
July 29, 2004 Re: UTF-8 to dchar conversion | ||||
---|---|---|---|---|
| ||||
Posted in reply to Arcane Jill | Does your version also reject UTF-8 sequences that produce the correct value, but are not the shortest possible sequence? "Arcane Jill" <Arcane_member@pathlink.com> wrote in message news:cea792$14f4$1@digitaldaemon.com... > In article <ce98eo$n71$1@digitaldaemon.com>, Walter says... > > > >One aspect to consider when writing fast conversion code is the frequency of > >various characters. Characters do not have a flat random distribution. I'd > >wager that the overwhelming majority of them will be ASCII. Thus, a fast converter would first just test for ASCII, and save the more complex processing for non-ASCII. Your routine does numerous unnecessary operations > >on ASCII chars, so while it may be faster if the data is random, it would be > >slower on text data. > > Good point. > > Here's a new version then, which tests for ASCII first. (It also makes the lookup tables half the size!) > > # const ubyte[128] LENGTH = > # [ > # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, > # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, > # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, > # 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, > # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, > # 0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, > # 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, > # 4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0, > # ]; > # > # const ubyte[128] START_CALC = > # [ > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, > # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, > # 0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17, > # 0x18,0x19,0x1A,0x1B,0x1C,0x1D,0x1E,0x1F, > # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, > # 0x08,0x09,0x0A,0x0B,0x0C,0x0D,0x0E,0x0F, > # 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, > # 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, > # ]; > # > # dchar convert(inout char[] s) > # { > # if (s.length > 0) > # { > # uint firstChar = s[0]; > # if (firstChar < 0x80) // ASCII > # { > # s = s[1..s.length]; > # return firstChar; > # } > # firstChar -= 0x80; > # uint len = LENGTH[firstChar]; > # if (len != 0 && s.length >= len) > # { > # if (firstChar != 0xE0 || (s[1] & 0xE0) != 0x80) && > # (firstChar != 0xF0 || (s[1] & 0xF0) != 0x80)) > # { > # uint c = START_CALC[firstChar]; > # uint i; > # for (i=1; i<len; ++i) > # { > # if ((s[i] & 0xC0) != 0x80) break; > # c <<= 6; > # c |= s[i] & 0x3F; > # } > # if (i == len && isValidDchar(c)) > # { > # s = s[len..s.length]; > # return c; > # } > # } > # } > # } > # throw new UtfError("invalid UTF-8 sequence"); > # } > > Jill > > > |
Copyright © 1999-2021 by the D Language Foundation