October 15, 2016
It can also be written like this producing smaller code.
But it the cost of slower decoding.

dchar myFront(ref char[] str) pure
{
    dchar c = cast(dchar) str.ptr[0];
    if (c & 128)
    {
        if (c & 64)
        {
            int idx = 0;
            int l = charWidthTab.ptr[c - 192];
            if (str.length < l)
                goto Linvalid;
            c = 0;
          l--;
            while(l) {
              l--;
               c |= str.ptr[idx++];
               c <<= 6;
            }
            c |= str.ptr[idx];

       }
        else
    Linvalid : throw new Exception("yadayada");

    }
    return c;
}
October 15, 2016
On Saturday, 15 October 2016 at 18:40:11 UTC, Uplink_Coder wrote:
> It can also be written like this producing smaller code.
> But it the cost of slower decoding.
>
> dchar myFront(ref char[] str) pure
> {
>     dchar c = cast(dchar) str.ptr[0];
>     if (c & 128)
>     {
>         if (c & 64)
>         {
>             int idx = 0;
>             int l = charWidthTab.ptr[c - 192];
>             if (str.length < l)
>                 goto Linvalid;
>             c = 0;
>           l--;
>             while(l) {
>               l--;
>                c |= str.ptr[idx++];
>                c <<= 6;
>             }
>             c |= str.ptr[idx];
>
>        }
>         else
>     Linvalid : throw new Exception("yadayada");
>
>     }
>     return c;
> }

Just a question. Do encoding errors not have to be detected or is validity of the string guaranteed? Wrong continuation bytes or overlong encodings are not detected by this routine.
October 15, 2016
At least with that lookup table below, you can detect isolated continuation bytes (192 and 193) and invalid codes (above 244).

__gshared static immutable ubyte[] charWidthTab = [
            1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
            3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
];

length 5 and 6 need not to be tested specifically for your goto.
October 15, 2016
On Saturday, 15 October 2016 at 19:07:50 UTC, Patrick Schluter wrote:
> At least with that lookup table below, you can detect isolated continuation bytes (192 and 193) and invalid codes (above 244).

192 and 193 can never appear in a UTF-8 text, they are overlongs not continuation bytes. Continuation are characters between 128 and 191 and thos are not allowed, so should be checked.

>
> __gshared static immutable ubyte[] charWidthTab = [
>             1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
>             4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
> ];
>
> length 5 and 6 need not to be tested specifically for your goto.


October 15, 2016
On Saturday, 15 October 2016 at 19:00:12 UTC, Patrick Schluter wrote:
>
> Just a question. Do encoding errors not have to be detected or is validity of the string guaranteed?

AFAIK they have to be detected, otherwise it would be a regression.


October 15, 2016
On Saturday, 15 October 2016 at 19:07:50 UTC, Patrick Schluter wrote:
> At least with that lookup table below, you can detect isolated continuation bytes (192 and 193) and invalid codes (above 244).
>
> __gshared static immutable ubyte[] charWidthTab = [
>             1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
>             4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
> ];
>
> length 5 and 6 need not to be tested specifically for your goto.

If you use 0 instead of 1 the length check will suffice for throwing on invalid.

October 15, 2016
On Saturday, 15 October 2016 at 19:42:03 UTC, Uplink_Coder wrote:
> On Saturday, 15 October 2016 at 19:07:50 UTC, Patrick Schluter wrote:
>> At least with that lookup table below, you can detect isolated continuation bytes (192 and 193) and invalid codes (above 244).
>>
>> __gshared static immutable ubyte[] charWidthTab = [
>>             1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>>             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>>             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
>>             4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
>> ];
>>
>> length 5 and 6 need not to be tested specifically for your goto.
>
> If you use 0 instead of 1 the length check will suffice for throwing on invalid.

__gshared static immutable ubyte[] charWidthTab = [2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0,
    0, 0, 0, 0];

dchar myFront2(ref char[] str) pure
{
    auto c1 = str.ptr[0];
    if (c1 & 128)
    {
        if (c1 & 64)
        {
            int idx = 0;
            int l = charWidthTab.ptr[c1 - 192];
            if (str.length < l)
                goto Linvalid;
            dchar c = 0;
            l--;
            while (l)
            {
                l--;
                immutable cc = str.ptr[idx++];
                debug if (cc & 64) goto Linvalid;
                c |= cc;
                c <<= 6;
            }
            c |= str.ptr[idx];
            return c;

        }
    Linvalid:
        throw new Exception("yadayada");

    }
    else
    {
        return c1;
    }
}

This code proofs to be the fastest so far.
On UTF and non-UTF text.
It's also fairly small.

October 16, 2016
On Saturday, 15 October 2016 at 21:21:22 UTC, Stefan Koch wrote:
> On Saturday, 15 October 2016 at 19:42:03 UTC, Uplink_Coder wrote:
>> On Saturday, 15 October 2016 at 19:07:50 UTC, Patrick Schluter wrote:
>>> At least with that lookup table below, you can detect isolated continuation bytes (192 and 193) and invalid codes (above 244).
>>>
>>> __gshared static immutable ubyte[] charWidthTab = [
>>>             1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>>>             2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
>>>             3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
>>>             4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
>>> ];
>>>
>>> length 5 and 6 need not to be tested specifically for your goto.
>>
>> If you use 0 instead of 1 the length check will suffice for throwing on invalid.
>
> __gshared static immutable ubyte[] charWidthTab = [2, 2, 2, 2, 2, 2, 2, 2, 2,
>     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
>     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0,
>     0, 0, 0, 0];
>
> dchar myFront2(ref char[] str) pure
> {
>     auto c1 = str.ptr[0];
>     if (c1 & 128)
>     {
>         if (c1 & 64)
>         {
>             int idx = 0;
>             int l = charWidthTab.ptr[c1 - 192];
>             if (str.length < l)
>                 goto Linvalid;
>             dchar c = 0;
>             l--;
>             while (l)
>             {
>                 l--;
>                 immutable cc = str.ptr[idx++];
>                 debug if (cc & 64) goto Linvalid;
>                 c |= cc;
>                 c <<= 6;
>             }
>             c |= str.ptr[idx];
>             return c;
>
>         }
>     Linvalid:
>         throw new Exception("yadayada");
>
>     }
>     else
>     {
>         return c1;
>     }
> }
>
> This code proofs to be the fastest so far.
> On UTF and non-UTF text.
> It's also fairly small.

What does "debug if" do ? Because when I replace it with a simple "if" the code generated by "LDC 1.1.0-beta2 -release -O3 -boundscheck=off" is 15 lines shorter.
That was my first question but I think I see the issue now. By removing the debug, the condition is compiled in and the compiler short circuits the whole loop and goes to the Linvalid.
The error is that cc is loaded the first time with the same value as c1 because idx=0 and it is post incremented. It should be preincremented and c would have to be initialised with the data bits of c1 not 0.

October 16, 2016
Here my version. It's probably not the shortest (100 ligns of assembly with LDC) but it is correct and has following properties:
- Performance proportional to the encoding length
- Detects Invalid byte sequences
- Detects Overlong encodings
- Detects Invalid code points

I put the exception to be comparable to other routines but Unicode specifies that it is preferable to not abort on encoding errors (to avoid denial of service attacks).

dchar myFront2(ref char[] str)
{
  dchar c0 = str.ptr[0];
  if(c0 < 0x80) {
    return c0;
  }
  else if(str.length > 1) {
    dchar c1 = str.ptr[1];
    if(c0 < 0xE0 && (c1 & 0xC0) == 0x80) {
      c1 = ((c0 & 0x1F) << 6)|(c1 & 0x3F);
      if(c1 < 0x80) goto Linvalid;
      return c1;
    }
    else if(str.length > 2) {
      dchar c2 = str.ptr[2];
      if(c0 < 0xF0 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80) {
        c2 = ((c0 & 0x0F) << 12)|((c1 & 0x3F) << 6)|(c2 & 0x3F);
        if(c2 < 0x800) goto Linvalid;
        return c2;
      }
      else if(str.length > 3) {
        dchar c3 = str.ptr[3];
        if(c0 < 0xF5 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80 && (c3 & 0xC0) == 0x80) {
          c3 = ((c0 & 0x07) << 16)|((c1 & 0x3F) << 12)|((c2 & 0x3F) << 6)|(c3 & 0x3F);
          if(c3 < 0x10000  || c3 > 0x10ffff) goto Linvalid;
          return c3;
        }
      }
    }
  }
  Linvalid:
     throw new Exception("yadayada");
//assert(myFront2(['\xC2','\xA2'])==0xA3);
}


October 16, 2016
On Sunday, 16 October 2016 at 07:59:16 UTC, Patrick Schluter wrote:
> Here my version. It's probably not the shortest (100 ligns of assembly with LDC) but it is correct and has following properties:
> - Performance proportional to the encoding length
> - Detects Invalid byte sequences
> - Detects Overlong encodings
> - Detects Invalid code points
>
> I put the exception to be comparable to other routines but Unicode specifies that it is preferable to not abort on encoding errors (to avoid denial of service attacks).
>
> dchar myFront2(ref char[] str)
> {
>   dchar c0 = str.ptr[0];
>   if(c0 < 0x80) {
>     return c0;
>   }
>   else if(str.length > 1) {
>     dchar c1 = str.ptr[1];
>     if(c0 < 0xE0 && (c1 & 0xC0) == 0x80) {
>       c1 = ((c0 & 0x1F) << 6)|(c1 & 0x3F);
>       if(c1 < 0x80) goto Linvalid;
>       return c1;
>     }
>     else if(str.length > 2) {
>       dchar c2 = str.ptr[2];
>       if(c0 < 0xF0 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80) {
>         c2 = ((c0 & 0x0F) << 12)|((c1 & 0x3F) << 6)|(c2 & 0x3F);
>         if(c2 < 0x800) goto Linvalid;
>         return c2;
>       }
>       else if(str.length > 3) {
>         dchar c3 = str.ptr[3];
>         if(c0 < 0xF5 && (c1 & 0xC0) == 0x80 && (c2 & 0xC0) == 0x80 && (c3 & 0xC0) == 0x80) {
>           c3 = ((c0 & 0x07) << 16)|((c1 & 0x3F) << 12)|((c2 & 0x3F) << 6)|(c3 & 0x3F);
>           if(c3 < 0x10000  || c3 > 0x10ffff) goto Linvalid;
>           return c3;
>         }
>       }
>     }
>   }
>   Linvalid:
>      throw new Exception("yadayada");
> //assert(myFront2(['\xC2','\xA2'])==0xA3);
> }

This looks quite slow.
We already have a correct version in utf.decodeImpl.
The goal here was to find a small and fast alternative.