July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #10 from bearophile_hugs@eml.cc 2013-07-10 04:15:16 PDT ---
(In reply to comment #9)

> Excellent, so could put these as templates then. :)

I have improved your code a little. Follows the 32 bit asm for ldc2 and dmd:


import std.traits: isIntegral, isUnsigned;

T rol(T)(in T x, in uint y) @safe pure nothrow
if (isIntegral!T && isUnsigned!T)
{
    return cast(T)((x << y) | (x >> ((T.sizeof * 8) - y)));
}

T ror(T)(in T x, in uint y) @safe pure nothrow
if (isIntegral!T && isUnsigned!T)
{
    return cast(T)((x >> y) | (x << ((T.sizeof * 8) - y)));
}

void main() {
    // Tests to check for assembly output.
    {
        __gshared static ubyte xb;
        __gshared static ushort xs;
        __gshared static uint xi;
        __gshared static ulong xl;
        __gshared static uint yi;

        rol(xb, yi);   // rolb
        ror(xb, yi);   // rorb

        rol(xs, yi);   // rolw
        ror(xs, yi);   // rorw

        rol(xi, yi);   // roll
        ror(xi, yi);   // rorl

        rol(xl, yi);   // version(X86_64) rolq
        ror(xl, yi);   // version(X86_64) rorq
    }
}


/*
ldmd2 -O -output-s


__D5temp210__T3rolThZ3rolFNaNbNfxhxkZh:
    pushl   %esi
    movzbl  8(%esp), %edx
    movb    %al, %cl
    movl    %edx, %esi
    shll    %cl, %esi
    movl    $8, %ecx
    subl    %eax, %ecx
    shrl    %cl, %edx
    orl %esi, %edx
    movzbl  %dl, %eax
    popl    %esi
    ret $4

__D5temp210__T3rorThZ3rorFNaNbNfxhxkZh:
    pushl   %esi
    movzbl  8(%esp), %edx
    movb    %al, %cl
    movl    %edx, %esi
    shrl    %cl, %esi
    movl    $8, %ecx
    subl    %eax, %ecx
    shll    %cl, %edx
    orl %esi, %edx
    movzbl  %dl, %eax
    popl    %esi
    ret $4

__D5temp210__T3rolTtZ3rolFNaNbNfxtxkZt:
    pushl   %esi
    movzwl  8(%esp), %edx
    movb    %al, %cl
    movl    %edx, %esi
    shll    %cl, %esi
    movl    $16, %ecx
    subl    %eax, %ecx
    shrl    %cl, %edx
    orl %esi, %edx
    movzwl  %dx, %eax
    popl    %esi
    ret $4

__D5temp210__T3rorTtZ3rorFNaNbNfxtxkZt:
    pushl   %esi
    movzwl  8(%esp), %edx
    movb    %al, %cl
    movl    %edx, %esi
    shrl    %cl, %esi
    movl    $16, %ecx
    subl    %eax, %ecx
    shll    %cl, %edx
    orl %esi, %edx
    movzwl  %dx, %eax
    popl    %esi
    ret $4

__D5temp210__T3rolTkZ3rolFNaNbNfxkxkZk:
    movl    4(%esp), %edx
    movb    %al, %cl
    roll    %cl, %edx
    movl    %edx, %eax
    ret $4

__D5temp210__T3rorTkZ3rorFNaNbNfxkxkZk:
    movl    4(%esp), %edx
    movb    %al, %cl
    rorl    %cl, %edx
    movl    %edx, %eax
    ret $4

__D5temp210__T3rolTmZ3rolFNaNbNfxmxkZm:
    pushl   %ebp
    pushl   %ebx
    pushl   %edi
    pushl   %esi
    movl    %eax, %ebx
    movl    $64, %ecx
    subl    %ebx, %ecx
    movl    24(%esp), %edx
    movl    20(%esp), %eax
    movl    %eax, %esi
    shrdl   %cl, %edx, %esi
    movl    %edx, %edi
    shrl    %cl, %edi
    xorl    %ebp, %ebp
    testb   $32, %cl
    cmovnel %edi, %esi
    cmovnel %ebp, %edi
    movb    %bl, %cl
    shldl   %cl, %eax, %edx
    movb    %bl, %cl
    shll    %cl, %eax
    testb   $32, %bl
    cmovnel %eax, %edx
    cmovnel %ebp, %eax
    orl %esi, %eax
    orl %edi, %edx
    popl    %esi
    popl    %edi
    popl    %ebx
    popl    %ebp
    ret $8

__D5temp210__T3rorTmZ3rorFNaNbNfxmxkZm:
    pushl   %ebp
    pushl   %ebx
    pushl   %edi
    pushl   %esi
    movl    %eax, %ecx
    movl    24(%esp), %edx
    movl    20(%esp), %eax
    movl    %eax, %esi
    shrdl   %cl, %edx, %esi
    movl    %edx, %edi
    shrl    %cl, %edi
    xorl    %ebp, %ebp
    testb   $32, %cl
    cmovnel %edi, %esi
    cmovnel %ebp, %edi
    movl    $64, %ebx
    subl    %ecx, %ebx
    movb    %bl, %cl
    shldl   %cl, %eax, %edx
    movb    %bl, %cl
    shll    %cl, %eax
    testb   $32, %bl
    cmovnel %eax, %edx
    cmovnel %ebp, %eax
    orl %esi, %eax
    orl %edi, %edx
    popl    %esi
    popl    %edi
    popl    %ebx
    popl    %ebp
    ret $8

--------------------------------

dmd -O

_D5temp210__T3rolThZ3rolFNaNbNfxhxkZh:
        push    EAX
        push    EAX
        movzx   ECX,byte ptr 0Ch[ESP]
        mov EAX,ECX
        mov 0[ESP],ECX
        mov CL,4[ESP]
        mov EDX,0[ESP]
        shl AL,CL
        mov ECX,8
        sub ECX,4[ESP]
        sar EDX,CL
        add ESP,8
        or  AL,DL
        ret 4

_D5temp210__T3rorThZ3rorFNaNbNfxhxkZh:
        push    EAX
        push    EAX
        movzx   ECX,byte ptr 0Ch[ESP]
        mov EAX,ECX
        mov 0[ESP],ECX
        mov ECX,8
        sub CL,4[ESP]
        mov EDX,0[ESP]
        shl AL,CL
        mov ECX,4[ESP]
        sar EDX,CL
        add ESP,8
        or  AL,DL
        ret 4

_D5temp210__T3rolTtZ3rolFNaNbNfxtxkZt:
        push    EAX
        mov AX,8[ESP]
        mov CX,[ESP]
        mov DX,8[ESP]
        shl EAX,CL
        mov ECX,010h
        sub ECX,[ESP]
        and EDX,0FFFFh
        sar EDX,CL
        pop ECX
        or  EAX,EDX
        ret 4

_D5temp210__T3rorTtZ3rorFNaNbNfxtxkZt:
        push    EAX
        mov ECX,010h
        mov AX,8[ESP]
        sub ECX,[ESP]
        mov DX,8[ESP]
        shl EAX,CL
        mov ECX,[ESP]
        and EDX,0FFFFh
        sar EDX,CL
        or  EAX,EDX
        pop ECX
        ret 4

_D5temp210__T3rolTkZ3rolFNaNbNfxkxkZk:
        push    EAX
        mov EAX,8[ESP]
        mov ECX,[ESP]
        rol EAX,CL
        pop ECX
        ret 4

_D5temp210__T3rorTkZ3rorFNaNbNfxkxkZk:
        push    EAX
        mov EAX,8[ESP]
        mov ECX,[ESP]
        ror EAX,CL
        pop ECX
        ret 4

_D5temp210__T3rolTmZ3rolFNaNbNfxmxkZm:
        push    EAX
        mov ECX,0[ESP]
        mov EDX,0Ch[ESP]
        push    EBX
        mov EAX,0Ch[ESP]
        test    CL,020h
        jne L1A
        shld    EDX,EAX,CL
        shl EAX,CL
        jmp short   L20
L1A:        shl EAX,CL
        mov EDX,EAX
        xor EAX,EAX
L20:        mov ECX,040h
        mov EBX,0Ch[ESP]
        push    EDX
        mov EDX,014h[ESP]
        sub ECX,8[ESP]
        test    CL,020h
        jne L3E
        shrd    EBX,EDX,CL
        shr EDX,CL
        jmp short   L44
L3E:        shr EDX,CL
        mov EBX,EDX
        xor EDX,EDX
L44:        mov ECX,EDX
        or  EAX,EBX
        pop EDX
        or  EDX,ECX
        pop EBX
        pop ECX
        ret 8

_D5temp210__T3rorTmZ3rorFNaNbNfxmxkZm:
        push    EAX
        mov ECX,0[ESP]
        mov EDX,0Ch[ESP]
        push    EBX
        mov EAX,0Ch[ESP]
        test    CL,020h
        jne L1A
        shrd    EAX,EDX,CL
        shr EDX,CL
        jmp short   L20
L1A:        shr EDX,CL
        mov EAX,EDX
        xor EDX,EDX
L20:        mov ECX,040h
        mov EBX,0Ch[ESP]
        push    EDX
        mov EDX,014h[ESP]
        sub ECX,8[ESP]
        test    CL,020h
        jne L3E
        shld    EDX,EBX,CL
        shl EBX,CL
        jmp short   L44
L3E:        shl EBX,CL
        mov EDX,EBX
        xor EBX,EBX
L44:        mov ECX,EDX
        or  EAX,EBX
        pop EDX
        or  EDX,ECX
        pop EBX
        pop ECX
        ret 8

*/

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #11 from Iain Buclaw <ibuclaw@ubuntu.com> 2013-07-10 04:57:05 PDT ---
(In reply to comment #10)
> (In reply to comment #9)
> 
> > Excellent, so could put these as templates then. :)
> 
> I have improved your code a little. Follows the 32 bit asm for ldc2 and dmd:
> 

gdc > (ldc && dmd);

It has no problem detecting all those cases.  :o)

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #12 from bearophile_hugs@eml.cc 2013-07-10 06:45:05 PDT ---
A bit better, with a pre-condition to catch some bugs:


T rol(T)(in T x, in uint n) @safe pure nothrow
if (isIntegral!T && isUnsigned!T)
in {
    assert(n < (T.sizeof * 8));
} body {
    return cast(T)((x << n) | (x >> ((T.sizeof * 8) - n)));
}

T ror(T)(in T x, in uint n) @safe pure nothrow
if (isIntegral!T && isUnsigned!T)
in {
    assert(n < (T.sizeof * 8));
} body {
    return cast(T)((x >> n) | (x << ((T.sizeof * 8) - n)));
}

--------------

(In reply to comment #11)

> gdc > (ldc && dmd);
> 
> It has no problem detecting all those cases.  :o)

But is that asm generated by gdc actually faster than this asm generated by ldc2?

One of the main points of adding those two templated functions to core.bitop is to have a single common standard pair of rotations that all present and future D compilers can recognize and optimize well.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #13 from bearophile_hugs@eml.cc 2013-07-10 06:46:44 PDT ---
(In reply to comment #11)

> It has no problem detecting all those cases.  :o)

Perhaps you want to show the asm generated by gdc for those functions?

(Perhaps here there's material for a small enhancement request for LLVM.)

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #14 from Iain Buclaw <ibuclaw@ubuntu.com> 2013-07-10 06:53:27 PDT ---
(In reply to comment #12)
> (In reply to comment #11)
> 
> > gdc > (ldc && dmd);
> > 
> > It has no problem detecting all those cases.  :o)
> 
> But is that asm generated by gdc actually faster than this asm generated by ldc2?
> 

Well, I would have to assume that ro{lr}{bwlq} is faster than shl/shr.  Someone should get speed reports in.  =)

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #15 from Iain Buclaw <ibuclaw@ubuntu.com> 2013-07-10 06:55:08 PDT ---
(In reply to comment #13)
> (In reply to comment #11)
> 
> > It has no problem detecting all those cases.  :o)
> 
> Perhaps you want to show the asm generated by gdc for those functions?
> 
> (Perhaps here there's material for a small enhancement request for LLVM.)

You see the assembler output for roll/rorl.  It's identical with exception to the operand suffix - eg: rolb/rorb for ubyte.

But to give you a comparison.

// LDC

__D5temp210__T3rolThZ3rolFNaNbNfxhxkZh:
    pushl   %esi
    movzbl  8(%esp), %edx
    movb    %al, %cl
    movl    %edx, %esi
    shll    %cl, %esi
    movl    $8, %ecx
    subl    %eax, %ecx
    shrl    %cl, %edx
    orl %esi, %edx
    movzbl  %dl, %eax
    popl    %esi
    ret $4

vs.

// GDC

__D5temp210__T3rolThZ3rolFNaNbNfxhxkZh:
    movl    %edi, %eax
    movl    %esi, %ecx
    rolb    %cl, %al
    ret

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #16 from Iain Buclaw <ibuclaw@ubuntu.com> 2013-07-10 06:59:38 PDT ---
(In reply to comment #13)
> (In reply to comment #11)
> 
> > It has no problem detecting all those cases.  :o)
> 
> Perhaps you want to show the asm generated by gdc for those functions?
> 
> (Perhaps here there's material for a small enhancement request for LLVM.)

Full listing:

_D4temp10__T3rolThZ3rolFNaNbNfxhxkZh:
        movl    %edi, %eax
        movl    %esi, %ecx
        rolb    %cl, %al
        ret

_D4temp10__T3rorThZ3rorFNaNbNfxhxkZh:
        movl    %edi, %eax
        movl    %esi, %ecx
        rorb    %cl, %al
        ret

_D4temp10__T3rolTtZ3rolFNaNbNfxtxkZt:
        movl    %edi, %eax
        movl    %esi, %ecx
        rolw    %cl, %ax
        ret

_D4temp10__T3rorTtZ3rorFNaNbNfxtxkZt:
        movl    %edi, %eax
        movl    %esi, %ecx
        rorw    %cl, %ax
        ret

_D4temp10__T3rolTkZ3rolFNaNbNfxkxkZk:
        movl    %edi, %eax
        movl    %esi, %ecx
        roll    %cl, %eax
        ret

_D4temp10__T3rorTkZ3rorFNaNbNfxkxkZk:
        movl    %edi, %eax
        movl    %esi, %ecx
        rorl    %cl, %eax
        ret

_D4temp10__T3rolTmZ3rolFNaNbNfxmxkZm:
        movq    %rdi, %rax
        movl    %esi, %ecx
        rolq    %cl, %rax
        ret

_D4temp10__T3rorTmZ3rorFNaNbNfxmxkZm:
        movq    %rdi, %rax
        movl    %esi, %ecx
        rorq    %cl, %rax
        ret

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 10, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #17 from bearophile_hugs@eml.cc 2013-07-10 09:29:51 PDT ---
(In reply to comment #15)

> But to give you a comparison.
> 
> // LDC
> 
> __D5temp210__T3rolThZ3rolFNaNbNfxhxkZh:
>     pushl   %esi
>     movzbl  8(%esp), %edx
>     movb    %al, %cl
>     movl    %edx, %esi
>     shll    %cl, %esi
>     movl    $8, %ecx
>     subl    %eax, %ecx
>     shrl    %cl, %edx
>     orl %esi, %edx
>     movzbl  %dl, %eax
>     popl    %esi
>     ret $4
> 
> vs.
> 
> // GDC
> 
> __D5temp210__T3rolThZ3rolFNaNbNfxhxkZh:
>     movl    %edi, %eax
>     movl    %esi, %ecx
>     rolb    %cl, %al
>     ret

I have asked in the LLVM IRC channel, and it seems a limit/problem of the llvm back-end. So maybe it's worth writing an LLVM enhancement request.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 12, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #18 from hsteoh@quickfur.ath.cx 2013-07-11 21:37:37 PDT ---
Hmph. I'm using bearophile's code with DMD git HEAD, running as dmd -O, and *none* of the cases produce a rotate instruction. :-(  Could it be an issue with 32bit vs. 64-bit? I'm running in a purely 64-bit environment.

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------
July 12, 2013
http://d.puremagic.com/issues/show_bug.cgi?id=6829



--- Comment #19 from hsteoh@quickfur.ath.cx 2013-07-11 21:40:34 PDT ---
I can't get GDC git HEAD to recognize this pattern either. What am I doing wrong??

-- 
Configure issuemail: http://d.puremagic.com/issues/userprefs.cgi?tab=email
------- You are receiving this mail because: -------