[Issue 18627] std.complex is a lot slower than builtin complex types at number crunching (page 2)

April 16, 2021

[Issue 18627] std.complex is a lot slower than builtin complex types at number crunching

Posted by Iain Buclaw

Permalink

Iain Buclaw

Permalink

https://issues.dlang.org/show_bug.cgi?id=18627

Iain Buclaw <ibuclaw@gdcproject.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|RESOLVED                    |REOPENED
         Resolution|FIXED                       |---

--- Comment #15 from Iain Buclaw <ibuclaw@gdcproject.org> ---
Not sure if this should really be marked as resolved/fixed, but anyhow...

With the following (lazy) function generator:
---
import std.complex : C = Complex;
import std.meta : AliasSeq;
import std.format : format;

static foreach (T; AliasSeq!(cfloat, cdouble, creal))
{
    // Unary operators
    mixin(format!"%s %s_unary_add(%s a) { return +a; }"
          (T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_unary_sub(%s a) { return -a; }"
          (T.stringof, T.stringof, T.stringof));

    // Binary operators
    mixin(format!"%s %s_binary_add(%s a, %s b) { return a + b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_binary_sub(%s a, %s b) { return a - b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_binary_mul(%s a, %s b) { return a * b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"%s %s_binary_div(%s a, %s b) { return a / b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
}

static foreach (T; AliasSeq!(float, double, real))
{
    // Unary operators
    mixin(format!"C!%s std_c%s_unary_add(C!%s a) { return +a; }"
          (T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_unary_sub(C!%s a) { return -a; }"
          (T.stringof, T.stringof, T.stringof));

    // Binary operators
    mixin(format!"C!%s std_c%s_binary_add(C!%s a, C!%s b) { return a + b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_binary_sub(C!%s a, C!%s b) { return a - b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_binary_mul(C!%s a, C!%s b) { return a * b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
    mixin(format!"C!%s std_c%s_binary_div(C!%s a, C!%s b) { return a / b; }"
          (T.stringof, T.stringof, T.stringof, T.stringof));
}
---


On x86_64/GDC, the results are: ========================================

cfloat_unary_add:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movss   %xmm0, -16(%rsp)
        movss   -4(%rsp), %xmm0
        movss   %xmm0, -12(%rsp)
        movq    -16(%rsp), %xmm0
        ret
---
std_cfloat_unary_add:
        ret

========================================

cdouble_unary_add:
        ret
---
std_cdouble_unary_add:
        ret

========================================

creal_unary_add:
        fldt    8(%rsp)
        fldt    24(%rsp)
        fxch    %st(1)
        ret
---
std_creal_unary_add:
        movdqa  8(%rsp), %xmm0
        movdqa  24(%rsp), %xmm1
        movq    %rdi, %rax
        movaps  %xmm0, (%rdi)
        movaps  %xmm1, 16(%rdi)
        ret

========================================

cfloat_unary_sub:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movss   .LC4(%rip), %xmm2
        movaps  %xmm0, %xmm1
        movss   -4(%rsp), %xmm0
        xorps   %xmm2, %xmm1
        xorps   %xmm2, %xmm0
        movss   %xmm1, -16(%rsp)
        movss   %xmm0, -12(%rsp)
        movq    -16(%rsp), %xmm0
        ret
.LC4:
        .long   -2147483648
        .long   0
        .long   0
        .long   0
---
std_cfloat_unary_sub:
        movq    .LC7(%rip), %xmm1
        xorps   %xmm1, %xmm0
        ret
.LC7:
        .long   -2147483648
        .long   -2147483648

========================================

cdouble_unary_sub:
        movq    .LC5(%rip), %xmm2
        xorpd   %xmm2, %xmm1
        xorpd   %xmm2, %xmm0
        ret
.LC5:
        .long   0
        .long   -2147483648
        .long   0
        .long   0
---
std_cdouble_unary_sub:
        movq    %xmm0, -24(%rsp)
        movq    %xmm1, -16(%rsp)
        movapd  -24(%rsp), %xmm2
        xorpd   .LC8(%rip), %xmm2
        movaps  %xmm2, -24(%rsp)
        movsd   -16(%rsp), %xmm1
        movsd   -24(%rsp), %xmm0
        ret
.LC8:
        .long   0
        .long   -2147483648
        .long   0
        .long   -2147483648

========================================

creal_unary_sub:
        fldt    8(%rsp)
        fchs
        fldt    24(%rsp)
        fchs
        fxch    %st(1)
        ret
---
std_creal_unary_sub:
        fldt    24(%rsp)
        movq    %rdi, %rax
        fchs
        fldt    8(%rsp)
        fchs
        fstpt   (%rdi)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_add:
        movq    %xmm0, -8(%rsp)
        movq    %xmm1, -16(%rsp)
        movss   -8(%rsp), %xmm1
        movss   -16(%rsp), %xmm0
        addss   %xmm0, %xmm1
        movss   -12(%rsp), %xmm0
        addss   -4(%rsp), %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_add:
        addps   %xmm1, %xmm0
        ret


========================================

cdouble_binary_add:
        addsd   %xmm3, %xmm1
        addsd   %xmm2, %xmm0
        ret
---
std_cdouble_binary_add:
        movq    %xmm0, -40(%rsp)
        movq    %xmm1, -32(%rsp)
        movq    %xmm2, -24(%rsp)
        movq    %xmm3, -16(%rsp)
        movapd  -24(%rsp), %xmm4
        addpd   -40(%rsp), %xmm4
        movaps  %xmm4, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_add:
        fldt    8(%rsp)
        fldt    40(%rsp)
        faddp   %st, %st(1)
        fldt    24(%rsp)
        fldt    56(%rsp)
        faddp   %st, %st(1)
        fxch    %st(1)
        ret
---
std_creal_binary_add:
        fldt    24(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        faddp   %st, %st(1)
        fldt    40(%rsp)
        fldt    8(%rsp)
        faddp   %st, %st(1)
        fstpt   (%rdi)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_sub:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movq    %xmm1, -16(%rsp)
        movaps  %xmm0, %xmm1
        movss   -4(%rsp), %xmm0
        subss   -16(%rsp), %xmm1
        subss   -12(%rsp), %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_sub:
        subps   %xmm1, %xmm0
        ret

========================================

cdouble_binary_sub:
        subsd   %xmm3, %xmm1
        subsd   %xmm2, %xmm0
        ret
---
std_cdouble_binary_sub:
        movq    %xmm0, -40(%rsp)
        movq    %xmm1, -32(%rsp)
        movapd  -40(%rsp), %xmm4
        movq    %xmm2, -24(%rsp)
        movq    %xmm3, -16(%rsp)
        subpd   -24(%rsp), %xmm4
        movaps  %xmm4, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_sub:
        fldt    8(%rsp)
        fldt    40(%rsp)
        fsubrp  %st, %st(1)
        fldt    24(%rsp)
        fldt    56(%rsp)
        fsubrp  %st, %st(1)
        fxch    %st(1)
        ret
---
std_creal_binary_sub:
        fldt    24(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        fsubrp  %st, %st(1)
        fldt    8(%rsp)
        fldt    40(%rsp)
        fsubrp  %st, %st(1)
        fstpt   (%rdi)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_mul:
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm0
        movss   -4(%rsp), %xmm2
        movq    %xmm1, -16(%rsp)
        movss   -16(%rsp), %xmm3
        movss   -12(%rsp), %xmm4
        movaps  %xmm0, %xmm1
        movaps  %xmm2, %xmm5
        mulss   %xmm3, %xmm1
        mulss   %xmm4, %xmm5
        mulss   %xmm4, %xmm0
        mulss   %xmm3, %xmm2
        subss   %xmm5, %xmm1
        addss   %xmm2, %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_mul:
        movdqa  %xmm0, %xmm2
        movaps  %xmm1, %xmm0
        shufps  $0xe5, %xmm1, %xmm1
        shufps  $0xe0, %xmm0, %xmm0
        mulps   %xmm2, %xmm0
        shufps  $0xe1, %xmm2, %xmm2
        mulps   %xmm1, %xmm2
        movaps  %xmm0, %xmm1
        subps   %xmm2, %xmm1
        addps   %xmm2, %xmm0
        movss   %xmm1, %xmm0
        ret

========================================

cdouble_binary_mul:
        movapd  %xmm0, %xmm4
        movapd  %xmm1, %xmm5
        mulsd   %xmm2, %xmm0
        mulsd   %xmm3, %xmm5
        mulsd   %xmm3, %xmm4
        mulsd   %xmm2, %xmm1
        subsd   %xmm5, %xmm0
        addsd   %xmm4, %xmm1
        ret
---
std_cdouble_binary_mul:
        movq    %xmm2, -40(%rsp)
        movq    %xmm3, -32(%rsp)
        movapd  -40(%rsp), %xmm2
        movq    %xmm1, -16(%rsp)
        movapd  -40(%rsp), %xmm1
        movq    %xmm0, -24(%rsp)
        movapd  -24(%rsp), %xmm0
        unpcklpd        %xmm2, %xmm2
        mulpd   -24(%rsp), %xmm2
        unpckhpd        %xmm1, %xmm1
        shufpd  $1, %xmm0, %xmm0
        mulpd   %xmm1, %xmm0
        movapd  %xmm2, %xmm1
        subpd   %xmm0, %xmm1
        addpd   %xmm0, %xmm2
        movsd   %xmm1, %xmm2
        movaps  %xmm2, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_mul:
        fldt    8(%rsp)
        fldt    24(%rsp)
        fldt    40(%rsp)
        fldt    56(%rsp)
        fld     %st(3)
        fmul    %st(2), %st
        fld     %st(3)
        fmul    %st(2), %st
        fsubrp  %st, %st(1)
        fxch    %st(4)
        fmulp   %st, %st(1)
        fxch    %st(2)
        fmulp   %st, %st(1)
        faddp   %st, %st(1)
        fxch    %st(1)
        ret
---
std_creal_binary_mul:
        fldt    40(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        fldt    24(%rsp)
        fldt    8(%rsp)
        fld     %st(3)
        fmul    %st(1), %st
        fld     %st(2)
        fmul    %st(4), %st
        fsubrp  %st, %st(1)
        fstpt   (%rdi)
        fxch    %st(3)
        fmulp   %st, %st(1)
        fxch    %st(2)
        fmulp   %st, %st(1)
        faddp   %st, %st(1)
        fstpt   16(%rdi)
        ret

========================================

cfloat_binary_div:
        movq    %xmm1, -16(%rsp)
        movss   -16(%rsp), %xmm5
        movss   -12(%rsp), %xmm4
        movq    %xmm0, -8(%rsp)
        movss   -8(%rsp), %xmm3
        movss   -4(%rsp), %xmm0
        movaps  %xmm5, %xmm2
        movaps  %xmm4, %xmm1
        mulss   %xmm4, %xmm1
        movaps  %xmm0, %xmm6
        mulss   %xmm5, %xmm2
        mulss   %xmm4, %xmm6
        mulss   %xmm5, %xmm0
        addss   %xmm1, %xmm2
        movaps  %xmm3, %xmm1
        mulss   %xmm5, %xmm1
        mulss   %xmm4, %xmm3
        addss   %xmm6, %xmm1
        subss   %xmm3, %xmm0
        divss   %xmm2, %xmm1
        divss   %xmm2, %xmm0
        movss   %xmm1, -24(%rsp)
        movss   %xmm0, -20(%rsp)
        movq    -24(%rsp), %xmm0
        ret
---
std_cfloat_binary_div:
        movq    %xmm1, %rax
        movdqa  %xmm1, %xmm2
        movdqa  %xmm0, %xmm3
        shrq    $32, %rax
        movaps  %xmm2, %xmm4
        mulss   %xmm2, %xmm4
        movd    %eax, %xmm1
        movq    %xmm0, %rax
        movaps  %xmm1, %xmm0
        shrq    $32, %rax
        mulss   %xmm1, %xmm0
        movq    %rax, %xmm5
        movd    %eax, %xmm6
        mulss   %xmm1, %xmm6
        addss   %xmm0, %xmm4
        movaps  %xmm2, %xmm0
        mulss   %xmm3, %xmm0
        mulss   %xmm5, %xmm2
        mulss   %xmm1, %xmm3
        addss   %xmm6, %xmm0
        subss   %xmm3, %xmm2
        divss   %xmm4, %xmm0
        divss   %xmm4, %xmm2
        unpcklps        %xmm2, %xmm0
        ret

========================================

cdouble_binary_div:
        movapd  %xmm0, %xmm4
        movapd  %xmm2, %xmm5
        movapd  %xmm3, %xmm0
        mulsd   %xmm3, %xmm0
        movapd  %xmm1, %xmm6
        mulsd   %xmm2, %xmm5
        mulsd   %xmm3, %xmm6
        mulsd   %xmm2, %xmm1
        addsd   %xmm0, %xmm5
        movapd  %xmm4, %xmm0
        mulsd   %xmm2, %xmm0
        mulsd   %xmm3, %xmm4
        addsd   %xmm6, %xmm0
        subsd   %xmm4, %xmm1
        divsd   %xmm5, %xmm0
        divsd   %xmm5, %xmm1
        ret
---
std_cdouble_binary_div:
        movq    %xmm2, -40(%rsp)
        movsd   -40(%rsp), %xmm2
        movq    %xmm3, -32(%rsp)
        movapd  -40(%rsp), %xmm3
        movsd   -32(%rsp), %xmm4
        movq    %xmm1, -16(%rsp)
        movapd  -40(%rsp), %xmm1
        mulsd   %xmm2, %xmm2
        movq    %xmm0, -24(%rsp)
        mulsd   %xmm4, %xmm4
        unpcklpd        %xmm3, %xmm3
        movapd  -24(%rsp), %xmm0
        mulpd   -24(%rsp), %xmm3
        unpckhpd        %xmm1, %xmm1
        shufpd  $1, %xmm0, %xmm0
        mulpd   %xmm1, %xmm0
        addsd   %xmm4, %xmm2
        movapd  %xmm3, %xmm1
        addpd   %xmm0, %xmm1
        subpd   %xmm0, %xmm3
        unpcklpd        %xmm2, %xmm2
        movsd   %xmm1, %xmm3
        divpd   %xmm2, %xmm3
        movaps  %xmm3, -40(%rsp)
        movsd   -32(%rsp), %xmm1
        movsd   -40(%rsp), %xmm0
        ret

========================================

creal_binary_div:
        fldt    8(%rsp)
        fldt    24(%rsp)
        fldt    40(%rsp)
        fldt    56(%rsp)
        fld     %st(1)
        fmul    %st(2), %st
        fld     %st(1)
        fmul    %st(2), %st
        faddp   %st, %st(1)
        fld     %st(4)
        fmul    %st(3), %st
        fld     %st(4)
        fmul    %st(3), %st
        faddp   %st, %st(1)
        fdiv    %st(1), %st
        fxch    %st(4)
        fmulp   %st, %st(3)
        fxch    %st(4)
        fmulp   %st, %st(1)
        fsubrp  %st, %st(1)
        fdivp   %st, %st(2)
        ret
---
std_creal_binary_div:
        fldt    40(%rsp)
        movq    %rdi, %rax
        fldt    56(%rsp)
        fldt    24(%rsp)
        fldt    8(%rsp)
        fld     %st(3)
        fmul    %st(4), %st
        fld     %st(3)
        fmul    %st(4), %st
        faddp   %st, %st(1)
        fld     %st(4)
        fmul    %st(2), %st
        fld     %st(3)
        fmul    %st(5), %st
        faddp   %st, %st(1)
        fdiv    %st(1), %st
        fstpt   (%rdi)
        fxch    %st(4)
        fmulp   %st, %st(2)
        fmulp   %st, %st(2)
        fsubp   %st, %st(1)
        fdivp   %st, %st(1)
        fstpt   16(%rdi)
        ret

========================================

Just visually comparing:

- cfloat -> Complex!float looks to be neglible.
- creal -> Complex!real just adds a small overhead of moving data on/off ST
registers (this is expected, and not a performance bug).
- cdouble -> Complex!double, it may look like cdouble still has a small edge,
however the use of *pd instructions on the std.complex would infact make it
quicker (i.e: one divpd is 2x faster than two divsd instructions in the
cdouble_binary_div functions).

I actually found that LLVM seemed for able to pick-up the intent of the FastMath complex divide functions, so LDC might give a more pleasing output.

Benchmarks to follow soon...

--

https://issues.dlang.org/show_bug.cgi?id=18627

Iain Buclaw <ibuclaw@gdcproject.org> changed:

           What    |Removed                     |Added
----------------------------------------------------------------------------
             Status|REOPENED                    |RESOLVED
         Resolution|---                         |FIXED

--- Comment #16 from Iain Buclaw <ibuclaw@gdcproject.org> ---
cfloat_unary_add: 15 secs, 195 ms, 935 μs, and 5 hnsecs std_cfloat_unary_add: 2 secs, 491 ms, 834 μs, and 9 hnsecs

cfloat_unary_sub: 14 secs, 926 ms, 587 μs, and 6 hnsecs std_cfloat_unary_sub: 4 secs, 858 ms, 349 μs, and 4 hnsecs

cfloat_binary_add: 22 secs, 363 ms, 951 μs, and 9 hnsecs std_cfloat_binary_add: 5 secs, 403 ms, 108 μs, and 9 hnsecs

cfloat_binary_sub: 22 secs, 236 ms, and 902 μs std_cfloat_binary_sub: 5 secs, 266 ms, 697 μs, and 6 hnsecs

cfloat_binary_mul: 24 secs, 858 ms, 63 μs, and 7 hnsecs std_cfloat_binary_mul: 7 secs, 186 ms, 291 μs, and 8 hnsecs

cfloat_binary_div: 30 secs, 225 ms, 114 μs, and 4 hnsecs std_cfloat_binary_div: 17 secs, 900 ms, 164 μs, and 6 hnsecs

cfloat_binary_div(FastMath): 29 secs, 230 ms, 821 μs, and 5 hnsecs
std_cfloat_binary_div(FastMath): 12 secs, 208 ms, 118 μs, and 7 hnsecs


cdouble_unary_add: 2 secs, 788 ms, 525 μs, and 6 hnsecs std_cdouble_unary_add: 2 secs, 922 ms, 224 μs, and 1 hnsec

cdouble_unary_sub: 2 secs, 502 ms, and 734 μs std_cdouble_unary_sub: 2 secs, 915 ms, 203 μs, and 9 hnsecs

cdouble_binary_add: 2 secs, 869 ms, 820 μs, and 1 hnsec std_cdouble_binary_add: 3 secs, 108 ms, 545 μs, and 4 hnsecs

cdouble_binary_sub: 2 secs, 836 ms, 796 μs, and 5 hnsecs std_cdouble_binary_sub: 3 secs, 159 ms, 209 μs, and 3 hnsecs

cdouble_binary_mul: 4 secs, 785 ms, 197 μs, and 6 hnsecs std_cdouble_binary_mul: 5 secs, 197 ms, 572 μs, and 9 hnsecs

cdouble_binary_div: 14 secs, 238 ms, 332 μs, and 6 hnsecs std_cdouble_binary_div: 15 secs, 933 ms, 301 μs, and 8 hnsecs

cdouble_binary_div(FastMath): 10 secs, 700 ms, and 32 μs
std_cdouble_binary_div(FastMath): 11 secs, 8 ms, 868 μs, and 5 hnsecs


creal_unary_add: 8 secs, 183 ms, 254 μs, and 3 hnsecs std_creal_unary_add: 14 secs, 72 ms, 96 μs, and 2 hnsecs

creal_unary_sub: 8 secs, 425 ms, 681 μs, and 9 hnsecs std_creal_unary_sub: 10 secs, 854 ms, 312 μs, and 8 hnsecs

creal_binary_add: 3 minutes, 50 secs, 877 ms, 637 μs, and 6 hnsecs std_creal_binary_add: 3 minutes, 57 secs, 397 ms, 952 μs, and 4 hnsecs

creal_binary_sub: 4 minutes, 4 secs, 982 ms, 715 μs, and 2 hnsecs std_creal_binary_sub: 4 minutes, 11 secs, 485 ms, 74 μs, and 8 hnsecs

creal_binary_mul: 11 minutes, 31 secs, 328 ms, 600 μs, and 7 hnsecs std_creal_binary_mul: 11 minutes, 46 secs, 26 ms, 451 μs, and 2 hnsecs

creal_binary_div: 20 minutes, 48 secs, 778 ms, and 747 μs std_creal_binary_div: 20 minutes, 2 secs, 439 ms, and 535 μs

creal_binary_div(FastMath): 18 minutes, 38 secs, 613 ms, 679 μs, and 6 hnsecs
std_creal_binary_div(FastMath): 18 minutes, 42 secs, 400 ms, 343 μs, and 7
hnsecs

--

Forums