Thread overview | |||||||||
---|---|---|---|---|---|---|---|---|---|
|
December 19, 2013 floor operation problem | ||||
---|---|---|---|---|
| ||||
While I was debugging a performance problem, I have found the cause is the floor operation. Below there is a small benchmark to show it. I have compiled the code with: gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3 test1.c -o test1 ldmd2 -O -release -inline -noboundscheck test2.d ldmd2 -O -release -inline -noboundscheck test3.d 32 bit system gcc version 4.8.0 LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1) Run-time, seconds: test1.c: 1.01 test2.d: 4.14 test3.d: 6.62 --------------------- // test1.c #include <stdio.h> #include <math.h> #include <stdlib.h> static inline float foo(const float x, const float y) { return floorf(x) + floorf(y); } int main() { float total = 0.0f; for (int i = 0; i < 1000; i++) for (int y = 0; y < 256; y++) for (int x = 0; x < 256; x++) total += foo(x * 0.1f, y * 0.1f); printf("%f\n", total); return 0; } --------------------- // test2.d import core.stdc.stdio, core.stdc.math; float foo(const float x, const float y) nothrow { return floorf(x) + floorf(y); } int main() { float total = 0.0f; for (int i = 0; i < 1000; i++) for (int y = 0; y < 256; y++) for (int x = 0; x < 256; x++) total += foo(x * 0.1f, y * 0.1f); printf("%f\n", total); return 0; } --------------------- // test3.d import core.stdc.stdio, std.math; float foo(const float x, const float y) nothrow { return floor(x) + floor(y); } int main() { float total = 0.0f; for (int i = 0; i < 1000; i++) for (int y = 0; y < 256; y++) for (int x = 0; x < 256; x++) total += foo(x * 0.1f, y * 0.1f); printf("%f\n", total); return 0; } --------------------- test1.c asm: _main: pushl %ebp movl %esp, %ebp pushl %ebx movl $1000, %ebx andl $-16, %esp subl $16, %esp call ___main xorps %xmm1, %xmm1 movss LC3, %xmm5 L2: movss LC1, %xmm6 xorps %xmm3, %xmm3 xorl %ecx, %ecx .p2align 4,,7 L9: movss LC2, %xmm4 xorps %xmm0, %xmm0 xorl %eax, %eax .p2align 4,,7 L7: addss %xmm0, %xmm1 addl $1, %eax cmpl $256, %eax addss %xmm3, %xmm1 je L12 cvtsi2ss %eax, %xmm0 mulss %xmm6, %xmm0 movaps %xmm0, %xmm2 andps %xmm5, %xmm2 ucomiss %xmm2, %xmm4 jbe L7 cvttss2si %xmm0, %edx cvtsi2ss %edx, %xmm2 movaps %xmm2, %xmm7 cmpnless %xmm0, %xmm7 movaps %xmm7, %xmm0 movss LC4, %xmm7 andps %xmm7, %xmm0 subss %xmm0, %xmm2 movaps %xmm2, %xmm0 jmp L7 .p2align 4,,7 L12: addl $1, %ecx cmpl $256, %ecx je L5 cvtsi2ss %ecx, %xmm3 movss LC6, %xmm0 movss LC2, %xmm2 mulss LC1, %xmm3 andps %xmm3, %xmm0 ucomiss %xmm0, %xmm2 jbe L9 cvttss2si %xmm3, %eax cvtsi2ss %eax, %xmm0 movaps %xmm0, %xmm2 cmpnless %xmm3, %xmm2 movss LC4, %xmm3 andps %xmm3, %xmm2 movaps %xmm0, %xmm3 subss %xmm2, %xmm3 jmp L9 L5: subl $1, %ebx jne L2 unpcklps %xmm1, %xmm1 movl $LC5, (%esp) cvtps2pd %xmm1, %xmm5 movsd %xmm5, 4(%esp) call _printf xorl %eax, %eax movl -4(%ebp), %ebx leave ret --------------------- test2.d asm: __Dmain: pushl %ebx pushl %edi pushl %esi subl $28, %esp xorps %xmm0, %xmm0 xorl %esi, %esi movss LCPI1_0, %xmm1 .align 16, 0x90 LBB1_1: xorl %edi, %edi .align 16, 0x90 LBB1_2: xorps %xmm2, %xmm2 cvtsi2ssl %edi, %xmm2 mulss %xmm1, %xmm2 movss %xmm2, 12(%esp) xorl %ebx, %ebx .align 16, 0x90 LBB1_3: movss %xmm0, 16(%esp) xorps %xmm0, %xmm0 cvtsi2ssl %ebx, %xmm0 mulss %xmm1, %xmm0 movss %xmm0, (%esp) calll _floorf movss 12(%esp), %xmm0 movss %xmm0, (%esp) fstps 24(%esp) calll _floorf movss LCPI1_0, %xmm1 fstps 20(%esp) movss 24(%esp), %xmm0 addss 20(%esp), %xmm0 movss 16(%esp), %xmm2 addss %xmm0, %xmm2 movss %xmm2, 16(%esp) movss 16(%esp), %xmm0 incl %ebx cmpl $256, %ebx jne LBB1_3 incl %edi cmpl $256, %edi jne LBB1_2 incl %esi cmpl $1000, %esi jne LBB1_1 cvtss2sd %xmm0, %xmm0 movsd %xmm0, 4(%esp) movl $_.str, (%esp) calll ___mingw_printf xorl %eax, %eax addl $28, %esp popl %esi popl %edi popl %ebx ret --------------------- test3.d asm: __Dmain: pushl %ebx pushl %edi pushl %esi subl $52, %esp xorps %xmm1, %xmm1 xorl %esi, %esi movss LCPI1_0, %xmm2 .align 16, 0x90 LBB1_1: xorl %edi, %edi .align 16, 0x90 LBB1_2: xorps %xmm0, %xmm0 cvtsi2ssl %edi, %xmm0 mulss %xmm2, %xmm0 movss %xmm0, 48(%esp) xorl %ebx, %ebx flds 48(%esp) fstpt 12(%esp) movaps %xmm1, %xmm0 .align 16, 0x90 LBB1_3: movss %xmm0, 36(%esp) xorps %xmm0, %xmm0 cvtsi2ssl %ebx, %xmm0 mulss %xmm2, %xmm0 movss %xmm0, 44(%esp) flds 44(%esp) fstpt (%esp) calll __D3std4math5floorFNbNeeZe subl $12, %esp fstpt 24(%esp) fldt 12(%esp) fstpt (%esp) calll __D3std4math5floorFNbNeeZe subl $12, %esp movss 36(%esp), %xmm0 movss LCPI1_0, %xmm2 fldt 24(%esp) faddp %st(1) fstps 40(%esp) addss 40(%esp), %xmm0 incl %ebx cmpl $256, %ebx jne LBB1_3 movaps %xmm0, %xmm1 incl %edi cmpl $256, %edi jne LBB1_2 incl %esi cmpl $1000, %esi jne LBB1_1 xorps %xmm0, %xmm0 cvtss2sd %xmm1, %xmm0 movsd %xmm0, 4(%esp) movl $_.str, (%esp) calll ___mingw_printf xorl %eax, %eax addl $52, %esp popl %esi popl %edi popl %ebx ret --------------------- Bye, bearophile |
December 19, 2013 Re: floor operation problem | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | Am Thu, 19 Dec 2013 01:15:27 +0100 schrieb "bearophile" <bearophileHUGS@lycos.com>: > While I was debugging a performance problem, I have found the cause is the floor operation. Below there is a small benchmark to show it. > > I have compiled the code with: > > gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3 > test1.c -o test1 > ldmd2 -O -release -inline -noboundscheck test2.d > ldmd2 -O -release -inline -noboundscheck test3.d > > 32 bit system > > gcc version 4.8.0 > LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1) > > Run-time, seconds: > test1.c: 1.01 > test2.d: 4.14 > test3.d: 6.62 > > --------------------- > > // test1.c > #include <stdio.h> > #include <math.h> > #include <stdlib.h> > > static inline float foo(const float x, const float y) { > return floorf(x) + floorf(y); > } > > int main() { > float total = 0.0f; > > for (int i = 0; i < 1000; i++) > for (int y = 0; y < 256; y++) > for (int x = 0; x < 256; x++) > total += foo(x * 0.1f, y * 0.1f); > > printf("%f\n", total); > return 0; > } > > --------------------- > > // test2.d > import core.stdc.stdio, core.stdc.math; > > float foo(const float x, const float y) nothrow { > return floorf(x) + floorf(y); > } > > int main() { > float total = 0.0f; > > for (int i = 0; i < 1000; i++) > for (int y = 0; y < 256; y++) > for (int x = 0; x < 256; x++) > total += foo(x * 0.1f, y * 0.1f); > > printf("%f\n", total); > return 0; > } > > --------------------- > > // test3.d > import core.stdc.stdio, std.math; > > float foo(const float x, const float y) nothrow { > return floor(x) + floor(y); > } > > int main() { > float total = 0.0f; > > for (int i = 0; i < 1000; i++) > for (int y = 0; y < 256; y++) > for (int x = 0; x < 256; x++) > total += foo(x * 0.1f, y * 0.1f); > > printf("%f\n", total); > return 0; > } > > --------------------- > > test1.c asm: > > _main: > pushl %ebp > movl %esp, %ebp > pushl %ebx > movl $1000, %ebx > andl $-16, %esp > subl $16, %esp > call ___main > xorps %xmm1, %xmm1 > movss LC3, %xmm5 > L2: > movss LC1, %xmm6 > xorps %xmm3, %xmm3 > xorl %ecx, %ecx > .p2align 4,,7 > L9: > movss LC2, %xmm4 > xorps %xmm0, %xmm0 > xorl %eax, %eax > .p2align 4,,7 > L7: > addss %xmm0, %xmm1 > addl $1, %eax > cmpl $256, %eax > addss %xmm3, %xmm1 > je L12 > cvtsi2ss %eax, %xmm0 > mulss %xmm6, %xmm0 > movaps %xmm0, %xmm2 > andps %xmm5, %xmm2 > ucomiss %xmm2, %xmm4 > jbe L7 > cvttss2si %xmm0, %edx > cvtsi2ss %edx, %xmm2 > movaps %xmm2, %xmm7 > cmpnless %xmm0, %xmm7 > movaps %xmm7, %xmm0 > movss LC4, %xmm7 > andps %xmm7, %xmm0 > subss %xmm0, %xmm2 > movaps %xmm2, %xmm0 > jmp L7 > .p2align 4,,7 > L12: > addl $1, %ecx > cmpl $256, %ecx > je L5 > cvtsi2ss %ecx, %xmm3 > movss LC6, %xmm0 > movss LC2, %xmm2 > mulss LC1, %xmm3 > andps %xmm3, %xmm0 > ucomiss %xmm0, %xmm2 > jbe L9 > cvttss2si %xmm3, %eax > cvtsi2ss %eax, %xmm0 > movaps %xmm0, %xmm2 > cmpnless %xmm3, %xmm2 > movss LC4, %xmm3 > andps %xmm3, %xmm2 > movaps %xmm0, %xmm3 > subss %xmm2, %xmm3 > jmp L9 > L5: > subl $1, %ebx > jne L2 > unpcklps %xmm1, %xmm1 > movl $LC5, (%esp) > cvtps2pd %xmm1, %xmm5 > movsd %xmm5, 4(%esp) > call _printf > xorl %eax, %eax > movl -4(%ebp), %ebx > leave > ret > > --------------------- > > test2.d asm: > > __Dmain: > pushl %ebx > pushl %edi > pushl %esi > subl $28, %esp > xorps %xmm0, %xmm0 > xorl %esi, %esi > movss LCPI1_0, %xmm1 > .align 16, 0x90 > LBB1_1: > xorl %edi, %edi > .align 16, 0x90 > LBB1_2: > xorps %xmm2, %xmm2 > cvtsi2ssl %edi, %xmm2 > mulss %xmm1, %xmm2 > movss %xmm2, 12(%esp) > xorl %ebx, %ebx > .align 16, 0x90 > LBB1_3: > movss %xmm0, 16(%esp) > xorps %xmm0, %xmm0 > cvtsi2ssl %ebx, %xmm0 > mulss %xmm1, %xmm0 > movss %xmm0, (%esp) > calll _floorf > movss 12(%esp), %xmm0 > movss %xmm0, (%esp) > fstps 24(%esp) > calll _floorf > movss LCPI1_0, %xmm1 > fstps 20(%esp) > movss 24(%esp), %xmm0 > addss 20(%esp), %xmm0 > movss 16(%esp), %xmm2 > addss %xmm0, %xmm2 > movss %xmm2, 16(%esp) > movss 16(%esp), %xmm0 > incl %ebx > cmpl $256, %ebx > jne LBB1_3 > incl %edi > cmpl $256, %edi > jne LBB1_2 > incl %esi > cmpl $1000, %esi > jne LBB1_1 > cvtss2sd %xmm0, %xmm0 > movsd %xmm0, 4(%esp) > movl $_.str, (%esp) > calll ___mingw_printf > xorl %eax, %eax > addl $28, %esp > popl %esi > popl %edi > popl %ebx > ret > > --------------------- > > test3.d asm: > > __Dmain: > pushl %ebx > pushl %edi > pushl %esi > subl $52, %esp > xorps %xmm1, %xmm1 > xorl %esi, %esi > movss LCPI1_0, %xmm2 > .align 16, 0x90 > LBB1_1: > xorl %edi, %edi > .align 16, 0x90 > LBB1_2: > xorps %xmm0, %xmm0 > cvtsi2ssl %edi, %xmm0 > mulss %xmm2, %xmm0 > movss %xmm0, 48(%esp) > xorl %ebx, %ebx > flds 48(%esp) > fstpt 12(%esp) > movaps %xmm1, %xmm0 > .align 16, 0x90 > LBB1_3: > movss %xmm0, 36(%esp) > xorps %xmm0, %xmm0 > cvtsi2ssl %ebx, %xmm0 > mulss %xmm2, %xmm0 > movss %xmm0, 44(%esp) > flds 44(%esp) > fstpt (%esp) > calll __D3std4math5floorFNbNeeZe > subl $12, %esp > fstpt 24(%esp) > fldt 12(%esp) > fstpt (%esp) > calll __D3std4math5floorFNbNeeZe > subl $12, %esp > movss 36(%esp), %xmm0 > movss LCPI1_0, %xmm2 > fldt 24(%esp) > faddp %st(1) > fstps 40(%esp) > addss 40(%esp), %xmm0 > incl %ebx > cmpl $256, %ebx > jne LBB1_3 > movaps %xmm0, %xmm1 > incl %edi > cmpl $256, %edi > jne LBB1_2 > incl %esi > cmpl $1000, %esi > jne LBB1_1 > xorps %xmm0, %xmm0 > cvtss2sd %xmm1, %xmm0 > movsd %xmm0, 4(%esp) > movl $_.str, (%esp) > calll ___mingw_printf > xorl %eax, %eax > addl $52, %esp > popl %esi > popl %edi > popl %ebx > ret > > --------------------- > > Bye, > bearophile but... fast-math isn't kosher -- Marco |
December 19, 2013 Re: floor operation problem | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | I cannot reproduce this on 64-bit Linux.
Compiled the C version with:
gcc -std=c99 -march=native -O3 -s -flto test1.c -o test1 -Wl,-lm
and the D version with:
ldc2 -release -O3 test2.d -of=test2 -ffunction-sections
-fdata-sections -L--gc-sections -vectorize-slp
-vectorize-loops -unit-at-a-time -L-O1 -L--as-needed -L-lrt
-L-znorelro -L--no-copy-dt-needed-entries -L--relax
-L--sort-common -L--export-dynamic
strip test2 -R .comment -R .note.ABI-tag -R .gnu.version
-R .jcr -R .got
Runtimes for both executables are around 0.8s.
--
Marco
|
December 19, 2013 Re: floor operation problem | ||||
---|---|---|---|---|
| ||||
Posted in reply to Marco Leise | Marco Leise: > but... fast-math isn't kosher Practice of programming shows that there are many situations where today fast-math is strictly necessary to allow the compiler to perform some important optimizations. > I cannot reproduce this on 64-bit Linux. >... > Runtimes for both executables are around 0.8s. Oh, good, so is it a 32 bit problem? Bye, bearophile |
December 19, 2013 Re: floor operation problem | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | On 19 Dec 2013, at 12:18, bearophile wrote:
> Marco Leise:
>> I cannot reproduce this on 64-bit Linux.
>> ...
>> Runtimes for both executables are around 0.8s.
>
> Oh, good, so is it a 32 bit problem?
I don't think Marco is building his C executable with -ffast-math.
We should definitely be able to provide a switch to enable the same unsafe/wrong FP optimizations in LDC as well.
David
|
December 19, 2013 Re: floor operation problem | ||||
---|---|---|---|---|
| ||||
Posted in reply to David Nadlinger | David Nadlinger: > I don't think Marco is building his C executable with -ffast-math. Showing the asm is a good way to understand what's going on in those 64 bit builds. > We should definitely be able to provide a switch to enable the same unsafe/wrong FP optimizations in LDC as well. So is this floor problem caused by those (missing) FP optimizations? :-) Bye, bearophile |
December 19, 2013 Re: floor operation problem | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | Am Thu, 19 Dec 2013 14:54:56 +0100 schrieb "bearophile" <bearophileHUGS@lycos.com>: > David Nadlinger: > > > I don't think Marco is building his C executable with -ffast-math. > > Showing the asm is a good way to understand what's going on in those 64 bit builds. > > > > We should definitely be able to provide a switch to enable the same unsafe/wrong FP optimizations in LDC as well. > > So is this floor problem caused by those (missing) FP optimizations? :-) > > Bye, > bearophile At first my gcc executable had the same speed as on your computer, bearophile (~1s). Then I removed some of the flags including -ffast-math from the gcc command-line and it became 25% faster -> 0.8s. I didn't try your original ldmd2 command-line, but one from a generic "omg-uber-optimize" Makefile I often use for stuff like this and immediately had the same performance in D as for C. I cannot show you the disassembly though, I'm in the middle of upgrading to to Gnome 3 and most programs don't work, like e.g. copy&paste from terminals. -- Marco |
Copyright © 1999-2021 by the D Language Foundation