Thread overview
floor operation problem
Dec 19, 2013
bearophile
Dec 19, 2013
Marco Leise
Dec 19, 2013
Marco Leise
Dec 19, 2013
bearophile
Dec 19, 2013
David Nadlinger
Dec 19, 2013
bearophile
Dec 19, 2013
Marco Leise
December 19, 2013
While I was debugging a performance problem, I have found the cause is the floor operation. Below there is a small benchmark to show it.

I have compiled the code with:

gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3 test1.c -o test1
ldmd2 -O -release -inline -noboundscheck test2.d
ldmd2 -O -release -inline -noboundscheck test3.d

32 bit system

gcc version 4.8.0
LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)

Run-time, seconds:
test1.c: 1.01
test2.d: 4.14
test3.d: 6.62

---------------------

// test1.c
#include <stdio.h>
#include <math.h>
#include <stdlib.h>

static inline float foo(const float x, const float y) {
    return floorf(x) + floorf(y);
}

int main() {
    float total = 0.0f;

    for (int i = 0; i < 1000; i++)
        for (int y = 0; y < 256; y++)
            for (int x = 0; x < 256; x++)
                total += foo(x * 0.1f, y * 0.1f);

    printf("%f\n", total);
    return 0;
}

---------------------

// test2.d
import core.stdc.stdio, core.stdc.math;

float foo(const float x, const float y) nothrow {
    return floorf(x) + floorf(y);
}

int main() {
    float total = 0.0f;

    for (int i = 0; i < 1000; i++)
        for (int y = 0; y < 256; y++)
            for (int x = 0; x < 256; x++)
                total += foo(x * 0.1f, y * 0.1f);

    printf("%f\n", total);
    return 0;
}

---------------------

// test3.d
import core.stdc.stdio, std.math;

float foo(const float x, const float y) nothrow {
    return floor(x) + floor(y);
}

int main() {
    float total = 0.0f;

    for (int i = 0; i < 1000; i++)
        for (int y = 0; y < 256; y++)
            for (int x = 0; x < 256; x++)
                total += foo(x * 0.1f, y * 0.1f);

    printf("%f\n", total);
    return 0;
}

---------------------

test1.c asm:

_main:
    pushl   %ebp
    movl    %esp, %ebp
    pushl   %ebx
    movl    $1000, %ebx
    andl    $-16, %esp
    subl    $16, %esp
    call    ___main
    xorps   %xmm1, %xmm1
    movss   LC3, %xmm5
L2:
    movss   LC1, %xmm6
    xorps   %xmm3, %xmm3
    xorl    %ecx, %ecx
    .p2align 4,,7
L9:
    movss   LC2, %xmm4
    xorps   %xmm0, %xmm0
    xorl    %eax, %eax
    .p2align 4,,7
L7:
    addss   %xmm0, %xmm1
    addl    $1, %eax
    cmpl    $256, %eax
    addss   %xmm3, %xmm1
    je  L12
    cvtsi2ss    %eax, %xmm0
    mulss   %xmm6, %xmm0
    movaps  %xmm0, %xmm2
    andps   %xmm5, %xmm2
    ucomiss %xmm2, %xmm4
    jbe L7
    cvttss2si   %xmm0, %edx
    cvtsi2ss    %edx, %xmm2
    movaps  %xmm2, %xmm7
    cmpnless    %xmm0, %xmm7
    movaps  %xmm7, %xmm0
    movss   LC4, %xmm7
    andps   %xmm7, %xmm0
    subss   %xmm0, %xmm2
    movaps  %xmm2, %xmm0
    jmp L7
    .p2align 4,,7
L12:
    addl    $1, %ecx
    cmpl    $256, %ecx
    je  L5
    cvtsi2ss    %ecx, %xmm3
    movss   LC6, %xmm0
    movss   LC2, %xmm2
    mulss   LC1, %xmm3
    andps   %xmm3, %xmm0
    ucomiss %xmm0, %xmm2
    jbe L9
    cvttss2si   %xmm3, %eax
    cvtsi2ss    %eax, %xmm0
    movaps  %xmm0, %xmm2
    cmpnless    %xmm3, %xmm2
    movss   LC4, %xmm3
    andps   %xmm3, %xmm2
    movaps  %xmm0, %xmm3
    subss   %xmm2, %xmm3
    jmp L9
L5:
    subl    $1, %ebx
    jne L2
    unpcklps    %xmm1, %xmm1
    movl    $LC5, (%esp)
    cvtps2pd    %xmm1, %xmm5
    movsd   %xmm5, 4(%esp)
    call    _printf
    xorl    %eax, %eax
    movl    -4(%ebp), %ebx
    leave
    ret

---------------------

test2.d asm:

__Dmain:
	pushl	%ebx
	pushl	%edi
	pushl	%esi
	subl	$28, %esp
	xorps	%xmm0, %xmm0
	xorl	%esi, %esi
	movss	LCPI1_0, %xmm1
	.align	16, 0x90
LBB1_1:
	xorl	%edi, %edi
	.align	16, 0x90
LBB1_2:
	xorps	%xmm2, %xmm2
	cvtsi2ssl	%edi, %xmm2
	mulss	%xmm1, %xmm2
	movss	%xmm2, 12(%esp)
	xorl	%ebx, %ebx
	.align	16, 0x90
LBB1_3:
	movss	%xmm0, 16(%esp)
	xorps	%xmm0, %xmm0
	cvtsi2ssl	%ebx, %xmm0
	mulss	%xmm1, %xmm0
	movss	%xmm0, (%esp)
	calll	_floorf
	movss	12(%esp), %xmm0
	movss	%xmm0, (%esp)
	fstps	24(%esp)
	calll	_floorf
	movss	LCPI1_0, %xmm1
	fstps	20(%esp)
	movss	24(%esp), %xmm0
	addss	20(%esp), %xmm0
	movss	16(%esp), %xmm2
	addss	%xmm0, %xmm2
	movss	%xmm2, 16(%esp)
	movss	16(%esp), %xmm0
	incl	%ebx
	cmpl	$256, %ebx
	jne	LBB1_3
	incl	%edi
	cmpl	$256, %edi
	jne	LBB1_2
	incl	%esi
	cmpl	$1000, %esi
	jne	LBB1_1
	cvtss2sd	%xmm0, %xmm0
	movsd	%xmm0, 4(%esp)
	movl	$_.str, (%esp)
	calll	___mingw_printf
	xorl	%eax, %eax
	addl	$28, %esp
	popl	%esi
	popl	%edi
	popl	%ebx
	ret

---------------------

test3.d asm:

__Dmain:
    pushl   %ebx
    pushl   %edi
    pushl   %esi
    subl    $52, %esp
    xorps   %xmm1, %xmm1
    xorl    %esi, %esi
    movss   LCPI1_0, %xmm2
    .align  16, 0x90
LBB1_1:
    xorl    %edi, %edi
    .align  16, 0x90
LBB1_2:
    xorps   %xmm0, %xmm0
    cvtsi2ssl   %edi, %xmm0
    mulss   %xmm2, %xmm0
    movss   %xmm0, 48(%esp)
    xorl    %ebx, %ebx
    flds    48(%esp)
    fstpt   12(%esp)
    movaps  %xmm1, %xmm0
    .align  16, 0x90
LBB1_3:
    movss   %xmm0, 36(%esp)
    xorps   %xmm0, %xmm0
    cvtsi2ssl   %ebx, %xmm0
    mulss   %xmm2, %xmm0
    movss   %xmm0, 44(%esp)
    flds    44(%esp)
    fstpt   (%esp)
    calll   __D3std4math5floorFNbNeeZe
    subl    $12, %esp
    fstpt   24(%esp)
    fldt    12(%esp)
    fstpt   (%esp)
    calll   __D3std4math5floorFNbNeeZe
    subl    $12, %esp
    movss   36(%esp), %xmm0
    movss   LCPI1_0, %xmm2
    fldt    24(%esp)
    faddp   %st(1)
    fstps   40(%esp)
    addss   40(%esp), %xmm0
    incl    %ebx
    cmpl    $256, %ebx
    jne LBB1_3
    movaps  %xmm0, %xmm1
    incl    %edi
    cmpl    $256, %edi
    jne LBB1_2
    incl    %esi
    cmpl    $1000, %esi
    jne LBB1_1
    xorps   %xmm0, %xmm0
    cvtss2sd    %xmm1, %xmm0
    movsd   %xmm0, 4(%esp)
    movl    $_.str, (%esp)
    calll   ___mingw_printf
    xorl    %eax, %eax
    addl    $52, %esp
    popl    %esi
    popl    %edi
    popl    %ebx
    ret

---------------------

Bye,
bearophile
December 19, 2013
Am Thu, 19 Dec 2013 01:15:27 +0100
schrieb "bearophile" <bearophileHUGS@lycos.com>:

> While I was debugging a performance problem, I have found the cause is the floor operation. Below there is a small benchmark to show it.
> 
> I have compiled the code with:
> 
> gcc -Ofast -std=c99 -s -flto -mfpmath=sse -ffast-math -msse3
> test1.c -o test1
> ldmd2 -O -release -inline -noboundscheck test2.d
> ldmd2 -O -release -inline -noboundscheck test3.d
> 
> 32 bit system
> 
> gcc version 4.8.0
> LDC 0.12.1 (based on DMD v2.063.2 and LLVM 3.3.1)
> 
> Run-time, seconds:
> test1.c: 1.01
> test2.d: 4.14
> test3.d: 6.62
> 
> ---------------------
> 
> // test1.c
> #include <stdio.h>
> #include <math.h>
> #include <stdlib.h>
> 
> static inline float foo(const float x, const float y) {
>      return floorf(x) + floorf(y);
> }
> 
> int main() {
>      float total = 0.0f;
> 
>      for (int i = 0; i < 1000; i++)
>          for (int y = 0; y < 256; y++)
>              for (int x = 0; x < 256; x++)
>                  total += foo(x * 0.1f, y * 0.1f);
> 
>      printf("%f\n", total);
>      return 0;
> }
> 
> ---------------------
> 
> // test2.d
> import core.stdc.stdio, core.stdc.math;
> 
> float foo(const float x, const float y) nothrow {
>      return floorf(x) + floorf(y);
> }
> 
> int main() {
>      float total = 0.0f;
> 
>      for (int i = 0; i < 1000; i++)
>          for (int y = 0; y < 256; y++)
>              for (int x = 0; x < 256; x++)
>                  total += foo(x * 0.1f, y * 0.1f);
> 
>      printf("%f\n", total);
>      return 0;
> }
> 
> ---------------------
> 
> // test3.d
> import core.stdc.stdio, std.math;
> 
> float foo(const float x, const float y) nothrow {
>      return floor(x) + floor(y);
> }
> 
> int main() {
>      float total = 0.0f;
> 
>      for (int i = 0; i < 1000; i++)
>          for (int y = 0; y < 256; y++)
>              for (int x = 0; x < 256; x++)
>                  total += foo(x * 0.1f, y * 0.1f);
> 
>      printf("%f\n", total);
>      return 0;
> }
> 
> ---------------------
> 
> test1.c asm:
> 
> _main:
>      pushl   %ebp
>      movl    %esp, %ebp
>      pushl   %ebx
>      movl    $1000, %ebx
>      andl    $-16, %esp
>      subl    $16, %esp
>      call    ___main
>      xorps   %xmm1, %xmm1
>      movss   LC3, %xmm5
> L2:
>      movss   LC1, %xmm6
>      xorps   %xmm3, %xmm3
>      xorl    %ecx, %ecx
>      .p2align 4,,7
> L9:
>      movss   LC2, %xmm4
>      xorps   %xmm0, %xmm0
>      xorl    %eax, %eax
>      .p2align 4,,7
> L7:
>      addss   %xmm0, %xmm1
>      addl    $1, %eax
>      cmpl    $256, %eax
>      addss   %xmm3, %xmm1
>      je  L12
>      cvtsi2ss    %eax, %xmm0
>      mulss   %xmm6, %xmm0
>      movaps  %xmm0, %xmm2
>      andps   %xmm5, %xmm2
>      ucomiss %xmm2, %xmm4
>      jbe L7
>      cvttss2si   %xmm0, %edx
>      cvtsi2ss    %edx, %xmm2
>      movaps  %xmm2, %xmm7
>      cmpnless    %xmm0, %xmm7
>      movaps  %xmm7, %xmm0
>      movss   LC4, %xmm7
>      andps   %xmm7, %xmm0
>      subss   %xmm0, %xmm2
>      movaps  %xmm2, %xmm0
>      jmp L7
>      .p2align 4,,7
> L12:
>      addl    $1, %ecx
>      cmpl    $256, %ecx
>      je  L5
>      cvtsi2ss    %ecx, %xmm3
>      movss   LC6, %xmm0
>      movss   LC2, %xmm2
>      mulss   LC1, %xmm3
>      andps   %xmm3, %xmm0
>      ucomiss %xmm0, %xmm2
>      jbe L9
>      cvttss2si   %xmm3, %eax
>      cvtsi2ss    %eax, %xmm0
>      movaps  %xmm0, %xmm2
>      cmpnless    %xmm3, %xmm2
>      movss   LC4, %xmm3
>      andps   %xmm3, %xmm2
>      movaps  %xmm0, %xmm3
>      subss   %xmm2, %xmm3
>      jmp L9
> L5:
>      subl    $1, %ebx
>      jne L2
>      unpcklps    %xmm1, %xmm1
>      movl    $LC5, (%esp)
>      cvtps2pd    %xmm1, %xmm5
>      movsd   %xmm5, 4(%esp)
>      call    _printf
>      xorl    %eax, %eax
>      movl    -4(%ebp), %ebx
>      leave
>      ret
> 
> ---------------------
> 
> test2.d asm:
> 
> __Dmain:
> 	pushl	%ebx
> 	pushl	%edi
> 	pushl	%esi
> 	subl	$28, %esp
> 	xorps	%xmm0, %xmm0
> 	xorl	%esi, %esi
> 	movss	LCPI1_0, %xmm1
> 	.align	16, 0x90
> LBB1_1:
> 	xorl	%edi, %edi
> 	.align	16, 0x90
> LBB1_2:
> 	xorps	%xmm2, %xmm2
> 	cvtsi2ssl	%edi, %xmm2
> 	mulss	%xmm1, %xmm2
> 	movss	%xmm2, 12(%esp)
> 	xorl	%ebx, %ebx
> 	.align	16, 0x90
> LBB1_3:
> 	movss	%xmm0, 16(%esp)
> 	xorps	%xmm0, %xmm0
> 	cvtsi2ssl	%ebx, %xmm0
> 	mulss	%xmm1, %xmm0
> 	movss	%xmm0, (%esp)
> 	calll	_floorf
> 	movss	12(%esp), %xmm0
> 	movss	%xmm0, (%esp)
> 	fstps	24(%esp)
> 	calll	_floorf
> 	movss	LCPI1_0, %xmm1
> 	fstps	20(%esp)
> 	movss	24(%esp), %xmm0
> 	addss	20(%esp), %xmm0
> 	movss	16(%esp), %xmm2
> 	addss	%xmm0, %xmm2
> 	movss	%xmm2, 16(%esp)
> 	movss	16(%esp), %xmm0
> 	incl	%ebx
> 	cmpl	$256, %ebx
> 	jne	LBB1_3
> 	incl	%edi
> 	cmpl	$256, %edi
> 	jne	LBB1_2
> 	incl	%esi
> 	cmpl	$1000, %esi
> 	jne	LBB1_1
> 	cvtss2sd	%xmm0, %xmm0
> 	movsd	%xmm0, 4(%esp)
> 	movl	$_.str, (%esp)
> 	calll	___mingw_printf
> 	xorl	%eax, %eax
> 	addl	$28, %esp
> 	popl	%esi
> 	popl	%edi
> 	popl	%ebx
> 	ret
> 
> ---------------------
> 
> test3.d asm:
> 
> __Dmain:
>      pushl   %ebx
>      pushl   %edi
>      pushl   %esi
>      subl    $52, %esp
>      xorps   %xmm1, %xmm1
>      xorl    %esi, %esi
>      movss   LCPI1_0, %xmm2
>      .align  16, 0x90
> LBB1_1:
>      xorl    %edi, %edi
>      .align  16, 0x90
> LBB1_2:
>      xorps   %xmm0, %xmm0
>      cvtsi2ssl   %edi, %xmm0
>      mulss   %xmm2, %xmm0
>      movss   %xmm0, 48(%esp)
>      xorl    %ebx, %ebx
>      flds    48(%esp)
>      fstpt   12(%esp)
>      movaps  %xmm1, %xmm0
>      .align  16, 0x90
> LBB1_3:
>      movss   %xmm0, 36(%esp)
>      xorps   %xmm0, %xmm0
>      cvtsi2ssl   %ebx, %xmm0
>      mulss   %xmm2, %xmm0
>      movss   %xmm0, 44(%esp)
>      flds    44(%esp)
>      fstpt   (%esp)
>      calll   __D3std4math5floorFNbNeeZe
>      subl    $12, %esp
>      fstpt   24(%esp)
>      fldt    12(%esp)
>      fstpt   (%esp)
>      calll   __D3std4math5floorFNbNeeZe
>      subl    $12, %esp
>      movss   36(%esp), %xmm0
>      movss   LCPI1_0, %xmm2
>      fldt    24(%esp)
>      faddp   %st(1)
>      fstps   40(%esp)
>      addss   40(%esp), %xmm0
>      incl    %ebx
>      cmpl    $256, %ebx
>      jne LBB1_3
>      movaps  %xmm0, %xmm1
>      incl    %edi
>      cmpl    $256, %edi
>      jne LBB1_2
>      incl    %esi
>      cmpl    $1000, %esi
>      jne LBB1_1
>      xorps   %xmm0, %xmm0
>      cvtss2sd    %xmm1, %xmm0
>      movsd   %xmm0, 4(%esp)
>      movl    $_.str, (%esp)
>      calll   ___mingw_printf
>      xorl    %eax, %eax
>      addl    $52, %esp
>      popl    %esi
>      popl    %edi
>      popl    %ebx
>      ret
> 
> ---------------------
> 
> Bye,
> bearophile

but... fast-math isn't kosher

-- 
Marco

December 19, 2013
I cannot reproduce this on 64-bit Linux.

Compiled the C version with:
gcc -std=c99 -march=native -O3 -s -flto test1.c -o test1 -Wl,-lm

and the D version with:
ldc2 -release -O3 test2.d -of=test2 -ffunction-sections
-fdata-sections -L--gc-sections -vectorize-slp
-vectorize-loops -unit-at-a-time -L-O1 -L--as-needed -L-lrt
-L-znorelro -L--no-copy-dt-needed-entries -L--relax
-L--sort-common -L--export-dynamic
strip test2 -R .comment -R .note.ABI-tag -R .gnu.version
-R .jcr -R .got

Runtimes for both executables are around 0.8s.

-- 
Marco

December 19, 2013
Marco Leise:

> but... fast-math isn't kosher

Practice of programming shows that there are many situations where today fast-math is strictly necessary to allow the compiler to perform some important optimizations.


> I cannot reproduce this on 64-bit Linux.
>...
> Runtimes for both executables are around 0.8s.

Oh, good, so is it a 32 bit problem?

Bye,
bearophile
December 19, 2013
On 19 Dec 2013, at 12:18, bearophile wrote:
> Marco Leise:
>> I cannot reproduce this on 64-bit Linux.
>> ...
>> Runtimes for both executables are around 0.8s.
>
> Oh, good, so is it a 32 bit problem?

I don't think Marco is building his C executable with -ffast-math.

We should definitely be able to provide a switch to enable the same unsafe/wrong FP optimizations in LDC as well.

David
December 19, 2013
David Nadlinger:

> I don't think Marco is building his C executable with -ffast-math.

Showing the asm is a good way to understand what's going on in those 64 bit builds.


> We should definitely be able to provide a switch to enable the same unsafe/wrong FP optimizations in LDC as well.

So is this floor problem caused by those (missing) FP optimizations? :-)

Bye,
bearophile
December 19, 2013
Am Thu, 19 Dec 2013 14:54:56 +0100
schrieb "bearophile" <bearophileHUGS@lycos.com>:

> David Nadlinger:
> 
> > I don't think Marco is building his C executable with -ffast-math.
> 
> Showing the asm is a good way to understand what's going on in those 64 bit builds.
> 
> 
> > We should definitely be able to provide a switch to enable the same unsafe/wrong FP optimizations in LDC as well.
> 
> So is this floor problem caused by those (missing) FP optimizations? :-)
> 
> Bye,
> bearophile

At first my gcc executable had the same speed as on your computer, bearophile (~1s). Then I removed some of the flags including -ffast-math from the gcc command-line and it became 25% faster -> 0.8s. I didn't try your original ldmd2 command-line, but one from a generic "omg-uber-optimize" Makefile I often use for stuff like this and immediately had the same performance in D as for C. I cannot show you the disassembly though, I'm in the middle of upgrading to to Gnome 3 and most programs don't work, like e.g. copy&paste from terminals.

-- 
Marco