December 01, 2013
I have found a small performance problem porting some C code to D and compiling it with LDC2. This post shows a reduced version of the D code. In this post I am not using LTO (link time optimization).


This version uses a private module-level N:


private __gshared private uint N = 9;
__gshared int[16 * 16] M;

int bar() nothrow {
    int tot = 0;
    foreach (immutable i; 0 .. N)
        foreach (immutable j; 0 .. N)
            tot += M[i * N + j];
    return tot;
}

int main(in string[] args) nothrow {
    N = 9;
    M[0] = args.length;
    return bar();
}


The asm of the two functions:


__D5test13barFNbZi:
    pushl   %ebp
    pushl   %ebx
    pushl   %edi
    pushl   %esi
    xorl    %eax, %eax
    movl    __D5test11Nk, %ecx
    testl   %ecx, %ecx
    je  LBB0_5
    leal    (,%ecx,4), %edx
    xorl    %eax, %eax
    movl    $__D5test11MG256i, %esi
    xorl    %edi, %edi
    .align  16, 0x90
LBB0_4:
    movl    %esi, %ebx
    movl    %ecx, %ebp
    .align  16, 0x90
LBB0_2:
    addl    (%ebx), %eax
    addl    $4, %ebx
    decl    %ebp
    jne LBB0_2
    addl    %edx, %esi
    incl    %edi
    cmpl    %ecx, %edi
    jne LBB0_4
LBB0_5:
    popl    %esi
    popl    %edi
    popl    %ebx
    popl    %ebp
    ret


__Dmain:
    pushl   %edi
    pushl   %esi
    movl    $9, __D5test11Nk
    movl    12(%esp), %eax
    movl    %eax, __D5test11MG256i
    xorl    %eax, %eax
    movl    $__D5test11MG256i, %ecx
    xorl    %edx, %edx
    .align  16, 0x90
LBB1_3:
    movl    $9, %esi
    movl    %ecx, %edi
    .align  16, 0x90
LBB1_1:
    addl    (%edi), %eax
    addl    $4, %edi
    decl    %esi
    jne LBB1_1
    addl    $36, %ecx
    incl    %edx
    cmpl    $9, %edx
    jne LBB1_3
    popl    %esi
    popl    %edi
    ret



A modified program, now N is a compile-time constant:


enum uint N = 9;
__gshared int[16 * 16] M;

int bar() nothrow {
    int tot = 0;
    foreach (immutable i; 0 .. N)
        foreach (immutable j; 0 .. N)
            tot += M[i * N + j];
    return tot;
}

int main(in string[] args) nothrow {
    //N = 9;
    M[0] = args.length;
    return bar();
}



And the asm of the two functions:

__D5test23barFNbZi:
    xorl    %eax, %eax
    movl    $-324, %ecx
    .align  16, 0x90
LBB0_1:
    addl    __D5test21MG256i+324(%ecx), %eax
    addl    __D5test21MG256i+328(%ecx), %eax
    addl    __D5test21MG256i+332(%ecx), %eax
    addl    __D5test21MG256i+336(%ecx), %eax
    addl    __D5test21MG256i+340(%ecx), %eax
    addl    __D5test21MG256i+344(%ecx), %eax
    addl    __D5test21MG256i+348(%ecx), %eax
    addl    __D5test21MG256i+352(%ecx), %eax
    addl    __D5test21MG256i+356(%ecx), %eax
    addl    $36, %ecx
    jne LBB0_1
    ret


__Dmain:
    movl    4(%esp), %eax
    movl    %eax, __D5test21MG256i
    xorl    %eax, %eax
    movl    $-324, %ecx
    .align  16, 0x90
LBB1_1:
    addl    __D5test21MG256i+324(%ecx), %eax
    addl    __D5test21MG256i+328(%ecx), %eax
    addl    __D5test21MG256i+332(%ecx), %eax
    addl    __D5test21MG256i+336(%ecx), %eax
    addl    __D5test21MG256i+340(%ecx), %eax
    addl    __D5test21MG256i+344(%ecx), %eax
    addl    __D5test21MG256i+348(%ecx), %eax
    addl    __D5test21MG256i+352(%ecx), %eax
    addl    __D5test21MG256i+356(%ecx), %eax
    addl    $36, %ecx
    jne LBB1_1
    ret


C code, with N global variable:

unsigned int N = 9;
int M [16 * 16];

int bar() {
    int tot = 0;
    int i, j;
    for (i = 0; i < N; i++)
        for (j = 0; j < N; j++)
            tot += M[i * N + j];
    return tot;
}

int main(int argc, char** argv) {
    N = 9;
    M[0] = argc;
    return bar();
}


gcc 4.8.0 with -Ofast -S gives:


_bar:
	pushl	%edi
	pushl	%esi
	pushl	%ebx
	movl	_N, %ebx
	testl	%ebx, %ebx
	je	L6
	xorl	%esi, %esi
	xorl	%edi, %edi
	xorl	%eax, %eax
L3:
	imull	%ebx, %esi
	xorl	%ecx, %ecx
	xorl	%edx, %edx
	.p2align 4,,7
L5:
	addl	%esi, %ecx
	addl	$1, %edx
	addl	_M(,%ecx,4), %eax
	cmpl	%ebx, %edx
	movl	%edx, %ecx
	jne	L5
	addl	$1, %edi
	cmpl	%ebx, %edi
	movl	%edi, %esi
	jne	L3
	popl	%ebx
	popl	%esi
	popl	%edi
	ret
L6:
	popl	%ebx
	xorl	%eax, %eax
	popl	%esi
	popl	%edi
	ret


_main:
	pushl	%ebp
	movl	%esp, %ebp
	pushl	%esi
	movl	8(%ebp), %esi
	pushl	%ebx
	movl	$_M+36, %ebx
	andl	$-16, %esp
	call	___main
	xorl	%eax, %eax
	xorl	%ecx, %ecx
	movl	$9, _N
	xorl	%edx, %edx
	movl	%esi, _M
	jmp	L11
	.p2align 4,,7
L13:
	movl	(%ebx), %esi
	addl	$36, %ebx
L11:
	leal	(%edx,%edx,8), %edx
	addl	$1, %ecx
	addl	%esi, %eax
	addl	_M+4(,%edx,4), %eax
	addl	_M+8(,%edx,4), %eax
	addl	_M+12(,%edx,4), %eax
	addl	_M+16(,%edx,4), %eax
	addl	_M+20(,%edx,4), %eax
	addl	_M+24(,%edx,4), %eax
	addl	_M+28(,%edx,4), %eax
	addl	_M+32(,%edx,4), %eax
	cmpl	$9, %ecx
	movl	%ecx, %edx
	jne	L13
	leal	-8(%ebp), %esp
	popl	%ebx
	popl	%esi
	popl	%ebp
	ret


As you see the asm of the function bar is comparable, but in the main the inlined bar is more efficient.

I have seen that the performance difference in my translation is removed if I replace a global variable with a global constant. But I don't know know if the asm code I've shown in this post is really showing the problem that caused the slowdown in the D version of the code.

Is ldc2 able to perform LTO with just a handy compilation switch (like -flto of GCC)?

Bye,
bearophile