December 01, 2013 A small problem | ||||
---|---|---|---|---|
| ||||
I have found a small performance problem porting some C code to D and compiling it with LDC2. This post shows a reduced version of the D code. In this post I am not using LTO (link time optimization). This version uses a private module-level N: private __gshared private uint N = 9; __gshared int[16 * 16] M; int bar() nothrow { int tot = 0; foreach (immutable i; 0 .. N) foreach (immutable j; 0 .. N) tot += M[i * N + j]; return tot; } int main(in string[] args) nothrow { N = 9; M[0] = args.length; return bar(); } The asm of the two functions: __D5test13barFNbZi: pushl %ebp pushl %ebx pushl %edi pushl %esi xorl %eax, %eax movl __D5test11Nk, %ecx testl %ecx, %ecx je LBB0_5 leal (,%ecx,4), %edx xorl %eax, %eax movl $__D5test11MG256i, %esi xorl %edi, %edi .align 16, 0x90 LBB0_4: movl %esi, %ebx movl %ecx, %ebp .align 16, 0x90 LBB0_2: addl (%ebx), %eax addl $4, %ebx decl %ebp jne LBB0_2 addl %edx, %esi incl %edi cmpl %ecx, %edi jne LBB0_4 LBB0_5: popl %esi popl %edi popl %ebx popl %ebp ret __Dmain: pushl %edi pushl %esi movl $9, __D5test11Nk movl 12(%esp), %eax movl %eax, __D5test11MG256i xorl %eax, %eax movl $__D5test11MG256i, %ecx xorl %edx, %edx .align 16, 0x90 LBB1_3: movl $9, %esi movl %ecx, %edi .align 16, 0x90 LBB1_1: addl (%edi), %eax addl $4, %edi decl %esi jne LBB1_1 addl $36, %ecx incl %edx cmpl $9, %edx jne LBB1_3 popl %esi popl %edi ret A modified program, now N is a compile-time constant: enum uint N = 9; __gshared int[16 * 16] M; int bar() nothrow { int tot = 0; foreach (immutable i; 0 .. N) foreach (immutable j; 0 .. N) tot += M[i * N + j]; return tot; } int main(in string[] args) nothrow { //N = 9; M[0] = args.length; return bar(); } And the asm of the two functions: __D5test23barFNbZi: xorl %eax, %eax movl $-324, %ecx .align 16, 0x90 LBB0_1: addl __D5test21MG256i+324(%ecx), %eax addl __D5test21MG256i+328(%ecx), %eax addl __D5test21MG256i+332(%ecx), %eax addl __D5test21MG256i+336(%ecx), %eax addl __D5test21MG256i+340(%ecx), %eax addl __D5test21MG256i+344(%ecx), %eax addl __D5test21MG256i+348(%ecx), %eax addl __D5test21MG256i+352(%ecx), %eax addl __D5test21MG256i+356(%ecx), %eax addl $36, %ecx jne LBB0_1 ret __Dmain: movl 4(%esp), %eax movl %eax, __D5test21MG256i xorl %eax, %eax movl $-324, %ecx .align 16, 0x90 LBB1_1: addl __D5test21MG256i+324(%ecx), %eax addl __D5test21MG256i+328(%ecx), %eax addl __D5test21MG256i+332(%ecx), %eax addl __D5test21MG256i+336(%ecx), %eax addl __D5test21MG256i+340(%ecx), %eax addl __D5test21MG256i+344(%ecx), %eax addl __D5test21MG256i+348(%ecx), %eax addl __D5test21MG256i+352(%ecx), %eax addl __D5test21MG256i+356(%ecx), %eax addl $36, %ecx jne LBB1_1 ret C code, with N global variable: unsigned int N = 9; int M [16 * 16]; int bar() { int tot = 0; int i, j; for (i = 0; i < N; i++) for (j = 0; j < N; j++) tot += M[i * N + j]; return tot; } int main(int argc, char** argv) { N = 9; M[0] = argc; return bar(); } gcc 4.8.0 with -Ofast -S gives: _bar: pushl %edi pushl %esi pushl %ebx movl _N, %ebx testl %ebx, %ebx je L6 xorl %esi, %esi xorl %edi, %edi xorl %eax, %eax L3: imull %ebx, %esi xorl %ecx, %ecx xorl %edx, %edx .p2align 4,,7 L5: addl %esi, %ecx addl $1, %edx addl _M(,%ecx,4), %eax cmpl %ebx, %edx movl %edx, %ecx jne L5 addl $1, %edi cmpl %ebx, %edi movl %edi, %esi jne L3 popl %ebx popl %esi popl %edi ret L6: popl %ebx xorl %eax, %eax popl %esi popl %edi ret _main: pushl %ebp movl %esp, %ebp pushl %esi movl 8(%ebp), %esi pushl %ebx movl $_M+36, %ebx andl $-16, %esp call ___main xorl %eax, %eax xorl %ecx, %ecx movl $9, _N xorl %edx, %edx movl %esi, _M jmp L11 .p2align 4,,7 L13: movl (%ebx), %esi addl $36, %ebx L11: leal (%edx,%edx,8), %edx addl $1, %ecx addl %esi, %eax addl _M+4(,%edx,4), %eax addl _M+8(,%edx,4), %eax addl _M+12(,%edx,4), %eax addl _M+16(,%edx,4), %eax addl _M+20(,%edx,4), %eax addl _M+24(,%edx,4), %eax addl _M+28(,%edx,4), %eax addl _M+32(,%edx,4), %eax cmpl $9, %ecx movl %ecx, %edx jne L13 leal -8(%ebp), %esp popl %ebx popl %esi popl %ebp ret As you see the asm of the function bar is comparable, but in the main the inlined bar is more efficient. I have seen that the performance difference in my translation is removed if I replace a global variable with a global constant. But I don't know know if the asm code I've shown in this post is really showing the problem that caused the slowdown in the D version of the code. Is ldc2 able to perform LTO with just a handy compilation switch (like -flto of GCC)? Bye, bearophile |
Copyright © 1999-2021 by the D Language Foundation