Thread overview | ||||||||||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
March 04, 2013 How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Dear D pros, As a fan of D, I was hoping to be able to get similar results as this fellow on stack overflow, by noting his tuning steps; http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c Sadly however, when I pull out a simple matrix multiplication benchmark from the old language shootout (back when it had D), it is disturbingly slower in D when pit against C++. Details? I ran with very recent gdc (gcc 4.7.2, gdc on the 4.7.2 branch, pullreq #51, commit b8f5c22b0e7afa7e68a287ed788597e783540063), and the exact same gcc c++ compiler. How would I tune this to be more competitive? I'm comparing gdc vs g++ both built using the exact same gcc-4.7.2 back end, so it has to be something in the front end. I've disabled GC after the matrices are made in D, so that doesn't explain it. What is going on? I'm hoping I'm making a silly, naive, obvious beginner mistake, but could that be? I'm not sure how to apply the 'in' argument advice given on stackoverflow; if that is the answer, could someone summarise the best practice for 'in' use? Thank you! - J $ g++ --version #shows: g++ (GCC) 4.7.2 $ uname -a Linux gofast 2.6.35-24-generic #42-Ubuntu SMP Thu Dec 2 02:41:37 UTC 2010 x86_64 GNU/Linux # first, g++, two runs: $ g++ -O3 matrix.cpp -ocppmatrix $ time ./cppmatrix -1015380632 859379360 -367726792 -1548829944 real 1m31.941s user 1m31.920s sys 0m0.010s $ time ./cppmatrix -1015380632 859379360 -367726792 -1548829944 real 1m32.068s user 1m32.010s sys 0m0.050s # second, gdc, two runs: $ gdmd -O -inline -release -noboundscheck -m64 matrix.d -ofdmatrix $ time ./dmatrix -1015380632 859379360 -367726792 -1548829944 real 2m10.677s user 2m10.650s sys 0m0.020s $ $ time ./dmatrix -1015380632 859379360 -367726792 -1548829944 real 2m12.664s user 2m12.600s sys 0m0.030s # SIZE = 2000 results: # It appears D (gdc) is 30% slower that C++ (g++); using the exact same backend compiler. # it doesn't even appear to help to request O3 directly: it goes slower-- $ gdmd -O -q,-O3 -inline -release -noboundscheck -m64 matrix.d -ofdmatrix $ time ./dmatrix -1015380632 859379360 -367726792 -1548829944 real 2m17.107s user 2m17.080s sys 0m0.020s jaten@afarm:~/tmp$ # Though still beating java, but not by much. (Java code not shown; it's same source as all of these; the historical http://shootout.alioth.debian.org/ code from when D was in the shootout.) $ time java matrix -1015380632 859379360 -367726792 -1548829944 real 2m23.739s user 2m23.650s sys 0m0.130s $ Slightly bigger matrix? SIZE = 2500 results: 25% slower in D $ time ./cpp.O3.matrix -1506465222 -119774408 -1600478274 1285663906 real 3m1.340s user 3m1.290s sys 0m0.040s $ time ./dmatrix -1506465222 -119774408 -1600478274 1285663906 real 4m2.109s user 4m2.050s sys 0m0.050s //////// D version import core.memory; import std.stdio, std.string, std.array, std.conv; const int SIZE = 2000; int main(string[] args) { int i, n = args.length > 1 ? to!int(args[1]) : 1; int[][] m1 = mkmatrix(SIZE,SIZE); int[][] m2 = mkmatrix(SIZE,SIZE); int[][] mm = mkmatrix(SIZE,SIZE); GC.disable; for (i=0; i<n; i++) { mmult(m1, m2, mm); } writefln("%d %d %d %d",mm[0][0],mm[2][3],mm[3][2],mm[4][4]); return 0; } int[][] mkmatrix(int rows, int cols) { int[][] m; int count = 1; m.length = rows; foreach(ref int[] mi; m) { mi.length = cols; foreach(ref int mij; mi) { mij = count++; } } return(m); } void mmult(int[][] m1, int[][] m2, int[][] m3) { foreach(int i, int[] m1i; m1) { foreach(int j, ref int m3ij; m3[i]) { int val; foreach(int k, int[] m2k; m2) { val += m1i[k] * m2k[j]; } m3ij = val; } } } ////// C++ version #include <stdio.h> #include <stdlib.h> #include <unistd.h> #define SIZE 2000 int **mkmatrix(int rows, int cols) { int i, j, count = 1; int **m = (int **) malloc(rows * sizeof(int *)); for (i=0; i<rows; i++) { m[i] = (int *) malloc(cols * sizeof(int)); for (j=0; j<cols; j++) { m[i][j] = count++; } } return(m); } void zeromatrix(int rows, int cols, int **m) { int i, j; for (i=0; i<rows; i++) for (j=0; j<cols; j++) m[i][j] = 0; } void freematrix(int rows, int **m) { while (--rows > -1) { free(m[rows]); } free(m); } int **mmult(int rows, int cols, int **m1, int **m2, int **m3) { int i, j, k, val; for (i=0; i<rows; i++) { for (j=0; j<cols; j++) { val = 0; for (k=0; k<cols; k++) { val += m1[i][k] * m2[k][j]; } m3[i][j] = val; } } return(m3); } int main(int argc, char *argv[]) { int i, n = ((argc == 2) ? atoi(argv[1]) : 1); int **m1 = mkmatrix(SIZE, SIZE); int **m2 = mkmatrix(SIZE, SIZE); int **mm = mkmatrix(SIZE, SIZE); for (i=0; i<n; i++) { mm = mmult(SIZE, SIZE, m1, m2, mm); } printf("%d %d %d %d\n", mm[0][0], mm[2][3], mm[3][2], mm[4][4]); freematrix(SIZE, m1); freematrix(SIZE, m2); freematrix(SIZE, mm); return(0); } |
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to J | On Monday, 4 March 2013 at 03:48:45 UTC, J wrote: > Dear D pros, > > As a fan of D, I was hoping to be able to get similar results as this fellow on stack overflow, by noting his tuning steps; > http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c > > Sadly however, when I pull out a simple matrix multiplication benchmark from the old language shootout (back when it had D), it is disturbingly slower in D when pit against C++. > > Details? I ran with very recent gdc (gcc 4.7.2, gdc on the 4.7.2 branch, pullreq #51, commit b8f5c22b0e7afa7e68a287ed788597e783540063), and the exact same gcc c++ compiler. > > How would I tune this to be more competitive? I'm comparing gdc vs g++ both built using the exact same gcc-4.7.2 back end, so it has to be something in the front end. I've disabled GC after the matrices are made in D, so that doesn't explain it. > > What is going on? I'm hoping I'm making a silly, naive, obvious beginner mistake, but could that be? I'm not sure how to apply the 'in' argument advice given on stackoverflow; if that is the answer, could someone summarise the best practice for 'in' use? > > Thank you! > > - J > > $ g++ --version #shows: g++ (GCC) 4.7.2 > $ uname -a > Linux gofast 2.6.35-24-generic #42-Ubuntu SMP Thu Dec 2 02:41:37 UTC 2010 x86_64 GNU/Linux > > # first, g++, two runs: > > $ g++ -O3 matrix.cpp -ocppmatrix > $ time ./cppmatrix > -1015380632 859379360 -367726792 -1548829944 > > real 1m31.941s > user 1m31.920s > sys 0m0.010s > $ time ./cppmatrix > -1015380632 859379360 -367726792 -1548829944 > > real 1m32.068s > user 1m32.010s > sys 0m0.050s > > > # second, gdc, two runs: > > $ gdmd -O -inline -release -noboundscheck -m64 matrix.d -ofdmatrix > $ time ./dmatrix > -1015380632 859379360 -367726792 -1548829944 > > real 2m10.677s > user 2m10.650s > sys 0m0.020s > $ > $ time ./dmatrix > -1015380632 859379360 -367726792 -1548829944 > > real 2m12.664s > user 2m12.600s > sys 0m0.030s > > # SIZE = 2000 results: > > # It appears D (gdc) is 30% slower that C++ (g++); using the exact same backend compiler. > > # it doesn't even appear to help to request O3 directly: it goes slower-- > > $ gdmd -O -q,-O3 -inline -release -noboundscheck -m64 matrix.d -ofdmatrix > $ time ./dmatrix > -1015380632 859379360 -367726792 -1548829944 > > real 2m17.107s > user 2m17.080s > sys 0m0.020s > jaten@afarm:~/tmp$ > > > # Though still beating java, but not by much. (Java code not shown; it's same source as all of these; the historical http://shootout.alioth.debian.org/ code from when D was in the shootout.) > > $ time java matrix > -1015380632 859379360 -367726792 -1548829944 > > real 2m23.739s > user 2m23.650s > sys 0m0.130s > $ > > > Slightly bigger matrix? > > SIZE = 2500 results: 25% slower in D > > $ time ./cpp.O3.matrix > -1506465222 -119774408 -1600478274 1285663906 > > real 3m1.340s > user 3m1.290s > sys 0m0.040s > > $ time ./dmatrix > -1506465222 -119774408 -1600478274 1285663906 > > real 4m2.109s > user 4m2.050s > sys 0m0.050s > > > //////// D version > > import core.memory; > > import std.stdio, std.string, std.array, std.conv; > > const int SIZE = 2000; > > int main(string[] args) > { > int i, n = args.length > 1 ? to!int(args[1]) : 1; > > int[][] m1 = mkmatrix(SIZE,SIZE); > int[][] m2 = mkmatrix(SIZE,SIZE); > int[][] mm = mkmatrix(SIZE,SIZE); > > GC.disable; > > for (i=0; i<n; i++) { > mmult(m1, m2, mm); > } > > writefln("%d %d %d %d",mm[0][0],mm[2][3],mm[3][2],mm[4][4]); > > return 0; > } > > int[][] mkmatrix(int rows, int cols) > { > int[][] m; > int count = 1; > > m.length = rows; > foreach(ref int[] mi; m) > { > mi.length = cols; > foreach(ref int mij; mi) > { > mij = count++; > } > } > > return(m); > } > > void mmult(int[][] m1, int[][] m2, int[][] m3) > { > foreach(int i, int[] m1i; m1) > { > foreach(int j, ref int m3ij; m3[i]) > { > int val; > foreach(int k, int[] m2k; m2) > { > val += m1i[k] * m2k[j]; > } > m3ij = val; > } > } > } > > ////// C++ version > > #include <stdio.h> > #include <stdlib.h> > #include <unistd.h> > > #define SIZE 2000 > > int **mkmatrix(int rows, int cols) { > int i, j, count = 1; > int **m = (int **) malloc(rows * sizeof(int *)); > for (i=0; i<rows; i++) { > m[i] = (int *) malloc(cols * sizeof(int)); > for (j=0; j<cols; j++) { > m[i][j] = count++; > } > } > return(m); > } > > void zeromatrix(int rows, int cols, int **m) { > int i, j; > for (i=0; i<rows; i++) > for (j=0; j<cols; j++) > m[i][j] = 0; > } > > void freematrix(int rows, int **m) { > while (--rows > -1) { free(m[rows]); } > free(m); > } > > int **mmult(int rows, int cols, int **m1, int **m2, int **m3) { > int i, j, k, val; > for (i=0; i<rows; i++) { > for (j=0; j<cols; j++) { > val = 0; > for (k=0; k<cols; k++) { > val += m1[i][k] * m2[k][j]; > } > m3[i][j] = val; > } > } > return(m3); > } > > int main(int argc, char *argv[]) { > int i, n = ((argc == 2) ? atoi(argv[1]) : 1); > > int **m1 = mkmatrix(SIZE, SIZE); > int **m2 = mkmatrix(SIZE, SIZE); > int **mm = mkmatrix(SIZE, SIZE); > > for (i=0; i<n; i++) { > mm = mmult(SIZE, SIZE, m1, m2, mm); > } > printf("%d %d %d %d\n", mm[0][0], mm[2][3], mm[3][2], mm[4][4]); > > freematrix(SIZE, m1); > freematrix(SIZE, m2); > freematrix(SIZE, mm); > return(0); > } First things first: You're not just timing the multiplication, you're timing the memory allocation as well. I suggest using http://dlang.org/phobos/std_datetime.html#StopWatch to do some proper timings in D Also, there is a semi-documented multi-dimensional array allocation syntax that is very neat, see here a simplified version of mkmatrix using it: int[][] mkmatrix(size_t rows, size_t cols) { int[][] m = new int[][](rows, cols); size_t count = 1; foreach(ref mi; m) foreach(ref mij; mi) mij = count++; return(m); } However, I have found myself that D is slower than C for these sort of intense numerical things. The assembly code should show why quite easily. |
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to J | Your benchmark code updated to D2: http://codepad.org/WMgu6XQG Bye, bearophile |
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to John Colvin | John Colvin:
> First things first:
> You're not just timing the multiplication, you're timing the memory allocation as well. I suggest using http://dlang.org/phobos/std_datetime.html#StopWatch to do some proper timings in D
Nope, what matters is the total program runtime.
Bye,
bearophile
|
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | On Monday, 4 March 2013 at 04:12:18 UTC, bearophile wrote:
> Your benchmark code updated to D2:
>
> http://codepad.org/WMgu6XQG
Sorry, this line:
enum size_t SIZE = 200;
Should be:
enum size_t SIZE = 2_000;
Bye,
bearophile
|
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | So this should be better: http://codepad.org/B5b4uyBM Bye, bearophile |
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | Generally for such matrix benchmarks if you chose the compilation flags really well (including link-time optimization!) I've seen that with LDC you get "good enough" timings. Bye, bearophile |
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to J | I suggest that you move this line GC.disable; to the first line. I don't see how you are doing your timings so that part is a wild card. Also note that when the GC is re-enabled it can add a significant amount of time to the tests. You are not explicitly re-enabling the GC, but I don't know if the GC kicks back as part of program termination, it probably shouldn't but we lack precise documentation and cannot be certain. I would test timings immediately after the GC is disabled and prior to program termination to see what affects the GC may or may not have on the timings. --rt |
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to J | On 3/3/13 10:48 PM, J wrote:
> Dear D pros,
>
> As a fan of D, I was hoping to be able to get similar results as this
> fellow on stack overflow, by noting his tuning steps;
> http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c
>
> Sadly however, when I pull out a simple matrix multiplication benchmark
> from the old language shootout (back when it had D), it is disturbingly
> slower in D when pit against C++.
You're measuring the speed of a couple of tight loops. The smallest differences in codegen between them will be on the radar. Use straight for loops or "foreach (i; 0 .. limit)" for those loops, the foreach forms you currently may introduce slight differences in codegen.
Andrei
|
March 04, 2013 Re: How to tune numerical D? (matrix multiplication is faster in g++ vs gdc) | ||||
---|---|---|---|---|
| ||||
Posted in reply to bearophile | On Monday, 4 March 2013 at 04:22:01 UTC, bearophile wrote: > So this should be better: > > http://codepad.org/B5b4uyBM > > Bye, > bearophile @bearophile: Thank you! Unfortunately the http://codepad.org/B5b4uyBM code runs a bit *slower* than the original D code. Yikes! $ gdmd -O -inline -release -noboundscheck -m64 bear.d -ofdbear $ time ./dbear -1015380632 859379360 -367726792 -1548829944 real 2m36.971s user 2m36.910s sys 0m0.030s $ time ./dbear -1015380632 859379360 -367726792 -1548829944 real 2m34.425s user 2m34.370s sys 0m0.020s @John Colvin: here is the disassembly of mmult() in both languages. Unfortunately I'm not literate in x86_64 assembly. Perhaps the problem is obvious to you? All I can really tell is that the g++ version is shorter. The memory allocation, when timed separately (comment out mmult), is less than 60 msec for either version, so I don't think its a memory issue, although it could be caching issue since the matrix layouts are different. ### gdc version of mmult: (gdb) disas /m _D6matrix5mmultFAAiAAiAAiZv Dump of assembler code for function _D6matrix5mmultFAAiAAiAAiZv: 56 void mmult(int[][] m1, int[][] m2, int[][] m3) 0x00000000004352a0 <+0>: push %r15 0x00000000004352a5 <+5>: push %r14 0x00000000004352a7 <+7>: push %r13 0x00000000004352a9 <+9>: push %r12 0x00000000004352ab <+11>: mov %rdx,%r12 0x00000000004352ae <+14>: push %rbp 0x00000000004352af <+15>: mov %rsi,%rbp 0x00000000004352b2 <+18>: push %rbx 0x00000000004352b3 <+19>: mov %r9,-0x40(%rsp) 0x00000000004352b8 <+24>: mov %rdi,-0x10(%rsp) 0x00000000004352bd <+29>: mov %rsi,-0x8(%rsp) 0x00000000004352c2 <+34>: mov %rdx,-0x20(%rsp) 0x00000000004352c7 <+39>: mov %rcx,-0x18(%rsp) 0x00000000004352cc <+44>: mov %r8,-0x30(%rsp) 0x00000000004352d1 <+49>: mov %r9,-0x28(%rsp) 0x00000000004352dc <+60>: add $0x1,%rdi 0x00000000004352e0 <+64>: lea 0x1(%rdx),%rdx 0x00000000004352e4 <+68>: mov $0x1,%r15d 0x00000000004352ea <+74>: mov %rdi,-0x38(%rsp) 0x0000000000435315 <+117>: add $0x1,%r13 0x0000000000435319 <+121>: mov $0x1,%r11d 0x0000000000435330 <+144>: mov $0x1,%esi 57 { 58 foreach(int i, int[] m1i; m1) 0x00000000004352a2 <+2>: test %rdi,%rdi 0x00000000004352d6 <+54>: je 0x4353aa <_D6matrix5mmultFAAiAAiAAiZv+266> 0x00000000004352ef <+79>: xor %esi,%esi 0x00000000004352f1 <+81>: mov %rsi,%rax 0x00000000004352f4 <+84>: shl $0x4,%rax 0x00000000004352f8 <+88>: mov 0x8(%rbp,%rax,1),%r10 0x0000000000435398 <+248>: cmp -0x38(%rsp),%rax 0x000000000043539d <+253>: mov %r15,%rsi 0x00000000004353a0 <+256>: je 0x4353aa <_D6matrix5mmultFAAiAAiAAiZv+266> 0x00000000004353a2 <+258>: mov %rax,%r15 0x00000000004353a5 <+261>: jmpq 0x4352f1 <_D6matrix5mmultFAAiAAiAAiZv+81> 59 { 60 foreach(int j, ref int m3ij; m3[i]) 0x00000000004352fd <+93>: add -0x40(%rsp),%rax 0x0000000000435302 <+98>: mov (%rax),%r13 0x0000000000435305 <+101>: mov 0x8(%rax),%r14 0x0000000000435309 <+105>: test %r13,%r13 0x000000000043530c <+108>: je 0x435394 <_D6matrix5mmultFAAiAAiAAiZv+244> 0x0000000000435312 <+114>: xor %r9d,%r9d 0x000000000043531f <+127>: shl $0x2,%r9 0x0000000000435326 <+134>: lea (%r14,%r9,1),%rbx 0x000000000043536c <+204>: mov %r11,%r9 0x000000000043536f <+207>: cmp %r13,%rax 0x0000000000435372 <+210>: je 0x435394 <_D6matrix5mmultFAAiAAiAAiZv+244> 0x0000000000435374 <+212>: shl $0x2,%r9 0x000000000043537b <+219>: mov %rax,%r11 0x000000000043537e <+222>: lea (%r14,%r9,1),%rbx 0x000000000043538a <+234>: mov %r11,%r9 0x000000000043538f <+239>: cmp %r13,%rax 0x0000000000435392 <+242>: jne 0x435374 <_D6matrix5mmultFAAiAAiAAiZv+212> 0x0000000000435394 <+244>: lea 0x1(%r15),%rax 61 { 62 int val; 0x0000000000435337 <+151>: xor %edi,%edi 0x0000000000435339 <+153>: jmp 0x435343 <_D6matrix5mmultFAAiAAiAAiZv+163> 0x000000000043533b <+155>: nopl 0x0(%rax,%rax,1) 0x0000000000435388 <+232>: xor %edi,%edi 63 foreach(int k, int[] m2k; m2) 0x0000000000435323 <+131>: test %r12,%r12 0x000000000043532a <+138>: je 0x435384 <_D6matrix5mmultFAAiAAiAAiZv+228> 0x000000000043532c <+140>: nopl 0x0(%rax) 0x0000000000435335 <+149>: xor %eax,%eax 0x0000000000435340 <+160>: mov %r8,%rsi 0x0000000000435343 <+163>: mov %rax,%r8 0x000000000043534a <+170>: shl $0x4,%r8 0x000000000043535e <+190>: cmp %rdx,%r8 0x0000000000435361 <+193>: mov %rsi,%rax 0x0000000000435364 <+196>: jne 0x435340 <_D6matrix5mmultFAAiAAiAAiZv+160> 0x0000000000435366 <+198>: lea 0x1(%r11),%rax 0x0000000000435378 <+216>: test %r12,%r12 0x0000000000435382 <+226>: jne 0x435330 <_D6matrix5mmultFAAiAAiAAiZv+144> 0x0000000000435384 <+228>: lea 0x1(%r11),%rax 64 { 65 val += m1i[k] * m2k[j]; 0x0000000000435346 <+166>: mov (%r10,%rax,4),%eax 0x000000000043534e <+174>: mov 0x8(%rcx,%r8,1),%r8 0x0000000000435353 <+179>: imul (%r8,%r9,1),%eax 0x0000000000435358 <+184>: lea 0x1(%rsi),%r8 0x000000000043535c <+188>: add %eax,%edi 66 } 67 m3ij = val; 0x000000000043536a <+202>: mov %edi,(%rbx) 0x000000000043538d <+237>: mov %edi,(%rbx) 68 } 69 } 70 } 0x00000000004353aa <+266>: pop %rbx 0x00000000004353ab <+267>: pop %rbp 0x00000000004353ac <+268>: pop %r12 0x00000000004353ae <+270>: pop %r13 0x00000000004353b0 <+272>: pop %r14 0x00000000004353b2 <+274>: pop %r15 0x00000000004353b4 <+276>: retq 0x00000000004353b5: data32 nopw %cs:0x0(%rax,%rax,1) End of assembler dump. (gdb) ### g++ version of mmult: (gdb) disas /m mmult Dump of assembler code for function mmult(int, int, int**, int**, int**): 36 int **mmult(int rows, int cols, int **m1, int **m2, int **m3) { 0x0000000000400a10 <+0>: push %r14 0x0000000000400a14 <+4>: push %r13 0x0000000000400a16 <+6>: mov %r8,%r13 0x0000000000400a19 <+9>: push %r12 0x0000000000400a1b <+11>: push %rbp 0x0000000000400a1c <+12>: push %rbx 0x0000000000400a1d <+13>: mov %edi,%ebx 0x0000000000400a21 <+17>: lea -0x1(%rsi),%eax 0x0000000000400a24 <+20>: mov %rdx,%r12 0x0000000000400a27 <+23>: xor %ebp,%ebp 0x0000000000400a29 <+25>: lea 0x4(,%rax,4),%rdi 0x0000000000400a40 <+48>: xor %r9d,%r9d 0x0000000000400a43 <+51>: xor %r11d,%r11d 0x0000000000400a46 <+54>: nopw %cs:0x0(%rax,%rax,1) 37 int i, j, k, val; 38 for (i=0; i<rows; i++) { 0x0000000000400a12 <+2>: test %edi,%edi 0x0000000000400a1f <+15>: jle 0x400a7e <mmult(int, int, int**, int**, int**)+110> 0x0000000000400a7a <+106>: cmp %ebp,%ebx 0x0000000000400a7c <+108>: jg 0x400a31 <mmult(int, int, int**, int**, int**)+33> 39 for (j=0; j<cols; j++) { 0x0000000000400a31 <+33>: test %esi,%esi 0x0000000000400a33 <+35>: jle 0x400a76 <mmult(int, int, int**, int**, int**)+102> 0x0000000000400a35 <+37>: mov (%r12,%rbp,8),%r8 0x0000000000400a39 <+41>: mov 0x0(%r13,%rbp,8),%rdx 0x0000000000400a3e <+46>: xor %eax,%eax 0x0000000000400a71 <+97>: cmp %rdi,%rax 0x0000000000400a74 <+100>: jne 0x400a40 <mmult(int, int, int**, int**, int**)+48> 0x0000000000400a76 <+102>: add $0x1,%rbp 40 val = 0; 41 for (k=0; k<cols; k++) { 0x0000000000400a64 <+84>: cmp %r9d,%esi 0x0000000000400a67 <+87>: jg 0x400a50 <mmult(int, int, int**, int**, int**)+64> 42 val += m1[i][k] * m2[k][j]; 0x0000000000400a50 <+64>: mov (%rcx,%r9,8),%r14 0x0000000000400a54 <+68>: mov (%r8,%r9,4),%r10d 0x0000000000400a58 <+72>: add $0x1,%r9 0x0000000000400a5c <+76>: imul (%r14,%rax,1),%r10d 0x0000000000400a61 <+81>: add %r10d,%r11d 43 } 44 m3[i][j] = val; 0x0000000000400a69 <+89>: mov %r11d,(%rdx,%rax,1) 0x0000000000400a6d <+93>: add $0x4,%rax 45 } 46 } 47 return(m3); 48 } 0x0000000000400a7e <+110>: pop %rbx 0x0000000000400a7f <+111>: pop %rbp 0x0000000000400a80 <+112>: pop %r12 0x0000000000400a82 <+114>: mov %r13,%rax 0x0000000000400a85 <+117>: pop %r13 0x0000000000400a87 <+119>: pop %r14 0x0000000000400a89 <+121>: retq End of assembler dump. (gdb) |
Copyright © 1999-2021 by the D Language Foundation