Jump to page: 1 2 3
Thread overview
How to tune numerical D? (matrix multiplication is faster in g++ vs gdc)
Mar 04, 2013
J
Mar 04, 2013
John Colvin
Mar 04, 2013
bearophile
Mar 04, 2013
John Colvin
Mar 04, 2013
bearophile
Mar 04, 2013
John Colvin
Mar 05, 2013
Walter Bright
Mar 04, 2013
bearophile
Mar 04, 2013
bearophile
Mar 04, 2013
bearophile
Mar 04, 2013
bearophile
Mar 04, 2013
J
Mar 04, 2013
Manu
Mar 04, 2013
J
Mar 04, 2013
J
Mar 04, 2013
bearophile
Mar 04, 2013
John Colvin
Mar 04, 2013
John Colvin
Mar 04, 2013
jerro
Mar 04, 2013
jerro
Mar 05, 2013
J
Mar 05, 2013
Timon Gehr
Mar 04, 2013
Walter Bright
Mar 04, 2013
Rob T
Mar 04, 2013
J
Mar 04, 2013
Walter Bright
March 04, 2013
Dear D pros,

As a fan of D, I was hoping to be able to get similar results as this fellow on stack overflow, by noting his tuning steps;
http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c

Sadly however, when I pull out a simple matrix multiplication benchmark from the old language shootout (back when it had D), it is disturbingly slower in D when pit against C++.

Details? I ran with very recent gdc (gcc 4.7.2, gdc on the 4.7.2 branch, pullreq #51, commit b8f5c22b0e7afa7e68a287ed788597e783540063), and the exact same gcc c++ compiler.

How would I tune this to be more competitive?  I'm comparing gdc vs g++ both built using the exact same gcc-4.7.2 back end, so it has to be something in the front end.  I've disabled GC after the matrices are made in D, so that doesn't explain it.

What is going on?  I'm hoping I'm making a silly, naive, obvious beginner mistake, but could that be?  I'm not sure how to apply the 'in' argument advice given on stackoverflow; if that is the answer, could someone summarise the best practice for 'in' use?

Thank you!

- J

$ g++ --version #shows: g++ (GCC) 4.7.2
$ uname -a
Linux gofast 2.6.35-24-generic #42-Ubuntu SMP Thu Dec 2 02:41:37 UTC 2010 x86_64 GNU/Linux

# first, g++, two runs:

$ g++  -O3 matrix.cpp -ocppmatrix
$ time ./cppmatrix
-1015380632 859379360 -367726792 -1548829944

real    1m31.941s
user    1m31.920s
sys 0m0.010s
$ time ./cppmatrix
-1015380632 859379360 -367726792 -1548829944

real    1m32.068s
user    1m32.010s
sys 0m0.050s


# second, gdc, two runs:

$ gdmd -O -inline -release -noboundscheck -m64 matrix.d -ofdmatrix
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944

real    2m10.677s
user    2m10.650s
sys 0m0.020s
$
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944

real    2m12.664s
user    2m12.600s
sys 0m0.030s

# SIZE = 2000 results:

# It appears D (gdc) is 30% slower that C++ (g++); using the exact same backend compiler.

# it doesn't even appear to help to request O3 directly: it goes slower--

$ gdmd -O -q,-O3 -inline -release -noboundscheck -m64 matrix.d -ofdmatrix
$ time ./dmatrix
-1015380632 859379360 -367726792 -1548829944

real    2m17.107s
user    2m17.080s
sys 0m0.020s
jaten@afarm:~/tmp$


# Though still beating java, but not by much. (Java code not shown; it's same source as all of these; the historical http://shootout.alioth.debian.org/ code from when D was in the shootout.)

$ time java matrix
-1015380632 859379360 -367726792 -1548829944

real    2m23.739s
user    2m23.650s
sys 0m0.130s
$


Slightly bigger matrix?

SIZE = 2500 results: 25% slower in D

$ time ./cpp.O3.matrix
-1506465222 -119774408 -1600478274 1285663906

real    3m1.340s
user    3m1.290s
sys 0m0.040s

$ time ./dmatrix
-1506465222 -119774408 -1600478274 1285663906

real    4m2.109s
user    4m2.050s
sys 0m0.050s


//////// D version

import core.memory;

import std.stdio, std.string, std.array, std.conv;

const int SIZE = 2000;

int main(string[] args)
{
    int i, n = args.length > 1 ? to!int(args[1]) : 1;

    int[][] m1 = mkmatrix(SIZE,SIZE);
    int[][] m2 = mkmatrix(SIZE,SIZE);
    int[][] mm = mkmatrix(SIZE,SIZE);

    GC.disable;

    for (i=0; i<n; i++) {
        mmult(m1, m2, mm);
    }

    writefln("%d %d %d %d",mm[0][0],mm[2][3],mm[3][2],mm[4][4]);

    return 0;
}

int[][] mkmatrix(int rows, int cols)
{
    int[][] m;
    int count = 1;

    m.length = rows;
    foreach(ref int[] mi; m)
    {
        mi.length = cols;
        foreach(ref int mij; mi)
        {
            mij = count++;
        }
    }

    return(m);
}

void mmult(int[][] m1, int[][] m2, int[][] m3)
{
    foreach(int i, int[] m1i; m1)
    {
        foreach(int j, ref int m3ij; m3[i])
        {
            int val;
            foreach(int k, int[] m2k; m2)
            {
                val += m1i[k] * m2k[j];
            }
            m3ij = val;
        }
    }
}

////// C++ version

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define SIZE 2000

int **mkmatrix(int rows, int cols) {
    int i, j, count = 1;
    int **m = (int **) malloc(rows * sizeof(int *));
    for (i=0; i<rows; i++) {
    m[i] = (int *) malloc(cols * sizeof(int));
    for (j=0; j<cols; j++) {
        m[i][j] = count++;
    }
    }
    return(m);
}

void zeromatrix(int rows, int cols, int **m) {
    int i, j;
    for (i=0; i<rows; i++)
    for (j=0; j<cols; j++)
        m[i][j] = 0;
}

void freematrix(int rows, int **m) {
    while (--rows > -1) { free(m[rows]); }
    free(m);
}

int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
    int i, j, k, val;
    for (i=0; i<rows; i++) {
    for (j=0; j<cols; j++) {
        val = 0;
        for (k=0; k<cols; k++) {
        val += m1[i][k] * m2[k][j];
        }
        m3[i][j] = val;
    }
    }
    return(m3);
}

int main(int argc, char *argv[]) {
    int i, n = ((argc == 2) ? atoi(argv[1]) : 1);

    int **m1 = mkmatrix(SIZE, SIZE);
    int **m2 = mkmatrix(SIZE, SIZE);
    int **mm = mkmatrix(SIZE, SIZE);

    for (i=0; i<n; i++) {
    mm = mmult(SIZE, SIZE, m1, m2, mm);
    }
    printf("%d %d %d %d\n", mm[0][0], mm[2][3], mm[3][2], mm[4][4]);

    freematrix(SIZE, m1);
    freematrix(SIZE, m2);
    freematrix(SIZE, mm);
    return(0);
}

March 04, 2013
On Monday, 4 March 2013 at 03:48:45 UTC, J wrote:
> Dear D pros,
>
> As a fan of D, I was hoping to be able to get similar results as this fellow on stack overflow, by noting his tuning steps;
> http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c
>
> Sadly however, when I pull out a simple matrix multiplication benchmark from the old language shootout (back when it had D), it is disturbingly slower in D when pit against C++.
>
> Details? I ran with very recent gdc (gcc 4.7.2, gdc on the 4.7.2 branch, pullreq #51, commit b8f5c22b0e7afa7e68a287ed788597e783540063), and the exact same gcc c++ compiler.
>
> How would I tune this to be more competitive?  I'm comparing gdc vs g++ both built using the exact same gcc-4.7.2 back end, so it has to be something in the front end.  I've disabled GC after the matrices are made in D, so that doesn't explain it.
>
> What is going on?  I'm hoping I'm making a silly, naive, obvious beginner mistake, but could that be?  I'm not sure how to apply the 'in' argument advice given on stackoverflow; if that is the answer, could someone summarise the best practice for 'in' use?
>
> Thank you!
>
> - J
>
> $ g++ --version #shows: g++ (GCC) 4.7.2
> $ uname -a
> Linux gofast 2.6.35-24-generic #42-Ubuntu SMP Thu Dec 2 02:41:37 UTC 2010 x86_64 GNU/Linux
>
> # first, g++, two runs:
>
> $ g++  -O3 matrix.cpp -ocppmatrix
> $ time ./cppmatrix
> -1015380632 859379360 -367726792 -1548829944
>
> real    1m31.941s
> user    1m31.920s
> sys 0m0.010s
> $ time ./cppmatrix
> -1015380632 859379360 -367726792 -1548829944
>
> real    1m32.068s
> user    1m32.010s
> sys 0m0.050s
>
>
> # second, gdc, two runs:
>
> $ gdmd -O -inline -release -noboundscheck -m64 matrix.d -ofdmatrix
> $ time ./dmatrix
> -1015380632 859379360 -367726792 -1548829944
>
> real    2m10.677s
> user    2m10.650s
> sys 0m0.020s
> $
> $ time ./dmatrix
> -1015380632 859379360 -367726792 -1548829944
>
> real    2m12.664s
> user    2m12.600s
> sys 0m0.030s
>
> # SIZE = 2000 results:
>
> # It appears D (gdc) is 30% slower that C++ (g++); using the exact same backend compiler.
>
> # it doesn't even appear to help to request O3 directly: it goes slower--
>
> $ gdmd -O -q,-O3 -inline -release -noboundscheck -m64 matrix.d -ofdmatrix
> $ time ./dmatrix
> -1015380632 859379360 -367726792 -1548829944
>
> real    2m17.107s
> user    2m17.080s
> sys 0m0.020s
> jaten@afarm:~/tmp$
>
>
> # Though still beating java, but not by much. (Java code not shown; it's same source as all of these; the historical http://shootout.alioth.debian.org/ code from when D was in the shootout.)
>
> $ time java matrix
> -1015380632 859379360 -367726792 -1548829944
>
> real    2m23.739s
> user    2m23.650s
> sys 0m0.130s
> $
>
>
> Slightly bigger matrix?
>
> SIZE = 2500 results: 25% slower in D
>
> $ time ./cpp.O3.matrix
> -1506465222 -119774408 -1600478274 1285663906
>
> real    3m1.340s
> user    3m1.290s
> sys 0m0.040s
>
> $ time ./dmatrix
> -1506465222 -119774408 -1600478274 1285663906
>
> real    4m2.109s
> user    4m2.050s
> sys 0m0.050s
>
>
> //////// D version
>
> import core.memory;
>
> import std.stdio, std.string, std.array, std.conv;
>
> const int SIZE = 2000;
>
> int main(string[] args)
> {
>     int i, n = args.length > 1 ? to!int(args[1]) : 1;
>
>     int[][] m1 = mkmatrix(SIZE,SIZE);
>     int[][] m2 = mkmatrix(SIZE,SIZE);
>     int[][] mm = mkmatrix(SIZE,SIZE);
>
>     GC.disable;
>
>     for (i=0; i<n; i++) {
>         mmult(m1, m2, mm);
>     }
>
>     writefln("%d %d %d %d",mm[0][0],mm[2][3],mm[3][2],mm[4][4]);
>
>     return 0;
> }
>
> int[][] mkmatrix(int rows, int cols)
> {
>     int[][] m;
>     int count = 1;
>
>     m.length = rows;
>     foreach(ref int[] mi; m)
>     {
>         mi.length = cols;
>         foreach(ref int mij; mi)
>         {
>             mij = count++;
>         }
>     }
>
>     return(m);
> }
>
> void mmult(int[][] m1, int[][] m2, int[][] m3)
> {
>     foreach(int i, int[] m1i; m1)
>     {
>         foreach(int j, ref int m3ij; m3[i])
>         {
>             int val;
>             foreach(int k, int[] m2k; m2)
>             {
>                 val += m1i[k] * m2k[j];
>             }
>             m3ij = val;
>         }
>     }
> }
>
> ////// C++ version
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <unistd.h>
>
> #define SIZE 2000
>
> int **mkmatrix(int rows, int cols) {
>     int i, j, count = 1;
>     int **m = (int **) malloc(rows * sizeof(int *));
>     for (i=0; i<rows; i++) {
>     m[i] = (int *) malloc(cols * sizeof(int));
>     for (j=0; j<cols; j++) {
>         m[i][j] = count++;
>     }
>     }
>     return(m);
> }
>
> void zeromatrix(int rows, int cols, int **m) {
>     int i, j;
>     for (i=0; i<rows; i++)
>     for (j=0; j<cols; j++)
>         m[i][j] = 0;
> }
>
> void freematrix(int rows, int **m) {
>     while (--rows > -1) { free(m[rows]); }
>     free(m);
> }
>
> int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
>     int i, j, k, val;
>     for (i=0; i<rows; i++) {
>     for (j=0; j<cols; j++) {
>         val = 0;
>         for (k=0; k<cols; k++) {
>         val += m1[i][k] * m2[k][j];
>         }
>         m3[i][j] = val;
>     }
>     }
>     return(m3);
> }
>
> int main(int argc, char *argv[]) {
>     int i, n = ((argc == 2) ? atoi(argv[1]) : 1);
>
>     int **m1 = mkmatrix(SIZE, SIZE);
>     int **m2 = mkmatrix(SIZE, SIZE);
>     int **mm = mkmatrix(SIZE, SIZE);
>
>     for (i=0; i<n; i++) {
>     mm = mmult(SIZE, SIZE, m1, m2, mm);
>     }
>     printf("%d %d %d %d\n", mm[0][0], mm[2][3], mm[3][2], mm[4][4]);
>
>     freematrix(SIZE, m1);
>     freematrix(SIZE, m2);
>     freematrix(SIZE, mm);
>     return(0);
> }

First things first:
You're not just timing the multiplication, you're timing the memory allocation as well. I suggest using http://dlang.org/phobos/std_datetime.html#StopWatch to do some proper timings in D

Also, there is a semi-documented multi-dimensional array allocation syntax that is very neat, see here a simplified version of mkmatrix using it:

int[][] mkmatrix(size_t rows, size_t cols)
{
    int[][] m = new int[][](rows, cols);
    size_t count = 1;

    foreach(ref mi; m)
        foreach(ref mij; mi)
            mij = count++;

    return(m);
}


However, I have found myself that D is slower than C for these sort of intense numerical things. The assembly code should show why quite easily.
March 04, 2013
Your benchmark code updated to D2:

http://codepad.org/WMgu6XQG

Bye,
bearophile
March 04, 2013
John Colvin:

> First things first:
> You're not just timing the multiplication, you're timing the memory allocation as well. I suggest using http://dlang.org/phobos/std_datetime.html#StopWatch to do some proper timings in D

Nope, what matters is the total program runtime.

Bye,
bearophile
March 04, 2013
On Monday, 4 March 2013 at 04:12:18 UTC, bearophile wrote:
> Your benchmark code updated to D2:
>
> http://codepad.org/WMgu6XQG

Sorry, this line:

enum size_t SIZE = 200;

Should be:

enum size_t SIZE = 2_000;

Bye,
bearophile
March 04, 2013
So this should be better:

http://codepad.org/B5b4uyBM

Bye,
bearophile
March 04, 2013
Generally for such matrix benchmarks if you chose the compilation flags really well (including link-time optimization!) I've seen that with LDC you get "good enough" timings.

Bye,
bearophile
March 04, 2013
I suggest that you move this line

GC.disable;

to the first line.

I don't see how you are doing your timings so that part is a wild card.

Also note that when the GC is re-enabled it can add a significant amount of time to the tests. You are not explicitly re-enabling the GC, but I don't know if the GC kicks back as part of program termination, it probably shouldn't but we lack precise documentation and cannot be certain.

I would test timings immediately after the GC is disabled and prior to program termination to see what affects the GC may or may not have on the timings.

--rt
March 04, 2013
On 3/3/13 10:48 PM, J wrote:
> Dear D pros,
>
> As a fan of D, I was hoping to be able to get similar results as this
> fellow on stack overflow, by noting his tuning steps;
> http://stackoverflow.com/questions/5142366/how-fast-is-d-compared-to-c
>
> Sadly however, when I pull out a simple matrix multiplication benchmark
> from the old language shootout (back when it had D), it is disturbingly
> slower in D when pit against C++.

You're measuring the speed of a couple of tight loops. The smallest differences in codegen between them will be on the radar. Use straight for loops or "foreach (i; 0 .. limit)" for those loops, the foreach forms you currently may introduce slight differences in codegen.

Andrei


March 04, 2013
On Monday, 4 March 2013 at 04:22:01 UTC, bearophile wrote:
> So this should be better:
>
> http://codepad.org/B5b4uyBM
>
> Bye,
> bearophile

@bearophile: Thank you!  Unfortunately the http://codepad.org/B5b4uyBM code runs a bit *slower* than the original D code. Yikes!

$  gdmd -O -inline -release -noboundscheck -m64 bear.d -ofdbear
$ time ./dbear
-1015380632 859379360 -367726792 -1548829944

real    2m36.971s
user    2m36.910s
sys 0m0.030s
$ time ./dbear
-1015380632 859379360 -367726792 -1548829944

real    2m34.425s
user    2m34.370s
sys 0m0.020s


@John Colvin: here is the disassembly of mmult() in both languages. Unfortunately I'm not literate in x86_64 assembly.  Perhaps the problem is obvious to you?  All I can really tell is that the g++ version is shorter.

The memory allocation, when timed separately (comment out mmult), is less than 60 msec for either version, so I don't think its a memory issue, although it could be caching issue since the matrix layouts are different.

### gdc version of mmult:

(gdb) disas /m  _D6matrix5mmultFAAiAAiAAiZv
Dump of assembler code for function _D6matrix5mmultFAAiAAiAAiZv:
56	void mmult(int[][] m1, int[][] m2, int[][] m3)
   0x00000000004352a0 <+0>:	push   %r15
   0x00000000004352a5 <+5>:	push   %r14
   0x00000000004352a7 <+7>:	push   %r13
   0x00000000004352a9 <+9>:	push   %r12
   0x00000000004352ab <+11>:	mov    %rdx,%r12
   0x00000000004352ae <+14>:	push   %rbp
   0x00000000004352af <+15>:	mov    %rsi,%rbp
   0x00000000004352b2 <+18>:	push   %rbx
   0x00000000004352b3 <+19>:	mov    %r9,-0x40(%rsp)
   0x00000000004352b8 <+24>:	mov    %rdi,-0x10(%rsp)
   0x00000000004352bd <+29>:	mov    %rsi,-0x8(%rsp)
   0x00000000004352c2 <+34>:	mov    %rdx,-0x20(%rsp)
   0x00000000004352c7 <+39>:	mov    %rcx,-0x18(%rsp)
   0x00000000004352cc <+44>:	mov    %r8,-0x30(%rsp)
   0x00000000004352d1 <+49>:	mov    %r9,-0x28(%rsp)
   0x00000000004352dc <+60>:	add    $0x1,%rdi
   0x00000000004352e0 <+64>:	lea    0x1(%rdx),%rdx
   0x00000000004352e4 <+68>:	mov    $0x1,%r15d
   0x00000000004352ea <+74>:	mov    %rdi,-0x38(%rsp)
   0x0000000000435315 <+117>:	add    $0x1,%r13
   0x0000000000435319 <+121>:	mov    $0x1,%r11d
   0x0000000000435330 <+144>:	mov    $0x1,%esi

57	{
58	    foreach(int i, int[] m1i; m1)
   0x00000000004352a2 <+2>:	test   %rdi,%rdi
   0x00000000004352d6 <+54>:	je     0x4353aa <_D6matrix5mmultFAAiAAiAAiZv+266>
   0x00000000004352ef <+79>:	xor    %esi,%esi
   0x00000000004352f1 <+81>:	mov    %rsi,%rax
   0x00000000004352f4 <+84>:	shl    $0x4,%rax
   0x00000000004352f8 <+88>:	mov    0x8(%rbp,%rax,1),%r10
   0x0000000000435398 <+248>:	cmp    -0x38(%rsp),%rax
   0x000000000043539d <+253>:	mov    %r15,%rsi
   0x00000000004353a0 <+256>:	je     0x4353aa <_D6matrix5mmultFAAiAAiAAiZv+266>
   0x00000000004353a2 <+258>:	mov    %rax,%r15
   0x00000000004353a5 <+261>:	jmpq   0x4352f1 <_D6matrix5mmultFAAiAAiAAiZv+81>

59	    {
60	        foreach(int j, ref int m3ij; m3[i])
   0x00000000004352fd <+93>:	add    -0x40(%rsp),%rax
   0x0000000000435302 <+98>:	mov    (%rax),%r13
   0x0000000000435305 <+101>:	mov    0x8(%rax),%r14
   0x0000000000435309 <+105>:	test   %r13,%r13
   0x000000000043530c <+108>:	je     0x435394 <_D6matrix5mmultFAAiAAiAAiZv+244>
   0x0000000000435312 <+114>:	xor    %r9d,%r9d
   0x000000000043531f <+127>:	shl    $0x2,%r9
   0x0000000000435326 <+134>:	lea    (%r14,%r9,1),%rbx
   0x000000000043536c <+204>:	mov    %r11,%r9
   0x000000000043536f <+207>:	cmp    %r13,%rax
   0x0000000000435372 <+210>:	je     0x435394 <_D6matrix5mmultFAAiAAiAAiZv+244>
   0x0000000000435374 <+212>:	shl    $0x2,%r9
   0x000000000043537b <+219>:	mov    %rax,%r11
   0x000000000043537e <+222>:	lea    (%r14,%r9,1),%rbx
   0x000000000043538a <+234>:	mov    %r11,%r9
   0x000000000043538f <+239>:	cmp    %r13,%rax
   0x0000000000435392 <+242>:	jne    0x435374 <_D6matrix5mmultFAAiAAiAAiZv+212>
   0x0000000000435394 <+244>:	lea    0x1(%r15),%rax

61	        {
62	            int val;
   0x0000000000435337 <+151>:	xor    %edi,%edi
   0x0000000000435339 <+153>:	jmp    0x435343 <_D6matrix5mmultFAAiAAiAAiZv+163>
   0x000000000043533b <+155>:	nopl   0x0(%rax,%rax,1)
   0x0000000000435388 <+232>:	xor    %edi,%edi

63	            foreach(int k, int[] m2k; m2)
   0x0000000000435323 <+131>:	test   %r12,%r12
   0x000000000043532a <+138>:	je     0x435384 <_D6matrix5mmultFAAiAAiAAiZv+228>
   0x000000000043532c <+140>:	nopl   0x0(%rax)
   0x0000000000435335 <+149>:	xor    %eax,%eax
   0x0000000000435340 <+160>:	mov    %r8,%rsi
   0x0000000000435343 <+163>:	mov    %rax,%r8
   0x000000000043534a <+170>:	shl    $0x4,%r8
   0x000000000043535e <+190>:	cmp    %rdx,%r8
   0x0000000000435361 <+193>:	mov    %rsi,%rax
   0x0000000000435364 <+196>:	jne    0x435340 <_D6matrix5mmultFAAiAAiAAiZv+160>
   0x0000000000435366 <+198>:	lea    0x1(%r11),%rax
   0x0000000000435378 <+216>:	test   %r12,%r12
   0x0000000000435382 <+226>:	jne    0x435330 <_D6matrix5mmultFAAiAAiAAiZv+144>
   0x0000000000435384 <+228>:	lea    0x1(%r11),%rax

64	            {
65	                val += m1i[k] * m2k[j];
   0x0000000000435346 <+166>:	mov    (%r10,%rax,4),%eax
   0x000000000043534e <+174>:	mov    0x8(%rcx,%r8,1),%r8
   0x0000000000435353 <+179>:	imul   (%r8,%r9,1),%eax
   0x0000000000435358 <+184>:	lea    0x1(%rsi),%r8
   0x000000000043535c <+188>:	add    %eax,%edi

66	            }
67	            m3ij = val;
   0x000000000043536a <+202>:	mov    %edi,(%rbx)
   0x000000000043538d <+237>:	mov    %edi,(%rbx)

68	        }
69	    }
70	}
   0x00000000004353aa <+266>:	pop    %rbx
   0x00000000004353ab <+267>:	pop    %rbp
   0x00000000004353ac <+268>:	pop    %r12
   0x00000000004353ae <+270>:	pop    %r13
   0x00000000004353b0 <+272>:	pop    %r14
   0x00000000004353b2 <+274>:	pop    %r15
   0x00000000004353b4 <+276>:	retq
   0x00000000004353b5:	data32 nopw %cs:0x0(%rax,%rax,1)

End of assembler dump.
(gdb)


### g++ version of mmult:

(gdb) disas /m mmult
Dump of assembler code for function mmult(int, int, int**, int**, int**):
36	int **mmult(int rows, int cols, int **m1, int **m2, int **m3) {
   0x0000000000400a10 <+0>:	push   %r14
   0x0000000000400a14 <+4>:	push   %r13
   0x0000000000400a16 <+6>:	mov    %r8,%r13
   0x0000000000400a19 <+9>:	push   %r12
   0x0000000000400a1b <+11>:	push   %rbp
   0x0000000000400a1c <+12>:	push   %rbx
   0x0000000000400a1d <+13>:	mov    %edi,%ebx
   0x0000000000400a21 <+17>:	lea    -0x1(%rsi),%eax
   0x0000000000400a24 <+20>:	mov    %rdx,%r12
   0x0000000000400a27 <+23>:	xor    %ebp,%ebp
   0x0000000000400a29 <+25>:	lea    0x4(,%rax,4),%rdi
   0x0000000000400a40 <+48>:	xor    %r9d,%r9d
   0x0000000000400a43 <+51>:	xor    %r11d,%r11d
   0x0000000000400a46 <+54>:	nopw   %cs:0x0(%rax,%rax,1)

37	    int i, j, k, val;
38	    for (i=0; i<rows; i++) {
   0x0000000000400a12 <+2>:	test   %edi,%edi
   0x0000000000400a1f <+15>:	jle    0x400a7e <mmult(int, int, int**, int**, int**)+110>
   0x0000000000400a7a <+106>:	cmp    %ebp,%ebx
   0x0000000000400a7c <+108>:	jg     0x400a31 <mmult(int, int, int**, int**, int**)+33>

39		for (j=0; j<cols; j++) {
   0x0000000000400a31 <+33>:	test   %esi,%esi
   0x0000000000400a33 <+35>:	jle    0x400a76 <mmult(int, int, int**, int**, int**)+102>
   0x0000000000400a35 <+37>:	mov    (%r12,%rbp,8),%r8
   0x0000000000400a39 <+41>:	mov    0x0(%r13,%rbp,8),%rdx
   0x0000000000400a3e <+46>:	xor    %eax,%eax
   0x0000000000400a71 <+97>:	cmp    %rdi,%rax
   0x0000000000400a74 <+100>:	jne    0x400a40 <mmult(int, int, int**, int**, int**)+48>
   0x0000000000400a76 <+102>:	add    $0x1,%rbp

40		    val = 0;
41		    for (k=0; k<cols; k++) {
   0x0000000000400a64 <+84>:	cmp    %r9d,%esi
   0x0000000000400a67 <+87>:	jg     0x400a50 <mmult(int, int, int**, int**, int**)+64>

42			val += m1[i][k] * m2[k][j];
   0x0000000000400a50 <+64>:	mov    (%rcx,%r9,8),%r14
   0x0000000000400a54 <+68>:	mov    (%r8,%r9,4),%r10d
   0x0000000000400a58 <+72>:	add    $0x1,%r9
   0x0000000000400a5c <+76>:	imul   (%r14,%rax,1),%r10d
   0x0000000000400a61 <+81>:	add    %r10d,%r11d

43		    }
44		    m3[i][j] = val;
   0x0000000000400a69 <+89>:	mov    %r11d,(%rdx,%rax,1)
   0x0000000000400a6d <+93>:	add    $0x4,%rax

45		}
46	    }
47	    return(m3);
48	}
   0x0000000000400a7e <+110>:	pop    %rbx
   0x0000000000400a7f <+111>:	pop    %rbp
   0x0000000000400a80 <+112>:	pop    %r12
   0x0000000000400a82 <+114>:	mov    %r13,%rax
   0x0000000000400a85 <+117>:	pop    %r13
   0x0000000000400a87 <+119>:	pop    %r14
   0x0000000000400a89 <+121>:	retq

End of assembler dump.
(gdb)
« First   ‹ Prev
1 2 3