Porting some code from C to D I found the inline assembler very convenient. This is the C code (using an external NASM file):
// dot_product returns dot product t*w of n elements. n is rounded
// up to a multiple of 8. Result is scaled down by 8 bits.
#ifdef NOASM // no assembly language
int dot_product(short *t, short *w, int n) {
int sum=0;
n=(n+7)&8;
for (int i=0; i<n; i+=2) {
if (lol >= 21567) printf("dp %d %d %d %d %d %d\n", n, i, t[i], w[i], t[i+1], w[i+1]);
sum+=(t[i]*w[i]+t[i+1]*w[i+1]) >> 8;
}
return sum;
}
#else // The NASM version uses MMX and is about 8 times faster.
extern "C" int dot_product(short *t, short *w, int n); // in NASM
#endif
In D, I can move the ASM inside the function, so there is no need for two declarations:
extern (C) int dot_product(short *t, short *w, const int n) {
version (D_InlineAsm_X86_64) asm {
naked;
mov RCX, RDX; // n
mov RAX, RDI; // a
mov RDX, RSI; // b
cmp RCX, 0;
jz done;
sub RAX, 16;
sub RDX, 16;
pxor XMM0, XMM0; // sum = 0
loop: // each loop sums 4 products
movdqa XMM1, [RAX+RCX*2];// put parital sums of vector product in xmm1
pmaddwd XMM1, [RDX+RCX*2];
psrad XMM1, 8;
paddd XMM0, XMM1;
sub RCX, 8;
ja loop;
movdqa XMM1, XMM0; // add 4 parts of xmm0 and return in eax
psrldq XMM1, 8;
paddd XMM0, XMM1;
movdqa XMM1, XMM0;
psrldq XMM1, 4;
paddd XMM0, XMM1;
movq RAX, XMM0;
done:
ret;
} else {
int sum = 0;
for (int i = 0; i < n; i += 4) {
sum += (t[i ]*w[i ] + t[i+1]*w[i+1]) >> 8;
sum += (t[i+2]*w[i+2] + t[i+3]*w[i+3]) >> 8;
}
return sum;
}
}
This example also shows, how 'naked' should probably not be applied to the function declaration, because it contains nonasm code as well. (It could be "naked asm" though.) For compatibility with GDC (and in fact the original NASM code), I used extern(C) here as the parameter passing strategy.
This may also serve as a practical use case for vector operations.
