April 10, 2015 Optimisation question | ||||
|---|---|---|---|---|
| ||||
void mul(float[] a, float v)
{
if ((cast(size_t)a.ptr) % 32 == 0
&& a.length == 16)
{
foreach (ref el; a)
el *= v;
}
}
with
-Ofast -march=broadwell -frelease
becomes
void example.mul(float[], float):
movq %rsi, %rax
andl $31, %eax
jne .L44
cmpq $16, %rdi
jne .L44
shrq $2, %rax
negq %rax
andl $7, %eax
je .L10
vmulss (%rsi), %xmm0, %xmm1
vmovss %xmm1, (%rsi)
cmpq $1, %rax
je .L11
vmulss 4(%rsi), %xmm0, %xmm1
vmovss %xmm1, 4(%rsi)
cmpq $2, %rax
je .L12
vmulss 8(%rsi), %xmm0, %xmm1
vmovss %xmm1, 8(%rsi)
cmpq $3, %rax
je .L13
vmulss 12(%rsi), %xmm0, %xmm1
vmovss %xmm1, 12(%rsi)
cmpq $4, %rax
je .L14
vmulss 16(%rsi), %xmm0, %xmm1
vmovss %xmm1, 16(%rsi)
cmpq $5, %rax
je .L15
vmulss 20(%rsi), %xmm0, %xmm1
vmovss %xmm1, 20(%rsi)
cmpq $6, %rax
je .L16
vmulss 24(%rsi), %xmm0, %xmm1
movl $9, %edx
movl $7, %r9d
vmovss %xmm1, 24(%rsi)
.L5:
movl $16, %edi
movl $8, %r8d
movl $1, %r10d
subq %rax, %rdi
.L4:
leaq (%rsi,%rax,4), %rcx
vbroadcastss %xmm0, %ymm1
vmulps (%rcx), %ymm1, %ymm2
vmovaps %ymm2, (%rcx)
cmpq $1, %r10
je .L6
vmulps 32(%rcx), %ymm1, %ymm1
vmovaps %ymm1, 32(%rcx)
.L6:
leaq (%r9,%r8), %rax
subq %r8, %rdx
cmpq %r8, %rdi
je .L43
leaq (%rsi,%rax,4), %rcx
vmulss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
leaq 1(%rax), %rcx
cmpq $1, %rdx
je .L43
leaq (%rsi,%rcx,4), %rcx
vmulss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
leaq 2(%rax), %rcx
cmpq $2, %rdx
je .L43
leaq (%rsi,%rcx,4), %rcx
vmulss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
leaq 3(%rax), %rcx
cmpq $3, %rdx
je .L43
leaq (%rsi,%rcx,4), %rcx
vmulss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
leaq 4(%rax), %rcx
cmpq $4, %rdx
je .L43
leaq (%rsi,%rcx,4), %rcx
vmulss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
leaq 5(%rax), %rcx
cmpq $5, %rdx
je .L43
leaq (%rsi,%rcx,4), %rcx
addq $6, %rax
vmulss (%rcx), %xmm0, %xmm1
vmovss %xmm1, (%rcx)
cmpq $6, %rdx
je .L43
leaq (%rsi,%rax,4), %rax
vmulss (%rax), %xmm0, %xmm0
vmovss %xmm0, (%rax)
vzeroupper
ret
.L43:
vzeroupper
.L44:
ret
.L10:
movl $16, %r8d
movl $2, %r10d
movl $16, %edi
movl $16, %edx
xorl %r9d, %r9d
jmp .L4
.L11:
movl $15, %edx
movl $1, %r9d
jmp .L5
.L16:
movl $10, %edx
movl $6, %r9d
jmp .L5
.L15:
movl $11, %edx
movl $5, %r9d
jmp .L5
.L14:
movl $12, %edx
movl $4, %r9d
jmp .L5
.L13:
movl $13, %edx
movl $3, %r9d
jmp .L5
.L12:
movl $14, %edx
movl $2, %r9d
jmp .L5
Which seems like an awful lot of code, wouldn't you say?
I was expecting something along the lines of this (untested):
void example.mul(float[], float):
testb $31, %sil
jne .L44
cmpq $16, %rdi
jne .L44
vbroadcastss xmm0, ymm2
vmulps (%rsi), ymm2, ymm0
vmulps 32(%rsi), ymm2, ymm1
vmovaps ymm0, (%rsi)
vmovaps ymm1, 32(%rsi)
.L44:
ret
Am I being stupid, or is the optimiser making a complete hash of things?
| ||||
April 10, 2015 Re: Optimisation question | ||||
|---|---|---|---|---|
| ||||
Posted in reply to John Colvin | On 10 April 2015 at 20:18, John Colvin via D.gnu <d.gnu@puremagic.com> wrote:
> void mul(float[] a, float v)
> {
> if ((cast(size_t)a.ptr) % 32 == 0
> && a.length == 16)
> {
> foreach (ref el; a)
> el *= v;
> }
> }
>
> with
> -Ofast -march=broadwell -frelease
> becomes
>
> void example.mul(float[], float):
> movq %rsi, %rax
> andl $31, %eax
> jne .L44
> cmpq $16, %rdi
> jne .L44
> shrq $2, %rax
> negq %rax
> andl $7, %eax
> je .L10
> vmulss (%rsi), %xmm0, %xmm1
> vmovss %xmm1, (%rsi)
> cmpq $1, %rax
> je .L11
> vmulss 4(%rsi), %xmm0, %xmm1
> vmovss %xmm1, 4(%rsi)
> cmpq $2, %rax
> je .L12
> vmulss 8(%rsi), %xmm0, %xmm1
> vmovss %xmm1, 8(%rsi)
> cmpq $3, %rax
> je .L13
> vmulss 12(%rsi), %xmm0, %xmm1
> vmovss %xmm1, 12(%rsi)
> cmpq $4, %rax
> je .L14
> vmulss 16(%rsi), %xmm0, %xmm1
> vmovss %xmm1, 16(%rsi)
> cmpq $5, %rax
> je .L15
> vmulss 20(%rsi), %xmm0, %xmm1
> vmovss %xmm1, 20(%rsi)
> cmpq $6, %rax
> je .L16
> vmulss 24(%rsi), %xmm0, %xmm1
> movl $9, %edx
> movl $7, %r9d
> vmovss %xmm1, 24(%rsi)
> .L5:
> movl $16, %edi
> movl $8, %r8d
> movl $1, %r10d
> subq %rax, %rdi
> .L4:
> leaq (%rsi,%rax,4), %rcx
> vbroadcastss %xmm0, %ymm1
> vmulps (%rcx), %ymm1, %ymm2
> vmovaps %ymm2, (%rcx)
> cmpq $1, %r10
> je .L6
> vmulps 32(%rcx), %ymm1, %ymm1
> vmovaps %ymm1, 32(%rcx)
> .L6:
> leaq (%r9,%r8), %rax
> subq %r8, %rdx
> cmpq %r8, %rdi
> je .L43
> leaq (%rsi,%rax,4), %rcx
> vmulss (%rcx), %xmm0, %xmm1
> vmovss %xmm1, (%rcx)
> leaq 1(%rax), %rcx
> cmpq $1, %rdx
> je .L43
> leaq (%rsi,%rcx,4), %rcx
> vmulss (%rcx), %xmm0, %xmm1
> vmovss %xmm1, (%rcx)
> leaq 2(%rax), %rcx
> cmpq $2, %rdx
> je .L43
> leaq (%rsi,%rcx,4), %rcx
> vmulss (%rcx), %xmm0, %xmm1
> vmovss %xmm1, (%rcx)
> leaq 3(%rax), %rcx
> cmpq $3, %rdx
> je .L43
> leaq (%rsi,%rcx,4), %rcx
> vmulss (%rcx), %xmm0, %xmm1
> vmovss %xmm1, (%rcx)
> leaq 4(%rax), %rcx
> cmpq $4, %rdx
> je .L43
> leaq (%rsi,%rcx,4), %rcx
> vmulss (%rcx), %xmm0, %xmm1
> vmovss %xmm1, (%rcx)
> leaq 5(%rax), %rcx
> cmpq $5, %rdx
> je .L43
> leaq (%rsi,%rcx,4), %rcx
> addq $6, %rax
> vmulss (%rcx), %xmm0, %xmm1
> vmovss %xmm1, (%rcx)
> cmpq $6, %rdx
> je .L43
> leaq (%rsi,%rax,4), %rax
> vmulss (%rax), %xmm0, %xmm0
> vmovss %xmm0, (%rax)
> vzeroupper
> ret
> .L43:
> vzeroupper
> .L44:
> ret
> .L10:
> movl $16, %r8d
> movl $2, %r10d
> movl $16, %edi
> movl $16, %edx
> xorl %r9d, %r9d
> jmp .L4
> .L11:
> movl $15, %edx
> movl $1, %r9d
> jmp .L5
> .L16:
> movl $10, %edx
> movl $6, %r9d
> jmp .L5
> .L15:
> movl $11, %edx
> movl $5, %r9d
> jmp .L5
> .L14:
> movl $12, %edx
> movl $4, %r9d
> jmp .L5
> .L13:
> movl $13, %edx
> movl $3, %r9d
> jmp .L5
> .L12:
> movl $14, %edx
> movl $2, %r9d
> jmp .L5
>
> Which seems like an awful lot of code, wouldn't you say?
>
> I was expecting something along the lines of this (untested):
>
> void example.mul(float[], float):
> testb $31, %sil
> jne .L44
> cmpq $16, %rdi
> jne .L44
> vbroadcastss xmm0, ymm2
> vmulps (%rsi), ymm2, ymm0
> vmulps 32(%rsi), ymm2, ymm1
> vmovaps ymm0, (%rsi)
> vmovaps ymm1, 32(%rsi)
> .L44:
> ret
>
> Am I being stupid, or is the optimiser making a complete hash of things?
I fear that I cannot reproduce on gcc-5, maybe is a problem specific to your gcc version?
_D6nested3mulFAffZv:
testb $31, %sil
jne .L8
cmpq $16, %rdi
jne .L8
vbroadcastss %xmm0, %ymm0
vmulps (%rsi), %ymm0, %ymm1
vmulps 32(%rsi), %ymm0, %ymm0
vmovaps %ymm1, (%rsi)
vmovaps %ymm0, 32(%rsi)
vzeroupper
.L8:
ret
Iain.
| |||
Copyright © 1999-2021 by the D Language Foundation
Permalink
Reply