Ilya Yaroshenko
| Mir Algorithm and Mir GLAS (glas is experimental) was added to https://d.godbolt.org
by Johan Engelen. Thanks you, Johan!
Try it:
1. Select mir-algorithm 0.6.13 from the libraries list (after Intel button)
2. Select LDC 1.4.0
3. Add compiler flags: -O -release -mcpu=cannonlake -linkonce-templates -betterC
4. Add o code
5. Enjoy AVX512 instructions with fused math :)
-------
// Euclidian norm
import mir.ndslice;
import mir.math.common;
@fastmath double norm2(ContiguousVector!double x) {
return 0.0.reduce!"a + b * b"(x).sqrt;
}
--------
Output:
double example.norm2(mir.ndslice.slice.Slice!(2, [1], double*).Slice):
mov rax, qword ptr [rsp + 8]
test rax, rax
je .LBB0_1
lea rcx, [rsp + 8]
mov rcx, qword ptr [rcx + 8]
vxorpd xmm0, xmm0, xmm0
cmp rax, 32
jb .LBB0_12
mov r8, rax
and r8, -32
mov rsi, rax
and rsi, -32
je .LBB0_12
lea rdi, [rsi - 32]
mov rdx, rdi
shr rdx, 5
bt edi, 5
jb .LBB0_5
vmovupd zmm0, zmmword ptr [rcx]
vmovupd zmm1, zmmword ptr [rcx + 64]
vmovupd zmm2, zmmword ptr [rcx + 128]
vmovupd zmm3, zmmword ptr [rcx + 192]
vmulpd zmm0, zmm0, zmm0
vmulpd zmm1, zmm1, zmm1
vmulpd zmm2, zmm2, zmm2
vmulpd zmm3, zmm3, zmm3
mov r9d, 32
test rdx, rdx
jne .LBB0_8
jmp .LBB0_10
.LBB0_1:
vxorps xmm0, xmm0, xmm0
vsqrtsd xmm0, xmm0, xmm0
ret
.LBB0_5:
vxorpd zmm0, zmm0, zmm0
xor r9d, r9d
vxorpd zmm1, zmm1, zmm1
vxorpd zmm2, zmm2, zmm2
vxorpd zmm3, zmm3, zmm3
test rdx, rdx
je .LBB0_10
.LBB0_8:
mov rdi, rsi
sub rdi, r9
lea rdx, [rcx + 8*r9 + 448]
.LBB0_9:
vmovupd zmm4, zmmword ptr [rdx - 448]
vmovupd zmm5, zmmword ptr [rdx - 384]
vmovupd zmm6, zmmword ptr [rdx - 320]
vmovupd zmm7, zmmword ptr [rdx - 256]
vfmadd213pd zmm4, zmm4, zmm0
vfmadd213pd zmm5, zmm5, zmm1
vfmadd213pd zmm6, zmm6, zmm2
vfmadd213pd zmm7, zmm7, zmm3
vmovupd zmm0, zmmword ptr [rdx - 192]
vmovupd zmm1, zmmword ptr [rdx - 128]
vmovupd zmm2, zmmword ptr [rdx - 64]
vmovupd zmm3, zmmword ptr [rdx]
vfmadd213pd zmm0, zmm0, zmm4
vfmadd213pd zmm1, zmm1, zmm5
vfmadd213pd zmm2, zmm2, zmm6
vfmadd213pd zmm3, zmm3, zmm7
add rdx, 512
add rdi, -64
jne .LBB0_9
.LBB0_10:
vaddpd zmm0, zmm0, zmm2
vaddpd zmm1, zmm1, zmm3
vaddpd zmm0, zmm0, zmm1
vshuff64x2 zmm1, zmm0, zmm0, 14
vaddpd zmm0, zmm0, zmm1
vpermpd zmm1, zmm0, 238
vaddpd zmm0, zmm0, zmm1
vpermilpd zmm1, zmm0, 1
vaddpd zmm0, zmm0, zmm1
cmp rax, rsi
je .LBB0_13
sub rax, r8
lea rcx, [rcx + 8*rsi]
.LBB0_12:
vmovsd xmm1, qword ptr [rcx]
vfmadd231sd xmm0, xmm1, xmm1
add rcx, 8
add rax, -1
jne .LBB0_12
.LBB0_13:
vsqrtsd xmm0, xmm0, xmm0
ret
Bet regards,
Ilya
|