DConf 2013 Day 3 Talk 5: Effective SIMD for modern architectures by Manu Evans (page 2)

July 12, 2013
Re: DConf 2013 Day 3 Talk 5: Effective SIMD for modern architectures by Manu Evans
Posted by bearophile
in reply to Manu
Permalink
bearophile
Posted in reply to Manu
Permalink
Manu:

> This is interesting. I didn't know about this.

I have taken a look at this page:
https://github.com/ispc/ispc

There is a free compiler binary for various operating systems:
http://ispc.github.io/downloads.html

I have tried the Windows compiler on some of the given examples of code, and it works! And the resulting asm is excellent. Normal compilers for the usual languages aren't able to do produce not even nearly as good asm.

To try the code of the examples I compile like this:

ispc.exe --emit-asm stencil.ispc -o stencil.s

Or even like this to see the AVX2 asm instructions:

ispc.exe --target=avx2 --emit-asm stencil.ispc -o stencil.s



As example it compiles a function like this:


static void
stencil_step(uniform int x0, uniform int x1,
             uniform int y0, uniform int y1,
             uniform int z0, uniform int z1,
             uniform int Nx, uniform int Ny, uniform int Nz,
             uniform const float coef[4], uniform const float vsq[],
             uniform const float Ain[], uniform float Aout[]) {
    const uniform int Nxy = Nx * Ny;

    foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) {
        int index = (z * Nxy) + (y * Nx) + x;
#define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)]
#define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)]
        float div = coef[0] * A_cur(0, 0, 0) +
            coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) +
                       A_cur(0, +1, 0) + A_cur(0, -1, 0) +
                       A_cur(0, 0, +1) + A_cur(0, 0, -1)) +
            coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) +
                       A_cur(0, +2, 0) + A_cur(0, -2, 0) +
                       A_cur(0, 0, +2) + A_cur(0, 0, -2)) +
            coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) +
                       A_cur(0, +3, 0) + A_cur(0, -3, 0) +
                       A_cur(0, 0, +3) + A_cur(0, 0, -3));

        A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) +
            vsq[index] * div;
    }
}



To asm like (using SSE4):


	pshufd	$0, %xmm7, %xmm9        # xmm9 = xmm7[0,0,0,0]
	cmpl	156(%rsp), %r8d         # 4-byte Folded Reload
	movups	(%r14,%rdi), %xmm0
	mulps	%xmm6, %xmm5
	pshufd	$0, %xmm2, %xmm2        # xmm2 = xmm2[0,0,0,0]
	movq	424(%rsp), %rdx
	movups	(%rdx,%r15), %xmm6
	mulps	%xmm3, %xmm2
	pshufd	$0, %xmm10, %xmm3       # xmm3 = xmm10[0,0,0,0]
	mulps	%xmm11, %xmm3
	addps	%xmm2, %xmm3
	addps	%xmm5, %xmm3
	addps	%xmm4, %xmm0
	movslq	%eax, %rax
	movups	(%r14,%rax), %xmm2
	addps	%xmm0, %xmm2
	movslq	%ebp, %rax
	movups	(%r14,%rax), %xmm0
	addps	%xmm2, %xmm0
	mulps	%xmm9, %xmm0
	addps	%xmm3, %xmm0
	mulps	%xmm6, %xmm0
	addps	%xmm1, %xmm0
	movups	%xmm0, (%r11,%r15)
	jl	.LBB0_5
.LBB0_6:                                # %partial_inner_all_outer
                                        #   in Loop: Header=BB0_4 Depth=2
	movq	%r13, %rbp
	movl	156(%rsp), %r13d        # 4-byte Reload
	movq	424(%rsp), %r9
	cmpl	64(%rsp), %r8d          # 4-byte Folded Reload
	movq	%r11, %rbx
	jge	.LBB0_257
# BB#7:                                 # %partial_inner_only
                                        #   in Loop: Header=BB0_4 Depth=2
	movd	%r8d, %xmm0
	movl	84(%rsp), %r15d         # 4-byte Reload
	imull	400(%rsp), %r15d
	pshufd	$0, %xmm0, %xmm0        # xmm0 = xmm0[0,0,0,0]
	paddd	.LCPI0_1(%rip), %xmm0
	movdqa	%xmm8, %xmm1
	pcmpgtd	%xmm0, %xmm1
	movmskps	%xmm1, %edi
	addl	68(%rsp), %r15d         # 4-byte Folded Reload
	leal	(%r8,%r15), %r10d



Or using AVX2 (this not exactly the equievalent piece of code) (The asm generates with AVX2 is usually significant shorter):


	movslq	%edx, %rdx
	vaddps	(%rax,%rdx), %ymm5, %ymm5
	movl	52(%rsp), %edx          # 4-byte Reload
	leal	(%rdx,%r15), %edx
	movslq	%edx, %rdx
	vaddps	(%rax,%rdx), %ymm5, %ymm5
	vaddps	%ymm11, %ymm6, %ymm9
	vaddps	%ymm10, %ymm8, %ymm10
	vmovups	(%rax,%rdi), %ymm12
	addl	$32, %r15d
	vmovups	(%r11,%rcx), %ymm6
	movq	400(%rsp), %rdx
	vbroadcastss	12(%rdx), %ymm7
	vbroadcastss	8(%rdx), %ymm8
	vbroadcastss	(%rdx), %ymm11
	vaddps	%ymm12, %ymm10, %ymm10
	cmpl	48(%rsp), %r14d         # 4-byte Folded Reload
	vbroadcastss	4(%rdx), %ymm12
	vmulps	%ymm9, %ymm12, %ymm9
	vfmadd213ps	%ymm9, %ymm3, %ymm11
	vfmadd213ps	%ymm11, %ymm8, %ymm10
	vfmadd213ps	%ymm10, %ymm5, %ymm7
	vfmadd213ps	%ymm4, %ymm6, %ymm7
	vmovups	%ymm7, (%r8,%rcx)
	jl	.LBB0_8
# BB#4:                                 # %partial_inner_all_outer.us
                                        #   in Loop: Header=BB0_7 Depth=2
	cmpl	40(%rsp), %r14d         # 4-byte Folded Reload
	movq	400(%rsp), %r10
	jge	.LBB0_6
# BB#5:                                 # %partial_inner_only.us
                                        #   in Loop: Header=BB0_7 Depth=2
	vmovd	%r14d, %xmm3
	vbroadcastss	%xmm3, %ymm3
	movl	100(%rsp), %ecx         # 4-byte Reload
	leal	(%rcx,%r15), %ecx
	vpaddd	%ymm2, %ymm3, %ymm3




Sometimes it gives performance warnings:

rt.ispc:257:9: Performance Warning: Scatter required to store value.
        image[offset] = ray.maxt;
        ^^^^^^^^^^^^^

rt.ispc:258:9: Performance Warning: Scatter required to store value.
        id[offset] = ray.hitId;
        ^^^^^^^^^^


A a bit larger example with this function from a little ray-tracer:


static bool TriIntersect(const uniform Triangle &tri, Ray &ray) {
    uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] };
    uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] };
    uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] };
    uniform float3 e1 = p1 - p0;
    uniform float3 e2 = p2 - p0;

    float3 s1 = Cross(ray.dir, e2);
    float divisor = Dot(s1, e1);
    bool hit = true;

    if (divisor == 0.)
        hit = false;
    float invDivisor = 1.f / divisor;

    // Compute first barycentric coordinate
    float3 d = ray.origin - p0;
    float b1 = Dot(d, s1) * invDivisor;
    if (b1 < 0. || b1 > 1.)
        hit = false;

    // Compute second barycentric coordinate
    float3 s2 = Cross(d, e1);
    float b2 = Dot(ray.dir, s2) * invDivisor;
    if (b2 < 0. || b1 + b2 > 1.)
        hit = false;

    // Compute _t_ to intersection point
    float t = Dot(e2, s2) * invDivisor;
    if (t < ray.mint || t > ray.maxt)
        hit = false;

    if (hit) {
        ray.maxt = t;
        ray.hitId = tri.id;
    }
    return hit;
}


The (more or less) complete asm with AVX2 for that function:


"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]": # @"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]"
# BB#0:                                 # %allocas
    subq    $248, %rsp
    vmovaps %xmm15, 80(%rsp)        # 16-byte Spill
    vmovaps %xmm14, 96(%rsp)        # 16-byte Spill
    vmovaps %xmm13, 112(%rsp)       # 16-byte Spill
    vmovaps %xmm12, 128(%rsp)       # 16-byte Spill
    vmovaps %xmm11, 144(%rsp)       # 16-byte Spill
    vmovaps %xmm10, 160(%rsp)       # 16-byte Spill
    vmovaps %xmm9, 176(%rsp)        # 16-byte Spill
    vmovaps %xmm8, 192(%rsp)        # 16-byte Spill
    vmovaps %xmm7, 208(%rsp)        # 16-byte Spill
    vmovaps %xmm6, 224(%rsp)        # 16-byte Spill
    vmovss  (%rcx), %xmm0
    vmovss  16(%rcx), %xmm2
    vinsertps   $16, 4(%rcx), %xmm0, %xmm0
    vinsertps   $32, 8(%rcx), %xmm0, %xmm0
    vinsertf128 $1, %xmm0, %ymm0, %ymm10
    vmovaps (%rdx), %ymm0
    vmovaps 32(%rdx), %ymm3
    vmovaps 64(%rdx), %ymm1
    vmovaps 96(%rdx), %ymm9
    vpbroadcastd    .LCPI1_0(%rip), %ymm12
    vxorps  %ymm4, %ymm4, %ymm4
    vpermps %ymm10, %ymm4, %ymm4
    vsubps  %ymm4, %ymm0, %ymm0
    vpermps %ymm10, %ymm12, %ymm4
    vsubps  %ymm4, %ymm3, %ymm13
    vmovups %ymm13, (%rsp)          # 32-byte Folded Spill
    vpbroadcastd    .LCPI1_1(%rip), %ymm8
    vpermps %ymm10, %ymm8, %ymm3
    vsubps  %ymm3, %ymm1, %ymm6
    vmovss  32(%rcx), %xmm1
    vinsertps   $16, 36(%rcx), %xmm1, %xmm1
    vinsertps   $32, 40(%rcx), %xmm1, %xmm1
    vinsertf128 $1, %xmm0, %ymm1, %ymm1
    vsubps  %ymm10, %ymm1, %ymm15
    vbroadcastss    %xmm15, %ymm3
    vmovups %ymm3, 32(%rsp)         # 32-byte Folded Spill
    vmovaps 128(%rdx), %ymm11
    vpermps %ymm15, %ymm12, %ymm5
    vmulps  %ymm3, %ymm11, %ymm1
    vmovaps %ymm3, %ymm4
    vmovaps %ymm5, %ymm7
    vfmsub213ps %ymm1, %ymm9, %ymm7
    vinsertps   $16, 20(%rcx), %xmm2, %xmm1
    vinsertps   $32, 24(%rcx), %xmm1, %xmm1
    vinsertf128 $1, %xmm0, %ymm1, %ymm1
    vsubps  %ymm10, %ymm1, %ymm1
    vpermps %ymm1, %ymm12, %ymm14
    vpermps %ymm1, %ymm8, %ymm12
    vmulps  %ymm6, %ymm14, %ymm3
    vmovaps %ymm13, %ymm2
    vfmsub213ps %ymm3, %ymm12, %ymm2
    vbroadcastss    %xmm1, %ymm1
    vmulps  %ymm0, %ymm12, %ymm3
    vmovaps %ymm6, %ymm13
    vfmsub213ps %ymm3, %ymm1, %ymm13
    vmulps  %ymm13, %ymm11, %ymm3
    vmovaps %ymm2, %ymm10
    vfmadd213ps %ymm3, %ymm9, %ymm10
    vpermps %ymm15, %ymm8, %ymm8
    vmulps  %ymm8, %ymm9, %ymm3
    vmovaps 160(%rdx), %ymm9
    vmulps  %ymm5, %ymm9, %ymm15
    vfmsub213ps %ymm15, %ymm8, %ymm11
    vmovaps %ymm4, %ymm15
    vfmsub213ps %ymm3, %ymm9, %ymm15
    vmulps  %ymm15, %ymm14, %ymm4
    vmovaps %ymm11, %ymm3
    vfmadd213ps %ymm4, %ymm1, %ymm3
    vfmadd213ps %ymm3, %ymm7, %ymm12
    vmovups (%rsp), %ymm4           # 32-byte Folded Reload
    vmulps  %ymm4, %ymm15, %ymm3
    vfmadd213ps %ymm3, %ymm0, %ymm11
    vfmadd213ps %ymm11, %ymm6, %ymm7
    vmulps  %ymm4, %ymm1, %ymm1
    vfmsub213ps %ymm1, %ymm14, %ymm0
    vbroadcastss    .LCPI1_2(%rip), %ymm1
    vrcpps  %ymm12, %ymm3
    vmovaps %ymm12, %ymm4
    vfnmadd213ps    %ymm1, %ymm3, %ymm4
    vmulps  %ymm4, %ymm3, %ymm4
    vxorps  %ymm14, %ymm14, %ymm14
    vcmpeqps    %ymm14, %ymm12, %ymm1
    vcmpunordps %ymm14, %ymm12, %ymm3
    vorps   %ymm1, %ymm3, %ymm11
    vmulps  %ymm13, %ymm5, %ymm1
    vmulps  %ymm4, %ymm7, %ymm5
    vmovups 32(%rsp), %ymm3         # 32-byte Folded Reload
    vfmadd213ps %ymm1, %ymm3, %ymm2
    vbroadcastss    .LCPI1_3(%rip), %ymm1
    vcmpnleps   %ymm1, %ymm5, %ymm3
    vcmpnleps   %ymm5, %ymm14, %ymm6
    vorps   %ymm3, %ymm6, %ymm6
    vbroadcastss    .LCPI1_0(%rip), %ymm3
    vblendvps   %ymm11, %ymm14, %ymm3, %ymm7
    vfmadd213ps %ymm10, %ymm0, %ymm9
    vmovaps (%r8), %ymm3
    vmovmskps   %ymm3, %eax
    leaq    352(%rdx), %r8
    cmpl    $255, %eax
    vmulps  %ymm9, %ymm4, %ymm9
    vaddps  %ymm9, %ymm5, %ymm10
    vmovups 320(%rdx), %ymm5
    vfmadd213ps %ymm2, %ymm8, %ymm0
    vblendvps   %ymm6, %ymm14, %ymm7, %ymm6
    vpcmpeqd    %ymm2, %ymm2, %ymm2
    vcmpnleps   %ymm1, %ymm10, %ymm1
    vcmpnleps   %ymm9, %ymm14, %ymm7
    vorps   %ymm1, %ymm7, %ymm1
    vblendvps   %ymm1, %ymm14, %ymm6, %ymm6
    vmulps  %ymm0, %ymm4, %ymm1
    vcmpnleps   352(%rdx), %ymm1, %ymm0
    vcmpnleps   %ymm1, %ymm5, %ymm4
    vorps   %ymm0, %ymm4, %ymm0
    vblendvps   %ymm0, %ymm14, %ymm6, %ymm0
    vpcmpeqd    %ymm14, %ymm0, %ymm4
    vpxor   %ymm2, %ymm4, %ymm2
    je  .LBB1_1
# BB#4:                                 # %some_on
    vpand   %ymm3, %ymm2, %ymm2
.LBB1_1:                                # %all_on
    vmovmskps   %ymm2, %eax
    testl   %eax, %eax
    je  .LBB1_3
# BB#2:                                 # %safe_if_run_true356
    vmaskmovps  %ymm1, %ymm2, (%r8)
    vpbroadcastd    48(%rcx), %ymm1
    vmaskmovps  %ymm1, %ymm2, 384(%rdx)
.LBB1_3:                                # %safe_if_after_true
    vmovaps 224(%rsp), %xmm6        # 16-byte Reload
    vmovaps 208(%rsp), %xmm7        # 16-byte Reload
    vmovaps 192(%rsp), %xmm8        # 16-byte Reload
    vmovaps 176(%rsp), %xmm9        # 16-byte Reload
    vmovaps 160(%rsp), %xmm10       # 16-byte Reload
    vmovaps 144(%rsp), %xmm11       # 16-byte Reload
    vmovaps 128(%rsp), %xmm12       # 16-byte Reload
    vmovaps 112(%rsp), %xmm13       # 16-byte Reload
    vmovaps 96(%rsp), %xmm14        # 16-byte Reload
    vmovaps 80(%rsp), %xmm15        # 16-byte Reload
    addq    $248, %rsp
    ret


Using SSE4 the function asm starts like this:

"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]": # @"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]"
# BB#0:                                 # %allocas
	subq	$248, %rsp
	movaps	%xmm15, 80(%rsp)        # 16-byte Spill
	movaps	%xmm14, 96(%rsp)        # 16-byte Spill
	movaps	%xmm13, 112(%rsp)       # 16-byte Spill
	movaps	%xmm12, 128(%rsp)       # 16-byte Spill
	movaps	%xmm11, 144(%rsp)       # 16-byte Spill
	movaps	%xmm10, 160(%rsp)       # 16-byte Spill
	movaps	%xmm9, 176(%rsp)        # 16-byte Spill
	movaps	%xmm8, 192(%rsp)        # 16-byte Spill
	movaps	%xmm7, 208(%rsp)        # 16-byte Spill
	movaps	%xmm6, 224(%rsp)        # 16-byte Spill
	movss	(%rcx), %xmm1
	movss	16(%rcx), %xmm0
	insertps	$16, 4(%rcx), %xmm1
	insertps	$32, 8(%rcx), %xmm1
	movss	32(%rcx), %xmm7
	insertps	$16, 36(%rcx), %xmm7
	insertps	$32, 40(%rcx), %xmm7
	subps	%xmm1, %xmm7
	pshufd	$-86, %xmm1, %xmm2      # xmm2 = xmm1[2,2,2,2]
	pshufd	$85, %xmm1, %xmm3       # xmm3 = xmm1[1,1,1,1]
	movaps	(%rdx), %xmm4
	movaps	16(%rdx), %xmm12
	movaps	32(%rdx), %xmm5
	movaps	48(%rdx), %xmm11
	subps	%xmm3, %xmm12
	subps	%xmm2, %xmm5
	movaps	%xmm5, 32(%rsp)         # 16-byte Spill
	pshufd	$0, %xmm1, %xmm2        # xmm2 = xmm1[0,0,0,0]
	subps	%xmm2, %xmm4
	pshufd	$85, %xmm7, %xmm3       # xmm3 = xmm7[1,1,1,1]
	movdqa	%xmm3, 48(%rsp)         # 16-byte Spill
	movaps	%xmm11, %xmm2
	mulps	%xmm3, %xmm2
	movaps	%xmm3, %xmm6
	pshufd	$-86, %xmm7, %xmm3      # xmm3 = xmm7[2,2,2,2]
	movdqa	%xmm3, 64(%rsp)         # 16-byte Spill
	movaps	%xmm11, %xmm9
	mulps	%xmm3, %xmm9
	movaps	%xmm3, %xmm10
	insertps	$16, 20(%rcx), %xmm0
	insertps	$32, 24(%rcx), %xmm0
	subps	%xmm1, %xmm0
	pshufd	$85, %xmm0, %xmm3       # xmm3 = xmm0[1,1,1,1]
	movdqa	%xmm3, (%rsp)           # 16-byte Spill
	movdqa	%xmm3, %xmm1
	movdqa	%xmm3, %xmm14
	mulps	%xmm5, %xmm1
	pshufd	$-86, %xmm0, %xmm8      # xmm8 = xmm0[2,2,2,2]
...



Even LDC2 compiler doesn't get anywhere close to such good/efficient usage of SIMD instructions. And the compiler is also able to spread the work on multiple cores. I think D is meant to be used for similar numerical code too, so perhaps the little amount of ideas contained in this very C-like language is worth stealing and adding to D.

Bye,
bearophile
Forums