July 12, 2013 Re: DConf 2013 Day 3 Talk 5: Effective SIMD for modern architectures by Manu Evans | ||||
---|---|---|---|---|
| ||||
Posted in reply to Manu | Manu: > This is interesting. I didn't know about this. I have taken a look at this page: https://github.com/ispc/ispc There is a free compiler binary for various operating systems: http://ispc.github.io/downloads.html I have tried the Windows compiler on some of the given examples of code, and it works! And the resulting asm is excellent. Normal compilers for the usual languages aren't able to do produce not even nearly as good asm. To try the code of the examples I compile like this: ispc.exe --emit-asm stencil.ispc -o stencil.s Or even like this to see the AVX2 asm instructions: ispc.exe --target=avx2 --emit-asm stencil.ispc -o stencil.s As example it compiles a function like this: static void stencil_step(uniform int x0, uniform int x1, uniform int y0, uniform int y1, uniform int z0, uniform int z1, uniform int Nx, uniform int Ny, uniform int Nz, uniform const float coef[4], uniform const float vsq[], uniform const float Ain[], uniform float Aout[]) { const uniform int Nxy = Nx * Ny; foreach (z = z0 ... z1, y = y0 ... y1, x = x0 ... x1) { int index = (z * Nxy) + (y * Nx) + x; #define A_cur(x, y, z) Ain[index + (x) + ((y) * Nx) + ((z) * Nxy)] #define A_next(x, y, z) Aout[index + (x) + ((y) * Nx) + ((z) * Nxy)] float div = coef[0] * A_cur(0, 0, 0) + coef[1] * (A_cur(+1, 0, 0) + A_cur(-1, 0, 0) + A_cur(0, +1, 0) + A_cur(0, -1, 0) + A_cur(0, 0, +1) + A_cur(0, 0, -1)) + coef[2] * (A_cur(+2, 0, 0) + A_cur(-2, 0, 0) + A_cur(0, +2, 0) + A_cur(0, -2, 0) + A_cur(0, 0, +2) + A_cur(0, 0, -2)) + coef[3] * (A_cur(+3, 0, 0) + A_cur(-3, 0, 0) + A_cur(0, +3, 0) + A_cur(0, -3, 0) + A_cur(0, 0, +3) + A_cur(0, 0, -3)); A_next(0, 0, 0) = 2 * A_cur(0, 0, 0) - A_next(0, 0, 0) + vsq[index] * div; } } To asm like (using SSE4): pshufd $0, %xmm7, %xmm9 # xmm9 = xmm7[0,0,0,0] cmpl 156(%rsp), %r8d # 4-byte Folded Reload movups (%r14,%rdi), %xmm0 mulps %xmm6, %xmm5 pshufd $0, %xmm2, %xmm2 # xmm2 = xmm2[0,0,0,0] movq 424(%rsp), %rdx movups (%rdx,%r15), %xmm6 mulps %xmm3, %xmm2 pshufd $0, %xmm10, %xmm3 # xmm3 = xmm10[0,0,0,0] mulps %xmm11, %xmm3 addps %xmm2, %xmm3 addps %xmm5, %xmm3 addps %xmm4, %xmm0 movslq %eax, %rax movups (%r14,%rax), %xmm2 addps %xmm0, %xmm2 movslq %ebp, %rax movups (%r14,%rax), %xmm0 addps %xmm2, %xmm0 mulps %xmm9, %xmm0 addps %xmm3, %xmm0 mulps %xmm6, %xmm0 addps %xmm1, %xmm0 movups %xmm0, (%r11,%r15) jl .LBB0_5 .LBB0_6: # %partial_inner_all_outer # in Loop: Header=BB0_4 Depth=2 movq %r13, %rbp movl 156(%rsp), %r13d # 4-byte Reload movq 424(%rsp), %r9 cmpl 64(%rsp), %r8d # 4-byte Folded Reload movq %r11, %rbx jge .LBB0_257 # BB#7: # %partial_inner_only # in Loop: Header=BB0_4 Depth=2 movd %r8d, %xmm0 movl 84(%rsp), %r15d # 4-byte Reload imull 400(%rsp), %r15d pshufd $0, %xmm0, %xmm0 # xmm0 = xmm0[0,0,0,0] paddd .LCPI0_1(%rip), %xmm0 movdqa %xmm8, %xmm1 pcmpgtd %xmm0, %xmm1 movmskps %xmm1, %edi addl 68(%rsp), %r15d # 4-byte Folded Reload leal (%r8,%r15), %r10d Or using AVX2 (this not exactly the equievalent piece of code) (The asm generates with AVX2 is usually significant shorter): movslq %edx, %rdx vaddps (%rax,%rdx), %ymm5, %ymm5 movl 52(%rsp), %edx # 4-byte Reload leal (%rdx,%r15), %edx movslq %edx, %rdx vaddps (%rax,%rdx), %ymm5, %ymm5 vaddps %ymm11, %ymm6, %ymm9 vaddps %ymm10, %ymm8, %ymm10 vmovups (%rax,%rdi), %ymm12 addl $32, %r15d vmovups (%r11,%rcx), %ymm6 movq 400(%rsp), %rdx vbroadcastss 12(%rdx), %ymm7 vbroadcastss 8(%rdx), %ymm8 vbroadcastss (%rdx), %ymm11 vaddps %ymm12, %ymm10, %ymm10 cmpl 48(%rsp), %r14d # 4-byte Folded Reload vbroadcastss 4(%rdx), %ymm12 vmulps %ymm9, %ymm12, %ymm9 vfmadd213ps %ymm9, %ymm3, %ymm11 vfmadd213ps %ymm11, %ymm8, %ymm10 vfmadd213ps %ymm10, %ymm5, %ymm7 vfmadd213ps %ymm4, %ymm6, %ymm7 vmovups %ymm7, (%r8,%rcx) jl .LBB0_8 # BB#4: # %partial_inner_all_outer.us # in Loop: Header=BB0_7 Depth=2 cmpl 40(%rsp), %r14d # 4-byte Folded Reload movq 400(%rsp), %r10 jge .LBB0_6 # BB#5: # %partial_inner_only.us # in Loop: Header=BB0_7 Depth=2 vmovd %r14d, %xmm3 vbroadcastss %xmm3, %ymm3 movl 100(%rsp), %ecx # 4-byte Reload leal (%rcx,%r15), %ecx vpaddd %ymm2, %ymm3, %ymm3 Sometimes it gives performance warnings: rt.ispc:257:9: Performance Warning: Scatter required to store value. image[offset] = ray.maxt; ^^^^^^^^^^^^^ rt.ispc:258:9: Performance Warning: Scatter required to store value. id[offset] = ray.hitId; ^^^^^^^^^^ A a bit larger example with this function from a little ray-tracer: static bool TriIntersect(const uniform Triangle &tri, Ray &ray) { uniform float3 p0 = { tri.p[0][0], tri.p[0][1], tri.p[0][2] }; uniform float3 p1 = { tri.p[1][0], tri.p[1][1], tri.p[1][2] }; uniform float3 p2 = { tri.p[2][0], tri.p[2][1], tri.p[2][2] }; uniform float3 e1 = p1 - p0; uniform float3 e2 = p2 - p0; float3 s1 = Cross(ray.dir, e2); float divisor = Dot(s1, e1); bool hit = true; if (divisor == 0.) hit = false; float invDivisor = 1.f / divisor; // Compute first barycentric coordinate float3 d = ray.origin - p0; float b1 = Dot(d, s1) * invDivisor; if (b1 < 0. || b1 > 1.) hit = false; // Compute second barycentric coordinate float3 s2 = Cross(d, e1); float b2 = Dot(ray.dir, s2) * invDivisor; if (b2 < 0. || b1 + b2 > 1.) hit = false; // Compute _t_ to intersection point float t = Dot(e2, s2) * invDivisor; if (t < ray.mint || t > ray.maxt) hit = false; if (hit) { ray.maxt = t; ray.hitId = tri.id; } return hit; } The (more or less) complete asm with AVX2 for that function: "TriIntersect___REFs[_c_unTriangle]REFs[vyRay]": # @"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]" # BB#0: # %allocas subq $248, %rsp vmovaps %xmm15, 80(%rsp) # 16-byte Spill vmovaps %xmm14, 96(%rsp) # 16-byte Spill vmovaps %xmm13, 112(%rsp) # 16-byte Spill vmovaps %xmm12, 128(%rsp) # 16-byte Spill vmovaps %xmm11, 144(%rsp) # 16-byte Spill vmovaps %xmm10, 160(%rsp) # 16-byte Spill vmovaps %xmm9, 176(%rsp) # 16-byte Spill vmovaps %xmm8, 192(%rsp) # 16-byte Spill vmovaps %xmm7, 208(%rsp) # 16-byte Spill vmovaps %xmm6, 224(%rsp) # 16-byte Spill vmovss (%rcx), %xmm0 vmovss 16(%rcx), %xmm2 vinsertps $16, 4(%rcx), %xmm0, %xmm0 vinsertps $32, 8(%rcx), %xmm0, %xmm0 vinsertf128 $1, %xmm0, %ymm0, %ymm10 vmovaps (%rdx), %ymm0 vmovaps 32(%rdx), %ymm3 vmovaps 64(%rdx), %ymm1 vmovaps 96(%rdx), %ymm9 vpbroadcastd .LCPI1_0(%rip), %ymm12 vxorps %ymm4, %ymm4, %ymm4 vpermps %ymm10, %ymm4, %ymm4 vsubps %ymm4, %ymm0, %ymm0 vpermps %ymm10, %ymm12, %ymm4 vsubps %ymm4, %ymm3, %ymm13 vmovups %ymm13, (%rsp) # 32-byte Folded Spill vpbroadcastd .LCPI1_1(%rip), %ymm8 vpermps %ymm10, %ymm8, %ymm3 vsubps %ymm3, %ymm1, %ymm6 vmovss 32(%rcx), %xmm1 vinsertps $16, 36(%rcx), %xmm1, %xmm1 vinsertps $32, 40(%rcx), %xmm1, %xmm1 vinsertf128 $1, %xmm0, %ymm1, %ymm1 vsubps %ymm10, %ymm1, %ymm15 vbroadcastss %xmm15, %ymm3 vmovups %ymm3, 32(%rsp) # 32-byte Folded Spill vmovaps 128(%rdx), %ymm11 vpermps %ymm15, %ymm12, %ymm5 vmulps %ymm3, %ymm11, %ymm1 vmovaps %ymm3, %ymm4 vmovaps %ymm5, %ymm7 vfmsub213ps %ymm1, %ymm9, %ymm7 vinsertps $16, 20(%rcx), %xmm2, %xmm1 vinsertps $32, 24(%rcx), %xmm1, %xmm1 vinsertf128 $1, %xmm0, %ymm1, %ymm1 vsubps %ymm10, %ymm1, %ymm1 vpermps %ymm1, %ymm12, %ymm14 vpermps %ymm1, %ymm8, %ymm12 vmulps %ymm6, %ymm14, %ymm3 vmovaps %ymm13, %ymm2 vfmsub213ps %ymm3, %ymm12, %ymm2 vbroadcastss %xmm1, %ymm1 vmulps %ymm0, %ymm12, %ymm3 vmovaps %ymm6, %ymm13 vfmsub213ps %ymm3, %ymm1, %ymm13 vmulps %ymm13, %ymm11, %ymm3 vmovaps %ymm2, %ymm10 vfmadd213ps %ymm3, %ymm9, %ymm10 vpermps %ymm15, %ymm8, %ymm8 vmulps %ymm8, %ymm9, %ymm3 vmovaps 160(%rdx), %ymm9 vmulps %ymm5, %ymm9, %ymm15 vfmsub213ps %ymm15, %ymm8, %ymm11 vmovaps %ymm4, %ymm15 vfmsub213ps %ymm3, %ymm9, %ymm15 vmulps %ymm15, %ymm14, %ymm4 vmovaps %ymm11, %ymm3 vfmadd213ps %ymm4, %ymm1, %ymm3 vfmadd213ps %ymm3, %ymm7, %ymm12 vmovups (%rsp), %ymm4 # 32-byte Folded Reload vmulps %ymm4, %ymm15, %ymm3 vfmadd213ps %ymm3, %ymm0, %ymm11 vfmadd213ps %ymm11, %ymm6, %ymm7 vmulps %ymm4, %ymm1, %ymm1 vfmsub213ps %ymm1, %ymm14, %ymm0 vbroadcastss .LCPI1_2(%rip), %ymm1 vrcpps %ymm12, %ymm3 vmovaps %ymm12, %ymm4 vfnmadd213ps %ymm1, %ymm3, %ymm4 vmulps %ymm4, %ymm3, %ymm4 vxorps %ymm14, %ymm14, %ymm14 vcmpeqps %ymm14, %ymm12, %ymm1 vcmpunordps %ymm14, %ymm12, %ymm3 vorps %ymm1, %ymm3, %ymm11 vmulps %ymm13, %ymm5, %ymm1 vmulps %ymm4, %ymm7, %ymm5 vmovups 32(%rsp), %ymm3 # 32-byte Folded Reload vfmadd213ps %ymm1, %ymm3, %ymm2 vbroadcastss .LCPI1_3(%rip), %ymm1 vcmpnleps %ymm1, %ymm5, %ymm3 vcmpnleps %ymm5, %ymm14, %ymm6 vorps %ymm3, %ymm6, %ymm6 vbroadcastss .LCPI1_0(%rip), %ymm3 vblendvps %ymm11, %ymm14, %ymm3, %ymm7 vfmadd213ps %ymm10, %ymm0, %ymm9 vmovaps (%r8), %ymm3 vmovmskps %ymm3, %eax leaq 352(%rdx), %r8 cmpl $255, %eax vmulps %ymm9, %ymm4, %ymm9 vaddps %ymm9, %ymm5, %ymm10 vmovups 320(%rdx), %ymm5 vfmadd213ps %ymm2, %ymm8, %ymm0 vblendvps %ymm6, %ymm14, %ymm7, %ymm6 vpcmpeqd %ymm2, %ymm2, %ymm2 vcmpnleps %ymm1, %ymm10, %ymm1 vcmpnleps %ymm9, %ymm14, %ymm7 vorps %ymm1, %ymm7, %ymm1 vblendvps %ymm1, %ymm14, %ymm6, %ymm6 vmulps %ymm0, %ymm4, %ymm1 vcmpnleps 352(%rdx), %ymm1, %ymm0 vcmpnleps %ymm1, %ymm5, %ymm4 vorps %ymm0, %ymm4, %ymm0 vblendvps %ymm0, %ymm14, %ymm6, %ymm0 vpcmpeqd %ymm14, %ymm0, %ymm4 vpxor %ymm2, %ymm4, %ymm2 je .LBB1_1 # BB#4: # %some_on vpand %ymm3, %ymm2, %ymm2 .LBB1_1: # %all_on vmovmskps %ymm2, %eax testl %eax, %eax je .LBB1_3 # BB#2: # %safe_if_run_true356 vmaskmovps %ymm1, %ymm2, (%r8) vpbroadcastd 48(%rcx), %ymm1 vmaskmovps %ymm1, %ymm2, 384(%rdx) .LBB1_3: # %safe_if_after_true vmovaps 224(%rsp), %xmm6 # 16-byte Reload vmovaps 208(%rsp), %xmm7 # 16-byte Reload vmovaps 192(%rsp), %xmm8 # 16-byte Reload vmovaps 176(%rsp), %xmm9 # 16-byte Reload vmovaps 160(%rsp), %xmm10 # 16-byte Reload vmovaps 144(%rsp), %xmm11 # 16-byte Reload vmovaps 128(%rsp), %xmm12 # 16-byte Reload vmovaps 112(%rsp), %xmm13 # 16-byte Reload vmovaps 96(%rsp), %xmm14 # 16-byte Reload vmovaps 80(%rsp), %xmm15 # 16-byte Reload addq $248, %rsp ret Using SSE4 the function asm starts like this: "TriIntersect___REFs[_c_unTriangle]REFs[vyRay]": # @"TriIntersect___REFs[_c_unTriangle]REFs[vyRay]" # BB#0: # %allocas subq $248, %rsp movaps %xmm15, 80(%rsp) # 16-byte Spill movaps %xmm14, 96(%rsp) # 16-byte Spill movaps %xmm13, 112(%rsp) # 16-byte Spill movaps %xmm12, 128(%rsp) # 16-byte Spill movaps %xmm11, 144(%rsp) # 16-byte Spill movaps %xmm10, 160(%rsp) # 16-byte Spill movaps %xmm9, 176(%rsp) # 16-byte Spill movaps %xmm8, 192(%rsp) # 16-byte Spill movaps %xmm7, 208(%rsp) # 16-byte Spill movaps %xmm6, 224(%rsp) # 16-byte Spill movss (%rcx), %xmm1 movss 16(%rcx), %xmm0 insertps $16, 4(%rcx), %xmm1 insertps $32, 8(%rcx), %xmm1 movss 32(%rcx), %xmm7 insertps $16, 36(%rcx), %xmm7 insertps $32, 40(%rcx), %xmm7 subps %xmm1, %xmm7 pshufd $-86, %xmm1, %xmm2 # xmm2 = xmm1[2,2,2,2] pshufd $85, %xmm1, %xmm3 # xmm3 = xmm1[1,1,1,1] movaps (%rdx), %xmm4 movaps 16(%rdx), %xmm12 movaps 32(%rdx), %xmm5 movaps 48(%rdx), %xmm11 subps %xmm3, %xmm12 subps %xmm2, %xmm5 movaps %xmm5, 32(%rsp) # 16-byte Spill pshufd $0, %xmm1, %xmm2 # xmm2 = xmm1[0,0,0,0] subps %xmm2, %xmm4 pshufd $85, %xmm7, %xmm3 # xmm3 = xmm7[1,1,1,1] movdqa %xmm3, 48(%rsp) # 16-byte Spill movaps %xmm11, %xmm2 mulps %xmm3, %xmm2 movaps %xmm3, %xmm6 pshufd $-86, %xmm7, %xmm3 # xmm3 = xmm7[2,2,2,2] movdqa %xmm3, 64(%rsp) # 16-byte Spill movaps %xmm11, %xmm9 mulps %xmm3, %xmm9 movaps %xmm3, %xmm10 insertps $16, 20(%rcx), %xmm0 insertps $32, 24(%rcx), %xmm0 subps %xmm1, %xmm0 pshufd $85, %xmm0, %xmm3 # xmm3 = xmm0[1,1,1,1] movdqa %xmm3, (%rsp) # 16-byte Spill movdqa %xmm3, %xmm1 movdqa %xmm3, %xmm14 mulps %xmm5, %xmm1 pshufd $-86, %xmm0, %xmm8 # xmm8 = xmm0[2,2,2,2] ... Even LDC2 compiler doesn't get anywhere close to such good/efficient usage of SIMD instructions. And the compiler is also able to spread the work on multiple cores. I think D is meant to be used for similar numerical code too, so perhaps the little amount of ideas contained in this very C-like language is worth stealing and adding to D. Bye, bearophile |
Copyright © 1999-2021 by the D Language Foundation