August 20, 2018
Mini numerical optimizer in D:
https://github.com/S6Regen/Dopt
I also have this Walsh Hadamard code in D for Linux AMD 64:

// Linux AMD64
extern(C) void hsixteen(float* x,ulong n,float scale){
	asm{
		naked;
		shufps XMM0,XMM0,0;
		align 16;
h16:
	sub RSI,16;
	movups XMM1,[RDI];
	movups XMM2,[RDI+16];
	movups XMM3,[RDI+2*16];
	movups XMM4,[RDI+3*16];
	movups XMM5,XMM1;
	movups XMM6,XMM3;
	haddps XMM1,XMM2;
	haddps XMM3,XMM4;
	hsubps XMM5,XMM2;
	hsubps XMM6,XMM4;
	movups XMM2,XMM1;
	movups XMM4,XMM3;
	haddps XMM1,XMM5;
	haddps XMM3,XMM6;
	hsubps XMM2,XMM5;
	hsubps XMM4,XMM6;
	movups XMM5,XMM1;
	movups XMM6,XMM3;
	haddps XMM1,XMM2;
	haddps XMM3,XMM4;
	hsubps XMM5,XMM2;
	hsubps XMM6,XMM4;
	movups XMM2,XMM1;
	movups XMM4,XMM5;
	addps XMM1,XMM3;
	addps XMM5,XMM6;
	subps XMM2,XMM3;
	subps XMM4,XMM6;
	mulps XMM1,XMM0;
	mulps XMM5,XMM0;
	mulps XMM2,XMM0;
	mulps XMM4,XMM0;
	movups [RDI],XMM1;
	movups [RDI+16],XMM5;
	movups [RDI+2*16],XMM2;
	movups [RDI+3*16],XMM4;
	lea RDI,[RDI+64];
	jnz h16;
	ret;
	}
}

extern(C) void hgap(float* x,ulong gap,ulong n){
	asm{
		naked;
		mov RCX,RSI;
		lea R8,[RDI+4*RSI];
		shr RDX,1;
		align 16;	
	hgaploop:
		sub RCX,16;
		movups XMM0,[RDI];
		movups XMM1,[RDI+16];
		movups XMM2,[RDI+2*16];
		movups XMM3,[RDI+3*16];
		movups XMM8,[R8];
		movups XMM9,[R8+16];
		movups XMM10,[R8+2*16];
		movups XMM11,[R8+3*16];
		movups XMM4,XMM0;
		movups XMM5,XMM1;
		movups XMM6,XMM2;
		movups XMM7,XMM3;
		addps XMM0,XMM8;
		addps XMM1,XMM9;
		addps XMM2,XMM10;
		addps XMM3,XMM11;
		subps XMM4,XMM8;
		subps XMM5,XMM9;
		subps XMM6,XMM10;
		subps XMM7,XMM11;
		movups [RDI],XMM0;
		movups [RDI+16],XMM1;
		movups [RDI+2*16],XMM2;
		movups [RDI+3*16],XMM3;
		movups [R8],XMM4;
		movups [R8+16],XMM5;
		movups [R8+2*16],XMM6;
		movups [R8+3*16],XMM7;
		lea RDI,[RDI+64];
		lea R8,[R8+64];
		jnz hgaploop;
		sub RDX,RSI;
		mov RCX,RSI;
		mov RDI,R8;
		lea R8,[R8+4*RSI];
		jnz hgaploop;
		ret;
	}
}
void wht(float[] vec){
	   const ulong lim=8192;
	   const ulong n=vec.length;
	   ulong gap,k;
	   float scale=1f/sqrt(to!float(n));
	   k=n;
	   if( k>lim) k=lim;
	   for(ulong i=0;i<n;i+=lim){
		   hsixteen(&vec[i],k,scale);
		   gap=16;
		   while (gap<k){
			  hgap(&vec[i],gap,k);
			  gap+=gap;
		   }
		}
		while(gap<n){
			hgap(&vec[0],gap,n);
			gap+=gap;
		}	
}

It is the simplest algorithm in computer science least known, as I like to say.
It actually has many uses. Eg.
https://github.com/FALCONN-LIB/FFHT