Thread overview
struggling with inline assembler
Apr 09, 2015
salsa
Apr 09, 2015
Kai Nacke
Apr 09, 2015
salsa
April 09, 2015
I'm trying to use Intel's AES instruction set for AES encryption. The following piece of code works well with DMD2 but won't compile with LDC. ldc2 tells me this:

Basic Block in function '_D4main48__T21AES_128_KEY_EXPANSIONVAyaa7_656e6372797074Z21AES_128_KEY_EXPANSIONFNaNbNiNexPhPhZv' does not have terminator!
label %endentry
LLVM ERROR: Broken function found, compilation aborted!

Flow control in the asm block might be the problem.

By the way, how could I access arrays (ubyte[]) instead of pointers in inline assembly? Couldn't find a single piece of documentation...
I preferrably omit the use of 'naked' assembler functions. I tried to do it as in biguintx86.d but was confused with the calling conventions. Registers are used in reverse order compared to the C calling convention, aren't they?

******************

module main;

import std.stdio;
import core.cpuid;


void main(string[] args)
{

	assert(sse2 && aes, "hardware does not support sse2 and aes!");


	// test vectors
	immutable ubyte[16] plaintext = cast(const ubyte[])x"6bc1bee22e409f96e93d7e117393172a";
	immutable ubyte[16] ciphertext = cast(const ubyte[])x"3ad77bb40d7a3660a89ecaf32466ef97";
	immutable ubyte[16] userKey = cast(const ubyte[])x"2b7e151628aed2a6abf7158809cf4f3c";

	ubyte[16*11] keySchedule;	// buffer for key schedule
	AES_128_KEY_EXPANSION!"encrypt"(userKey.ptr, keySchedule.ptr);	// initialize encryption key schedule

	ubyte[16] buffer;
	AES_128_ENCRYPT(keySchedule.ptr, plaintext.ptr, buffer.ptr);	// encrypt one 128 bit block


	assert(buffer == ciphertext, "aes encryption failed");
	writeln("200 OK");

}

/// AES128 11 round encryption
/// Params:
/// key = 11*16 byte key schedule
/// plain = 16 bytes plaintext
/// ciphertext = at least 16 bytes output buffer
void AES_128_ENCRYPT(in ubyte* key, in ubyte* plain, ubyte* ciphertext)
in {
	//assert(key.length == 16*ROUNDS, "invalid key size");
	//assert(plain.length == 16, "invalid input block size");
	//assert(ciphertext.length >= 16, "output buffer too small");
}
body {

	asm {
		mov RDX, key;	// pointer to key schedule
		// load key into XMM0-XMM10
		lddqu XMM0, [RDX+0x00];
		lddqu XMM1, [RDX+0x10];
		lddqu XMM2, [RDX+0x20];
		lddqu XMM3, [RDX+0x30];
		lddqu XMM4, [RDX+0x40];
		lddqu XMM5, [RDX+0x50];
		lddqu XMM6, [RDX+0x60];
		lddqu XMM7, [RDX+0x70];
		lddqu XMM8, [RDX+0x80];
		lddqu XMM9, [RDX+0x90];
		lddqu XMM10, [RDX+0xA0];

		// load plaintext into XMM15

		mov RDX, plain;	// pointer to plaintext
		movdqu XMM15, [RDX];	// read plaintext block

		// AES-128 encryption sequence.
		// The data block is in XMM15.
		// Registers XMM0–XMM10 hold the round keys(from 0 to 10 in this order).
		// In the end, XMM15 holds the encryption result.
		pxor XMM15, XMM0; // Whitening step (Round 0)
		aesenc XMM15, XMM1; // Round 1
		aesenc XMM15, XMM2; // Round 2
		aesenc XMM15, XMM3; // Round 3
		aesenc XMM15, XMM4; // Round 4
		aesenc XMM15, XMM5; // Round 5
		aesenc XMM15, XMM6; // Round 6
		aesenc XMM15, XMM7; // Round 7
		aesenc XMM15, XMM8; // Round 8
		aesenc XMM15, XMM9; // Round 9
		aesenclast XMM15, XMM10; // Round 10

		mov RDX, ciphertext;	// pointer to output buffer
		movdqu [RDX], XMM15;	// write processed data to buffer
	}
}


///
/// Expand a 128 bit user key into 11 round keys
///
/// source: http://www.intel.com/content/dam/doc/white-paper/advanced-encryption-standard-new-instructions-set-paper.pdf, Figure 19. AES-128 Key Expansion: Outlined Code Example
///
/// Params:
///
/// decrypt = generate decryption key if set to true. default: false
///
/// userKey = the AES key as given by the user
/// key = 11 round keys
///
///
enum ROUNDS = 11;
@trusted
public void AES_128_KEY_EXPANSION(string mode = "encrypt")(in ubyte* userKey, ubyte* key) nothrow @nogc
	if(mode == "encrypt" || mode == "decrypt")
	in {
		//assertHardwareSupport();
		//assert(userKey.length == 16, "invalid key size");
		//assert(key.length == ROUNDS*16, "invalid key schedule size");
	}
body {
	
	
	asm  {

		mov RDX, userKey; // pointer to user key
		movdqu XMM1, [RDX]; // read user key
		
		xor RCX, RCX; // set index to 0
		
		mov RDX, key;	// pointer to working key
		movdqu [RDX+RCX], XMM1;
		add	RCX, 0x10; // increment by 16 bytes
		
		aeskeygenassist XMM2, XMM1, 0x01;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x02;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x04;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x08;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x10;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x20;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x40;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x80;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x1b;
		call aes_128_assist;
		aeskeygenassist XMM2, XMM1, 0x36;
		call aes_128_assist;
		
	}
	static if(mode == "decrypt") {
		asm {
			// generate inverse key
			call aesimc128;
		}
	} asm {
		
		jmp END;
		
	aes_128_assist:
		pshufd XMM2, XMM2, 0xff;
		
		//vpslldq XMM3, XMM1, 0x4; // vpslldq requires AVX, pslldq requires only SSE2
		movdqu XMM3, XMM1;
		pslldq XMM3, 0x4;
		
		pxor XMM1, XMM3;
		
		//vpslldq XMM3, XMM1, 0x4;
		movdqu XMM3, XMM1;
		pslldq XMM3, 0x4;
		
		pxor XMM1, XMM3;
		
		//vpslldq XMM3, XMM1, 0x4;
		movdqu XMM3, XMM1;
		pslldq XMM3, 0x4;
		
		pxor XMM1, XMM3;
		pxor XMM1, XMM2;
		
		mov RDX, key;	// pointer to working key
		movdqu [RDX+RCX], XMM1; // store result in keySchedule
		add RCX, 0x10; // increment index by 16 bytes

		ret; // end of key_expansion_128
		
		//
		// do aesimc for all except the first and the last round key
		//
	aesimc128:
		
		mov RDX, key;			// pointer to key output buffer
		add RDX, 0x10;			// dont modify first key
		
		mov RCX, ROUNDS-2;		// set counter to number of rounds - 2
		
	LOOP:
		movdqu XMM1, [RDX];		// load
		aesimc XMM1, XMM1;		// invert
		movdqu [RDX], XMM1;		// store
		
		add RDX, 0x10;			// increment pointer
		
		loop LOOP;				// loop rounds-2 times
		
		ret;
		
		// end aesimc128
		
	END:
		;
	}
}
April 09, 2015
On Thursday, 9 April 2015 at 15:31:34 UTC, salsa wrote:
> I'm trying to use Intel's AES instruction set for AES encryption. The following piece of code works well with DMD2 but won't compile with LDC. ldc2 tells me this:
>
> Basic Block in function '_D4main48__T21AES_128_KEY_EXPANSIONVAyaa7_656e6372797074Z21AES_128_KEY_EXPANSIONFNaNbNiNexPhPhZv' does not have terminator!
> label %endentry
> LLVM ERROR: Broken function found, compilation aborted!
>
> Flow control in the asm block might be the problem.

Hi salsa!

The function has several asm { .. } blocks. Jumping between these blocks is not supported by ldc.

A possible workaround could be to load the target address into a register and do an indirect call. You could also write a mixin for the aes_128_assist and aesimc128 subroutines and replace the calls with the assembler text.

Regards,
Kai
April 09, 2015
Thanks! Replaced the 'static if' with a simple assembler branch.