Building Machine Code at Runtime

CS 301 Lecture, Dr. Lawlor

Here's a little bit of C++ that calls some assembly to do work on floats:

extern "C" float do_work(float x,float y);

int foo(void) {
	float r=do_work(1.7,1000.0);
	std::cout<<"Return value = "<<r<<"\n";
	return 0;
}

(Try this in NetRun now!)

Here's the corresponding assembly:

global do_work
do_work:

addss xmm0,xmm1
ret

(Try this in NetRun now!)

That assembly code is translated into these 5 bytes of machine code:

0000000000000000 <do_work>:
   0:	f3 0f 58 c1          	addss  xmm0,xmm1
   4:	c3                   	ret

Here's an idea: let's skip the assembler, and set up those bytes ourselves!

typedef float (*work_fn)(float x,float y); // function pointer prototype
const char work_mc[]={
	0xf3,0x0f,0x58,0xc1, //	addss  xmm0,xmm1
	0xc3                 //	ret  
};

int foo(void) {
	work_fn f=(work_fn)work_mc; // make machine code array into callable function
	
	float r=f(1.7,1000.0);
	std::cout<<"Return value = "<<r<<"\n";
	return 0;
}

(Try this in NetRun now!)

Note that "work_mc" starts out as just an array of bytes, but we typecast it into a function pointer, and then run those bytes as machine code!

Generalize Across Operations

It's pretty easy to figure out the machine code for all the operations you want. A disassembler makes this easy:

0f 58 c0             	addps  xmm0,xmm0
0f 59 c0             	mulps  xmm0,xmm0
0f 5c c0             	subps  xmm0,xmm0
0f 5e c0             	divps  xmm0,xmm0
0f 5d c0             	minps  xmm0,xmm0
0f 5f c0             	maxps  xmm0,xmm0
0f 50 c0             	movmskps eax,xmm0
0f 51 c0             	sqrtps xmm0,xmm0
0f 52 c0             	rsqrtps xmm0,xmm0
0f 53 c0             	rcpps  xmm0,xmm0
0f 54 c0             	andps  xmm0,xmm0
0f 55 c0             	andnps xmm0,xmm0
0f 56 c0             	orps   xmm0,xmm0
0f 57 c0             	xorps  xmm0,xmm0
0f 28 c0             	movaps xmm0,xmm0

0f 58 c0             	addps  xmm0,xmm0
0f 58 c1             	addps  xmm0,xmm1
0f 58 c2             	addps  xmm0,xmm2
0f 58 c3             	addps  xmm0,xmm3

0f 58 c0             	addps  xmm0,xmm0
0f 58 c8             	addps  xmm1,xmm0
0f 58 d0             	addps  xmm2,xmm0
0f 58 d8             	addps  xmm3,xmm0

f3 0f 58 c0          	addss  xmm0,xmm0
0f 58 c0             	addps  xmm0,xmm0
f2 0f 58 c0          	addsd  xmm0,xmm0
66 0f 58 c0          	addpd  xmm0,xmm0

(Try this in NetRun now!)

This is a pretty simple table, so we can even put together a totally new SSE function from these instructions:

#include <sys/mman.h>

typedef float (*do_work_fn)(float x,float y); 
/* Make this region of memory executable, using "mprotect" */
void executable_region(void *start_ptr,void *end_ptr) {
	long start=(long)start_ptr, end=(long)end_ptr;
	start&=~0xfff; end+=0xfff; end&=~0xfff; /* round to page boundaries */
	if (0!=mprotect((void *)start,end-start,PROT_READ|PROT_WRITE|PROT_EXEC))
		std::cout<<"Unable to mprotect between pointers "<<start<<" and "<<end<<".\n";
}

// Lists x86 SSE instructions' machine code.  Format is F3 0F <this byte> <modrm>
typedef enum {
	addss=0x58, subss=0x5C,
	mulss=0x59, divss=0x5E,
	minss=0x5D, maxss=0x5F,
	sqrtss=0x51, rsqrtss=0x52, rcpss=0x53
} sse_instruction_type;

int foo(void) {
	std::vector<char> mc;
	std::string line;
	while (std::getline(std::cin,line)) {
		int D=line[0]-'0', S=line[3]-'0';
		mc.push_back(0xf3);  // one float (ss)
		mc.push_back(0x0f); // SSE
		switch (line[1]) {
		case '+': mc.push_back(addss); break;
		case '-': mc.push_back(subss); break;
		case '*': mc.push_back(mulss); break;
		case 'm': mc.push_back(minss); break;
		case 'M': mc.push_back(maxss); break;
		default: std::cout<<"OHNO!!!  Unrecognized line "<<line<<"\n";
		}
		mc.push_back(0xC0+(D<<3)+S);   // ModR/M byte (see below) specifying xmmD,xmmS
	}

	mc.push_back(0xc3); // return

	executable_region(&mc[0],&mc[mc.size()-1]);

	do_work_fn f=(do_work_fn)&mc[0];
	float r=f(1.7,10.0);
	std::cout<<"Return value = "<<r<<"\n";
	return 0;
}

(Try this in NetRun now!)

The bottom line is that creating a function's machine code at runtime isn't actually that hard--just really weird!

ModR/M

The last byte of most x86 instructions is called "ModR/M" in the Intel documentation, and it specifies what the source and destination are. It's just a bit field giving a selector called "mod" (which indicates whether r/m is treated as a plain register or a memory address), one register called "reg/opcode" (which is usually the destination register, and determines the column in the ModR/M table), and a final register called "r/m" (usually the source register, which selects the row of the ModR/M table). These are stored in a single byte as shown below.

mod	reg/opcode	r/m
2 bits, selects memory or register access mode: 0: memory at register r/m 1: memory at register r/m+byte offset 2: memory at register r/m + 32-bit offset 3: register r/m itself (not memory)	3 bits, usually a destination register number. For some instructions, this is actually extra opcode bits.	3 bits, usually a source register number. Treated as a pointer for mod!=3, treated as an ordinary register for mod==3. If r/m==4, indicates the real memory source is a SIB byte.

See the "ModR/M" table for meaning of values. ModR/M is much easier to write in octal, not hex, since the 3-bit fields match exactly with octal digits. You can write octal in C/C++ with a leading 0, so "0301" is octal for 011 000 001 (binary) or 0xC1 (hex).

x86 register numbering is about as bizarre as you've come to expect:

Number	0	1	2	3	4	5	6	7
SSE Register	xmm0	xmm1	xmm2	xmm3	xmm4	xmm5	xmm6	xmm7
Int Register	eax	ecx	edx	ebx	esp	ebp	esi	edi

For example, a ModR/M byte like:

ModR/M==0312 (octal again) means mod==3 (so we're only touching registers, not memory), reg/opcode==1 (meaning destination is xmm1 or ecx, register 1), and r/m==2 (meaning the source register is xmm2 or edx, register 2)
ModR/M==0005 (in octal) is composed of mod==0 (so accessing memory), reg/opcode==0 (meaning the normal-destination register is xmm0 or eax, register 0), and r/m==5 (meaning the source is memory pointed to by ebp, register 5).

Interpreters and Dynamic Translation Performance Analysis

People who write assemblers, compilers, and linkers need to know about machine code. But lots of other people do too--people who write fast interpreters.

A typical problem is where we want to do some simple operations depending on the input file, but they're really slow when interpreted normally (read the line of code to interpret, figure out what it's asking, and do it; instead of "just do it"). The solution is to write a version of your interpreter loop that spits out real machine code to solve the problem. If you can make your interpreter use registers (e.g., because you've only got a few variables), you'll get excellent performance--just as good as compiled assembly!

Lots of people do this technique, often called dynamic binary translation:

Java does it to Java bytecode. They call this "Just-In-Time compilation", or JIT.
Microsoft does it to Common Language Runtime bytecode.
Google does it in their fast JavaScript "V8" engine.
Haskell does it in the compiled version of that language.
VMware does dynamic translation to convert ring-0 privileged x86 instructions into ring-3 unprivileged instructions. This is essentially patching the operating system binary at runtime.
Lots of companies do this to run code for one processor on another processor. Apple's done this twice: in 1994 during the transition from 68000 to PowerPC (the "68k emulator"), and again in 2006 during the transition to x86 ("Rosetta").

Here's an example comparing the performance of dynamic binary translation, assembly, and ordinary interpretation for parallel-float SSE instructions. The bottom line is that the dynamic version is about 9x faster than the interpreted version!

#include <sys/mman.h> /* Needed to work around "No eXecute" (NX) heap memory */
#include <xmmintrin.h> /* for SSE parallel float __m128's */

__m128 arr[8]; /* SSE temporary values */

/****************** Assembly support **************/
typedef void (*bar_fn)(void);
extern "C" void bar_asm(void);
bar_fn bar=bar_asm; /* function pointer pointing to current bar routine */

/* GCC-assembly version of bar routine, bar_asm */
__asm__(
".section \".text\"\n"
".globl foo\n"
".type bar_asm,@function\n"
"bar_asm:\n"
"  addps %xmm1,%xmm0\n"
"  mulps %xmm2,%xmm0\n"
"  ret\n"
);

/* Copies values in "arr" into SSE registers, calls bar function pointer, and 
   copies the result in xmm0 back into arr[0]. */
int call_bar(void) {
	__asm__(
	"  mov $arr,%ecx\n" // ecx points to "arr" array
	"  movaps 0x00(%ecx),%xmm0\n" // Fill SSE registers with inputs
	"  movaps 0x10(%ecx),%xmm1\n"
	"  movaps 0x20(%ecx),%xmm2\n"
	"  movaps 0x30(%ecx),%xmm3\n"
	);
	bar(); 
	__asm__(
	"  movaps %xmm0,0x00(%ecx)\n" // Extract outputs
	);
	return 0;
}
/************** Interpreter support ************/
/* One instruction in the bar routine */
class bar_op {
public:
	int i,o; /* input and output registers, 0-7 */
	char op; /* operation to perform (+,-,*, etc.) */
	bar_op(const char *line) { /* HACK: assumes line looks like "r1+=r0..." */
		o=line[1]-'0';
		i=line[5]-'0';
		op=line[2];
	}
};
/* Stored operations to make up the bar routine */
std::vector<bar_op> ops;

/* Interpreted bar routine: switch-and-execute */
int call_interp_bar(void) {
	for (unsigned int i=0;i<ops.size();i++) {
		bar_op &b=ops[i];
		switch(b.op) {
		case '+': arr[b.o]=_mm_add_ps(arr[b.o],arr[b.i]); break;
		case '-': arr[b.o]=_mm_sub_ps(arr[b.o],arr[b.i]); break;
		case '*': arr[b.o]=_mm_mul_ps(arr[b.o],arr[b.i]); break;
		case '/': arr[b.o]=_mm_div_ps(arr[b.o],arr[b.i]); break;
		default: printf("Unknown bar operation!\n"); exit(1); 
		};
	}
	return 0;
}

/* Dynamically assembled x86 bar routine: prepare machine code to run bar */
bar_fn bar_dynamic(void) {
	typedef unsigned char byte;
	std::vector<byte> *out=new std::vector<byte>;  /* stores output machine code */
	for (unsigned int i=0;i<ops.size();i++) {
		out->push_back(0x0F); /* all SSE opcodes start with 0x0F */
		bar_op &b=ops[i];
		switch(b.op) {
		case '+': out->push_back(0x58); break; /* SSE opcode for corresponding operation */
		case '-': out->push_back(0x5C); break;
		case '*': out->push_back(0x59); break;
		case '/': out->push_back(0x5E); break;
		default: printf("Unknown bar operation!\n"); exit(1); 
		};
		/* Prepare ModR/M byte giving input and output registers */
		int mod=3; /* register-register mode (octal 03ds) */
		int destreg=b.o; /* output register */
		int srcreg=b.i; /* source register */
		int ModRM=(mod<<6)+(destreg<<3)+srcreg;
		out->push_back((byte)ModRM);
	}
	out->push_back(0xc3); /* return instruction at end of routine */
/* Work around NX bit, by marking entire page as executable */
	long mc_start=(long)&(*out)[0], mc_end=(long)&(*out)[out->size()-1];
	mc_start&=~0xfff; mc_end=(mc_end+0xfff)&~0xfff;// page size!
	mprotect((void *)mc_start,mc_end-mc_start, 
		PROT_READ+PROT_WRITE+PROT_EXEC);
	return (bar_fn)(&(*out)[0]);  /* typecast bytes to function pointer */
}

int foo(void) {
	int i;
/* Read the user's input bar routine */
	char line[100];
	while (read_string(line)) {
		if (line[0]=='r')
			ops.push_back(bar_op(line));
	}
/* Set up the (hardcoded) inputs */
	const static float in[8][4]={
		{1.0,2.0,3.0,4.0},
		{10.0,20.0,30.0,40.0},
		{100.0,200.0,300.0,400.0}
	};
	float out[4];
	for (i=0;i<8;i++) arr[i]=_mm_loadu_ps(in[i]);
/* Dynamically assemble bar routine, and make a test run */
	bar=bar_dynamic();
	call_bar();
	_mm_storeu_ps(out,arr[0]);
	printf("%f %f %f %f\n",out[0],out[1],out[2],out[3]);

/* Do timings */
	printf("Bar takes about %.2f ns/call in dynamic machine code\n",
		time_function(call_bar)*1.0e9);
	bar=bar_asm;
	printf("Bar takes about %.2f ns/call in real asm\n",
		time_function(call_bar)*1.0e9);
	printf("Bar takes about %.2f ns/call interpreted\n",
		time_function(call_interp_bar)*1.0e9);
	return 0;
}

(Try this in NetRun now!)