switch (the_next_instruction[instruction_counter++]) {How can the branch predictor do its job, when the code itself doesn't know what instruction is coming next? It can't, and so the pipeline has to be flushed and re-flushed, and interpreter performance is quite bad.
case instruction_1: ... do instruction 1 ... break;
case instruction_2: ... and so on ...
}
extern "C" float do_work(float x,float y);Here's the corresponding assembly:
int foo(void) {
float r=do_work(1.7,1000.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
global do_work
do_work:
addss xmm0,xmm1
ret
That assembly code is translated into these 5 bytes of machine code:
0000000000000000 <do_work>:
0: f3 0f 58 c1 addss xmm0,xmm1
4: c3 ret
Here's an idea: let's skip the assembler, and set up those bytes ourselves!
typedef float (*work_fn)(float x,float y); // function pointer prototype
const char work_mc[]={
0xf3,0x0f,0x58,0xc1, // addss xmm0,xmm1
0xc3 // ret
};
int foo(void) {
work_fn f=(work_fn)work_mc; // make machine code array into callable function
float r=f(1.7,1000.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
Note that "work_mc" starts out as just an array of bytes, but we
typecast it into a function pointer, and then run those bytes as
machine code!
0f 58 c0 addps xmm0,xmm0 |
#include <sys/mman.h>
typedef float (*do_work_fn)(float x,float y);
/* Make this region of memory executable, using "mprotect" */
void executable_region(void *start_ptr,void *end_ptr) {
long start=(long)start_ptr, end=(long)end_ptr;
start&=~0xfff; end+=0xfff; end&=~0xfff; /* round to page boundaries */
if (0!=mprotect((void *)start,end-start,PROT_READ|PROT_WRITE|PROT_EXEC))
std::cout<<"Unable to mprotect between pointers "<<start<<" and "<<end<<".\n";
}
// Lists x86 SSE instructions' machine code. Format is F3 0F <this byte> <modrm>
typedef enum {
addss=0x58, subss=0x5C,
mulss=0x59, divss=0x5E,
minss=0x5D, maxss=0x5F,
sqrtss=0x51, rsqrtss=0x52, rcpss=0x53
} sse_instruction_type;
int foo(void) {
std::vector<char> mc;
std::string line;
while (std::getline(std::cin,line)) {
int D=line[0]-'0', S=line[3]-'0';
mc.push_back(0xf3); // one float (ss)
mc.push_back(0x0f); // SSE
switch (line[1]) {
case '+': mc.push_back(addss); break;
case '-': mc.push_back(subss); break;
case '*': mc.push_back(mulss); break;
case 'm': mc.push_back(minss); break;
case 'M': mc.push_back(maxss); break;
default: std::cout<<"OHNO!!! Unrecognized line "<<line<<"\n";
}
mc.push_back(0xC0+(D<<3)+S); // xmmD,xmmS
}
mc.push_back(0xc3); // return
executable_region(&mc[0],&mc[mc.size()-1]);
do_work_fn f=(do_work_fn)&mc[0];
float r=f(1.7,10.0);
std::cout<<"Return value = "<<r<<"\n";
return 0;
}
Here's a more object-oriented example program:
typedef float (*work_fn)(float x,float y); // function pointer prototype
// Lists x86 SSE instructions' machine code. Format is F3 0F <this byte> <modrm>
typedef enum {
addss=0x58, subss=0x5C,
mulss=0x59, divss=0x5E,
minss=0x5D, maxss=0x5F,
sqrtss=0x51, rsqrtss=0x52, rcpss=0x53
} sse_instruction_type;
// Creates a new machine code function at runtime, instruction by instruction
class function_maker {
public:
function_maker() {has_ret=false;}
// Add this instruction to our function. src and dest are register numbers
void instruction(int dest,sse_instruction_type opcode,int src) {
mc.push_back(0xF3); mc.push_back(0x0F); // SSE-float prefix bytes
mc.push_back(opcode);
int modrm=(3<<6) | (dest<<3) | (src);
mc.push_back(modrm);
}
// Run the new function
float run(float x,float y) {
if (!has_ret) { // we need to add a return instruction before running
has_ret=true;
mc.push_back(0xc3); /* "ret" instruction */
}
unsigned char *mc_data=&mc[0]; // get a pointer to machine code
work_fn f=(work_fn)mc_data; // make executable
return f(x,y); // run the new function
}
private:
std::vector<unsigned char> mc; /* new function's machine code */
bool has_ret; /* did we add the "ret" instruction yet? */
};
int foo(void) {
function_maker f;
f.instruction(0,addss,1);
float x=f.run(1.7, 1000.0);
f.print();
std::cout<<"Function returned "<<x<<"\n";
return 0;
}
Depending on the machine's configuration, this may crash instead of
running your new function, because std::vector's heap memory is not
marked as being executable (the "No eXecute" or NX bit is set). We can change the heap's memory permission on a per-page basis using "mprotect",
a function that annoyingly only works on page aligned pointers. We'll talk about the page table stuff later.
#include <sys/mman.h> /* Needed to work around "No eXecute" (NX) heap memory */
...
/* Work around NX bit, by marking entire page as executable */
long mc_start=(long)&mc[0], mc_end=(long)&mc[mc.size()-1];
mc_start&=~0xfff; mc_end=(mc_end+0xfff)&~0xfff; /* round to page size */
mprotect((void *)mc_start,mc_end-mc_start,
PROT_READ+PROT_WRITE+PROT_EXEC);
...
Here's a way to use function_maker to translate a simple line-by-line script into machine code:
int foo(void) {
function_maker f;
std::string line;
while (std::getline(std::cin,line))
{ /* Format of lines:
rD?=rS; character
0123456 array index
*/
sse_instruction_type op; /* which operation to perform */
switch (line[2]) {
case '+': op=addss; break;
case '*': op=mulss; break;
case 'v': op=sqrtss; break;
default: std::cout<<"Unknown operation "<<line<<"!\n"; return 0;
}
f.instruction(line[1]-'0', op, line[5]-'0');
}
f.print();
std::cout<<"Running newly created function...\n";
float x=f.run(1.7, 10000.0);
std::cout<<"Function returned "<<x<<"\n";
return 0;
}
The bottom line is that creating a function's machine code at runtime isn't actually that hard!
mod |
reg/opcode |
r/m |
2 bits, selects memory or register access mode: 0: memory at register r/m 1: memory at register r/m+byte offset 2: memory at register r/m + 32-bit offset 3: register r/m itself (not memory) |
3 bits, usually a register number Usually the destination (but for dyslexic instructions, it's the source register; or occasionally even an extra opcode) |
3 bits, register number Usually defines the data source (except for dyslexic instructions) Treated as a pointer for mod!=3, treated as an ordinary register for mod==3. If r/m==4, indicates the real memory source is a SIB byte. |
Register |
eax |
ecx |
edx |
ebx |
esp |
ebp |
esi |
edi |
Number |
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
Machine Code |
Assembly |
C |
0x03 0312 |
add ecx,edx |
c+=d; |
0x03 0012 | add ecx,[edx] | c+=*d; |
0x01 0012 |
add [edx],ecx |
*d+=c; |
0x01 0312 |
add edx,ecx |
d+=c; |
0xf3, 0x0f, 0x58, 0301"0xf3 0x0f 0x58" is the "add SSE scalar single" opcode, "addss". "0xc1" is the ModR/M byte, which from the "ModR/M" table gives mod==3 (a register-register operation), reg/opcode==0 (the destination register is xmm0), and R/M==1 (the source register is xmm1). So all together this is "addss xmm0,xmm1".
svn co http://v8.googlecode.com/svn/trunk/ v8A typical chunk of dynamic translation generation code is in src/ia32/full-codegen-ia32.cc:
cd v8
svn co http://gyp.googlecode.com/svn/trunk build/gyp
make
Label ok;There are several interesting things happening here:
__ test(ecx, ecx);
__ j(zero, &ok, Label::kNear);
// +1 for return address.
int receiver_offset = (info->scope()->num_parameters() + 1) * kPointerSize;
__ mov(ecx, Operand(esp, receiver_offset));
__ JumpIfSmi(ecx, &ok);
__ CmpObjectType(ecx, JS_GLOBAL_PROXY_TYPE, ecx);
__ j(not_equal, &ok, Label::kNear);
__ mov(Operand(esp, receiver_offset),
Immediate(isolate()->factory()->undefined_value()));
__ bind(&ok);