movss xmm0,[somePlace]Here's the assembly for Intel's 1999 4float SIMD instruction set extension called SSE. The instructions are "ps" for "parallel, singleprecision".
addss xmm0,xmm0
movss [somePlace],xmm0
push rax
mov rdi,somePlace
mov rsi,1
extern farray_print
call farray_print
pop rax
ret
section .data
somePlace:
dd 12.3
movups xmm0,[somePlace]
addps xmm0,xmm0
movups [somePlace],xmm0
push rax
mov rdi,somePlace
mov rsi,4
extern farray_print
call farray_print
pop rax
ret
section .data
somePlace:
dd 12.3
dd 10.0
dd 100.0
dd 1.0
vmovups ymm0,[somePlace]What's really amazing about this is each function takes the same amount of time, about 5 nanoseconds.
vaddps ymm0,ymm0
vmovups [somePlace],ymm0
push rax
mov rdi,somePlace
mov rsi,8
extern farray_print
call farray_print
pop rax
ret
section .data
somePlace:
dd 12.3
dd 10.0
dd 100.0
dd 1.0
dd 0.01
dd 0.1
dd 0.2
dd 2.0
Scalar Singleprecision (float) 
Scalar Doubleprecision (double) 
Packed Singleprecision (4 floats) 
Packed Doubleprecision (2 doubles) 
Comments 

add 
addss 
addsd 
addps 
addpd 
sub, mul, div all work the same way 
min 
minss 
minsd 
minps 
minpd 
max works the same way 
sqrt 
sqrtss 
sqrtsd 
sqrtps 
sqrtpd 
Square root (sqrt), reciprocal (rcp), and reciprocalsquareroot (rsqrt) all work the same way 
mov 
movss 
movsd 
movaps (aligned) movups (unaligned) 
movapd (aligned) movupd (unaligned) 
Aligned loads are up to 4x
faster, but will crash if given an unaligned address! In 64bit mode, the stack is always
16byte aligned, specifically for this instruction. Use "align 16" directive for static data. 
cvt  cvtss2sd cvtss2si cvttss2si 
cvtsd2ss cvtsd2si cvttsd2si 
cvtps2pd cvtps2dq cvttps2dq 
cvtpd2ps cvtpd2dq cvttpd2dq 
Convert to ("2", get it?) Single
Integer (si, stored in register like eax) or four DWORDs (dq, stored in
xmm register). "cvtt" versions do truncation (round down); "cvt"
versions round to nearest. 
com 
ucomiss 
ucomisd 
n/a 
n/a 
Sets CPU flags like normal x86 "cmp" instruction, from SSE registers. 
cmp 
cmpeqss 
cmpeqsd 
cmpeqps 
cmpeqpd 
Compare for equality ("lt",
"le", "neq", "nlt", "nle" versions work the same way). Sets all bits
of float to zero if false (0.0), or all bits to ones if true (a NaN).
Result is used as a bitmask for the bitwise AND and OR operations. 
and 
n/a 
n/a 
andps andnps 
andpd andnpd 
Bitwise AND operation. "andn"
versions are bitwise ANDNOT operations (A=(~A) & B). "or"
version works the same way. 
__m128 _mm_load_ps(float *src) 
Load 4 floats from a 16byte aligned address. WARNING: Segfaults if the address isn't a multiple of 16! 
__m128 _mm_loadu_ps(float *src)  Load 4 floats from an unaligned address (4x slower!) 
__m128 _mm_load1_ps(float *src)  Load 1 individual float into all 4 fields of an __m128 
__m128 _mm_setr_ps(float a,float b,float c,float d) 
Load 4 separate floats from parameters into an __m128 
void _mm_store_ps(float *dest,__m128 src) 
Store 4 floats to an aligned address. 
void _mm_storeu_ps(float *dest,__m128 src)  Store 4 floats to unaligned address 
__m128 _mm_add_ps(__m128 a,__m128 b) 
Add corresponding floats (also "sub") 
__m128 _mm_mul_ps(__m128 a,__m128 b)  Multiply corresponding floats (also "div", but it's slow) 
__m128 _mm_min_ps(__m128 a,__m128 b)  Take corresponding minimum (also "max") 
__m128 _mm_sqrt_ps(__m128 a)  Take square roots of 4 floats (12ns, slow like divide) 
__m128 _mm_rcp_ps(__m128 a)  Compute rough (12bit accuracy) reciprocal of all 4 floats (as fast as an add!) 
__m128 _mm_rsqrt_ps(__m128 a)  Rough (12bit) reciprocalsquareroot of all 4 floats (fast) 
__m128 _mm_shuffle_ps(__m128 lo,__m128 hi, _MM_SHUFFLE(hi3,hi2,lo1,lo0)) 
Interleave inputs into low 2 floats and high 2 floats of output. Basically out[0]=lo[lo0]; out[1]=lo[lo1]; out[2]=hi[hi2]; out[3]=hi[hi3]; For example, _mm_shuffle_ps(a,a,_MM_SHUFFLE(i,i,i,i)) copies the float a[i] into all 4 output floats. 
add r11,1 ; treat r11 as counter. HACK: assumes r11 not used by NetRun's timing loop(Try this in NetRun now!)
mov r8,r11 ; copy the counter
imul r8,65747 ; scale the counter by some big number (not a power of two, though!)
and r8,4 ; grab a random bit from inside the scaled counter
cmp r8,0 ; check if that bit is zero
je same_thing ; if it's zero, jump
ret
same_thing: ; basically the same as not jumping, except for performance!
ret
if (x) y=a; else y=b; // conditionalinto any of these forms:
y=b+x*(ab); // linear version (assumes ab works)Note that this last one is just a software version of a hardware multiplexer, and it works well in SIMD mode.
y=x*a+(1x)*b; // rearranged version of above
y=(x&a)(~x)&b); // bitwise version (assumes x is all zero bits, or all one bits)
#include <immintrin.h> /* Intel SIMD intrinsics header (use emmintrin.h if this fails) */See the bottom of this lecture for a version of "floats" that works with AVX (8 floats) as well as SSE (4 floats).
class floats; /* forward declaration */
/* This is a SIMD block of boolean flags. */
class booleans {
__m128 v;
public:
enum {n=4}; // how many bools we work on at once
booleans(__m128 n) :v(n) {}
/* Logical operations on booleans: */
friend booleans operator&&(const booleans &a,const booleans &b) {return _mm_and_ps(a.v,b.v);}
friend booleans operator(const booleans &a,const booleans &b) {return _mm_or_ps(a.v,b.v);}
/*
Return "the_then" where we are true, and "the_else" where we are false.
Basically performs a perelement branch, but without branching.
Example:
booleans m=(a<b);
c=m.then_else(3,7);
*/
floats then_else(const floats &the_then,const floats &the_else);
};
/* This is a SIMD block of singleprecision floats. */
class floats {
__m128 v;
public:
enum {n=4}; // floats::n is how many floats we work on at once
floats() {}
floats(__m128 n) :v(n) {}
floats(const float *src) {v=_mm_loadu_ps(src);}
floats(float single) {v=_mm_load1_ps(&single);}
operator __m128 (void) const {return v;}
void store(float *dest) const {_mm_storeu_ps(dest,v);}
/* Caution: this only works if dest is 16byte aligned. */
void store_aligned(float *dest) const {_mm_store_ps(dest,v);}
/* Printing support */
friend std::ostream &operator<<(std::ostream &o,const floats &a) {
float out[floats::n];
a.store(out);
for (int i=0;i<floats::n;i++) std::cout<<out[i]<<" ";
return o;
}
/* Arithmetic operations */
friend floats operator+(const floats &a,const floats &b) {return _mm_add_ps(a.v,b.v);}
friend floats operator(const floats &a,const floats &b) {return _mm_sub_ps(a.v,b.v);}
friend floats operator*(const floats &a,const floats &b) {return _mm_mul_ps(a.v,b.v);}
friend floats operator/(const floats &a,const floats &b) {return _mm_div_ps(a.v,b.v);}
friend floats min(const floats &a,const floats &b) {return _mm_min_ps(a.v,b.v);}
friend floats max(const floats &a,const floats &b) {return _mm_max_ps(a.v,b.v);}
friend floats sqrt(const floats &a) {return _mm_sqrt_ps(a.v);}
/* Comparison operators */
friend booleans operator<(const floats &a,const floats &b) {return _mm_cmplt_ps(a.v,b.v);}
friend booleans operator<=(const floats &a,const floats &b) {return _mm_cmple_ps(a.v,b.v);}
friend booleans operator>=(const floats &a,const floats &b) {return _mm_cmpge_ps(a.v,b.v);}
friend booleans operator>(const floats &a,const floats &b) {return _mm_cmpgt_ps(a.v,b.v);}
friend booleans operator==(const floats &a,const floats &b) {return _mm_cmpeq_ps(a.v,b.v);}
friend booleans operator!=(const floats &a,const floats &b) {return _mm_cmpneq_ps(a.v,b.v);}
};
inline floats booleans::then_else(const floats &the_then,const floats &the_else)
{
return _mm_or_ps(
_mm_and_ps(v,the_then),
_mm_andnot_ps(v,the_else)
);
}
/* A little test code: */
float a[4]={1.375,1.567,1.876,1.999};
float b[4]={10.0,0.1,0.2,10.3};
float c[4]={5.0,5.1,5.2,5.3};
float d[4]={8.1,8.2,8.3,8.4};
float out[4];
int foo(void) {
floats A(a), B(b);
booleans m=(A<B);
std::cout<<"then_else="<<m.then_else(3,c)<<"\n";
return 0;
}