movss xmm0,[somePlace]Here's the assembly for Intel's 1999 4-float SIMD instruction set extension called SSE. The instructions are "ps" for "parallel, single-precision".
addss xmm0,xmm0
movss [somePlace],xmm0
push rax
mov rdi,somePlace
mov rsi,1
extern farray_print
call farray_print
pop rax
ret
section .data
somePlace:
dd 12.3
movups xmm0,[somePlace]
addps xmm0,xmm0
movups [somePlace],xmm0
push rax
mov rdi,somePlace
mov rsi,4
extern farray_print
call farray_print
pop rax
ret
section .data
somePlace:
dd 12.3
dd 10.0
dd 100.0
dd 1.0
vmovups ymm0,[somePlace]What's really amazing about this is each function takes the same amount of time, about 5 nanoseconds.
vaddps ymm0,ymm0
vmovups [somePlace],ymm0
push rax
mov rdi,somePlace
mov rsi,8
extern farray_print
call farray_print
pop rax
ret
section .data
somePlace:
dd 12.3
dd 10.0
dd 100.0
dd 1.0
dd 0.01
dd 0.1
dd 0.2
dd 2.0
Scalar Single-precision (float) |
Scalar Double-precision (double) |
Packed Single-precision (4 floats) |
Packed Double-precision (2 doubles) |
Comments |
|
add |
addss |
addsd |
addps |
addpd |
sub, mul, div all work the same way |
min |
minss |
minsd |
minps |
minpd |
max works the same way |
sqrt |
sqrtss |
sqrtsd |
sqrtps |
sqrtpd |
Square root (sqrt), reciprocal (rcp), and reciprocal-square-root (rsqrt) all work the same way |
mov |
movss |
movsd |
movaps (aligned) movups (unaligned) |
movapd (aligned) movupd (unaligned) |
Aligned loads are up to 4x
faster, but will crash if given an unaligned address! In 64-bit mode, the stack is always
16-byte aligned, specifically for this instruction. Use "align 16" directive for static data. |
cvt | cvtss2sd cvtss2si cvttss2si |
cvtsd2ss cvtsd2si cvttsd2si |
cvtps2pd cvtps2dq cvttps2dq |
cvtpd2ps cvtpd2dq cvttpd2dq |
Convert to ("2", get it?) Single
Integer (si, stored in register like eax) or four DWORDs (dq, stored in
xmm register). "cvtt" versions do truncation (round down); "cvt"
versions round to nearest. |
com |
ucomiss |
ucomisd |
n/a |
n/a |
Sets CPU flags like normal x86 "cmp" instruction, from SSE registers. |
cmp |
cmpeqss |
cmpeqsd |
cmpeqps |
cmpeqpd |
Compare for equality ("lt",
"le", "neq", "nlt", "nle" versions work the same way). Sets all bits
of float to zero if false (0.0), or all bits to ones if true (a NaN).
Result is used as a bitmask for the bitwise AND and OR operations. |
and |
n/a |
n/a |
andps andnps |
andpd andnpd |
Bitwise AND operation. "andn"
versions are bitwise AND-NOT operations (A=(~A) & B). "or"
version works the same way. |
__m128 _mm_load_ps(float *src) |
Load 4 floats from a 16-byte aligned address. WARNING: Segfaults if the address isn't a multiple of 16! |
__m128 _mm_loadu_ps(float *src) | Load 4 floats from an unaligned address (4x slower!) |
__m128 _mm_load1_ps(float *src) | Load 1 individual float into all 4 fields of an __m128 |
__m128 _mm_setr_ps(float a,float b,float c,float d) |
Load 4 separate floats from parameters into an __m128 |
void _mm_store_ps(float *dest,__m128 src) |
Store 4 floats to an aligned address. |
void _mm_storeu_ps(float *dest,__m128 src) | Store 4 floats to unaligned address |
__m128 _mm_add_ps(__m128 a,__m128 b) |
Add corresponding floats (also "sub") |
__m128 _mm_mul_ps(__m128 a,__m128 b) | Multiply corresponding floats (also "div", but it's slow) |
__m128 _mm_min_ps(__m128 a,__m128 b) | Take corresponding minimum (also "max") |
__m128 _mm_sqrt_ps(__m128 a) | Take square roots of 4 floats (12ns, slow like divide) |
__m128 _mm_rcp_ps(__m128 a) | Compute rough (12-bit accuracy) reciprocal of all 4 floats (as fast as an add!) |
__m128 _mm_rsqrt_ps(__m128 a) | Rough (12-bit) reciprocal-square-root of all 4 floats (fast) |
__m128 _mm_shuffle_ps(__m128 lo,__m128 hi, _MM_SHUFFLE(hi3,hi2,lo1,lo0)) |
Interleave inputs into low 2 floats and high 2 floats of output. Basically out[0]=lo[lo0]; out[1]=lo[lo1]; out[2]=hi[hi2]; out[3]=hi[hi3]; For example, _mm_shuffle_ps(a,a,_MM_SHUFFLE(i,i,i,i)) copies the float a[i] into all 4 output floats. |
add r11,1 ; treat r11 as counter. HACK: assumes r11 not used by NetRun's timing loop(Try this in NetRun now!)
mov r8,r11 ; copy the counter
imul r8,65747 ; scale the counter by some big number (not a power of two, though!)
and r8,4 ; grab a random bit from inside the scaled counter
cmp r8,0 ; check if that bit is zero
je same_thing ; if it's zero, jump
ret
same_thing: ; basically the same as not jumping, except for performance!
ret
if (x) y=a; else y=b; // conditionalinto any of these forms:
y=b+x*(a-b); // linear version (assumes a-b works)Note that this last one is just a software version of a hardware multiplexer, and it works well in SIMD mode.
y=x*a+(1-x)*b; // rearranged version of above
y=(x&a)|(~x)&b); // bitwise version (assumes x is all zero bits, or all one bits)
#include <immintrin.h> /* Intel SIMD intrinsics header (use emmintrin.h if this fails) */See the bottom of this lecture for a version of "floats" that works with AVX (8 floats) as well as SSE (4 floats).
class floats; /* forward declaration */
/* This is a SIMD block of boolean flags. */
class booleans {
__m128 v;
public:
enum {n=4}; // how many bools we work on at once
booleans(__m128 n) :v(n) {}
/* Logical operations on booleans: */
friend booleans operator&&(const booleans &a,const booleans &b) {return _mm_and_ps(a.v,b.v);}
friend booleans operator||(const booleans &a,const booleans &b) {return _mm_or_ps(a.v,b.v);}
/*
Return "the_then" where we are true, and "the_else" where we are false.
Basically performs a per-element branch, but without branching.
Example:
booleans m=(a<b);
c=m.then_else(3,7);
*/
floats then_else(const floats &the_then,const floats &the_else);
};
/* This is a SIMD block of single-precision floats. */
class floats {
__m128 v;
public:
enum {n=4}; // floats::n is how many floats we work on at once
floats() {}
floats(__m128 n) :v(n) {}
floats(const float *src) {v=_mm_loadu_ps(src);}
floats(float single) {v=_mm_load1_ps(&single);}
operator __m128 (void) const {return v;}
void store(float *dest) const {_mm_storeu_ps(dest,v);}
/* Caution: this only works if dest is 16-byte aligned. */
void store_aligned(float *dest) const {_mm_store_ps(dest,v);}
/* Printing support */
friend std::ostream &operator<<(std::ostream &o,const floats &a) {
float out[floats::n];
a.store(out);
for (int i=0;i<floats::n;i++) std::cout<<out[i]<<" ";
return o;
}
/* Arithmetic operations */
friend floats operator+(const floats &a,const floats &b) {return _mm_add_ps(a.v,b.v);}
friend floats operator-(const floats &a,const floats &b) {return _mm_sub_ps(a.v,b.v);}
friend floats operator*(const floats &a,const floats &b) {return _mm_mul_ps(a.v,b.v);}
friend floats operator/(const floats &a,const floats &b) {return _mm_div_ps(a.v,b.v);}
friend floats min(const floats &a,const floats &b) {return _mm_min_ps(a.v,b.v);}
friend floats max(const floats &a,const floats &b) {return _mm_max_ps(a.v,b.v);}
friend floats sqrt(const floats &a) {return _mm_sqrt_ps(a.v);}
/* Comparison operators */
friend booleans operator<(const floats &a,const floats &b) {return _mm_cmplt_ps(a.v,b.v);}
friend booleans operator<=(const floats &a,const floats &b) {return _mm_cmple_ps(a.v,b.v);}
friend booleans operator>=(const floats &a,const floats &b) {return _mm_cmpge_ps(a.v,b.v);}
friend booleans operator>(const floats &a,const floats &b) {return _mm_cmpgt_ps(a.v,b.v);}
friend booleans operator==(const floats &a,const floats &b) {return _mm_cmpeq_ps(a.v,b.v);}
friend booleans operator!=(const floats &a,const floats &b) {return _mm_cmpneq_ps(a.v,b.v);}
};
inline floats booleans::then_else(const floats &the_then,const floats &the_else)
{
return _mm_or_ps(
_mm_and_ps(v,the_then),
_mm_andnot_ps(v,the_else)
);
}
/* A little test code: */
float a[4]={1.375,1.567,1.876,1.999};
float b[4]={10.0,0.1,0.2,10.3};
float c[4]={5.0,5.1,5.2,5.3};
float d[4]={8.1,8.2,8.3,8.4};
float out[4];
int foo(void) {
floats A(a), B(b);
booleans m=(A<B);
std::cout<<"then_else="<<m.then_else(3,c)<<"\n";
return 0;
}