movaps xmm0,[a]
addps xmm0,xmm0 ; add each float to itself
movaps [out],xmm0 ; copy floats out to memory
push rax; align stack
extern farray_print ; prototype
mov rdi, out ; print the floats in memory
mov rsi,4 ; count of floats: four to print
call farray_print
pop rax ; clean up stack
ret
section .data align=16
a: dd 3.141592,1.0,10.0,100.0
out: dd 0,0,0,0
single/ serial 
packed/ parallel 

singleprecision "float" 
ss 
ps (4 floats) 
doubleprecision "double" 
sd 
pd (2 doubles) 
enum {n=1024};Staring at the assembly language, there are a number of "cvtss2sd" and back again, due to the doubleprecision constants and singleprecision data. So we can get a good speedup to 1.4ns/float, just by making the constants floating point.
float a[n];
for (int i=0;i<n;i++) a[i]=3.4;
for (int i=0;i<n;i++) a[i]+=1.2;
return a[0];
enum {n=1024};We can run a *lot* faster by using SSE parallel instructions. I'm going to do this the "hard way," making separate functions to do the assembly computation. Essentially, we're converting the loops to run across four iterations at once, so they can operate on blocks of floats.
float a[n];
for (int i=0;i<n;i++) a[i]=3.4f;
for (int i=0;i<n;i++) a[i]+=1.2f;
return a[0];
extern "C" void init_array(float *arr,int n);Here are the two assembly language functions called above. Together, we're down to under 0.5ns/float!
extern "C" void add_array(float *arr,int n);
int foo(void) {
enum {n=1024};
float a[n];
init_array(a,n);
add_array(a,n);
return a[0]*1000;
}
; extern "C" void init_array(float *arr,int n);0.5ns/float is pretty impressive performance for this code, since:
;for (int i=0;i<n;i+=4) {
; a[i]=3.4f;
; a[i+1]=3.4f;
; a[i+2]=3.4f;
; a[i+3]=3.4f;
;}
global init_array
init_array:
; rdi points to arr
; rsi is n, the array length
mov rcx,0 ; i
movaps xmm1,[constant3_4]
jmp loopcompare
loopstart:
movaps [rdi+4*rcx],xmm1 ; init array with xmm1
add rcx,4
loopcompare:
cmp rcx,rsi
jl loopstart
ret
section .data align=16
constant3_4:
dd 3.4,3.4,3.4,3.4 ; movaps!
section .text
; extern "C" void add_array(float *arr,int n);
;for (int i=0;i<n;i++) a[i]+=1.2f;
global add_array
add_array:
; rdi points to arr
; rsi is n, the array length
mov rcx,0 ; i
movaps xmm1,[constant1_2]
jmp loopcompare2
loopstart2:
movaps xmm0,[rdi+4*rcx] ; loads arr[i] through arr[i+3]
addps xmm0,xmm1
movaps [rdi+4*rcx],xmm0
add rcx,4
loopcompare2:
cmp rcx,rsi
jl loopstart2
ret
section .data align=16
constant1_2:
dd 1.2,1.2,1.2,1.2 ; movaps!
Scalar Singleprecision (float) 
Scalar Doubleprecision (double) 
Packed Singleprecision (4 floats) 
Packed Doubleprecision (2 doubles) 
Example 
Comments 

Arithmetic 
addss 
addsd 
addps 
addpd 
addss xmm0,xmm1 
sub, mul, div all work the same way 
Compare 
minss 
minsd 
minps 
minpd 
minps xmm0,xmm1 
max works the same way 
Sqrt 
sqrtss 
sqrtsd 
sqrtps 
sqrtpd 
sqrtss xmm0,xmm1 
Square root (sqrt), reciprocal (rcp), and reciprocalsquareroot (rsqrt) all work the same way 
Move 
movss 
movsd 
movaps (aligned) movups (unaligned) 
movapd (aligned) movupd (unaligned) 
movss xmm0,xmm1 
Aligned loads are up to 4x
faster, but will crash if given an unaligned address! Stack is always
16byte aligned specifically for this instruction. Use "align 16" directive for static data. 
Convert  cvtss2sd cvtss2si cvttss2si 
cvtsd2ss cvtsd2si cvttsd2si 
cvtps2pd cvtps2dq cvttps2dq 
cvtpd2ps cvtpd2dq cvttpd2dq 
cvtsi2ss xmm0,eax 
Convert to ("2", get it?) Single
Integer (si, stored in register like eax) or four DWORDs (dq, stored in
xmm register). "cvtt" versions do truncation (round down); "cvt"
versions round to nearest. 
High Bits 
n/a 
n/a 
movmskps 
movmskpd 
movmskps eax,xmm0 
Extract the sign bits of an xmm
register into an integer register. Often used to see if all the
floats are "done" and you can exit. 
Compare to flags 
ucomiss 
ucomisd 
n/a 
n/a 
ucomiss xmm0,xmm1 jbe dostuff 
Sets CPU flags like normal x86 "cmp" instruction, but from SSE registers.
Use with "jb", "jbe", "je", "jae", or "ja" for normal
comparisons. Sets "pf", the parity flag, if either input is a NaN. 
Compare to bitwise mask 
cmpeqss 
cmpeqsd 
cmpeqps 
cmpeqpd 
cmpleps xmm0,xmm1 
Compare for equality ("lt",
"le", "neq", "nlt", "nle" versions work the same way). There's
also a "cmpunordss" that marks NaN values. Sets all bits
of float to zero if false (0.0), or all bits to ones if true (a
NaN).
Result is used as a bitmask for the bitwise AND and OR operations. 
Bitwise 
n/a 
n/a 
andps andnps 
andpd andnpd 
andps xmm0,xmm1 
Bitwise AND operation. "andn"
versions are bitwise ANDNOT operations (A=(~A) & B). "or"
version works the same way. This is for the "branchless ifthenelse" we covered before: result = (maskV & thenV)  ((~maskV) & elseV); 