Floating-point arithmetic is quite fast when everything works properly. For example, I can add, subtract, and multiply ordinary floats or doubles in about 1ns each:

=: 0.433 ns/floatOther operations are not very fast:

+: 0.845 ns/float

-: 0.847 ns/float

*: 0.848 ns/float

/: 77.051 ns/floatHere's the code I used to measure these values. Note that I'm being very careful in this code to keep the values from exploding to infinity, because (as we see below) such a malfunction can change the speed of the code!

sqrt: 24.162 ns/float

cbrt: 75.031 ns/float

sin: 16.758 ns/float

cos: 21.203 ns/float

tan: 17.882 ns/float

asin: 25.842 ns/float

acos: 29.293 ns/float

atan: 17.715 ns/float

exp: 33.725 ns/float

log: 50.433 ns/float

pow: 105.437 ns/float

enum {n_X=1000};

double X[n_X];

const double k=0.99, k2=1.01, k3=0.000001;

int assign_X(void) { for (int i=0;i<n_X-1;i++) X[i]=k;return 0;}

int add_X(void) { for (int i=0;i<n_X-1;i++) X[i]=X[i]+k;return 0;}

int sub_X(void) { for (int i=0;i<n_X-1;i++) X[i]=X[i]-k;return 0;}

int mul_X(void) { for (int i=0;i<n_X-1;i++) X[i]=X[i]*k;return 0;}

int div_X(void) { for (int i=0;i<n_X-1;i++) X[i]=X[i]/k2;return 0;}

int sqrt_X(void) { for (int i=0;i<n_X-1;i++) X[i]=sqrt(X[i]);return 0;}

int cbrt_X(void) { for (int i=0;i<n_X-1;i++) X[i]=cbrt(X[i]);return 0;}

int sin_X(void) { for (int i=0;i<n_X-1;i++) X[i]=sin(X[i]);return 0;}

int cos_X(void) { for (int i=0;i<n_X-1;i++) X[i]=cos(X[i]);return 0;}

int tan_X(void) { for (int i=0;i<n_X-1;i++) X[i]=tan(X[i]);return 0;}

int asin_X(void) { for (int i=0;i<n_X-1;i++) X[i]=asin(X[i]);return 0;}

int acos_X(void) { for (int i=0;i<n_X-1;i++) X[i]=k3*acos(X[i]);return 0;}

int atan_X(void) { for (int i=0;i<n_X-1;i++) X[i]=atan(X[i]);return 0;}

int exp_X(void) { for (int i=0;i<n_X-1;i++) X[i]=k3*exp(X[i]);return 0;}

int log_X(void) { for (int i=0;i<n_X-1;i++) X[i]=k2+log(X[i]);return 0;}

int pow_X(void) { for (int i=0;i<n_X-1;i++) X[i]=pow(X[i],k);return 0;}

int foo(void) {

int i;

printf(" =: %.3f ns/float\n",time_function(assign_X)/n_X*1.0e9);

printf(" +: %.3f ns/float\n",time_function(add_X)/n_X*1.0e9);

printf(" -: %.3f ns/float\n",time_function(sub_X)/n_X*1.0e9);

for (i=0;i<n_X;i++) X[i]=1.0;

printf(" *: %.3f ns/float\n",time_function(mul_X)/n_X*1.0e9);

printf(" /: %.3f ns/float\n",time_function(div_X)/n_X*1.0e9);

printf("sqrt: %.3f ns/float\n",time_function(sqrt_X)/n_X*1.0e9);

printf("cbrt: %.3f ns/float\n",time_function(cbrt_X)/n_X*1.0e9);

printf(" sin: %.3f ns/float\n",time_function(sin_X)/n_X*1.0e9);

printf(" cos: %.3f ns/float\n",time_function(cos_X)/n_X*1.0e9);

printf(" tan: %.3f ns/float\n",time_function(tan_X)/n_X*1.0e9);

printf("asin: %.3f ns/float\n",time_function(asin_X)/n_X*1.0e9);

printf("acos: %.3f ns/float\n",time_function(acos_X)/n_X*1.0e9);

printf("atan: %.3f ns/float\n",time_function(atan_X)/n_X*1.0e9);

for (i=0;i<n_X;i++) X[i]=0.0;

printf(" exp: %.3f ns/float\n",time_function(exp_X)/n_X*1.0e9);

for (i=0;i<n_X;i++) X[i]=1.0;

printf(" log: %.3f ns/float\n",time_function(log_X)/n_X*1.0e9);

printf(" pow: %.3f ns/float\n",time_function(pow_X)/n_X*1.0e9);

printf("Final X values: %.4f %.4f %.4f\n",X[0],X[n_X/2],X[n_X-2]);

return 0;

}

Sign |
Exponent |
Mantissa (or Fraction) |

1 bit-- 0 for positive 1 for negative |
8 bits-- 127 means 2 ^{0
} 137 means 2^{10
} |
23 bits-- a binary fraction. |

The hardware usually interprets a float as having the value:

value = (-1)

Note that the mantissa normally has an implicit leading 1 applied.

value = (-1)

Supressing the leading 1 allows you to exactly represent 0: the bit pattern for 0.0 is just exponent==0 and fraction==00000000 (that is, everything zero). If you set the sign bit to negative, you have "negative zero", a strange curiosity. Positive and negative zero work the same way in arithmetic operations, and as far as I know there's no reason to prefer one to the other. The "==" operator claims positive and negative zero are the same!

If the fraction field isn't zero, but the exponent field is, you have a "denormalized number"--these are numbers too small to represent with a leading one. You always need denormals to represent zero, but denormals (also known as "subnormal" values) also provide a little more range at the very low end--they can store values down to around 1.0e-40 for "float", and 1.0e-310 for "double".

See below for the performance problem with denormals.

float z=0.0;Arithmetic on infinities works just the way you'd expect:infinity plus 1.0 gives infinity, etc. (See tables below). Positive and negative infinities exist, and work as you'd expect. Note that while divide-by-integer-zero causes a crash (divide by zero error), divide-by-floating-point-zero just happily returns infinity by default.

float f=1.0/z;

std::cout<<f<<"\n";

return (int)f;

- 0.0/0.0 (neither zero nor infinity, because we'd want (x/x)==1.0; but not 1.0 either, because we'd want (2*x)/x==2.0...)
- infinity-infinity (might cancel out to anything)
- infinity*0

float f=sqrt(-1.0);This is a "NaN", which is represented with a huge exponent and a *nonzero* fraction field. Positive and negative nans exist, but like zeros both signs seem to work the same. x86 seems to rewrite the bits of all NaNs to a special pattern it prefers (0x7FC00000 for float, with exponent bits and the leading fraction bit all set to 1).

std::cout<<f<<"\n";

return (int)f;

However, most modern machines *don't* handle denormals, infinities, or NaNs in hardware--instead when one of these special values occurs, they trap out to software which handles the problem and restarts the computation. This trapping process takes time, as shown in the following program:

(Executable NetRun Link)

enum {n_vals=1000};On my P4, this gives 3ns for zeros and ordinary values, 300ns for denormals (a 100x slowdown), and 700ns for infinities and NaNs (a 200x slowdown)!

double vals[n_vals];

int average_vals(void) {

for (int i=0;i<n_vals-1;i++)

vals[i]=0.5*(vals[i]+vals[i+1]);

return 0;

}

int foo(void) {

int i;

for (i=0;i<n_vals;i++) vals[i]=0.0;

printf(" Zeros: %.3f ns/float\n",time_function(average_vals)/n_vals*1.0e9);

for (i=0;i<n_vals;i++) vals[i]=1.0;

printf(" Ones: %.3f ns/float\n",time_function(average_vals)/n_vals*1.0e9);

for (i=0;i<n_vals;i++) vals[i]=1.0e-310;

printf(" Denorm: %.3f ns/float\n",time_function(average_vals)/n_vals*1.0e9);

float x=0.0;

for (i=0;i<n_vals;i++) vals[i]=1.0/x;

printf(" Inf: %.3f ns/float\n",time_function(average_vals)/n_vals*1.0e9);

for (i=0;i<n_vals;i++) vals[i]=x/x;

printf(" NaN: %.3f ns/float\n",time_function(average_vals)/n_vals*1.0e9);

return 0;

}

On my PowerPC 604e, this gives 35ns for zeros, 65ns for denormals (a 2x slowdown), and 35ns for infinities and NaNs (no penalty).

My friends at Illinois and I wrote a paper on this with many more performance details.

enum {n=1000};This takes 2.8ns per float; about one nanosecond float per pass. We get substantially better performance by "fusing" the loops:

float a[n], b[n];

int foo(void) {

for (int i=0;i<n;i++) a[i]=b[i]+3;

for (int i=0;i<n;i++) a[i]+=b[i]*7;

for (int i=0;i<n;i++) a[i]*=4;

return 0;

}

enum {n=1000};This takes just 1.54ns per float. We can do even better with some algebraic transformations:

float a[n], b[n];

int foo(void) {

for (int i=0;i<n;i++) {

a[i]=b[i]+3;

a[i]+=b[i]*7;

a[i]*=4;

}

return 0;

}

enum {n=1000};Now we're down to 1.0ns per iteration.

float a[n], b[n];

int foo(void) {

for (int i=0;i<n;i++) {

a[i]=b[i]*32+12;

}

return 0;

}

n equ 1000 ; NASM macro

mov ecx,0 ; loop counter

loopstart:

movss xmm0,[inarr+4*rcx]

movss xmm1,[thirtytwo]

mulss xmm0,xmm1

movss xmm1,[twelve]

addss xmm0,xmm1

movss [outarr+4*rcx],xmm0

add ecx,1

cmp ecx,n

jl loopstart

ret

section .data

inarr: times n dd 1.0

outarr: times n dd 0.0

twelve: dd 12.0

thirtytwo: dd 32.0

Loading four floats using "movups" doesn't provide much of a speedup, and we've got to replicate the constant values. We are down to 1.2ns per iteration:

n equ 1000 ; NASM macro

mov ecx,0 ; loop counter

loopstart:

movups xmm0,[inarr+4*rcx]

movups xmm1,[thirtytwo]

mulps xmm0,xmm1

movups xmm1,[twelve]

addps xmm0,xmm1

movups [outarr+4*rcx],xmm0

add ecx,4

cmp ecx,n

jl loopstart

ret

section .data

inarr: times n dd 1.0

outarr: times n dd 0.0

twelve: times 4 dd 12.0

thirtytwo: times 4 dd 32.0

Switching to an aligned load, "movaps", drops us to 0.36ns per iteration; almost a perfect 4x speedup!

n equ 1000 ; NASM macro

mov ecx,0 ; loop counter

loopstart:

movaps xmm0,[inarr+4*rcx]

movaps xmm1,[thirtytwo]

mulps xmm0,xmm1

movaps xmm1,[twelve]

addps xmm0,xmm1

movaps [outarr+4*rcx],xmm0

add ecx,4

cmp ecx,n

jl loopstart

ret

section .data

align 16

inarr: times n dd 1.0

outarr: times n dd 0.0

twelve: times 4 dd 12.0

thirtytwo: times 4 dd 32.0

Finally, we can pull the constant loads out of the loop, now down to 0.27ns per float!

n equ 1000 ; NASM macro

mov ecx,0 ; loop counter

movaps xmm2,[thirtytwo]

movaps xmm1,[twelve]

loopstart:

movaps xmm0,[inarr+4*rcx]

mulps xmm0,xmm2

addps xmm0,xmm1

movaps [outarr+4*rcx],xmm0

add ecx,4

cmp ecx,n

jl loopstart

ret

section .data

align 16

inarr: times n dd 1.0

outarr: times n dd 0.0

twelve: times 4 dd 12.0

thirtytwo: times 4 dd 32.0

enum {n=1000000};

float a[n], b[n];

int foo(void) {

#pragma omp parallel for

for (int i=0;i<n;i++) {

a[i]=b[i]*32+12;

}

return 0;

}