Sign |
Exponent |
Fraction (or
"Mantissa") |
1 bit-- 0 for positive 1 for negative |
8 unsigned bits-- 127 means 20 137 means 210 |
23 bits-- a binary fraction. Don't forget the implicit leading 1! |
/* IEEE floating-point number's bits: sign exponent mantissa */(Executable NetRun link)
struct float_bits {
unsigned int fraction:23; /**< Value is binary 1.fraction ("mantissa") */
unsigned int exp:8; /**< Value is 2^(exp-127) */
unsigned int sign:1; /**< 0 for positive, 1 for negative */
};
/* A union is a struct where all the fields *overlap* each other */
union float_dissector {
float f;
float_bits b;
};
float_dissector s;
s.f=8.0;
std::cout<<s.f<<"= sign "<<s.b.sign<<" exp "<<s.b.exp<<" fract "<<s.b.fraction<<"\n";
return 0;
C Datatype |
Size |
Approx. Precision |
Approx. Range |
Exponent Bits |
Fraction Bits |
+-1 range |
float |
4 bytes (everywhere) |
1.0x10-7 |
1038 |
8 |
23 |
224 |
double |
8 bytes (everywhere) |
2.0x10-15 |
10308 |
11 |
52 |
253 |
long double |
12-16 bytes (if it even exists) |
2.0x10-20 |
104932 |
15 |
64 |
265 |
/* A floating-point number, written inside a class (for no real reason) */One very cool thing about C++ is that because everything we do with the "my_float" class is "inline", the compiler is smart enough to "see through" our my_float class to the double underneath. This means our "my_float" class actually costs nothing at runtime--it's just as fast to use our own wrapper around a "double" as it is to use a plain "double":
class my_float {
public:
double v; /* value I represent */
/* Create a "my_float" from an actual hardware float. */
my_float(double value) :v(value) {}
};
/** Output operator, for easy cout-style printing */
std::ostream &operator<<(std::ostream &o,const my_float &f) {
o<<f.v;
return o;
}
/** Like "-a". Make this my_float have the opposite sign */
inline my_float operator-(const my_float &a)
{
return my_float(-a.v);
}
/** Like "a+b". Add these two my_floats */
inline my_float operator+(const my_float &a,const my_float &b) {
return my_float(a.v+b.v);
}
/** Like "a-b". Subtract these two my_floats */
inline my_float operator-(const my_float &a,const my_float &b) {
return my_float(a.v-b.v);
}
my_float ma(1.0), mb(1.0);
int my_fadd(void) {for (int i=0;i<1000;i++) ma=ma+mb; return 0;}
double fa(1.0), fb(1.0);
int hw_fadd(void) {for (int i=0;i<1000;i++) fa=fa+fb; return 0;}
int foo(void) {
print_time("my_float",my_fadd);
print_time("hw_float",hw_fadd);
my_float a(1.0);
my_float b(0.25);
std::cout<<" a="<<a<<" b="<<b<<" a-b="<<(a-b)<<"\n";
return 0;
}
my_float: 2216.08 ns/call
hw_float: 2211.89 ns/call
a=1 b=0.25 a-b=0.75
Program complete. Return 0 (0x0)
/* A floating-point number, written in software */Here's how we do output. I'm outputting the mantissa in hex, the exponent in signed decimal (just like printf's new "%a" format!), and then I'm also computing the floating-point value we represent:
class my_float {
public:
int sign; /* 0 for +, 1 for - */
int exponent; /* scaling on float is 2^exponent */
int mantissa; /* value of float */
/* Create a "my_float" from sign, exponent, and mantissa fields */
my_float(int sign_,int exponent_,int mantissa_)
:sign(sign_), exponent(exponent_), mantissa(mantissa_) {}
};
/** Output operator, for easy cout-style printing */OK. Let's start with something easy. How do we implement "-x"? Well, let's just flip the sign bit:
std::ostream &operator<<(std::ostream &o,const my_float &f) {
o<<(f.sign?"-":"+")<<
"0x"<<std::hex<<f.mantissa<<
"p"<<std::dec<<f.exponent<<
" ("<<(f.sign?-1.0:+1.0)*f.mantissa*pow(2,f.exponent)<<") ";
return o;
}
/** Like "-a". Make this my_float have the opposite sign */Let's try this out. We'll start with the number +1 times two to the zero power, and negate it:
inline my_float operator-(const my_float &a)
{
return my_float(!a.sign,a.exponent,a.mantissa);
}
int foo(void) {This prints out:
my_float a(0,0,1);
std::cout<<" a="<<a<<" -a="<<(-a)<<"\n";
return 0;
}
a=+0x1p0 (1) -a=-0x1p0 (-1)OK! Looks like we've got "negate" down!
Program complete. Return 0 (0x0)
/** Like "a+b". Add these two my_floats */(executable NetRun link)
inline my_float operator+(const my_float &a,const my_float &b) {
int s=a.sign; /* sign of return value (FIXME: what if a.sign!=b.sign?)*/
int e=a.exponent; /* exponent (FIXME: what if a.exponent!=b.exponent?) */
int m=a.mantissa + b.mantissa; /* mantissa (FIXME: what about a carry?) */
return my_float(s,e,m);
}
int foo(void) {
my_float a(0,0,1), b(0,0,1);
std::cout<<" a="<<a<<" b="<<b<<" a+b="<<(a+b)<<"\n";
return 0;
}
my_float: 639.15 ns/call
hw_float: 2171.61 ns/call
a=+0x1p0 (1) b=+0x1p0 (1) a+b=+0x2p0 (2)
Program complete. Return 0 (0x0)
class my_float { ...(executable NetRun link)
enum {mantissa_min=1u<<16}; /* <- minimum value to store in mantissa */
enum {mantissa_max=1u<<17}; /* <- maximum value to store in mantissa */
/* Create a "my_float" from an integer value */
my_float(int value) {
if (value<0) {sign=1; value=-value;} else {sign=0;}
exponent=0; /* find exponent needed to "normalize" value. */
while (value<mantissa_min) {value*=2;exponent--;}
while (value>=mantissa_max) {value=value>>1;exponent++;}
mantissa=(mantissa_t)value; /*<- value has now been scaled properly */
}
/** Like "a+b". Add these two my_floats */OK! Now
inline my_float operator+(const my_float &a,const my_float &b) {
int s=a.sign; /* sign of return value (FIXME: what if a.sign!=b.sign?)*/
int e=a.exponent; /* exponent (FIXME: what if a.exponent!=b.exponent?) */
int m=a.mantissa + b.mantissa; /* mantissa */
while (m>=my_float::mantissa_max) {m=m>>1;e++;} /* handle mantissa carry */
return my_float(s,e,m);
}
a=+0x10000p-16 (1) b=+0x10000p-16 (1) a+b=+0x10000p-15 (2)So our mantissas are normalized coming out!
/** Like "a+b". Add these two my_floats */OK! Now 1 + 2 == 3. I claim this software-floating-point code actually works pretty well, although if you look at the timings you'll notice that now our software version is about 5x slower than hardware floating-point!
inline my_float operator+(const my_float &a,const my_float &b) {
int s=a.sign; /* sign of return value (FIXME: what if a.sign!=b.sign?)*/
int e=std::max(a.exponent,b.exponent); /* exponent of return value */
int am=a.mantissa>>(e-a.exponent); /* shifted mantissas (lined-up on e) */
int bm=b.mantissa>>(e-b.exponent);
int m=am+bm; /* outgoing mantissa */
while (m>=my_float::mantissa_max) {m=m>>1;e++;} /* handle mantissa carry */
return my_float(s,e,m);
}