A nice C++ interface for SSE operations

CS 301 Lecture, Dr. Lawlor

Here it is:
#include <xmmintrin.h>  /* Standard Intel header, has _mm_... functions */
/* A "vec4" is four floats: one SSE register. */
class vec4 {
public:
inline vec4(void) {}
inline vec4(__m128 val) :v(val) {}

/* Load up 4 floats from this 16-BYTE ALIGNED pointer */
inline void operator=(float *a) {v=_mm_load_ps(a);}
inline vec4(float *a) {(*this)=a;}

/* Load up 4 copies of 1 float */
inline void operator=(float a) {v=_mm_load1_ps(&a);}
inline vec4(float a) {(*this)=a;}

/* Extract the underlying xmmintrin value from this vec4 */
inline __m128 get(void) const {return v;}
/* Copy our 4 floats out to this 16-BYTE ALIGNED pointer */
inline void write(float *a) {_mm_store_ps(a,v);}
private:
__m128 v; // We contain one four-float (128-bit) value.
};

/* Make arithmetic work on vec4's */
inline vec4 operator+(const vec4 &a,const vec4 &b) { return _mm_add_ps(a.get(),b.get()); }
inline vec4 operator-(const vec4 &a,const vec4 &b) { return _mm_sub_ps(a.get(),b.get()); }
inline vec4 operator*(const vec4 &a,const vec4 &b) { return _mm_mul_ps(a.get(),b.get()); }
inline vec4 operator/(const vec4 &a,const vec4 &b) { return _mm_div_ps(a.get(),b.get()); }

/* User code */
enum {n=1024};
float arr[n];
float a=1.2, b=0.3;
int bar(void) {
vec4 A=a, B=b;
for (int i=0;i<n;i+=8) /* each loop iteration does 8 floats */
{
vec4 V=&arr[i]; /* load arr[i] .. arr[i+3] */
V=(V+A)*B;
V.write(&arr[i]); /* write back */

/* Unrolled SSE: second set of four floats */
V=&arr[i+4];
V=(V+A)*B;
V.write(&arr[i+4]);
}
return 0;
}

int foo(void) {
print_time("Bar",bar);
return 0;
}

(Try this in NetRun now!)