for (int i=0;i<n_vals;i++) {There's actually n_vals-way parallelism in this program, but to extract that parallelism from the C program is really tricky, since n_vals, vals, a, or b could all be changing (because of another thread, because the array points to it, etc.).
vals[i]=vals[i]*a+b;
}
for all i simultaneouslybut that's not what the code above means in C; the C version means "set i to 0; do the loop; i++; if (i<n_vals) repeat", which is totally different!
vals[i]=vals[i]*a+b;
for (int i=1;i<n_vals;i++) {Now it's not a parallel loop at all.
vals[i]=vals[i-1]*a+b;
}
__m128 _mm_load_ps(float *src) |
Load 4 floats from a 16-byte aligned address. |
__m128 _mm_loadu_ps(float *src) | Load from an unaligned address (4x slower!) |
__m128 _mm_load1_ps(float *src) | Load 1 float into all 4 fields of an __m128 |
__m128 _mm_setr_ps(float a,float b,float c,float d) |
Load 4 floats from parameters into an __m128 |
void _mm_store_ps(float *dest,__m128 src) |
Store 4 floats to an aligned address. |
void _mm_storeu_ps(float *dest,__m128 src) | Store 4 floats to unaligned address |
__m128 _mm_add_ps(__m128 a,__m128 b) |
Add corresponding floats (also "sub") |
__m128 _mm_mul_ps(__m128 a,__m128 b) | Multiply corresponding floats (also "div") |
__m128 _mm_min_ps(__m128 a,__m128 b) | Take corresponding minimum (also "max") |
__m128 _mm_sqrt_ps(__m128 a) | Take square roots of 4 floats (12ns, slow like divide) |
__m128 _mm_rcp_ps(__m128 a) | Compute rough (12-bit accuracy) reciprocal of all 4 floats (fast as an add!) |
__m128 _mm_rsqrt_ps(__m128 a) | Rough (12-bit) reciprocal-square-root of all 4 floats (fast) |
__m128 _mm_shuffle_ps(__m128 lo,__m128 hi, _MM_SHUFFLE(hi3,hi2,lo1,lo0)) |
Interleave inputs into low 2 floats and high 2 floats of output. Basically out[0]=lo[lo0]; out[1]=lo[lo1]; out[2]=hi[hi2]; out[3]=hi[hi3]; For example, _mm_shuffle_ps(a,a,_MM_SHUFFLE(i,i,i,i)) copies the float a[i] into all 4 output floats. |
__m128 va=_mm_load_ps1(&a); /* va contains 4 copies of a */This gives us about a 4x speedup over the original, and still a 2x speedup over the unrolled version!
__m128 vb=_mm_load_ps1(&b); /* vb contains 4 copies of b */
for (int i=0;i<n_vals;i+=4) { /* careful! n_vals must be multiple of 4! */
__m128 v=_mm_load_ps(&vals[i]); /* careful about alignment! */
v=_mm_mul_ps(v,va);
v=_mm_add_ps(v,vb);
_mm_store_ps(&vals[i],v);
}