Course Review for Final Exam

CS 441 Lecture, Dr. Lawlor

Pre-midterm content on the final:

You should also understand the following performance pitfalls, and how to analyze an application for high performance:

These explicit parallel programming models:

OpenMP
MPI
GLSL
SSE
Uses...
threads
processes
shader cores
floats
Where?
#pragma before a "for" loop
Throughout main
func<<<blocks,threads>>> kernel call
_mm_add_ps macro
Pitfalls?
global or shared variables
blocking calls
data transfer and startup cost
branch decoherence
Scalability
Typically 2-8x maximum
Nearly unlimited
Up to 1000x or so (relative to CPU)
Up to 4x

Mandelbench

Here's part of the Mandelbrot set fractal, rendered at 512x512 in each of the parallel programming models listed above.

Sequential C++

#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */

const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}

/* Run on CPU */
int main(int argc,char *argv[]) {
double start=time_in_seconds(), elapsed;
unsigned char *arr=new unsigned char[wid*ht];
#define timer (time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n"
std::cout<<"Allocated array: "<<timer;

start=time_in_seconds(); // <- reset the timer!
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x++)
arr[y*wid+x]=render_mandelbrot(x,y);
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<timer;

std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&arr[0],wid*ht);

elapsed=time_in_seconds()-start;
std::cout<<"Render + IO: "<<timer;
return 0;
}

(Try this in NetRun now!)

Allocated array: 0.255568ns/pixel
Rendered array: 491.817ns/pixel
Render + IO: 493.858ns/pixel
(This is one core of a Quad-core Intel Q6600)

OpenMP

...
#pragma omp parallel for schedule(dynamic,1)
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x++)
arr[y*wid+x]=render_mandelbrot(x,y);
...

(Try this in NetRun now!)

Allocated array: 0.240107ns/pixel
Rendered array: 138.046ns/pixel
Render + IO: 140.446ns/pixel
OpenMP gives about a 3.6x speedup on this Quad-core Intel Q6600.

OpenMP + SSE

#include <xmmintrin.h> /* SSE __m128 header */
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */

const int wid=512, ht=512;
/* Render 4 adjacent pixels of the mandelbrot set: x+0 through x+3 */
void render_mandelbrot(int x,int y,unsigned char *output)
{
/* See http://www.cs.uaf.edu/2009/fall/cs441/lecture/11_19_mandelbrot_compare.html */
float cr[4], ci[4];
for (int i=0;i<4;i++) {
cr[i]=0.2+(x+i)*(0.2/wid);
ci[i]=0.5+y*(0.2/ht);
}
float two[4]={2.0,2.0,2.0,2.0};
__m128 TWO=_mm_load_ps(two);
float four[4]={4.0,4.0,4.0,4.0};
__m128 FOUR=_mm_load_ps(four);
__m128 CR=_mm_load_ps(cr), CI=_mm_load_ps(ci);
__m128 ZR=CR, ZI=CI;
int count;
enum {max_count=256};
for (count=0;count<max_count;count++) {
__m128 tZR=ZR*ZR-ZI*ZI+CR; /*subtle: don't overwrite ZR yet!*/
__m128 tZI=TWO*ZR*ZI+CI;
__m128 LEN=tZR*tZR+tZI*tZI;
__m128 MASK=_mm_cmplt_ps(LEN,FOUR); /* set if len<4.0 */
ZR=_mm_or_ps(_mm_and_ps(MASK,tZR),_mm_andnot_ps(MASK,ZR));
ZI=_mm_or_ps(_mm_and_ps(MASK,tZI),_mm_andnot_ps(MASK,ZI));
if (_mm_movemask_ps(LEN-FOUR)==0) break; /* everybody's > 4 */
}
_mm_store_ps(cr,ZR); _mm_store_ps(ci,ZI);
for (int i=0;i<4;i++) {
output[i]=(char)(cr[i]*(256/4.0));
}
}

/* Run on CPU */
int main(int argc,char *argv[]) {
double start=time_in_seconds(), elapsed;
unsigned char *arr=new unsigned char[wid*ht];
#define timer (time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n"
std::cout<<"Allocated array: "<<timer;

start=time_in_seconds(); // <- reset the timer!
#pragma omp parallel for schedule(dynamic,1)
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x+=4)
render_mandelbrot(x,y,&arr[y*wid+x]);
elapsed=time_in_seconds()-start;
std::cout<<"Rendered array: "<<timer;

std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&arr[0],wid*ht);

elapsed=time_in_seconds()-start;
std::cout<<"Render + IO: "<<timer;
return 0;
}

(Try this in NetRun now!)

Allocated array: 0.241016ns/pixel
Rendered array: 85.1824ns/pixel
Render + IO: 87.4979ns/pixel
See my 2009 lecture for the development of the SSE version.  SSE gives us about a 1.7x speedup, and then OpenMP gives us about a 3.6x speedup, for a total of about 6x speedup over the sequential code.  SSE is crucial to get the most performance from modern CPUs, but it is by far the ugliest code of the bunch!  I need to rewrite this version using my "float4" class; I'm also cheating a bit and returning zr instead of "count".

fork()+sockets

// Socket-based multicore parallelism (for quad-core machine)
#include "osl/socket.h"
#include "osl/socket.cpp"
#include <sys/wait.h> /* for wait() */
#include <unistd.h> /* for fork() */
#include "lib/inc.c" /* NetRun, for time_in_seconds */
#include <iostream>
#include <fstream>
#include <math.h>

const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}

/* Run as process "rank", one process among "size" others.
Each socket connects you with another rank: s[0] connects to rank 0.
*/
void run(int rank,int size,SOCKET *s)
{
double start=time_in_seconds();
float ns=1.0e9/(wid*ht); /* convert seconds to ns/pixel */

enum {n=512}; /* total size of the problem */
int lht=(ht+size-1)/size; /* local height = ht/size, rounded up */
char limg[wid*lht];
/* Render interleaved rows; each processor taking one and skipping size-1 */
for (int ly=0, gy=rank;gy<ht;ly++, gy+=size)
for (int x=0;x<wid;x++)
limg[ly*wid+x]=render_mandelbrot(x,gy);

if (rank!=0) /* send local data to 0 */
{
skt_sendN(s[0],limg,wid*lht);
} else /* I'm rank 0: receive and unpack rows */
{
printf("render: %.3f ns/pixel\n",(time_in_seconds()-start)*ns);
char img[wid*ht]; /* global image */
for (int srcrank=0;srcrank<size;srcrank++) {
if (srcrank>0) { /* receive somebody else's rows */
skt_recvN(s[srcrank],limg,wid*lht);
}
/* Copy out local rows into global image */
for (int ly=0, gy=srcrank;gy<ht;ly++, gy+=size)
memcpy(&img[wid*gy],&limg[wid*ly],wid);
}
printf("render+net: %.3f ns/pixel\n",(time_in_seconds()-start)*ns);

/* write the image */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&img[0],wid*ht);

printf("render+net+IO: %.3f ns/pixel\n",(time_in_seconds()-start)*ns);
}
}


int main(void) {
double start=time_in_seconds();
unsigned int port=0;
const int size=4; /* quad-core machine */
SOCKET s[size];
SERVER_SOCKET serv=skt_server(&port);
for (int child=1;child<size;child++) {
int newpid=fork();
if (newpid==0) { /* I'm the child */
s[0]=skt_connect(skt_lookup_ip("127.0.0.1"),port,2);
run(child,size,s);
skt_close(s[0]);
exit(0); /* close out child process when done */
}
/* else I'm the parent */
s[child]=skt_accept(serv,0,0);
}
/* Now that all children are created, run as parent */
run(0,size,s);
/* Once parent is done, collect all the children */
for (int child=1;child<size;child++) {
skt_close(s[child]);
int status=0;
wait(&status); /* wait for child to finish */
}
printf("total_run: %.3f ms\n",(time_in_seconds()-start)*1.0e3);
return 0;
}

(Try this in NetRun now!)

render: 132.263 ns/pixel
render+net: 136.078 ns/pixel
render+net+IO: 138.461 ns/pixel
Speedup with sockets is actually a tiny bit better than with OpenMP, but still about 3.6x on this Intel Q6600.

MPI


#include <iostream>
#include <fstream>
#include <mpi.h>
#include <math.h>

const int wid=512, ht=512;
int render_mandelbrot(int x,int y)
{
float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0f*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0f) break;
zr=nzr; zi=nzi;
}
return count;
}

int main(int argc,char *argv[])
{
MPI_Init(&argc,&argv);
int rank,size;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
double start=MPI_Wtime();
float ns=1.0e9/(wid*ht); /* convert seconds to ns/pixel */

enum {n=512}; /* total size of the problem */
int lht=(ht+size-1)/size; /* local height = ht/size, rounded up */
char limg[wid*lht];
/* Render interleaved rows; each processor taking one and skipping size-1 */
for (int ly=0, gy=rank;gy<ht;ly++, gy+=size)
for (int x=0;x<wid;x++)
limg[ly*wid+x]=render_mandelbrot(x,gy);

int tag=37331; /* MPI tag (random value) */
if (rank!=0) /* send local data to 0 */
{
MPI_Send(limg,wid*lht,MPI_CHAR, 0,tag,MPI_COMM_WORLD);
} else /* I'm rank 0: receive and unpack rows */
{
printf("render: %.3f ns/pixel\n",(MPI_Wtime()-start)*ns);
char img[wid*ht]; /* global image */
for (int srcrank=0;srcrank<size;srcrank++) {
if (srcrank>0) { /* receive somebody else's rows */
MPI_Status sts;
MPI_Recv(limg,wid*lht,MPI_CHAR, srcrank,tag,MPI_COMM_WORLD,&sts);
}
/* Copy out local rows into global image */
for (int ly=0, gy=srcrank;gy<ht;ly++, gy+=size)
memcpy(&img[wid*gy],&limg[wid*ly],wid);
}
printf("render+net: %.3f ns/pixel\n",(MPI_Wtime()-start)*ns);

/* write the image */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write((char *)&img[0],wid*ht);

printf("render+net+IO: %.3f ns/pixel\n",(MPI_Wtime()-start)*ns);
}
MPI_Finalize();
return 0;
}

(Try this in NetRun now!)

1 process:
render: 695.171 ns/pixel
render+net: 696.270 ns/pixel
render+net+IO: 699.032 ns/pixel

2 processes:
render: 348.187 ns/pixel
render+net: 356.354 ns/pixel
render+net+IO: 359.066 ns/pixel

4 processes:
render: 172.669 ns/pixel
render+net: 186.508 ns/pixel
render+net+IO: 189.327 ns/pixel

10 processes:
render: 70.209 ns/pixel
render+net: 81.886 ns/pixel
render+net+IO: 84.621 ns/pixel
MPI does an excellent job of connecting these Intel Core2 Duos with gigabit Ethernet.  The baseline is a little slower than the newer Q6600 CPU, but we can beat even the hideous OpenMP+SSE version by adding enough processors.

GLSL

	float cr=0.2+0.2*texcoords.x, ci=0.5+0.2*(1.0-texcoords.y);
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
gl_FragColor=vec4(fract(count/255.0));

(Try this in NetRun now!)

Rendering: 2.929 ns/pixel
The NVIDIA GeForce GTX 280 just demolishes all the CPUs here: 167x faster than the serial version, and still dozens of times faster than all ten CPUs with MPI.  The code is actually still quite readable, although I'm leaving out the code and time to copy the data off the GPU (shown below).

ogl/gpgpu.h (Orion's OpenGL Libraries)

#include <iostream>
#include "ogl/gpgpu.h" /* Dr. Lawlor's OpenGL wrappers for general-purpose GPU computing */
#include "ogl/glsl.cpp"
#include <GL/glew.c> /* include entire body here, for easy linking */
#include "lib/inc.c" /* NetRun file, for time_in_seconds */

int main() {
gpu_env e; /* make the environment (OpenGL window &c) */
const int wid=512, ht=512;


double start=time_in_seconds();
gpu_array img(e,"img",wid,ht,0,GL_LUMINANCE32F_ARB);
std::cout<<"Allocate: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";

for (int run=0;run<2;run++) { //<- the first run is 30ms slower (compile time)
start=time_in_seconds(); // <- reset the timer for each run

GPU_RUN(img,
float cr=0.2+0.2*location.x; float ci=0.5+0.2*location.y;
float zr=cr; float zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
/* z= z*z+c */
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
gl_FragColor=vec4(fract(count/255.0));
)
glFinish();// <- needed for accurate timing only
std::cout<<"Render: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";

/* Copy data off to CPU */
char out[wid*ht];
img.read((float *)out,wid,ht,0,0,GL_UNSIGNED_BYTE);
std::cout<<"Render + Mem: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";

/* Write image file */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write(out,wid*ht);
std::cout<<"Render + Mem + IO: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";
}

return 0;
}

(Try this in NetRun now!)

Allocate: 37.7193ns/pixel
Render: 3.21184ns/pixel
Render + Mem: 12.081ns/pixel
Render + Mem + IO: 15.0641ns/pixel
At the moment, the slowest thing you can do on a GPU is talk to a CPU.  Copying the data off takes 4x longer than calculating it.  The first call to GPU_RUN has to compile the GLSL code, so I call it twice in a loop here.  I think the code is pretty readable, but I wrote the gpu_ library, so I would think that!  A more standard interface, CUDA, is shown below.

CUDA

#include <cuda.h>
#include <iostream>
#include <fstream>
#include "lib/inc.c" /* NetRun utility functions, like time_in_seconds */

#define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}

const int wid=512, ht=512;

/* GPU Code! */
__global__ void fill_in_array(float *arr) {
int i=threadIdx.x+blockDim.x*blockIdx.x; // my thread's global number
int x=i%wid, y=i/wid; // my thread's pixel to work on

float cr=0.2+x*0.2/wid, ci=0.5+y*0.2/ht;
float zr=cr, zi=ci;
int count;
const int max_count=256;
for (count=0;count<max_count;count++) {
// z= z*z+c
float nzr=zr*zr-zi*zi + cr;
float nzi=2.0*zr*zi + ci;
if ((nzr*nzr+nzi*nzi)>4.0) break;
zr=nzr; zi=nzi;
}
arr[y*wid+x]=count;
}

/* Run on CPU */
int main(int argc,char *argv[]) {
float *arr=0; /* DATA LIVES ON THE GPU!!!! */

cudaMalloc((void **)&arr,sizeof(float)); /* allocate one float (pay CUDA startup time) */

/* Allocate memory */
double start=time_in_seconds(), t;
check(cudaMalloc((void **)&arr, wid*ht*sizeof(float)));
t=time_in_seconds()-start;
std::cout<<"Allocated array: "<<t*1.0e9/(wid*ht)<<"ns/pixel\n";
start=time_in_seconds(); // <- reset the timer!
int threadsPerBlock=512;
int nBlocks=wid*ht/threadsPerBlock;

/* Call CUDA kernel */
fill_in_array<<<nBlocks,threadsPerBlock>>>(arr);

check(cudaThreadSynchronize()); // look for errors in kernel
t=time_in_seconds()-start;
std::cout<<"Rendered array: "<<t*1.0e9/(wid*ht)<<"ns/pixel\n";

/* Copy data back to CPU */
float harr[wid*ht];
check(cudaMemcpy(harr,arr,wid*ht*sizeof(float),cudaMemcpyDeviceToHost));
t=time_in_seconds()-start;
std::cout<<"Render + Mem: "<<t*1.0e9/(wid*ht)<<"ns/pixel\n";
/* Convert from float to char */
char carr[wid*ht];
for (int i=0;i<wid*ht;i++) carr[i]=(char)harr[i];

/* Write data to file */
std::ofstream of("out.ppm",std::ios_base::binary);
of<<"P5\n"; // greyscale, binary
of<<wid<<" "<<ht<<"\n"; // image size
of<<"255\n"; // byte image
of.write(carr,wid*ht);
std::cout<<"Render + Mem + IO: "<<(time_in_seconds()-start)*1.0e9/(wid*ht)<<"ns/pixel\n";
return 0;
}

(Try this in NetRun now!)

Allocate: 0.328004ns/pixel
Render: 4.31813ns/pixel
Render + Mem: 11.5661ns/pixel
Render + Mem + IO: 32.9589ns/pixel
CUDA actually seems to run the calculation portion more slowly than GLSL for this problem, but pulls slightly ahead when including the time to copy the answer off the card, and falls way behind by the time you finish the I/O.  I'm hitting a weird CUDA 2.2 driver bug if I try to have the kernel write raw char data, so the I/O time is much larger because I have to do float-to-char conversion on the CPU.  There's also a startup effect in CUDA: the first call takes about 40ms to find and initialize the card.