Example Parallel Program: Shallow-Water Wave Equation
CS 441 Lecture, Dr. Lawlor
Serial Code
Here's the serial code (also on the powerwall in ~olawlor/441_demo/shallow_wave/openmp):
/*
Stencil-based 2D array computation: Shallow-Water Wave Equation
Example program.
Dr. Orion Sky Lawlor, olawlor@acm.org, 2008-11-20 (Public Domain)
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <fstream> /* for ofstream */
#include <vector>
#include "vec3.h"
/********************** 2D Field Support **************************/
template <class T>
class field2D {
public:
int wid,ht;
field2D(int w,int h,T *data_,int rowsize_)
:wid(w),ht(h),data(data_), rowsize(rowsize_) {}
T &operator() (int i,int j) {boundscheck(i,j); return data[i+j*rowsize];}
T operator() (int i,int j) const {boundscheck(i,j); return data[i+j*rowsize];}
private:
T *data;
int rowsize;
void boundscheck(int i,int j) const {
if (i<0 || i>=wid) {std::cout<<"Error: i="<<i<<" out of bounds!\n"; exit(1);}
if (j<0 || j>=ht) {std::cout<<"Error: j="<<j<<" out of bounds!\n"; exit(1);}
}
};
/*********************** Problem Physics ***********************/
/* Computes shallow-water wave equations:
x - u, fluid X velocity + 0.5
y - v, fluid Y velocity + 0.5
z - h, fluid height
du_dt = - u*du_dx - v * du_dy - growth * dh_dx
dv_dt = - u*dv_dx - v * dv_dy - growth * dh_dy
dh_dt = - u*dh_dx - v * dh_dy - h * (du_dx + dv_dy)
Produces beautiful barely-stable patterns at growth=0.12
Inputs: source field and one location in it.
Output: new field value at that location
*/
vec3 physics(const field2D<vec3> &src,int i,int j) {
vec3 L=src(i-1,j);
vec3 R=src(i+1,j);
vec3 T=src(i,j+1);
vec3 B=src(i,j-1);
vec3 C=src(i,j);
vec3 dx=vec3(R-L), dy=vec3(T-B);
vec3 avg=0.25*(L+R+T+B);
/* Parameters */
float bl=0.2, gp=1.2, dt=0.15;
/* Diffusion */
C=C*(1-bl)+(bl)*(0.25*(L+R+T+B));
/* Shallow-water dynamics */
float u=C.x, v=C.y, h=C.z;
/* 'd': the direction our value will change based on physics */
vec3 d = -u*dx -v*dy;
d.x+= -gp*dx.z;
d.y+= -gp*dy.z;
d.z+= -h*(dx.x+dy.y);
/* Take first-order Euler step */
C += dt*d;
return C;
}
int main(int argc,char *argv[])
{
// Figure out how big an image we should render:
int i,j, wid=300, ht=200;
// Set up initial conditions
std::vector<vec3> data1(wid*ht), data2(wid*ht);
field2D<vec3> src(wid,ht,&data1[0],wid);
field2D<vec3> dest(wid,ht,&data2[0],wid);
for (j=0;j<ht;j++) for (i=0;i<wid;i++) {
float lil=1.0e-6f; /* bias up from zero, to avoid denormal slowdown */
dest(i,j)=src(i,j)=vec3(lil,lil,(i>100 && i<150 && j>100 && j<150)?1.0f:lil);
}
int doprint=0; /* print out debugging info */
// Run physics for a certain number of timesteps
for (int t=0;t<1000;t++) {
/* Compute new interior points in dest */
for (j=1;j<ht-1;j++) {
for (i=1;i<wid-1;i++) {
dest(i,j) = physics(src,i,j);
}
}
/* Swap old and new arrays */
std::swap(src,dest);
}
// Create a PPM output image file header
char outfile[100]; sprintf(outfile,"out.ppm");
std::ofstream out(outfile,std::ios_base::binary);
out<<"P6\n"
<<wid<<" "<<ht<<"\n"
<<"255\n";
// Write out PPM file data
for (j=0;j<ht;j++) for (i=0;i<wid;i++) {
vec3 v=src(i,j);
unsigned char r=(unsigned char)((v.x*0.4+0.5)*255.0);
unsigned char g=(unsigned char)((v.y*0.4+0.5)*255.0);
unsigned char b=(unsigned char)((v.z*0.9)*255.0);
unsigned char buf[3]={r,g,b};
out.write((char *)buf,3);
}
return 0;
}
OpenMP
Here are the changes for the OpenMP code (also on the powerwall in ~olawlor/441_demo/shallow_wave/openmp):
// Run physics for a certain number of timesteps
for (int t=0;t<1000;t++) {
/* Compute new interior points in dest */
#pragma omp parallel for private(i)
for (j=1;j<ht-1;j++) {
for (i=1;i<wid-1;i++) {
dest(i,j) = physics(src,i,j);
}
}
/* Swap old and new arrays */
std::swap(src,dest);
}
Sequential Performance: 14 seconds elapsed.
2 CPUs: 8 seconds elapsed.
MPI
Here's the MPI code (also on the powerwall in ~olawlor/441_demo/shallow_wave/MPI):
/*
Stencil-based 2D array computation: Shallow-Water Wave Equation
Example program.
Dr. Orion Sky Lawlor, olawlor@acm.org, 2008-11-20 (Public Domain)
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <fstream> /* for ofstream */
#include <vector>
#include <mpi.h>
#include "vec3.h"
... same stuff as before here ...
void cleanup(void) {MPI_Finalize();}
int main(int argc,char *argv[])
{
MPI_Init(&argc,&argv);
atexit(cleanup);
int rank,size;
MPI_Comm_rank(MPI_COMM_WORLD,&rank);
MPI_Comm_size(MPI_COMM_WORLD,&size);
// Figure out how big an image we should render:
int i,j, wid=300, ht=200;
// Figure out how big an image I alone am responsible for:
int global_ht=ht;
ht=ht/size; /* number of rows *I* have to make */
int start_j=rank*ht, end_j=(rank+1)*ht;
// Set up initial conditions
std::vector<vec3> data1(wid*ht), data2(wid*ht);
field2D<vec3> src(wid,ht,&data1[0],wid);
field2D<vec3> dest(wid,ht,&data2[0],wid);
for (j=0;j<ht;j++) for (i=0;i<wid;i++) {
int gj=j+start_j; // initial conditions are in global space
float lil=1.0e-6f; /* bias up from zero, to avoid denormal slowdown */
dest(i,j)=src(i,j)=vec3(lil,lil,(i>100 && i<150 && gj>100 && gj<150)?1.0f:lil);
}
int doprint=0; /* print out debugging info */
// Run physics for a certain number of timesteps
for (int t=0;t<1000;t++) {
/* Do communication: exchange boundary rows with neighbors */
std::vector<MPI_Request> reqs; MPI_Request r;
if (doprint) {printf("Rank %d (%d) entering comm\n",rank,t); fflush(stdout);}
/* Do all my sends and receives */
if (rank+1<size) { /* I have a lower neighbor */
MPI_Isend(&src(0,ht-2),wid*3,MPI_FLOAT,
rank+1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
MPI_Irecv(&src(0,ht-1),wid*3,MPI_FLOAT,
rank+1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
}
if (rank-1>=0) { /* I have an upper neighbor */
MPI_Isend(&src(0,1),wid*3,MPI_FLOAT,
rank-1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
MPI_Irecv(&src(0,0),wid*3,MPI_FLOAT,
rank-1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
}
MPI_Waitall(reqs.size(),&reqs[0],MPI_STATUSES_IGNORE);
if (doprint) {printf("Rank %d (%d) done comm\n",rank,t); fflush(stdout);}
/* Compute new interior points in dest */
for (j=1;j<ht-1;j++) {
for (i=1;i<wid-1;i++) {
dest(i,j) = physics(src,i,j);
}
}
/* Swap old and new arrays */
std::swap(src,dest);
}
// Create a PPM output image file header (*per* rank)
char outfile[100]; sprintf(outfile,"out_%d.ppm",rank);
std::ofstream out(outfile,std::ios_base::binary);
out<<"P6\n"
<<wid<<" "<<ht<<"\n"
<<"255\n";
// Write out PPM file data
for (j=0;j<ht;j++) for (i=0;i<wid;i++) {
vec3 v=src(i,j);
unsigned char r=(unsigned char)((v.x*0.4+0.5)*255.0);
unsigned char g=(unsigned char)((v.y*0.4+0.5)*255.0);
unsigned char b=(unsigned char)((v.z*0.9)*255.0);
unsigned char buf[3]={r,g,b};
out.write((char *)buf,3);
}
return 0;
}
1 CPU: 23 seconds (mpiCC uses an older, slower compiler).
2 CPUs: 13 seconds
4 CPUs: 7.5 seconds
8 CPUs: 3.9 seconds
10 CPUs: 3.2 seconds
16 CPUs: 2.1 seconds
20 CPUs: 1.98 seconds