Example Parallel Program: Shallow-Water Wave Equation

Serial Code

Here's the serial code (also on the powerwall in ~olawlor/441_demo/shallow_wave/openmp):

/*
Stencil-based 2D array computation: Shallow-Water Wave Equation
Example program.

Dr. Orion Sky Lawlor, olawlor@acm.org, 2008-11-20 (Public Domain)
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <fstream> /* for ofstream */ 
#include <vector>

#include "vec3.h"
 
/********************** 2D Field Support **************************/ 
template <class T> 
class field2D { 
public: 
	int wid,ht; 
	field2D(int w,int h,T *data_,int rowsize_)  
		:wid(w),ht(h),data(data_), rowsize(rowsize_) {} 
	T &operator() (int i,int j) {boundscheck(i,j); return data[i+j*rowsize];} 
	T operator() (int i,int j) const {boundscheck(i,j); return data[i+j*rowsize];} 
private: 
	T *data; 
	int rowsize; 
	void boundscheck(int i,int j) const { 
		if (i<0 || i>=wid) {std::cout<<"Error: i="<<i<<" out of bounds!\n"; exit(1);} 
		if (j<0 || j>=ht) {std::cout<<"Error: j="<<j<<" out of bounds!\n"; exit(1);} 
	} 
}; 
 
/*********************** Problem Physics ***********************/ 
/* Computes shallow-water wave equations: 
	x - u, fluid X velocity + 0.5 
	y - v, fluid Y velocity + 0.5 
	z - h, fluid height 
 
du_dt = - u*du_dx - v * du_dy - growth * dh_dx 
dv_dt = - u*dv_dx - v * dv_dy - growth * dh_dy 
dh_dt = - u*dh_dx - v * dh_dy - h * (du_dx + dv_dy) 
 
Produces beautiful barely-stable patterns at growth=0.12 
 
Inputs: source field and one location in it. 
Output: new field value at that location 
*/ 
vec3 physics(const field2D<vec3> &src,int i,int j) { 
	vec3 L=src(i-1,j);  
	vec3 R=src(i+1,j);  
	vec3 T=src(i,j+1);  
	vec3 B=src(i,j-1);  
	vec3 C=src(i,j);    
	vec3 dx=vec3(R-L), dy=vec3(T-B); 
	vec3 avg=0.25*(L+R+T+B); 
	 
	/* Parameters */ 
	float bl=0.2, gp=1.2, dt=0.15; 
	 
	/* Diffusion */ 
	C=C*(1-bl)+(bl)*(0.25*(L+R+T+B)); 
	 
	/* Shallow-water dynamics */ 
	float u=C.x, v=C.y, h=C.z; 
 
	/* 'd': the direction our value will change based on physics */ 
	vec3 d = -u*dx -v*dy; 
	d.x+= -gp*dx.z; 
	d.y+= -gp*dy.z; 
	d.z+= -h*(dx.x+dy.y); 
	 
	/* Take first-order Euler step */ 
	C += dt*d; 
	return C; 
} 
 
int main(int argc,char *argv[])  
{ 
	// Figure out how big an image we should render: 
	int i,j, wid=300, ht=200; 
	
	// Set up initial conditions 
	std::vector<vec3> data1(wid*ht), data2(wid*ht);	 
	field2D<vec3> src(wid,ht,&data1[0],wid); 
	field2D<vec3> dest(wid,ht,&data2[0],wid); 
	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 
		float lil=1.0e-6f; /* bias up from zero, to avoid denormal slowdown */
		dest(i,j)=src(i,j)=vec3(lil,lil,(i>100 && i<150 && j>100 && j<150)?1.0f:lil);
	} 
 
 	int doprint=0; /* print out debugging info */
 
	// Run physics for a certain number of timesteps 
	for (int t=0;t<1000;t++) { 
		/* Compute new interior points in dest */
		for (j=1;j<ht-1;j++) {
			for (i=1;i<wid-1;i++) { 
				dest(i,j) = physics(src,i,j); 
			} 
		}
		
		/* Swap old and new arrays */
		std::swap(src,dest); 
	} 
 
	// Create a PPM output image file header
	char outfile[100]; sprintf(outfile,"out.ppm"); 
	std::ofstream out(outfile,std::ios_base::binary); 
	out<<"P6\n" 
	   <<wid<<" "<<ht<<"\n" 
	   <<"255\n"; 
	 
	// Write out PPM file data 
	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 
		vec3 v=src(i,j); 
		unsigned char r=(unsigned char)((v.x*0.4+0.5)*255.0); 
		unsigned char g=(unsigned char)((v.y*0.4+0.5)*255.0); 
		unsigned char b=(unsigned char)((v.z*0.9)*255.0); 
		unsigned char buf[3]={r,g,b}; 
		out.write((char *)buf,3); 
	} 
 
	return 0; 
}

OpenMP

Here are the changes for the OpenMP code (also on the powerwall in ~olawlor/441_demo/shallow_wave/openmp):

	// Run physics for a certain number of timesteps 
	for (int t=0;t<1000;t++) { 
		/* Compute new interior points in dest */
#pragma omp parallel for private(i)
		for (j=1;j<ht-1;j++) {
			for (i=1;i<wid-1;i++) { 
				dest(i,j) = physics(src,i,j); 
			} 
		}
		
		/* Swap old and new arrays */
		std::swap(src,dest); 
	}

Sequential Performance: 14 seconds elapsed.
2 CPUs: 8 seconds elapsed.

MPI

Here's the MPI code (also on the powerwall in ~olawlor/441_demo/shallow_wave/MPI):

/*
Stencil-based 2D array computation: Shallow-Water Wave Equation
Example program.

Dr. Orion Sky Lawlor, olawlor@acm.org, 2008-11-20 (Public Domain)
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include <math.h>
#include <iostream>
#include <fstream> /* for ofstream */ 
#include <vector>

#include <mpi.h>
 
#include "vec3.h"

... same stuff as before here ...
 
void cleanup(void) {MPI_Finalize();}
int main(int argc,char *argv[])  
{ 
	MPI_Init(&argc,&argv);
	atexit(cleanup);
	
	int rank,size;
	MPI_Comm_rank(MPI_COMM_WORLD,&rank);
	MPI_Comm_size(MPI_COMM_WORLD,&size);

	// Figure out how big an image we should render: 
	int i,j, wid=300, ht=200; 
	
	// Figure out how big an image I alone am responsible for:
	int global_ht=ht;
	ht=ht/size; /* number of rows *I* have to make */
	int start_j=rank*ht, end_j=(rank+1)*ht;
 
	// Set up initial conditions 
	std::vector<vec3> data1(wid*ht), data2(wid*ht);	 
	field2D<vec3> src(wid,ht,&data1[0],wid); 
	field2D<vec3> dest(wid,ht,&data2[0],wid); 
	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 
		int gj=j+start_j; // initial conditions are in global space
		float lil=1.0e-6f; /* bias up from zero, to avoid denormal slowdown */
		dest(i,j)=src(i,j)=vec3(lil,lil,(i>100 && i<150 && gj>100 && gj<150)?1.0f:lil);
	} 
 
 	int doprint=0; /* print out debugging info */
 
	// Run physics for a certain number of timesteps 
	for (int t=0;t<1000;t++) { 
		/* Do communication: exchange boundary rows with neighbors */
		std::vector<MPI_Request> reqs; MPI_Request r;
		if (doprint) {printf("Rank %d (%d) entering comm\n",rank,t); fflush(stdout);}
		/* Do all my sends and receives */
		if (rank+1<size) { /* I have a lower neighbor */
			MPI_Isend(&src(0,ht-2),wid*3,MPI_FLOAT,
				rank+1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
			MPI_Irecv(&src(0,ht-1),wid*3,MPI_FLOAT,
				rank+1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
		}
		if (rank-1>=0) { /* I have an upper neighbor */
			MPI_Isend(&src(0,1),wid*3,MPI_FLOAT,
				rank-1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
			MPI_Irecv(&src(0,0),wid*3,MPI_FLOAT,
				rank-1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);
		}
		MPI_Waitall(reqs.size(),&reqs[0],MPI_STATUSES_IGNORE);
		if (doprint) {printf("Rank %d (%d) done comm\n",rank,t); fflush(stdout);}
	
		/* Compute new interior points in dest */
		for (j=1;j<ht-1;j++) {
			for (i=1;i<wid-1;i++) { 
				dest(i,j) = physics(src,i,j); 
			} 
		}
		
		/* Swap old and new arrays */
		std::swap(src,dest); 
	} 
 
	// Create a PPM output image file header (*per* rank)
	char outfile[100]; sprintf(outfile,"out_%d.ppm",rank); 
	std::ofstream out(outfile,std::ios_base::binary); 
	out<<"P6\n" 
	   <<wid<<" "<<ht<<"\n" 
	   <<"255\n"; 
	 
	// Write out PPM file data 
	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 
		vec3 v=src(i,j); 
		unsigned char r=(unsigned char)((v.x*0.4+0.5)*255.0); 
		unsigned char g=(unsigned char)((v.y*0.4+0.5)*255.0); 
		unsigned char b=(unsigned char)((v.z*0.9)*255.0); 
		unsigned char buf[3]={r,g,b}; 
		out.write((char *)buf,3); 
	} 
 
	return 0; 
}

1 CPU: 23 seconds (mpiCC uses an older, slower compiler).
2 CPUs: 13 seconds
4 CPUs: 7.5 seconds
8 CPUs: 3.9 seconds
10 CPUs: 3.2 seconds
16 CPUs: 2.1 seconds
20 CPUs: 1.98 seconds