# Example Parallel Program: Shallow-Water Wave Equation

CS 441 Lecture, Dr. Lawlor

## Serial Code

Here's the serial code (also on the powerwall in ~olawlor/441_demo/shallow_wave/openmp):
`/*Stencil-based 2D array computation: Shallow-Water Wave EquationExample program.Dr. Orion Sky Lawlor, olawlor@acm.org, 2008-11-20 (Public Domain)*/#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <string.h>#include <math.h>#include <iostream>#include <fstream> /* for ofstream */ #include <vector>#include "vec3.h" /********************** 2D Field Support **************************/ template <class T> class field2D { public: 	int wid,ht; 	field2D(int w,int h,T *data_,int rowsize_)  		:wid(w),ht(h),data(data_), rowsize(rowsize_) {} 	T &operator() (int i,int j) {boundscheck(i,j); return data[i+j*rowsize];} 	T operator() (int i,int j) const {boundscheck(i,j); return data[i+j*rowsize];} private: 	T *data; 	int rowsize; 	void boundscheck(int i,int j) const { 		if (i<0 || i>=wid) {std::cout<<"Error: i="<<i<<" out of bounds!\n"; exit(1);} 		if (j<0 || j>=ht) {std::cout<<"Error: j="<<j<<" out of bounds!\n"; exit(1);} 	} };  /*********************** Problem Physics ***********************/ /* Computes shallow-water wave equations: 	x - u, fluid X velocity + 0.5 	y - v, fluid Y velocity + 0.5 	z - h, fluid height  du_dt = - u*du_dx - v * du_dy - growth * dh_dx dv_dt = - u*dv_dx - v * dv_dy - growth * dh_dy dh_dt = - u*dh_dx - v * dh_dy - h * (du_dx + dv_dy)  Produces beautiful barely-stable patterns at growth=0.12  Inputs: source field and one location in it. Output: new field value at that location */ vec3 physics(const field2D<vec3> &src,int i,int j) { 	vec3 L=src(i-1,j);  	vec3 R=src(i+1,j);  	vec3 T=src(i,j+1);  	vec3 B=src(i,j-1);  	vec3 C=src(i,j);    	vec3 dx=vec3(R-L), dy=vec3(T-B); 	vec3 avg=0.25*(L+R+T+B); 	 	/* Parameters */ 	float bl=0.2, gp=1.2, dt=0.15; 	 	/* Diffusion */ 	C=C*(1-bl)+(bl)*(0.25*(L+R+T+B)); 	 	/* Shallow-water dynamics */ 	float u=C.x, v=C.y, h=C.z;  	/* 'd': the direction our value will change based on physics */ 	vec3 d = -u*dx -v*dy; 	d.x+= -gp*dx.z; 	d.y+= -gp*dy.z; 	d.z+= -h*(dx.x+dy.y); 	 	/* Take first-order Euler step */ 	C += dt*d; 	return C; }  int main(int argc,char *argv[])  { 	// Figure out how big an image we should render: 	int i,j, wid=300, ht=200; 		// Set up initial conditions 	std::vector<vec3> data1(wid*ht), data2(wid*ht);	 	field2D<vec3> src(wid,ht,&data1[0],wid); 	field2D<vec3> dest(wid,ht,&data2[0],wid); 	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 		float lil=1.0e-6f; /* bias up from zero, to avoid denormal slowdown */		dest(i,j)=src(i,j)=vec3(lil,lil,(i>100 && i<150 && j>100 && j<150)?1.0f:lil);	}   	int doprint=0; /* print out debugging info */ 	// Run physics for a certain number of timesteps 	for (int t=0;t<1000;t++) { 		/* Compute new interior points in dest */		for (j=1;j<ht-1;j++) {			for (i=1;i<wid-1;i++) { 				dest(i,j) = physics(src,i,j); 			} 		}				/* Swap old and new arrays */		std::swap(src,dest); 	}  	// Create a PPM output image file header	char outfile[100]; sprintf(outfile,"out.ppm"); 	std::ofstream out(outfile,std::ios_base::binary); 	out<<"P6\n" 	   <<wid<<" "<<ht<<"\n" 	   <<"255\n"; 	 	// Write out PPM file data 	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 		vec3 v=src(i,j); 		unsigned char r=(unsigned char)((v.x*0.4+0.5)*255.0); 		unsigned char g=(unsigned char)((v.y*0.4+0.5)*255.0); 		unsigned char b=(unsigned char)((v.z*0.9)*255.0); 		unsigned char buf[3]={r,g,b}; 		out.write((char *)buf,3); 	}  	return 0; } `

## OpenMP

Here are the changes for the OpenMP code (also on the powerwall in ~olawlor/441_demo/shallow_wave/openmp):
`	// Run physics for a certain number of timesteps 	for (int t=0;t<1000;t++) { 		/* Compute new interior points in dest */#pragma omp parallel for private(i)		for (j=1;j<ht-1;j++) {			for (i=1;i<wid-1;i++) { 				dest(i,j) = physics(src,i,j); 			} 		}				/* Swap old and new arrays */		std::swap(src,dest); 	} `

Sequential Performance: 14 seconds elapsed.
2 CPUs: 8 seconds elapsed.

## MPI

Here's the MPI code (also on the powerwall in ~olawlor/441_demo/shallow_wave/MPI):
`/*Stencil-based 2D array computation: Shallow-Water Wave EquationExample program.Dr. Orion Sky Lawlor, olawlor@acm.org, 2008-11-20 (Public Domain)*/#include <stdio.h>#include <stdlib.h>#include <ctype.h>#include <string.h>#include <math.h>#include <iostream>#include <fstream> /* for ofstream */ #include <vector>#include <mpi.h> #include "vec3.h"... same stuff as before here ... void cleanup(void) {MPI_Finalize();}int main(int argc,char *argv[])  { 	MPI_Init(&argc,&argv);	atexit(cleanup);		int rank,size;	MPI_Comm_rank(MPI_COMM_WORLD,&rank);	MPI_Comm_size(MPI_COMM_WORLD,&size);	// Figure out how big an image we should render: 	int i,j, wid=300, ht=200; 		// Figure out how big an image I alone am responsible for:	int global_ht=ht;	ht=ht/size; /* number of rows *I* have to make */	int start_j=rank*ht, end_j=(rank+1)*ht; 	// Set up initial conditions 	std::vector<vec3> data1(wid*ht), data2(wid*ht);	 	field2D<vec3> src(wid,ht,&data1[0],wid); 	field2D<vec3> dest(wid,ht,&data2[0],wid); 	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 		int gj=j+start_j; // initial conditions are in global space		float lil=1.0e-6f; /* bias up from zero, to avoid denormal slowdown */		dest(i,j)=src(i,j)=vec3(lil,lil,(i>100 && i<150 && gj>100 && gj<150)?1.0f:lil);	}   	int doprint=0; /* print out debugging info */ 	// Run physics for a certain number of timesteps 	for (int t=0;t<1000;t++) { 		/* Do communication: exchange boundary rows with neighbors */		std::vector<MPI_Request> reqs; MPI_Request r;		if (doprint) {printf("Rank %d (%d) entering comm\n",rank,t); fflush(stdout);}		/* Do all my sends and receives */		if (rank+1<size) { /* I have a lower neighbor */			MPI_Isend(&src(0,ht-2),wid*3,MPI_FLOAT,				rank+1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);			MPI_Irecv(&src(0,ht-1),wid*3,MPI_FLOAT,				rank+1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);		}		if (rank-1>=0) { /* I have an upper neighbor */			MPI_Isend(&src(0,1),wid*3,MPI_FLOAT,				rank-1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);			MPI_Irecv(&src(0,0),wid*3,MPI_FLOAT,				rank-1,0,MPI_COMM_WORLD,&r);reqs.push_back(r);		}		MPI_Waitall(reqs.size(),&reqs[0],MPI_STATUSES_IGNORE);		if (doprint) {printf("Rank %d (%d) done comm\n",rank,t); fflush(stdout);}			/* Compute new interior points in dest */		for (j=1;j<ht-1;j++) {			for (i=1;i<wid-1;i++) { 				dest(i,j) = physics(src,i,j); 			} 		}				/* Swap old and new arrays */		std::swap(src,dest); 	}  	// Create a PPM output image file header (*per* rank)	char outfile[100]; sprintf(outfile,"out_%d.ppm",rank); 	std::ofstream out(outfile,std::ios_base::binary); 	out<<"P6\n" 	   <<wid<<" "<<ht<<"\n" 	   <<"255\n"; 	 	// Write out PPM file data 	for (j=0;j<ht;j++) for (i=0;i<wid;i++) { 		vec3 v=src(i,j); 		unsigned char r=(unsigned char)((v.x*0.4+0.5)*255.0); 		unsigned char g=(unsigned char)((v.y*0.4+0.5)*255.0); 		unsigned char b=(unsigned char)((v.z*0.9)*255.0); 		unsigned char buf[3]={r,g,b}; 		out.write((char *)buf,3); 	}  	return 0; } `

1 CPU: 23 seconds (mpiCC uses an older, slower compiler).
2 CPUs: 13 seconds
4 CPUs: 7.5 seconds
8 CPUs: 3.9 seconds
10 CPUs: 3.2 seconds
16 CPUs: 2.1 seconds
20 CPUs: 1.98 seconds