Multicore Performance Example
Here's a simple example where we simulate the flight path of a baseball for various starting conditions.
class baseball {
public:
float px,py,pz;
float vx,vy,vz;
void step(float dt) {
float drag=-1.0;
float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz;
float coriolis=0.00001;
ax+=coriolis*vy; ay-=coriolis*vx;
vx+=ax*dt; vy+=ay*dt; vz+=az*dt;
px+=vx*dt; py+=vy*dt; pz+=vz*dt;
}
};
float foo(void) {
#define ASCII 0 /* set to 1 to see ASCII art of the ball's path */
#if ASCII
const int ASCIIART=100; // chars wide and high
char buf[ASCIIART][ASCIIART];
for (int y=0;y<ASCIIART;y++) for (int x=0;x<ASCIIART;x++) buf[y][x]=' ';
#endif
baseball b;
for (float angle=0.0;angle<1.0;angle+=0.125)
{
b.px=b.py=0.0; b.pz=2.0;
b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;
// Move the baseball:
for (int i=0;i<1000;i++)
{
b.step(0.001);
#if ASCII
int y=(int)(b.pz*ASCIIART/100*40);
int x=(int)(b.px*ASCIIART/100*4);
if (y>=0 && y<ASCIIART && x>=0 && x<ASCIIART)
buf[y][x]='O'+(char)(angle*8);
#endif
}
}
#if ASCII
for (int y=0;y<ASCIIART;y++) {
for (int x=0;x<ASCIIART;x++)
std::cout<<buf[ASCIIART-1-y][x];
std::cout<<std::endl;
}
#endif
return b.pz;
}
We can convert to multicore using OpenMP by adding the pragma to parallelize the angle loop:
#pragma omp parallel for
for (int angle_i=0;angle_i<8;angle_i++)
// for (float angle=0.0;angle<1.0;angle+=0.125) // omp can't do float loops
{
float angle=0.125*angle_i;
baseball b;
b.px=b.py=0.0; b.pz=2.0;
b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;
for (int i=0;i<1000;i++)
{
b.step(0.001);
...
}
}
Here's the CUDA baseball simulator:
#include <iostream>
#include <stdio.h>
#include "lib/inc.c"
class baseball {
public:
float px,py,pz;
float vx,vy,vz;
__device__ __host__ void step(float dt) {
float drag=-1.0;
float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz;
float coriolis=0.00001;
ax+=coriolis*vy; ay-=coriolis*vx;
vx+=ax*dt; vy+=ay*dt; vz+=az*dt;
px+=vx*dt; py+=vy*dt; pz+=vz*dt;
}
};
__global__ void move_baseball(baseball *b_array) {
int i=blockIdx.x*blockDim.x+threadIdx.x; // global thread ID
// printf("Hello from GPU: thread %d of block %d, f=%d\n",t,b,f);
float angle=0.000125*i;
baseball b;
b.px=b.py=0.0; b.pz=2.0;
b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;
// Move the baseball:
for (int i=0;i<1000;i++)
{
b.step(0.001);
}
b_array[i]=b;
}
int main() {
int threads=512;
int *cpu_flosm=new int[threads*8192];
for (int num_blocks=1;num_blocks<=8192;num_blocks*=2) {
baseball *b_array;
//cudaMallocManaged(&flosm,threads*num_blocks*sizeof(int)); // slow!
cudaMalloc(&b_array,threads*num_blocks*sizeof(baseball));
double start=time_in_seconds();
move_baseball<<< num_blocks,threads >>>(b_array); // spawn threads
cudaDeviceSynchronize(); // join threads
// Copy back one random baseball, to show it's possible:
baseball cpu_b;
cudaMemcpy(&cpu_b,b_array,sizeof(baseball),cudaMemcpyDeviceToHost);
double elapsed=time_in_seconds()-start;
std::cout<<num_blocks<<" blocks took "<<elapsed*1.0e9<<" ns: "<<elapsed*1.0e9/(threads*num_blocks)<<" ns/thread\n";
}
}
Performance for a single or small number of baseballs is way worse than the CPU; performance for many baseballs is way better than the CPU.