# Multicore Performance Example

Here's a simple example where we simulate the flight path of a baseball for various starting conditions.

```class baseball {
public:
float px,py,pz;
float vx,vy,vz;
void step(float dt) {
float drag=-1.0;
float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz;
float coriolis=0.00001;
ax+=coriolis*vy; ay-=coriolis*vx;
vx+=ax*dt; vy+=ay*dt; vz+=az*dt;
px+=vx*dt; py+=vy*dt; pz+=vz*dt;
}
};

float foo(void) {

#define ASCII 0 /* set to 1 to see ASCII art of the ball's path */
#if ASCII
const int ASCIIART=100; // chars wide and high
char buf[ASCIIART][ASCIIART];
for (int y=0;y<ASCIIART;y++) for (int x=0;x<ASCIIART;x++) buf[y][x]=' ';
#endif

baseball b;
for (float angle=0.0;angle<1.0;angle+=0.125)
{
b.px=b.py=0.0; b.pz=2.0;
b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;

//	Move the baseball:
for (int i=0;i<1000;i++)
{
b.step(0.001);

#if ASCII
int y=(int)(b.pz*ASCIIART/100*40);
int x=(int)(b.px*ASCIIART/100*4);
if (y>=0 && y<ASCIIART && x>=0 && x<ASCIIART)
buf[y][x]='O'+(char)(angle*8);
#endif
}
}

#if ASCII
for (int y=0;y<ASCIIART;y++) {
for (int x=0;x<ASCIIART;x++)
std::cout<<buf[ASCIIART-1-y][x];
std::cout<<std::endl;
}
#endif

return b.pz;
}

```

(Try this in NetRun now!)

We can convert to multicore using OpenMP by adding the pragma to parallelize the angle loop:

```#pragma omp parallel for
for (int angle_i=0;angle_i<8;angle_i++)
//	for (float angle=0.0;angle<1.0;angle+=0.125) // omp can't do float loops
{
float angle=0.125*angle_i;
baseball b;
b.px=b.py=0.0; b.pz=2.0;
b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;

for (int i=0;i<1000;i++)
{
b.step(0.001);```
`			...`
`		}`
`}`

(Try this in NetRun now!)

Here's the CUDA baseball simulator:

```#include <iostream>
#include <stdio.h>
#include "lib/inc.c"

class baseball {
public:
float px,py,pz;
float vx,vy,vz;
__device__ __host__ void step(float dt) {
float drag=-1.0;
float ax=drag*vx, ay=drag*vy, az=-9.8+drag*vz;
float coriolis=0.00001;
ax+=coriolis*vy; ay-=coriolis*vx;
vx+=ax*dt; vy+=ay*dt; vz+=az*dt;
px+=vx*dt; py+=vy*dt; pz+=vz*dt;
}
};

__global__ void move_baseball(baseball *b_array) {
// printf("Hello from GPU: thread %d of block %d, f=%d\n",t,b,f);
float angle=0.000125*i;
baseball b;
b.px=b.py=0.0; b.pz=2.0;
b.vx=35.7; b.vy=0.2; b.vz=2.0+angle;

//	Move the baseball:
for (int i=0;i<1000;i++)
{
b.step(0.001);
}

b_array[i]=b;
}

int main() {
for (int num_blocks=1;num_blocks<=8192;num_blocks*=2) {

baseball *b_array;

double start=time_in_seconds();

// Copy back one random baseball, to show it's possible:
baseball cpu_b;
cudaMemcpy(&cpu_b,b_array,sizeof(baseball),cudaMemcpyDeviceToHost);
double elapsed=time_in_seconds()-start;