#include <iostream>
#include <fstream>
#include <cassert>
#include "lib/inc.c"
/* Class to represent one image pixel */
struct pixel {
unsigned char r,g,b;
};
void draw_pixel(pixel *image,int x,int y, int wid,int ht) {
int i=y*wid+x;
image[i].r=255;
image[i].g=x*255/wid;
image[i].b=y*255/ht;
if (sqrt((float)(x*x+y*y))<200) { // blue circle in top left corner
image[i].r=0;
image[i].g=0;
image[i].b=255;
}
}
/* Run on CPU */
int main(int argc,char *argv[]) {
int wid=512,ht=512;
std::vector<pixel> img(wid*ht); // make space for output data
double start=time_in_seconds();
// Draw the image
for (int y=0;y<ht;y++)
for (int x=0;x<wid;x++)
draw_pixel(&img[0],x,y,wid,ht);
double elapsed=time_in_seconds()-start;
std::cout<<"Drew image at "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed*1.0e6<<" microseconds\n";
// Write the image to a file
std::ofstream imgfile("out.ppm",std::ios_base::binary);
imgfile<<"P6\n"<<wid<<" "<<ht<<" 255\n";
imgfile.write((char *)&img[0],wid*ht*sizeof(pixel));
return 0;
}
To compute graphics on the GPU with CUDA, you can assign one thread
per pixel, and render the image with a kernel call Don't use a loop,
let each thread find its own pixel so they can all run in parallel.
/* Class to represent one image pixel */
struct pixel {
unsigned char r,g,b;
};
/* GPU Code! */
__global__ void draw_image(pixel *image,int wid,int ht) {
int i=threadIdx.x+blockIdx.x*blockDim.x;
int x=i%wid, y=i/wid;
image[i].r=255;
image[i].g=x*255/wid;
image[i].b=y*255/ht;
if (sqrt((float)(x*x+y*y))<200) { // blue circle in top left corner
image[i].r=0;
image[i].g=0;
image[i].b=255;
}
}
/* Run on CPU */
int main(int argc,char *argv[]) {
int wid=512,ht=512, threads_per_block=256;
gpu_vec<pixel> image(wid*ht); // make space for output data
double start=time_in_seconds();
draw_image<<<wid*ht/threads_per_block,threads_per_block>>>(image,wid,ht);
pixel p=image[0]; // read one pixel, to make sure process actually finished
double elapsed=time_in_seconds()-start;
std::cout<<"Drew image at "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed*1.0e6<<" microseconds\n";
// Copy the image data back to the CPU, and write to file
std::vector<pixel> img; image.copy_to(img);
std::ofstream imgfile("out.ppm",std::ios_base::binary);
imgfile<<"P6\n"<<wid<<" "<<ht<<" 255\n";
imgfile.write((char *)&img[0],wid*ht*sizeof(pixel));
return 0;
}
This CUDA code renders the image
at about 0.13 ns/pixel, while the C++ code takes over 7
ns/pixel. (Of course, the C++ code is single core and
doesn't use SIMD.)
int radius=(int)sqrt((float)(x*x+y*y));
if (radius%10 >= 1) { // blue circles
image[i].r=255.0*tan(sin(cos(sin(0.1f*x))));
image[i].g=0;
image[i].b=255;
}
But now if we do that expensive trig on only 10% of the pixels (only
when radius%10 == 9), the program barely speeds up, to only
0.175ns/pixel: int radius=(int)sqrt((float)(x*x+y*y));
if (radius%10 >= 9) { // blue circles
image[i].r=255.0*tan(sin(cos(sin(0.1f*x))));
image[i].g=0;
image[i].b=255;
}
Yet if we make no pixels do trig, the program runs fast. Even
stranger, if 10% of the pixels still do trig, but that 10% are
clumped together into bigger areas, the program speeds up to 0.15
ns/pixel! int radius=(int)sqrt((float)(x*x+y*y));
if (radius%100 >= 90) { // big blue circles
image[i].r=255.0*tan(sin(cos(sin(0.1f*x))));
image[i].g=0;
image[i].b=255;
}
This weird performance change is due to branch
divergance: at the hardware level, CUDA threads that live
together in the same 32-float SIMD register all experience the same
branches. The terminology is these 32 threads make a "warp" of
threads. If all 32 threads branch the same way, no extra work
is needed so performance is great. If some of the threads
branch and some don't, all the threads wait through both branches,
and they each need to mask off the results from the other branch
before they can continue. | 0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
| 8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
| 16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
| 24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
| 0 |
1 |
4 |
5 |
8 |
9 |
12 |
13 |
| 2 |
3 |
6 |
7 |
10 |
11 |
14 |
15 |
| 16 |
17 | 20 |
21 |
24 |
25 |
28 |
29 |
| 18 |
19 |
22 |
23 |
26 |
27 |
30 |
31 |
__global__ void draw_image(pixel *image,int wid,int ht) {
int i=threadIdx.x+blockIdx.x*blockDim.x; // linear thread number
// Apply 8x8 pixel tiling via bit interchange:
// i's bits start off as: YYYYYYyyyXXXXXXxxx
// we want to make them: YYYYYYXXXXXXyyyxxx
// octal digits: \3/\2/\1/\0/
// non-moving yyy move down XXXXXX move up
i = (i&~07770) | ((i>>6)&070) | ((i<<3)&&0x07700);
int x=i%wid, y=i/wid;
image[i].r=255;
image[i].g=x*255/wid;
image[i].b=y*255/ht;
int radius=(int)sqrt((float)(x*x+y*y));
if (radius%100 >= 90) { // big blue circles
image[i].r=255.0*tan(sin(cos(sin(0.1f*x))));
image[i].g=0;
image[i].b=255;
}
}
This looks really wacky, but it improves branch coherence enough to speed the renderer up to 0.14 ns/pixel. For renderers doing more complex operations, like the recursive branching in a raytracer, tiling can be an even bigger performance win.