#include <iostream> #include <fstream> #include <cassert> #include "lib/inc.c" /* Class to represent one image pixel */ struct pixel { unsigned char r,g,b; }; void draw_pixel(pixel *image,int x,int y, int wid,int ht) { int i=y*wid+x; image[i].r=255; image[i].g=x*255/wid; image[i].b=y*255/ht; if (sqrt((float)(x*x+y*y))<200) { // blue circle in top left corner image[i].r=0; image[i].g=0; image[i].b=255; } } /* Run on CPU */ int main(int argc,char *argv[]) { int wid=512,ht=512; std::vector<pixel> img(wid*ht); // make space for output data double start=time_in_seconds(); // Draw the image for (int y=0;y<ht;y++) for (int x=0;x<wid;x++) draw_pixel(&img[0],x,y,wid,ht); double elapsed=time_in_seconds()-start; std::cout<<"Drew image at "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed*1.0e6<<" microseconds\n"; // Write the image to a file std::ofstream imgfile("out.ppm",std::ios_base::binary); imgfile<<"P6\n"<<wid<<" "<<ht<<" 255\n"; imgfile.write((char *)&img[0],wid*ht*sizeof(pixel)); return 0; }To compute graphics on the GPU with CUDA, you can assign one thread per pixel, and render the image with a kernel call Don't use a loop, let each thread find its own pixel so they can all run in parallel.
/* Class to represent one image pixel */ struct pixel { unsigned char r,g,b; }; /* GPU Code! */ __global__ void draw_image(pixel *image,int wid,int ht) { int i=threadIdx.x+blockIdx.x*blockDim.x; int x=i%wid, y=i/wid; image[i].r=255; image[i].g=x*255/wid; image[i].b=y*255/ht; if (sqrt((float)(x*x+y*y))<200) { // blue circle in top left corner image[i].r=0; image[i].g=0; image[i].b=255; } } /* Run on CPU */ int main(int argc,char *argv[]) { int wid=512,ht=512, threads_per_block=256; gpu_vec<pixel> image(wid*ht); // make space for output data double start=time_in_seconds(); draw_image<<<wid*ht/threads_per_block,threads_per_block>>>(image,wid,ht); pixel p=image[0]; // read one pixel, to make sure process actually finished double elapsed=time_in_seconds()-start; std::cout<<"Drew image at "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed*1.0e6<<" microseconds\n"; // Copy the image data back to the CPU, and write to file std::vector<pixel> img; image.copy_to(img); std::ofstream imgfile("out.ppm",std::ios_base::binary); imgfile<<"P6\n"<<wid<<" "<<ht<<" 255\n"; imgfile.write((char *)&img[0],wid*ht*sizeof(pixel)); return 0; }
This CUDA code renders the image
at about 0.13 ns/pixel, while the C++ code takes over 7
ns/pixel. (Of course, the C++ code is single core and
doesn't use SIMD.)
int radius=(int)sqrt((float)(x*x+y*y)); if (radius%10 >= 1) { // blue circles image[i].r=255.0*tan(sin(cos(sin(0.1f*x)))); image[i].g=0; image[i].b=255; }But now if we do that expensive trig on only 10% of the pixels (only when radius%10 == 9), the program barely speeds up, to only 0.175ns/pixel:
int radius=(int)sqrt((float)(x*x+y*y)); if (radius%10 >= 9) { // blue circles image[i].r=255.0*tan(sin(cos(sin(0.1f*x)))); image[i].g=0; image[i].b=255; }Yet if we make no pixels do trig, the program runs fast. Even stranger, if 10% of the pixels still do trig, but that 10% are clumped together into bigger areas, the program speeds up to 0.15 ns/pixel!
int radius=(int)sqrt((float)(x*x+y*y)); if (radius%100 >= 90) { // big blue circles image[i].r=255.0*tan(sin(cos(sin(0.1f*x)))); image[i].g=0; image[i].b=255; }This weird performance change is due to branch divergance: at the hardware level, CUDA threads that live together in the same 32-float SIMD register all experience the same branches. The terminology is these 32 threads make a "warp" of threads. If all 32 threads branch the same way, no extra work is needed so performance is great. If some of the threads branch and some don't, all the threads wait through both branches, and they each need to mask off the results from the other branch before they can continue.
0 |
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
0 |
1 |
4 |
5 |
8 |
9 |
12 |
13 |
2 |
3 |
6 |
7 |
10 |
11 |
14 |
15 |
16 |
17 | 20 |
21 |
24 |
25 |
28 |
29 |
18 |
19 |
22 |
23 |
26 |
27 |
30 |
31 |
__global__ void draw_image(pixel *image,int wid,int ht) { int i=threadIdx.x+blockIdx.x*blockDim.x; // linear thread number // Apply 8x8 pixel tiling via bit interchange: // i's bits start off as: YYYYYYyyyXXXXXXxxx // we want to make them: YYYYYYXXXXXXyyyxxx // octal digits: \3/\2/\1/\0/ // non-moving yyy move down XXXXXX move up i = (i&~07770) | ((i>>6)&070) | ((i<<3)&&0x07700);This looks really wacky, but it improves branch coherence enough to speed the renderer up to 0.14 ns/pixel. For renderers doing more complex operations, like the recursive branching in a raytracer, tiling can be an even bigger performance win.
int x=i%wid, y=i/wid; image[i].r=255; image[i].g=x*255/wid; image[i].b=y*255/ht; int radius=(int)sqrt((float)(x*x+y*y)); if (radius%100 >= 90) { // big blue circles image[i].r=255.0*tan(sin(cos(sin(0.1f*x)))); image[i].g=0; image[i].b=255; } }