/** This is an RC5 function, supporting key setup, encryption and decryption. */ template <int N_ROUNDS=20,class WORD=unsigned int> class RC5 { public: enum {n_rounds=N_ROUNDS}; enum {n_keys=2*n_rounds+2}; WORD keys[n_keys]; // expanded key // Circular bit rotate left: rotate v left by count bits inline unsigned int ROTL(unsigned int v,unsigned int count) { count=count&31; // wrap around to word size return (v<<count) | (v>>(32-count)); } inline unsigned int ROTR(unsigned int v,unsigned int count) { count=count&31; // wrap around to word size return (v>>count) | (v<<(32-count)); } RC5(WORD keyseed) { // This is a ridiculously weak way to expand the key: srand(keyseed); for (int k=0;k<n_keys;k++) keys[k]=rand(); // add key schedule here } RC5(const void *K,int nKbytes) { // This is the official way to make the RC5 key schedule, // straight from Rivest's paper WORD w = 8*sizeof(WORD); WORD b=nKbytes; WORD t = n_keys; WORD i, j, k, u = w/8, A, B; // Fill L with little-endian word version of key WORD c=(8*b+w-1)/w; // size of L array if (c<=0) c=1; WORD L[c]; // or WORD *L=new WORD[c]; for (i=0;i<c;i++) L[i]=0; for(i = 0; i<b; i++) L[i/u] = (L[i/u] << 8) + ((unsigned char *)K)[i]; // Fill initial key array with constants const WORD Pw=0xB7E15163; // from e const WORD Qw=0x9E3779B9; // from golden ratio for(keys[0] = Pw, i = 1; i < t; i++) keys[i] = keys[i-1] + Qw; // RC4-inspired mixing of keys if (t<c) k=3*c; else k=3*t; for(A = B = i = j = 0; k>0; k--, i = (i+1) % t, j = (j+1) % c) { A = keys[i] = ROTL(keys[i] + (A + B), 3); B = L[j] = ROTL(L[j] + (A + B), (A + B)); } // delete[] L; // if you used new above } inline void round_enc(WORD &A,WORD &B,int round) { A^=B; A = ROTL(A,B) + keys[2*round]; B^=A; B = ROTL(B,A) + keys[2*round+1]; } // Mix two integers worth of data, A and B, with these keys. void rounds_enc(WORD &A,WORD &B) { A += keys[0]; B += keys[1]; for (int round=1;round<=n_rounds;round++) { //printf(" round %d: %08x %08x\n", round, A,B); round_enc(A,B,round); } } inline void round_dec(WORD &A,WORD &B,int round) { B -= keys[2*round+1]; B = ROTR(B,A); B^=A; A -= keys[2*round]; A = ROTR(A,B); A^=B; } // Mix two integers worth of data, A and B, with these keys. void rounds_dec(WORD &A,WORD &B) { for (int round=n_rounds;round>=1;round--) { round_dec(A,B,round); //printf(" round %d: %08x %08x\n", round, A,B); } A -= keys[0]; B -= keys[1]; } // Print these two words in hex, using RC5's little-endian format void printWord(WORD A) { for (int byte=0;byte<4;byte++) printf("%02x",0xff&(A>>(8*byte))); } void printAB(WORD A,WORD B) { printWord(A); printWord(B); printf("\n"); } }; // This is the ciphertext we're trying to crack unsigned int ciphertextA=0xdaad16a1; // Run some simple tests, encrypting ascending data void foo(void) { // Get cracking! double start=time_in_seconds(); for (int PIN=0;PIN<1000000;PIN++) { char PINstr[8]; snprintf(PINstr,8,"%06d",PIN); RC5<> rc(PINstr,6); unsigned int A=0, B=0; // known plaintext rc.rounds_enc(A,B); if (PIN<10) printf("A=%08x key=%08x\n",A,rc.keys[0]); if (A==ciphertextA) printf("Possible PIN: %06d\n",PIN); } printf("All pins checked in %.3f sec\n",time_in_seconds()-start); }Benchmarking reveals the rounds_enc function only takes 54 nanoseconds each, but setting up the key schedule (in the constructor) takes 760 nanoseconds each. Checking all million PIN codes takes 0.876 seconds with this naive serial CPU implementation.
This is a very contorted and bizarre software implementation, but the speedup is substantial:RC5<> rc("yup",3); RC5<> rc2("yupper",6); RC5<> rc3("yippie",6); unsigned int src=0; int time_enc(void) { // simple single-decode benchmark unsigned int A=src, B=src; rc.rounds_enc(A,B); return A==ciphertextA; } int time_enc2(void) { // crazy superscalar-decode benchmark, 2 at once unsigned int A=src-rc.keys[0], B=src-rc.keys[1]; unsigned int A2=src-rc2.keys[0], B2=src-rc2.keys[1]; for (unsigned int round=1;round<=RC5<>::n_rounds;round++) { rc.round_enc(A,B,round); rc2.round_enc(A2,B2,round); } return A==ciphertextA || A2==ciphertextA; } int time_enc3(void) { // crazy superscalar-decode benchmark, 3 at once unsigned int A=src-rc.keys[0], B=src-rc.keys[1]; unsigned int A2=src-rc2.keys[0], B2=src-rc2.keys[1]; unsigned int A3=src-rc3.keys[0], B3=src-rc3.keys[1]; for (unsigned int round=1;round<=RC5<>::n_rounds;round++) { rc.round_enc(A,B,round); rc2.round_enc(A2,B2,round); rc3.round_enc(A3,B3,round); } return A==ciphertextA || A2==ciphertextA || A3==ciphertextA; }
N |
Raw time |
Time/output |
1 |
53.81 ns |
53.81 ns/output |
2 |
63.77 ns |
31.89 ns/output |
3 |
86.93 ns |
28.98 ns/output |
4 |
110.56 ns |
27.64 ns/output |
#pragma omp parallel for for (int PIN=0;PIN<1000000;PIN++) { ... same check as before ... }This checks 1 million PINs in 0.344 seconds, which is just over a 2.5x speedup. For a 4 core machine, this isn't great, but it was really easy to do.
The big advantage of the graphics processing unit (GPU) is the
entire software interface is designed around supporting millions
of separate threads, modeled after the millions of separate pixels
on a display. The enormous parallelism means you get
excellent performance for arithmetic-limited code. The
downside is you give up lots of things you might be used to:
- The GPU can't allocate its own memory. You call cudaMalloc from the CPU side instead.
- The GPU can't do disk or network I/O. You do the I/O on the CPU, and call cudaMemcpy instead.
In CUDA,
all code is normal C++ for the CPU by default; you add the
__global__ keyword to a function to make GPU code callable by the
CPU, and add the __device__ keyword to make a function callable
from GPU code. Here I make a macro CUDA_ALL to make code
callable from the GPU or CPU.
/* Simple CUDA RC5 key setup */ #include <iostream> #include <fstream> #include "lib/inc.c" #ifdef __CUDACC__ # include <cuda.h> # define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";} # define CUDA_ONLY(code) code # define CUDA_ALL __host__ __device__ #else /* no CUDA, for testing */ # define check(cudacall) /* empty */ # define CUDA_ONLY(code) /* empty */ # define CUDA_ALL /* empty */ #endif /** This is a CUDA-ified RC5 function, supporting key setup, encryption and decryption. */ template <int N_ROUNDS=20,class WORD=unsigned int> class RC5 { public: enum {n_userkey=6}; // bytes in user's key enum {n_userword=(n_userkey+4-1)/4}; // WORDs in user's key enum {n_rounds=N_ROUNDS}; enum {n_keys=2*n_rounds+2}; WORD keys[n_keys]; // expanded key WORD L[n_userword]; // secret key temporary // Circular bit rotate left: rotate v left by count bits CUDA_ALL inline unsigned int ROTL(unsigned int v,unsigned int count) { count=count&31; // wrap around to word size return (v<<count) | (v>>(32-count)); } CUDA_ALL inline unsigned int ROTR(unsigned int v,unsigned int count) { count=count&31; // wrap around to word size return (v>>count) | (v<<(32-count)); } RC5(WORD keyseed) { // This is a ridiculously weak way to expand the key: srand(keyseed); for (int k=0;k<n_keys;k++) keys[k]=rand(); // add key schedule here } CUDA_ALL RC5(const void *K,int nKbytes) { // This is the official way to make the RC5 key schedule, // straight from Rivest's paper WORD w = 8*sizeof(WORD); WORD b=nKbytes; WORD t = n_keys; WORD i, j, k, u = w/8, A, B; // Fill L with little-endian word version of key WORD c=(8*b+w-1)/w; // size of L array if (c<=0) c=1; // L is predeclared; a fixed-length user key HACK for CUDA // WORD *L=new WORD[c]; // <- won't work in CUDA! for (i=0;i<c;i++) L[i]=0; for(i = 0; i<b; i++) L[i/u] = (L[i/u] << 8) + ((unsigned char *)K)[i]; // Fill initial key array with constants const WORD Pw=0xB7E15163; // from e const WORD Qw=0x9E3779B9; // from golden ratio for(keys[0] = Pw, i = 1; i < t; i++) keys[i] = keys[i-1] + Qw; // RC4-inspired mixing of keys if (t<c) k=3*c; else k=3*t; for(A = B = i = j = 0; k>0; k--, i = (i+1) % t, j = (j+1) % c) { A = keys[i] = ROTL(keys[i] + (A + B), 3); B = L[j] = ROTL(L[j] + (A + B), (A + B)); } // delete[] L; // <- not in CUDA! } CUDA_ALL inline void round_enc(WORD &A,WORD &B,int round) { A^=B; A = ROTL(A,B) + keys[2*round]; B^=A; B = ROTL(B,A) + keys[2*round+1]; } // Mix two integers worth of data, A and B, with these keys. CUDA_ALL void rounds_enc(WORD &A,WORD &B) { A += keys[0]; B += keys[1]; for (int round=1;round<=n_rounds;round++) { //printf(" round %d: %08x %08x\n", round, A,B); round_enc(A,B,round); } } CUDA_ALL inline void round_dec(WORD &A,WORD &B,int round) { B -= keys[2*round+1]; B = ROTR(B,A); B^=A; A -= keys[2*round]; A = ROTR(A,B); A^=B; } // Mix two integers worth of data, A and B, with these keys. CUDA_ALL void rounds_dec(WORD &A,WORD &B) { for (int round=n_rounds;round>=1;round--) { round_dec(A,B,round); //printf(" round %d: %08x %08x\n", round, A,B); } A -= keys[0]; B -= keys[1]; } // Print these two words in hex, using RC5's little-endian format void printWord(WORD A) { for (int byte=0;byte<4;byte++) printf("%02x",0xff&(A>>(8*byte))); } void printAB(WORD A,WORD B) { printWord(A); printWord(B); printf("\n"); } }; /* GPU Code! */ __global__ void fill_in_array(int *dest,int *src,int ciphertextA) { int tx=threadIdx.x+blockIdx.x*blockDim.x; // my thread number int PIN=tx; // Convert key to string char PINstr[8]; //snprintf(PINstr,50,"%06d",PIN); for (int digit=5;digit>=0;digit--) { char c=(PIN%10)+'0'; PINstr[digit]=c; PIN=PIN/10; } // Set up and run encryption RC5<> rc(PINstr,6); unsigned int A=0, B=0; // known plaintext rc.rounds_enc(A,B); // Check decryption if (A==ciphertextA) printf("Possible PIN: %06d\n",tx); dest[tx]=A; // debug output } /* Run on CPU */ int main(int argc,char *argv[]) { int wid=256; // threads per block int ht=1000000/wid; // total blocks unsigned char data[wid]; for (int i=0;i<wid;i++) data[i]=i|(i<<4); int *src=0, *dest=0; /* LIVES ON THE GPU!!!! */ check(cudaMalloc((void **)&src, wid*ht*sizeof(int))); check(cudaMemcpy(src,data,wid*sizeof(unsigned char),cudaMemcpyHostToDevice)); check(cudaMalloc((void **)&dest, wid*ht*sizeof(int))); double start=time_in_seconds(); int ciphertextTarget=0xdaad16a1; fill_in_array<<<ht,wid>>>(dest,src,ciphertextTarget); int harr[wid]; check(cudaMemcpy(harr,dest,wid*sizeof(float),cudaMemcpyDeviceToHost)); double elapsed=time_in_seconds()-start; std::cout<<"Copied back array at rate "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed*1.0e6<<" microseconds\n"; for (int i=0;i<20;i++) printf("array %d = %08x\n",i,harr[i]); return 0; }
The
performance? Just 15 milliseconds to scan all million
PINs! This is about 22x faster than multicore (2200%
speedup!), and over 50x faster than single core. I've found
if I combine multicore, SIMD, and superscalar, often the CPU can
be made at least competitive with GPU, but by that point the GPU
code is actually cleaner and easier to understand!
The big advantage of custom silicon is you don't waste chip area
on things like branching. Because a dedicated chip does
exactly one thing, you can fit many more circuits doing that
thing, resulting in about a 10x energy efficiency improvement over
the best programmable implementations (such as GPU), which spends
a lot of energy deciding what to do next.
I don't know of an RC5 ASIC, but the SHA-256 circuits in a
dedicated miner like Antminer
can compute over a billion hashes per second, nearly twice as fast
as the best GPU hashers
(and >500 watt systems), despite only being a tiny USB card
(<2 watts!).