/**
This is an RC5 function, supporting key setup,
encryption and decryption.
*/
template <int N_ROUNDS=20,class WORD=unsigned int>
class RC5 {
public:
enum {n_rounds=N_ROUNDS};
enum {n_keys=2*n_rounds+2};
WORD keys[n_keys]; // expanded key
// Circular bit rotate left: rotate v left by count bits
inline unsigned int ROTL(unsigned int v,unsigned int count)
{
count=count&31; // wrap around to word size
return (v<<count) | (v>>(32-count));
}
inline unsigned int ROTR(unsigned int v,unsigned int count)
{
count=count&31; // wrap around to word size
return (v>>count) | (v<<(32-count));
}
RC5(WORD keyseed) {
// This is a ridiculously weak way to expand the key:
srand(keyseed);
for (int k=0;k<n_keys;k++) keys[k]=rand(); // add key schedule here
}
RC5(const void *K,int nKbytes) {
// This is the official way to make the RC5 key schedule,
// straight from Rivest's paper
WORD w = 8*sizeof(WORD);
WORD b=nKbytes;
WORD t = n_keys;
WORD i, j, k, u = w/8, A, B;
// Fill L with little-endian word version of key
WORD c=(8*b+w-1)/w; // size of L array
if (c<=0) c=1;
WORD L[c]; // or WORD *L=new WORD[c];
for (i=0;i<c;i++) L[i]=0;
for(i = 0; i<b; i++)
L[i/u] = (L[i/u] << 8) + ((unsigned char *)K)[i];
// Fill initial key array with constants
const WORD Pw=0xB7E15163; // from e
const WORD Qw=0x9E3779B9; // from golden ratio
for(keys[0] = Pw, i = 1; i < t; i++)
keys[i] = keys[i-1] + Qw;
// RC4-inspired mixing of keys
if (t<c) k=3*c;
else k=3*t;
for(A = B = i = j = 0; k>0; k--, i = (i+1) % t, j = (j+1) % c)
{
A = keys[i] = ROTL(keys[i] + (A + B), 3);
B = L[j] = ROTL(L[j] + (A + B), (A + B));
}
// delete[] L; // if you used new above
}
inline void round_enc(WORD &A,WORD &B,int round) {
A^=B;
A = ROTL(A,B) + keys[2*round];
B^=A;
B = ROTL(B,A) + keys[2*round+1];
}
// Mix two integers worth of data, A and B, with these keys.
void rounds_enc(WORD &A,WORD &B) {
A += keys[0];
B += keys[1];
for (int round=1;round<=n_rounds;round++) {
//printf(" round %d: %08x %08x\n", round, A,B);
round_enc(A,B,round);
}
}
inline void round_dec(WORD &A,WORD &B,int round) {
B -= keys[2*round+1];
B = ROTR(B,A);
B^=A;
A -= keys[2*round];
A = ROTR(A,B);
A^=B;
}
// Mix two integers worth of data, A and B, with these keys.
void rounds_dec(WORD &A,WORD &B) {
for (int round=n_rounds;round>=1;round--) {
round_dec(A,B,round);
//printf(" round %d: %08x %08x\n", round, A,B);
}
A -= keys[0];
B -= keys[1];
}
// Print these two words in hex, using RC5's little-endian format
void printWord(WORD A) {
for (int byte=0;byte<4;byte++)
printf("%02x",0xff&(A>>(8*byte)));
}
void printAB(WORD A,WORD B) {
printWord(A); printWord(B); printf("\n");
}
};
// This is the ciphertext we're trying to crack
unsigned int ciphertextA=0xdaad16a1;
// Run some simple tests, encrypting ascending data
void foo(void) {
// Get cracking!
double start=time_in_seconds();
for (int PIN=0;PIN<1000000;PIN++) {
char PINstr[8];
snprintf(PINstr,8,"%06d",PIN);
RC5<> rc(PINstr,6);
unsigned int A=0, B=0; // known plaintext
rc.rounds_enc(A,B);
if (PIN<10) printf("A=%08x key=%08x\n",A,rc.keys[0]);
if (A==ciphertextA) printf("Possible PIN: %06d\n",PIN);
}
printf("All pins checked in %.3f sec\n",time_in_seconds()-start);
}
Benchmarking reveals the rounds_enc function only takes 54
nanoseconds each, but setting up the key schedule (in the
constructor) takes 760 nanoseconds each. Checking all million
PIN codes takes 0.876 seconds with this naive serial CPU
implementation.
RC5<> rc("yup",3);
RC5<> rc2("yupper",6);
RC5<> rc3("yippie",6);
unsigned int src=0;
int time_enc(void) { // simple single-decode benchmark
unsigned int A=src, B=src;
rc.rounds_enc(A,B);
return A==ciphertextA;
}
int time_enc2(void) { // crazy superscalar-decode benchmark, 2 at once
unsigned int A=src-rc.keys[0], B=src-rc.keys[1];
unsigned int A2=src-rc2.keys[0], B2=src-rc2.keys[1];
for (unsigned int round=1;round<=RC5<>::n_rounds;round++) {
rc.round_enc(A,B,round);
rc2.round_enc(A2,B2,round);
}
return A==ciphertextA || A2==ciphertextA;
}
int time_enc3(void) { // crazy superscalar-decode benchmark, 3 at once
unsigned int A=src-rc.keys[0], B=src-rc.keys[1];
unsigned int A2=src-rc2.keys[0], B2=src-rc2.keys[1];
unsigned int A3=src-rc3.keys[0], B3=src-rc3.keys[1];
for (unsigned int round=1;round<=RC5<>::n_rounds;round++) {
rc.round_enc(A,B,round);
rc2.round_enc(A2,B2,round);
rc3.round_enc(A3,B3,round);
}
return A==ciphertextA || A2==ciphertextA || A3==ciphertextA;
}
This is a very contorted and bizarre software implementation, but
the speedup is substantial:| N |
Raw time |
Time/output |
| 1 |
53.81 ns |
53.81 ns/output |
| 2 |
63.77 ns |
31.89 ns/output |
| 3 |
86.93 ns |
28.98 ns/output |
| 4 |
110.56 ns |
27.64 ns/output |
#pragma omp parallel for
for (int PIN=0;PIN<1000000;PIN++) {
... same check as before ...
}
This checks 1 million PINs in
0.344 seconds, which is just over a 2.5x speedup. For a 4 core
machine, this isn't great, but it was really easy to do.The big advantage of the graphics processing unit (GPU) is the
entire software interface is designed around supporting millions
of separate threads, modeled after the millions of separate pixels
on a display. The enormous parallelism means you get
excellent performance for arithmetic-limited code. The
downside is you give up lots of things you might be used to:
- The GPU can't allocate its own memory. You call cudaMalloc from the CPU side instead.
- The GPU can't do disk or network I/O. You do the I/O on the CPU, and call cudaMemcpy instead.
In CUDA,
all code is normal C++ for the CPU by default; you add the
__global__ keyword to a function to make GPU code callable by the
CPU, and add the __device__ keyword to make a function callable
from GPU code. Here I make a macro CUDA_ALL to make code
callable from the GPU or CPU.
/*
Simple CUDA RC5 key setup
*/
#include <iostream>
#include <fstream>
#include "lib/inc.c"
#ifdef __CUDACC__
# include <cuda.h>
# define check(cudacall) { int err=cudacall; if (err!=cudaSuccess) std::cout<<"CUDA ERROR "<<err<<" at line "<<__LINE__<<"'s "<<#cudacall<<"\n";}
# define CUDA_ONLY(code) code
# define CUDA_ALL __host__ __device__
#else /* no CUDA, for testing */
# define check(cudacall) /* empty */
# define CUDA_ONLY(code) /* empty */
# define CUDA_ALL /* empty */
#endif
/**
This is a CUDA-ified RC5 function, supporting key setup,
encryption and decryption.
*/
template <int N_ROUNDS=20,class WORD=unsigned int>
class RC5 {
public:
enum {n_userkey=6}; // bytes in user's key
enum {n_userword=(n_userkey+4-1)/4}; // WORDs in user's key
enum {n_rounds=N_ROUNDS};
enum {n_keys=2*n_rounds+2};
WORD keys[n_keys]; // expanded key
WORD L[n_userword]; // secret key temporary
// Circular bit rotate left: rotate v left by count bits
CUDA_ALL inline unsigned int ROTL(unsigned int v,unsigned int count)
{
count=count&31; // wrap around to word size
return (v<<count) | (v>>(32-count));
}
CUDA_ALL inline unsigned int ROTR(unsigned int v,unsigned int count)
{
count=count&31; // wrap around to word size
return (v>>count) | (v<<(32-count));
}
RC5(WORD keyseed) {
// This is a ridiculously weak way to expand the key:
srand(keyseed);
for (int k=0;k<n_keys;k++) keys[k]=rand(); // add key schedule here
}
CUDA_ALL RC5(const void *K,int nKbytes) {
// This is the official way to make the RC5 key schedule,
// straight from Rivest's paper
WORD w = 8*sizeof(WORD);
WORD b=nKbytes;
WORD t = n_keys;
WORD i, j, k, u = w/8, A, B;
// Fill L with little-endian word version of key
WORD c=(8*b+w-1)/w; // size of L array
if (c<=0) c=1;
// L is predeclared; a fixed-length user key HACK for CUDA
// WORD *L=new WORD[c]; // <- won't work in CUDA!
for (i=0;i<c;i++) L[i]=0;
for(i = 0; i<b; i++)
L[i/u] = (L[i/u] << 8) + ((unsigned char *)K)[i];
// Fill initial key array with constants
const WORD Pw=0xB7E15163; // from e
const WORD Qw=0x9E3779B9; // from golden ratio
for(keys[0] = Pw, i = 1; i < t; i++)
keys[i] = keys[i-1] + Qw;
// RC4-inspired mixing of keys
if (t<c) k=3*c;
else k=3*t;
for(A = B = i = j = 0; k>0; k--, i = (i+1) % t, j = (j+1) % c)
{
A = keys[i] = ROTL(keys[i] + (A + B), 3);
B = L[j] = ROTL(L[j] + (A + B), (A + B));
}
// delete[] L; // <- not in CUDA!
}
CUDA_ALL inline void round_enc(WORD &A,WORD &B,int round) {
A^=B;
A = ROTL(A,B) + keys[2*round];
B^=A;
B = ROTL(B,A) + keys[2*round+1];
}
// Mix two integers worth of data, A and B, with these keys.
CUDA_ALL void rounds_enc(WORD &A,WORD &B) {
A += keys[0];
B += keys[1];
for (int round=1;round<=n_rounds;round++) {
//printf(" round %d: %08x %08x\n", round, A,B);
round_enc(A,B,round);
}
}
CUDA_ALL inline void round_dec(WORD &A,WORD &B,int round) {
B -= keys[2*round+1];
B = ROTR(B,A);
B^=A;
A -= keys[2*round];
A = ROTR(A,B);
A^=B;
}
// Mix two integers worth of data, A and B, with these keys.
CUDA_ALL void rounds_dec(WORD &A,WORD &B) {
for (int round=n_rounds;round>=1;round--) {
round_dec(A,B,round);
//printf(" round %d: %08x %08x\n", round, A,B);
}
A -= keys[0];
B -= keys[1];
}
// Print these two words in hex, using RC5's little-endian format
void printWord(WORD A) {
for (int byte=0;byte<4;byte++)
printf("%02x",0xff&(A>>(8*byte)));
}
void printAB(WORD A,WORD B) {
printWord(A); printWord(B); printf("\n");
}
};
/* GPU Code! */
__global__ void fill_in_array(int *dest,int *src,int ciphertextA) {
int tx=threadIdx.x+blockIdx.x*blockDim.x; // my thread number
int PIN=tx;
// Convert key to string
char PINstr[8];
//snprintf(PINstr,50,"%06d",PIN);
for (int digit=5;digit>=0;digit--) {
char c=(PIN%10)+'0';
PINstr[digit]=c;
PIN=PIN/10;
}
// Set up and run encryption
RC5<> rc(PINstr,6);
unsigned int A=0, B=0; // known plaintext
rc.rounds_enc(A,B);
// Check decryption
if (A==ciphertextA) printf("Possible PIN: %06d\n",tx);
dest[tx]=A; // debug output
}
/* Run on CPU */
int main(int argc,char *argv[]) {
int wid=256; // threads per block
int ht=1000000/wid; // total blocks
unsigned char data[wid];
for (int i=0;i<wid;i++) data[i]=i|(i<<4);
int *src=0, *dest=0; /* LIVES ON THE GPU!!!! */
check(cudaMalloc((void **)&src, wid*ht*sizeof(int)));
check(cudaMemcpy(src,data,wid*sizeof(unsigned char),cudaMemcpyHostToDevice));
check(cudaMalloc((void **)&dest, wid*ht*sizeof(int)));
double start=time_in_seconds();
int ciphertextTarget=0xdaad16a1;
fill_in_array<<<ht,wid>>>(dest,src,ciphertextTarget);
int harr[wid];
check(cudaMemcpy(harr,dest,wid*sizeof(float),cudaMemcpyDeviceToHost));
double elapsed=time_in_seconds()-start;
std::cout<<"Copied back array at rate "<<elapsed*1.0e9/(wid*ht)<<"ns/pixel, time "<<elapsed*1.0e6<<" microseconds\n";
for (int i=0;i<20;i++) printf("array %d = %08x\n",i,harr[i]);
return 0;
}
The
performance? Just 15 milliseconds to scan all million
PINs! This is about 22x faster than multicore (2200%
speedup!), and over 50x faster than single core. I've found
if I combine multicore, SIMD, and superscalar, often the CPU can
be made at least competitive with GPU, but by that point the GPU
code is actually cleaner and easier to understand!
The big advantage of custom silicon is you don't waste chip area
on things like branching. Because a dedicated chip does
exactly one thing, you can fit many more circuits doing that
thing, resulting in about a 10x energy efficiency improvement over
the best programmable implementations (such as GPU), which spends
a lot of energy deciding what to do next.
I don't know of an RC5 ASIC, but the SHA-256 circuits in a
dedicated miner like Antminer
can compute over a billion hashes per second, nearly twice as fast
as the best GPU hashers
(and >500 watt systems), despite only being a tiny USB card
(<2 watts!).