/* Secure Hash Algorithm-1 (SHA-1) implementation.
Implemented and placed in the public domain by Steve Reid
Collected by Wei Dai (http://www.eskimo.com/~weidai/cryptlib.html)
Adapted by Orion Sky Lawlor, 7/20/2001
Re-adapted for single-file operation 2012-11.
*/
typedef unsigned int SHA1_word32;
enum {SHA1_nstate=5};
enum {SHA1_ndata=16};
/// Initialize SHA1_hash_words of state.
void SHA1_init(SHA1_word32 *state)
{
state[0] = 0x67452301u;
state[1] = 0xEFCDAB89u;
state[2] = 0x98BADCFEu;
state[3] = 0x10325476u;
state[4] = 0xC3D2E1F0u;
}
/// Circular left shift in 32 bits
inline SHA1_word32 rotlFixed(SHA1_word32 x, SHA1_word32 y)
{
#if defined(_MSC_VER) || defined(__BCPLUSPLUS__)
return y ? _lrotl(x, y) : x;
#elif defined(__MWERKS__) && TARGET_CPU_PPC
return y ? __rlwinm(x,y,0,31) : x;
#else /*Default C version*/
/* works for int, don't and for SIMD
return ((0xFFffFFffu)&(x<<y)) | (((0xFFffFFffu)&x)>>(32-y));
*/
return ((x<<y)) | ((x)>>(32-y));
#endif
}
#define blk0(i) (W[i] = data[i])
#define blk1(i) (W[i&15] = rotlFixed(W[(i+13)&15]^W[(i+8)&15]^W[(i+2)&15]^W[i&15],1))
#define f1(x,y,z) (z^(x&(y^z)))
#define f2(x,y,z) (x^y^z)
#define f3(x,y,z) ((x&y)|(z&(x|y)))
#define f4(x,y,z) (x^y^z)
/* (R0+R1), R2, R3, R4 are the different operations used in SHA1 */
#define R0(v,w,x,y,z,i) z+=f1(w,x,y)+blk0(i)+0x5A827999u+rotlFixed(v,5);w=rotlFixed(w,30);
#define R1(v,w,x,y,z,i) z+=f1(w,x,y)+blk1(i)+0x5A827999u+rotlFixed(v,5);w=rotlFixed(w,30);
#define R2(v,w,x,y,z,i) z+=f2(w,x,y)+blk1(i)+0x6ED9EBA1u+rotlFixed(v,5);w=rotlFixed(w,30);
#define R3(v,w,x,y,z,i) z+=f3(w,x,y)+blk1(i)+0x8F1BBCDCu+rotlFixed(v,5);w=rotlFixed(w,30);
#define R4(v,w,x,y,z,i) z+=f4(w,x,y)+blk1(i)+0xCA62C1D6u+rotlFixed(v,5);w=rotlFixed(w,30);
void SHA1_transform(SHA1_word32 *state, const SHA1_word32 *data)
{
SHA1_word32 W[16];
/* Copy state to working vars */
SHA1_word32 a = state[0];
SHA1_word32 b = state[1];
SHA1_word32 c = state[2];
SHA1_word32 d = state[3];
SHA1_word32 e = state[4];
/* 4 rounds of 20 operations each. Loop unrolled. */
R0(a,b,c,d,e, 0); R0(e,a,b,c,d, 1); R0(d,e,a,b,c, 2); R0(c,d,e,a,b, 3);
R0(b,c,d,e,a, 4); R0(a,b,c,d,e, 5); R0(e,a,b,c,d, 6); R0(d,e,a,b,c, 7);
R0(c,d,e,a,b, 8); R0(b,c,d,e,a, 9); R0(a,b,c,d,e,10); R0(e,a,b,c,d,11);
R0(d,e,a,b,c,12); R0(c,d,e,a,b,13); R0(b,c,d,e,a,14); R0(a,b,c,d,e,15);
R1(e,a,b,c,d,16); R1(d,e,a,b,c,17); R1(c,d,e,a,b,18); R1(b,c,d,e,a,19);
R2(a,b,c,d,e,20); R2(e,a,b,c,d,21); R2(d,e,a,b,c,22); R2(c,d,e,a,b,23);
R2(b,c,d,e,a,24); R2(a,b,c,d,e,25); R2(e,a,b,c,d,26); R2(d,e,a,b,c,27);
R2(c,d,e,a,b,28); R2(b,c,d,e,a,29); R2(a,b,c,d,e,30); R2(e,a,b,c,d,31);
R2(d,e,a,b,c,32); R2(c,d,e,a,b,33); R2(b,c,d,e,a,34); R2(a,b,c,d,e,35);
R2(e,a,b,c,d,36); R2(d,e,a,b,c,37); R2(c,d,e,a,b,38); R2(b,c,d,e,a,39);
R3(a,b,c,d,e,40); R3(e,a,b,c,d,41); R3(d,e,a,b,c,42); R3(c,d,e,a,b,43);
R3(b,c,d,e,a,44); R3(a,b,c,d,e,45); R3(e,a,b,c,d,46); R3(d,e,a,b,c,47);
R3(c,d,e,a,b,48); R3(b,c,d,e,a,49); R3(a,b,c,d,e,50); R3(e,a,b,c,d,51);
R3(d,e,a,b,c,52); R3(c,d,e,a,b,53); R3(b,c,d,e,a,54); R3(a,b,c,d,e,55);
R3(e,a,b,c,d,56); R3(d,e,a,b,c,57); R3(c,d,e,a,b,58); R3(b,c,d,e,a,59);
R4(a,b,c,d,e,60); R4(e,a,b,c,d,61); R4(d,e,a,b,c,62); R4(c,d,e,a,b,63);
R4(b,c,d,e,a,64); R4(a,b,c,d,e,65); R4(e,a,b,c,d,66); R4(d,e,a,b,c,67);
R4(c,d,e,a,b,68); R4(b,c,d,e,a,69); R4(a,b,c,d,e,70); R4(e,a,b,c,d,71);
R4(d,e,a,b,c,72); R4(c,d,e,a,b,73); R4(b,c,d,e,a,74); R4(a,b,c,d,e,75);
R4(e,a,b,c,d,76); R4(d,e,a,b,c,77); R4(c,d,e,a,b,78); R4(b,c,d,e,a,79);
/* Add the working vars back into context.state[] */
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
}
/* Silly single-integer interface.
The real interface takes a variable-length bit string of data, transforms each
SHA1_ndata size chunk, and finally tacks on an end-of-message 0x80 and a bit count.
I'm ignoring all that here.
*/
void hash(unsigned int *src,unsigned int *hashes) {
SHA1_word32 state[SHA1_nstate];
SHA1_init(&state[0]);
SHA1_word32 data[SHA1_ndata];
for (int i=0;i<SHA1_ndata;i++) data[i]=0;
data[0]=src[0];
SHA1_transform(&state[0],&data[0]);
hashes[0]=state[0];
}
/************ Main program ****************/
int foo(void) {
unsigned int matches=0;
int nbatches=1000000;
unsigned int htarget=read_input();
double start=time_in_seconds();
for (int v=0;v<nbatches;v++) {
unsigned int vs[1], hs[1];
for (int s=0;s<1;s++) vs[s]=v+s;
hash(vs,hs);
for (int s=0;s<1;s++) {
unsigned int h=hs[s];
if (h==htarget) { /* we have a match! */
matches++;
std::cout<<"Hash["<<v+s<<"]="<<std::hex<<h<<"\n";
}
}
}
double end=time_in_seconds();
std::cout<<"Total time: "<< (end-start)*1.0e3 <<" milliseconds\n";
std::cout<<"Time/hash: "<< (end-start)/nbatches*1.0e9 <<" nanoseconds\n";
return matches;
}
This runs in 178 nanoseconds per hash on my Sandy Bridge
machine--about 5 million hashes per second, which is fairly
respectable.One simple way to run faster is to use multiple cores. The easiest way to use multiple cores is via OpenMP, a compiler directive that says "make this loop run with multiple threads across the cores".
To use it, add the line "#pragma omp parallel for" above the v
loop in foo, and switch to "OpenMP" mode.
(Try
this in NetRun now!)
This immediately gets us to 55 nanoseconds per hash, not quite a
four-fold speedup due to overheads creating threads, and the
single shared memory subsystem.
In real code, you must be very careful when using threads:
Since threads take a long time to start up, hardware architects
often use an orthogonal way to express parallelism called SIMD:
Single Instruction, Multiple Data. Intel's version of this
is called SSE, the Streaming SIMD Extensions. From C, you
can #include <emmintrin.h>, and
the crypo-friendly integer values are four-integer blocks named
__m128i, and the operations end with "_epi32".
This means you can add two SIMD blocks of integers using:
#include <emmintrin.h> /* Intel SSE header */
int foo(void) {
__m128i a=_mm_setr_epi32(1,2,3,4);
__m128i b=_mm_setr_epi32(10,100,1000,1000);
__m128i c=_mm_add_epi32(a,b);
int out[4];
_mm_store_si128((__m128i *)&out[0],c);
for (int i=0;i<4;i++) std::cout<<out[i]<<" ";
return 0;
}
This is pretty darn ugly, so I built a header named "osl/floats.h"
that defines a class named "ints". This class has overloaded
operators to let you treat SSE values similar to ordinary single
values. In fact, I can switch the typedef around in the SHA1
implementation above, and get a SSE version basically free!
Note that this version computes four totally independent hashes
per run--this is the easiest way to use SIMD, computing unrelated
values, because it eliminates dependencies between the slices of
an SIMD register.
/* Secure Hash Algorithm-1 (SHA-1) implementation. Implemented and placed in the public domain by Steve Reid Collected by Wei Dai (http://www.eskimo.com/~weidai/cryptlib.html) Adapted by Orion Sky Lawlor, 7/20/2001 Re-adapted for single-file operation 2012-11. */ #undef __AVX__ // the SSE integer stuff is faster than AVX #include "osl/floats.h" typedef ints SHA1_word32;
... all SHA1 stuff works exactly the same way ...
void hash(unsigned int *src,unsigned int *hashes) { SHA1_word32 state[SHA1_nstate]; SHA1_init(&state[0]); SHA1_word32 data[SHA1_ndata]; for (int i=0;i<SHA1_ndata;i++) data[i]=0; for (int s=0;s<ints::n;s++) data[0][s]=src[s]; SHA1_transform(&state[0],&data[0]); for (int s=0;s<ints::n;s++) hashes[s]=state[0][s]; } /************ Main program ****************/ int foo(void) { unsigned int matches=0; int nbatches=1000000; unsigned int htarget=read_input(); double start=time_in_seconds(); #pragma omp parallel for for (int v=0;v<nbatches;v+=ints::n) { unsigned int vs[ints::n], hs[ints::n]; for (int s=0;s<ints::n;s++) vs[s]=v+s; hash(vs,hs); for (int s=0;s<ints::n;s++) { unsigned int h=hs[s]; if (h==htarget) { /* we have a match! */ matches++; std::cout<<"Hash["<<v+s<<"]="<<std::hex<<h<<"\n"; } } } double end=time_in_seconds(); std::cout<<"Total time: "<< (end-start)*1.0e3 <<" milliseconds\n"; std::cout<<"Time/hash: "<< (end-start)/nbatches*1.0e9 <<" nanoseconds\n"; return matches; }
The end result is 19ns per hash,
which is over 50 million hashes per second, and nearly 10x faster
than our original sequential code!