1
mirror of https://github.com/hashcat/hashcat synced 2024-11-13 17:28:58 +01:00

Remove automatic unrolling on some ciphers and replace with manually unrolled code

This commit is contained in:
jsteube 2019-03-16 19:00:36 +01:00
parent 6286874b0a
commit a063e9ef62
3 changed files with 572 additions and 325 deletions

View File

@ -700,9 +700,6 @@ DECLSPEC void aes128_ExpandKey (u32 *ks, const u32 *ukey, SHM_TYPE u32 *s_te0, S
ks[2] = ukey[2]; ks[2] = ukey[2];
ks[3] = ukey[3]; ks[3] = ukey[3];
#ifdef _unroll
#pragma unroll
#endif
for (int i = 0, j = 0; i < 10; i += 1, j += 4) for (int i = 0, j = 0; i < 10; i += 1, j += 4)
{ {
u32 temp = ks[j + 3]; u32 temp = ks[j + 3];
@ -748,9 +745,6 @@ DECLSPEC void aes128_InvertKey (u32 *ks, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te
temp = ks[18]; ks[18] = ks[26]; ks[26] = temp; temp = ks[18]; ks[18] = ks[26]; ks[26] = temp;
temp = ks[19]; ks[19] = ks[27]; ks[27] = temp; temp = ks[19]; ks[19] = ks[27]; ks[27] = temp;
#ifdef _unroll
#pragma unroll
#endif
for (int i = 1, j = 4; i < 10; i += 1, j += 4) for (int i = 1, j = 4; i < 10; i += 1, j += 4)
{ {
const u32 x0s0 = (ks[j + 0] >> 0) & 0xff; const u32 x0s0 = (ks[j + 0] >> 0) & 0xff;
@ -827,83 +821,82 @@ DECLSPEC void aes128_set_decrypt_key (u32 *ks, const u32 *ukey, SHM_TYPE u32 *s_
DECLSPEC void aes128_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4); DECLSPEC void aes128_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4);
DECLSPEC void aes128_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) DECLSPEC void aes128_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
{ {
const u32 in_s0 = swap32_S (in[0]); u32 in_s[4];
const u32 in_s1 = swap32_S (in[1]);
const u32 in_s2 = swap32_S (in[2]);
const u32 in_s3 = swap32_S (in[3]);
u32 t0 = in_s0 ^ ks[0]; in_s[0] = swap32_S (in[0]);
u32 t1 = in_s1 ^ ks[1]; in_s[1] = swap32_S (in[1]);
u32 t2 = in_s2 ^ ks[2]; in_s[2] = swap32_S (in[2]);
u32 t3 = in_s3 ^ ks[3]; in_s[3] = swap32_S (in[3]);
#ifdef _unroll u32 s0 = in_s[0] ^ ks[0];
#pragma unroll u32 s1 = in_s[1] ^ ks[1];
#endif u32 s2 = in_s[2] ^ ks[2];
for (int i = 4; i < 40; i += 4) u32 s3 = in_s[3] ^ ks[3];
{
const u32 x0s0 = (t0 >> 0) & 0xff;
const u32 x0s1 = (t0 >> 8) & 0xff;
const u32 x0s2 = (t0 >> 16) & 0xff;
const u32 x0s3 = (t0 >> 24) & 0xff;
const u32 x1s0 = (t1 >> 0) & 0xff;
const u32 x1s1 = (t1 >> 8) & 0xff;
const u32 x1s2 = (t1 >> 16) & 0xff;
const u32 x1s3 = (t1 >> 24) & 0xff;
const u32 x2s0 = (t2 >> 0) & 0xff;
const u32 x2s1 = (t2 >> 8) & 0xff;
const u32 x2s2 = (t2 >> 16) & 0xff;
const u32 x2s3 = (t2 >> 24) & 0xff;
const u32 x3s0 = (t3 >> 0) & 0xff;
const u32 x3s1 = (t3 >> 8) & 0xff;
const u32 x3s2 = (t3 >> 16) & 0xff;
const u32 x3s3 = (t3 >> 24) & 0xff;
t0 = s_te0[x0s3] ^ s_te1[x1s2] ^ s_te2[x2s1] ^ s_te3[x3s0] ^ ks[i + 0]; u32 t0;
t1 = s_te0[x1s3] ^ s_te1[x2s2] ^ s_te2[x3s1] ^ s_te3[x0s0] ^ ks[i + 1]; u32 t1;
t2 = s_te0[x2s3] ^ s_te1[x3s2] ^ s_te2[x0s1] ^ s_te3[x1s0] ^ ks[i + 2]; u32 t2;
t3 = s_te0[x3s3] ^ s_te1[x0s2] ^ s_te2[x1s1] ^ s_te3[x2s0] ^ ks[i + 3]; u32 t3;
}
const u32 x0s0 = (t0 >> 0) & 0xff; t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[ 4];
const u32 x0s1 = (t0 >> 8) & 0xff; t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[ 5];
const u32 x0s2 = (t0 >> 16) & 0xff; t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[ 6];
const u32 x0s3 = (t0 >> 24) & 0xff; t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[ 7];
const u32 x1s0 = (t1 >> 0) & 0xff; s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[ 8];
const u32 x1s1 = (t1 >> 8) & 0xff; s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[ 9];
const u32 x1s2 = (t1 >> 16) & 0xff; s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[10];
const u32 x1s3 = (t1 >> 24) & 0xff; s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[11];
const u32 x2s0 = (t2 >> 0) & 0xff; t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[12];
const u32 x2s1 = (t2 >> 8) & 0xff; t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[13];
const u32 x2s2 = (t2 >> 16) & 0xff; t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[14];
const u32 x2s3 = (t2 >> 24) & 0xff; t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[15];
const u32 x3s0 = (t3 >> 0) & 0xff; s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[16];
const u32 x3s1 = (t3 >> 8) & 0xff; s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[17];
const u32 x3s2 = (t3 >> 16) & 0xff; s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[18];
const u32 x3s3 = (t3 >> 24) & 0xff; s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[19];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[20];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[21];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[22];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[23];
s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[24];
s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[25];
s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[26];
s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[27];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[28];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[29];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[30];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[31];
s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[32];
s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[33];
s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[34];
s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[35];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[36];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[37];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[38];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[39];
out[0] = (s_te4[x0s3] & 0xff000000) out[0] = (s_te4[(t0 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x1s2] & 0x00ff0000) ^ (s_te4[(t1 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x2s1] & 0x0000ff00) ^ (s_te4[(t2 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x3s0] & 0x000000ff) ^ (s_te4[(t3 >> 0) & 0xff] & 0x000000ff)
^ ks[40]; ^ ks[40];
out[1] = (s_te4[x1s3] & 0xff000000) out[1] = (s_te4[(t1 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x2s2] & 0x00ff0000) ^ (s_te4[(t2 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x3s1] & 0x0000ff00) ^ (s_te4[(t3 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x0s0] & 0x000000ff) ^ (s_te4[(t0 >> 0) & 0xff] & 0x000000ff)
^ ks[41]; ^ ks[41];
out[2] = (s_te4[x2s3] & 0xff000000) out[2] = (s_te4[(t2 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x3s2] & 0x00ff0000) ^ (s_te4[(t3 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x0s1] & 0x0000ff00) ^ (s_te4[(t0 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x1s0] & 0x000000ff) ^ (s_te4[(t1 >> 0) & 0xff] & 0x000000ff)
^ ks[42]; ^ ks[42];
out[3] = (s_te4[x3s3] & 0xff000000) out[3] = (s_te4[(t3 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x0s2] & 0x00ff0000) ^ (s_te4[(t0 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x1s1] & 0x0000ff00) ^ (s_te4[(t1 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x2s0] & 0x000000ff) ^ (s_te4[(t2 >> 0) & 0xff] & 0x000000ff)
^ ks[43]; ^ ks[43];
out[0] = swap32_S (out[0]); out[0] = swap32_S (out[0]);
@ -915,83 +908,82 @@ DECLSPEC void aes128_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u
DECLSPEC void aes128_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4); DECLSPEC void aes128_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4);
DECLSPEC void aes128_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) DECLSPEC void aes128_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4)
{ {
const u32 in_s0 = swap32_S (in[0]); u32 in_s[4];
const u32 in_s1 = swap32_S (in[1]);
const u32 in_s2 = swap32_S (in[2]);
const u32 in_s3 = swap32_S (in[3]);
u32 t0 = in_s0 ^ ks[0]; in_s[0] = swap32_S (in[0]);
u32 t1 = in_s1 ^ ks[1]; in_s[1] = swap32_S (in[1]);
u32 t2 = in_s2 ^ ks[2]; in_s[2] = swap32_S (in[2]);
u32 t3 = in_s3 ^ ks[3]; in_s[3] = swap32_S (in[3]);
#ifdef _unroll u32 s0 = in_s[0] ^ ks[0];
#pragma unroll u32 s1 = in_s[1] ^ ks[1];
#endif u32 s2 = in_s[2] ^ ks[2];
for (int i = 4; i < 40; i += 4) u32 s3 = in_s[3] ^ ks[3];
{
const u32 x0s0 = (t0 >> 0) & 0xff;
const u32 x0s1 = (t0 >> 8) & 0xff;
const u32 x0s2 = (t0 >> 16) & 0xff;
const u32 x0s3 = (t0 >> 24) & 0xff;
const u32 x1s0 = (t1 >> 0) & 0xff;
const u32 x1s1 = (t1 >> 8) & 0xff;
const u32 x1s2 = (t1 >> 16) & 0xff;
const u32 x1s3 = (t1 >> 24) & 0xff;
const u32 x2s0 = (t2 >> 0) & 0xff;
const u32 x2s1 = (t2 >> 8) & 0xff;
const u32 x2s2 = (t2 >> 16) & 0xff;
const u32 x2s3 = (t2 >> 24) & 0xff;
const u32 x3s0 = (t3 >> 0) & 0xff;
const u32 x3s1 = (t3 >> 8) & 0xff;
const u32 x3s2 = (t3 >> 16) & 0xff;
const u32 x3s3 = (t3 >> 24) & 0xff;
t0 = s_td0[x0s3] ^ s_td1[x3s2] ^ s_td2[x2s1] ^ s_td3[x1s0] ^ ks[i + 0]; u32 t0;
t1 = s_td0[x1s3] ^ s_td1[x0s2] ^ s_td2[x3s1] ^ s_td3[x2s0] ^ ks[i + 1]; u32 t1;
t2 = s_td0[x2s3] ^ s_td1[x1s2] ^ s_td2[x0s1] ^ s_td3[x3s0] ^ ks[i + 2]; u32 t2;
t3 = s_td0[x3s3] ^ s_td1[x2s2] ^ s_td2[x1s1] ^ s_td3[x0s0] ^ ks[i + 3]; u32 t3;
}
const u32 x0s0 = (t0 >> 0) & 0xff; t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[ 4];
const u32 x0s1 = (t0 >> 8) & 0xff; t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[ 5];
const u32 x0s2 = (t0 >> 16) & 0xff; t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[ 6];
const u32 x0s3 = (t0 >> 24) & 0xff; t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[ 7];
const u32 x1s0 = (t1 >> 0) & 0xff; s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[ 8];
const u32 x1s1 = (t1 >> 8) & 0xff; s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[ 9];
const u32 x1s2 = (t1 >> 16) & 0xff; s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[10];
const u32 x1s3 = (t1 >> 24) & 0xff; s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[11];
const u32 x2s0 = (t2 >> 0) & 0xff; t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[12];
const u32 x2s1 = (t2 >> 8) & 0xff; t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[13];
const u32 x2s2 = (t2 >> 16) & 0xff; t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[14];
const u32 x2s3 = (t2 >> 24) & 0xff; t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[15];
const u32 x3s0 = (t3 >> 0) & 0xff; s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[16];
const u32 x3s1 = (t3 >> 8) & 0xff; s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[17];
const u32 x3s2 = (t3 >> 16) & 0xff; s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[18];
const u32 x3s3 = (t3 >> 24) & 0xff; s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[19];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[20];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[21];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[22];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[23];
s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[24];
s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[25];
s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[26];
s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[27];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[28];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[29];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[30];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[31];
s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[32];
s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[33];
s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[34];
s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[35];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[36];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[37];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[38];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[39];
out[0] = (s_td4[x0s3] & 0xff000000) out[0] = (s_td4[(t0 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x3s2] & 0x00ff0000) ^ (s_td4[(t3 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x2s1] & 0x0000ff00) ^ (s_td4[(t2 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x1s0] & 0x000000ff) ^ (s_td4[(t1 >> 0) & 0xff] & 0x000000ff)
^ ks[40]; ^ ks[40];
out[1] = (s_td4[x1s3] & 0xff000000) out[1] = (s_td4[(t1 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x0s2] & 0x00ff0000) ^ (s_td4[(t0 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x3s1] & 0x0000ff00) ^ (s_td4[(t3 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x2s0] & 0x000000ff) ^ (s_td4[(t2 >> 0) & 0xff] & 0x000000ff)
^ ks[41]; ^ ks[41];
out[2] = (s_td4[x2s3] & 0xff000000) out[2] = (s_td4[(t2 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x1s2] & 0x00ff0000) ^ (s_td4[(t1 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x0s1] & 0x0000ff00) ^ (s_td4[(t0 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x3s0] & 0x000000ff) ^ (s_td4[(t3 >> 0) & 0xff] & 0x000000ff)
^ ks[42]; ^ ks[42];
out[3] = (s_td4[x3s3] & 0xff000000) out[3] = (s_td4[(t3 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x2s2] & 0x00ff0000) ^ (s_td4[(t2 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x1s1] & 0x0000ff00) ^ (s_td4[(t1 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x0s0] & 0x000000ff) ^ (s_td4[(t0 >> 0) & 0xff] & 0x000000ff)
^ ks[43]; ^ ks[43];
out[0] = swap32_S (out[0]); out[0] = swap32_S (out[0]);
@ -1017,9 +1009,6 @@ DECLSPEC void aes256_ExpandKey (u32 *ks, const u32 *ukey, SHM_TYPE u32 *s_te0, S
int i; int i;
int j; int j;
#ifdef _unroll
#pragma unroll
#endif
for (int i = 0, j = 0; i < 7; i += 1, j += 8) for (int i = 0, j = 0; i < 7; i += 1, j += 8)
{ {
const u32 temp1 = ks[j + 7]; const u32 temp1 = ks[j + 7];
@ -1085,9 +1074,6 @@ DECLSPEC void aes256_InvertKey (u32 *ks, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te
temp = ks[26]; ks[26] = ks[34]; ks[34] = temp; temp = ks[26]; ks[26] = ks[34]; ks[34] = temp;
temp = ks[27]; ks[27] = ks[35]; ks[35] = temp; temp = ks[27]; ks[27] = ks[35]; ks[35] = temp;
#ifdef _unroll
#pragma unroll
#endif
for (int i = 1, j = 4; i < 14; i += 1, j += 4) for (int i = 1, j = 4; i < 14; i += 1, j += 4)
{ {
const u32 x0s0 = (ks[j + 0] >> 0) & 0xff; const u32 x0s0 = (ks[j + 0] >> 0) & 0xff;
@ -1172,83 +1158,98 @@ DECLSPEC void aes256_set_decrypt_key (u32 *ks, const u32 *ukey, SHM_TYPE u32 *s_
DECLSPEC void aes256_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4); DECLSPEC void aes256_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4);
DECLSPEC void aes256_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) DECLSPEC void aes256_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4)
{ {
const u32 in_s0 = swap32_S (in[0]); u32 in_s[4];
const u32 in_s1 = swap32_S (in[1]);
const u32 in_s2 = swap32_S (in[2]);
const u32 in_s3 = swap32_S (in[3]);
u32 t0 = in_s0 ^ ks[0]; in_s[0] = swap32_S (in[0]);
u32 t1 = in_s1 ^ ks[1]; in_s[1] = swap32_S (in[1]);
u32 t2 = in_s2 ^ ks[2]; in_s[2] = swap32_S (in[2]);
u32 t3 = in_s3 ^ ks[3]; in_s[3] = swap32_S (in[3]);
#ifdef _unroll u32 s0 = in_s[0] ^ ks[0];
#pragma unroll u32 s1 = in_s[1] ^ ks[1];
#endif u32 s2 = in_s[2] ^ ks[2];
for (int i = 4; i < 56; i += 4) u32 s3 = in_s[3] ^ ks[3];
{
const u32 x0s0 = (t0 >> 0) & 0xff;
const u32 x0s1 = (t0 >> 8) & 0xff;
const u32 x0s2 = (t0 >> 16) & 0xff;
const u32 x0s3 = (t0 >> 24) & 0xff;
const u32 x1s0 = (t1 >> 0) & 0xff;
const u32 x1s1 = (t1 >> 8) & 0xff;
const u32 x1s2 = (t1 >> 16) & 0xff;
const u32 x1s3 = (t1 >> 24) & 0xff;
const u32 x2s0 = (t2 >> 0) & 0xff;
const u32 x2s1 = (t2 >> 8) & 0xff;
const u32 x2s2 = (t2 >> 16) & 0xff;
const u32 x2s3 = (t2 >> 24) & 0xff;
const u32 x3s0 = (t3 >> 0) & 0xff;
const u32 x3s1 = (t3 >> 8) & 0xff;
const u32 x3s2 = (t3 >> 16) & 0xff;
const u32 x3s3 = (t3 >> 24) & 0xff;
t0 = s_te0[x0s3] ^ s_te1[x1s2] ^ s_te2[x2s1] ^ s_te3[x3s0] ^ ks[i + 0]; u32 t0;
t1 = s_te0[x1s3] ^ s_te1[x2s2] ^ s_te2[x3s1] ^ s_te3[x0s0] ^ ks[i + 1]; u32 t1;
t2 = s_te0[x2s3] ^ s_te1[x3s2] ^ s_te2[x0s1] ^ s_te3[x1s0] ^ ks[i + 2]; u32 t2;
t3 = s_te0[x3s3] ^ s_te1[x0s2] ^ s_te2[x1s1] ^ s_te3[x2s0] ^ ks[i + 3]; u32 t3;
}
const u32 x0s0 = (t0 >> 0) & 0xff; t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[ 4];
const u32 x0s1 = (t0 >> 8) & 0xff; t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[ 5];
const u32 x0s2 = (t0 >> 16) & 0xff; t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[ 6];
const u32 x0s3 = (t0 >> 24) & 0xff; t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[ 7];
const u32 x1s0 = (t1 >> 0) & 0xff; s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[ 8];
const u32 x1s1 = (t1 >> 8) & 0xff; s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[ 9];
const u32 x1s2 = (t1 >> 16) & 0xff; s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[10];
const u32 x1s3 = (t1 >> 24) & 0xff; s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[11];
const u32 x2s0 = (t2 >> 0) & 0xff; t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[12];
const u32 x2s1 = (t2 >> 8) & 0xff; t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[13];
const u32 x2s2 = (t2 >> 16) & 0xff; t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[14];
const u32 x2s3 = (t2 >> 24) & 0xff; t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[15];
const u32 x3s0 = (t3 >> 0) & 0xff; s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[16];
const u32 x3s1 = (t3 >> 8) & 0xff; s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[17];
const u32 x3s2 = (t3 >> 16) & 0xff; s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[18];
const u32 x3s3 = (t3 >> 24) & 0xff; s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[19];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[20];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[21];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[22];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[23];
s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[24];
s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[25];
s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[26];
s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[27];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[28];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[29];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[30];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[31];
s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[32];
s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[33];
s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[34];
s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[35];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[36];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[37];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[38];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[39];
s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[40];
s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[41];
s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[42];
s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[43];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[44];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[45];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[46];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[47];
s0 = s_te0[t0 >> 24] ^ s_te1[(t1 >> 16) & 0xff] ^ s_te2[(t2 >> 8) & 0xff] ^ s_te3[t3 & 0xff] ^ ks[48];
s1 = s_te0[t1 >> 24] ^ s_te1[(t2 >> 16) & 0xff] ^ s_te2[(t3 >> 8) & 0xff] ^ s_te3[t0 & 0xff] ^ ks[49];
s2 = s_te0[t2 >> 24] ^ s_te1[(t3 >> 16) & 0xff] ^ s_te2[(t0 >> 8) & 0xff] ^ s_te3[t1 & 0xff] ^ ks[50];
s3 = s_te0[t3 >> 24] ^ s_te1[(t0 >> 16) & 0xff] ^ s_te2[(t1 >> 8) & 0xff] ^ s_te3[t2 & 0xff] ^ ks[51];
t0 = s_te0[s0 >> 24] ^ s_te1[(s1 >> 16) & 0xff] ^ s_te2[(s2 >> 8) & 0xff] ^ s_te3[s3 & 0xff] ^ ks[52];
t1 = s_te0[s1 >> 24] ^ s_te1[(s2 >> 16) & 0xff] ^ s_te2[(s3 >> 8) & 0xff] ^ s_te3[s0 & 0xff] ^ ks[53];
t2 = s_te0[s2 >> 24] ^ s_te1[(s3 >> 16) & 0xff] ^ s_te2[(s0 >> 8) & 0xff] ^ s_te3[s1 & 0xff] ^ ks[54];
t3 = s_te0[s3 >> 24] ^ s_te1[(s0 >> 16) & 0xff] ^ s_te2[(s1 >> 8) & 0xff] ^ s_te3[s2 & 0xff] ^ ks[55];
out[0] = (s_te4[x0s3] & 0xff000000) out[0] = (s_te4[(t0 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x1s2] & 0x00ff0000) ^ (s_te4[(t1 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x2s1] & 0x0000ff00) ^ (s_te4[(t2 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x3s0] & 0x000000ff) ^ (s_te4[(t3 >> 0) & 0xff] & 0x000000ff)
^ ks[56]; ^ ks[56];
out[1] = (s_te4[x1s3] & 0xff000000) out[1] = (s_te4[(t1 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x2s2] & 0x00ff0000) ^ (s_te4[(t2 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x3s1] & 0x0000ff00) ^ (s_te4[(t3 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x0s0] & 0x000000ff) ^ (s_te4[(t0 >> 0) & 0xff] & 0x000000ff)
^ ks[57]; ^ ks[57];
out[2] = (s_te4[x2s3] & 0xff000000) out[2] = (s_te4[(t2 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x3s2] & 0x00ff0000) ^ (s_te4[(t3 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x0s1] & 0x0000ff00) ^ (s_te4[(t0 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x1s0] & 0x000000ff) ^ (s_te4[(t1 >> 0) & 0xff] & 0x000000ff)
^ ks[58]; ^ ks[58];
out[3] = (s_te4[x3s3] & 0xff000000) out[3] = (s_te4[(t3 >> 24) & 0xff] & 0xff000000)
^ (s_te4[x0s2] & 0x00ff0000) ^ (s_te4[(t0 >> 16) & 0xff] & 0x00ff0000)
^ (s_te4[x1s1] & 0x0000ff00) ^ (s_te4[(t1 >> 8) & 0xff] & 0x0000ff00)
^ (s_te4[x2s0] & 0x000000ff) ^ (s_te4[(t2 >> 0) & 0xff] & 0x000000ff)
^ ks[59]; ^ ks[59];
out[0] = swap32_S (out[0]); out[0] = swap32_S (out[0]);
@ -1260,83 +1261,98 @@ DECLSPEC void aes256_encrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u
DECLSPEC void aes256_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4); DECLSPEC void aes256_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4);
DECLSPEC void aes256_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4) DECLSPEC void aes256_decrypt (const u32 *ks, const u32 *in, u32 *out, SHM_TYPE u32 *s_td0, SHM_TYPE u32 *s_td1, SHM_TYPE u32 *s_td2, SHM_TYPE u32 *s_td3, SHM_TYPE u32 *s_td4)
{ {
const u32 in_s0 = swap32_S (in[0]); u32 in_s[4];
const u32 in_s1 = swap32_S (in[1]);
const u32 in_s2 = swap32_S (in[2]);
const u32 in_s3 = swap32_S (in[3]);
u32 t0 = in_s0 ^ ks[0]; in_s[0] = swap32_S (in[0]);
u32 t1 = in_s1 ^ ks[1]; in_s[1] = swap32_S (in[1]);
u32 t2 = in_s2 ^ ks[2]; in_s[2] = swap32_S (in[2]);
u32 t3 = in_s3 ^ ks[3]; in_s[3] = swap32_S (in[3]);
#ifdef _unroll u32 s0 = in_s[0] ^ ks[0];
#pragma unroll u32 s1 = in_s[1] ^ ks[1];
#endif u32 s2 = in_s[2] ^ ks[2];
for (int i = 4; i < 56; i += 4) u32 s3 = in_s[3] ^ ks[3];
{
const u32 x0s0 = (t0 >> 0) & 0xff;
const u32 x0s1 = (t0 >> 8) & 0xff;
const u32 x0s2 = (t0 >> 16) & 0xff;
const u32 x0s3 = (t0 >> 24) & 0xff;
const u32 x1s0 = (t1 >> 0) & 0xff;
const u32 x1s1 = (t1 >> 8) & 0xff;
const u32 x1s2 = (t1 >> 16) & 0xff;
const u32 x1s3 = (t1 >> 24) & 0xff;
const u32 x2s0 = (t2 >> 0) & 0xff;
const u32 x2s1 = (t2 >> 8) & 0xff;
const u32 x2s2 = (t2 >> 16) & 0xff;
const u32 x2s3 = (t2 >> 24) & 0xff;
const u32 x3s0 = (t3 >> 0) & 0xff;
const u32 x3s1 = (t3 >> 8) & 0xff;
const u32 x3s2 = (t3 >> 16) & 0xff;
const u32 x3s3 = (t3 >> 24) & 0xff;
t0 = s_td0[x0s3] ^ s_td1[x3s2] ^ s_td2[x2s1] ^ s_td3[x1s0] ^ ks[i + 0]; u32 t0;
t1 = s_td0[x1s3] ^ s_td1[x0s2] ^ s_td2[x3s1] ^ s_td3[x2s0] ^ ks[i + 1]; u32 t1;
t2 = s_td0[x2s3] ^ s_td1[x1s2] ^ s_td2[x0s1] ^ s_td3[x3s0] ^ ks[i + 2]; u32 t2;
t3 = s_td0[x3s3] ^ s_td1[x2s2] ^ s_td2[x1s1] ^ s_td3[x0s0] ^ ks[i + 3]; u32 t3;
}
const u32 x0s0 = (t0 >> 0) & 0xff; t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[ 4];
const u32 x0s1 = (t0 >> 8) & 0xff; t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[ 5];
const u32 x0s2 = (t0 >> 16) & 0xff; t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[ 6];
const u32 x0s3 = (t0 >> 24) & 0xff; t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[ 7];
const u32 x1s0 = (t1 >> 0) & 0xff; s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[ 8];
const u32 x1s1 = (t1 >> 8) & 0xff; s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[ 9];
const u32 x1s2 = (t1 >> 16) & 0xff; s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[10];
const u32 x1s3 = (t1 >> 24) & 0xff; s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[11];
const u32 x2s0 = (t2 >> 0) & 0xff; t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[12];
const u32 x2s1 = (t2 >> 8) & 0xff; t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[13];
const u32 x2s2 = (t2 >> 16) & 0xff; t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[14];
const u32 x2s3 = (t2 >> 24) & 0xff; t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[15];
const u32 x3s0 = (t3 >> 0) & 0xff; s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[16];
const u32 x3s1 = (t3 >> 8) & 0xff; s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[17];
const u32 x3s2 = (t3 >> 16) & 0xff; s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[18];
const u32 x3s3 = (t3 >> 24) & 0xff; s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[19];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[20];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[21];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[22];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[23];
s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[24];
s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[25];
s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[26];
s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[27];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[28];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[29];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[30];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[31];
s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[32];
s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[33];
s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[34];
s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[35];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[36];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[37];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[38];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[39];
s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[40];
s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[41];
s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[42];
s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[43];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[44];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[45];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[46];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[47];
s0 = s_td0[t0 >> 24] ^ s_td1[(t3 >> 16) & 0xff] ^ s_td2[(t2 >> 8) & 0xff] ^ s_td3[t1 & 0xff] ^ ks[48];
s1 = s_td0[t1 >> 24] ^ s_td1[(t0 >> 16) & 0xff] ^ s_td2[(t3 >> 8) & 0xff] ^ s_td3[t2 & 0xff] ^ ks[49];
s2 = s_td0[t2 >> 24] ^ s_td1[(t1 >> 16) & 0xff] ^ s_td2[(t0 >> 8) & 0xff] ^ s_td3[t3 & 0xff] ^ ks[50];
s3 = s_td0[t3 >> 24] ^ s_td1[(t2 >> 16) & 0xff] ^ s_td2[(t1 >> 8) & 0xff] ^ s_td3[t0 & 0xff] ^ ks[51];
t0 = s_td0[s0 >> 24] ^ s_td1[(s3 >> 16) & 0xff] ^ s_td2[(s2 >> 8) & 0xff] ^ s_td3[s1 & 0xff] ^ ks[52];
t1 = s_td0[s1 >> 24] ^ s_td1[(s0 >> 16) & 0xff] ^ s_td2[(s3 >> 8) & 0xff] ^ s_td3[s2 & 0xff] ^ ks[53];
t2 = s_td0[s2 >> 24] ^ s_td1[(s1 >> 16) & 0xff] ^ s_td2[(s0 >> 8) & 0xff] ^ s_td3[s3 & 0xff] ^ ks[54];
t3 = s_td0[s3 >> 24] ^ s_td1[(s2 >> 16) & 0xff] ^ s_td2[(s1 >> 8) & 0xff] ^ s_td3[s0 & 0xff] ^ ks[55];
out[0] = (s_td4[x0s3] & 0xff000000) out[0] = (s_td4[(t0 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x3s2] & 0x00ff0000) ^ (s_td4[(t3 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x2s1] & 0x0000ff00) ^ (s_td4[(t2 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x1s0] & 0x000000ff) ^ (s_td4[(t1 >> 0) & 0xff] & 0x000000ff)
^ ks[56]; ^ ks[56];
out[1] = (s_td4[x1s3] & 0xff000000) out[1] = (s_td4[(t1 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x0s2] & 0x00ff0000) ^ (s_td4[(t0 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x3s1] & 0x0000ff00) ^ (s_td4[(t3 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x2s0] & 0x000000ff) ^ (s_td4[(t2 >> 0) & 0xff] & 0x000000ff)
^ ks[57]; ^ ks[57];
out[2] = (s_td4[x2s3] & 0xff000000) out[2] = (s_td4[(t2 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x1s2] & 0x00ff0000) ^ (s_td4[(t1 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x0s1] & 0x0000ff00) ^ (s_td4[(t0 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x3s0] & 0x000000ff) ^ (s_td4[(t3 >> 0) & 0xff] & 0x000000ff)
^ ks[58]; ^ ks[58];
out[3] = (s_td4[x3s3] & 0xff000000) out[3] = (s_td4[(t3 >> 24) & 0xff] & 0xff000000)
^ (s_td4[x2s2] & 0x00ff0000) ^ (s_td4[(t2 >> 16) & 0xff] & 0x00ff0000)
^ (s_td4[x1s1] & 0x0000ff00) ^ (s_td4[(t1 >> 8) & 0xff] & 0x0000ff00)
^ (s_td4[x0s0] & 0x000000ff) ^ (s_td4[(t0 >> 0) & 0xff] & 0x000000ff)
^ ks[59]; ^ ks[59];
out[0] = swap32_S (out[0]); out[0] = swap32_S (out[0]);

View File

@ -366,9 +366,6 @@ DECLSPEC void _des_crypt_encrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, SHM_
r = rotl32 (r, 3u); r = rotl32 (r, 3u);
l = rotl32 (l, 3u); l = rotl32 (l, 3u);
#ifdef _unroll
#pragma unroll
#endif
for (u32 i = 0; i < 16; i += 2) for (u32 i = 0; i < 16; i += 2)
{ {
u32x u; u32x u;
@ -419,9 +416,6 @@ DECLSPEC void _des_crypt_decrypt (u32x *iv, u32x *data, u32x *Kc, u32x *Kd, SHM_
r = rotl32 (r, 3u); r = rotl32 (r, 3u);
l = rotl32 (l, 3u); l = rotl32 (l, 3u);
#ifdef _unroll
#pragma unroll
#endif
for (u32 i = 16; i > 0; i -= 2) for (u32 i = 16; i > 0; i -= 2)
{ {
u32x u; u32x u;
@ -478,9 +472,6 @@ DECLSPEC void _des_crypt_keysetup (u32x c, u32x d, u32x *Kc, u32x *Kd, SHM_TYPE
c = c & 0x0fffffff; c = c & 0x0fffffff;
#ifdef _unroll
#pragma unroll
#endif
for (u32 i = 0; i < 16; i++) for (u32 i = 0; i < 16; i++)
{ {
if ((i < 2) || (i == 8) || (i == 15)) if ((i < 2) || (i == 8) || (i == 15))

View File

@ -406,31 +406,146 @@
DECLSPEC void serpent128_set_key (u32 *ks, const u32 *ukey); DECLSPEC void serpent128_set_key (u32 *ks, const u32 *ukey);
DECLSPEC void serpent128_set_key (u32 *ks, const u32 *ukey) DECLSPEC void serpent128_set_key (u32 *ks, const u32 *ukey)
{ {
#ifdef _unroll ks[ 0] = ukey[0];
#pragma unroll ks[ 1] = ukey[1];
#endif ks[ 2] = ukey[2];
for (int i = 0; i < 4; i++) ks[ 3] = ukey[3];
{ ks[ 4] = 1;
ks[i] = ukey[i]; ks[ 5] = 0;
} ks[ 6] = 0;
ks[ 7] = 0;
#ifdef _unroll ks[ 8] = rotl32_S ((ks[ 7] ^ ks[ 5] ^ ks[ 3] ^ ks[ 0] ^ 0x9e3779b9 ^ 0), 11);
#pragma unroll ks[ 9] = rotl32_S ((ks[ 8] ^ ks[ 6] ^ ks[ 4] ^ ks[ 1] ^ 0x9e3779b9 ^ 1), 11);
#endif ks[ 10] = rotl32_S ((ks[ 9] ^ ks[ 7] ^ ks[ 5] ^ ks[ 2] ^ 0x9e3779b9 ^ 2), 11);
for (int i = 4; i < 8; i++) ks[ 11] = rotl32_S ((ks[ 10] ^ ks[ 8] ^ ks[ 6] ^ ks[ 3] ^ 0x9e3779b9 ^ 3), 11);
{ ks[ 12] = rotl32_S ((ks[ 11] ^ ks[ 9] ^ ks[ 7] ^ ks[ 4] ^ 0x9e3779b9 ^ 4), 11);
ks[i] = 0; ks[ 13] = rotl32_S ((ks[ 12] ^ ks[ 10] ^ ks[ 8] ^ ks[ 5] ^ 0x9e3779b9 ^ 5), 11);
} ks[ 14] = rotl32_S ((ks[ 13] ^ ks[ 11] ^ ks[ 9] ^ ks[ 6] ^ 0x9e3779b9 ^ 6), 11);
ks[ 15] = rotl32_S ((ks[ 14] ^ ks[ 12] ^ ks[ 10] ^ ks[ 7] ^ 0x9e3779b9 ^ 7), 11);
ks[4] = 1; ks[ 16] = rotl32_S ((ks[ 15] ^ ks[ 13] ^ ks[ 11] ^ ks[ 8] ^ 0x9e3779b9 ^ 8), 11);
ks[ 17] = rotl32_S ((ks[ 16] ^ ks[ 14] ^ ks[ 12] ^ ks[ 9] ^ 0x9e3779b9 ^ 9), 11);
#ifdef _unroll ks[ 18] = rotl32_S ((ks[ 17] ^ ks[ 15] ^ ks[ 13] ^ ks[ 10] ^ 0x9e3779b9 ^ 10), 11);
#pragma unroll ks[ 19] = rotl32_S ((ks[ 18] ^ ks[ 16] ^ ks[ 14] ^ ks[ 11] ^ 0x9e3779b9 ^ 11), 11);
#endif ks[ 20] = rotl32_S ((ks[ 19] ^ ks[ 17] ^ ks[ 15] ^ ks[ 12] ^ 0x9e3779b9 ^ 12), 11);
for (int i = 0; i < 132; i++) ks[ 21] = rotl32_S ((ks[ 20] ^ ks[ 18] ^ ks[ 16] ^ ks[ 13] ^ 0x9e3779b9 ^ 13), 11);
{ ks[ 22] = rotl32_S ((ks[ 21] ^ ks[ 19] ^ ks[ 17] ^ ks[ 14] ^ 0x9e3779b9 ^ 14), 11);
ks[i + 8] = rotl32_S (ks[i + 7] ^ ks[i + 5] ^ ks[i + 3] ^ ks[i + 0] ^ 0x9e3779b9 ^ i, 11); ks[ 23] = rotl32_S ((ks[ 22] ^ ks[ 20] ^ ks[ 18] ^ ks[ 15] ^ 0x9e3779b9 ^ 15), 11);
} ks[ 24] = rotl32_S ((ks[ 23] ^ ks[ 21] ^ ks[ 19] ^ ks[ 16] ^ 0x9e3779b9 ^ 16), 11);
ks[ 25] = rotl32_S ((ks[ 24] ^ ks[ 22] ^ ks[ 20] ^ ks[ 17] ^ 0x9e3779b9 ^ 17), 11);
ks[ 26] = rotl32_S ((ks[ 25] ^ ks[ 23] ^ ks[ 21] ^ ks[ 18] ^ 0x9e3779b9 ^ 18), 11);
ks[ 27] = rotl32_S ((ks[ 26] ^ ks[ 24] ^ ks[ 22] ^ ks[ 19] ^ 0x9e3779b9 ^ 19), 11);
ks[ 28] = rotl32_S ((ks[ 27] ^ ks[ 25] ^ ks[ 23] ^ ks[ 20] ^ 0x9e3779b9 ^ 20), 11);
ks[ 29] = rotl32_S ((ks[ 28] ^ ks[ 26] ^ ks[ 24] ^ ks[ 21] ^ 0x9e3779b9 ^ 21), 11);
ks[ 30] = rotl32_S ((ks[ 29] ^ ks[ 27] ^ ks[ 25] ^ ks[ 22] ^ 0x9e3779b9 ^ 22), 11);
ks[ 31] = rotl32_S ((ks[ 30] ^ ks[ 28] ^ ks[ 26] ^ ks[ 23] ^ 0x9e3779b9 ^ 23), 11);
ks[ 32] = rotl32_S ((ks[ 31] ^ ks[ 29] ^ ks[ 27] ^ ks[ 24] ^ 0x9e3779b9 ^ 24), 11);
ks[ 33] = rotl32_S ((ks[ 32] ^ ks[ 30] ^ ks[ 28] ^ ks[ 25] ^ 0x9e3779b9 ^ 25), 11);
ks[ 34] = rotl32_S ((ks[ 33] ^ ks[ 31] ^ ks[ 29] ^ ks[ 26] ^ 0x9e3779b9 ^ 26), 11);
ks[ 35] = rotl32_S ((ks[ 34] ^ ks[ 32] ^ ks[ 30] ^ ks[ 27] ^ 0x9e3779b9 ^ 27), 11);
ks[ 36] = rotl32_S ((ks[ 35] ^ ks[ 33] ^ ks[ 31] ^ ks[ 28] ^ 0x9e3779b9 ^ 28), 11);
ks[ 37] = rotl32_S ((ks[ 36] ^ ks[ 34] ^ ks[ 32] ^ ks[ 29] ^ 0x9e3779b9 ^ 29), 11);
ks[ 38] = rotl32_S ((ks[ 37] ^ ks[ 35] ^ ks[ 33] ^ ks[ 30] ^ 0x9e3779b9 ^ 30), 11);
ks[ 39] = rotl32_S ((ks[ 38] ^ ks[ 36] ^ ks[ 34] ^ ks[ 31] ^ 0x9e3779b9 ^ 31), 11);
ks[ 40] = rotl32_S ((ks[ 39] ^ ks[ 37] ^ ks[ 35] ^ ks[ 32] ^ 0x9e3779b9 ^ 32), 11);
ks[ 41] = rotl32_S ((ks[ 40] ^ ks[ 38] ^ ks[ 36] ^ ks[ 33] ^ 0x9e3779b9 ^ 33), 11);
ks[ 42] = rotl32_S ((ks[ 41] ^ ks[ 39] ^ ks[ 37] ^ ks[ 34] ^ 0x9e3779b9 ^ 34), 11);
ks[ 43] = rotl32_S ((ks[ 42] ^ ks[ 40] ^ ks[ 38] ^ ks[ 35] ^ 0x9e3779b9 ^ 35), 11);
ks[ 44] = rotl32_S ((ks[ 43] ^ ks[ 41] ^ ks[ 39] ^ ks[ 36] ^ 0x9e3779b9 ^ 36), 11);
ks[ 45] = rotl32_S ((ks[ 44] ^ ks[ 42] ^ ks[ 40] ^ ks[ 37] ^ 0x9e3779b9 ^ 37), 11);
ks[ 46] = rotl32_S ((ks[ 45] ^ ks[ 43] ^ ks[ 41] ^ ks[ 38] ^ 0x9e3779b9 ^ 38), 11);
ks[ 47] = rotl32_S ((ks[ 46] ^ ks[ 44] ^ ks[ 42] ^ ks[ 39] ^ 0x9e3779b9 ^ 39), 11);
ks[ 48] = rotl32_S ((ks[ 47] ^ ks[ 45] ^ ks[ 43] ^ ks[ 40] ^ 0x9e3779b9 ^ 40), 11);
ks[ 49] = rotl32_S ((ks[ 48] ^ ks[ 46] ^ ks[ 44] ^ ks[ 41] ^ 0x9e3779b9 ^ 41), 11);
ks[ 50] = rotl32_S ((ks[ 49] ^ ks[ 47] ^ ks[ 45] ^ ks[ 42] ^ 0x9e3779b9 ^ 42), 11);
ks[ 51] = rotl32_S ((ks[ 50] ^ ks[ 48] ^ ks[ 46] ^ ks[ 43] ^ 0x9e3779b9 ^ 43), 11);
ks[ 52] = rotl32_S ((ks[ 51] ^ ks[ 49] ^ ks[ 47] ^ ks[ 44] ^ 0x9e3779b9 ^ 44), 11);
ks[ 53] = rotl32_S ((ks[ 52] ^ ks[ 50] ^ ks[ 48] ^ ks[ 45] ^ 0x9e3779b9 ^ 45), 11);
ks[ 54] = rotl32_S ((ks[ 53] ^ ks[ 51] ^ ks[ 49] ^ ks[ 46] ^ 0x9e3779b9 ^ 46), 11);
ks[ 55] = rotl32_S ((ks[ 54] ^ ks[ 52] ^ ks[ 50] ^ ks[ 47] ^ 0x9e3779b9 ^ 47), 11);
ks[ 56] = rotl32_S ((ks[ 55] ^ ks[ 53] ^ ks[ 51] ^ ks[ 48] ^ 0x9e3779b9 ^ 48), 11);
ks[ 57] = rotl32_S ((ks[ 56] ^ ks[ 54] ^ ks[ 52] ^ ks[ 49] ^ 0x9e3779b9 ^ 49), 11);
ks[ 58] = rotl32_S ((ks[ 57] ^ ks[ 55] ^ ks[ 53] ^ ks[ 50] ^ 0x9e3779b9 ^ 50), 11);
ks[ 59] = rotl32_S ((ks[ 58] ^ ks[ 56] ^ ks[ 54] ^ ks[ 51] ^ 0x9e3779b9 ^ 51), 11);
ks[ 60] = rotl32_S ((ks[ 59] ^ ks[ 57] ^ ks[ 55] ^ ks[ 52] ^ 0x9e3779b9 ^ 52), 11);
ks[ 61] = rotl32_S ((ks[ 60] ^ ks[ 58] ^ ks[ 56] ^ ks[ 53] ^ 0x9e3779b9 ^ 53), 11);
ks[ 62] = rotl32_S ((ks[ 61] ^ ks[ 59] ^ ks[ 57] ^ ks[ 54] ^ 0x9e3779b9 ^ 54), 11);
ks[ 63] = rotl32_S ((ks[ 62] ^ ks[ 60] ^ ks[ 58] ^ ks[ 55] ^ 0x9e3779b9 ^ 55), 11);
ks[ 64] = rotl32_S ((ks[ 63] ^ ks[ 61] ^ ks[ 59] ^ ks[ 56] ^ 0x9e3779b9 ^ 56), 11);
ks[ 65] = rotl32_S ((ks[ 64] ^ ks[ 62] ^ ks[ 60] ^ ks[ 57] ^ 0x9e3779b9 ^ 57), 11);
ks[ 66] = rotl32_S ((ks[ 65] ^ ks[ 63] ^ ks[ 61] ^ ks[ 58] ^ 0x9e3779b9 ^ 58), 11);
ks[ 67] = rotl32_S ((ks[ 66] ^ ks[ 64] ^ ks[ 62] ^ ks[ 59] ^ 0x9e3779b9 ^ 59), 11);
ks[ 68] = rotl32_S ((ks[ 67] ^ ks[ 65] ^ ks[ 63] ^ ks[ 60] ^ 0x9e3779b9 ^ 60), 11);
ks[ 69] = rotl32_S ((ks[ 68] ^ ks[ 66] ^ ks[ 64] ^ ks[ 61] ^ 0x9e3779b9 ^ 61), 11);
ks[ 70] = rotl32_S ((ks[ 69] ^ ks[ 67] ^ ks[ 65] ^ ks[ 62] ^ 0x9e3779b9 ^ 62), 11);
ks[ 71] = rotl32_S ((ks[ 70] ^ ks[ 68] ^ ks[ 66] ^ ks[ 63] ^ 0x9e3779b9 ^ 63), 11);
ks[ 72] = rotl32_S ((ks[ 71] ^ ks[ 69] ^ ks[ 67] ^ ks[ 64] ^ 0x9e3779b9 ^ 64), 11);
ks[ 73] = rotl32_S ((ks[ 72] ^ ks[ 70] ^ ks[ 68] ^ ks[ 65] ^ 0x9e3779b9 ^ 65), 11);
ks[ 74] = rotl32_S ((ks[ 73] ^ ks[ 71] ^ ks[ 69] ^ ks[ 66] ^ 0x9e3779b9 ^ 66), 11);
ks[ 75] = rotl32_S ((ks[ 74] ^ ks[ 72] ^ ks[ 70] ^ ks[ 67] ^ 0x9e3779b9 ^ 67), 11);
ks[ 76] = rotl32_S ((ks[ 75] ^ ks[ 73] ^ ks[ 71] ^ ks[ 68] ^ 0x9e3779b9 ^ 68), 11);
ks[ 77] = rotl32_S ((ks[ 76] ^ ks[ 74] ^ ks[ 72] ^ ks[ 69] ^ 0x9e3779b9 ^ 69), 11);
ks[ 78] = rotl32_S ((ks[ 77] ^ ks[ 75] ^ ks[ 73] ^ ks[ 70] ^ 0x9e3779b9 ^ 70), 11);
ks[ 79] = rotl32_S ((ks[ 78] ^ ks[ 76] ^ ks[ 74] ^ ks[ 71] ^ 0x9e3779b9 ^ 71), 11);
ks[ 80] = rotl32_S ((ks[ 79] ^ ks[ 77] ^ ks[ 75] ^ ks[ 72] ^ 0x9e3779b9 ^ 72), 11);
ks[ 81] = rotl32_S ((ks[ 80] ^ ks[ 78] ^ ks[ 76] ^ ks[ 73] ^ 0x9e3779b9 ^ 73), 11);
ks[ 82] = rotl32_S ((ks[ 81] ^ ks[ 79] ^ ks[ 77] ^ ks[ 74] ^ 0x9e3779b9 ^ 74), 11);
ks[ 83] = rotl32_S ((ks[ 82] ^ ks[ 80] ^ ks[ 78] ^ ks[ 75] ^ 0x9e3779b9 ^ 75), 11);
ks[ 84] = rotl32_S ((ks[ 83] ^ ks[ 81] ^ ks[ 79] ^ ks[ 76] ^ 0x9e3779b9 ^ 76), 11);
ks[ 85] = rotl32_S ((ks[ 84] ^ ks[ 82] ^ ks[ 80] ^ ks[ 77] ^ 0x9e3779b9 ^ 77), 11);
ks[ 86] = rotl32_S ((ks[ 85] ^ ks[ 83] ^ ks[ 81] ^ ks[ 78] ^ 0x9e3779b9 ^ 78), 11);
ks[ 87] = rotl32_S ((ks[ 86] ^ ks[ 84] ^ ks[ 82] ^ ks[ 79] ^ 0x9e3779b9 ^ 79), 11);
ks[ 88] = rotl32_S ((ks[ 87] ^ ks[ 85] ^ ks[ 83] ^ ks[ 80] ^ 0x9e3779b9 ^ 80), 11);
ks[ 89] = rotl32_S ((ks[ 88] ^ ks[ 86] ^ ks[ 84] ^ ks[ 81] ^ 0x9e3779b9 ^ 81), 11);
ks[ 90] = rotl32_S ((ks[ 89] ^ ks[ 87] ^ ks[ 85] ^ ks[ 82] ^ 0x9e3779b9 ^ 82), 11);
ks[ 91] = rotl32_S ((ks[ 90] ^ ks[ 88] ^ ks[ 86] ^ ks[ 83] ^ 0x9e3779b9 ^ 83), 11);
ks[ 92] = rotl32_S ((ks[ 91] ^ ks[ 89] ^ ks[ 87] ^ ks[ 84] ^ 0x9e3779b9 ^ 84), 11);
ks[ 93] = rotl32_S ((ks[ 92] ^ ks[ 90] ^ ks[ 88] ^ ks[ 85] ^ 0x9e3779b9 ^ 85), 11);
ks[ 94] = rotl32_S ((ks[ 93] ^ ks[ 91] ^ ks[ 89] ^ ks[ 86] ^ 0x9e3779b9 ^ 86), 11);
ks[ 95] = rotl32_S ((ks[ 94] ^ ks[ 92] ^ ks[ 90] ^ ks[ 87] ^ 0x9e3779b9 ^ 87), 11);
ks[ 96] = rotl32_S ((ks[ 95] ^ ks[ 93] ^ ks[ 91] ^ ks[ 88] ^ 0x9e3779b9 ^ 88), 11);
ks[ 97] = rotl32_S ((ks[ 96] ^ ks[ 94] ^ ks[ 92] ^ ks[ 89] ^ 0x9e3779b9 ^ 89), 11);
ks[ 98] = rotl32_S ((ks[ 97] ^ ks[ 95] ^ ks[ 93] ^ ks[ 90] ^ 0x9e3779b9 ^ 90), 11);
ks[ 99] = rotl32_S ((ks[ 98] ^ ks[ 96] ^ ks[ 94] ^ ks[ 91] ^ 0x9e3779b9 ^ 91), 11);
ks[100] = rotl32_S ((ks[ 99] ^ ks[ 97] ^ ks[ 95] ^ ks[ 92] ^ 0x9e3779b9 ^ 92), 11);
ks[101] = rotl32_S ((ks[100] ^ ks[ 98] ^ ks[ 96] ^ ks[ 93] ^ 0x9e3779b9 ^ 93), 11);
ks[102] = rotl32_S ((ks[101] ^ ks[ 99] ^ ks[ 97] ^ ks[ 94] ^ 0x9e3779b9 ^ 94), 11);
ks[103] = rotl32_S ((ks[102] ^ ks[100] ^ ks[ 98] ^ ks[ 95] ^ 0x9e3779b9 ^ 95), 11);
ks[104] = rotl32_S ((ks[103] ^ ks[101] ^ ks[ 99] ^ ks[ 96] ^ 0x9e3779b9 ^ 96), 11);
ks[105] = rotl32_S ((ks[104] ^ ks[102] ^ ks[100] ^ ks[ 97] ^ 0x9e3779b9 ^ 97), 11);
ks[106] = rotl32_S ((ks[105] ^ ks[103] ^ ks[101] ^ ks[ 98] ^ 0x9e3779b9 ^ 98), 11);
ks[107] = rotl32_S ((ks[106] ^ ks[104] ^ ks[102] ^ ks[ 99] ^ 0x9e3779b9 ^ 99), 11);
ks[108] = rotl32_S ((ks[107] ^ ks[105] ^ ks[103] ^ ks[100] ^ 0x9e3779b9 ^ 100), 11);
ks[109] = rotl32_S ((ks[108] ^ ks[106] ^ ks[104] ^ ks[101] ^ 0x9e3779b9 ^ 101), 11);
ks[110] = rotl32_S ((ks[109] ^ ks[107] ^ ks[105] ^ ks[102] ^ 0x9e3779b9 ^ 102), 11);
ks[111] = rotl32_S ((ks[110] ^ ks[108] ^ ks[106] ^ ks[103] ^ 0x9e3779b9 ^ 103), 11);
ks[112] = rotl32_S ((ks[111] ^ ks[109] ^ ks[107] ^ ks[104] ^ 0x9e3779b9 ^ 104), 11);
ks[113] = rotl32_S ((ks[112] ^ ks[110] ^ ks[108] ^ ks[105] ^ 0x9e3779b9 ^ 105), 11);
ks[114] = rotl32_S ((ks[113] ^ ks[111] ^ ks[109] ^ ks[106] ^ 0x9e3779b9 ^ 106), 11);
ks[115] = rotl32_S ((ks[114] ^ ks[112] ^ ks[110] ^ ks[107] ^ 0x9e3779b9 ^ 107), 11);
ks[116] = rotl32_S ((ks[115] ^ ks[113] ^ ks[111] ^ ks[108] ^ 0x9e3779b9 ^ 108), 11);
ks[117] = rotl32_S ((ks[116] ^ ks[114] ^ ks[112] ^ ks[109] ^ 0x9e3779b9 ^ 109), 11);
ks[118] = rotl32_S ((ks[117] ^ ks[115] ^ ks[113] ^ ks[110] ^ 0x9e3779b9 ^ 110), 11);
ks[119] = rotl32_S ((ks[118] ^ ks[116] ^ ks[114] ^ ks[111] ^ 0x9e3779b9 ^ 111), 11);
ks[120] = rotl32_S ((ks[119] ^ ks[117] ^ ks[115] ^ ks[112] ^ 0x9e3779b9 ^ 112), 11);
ks[121] = rotl32_S ((ks[120] ^ ks[118] ^ ks[116] ^ ks[113] ^ 0x9e3779b9 ^ 113), 11);
ks[122] = rotl32_S ((ks[121] ^ ks[119] ^ ks[117] ^ ks[114] ^ 0x9e3779b9 ^ 114), 11);
ks[123] = rotl32_S ((ks[122] ^ ks[120] ^ ks[118] ^ ks[115] ^ 0x9e3779b9 ^ 115), 11);
ks[124] = rotl32_S ((ks[123] ^ ks[121] ^ ks[119] ^ ks[116] ^ 0x9e3779b9 ^ 116), 11);
ks[125] = rotl32_S ((ks[124] ^ ks[122] ^ ks[120] ^ ks[117] ^ 0x9e3779b9 ^ 117), 11);
ks[126] = rotl32_S ((ks[125] ^ ks[123] ^ ks[121] ^ ks[118] ^ 0x9e3779b9 ^ 118), 11);
ks[127] = rotl32_S ((ks[126] ^ ks[124] ^ ks[122] ^ ks[119] ^ 0x9e3779b9 ^ 119), 11);
ks[128] = rotl32_S ((ks[127] ^ ks[125] ^ ks[123] ^ ks[120] ^ 0x9e3779b9 ^ 120), 11);
ks[129] = rotl32_S ((ks[128] ^ ks[126] ^ ks[124] ^ ks[121] ^ 0x9e3779b9 ^ 121), 11);
ks[130] = rotl32_S ((ks[129] ^ ks[127] ^ ks[125] ^ ks[122] ^ 0x9e3779b9 ^ 122), 11);
ks[131] = rotl32_S ((ks[130] ^ ks[128] ^ ks[126] ^ ks[123] ^ 0x9e3779b9 ^ 123), 11);
ks[132] = rotl32_S ((ks[131] ^ ks[129] ^ ks[127] ^ ks[124] ^ 0x9e3779b9 ^ 124), 11);
ks[133] = rotl32_S ((ks[132] ^ ks[130] ^ ks[128] ^ ks[125] ^ 0x9e3779b9 ^ 125), 11);
ks[134] = rotl32_S ((ks[133] ^ ks[131] ^ ks[129] ^ ks[126] ^ 0x9e3779b9 ^ 126), 11);
ks[135] = rotl32_S ((ks[134] ^ ks[132] ^ ks[130] ^ ks[127] ^ 0x9e3779b9 ^ 127), 11);
ks[136] = rotl32_S ((ks[135] ^ ks[133] ^ ks[131] ^ ks[128] ^ 0x9e3779b9 ^ 128), 11);
ks[137] = rotl32_S ((ks[136] ^ ks[134] ^ ks[132] ^ ks[129] ^ 0x9e3779b9 ^ 129), 11);
ks[138] = rotl32_S ((ks[137] ^ ks[135] ^ ks[133] ^ ks[130] ^ 0x9e3779b9 ^ 130), 11);
ks[139] = rotl32_S ((ks[138] ^ ks[136] ^ ks[134] ^ ks[131] ^ 0x9e3779b9 ^ 131), 11);
u32 a,b,c,d,e,f,g,h; u32 a,b,c,d,e,f,g,h;
u32 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16; u32 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;
@ -577,21 +692,146 @@ DECLSPEC void serpent128_decrypt (const u32 *ks, const u32 *in, u32 *out)
DECLSPEC void serpent256_set_key (u32 *ks, const u32 *ukey); DECLSPEC void serpent256_set_key (u32 *ks, const u32 *ukey);
DECLSPEC void serpent256_set_key (u32 *ks, const u32 *ukey) DECLSPEC void serpent256_set_key (u32 *ks, const u32 *ukey)
{ {
#ifdef _unroll ks[ 0] = ukey[0];
#pragma unroll ks[ 1] = ukey[1];
#endif ks[ 2] = ukey[2];
for (int i = 0; i < 8; i++) ks[ 3] = ukey[3];
{ ks[ 4] = ukey[4];
ks[i] = ukey[i]; ks[ 5] = ukey[5];
} ks[ 6] = ukey[6];
ks[ 7] = ukey[7];
#ifdef _unroll ks[ 8] = rotl32_S ((ks[ 7] ^ ks[ 5] ^ ks[ 3] ^ ks[ 0] ^ 0x9e3779b9 ^ 0), 11);
#pragma unroll ks[ 9] = rotl32_S ((ks[ 8] ^ ks[ 6] ^ ks[ 4] ^ ks[ 1] ^ 0x9e3779b9 ^ 1), 11);
#endif ks[ 10] = rotl32_S ((ks[ 9] ^ ks[ 7] ^ ks[ 5] ^ ks[ 2] ^ 0x9e3779b9 ^ 2), 11);
for (int i = 0; i < 132; i++) ks[ 11] = rotl32_S ((ks[ 10] ^ ks[ 8] ^ ks[ 6] ^ ks[ 3] ^ 0x9e3779b9 ^ 3), 11);
{ ks[ 12] = rotl32_S ((ks[ 11] ^ ks[ 9] ^ ks[ 7] ^ ks[ 4] ^ 0x9e3779b9 ^ 4), 11);
ks[i + 8] = rotl32_S (ks[i + 7] ^ ks[i + 5] ^ ks[i + 3] ^ ks[i + 0] ^ 0x9e3779b9 ^ i, 11); ks[ 13] = rotl32_S ((ks[ 12] ^ ks[ 10] ^ ks[ 8] ^ ks[ 5] ^ 0x9e3779b9 ^ 5), 11);
} ks[ 14] = rotl32_S ((ks[ 13] ^ ks[ 11] ^ ks[ 9] ^ ks[ 6] ^ 0x9e3779b9 ^ 6), 11);
ks[ 15] = rotl32_S ((ks[ 14] ^ ks[ 12] ^ ks[ 10] ^ ks[ 7] ^ 0x9e3779b9 ^ 7), 11);
ks[ 16] = rotl32_S ((ks[ 15] ^ ks[ 13] ^ ks[ 11] ^ ks[ 8] ^ 0x9e3779b9 ^ 8), 11);
ks[ 17] = rotl32_S ((ks[ 16] ^ ks[ 14] ^ ks[ 12] ^ ks[ 9] ^ 0x9e3779b9 ^ 9), 11);
ks[ 18] = rotl32_S ((ks[ 17] ^ ks[ 15] ^ ks[ 13] ^ ks[ 10] ^ 0x9e3779b9 ^ 10), 11);
ks[ 19] = rotl32_S ((ks[ 18] ^ ks[ 16] ^ ks[ 14] ^ ks[ 11] ^ 0x9e3779b9 ^ 11), 11);
ks[ 20] = rotl32_S ((ks[ 19] ^ ks[ 17] ^ ks[ 15] ^ ks[ 12] ^ 0x9e3779b9 ^ 12), 11);
ks[ 21] = rotl32_S ((ks[ 20] ^ ks[ 18] ^ ks[ 16] ^ ks[ 13] ^ 0x9e3779b9 ^ 13), 11);
ks[ 22] = rotl32_S ((ks[ 21] ^ ks[ 19] ^ ks[ 17] ^ ks[ 14] ^ 0x9e3779b9 ^ 14), 11);
ks[ 23] = rotl32_S ((ks[ 22] ^ ks[ 20] ^ ks[ 18] ^ ks[ 15] ^ 0x9e3779b9 ^ 15), 11);
ks[ 24] = rotl32_S ((ks[ 23] ^ ks[ 21] ^ ks[ 19] ^ ks[ 16] ^ 0x9e3779b9 ^ 16), 11);
ks[ 25] = rotl32_S ((ks[ 24] ^ ks[ 22] ^ ks[ 20] ^ ks[ 17] ^ 0x9e3779b9 ^ 17), 11);
ks[ 26] = rotl32_S ((ks[ 25] ^ ks[ 23] ^ ks[ 21] ^ ks[ 18] ^ 0x9e3779b9 ^ 18), 11);
ks[ 27] = rotl32_S ((ks[ 26] ^ ks[ 24] ^ ks[ 22] ^ ks[ 19] ^ 0x9e3779b9 ^ 19), 11);
ks[ 28] = rotl32_S ((ks[ 27] ^ ks[ 25] ^ ks[ 23] ^ ks[ 20] ^ 0x9e3779b9 ^ 20), 11);
ks[ 29] = rotl32_S ((ks[ 28] ^ ks[ 26] ^ ks[ 24] ^ ks[ 21] ^ 0x9e3779b9 ^ 21), 11);
ks[ 30] = rotl32_S ((ks[ 29] ^ ks[ 27] ^ ks[ 25] ^ ks[ 22] ^ 0x9e3779b9 ^ 22), 11);
ks[ 31] = rotl32_S ((ks[ 30] ^ ks[ 28] ^ ks[ 26] ^ ks[ 23] ^ 0x9e3779b9 ^ 23), 11);
ks[ 32] = rotl32_S ((ks[ 31] ^ ks[ 29] ^ ks[ 27] ^ ks[ 24] ^ 0x9e3779b9 ^ 24), 11);
ks[ 33] = rotl32_S ((ks[ 32] ^ ks[ 30] ^ ks[ 28] ^ ks[ 25] ^ 0x9e3779b9 ^ 25), 11);
ks[ 34] = rotl32_S ((ks[ 33] ^ ks[ 31] ^ ks[ 29] ^ ks[ 26] ^ 0x9e3779b9 ^ 26), 11);
ks[ 35] = rotl32_S ((ks[ 34] ^ ks[ 32] ^ ks[ 30] ^ ks[ 27] ^ 0x9e3779b9 ^ 27), 11);
ks[ 36] = rotl32_S ((ks[ 35] ^ ks[ 33] ^ ks[ 31] ^ ks[ 28] ^ 0x9e3779b9 ^ 28), 11);
ks[ 37] = rotl32_S ((ks[ 36] ^ ks[ 34] ^ ks[ 32] ^ ks[ 29] ^ 0x9e3779b9 ^ 29), 11);
ks[ 38] = rotl32_S ((ks[ 37] ^ ks[ 35] ^ ks[ 33] ^ ks[ 30] ^ 0x9e3779b9 ^ 30), 11);
ks[ 39] = rotl32_S ((ks[ 38] ^ ks[ 36] ^ ks[ 34] ^ ks[ 31] ^ 0x9e3779b9 ^ 31), 11);
ks[ 40] = rotl32_S ((ks[ 39] ^ ks[ 37] ^ ks[ 35] ^ ks[ 32] ^ 0x9e3779b9 ^ 32), 11);
ks[ 41] = rotl32_S ((ks[ 40] ^ ks[ 38] ^ ks[ 36] ^ ks[ 33] ^ 0x9e3779b9 ^ 33), 11);
ks[ 42] = rotl32_S ((ks[ 41] ^ ks[ 39] ^ ks[ 37] ^ ks[ 34] ^ 0x9e3779b9 ^ 34), 11);
ks[ 43] = rotl32_S ((ks[ 42] ^ ks[ 40] ^ ks[ 38] ^ ks[ 35] ^ 0x9e3779b9 ^ 35), 11);
ks[ 44] = rotl32_S ((ks[ 43] ^ ks[ 41] ^ ks[ 39] ^ ks[ 36] ^ 0x9e3779b9 ^ 36), 11);
ks[ 45] = rotl32_S ((ks[ 44] ^ ks[ 42] ^ ks[ 40] ^ ks[ 37] ^ 0x9e3779b9 ^ 37), 11);
ks[ 46] = rotl32_S ((ks[ 45] ^ ks[ 43] ^ ks[ 41] ^ ks[ 38] ^ 0x9e3779b9 ^ 38), 11);
ks[ 47] = rotl32_S ((ks[ 46] ^ ks[ 44] ^ ks[ 42] ^ ks[ 39] ^ 0x9e3779b9 ^ 39), 11);
ks[ 48] = rotl32_S ((ks[ 47] ^ ks[ 45] ^ ks[ 43] ^ ks[ 40] ^ 0x9e3779b9 ^ 40), 11);
ks[ 49] = rotl32_S ((ks[ 48] ^ ks[ 46] ^ ks[ 44] ^ ks[ 41] ^ 0x9e3779b9 ^ 41), 11);
ks[ 50] = rotl32_S ((ks[ 49] ^ ks[ 47] ^ ks[ 45] ^ ks[ 42] ^ 0x9e3779b9 ^ 42), 11);
ks[ 51] = rotl32_S ((ks[ 50] ^ ks[ 48] ^ ks[ 46] ^ ks[ 43] ^ 0x9e3779b9 ^ 43), 11);
ks[ 52] = rotl32_S ((ks[ 51] ^ ks[ 49] ^ ks[ 47] ^ ks[ 44] ^ 0x9e3779b9 ^ 44), 11);
ks[ 53] = rotl32_S ((ks[ 52] ^ ks[ 50] ^ ks[ 48] ^ ks[ 45] ^ 0x9e3779b9 ^ 45), 11);
ks[ 54] = rotl32_S ((ks[ 53] ^ ks[ 51] ^ ks[ 49] ^ ks[ 46] ^ 0x9e3779b9 ^ 46), 11);
ks[ 55] = rotl32_S ((ks[ 54] ^ ks[ 52] ^ ks[ 50] ^ ks[ 47] ^ 0x9e3779b9 ^ 47), 11);
ks[ 56] = rotl32_S ((ks[ 55] ^ ks[ 53] ^ ks[ 51] ^ ks[ 48] ^ 0x9e3779b9 ^ 48), 11);
ks[ 57] = rotl32_S ((ks[ 56] ^ ks[ 54] ^ ks[ 52] ^ ks[ 49] ^ 0x9e3779b9 ^ 49), 11);
ks[ 58] = rotl32_S ((ks[ 57] ^ ks[ 55] ^ ks[ 53] ^ ks[ 50] ^ 0x9e3779b9 ^ 50), 11);
ks[ 59] = rotl32_S ((ks[ 58] ^ ks[ 56] ^ ks[ 54] ^ ks[ 51] ^ 0x9e3779b9 ^ 51), 11);
ks[ 60] = rotl32_S ((ks[ 59] ^ ks[ 57] ^ ks[ 55] ^ ks[ 52] ^ 0x9e3779b9 ^ 52), 11);
ks[ 61] = rotl32_S ((ks[ 60] ^ ks[ 58] ^ ks[ 56] ^ ks[ 53] ^ 0x9e3779b9 ^ 53), 11);
ks[ 62] = rotl32_S ((ks[ 61] ^ ks[ 59] ^ ks[ 57] ^ ks[ 54] ^ 0x9e3779b9 ^ 54), 11);
ks[ 63] = rotl32_S ((ks[ 62] ^ ks[ 60] ^ ks[ 58] ^ ks[ 55] ^ 0x9e3779b9 ^ 55), 11);
ks[ 64] = rotl32_S ((ks[ 63] ^ ks[ 61] ^ ks[ 59] ^ ks[ 56] ^ 0x9e3779b9 ^ 56), 11);
ks[ 65] = rotl32_S ((ks[ 64] ^ ks[ 62] ^ ks[ 60] ^ ks[ 57] ^ 0x9e3779b9 ^ 57), 11);
ks[ 66] = rotl32_S ((ks[ 65] ^ ks[ 63] ^ ks[ 61] ^ ks[ 58] ^ 0x9e3779b9 ^ 58), 11);
ks[ 67] = rotl32_S ((ks[ 66] ^ ks[ 64] ^ ks[ 62] ^ ks[ 59] ^ 0x9e3779b9 ^ 59), 11);
ks[ 68] = rotl32_S ((ks[ 67] ^ ks[ 65] ^ ks[ 63] ^ ks[ 60] ^ 0x9e3779b9 ^ 60), 11);
ks[ 69] = rotl32_S ((ks[ 68] ^ ks[ 66] ^ ks[ 64] ^ ks[ 61] ^ 0x9e3779b9 ^ 61), 11);
ks[ 70] = rotl32_S ((ks[ 69] ^ ks[ 67] ^ ks[ 65] ^ ks[ 62] ^ 0x9e3779b9 ^ 62), 11);
ks[ 71] = rotl32_S ((ks[ 70] ^ ks[ 68] ^ ks[ 66] ^ ks[ 63] ^ 0x9e3779b9 ^ 63), 11);
ks[ 72] = rotl32_S ((ks[ 71] ^ ks[ 69] ^ ks[ 67] ^ ks[ 64] ^ 0x9e3779b9 ^ 64), 11);
ks[ 73] = rotl32_S ((ks[ 72] ^ ks[ 70] ^ ks[ 68] ^ ks[ 65] ^ 0x9e3779b9 ^ 65), 11);
ks[ 74] = rotl32_S ((ks[ 73] ^ ks[ 71] ^ ks[ 69] ^ ks[ 66] ^ 0x9e3779b9 ^ 66), 11);
ks[ 75] = rotl32_S ((ks[ 74] ^ ks[ 72] ^ ks[ 70] ^ ks[ 67] ^ 0x9e3779b9 ^ 67), 11);
ks[ 76] = rotl32_S ((ks[ 75] ^ ks[ 73] ^ ks[ 71] ^ ks[ 68] ^ 0x9e3779b9 ^ 68), 11);
ks[ 77] = rotl32_S ((ks[ 76] ^ ks[ 74] ^ ks[ 72] ^ ks[ 69] ^ 0x9e3779b9 ^ 69), 11);
ks[ 78] = rotl32_S ((ks[ 77] ^ ks[ 75] ^ ks[ 73] ^ ks[ 70] ^ 0x9e3779b9 ^ 70), 11);
ks[ 79] = rotl32_S ((ks[ 78] ^ ks[ 76] ^ ks[ 74] ^ ks[ 71] ^ 0x9e3779b9 ^ 71), 11);
ks[ 80] = rotl32_S ((ks[ 79] ^ ks[ 77] ^ ks[ 75] ^ ks[ 72] ^ 0x9e3779b9 ^ 72), 11);
ks[ 81] = rotl32_S ((ks[ 80] ^ ks[ 78] ^ ks[ 76] ^ ks[ 73] ^ 0x9e3779b9 ^ 73), 11);
ks[ 82] = rotl32_S ((ks[ 81] ^ ks[ 79] ^ ks[ 77] ^ ks[ 74] ^ 0x9e3779b9 ^ 74), 11);
ks[ 83] = rotl32_S ((ks[ 82] ^ ks[ 80] ^ ks[ 78] ^ ks[ 75] ^ 0x9e3779b9 ^ 75), 11);
ks[ 84] = rotl32_S ((ks[ 83] ^ ks[ 81] ^ ks[ 79] ^ ks[ 76] ^ 0x9e3779b9 ^ 76), 11);
ks[ 85] = rotl32_S ((ks[ 84] ^ ks[ 82] ^ ks[ 80] ^ ks[ 77] ^ 0x9e3779b9 ^ 77), 11);
ks[ 86] = rotl32_S ((ks[ 85] ^ ks[ 83] ^ ks[ 81] ^ ks[ 78] ^ 0x9e3779b9 ^ 78), 11);
ks[ 87] = rotl32_S ((ks[ 86] ^ ks[ 84] ^ ks[ 82] ^ ks[ 79] ^ 0x9e3779b9 ^ 79), 11);
ks[ 88] = rotl32_S ((ks[ 87] ^ ks[ 85] ^ ks[ 83] ^ ks[ 80] ^ 0x9e3779b9 ^ 80), 11);
ks[ 89] = rotl32_S ((ks[ 88] ^ ks[ 86] ^ ks[ 84] ^ ks[ 81] ^ 0x9e3779b9 ^ 81), 11);
ks[ 90] = rotl32_S ((ks[ 89] ^ ks[ 87] ^ ks[ 85] ^ ks[ 82] ^ 0x9e3779b9 ^ 82), 11);
ks[ 91] = rotl32_S ((ks[ 90] ^ ks[ 88] ^ ks[ 86] ^ ks[ 83] ^ 0x9e3779b9 ^ 83), 11);
ks[ 92] = rotl32_S ((ks[ 91] ^ ks[ 89] ^ ks[ 87] ^ ks[ 84] ^ 0x9e3779b9 ^ 84), 11);
ks[ 93] = rotl32_S ((ks[ 92] ^ ks[ 90] ^ ks[ 88] ^ ks[ 85] ^ 0x9e3779b9 ^ 85), 11);
ks[ 94] = rotl32_S ((ks[ 93] ^ ks[ 91] ^ ks[ 89] ^ ks[ 86] ^ 0x9e3779b9 ^ 86), 11);
ks[ 95] = rotl32_S ((ks[ 94] ^ ks[ 92] ^ ks[ 90] ^ ks[ 87] ^ 0x9e3779b9 ^ 87), 11);
ks[ 96] = rotl32_S ((ks[ 95] ^ ks[ 93] ^ ks[ 91] ^ ks[ 88] ^ 0x9e3779b9 ^ 88), 11);
ks[ 97] = rotl32_S ((ks[ 96] ^ ks[ 94] ^ ks[ 92] ^ ks[ 89] ^ 0x9e3779b9 ^ 89), 11);
ks[ 98] = rotl32_S ((ks[ 97] ^ ks[ 95] ^ ks[ 93] ^ ks[ 90] ^ 0x9e3779b9 ^ 90), 11);
ks[ 99] = rotl32_S ((ks[ 98] ^ ks[ 96] ^ ks[ 94] ^ ks[ 91] ^ 0x9e3779b9 ^ 91), 11);
ks[100] = rotl32_S ((ks[ 99] ^ ks[ 97] ^ ks[ 95] ^ ks[ 92] ^ 0x9e3779b9 ^ 92), 11);
ks[101] = rotl32_S ((ks[100] ^ ks[ 98] ^ ks[ 96] ^ ks[ 93] ^ 0x9e3779b9 ^ 93), 11);
ks[102] = rotl32_S ((ks[101] ^ ks[ 99] ^ ks[ 97] ^ ks[ 94] ^ 0x9e3779b9 ^ 94), 11);
ks[103] = rotl32_S ((ks[102] ^ ks[100] ^ ks[ 98] ^ ks[ 95] ^ 0x9e3779b9 ^ 95), 11);
ks[104] = rotl32_S ((ks[103] ^ ks[101] ^ ks[ 99] ^ ks[ 96] ^ 0x9e3779b9 ^ 96), 11);
ks[105] = rotl32_S ((ks[104] ^ ks[102] ^ ks[100] ^ ks[ 97] ^ 0x9e3779b9 ^ 97), 11);
ks[106] = rotl32_S ((ks[105] ^ ks[103] ^ ks[101] ^ ks[ 98] ^ 0x9e3779b9 ^ 98), 11);
ks[107] = rotl32_S ((ks[106] ^ ks[104] ^ ks[102] ^ ks[ 99] ^ 0x9e3779b9 ^ 99), 11);
ks[108] = rotl32_S ((ks[107] ^ ks[105] ^ ks[103] ^ ks[100] ^ 0x9e3779b9 ^ 100), 11);
ks[109] = rotl32_S ((ks[108] ^ ks[106] ^ ks[104] ^ ks[101] ^ 0x9e3779b9 ^ 101), 11);
ks[110] = rotl32_S ((ks[109] ^ ks[107] ^ ks[105] ^ ks[102] ^ 0x9e3779b9 ^ 102), 11);
ks[111] = rotl32_S ((ks[110] ^ ks[108] ^ ks[106] ^ ks[103] ^ 0x9e3779b9 ^ 103), 11);
ks[112] = rotl32_S ((ks[111] ^ ks[109] ^ ks[107] ^ ks[104] ^ 0x9e3779b9 ^ 104), 11);
ks[113] = rotl32_S ((ks[112] ^ ks[110] ^ ks[108] ^ ks[105] ^ 0x9e3779b9 ^ 105), 11);
ks[114] = rotl32_S ((ks[113] ^ ks[111] ^ ks[109] ^ ks[106] ^ 0x9e3779b9 ^ 106), 11);
ks[115] = rotl32_S ((ks[114] ^ ks[112] ^ ks[110] ^ ks[107] ^ 0x9e3779b9 ^ 107), 11);
ks[116] = rotl32_S ((ks[115] ^ ks[113] ^ ks[111] ^ ks[108] ^ 0x9e3779b9 ^ 108), 11);
ks[117] = rotl32_S ((ks[116] ^ ks[114] ^ ks[112] ^ ks[109] ^ 0x9e3779b9 ^ 109), 11);
ks[118] = rotl32_S ((ks[117] ^ ks[115] ^ ks[113] ^ ks[110] ^ 0x9e3779b9 ^ 110), 11);
ks[119] = rotl32_S ((ks[118] ^ ks[116] ^ ks[114] ^ ks[111] ^ 0x9e3779b9 ^ 111), 11);
ks[120] = rotl32_S ((ks[119] ^ ks[117] ^ ks[115] ^ ks[112] ^ 0x9e3779b9 ^ 112), 11);
ks[121] = rotl32_S ((ks[120] ^ ks[118] ^ ks[116] ^ ks[113] ^ 0x9e3779b9 ^ 113), 11);
ks[122] = rotl32_S ((ks[121] ^ ks[119] ^ ks[117] ^ ks[114] ^ 0x9e3779b9 ^ 114), 11);
ks[123] = rotl32_S ((ks[122] ^ ks[120] ^ ks[118] ^ ks[115] ^ 0x9e3779b9 ^ 115), 11);
ks[124] = rotl32_S ((ks[123] ^ ks[121] ^ ks[119] ^ ks[116] ^ 0x9e3779b9 ^ 116), 11);
ks[125] = rotl32_S ((ks[124] ^ ks[122] ^ ks[120] ^ ks[117] ^ 0x9e3779b9 ^ 117), 11);
ks[126] = rotl32_S ((ks[125] ^ ks[123] ^ ks[121] ^ ks[118] ^ 0x9e3779b9 ^ 118), 11);
ks[127] = rotl32_S ((ks[126] ^ ks[124] ^ ks[122] ^ ks[119] ^ 0x9e3779b9 ^ 119), 11);
ks[128] = rotl32_S ((ks[127] ^ ks[125] ^ ks[123] ^ ks[120] ^ 0x9e3779b9 ^ 120), 11);
ks[129] = rotl32_S ((ks[128] ^ ks[126] ^ ks[124] ^ ks[121] ^ 0x9e3779b9 ^ 121), 11);
ks[130] = rotl32_S ((ks[129] ^ ks[127] ^ ks[125] ^ ks[122] ^ 0x9e3779b9 ^ 122), 11);
ks[131] = rotl32_S ((ks[130] ^ ks[128] ^ ks[126] ^ ks[123] ^ 0x9e3779b9 ^ 123), 11);
ks[132] = rotl32_S ((ks[131] ^ ks[129] ^ ks[127] ^ ks[124] ^ 0x9e3779b9 ^ 124), 11);
ks[133] = rotl32_S ((ks[132] ^ ks[130] ^ ks[128] ^ ks[125] ^ 0x9e3779b9 ^ 125), 11);
ks[134] = rotl32_S ((ks[133] ^ ks[131] ^ ks[129] ^ ks[126] ^ 0x9e3779b9 ^ 126), 11);
ks[135] = rotl32_S ((ks[134] ^ ks[132] ^ ks[130] ^ ks[127] ^ 0x9e3779b9 ^ 127), 11);
ks[136] = rotl32_S ((ks[135] ^ ks[133] ^ ks[131] ^ ks[128] ^ 0x9e3779b9 ^ 128), 11);
ks[137] = rotl32_S ((ks[136] ^ ks[134] ^ ks[132] ^ ks[129] ^ 0x9e3779b9 ^ 129), 11);
ks[138] = rotl32_S ((ks[137] ^ ks[135] ^ ks[133] ^ ks[130] ^ 0x9e3779b9 ^ 130), 11);
ks[139] = rotl32_S ((ks[138] ^ ks[136] ^ ks[134] ^ ks[131] ^ 0x9e3779b9 ^ 131), 11);
u32 a,b,c,d,e,f,g,h; u32 a,b,c,d,e,f,g,h;
u32 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16; u32 t1,t2,t3,t4,t5,t6,t7,t8,t9,t10,t11,t12,t13,t14,t15,t16;