1
mirror of https://github.com/hashcat/hashcat synced 2024-12-23 14:13:43 +01:00

Optimize some switch_buffer_* functions for generic OpenCL devices (CPU, various OSX, ...)

This commit is contained in:
jsteube 2019-03-01 14:49:00 +01:00
parent 23917455ef
commit e1fe3e755b
12 changed files with 8345 additions and 8694 deletions

File diff suppressed because it is too large Load Diff

View File

@ -361,16 +361,46 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
return amd_bfe (a, b, c);
}
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const u32x c)
DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const u32 c)
{
return amd_bytealign (a, b, c);
}
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c)
DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const u32 c)
{
return amd_bytealign (a, b, c);
}
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const u32 c)
{
u32x r;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
return r;
}
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
return r;
}
#if HAS_VPERM
DECLSPEC u32x hc_byte_perm (const u32x a, const u32x b, const u32x c)
{
@ -1089,44 +1119,64 @@ DECLSPEC u32 hc_bfe_S (const u32 a, const u32 b, const u32 c)
#undef BFE
}
DECLSPEC u32x hc_bytealign_be (const u32x a, const u32x b, const u32 c)
{
u32x r;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a << 24) | (b >> 8); break;
case 2: r = (a << 16) | (b >> 16); break;
case 3: r = (a << 8) | (b >> 24); break;
}
return r;
}
DECLSPEC u32 hc_bytealign_be_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a << 24) | (b >> 8); break;
case 2: r = (a << 16) | (b >> 16); break;
case 3: r = (a << 8) | (b >> 24); break;
}
return r;
}
DECLSPEC u32x hc_bytealign (const u32x a, const u32x b, const u32 c)
{
#if VECT_SIZE == 1
const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8);
u32x r;
return (u32x) (tmp);
#endif
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
#if VECT_SIZE == 2
const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8);
return (u32x) (tmp.s0, tmp.s1);
#endif
#if VECT_SIZE == 4
const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8);
return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3);
#endif
#if VECT_SIZE == 8
const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8);
return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7);
#endif
#if VECT_SIZE == 16
const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7, a.s8, a.s9, a.sa, a.sb, a.sc, a.sd, a.se, a.sf)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7, b.s8, b.s9, b.sa, b.sb, b.sc, b.sd, b.se, b.sf))) >> ((c & 3) * 8);
return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7, tmp.s8, tmp.s9, tmp.sa, tmp.sb, tmp.sc, tmp.sd, tmp.se, tmp.sf);
#endif
return r;
}
DECLSPEC u32 hc_bytealign_S (const u32 a, const u32 b, const u32 c)
{
const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8);
u32 r;
return (u32) (tmp);
switch (c & 3)
{
case 0: r = b; break;
case 1: r = (a >> 24) | (b << 8); break;
case 2: r = (a >> 16) | (b << 16); break;
case 3: r = (a >> 8) | (b << 24); break;
}
return r;
}
DECLSPEC u32x hc_add3 (const u32x a, const u32x b, const u32x c)

View File

@ -30,22 +30,16 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -143,23 +137,17 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -256,16 +244,12 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
#endif
#ifdef IS_NV

View File

@ -29,22 +29,16 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -142,23 +136,17 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -255,16 +243,12 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
#endif
#ifdef IS_NV

View File

@ -2117,11 +2117,11 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = swap32_S (append[4]);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = append[4];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
@ -2129,13 +2129,6 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp5 = hc_bytealign (in4, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
tmp5 = swap32_S (tmp5);
#endif
#ifdef IS_NV

View File

@ -26,22 +26,16 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -139,23 +133,17 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3,
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -252,16 +240,12 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
#endif
#ifdef IS_NV

View File

@ -97,22 +97,16 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -238,22 +232,16 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -405,22 +393,16 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
@ -554,23 +536,17 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = hc_bytealign ( 0, in0, offset);
tmp1 = hc_bytealign (in0, in1, offset);
tmp2 = hc_bytealign (in1, in2, offset);
tmp3 = hc_bytealign (in2, in3, offset);
tmp4 = hc_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV

View File

@ -200,11 +200,11 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl,
#if defined IS_AMD || defined IS_GENERIC
for (i = 0; i < pd; i++) sc[idx++] = pw[i];
sc[idx++] = pw[i]
| hc_bytealign (bl[0], 0, pm4);
for (i = 1; i < bd; i++) sc[idx++] = hc_bytealign (bl[i], bl[i - 1], pm4);
sc[idx++] = hc_bytealign (sc[0], bl[i - 1], pm4);
for (i = 1; i < 4; i++) sc[idx++] = hc_bytealign (sc[i], sc[i - 1], pm4);
sc[idx++] = hc_bytealign ( 0, sc[i - 1], pm4);
| hc_bytealign_be (bl[0], 0, pm4);
for (i = 1; i < bd; i++) sc[idx++] = hc_bytealign_be (bl[i], bl[i - 1], pm4);
sc[idx++] = hc_bytealign_be (sc[0], bl[i - 1], pm4);
for (i = 1; i < 4; i++) sc[idx++] = hc_bytealign_be (sc[i], sc[i - 1], pm4);
sc[idx++] = hc_bytealign_be ( 0, sc[i - 1], pm4);
#endif
#ifdef IS_NV
@ -229,10 +229,10 @@ DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, con
const u32 od = m / 4;
#if defined IS_AMD || defined IS_GENERIC
pt[0] = hc_bytealign (sc[od + 1], sc[od + 0], om);
pt[1] = hc_bytealign (sc[od + 2], sc[od + 1], om);
pt[2] = hc_bytealign (sc[od + 3], sc[od + 2], om);
pt[3] = hc_bytealign (sc[od + 4], sc[od + 3], om);
pt[0] = hc_bytealign_be (sc[od + 1], sc[od + 0], om);
pt[1] = hc_bytealign_be (sc[od + 2], sc[od + 1], om);
pt[2] = hc_bytealign_be (sc[od + 3], sc[od + 2], om);
pt[3] = hc_bytealign_be (sc[od + 4], sc[od + 3], om);
#endif
#ifdef IS_NV

View File

@ -20,6 +20,11 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
u32 tmp0;
u32 tmp1;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = hc_bytealign_be (0, append, func_len);
tmp1 = hc_bytealign_be (append, 0, func_len);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff;
@ -27,11 +32,6 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co
tmp1 = hc_byte_perm (0, append, selector);
#endif
#if defined IS_AMD || defined IS_GENERIC
tmp0 = hc_bytealign (0, append, func_len);
tmp1 = hc_bytealign (append, 0, func_len);
#endif
u32 carry = 0;
switch (div)

View File

@ -45,23 +45,23 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = hc_bytealign ( 0, carry[ 0], offset);
tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign (carry[10], carry[11], offset);
tmp12 = hc_bytealign (carry[11], carry[12], offset);
tmp13 = hc_bytealign (carry[12], carry[13], offset);
tmp14 = hc_bytealign (carry[13], carry[14], offset);
tmp15 = hc_bytealign (carry[14], carry[15], offset);
tmp16 = hc_bytealign (carry[15], 0, offset);
tmp00 = hc_bytealign_be ( 0, carry[ 0], offset);
tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign_be (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign_be (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign_be (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign_be (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign_be (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign_be (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign_be (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign_be (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign_be (carry[10], carry[11], offset);
tmp12 = hc_bytealign_be (carry[11], carry[12], offset);
tmp13 = hc_bytealign_be (carry[12], carry[13], offset);
tmp14 = hc_bytealign_be (carry[13], carry[14], offset);
tmp15 = hc_bytealign_be (carry[14], carry[15], offset);
tmp16 = hc_bytealign_be (carry[15], 0, offset);
#endif
#ifdef IS_NV

View File

@ -43,23 +43,23 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = hc_bytealign ( 0, carry[ 0], offset);
tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign (carry[10], carry[11], offset);
tmp12 = hc_bytealign (carry[11], carry[12], offset);
tmp13 = hc_bytealign (carry[12], carry[13], offset);
tmp14 = hc_bytealign (carry[13], carry[14], offset);
tmp15 = hc_bytealign (carry[14], carry[15], offset);
tmp16 = hc_bytealign (carry[15], 0, offset);
tmp00 = hc_bytealign_be ( 0, carry[ 0], offset);
tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign_be (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign_be (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign_be (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign_be (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign_be (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign_be (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign_be (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign_be (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign_be (carry[10], carry[11], offset);
tmp12 = hc_bytealign_be (carry[11], carry[12], offset);
tmp13 = hc_bytealign_be (carry[12], carry[13], offset);
tmp14 = hc_bytealign_be (carry[13], carry[14], offset);
tmp15 = hc_bytealign_be (carry[14], carry[15], offset);
tmp16 = hc_bytealign_be (carry[15], 0, offset);
#endif
#ifdef IS_NV

View File

@ -42,23 +42,23 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry)
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = hc_bytealign ( 0, carry[ 0], offset);
tmp01 = hc_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign (carry[10], carry[11], offset);
tmp12 = hc_bytealign (carry[11], carry[12], offset);
tmp13 = hc_bytealign (carry[12], carry[13], offset);
tmp14 = hc_bytealign (carry[13], carry[14], offset);
tmp15 = hc_bytealign (carry[14], carry[15], offset);
tmp16 = hc_bytealign (carry[15], 0, offset);
tmp00 = hc_bytealign_be ( 0, carry[ 0], offset);
tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset);
tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset);
tmp03 = hc_bytealign_be (carry[ 2], carry[ 3], offset);
tmp04 = hc_bytealign_be (carry[ 3], carry[ 4], offset);
tmp05 = hc_bytealign_be (carry[ 4], carry[ 5], offset);
tmp06 = hc_bytealign_be (carry[ 5], carry[ 6], offset);
tmp07 = hc_bytealign_be (carry[ 6], carry[ 7], offset);
tmp08 = hc_bytealign_be (carry[ 7], carry[ 8], offset);
tmp09 = hc_bytealign_be (carry[ 8], carry[ 9], offset);
tmp10 = hc_bytealign_be (carry[ 9], carry[10], offset);
tmp11 = hc_bytealign_be (carry[10], carry[11], offset);
tmp12 = hc_bytealign_be (carry[11], carry[12], offset);
tmp13 = hc_bytealign_be (carry[12], carry[13], offset);
tmp14 = hc_bytealign_be (carry[13], carry[14], offset);
tmp15 = hc_bytealign_be (carry[14], carry[15], offset);
tmp16 = hc_bytealign_be (carry[15], 0, offset);
#endif
#ifdef IS_NV