1
mirror of https://github.com/hashcat/hashcat synced 2024-11-24 14:27:14 +01:00

Rewrite code around amd_bytealign to be of type BE to save a branch afterwards

This commit is contained in:
jsteube 2017-08-05 19:46:56 +02:00
parent 6bafc385dc
commit c5c12f89c1
15 changed files with 23824 additions and 8856 deletions

File diff suppressed because it is too large Load Diff

View File

@ -760,7 +760,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
u32 s0 = 0;
u32 s1 = 0;
u32 s2 = 0;
@ -769,64 +768,69 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
u32 s5 = 0;
u32 s6 = 0;
u32 s7 = 0;
u32 s8 = 0;
#if defined IS_AMD || defined IS_GENERIC
const u32 src_r00 = swap32_S (src_r0[0]);
const u32 src_r01 = swap32_S (src_r0[1]);
const u32 src_r02 = swap32_S (src_r0[2]);
const u32 src_r03 = swap32_S (src_r0[3]);
const u32 src_r10 = swap32_S (src_r1[0]);
const u32 src_r11 = swap32_S (src_r1[1]);
const u32 src_r12 = swap32_S (src_r1[2]);
const u32 src_r13 = swap32_S (src_r1[3]);
switch (offset / 4)
{
case 0:
s8 = amd_bytealign_S ( 0, src_r1[3], offset_minus_4);
s7 = amd_bytealign_S (src_r1[3], src_r1[2], offset_minus_4);
s6 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4);
s5 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4);
s4 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s3 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s2 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s1 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s0 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S (src_r12, src_r13, offset);
s6 = amd_bytealign_S (src_r11, src_r12, offset);
s5 = amd_bytealign_S (src_r10, src_r11, offset);
s4 = amd_bytealign_S (src_r03, src_r10, offset);
s3 = amd_bytealign_S (src_r02, src_r03, offset);
s2 = amd_bytealign_S (src_r01, src_r02, offset);
s1 = amd_bytealign_S (src_r00, src_r01, offset);
s0 = amd_bytealign_S ( 0, src_r00, offset);
break;
case 1:
s8 = amd_bytealign_S ( 0, src_r1[2], offset_minus_4);
s7 = amd_bytealign_S (src_r1[2], src_r1[1], offset_minus_4);
s6 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4);
s5 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s4 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s3 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s2 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s1 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S (src_r11, src_r12, offset);
s6 = amd_bytealign_S (src_r10, src_r11, offset);
s5 = amd_bytealign_S (src_r03, src_r10, offset);
s4 = amd_bytealign_S (src_r02, src_r03, offset);
s3 = amd_bytealign_S (src_r01, src_r02, offset);
s2 = amd_bytealign_S (src_r00, src_r01, offset);
s1 = amd_bytealign_S ( 0, src_r00, offset);
s0 = 0;
break;
case 2:
s8 = amd_bytealign_S ( 0, src_r1[1], offset_minus_4);
s7 = amd_bytealign_S (src_r1[1], src_r1[0], offset_minus_4);
s6 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s5 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s4 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s3 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s2 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S (src_r10, src_r11, offset);
s6 = amd_bytealign_S (src_r03, src_r10, offset);
s5 = amd_bytealign_S (src_r02, src_r03, offset);
s4 = amd_bytealign_S (src_r01, src_r02, offset);
s3 = amd_bytealign_S (src_r00, src_r01, offset);
s2 = amd_bytealign_S ( 0, src_r00, offset);
s1 = 0;
s0 = 0;
break;
case 3:
s8 = amd_bytealign_S ( 0, src_r1[0], offset_minus_4);
s7 = amd_bytealign_S (src_r1[0], src_r0[3], offset_minus_4);
s6 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s5 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s4 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s3 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S (src_r03, src_r10, offset);
s6 = amd_bytealign_S (src_r02, src_r03, offset);
s5 = amd_bytealign_S (src_r01, src_r02, offset);
s4 = amd_bytealign_S (src_r00, src_r01, offset);
s3 = amd_bytealign_S ( 0, src_r00, offset);
s2 = 0;
s1 = 0;
s0 = 0;
break;
case 4:
s8 = amd_bytealign_S ( 0, src_r0[3], offset_minus_4);
s7 = amd_bytealign_S (src_r0[3], src_r0[2], offset_minus_4);
s6 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s5 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s4 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S (src_r02, src_r03, offset);
s6 = amd_bytealign_S (src_r01, src_r02, offset);
s5 = amd_bytealign_S (src_r00, src_r01, offset);
s4 = amd_bytealign_S ( 0, src_r00, offset);
s3 = 0;
s2 = 0;
s1 = 0;
@ -834,10 +838,9 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
case 5:
s8 = amd_bytealign_S ( 0, src_r0[2], offset_minus_4);
s7 = amd_bytealign_S (src_r0[2], src_r0[1], offset_minus_4);
s6 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s5 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S (src_r01, src_r02, offset);
s6 = amd_bytealign_S (src_r00, src_r01, offset);
s5 = amd_bytealign_S ( 0, src_r00, offset);
s4 = 0;
s3 = 0;
s2 = 0;
@ -846,9 +849,8 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
case 6:
s8 = amd_bytealign_S ( 0, src_r0[1], offset_minus_4);
s7 = amd_bytealign_S (src_r0[1], src_r0[0], offset_minus_4);
s6 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S (src_r00, src_r01, offset);
s6 = amd_bytealign_S ( 0, src_r00, offset);
s5 = 0;
s4 = 0;
s3 = 0;
@ -858,8 +860,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
case 7:
s8 = amd_bytealign_S ( 0, src_r0[0], offset_minus_4);
s7 = amd_bytealign_S (src_r0[0], 0, offset_minus_4);
s7 = amd_bytealign_S ( 0, src_r00, offset);
s6 = 0;
s5 = 0;
s4 = 0;
@ -870,83 +871,69 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
}
if (offset_mod_4 == 0)
{
buf0[0] = src_l0[0] | s1;
buf0[1] = src_l0[1] | s2;
buf0[2] = src_l0[2] | s3;
buf0[3] = src_l0[3] | s4;
buf1[0] = src_l1[0] | s5;
buf1[1] = src_l1[1] | s6;
buf1[2] = src_l1[2] | s7;
buf1[3] = src_l1[3] | s8;
}
else
{
buf0[0] = src_l0[0] | s0;
buf0[1] = src_l0[1] | s1;
buf0[2] = src_l0[2] | s2;
buf0[3] = src_l0[3] | s3;
buf1[0] = src_l1[0] | s4;
buf1[1] = src_l1[1] | s5;
buf1[2] = src_l1[2] | s6;
buf1[3] = src_l1[3] | s7;
}
s0 = swap32_S (s0);
s1 = swap32_S (s1);
s2 = swap32_S (s2);
s3 = swap32_S (s3);
s4 = swap32_S (s4);
s5 = swap32_S (s5);
s6 = swap32_S (s6);
s7 = swap32_S (s7);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
u32 s0 = 0;
u32 s1 = 0;
u32 s2 = 0;
u32 s3 = 0;
u32 s4 = 0;
u32 s5 = 0;
u32 s6 = 0;
u32 s7 = 0;
const u32 src_r00 = src_r0[0];
const u32 src_r01 = src_r0[1];
const u32 src_r02 = src_r0[2];
const u32 src_r03 = src_r0[3];
const u32 src_r10 = src_r1[0];
const u32 src_r11 = src_r1[1];
const u32 src_r12 = src_r1[2];
const u32 src_r13 = src_r1[3];
switch (offset / 4)
{
case 0:
s7 = __byte_perm_S (src_r1[2], src_r1[3], selector);
s6 = __byte_perm_S (src_r1[1], src_r1[2], selector);
s5 = __byte_perm_S (src_r1[0], src_r1[1], selector);
s4 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s3 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s2 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s1 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s0 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r12, src_r13, selector);
s6 = __byte_perm_S (src_r11, src_r12, selector);
s5 = __byte_perm_S (src_r10, src_r11, selector);
s4 = __byte_perm_S (src_r03, src_r10, selector);
s3 = __byte_perm_S (src_r02, src_r03, selector);
s2 = __byte_perm_S (src_r01, src_r02, selector);
s1 = __byte_perm_S (src_r00, src_r01, selector);
s0 = __byte_perm_S ( 0, src_r00, selector);
break;
case 1:
s7 = __byte_perm_S (src_r1[1], src_r1[2], selector);
s6 = __byte_perm_S (src_r1[0], src_r1[1], selector);
s5 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s4 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s3 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s2 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s1 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r11, src_r12, selector);
s6 = __byte_perm_S (src_r10, src_r11, selector);
s5 = __byte_perm_S (src_r03, src_r10, selector);
s4 = __byte_perm_S (src_r02, src_r03, selector);
s3 = __byte_perm_S (src_r01, src_r02, selector);
s2 = __byte_perm_S (src_r00, src_r01, selector);
s1 = __byte_perm_S ( 0, src_r00, selector);
s0 = 0;
break;
case 2:
s7 = __byte_perm_S (src_r1[0], src_r1[1], selector);
s6 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s5 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s4 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s3 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s2 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r10, src_r11, selector);
s6 = __byte_perm_S (src_r03, src_r10, selector);
s5 = __byte_perm_S (src_r02, src_r03, selector);
s4 = __byte_perm_S (src_r01, src_r02, selector);
s3 = __byte_perm_S (src_r00, src_r01, selector);
s2 = __byte_perm_S ( 0, src_r00, selector);
s1 = 0;
s0 = 0;
break;
case 3:
s7 = __byte_perm_S (src_r0[3], src_r1[0], selector);
s6 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s5 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s4 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s3 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r03, src_r10, selector);
s6 = __byte_perm_S (src_r02, src_r03, selector);
s5 = __byte_perm_S (src_r01, src_r02, selector);
s4 = __byte_perm_S (src_r00, src_r01, selector);
s3 = __byte_perm_S ( 0, src_r00, selector);
s2 = 0;
s1 = 0;
s0 = 0;
@ -954,10 +941,10 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
case 4:
s7 = __byte_perm_S (src_r0[2], src_r0[3], selector);
s6 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s5 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s4 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r02, src_r03, selector);
s6 = __byte_perm_S (src_r01, src_r02, selector);
s5 = __byte_perm_S (src_r00, src_r01, selector);
s4 = __byte_perm_S ( 0, src_r00, selector);
s3 = 0;
s2 = 0;
s1 = 0;
@ -965,9 +952,9 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
case 5:
s7 = __byte_perm_S (src_r0[1], src_r0[2], selector);
s6 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s5 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r01, src_r02, selector);
s6 = __byte_perm_S (src_r00, src_r01, selector);
s5 = __byte_perm_S ( 0, src_r00, selector);
s4 = 0;
s3 = 0;
s2 = 0;
@ -976,8 +963,8 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
case 6:
s7 = __byte_perm_S (src_r0[0], src_r0[1], selector);
s6 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S (src_r00, src_r01, selector);
s6 = __byte_perm_S ( 0, src_r00, selector);
s5 = 0;
s4 = 0;
s3 = 0;
@ -987,7 +974,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
break;
case 7:
s7 = __byte_perm_S ( 0, src_r0[0], selector);
s7 = __byte_perm_S ( 0, src_r00, selector);
s6 = 0;
s5 = 0;
s4 = 0;
@ -997,6 +984,7 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
s0 = 0;
break;
}
#endif
buf0[0] = src_l0[0] | s0;
buf0[1] = src_l0[1] | s1;
@ -1006,8 +994,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0
buf1[1] = src_l1[1] | s5;
buf1[2] = src_l1[2] | s6;
buf1[3] = src_l1[3] | s7;
#endif
}
void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len)

View File

@ -10,110 +10,13 @@
#include "inc_hash_functions.cl"
#include "inc_types.cl"
#include "inc_common.cl"
#include "inc_hash_md5.cl"
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
#define md5crypt_magic 0x00243124u
void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{
u32 a = digest[0];
u32 b = digest[1];
u32 c = digest[2];
u32 d = digest[3];
u32 w0_t = w0[0];
u32 w1_t = w0[1];
u32 w2_t = w0[2];
u32 w3_t = w0[3];
u32 w4_t = w1[0];
u32 w5_t = w1[1];
u32 w6_t = w1[2];
u32 w7_t = w1[3];
u32 w8_t = w2[0];
u32 w9_t = w2[1];
u32 wa_t = w2[2];
u32 wb_t = w2[3];
u32 wc_t = w3[0];
u32 wd_t = w3[1];
u32 we_t = w3[2];
u32 wf_t = 0;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
}
void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4])
{
u32 tmp0;
@ -127,30 +30,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
#endif
const u32 div = offset / 4;
@ -233,30 +143,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0x80;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0x80, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
#endif
const u32 div = offset / 4;
@ -337,24 +256,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign ( 0, append[1], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, 0, selector);
#endif
const u32 div = offset / 4;

View File

@ -8,6 +8,7 @@
#include "inc_hash_functions.cl"
#include "inc_types.cl"
#include "inc_common.cl"
#include "inc_hash_md5.cl"
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
@ -15,104 +16,6 @@
#define md5apr1_magic0 0x72706124u
#define md5apr1_magic1 0x00002431u
void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{
u32 a = digest[0];
u32 b = digest[1];
u32 c = digest[2];
u32 d = digest[3];
u32 w0_t = w0[0];
u32 w1_t = w0[1];
u32 w2_t = w0[2];
u32 w3_t = w0[3];
u32 w4_t = w1[0];
u32 w5_t = w1[1];
u32 w6_t = w1[2];
u32 w7_t = w1[3];
u32 w8_t = w2[0];
u32 w9_t = w2[1];
u32 wa_t = w2[2];
u32 wb_t = w2[3];
u32 wc_t = w3[0];
u32 wd_t = w3[1];
u32 we_t = w3[2];
u32 wf_t = 0;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
}
void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4])
{
u32 tmp0;
@ -126,30 +29,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
#endif
const u32 div = offset / 4;
@ -232,30 +142,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0x80;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0x80, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
#endif
const u32 div = offset / 4;
@ -336,24 +255,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign ( 0, append[1], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, 0, selector);
#endif
const u32 div = offset / 4;

View File

@ -62,7 +62,7 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule
const u32 pw_lenv = ceil ((float) pw_len / 4);
u32 w[64] = { 0 };
u32x w[64] = { 0 };
for (int idx = 0; idx < pw_lenv; idx++)
{
@ -88,13 +88,13 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule
* loop
*/
u32 w0l = w[0];
u32x w0l = w[0];
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos / VECT_SIZE];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0lr = w0l | w0r;
const u32x w0lr = w0l | w0r;
w[0] = w0lr;
@ -106,10 +106,10 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule
md5_final_vector (&ctx0);
const u32 a = ctx0.h[0];
const u32 b = ctx0.h[1];
const u32 c = ctx0.h[2];
const u32 d = ctx0.h[3];
const u32x a = ctx0.h[0];
const u32x b = ctx0.h[1];
const u32x c = ctx0.h[2];
const u32x d = ctx0.h[3];
md5_ctx_vector_t ctx;
@ -161,10 +161,10 @@ __kernel void m02810_mxx (__global pw_t *pws, __global const kernel_rule_t *rule
md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
const u32x r0 = ctx.h[DGST_R0];
const u32x r1 = ctx.h[DGST_R1];
const u32x r2 = ctx.h[DGST_R2];
const u32x r3 = ctx.h[DGST_R3];
COMPARE_M_SIMD (r0, r1, r2, r3);
}
@ -219,7 +219,7 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule
const u32 pw_lenv = ceil ((float) pw_len / 4);
u32 w[64] = { 0 };
u32x w[64] = { 0 };
for (int idx = 0; idx < pw_lenv; idx++)
{
@ -245,13 +245,13 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule
* loop
*/
u32 w0l = w[0];
u32x w0l = w[0];
for (u32 il_pos = 0; il_pos < il_cnt; il_pos += VECT_SIZE)
{
const u32 w0r = words_buf_r[il_pos / VECT_SIZE];
const u32x w0r = words_buf_r[il_pos / VECT_SIZE];
const u32 w0lr = w0l | w0r;
const u32x w0lr = w0l | w0r;
w[0] = w0lr;
@ -263,10 +263,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule
md5_final_vector (&ctx0);
const u32 a = ctx0.h[0];
const u32 b = ctx0.h[1];
const u32 c = ctx0.h[2];
const u32 d = ctx0.h[3];
const u32x a = ctx0.h[0];
const u32x b = ctx0.h[1];
const u32x c = ctx0.h[2];
const u32x d = ctx0.h[3];
md5_ctx_vector_t ctx;
@ -318,10 +318,10 @@ __kernel void m02810_sxx (__global pw_t *pws, __global const kernel_rule_t *rule
md5_transform_vector (ctx.w0, ctx.w1, ctx.w2, ctx.w3, ctx.h);
const u32 r0 = ctx.h[DGST_R0];
const u32 r1 = ctx.h[DGST_R1];
const u32 r2 = ctx.h[DGST_R2];
const u32 r3 = ctx.h[DGST_R3];
const u32x r0 = ctx.h[DGST_R0];
const u32x r1 = ctx.h[DGST_R1];
const u32x r2 = ctx.h[DGST_R2];
const u32x r3 = ctx.h[DGST_R3];
COMPARE_S_SIMD (r0, r1, r2, r3);
}

View File

@ -8,6 +8,7 @@
#include "inc_hash_functions.cl"
#include "inc_types.cl"
#include "inc_common.cl"
#include "inc_hash_sha1.cl"
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
@ -2116,33 +2117,42 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign (append[4], append[3], offset_minus_4);
tmp5 = amd_bytealign ( 0, append[4], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = swap32_S (append[4]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = tmp5;
tmp5 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp5 = amd_bytealign (in4, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
tmp5 = swap32_S (tmp5);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], append[4], selector);
tmp5 = __byte_perm (append[4], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = append[4];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
tmp5 = __byte_perm (in4, 0, selector);
#endif
const u32 div = offset / 4;
@ -2187,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3
}
}
void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{
u32 A = digest[0];
u32 B = digest[1];
u32 C = digest[2];
u32 D = digest[3];
u32 E = digest[4];
u32 w0_t = w0[0];
u32 w1_t = w0[1];
u32 w2_t = w0[2];
u32 w3_t = w0[3];
u32 w4_t = w1[0];
u32 w5_t = w1[1];
u32 w6_t = w1[2];
u32 w7_t = w1[3];
u32 w8_t = w2[0];
u32 w9_t = w2[1];
u32 wa_t = w2[2];
u32 wb_t = w2[3];
u32 wc_t = w3[0];
u32 wd_t = w3[1];
u32 we_t = w3[2];
u32 wf_t = w3[3];
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t);
SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t);
SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t);
SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t);
SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t);
SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t);
SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t);
SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t);
SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t);
SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t);
SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t);
SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t);
SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t);
SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t);
SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t);
SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t);
#undef K
#define K SHA1C01
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t);
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t);
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t);
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t);
#undef K
#define K SHA1C02
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t);
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t);
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t);
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t);
#undef K
#define K SHA1C03
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t);
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t);
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t);
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t);
digest[0] += A;
digest[1] += B;
digest[2] += C;
digest[3] += D;
digest[4] += E;
}
__kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**

View File

@ -2117,33 +2117,42 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign (append[4], append[3], offset_minus_4);
tmp5 = amd_bytealign ( 0, append[4], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = swap32_S (append[4]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = tmp5;
tmp5 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp5 = amd_bytealign (in4, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
tmp5 = swap32_S (tmp5);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], append[4], selector);
tmp5 = __byte_perm (append[4], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = append[4];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp3 = __byte_perm (in3, in4, selector);
tmp4 = __byte_perm (in4, 0, selector);
#endif
const u32 div = offset / 4;
@ -2188,134 +2197,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3
}
}
void orig_sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5])
{
u32 A = digest[0];
u32 B = digest[1];
u32 C = digest[2];
u32 D = digest[3];
u32 E = digest[4];
u32 w0_t = w0[0];
u32 w1_t = w0[1];
u32 w2_t = w0[2];
u32 w3_t = w0[3];
u32 w4_t = w1[0];
u32 w5_t = w1[1];
u32 w6_t = w1[2];
u32 w7_t = w1[3];
u32 w8_t = w2[0];
u32 w9_t = w2[1];
u32 wa_t = w2[2];
u32 wb_t = w2[3];
u32 wc_t = w3[0];
u32 wd_t = w3[1];
u32 we_t = w3[2];
u32 wf_t = w3[3];
#undef K
#define K SHA1C00
SHA1_STEP (SHA1_F0o, A, B, C, D, E, w0_t);
SHA1_STEP (SHA1_F0o, E, A, B, C, D, w1_t);
SHA1_STEP (SHA1_F0o, D, E, A, B, C, w2_t);
SHA1_STEP (SHA1_F0o, C, D, E, A, B, w3_t);
SHA1_STEP (SHA1_F0o, B, C, D, E, A, w4_t);
SHA1_STEP (SHA1_F0o, A, B, C, D, E, w5_t);
SHA1_STEP (SHA1_F0o, E, A, B, C, D, w6_t);
SHA1_STEP (SHA1_F0o, D, E, A, B, C, w7_t);
SHA1_STEP (SHA1_F0o, C, D, E, A, B, w8_t);
SHA1_STEP (SHA1_F0o, B, C, D, E, A, w9_t);
SHA1_STEP (SHA1_F0o, A, B, C, D, E, wa_t);
SHA1_STEP (SHA1_F0o, E, A, B, C, D, wb_t);
SHA1_STEP (SHA1_F0o, D, E, A, B, C, wc_t);
SHA1_STEP (SHA1_F0o, C, D, E, A, B, wd_t);
SHA1_STEP (SHA1_F0o, B, C, D, E, A, we_t);
SHA1_STEP (SHA1_F0o, A, B, C, D, E, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, E, A, B, C, D, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, D, E, A, B, C, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, C, D, E, A, B, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, B, C, D, E, A, w3_t);
#undef K
#define K SHA1C01
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w7_t);
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wb_t);
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w3_t);
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w7_t);
#undef K
#define K SHA1C02
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wb_t);
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w3_t);
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, A, B, C, D, E, w7_t);
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, E, A, B, C, D, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, D, E, A, B, C, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, C, D, E, A, B, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, B, C, D, E, A, wb_t);
#undef K
#define K SHA1C03
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, wf_t);
w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w0_t);
w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w1_t);
w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w2_t);
w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w3_t);
w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w4_t);
w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, w5_t);
w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, w6_t);
w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, w7_t);
w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, w8_t);
w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, w9_t);
wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wa_t);
wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, A, B, C, D, E, wb_t);
wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, E, A, B, C, D, wc_t);
wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, D, E, A, B, C, wd_t);
we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, C, D, E, A, B, we_t);
wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, B, C, D, E, A, wf_t);
digest[0] += A;
digest[1] += B;
digest[2] += C;
digest[3] += D;
digest[4] += E;
}
__kernel void m05800_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global androidpin_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
const u32 gid = get_global_id (0);

View File

@ -8,108 +8,11 @@
#include "inc_hash_functions.cl"
#include "inc_types.cl"
#include "inc_common.cl"
#include "inc_hash_md5.cl"
#define COMPARE_S "inc_comp_single.cl"
#define COMPARE_M "inc_comp_multi.cl"
void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4])
{
u32 a = digest[0];
u32 b = digest[1];
u32 c = digest[2];
u32 d = digest[3];
u32 w0_t = w0[0];
u32 w1_t = w0[1];
u32 w2_t = w0[2];
u32 w3_t = w0[3];
u32 w4_t = w1[0];
u32 w5_t = w1[1];
u32 w6_t = w1[2];
u32 w7_t = w1[3];
u32 w8_t = w2[0];
u32 w9_t = w2[1];
u32 wa_t = w2[2];
u32 wb_t = w2[3];
u32 wc_t = w3[0];
u32 wd_t = w3[1];
u32 we_t = w3[2];
u32 wf_t = 0;
MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03);
MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00);
MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01);
MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02);
MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03);
MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13);
MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10);
MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11);
MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12);
MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13);
MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23);
MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20);
MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21);
MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22);
MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23);
MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33);
MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30);
MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31);
MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32);
MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33);
digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
}
void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4])
{
u32 tmp0;
@ -123,30 +26,37 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
#endif
const u32 div = offset / 4;
@ -229,30 +139,39 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0x80;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0x80, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
#endif
const u32 div = offset / 4;
@ -333,24 +252,27 @@ void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign ( 0, append[1], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, 0, selector);
#endif
const u32 div = offset / 4;

View File

@ -32,8 +32,6 @@ __constant u32a k_sha256[64] =
SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f,
};
#if 1
void sha256_transform (const u32 w[16], u32 digest[8])
{
u32 a = digest[0];
@ -203,30 +201,37 @@ u32 memcat16 (u32 block[16], const u32 offset, const u32 append[4], const u32 ap
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
#endif
switch (offset / 4)
@ -337,30 +342,37 @@ u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 a
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
#endif
u32 carry[4] = { 0, 0, 0, 0 };
@ -484,7 +496,7 @@ u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 a
return new_len;
}
u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len)
u32 memcat20 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len)
{
u32 tmp0;
u32 tmp1;
@ -497,30 +509,37 @@ u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 ap
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, 0, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, 0, selector);
#endif
switch (offset / 4)
@ -626,7 +645,7 @@ u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 ap
return offset + append_len;
}
u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len)
u32 memcat20_x80 (u32 block[32], const u32 offset, const u32 append[4], const u32 append_len)
{
u32 tmp0;
u32 tmp1;
@ -639,30 +658,39 @@ u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u3
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD || defined IS_GENERIC
tmp0 = amd_bytealign (append[0], 0, offset_minus_4);
tmp1 = amd_bytealign (append[1], append[0], offset_minus_4);
tmp2 = amd_bytealign (append[2], append[1], offset_minus_4);
tmp3 = amd_bytealign (append[3], append[2], offset_minus_4);
tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4);
u32 in0 = swap32_S (append[0]);
u32 in1 = swap32_S (append[1]);
u32 in2 = swap32_S (append[2]);
u32 in3 = swap32_S (append[3]);
u32 in4 = 0x80000000;
if (offset_mod_4 == 0)
{
tmp0 = tmp1;
tmp1 = tmp2;
tmp2 = tmp3;
tmp3 = tmp4;
tmp4 = 0x80;
}
tmp0 = amd_bytealign ( 0, in0, offset);
tmp1 = amd_bytealign (in0, in1, offset);
tmp2 = amd_bytealign (in1, in2, offset);
tmp3 = amd_bytealign (in2, in3, offset);
tmp4 = amd_bytealign (in3, in4, offset);
tmp0 = swap32_S (tmp0);
tmp1 = swap32_S (tmp1);
tmp2 = swap32_S (tmp2);
tmp3 = swap32_S (tmp3);
tmp4 = swap32_S (tmp4);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
tmp0 = __byte_perm ( 0, append[0], selector);
tmp1 = __byte_perm (append[0], append[1], selector);
tmp2 = __byte_perm (append[1], append[2], selector);
tmp3 = __byte_perm (append[2], append[3], selector);
tmp4 = __byte_perm (append[3], 0x80, selector);
u32 in0 = append[0];
u32 in1 = append[1];
u32 in2 = append[2];
u32 in3 = append[3];
u32 in4 = 0x80;
tmp0 = __byte_perm ( 0, in0, selector);
tmp1 = __byte_perm (in0, in1, selector);
tmp2 = __byte_perm (in1, in2, selector);
tmp3 = __byte_perm (in2, in3, selector);
tmp4 = __byte_perm (in3, in4, selector);
#endif
switch (offset / 4)
@ -1201,543 +1229,3 @@ __kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rul
#include COMPARE_M
}
#else
// this is basically a much cleaner version, but apparently drops speeds by over 100% :(
#define PUTCHAR32_BE(a,p,c) ((u8 *)(a))[(p) ^ 3] = (u8) (c)
#define GETCHAR32_BE(a,p) ((u8 *)(a))[(p) ^ 3]
typedef struct
{
u32 state[8];
u32 buf[32];
int len;
} sha256_ctx_t;
void sha256_transform (const u32 w[16], u32 digest[8])
{
u32 a = digest[0];
u32 b = digest[1];
u32 c = digest[2];
u32 d = digest[3];
u32 e = digest[4];
u32 f = digest[5];
u32 g = digest[6];
u32 h = digest[7];
u32 w0_t = w[ 0];
u32 w1_t = w[ 1];
u32 w2_t = w[ 2];
u32 w3_t = w[ 3];
u32 w4_t = w[ 4];
u32 w5_t = w[ 5];
u32 w6_t = w[ 6];
u32 w7_t = w[ 7];
u32 w8_t = w[ 8];
u32 w9_t = w[ 9];
u32 wa_t = w[10];
u32 wb_t = w[11];
u32 wc_t = w[12];
u32 wd_t = w[13];
u32 we_t = w[14];
u32 wf_t = w[15];
#define ROUND_EXPAND() \
{ \
w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \
w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \
w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \
w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \
w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \
w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \
w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \
w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \
w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \
w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \
wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \
wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \
wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \
wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \
we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \
wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \
}
#define ROUND_STEP(i) \
{ \
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \
SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \
}
ROUND_STEP (0);
#ifdef _unroll
#pragma unroll
#endif
for (int i = 16; i < 64; i += 16)
{
ROUND_EXPAND (); ROUND_STEP (i);
}
digest[0] += a;
digest[1] += b;
digest[2] += c;
digest[3] += d;
digest[4] += e;
digest[5] += f;
digest[6] += g;
digest[7] += h;
}
void sha256_init (sha256_ctx_t *sha256_ctx)
{
sha256_ctx->state[0] = SHA256M_A;
sha256_ctx->state[1] = SHA256M_B;
sha256_ctx->state[2] = SHA256M_C;
sha256_ctx->state[3] = SHA256M_D;
sha256_ctx->state[4] = SHA256M_E;
sha256_ctx->state[5] = SHA256M_F;
sha256_ctx->state[6] = SHA256M_G;
sha256_ctx->state[7] = SHA256M_H;
sha256_ctx->len = 0;
}
void sha256_update (sha256_ctx_t *sha256_ctx, const u32 *buf, int len)
{
int pos = sha256_ctx->len & 0x3f;
sha256_ctx->len += len;
if ((pos + len) < 64)
{
for (int i = 0; i < len; i++)
{
PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i));
}
return;
}
int cnt = 64 - pos;
for (int i = 0; i < cnt; i++)
{
PUTCHAR32_BE (sha256_ctx->buf, pos++, GETCHAR32_BE (buf, i));
}
sha256_transform (sha256_ctx->buf, sha256_ctx->state);
len -= cnt;
for (int i = 0; i < len; i++)
{
PUTCHAR32_BE (sha256_ctx->buf, i, GETCHAR32_BE (buf, cnt + i));
}
}
void sha256_final (sha256_ctx_t *sha256_ctx)
{
int pos = sha256_ctx->len & 0x3f;
for (int i = pos; i < 64; i++)
{
PUTCHAR32_BE (sha256_ctx->buf, i, 0);
}
PUTCHAR32_BE (sha256_ctx->buf, pos, 0x80);
if (pos >= 56)
{
sha256_transform (sha256_ctx->buf, sha256_ctx->state);
sha256_ctx->buf[ 0] = 0;
sha256_ctx->buf[ 1] = 0;
sha256_ctx->buf[ 2] = 0;
sha256_ctx->buf[ 3] = 0;
sha256_ctx->buf[ 4] = 0;
sha256_ctx->buf[ 5] = 0;
sha256_ctx->buf[ 6] = 0;
sha256_ctx->buf[ 7] = 0;
sha256_ctx->buf[ 8] = 0;
sha256_ctx->buf[ 9] = 0;
sha256_ctx->buf[10] = 0;
sha256_ctx->buf[11] = 0;
sha256_ctx->buf[12] = 0;
sha256_ctx->buf[13] = 0;
sha256_ctx->buf[14] = 0;
sha256_ctx->buf[15] = 0;
}
sha256_ctx->buf[15] = sha256_ctx->len * 8;
sha256_transform (sha256_ctx->buf, sha256_ctx->state);
}
__kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
*/
const u32 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 pw[4];
pw[0] = swap32 (pws[gid].i[0]);
pw[1] = swap32 (pws[gid].i[1]);
pw[2] = swap32 (pws[gid].i[2]);
pw[3] = swap32 (pws[gid].i[3]);
const u32 pw_len = pws[gid].pw_len;
/**
* salt
*/
u32 salt[4];
salt[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]);
salt[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]);
salt[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]);
salt[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]);
u32 salt_len = salt_bufs[salt_pos].salt_len;
/**
* begin
*/
sha256_ctx_t sha256_ctx;
sha256_init (&sha256_ctx);
sha256_update (&sha256_ctx, pw, pw_len);
sha256_update (&sha256_ctx, salt, salt_len);
sha256_update (&sha256_ctx, pw, pw_len);
sha256_final (&sha256_ctx);
u32 tmp[8];
tmp[0] = sha256_ctx.state[0];
tmp[1] = sha256_ctx.state[1];
tmp[2] = sha256_ctx.state[2];
tmp[3] = sha256_ctx.state[3];
tmp[4] = sha256_ctx.state[4];
tmp[5] = sha256_ctx.state[5];
tmp[6] = sha256_ctx.state[6];
tmp[7] = sha256_ctx.state[7];
sha256_init (&sha256_ctx);
sha256_update (&sha256_ctx, pw, pw_len);
sha256_update (&sha256_ctx, salt, salt_len);
sha256_update (&sha256_ctx, tmp, pw_len);
for (u32 j = pw_len; j; j >>= 1)
{
if (j & 1)
{
sha256_update (&sha256_ctx, tmp, 32);
}
else
{
sha256_update (&sha256_ctx, pw, pw_len);
}
}
sha256_final (&sha256_ctx);
tmps[gid].alt_result[0] = sha256_ctx.state[0];
tmps[gid].alt_result[1] = sha256_ctx.state[1];
tmps[gid].alt_result[2] = sha256_ctx.state[2];
tmps[gid].alt_result[3] = sha256_ctx.state[3];
tmps[gid].alt_result[4] = sha256_ctx.state[4];
tmps[gid].alt_result[5] = sha256_ctx.state[5];
tmps[gid].alt_result[6] = sha256_ctx.state[6];
tmps[gid].alt_result[7] = sha256_ctx.state[7];
// p_bytes
sha256_init (&sha256_ctx);
for (u32 j = 0; j < pw_len; j++)
{
sha256_update (&sha256_ctx, pw, pw_len);
}
sha256_final (&sha256_ctx);
tmps[gid].p_bytes[0] = sha256_ctx.state[0];
tmps[gid].p_bytes[1] = sha256_ctx.state[1];
tmps[gid].p_bytes[2] = sha256_ctx.state[2];
tmps[gid].p_bytes[3] = sha256_ctx.state[3];
// s_bytes
sha256_init (&sha256_ctx);
for (u32 j = 0; j < 16 + ((tmps[gid].alt_result[0] >> 24) & 0xff); j++)
{
sha256_update (&sha256_ctx, salt, salt_len);
}
sha256_final (&sha256_ctx);
tmps[gid].s_bytes[0] = sha256_ctx.state[0];
tmps[gid].s_bytes[1] = sha256_ctx.state[1];
tmps[gid].s_bytes[2] = sha256_ctx.state[2];
tmps[gid].s_bytes[3] = sha256_ctx.state[3];
}
__kernel void m07400_loop (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
*/
const u32 gid = get_global_id (0);
if (gid >= gid_max) return;
u32 p_bytes0[4];
p_bytes0[0] = tmps[gid].p_bytes[0];
p_bytes0[1] = tmps[gid].p_bytes[1];
p_bytes0[2] = tmps[gid].p_bytes[2];
p_bytes0[3] = tmps[gid].p_bytes[3];
const u32 pw_len = pws[gid].pw_len;
u32 s_bytes0[4];
s_bytes0[0] = tmps[gid].s_bytes[0];
s_bytes0[1] = tmps[gid].s_bytes[1];
s_bytes0[2] = tmps[gid].s_bytes[2];
s_bytes0[3] = tmps[gid].s_bytes[3];
const u32 salt_len = salt_bufs[salt_pos].salt_len;
u32 wpc_len[8];
wpc_len[0] = 32 + 0 + 0 + pw_len;
wpc_len[1] = pw_len + 0 + 0 + 32;
wpc_len[2] = 32 + salt_len + 0 + pw_len;
wpc_len[3] = pw_len + salt_len + 0 + 32;
wpc_len[4] = 32 + 0 + pw_len + pw_len;
wpc_len[5] = pw_len + 0 + pw_len + 32;
wpc_len[6] = 32 + salt_len + pw_len + pw_len;
wpc_len[7] = pw_len + salt_len + pw_len + 32;
u32 wpc[8][32] = { { 0 } };
for (u32 i = 0; i < 8; i++)
{
u32 block_len = 0;
if (i & 1)
{
for (u32 j = 0; j < pw_len; j++)
{
PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j));
}
}
else
{
block_len += 32;
}
if (i & 2)
{
for (u32 j = 0; j < salt_len; j++)
{
PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (s_bytes0, j));
}
}
if (i & 4)
{
for (u32 j = 0; j < pw_len; j++)
{
PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j));
}
}
if (i & 1)
{
block_len += 32;
}
else
{
for (u32 j = 0; j < pw_len; j++)
{
PUTCHAR32_BE (wpc[i], block_len++, GETCHAR32_BE (p_bytes0, j));
}
}
PUTCHAR32_BE (wpc[i], block_len, 0x80);
if (block_len < 56)
{
wpc[i][15] = block_len * 8;
}
else
{
wpc[i][31] = block_len * 8;
}
}
/**
* base
*/
u32 alt_result[8];
alt_result[0] = tmps[gid].alt_result[0];
alt_result[1] = tmps[gid].alt_result[1];
alt_result[2] = tmps[gid].alt_result[2];
alt_result[3] = tmps[gid].alt_result[3];
alt_result[4] = tmps[gid].alt_result[4];
alt_result[5] = tmps[gid].alt_result[5];
alt_result[6] = tmps[gid].alt_result[6];
alt_result[7] = tmps[gid].alt_result[7];
/* Repeatedly run the collected hash value through SHA256 to burn
CPU cycles. */
for (u32 i = 0, j = loop_pos; i < loop_cnt; i++, j++)
{
const u32 j1 = (j & 1) ? 1 : 0;
const u32 j3 = (j % 3) ? 2 : 0;
const u32 j7 = (j % 7) ? 4 : 0;
const u32 pc = j1 + j3 + j7;
u32 block[32];
block[ 0] = wpc[pc][ 0];
block[ 1] = wpc[pc][ 1];
block[ 2] = wpc[pc][ 2];
block[ 3] = wpc[pc][ 3];
block[ 4] = wpc[pc][ 4];
block[ 5] = wpc[pc][ 5];
block[ 6] = wpc[pc][ 6];
block[ 7] = wpc[pc][ 7];
block[ 8] = wpc[pc][ 8];
block[ 9] = wpc[pc][ 9];
block[10] = wpc[pc][10];
block[11] = wpc[pc][11];
block[12] = wpc[pc][12];
block[13] = wpc[pc][13];
block[14] = wpc[pc][14];
block[15] = wpc[pc][15];
block[16] = wpc[pc][16];
block[17] = wpc[pc][17];
block[18] = wpc[pc][18];
block[19] = wpc[pc][19];
block[20] = wpc[pc][20];
block[21] = wpc[pc][21];
block[22] = wpc[pc][22];
block[23] = wpc[pc][23];
block[24] = wpc[pc][24];
block[25] = wpc[pc][25];
block[26] = wpc[pc][26];
block[27] = wpc[pc][27];
block[28] = wpc[pc][28];
block[29] = wpc[pc][29];
block[30] = wpc[pc][30];
block[31] = wpc[pc][31];
const u32 block_len = wpc_len[pc];
if (j1)
{
#ifdef _unroll
#pragma unroll
#endif
for (u32 k = 0, p = block_len - 32; k < 32; k++, p++)
{
PUTCHAR32_BE (block, p, GETCHAR32_BE (alt_result, k));
}
}
else
{
block[0] = alt_result[0];
block[1] = alt_result[1];
block[2] = alt_result[2];
block[3] = alt_result[3];
block[4] = alt_result[4];
block[5] = alt_result[5];
block[6] = alt_result[6];
block[7] = alt_result[7];
}
alt_result[0] = SHA256M_A;
alt_result[1] = SHA256M_B;
alt_result[2] = SHA256M_C;
alt_result[3] = SHA256M_D;
alt_result[4] = SHA256M_E;
alt_result[5] = SHA256M_F;
alt_result[6] = SHA256M_G;
alt_result[7] = SHA256M_H;
sha256_transform (block, alt_result);
if (block_len >= 56)
{
sha256_transform (block + 16, alt_result);
}
}
tmps[gid].alt_result[0] = alt_result[0];
tmps[gid].alt_result[1] = alt_result[1];
tmps[gid].alt_result[2] = alt_result[2];
tmps[gid].alt_result[3] = alt_result[3];
tmps[gid].alt_result[4] = alt_result[4];
tmps[gid].alt_result[5] = alt_result[5];
tmps[gid].alt_result[6] = alt_result[6];
tmps[gid].alt_result[7] = alt_result[7];
}
__kernel void m07400_comp (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)
{
/**
* base
*/
const u32 gid = get_global_id (0);
if (gid >= gid_max) return;
const u32 lid = get_local_id (0);
const u32 r0 = swap32 (tmps[gid].alt_result[0]);
const u32 r1 = swap32 (tmps[gid].alt_result[1]);
const u32 r2 = swap32 (tmps[gid].alt_result[2]);
const u32 r3 = swap32 (tmps[gid].alt_result[3]);
#define il_pos 0
#include COMPARE_M
}
#endif

View File

@ -27,76 +27,79 @@
#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
#endif
u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len)
u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len)
{
const u32 mod = block_len & 3;
const u32 div = block_len / 4;
const u32 mod = offset & 3;
const u32 div = offset / 4;
#if defined IS_AMD || defined IS_GENERIC
const int offset_minus_4 = 4 - mod;
u32x append00 = swap32 (append0[0]);
u32x append01 = swap32 (append0[1]);
u32x append02 = swap32 (append0[2]);
u32x append03 = swap32 (append0[3]);
u32x append10 = swap32 (append1[0]);
u32x append11 = swap32 (append1[1]);
u32x append12 = swap32 (append1[2]);
u32x append13 = swap32 (append1[3]);
u32x append20 = swap32 (append2[0]);
u32x append21 = swap32 (append2[1]);
u32x append22 = swap32 (append2[2]);
u32x append23 = swap32 (append2[3]);
u32x append30 = swap32 (append3[0]);
u32x append31 = swap32 (append3[1]);
u32x append32 = swap32 (append3[2]);
u32x append33 = swap32 (append3[3]);
u32x append0_t[4];
append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4);
append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4);
append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4);
append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4);
u32x append1_t[4];
append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4);
append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4);
append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4);
append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4);
u32x append2_t[4];
append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4);
append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4);
append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4);
append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4);
u32x append3_t[4];
append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4);
append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4);
append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4);
append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4);
u32x append4_t[4];
append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4);
append0_t[0] = amd_bytealign ( 0, append00, offset);
append0_t[1] = amd_bytealign (append00, append01, offset);
append0_t[2] = amd_bytealign (append01, append02, offset);
append0_t[3] = amd_bytealign (append02, append03, offset);
append1_t[0] = amd_bytealign (append03, append10, offset);
append1_t[1] = amd_bytealign (append10, append11, offset);
append1_t[2] = amd_bytealign (append11, append12, offset);
append1_t[3] = amd_bytealign (append12, append13, offset);
append2_t[0] = amd_bytealign (append13, append20, offset);
append2_t[1] = amd_bytealign (append20, append21, offset);
append2_t[2] = amd_bytealign (append21, append22, offset);
append2_t[3] = amd_bytealign (append22, append23, offset);
append3_t[0] = amd_bytealign (append23, append30, offset);
append3_t[1] = amd_bytealign (append30, append31, offset);
append3_t[2] = amd_bytealign (append31, append32, offset);
append3_t[3] = amd_bytealign (append32, append33, offset);
append4_t[0] = amd_bytealign (append33, 0, offset);
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
if (mod == 0)
{
append0_t[0] = append0[0];
append0_t[1] = append0[1];
append0_t[2] = append0[2];
append0_t[3] = append0[3];
append0_t[0] = swap32 (append0_t[0]);
append0_t[1] = swap32 (append0_t[1]);
append0_t[2] = swap32 (append0_t[2]);
append0_t[3] = swap32 (append0_t[3]);
append1_t[0] = swap32 (append1_t[0]);
append1_t[1] = swap32 (append1_t[1]);
append1_t[2] = swap32 (append1_t[2]);
append1_t[3] = swap32 (append1_t[3]);
append2_t[0] = swap32 (append2_t[0]);
append2_t[1] = swap32 (append2_t[1]);
append2_t[2] = swap32 (append2_t[2]);
append2_t[3] = swap32 (append2_t[3]);
append3_t[0] = swap32 (append3_t[0]);
append3_t[1] = swap32 (append3_t[1]);
append3_t[2] = swap32 (append3_t[2]);
append3_t[3] = swap32 (append3_t[3]);
append4_t[0] = swap32 (append4_t[0]);
append4_t[1] = swap32 (append4_t[1]);
append4_t[2] = swap32 (append4_t[2]);
append4_t[3] = swap32 (append4_t[3]);
append1_t[0] = append1[0];
append1_t[1] = append1[1];
append1_t[2] = append1[2];
append1_t[3] = append1[3];
append2_t[0] = append2[0];
append2_t[1] = append2[1];
append2_t[2] = append2[2];
append2_t[3] = append2[3];
append3_t[0] = append3[0];
append3_t[1] = append3[1];
append3_t[2] = append3[2];
append3_t[3] = append3[3];
append4_t[0] = 0;
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
}
#endif
#ifdef IS_NV
@ -105,40 +108,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
u32x append00 = append0[0];
u32x append01 = append0[1];
u32x append02 = append0[2];
u32x append03 = append0[3];
u32x append10 = append1[0];
u32x append11 = append1[1];
u32x append12 = append1[2];
u32x append13 = append1[3];
u32x append20 = append2[0];
u32x append21 = append2[1];
u32x append22 = append2[2];
u32x append23 = append2[3];
u32x append30 = append3[0];
u32x append31 = append3[1];
u32x append32 = append3[2];
u32x append33 = append3[3];
u32x append0_t[4];
append0_t[0] = __byte_perm ( 0, append0[0], selector);
append0_t[1] = __byte_perm (append0[0], append0[1], selector);
append0_t[2] = __byte_perm (append0[1], append0[2], selector);
append0_t[3] = __byte_perm (append0[2], append0[3], selector);
u32x append1_t[4];
append1_t[0] = __byte_perm (append0[3], append1[0], selector);
append1_t[1] = __byte_perm (append1[0], append1[1], selector);
append1_t[2] = __byte_perm (append1[1], append1[2], selector);
append1_t[3] = __byte_perm (append1[2], append1[3], selector);
u32x append2_t[4];
append2_t[0] = __byte_perm (append1[3], append2[0], selector);
append2_t[1] = __byte_perm (append2[0], append2[1], selector);
append2_t[2] = __byte_perm (append2[1], append2[2], selector);
append2_t[3] = __byte_perm (append2[2], append2[3], selector);
u32x append3_t[4];
append3_t[0] = __byte_perm (append2[3], append3[0], selector);
append3_t[1] = __byte_perm (append3[0], append3[1], selector);
append3_t[2] = __byte_perm (append3[1], append3[2], selector);
append3_t[3] = __byte_perm (append3[2], append3[3], selector);
u32x append4_t[4];
append4_t[0] = __byte_perm (append3[3], 0, selector);
append0_t[0] = __byte_perm ( 0, append00, selector);
append0_t[1] = __byte_perm (append00, append01, selector);
append0_t[2] = __byte_perm (append01, append02, selector);
append0_t[3] = __byte_perm (append02, append03, selector);
append1_t[0] = __byte_perm (append03, append10, selector);
append1_t[1] = __byte_perm (append10, append11, selector);
append1_t[2] = __byte_perm (append11, append12, selector);
append1_t[3] = __byte_perm (append12, append13, selector);
append2_t[0] = __byte_perm (append13, append20, selector);
append2_t[1] = __byte_perm (append20, append21, selector);
append2_t[2] = __byte_perm (append21, append22, selector);
append2_t[3] = __byte_perm (append22, append23, selector);
append3_t[0] = __byte_perm (append23, append30, selector);
append3_t[1] = __byte_perm (append30, append31, selector);
append3_t[2] = __byte_perm (append31, append32, selector);
append3_t[3] = __byte_perm (append32, append33, selector);
append4_t[0] = __byte_perm (append33, 0, selector);
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
#endif
switch (div)
@ -147,22 +160,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 1] = append0_t[1];
block0[ 2] = append0_t[2];
block0[ 3] = append0_t[3];
block0[ 4] = append1_t[0];
block0[ 5] = append1_t[1];
block0[ 6] = append1_t[2];
block0[ 7] = append1_t[3];
block0[ 8] = append2_t[0];
block0[ 9] = append2_t[1];
block0[10] = append2_t[2];
block0[11] = append2_t[3];
block0[12] = append3_t[0];
block0[13] = append3_t[1];
block0[14] = append3_t[2];
block0[15] = append3_t[3];
block1[ 0] = append4_t[0];
block1[ 1] = append4_t[1];
block1[ 2] = append4_t[2];
@ -173,22 +182,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 2] = append0_t[1];
block0[ 3] = append0_t[2];
block0[ 4] = append0_t[3];
block0[ 5] = append1_t[0];
block0[ 6] = append1_t[1];
block0[ 7] = append1_t[2];
block0[ 8] = append1_t[3];
block0[ 9] = append2_t[0];
block0[10] = append2_t[1];
block0[11] = append2_t[2];
block0[12] = append2_t[3];
block0[13] = append3_t[0];
block0[14] = append3_t[1];
block0[15] = append3_t[2];
block1[ 0] = append3_t[3];
block1[ 1] = append4_t[0];
block1[ 2] = append4_t[1];
block1[ 3] = append4_t[2];
@ -199,22 +204,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 3] = append0_t[1];
block0[ 4] = append0_t[2];
block0[ 5] = append0_t[3];
block0[ 6] = append1_t[0];
block0[ 7] = append1_t[1];
block0[ 8] = append1_t[2];
block0[ 9] = append1_t[3];
block0[10] = append2_t[0];
block0[11] = append2_t[1];
block0[12] = append2_t[2];
block0[13] = append2_t[3];
block0[14] = append3_t[0];
block0[15] = append3_t[1];
block1[ 0] = append3_t[2];
block1[ 1] = append3_t[3];
block1[ 2] = append4_t[0];
block1[ 3] = append4_t[1];
block1[ 4] = append4_t[2];
@ -225,22 +226,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 4] = append0_t[1];
block0[ 5] = append0_t[2];
block0[ 6] = append0_t[3];
block0[ 7] = append1_t[0];
block0[ 8] = append1_t[1];
block0[ 9] = append1_t[2];
block0[10] = append1_t[3];
block0[11] = append2_t[0];
block0[12] = append2_t[1];
block0[13] = append2_t[2];
block0[14] = append2_t[3];
block0[15] = append3_t[0];
block1[ 0] = append3_t[1];
block1[ 1] = append3_t[2];
block1[ 2] = append3_t[3];
block1[ 3] = append4_t[0];
block1[ 4] = append4_t[1];
block1[ 5] = append4_t[2];
@ -251,22 +248,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 5] = append0_t[1];
block0[ 6] = append0_t[2];
block0[ 7] = append0_t[3];
block0[ 8] = append1_t[0];
block0[ 9] = append1_t[1];
block0[10] = append1_t[2];
block0[11] = append1_t[3];
block0[12] = append2_t[0];
block0[13] = append2_t[1];
block0[14] = append2_t[2];
block0[15] = append2_t[3];
block1[ 0] = append3_t[0];
block1[ 1] = append3_t[1];
block1[ 2] = append3_t[2];
block1[ 3] = append3_t[3];
block1[ 4] = append4_t[0];
block1[ 5] = append4_t[1];
block1[ 6] = append4_t[2];
@ -277,22 +270,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 6] = append0_t[1];
block0[ 7] = append0_t[2];
block0[ 8] = append0_t[3];
block0[ 9] = append1_t[0];
block0[10] = append1_t[1];
block0[11] = append1_t[2];
block0[12] = append1_t[3];
block0[13] = append2_t[0];
block0[14] = append2_t[1];
block0[15] = append2_t[2];
block1[ 0] = append2_t[3];
block1[ 1] = append3_t[0];
block1[ 2] = append3_t[1];
block1[ 3] = append3_t[2];
block1[ 4] = append3_t[3];
block1[ 5] = append4_t[0];
block1[ 6] = append4_t[1];
block1[ 7] = append4_t[2];
@ -303,22 +292,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 7] = append0_t[1];
block0[ 8] = append0_t[2];
block0[ 9] = append0_t[3];
block0[10] = append1_t[0];
block0[11] = append1_t[1];
block0[12] = append1_t[2];
block0[13] = append1_t[3];
block0[14] = append2_t[0];
block0[15] = append2_t[1];
block1[ 0] = append2_t[2];
block1[ 1] = append2_t[3];
block1[ 2] = append3_t[0];
block1[ 3] = append3_t[1];
block1[ 4] = append3_t[2];
block1[ 5] = append3_t[3];
block1[ 6] = append4_t[0];
block1[ 7] = append4_t[1];
block1[ 8] = append4_t[2];
@ -329,22 +314,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 8] = append0_t[1];
block0[ 9] = append0_t[2];
block0[10] = append0_t[3];
block0[11] = append1_t[0];
block0[12] = append1_t[1];
block0[13] = append1_t[2];
block0[14] = append1_t[3];
block0[15] = append2_t[0];
block1[ 0] = append2_t[1];
block1[ 1] = append2_t[2];
block1[ 2] = append2_t[3];
block1[ 3] = append3_t[0];
block1[ 4] = append3_t[1];
block1[ 5] = append3_t[2];
block1[ 6] = append3_t[3];
block1[ 7] = append4_t[0];
block1[ 8] = append4_t[1];
block1[ 9] = append4_t[2];
@ -355,22 +336,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 9] = append0_t[1];
block0[10] = append0_t[2];
block0[11] = append0_t[3];
block0[12] = append1_t[0];
block0[13] = append1_t[1];
block0[14] = append1_t[2];
block0[15] = append1_t[3];
block1[ 0] = append2_t[0];
block1[ 1] = append2_t[1];
block1[ 2] = append2_t[2];
block1[ 3] = append2_t[3];
block1[ 4] = append3_t[0];
block1[ 5] = append3_t[1];
block1[ 6] = append3_t[2];
block1[ 7] = append3_t[3];
block1[ 8] = append4_t[0];
block1[ 9] = append4_t[1];
block1[10] = append4_t[2];
@ -381,22 +358,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[10] = append0_t[1];
block0[11] = append0_t[2];
block0[12] = append0_t[3];
block0[13] = append1_t[0];
block0[14] = append1_t[1];
block0[15] = append1_t[2];
block1[ 0] = append1_t[3];
block1[ 1] = append2_t[0];
block1[ 2] = append2_t[1];
block1[ 3] = append2_t[2];
block1[ 4] = append2_t[3];
block1[ 5] = append3_t[0];
block1[ 6] = append3_t[1];
block1[ 7] = append3_t[2];
block1[ 8] = append3_t[3];
block1[ 9] = append4_t[0];
block1[10] = append4_t[1];
block1[11] = append4_t[2];
@ -407,22 +380,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[11] = append0_t[1];
block0[12] = append0_t[2];
block0[13] = append0_t[3];
block0[14] = append1_t[0];
block0[15] = append1_t[1];
block1[ 0] = append1_t[2];
block1[ 1] = append1_t[3];
block1[ 2] = append2_t[0];
block1[ 3] = append2_t[1];
block1[ 4] = append2_t[2];
block1[ 5] = append2_t[3];
block1[ 6] = append3_t[0];
block1[ 7] = append3_t[1];
block1[ 8] = append3_t[2];
block1[ 9] = append3_t[3];
block1[10] = append4_t[0];
block1[11] = append4_t[1];
block1[12] = append4_t[2];
@ -433,22 +402,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[12] = append0_t[1];
block0[13] = append0_t[2];
block0[14] = append0_t[3];
block0[15] = append1_t[0];
block1[ 0] = append1_t[1];
block1[ 1] = append1_t[2];
block1[ 2] = append1_t[3];
block1[ 3] = append2_t[0];
block1[ 4] = append2_t[1];
block1[ 5] = append2_t[2];
block1[ 6] = append2_t[3];
block1[ 7] = append3_t[0];
block1[ 8] = append3_t[1];
block1[ 9] = append3_t[2];
block1[10] = append3_t[3];
block1[11] = append4_t[0];
block1[12] = append4_t[1];
block1[13] = append4_t[2];
@ -459,22 +424,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[13] = append0_t[1];
block0[14] = append0_t[2];
block0[15] = append0_t[3];
block1[ 0] = append1_t[0];
block1[ 1] = append1_t[1];
block1[ 2] = append1_t[2];
block1[ 3] = append1_t[3];
block1[ 4] = append2_t[0];
block1[ 5] = append2_t[1];
block1[ 6] = append2_t[2];
block1[ 7] = append2_t[3];
block1[ 8] = append3_t[0];
block1[ 9] = append3_t[1];
block1[10] = append3_t[2];
block1[11] = append3_t[3];
block1[12] = append4_t[0];
block1[13] = append4_t[1];
block1[14] = append4_t[2];
@ -485,22 +446,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[14] = append0_t[1];
block0[15] = append0_t[2];
block1[ 0] = append0_t[3];
block1[ 1] = append1_t[0];
block1[ 2] = append1_t[1];
block1[ 3] = append1_t[2];
block1[ 4] = append1_t[3];
block1[ 5] = append2_t[0];
block1[ 6] = append2_t[1];
block1[ 7] = append2_t[2];
block1[ 8] = append2_t[3];
block1[ 9] = append3_t[0];
block1[10] = append3_t[1];
block1[11] = append3_t[2];
block1[12] = append3_t[3];
block1[13] = append4_t[0];
block1[14] = append4_t[1];
block1[15] = append4_t[2];
@ -510,22 +467,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[15] = append0_t[1];
block1[ 0] = append0_t[2];
block1[ 1] = append0_t[3];
block1[ 2] = append1_t[0];
block1[ 3] = append1_t[1];
block1[ 4] = append1_t[2];
block1[ 5] = append1_t[3];
block1[ 6] = append2_t[0];
block1[ 7] = append2_t[1];
block1[ 8] = append2_t[2];
block1[ 9] = append2_t[3];
block1[10] = append3_t[0];
block1[11] = append3_t[1];
block1[12] = append3_t[2];
block1[13] = append3_t[3];
block1[14] = append4_t[0];
block1[15] = append4_t[1];
break;
@ -534,22 +487,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 0] = append0_t[1];
block1[ 1] = append0_t[2];
block1[ 2] = append0_t[3];
block1[ 3] = append1_t[1];
block1[ 4] = append1_t[2];
block1[ 5] = append1_t[3];
block1[ 6] = append1_t[0];
block1[ 7] = append2_t[0];
block1[ 8] = append2_t[1];
block1[ 9] = append2_t[2];
block1[10] = append2_t[3];
block1[11] = append3_t[0];
block1[12] = append3_t[1];
block1[13] = append3_t[2];
block1[14] = append3_t[3];
block1[15] = append4_t[0];
break;
@ -557,17 +506,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 1] = append0_t[1];
block1[ 2] = append0_t[2];
block1[ 3] = append0_t[3];
block1[ 4] = append1_t[0];
block1[ 5] = append1_t[1];
block1[ 6] = append1_t[2];
block1[ 7] = append1_t[3];
block1[ 8] = append2_t[0];
block1[ 9] = append2_t[1];
block1[10] = append2_t[2];
block1[11] = append2_t[3];
block1[12] = append3_t[0];
block1[13] = append3_t[1];
block1[14] = append3_t[2];
@ -578,17 +524,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 2] = append0_t[1];
block1[ 3] = append0_t[2];
block1[ 4] = append0_t[3];
block1[ 5] = append1_t[0];
block1[ 6] = append1_t[1];
block1[ 7] = append1_t[2];
block1[ 8] = append1_t[3];
block1[ 9] = append2_t[0];
block1[10] = append2_t[1];
block1[11] = append2_t[2];
block1[12] = append2_t[3];
block1[13] = append3_t[0];
block1[14] = append3_t[1];
block1[15] = append3_t[2];
@ -598,17 +541,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 3] = append0_t[1];
block1[ 4] = append0_t[2];
block1[ 5] = append0_t[3];
block1[ 6] = append1_t[0];
block1[ 7] = append1_t[1];
block1[ 8] = append1_t[2];
block1[ 9] = append1_t[3];
block1[10] = append2_t[0];
block1[11] = append2_t[1];
block1[12] = append2_t[2];
block1[13] = append2_t[3];
block1[14] = append3_t[0];
block1[15] = append3_t[1];
break;
@ -617,17 +557,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 4] = append0_t[1];
block1[ 5] = append0_t[2];
block1[ 6] = append0_t[3];
block1[ 7] = append1_t[0];
block1[ 8] = append1_t[1];
block1[ 9] = append1_t[2];
block1[10] = append1_t[3];
block1[11] = append2_t[0];
block1[12] = append2_t[1];
block1[13] = append2_t[2];
block1[14] = append2_t[3];
block1[15] = append3_t[0];
break;
@ -635,12 +572,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 5] = append0_t[1];
block1[ 6] = append0_t[2];
block1[ 7] = append0_t[3];
block1[ 8] = append1_t[0];
block1[ 9] = append1_t[1];
block1[10] = append1_t[2];
block1[11] = append1_t[3];
block1[12] = append2_t[0];
block1[13] = append2_t[1];
block1[14] = append2_t[2];
@ -651,12 +586,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 6] = append0_t[1];
block1[ 7] = append0_t[2];
block1[ 8] = append0_t[3];
block1[ 9] = append1_t[0];
block1[10] = append1_t[1];
block1[11] = append1_t[2];
block1[12] = append1_t[3];
block1[13] = append2_t[0];
block1[14] = append2_t[1];
block1[15] = append2_t[2];
@ -666,12 +599,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 7] = append0_t[1];
block1[ 8] = append0_t[2];
block1[ 9] = append0_t[3];
block1[10] = append1_t[0];
block1[11] = append1_t[1];
block1[12] = append1_t[2];
block1[13] = append1_t[3];
block1[14] = append2_t[0];
block1[15] = append2_t[1];
break;
@ -680,12 +611,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 8] = append0_t[1];
block1[ 9] = append0_t[2];
block1[10] = append0_t[3];
block1[11] = append1_t[0];
block1[12] = append1_t[1];
block1[13] = append1_t[2];
block1[14] = append1_t[3];
block1[15] = append2_t[0];
break;
@ -693,7 +622,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 9] = append0_t[1];
block1[10] = append0_t[2];
block1[11] = append0_t[3];
block1[12] = append1_t[0];
block1[13] = append1_t[1];
block1[14] = append1_t[2];
@ -704,7 +632,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[10] = append0_t[1];
block1[11] = append0_t[2];
block1[12] = append0_t[3];
block1[13] = append1_t[0];
block1[14] = append1_t[1];
block1[15] = append1_t[2];
@ -714,7 +641,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[11] = append0_t[1];
block1[12] = append0_t[2];
block1[13] = append0_t[3];
block1[14] = append1_t[0];
block1[15] = append1_t[1];
break;
@ -723,7 +649,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[12] = append0_t[1];
block1[13] = append0_t[2];
block1[14] = append0_t[3];
block1[15] = append1_t[0];
break;
@ -743,7 +668,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
break;
}
u32 new_len = block_len + append_len;
u32 new_len = offset + append_len;
return new_len;
}

View File

@ -25,76 +25,79 @@
#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
#endif
u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len)
u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len)
{
const u32 mod = block_len & 3;
const u32 div = block_len / 4;
const u32 mod = offset & 3;
const u32 div = offset / 4;
#if defined IS_AMD || defined IS_GENERIC
const int offset_minus_4 = 4 - mod;
u32x append00 = swap32 (append0[0]);
u32x append01 = swap32 (append0[1]);
u32x append02 = swap32 (append0[2]);
u32x append03 = swap32 (append0[3]);
u32x append10 = swap32 (append1[0]);
u32x append11 = swap32 (append1[1]);
u32x append12 = swap32 (append1[2]);
u32x append13 = swap32 (append1[3]);
u32x append20 = swap32 (append2[0]);
u32x append21 = swap32 (append2[1]);
u32x append22 = swap32 (append2[2]);
u32x append23 = swap32 (append2[3]);
u32x append30 = swap32 (append3[0]);
u32x append31 = swap32 (append3[1]);
u32x append32 = swap32 (append3[2]);
u32x append33 = swap32 (append3[3]);
u32x append0_t[4];
append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4);
append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4);
append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4);
append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4);
u32x append1_t[4];
append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4);
append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4);
append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4);
append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4);
u32x append2_t[4];
append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4);
append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4);
append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4);
append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4);
u32x append3_t[4];
append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4);
append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4);
append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4);
append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4);
u32x append4_t[4];
append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4);
append0_t[0] = amd_bytealign ( 0, append00, offset);
append0_t[1] = amd_bytealign (append00, append01, offset);
append0_t[2] = amd_bytealign (append01, append02, offset);
append0_t[3] = amd_bytealign (append02, append03, offset);
append1_t[0] = amd_bytealign (append03, append10, offset);
append1_t[1] = amd_bytealign (append10, append11, offset);
append1_t[2] = amd_bytealign (append11, append12, offset);
append1_t[3] = amd_bytealign (append12, append13, offset);
append2_t[0] = amd_bytealign (append13, append20, offset);
append2_t[1] = amd_bytealign (append20, append21, offset);
append2_t[2] = amd_bytealign (append21, append22, offset);
append2_t[3] = amd_bytealign (append22, append23, offset);
append3_t[0] = amd_bytealign (append23, append30, offset);
append3_t[1] = amd_bytealign (append30, append31, offset);
append3_t[2] = amd_bytealign (append31, append32, offset);
append3_t[3] = amd_bytealign (append32, append33, offset);
append4_t[0] = amd_bytealign (append33, 0, offset);
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
if (mod == 0)
{
append0_t[0] = append0[0];
append0_t[1] = append0[1];
append0_t[2] = append0[2];
append0_t[3] = append0[3];
append0_t[0] = swap32 (append0_t[0]);
append0_t[1] = swap32 (append0_t[1]);
append0_t[2] = swap32 (append0_t[2]);
append0_t[3] = swap32 (append0_t[3]);
append1_t[0] = swap32 (append1_t[0]);
append1_t[1] = swap32 (append1_t[1]);
append1_t[2] = swap32 (append1_t[2]);
append1_t[3] = swap32 (append1_t[3]);
append2_t[0] = swap32 (append2_t[0]);
append2_t[1] = swap32 (append2_t[1]);
append2_t[2] = swap32 (append2_t[2]);
append2_t[3] = swap32 (append2_t[3]);
append3_t[0] = swap32 (append3_t[0]);
append3_t[1] = swap32 (append3_t[1]);
append3_t[2] = swap32 (append3_t[2]);
append3_t[3] = swap32 (append3_t[3]);
append4_t[0] = swap32 (append4_t[0]);
append4_t[1] = swap32 (append4_t[1]);
append4_t[2] = swap32 (append4_t[2]);
append4_t[3] = swap32 (append4_t[3]);
append1_t[0] = append1[0];
append1_t[1] = append1[1];
append1_t[2] = append1[2];
append1_t[3] = append1[3];
append2_t[0] = append2[0];
append2_t[1] = append2[1];
append2_t[2] = append2[2];
append2_t[3] = append2[3];
append3_t[0] = append3[0];
append3_t[1] = append3[1];
append3_t[2] = append3[2];
append3_t[3] = append3[3];
append4_t[0] = 0;
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
}
#endif
#ifdef IS_NV
@ -103,40 +106,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
u32x append00 = append0[0];
u32x append01 = append0[1];
u32x append02 = append0[2];
u32x append03 = append0[3];
u32x append10 = append1[0];
u32x append11 = append1[1];
u32x append12 = append1[2];
u32x append13 = append1[3];
u32x append20 = append2[0];
u32x append21 = append2[1];
u32x append22 = append2[2];
u32x append23 = append2[3];
u32x append30 = append3[0];
u32x append31 = append3[1];
u32x append32 = append3[2];
u32x append33 = append3[3];
u32x append0_t[4];
append0_t[0] = __byte_perm ( 0, append0[0], selector);
append0_t[1] = __byte_perm (append0[0], append0[1], selector);
append0_t[2] = __byte_perm (append0[1], append0[2], selector);
append0_t[3] = __byte_perm (append0[2], append0[3], selector);
u32x append1_t[4];
append1_t[0] = __byte_perm (append0[3], append1[0], selector);
append1_t[1] = __byte_perm (append1[0], append1[1], selector);
append1_t[2] = __byte_perm (append1[1], append1[2], selector);
append1_t[3] = __byte_perm (append1[2], append1[3], selector);
u32x append2_t[4];
append2_t[0] = __byte_perm (append1[3], append2[0], selector);
append2_t[1] = __byte_perm (append2[0], append2[1], selector);
append2_t[2] = __byte_perm (append2[1], append2[2], selector);
append2_t[3] = __byte_perm (append2[2], append2[3], selector);
u32x append3_t[4];
append3_t[0] = __byte_perm (append2[3], append3[0], selector);
append3_t[1] = __byte_perm (append3[0], append3[1], selector);
append3_t[2] = __byte_perm (append3[1], append3[2], selector);
append3_t[3] = __byte_perm (append3[2], append3[3], selector);
u32x append4_t[4];
append4_t[0] = __byte_perm (append3[3], 0, selector);
append0_t[0] = __byte_perm ( 0, append00, selector);
append0_t[1] = __byte_perm (append00, append01, selector);
append0_t[2] = __byte_perm (append01, append02, selector);
append0_t[3] = __byte_perm (append02, append03, selector);
append1_t[0] = __byte_perm (append03, append10, selector);
append1_t[1] = __byte_perm (append10, append11, selector);
append1_t[2] = __byte_perm (append11, append12, selector);
append1_t[3] = __byte_perm (append12, append13, selector);
append2_t[0] = __byte_perm (append13, append20, selector);
append2_t[1] = __byte_perm (append20, append21, selector);
append2_t[2] = __byte_perm (append21, append22, selector);
append2_t[3] = __byte_perm (append22, append23, selector);
append3_t[0] = __byte_perm (append23, append30, selector);
append3_t[1] = __byte_perm (append30, append31, selector);
append3_t[2] = __byte_perm (append31, append32, selector);
append3_t[3] = __byte_perm (append32, append33, selector);
append4_t[0] = __byte_perm (append33, 0, selector);
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
#endif
switch (div)
@ -145,22 +158,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 1] = append0_t[1];
block0[ 2] = append0_t[2];
block0[ 3] = append0_t[3];
block0[ 4] = append1_t[0];
block0[ 5] = append1_t[1];
block0[ 6] = append1_t[2];
block0[ 7] = append1_t[3];
block0[ 8] = append2_t[0];
block0[ 9] = append2_t[1];
block0[10] = append2_t[2];
block0[11] = append2_t[3];
block0[12] = append3_t[0];
block0[13] = append3_t[1];
block0[14] = append3_t[2];
block0[15] = append3_t[3];
block1[ 0] = append4_t[0];
block1[ 1] = append4_t[1];
block1[ 2] = append4_t[2];
@ -171,22 +180,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 2] = append0_t[1];
block0[ 3] = append0_t[2];
block0[ 4] = append0_t[3];
block0[ 5] = append1_t[0];
block0[ 6] = append1_t[1];
block0[ 7] = append1_t[2];
block0[ 8] = append1_t[3];
block0[ 9] = append2_t[0];
block0[10] = append2_t[1];
block0[11] = append2_t[2];
block0[12] = append2_t[3];
block0[13] = append3_t[0];
block0[14] = append3_t[1];
block0[15] = append3_t[2];
block1[ 0] = append3_t[3];
block1[ 1] = append4_t[0];
block1[ 2] = append4_t[1];
block1[ 3] = append4_t[2];
@ -197,22 +202,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 3] = append0_t[1];
block0[ 4] = append0_t[2];
block0[ 5] = append0_t[3];
block0[ 6] = append1_t[0];
block0[ 7] = append1_t[1];
block0[ 8] = append1_t[2];
block0[ 9] = append1_t[3];
block0[10] = append2_t[0];
block0[11] = append2_t[1];
block0[12] = append2_t[2];
block0[13] = append2_t[3];
block0[14] = append3_t[0];
block0[15] = append3_t[1];
block1[ 0] = append3_t[2];
block1[ 1] = append3_t[3];
block1[ 2] = append4_t[0];
block1[ 3] = append4_t[1];
block1[ 4] = append4_t[2];
@ -223,22 +224,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 4] = append0_t[1];
block0[ 5] = append0_t[2];
block0[ 6] = append0_t[3];
block0[ 7] = append1_t[0];
block0[ 8] = append1_t[1];
block0[ 9] = append1_t[2];
block0[10] = append1_t[3];
block0[11] = append2_t[0];
block0[12] = append2_t[1];
block0[13] = append2_t[2];
block0[14] = append2_t[3];
block0[15] = append3_t[0];
block1[ 0] = append3_t[1];
block1[ 1] = append3_t[2];
block1[ 2] = append3_t[3];
block1[ 3] = append4_t[0];
block1[ 4] = append4_t[1];
block1[ 5] = append4_t[2];
@ -249,22 +246,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 5] = append0_t[1];
block0[ 6] = append0_t[2];
block0[ 7] = append0_t[3];
block0[ 8] = append1_t[0];
block0[ 9] = append1_t[1];
block0[10] = append1_t[2];
block0[11] = append1_t[3];
block0[12] = append2_t[0];
block0[13] = append2_t[1];
block0[14] = append2_t[2];
block0[15] = append2_t[3];
block1[ 0] = append3_t[0];
block1[ 1] = append3_t[1];
block1[ 2] = append3_t[2];
block1[ 3] = append3_t[3];
block1[ 4] = append4_t[0];
block1[ 5] = append4_t[1];
block1[ 6] = append4_t[2];
@ -275,22 +268,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 6] = append0_t[1];
block0[ 7] = append0_t[2];
block0[ 8] = append0_t[3];
block0[ 9] = append1_t[0];
block0[10] = append1_t[1];
block0[11] = append1_t[2];
block0[12] = append1_t[3];
block0[13] = append2_t[0];
block0[14] = append2_t[1];
block0[15] = append2_t[2];
block1[ 0] = append2_t[3];
block1[ 1] = append3_t[0];
block1[ 2] = append3_t[1];
block1[ 3] = append3_t[2];
block1[ 4] = append3_t[3];
block1[ 5] = append4_t[0];
block1[ 6] = append4_t[1];
block1[ 7] = append4_t[2];
@ -301,22 +290,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 7] = append0_t[1];
block0[ 8] = append0_t[2];
block0[ 9] = append0_t[3];
block0[10] = append1_t[0];
block0[11] = append1_t[1];
block0[12] = append1_t[2];
block0[13] = append1_t[3];
block0[14] = append2_t[0];
block0[15] = append2_t[1];
block1[ 0] = append2_t[2];
block1[ 1] = append2_t[3];
block1[ 2] = append3_t[0];
block1[ 3] = append3_t[1];
block1[ 4] = append3_t[2];
block1[ 5] = append3_t[3];
block1[ 6] = append4_t[0];
block1[ 7] = append4_t[1];
block1[ 8] = append4_t[2];
@ -327,22 +312,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 8] = append0_t[1];
block0[ 9] = append0_t[2];
block0[10] = append0_t[3];
block0[11] = append1_t[0];
block0[12] = append1_t[1];
block0[13] = append1_t[2];
block0[14] = append1_t[3];
block0[15] = append2_t[0];
block1[ 0] = append2_t[1];
block1[ 1] = append2_t[2];
block1[ 2] = append2_t[3];
block1[ 3] = append3_t[0];
block1[ 4] = append3_t[1];
block1[ 5] = append3_t[2];
block1[ 6] = append3_t[3];
block1[ 7] = append4_t[0];
block1[ 8] = append4_t[1];
block1[ 9] = append4_t[2];
@ -353,22 +334,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 9] = append0_t[1];
block0[10] = append0_t[2];
block0[11] = append0_t[3];
block0[12] = append1_t[0];
block0[13] = append1_t[1];
block0[14] = append1_t[2];
block0[15] = append1_t[3];
block1[ 0] = append2_t[0];
block1[ 1] = append2_t[1];
block1[ 2] = append2_t[2];
block1[ 3] = append2_t[3];
block1[ 4] = append3_t[0];
block1[ 5] = append3_t[1];
block1[ 6] = append3_t[2];
block1[ 7] = append3_t[3];
block1[ 8] = append4_t[0];
block1[ 9] = append4_t[1];
block1[10] = append4_t[2];
@ -379,22 +356,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[10] = append0_t[1];
block0[11] = append0_t[2];
block0[12] = append0_t[3];
block0[13] = append1_t[0];
block0[14] = append1_t[1];
block0[15] = append1_t[2];
block1[ 0] = append1_t[3];
block1[ 1] = append2_t[0];
block1[ 2] = append2_t[1];
block1[ 3] = append2_t[2];
block1[ 4] = append2_t[3];
block1[ 5] = append3_t[0];
block1[ 6] = append3_t[1];
block1[ 7] = append3_t[2];
block1[ 8] = append3_t[3];
block1[ 9] = append4_t[0];
block1[10] = append4_t[1];
block1[11] = append4_t[2];
@ -405,22 +378,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[11] = append0_t[1];
block0[12] = append0_t[2];
block0[13] = append0_t[3];
block0[14] = append1_t[0];
block0[15] = append1_t[1];
block1[ 0] = append1_t[2];
block1[ 1] = append1_t[3];
block1[ 2] = append2_t[0];
block1[ 3] = append2_t[1];
block1[ 4] = append2_t[2];
block1[ 5] = append2_t[3];
block1[ 6] = append3_t[0];
block1[ 7] = append3_t[1];
block1[ 8] = append3_t[2];
block1[ 9] = append3_t[3];
block1[10] = append4_t[0];
block1[11] = append4_t[1];
block1[12] = append4_t[2];
@ -431,22 +400,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[12] = append0_t[1];
block0[13] = append0_t[2];
block0[14] = append0_t[3];
block0[15] = append1_t[0];
block1[ 0] = append1_t[1];
block1[ 1] = append1_t[2];
block1[ 2] = append1_t[3];
block1[ 3] = append2_t[0];
block1[ 4] = append2_t[1];
block1[ 5] = append2_t[2];
block1[ 6] = append2_t[3];
block1[ 7] = append3_t[0];
block1[ 8] = append3_t[1];
block1[ 9] = append3_t[2];
block1[10] = append3_t[3];
block1[11] = append4_t[0];
block1[12] = append4_t[1];
block1[13] = append4_t[2];
@ -457,22 +422,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[13] = append0_t[1];
block0[14] = append0_t[2];
block0[15] = append0_t[3];
block1[ 0] = append1_t[0];
block1[ 1] = append1_t[1];
block1[ 2] = append1_t[2];
block1[ 3] = append1_t[3];
block1[ 4] = append2_t[0];
block1[ 5] = append2_t[1];
block1[ 6] = append2_t[2];
block1[ 7] = append2_t[3];
block1[ 8] = append3_t[0];
block1[ 9] = append3_t[1];
block1[10] = append3_t[2];
block1[11] = append3_t[3];
block1[12] = append4_t[0];
block1[13] = append4_t[1];
block1[14] = append4_t[2];
@ -483,22 +444,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[14] = append0_t[1];
block0[15] = append0_t[2];
block1[ 0] = append0_t[3];
block1[ 1] = append1_t[0];
block1[ 2] = append1_t[1];
block1[ 3] = append1_t[2];
block1[ 4] = append1_t[3];
block1[ 5] = append2_t[0];
block1[ 6] = append2_t[1];
block1[ 7] = append2_t[2];
block1[ 8] = append2_t[3];
block1[ 9] = append3_t[0];
block1[10] = append3_t[1];
block1[11] = append3_t[2];
block1[12] = append3_t[3];
block1[13] = append4_t[0];
block1[14] = append4_t[1];
block1[15] = append4_t[2];
@ -508,22 +465,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[15] = append0_t[1];
block1[ 0] = append0_t[2];
block1[ 1] = append0_t[3];
block1[ 2] = append1_t[0];
block1[ 3] = append1_t[1];
block1[ 4] = append1_t[2];
block1[ 5] = append1_t[3];
block1[ 6] = append2_t[0];
block1[ 7] = append2_t[1];
block1[ 8] = append2_t[2];
block1[ 9] = append2_t[3];
block1[10] = append3_t[0];
block1[11] = append3_t[1];
block1[12] = append3_t[2];
block1[13] = append3_t[3];
block1[14] = append4_t[0];
block1[15] = append4_t[1];
break;
@ -532,22 +485,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 0] = append0_t[1];
block1[ 1] = append0_t[2];
block1[ 2] = append0_t[3];
block1[ 3] = append1_t[1];
block1[ 4] = append1_t[2];
block1[ 5] = append1_t[3];
block1[ 6] = append1_t[0];
block1[ 7] = append2_t[0];
block1[ 8] = append2_t[1];
block1[ 9] = append2_t[2];
block1[10] = append2_t[3];
block1[11] = append3_t[0];
block1[12] = append3_t[1];
block1[13] = append3_t[2];
block1[14] = append3_t[3];
block1[15] = append4_t[0];
break;
@ -555,17 +504,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 1] = append0_t[1];
block1[ 2] = append0_t[2];
block1[ 3] = append0_t[3];
block1[ 4] = append1_t[0];
block1[ 5] = append1_t[1];
block1[ 6] = append1_t[2];
block1[ 7] = append1_t[3];
block1[ 8] = append2_t[0];
block1[ 9] = append2_t[1];
block1[10] = append2_t[2];
block1[11] = append2_t[3];
block1[12] = append3_t[0];
block1[13] = append3_t[1];
block1[14] = append3_t[2];
@ -576,17 +522,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 2] = append0_t[1];
block1[ 3] = append0_t[2];
block1[ 4] = append0_t[3];
block1[ 5] = append1_t[0];
block1[ 6] = append1_t[1];
block1[ 7] = append1_t[2];
block1[ 8] = append1_t[3];
block1[ 9] = append2_t[0];
block1[10] = append2_t[1];
block1[11] = append2_t[2];
block1[12] = append2_t[3];
block1[13] = append3_t[0];
block1[14] = append3_t[1];
block1[15] = append3_t[2];
@ -596,17 +539,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 3] = append0_t[1];
block1[ 4] = append0_t[2];
block1[ 5] = append0_t[3];
block1[ 6] = append1_t[0];
block1[ 7] = append1_t[1];
block1[ 8] = append1_t[2];
block1[ 9] = append1_t[3];
block1[10] = append2_t[0];
block1[11] = append2_t[1];
block1[12] = append2_t[2];
block1[13] = append2_t[3];
block1[14] = append3_t[0];
block1[15] = append3_t[1];
break;
@ -615,17 +555,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 4] = append0_t[1];
block1[ 5] = append0_t[2];
block1[ 6] = append0_t[3];
block1[ 7] = append1_t[0];
block1[ 8] = append1_t[1];
block1[ 9] = append1_t[2];
block1[10] = append1_t[3];
block1[11] = append2_t[0];
block1[12] = append2_t[1];
block1[13] = append2_t[2];
block1[14] = append2_t[3];
block1[15] = append3_t[0];
break;
@ -633,12 +570,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 5] = append0_t[1];
block1[ 6] = append0_t[2];
block1[ 7] = append0_t[3];
block1[ 8] = append1_t[0];
block1[ 9] = append1_t[1];
block1[10] = append1_t[2];
block1[11] = append1_t[3];
block1[12] = append2_t[0];
block1[13] = append2_t[1];
block1[14] = append2_t[2];
@ -649,12 +584,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 6] = append0_t[1];
block1[ 7] = append0_t[2];
block1[ 8] = append0_t[3];
block1[ 9] = append1_t[0];
block1[10] = append1_t[1];
block1[11] = append1_t[2];
block1[12] = append1_t[3];
block1[13] = append2_t[0];
block1[14] = append2_t[1];
block1[15] = append2_t[2];
@ -664,12 +597,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 7] = append0_t[1];
block1[ 8] = append0_t[2];
block1[ 9] = append0_t[3];
block1[10] = append1_t[0];
block1[11] = append1_t[1];
block1[12] = append1_t[2];
block1[13] = append1_t[3];
block1[14] = append2_t[0];
block1[15] = append2_t[1];
break;
@ -678,12 +609,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 8] = append0_t[1];
block1[ 9] = append0_t[2];
block1[10] = append0_t[3];
block1[11] = append1_t[0];
block1[12] = append1_t[1];
block1[13] = append1_t[2];
block1[14] = append1_t[3];
block1[15] = append2_t[0];
break;
@ -691,7 +620,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 9] = append0_t[1];
block1[10] = append0_t[2];
block1[11] = append0_t[3];
block1[12] = append1_t[0];
block1[13] = append1_t[1];
block1[14] = append1_t[2];
@ -702,7 +630,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[10] = append0_t[1];
block1[11] = append0_t[2];
block1[12] = append0_t[3];
block1[13] = append1_t[0];
block1[14] = append1_t[1];
block1[15] = append1_t[2];
@ -712,7 +639,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[11] = append0_t[1];
block1[12] = append0_t[2];
block1[13] = append0_t[3];
block1[14] = append1_t[0];
block1[15] = append1_t[1];
break;
@ -721,7 +647,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[12] = append0_t[1];
block1[13] = append0_t[2];
block1[14] = append0_t[3];
block1[15] = append1_t[0];
break;
@ -741,7 +666,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
break;
}
u32 new_len = block_len + append_len;
u32 new_len = offset + append_len;
return new_len;
}

View File

@ -25,76 +25,79 @@
#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7], l_bin2asc[(i).s8], l_bin2asc[(i).s9], l_bin2asc[(i).sa], l_bin2asc[(i).sb], l_bin2asc[(i).sc], l_bin2asc[(i).sd], l_bin2asc[(i).se], l_bin2asc[(i).sf])
#endif
u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len)
u32 memcat32 (u32x block0[16], u32x block1[16], const u32 offset, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len)
{
const u32 mod = block_len & 3;
const u32 div = block_len / 4;
const u32 mod = offset & 3;
const u32 div = offset / 4;
#if defined IS_AMD || defined IS_GENERIC
const int offset_minus_4 = 4 - mod;
u32x append00 = swap32 (append0[0]);
u32x append01 = swap32 (append0[1]);
u32x append02 = swap32 (append0[2]);
u32x append03 = swap32 (append0[3]);
u32x append10 = swap32 (append1[0]);
u32x append11 = swap32 (append1[1]);
u32x append12 = swap32 (append1[2]);
u32x append13 = swap32 (append1[3]);
u32x append20 = swap32 (append2[0]);
u32x append21 = swap32 (append2[1]);
u32x append22 = swap32 (append2[2]);
u32x append23 = swap32 (append2[3]);
u32x append30 = swap32 (append3[0]);
u32x append31 = swap32 (append3[1]);
u32x append32 = swap32 (append3[2]);
u32x append33 = swap32 (append3[3]);
u32x append0_t[4];
append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4);
append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4);
append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4);
append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4);
u32x append1_t[4];
append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4);
append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4);
append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4);
append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4);
u32x append2_t[4];
append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4);
append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4);
append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4);
append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4);
u32x append3_t[4];
append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4);
append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4);
append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4);
append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4);
u32x append4_t[4];
append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4);
append0_t[0] = amd_bytealign ( 0, append00, offset);
append0_t[1] = amd_bytealign (append00, append01, offset);
append0_t[2] = amd_bytealign (append01, append02, offset);
append0_t[3] = amd_bytealign (append02, append03, offset);
append1_t[0] = amd_bytealign (append03, append10, offset);
append1_t[1] = amd_bytealign (append10, append11, offset);
append1_t[2] = amd_bytealign (append11, append12, offset);
append1_t[3] = amd_bytealign (append12, append13, offset);
append2_t[0] = amd_bytealign (append13, append20, offset);
append2_t[1] = amd_bytealign (append20, append21, offset);
append2_t[2] = amd_bytealign (append21, append22, offset);
append2_t[3] = amd_bytealign (append22, append23, offset);
append3_t[0] = amd_bytealign (append23, append30, offset);
append3_t[1] = amd_bytealign (append30, append31, offset);
append3_t[2] = amd_bytealign (append31, append32, offset);
append3_t[3] = amd_bytealign (append32, append33, offset);
append4_t[0] = amd_bytealign (append33, 0, offset);
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
if (mod == 0)
{
append0_t[0] = append0[0];
append0_t[1] = append0[1];
append0_t[2] = append0[2];
append0_t[3] = append0[3];
append0_t[0] = swap32 (append0_t[0]);
append0_t[1] = swap32 (append0_t[1]);
append0_t[2] = swap32 (append0_t[2]);
append0_t[3] = swap32 (append0_t[3]);
append1_t[0] = swap32 (append1_t[0]);
append1_t[1] = swap32 (append1_t[1]);
append1_t[2] = swap32 (append1_t[2]);
append1_t[3] = swap32 (append1_t[3]);
append2_t[0] = swap32 (append2_t[0]);
append2_t[1] = swap32 (append2_t[1]);
append2_t[2] = swap32 (append2_t[2]);
append2_t[3] = swap32 (append2_t[3]);
append3_t[0] = swap32 (append3_t[0]);
append3_t[1] = swap32 (append3_t[1]);
append3_t[2] = swap32 (append3_t[2]);
append3_t[3] = swap32 (append3_t[3]);
append4_t[0] = swap32 (append4_t[0]);
append4_t[1] = swap32 (append4_t[1]);
append4_t[2] = swap32 (append4_t[2]);
append4_t[3] = swap32 (append4_t[3]);
append1_t[0] = append1[0];
append1_t[1] = append1[1];
append1_t[2] = append1[2];
append1_t[3] = append1[3];
append2_t[0] = append2[0];
append2_t[1] = append2[1];
append2_t[2] = append2[2];
append2_t[3] = append2[3];
append3_t[0] = append3[0];
append3_t[1] = append3[1];
append3_t[2] = append3[2];
append3_t[3] = append3[3];
append4_t[0] = 0;
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
}
#endif
#ifdef IS_NV
@ -103,40 +106,50 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
u32x append00 = append0[0];
u32x append01 = append0[1];
u32x append02 = append0[2];
u32x append03 = append0[3];
u32x append10 = append1[0];
u32x append11 = append1[1];
u32x append12 = append1[2];
u32x append13 = append1[3];
u32x append20 = append2[0];
u32x append21 = append2[1];
u32x append22 = append2[2];
u32x append23 = append2[3];
u32x append30 = append3[0];
u32x append31 = append3[1];
u32x append32 = append3[2];
u32x append33 = append3[3];
u32x append0_t[4];
append0_t[0] = __byte_perm ( 0, append0[0], selector);
append0_t[1] = __byte_perm (append0[0], append0[1], selector);
append0_t[2] = __byte_perm (append0[1], append0[2], selector);
append0_t[3] = __byte_perm (append0[2], append0[3], selector);
u32x append1_t[4];
append1_t[0] = __byte_perm (append0[3], append1[0], selector);
append1_t[1] = __byte_perm (append1[0], append1[1], selector);
append1_t[2] = __byte_perm (append1[1], append1[2], selector);
append1_t[3] = __byte_perm (append1[2], append1[3], selector);
u32x append2_t[4];
append2_t[0] = __byte_perm (append1[3], append2[0], selector);
append2_t[1] = __byte_perm (append2[0], append2[1], selector);
append2_t[2] = __byte_perm (append2[1], append2[2], selector);
append2_t[3] = __byte_perm (append2[2], append2[3], selector);
u32x append3_t[4];
append3_t[0] = __byte_perm (append2[3], append3[0], selector);
append3_t[1] = __byte_perm (append3[0], append3[1], selector);
append3_t[2] = __byte_perm (append3[1], append3[2], selector);
append3_t[3] = __byte_perm (append3[2], append3[3], selector);
u32x append4_t[4];
append4_t[0] = __byte_perm (append3[3], 0, selector);
append0_t[0] = __byte_perm ( 0, append00, selector);
append0_t[1] = __byte_perm (append00, append01, selector);
append0_t[2] = __byte_perm (append01, append02, selector);
append0_t[3] = __byte_perm (append02, append03, selector);
append1_t[0] = __byte_perm (append03, append10, selector);
append1_t[1] = __byte_perm (append10, append11, selector);
append1_t[2] = __byte_perm (append11, append12, selector);
append1_t[3] = __byte_perm (append12, append13, selector);
append2_t[0] = __byte_perm (append13, append20, selector);
append2_t[1] = __byte_perm (append20, append21, selector);
append2_t[2] = __byte_perm (append21, append22, selector);
append2_t[3] = __byte_perm (append22, append23, selector);
append3_t[0] = __byte_perm (append23, append30, selector);
append3_t[1] = __byte_perm (append30, append31, selector);
append3_t[2] = __byte_perm (append31, append32, selector);
append3_t[3] = __byte_perm (append32, append33, selector);
append4_t[0] = __byte_perm (append33, 0, selector);
append4_t[1] = 0;
append4_t[2] = 0;
append4_t[3] = 0;
#endif
switch (div)
@ -145,22 +158,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 1] = append0_t[1];
block0[ 2] = append0_t[2];
block0[ 3] = append0_t[3];
block0[ 4] = append1_t[0];
block0[ 5] = append1_t[1];
block0[ 6] = append1_t[2];
block0[ 7] = append1_t[3];
block0[ 8] = append2_t[0];
block0[ 9] = append2_t[1];
block0[10] = append2_t[2];
block0[11] = append2_t[3];
block0[12] = append3_t[0];
block0[13] = append3_t[1];
block0[14] = append3_t[2];
block0[15] = append3_t[3];
block1[ 0] = append4_t[0];
block1[ 1] = append4_t[1];
block1[ 2] = append4_t[2];
@ -171,22 +180,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 2] = append0_t[1];
block0[ 3] = append0_t[2];
block0[ 4] = append0_t[3];
block0[ 5] = append1_t[0];
block0[ 6] = append1_t[1];
block0[ 7] = append1_t[2];
block0[ 8] = append1_t[3];
block0[ 9] = append2_t[0];
block0[10] = append2_t[1];
block0[11] = append2_t[2];
block0[12] = append2_t[3];
block0[13] = append3_t[0];
block0[14] = append3_t[1];
block0[15] = append3_t[2];
block1[ 0] = append3_t[3];
block1[ 1] = append4_t[0];
block1[ 2] = append4_t[1];
block1[ 3] = append4_t[2];
@ -197,22 +202,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 3] = append0_t[1];
block0[ 4] = append0_t[2];
block0[ 5] = append0_t[3];
block0[ 6] = append1_t[0];
block0[ 7] = append1_t[1];
block0[ 8] = append1_t[2];
block0[ 9] = append1_t[3];
block0[10] = append2_t[0];
block0[11] = append2_t[1];
block0[12] = append2_t[2];
block0[13] = append2_t[3];
block0[14] = append3_t[0];
block0[15] = append3_t[1];
block1[ 0] = append3_t[2];
block1[ 1] = append3_t[3];
block1[ 2] = append4_t[0];
block1[ 3] = append4_t[1];
block1[ 4] = append4_t[2];
@ -223,22 +224,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 4] = append0_t[1];
block0[ 5] = append0_t[2];
block0[ 6] = append0_t[3];
block0[ 7] = append1_t[0];
block0[ 8] = append1_t[1];
block0[ 9] = append1_t[2];
block0[10] = append1_t[3];
block0[11] = append2_t[0];
block0[12] = append2_t[1];
block0[13] = append2_t[2];
block0[14] = append2_t[3];
block0[15] = append3_t[0];
block1[ 0] = append3_t[1];
block1[ 1] = append3_t[2];
block1[ 2] = append3_t[3];
block1[ 3] = append4_t[0];
block1[ 4] = append4_t[1];
block1[ 5] = append4_t[2];
@ -249,22 +246,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 5] = append0_t[1];
block0[ 6] = append0_t[2];
block0[ 7] = append0_t[3];
block0[ 8] = append1_t[0];
block0[ 9] = append1_t[1];
block0[10] = append1_t[2];
block0[11] = append1_t[3];
block0[12] = append2_t[0];
block0[13] = append2_t[1];
block0[14] = append2_t[2];
block0[15] = append2_t[3];
block1[ 0] = append3_t[0];
block1[ 1] = append3_t[1];
block1[ 2] = append3_t[2];
block1[ 3] = append3_t[3];
block1[ 4] = append4_t[0];
block1[ 5] = append4_t[1];
block1[ 6] = append4_t[2];
@ -275,22 +268,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 6] = append0_t[1];
block0[ 7] = append0_t[2];
block0[ 8] = append0_t[3];
block0[ 9] = append1_t[0];
block0[10] = append1_t[1];
block0[11] = append1_t[2];
block0[12] = append1_t[3];
block0[13] = append2_t[0];
block0[14] = append2_t[1];
block0[15] = append2_t[2];
block1[ 0] = append2_t[3];
block1[ 1] = append3_t[0];
block1[ 2] = append3_t[1];
block1[ 3] = append3_t[2];
block1[ 4] = append3_t[3];
block1[ 5] = append4_t[0];
block1[ 6] = append4_t[1];
block1[ 7] = append4_t[2];
@ -301,22 +290,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 7] = append0_t[1];
block0[ 8] = append0_t[2];
block0[ 9] = append0_t[3];
block0[10] = append1_t[0];
block0[11] = append1_t[1];
block0[12] = append1_t[2];
block0[13] = append1_t[3];
block0[14] = append2_t[0];
block0[15] = append2_t[1];
block1[ 0] = append2_t[2];
block1[ 1] = append2_t[3];
block1[ 2] = append3_t[0];
block1[ 3] = append3_t[1];
block1[ 4] = append3_t[2];
block1[ 5] = append3_t[3];
block1[ 6] = append4_t[0];
block1[ 7] = append4_t[1];
block1[ 8] = append4_t[2];
@ -327,22 +312,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 8] = append0_t[1];
block0[ 9] = append0_t[2];
block0[10] = append0_t[3];
block0[11] = append1_t[0];
block0[12] = append1_t[1];
block0[13] = append1_t[2];
block0[14] = append1_t[3];
block0[15] = append2_t[0];
block1[ 0] = append2_t[1];
block1[ 1] = append2_t[2];
block1[ 2] = append2_t[3];
block1[ 3] = append3_t[0];
block1[ 4] = append3_t[1];
block1[ 5] = append3_t[2];
block1[ 6] = append3_t[3];
block1[ 7] = append4_t[0];
block1[ 8] = append4_t[1];
block1[ 9] = append4_t[2];
@ -353,22 +334,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[ 9] = append0_t[1];
block0[10] = append0_t[2];
block0[11] = append0_t[3];
block0[12] = append1_t[0];
block0[13] = append1_t[1];
block0[14] = append1_t[2];
block0[15] = append1_t[3];
block1[ 0] = append2_t[0];
block1[ 1] = append2_t[1];
block1[ 2] = append2_t[2];
block1[ 3] = append2_t[3];
block1[ 4] = append3_t[0];
block1[ 5] = append3_t[1];
block1[ 6] = append3_t[2];
block1[ 7] = append3_t[3];
block1[ 8] = append4_t[0];
block1[ 9] = append4_t[1];
block1[10] = append4_t[2];
@ -379,22 +356,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[10] = append0_t[1];
block0[11] = append0_t[2];
block0[12] = append0_t[3];
block0[13] = append1_t[0];
block0[14] = append1_t[1];
block0[15] = append1_t[2];
block1[ 0] = append1_t[3];
block1[ 1] = append2_t[0];
block1[ 2] = append2_t[1];
block1[ 3] = append2_t[2];
block1[ 4] = append2_t[3];
block1[ 5] = append3_t[0];
block1[ 6] = append3_t[1];
block1[ 7] = append3_t[2];
block1[ 8] = append3_t[3];
block1[ 9] = append4_t[0];
block1[10] = append4_t[1];
block1[11] = append4_t[2];
@ -405,22 +378,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[11] = append0_t[1];
block0[12] = append0_t[2];
block0[13] = append0_t[3];
block0[14] = append1_t[0];
block0[15] = append1_t[1];
block1[ 0] = append1_t[2];
block1[ 1] = append1_t[3];
block1[ 2] = append2_t[0];
block1[ 3] = append2_t[1];
block1[ 4] = append2_t[2];
block1[ 5] = append2_t[3];
block1[ 6] = append3_t[0];
block1[ 7] = append3_t[1];
block1[ 8] = append3_t[2];
block1[ 9] = append3_t[3];
block1[10] = append4_t[0];
block1[11] = append4_t[1];
block1[12] = append4_t[2];
@ -431,22 +400,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[12] = append0_t[1];
block0[13] = append0_t[2];
block0[14] = append0_t[3];
block0[15] = append1_t[0];
block1[ 0] = append1_t[1];
block1[ 1] = append1_t[2];
block1[ 2] = append1_t[3];
block1[ 3] = append2_t[0];
block1[ 4] = append2_t[1];
block1[ 5] = append2_t[2];
block1[ 6] = append2_t[3];
block1[ 7] = append3_t[0];
block1[ 8] = append3_t[1];
block1[ 9] = append3_t[2];
block1[10] = append3_t[3];
block1[11] = append4_t[0];
block1[12] = append4_t[1];
block1[13] = append4_t[2];
@ -457,22 +422,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[13] = append0_t[1];
block0[14] = append0_t[2];
block0[15] = append0_t[3];
block1[ 0] = append1_t[0];
block1[ 1] = append1_t[1];
block1[ 2] = append1_t[2];
block1[ 3] = append1_t[3];
block1[ 4] = append2_t[0];
block1[ 5] = append2_t[1];
block1[ 6] = append2_t[2];
block1[ 7] = append2_t[3];
block1[ 8] = append3_t[0];
block1[ 9] = append3_t[1];
block1[10] = append3_t[2];
block1[11] = append3_t[3];
block1[12] = append4_t[0];
block1[13] = append4_t[1];
block1[14] = append4_t[2];
@ -483,22 +444,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[14] = append0_t[1];
block0[15] = append0_t[2];
block1[ 0] = append0_t[3];
block1[ 1] = append1_t[0];
block1[ 2] = append1_t[1];
block1[ 3] = append1_t[2];
block1[ 4] = append1_t[3];
block1[ 5] = append2_t[0];
block1[ 6] = append2_t[1];
block1[ 7] = append2_t[2];
block1[ 8] = append2_t[3];
block1[ 9] = append3_t[0];
block1[10] = append3_t[1];
block1[11] = append3_t[2];
block1[12] = append3_t[3];
block1[13] = append4_t[0];
block1[14] = append4_t[1];
block1[15] = append4_t[2];
@ -508,22 +465,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block0[15] = append0_t[1];
block1[ 0] = append0_t[2];
block1[ 1] = append0_t[3];
block1[ 2] = append1_t[0];
block1[ 3] = append1_t[1];
block1[ 4] = append1_t[2];
block1[ 5] = append1_t[3];
block1[ 6] = append2_t[0];
block1[ 7] = append2_t[1];
block1[ 8] = append2_t[2];
block1[ 9] = append2_t[3];
block1[10] = append3_t[0];
block1[11] = append3_t[1];
block1[12] = append3_t[2];
block1[13] = append3_t[3];
block1[14] = append4_t[0];
block1[15] = append4_t[1];
break;
@ -532,22 +485,18 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 0] = append0_t[1];
block1[ 1] = append0_t[2];
block1[ 2] = append0_t[3];
block1[ 3] = append1_t[1];
block1[ 4] = append1_t[2];
block1[ 5] = append1_t[3];
block1[ 6] = append1_t[0];
block1[ 7] = append2_t[0];
block1[ 8] = append2_t[1];
block1[ 9] = append2_t[2];
block1[10] = append2_t[3];
block1[11] = append3_t[0];
block1[12] = append3_t[1];
block1[13] = append3_t[2];
block1[14] = append3_t[3];
block1[15] = append4_t[0];
break;
@ -555,17 +504,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 1] = append0_t[1];
block1[ 2] = append0_t[2];
block1[ 3] = append0_t[3];
block1[ 4] = append1_t[0];
block1[ 5] = append1_t[1];
block1[ 6] = append1_t[2];
block1[ 7] = append1_t[3];
block1[ 8] = append2_t[0];
block1[ 9] = append2_t[1];
block1[10] = append2_t[2];
block1[11] = append2_t[3];
block1[12] = append3_t[0];
block1[13] = append3_t[1];
block1[14] = append3_t[2];
@ -576,17 +522,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 2] = append0_t[1];
block1[ 3] = append0_t[2];
block1[ 4] = append0_t[3];
block1[ 5] = append1_t[0];
block1[ 6] = append1_t[1];
block1[ 7] = append1_t[2];
block1[ 8] = append1_t[3];
block1[ 9] = append2_t[0];
block1[10] = append2_t[1];
block1[11] = append2_t[2];
block1[12] = append2_t[3];
block1[13] = append3_t[0];
block1[14] = append3_t[1];
block1[15] = append3_t[2];
@ -596,17 +539,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 3] = append0_t[1];
block1[ 4] = append0_t[2];
block1[ 5] = append0_t[3];
block1[ 6] = append1_t[0];
block1[ 7] = append1_t[1];
block1[ 8] = append1_t[2];
block1[ 9] = append1_t[3];
block1[10] = append2_t[0];
block1[11] = append2_t[1];
block1[12] = append2_t[2];
block1[13] = append2_t[3];
block1[14] = append3_t[0];
block1[15] = append3_t[1];
break;
@ -615,17 +555,14 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 4] = append0_t[1];
block1[ 5] = append0_t[2];
block1[ 6] = append0_t[3];
block1[ 7] = append1_t[0];
block1[ 8] = append1_t[1];
block1[ 9] = append1_t[2];
block1[10] = append1_t[3];
block1[11] = append2_t[0];
block1[12] = append2_t[1];
block1[13] = append2_t[2];
block1[14] = append2_t[3];
block1[15] = append3_t[0];
break;
@ -633,12 +570,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 5] = append0_t[1];
block1[ 6] = append0_t[2];
block1[ 7] = append0_t[3];
block1[ 8] = append1_t[0];
block1[ 9] = append1_t[1];
block1[10] = append1_t[2];
block1[11] = append1_t[3];
block1[12] = append2_t[0];
block1[13] = append2_t[1];
block1[14] = append2_t[2];
@ -649,12 +584,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 6] = append0_t[1];
block1[ 7] = append0_t[2];
block1[ 8] = append0_t[3];
block1[ 9] = append1_t[0];
block1[10] = append1_t[1];
block1[11] = append1_t[2];
block1[12] = append1_t[3];
block1[13] = append2_t[0];
block1[14] = append2_t[1];
block1[15] = append2_t[2];
@ -664,12 +597,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 7] = append0_t[1];
block1[ 8] = append0_t[2];
block1[ 9] = append0_t[3];
block1[10] = append1_t[0];
block1[11] = append1_t[1];
block1[12] = append1_t[2];
block1[13] = append1_t[3];
block1[14] = append2_t[0];
block1[15] = append2_t[1];
break;
@ -678,12 +609,10 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 8] = append0_t[1];
block1[ 9] = append0_t[2];
block1[10] = append0_t[3];
block1[11] = append1_t[0];
block1[12] = append1_t[1];
block1[13] = append1_t[2];
block1[14] = append1_t[3];
block1[15] = append2_t[0];
break;
@ -691,7 +620,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[ 9] = append0_t[1];
block1[10] = append0_t[2];
block1[11] = append0_t[3];
block1[12] = append1_t[0];
block1[13] = append1_t[1];
block1[14] = append1_t[2];
@ -702,7 +630,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[10] = append0_t[1];
block1[11] = append0_t[2];
block1[12] = append0_t[3];
block1[13] = append1_t[0];
block1[14] = append1_t[1];
block1[15] = append1_t[2];
@ -712,7 +639,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[11] = append0_t[1];
block1[12] = append0_t[2];
block1[13] = append0_t[3];
block1[14] = append1_t[0];
block1[15] = append1_t[1];
break;
@ -721,7 +647,6 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
block1[12] = append0_t[1];
block1[13] = append0_t[2];
block1[14] = append0_t[3];
block1[15] = append1_t[0];
break;
@ -741,7 +666,7 @@ u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x
break;
}
u32 new_len = block_len + append_len;
u32 new_len = offset + append_len;
return new_len;
}

View File

@ -146,6 +146,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16])
u32x tmp15;
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
@ -168,26 +188,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16])
tmp16 = __byte_perm ( 0, carry[15], selector);
#endif
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
#endif
carry[ 0] = 0;
carry[ 1] = 0;
carry[ 2] = 0;

View File

@ -144,6 +144,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16])
u32x tmp15;
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
@ -166,26 +186,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16])
tmp16 = __byte_perm ( 0, carry[15], selector);
#endif
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
#endif
carry[ 0] = 0;
carry[ 1] = 0;
carry[ 2] = 0;

View File

@ -143,6 +143,26 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16])
u32x tmp15;
u32x tmp16;
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
#endif
#ifdef IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
@ -165,26 +185,6 @@ void memcat64c_be (u32x block[16], const u32 offset, u32x carry[16])
tmp16 = __byte_perm ( 0, carry[15], selector);
#endif
#if defined IS_AMD || defined IS_GENERIC
tmp00 = amd_bytealign ( 0, carry[ 0], offset);
tmp01 = amd_bytealign (carry[ 0], carry[ 1], offset);
tmp02 = amd_bytealign (carry[ 1], carry[ 2], offset);
tmp03 = amd_bytealign (carry[ 2], carry[ 3], offset);
tmp04 = amd_bytealign (carry[ 3], carry[ 4], offset);
tmp05 = amd_bytealign (carry[ 4], carry[ 5], offset);
tmp06 = amd_bytealign (carry[ 5], carry[ 6], offset);
tmp07 = amd_bytealign (carry[ 6], carry[ 7], offset);
tmp08 = amd_bytealign (carry[ 7], carry[ 8], offset);
tmp09 = amd_bytealign (carry[ 8], carry[ 9], offset);
tmp10 = amd_bytealign (carry[ 9], carry[10], offset);
tmp11 = amd_bytealign (carry[10], carry[11], offset);
tmp12 = amd_bytealign (carry[11], carry[12], offset);
tmp13 = amd_bytealign (carry[12], carry[13], offset);
tmp14 = amd_bytealign (carry[13], carry[14], offset);
tmp15 = amd_bytealign (carry[14], carry[15], offset);
tmp16 = amd_bytealign (carry[15], 0, offset);
#endif
carry[ 0] = 0;
carry[ 1] = 0;
carry[ 2] = 0;