From a53d9e09dea343c4c06665d68db8429786a72df7 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 4 Aug 2017 14:12:58 +0200 Subject: [PATCH] Fix some issue with offset_minus_4 --- OpenCL/inc_common.cl | 34 ++++++----------- OpenCL/inc_rp.cl | 6 +-- OpenCL/m00500-optimized.cl | 60 +++++++++++------------------- OpenCL/m01600-optimized.cl | 60 +++++++++++------------------- OpenCL/m05800-optimized.cl | 16 +++----- OpenCL/m05800.cl | 16 +++----- OpenCL/m06300-optimized.cl | 60 +++++++++++------------------- OpenCL/m07400-optimized.cl | 76 ++++++++++++++++---------------------- 8 files changed, 119 insertions(+), 209 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index f50e073f9..4673a5bc4 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -2939,11 +2939,11 @@ void append_0x80_1x16 (u32x w[16], const u32 offset) void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -3469,8 +3469,6 @@ void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -3802,7 +3800,7 @@ void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x { const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; switch (offset / 4) { @@ -6184,11 +6182,11 @@ void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -7226,8 +7224,6 @@ void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -15044,11 +15040,11 @@ void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -15574,8 +15570,6 @@ void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], c #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -15907,7 +15901,7 @@ void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 { const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; switch (offset / 4) { @@ -18289,11 +18283,11 @@ void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3 void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -19331,8 +19325,6 @@ void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) @@ -25596,11 +25588,11 @@ void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u3 void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { case 0: @@ -32294,8 +32286,6 @@ void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; switch (offset / 4) diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index c50ec4a67..170ec5385 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -756,11 +756,11 @@ void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0 void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { - #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; - const int offset_minus_4 = 4 - offset; + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC u32 s0 = 0; u32 s1 = 0; u32 s2 = 0; @@ -895,8 +895,6 @@ void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; u32 s0 = 0; diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl index 81cd4f8bf..cb89b0b7b 100644 --- a/OpenCL/m00500-optimized.cl +++ b/OpenCL/m00500-optimized.cl @@ -114,7 +114,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -122,19 +122,18 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -142,13 +141,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp3 = tmp4; tmp4 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -156,10 +151,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -226,7 +220,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -234,19 +228,18 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -254,13 +247,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp3 = tmp4; tmp4 = 0x80; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -268,10 +257,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0x80, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -338,44 +326,38 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); tmp1 = __byte_perm (append[0], append[1], selector); tmp2 = __byte_perm (append[1], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl index fd5c4f293..d624b4678 100644 --- a/OpenCL/m01600-optimized.cl +++ b/OpenCL/m01600-optimized.cl @@ -113,7 +113,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -121,19 +121,18 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -141,13 +140,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp3 = tmp4; tmp4 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -155,10 +150,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -225,7 +219,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -233,19 +227,18 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -253,13 +246,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp3 = tmp4; tmp4 = 0x80; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -267,10 +256,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0x80, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -337,44 +325,38 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); tmp1 = __byte_perm (append[0], append[1], selector); tmp2 = __byte_perm (append[1], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl index b84015eef..774b97327 100644 --- a/OpenCL/m05800-optimized.cl +++ b/OpenCL/m05800-optimized.cl @@ -2111,10 +2111,11 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 u32 tmp4; u32 tmp5; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (offset & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); @@ -2122,9 +2123,7 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); - const u32 mod = offset & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -2133,13 +2132,9 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = tmp5; tmp5 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (offset & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -2148,7 +2143,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], append[4], selector); tmp5 = __byte_perm (append[4], 0, selector); - #endif const u32 div = offset / 4; diff --git a/OpenCL/m05800.cl b/OpenCL/m05800.cl index 3e9a9c72d..ec03f6218 100644 --- a/OpenCL/m05800.cl +++ b/OpenCL/m05800.cl @@ -2112,10 +2112,11 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 u32 tmp4; u32 tmp5; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (offset & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); @@ -2123,9 +2124,7 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = amd_bytealign (append[4], append[3], offset_minus_4); tmp5 = amd_bytealign ( 0, append[4], offset_minus_4); - const u32 mod = offset & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -2134,13 +2133,9 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp4 = tmp5; tmp5 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (offset & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -2149,7 +2144,6 @@ void append_salt (u32 w0[4], u32 w1[4], u32 w2[4], const u32 append[5], const u3 tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], append[4], selector); tmp5 = __byte_perm (append[4], 0, selector); - #endif const u32 div = offset / 4; diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl index 0645a12fd..6b3980270 100644 --- a/OpenCL/m06300-optimized.cl +++ b/OpenCL/m06300-optimized.cl @@ -110,7 +110,7 @@ void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 digest[3] += d; } -void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -118,19 +118,18 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -138,13 +137,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp3 = tmp4; tmp4 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -152,10 +147,9 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -222,7 +216,7 @@ void memcat16 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const } } -void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[4]) +void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[4]) { u32 tmp0; u32 tmp1; @@ -230,19 +224,18 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c u32 tmp3; u32 tmp4; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -250,13 +243,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp3 = tmp4; tmp4 = 0x80; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -264,10 +253,9 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c tmp2 = __byte_perm (append[1], append[2], selector); tmp3 = __byte_perm (append[2], append[3], selector); tmp4 = __byte_perm (append[3], 0x80, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { @@ -334,44 +322,38 @@ void memcat16_x80 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], c } } -void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 block_len, const u32 append[2]) +void memcat8 (u32 block0[4], u32 block1[4], u32 block2[4], u32 block3[4], const u32 offset, const u32 append[2]) { u32 tmp0; u32 tmp1; u32 tmp2; + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_AMD || defined IS_GENERIC - - const int offset_minus_4 = 4 - (block_len & 3); - tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign ( 0, append[1], offset_minus_4); - const u32 mod = block_len & 3; - - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; tmp2 = 0; } - #endif #ifdef IS_NV - - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); tmp1 = __byte_perm (append[0], append[1], selector); tmp2 = __byte_perm (append[1], 0, selector); - #endif - const u32 div = block_len / 4; + const u32 div = offset / 4; switch (div) { diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index e3a9fbda5..3483c57ec 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -190,27 +190,26 @@ void bswap8 (u32 block[16]) block[ 7] = swap32 (block[ 7]); } -u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat16 (u32 block[16], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -221,8 +220,6 @@ u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -232,7 +229,7 @@ u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 tmp4 = __byte_perm (append[3], 0, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -322,32 +319,31 @@ u32 memcat16 (u32 block[16], const u32 block_len, const u32 append[4], const u32 break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; return new_len; } -u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u32 append_len, u32 digest[8]) +u32 memcat16c (u32 block[16], const u32 offset, const u32 append[4], const u32 append_len, u32 digest[8]) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -358,8 +354,6 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -371,7 +365,7 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 u32 carry[4] = { 0, 0, 0, 0 }; - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -471,7 +465,7 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 break; } - u32 new_len = block_len + append_len; + u32 new_len = offset + append_len; if (new_len >= 64) { @@ -490,27 +484,26 @@ u32 memcat16c (u32 block[16], const u32 block_len, const u32 append[4], const u3 return new_len; } -u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat20 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -521,8 +514,6 @@ u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -532,7 +523,7 @@ u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 tmp4 = __byte_perm (append[3], 0, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -632,30 +623,29 @@ u32 memcat20 (u32 block[20], const u32 block_len, const u32 append[4], const u32 break; } - return block_len + append_len; + return offset + append_len; } -u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const u32 append_len) +u32 memcat20_x80 (u32 block[20], const u32 offset, const u32 append[4], const u32 append_len) { - const u32 mod = block_len & 3; - const u32 div = block_len / 4; - u32 tmp0; u32 tmp1; u32 tmp2; u32 tmp3; u32 tmp4; - #if defined IS_AMD || defined IS_GENERIC - const int offset_minus_4 = 4 - block_len; + const int offset_mod_4 = offset & 3; + const int offset_minus_4 = 4 - offset_mod_4; + + #if defined IS_AMD || defined IS_GENERIC tmp0 = amd_bytealign (append[0], 0, offset_minus_4); tmp1 = amd_bytealign (append[1], append[0], offset_minus_4); tmp2 = amd_bytealign (append[2], append[1], offset_minus_4); tmp3 = amd_bytealign (append[3], append[2], offset_minus_4); tmp4 = amd_bytealign ( 0x80, append[3], offset_minus_4); - if (mod == 0) + if (offset_mod_4 == 0) { tmp0 = tmp1; tmp1 = tmp2; @@ -666,8 +656,6 @@ u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const #endif #ifdef IS_NV - const int offset_minus_4 = 4 - (block_len & 3); - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; tmp0 = __byte_perm ( 0, append[0], selector); @@ -677,7 +665,7 @@ u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const tmp4 = __byte_perm (append[3], 0x80, selector); #endif - switch (div) + switch (offset / 4) { case 0: block[ 0] |= tmp0; block[ 1] = tmp1; @@ -777,7 +765,7 @@ u32 memcat20_x80 (u32 block[20], const u32 block_len, const u32 append[4], const break; } - return block_len + append_len; + return offset + append_len; } __kernel void m07400_init (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global sha256crypt_tmp_t *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max)