From 5e34ec348e68a1efae45b32949162a7c2aebd713 Mon Sep 17 00:00:00 2001 From: jsteube Date: Sat, 22 Jul 2017 18:05:18 +0200 Subject: [PATCH] Optimize kernels for ROCm 1.6 - Remove inline keywords - Remove volatile keywords where it causes ROCm to slow down - Replace DES functions (looks like bitselect somehow is no longer mapped to BFI_INT) --- OpenCL/inc_common.cl | 118 ++--- OpenCL/inc_hash_md4.cl | 4 +- OpenCL/inc_hash_md5.cl | 4 +- OpenCL/inc_hash_ripemd160.cl | 4 +- OpenCL/inc_hash_sha1.cl | 4 +- OpenCL/inc_hash_sha224.cl | 4 +- OpenCL/inc_hash_sha256.cl | 4 +- OpenCL/inc_hash_sha384.cl | 4 +- OpenCL/inc_hash_sha512.cl | 4 +- OpenCL/inc_hash_whirlpool.cl | 4 +- OpenCL/inc_rp.cl | 106 ++-- OpenCL/inc_simd.cl | 6 +- OpenCL/inc_types.cl | 134 +++-- OpenCL/inc_vendor.cl | 6 +- OpenCL/m01500_a3.cl | 950 ++++++++++++++++++---------------- OpenCL/m02501.cl | 4 +- OpenCL/m03000_a3.cl | 905 ++++++++++++++++---------------- OpenCL/m14000_a3-optimized.cl | 897 ++++++++++++++++---------------- OpenCL/markov_be.cl | 2 +- OpenCL/markov_le.cl | 2 +- 20 files changed, 1622 insertions(+), 1544 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 8d20c2040..f50e073f9 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -7,7 +7,7 @@ * pure scalar functions */ -inline int ffz (const u32 v) +int ffz (const u32 v) { #ifdef _unroll #pragma unroll @@ -22,7 +22,7 @@ inline int ffz (const u32 v) return -1; } -inline int hash_comp (const u32 d1[4], __global const u32 *d2) +int hash_comp (const u32 d1[4], __global const u32 *d2) { if (d1[3] > d2[DGST_R3]) return ( 1); if (d1[3] < d2[DGST_R3]) return (-1); @@ -36,7 +36,7 @@ inline int hash_comp (const u32 d1[4], __global const u32 *d2) return (0); } -inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global const digest_t *digests_buf) +int find_hash (const u32 digest[4], const u32 digests_cnt, __global const digest_t *digests_buf) { for (u32 l = 0, r = digests_cnt; r; r >>= 1) { @@ -59,12 +59,12 @@ inline int find_hash (const u32 digest[4], const u32 digests_cnt, __global const return (-1); } -inline u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest) +u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest) { return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f))); } -inline u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2) +u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2) { if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0); if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0); @@ -79,7 +79,7 @@ inline u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global return (1); } -inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u32 gid, const u32 il_pos) +void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u32 gid, const u32 il_pos) { const u32 idx = atomic_inc (d_result); @@ -100,7 +100,7 @@ inline void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, con plains_buf[idx].il_pos = il_pos; } -inline int count_char (const u32 *buf, const int elems, const u32 c) +int count_char (const u32 *buf, const int elems, const u32 c) { int r = 0; @@ -117,7 +117,7 @@ inline int count_char (const u32 *buf, const int elems, const u32 c) return r; } -inline float get_entropy (const u32 *buf, const int elems) +float get_entropy (const u32 *buf, const int elems) { const int length = elems * 4; @@ -144,7 +144,7 @@ inline float get_entropy (const u32 *buf, const int elems) * vector functions */ -inline void truncate_block_4x4_le (u32x w0[4], const u32 len) +void truncate_block_4x4_le (u32x w0[4], const u32 len) { switch (len) { @@ -254,7 +254,7 @@ inline void truncate_block_4x4_le (u32x w0[4], const u32 len) } } -inline void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) { switch (len) { @@ -1060,7 +1060,7 @@ inline void truncate_block_16x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[ } } -inline void truncate_block_4x4_be (u32x w0[4], const u32 len) +void truncate_block_4x4_be (u32x w0[4], const u32 len) { switch (len) { @@ -1170,7 +1170,7 @@ inline void truncate_block_4x4_be (u32x w0[4], const u32 len) } } -inline void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) +void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 len) { switch (len) { @@ -1976,7 +1976,7 @@ inline void truncate_block_16x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[ } } -inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x3727); @@ -2001,7 +2001,7 @@ inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x1707); @@ -2026,7 +2026,7 @@ inline void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) +void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV out2[3] = __byte_perm (in[3], 0, 0x7372); @@ -2051,7 +2051,7 @@ inline void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -inline void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) +void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) { #ifdef IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x4602); @@ -2072,7 +2072,7 @@ inline void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } -inline void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) +void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) { #ifdef IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x6420); @@ -2093,7 +2093,7 @@ inline void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } -inline void append_0x80_1x4 (u32x w0[4], const u32 offset) +void append_0x80_1x4 (u32x w0[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2103,7 +2103,7 @@ inline void append_0x80_1x4 (u32x w0[4], const u32 offset) w0[3] |= (offset >= 12) ? tmp : 0; } -inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) +void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2117,7 +2117,7 @@ inline void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) +void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2135,7 +2135,7 @@ inline void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offse w2[3] |= (offset >= 44) ? tmp : 0; } -inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -2157,7 +2157,7 @@ inline void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], con w3[3] |= (offset >= 60) ? tmp : 0; } -inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { switch (offset) { @@ -2675,7 +2675,7 @@ inline void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32 } } -inline void append_0x80_1x16 (u32x w[16], const u32 offset) +void append_0x80_1x16 (u32x w[16], const u32 offset) { switch (offset) { @@ -2937,7 +2937,7 @@ inline void append_0x80_1x16 (u32x w[16], const u32 offset) } } -inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -3798,7 +3798,7 @@ inline void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif } -inline void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) +void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; @@ -4600,7 +4600,7 @@ inline void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4] } } -inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -5255,7 +5255,7 @@ inline void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif } -inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) +void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -6182,7 +6182,7 @@ inline void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] #endif } -inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -7795,7 +7795,7 @@ inline void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], #endif } -inline void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -10114,7 +10114,7 @@ inline void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], #endif } -inline void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) +void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -13489,7 +13489,7 @@ inline void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w #endif } -inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) +void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) { #if defined cl_amd_media_ops switch (salt_len) @@ -13678,7 +13678,7 @@ inline void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) #endif } -inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) +void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) { // would be nice to have optimization based on amd_bytealign as with _le counterpart @@ -13775,7 +13775,7 @@ inline void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) } } -inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) { #if defined cl_amd_media_ops switch (salt_len) @@ -14140,7 +14140,7 @@ inline void overwrite_at_le_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], #endif } -inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) +void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x wx, const u32 salt_len) { // would be nice to have optimization based on amd_bytealign as with _le counterpart @@ -14329,7 +14329,7 @@ inline void overwrite_at_be_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], * vector functions as scalar (for outer loop usage) */ -inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 tmp = 0x01 << ((offset & 3) * 8); @@ -14343,7 +14343,7 @@ inline void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) +void append_0x80_1x4_S (u32 w0[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14353,7 +14353,7 @@ inline void append_0x80_1x4_S (u32 w0[4], const u32 offset) w0[3] |= (offset >= 12) ? tmp : 0; } -inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14367,7 +14367,7 @@ inline void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) w1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14385,7 +14385,7 @@ inline void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset w2[3] |= (offset >= 44) ? tmp : 0; } -inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { const u32 tmp = 0x80 << ((offset & 3) * 8); @@ -14407,7 +14407,7 @@ inline void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const w3[3] |= (offset >= 60) ? tmp : 0; } -inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { switch (offset) { @@ -14925,7 +14925,7 @@ inline void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w } } -inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x3727); @@ -14950,7 +14950,7 @@ inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x1707); @@ -14975,7 +14975,7 @@ inline void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) +void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #ifdef IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x7372); @@ -15000,7 +15000,7 @@ inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif } -inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) +void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #ifdef IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x4602); @@ -15021,7 +15021,7 @@ inline void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) #endif } -inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) +void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #ifdef IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); @@ -15042,7 +15042,7 @@ inline void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) #endif } -inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -15903,7 +15903,7 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w #endif } -inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; @@ -16705,7 +16705,7 @@ inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], } } -inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -17360,7 +17360,7 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w #endif } -inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) +void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -18287,7 +18287,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], #endif } -inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -19900,7 +19900,7 @@ inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -22219,7 +22219,7 @@ inline void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) +void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -25594,7 +25594,7 @@ inline void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2 #endif } -inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) +void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -36655,7 +36655,7 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) #endif } -inline void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) +void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) @@ -45438,7 +45438,7 @@ inline void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) PACKSV4 (s6, v6, e); \ PACKSV4 (s7, v7, e); -inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) +void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45498,7 +45498,7 @@ inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u #endif } -inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) +void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45678,7 +45678,7 @@ inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4 #endif } -inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) +void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45736,7 +45736,7 @@ inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) #endif } -inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) +void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 @@ -45794,7 +45794,7 @@ inline void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) #endif } -inline void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) +void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 diff --git a/OpenCL/inc_hash_md4.cl b/OpenCL/inc_hash_md4.cl index 668d0bbc9..a9383a5da 100644 --- a/OpenCL/inc_hash_md4.cl +++ b/OpenCL/inc_hash_md4.cl @@ -111,7 +111,7 @@ void md4_init (md4_ctx_t *ctx) void md4_update_64 (md4_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1234,7 +1234,7 @@ void md4_init_vector_from_scalar (md4_ctx_vector_t *ctx, md4_ctx_t *ctx0) void md4_update_vector_64 (md4_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_md5.cl b/OpenCL/inc_hash_md5.cl index 926bbb2c4..95e06cbef 100644 --- a/OpenCL/inc_hash_md5.cl +++ b/OpenCL/inc_hash_md5.cl @@ -145,7 +145,7 @@ void md5_init (md5_ctx_t *ctx) void md5_update_64 (md5_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1303,7 +1303,7 @@ void md5_init_vector_from_scalar (md5_ctx_vector_t *ctx, md5_ctx_t *ctx0) void md5_update_vector_64 (md5_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_ripemd160.cl b/OpenCL/inc_hash_ripemd160.cl index bf5d2ec42..709ad3eb2 100644 --- a/OpenCL/inc_hash_ripemd160.cl +++ b/OpenCL/inc_hash_ripemd160.cl @@ -245,7 +245,7 @@ void ripemd160_init (ripemd160_ctx_t *ctx) void ripemd160_update_64 (ripemd160_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1504,7 +1504,7 @@ void ripemd160_init_vector_from_scalar (ripemd160_ctx_vector_t *ctx, ripemd160_c void ripemd160_update_vector_64 (ripemd160_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha1.cl b/OpenCL/inc_hash_sha1.cl index 9713a02dd..47fe4691d 100644 --- a/OpenCL/inc_hash_sha1.cl +++ b/OpenCL/inc_hash_sha1.cl @@ -177,7 +177,7 @@ void sha1_init (sha1_ctx_t *ctx) void sha1_update_64 (sha1_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1368,7 +1368,7 @@ void sha1_init_vector_from_scalar (sha1_ctx_vector_t *ctx, sha1_ctx_t *ctx0) void sha1_update_vector_64 (sha1_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha224.cl b/OpenCL/inc_hash_sha224.cl index 4f35938a6..553397f6c 100644 --- a/OpenCL/inc_hash_sha224.cl +++ b/OpenCL/inc_hash_sha224.cl @@ -162,7 +162,7 @@ void sha224_init (sha224_ctx_t *ctx) void sha224_update_64 (sha224_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1321,7 +1321,7 @@ void sha224_init_vector_from_scalar (sha224_ctx_vector_t *ctx, sha224_ctx_t *ctx void sha224_update_vector_64 (sha224_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha256.cl b/OpenCL/inc_hash_sha256.cl index 75fd99acf..92b35b579 100644 --- a/OpenCL/inc_hash_sha256.cl +++ b/OpenCL/inc_hash_sha256.cl @@ -162,7 +162,7 @@ void sha256_init (sha256_ctx_t *ctx) void sha256_update_64 (sha256_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -1321,7 +1321,7 @@ void sha256_init_vector_from_scalar (sha256_ctx_vector_t *ctx, sha256_ctx_t *ctx void sha256_update_vector_64 (sha256_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_hash_sha384.cl b/OpenCL/inc_hash_sha384.cl index 8302cd379..0800b253a 100644 --- a/OpenCL/inc_hash_sha384.cl +++ b/OpenCL/inc_hash_sha384.cl @@ -186,7 +186,7 @@ void sha384_init (sha384_ctx_t *ctx) void sha384_update_128 (sha384_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif @@ -2017,7 +2017,7 @@ void sha384_init_vector_from_scalar (sha384_ctx_vector_t *ctx, sha384_ctx_t *ctx void sha384_update_vector_128 (sha384_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif diff --git a/OpenCL/inc_hash_sha512.cl b/OpenCL/inc_hash_sha512.cl index 6c58834eb..61c6e143d 100644 --- a/OpenCL/inc_hash_sha512.cl +++ b/OpenCL/inc_hash_sha512.cl @@ -186,7 +186,7 @@ void sha512_init (sha512_ctx_t *ctx) void sha512_update_128 (sha512_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif @@ -2017,7 +2017,7 @@ void sha512_init_vector_from_scalar (sha512_ctx_vector_t *ctx, sha512_ctx_t *ctx void sha512_update_vector_128 (sha512_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 127; + const int pos = ctx->len & 127; #else const int pos = ctx->len & 127; #endif diff --git a/OpenCL/inc_hash_whirlpool.cl b/OpenCL/inc_hash_whirlpool.cl index a983cefb7..1ec270105 100644 --- a/OpenCL/inc_hash_whirlpool.cl +++ b/OpenCL/inc_hash_whirlpool.cl @@ -1345,7 +1345,7 @@ void whirlpool_init (whirlpool_ctx_t *ctx, __local u32 (*s_Ch)[256], __local u32 void whirlpool_update_64 (whirlpool_ctx_t *ctx, u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif @@ -2608,7 +2608,7 @@ void whirlpool_init_vector_from_scalar (whirlpool_ctx_vector_t *ctx, whirlpool_c void whirlpool_update_vector_64 (whirlpool_ctx_vector_t *ctx, u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const int len) { #ifdef IS_AMD - volatile const int pos = ctx->len & 63; + const int pos = ctx->len & 63; #else const int pos = ctx->len & 63; #endif diff --git a/OpenCL/inc_rp.cl b/OpenCL/inc_rp.cl index 71c926d92..c50ec4a67 100644 --- a/OpenCL/inc_rp.cl +++ b/OpenCL/inc_rp.cl @@ -3,7 +3,7 @@ * License.....: MIT */ -inline u32 generate_cmask (const u32 value) +u32 generate_cmask (const u32 value) { const u32 rmask = ((value & 0x40404040u) >> 1u) & ~((value & 0x80808080u) >> 2u); @@ -14,7 +14,7 @@ inline u32 generate_cmask (const u32 value) return rmask & ~hmask & lmask; } -inline void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) +void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) { const u32 tmp = (1u << ((offset & 3u) * 8u)) - 1u; @@ -67,7 +67,7 @@ inline void truncate_right (u32 buf0[4], u32 buf1[4], const u32 offset) } } -inline void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) +void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) { const u32 tmp = ~((1u << ((offset & 3u) * 8u)) - 1u); @@ -120,7 +120,7 @@ inline void truncate_left (u32 buf0[4], u32 buf1[4], const u32 offset) } } -inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) +void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { out0[0] = amd_bytealign_S (in0[1], in0[0], 1); out0[1] = amd_bytealign_S (in0[2], in0[1], 1); @@ -132,7 +132,7 @@ inline void lshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 o out1[3] = amd_bytealign_S ( 0, in1[3], 1); } -inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) +void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4]) { out1[3] = amd_bytealign_S (in1[3], in1[2], 3); out1[2] = amd_bytealign_S (in1[2], in1[1], 3); @@ -144,7 +144,7 @@ inline void rshift_block (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 o out0[0] = amd_bytealign_S (in0[0], 0, 3); } -inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) +void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) { switch (num) { @@ -439,7 +439,7 @@ inline void lshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) +void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 out1[4], const u32 num) { switch (num) { @@ -734,7 +734,7 @@ inline void rshift_block_N (const u32 in0[4], const u32 in1[4], u32 out0[4], u32 } } -inline void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0) +void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0) { // this version works with 1 byte append only @@ -754,7 +754,7 @@ inline void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 buf1[3] |= (offset >= 28) ? tmp : 0; } -inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) +void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4]) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -1012,7 +1012,7 @@ inline void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 #endif } -inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) +void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], const u32 len) { rshift_block_N (in0, in1, out0, out1, 32 - len); @@ -1038,7 +1038,7 @@ inline void reverse_block (u32 in0[4], u32 in1[4], u32 out0[4], u32 out1[4], con out1[3] = swap32_S (tib41[3]); } -inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); @@ -1052,7 +1052,7 @@ inline u32 rule_op_mangle_lrest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] &= ~(generate_cmask (buf0[0])); buf0[1] &= ~(generate_cmask (buf0[1])); @@ -1066,7 +1066,7 @@ inline u32 rule_op_mangle_urest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { rule_op_mangle_lrest (p0, p1, buf0, buf1, in_len); @@ -1075,7 +1075,7 @@ inline u32 rule_op_mangle_lrest_ufirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { rule_op_mangle_urest (p0, p1, buf0, buf1, in_len); @@ -1084,7 +1084,7 @@ inline u32 rule_op_mangle_urest_lfirst (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] ^= (generate_cmask (buf0[0])); buf0[1] ^= (generate_cmask (buf0[1])); @@ -1098,7 +1098,7 @@ inline u32 rule_op_mangle_trest (const u32 p0, const u32 p1, u32 buf0[4], u32 bu return in_len; } -inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1119,14 +1119,14 @@ inline u32 rule_op_mangle_toggle_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_reverse (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { reverse_block (buf0, buf1, buf0, buf1, in_len); return in_len; } -inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1139,7 +1139,7 @@ inline u32 rule_op_mangle_dupeword (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (((in_len * p0) + in_len) >= 32) return (in_len); @@ -1167,7 +1167,7 @@ inline u32 rule_op_mangle_dupeword_times (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + in_len) >= 32) return (in_len); @@ -1185,7 +1185,7 @@ inline u32 rule_op_mangle_reflect (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1198,7 +1198,7 @@ inline u32 rule_op_mangle_append (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((in_len + 1) >= 32) return (in_len); @@ -1213,7 +1213,7 @@ inline u32 rule_op_mangle_prepend (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1237,7 +1237,7 @@ inline u32 rule_op_mangle_rotate_left (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1267,7 +1267,7 @@ inline u32 rule_op_mangle_rotate_right (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1278,7 +1278,7 @@ inline u32 rule_op_mangle_delete_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len == 0) return (in_len); @@ -1298,7 +1298,7 @@ inline u32 rule_op_mangle_delete_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len1; } -inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1373,7 +1373,7 @@ inline u32 rule_op_mangle_delete_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1388,7 +1388,7 @@ inline u32 rule_op_mangle_extract (const u32 p0, const u32 p1, u32 buf0[4], u32 return out_len; } -inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1474,7 +1474,7 @@ inline u32 rule_op_mangle_omit (const u32 p0, const u32 p1, u32 buf0[4], u32 buf return out_len; } -inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -1546,7 +1546,7 @@ inline u32 rule_op_mangle_insert (const u32 p0, const u32 p1, u32 buf0[4], u32 b return out_len; } -inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1569,7 +1569,7 @@ inline u32 rule_op_mangle_overstrike (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -1578,7 +1578,7 @@ inline u32 rule_op_mangle_truncate_at (const u32 p0, const u32 p1, u32 buf0[4], return p0; } -inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { const uchar4 tmp0 = (uchar4) (p0); const uchar4 tmp1 = (uchar4) (p1); @@ -1597,7 +1597,7 @@ inline u32 rule_op_mangle_replace (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { u32 out_len = 0; @@ -1638,13 +1638,13 @@ inline u32 rule_op_mangle_purgechar (const u32 p0, const u32 p1, u32 buf0[4], u3 return out_len; } -inline u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_togglecase_rec (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { // TODO return in_len; } -inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1831,7 +1831,7 @@ inline u32 rule_op_mangle_dupechar_first (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + p0) >= 32) return (in_len); @@ -1865,7 +1865,7 @@ inline u32 rule_op_mangle_dupechar_last (const u32 p0, const u32 p1, u32 buf0[4] return out_len; } -inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ( in_len == 0) return (in_len); if ((in_len + in_len) >= 32) return (in_len); @@ -1898,7 +1898,7 @@ inline u32 rule_op_mangle_dupechar_all (const u32 p0, const u32 p1, u32 buf0[4], return out_len; } -inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1907,7 +1907,7 @@ inline u32 rule_op_mangle_switch_first (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (in_len < 2) return (in_len); @@ -1992,7 +1992,7 @@ inline u32 rule_op_mangle_switch_last (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); if (p1 >= in_len) return (in_len); @@ -2239,7 +2239,7 @@ inline u32 rule_op_mangle_switch_at (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2261,7 +2261,7 @@ inline u32 rule_op_mangle_chr_shiftl (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2283,7 +2283,7 @@ inline u32 rule_op_mangle_chr_shiftr (const u32 p0, const u32 p1, u32 buf0[4], u return in_len; } -inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2307,7 +2307,7 @@ inline u32 rule_op_mangle_chr_incr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 >= in_len) return (in_len); @@ -2331,7 +2331,7 @@ inline u32 rule_op_mangle_chr_decr (const u32 p0, const u32 p1, u32 buf0[4], u32 return in_len; } -inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if ((p0 + 1) >= in_len) return (in_len); @@ -2358,7 +2358,7 @@ inline u32 rule_op_mangle_replace_np1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 == 0) return (in_len); @@ -2387,7 +2387,7 @@ inline u32 rule_op_mangle_replace_nm1 (const u32 p0, const u32 p1, u32 buf0[4], return in_len; } -inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2425,7 +2425,7 @@ inline u32 rule_op_mangle_dupeblock_first (const u32 p0, const u32 p1, u32 buf0[ return out_len; } -inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { if (p0 > in_len) return (in_len); @@ -2454,7 +2454,7 @@ inline u32 rule_op_mangle_dupeblock_last (const u32 p0, const u32 p1, u32 buf0[4 return out_len; } -inline u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { buf0[0] |= (generate_cmask (buf0[0])); buf0[1] |= (generate_cmask (buf0[1])); @@ -2497,7 +2497,7 @@ inline u32 rule_op_mangle_title_sep (const u32 p0, const u32 p1, u32 buf0[4], u3 return in_len; } -inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) +u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], u32 buf1[4], const u32 in_len) { u32 out_len = in_len; @@ -2549,7 +2549,7 @@ inline u32 apply_rule (const u32 name, const u32 p0, const u32 p1, u32 buf0[4], return out_len; } -inline u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) +u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], const u32 len) { u32 out_len = len; @@ -2567,7 +2567,7 @@ inline u32 apply_rules (__global const u32 *cmds, u32 buf0[4], u32 buf1[4], cons return out_len; } -inline u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, __global const kernel_rule_t *rules_buf, const u32 il_pos, u32x buf0[4], u32x buf1[4]) +u32x apply_rules_vect (const u32 pw_buf0[4], const u32 pw_buf1[4], const u32 pw_len, __global const kernel_rule_t *rules_buf, const u32 il_pos, u32x buf0[4], u32x buf1[4]) { #if VECT_SIZE == 1 diff --git a/OpenCL/inc_simd.cl b/OpenCL/inc_simd.cl index 37548b44c..ac9f0410f 100644 --- a/OpenCL/inc_simd.cl +++ b/OpenCL/inc_simd.cl @@ -1054,7 +1054,7 @@ // attack-mode 0 -inline u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) +u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) { #if VECT_SIZE == 1 const u32x ix = (u32x) (bfs_buf[il_pos + 0].i); @@ -1073,7 +1073,7 @@ inline u32x ix_create_bft (__global const bf_t *bfs_buf, const u32 il_pos) // attack-mode 1 -inline u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_pos) +u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_pos) { #if VECT_SIZE == 1 const u32x pw_lenx = (u32x) (combs_buf[il_pos + 0].pw_len); @@ -1090,7 +1090,7 @@ inline u32x pwlenx_create_combt (__global const pw_t *combs_buf, const u32 il_po return pw_lenx; } -inline u32x ix_create_combt (__global const pw_t *combs_buf, const u32 il_pos, const int idx) +u32x ix_create_combt (__global const pw_t *combs_buf, const u32 il_pos, const int idx) { #if VECT_SIZE == 1 const u32x ix = (u32x) (combs_buf[il_pos + 0].i[idx]); diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index 2bfb641df..78ecd9988 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -33,14 +33,14 @@ typedef VTYPE(uint, VECT_SIZE) u32x; typedef VTYPE(ulong, VECT_SIZE) u64x; #endif -inline u32 l32_from_64_S (u64 a) +u32 l32_from_64_S (u64 a) { const u32 r = (u32) (a); return r; } -inline u32 h32_from_64_S (u64 a) +u32 h32_from_64_S (u64 a) { a >>= 32; @@ -49,12 +49,12 @@ inline u32 h32_from_64_S (u64 a) return r; } -inline u64 hl32_to_64_S (const u32 a, const u32 b) +u64 hl32_to_64_S (const u32 a, const u32 b) { return as_ulong ((uint2) (b, a)); } -inline u32x l32_from_64 (u64x a) +u32x l32_from_64 (u64x a) { u32x r; @@ -93,7 +93,7 @@ inline u32x l32_from_64 (u64x a) return r; } -inline u32x h32_from_64 (u64x a) +u32x h32_from_64 (u64x a) { a >>= 32; @@ -134,7 +134,7 @@ inline u32x h32_from_64 (u64x a) return r; } -inline u64x hl32_to_64 (const u32x a, const u32x b) +u64x hl32_to_64 (const u32x a, const u32x b) { u64x r; @@ -174,45 +174,37 @@ inline u64x hl32_to_64 (const u32x a, const u32x b) } #ifdef IS_AMD -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { return rotate (a, 32 - n); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { - const u32 a0 = h32_from_64_S (a); - const u32 a1 = l32_from_64_S (a); - - const u32 t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); - const u32 t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - - const u64 r = hl32_to_64_S (t0, t1); - - return r; + return rotate (a, (u64) (64 - n)); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { - return rotr64_S (a, 64 - n); + return rotate (a, (u64) n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -220,7 +212,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -232,82 +224,74 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { return rotate (a, 32 - n); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { - const u32x a0 = h32_from_64 (a); - const u32x a1 = l32_from_64 (a); - - const u32x t0 = (n >= 32) ? amd_bitalign (a0, a1, n - 32) : amd_bitalign (a1, a0, n); - const u32x t1 = (n >= 32) ? amd_bitalign (a1, a0, n - 32) : amd_bitalign (a0, a1, n); - - const u64x r = hl32_to_64 (t0, t1); - - return r; + return rotate (a, (u64x) (64 - n)); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { - return rotr64 (a, 64 - n); + return rotate (a, (u64x) n); } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { return amd_bfe (a, b, c); } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { return amd_bfe (a, b, c); } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { return amd_bytealign (a, b, c); } #endif #ifdef IS_NV -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { return rotate (a, 32 - n); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { return rotr64_S (a, 64 - n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -315,7 +299,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -327,27 +311,27 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { return rotate (a, 32 - n); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { return rotate (a, (u64) n); } -inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) +u32x __byte_perm (const u32x a, const u32x b, const u32x c) { u32x r; @@ -386,7 +370,7 @@ inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) return r; } -inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) +u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -395,7 +379,7 @@ inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) return r; } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { u32x r; @@ -434,7 +418,7 @@ inline u32x __bfe (const u32x a, const u32x b, const u32x c) return r; } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -443,7 +427,7 @@ inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) return r; } -inline u32x amd_bytealign (const u32x a, const u32x b, const u32x c) +u32x amd_bytealign (const u32x a, const u32x b, const u32x c) { u32x r; @@ -490,7 +474,7 @@ inline u32x amd_bytealign (const u32x a, const u32x b, const u32x c) return r; } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -509,37 +493,37 @@ inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) #endif #ifdef IS_GENERIC -inline u32 swap32_S (const u32 v) +u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -inline u64 swap64_S (const u64 v) +u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } -inline u32 rotr32_S (const u32 a, const u32 n) +u32 rotr32_S (const u32 a, const u32 n) { return rotate (a, 32 - n); } -inline u32 rotl32_S (const u32 a, const u32 n) +u32 rotl32_S (const u32 a, const u32 n) { return rotate (a, n); } -inline u64 rotr64_S (const u64 a, const u32 n) +u64 rotr64_S (const u64 a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64 rotl64_S (const u64 a, const u32 n) +u64 rotl64_S (const u64 a, const u32 n) { return rotate (a, (u64) n); } -inline u32x swap32 (const u32x v) +u32x swap32 (const u32x v) { return ((v >> 24) & 0x000000ff) | ((v >> 8) & 0x0000ff00) @@ -547,7 +531,7 @@ inline u32x swap32 (const u32x v) | ((v << 24) & 0xff000000); } -inline u64x swap64 (const u64x v) +u64x swap64 (const u64x v) { return ((v >> 56) & 0x00000000000000ff) | ((v >> 40) & 0x000000000000ff00) @@ -559,27 +543,27 @@ inline u64x swap64 (const u64x v) | ((v << 56) & 0xff00000000000000); } -inline u32x rotr32 (const u32x a, const u32 n) +u32x rotr32 (const u32x a, const u32 n) { return rotate (a, 32 - n); } -inline u32x rotl32 (const u32x a, const u32 n) +u32x rotl32 (const u32x a, const u32 n) { return rotate (a, n); } -inline u64x rotr64 (const u64x a, const u32 n) +u64x rotr64 (const u64x a, const u32 n) { return rotate (a, (u64) 64 - n); } -inline u64x rotl64 (const u64x a, const u32 n) +u64x rotl64 (const u64x a, const u32 n) { return rotate (a, (u64) n); } -inline u32x __bfe (const u32x a, const u32x b, const u32x c) +u32x __bfe (const u32x a, const u32x b, const u32x c) { #define BIT(x) ((u32x) (1u) << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -592,7 +576,7 @@ inline u32x __bfe (const u32x a, const u32x b, const u32x c) #undef BFE } -inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) +u32 __bfe_S (const u32 a, const u32 b, const u32 c) { #define BIT(x) (1u << (x)) #define BIT_MASK(x) (BIT (x) - 1) @@ -605,7 +589,7 @@ inline u32 __bfe_S (const u32 a, const u32 b, const u32 c) #undef BFE } -inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) +u32x amd_bytealign (const u32x a, const u32x b, const u32 c) { #if VECT_SIZE == 1 const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8); @@ -638,7 +622,7 @@ inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) #endif } -inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) { const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8); diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index e990b0a31..a238286ea 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -153,9 +153,6 @@ #if KERN_TYPE == 13800 #undef _unroll #endif -#if KERN_TYPE == 14100 -#undef _unroll -#endif // nvidia specific @@ -177,6 +174,9 @@ #if KERN_TYPE == 14000 #undef _unroll #endif +#if KERN_TYPE == 14100 +#undef _unroll +#endif #endif #endif diff --git a/OpenCL/m01500_a3.cl b/OpenCL/m01500_a3.cl index f54007b40..e372251c4 100644 --- a/OpenCL/m01500_a3.cl +++ b/OpenCL/m01500_a3.cl @@ -14,7 +14,20 @@ #define COMPARE_S "inc_comp_single_bs.cl" #define COMPARE_M "inc_comp_multi_bs.cl" -#define myselx(a,b,c) ((c) ? (b) : (a)) +#ifdef IS_NV +#define KXX_DECL +#define sXXX_DECL +#endif + +#ifdef IS_AMD +#define KXX_DECL +#define sXXX_DECL +#endif + +#ifdef IS_GENERIC +#define KXX_DECL +#define sXXX_DECL +#endif #ifdef IS_NV @@ -888,11 +901,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -911,473 +924,561 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif +//#define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; } #define SWAP(a, b) { u32 tmp=*a;*a=*b;*b=tmp; } #define DATASWAP \ @@ -1431,37 +1532,24 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #define KEYSET07 { k00 = K31; k01 = K35; k02 = K52; k03 = K43; k04 = K08; k05 = K37; k06 = K51; k07 = K15; k08 = K49; k09 = K30; k10 = K07; k11 = K02; k12 = K50; k13 = K21; k14 = K45; k15 = K44; k16 = K29; k17 = K16; k18 = K42; k19 = K23; k20 = K22; k21 = K14; k22 = K38; k23 = K01; k24 = K10; k25 = K47; k26 = K53; k27 = K11; k28 = K27; k29 = K26; k30 = K05; k31 = K17; k32 = K54; k33 = K41; k34 = K39; k35 = K20; k36 = K48; k37 = K13; k38 = K24; k39 = K19; k40 = K32; k41 = K40; k42 = K34; k43 = K03; k44 = K06; k45 = K18; k46 = K12; k47 = K46; } #define KEYSET17 { k00 = K15; k01 = K51; k02 = K36; k03 = K02; k04 = K49; k05 = K21; k06 = K35; k07 = K31; k08 = K08; k09 = K14; k10 = K23; k11 = K43; k12 = K09; k13 = K37; k14 = K29; k15 = K28; k16 = K45; k17 = K00; k18 = K01; k19 = K07; k20 = K38; k21 = K30; k22 = K22; k23 = K42; k24 = K26; k25 = K04; k26 = K41; k27 = K54; k28 = K39; k29 = K10; k30 = K48; k31 = K33; k32 = K11; k33 = K53; k34 = K27; k35 = K32; k36 = K05; k37 = K25; k38 = K40; k39 = K03; k40 = K20; k41 = K24; k42 = K46; k43 = K19; k44 = K18; k45 = K06; k46 = K55; k47 = K34; } -#ifdef IS_NV -#define KXX_DECL -#define sXXX_DECL -#endif - -#ifdef IS_AMD -#define KXX_DECL -#define sXXX_DECL -#endif - -#ifdef IS_GENERIC -#define KXX_DECL -#define sXXX_DECL -#endif +#define myselx(a,b,c) ((c) ? (b) : (a)) #ifdef DESCRYPT_SALT void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63) { - sXXX_DECL u32 s001 = (0x001 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s002 = (0x002 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s004 = (0x004 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s008 = (0x008 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s010 = (0x010 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s020 = (0x020 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s040 = (0x040 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s080 = (0x080 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s100 = (0x100 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s200 = (0x200 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s400 = (0x400 & DESCRYPT_SALT) ? 0xffffffff : 0; - sXXX_DECL u32 s800 = (0x800 & DESCRYPT_SALT) ? 0xffffffff : 0; + sXXX_DECL u32 s001 = (0x001 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s002 = (0x002 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s004 = (0x004 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s008 = (0x008 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s010 = (0x010 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s020 = (0x020 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s040 = (0x040 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s080 = (0x080 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s100 = (0x100 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s200 = (0x200 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s400 = (0x400 & DESCRYPT_SALT) ? 1 : 0; + sXXX_DECL u32 s800 = (0x800 & DESCRYPT_SALT) ? 1 : 0; KXX_DECL u32 k00, k01, k02, k03, k04, k05; KXX_DECL u32 k06, k07, k08, k09, k10, k11; @@ -1474,60 +1562,6 @@ void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, cons for (u32 ii = 0; ii < 25; ii++) { - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(myselx (*D63, *D47, s001) ^ k00, myselx (*D32, *D48, s002) ^ k01, myselx (*D33, *D49, s004) ^ k02, myselx (*D34, *D50, s008) ^ k03, myselx (*D35, *D51, s010) ^ k04, myselx (*D36, *D52, s020) ^ k05, D08, D16, D22, D30); - s2(myselx (*D35, *D51, s040) ^ k06, myselx (*D36, *D52, s080) ^ k07, myselx (*D37, *D53, s100) ^ k08, myselx (*D38, *D54, s200) ^ k09, myselx (*D39, *D55, s400) ^ k10, myselx (*D40, *D56, s800) ^ k11, D12, D27, D01, D17); - s3( *D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4( *D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(myselx (*D47, *D63, s001) ^ k24, myselx (*D48, *D32, s002) ^ k25, myselx (*D49, *D33, s004) ^ k26, myselx (*D50, *D34, s008) ^ k27, myselx (*D51, *D35, s010) ^ k28, myselx (*D52, *D36, s020) ^ k29, D07, D13, D24, D02); - s6(myselx (*D51, *D35, s040) ^ k30, myselx (*D52, *D36, s080) ^ k31, myselx (*D53, *D37, s100) ^ k32, myselx (*D54, *D38, s200) ^ k33, myselx (*D55, *D39, s400) ^ k34, myselx (*D56, *D40, s800) ^ k35, D03, D28, D10, D18); - s7( *D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8( *D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(myselx (*D31, *D15, s001) ^ k00, myselx (*D00, *D16, s002) ^ k01, myselx (*D01, *D17, s004) ^ k02, myselx (*D02, *D18, s008) ^ k03, myselx (*D03, *D19, s010) ^ k04, myselx (*D04, *D20, s020) ^ k05, D40, D48, D54, D62); - s2(myselx (*D03, *D19, s040) ^ k06, myselx (*D04, *D20, s080) ^ k07, myselx (*D05, *D21, s100) ^ k08, myselx (*D06, *D22, s200) ^ k09, myselx (*D07, *D23, s400) ^ k10, myselx (*D08, *D24, s800) ^ k11, D44, D59, D33, D49); - s3( *D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4( *D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(myselx (*D15, *D31, s001) ^ k24, myselx (*D16, *D00, s002) ^ k25, myselx (*D17, *D01, s004) ^ k26, myselx (*D18, *D02, s008) ^ k27, myselx (*D19, *D03, s010) ^ k28, myselx (*D20, *D04, s020) ^ k29, D39, D45, D56, D34); - s6(myselx (*D19, *D03, s040) ^ k30, myselx (*D20, *D04, s080) ^ k31, myselx (*D21, *D05, s100) ^ k32, myselx (*D22, *D06, s200) ^ k33, myselx (*D23, *D07, s400) ^ k34, myselx (*D24, *D08, s800) ^ k35, D35, D60, D42, D50); - s7( *D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8( *D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1622,8 +1656,6 @@ void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, cons s8( *D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - #endif - DATASWAP; } diff --git a/OpenCL/m02501.cl b/OpenCL/m02501.cl index d506bc389..dbfd507ba 100644 --- a/OpenCL/m02501.cl +++ b/OpenCL/m02501.cl @@ -17,12 +17,12 @@ #define COMPARE_S "inc_comp_single.cl" #define COMPARE_M "inc_comp_multi.cl" -inline u8 hex_convert (const u8 c) +u8 hex_convert (const u8 c) { return (c & 15) + (c >> 6) * 9; } -inline u8 hex_to_u8 (const u8 hex[2]) +u8 hex_to_u8 (const u8 hex[2]) { u8 v = 0; diff --git a/OpenCL/m03000_a3.cl b/OpenCL/m03000_a3.cl index 9817c3828..26dea4196 100644 --- a/OpenCL/m03000_a3.cl +++ b/OpenCL/m03000_a3.cl @@ -19,7 +19,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL volatile +#define KXX_DECL #endif #ifdef IS_GENERIC @@ -898,11 +898,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -921,469 +921,556 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif @@ -1452,60 +1539,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 KXX_DECL u32 k36, k37, k38, k39, k40, k41; KXX_DECL u32 k42, k43, k44, k45, k46, k47; - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30); - s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17); - s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02); - s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18); - s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62); - s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49); - s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34); - s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50); - s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1599,8 +1632,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - - #endif } void transpose32c (u32 data[32]) @@ -1694,7 +1725,7 @@ void transpose32c (u32 data[32]) swap (data[30], data[31], 1, 0x55555555); } -void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * base @@ -2066,7 +2097,7 @@ void m03000m (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __glo } } -void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) +void m03000s (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * base @@ -2481,7 +2512,7 @@ __kernel void m03000_tm (__global u32 *mod, __global bs_word_t *words_buf_r) } } -__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -2499,7 +2530,7 @@ __kernel void m03000_mxx (__global pw_t *pws, __global const kernel_rule_t *rule m03000m (pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV0_buf, d_scryptV1_buf, d_scryptV2_buf, d_scryptV3_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, il_cnt, digests_cnt, digests_offset); } -__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03000_sxx (__global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __constant bs_word_t * words_buf_r, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m14000_a3-optimized.cl b/OpenCL/m14000_a3-optimized.cl index 4a1b81cd3..2a33d1f60 100644 --- a/OpenCL/m14000_a3-optimized.cl +++ b/OpenCL/m14000_a3-optimized.cl @@ -19,7 +19,7 @@ #endif #ifdef IS_AMD -#define KXX_DECL volatile +#define KXX_DECL #endif #ifdef IS_GENERIC @@ -898,11 +898,11 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c #if defined IS_AMD || defined IS_GENERIC /* - * Bitslice DES S-boxes making use of a vector conditional select operation - * (e.g., vsel on PowerPC with AltiVec). + * Bitslice DES S-boxes for x86 with MMX/SSE2/AVX and for typical RISC + * architectures. These use AND, OR, XOR, NOT, and AND-NOT gates. * - * Gate counts: 36 33 33 26 35 34 34 32 - * Average: 32.875 + * Gate counts: 49 44 46 33 48 46 46 41 + * Average: 44.125 * * Several same-gate-count expressions for each S-box are included (for use on * different CPUs/GPUs). @@ -921,469 +921,556 @@ void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, c * The effort has been sponsored by Rapid7: http://www.rapid7.com */ -#define vnot(d,a) (d) = ~(a) -#define vor(d,a,b) (d) = (a) | (b) -#define vxor(d,a,b) (d) = (a) ^ (b) -#define vsel(d,a,b,c) (d) = bitselect ((a), (b), (c)) - void s1 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F0F3333, x3C3C3C3C, x55FF55FF, x69C369C3, x0903B73F, x09FCB7C0, x5CA9E295; - u32 x55AFD1B7, x3C3C69C3, x6993B874; - u32 x5CEDE59F, x09FCE295, x5D91A51E, x529E962D; - u32 x29EEADC0, x4B8771A3, x428679F3, x6B68D433; - u32 x5BA7E193, x026F12F3, x6B27C493, x94D83B6C; - u32 x965E0B0F, x3327A113, x847F0A1F, xD6E19C32; - u32 x0DBCE883, x3A25A215, x37994A96; - u32 xC9C93B62, x89490F02, xB96C2D16; - u32 x0, x1, x2, x3; + u32 x55005500, x5A0F5A0F, x3333FFFF, x66666666, x22226666, x2D2D6969, + x25202160; + u32 x00FFFF00, x33CCCC33, x4803120C, x2222FFFF, x6A21EDF3, x4A01CC93; + u32 x5555FFFF, x7F75FFFF, x00D20096, x7FA7FF69; + u32 x0A0A0000, x0AD80096, x00999900, x0AD99996; + u32 x22332233, x257AA5F0, x054885C0, xFAB77A3F, x2221EDF3, xD89697CC; + u32 x05B77AC0, x05F77AD6, x36C48529, x6391D07C, xBB0747B0; + u32 x4C460000, x4EDF9996, x2D4E49EA, xBBFFFFB0, x96B1B65A; + u32 x5AFF5AFF, x52B11215, x4201C010, x10B0D205; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F0F3333, a3, a2, a5); - vxor(x3C3C3C3C, a2, a3); - vor(x55FF55FF, a1, a4); - vxor(x69C369C3, x3C3C3C3C, x55FF55FF); - vsel(x0903B73F, a5, x0F0F3333, x69C369C3); - vxor(x09FCB7C0, a4, x0903B73F); - vxor(x5CA9E295, a1, x09FCB7C0); + x55005500 = a1 & ~a5; + x5A0F5A0F = a4 ^ x55005500; + x3333FFFF = a3 | a6; + x66666666 = a1 ^ a3; + x22226666 = x3333FFFF & x66666666; + x2D2D6969 = a4 ^ x22226666; + x25202160 = x2D2D6969 & ~x5A0F5A0F; - vsel(x55AFD1B7, x5CA9E295, x55FF55FF, x0F0F3333); - vsel(x3C3C69C3, x3C3C3C3C, x69C369C3, a5); - vxor(x6993B874, x55AFD1B7, x3C3C69C3); + x00FFFF00 = a5 ^ a6; + x33CCCC33 = a3 ^ x00FFFF00; + x4803120C = x5A0F5A0F & ~x33CCCC33; + x2222FFFF = a6 | x22226666; + x6A21EDF3 = x4803120C ^ x2222FFFF; + x4A01CC93 = x6A21EDF3 & ~x25202160; - vsel(x5CEDE59F, x55FF55FF, x5CA9E295, x6993B874); - vsel(x09FCE295, x09FCB7C0, x5CA9E295, a5); - vsel(x5D91A51E, x5CEDE59F, x6993B874, x09FCE295); - vxor(x529E962D, x0F0F3333, x5D91A51E); + x5555FFFF = a1 | a6; + x7F75FFFF = x6A21EDF3 | x5555FFFF; + x00D20096 = a5 & ~x2D2D6969; + x7FA7FF69 = x7F75FFFF ^ x00D20096; - vsel(x29EEADC0, x69C369C3, x09FCB7C0, x5CEDE59F); - vsel(x4B8771A3, x0F0F3333, x69C369C3, x5CA9E295); - vsel(x428679F3, a5, x4B8771A3, x529E962D); - vxor(x6B68D433, x29EEADC0, x428679F3); + x0A0A0000 = a4 & ~x5555FFFF; + x0AD80096 = x00D20096 ^ x0A0A0000; + x00999900 = x00FFFF00 & ~x66666666; + x0AD99996 = x0AD80096 | x00999900; - vsel(x5BA7E193, x5CA9E295, x4B8771A3, a3); - vsel(x026F12F3, a4, x0F0F3333, x529E962D); - vsel(x6B27C493, x6B68D433, x5BA7E193, x026F12F3); - vnot(x94D83B6C, x6B27C493); - vsel(x0, x94D83B6C, x6B68D433, a6); - vxor(*out1, *out1, x0); + x22332233 = a3 & ~x55005500; + x257AA5F0 = x5A0F5A0F ^ x7F75FFFF; + x054885C0 = x257AA5F0 & ~x22332233; + xFAB77A3F = ~x054885C0; + x2221EDF3 = x3333FFFF & x6A21EDF3; + xD89697CC = xFAB77A3F ^ x2221EDF3; + x20 = x7FA7FF69 & ~a2; + x21 = x20 ^ xD89697CC; + *out3 ^= x21; - vsel(x965E0B0F, x94D83B6C, a3, x428679F3); - vsel(x3327A113, x5BA7E193, a2, x69C369C3); - vsel(x847F0A1F, x965E0B0F, a4, x3327A113); - vxor(xD6E19C32, x529E962D, x847F0A1F); - vsel(x1, xD6E19C32, x5CA9E295, a6); - vxor(*out2, *out2, x1); + x05B77AC0 = x00FFFF00 ^ x054885C0; + x05F77AD6 = x00D20096 | x05B77AC0; + x36C48529 = x3333FFFF ^ x05F77AD6; + x6391D07C = a1 ^ x36C48529; + xBB0747B0 = xD89697CC ^ x6391D07C; + x00 = x25202160 | a2; + x01 = x00 ^ xBB0747B0; + *out1 ^= x01; - vsel(x0DBCE883, x09FCE295, x3C3C69C3, x847F0A1F); - vsel(x3A25A215, x3327A113, x5CA9E295, x0903B73F); - vxor(x37994A96, x0DBCE883, x3A25A215); - vsel(x3, x37994A96, x529E962D, a6); - vxor(*out4, *out4, x3); + x4C460000 = x3333FFFF ^ x7F75FFFF; + x4EDF9996 = x0AD99996 | x4C460000; + x2D4E49EA = x6391D07C ^ x4EDF9996; + xBBFFFFB0 = x00FFFF00 | xBB0747B0; + x96B1B65A = x2D4E49EA ^ xBBFFFFB0; + x10 = x4A01CC93 | a2; + x11 = x10 ^ x96B1B65A; + *out2 ^= x11; - vsel(xC9C93B62, x94D83B6C, x69C369C3, x5D91A51E); - vsel(x89490F02, a3, xC9C93B62, x965E0B0F); - vsel(xB96C2D16, x89490F02, x3C3C3C3C, x3A25A215); - vsel(x2, xB96C2D16, x6993B874, a6); - vxor(*out3, *out3, x2); + x5AFF5AFF = a5 | x5A0F5A0F; + x52B11215 = x5AFF5AFF & ~x2D4E49EA; + x4201C010 = x4A01CC93 & x6391D07C; + x10B0D205 = x52B11215 ^ x4201C010; + x30 = x10B0D205 | a2; + x31 = x30 ^ x0AD99996; + *out4 ^= x31; } void s2 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x55553333, x0055FF33, x33270F03, x66725A56, x00FFFF00, x668DA556; - u32 x0F0F5A56, xF0F0A5A9, xA5A5969A, xA55A699A; - u32 x0F5AF03C, x6600FF56, x87A5F09C; - u32 xA55A963C, x3C69C30F, xB44BC32D; - u32 x66D7CC56, x0F4B0F2D, x699CC37B, x996C66D2; - u32 xB46C662D, x278DB412, xB66CB43B; - u32 xD2DC4E52, x27993333, xD2994E33; - u32 x278D0F2D, x2E0E547B, x09976748; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x55550000, x00AA00FF, x33BB33FF; + u32 x33CC0000, x11441144, x11BB11BB, x003311BB; + u32 x00000F0F, x336600FF, x332200FF, x332200F0; + u32 x0302000F, xAAAAAAAA, xA9A8AAA5, x33CCCC33, x33CCC030, x9A646A95; + u32 x00333303, x118822B8, xA8208805, x3CC3C33C, x94E34B39; + u32 x0331330C, x3FF3F33C, xA9DF596A, xA9DF5F6F, x962CAC53; + u32 xA9466A6A, x3DA52153, x29850143, x33C0330C, x1A45324F; + u32 x0A451047, xBBDFDD7B, xB19ACD3C; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x55553333, a1, a3, a6); - vsel(x0055FF33, a6, x55553333, a5); - vsel(x33270F03, a3, a4, x0055FF33); - vxor(x66725A56, a1, x33270F03); - vxor(x00FFFF00, a5, a6); - vxor(x668DA556, x66725A56, x00FFFF00); + x33CC33CC = a2 ^ a5; - vsel(x0F0F5A56, a4, x66725A56, a6); - vnot(xF0F0A5A9, x0F0F5A56); - vxor(xA5A5969A, x55553333, xF0F0A5A9); - vxor(xA55A699A, x00FFFF00, xA5A5969A); - vsel(x1, xA55A699A, x668DA556, a2); - vxor(*out2, *out2, x1); + x55550000 = a1 & ~a6; + x00AA00FF = a5 & ~x55550000; + x33BB33FF = a2 | x00AA00FF; - vxor(x0F5AF03C, a4, x0055FF33); - vsel(x6600FF56, x66725A56, a6, x00FFFF00); - vsel(x87A5F09C, xA5A5969A, x0F5AF03C, x6600FF56); + x33CC0000 = x33CC33CC & ~a6; + x11441144 = a1 & x33CC33CC; + x11BB11BB = a5 ^ x11441144; + x003311BB = x11BB11BB & ~x33CC0000; - vsel(xA55A963C, xA5A5969A, x0F5AF03C, a5); - vxor(x3C69C30F, a3, x0F5AF03C); - vsel(xB44BC32D, xA55A963C, x3C69C30F, a1); + x00000F0F = a3 & a6; + x336600FF = x00AA00FF ^ x33CC0000; + x332200FF = x33BB33FF & x336600FF; + x332200F0 = x332200FF & ~x00000F0F; - vsel(x66D7CC56, x66725A56, x668DA556, xA5A5969A); - vsel(x0F4B0F2D, a4, xB44BC32D, a5); - vxor(x699CC37B, x66D7CC56, x0F4B0F2D); - vxor(x996C66D2, xF0F0A5A9, x699CC37B); - vsel(x0, x996C66D2, xB44BC32D, a2); - vxor(*out1, *out1, x0); + x0302000F = a3 & x332200FF; + xAAAAAAAA = ~a1; + xA9A8AAA5 = x0302000F ^ xAAAAAAAA; + x33CCCC33 = a6 ^ x33CC33CC; + x33CCC030 = x33CCCC33 & ~x00000F0F; + x9A646A95 = xA9A8AAA5 ^ x33CCC030; + x10 = a4 & ~x332200F0; + x11 = x10 ^ x9A646A95; + *out2 ^= x11; - vsel(xB46C662D, xB44BC32D, x996C66D2, x00FFFF00); - vsel(x278DB412, x668DA556, xA5A5969A, a1); - vsel(xB66CB43B, xB46C662D, x278DB412, x6600FF56); + x00333303 = a2 & ~x33CCC030; + x118822B8 = x11BB11BB ^ x00333303; + xA8208805 = xA9A8AAA5 & ~x118822B8; + x3CC3C33C = a3 ^ x33CCCC33; + x94E34B39 = xA8208805 ^ x3CC3C33C; + x00 = x33BB33FF & ~a4; + x01 = x00 ^ x94E34B39; + *out1 ^= x01; - vsel(xD2DC4E52, x66D7CC56, x996C66D2, xB44BC32D); - vsel(x27993333, x278DB412, a3, x0055FF33); - vsel(xD2994E33, xD2DC4E52, x27993333, a5); - vsel(x3, x87A5F09C, xD2994E33, a2); - vxor(*out4, *out4, x3); + x0331330C = x0302000F ^ x00333303; + x3FF3F33C = x3CC3C33C | x0331330C; + xA9DF596A = x33BB33FF ^ x9A646A95; + xA9DF5F6F = x00000F0F | xA9DF596A; + x962CAC53 = x3FF3F33C ^ xA9DF5F6F; - vsel(x278D0F2D, x278DB412, x0F4B0F2D, a6); - vsel(x2E0E547B, x0F0F5A56, xB66CB43B, x278D0F2D); - vxor(x09976748, x27993333, x2E0E547B); - vsel(x2, xB66CB43B, x09976748, a2); - vxor(*out3, *out3, x2); + xA9466A6A = x332200FF ^ x9A646A95; + x3DA52153 = x94E34B39 ^ xA9466A6A; + x29850143 = xA9DF5F6F & x3DA52153; + x33C0330C = x33CC33CC & x3FF3F33C; + x1A45324F = x29850143 ^ x33C0330C; + x20 = x1A45324F | a4; + x21 = x20 ^ x962CAC53; + *out3 ^= x21; + + x0A451047 = x1A45324F & ~x118822B8; + xBBDFDD7B = x33CCCC33 | xA9DF596A; + xB19ACD3C = x0A451047 ^ xBBDFDD7B; + x30 = x003311BB | a4; + x31 = x30 ^ xB19ACD3C; + *out4 ^= x31; } void s3 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0F330F33, x0F33F0CC, x5A66A599; - u32 x2111B7BB, x03FF3033, x05BB50EE, x074F201F, x265E97A4; - u32 x556BA09E, x665A93AC, x99A56C53; - u32 x25A1A797, x5713754C, x66559355, x47B135C6; - u32 x9A5A5C60, xD07AF8F8, x87698DB4, xE13C1EE1; - u32 x000CFFCF, x9A485CCE, x0521DDF4, x9E49915E; - u32 xD069F8B4, x030FF0C3, xD2699876; - u32 xD579DDF4, xD579F0C3, xB32C6396; - u32 x0, x1, x2, x3; + u32 x44444444, x0F0FF0F0, x4F4FF4F4, x00FFFF00, x00AAAA00, x4FE55EF4; + u32 x3C3CC3C3, x3C3C0000, x7373F4F4, x0C840A00; + u32 x00005EF4, x00FF5EFF, x00555455, x3C699796; + u32 x000FF000, x55AA55AA, x26D9A15E, x2FDFAF5F, x2FD00F5F; + u32 x55AAFFAA, x28410014, x000000FF, x000000CC, x284100D8; + u32 x204100D0, x3C3CC3FF, x1C3CC32F, x4969967A; + u32 x4CC44CC4, x40C040C0, xC3C33C3C, x9669C396, xD6A98356; + u32 xD6E9C3D6, x4CEEEEC4, x9A072D12, x001A000B, x9A1F2D1B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0F330F33, a4, a3, a5); - vxor(x0F33F0CC, a6, x0F330F33); - vxor(x5A66A599, a2, x0F33F0CC); + x44444444 = a1 & ~a2; + x0F0FF0F0 = a3 ^ a6; + x4F4FF4F4 = x44444444 | x0F0FF0F0; + x00FFFF00 = a4 ^ a6; + x00AAAA00 = x00FFFF00 & ~a1; + x4FE55EF4 = x4F4FF4F4 ^ x00AAAA00; - vsel(x2111B7BB, a3, a6, x5A66A599); - vsel(x03FF3033, a5, a3, x0F33F0CC); - vsel(x05BB50EE, a5, x0F33F0CC, a2); - vsel(x074F201F, x03FF3033, a4, x05BB50EE); - vxor(x265E97A4, x2111B7BB, x074F201F); + x3C3CC3C3 = a2 ^ x0F0FF0F0; + x3C3C0000 = x3C3CC3C3 & ~a6; + x7373F4F4 = x4F4FF4F4 ^ x3C3C0000; + x0C840A00 = x4FE55EF4 & ~x7373F4F4; - vsel(x556BA09E, x5A66A599, x05BB50EE, a4); - vsel(x665A93AC, x556BA09E, x265E97A4, a3); - vnot(x99A56C53, x665A93AC); - vsel(x1, x265E97A4, x99A56C53, a1); - vxor(*out2, *out2, x1); + x00005EF4 = a6 & x4FE55EF4; + x00FF5EFF = a4 | x00005EF4; + x00555455 = a1 & x00FF5EFF; + x3C699796 = x3C3CC3C3 ^ x00555455; + x30 = x4FE55EF4 & ~a5; + x31 = x30 ^ x3C699796; + *out4 ^= x31; - vxor(x25A1A797, x03FF3033, x265E97A4); - vsel(x5713754C, a2, x0F33F0CC, x074F201F); - vsel(x66559355, x665A93AC, a2, a5); - vsel(x47B135C6, x25A1A797, x5713754C, x66559355); + x000FF000 = x0F0FF0F0 & x00FFFF00; + x55AA55AA = a1 ^ a4; + x26D9A15E = x7373F4F4 ^ x55AA55AA; + x2FDFAF5F = a3 | x26D9A15E; + x2FD00F5F = x2FDFAF5F & ~x000FF000; - vxor(x9A5A5C60, x03FF3033, x99A56C53); - vsel(xD07AF8F8, x9A5A5C60, x556BA09E, x5A66A599); - vxor(x87698DB4, x5713754C, xD07AF8F8); - vxor(xE13C1EE1, x66559355, x87698DB4); + x55AAFFAA = x00AAAA00 | x55AA55AA; + x28410014 = x3C699796 & ~x55AAFFAA; + x000000FF = a4 & a6; + x000000CC = x000000FF & ~a2; + x284100D8 = x28410014 ^ x000000CC; - vsel(x000CFFCF, a4, a6, x0F33F0CC); - vsel(x9A485CCE, x9A5A5C60, x000CFFCF, x05BB50EE); - vsel(x0521DDF4, x87698DB4, a6, x9A5A5C60); - vsel(x9E49915E, x9A485CCE, x66559355, x0521DDF4); - vsel(x0, x9E49915E, xE13C1EE1, a1); - vxor(*out1, *out1, x0); + x204100D0 = x7373F4F4 & x284100D8; + x3C3CC3FF = x3C3CC3C3 | x000000FF; + x1C3CC32F = x3C3CC3FF & ~x204100D0; + x4969967A = a1 ^ x1C3CC32F; + x10 = x2FD00F5F & a5; + x11 = x10 ^ x4969967A; + *out2 ^= x11; - vsel(xD069F8B4, xD07AF8F8, x87698DB4, a5); - vsel(x030FF0C3, x000CFFCF, x03FF3033, a4); - vsel(xD2699876, xD069F8B4, x9E49915E, x030FF0C3); - vsel(x3, x5A66A599, xD2699876, a1); - vxor(*out4, *out4, x3); + x4CC44CC4 = x4FE55EF4 & ~a2; + x40C040C0 = x4CC44CC4 & ~a3; + xC3C33C3C = ~x3C3CC3C3; + x9669C396 = x55AAFFAA ^ xC3C33C3C; + xD6A98356 = x40C040C0 ^ x9669C396; + x00 = a5 & ~x0C840A00; + x01 = x00 ^ xD6A98356; + *out1 ^= x01; - vsel(xD579DDF4, xD07AF8F8, a2, x5713754C); - vsel(xD579F0C3, xD579DDF4, x030FF0C3, a6); - vxor(xB32C6396, x66559355, xD579F0C3); - vsel(x2, xB32C6396, x47B135C6, a1); - vxor(*out3, *out3, x2); + xD6E9C3D6 = x40C040C0 | x9669C396; + x4CEEEEC4 = x00AAAA00 | x4CC44CC4; + x9A072D12 = xD6E9C3D6 ^ x4CEEEEC4; + x001A000B = a4 & ~x4FE55EF4; + x9A1F2D1B = x9A072D12 | x001A000B; + x20 = a5 & ~x284100D8; + x21 = x20 ^ x9A1F2D1B; + *out3 ^= x21; } void s4 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505AFAF, x0555AF55, x0A5AA05A, x46566456, x0A0A5F5F, x0AF55FA0, - x0AF50F0F, x4CA36B59; - u32 xB35C94A6; - u32 x01BB23BB, x5050FAFA, xA31C26BE, xA91679E1; - u32 x56E9861E; - u32 x50E9FA1E, x0AF55F00, x827D9784, xD2946D9A; - u32 x31F720B3, x11FB21B3, x4712A7AD, x9586CA37; - u32 x0, x1, x2, x3; + u32 x5A5A5A5A, x0F0FF0F0; + u32 x33FF33FF, x33FFCC00, x0C0030F0, x0C0CC0C0, x0CF3C03F, x5EFBDA7F, + x52FBCA0F, x61C8F93C; + u32 x00C0C03C, x0F0F30C0, x3B92A366, x30908326, x3C90B3D6; + u32 x33CC33CC, x0C0CFFFF, x379E5C99, x04124C11, x56E9861E, xA91679E1; + u32 x9586CA37, x8402C833, x84C2C83F, xB35C94A6; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505AFAF, a5, a3, a1); - vsel(x0555AF55, x0505AFAF, a1, a4); - vxor(x0A5AA05A, a3, x0555AF55); - vsel(x46566456, a1, x0A5AA05A, a2); - vsel(x0A0A5F5F, a3, a5, a1); - vxor(x0AF55FA0, a4, x0A0A5F5F); - vsel(x0AF50F0F, x0AF55FA0, a3, a5); - vxor(x4CA36B59, x46566456, x0AF50F0F); + x5A5A5A5A = a1 ^ a3; + x0F0FF0F0 = a3 ^ a5; + x33FF33FF = a2 | a4; + x33FFCC00 = a5 ^ x33FF33FF; + x0C0030F0 = x0F0FF0F0 & ~x33FFCC00; + x0C0CC0C0 = x0F0FF0F0 & ~a2; + x0CF3C03F = a4 ^ x0C0CC0C0; + x5EFBDA7F = x5A5A5A5A | x0CF3C03F; + x52FBCA0F = x5EFBDA7F & ~x0C0030F0; + x61C8F93C = a2 ^ x52FBCA0F; - vnot(xB35C94A6, x4CA36B59); + x00C0C03C = x0CF3C03F & x61C8F93C; + x0F0F30C0 = x0F0FF0F0 & ~x00C0C03C; + x3B92A366 = x5A5A5A5A ^ x61C8F93C; + x30908326 = x3B92A366 & ~x0F0F30C0; + x3C90B3D6 = x0C0030F0 ^ x30908326; - vsel(x01BB23BB, a4, a2, x0555AF55); - vxor(x5050FAFA, a1, x0505AFAF); - vsel(xA31C26BE, xB35C94A6, x01BB23BB, x5050FAFA); - vxor(xA91679E1, x0A0A5F5F, xA31C26BE); + x33CC33CC = a2 ^ a4; + x0C0CFFFF = a5 | x0C0CC0C0; + x379E5C99 = x3B92A366 ^ x0C0CFFFF; + x04124C11 = x379E5C99 & ~x33CC33CC; + x56E9861E = x52FBCA0F ^ x04124C11; + x00 = a6 & ~x3C90B3D6; + x01 = x00 ^ x56E9861E; + *out1 ^= x01; - vnot(x56E9861E, xA91679E1); + xA91679E1 = ~x56E9861E; + x10 = x3C90B3D6 & ~a6; + x11 = x10 ^ xA91679E1; + *out2 ^= x11; - vsel(x50E9FA1E, x5050FAFA, x56E9861E, a4); - vsel(x0AF55F00, x0AF50F0F, x0AF55FA0, x0A0A5F5F); - vsel(x827D9784, xB35C94A6, x0AF55F00, a2); - vxor(xD2946D9A, x50E9FA1E, x827D9784); - vsel(x2, xD2946D9A, x4CA36B59, a6); - vxor(*out3, *out3, x2); - vsel(x3, xB35C94A6, xD2946D9A, a6); - vxor(*out4, *out4, x3); + x9586CA37 = x3C90B3D6 ^ xA91679E1; + x8402C833 = x9586CA37 & ~x33CC33CC; + x84C2C83F = x00C0C03C | x8402C833; + xB35C94A6 = x379E5C99 ^ x84C2C83F; + x20 = x61C8F93C | a6; + x21 = x20 ^ xB35C94A6; + *out3 ^= x21; - vsel(x31F720B3, a2, a4, x0AF55FA0); - vsel(x11FB21B3, x01BB23BB, x31F720B3, x5050FAFA); - vxor(x4712A7AD, x56E9861E, x11FB21B3); - vxor(x9586CA37, xD2946D9A, x4712A7AD); - vsel(x0, x56E9861E, x9586CA37, a6); - vxor(*out1, *out1, x0); - vsel(x1, x9586CA37, xA91679E1, a6); - vxor(*out2, *out2, x1); + x30 = a6 & x61C8F93C; + x31 = x30 ^ xB35C94A6; + *out4 ^= x31; } void s5 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x550F550F, xAAF0AAF0, xA5F5A5F5, x96C696C6, x00FFFF00, x963969C6; - u32 x2E3C2E3C, xB73121F7, x1501DF0F, x00558A5F, x2E69A463; - u32 x0679ED42, x045157FD, xB32077FF, x9D49D39C; - u32 xAC81CFB2, xF72577AF, x5BA4B81D; - u32 x5BA477AF, x4895469F, x3A35273A, x1A35669A; - u32 x12E6283D, x9E47D3D4, x1A676AB4; - u32 x891556DF, xE5E77F82, x6CF2295D; - u32 x2E3CA5F5, x9697C1C6, x369CC1D6; - u32 x0, x1, x2, x3; + u32 x77777777, x77770000, x22225555, x11116666, x1F1F6F6F; + u32 x70700000, x43433333, x00430033, x55557777, x55167744, x5A19784B; + u32 x5A1987B4, x7A3BD7F5, x003B00F5, x221955A0, x05050707, x271C52A7; + u32 x2A2A82A0, x6969B193, x1FE06F90, x16804E00, xE97FB1FF; + u32 x43403302, x35CAED30, x37DEFFB7, x349ECCB5, x0B01234A; + u32 x101884B4, x0FF8EB24, x41413333, x4FF9FB37, x4FC2FBC2; + u32 x22222222, x16BCEE97, x0F080B04, x19B4E593; + u32 x5C5C5C5C, x4448184C, x2DDABE71, x6992A63D; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x550F550F, a1, a3, a5); - vnot(xAAF0AAF0, x550F550F); - vsel(xA5F5A5F5, xAAF0AAF0, a1, a3); - vxor(x96C696C6, a2, xA5F5A5F5); - vxor(x00FFFF00, a5, a6); - vxor(x963969C6, x96C696C6, x00FFFF00); + x77777777 = a1 | a3; + x77770000 = x77777777 & ~a6; + x22225555 = a1 ^ x77770000; + x11116666 = a3 ^ x22225555; + x1F1F6F6F = a4 | x11116666; - vsel(x2E3C2E3C, a3, xAAF0AAF0, a2); - vsel(xB73121F7, a2, x963969C6, x96C696C6); - vsel(x1501DF0F, a6, x550F550F, xB73121F7); - vsel(x00558A5F, x1501DF0F, a5, a1); - vxor(x2E69A463, x2E3C2E3C, x00558A5F); + x70700000 = x77770000 & ~a4; + x43433333 = a3 ^ x70700000; + x00430033 = a5 & x43433333; + x55557777 = a1 | x11116666; + x55167744 = x00430033 ^ x55557777; + x5A19784B = a4 ^ x55167744; - vsel(x0679ED42, x00FFFF00, x2E69A463, x96C696C6); - vsel(x045157FD, a6, a1, x0679ED42); - vsel(xB32077FF, xB73121F7, a6, x045157FD); - vxor(x9D49D39C, x2E69A463, xB32077FF); - vsel(x2, x9D49D39C, x2E69A463, a4); - vxor(*out3, *out3, x2); + x5A1987B4 = a6 ^ x5A19784B; + x7A3BD7F5 = x22225555 | x5A1987B4; + x003B00F5 = a5 & x7A3BD7F5; + x221955A0 = x22225555 ^ x003B00F5; + x05050707 = a4 & x55557777; + x271C52A7 = x221955A0 ^ x05050707; - vsel(xAC81CFB2, xAAF0AAF0, x1501DF0F, x0679ED42); - vsel(xF72577AF, xB32077FF, x550F550F, a1); - vxor(x5BA4B81D, xAC81CFB2, xF72577AF); - vsel(x1, x5BA4B81D, x963969C6, a4); - vxor(*out2, *out2, x1); + x2A2A82A0 = x7A3BD7F5 & ~a1; + x6969B193 = x43433333 ^ x2A2A82A0; + x1FE06F90 = a5 ^ x1F1F6F6F; + x16804E00 = x1FE06F90 & ~x6969B193; + xE97FB1FF = ~x16804E00; + x20 = xE97FB1FF & ~a2; + x21 = x20 ^ x5A19784B; + *out3 ^= x21; - vsel(x5BA477AF, x5BA4B81D, xF72577AF, a6); - vsel(x4895469F, x5BA477AF, x00558A5F, a2); - vsel(x3A35273A, x2E3C2E3C, a2, x963969C6); - vsel(x1A35669A, x4895469F, x3A35273A, x5BA4B81D); + x43403302 = x43433333 & ~x003B00F5; + x35CAED30 = x2A2A82A0 ^ x1FE06F90; + x37DEFFB7 = x271C52A7 | x35CAED30; + x349ECCB5 = x37DEFFB7 & ~x43403302; + x0B01234A = x1F1F6F6F & ~x349ECCB5; - vsel(x12E6283D, a5, x5BA4B81D, x963969C6); - vsel(x9E47D3D4, x96C696C6, x9D49D39C, xAC81CFB2); - vsel(x1A676AB4, x12E6283D, x9E47D3D4, x4895469F); + x101884B4 = x5A1987B4 & x349ECCB5; + x0FF8EB24 = x1FE06F90 ^ x101884B4; + x41413333 = x43433333 & x55557777; + x4FF9FB37 = x0FF8EB24 | x41413333; + x4FC2FBC2 = x003B00F5 ^ x4FF9FB37; + x30 = x4FC2FBC2 & a2; + x31 = x30 ^ x271C52A7; + *out4 ^= x31; - vsel(x891556DF, xB32077FF, x4895469F, x3A35273A); - vsel(xE5E77F82, xF72577AF, x00FFFF00, x12E6283D); - vxor(x6CF2295D, x891556DF, xE5E77F82); - vsel(x3, x1A35669A, x6CF2295D, a4); - vxor(*out4, *out4, x3); + x22222222 = a1 ^ x77777777; + x16BCEE97 = x349ECCB5 ^ x22222222; + x0F080B04 = a4 & x0FF8EB24; + x19B4E593 = x16BCEE97 ^ x0F080B04; + x00 = x0B01234A | a2; + x01 = x00 ^ x19B4E593; + *out1 ^= x01; - vsel(x2E3CA5F5, x2E3C2E3C, xA5F5A5F5, a6); - vsel(x9697C1C6, x96C696C6, x963969C6, x045157FD); - vsel(x369CC1D6, x2E3CA5F5, x9697C1C6, x5BA477AF); - vsel(x0, x369CC1D6, x1A676AB4, a4); - vxor(*out1, *out1, x0); + x5C5C5C5C = x1F1F6F6F ^ x43433333; + x4448184C = x5C5C5C5C & ~x19B4E593; + x2DDABE71 = x22225555 ^ x0FF8EB24; + x6992A63D = x4448184C ^ x2DDABE71; + x10 = x1F1F6F6F & a2; + x11 = x10 ^ x6992A63D; + *out2 ^= x11; } void s6 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x555500FF, x666633CC, x606F30CF, x353A659A, x353A9A65, xCAC5659A; - u32 x353A6565, x0A3F0A6F, x6C5939A3, x5963A3C6; - u32 x35FF659A, x3AF06A95, x05CF0A9F, x16E94A97; - u32 x86CD4C9B, x12E0FFFD, x942D9A67; - u32 x142956AB, x455D45DF, x1C3EE619; - u32 x2AEA70D5, x20CF7A9F, x3CF19C86, x69A49C79; - u32 x840DBB67, x6DA19C1E, x925E63E1; - u32 x9C3CA761, x257A75D5, xB946D2B4; - u32 x0, x1, x2, x3; + u32 x33CC33CC; + u32 x3333FFFF, x11115555, x22DD6699, x22DD9966, x00220099; + u32 x00551144, x33662277, x5A5A5A5A, x7B7E7A7F, x59A31CE6; + u32 x09030C06, x09030000, x336622FF, x3A6522FF; + u32 x484D494C, x0000B6B3, x0F0FB9BC, x00FC00F9, x0FFFB9FD; + u32 x5DF75DF7, x116600F7, x1E69B94B, x1668B94B; + u32 x7B7B7B7B, x411E5984, x1FFFFDFD, x5EE1A479; + u32 x3CB4DFD2, x004B002D, xB7B2B6B3, xCCC9CDC8, xCC82CDE5; + u32 x0055EEBB, x5A5AECE9, x0050ECA9, xC5CAC1CE, xC59A2D67; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x555500FF, a1, a4, a5); - vxor(x666633CC, a2, x555500FF); - vsel(x606F30CF, x666633CC, a4, a3); - vxor(x353A659A, a1, x606F30CF); - vxor(x353A9A65, a5, x353A659A); - vnot(xCAC5659A, x353A9A65); + x33CC33CC = a2 ^ a5; - vsel(x353A6565, x353A659A, x353A9A65, a4); - vsel(x0A3F0A6F, a3, a4, x353A6565); - vxor(x6C5939A3, x666633CC, x0A3F0A6F); - vxor(x5963A3C6, x353A9A65, x6C5939A3); + x3333FFFF = a2 | a6; + x11115555 = a1 & x3333FFFF; + x22DD6699 = x33CC33CC ^ x11115555; + x22DD9966 = a6 ^ x22DD6699; + x00220099 = a5 & ~x22DD9966; - vsel(x35FF659A, a4, x353A659A, x353A6565); - vxor(x3AF06A95, a3, x35FF659A); - vsel(x05CF0A9F, a4, a3, x353A9A65); - vsel(x16E94A97, x3AF06A95, x05CF0A9F, x6C5939A3); + x00551144 = a1 & x22DD9966; + x33662277 = a2 ^ x00551144; + x5A5A5A5A = a1 ^ a3; + x7B7E7A7F = x33662277 | x5A5A5A5A; + x59A31CE6 = x22DD6699 ^ x7B7E7A7F; - vsel(x86CD4C9B, xCAC5659A, x05CF0A9F, x6C5939A3); - vsel(x12E0FFFD, a5, x3AF06A95, x16E94A97); - vsel(x942D9A67, x86CD4C9B, x353A9A65, x12E0FFFD); - vsel(x0, xCAC5659A, x942D9A67, a6); - vxor(*out1, *out1, x0); + x09030C06 = a3 & x59A31CE6; + x09030000 = x09030C06 & ~a6; + x336622FF = x00220099 | x33662277; + x3A6522FF = x09030000 ^ x336622FF; + x30 = x3A6522FF & a4; + x31 = x30 ^ x59A31CE6; + *out4 ^= x31; - vsel(x142956AB, x353A659A, x942D9A67, a2); - vsel(x455D45DF, a1, x86CD4C9B, x142956AB); - vxor(x1C3EE619, x5963A3C6, x455D45DF); - vsel(x3, x5963A3C6, x1C3EE619, a6); - vxor(*out4, *out4, x3); + x484D494C = a2 ^ x7B7E7A7F; + x0000B6B3 = a6 & ~x484D494C; + x0F0FB9BC = a3 ^ x0000B6B3; + x00FC00F9 = a5 & ~x09030C06; + x0FFFB9FD = x0F0FB9BC | x00FC00F9; - vsel(x2AEA70D5, x3AF06A95, x606F30CF, x353A9A65); - vsel(x20CF7A9F, x2AEA70D5, x05CF0A9F, x0A3F0A6F); - vxor(x3CF19C86, x1C3EE619, x20CF7A9F); - vxor(x69A49C79, x555500FF, x3CF19C86); + x5DF75DF7 = a1 | x59A31CE6; + x116600F7 = x336622FF & x5DF75DF7; + x1E69B94B = x0F0FB9BC ^ x116600F7; + x1668B94B = x1E69B94B & ~x09030000; + x20 = x00220099 | a4; + x21 = x20 ^ x1668B94B; + *out3 ^= x21; - vsel(x840DBB67, a5, x942D9A67, x86CD4C9B); - vsel(x6DA19C1E, x69A49C79, x3CF19C86, x840DBB67); - vnot(x925E63E1, x6DA19C1E); - vsel(x1, x925E63E1, x69A49C79, a6); - vxor(*out2, *out2, x1); + x7B7B7B7B = a2 | x5A5A5A5A; + x411E5984 = x3A6522FF ^ x7B7B7B7B; + x1FFFFDFD = x11115555 | x0FFFB9FD; + x5EE1A479 = x411E5984 ^ x1FFFFDFD; - vsel(x9C3CA761, x840DBB67, x1C3EE619, x3CF19C86); - vsel(x257A75D5, x455D45DF, x2AEA70D5, x606F30CF); - vxor(xB946D2B4, x9C3CA761, x257A75D5); - vsel(x2, x16E94A97, xB946D2B4, a6); - vxor(*out3, *out3, x2); + x3CB4DFD2 = x22DD6699 ^ x1E69B94B; + x004B002D = a5 & ~x3CB4DFD2; + xB7B2B6B3 = ~x484D494C; + xCCC9CDC8 = x7B7B7B7B ^ xB7B2B6B3; + xCC82CDE5 = x004B002D ^ xCCC9CDC8; + x10 = xCC82CDE5 & ~a4; + x11 = x10 ^ x5EE1A479; + *out2 ^= x11; + + x0055EEBB = a6 ^ x00551144; + x5A5AECE9 = a1 ^ x0F0FB9BC; + x0050ECA9 = x0055EEBB & x5A5AECE9; + xC5CAC1CE = x09030C06 ^ xCCC9CDC8; + xC59A2D67 = x0050ECA9 ^ xC5CAC1CE; + x00 = x0FFFB9FD & ~a4; + x01 = x00 ^ xC59A2D67; + *out1 ^= x01; } void s7 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x44447777, x4B4B7878, x22772277, x0505F5F5, x220522F5, x694E5A8D; - u32 x00FFFF00, x66666666, x32353235, x26253636, x26DAC936; - u32 x738F9C63, x11EF9867, x26DA9867; - u32 x4B4B9C63, x4B666663, x4E639396; - u32 x4E4B393C, xFF00FF00, xFF05DD21, xB14EE41D; - u32 xD728827B, x6698807B, x699C585B; - u32 x778A8877, xA4A71E18, x74878E78; - u32 x204A5845, x74879639, x8B7869C6; - u32 x0, x1, x2, x3; + u32 x0FF00FF0, x3CC33CC3, x00003CC3, x0F000F00, x5A555A55, x00001841; + u32 x00000F00, x33333C33, x7B777E77, x0FF0F00F, x74878E78; + u32 x003C003C, x5A7D5A7D, x333300F0, x694E5A8D; + u32 x0FF0CCCC, x000F0303, x5A505854, x33CC000F, x699C585B; + u32 x7F878F78, x21101013, x7F979F7B, x30030CC0, x4F9493BB; + u32 x6F9CDBFB, x0000DBFB, x00005151, x26DAC936, x26DA9867; + u32 x27DA9877, x27DA438C, x2625C9C9, x27FFCBCD; + u32 x27FF1036, x27FF103E, xB06B6C44, x97947C7A; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x44447777, a2, a6, a3); - vxor(x4B4B7878, a4, x44447777); - vsel(x22772277, a3, a5, a2); - vsel(x0505F5F5, a6, a2, a4); - vsel(x220522F5, x22772277, x0505F5F5, a5); - vxor(x694E5A8D, x4B4B7878, x220522F5); + x0FF00FF0 = a4 ^ a5; + x3CC33CC3 = a3 ^ x0FF00FF0; + x00003CC3 = a6 & x3CC33CC3; + x0F000F00 = a4 & x0FF00FF0; + x5A555A55 = a2 ^ x0F000F00; + x00001841 = x00003CC3 & x5A555A55; - vxor(x00FFFF00, a5, a6); - vxor(x66666666, a2, a3); - vsel(x32353235, a3, x220522F5, a4); - vsel(x26253636, x66666666, x32353235, x4B4B7878); - vxor(x26DAC936, x00FFFF00, x26253636); - vsel(x0, x26DAC936, x694E5A8D, a1); - vxor(*out1, *out1, x0); + x00000F00 = a6 & x0F000F00; + x33333C33 = a3 ^ x00000F00; + x7B777E77 = x5A555A55 | x33333C33; + x0FF0F00F = a6 ^ x0FF00FF0; + x74878E78 = x7B777E77 ^ x0FF0F00F; + x30 = a1 & ~x00001841; + x31 = x30 ^ x74878E78; + *out4 ^= x31; - vxor(x738F9C63, a2, x26DAC936); - vsel(x11EF9867, x738F9C63, a5, x66666666); - vsel(x26DA9867, x26DAC936, x11EF9867, a6); + x003C003C = a5 & ~x3CC33CC3; + x5A7D5A7D = x5A555A55 | x003C003C; + x333300F0 = x00003CC3 ^ x33333C33; + x694E5A8D = x5A7D5A7D ^ x333300F0; - vsel(x4B4B9C63, x4B4B7878, x738F9C63, a6); - vsel(x4B666663, x4B4B9C63, x66666666, x00FFFF00); - vxor(x4E639396, x0505F5F5, x4B666663); + x0FF0CCCC = x00003CC3 ^ x0FF0F00F; + x000F0303 = a4 & ~x0FF0CCCC; + x5A505854 = x5A555A55 & ~x000F0303; + x33CC000F = a5 ^ x333300F0; + x699C585B = x5A505854 ^ x33CC000F; - vsel(x4E4B393C, x4B4B7878, x4E639396, a2); - vnot(xFF00FF00, a5); - vsel(xFF05DD21, xFF00FF00, x738F9C63, x32353235); - vxor(xB14EE41D, x4E4B393C, xFF05DD21); - vsel(x1, xB14EE41D, x26DA9867, a1); - vxor(*out2, *out2, x1); + x7F878F78 = x0F000F00 | x74878E78; + x21101013 = a3 & x699C585B; + x7F979F7B = x7F878F78 | x21101013; + x30030CC0 = x3CC33CC3 & ~x0FF0F00F; + x4F9493BB = x7F979F7B ^ x30030CC0; + x00 = x4F9493BB & ~a1; + x01 = x00 ^ x694E5A8D; + *out1 ^= x01; - vxor(xD728827B, x66666666, xB14EE41D); - vsel(x6698807B, x26DA9867, xD728827B, x4E4B393C); - vsel(x699C585B, x6698807B, x694E5A8D, xFF05DD21); - vsel(x2, x699C585B, x4E639396, a1); - vxor(*out3, *out3, x2); + x6F9CDBFB = x699C585B | x4F9493BB; + x0000DBFB = a6 & x6F9CDBFB; + x00005151 = a2 & x0000DBFB; + x26DAC936 = x694E5A8D ^ x4F9493BB; + x26DA9867 = x00005151 ^ x26DAC936; - vsel(x778A8877, x738F9C63, x26DAC936, x26253636); - vxor(xA4A71E18, x738F9C63, xD728827B); - vsel(x74878E78, x778A8877, xA4A71E18, a4); + x27DA9877 = x21101013 | x26DA9867; + x27DA438C = x0000DBFB ^ x27DA9877; + x2625C9C9 = a5 ^ x26DAC936; + x27FFCBCD = x27DA438C | x2625C9C9; + x20 = x27FFCBCD & a1; + x21 = x20 ^ x699C585B; + *out3 ^= x21; - vsel(x204A5845, x26DA9867, x694E5A8D, x26DAC936); - vsel(x74879639, x74878E78, a3, x204A5845); - vnot(x8B7869C6, x74879639); - vsel(x3, x74878E78, x8B7869C6, a1); - vxor(*out4, *out4, x3); + x27FF1036 = x0000DBFB ^ x27FFCBCD; + x27FF103E = x003C003C | x27FF1036; + xB06B6C44 = ~x4F9493BB; + x97947C7A = x27FF103E ^ xB06B6C44; + x10 = x97947C7A & ~a1; + x11 = x10 ^ x26DA9867; + *out2 ^= x11; } void s8 (const u32 a1, const u32 a2, const u32 a3, const u32 a4, const u32 a5, const u32 a6, u32 *out1, u32 *out2, u32 *out3, u32 *out4) { - u32 x0505F5F5, x05FAF50A, x0F0F00FF, x22227777, x07DA807F, x34E9B34C; - u32 x00FFF00F, x0033FCCF, x5565B15C, x0C0C3F3F, x59698E63; - u32 x3001F74E, x30555745, x693CD926; - u32 x0C0CD926, x0C3F25E9, x38D696A5; - u32 xC729695A; - u32 x03D2117B, xC778395B, xCB471CB2; - u32 x5425B13F, x56B3803F, x919AE965; - u32 x17B3023F, x75555755, x62E6556A, xA59E6C31; - u32 x0, x1, x2, x3; + u32 x0C0C0C0C, x0000F0F0, x00FFF00F, x00555005, x00515001; + u32 x33000330, x77555775, x30303030, x3030CFCF, x30104745, x30555745; + u32 xFF000FF0, xCF1048B5, x080A080A, xC71A40BF, xCB164CB3; + u32 x9E4319E6, x000019E6, xF429738C, xF4296A6A, xC729695A; + u32 xC47C3D2F, xF77F3F3F, x9E43E619, x693CD926; + u32 xF719A695, xF4FF73FF, x03E6D56A, x56B3803F; + u32 xF700A600, x61008000, x03B7856B, x62B7056B; + u32 x00, x01, x10, x11, x20, x21, x30, x31; - vsel(x0505F5F5, a5, a1, a3); - vxor(x05FAF50A, a4, x0505F5F5); - vsel(x0F0F00FF, a3, a4, a5); - vsel(x22227777, a2, a5, a1); - vsel(x07DA807F, x05FAF50A, x0F0F00FF, x22227777); - vxor(x34E9B34C, a2, x07DA807F); + x0C0C0C0C = a3 & ~a2; + x0000F0F0 = a5 & ~a3; + x00FFF00F = a4 ^ x0000F0F0; + x00555005 = a1 & x00FFF00F; + x00515001 = x00555005 & ~x0C0C0C0C; - vsel(x00FFF00F, x05FAF50A, a4, a3); - vsel(x0033FCCF, a5, x00FFF00F, a2); - vsel(x5565B15C, a1, x34E9B34C, x0033FCCF); - vsel(x0C0C3F3F, a3, a5, a2); - vxor(x59698E63, x5565B15C, x0C0C3F3F); + x33000330 = a2 & ~x00FFF00F; + x77555775 = a1 | x33000330; + x30303030 = a2 & ~a3; + x3030CFCF = a5 ^ x30303030; + x30104745 = x77555775 & x3030CFCF; + x30555745 = x00555005 | x30104745; - vsel(x3001F74E, x34E9B34C, a5, x05FAF50A); - vsel(x30555745, x3001F74E, a1, x00FFF00F); - vxor(x693CD926, x59698E63, x30555745); - vsel(x2, x693CD926, x59698E63, a6); - vxor(*out3, *out3, x2); + xFF000FF0 = ~x00FFF00F; + xCF1048B5 = x30104745 ^ xFF000FF0; + x080A080A = a3 & ~x77555775; + xC71A40BF = xCF1048B5 ^ x080A080A; + xCB164CB3 = x0C0C0C0C ^ xC71A40BF; + x10 = x00515001 | a6; + x11 = x10 ^ xCB164CB3; + *out2 ^= x11; - vsel(x0C0CD926, x0C0C3F3F, x693CD926, a5); - vxor(x0C3F25E9, x0033FCCF, x0C0CD926); - vxor(x38D696A5, x34E9B34C, x0C3F25E9); + x9E4319E6 = a1 ^ xCB164CB3; + x000019E6 = a5 & x9E4319E6; + xF429738C = a2 ^ xC71A40BF; + xF4296A6A = x000019E6 ^ xF429738C; + xC729695A = x33000330 ^ xF4296A6A; - vnot(xC729695A, x38D696A5); + xC47C3D2F = x30555745 ^ xF4296A6A; + xF77F3F3F = a2 | xC47C3D2F; + x9E43E619 = a5 ^ x9E4319E6; + x693CD926 = xF77F3F3F ^ x9E43E619; + x20 = x30555745 & a6; + x21 = x20 ^ x693CD926; + *out3 ^= x21; - vsel(x03D2117B, x07DA807F, a2, x0C0CD926); - vsel(xC778395B, xC729695A, x03D2117B, x30555745); - vxor(xCB471CB2, x0C3F25E9, xC778395B); - vsel(x1, xCB471CB2, x34E9B34C, a6); - vxor(*out2, *out2, x1); + xF719A695 = x3030CFCF ^ xC729695A; + xF4FF73FF = a4 | xF429738C; + x03E6D56A = xF719A695 ^ xF4FF73FF; + x56B3803F = a1 ^ x03E6D56A; + x30 = x56B3803F & a6; + x31 = x30 ^ xC729695A; + *out4 ^= x31; - vsel(x5425B13F, x5565B15C, x0C0C3F3F, x03D2117B); - vsel(x56B3803F, x07DA807F, x5425B13F, x59698E63); - vxor(x919AE965, xC729695A, x56B3803F); - vsel(x3, xC729695A, x919AE965, a6); - vxor(*out4, *out4, x3); - - vsel(x17B3023F, x07DA807F, a2, x59698E63); - vor(x75555755, a1, x30555745); - vxor(x62E6556A, x17B3023F, x75555755); - vxor(xA59E6C31, xC778395B, x62E6556A); - vsel(x0, xA59E6C31, x38D696A5, a6); - vxor(*out1, *out1, x0); + xF700A600 = xF719A695 & ~a4; + x61008000 = x693CD926 & xF700A600; + x03B7856B = x00515001 ^ x03E6D56A; + x62B7056B = x61008000 ^ x03B7856B; + x00 = x62B7056B | a6; + x01 = x00 ^ xC729695A; + *out1 ^= x01; } #endif @@ -1452,60 +1539,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 KXX_DECL u32 k36, k37, k38, k39, k40, k41; KXX_DECL u32 k42, k43, k44, k45, k46, k47; - #if defined IS_AMD || defined IS_GENERIC - - #ifdef _unroll - #pragma unroll - #endif - for (u32 i = 0; i < 8; i++) - { - switch (i) - { - case 0: KEYSET00; break; - case 1: KEYSET02; break; - case 2: KEYSET04; break; - case 3: KEYSET06; break; - case 4: KEYSET10; break; - case 5: KEYSET12; break; - case 6: KEYSET14; break; - case 7: KEYSET16; break; - } - - s1(*D63 ^ k00, *D32 ^ k01, *D33 ^ k02, *D34 ^ k03, *D35 ^ k04, *D36 ^ k05, D08, D16, D22, D30); - s2(*D35 ^ k06, *D36 ^ k07, *D37 ^ k08, *D38 ^ k09, *D39 ^ k10, *D40 ^ k11, D12, D27, D01, D17); - s3(*D39 ^ k12, *D40 ^ k13, *D41 ^ k14, *D42 ^ k15, *D43 ^ k16, *D44 ^ k17, D23, D15, D29, D05); - s4(*D43 ^ k18, *D44 ^ k19, *D45 ^ k20, *D46 ^ k21, *D47 ^ k22, *D48 ^ k23, D25, D19, D09, D00); - s5(*D47 ^ k24, *D48 ^ k25, *D49 ^ k26, *D50 ^ k27, *D51 ^ k28, *D52 ^ k29, D07, D13, D24, D02); - s6(*D51 ^ k30, *D52 ^ k31, *D53 ^ k32, *D54 ^ k33, *D55 ^ k34, *D56 ^ k35, D03, D28, D10, D18); - s7(*D55 ^ k36, *D56 ^ k37, *D57 ^ k38, *D58 ^ k39, *D59 ^ k40, *D60 ^ k41, D31, D11, D21, D06); - s8(*D59 ^ k42, *D60 ^ k43, *D61 ^ k44, *D62 ^ k45, *D63 ^ k46, *D32 ^ k47, D04, D26, D14, D20); - - switch (i) - { - case 0: KEYSET01; break; - case 1: KEYSET03; break; - case 2: KEYSET05; break; - case 3: KEYSET07; break; - case 4: KEYSET11; break; - case 5: KEYSET13; break; - case 6: KEYSET15; break; - case 7: KEYSET17; break; - } - - s1(*D31 ^ k00, *D00 ^ k01, *D01 ^ k02, *D02 ^ k03, *D03 ^ k04, *D04 ^ k05, D40, D48, D54, D62); - s2(*D03 ^ k06, *D04 ^ k07, *D05 ^ k08, *D06 ^ k09, *D07 ^ k10, *D08 ^ k11, D44, D59, D33, D49); - s3(*D07 ^ k12, *D08 ^ k13, *D09 ^ k14, *D10 ^ k15, *D11 ^ k16, *D12 ^ k17, D55, D47, D61, D37); - s4(*D11 ^ k18, *D12 ^ k19, *D13 ^ k20, *D14 ^ k21, *D15 ^ k22, *D16 ^ k23, D57, D51, D41, D32); - s5(*D15 ^ k24, *D16 ^ k25, *D17 ^ k26, *D18 ^ k27, *D19 ^ k28, *D20 ^ k29, D39, D45, D56, D34); - s6(*D19 ^ k30, *D20 ^ k31, *D21 ^ k32, *D22 ^ k33, *D23 ^ k34, *D24 ^ k35, D35, D60, D42, D50); - s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); - s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); - } - - #endif - - #if defined IS_NV - #ifdef _unroll #pragma unroll #endif @@ -1599,8 +1632,6 @@ void DES (const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 s7(*D23 ^ k36, *D24 ^ k37, *D25 ^ k38, *D26 ^ k39, *D27 ^ k40, *D28 ^ k41, D63, D43, D53, D38); s8(*D27 ^ k42, *D28 ^ k43, *D29 ^ k44, *D30 ^ k45, *D31 ^ k46, *D00 ^ k47, D36, D58, D46, D52); } - - #endif } void transpose32c (u32 data[32]) diff --git a/OpenCL/markov_be.cl b/OpenCL/markov_be.cl index b178259ed..b62775c43 100644 --- a/OpenCL/markov_be.cl +++ b/OpenCL/markov_be.cl @@ -9,7 +9,7 @@ #include "inc_types.cl" -inline void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) +void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) { __global const cs_t *cs = &root_css_buf[pw_r_len]; diff --git a/OpenCL/markov_le.cl b/OpenCL/markov_le.cl index a90fc489c..2d7babff4 100644 --- a/OpenCL/markov_le.cl +++ b/OpenCL/markov_le.cl @@ -9,7 +9,7 @@ #include "inc_types.cl" -inline void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) +void generate_pw (u32 pw_buf[64], __global const cs_t *root_css_buf, __global const cs_t *markov_css_buf, const u32 pw_l_len, const u32 pw_r_len, const u32 mask80, const u32 bits14, const u32 bits15, u64 val) { __global const cs_t *cs = &root_css_buf[pw_r_len];